Bioconductor RNA-Seq workflow
-Michael Love, dept. biostat., HSPH/DFCI
- preparing gene models
- read counting
- EDA (exploratory data analysis)
- differential expression analysis
- annotating results
slides and code:
-Michael Love, dept. biostat., HSPH/DFCI
slides and code:
library( "GenomicFeatures" )
# takes ~10 min
txdb <- makeTranscriptDbFromBiomart( biomart="ensembl",
dataset="hsapiens_gene_ensembl" )
smart to use saveDb()
to only do this once
makeTranscriptDbFromGFF()
accepts GTF fileslibrary(TxDb.Hsapiens.UCSC.hg19.knownGene)
ready to goAnnotationHub
will offer ready to go# takes ~30 seconds
exonsByGene <- exonsBy( txdb, by="gene" )
exonsByGene[[ 1 ]][ 1:4 ]
## GRanges with 4 ranges and 2 metadata columns:
## seqnames ranges strand | exon_id exon_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] X [99883667, 99884983] - | 664095 ENSE00001459322
## [2] X [99885756, 99885863] - | 664096 ENSE00000868868
## [3] X [99887482, 99887565] - | 664097 ENSE00000401072
## [4] X [99887538, 99887565] - | 664098 ENSE00001849132
## ---
## seqlengths:
## 1 2 ... LRG_99
## 249250621 243199373 ... 13294
To convert from chrX
to X
and back:
seqlevelsStyle(gr) <- "NCBI"
seqlevelsStyle(gr) <- "UCSC"
yieldSize
for controlling memory:
samples <- read.csv( "sample_data.csv" )
fls <- samples$filename
library( "Rsamtools" )
bamLst <- BamFileList( fls, yieldSize=2000000 )
library( "GenomicAlignments" )
register( MulticoreParam( workers=4 ) )
# takes e.g. ~30 minutes per sample for 40 million PE reads
se <- summarizeOverlaps( features=exonsByGene,
reads=bamLst,
mode="Union",
singleEnd=FALSE,
ignore.strand=TRUE,
fragments=TRUE )
featureCounts()
metadata( rowData( se ) )[[ 1 ]][ 1:6 ]
## $`Db type`
## [1] "TranscriptDb"
##
## $`Supporting package`
## [1] "GenomicFeatures"
##
## $`Data source`
## [1] "BioMart"
##
## $Organism
## [1] "Homo sapiens"
##
## $`Resource URL`
## [1] "www.biomart.org:80"
##
## $`BioMart database`
## [1] "ensembl"
DataFrame
is a powerful data frame defined in IRanges
:
colData( se ) <- DataFrame( samples )
dds <- DESeqDataSet( se, ~ group + condition )
rld <- rlog( dds )
plotPCA( rld, intgroup="condition" )
\[ K_{ij} \sim \text{NB}( \mu_{ij}, \alpha_i ) \]
\[ \mu_{ij} = s_{ij} q_{ij} \]
\[ \log q_{ij} = \sum_r x_{jr} \beta_{ir} \]
\[ \begin{array}{c} \log q_1 \\ \log q_2 \\ \log q_3 \\ \log q_4 \end{array} = \left( \begin{array}{cc} 1 & 0 \\ 1 & 0 \\ 1 & 1 \\ 1 & 1 \end{array} \right) \left( \begin{array}{c} \beta_0 \\ \beta_1 \end{array} \right) \]
\[ \begin{array}{c} \log q_1 \\ \log q_2 \\ \log q_3 \\ \log q_4 \\ \log q_5 \\ \log q_6 \end{array} = \left( \begin{array}{cccc} 1 & 1 & 0 & 0 \\ 1 & 1 & 0 & 0 \\ 1 & 0 & 1 & 0 \\ 1 & 0 & 1 & 0 \\ 1 & 0 & 0 & 1 \\ 1 & 0 & 0 & 1 \\ \end{array} \right) \left( \begin{array}{c} \beta_0 \\ \beta_1 \\ \beta_2 \\ \beta_3 \end{array} \right) \]
# takes e.g. ~25 seconds for 8 samples, 35,000 genes
dds <- DESeq( dds )
res <- results( dds )
res <- results( dds, contrast=c("condition","trt","untrt") )
plotMA( res )
gene <- rownames( res )[ order( res$pvalue ) ][ 1 ]
plotCounts( dds, gene, "condition" )
in DESeq2 >= v1.5
-cqn package vignette
return a matrix with columns which are surrogate variables
rprt <- HTMLReport(shortName = "analysis",
title = "RNA-Seq analysis",
reportDirectory = "./reports")
publish(res, rprt,
DataSet=dds,
annotation.db="org.Hs.eg.db")
finish(rprt)
select()
works with the annotation packages org.Hs.eg.db
:tab <- select(org.Hs.eg.db, genes, "SYMBOL", "ENSEMBL")
browseVignettes("pkg")
?function