R rna-seq workflow

library(“airway”)
indir <- system.file(“extdata”, package=“airway”, mustWork=TRUE)
list.files(indir)
csvfile <- file.path(indir, “sample_table.csv”)
sampleTable <- read.csv(csvfile, row.names = 1)

filenames <- file.path(indir, paste0(sampleTable$Run, “_subset.bam”))
file.exists(filenames)
library(“Rsamtools”)
bamfiles <- BamFileList(filenames, yieldSize=2000000)
library(“GenomicFeatures”)

gtffile <- file.path(indir,“Homo_sapiens.GRCh37.75_subset.gtf”)
txdb <- makeTxDbFromGFF(gtffile, format = “gtf”, circ_seqs = character())
ebg <- exonsBy(txdb, by=“gene”)
library(“GenomicAlignments”)
library(“BiocParallel”)
se <- summarizeOverlaps(features=ebg, reads=bamfiles,
mode=“Union”,
singleEnd=FALSE,
ignore.strand=TRUE,
fragments=TRUE )

data(“airway”)
se <- airway
se$dex %<>% relevel(“untrt”)
library(“DESeq2”)
dds <- DESeqDataSet(se, design = ~ cell + dex)
countdata <- assay(se)
coldata <- colData(se)
ddsMat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~ cell + dex)

dds <- dds[ rowSums(counts(dds)) > 1, ]
lambda <- 10^seq(from = -1, to = 2, length = 1000)
cts <- matrix(rpois(1000*100, lambda), ncol = 100)
library(“vsn”)
meanSdPlot(cts, ranks = FALSE)

log.cts.one <- log2(cts + 1)
meanSdPlot(log.cts.one, ranks = FALSE)

vsd <- vst(dds, blind = FALSE)
rld <- rlog(dds, blind = FALSE)
library(“dplyr”)
library(“ggplot2”)

dds <- estimateSizeFactors(dds)

df <- bind_rows(
as_data_frame(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
mutate(transformation = “log2(x + 1)”),
as_data_frame(assay(vsd)[, 1:2]) %>% mutate(transformation = “vst”),
as_data_frame(assay(rld)[, 1:2]) %>% mutate(transformation = “rlog”))

colnames(df)[1:2] <- c(“x”, “y”)

ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
coord_fixed() + facet_grid( . ~ transformation)

sampleDists <- dist(t(assay(vsd)))

library(“pheatmap”)
library(“RColorBrewer”)
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( vsd d e x , v s d dex, vsd cell, sep = " - " )
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, “Blues”)) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
col = colors)

library(“PoiClaClu”)
poisd <- PoissonDistance(t(counts(dds)))

samplePoisDistMatrix <- as.matrix( poisd d d ) r o w n a m e s ( s a m p l e P o i s D i s t M a t r i x ) &lt; p a s t e ( d d s dd ) rownames(samplePoisDistMatrix) &lt;- paste( dds dex, dds c e l l , s e p = &quot; &quot; ) c o l n a m e s ( s a m p l e P o i s D i s t M a t r i x ) &lt; N U L L p h e a t m a p ( s a m p l e P o i s D i s t M a t r i x , c l u s t e r i n g d i s t a n c e r o w s = p o i s d cell, sep=&quot; - &quot; ) colnames(samplePoisDistMatrix) &lt;- NULL pheatmap(samplePoisDistMatrix, clustering_distance_rows = poisd dd,
clustering_distance_cols = poisd$dd,
col = colors)

pcaData <- plotPCA(vsd, intgroup = c( “dex”, “cell”), returnData = TRUE)
percentVar <- round(100 * attr(pcaData, “percentVar”))
ggplot(pcaData, aes(x = PC1, y = PC2, color = dex, shape = cell)) +
geom_point(size =3) +
xlab(paste0("PC1: ", percentVar[1], “% variance”)) +
ylab(paste0("PC2: ", percentVar[2], “% variance”)) +
coord_fixed()

mds <- as.data.frame(colData(vsd)) %>%
cbind(cmdscale(sampleDistMatrix))
ggplot(mds, aes(x = 1, y = 2, color = dex, shape = cell)) +
geom_point(size = 3) + coord_fixed()

mdsPois <- as.data.frame(colData(dds)) %>%
cbind(cmdscale(samplePoisDistMatrix))
ggplot(mdsPois, aes(x = 1, y = 2, color = dex, shape = cell)) +
geom_point(size = 3) + coord_fixed()

dds <- DESeq(dds)
res <- results(dds)
res <- results(dds, contrast=c(“dex”,“trt”,“untrt”))
mcols(res, use.names = TRUE)

res.05 <- results(dds, alpha = 0.05)
table(res.05$padj < 0.05)

library(“ggbeeswarm”)
geneCounts <- plotCounts(dds, gene = topGene, intgroup = c(“dex”,“cell”),
returnData = TRUE)
ggplot(geneCounts, aes(x = dex, y = count, color = cell)) +
scale_y_log10() + geom_beeswarm(cex = 3)

ggplot(geneCounts, aes(x = dex, y = count, color = cell, group = cell)) +
scale_y_log10() + geom_point(size = 3) + geom_line()

library(“apeglm”)
res <- lfcShrink(dds, coef=“dex_trt_vs_untrt”, type=“apeglm”)
plotMA(res, ylim = c(-5, 5))

res.noshr <- results(dds, name=“dex_trt_vs_untrt”)
plotMA(res.noshr, ylim = c(-5, 5))
plotMA(res, ylim = c(-5,5))
topGene <- rownames(res)[which.min(res$padj)]
with(res[topGene, ], {
points(baseMean, log2FoldChange, col=“dodgerblue”, cex=2, lwd=2)
text(baseMean, log2FoldChange, topGene, pos=2, col=“dodgerblue”)
})

hist(res p v a l u e [ r e s pvalue[res baseMean > 1], breaks = 0:20/20,
col = “grey50”, border = “white”)

library(“genefilter”)
topVarGenes <- head(order(rowVars(assay(vsd)), decreasing = TRUE), 20)

mat <- assay(vsd)[ topVarGenes, ]
mat <- mat - rowMeans(mat)
anno <- as.data.frame(colData(vsd)[, c(“cell”,“dex”)])
pheatmap(mat, annotation_col = anno)

qs <- c(0, quantile(resLFC1 b a s e M e a n [ r e s L F C 1 baseMean[resLFC1 baseMean > 0], 0:6/6))
bins <- cut(resLFC1 b a s e M e a n , q s ) l e v e l s ( b i n s ) &lt; p a s t e 0 ( &quot;   &quot; , r o u n d ( s i g n i f ( ( q s [ 1 ] + q s [ l e n g t h ( q s ) ] ) / 2 , 2 ) ) ) f r a c t i o n S i g &lt; t a p p l y ( r e s L F C 1 baseMean, qs) levels(bins) &lt;- paste0(&quot;~&quot;, round(signif((qs[-1] + qs[-length(qs)])/2, 2))) fractionSig &lt;- tapply(resLFC1 pvalue, bins, function§
mean(p < .05, na.rm = TRUE))
barplot(fractionSig, xlab = “mean normalized count”,
ylab = “fraction of small p values”)

library(“AnnotationDbi”)
library(“org.Hs.eg.db”)

res s y m b o l &lt; m a p I d s ( o r g . H s . e g . d b , k e y s = r o w . n a m e s ( r e s ) , c o l u m n = &quot; S Y M B O L &quot; , k e y t y p e = &quot; E N S E M B L &quot; , m u l t i V a l s = &quot; f i r s t &quot; ) r e s symbol &lt;- mapIds(org.Hs.eg.db, keys=row.names(res), column=&quot;SYMBOL&quot;, keytype=&quot;ENSEMBL&quot;, multiVals=&quot;first&quot;) res entrez <- mapIds(org.Hs.eg.db,
keys=row.names(res),
column=“ENTREZID”,
keytype=“ENSEMBL”,
multiVals=“first”)

resOrdered <- res[order(res$pvalue),]

resOrderedDF <- as.data.frame(resOrdered)[1:100, ]
write.csv(resOrderedDF, file = “results.csv”)

library(“ReportingTools”)
htmlRep <- HTMLReport(shortName=“report”, title=“My report”,
reportDirectory="./report")
publish(resOrderedDF, htmlRep)
url <- finish(htmlRep)
browseURL(url)

resGR <- results(dds, name=“dex_trt_vs_untrt”, format=“GRanges”)
resGR l o g 2 F o l d C h a n g e &lt; r e s log2FoldChange &lt;- res log2FoldChange
resGR s y m b o l &lt; m a p I d s ( o r g . H s . e g . d b , n a m e s ( r e s G R ) , &quot; S Y M B O L &quot; , &quot; E N S E M B L &quot; ) l i b r a r y ( &quot; G v i z &quot; ) w i n d o w &lt; r e s G R [ t o p G e n e ] + 1 e 6 s t r a n d ( w i n d o w ) &lt; &quot; &quot; r e s G R s u b &lt; r e s G R [ r e s G R n a O r D u p &lt; i s . n a ( r e s G R s u b symbol &lt;- mapIds(org.Hs.eg.db, names(resGR), &quot;SYMBOL&quot;, &quot;ENSEMBL&quot;) library(&quot;Gviz&quot;) window &lt;- resGR[topGene] + 1e6 strand(window) &lt;- &quot;*&quot; resGRsub &lt;- resGR[resGR %over% window] naOrDup &lt;- is.na(resGRsub symbol) | duplicated(resGRsub s y m b o l ) r e s G R s u b symbol) resGRsub group <- ifelse(naOrDup, names(resGRsub), resGRsub s y m b o l ) s t a t u s &lt; f a c t o r ( i f e l s e ( r e s G R s u b symbol) status &lt;- factor(ifelse(resGRsub padj < 0.1 & !is.na(resGRsub$padj),
“sig”, “notsig”))

options(ucscChromosomeNames = FALSE)
g <- GenomeAxisTrack()
a <- AnnotationTrack(resGRsub, name = “gene ranges”, feature = status)
d <- DataTrack(resGRsub, data = “log2FoldChange”, baseline = 0,
type = “h”, name = “log2 fold change”, strand = “+”)
plotTracks(list(g, d, a), groupAnnotation = “group”,
notsig = “grey”, sig = “hotpink”)

library(“sva”)

dat <- counts(dds, normalized = TRUE)
idx <- rowMeans(dat) > 1
dat <- dat[idx, ]
mod <- model.matrix(~ dex, colData(dds))
mod0 <- model.matrix(~ 1, colData(dds))
svseq <- svaseq(dat, mod, mod0, n.sv = 2)

par(mfrow = c(2, 1), mar = c(3,5,3,1))
for (i in 1:2) {
stripchart(svseq s v [ , i ]   d d s sv[, i] ~ dds cell, vertical = TRUE, main = paste0(“SV”, i))
abline(h = 0)
}

ddssva <- dds
ddssva S V 1 &lt; s v s e q SV1 &lt;- svseq sv[,1]
ddssva S V 2 &lt; s v s e q SV2 &lt;- svseq sv[,2]
design(ddssva) <- ~ SV1 + SV2 + dex
library(“fission”)
data(“fission”)
ddsTC <- DESeqDataSet(fission, ~ strain + minute + strain:minute)
ddsTC <- DESeq(ddsTC, test=“LRT”, reduced = ~ strain + minute)
resTC <- results(ddsTC)
resTC s y m b o l &lt; m c o l s ( d d s T C ) symbol &lt;- mcols(ddsTC) symbol
head(resTC[order(resTC$padj),], 4)

fiss <- plotCounts(ddsTC, which.min(resTC p a d j ) , i n t g r o u p = c ( &quot; m i n u t e &quot; , &quot; s t r a i n &quot; ) , r e t u r n D a t a = T R U E ) f i s s padj), intgroup = c(&quot;minute&quot;,&quot;strain&quot;), returnData = TRUE) fiss minute <- as.numeric(as.character(fissKaTeX parse error: Double subscript at position 154: …e") + scale_y_̲log10() res30 <…padj),]
betas <- coef(ddsTC)

topGenes <- head(order(resTC$padj),20)
mat <- betas[topGenes, -c(1,2)]
thr <- 3
mat[mat < -thr] <- -thr
mat[mat > thr] <- thr
pheatmap(mat, breaks=seq(from=-thr, to=thr, length=101),
cluster_col=FALSE)

发布了27 篇原创文章 · 获赞 3 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/qq_39306047/article/details/90636755