R rna-seq workflow

library(“airway”)
indir <- system.file(“extdata”, package=“airway”, mustWork=TRUE)
list.files(indir)
csvfile <- file.path(indir, “sample_table.csv”)
sampleTable <- read.csv(csvfile, row.names = 1)

filenames <- file.path(indir, paste0(sampleTable$Run, “_subset.bam”))
file.exists(filenames)
library(“Rsamtools”)
bamfiles <- BamFileList(filenames, yieldSize=2000000)
library(“GenomicFeatures”)

gtffile <- file.path(indir,“Homo_sapiens.GRCh37.75_subset.gtf”)
txdb <- makeTxDbFromGFF(gtffile, format = “gtf”, circ_seqs = character())
ebg <- exonsBy(txdb, by=“gene”)
library(“GenomicAlignments”)
library(“BiocParallel”)
se <- summarizeOverlaps(features=ebg, reads=bamfiles,
mode=“Union”,
singleEnd=FALSE,
ignore.strand=TRUE,
fragments=TRUE )

data(“airway”)
se <- airway
se$dex %<>% relevel(“untrt”)
library(“DESeq2”)
dds <- DESeqDataSet(se, design = ~ cell + dex)
countdata <- assay(se)
coldata <- colData(se)
ddsMat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~ cell + dex)

dds <- dds[ rowSums(counts(dds)) > 1, ]
lambda <- 10^seq(from = -1, to = 2, length = 1000)
cts <- matrix(rpois(1000*100, lambda), ncol = 100)
library(“vsn”)
meanSdPlot(cts, ranks = FALSE)

log.cts.one <- log2(cts + 1)
meanSdPlot(log.cts.one, ranks = FALSE)

vsd <- vst(dds, blind = FALSE)
rld <- rlog(dds, blind = FALSE)
library(“dplyr”)
library(“ggplot2”)

dds <- estimateSizeFactors(dds)

df <- bind_rows(
as_data_frame(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
mutate(transformation = “log2(x + 1)”),
as_data_frame(assay(vsd)[, 1:2]) %>% mutate(transformation = “vst”),
as_data_frame(assay(rld)[, 1:2]) %>% mutate(transformation = “rlog”))

colnames(df)[1:2] <- c(“x”, “y”)

ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
coord_fixed() + facet_grid( . ~ transformation)

sampleDists <- dist(t(assay(vsd)))

library(“pheatmap”)
library(“RColorBrewer”)
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( vsd $dex, vsd$ cell, sep = " - " )
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, “Blues”)) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
col = colors)

library(“PoiClaClu”)
poisd <- PoissonDistance(t(counts(dds)))

samplePoisDistMatrix <- as.matrix( poisd $dd ) rownames(samplePoisDistMatrix) <- paste( dds$ dex, dds $cell, sep=" - " ) colnames(samplePoisDistMatrix) <- NULL pheatmap(samplePoisDistMatrix, clustering_distance_rows = poisd$ dd,
clustering_distance_cols = poisd$dd,
col = colors)

pcaData <- plotPCA(vsd, intgroup = c( “dex”, “cell”), returnData = TRUE)
percentVar <- round(100 * attr(pcaData, “percentVar”))
ggplot(pcaData, aes(x = PC1, y = PC2, color = dex, shape = cell)) +
geom_point(size =3) +
xlab(paste0("PC1: ", percentVar[1], “% variance”)) +
ylab(paste0("PC2: ", percentVar[2], “% variance”)) +
coord_fixed()

mds <- as.data.frame(colData(vsd)) %>%
cbind(cmdscale(sampleDistMatrix))
ggplot(mds, aes(x = 1, y = 2, color = dex, shape = cell)) +
geom_point(size = 3) + coord_fixed()

mdsPois <- as.data.frame(colData(dds)) %>%
cbind(cmdscale(samplePoisDistMatrix))
ggplot(mdsPois, aes(x = 1, y = 2, color = dex, shape = cell)) +
geom_point(size = 3) + coord_fixed()

dds <- DESeq(dds)
res <- results(dds)
res <- results(dds, contrast=c(“dex”,“trt”,“untrt”))
mcols(res, use.names = TRUE)

res.05 <- results(dds, alpha = 0.05)
table(res.05$padj < 0.05)

library(“ggbeeswarm”)
geneCounts <- plotCounts(dds, gene = topGene, intgroup = c(“dex”,“cell”),
returnData = TRUE)
ggplot(geneCounts, aes(x = dex, y = count, color = cell)) +
scale_y_log10() + geom_beeswarm(cex = 3)

ggplot(geneCounts, aes(x = dex, y = count, color = cell, group = cell)) +
scale_y_log10() + geom_point(size = 3) + geom_line()

library(“apeglm”)
res <- lfcShrink(dds, coef=“dex_trt_vs_untrt”, type=“apeglm”)
plotMA(res, ylim = c(-5, 5))

res.noshr <- results(dds, name=“dex_trt_vs_untrt”)
plotMA(res.noshr, ylim = c(-5, 5))
plotMA(res, ylim = c(-5,5))
topGene <- rownames(res)[which.min(res$padj)]
with(res[topGene, ], {
points(baseMean, log2FoldChange, col=“dodgerblue”, cex=2, lwd=2)
text(baseMean, log2FoldChange, topGene, pos=2, col=“dodgerblue”)
})

hist(res $pvalue[res$ baseMean > 1], breaks = 0:20/20,
col = “grey50”, border = “white”)

library(“genefilter”)
topVarGenes <- head(order(rowVars(assay(vsd)), decreasing = TRUE), 20)

mat <- assay(vsd)[ topVarGenes, ]
mat <- mat - rowMeans(mat)
anno <- as.data.frame(colData(vsd)[, c(“cell”,“dex”)])
pheatmap(mat, annotation_col = anno)

qs <- c(0, quantile(resLFC1 $baseMean[resLFC1$ baseMean > 0], 0:6/6))
bins <- cut(resLFC1 $baseMean, qs) levels(bins) <- paste0("~", round(signif((qs[-1] + qs[-length(qs)])/2, 2))) fractionSig <- tapply(resLFC1$ pvalue, bins, function§
mean(p < .05, na.rm = TRUE))
barplot(fractionSig, xlab = “mean normalized count”,
ylab = “fraction of small p values”)

library(“AnnotationDbi”)
library(“org.Hs.eg.db”)

res $symbol <- mapIds(org.Hs.eg.db, keys=row.names(res), column="SYMBOL", keytype="ENSEMBL", multiVals="first") res$ entrez <- mapIds(org.Hs.eg.db,
keys=row.names(res),
column=“ENTREZID”,
keytype=“ENSEMBL”,
multiVals=“first”)

resOrdered <- res[order(res$pvalue),]

resOrderedDF <- as.data.frame(resOrdered)[1:100, ]
write.csv(resOrderedDF, file = “results.csv”)

library(“ReportingTools”)
htmlRep <- HTMLReport(shortName=“report”, title=“My report”,
reportDirectory="./report")
publish(resOrderedDF, htmlRep)
url <- finish(htmlRep)
browseURL(url)

resGR <- results(dds, name=“dex_trt_vs_untrt”, format=“GRanges”)
resGR $log2FoldChange <- res$ log2FoldChange
resGR $symbol <- mapIds(org.Hs.eg.db, names(resGR), "SYMBOL", "ENSEMBL") library("Gviz") window <- resGR[topGene] + 1e6 strand(window) <- "*" resGRsub <- resGR[resGR %over% window] naOrDup <- is.na(resGRsub$ symbol) | duplicated(resGRsub $symbol) resGRsub$ group <- ifelse(naOrDup, names(resGRsub), resGRsub $symbol) status <- factor(ifelse(resGRsub$ padj < 0.1 & !is.na(resGRsub$padj),
“sig”, “notsig”))

options(ucscChromosomeNames = FALSE)
g <- GenomeAxisTrack()
a <- AnnotationTrack(resGRsub, name = “gene ranges”, feature = status)
d <- DataTrack(resGRsub, data = “log2FoldChange”, baseline = 0,
type = “h”, name = “log2 fold change”, strand = “+”)
plotTracks(list(g, d, a), groupAnnotation = “group”,
notsig = “grey”, sig = “hotpink”)

library(“sva”)

dat <- counts(dds, normalized = TRUE)
idx <- rowMeans(dat) > 1
dat <- dat[idx, ]
mod <- model.matrix(~ dex, colData(dds))
mod0 <- model.matrix(~ 1, colData(dds))
svseq <- svaseq(dat, mod, mod0, n.sv = 2)

par(mfrow = c(2, 1), mar = c(3,5,3,1))
for (i in 1:2) {
stripchart(svseq $sv[, i] ~ dds$ cell, vertical = TRUE, main = paste0(“SV”, i))
abline(h = 0)
}

ddssva <- dds
ddssva $SV1 <- svseq$ sv[,1]
ddssva $SV2 <- svseq$ sv[,2]
design(ddssva) <- ~ SV1 + SV2 + dex
library(“fission”)
data(“fission”)
ddsTC <- DESeqDataSet(fission, ~ strain + minute + strain:minute)
ddsTC <- DESeq(ddsTC, test=“LRT”, reduced = ~ strain + minute)
resTC <- results(ddsTC)
resTC $symbol <- mcols(ddsTC)$ symbol
head(resTC[order(resTC$padj),], 4)

fiss <- plotCounts(ddsTC, which.min(resTC $padj), intgroup = c("minute","strain"), returnData = TRUE) fiss$ minute <- as.numeric(as.character(fiss $KaTeX parse error: Double subscript at position 154: \dotse") + scale_y_̲log10() res30 <\dots$ padj),]
betas <- coef(ddsTC)

topGenes <- head(order(resTC$padj),20)
mat <- betas[topGenes, -c(1,2)]
thr <- 3
mat[mat < -thr] <- -thr
mat[mat > thr] <- thr
pheatmap(mat, breaks=seq(from=-thr, to=thr, length=101),
cluster_col=FALSE)

qq_39306047

发布了27 篇原创文章 · 获赞 3 · 访问量 1万+

私信关注

猜你喜欢