library(ggplot2)
library(reshape)
library(gplots)
library(edgeR)
library(CHBUtils)
project_summary = "/Volumes/Clotho/Users/rory/cache/bcbio.rnaseq/resources/project/project-summary.csv"
counts_file = "/Volumes/Clotho/Users/rory/cache/bcbio.rnaseq/resources/project/combined.counts"
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2",
"#D55E00", "#CC79A7")
summarydata = read.table(project_summary, header = TRUE, sep = ",")
rownames(summarydata) = summarydata$Name
summarydata = summarydata[order(rownames(summarydata)), ]
counts = read.table(counts_file, header = TRUE, row.names = "id")
counts = counts[, order(colnames(counts))]
ggplot(summarydata, aes(x = Name, y = Mapped)) + geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90)) + ylab("mapped reads") + xlab("")
ggplot(summarydata, aes(x = Name, y = Mapping.Rate)) + geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90)) + ylab("mapping rate") + xlab("")
ggplot(summarydata, aes(x = Name, y = Genes.Detected)) + geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90)) + ylab("genes detected") +
xlab("")
ggplot(summarydata, aes(x = Name, y = Exonic.Rate)) + geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90)) + ylab("exonic mapping rate") +
xlab("")
ggplot(summarydata, aes(x = Name, y = rRNA.rate)) + geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90)) + ylab("rRNA rate") + xlab("")
ggplot(summarydata, aes(x = Name, y = Fragment.Length.Mean)) + geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90)) + ylab("fragment length") +
xlab("")
melted = melt(counts)
colnames(melted) = c("sample", "count")
melted$sample = factor(melted$sample)
melted$sample = reorder(melted$sample, colnames(counts))
melted$count = log(melted$count)
ggplot(melted, aes(x = sample, y = count)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90,
hjust = 1)) + xlab("")
Trimmed mean of M-values (TMM) normalization is described here
Robinson, M. D., & Oshlack, A. (2010). A scaling normalization method for differential expression analysis of RNA-seq data. Genome Biology, 11(3). doi:10.1186/gb-2010-11-3-r25
y = DGEList(counts = counts)
y = calcNormFactors(y)
normalized_counts = cpm(y, normalized.lib.sizes = TRUE)
melted = melt(normalized_counts)
colnames(melted) = c("gene", "sample", "count")
melted$sample = factor(melted$sample)
melted$sample = reorder(melted$sample, colnames(counts))
melted$count = log(melted$count)
ggplot(melted, aes(x = sample, y = count)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90,
hjust = 1)) + xlab("")
select = order(rowMeans(counts), decreasing = TRUE)[1:30]
p = ggheatmap(as.matrix(counts[select, ]))
ggheatmap.show(p)
library(DESeq2)
library(vsn)
design = ~panel
condition = "panel"
dataset = "hsapiens_gene_ensembl"
filter = "ensembl_gene_id"
symbol = "hgnc_symbol"
dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata, design = design)
dds = DESeq(dds)
contrasts = combn(levels(summarydata[, condition]), 2, simplify = FALSE)
par(mfrow = c(1, 3))
notAllZero <- (rowSums(counts(dds)) > 0)
rld <- rlog(dds)
vsd <- varianceStabilizingTransformation(dds)
rlogMat <- assay(rld)
vstMat <- assay(vsd)
meanSdPlot(log2(counts(dds, normalized = TRUE)[notAllZero, ] + 1), ylim = c(0,
2.5))
meanSdPlot(assay(rld[notAllZero, ]), ylim = c(0, 2.5))
meanSdPlot(assay(vsd[notAllZero, ]), ylim = c(0, 2.5))
handle_deseq2 = function(dds, summarydata, column) {
all_combs = combn(levels(summarydata[, column]), 2, simplify = FALSE)
all_results = list()
contrast_strings = list()
for (comb in all_combs) {
contrast_string = paste(comb, collapse = " vs ")
contrast = c(column, comb)
res = results(dds, contrast = contrast)
res = res[order(res$padj), ]
all_results = c(all_results, res)
contrast_strings = c(contrast_strings, contrast_string)
}
names(all_results) = contrast_strings
return(all_results)
}
annotate_df = function(df, dataset, filter, symbol) { require(biomaRt) ensembl = useMart('ensembl', dataset = dataset) annot.df = getBM(attributes=c(filter, symbol), filters=c(filter), values=rownames(df), mart=ensembl) m = merge(df, annot.df, by.x="row.names", by.y=filter) colnames(m)[1] = "id" return(m) }
all_results = handle_deseq2(dds, summarydata, condition)
for (i in seq(length(all_results))) {
plotMA(all_results[[i]])
title(paste("MA plot for contrast", names(all_results)[i]))
}
for (i in seq(length(all_results))) {
cat(paste("Lowest adjusted p-value hits for", names(all_results)[i]))
# out_df = annotate_df(data.frame(all_results[[i]]), dataset, filter,
# symbol)
out_df = all_results[[i]]
knitr::kable(head(out_df))
write.table(out_df, file = paste(names(all_results)[i], ".tsv", sep = ""),
quote = FALSE, sep = "\t", row.names = TRUE, col.names = TRUE)
cat("\n")
}
Lowest adjusted p-value hits for HBRR vs UHRR
baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | |
---|---|---|---|---|---|---|
AAK1 | 1633.3 | 2.629 | 0.0525 | 50.02 | 0 | 0 |
ABAT | 2156.1 | 3.732 | 0.0699 | 53.42 | 0 | 0 |
ABLIM1 | 4101.2 | 3.199 | 0.0406 | 78.78 | 0 | 0 |
ACSL6 | 1178.5 | 6.862 | 0.1514 | 45.33 | 0 | 0 |
ADAM11 | 920.7 | 4.782 | 0.1034 | 46.23 | 0 | 0 |
ADAM22 | 1576.4 | 4.372 | 0.0727 | 60.17 | 0 | 0 |