--- title: "Classification of QTL from RR approach" author: "Malachy Campbell" date: "8/7/2018" output: rmdformats::html_clean: fig_width: 6 fig_height: 6 highlight: kate thumbnails: true lightbox: true gallery: true --- ```{r setup, include=FALSE, echo = F, eval = T} knitr::opts_knit$set(root.dir = '/Users/malachycampbell/Documents/Dropbox/Work/Manuscripts/2018_RandomRegressionGWAS/ThePlantGenome/Revision/New Analysis/') ``` # Background Here, we'll classify QTLs into persistent QTL (QTL detected at all 20 time points), long-duration QTL (those with significant associations at more than 12, but less than 20 time points), mid-duration (QTL with associations at 6 - 12 time points), and short-duration QTL (those with associations at fewer than 6 time points). Significant SNPs from the RR approach are selected, and those within a 200 kb window are merged to a single QTL with the most signifiacnt SNP at each time point selected to represent the QTL. # QTL identification using bedtools To runbedtools we need to generate a .BED file for the signifcant SNPs. ```{r make bed files, echo = T, eval = F} library(reshape2) TP.p <- as.matrix( read.table("DataAndCode/CompCluster/TP/pval.mat.txt", sep = "\t", header = T, row.names = 1) ) RR.p <- as.matrix( read.table("DataAndCode/CompCluster/RR/pval.mat.txt", sep = "\t", header = T, row.names = 1) ) MAP <- read.table("DataAndCode/CompCluster/MAP_maf0.05.txt", header = F, sep = "\t") row.names(TP.p) <- MAP$V2 row.names(RR.p) <- MAP$V2 colnames(TP.p) <- paste0("D", 1:20) colnames(RR.p) <- paste0("D", 1:20) TP.p <- melt(TP.p, id.vars = 0) RR.p <- melt(RR.p, id.vars = 0) TP.p <- TP.p[TP.p$value < 1E-4 ,] RR.p <- RR.p[RR.p$value < 1E-4 ,] TP.p <- merge(TP.p, MAP, by.x = "Var1", by.y = "V2", all.x = T, all.y = F) TP.p$V5 <- TP.p$V4 + 1 RR.p <- merge(RR.p, MAP, by.x = "Var1", by.y = "V2", all.x = T, all.y = F) RR.p$V5 <- RR.p$V4 + 1 TP.p <- TP.p[c("V1", "V4", "V5", "Var2", "Var1")] RR.p <- RR.p[c("V1", "V4", "V5", "Var2", "Var1")] TP.p <- TP.p[order(TP.p$V1, TP.p$V4) ,] RR.p <- RR.p[order(RR.p$V1, RR.p$V4) ,] length(unique(RR.p$Var1)) #31 length(unique(TP.p$Var1)) #38 write.table(TP.p[c("V1", "V4", "V5", "Var2")], "DataAndCode/TP_GP/bedtools/TP.p.BED", sep = "\t", col.names = F, row.names = F, quote = F) write.table(RR.p[c("V1", "V4", "V5", "Var2")], "DataAndCode/RR_GP/bedtools/RR.p.BED", sep = "\t", col.names = F, row.names = F, quote = F) write.table(TP.p[c("V1", "V4", "V5", "Var1")], "DataAndCode/TP_GP/bedtools/TP.pSNPs.BED", sep = "\t", col.names = F, row.names = F, quote = F) write.table(RR.p[c("V1", "V4", "V5", "Var1")], "DataAndCode/RR_GP/bedtools/RR.pSNPs.BED", sep = "\t", col.names = F, row.names = F, quote = F) ``` ```{bash find QTLs, echo = T, eval = F} cd DataAndCode/TP_GP/bedtools/ bedtools merge -i TP.p.BED -d 200000 -nms > TP.p.QTL.txt bedtools merge -i TP.pSNPs.BED -d 200000 -nms > TP.p.QTLSNPs.txt less TP.p.QTL.txt | wc -l #15 cd DataAndCode/RR_GP/bedtools/ bedtools merge -i RR.p.BED -d 200000 -nms > RR.p.QTL.txt bedtools merge -i RR.pSNPs.BED -d 200000 -nms > RR.p.QTLSNPs.txt less RR.p.QTL.txt | wc -l #26 ``` This produces a file that lists the QTLs and has the days in which each SNP was detected in a string delimited by semicolons. This next chunk of code will take the fourth column with the string of days, split the string, and only keep non redundant entries. ```{r split column, echo = T, eval = F} RR.QTLs <- read.table("DataAndCode/RR_GP/bedtools/RR.p.QTL.txt", header = F, sep = "\t") RR.list <- list() for (i in 1:nrow(RR.QTLs)){ tmp <- as.character(RR.QTLs[i,]$V4) RR.list[[i]] <- unique(unlist(strsplit(tmp, ";"))) } #Summarise RR.QTLs$NoDays <- unlist(lapply(RR.list, function(x) length(x))) dim(RR.QTLs[RR.QTLs$NoDays == 20 ,]) #13 QTL detected on all days dim(RR.QTLs[(RR.QTLs$NoDays < 20) & (RR.QTLs$NoDays > 12) ,]) #5 long-duration (13-19 days) dim(RR.QTLs[(RR.QTLs$NoDays < 13) & (RR.QTLs$NoDays > 5) ,]) #6 mid-duration (6-12 days) dim(RR.QTLs[(RR.QTLs$NoDays < 6) ,]) #2 short-duration QTL (< 6 days) ``` Now I'll figure out which SNPs are most significant for each QTL. First find out which significant SNPs are in the QTL intervals with bedtools. ```{r find sig SNPs in QTLs, echo = T, eval = F} library(plyr) RR.QTLs <- read.table("DataAndCode/RR_GP/bedtools/RR.p.QTL.txt", header = F, sep = "\t") RR.list <- list() for (i in 1:nrow(RR.QTLs)){ tmp <- as.character(RR.QTLs[i,]$V4) RR.list[[i]] <- unique(unlist(strsplit(tmp, ";"))) } RR.QTLs$NoDays <- unlist(lapply(RR.list, function(x) length(x))) RR.QTLs.perst <- RR.QTLs[RR.QTLs$NoDays == 20 ,]; dim(RR.QTLs.perst) #QTLs detected on all days, 13 RR.QTLs.mid.long <- RR.QTLs[(RR.QTLs$NoDays > 12 & RR.QTLs$NoDays < 20) ,]; dim(RR.QTLs.mid.long) #5 RR.QTLs.mid.sh <- RR.QTLs[(RR.QTLs$NoDays > 5 & RR.QTLs$NoDays < 13) ,]; dim(RR.QTLs.mid.sh) #6 RR.QTLs.short <- RR.QTLs[RR.QTLs$NoDays < 6 ,]; dim(RR.QTLs.short) #127 #Get the signficant SNPs library(reshape2) RR.p <- as.matrix( read.table("DataAndCode/CompCluster/RR/pval.mat.txt", sep = "\t", header = T, row.names = 1) ) MAP <- read.table("DataAndCode/CompCluster/MAP_maf0.05.txt", header = F, sep = "\t") row.names(RR.p) <- MAP$V2 colnames(RR.p) <- paste0(1:20) RR.p2 <- RR.p RR.p <- melt(RR.p, id.vars = 0) #RR.p <- RR.p[RR.p$value < 10E-4 ,] RR.p <- merge(RR.p, MAP, by.x = "Var1", by.y = "V2", all.x = T, all.y = F) colnames(RR.p) <- c("SNP", "Day", "P", "CHR", "BP2", "BP1") RR.p$BP2 <- RR.p$BP1 + 1 RR.p <- RR.p[c("SNP", "Day", "P", "CHR", "BP1", "BP2")] RR.p <- RR.p[order(RR.p$CHR, RR.p$BP1, RR.p$Day) ,] RR.p <- dcast(RR.p, SNP + CHR + BP1 + BP2 ~ Day, value.var = "P") #Persistent SNPs ##for each row of persistent RR.QTLs find the most significant SNPs in the interval on each day RR.mat <- matrix(0, ncol = 20, nrow = nrow(RR.QTLs.perst)) for (i in 1:nrow(RR.QTLs.perst)){ tmp.QTL <- RR.QTLs.perst[i,] tmp.P <- RR.p[(RR.p$CHR %in% tmp.QTL$V1 & RR.p$BP1 <= tmp.QTL$V3 & RR.p$BP1 >= tmp.QTL$V2) ,] #subset for p values RR.mat[i,] <- apply(tmp.P[5:24], 2, min, rm.na = T) #find smallest p value across all time points } RR.mat <- cbind(RR.QTLs.perst[1:3], RR.mat) RR.mat$min <- apply(RR.mat[4:23], 1, min) #find the lowest p-value for each QTL RR.mat.per <- RR.mat[order(RR.mat$min) ,] #Mid-long duration SNPs RR.mat <- matrix(0, ncol = 20, nrow = nrow(RR.QTLs.mid.long)) for (i in 1:nrow(RR.QTLs.mid.long)){ tmp.QTL <- RR.QTLs.mid.long[i,] tmp.P <- RR.p[(RR.p$CHR %in% tmp.QTL$V1 & RR.p$BP1 <= tmp.QTL$V3 & RR.p$BP1 >= tmp.QTL$V2) ,] RR.mat[i,] <- apply(tmp.P[5:24], 2, min, rm.na = T) } RR.mat <- cbind(RR.QTLs.mid.long[1:3], RR.mat) RR.mat$min <- apply(RR.mat[4:23], 1, min) Cnt.day.ML <- apply(RR.mat[4:23], 2, function(x) sum(x < 1e-04)/5) plot(1:20, Cnt.day.ML, type = "l", ylim = c(0, 1)) RR.mat.ML <- RR.mat[order(RR.mat$min) ,] #Mid-short duration SNPs ##for each row of mid short -duration RR.QTLs find the significant SNPs in the interval RR.mat <- matrix(0, ncol = 20, nrow = nrow(RR.QTLs.mid.sh)) for (i in 1:nrow(RR.QTLs.mid.sh)){ tmp.QTL <- RR.QTLs.mid.sh[i,] tmp.P <- RR.p[(RR.p$CHR %in% tmp.QTL$V1 & RR.p$BP1 <= tmp.QTL$V3 & RR.p$BP1 >= tmp.QTL$V2) ,] RR.mat[i,] <- apply(tmp.P[5:24], 2, min, rm.na = T) } RR.mat <- cbind(RR.QTLs.mid.sh[1:3], RR.mat) RR.mat$min <- apply(RR.mat[4:23], 1, min) Cnt.day.MS <- apply(RR.mat[4:23], 2, function(x) sum(x < 1e-04)/6) plot(1:20, Cnt.day.MS, type = "l", ylim = c(0, 1)) RR.mat.MS <- RR.mat[order(RR.mat$min) ,] #Short duration SNPs ##for each row of mid short -duration RR.QTLs find the significant SNPs in the interval RR.mat <- matrix(0, ncol = 20, nrow = nrow(RR.QTLs.short)) for (i in 1:nrow(RR.QTLs.short)){ tmp.QTL <- RR.QTLs.short[i,] tmp.P <- RR.p[(RR.p$CHR %in% tmp.QTL$V1 & RR.p$BP1 <= tmp.QTL$V3 & RR.p$BP1 >= tmp.QTL$V2) ,] RR.mat[i,] <- apply(tmp.P[5:24], 2, min, rm.na = T) } RR.mat <- cbind(RR.QTLs.short[1:3], RR.mat) RR.mat$min <- apply(RR.mat[4:23], 1, min) Cnt.day.S <- apply(RR.mat[4:23], 2, function(x) sum(x < 1e-04)/2) plot(1:20, Cnt.day.S, type = "l", ylim = c(0, 1)) RR.mat.S <- RR.mat[order(RR.mat$min) ,] ``` ## Plot frequencies for each class ```{r plot frequencies for each class, echo = T, eval = F} pdf("DataAndCode/Figures/QTL.freq.pdf", h = 1.9, w=3.2, useDingbats = F, pointsize = 10) par(xpd = T, mar=c(3,3,.1,.75)+ c(0,0,0,4), mgp=c(1.8,0.5,0)) plot(1:20, Cnt.day.ML, type = "l", ylim = c(0, 1), col = "black", ylab = "Frequency", xlab = "Days of Imaging") lines(1:20, Cnt.day.MS, col = "grey60", lty = 2) lines(1:20, Cnt.day.S, col = "red", lty = 3) legend(21, 0.75, c("Long", "Mid", "Short"), col = c("black", "grey60", "red"), lty = 1:3, bty = 'n', cex = 0.75, pt.cex = 0.65) dev.off() ``` ## Heatmap ```{r heatmap for QTL, echo = T, eval = F} P.mat.per <- round(-log(RR.mat.per[4:23], 10), 2) P.mat.ML <- round(-log(RR.mat.ML[4:23], 10), 2) P.mat.MS <- round(-log(RR.mat.MS[4:23], 10), 2) P.mat.S <- round(-log(RR.mat.S[4:23], 10), 2) P.mat.all <- rbind(P.mat.per, P.mat.ML, P.mat.MS, P.mat.S) RR.mat.all <- rbind(RR.mat.per, RR.mat.ML, RR.mat.MS, RR.mat.S) colnames(RR.mat.all) <- c("CHR", "QTLstart", "QTLend", paste0("p.value_Day", 1:20, "MIN")) RR.mat.all$MIN <- NULL write.csv(RR.mat.all, "DataandCode/Figures/RR_GWASresults_QTL.csv", row.names = F) library(RColorBrewer) library(pheatmap) HMcolors = c(colorRampPalette(c("white", "grey60"))(39), colorRampPalette(c("orangered", "orangered4"))(50) ) myBreaks = c(seq(0, 3.999999, length.out=40), seq(4, 10, length.out=50)) pheatmap(P.mat.all, color = HMcolors, border_color = NA, cellwidth = 15, cellheight = 10, fontsize = 6, scale = "none", cluster_rows = T, cluster_cols = F, legend = T, display_numbers = F, number_format = "%.2f", number_color = "black", fontsize_number = 3.5, breaks = myBreaks, filename = "DataAndCode/Figures/Fig3.pdf", width = 7, height = 5, show_rownames = T, show_colnames = T, labels_row = paste0(RR.mat.all$V1, ": ", RR.mat.all$V2, " - ", RR.mat.all$V3), labels_col = seq(1:20) ) ```