Background

Here, we’ll classify QTLs into persistent QTL (QTL detected at all 20 time points), long-duration QTL (those with significant associations at more than 12, but less than 20 time points), mid-duration (QTL with associations at 6 - 12 time points), and short-duration QTL (those with associations at fewer than 6 time points). Significant SNPs from the RR approach are selected, and those within a 200 kb window are merged to a single QTL with the most signifiacnt SNP at each time point selected to represent the QTL.

QTL identification using bedtools

To runbedtools we need to generate a .BED file for the signifcant SNPs.

library(reshape2)

TP.p <- as.matrix( read.table("DataAndCode/CompCluster/TP/pval.mat.txt", sep = "\t", header = T, row.names = 1) )
RR.p <- as.matrix( read.table("DataAndCode/CompCluster/RR/pval.mat.txt", sep = "\t", header = T, row.names = 1) )

MAP <- read.table("DataAndCode/CompCluster/MAP_maf0.05.txt", header = F, sep = "\t")

row.names(TP.p) <- MAP$V2
row.names(RR.p) <- MAP$V2

colnames(TP.p) <- paste0("D", 1:20)
colnames(RR.p) <- paste0("D", 1:20)

TP.p <- melt(TP.p, id.vars = 0)
RR.p <- melt(RR.p, id.vars = 0)

TP.p <- TP.p[TP.p$value < 1E-4 ,]
RR.p <- RR.p[RR.p$value < 1E-4 ,]

TP.p <- merge(TP.p, MAP, by.x = "Var1", by.y = "V2", all.x = T, all.y = F)
TP.p$V5 <- TP.p$V4 + 1

RR.p <- merge(RR.p, MAP, by.x = "Var1", by.y = "V2", all.x = T, all.y = F)
RR.p$V5 <- RR.p$V4 + 1

TP.p <- TP.p[c("V1", "V4", "V5", "Var2", "Var1")]
RR.p <- RR.p[c("V1", "V4", "V5", "Var2", "Var1")]

TP.p <- TP.p[order(TP.p$V1, TP.p$V4) ,]
RR.p <- RR.p[order(RR.p$V1, RR.p$V4) ,]

length(unique(RR.p$Var1)) #31
length(unique(TP.p$Var1)) #38

write.table(TP.p[c("V1", "V4", "V5", "Var2")],
            "DataAndCode/TP_GP/bedtools/TP.p.BED",
            sep = "\t", col.names = F, row.names = F, quote = F)
write.table(RR.p[c("V1", "V4", "V5", "Var2")],
            "DataAndCode/RR_GP/bedtools/RR.p.BED",
            sep = "\t", col.names = F, row.names = F, quote = F)
write.table(TP.p[c("V1", "V4", "V5", "Var1")],
            "DataAndCode/TP_GP/bedtools/TP.pSNPs.BED",
            sep = "\t", col.names = F, row.names = F, quote = F)
write.table(RR.p[c("V1", "V4", "V5", "Var1")],
            "DataAndCode/RR_GP/bedtools/RR.pSNPs.BED",
            sep = "\t", col.names = F, row.names = F, quote = F)

cd DataAndCode/TP_GP/bedtools/
bedtools merge -i TP.p.BED -d 200000 -nms > TP.p.QTL.txt
bedtools merge -i TP.pSNPs.BED -d 200000 -nms > TP.p.QTLSNPs.txt
less TP.p.QTL.txt | wc -l #15

cd DataAndCode/RR_GP/bedtools/
bedtools merge -i RR.p.BED -d 200000 -nms > RR.p.QTL.txt
bedtools merge -i RR.pSNPs.BED -d 200000 -nms > RR.p.QTLSNPs.txt
less RR.p.QTL.txt | wc -l #26

This produces a file that lists the QTLs and has the days in which each SNP was detected in a string delimited by semicolons. This next chunk of code will take the fourth column with the string of days, split the string, and only keep non redundant entries.

RR.QTLs <- read.table("DataAndCode/RR_GP/bedtools/RR.p.QTL.txt",
                      header = F, sep = "\t")

RR.list <- list()
for (i in 1:nrow(RR.QTLs)){
  tmp <- as.character(RR.QTLs[i,]$V4)
  RR.list[[i]] <- unique(unlist(strsplit(tmp, ";")))
}

#Summarise
RR.QTLs$NoDays <- unlist(lapply(RR.list, function(x) length(x)))

dim(RR.QTLs[RR.QTLs$NoDays == 20 ,]) #13 QTL detected on all days
dim(RR.QTLs[(RR.QTLs$NoDays < 20) & (RR.QTLs$NoDays > 12) ,]) #5 long-duration (13-19 days)
dim(RR.QTLs[(RR.QTLs$NoDays < 13) & (RR.QTLs$NoDays > 5) ,]) #6 mid-duration (6-12 days)
dim(RR.QTLs[(RR.QTLs$NoDays < 6) ,]) #2 short-duration QTL (< 6 days)

Now I’ll figure out which SNPs are most significant for each QTL. First find out which significant SNPs are in the QTL intervals with bedtools.

library(plyr)
RR.QTLs <- read.table("DataAndCode/RR_GP/bedtools/RR.p.QTL.txt", header = F, sep = "\t")

RR.list <- list()
for (i in 1:nrow(RR.QTLs)){
  tmp <- as.character(RR.QTLs[i,]$V4)
  RR.list[[i]] <- unique(unlist(strsplit(tmp, ";")))
}

RR.QTLs$NoDays <- unlist(lapply(RR.list, function(x) length(x)))

RR.QTLs.perst <- RR.QTLs[RR.QTLs$NoDays == 20 ,]; dim(RR.QTLs.perst) #QTLs detected on all days, 13
RR.QTLs.mid.long <- RR.QTLs[(RR.QTLs$NoDays > 12 & RR.QTLs$NoDays < 20) ,]; dim(RR.QTLs.mid.long) #5
RR.QTLs.mid.sh <- RR.QTLs[(RR.QTLs$NoDays > 5 & RR.QTLs$NoDays < 13) ,]; dim(RR.QTLs.mid.sh) #6
RR.QTLs.short <- RR.QTLs[RR.QTLs$NoDays < 6 ,]; dim(RR.QTLs.short) #127


#Get the signficant SNPs
library(reshape2)

RR.p <- as.matrix( read.table("DataAndCode/CompCluster/RR/pval.mat.txt", sep = "\t", header = T, row.names = 1) )

MAP <- read.table("DataAndCode/CompCluster/MAP_maf0.05.txt", header = F, sep = "\t")

row.names(RR.p) <- MAP$V2

colnames(RR.p) <- paste0(1:20)

RR.p2 <- RR.p

RR.p <- melt(RR.p, id.vars = 0)

#RR.p <- RR.p[RR.p$value < 10E-4 ,]

RR.p <- merge(RR.p, MAP, by.x = "Var1", by.y = "V2", all.x = T, all.y = F)
colnames(RR.p) <- c("SNP", "Day", "P", "CHR", "BP2", "BP1")
RR.p$BP2 <- RR.p$BP1 + 1
RR.p <- RR.p[c("SNP", "Day", "P", "CHR", "BP1", "BP2")]
RR.p <- RR.p[order(RR.p$CHR, RR.p$BP1, RR.p$Day) ,]
RR.p <- dcast(RR.p, SNP + CHR + BP1 + BP2 ~ Day, value.var = "P")

#Persistent SNPs
##for each row of persistent RR.QTLs find the most significant SNPs in the interval on each day
RR.mat <- matrix(0, ncol = 20, nrow = nrow(RR.QTLs.perst))
for (i in 1:nrow(RR.QTLs.perst)){
  tmp.QTL <- RR.QTLs.perst[i,]
  tmp.P <- RR.p[(RR.p$CHR %in% tmp.QTL$V1 & RR.p$BP1 <= tmp.QTL$V3 & RR.p$BP1 >= tmp.QTL$V2) ,] #subset for p values
  RR.mat[i,] <- apply(tmp.P[5:24], 2, min, rm.na = T) #find smallest p value across all time points
}

RR.mat <- cbind(RR.QTLs.perst[1:3], RR.mat)
RR.mat$min <- apply(RR.mat[4:23], 1, min) #find the lowest p-value for each QTL
RR.mat.per <- RR.mat[order(RR.mat$min) ,]


#Mid-long duration SNPs
RR.mat <- matrix(0, ncol = 20, nrow = nrow(RR.QTLs.mid.long))
for (i in 1:nrow(RR.QTLs.mid.long)){
  tmp.QTL <- RR.QTLs.mid.long[i,]
  tmp.P <- RR.p[(RR.p$CHR %in% tmp.QTL$V1 & RR.p$BP1 <= tmp.QTL$V3 & RR.p$BP1 >= tmp.QTL$V2) ,]
  RR.mat[i,] <- apply(tmp.P[5:24], 2, min, rm.na = T)
}

RR.mat <- cbind(RR.QTLs.mid.long[1:3], RR.mat)
RR.mat$min <- apply(RR.mat[4:23], 1, min)

Cnt.day.ML <- apply(RR.mat[4:23], 2, function(x) sum(x < 1e-04)/5)
plot(1:20, Cnt.day.ML, type = "l", ylim = c(0, 1))

RR.mat.ML <- RR.mat[order(RR.mat$min) ,]



#Mid-short duration SNPs
##for each row of mid short -duration RR.QTLs find the significant SNPs in the interval
RR.mat <- matrix(0, ncol = 20, nrow = nrow(RR.QTLs.mid.sh))
for (i in 1:nrow(RR.QTLs.mid.sh)){
  tmp.QTL <- RR.QTLs.mid.sh[i,]
  tmp.P <- RR.p[(RR.p$CHR %in% tmp.QTL$V1 & RR.p$BP1 <= tmp.QTL$V3 & RR.p$BP1 >= tmp.QTL$V2) ,]
  RR.mat[i,] <- apply(tmp.P[5:24], 2, min, rm.na = T)
}

RR.mat <- cbind(RR.QTLs.mid.sh[1:3], RR.mat)
RR.mat$min <- apply(RR.mat[4:23], 1, min)

Cnt.day.MS <- apply(RR.mat[4:23], 2, function(x) sum(x < 1e-04)/6)
plot(1:20, Cnt.day.MS, type = "l", ylim = c(0, 1))

RR.mat.MS <- RR.mat[order(RR.mat$min) ,]

            

#Short duration SNPs
##for each row of mid short -duration RR.QTLs find the significant SNPs in the interval
RR.mat <- matrix(0, ncol = 20, nrow = nrow(RR.QTLs.short))
for (i in 1:nrow(RR.QTLs.short)){
  tmp.QTL <- RR.QTLs.short[i,]
  tmp.P <- RR.p[(RR.p$CHR %in% tmp.QTL$V1 & RR.p$BP1 <= tmp.QTL$V3 & RR.p$BP1 >= tmp.QTL$V2) ,]
  RR.mat[i,] <- apply(tmp.P[5:24], 2, min, rm.na = T)
}

RR.mat <- cbind(RR.QTLs.short[1:3], RR.mat)
RR.mat$min <- apply(RR.mat[4:23], 1, min)

Cnt.day.S <- apply(RR.mat[4:23], 2, function(x) sum(x < 1e-04)/2)
plot(1:20, Cnt.day.S, type = "l", ylim = c(0, 1))

RR.mat.S <- RR.mat[order(RR.mat$min) ,]

Plot frequencies for each class

pdf("DataAndCode/Figures/QTL.freq.pdf", h = 1.9, w=3.2, useDingbats = F, pointsize = 10)
par(xpd = T, mar=c(3,3,.1,.75)+ c(0,0,0,4), mgp=c(1.8,0.5,0))

plot(1:20, Cnt.day.ML, type = "l", ylim = c(0, 1), col = "black", ylab = "Frequency", xlab = "Days of Imaging")
lines(1:20, Cnt.day.MS, col = "grey60", lty = 2)
lines(1:20, Cnt.day.S, col = "red", lty = 3)

legend(21, 0.75, c("Long", "Mid", "Short"), col = c("black", "grey60", "red"), lty = 1:3, bty = 'n', cex = 0.75, pt.cex = 0.65)
dev.off()

Heatmap

P.mat.per <- round(-log(RR.mat.per[4:23], 10), 2)
P.mat.ML <- round(-log(RR.mat.ML[4:23], 10), 2)
P.mat.MS <- round(-log(RR.mat.MS[4:23], 10), 2)
P.mat.S <- round(-log(RR.mat.S[4:23], 10), 2)

P.mat.all <- rbind(P.mat.per, P.mat.ML, P.mat.MS, P.mat.S)
RR.mat.all <- rbind(RR.mat.per, RR.mat.ML, RR.mat.MS, RR.mat.S)

colnames(RR.mat.all) <- c("CHR", "QTLstart", "QTLend", paste0("p.value_Day", 1:20, "MIN"))
RR.mat.all$MIN <- NULL
write.csv(RR.mat.all, "DataandCode/Figures/RR_GWASresults_QTL.csv", row.names = F)

library(RColorBrewer)
library(pheatmap)
HMcolors = c(colorRampPalette(c("white", "grey60"))(39), colorRampPalette(c("orangered", "orangered4"))(50) )
myBreaks = c(seq(0, 3.999999, length.out=40), 
              seq(4, 10, length.out=50))

pheatmap(P.mat.all,
         color = HMcolors,
         border_color = NA,
         cellwidth = 15,
         cellheight = 10,
         fontsize = 6,
         scale = "none",
         cluster_rows = T,
         cluster_cols = F,
         legend = T,
         display_numbers = F,
         number_format = "%.2f",
         number_color = "black",
         fontsize_number = 3.5,
         breaks = myBreaks,
         filename = "DataAndCode/Figures/Fig3.pdf",
         width = 7,
         height = 5,
         show_rownames = T,
         show_colnames = T,
         labels_row = paste0(RR.mat.all$V1, ": ", RR.mat.all$V2, " - ", RR.mat.all$V3),
         labels_col = seq(1:20)
)

Classification of QTL from RR approach

Malachy Campbell

8/7/2018

Background

QTL identification using bedtools

Plot frequencies for each class

Heatmap