Background

Here, we’ll classify QTLs into persistent QTL (QTL detected at all 20 time points), long-duration QTL (those with significant associations at more than 12, but less than 20 time points), mid-duration (QTL with associations at 6 - 12 time points), and short-duration QTL (those with associations at fewer than 6 time points). Significant SNPs from the RR approach are selected, and those within a 200 kb window are merged to a single QTL with the most signifiacnt SNP at each time point selected to represent the QTL.

QTL identification using bedtools

To runbedtools we need to generate a .BED file for the signifcant SNPs.

library(reshape2)

TP.p <- as.matrix( read.table("DataAndCode/CompCluster/TP/pval.mat.txt", sep = "\t", header = T, row.names = 1) )
RR.p <- as.matrix( read.table("DataAndCode/CompCluster/RR/pval.mat.txt", sep = "\t", header = T, row.names = 1) )

MAP <- read.table("DataAndCode/CompCluster/MAP_maf0.05.txt", header = F, sep = "\t")

row.names(TP.p) <- MAP$V2
row.names(RR.p) <- MAP$V2

colnames(TP.p) <- paste0("D", 1:20)
colnames(RR.p) <- paste0("D", 1:20)

TP.p <- melt(TP.p, id.vars = 0)
RR.p <- melt(RR.p, id.vars = 0)

TP.p <- TP.p[TP.p$value < 1E-4 ,]
RR.p <- RR.p[RR.p$value < 1E-4 ,]

TP.p <- merge(TP.p, MAP, by.x = "Var1", by.y = "V2", all.x = T, all.y = F)
TP.p$V5 <- TP.p$V4 + 1

RR.p <- merge(RR.p, MAP, by.x = "Var1", by.y = "V2", all.x = T, all.y = F)
RR.p$V5 <- RR.p$V4 + 1

TP.p <- TP.p[c("V1", "V4", "V5", "Var2", "Var1")]
RR.p <- RR.p[c("V1", "V4", "V5", "Var2", "Var1")]

TP.p <- TP.p[order(TP.p$V1, TP.p$V4) ,]
RR.p <- RR.p[order(RR.p$V1, RR.p$V4) ,]

length(unique(RR.p$Var1)) #31
length(unique(TP.p$Var1)) #38

write.table(TP.p[c("V1", "V4", "V5", "Var2")],
            "DataAndCode/TP_GP/bedtools/TP.p.BED",
            sep = "\t", col.names = F, row.names = F, quote = F)
write.table(RR.p[c("V1", "V4", "V5", "Var2")],
            "DataAndCode/RR_GP/bedtools/RR.p.BED",
            sep = "\t", col.names = F, row.names = F, quote = F)
write.table(TP.p[c("V1", "V4", "V5", "Var1")],
            "DataAndCode/TP_GP/bedtools/TP.pSNPs.BED",
            sep = "\t", col.names = F, row.names = F, quote = F)
write.table(RR.p[c("V1", "V4", "V5", "Var1")],
            "DataAndCode/RR_GP/bedtools/RR.pSNPs.BED",
            sep = "\t", col.names = F, row.names = F, quote = F)

This produces a file that lists the QTLs and has the days in which each SNP was detected in a string delimited by semicolons. This next chunk of code will take the fourth column with the string of days, split the string, and only keep non redundant entries.

Now I’ll figure out which SNPs are most significant for each QTL. First find out which significant SNPs are in the QTL intervals with bedtools.

library(plyr)
RR.QTLs <- read.table("DataAndCode/RR_GP/bedtools/RR.p.QTL.txt", header = F, sep = "\t")

RR.list <- list()
for (i in 1:nrow(RR.QTLs)){
  tmp <- as.character(RR.QTLs[i,]$V4)
  RR.list[[i]] <- unique(unlist(strsplit(tmp, ";")))
}

RR.QTLs$NoDays <- unlist(lapply(RR.list, function(x) length(x)))

RR.QTLs.perst <- RR.QTLs[RR.QTLs$NoDays == 20 ,]; dim(RR.QTLs.perst) #QTLs detected on all days, 13
RR.QTLs.mid.long <- RR.QTLs[(RR.QTLs$NoDays > 12 & RR.QTLs$NoDays < 20) ,]; dim(RR.QTLs.mid.long) #5
RR.QTLs.mid.sh <- RR.QTLs[(RR.QTLs$NoDays > 5 & RR.QTLs$NoDays < 13) ,]; dim(RR.QTLs.mid.sh) #6
RR.QTLs.short <- RR.QTLs[RR.QTLs$NoDays < 6 ,]; dim(RR.QTLs.short) #127


#Get the signficant SNPs
library(reshape2)

RR.p <- as.matrix( read.table("DataAndCode/CompCluster/RR/pval.mat.txt", sep = "\t", header = T, row.names = 1) )

MAP <- read.table("DataAndCode/CompCluster/MAP_maf0.05.txt", header = F, sep = "\t")

row.names(RR.p) <- MAP$V2

colnames(RR.p) <- paste0(1:20)

RR.p2 <- RR.p

RR.p <- melt(RR.p, id.vars = 0)

#RR.p <- RR.p[RR.p$value < 10E-4 ,]

RR.p <- merge(RR.p, MAP, by.x = "Var1", by.y = "V2", all.x = T, all.y = F)
colnames(RR.p) <- c("SNP", "Day", "P", "CHR", "BP2", "BP1")
RR.p$BP2 <- RR.p$BP1 + 1
RR.p <- RR.p[c("SNP", "Day", "P", "CHR", "BP1", "BP2")]
RR.p <- RR.p[order(RR.p$CHR, RR.p$BP1, RR.p$Day) ,]
RR.p <- dcast(RR.p, SNP + CHR + BP1 + BP2 ~ Day, value.var = "P")

#Persistent SNPs
##for each row of persistent RR.QTLs find the most significant SNPs in the interval on each day
RR.mat <- matrix(0, ncol = 20, nrow = nrow(RR.QTLs.perst))
for (i in 1:nrow(RR.QTLs.perst)){
  tmp.QTL <- RR.QTLs.perst[i,]
  tmp.P <- RR.p[(RR.p$CHR %in% tmp.QTL$V1 & RR.p$BP1 <= tmp.QTL$V3 & RR.p$BP1 >= tmp.QTL$V2) ,] #subset for p values
  RR.mat[i,] <- apply(tmp.P[5:24], 2, min, rm.na = T) #find smallest p value across all time points
}

RR.mat <- cbind(RR.QTLs.perst[1:3], RR.mat)
RR.mat$min <- apply(RR.mat[4:23], 1, min) #find the lowest p-value for each QTL
RR.mat.per <- RR.mat[order(RR.mat$min) ,]


#Mid-long duration SNPs
RR.mat <- matrix(0, ncol = 20, nrow = nrow(RR.QTLs.mid.long))
for (i in 1:nrow(RR.QTLs.mid.long)){
  tmp.QTL <- RR.QTLs.mid.long[i,]
  tmp.P <- RR.p[(RR.p$CHR %in% tmp.QTL$V1 & RR.p$BP1 <= tmp.QTL$V3 & RR.p$BP1 >= tmp.QTL$V2) ,]
  RR.mat[i,] <- apply(tmp.P[5:24], 2, min, rm.na = T)
}

RR.mat <- cbind(RR.QTLs.mid.long[1:3], RR.mat)
RR.mat$min <- apply(RR.mat[4:23], 1, min)

Cnt.day.ML <- apply(RR.mat[4:23], 2, function(x) sum(x < 1e-04)/5)
plot(1:20, Cnt.day.ML, type = "l", ylim = c(0, 1))

RR.mat.ML <- RR.mat[order(RR.mat$min) ,]



#Mid-short duration SNPs
##for each row of mid short -duration RR.QTLs find the significant SNPs in the interval
RR.mat <- matrix(0, ncol = 20, nrow = nrow(RR.QTLs.mid.sh))
for (i in 1:nrow(RR.QTLs.mid.sh)){
  tmp.QTL <- RR.QTLs.mid.sh[i,]
  tmp.P <- RR.p[(RR.p$CHR %in% tmp.QTL$V1 & RR.p$BP1 <= tmp.QTL$V3 & RR.p$BP1 >= tmp.QTL$V2) ,]
  RR.mat[i,] <- apply(tmp.P[5:24], 2, min, rm.na = T)
}

RR.mat <- cbind(RR.QTLs.mid.sh[1:3], RR.mat)
RR.mat$min <- apply(RR.mat[4:23], 1, min)

Cnt.day.MS <- apply(RR.mat[4:23], 2, function(x) sum(x < 1e-04)/6)
plot(1:20, Cnt.day.MS, type = "l", ylim = c(0, 1))

RR.mat.MS <- RR.mat[order(RR.mat$min) ,]

            

#Short duration SNPs
##for each row of mid short -duration RR.QTLs find the significant SNPs in the interval
RR.mat <- matrix(0, ncol = 20, nrow = nrow(RR.QTLs.short))
for (i in 1:nrow(RR.QTLs.short)){
  tmp.QTL <- RR.QTLs.short[i,]
  tmp.P <- RR.p[(RR.p$CHR %in% tmp.QTL$V1 & RR.p$BP1 <= tmp.QTL$V3 & RR.p$BP1 >= tmp.QTL$V2) ,]
  RR.mat[i,] <- apply(tmp.P[5:24], 2, min, rm.na = T)
}

RR.mat <- cbind(RR.QTLs.short[1:3], RR.mat)
RR.mat$min <- apply(RR.mat[4:23], 1, min)

Cnt.day.S <- apply(RR.mat[4:23], 2, function(x) sum(x < 1e-04)/2)
plot(1:20, Cnt.day.S, type = "l", ylim = c(0, 1))

RR.mat.S <- RR.mat[order(RR.mat$min) ,]