Background

In scenario A, PSA for 357 lines at all twenty time points were used to fit a RR model and phenotypes were predicted for a set of lines without phenotypic records. This can be thought of as a standard genomic selection approach, where the training set is a set of existing lines with phenotypic records and the test population as a new set of lines without records. Here the aim is to determine whether the longitudinal RR approach provides greater prediction accuracy than a cross-sectional GS approach in which a mixed model is fit at each time point.

To assess the accuracy of gBLUPs for the TP GS as well as scenarios A, C, and D, a two-fold cross validation approach was used. Briefly, the 357 lines were split into two sets, with one serving as a training set with known phenotypes and the second serving as a testing set with unknown phenotypes. Since the number of lines were not even the remaining line was assigned to the training set. The accuracy of prediction was assessed by comparing predicted gBLUPs with observed PSA at each of the three experiments using Pearson’s correlation method. The lines were randomly assigned to each fold, and the process was repeated 20 times. For each fold, the average correlation over the three experiments was used.

Generate the dataset for cross validation

This clunky chunk of code is to prepare a dataset for cross validation. Here, half of the accessions are randomly selected, and the obervations for these accessions are masked. N controls the number of resamplings that are done. For each resampling run the columns of masked observations are added to the original dataframe and named Y1, Y2… This data will be used for CV for thr RR approach as well as the TP approach, However for TP we will convert the dataset to a wide format, and write a seperate file for each resampling run.

PSA.df <- read.csv("~/Desktop/RR/RR/FinalFiles/PSA.cleaned.csv")

##Get the accession names
Acc <- unique(PSA.df$NSFTV.ID)

#Set the number of resampling runs
N <- 10
for (i in 1:N){
  
  set.seed(i)
  ########################################
  #1. Split the accessions into two folds#
  ########################################
  
  ##Create a new dataframe to mask observations
  new.ph <- PSA.df
  
  ##Assign accessions to folds
  setlabels <- sample(1:2, size=length(Acc), replace = T)
  names(setlabels) <- Acc
  setlabels <- as.matrix(setlabels, ncol = 1, nrow = length(Acc))
  
  ##Merge the original dataframe and the column with fold IDs
  new.ph <- merge(new.ph, setlabels, by.x = "NSFTV.ID", by.y = 0, all = T)
  colnames(new.ph)[ncol(new.ph)] <- "Fold"
  
  ##Make a fold x n matrix that has all the origninal observations 
  new.y <- as.matrix(new.ph[,5, drop = F]) %*% t(rep(1, 2))
  colnames(new.y) <- paste0("Y", 1:2)
  
  ##merge the fold x n matrix and the phenotypic data
  new.ph1 <- cbind(new.ph, new.y)[,ncol(new.ph):(ncol(new.ph)+2)]
  
  #######################################
  #2. Mask the observations in each fold#
  #######################################
  ##This function will assign NA to all obervations from the accessions belonging to a given fold
  missingInFold <- function(r){
    fold = r["Fold"]
    index = fold + 1
    r[index] = NA
    return(r)
  }
  
  ##Assign the NA's
  new.ph2 <- t(apply(new.ph1, 1, missingInFold))
  
  ##Merge the masked dataset with the original dataframe
  if(i == 1){
    final <- cbind(new.ph[,1:(ncol(new.ph)-1)], new.ph2[,2:3])
  }else{
    final <- cbind(final, new.ph2[,2:3])
  }
}

CV file for RR approach

Here, the RR CV file is ordered by NSFTV.ID, Experiment, Rep, and DayOfImaging, and wrtie it to a file.

###############################################
#3. Order the dataframe and write it to a file#
###############################################

final <- final[order(final$NSFTV.ID, final$Exp, final$Rep, final$DayOfImaging) ,]
colnames(final)[6:ncol(final)] <- paste0("Y", 1:(N*2))
write.csv(final, "~/Desktop/RR/ScenarioA/RR/RR_CV.csv", row.names = F)

CV file for TP approach

Here, the same file for RR CV is loaded, and converted to a wide format where each column is a day of imaging. Finally, a loop is used to split the file based on sampling run and a file is writeen for each.

library(reshape2)

final <- read.csv("~/Desktop/RR/ScenarioA/RR/RR_CV.csv")

#Convert it to wide format
final <- melt(final, id.vars=c("NSFTV.ID", "Rep", "Exp", "DayOfImaging"))
final <- dcast(final, NSFTV.ID + Rep + Exp + variable ~ DayOfImaging)
colnames(final)[4] = "Run"

final <- final[order(final$NSFTV.ID, final$Exp, final$Rep) ,]

colnames(final)[5:ncol(final)] <- paste0("D", 1:20)

#Split up the dataset and write a seperate file for each run
Sets <- unique(final$Run)
for (i in 1:length(Sets)){
  tmp <- final[final$Run == Sets[i] ,]
  write.csv(tmp, paste0("~/Desktop/RR/ScenarioA/TP/", Sets[i], ".csv"), row.names=F)
}

write.csv(final, "~/Desktop/RR/ScenarioA/TP/TP_CV.csv", row.names=F)

Running asreml for CV

RR approach

Since the RR model can take quite some time to run, CV was done on our computing cluster at UNL. Four jobs were created, where the RR model was fit for five resampling runs. Below is an example .as file, as well as the slurm script used to submit the job.

!RENAME !ARG Y1 Y2 Y3 Y4 Y5
RR scenario A
 NID !A
 PSA !/100000
 DayOfImaging 20 !I
 Rep 2
 Exp !A
 Y1 !/100000
 Y2 !/100000
 Y3 !/100000
 Y4 !/100000
 Y5 !/100000
 Y6 !/100000
 Y7 !/100000
 Y8 !/100000
 Y9 !/100000
 Y10 !/100000
 Y11 !/100000
 Y12 !/100000
 Y13 !/100000
 Y14 !/100000
 Y15 !/100000
 Y16 !/100000
 Y17 !/100000
 Y18 !/100000
 Y19 !/100000
 Y20 !/100000
G2.grm
RR_CV.csv !SKIP 1 !MAXITER 1000 !EXTRA 100 !WORKSPACE 6144 !ASUV

!ASSIGN USe !< !INIT
1.37E-03 1.22E-03 2.07E-03 3.32E-03 5.32E-03 7.88E-03 1.18E-02 1.53E-02 2.04E-02 2.76E-02 4.03E-02 5.80E-02 8.19E-02 0.114627 0.164452 0.224128 0.336555 0.522506 0.790597 1.17392
!>
!ASSIGN USg !< !INIT
0.629183
0.495574 0.394557
0.129607 0.104540 0.281278E-01
!>
!ASSIGN USp !< !INIT
0.594161E-01
0.392295E-01 0.260640E-01
!>
$1 ~ mu leg(DayOfImaging,2) !r us(leg(DayOfImaging,1) $USp).Exp us(leg(DayOfImaging,2) $USg).grm(NID) !f mv
residual id(2142).idh(DayOfImaging $USe)
#!/bin/bash
#SBATCH --time=24:00:00
#SBATCH --mem=4gb
#SBATCH --ntasks-per-node=1
#SBATCH --job-name=ASREML_Y1

errlog="logs/log_${SLURM_JOB_ID}.err"
outlog="logs/log_${SLURM_JOB_ID}.out"

module load asreml

asreml CV1.as

TP approach

For each of the 20 resampling runs a .as file was created. Below is an example for one run.

TP CV
 ID !A
 Rep 2
 Exp !A
 Run !A
 D1 !/100000
 D2 !/100000
 D3 !/100000
 D4 !/100000
 D5 !/100000
 D6 !/100000
 D7 !/100000
 D8 !/100000
 D9 !/100000
 D10 !/100000
 D11 !/100000
 D12 !/100000
 D13 !/100000
 D14 !/100000
 D15 !/100000
 D16 !/100000
 D17 !/100000
 D18 !/100000
 D19 !/100000
 D20 !/100000
G2.grm
Y1.csv !SKIP 1 !MAXITER 200 !WORKSPACE 6144

!CYCLE D1 D2 D3 D4 D5 D6 D7 D8 D9 D10,
D11 D12 D13 D14 D15 D16 D17 D18 D19 D20

$I ~ mu !r grm(ID) Exp !f mv

This script will run CV for each of the 20 .as files.

cd ~/Desktop/RR/ScenarioA/TP/

asreml *.as

Cross validation results

RR Approach

For the CV of the RR model, 20 files were written for each resampling run. Each file consists of the BV for the legendre polynomials. Recall for the random regressiuon model a second order polynomal was used to model the additive genetic component, thus there will be three BVs for each accession. For a given line, \(j\), at time \(t\) the gBLUPs can be obtained by \(\text{gBLUP}_{jt} = \phi_t\hat{u}_j\); where \(\phi_t\) is the row vector of the matrix of Legendre polynomials of order 2.

The accuracy of prediction was assessed by comparing predicted gBLUPs with observed PSA at each of the three experiments using Pearson’s correlation method. For each fold, the average correlation over the three experiments was used.

The functions below were adapted from Mrode (2005) by Gota Morota.

##Return coefficient matrix (lambda) of n-th order Legendre polynomials. Scaling method implemented by Gengler et. al. (1999) converts constant Legendre polynomial coefficients into 1
`legendre` <-
  function(n, gengler){
    
    if (nargs()==1){
      gengler <- TRUE   
    }
    
    if (gengler != TRUE & gengler != FALSE){
      gengler=TRUE  
    }
    
    N <- n+1
    L <- matrix(0,nrow=N, ncol=N)
    
    for(i in (1:N)){
      if(i==1){
        L[i,i] <- 1
      }
      else if(i==2){
        L[i,i] <- 1
      }
      else  {
        tmp <- L[i-1,]
        tmp2 <- as.numeric()
        tmp2 <- c(0,tmp[1:(N-1)])
        L[i,] <- (1/(i-2+1))*( (2*(i-2) + 1)*tmp2 -(i-2)*L[i-2,] )
      }
    }
    
    # Normalize
    for (j in (1:N)){   
      L[j,] <- (sqrt( (2*(j-1)+1)/2)  )*L[j,]
    }
    
    
    # Gengler (1999)
    if (gengler==TRUE){
      L <- sqrt(2)*L
    }
    
    return(L)
    
  }

##Given time points covariate and order of fit for Legendre polynomials, return matrix 'M' containing the polynomials of standardized time. 'M' is order t (number of time points) by k (order of Legendre polynomials)
stdtime <- function(t, n, tmax, tmin){
  if(missing(tmax)) {
    tmax <- t[which.max(t)]
  }
  if(missing(tmin)) {
    tmin <- t[which.min(t)]
  }
  
  N <- n+1
  M <- matrix(0, nrow=length(t), ncol=N)
  a <- -1 + 2*(t-tmin)/(tmax - tmin)
  M[,1] <- 1
  
  for (i in 2:N){
    M[,i] <- a^(i-1)
  }
  
  return(M)
}
#This code will calucalte the gBLUPs at each time point from the asreml .sln files.
library(plyr)
library(reshape2)

setwd("~/Desktop/RR/ScenarioA/RR/")

final <- read.csv("~/Desktop/RR/ScenarioA/RR/RR_CV.csv")

PSA <- final[1:5]
PSA <- ddply(PSA, .(NSFTV.ID, Exp, DayOfImaging), summarise, PSA=mean(PSA, na.rm = T))
PSA <- dcast(PSA, NSFTV.ID + DayOfImaging ~ Exp)
## Using PSA as value column: use value.var to override.
PSA <- PSA[order(PSA$DayOfImaging, PSA$NSFTV.ID) ,]

#Create an object that lists the file names for all CV runs. Recall that the CV was split up into four batches, each with 5 jobs.
Files <- c(paste0("CV1Y", 1:5, ".sln"),
           paste0("CV2Y", 6:10, ".sln"),
           paste0("CV3Y", 11:15, ".sln"),
           paste0("CV4Y", 16:20, ".sln"))

for (i in 1:length(Files)){
  sln <- read.delim(Files[i], sep="", header=T)
  
  #gBLUPs for legendre polynomials
  g.hat.y <- t(cbind(sln[grep("1.NSFTV", sln$Level) ,][,3],
                     sln[grep("2.NSFTV", sln$Level) ,][,3],
                     sln[grep("3.NSFTV", sln$Level) ,][,3]))
  
  colnames(g.hat.y) <- sub("1.", "", sln[grep("1.NSFTV", sln$Level) ,][,2])
  
  #Calculated gBLUPs at each time point
  Phi <- stdtime(1:20, 2) %*% t(legendre(2, gengler = F))
  ghat.t.y <- t(apply(g.hat.y, 2, function (x) Phi %*% x))
  colnames(ghat.t.y) <- 1:20
  gBLUP <- melt(ghat.t.y)
  colnames(gBLUP) <- c("NSFTV.ID", "DayOfImaging", "gBLUP")
  
  if(i == 1 ){
    final.blups <- gBLUP
  }else{
    final.blups <- cbind(final.blups, gBLUP[,3])
  }
}

#This file contains all the gBLUP values for ach accession, run and time point
colnames(final.blups)[3:ncol(final.blups)] <- sub(".sln", "", sub("CV[:1-4:]", "", Files) )
library(plyr)

#Now we do the correlation and get the actual CV results
Cor.res <- matrix(0, ncol=20, nrow=20)

for(j in 1:length(Files)){
  tmp <- final[c("NSFTV.ID", colnames(final.blups)[j+2])]
  colnames(tmp)[2] <- "Y"
  tmp <- ddply(tmp, .(NSFTV.ID), summarise, Cnt=sum(is.na(Y)))
  
  #Find which accessions are in the test set
  test.acc <- tmp[ tmp$Cnt == 120 ,]$NSFTV.ID
  
  #Merge the gBLUPs and observed PSA datasets
  tmp.df <- merge(PSA, final.blups, by=c("DayOfImaging", "NSFTV.ID"))
  
  #Only keep the data for individuals in the testing set
  tmp.df <- tmp.df[tmp.df$NSFTV.ID %in% test.acc ,]
  tmp.df <- cbind(tmp.df[1:5], tmp.df[, (j + 5)])
  tmp.df <- tmp.df[order(tmp.df$DayOfImaging, tmp.df$NSFTV.ID) ,]
  colnames(tmp.df)[6] <- "gBLUP"
  
  #Do the correlations
  res <- rbind(ldply(dlply(tmp.df, .(DayOfImaging), function(x) cor(x$E1, x$gBLUP, use="complete.obs") ) )[,2],
               ldply(dlply(tmp.df, .(DayOfImaging), function(x) cor(x$E2, x$gBLUP, use="complete.obs") ) )[,2],
               ldply(dlply(tmp.df, .(DayOfImaging), function(x) cor(x$E3, x$gBLUP, use="complete.obs") ) )[,2] )
  
  #Store the results
  Cor.res[,j] <- colMeans(res)
}

TP approach

For the CV at each individual time point, 20 files were written for each resampling and one for PSA. Each file has the day of imaging in seperate columns. Asreml loops over each day and writes the random coefficients to a .sln file. Each section in the .sln file is a time point. Therefore, each resampling run will need to be read into R and compared to the predicted PSA from the full, unmasked dataset.

library(plyr)
library(reshape2)

#Read in the CV file
final <- read.csv("~/Desktop/RR/ScenarioA/TP/TP_CV.csv")

PSA <- final[1:5]
PSA <- ddply(PSA, .(NSFTV.ID, Exp, DayOfImaging), summarise, PSA=mean(PSA, na.rm = T))
PSA <- dcast(PSA, NSFTV.ID + DayOfImaging ~ Exp)
## Using PSA as value column: use value.var to override.
PSA <- PSA[order(PSA$DayOfImaging, PSA$NSFTV.ID) ,]


#Split up fo each experiment
E1 <- dcast(PSA[c("NSFTV.ID", "DayOfImaging", "E1")], NSFTV.ID ~ DayOfImaging)
## Using E1 as value column: use value.var to override.
row.names(E1) <- E1$NSFTV.ID
E1$NSFTV.ID <- NULL

E2 <- dcast(PSA[c("NSFTV.ID", "DayOfImaging", "E2")], NSFTV.ID ~ DayOfImaging)
## Using E2 as value column: use value.var to override.
row.names(E2) <- E2$NSFTV.ID
E2$NSFTV.ID <- NULL

E3 <- dcast(PSA[c("NSFTV.ID", "DayOfImaging", "E3")], NSFTV.ID ~ DayOfImaging)
## Using E3 as value column: use value.var to override.
row.names(E3) <- E3$NSFTV.ID
E3$NSFTV.ID <- NULL


#Cross validation
setwd("~/Desktop/RR/ScenarioA/TP/")
files <- list.files(pattern=".sln")
#Matrix to store results
Acc.mat <- matrix(0, ncol=20, nrow=20)

for (i in 1:length(files)){
  #Each sln file will have the all the gBLUPs for all accessions at each time point. Each time point will be in a seperate section. The 'Chunks' part of the code finds where these sections end.
  sln <- read.delim(files[i], sep="", header=T)
  Chunks <- which(sln$Model_Term == "Model_Term")
  
  #Matrix to store gBLUPs
  Y.mat <- matrix(0, ncol=20, nrow=357)
  
  tmp.final <- data.frame(NSFTV.ID = final$NSFTV.ID,
                          Run = final[, colnames(final) %in% sub(".sln", "", files[i])] )
  #Find out which accessions are part of the testing set
  tmp.final <- ddply(tmp.final, .(NSFTV.ID), summarise, Cnt=sum(is.na(Run)) )
  test.acc <- tmp.final[ tmp.final$Cnt == 120 ,]$NSFTV.ID
  
  #Get the gBLUPs for each day and store them in Y.mat
  for (j in 1:20){
    if(j == 1){
      tmp.sln <- sln[1:(Chunks[j]-1) ,]
      Y.mat[,j] <- matrix(as.numeric(tmp.sln[grep("NSFTV", tmp.sln$Level) ,][,3]), ncol=1, nrow=357)
      row.names(Y.mat) <- tmp.sln[grep("NSFTV", tmp.sln$Level) ,][,2]
    }else{
      if(j < 20){
        tmp.sln <- sln[Chunks[j-1]:(Chunks[j]-1) ,]
        Y.mat[,j] <- matrix(as.numeric(tmp.sln[grep("NSFTV", tmp.sln$Level) ,][,3]), ncol=1, nrow=357)
      }else{
        tmp.sln <- sln[Chunks[j-1]:nrow(sln) ,]
        Y.mat[,j] <- matrix(as.numeric(tmp.sln[grep("NSFTV", tmp.sln$Level) ,][,3]), ncol=1, nrow=357)
      }
    }
  }
  
  #Get the correlation between predicted and actual PSA at each experiment
  res <- rbind(diag(cor(Y.mat[row.names(Y.mat) %in% test.acc ,], E1[row.names(E1) %in% test.acc ,], "complete")),
            diag(cor(Y.mat[row.names(Y.mat) %in% test.acc ,], E2[row.names(E2) %in% test.acc ,], "complete")),
            diag(cor(Y.mat[row.names(Y.mat) %in% test.acc ,], E3[row.names(E3) %in% test.acc ,], "complete")))
  
  #Store the average across experiments in Acc.Mat
  Acc.mat[,i] <- colMeans(res)
}

Plot the results

Here is the code use to generate Figure 5B.

Acc.mat.mean <- apply(Acc.mat, 1, mean)
Acc.mat.sd <- apply(Acc.mat, 1, sd)

Cor.res.mean <- apply(Cor.res, 1, mean)
Cor.res.sd <- apply(Cor.res, 1, sd)

(mean(Cor.res.mean) - mean(Acc.mat.mean))/mean(Acc.mat.mean)
## [1] 0.1159557
(Cor.res.mean - Acc.mat.mean)/Acc.mat.mean
##  [1]  0.123023258  0.154199430  0.133677392  0.208408715  0.245487156
##  [6]  0.234236440  0.304148319  0.348399359  0.321535204  0.244689056
## [11]  0.184622848  0.136258780  0.084625530  0.053958553  0.011443079
## [16] -0.008765618 -0.015445016 -0.027167371 -0.035265222 -0.041280697
#pdf("~/Desktop/RR/RR/Figures/ScenarioA.pdf", h=2.1, w=3.5, useDingbats = F, pointsize = 10)
#par(mar=c(3,3,1,.2), mgp=c(1.8,0.5,0))
plot(1:20, Cor.res.mean, ylim=c(0,1), cex=0.3, pch=19, xlab="Day of Imaging", ylab=expression(italic("r")))
lines(1:20, Cor.res.mean)

segments(1:20, Cor.res.mean - Cor.res.sd, 1:20, Cor.res.mean + Cor.res.sd, lwd=1)
segments(1:20 - 0.1, Cor.res.mean - Cor.res.sd, 1:20 + 0.1, Cor.res.mean - Cor.res.sd, lwd=1)
segments(1:20 - 0.1, Cor.res.mean + Cor.res.sd, 1:20 + 0.1, Cor.res.mean + Cor.res.sd, lwd=1)

points(1:20+0.1, Acc.mat.mean, pch=21, cex=0.3, col="grey")
lines(1:20+0.1, Acc.mat.mean, col="grey", lty=2)

segments(1:20+0.1, Acc.mat.mean - Acc.mat.sd, 1:20+0.1, Acc.mat.mean + Acc.mat.sd, lwd=1, col="grey")
segments(1:20+0.1 - 0.1, Acc.mat.mean - Acc.mat.sd, 1:20+0.1 + 0.1, Acc.mat.mean - Acc.mat.sd, lwd=1, col="grey")
segments(1:20+0.1 - 0.1, Acc.mat.mean + Acc.mat.sd, 1:20+0.1 + 0.1, Acc.mat.mean + Acc.mat.sd, lwd=1, col="grey")

legend("bottomright", c("TP", "RR"), col=c("grey", "black"), pch=c(21, 19), lty=c(2, 1), cex=0.6, pt.cex = 0.3)