Background

Here we will prep the genotypic data used for all analyses. The 44K SNP data Zhao et al (2011) was obtained from ricediversity.org.

library(BGLR)

Impute missing data

This code loads to the 44k SNP data, and imputes the missing markers. The missing markers were replaced with the mean values.

setwd("~/Desktop/")

FAM <- read.table("RR/RR/RiceDiversity_44K_Genotypes_PLINK/sativas413.fam")[1:2]
MAP <- read.table("RR/RR/RiceDiversity_44K_Genotypes_PLINK/sativas413.map")

PED <- read_ped("RR/RR/RiceDiversity_44K_Genotypes_PLINK/sativas413.ped")
m <- PED$p
n <- PED$n
PED <- PED$x

##SNPs in PED are coded as 0, 1, 2, 3. 2 is missing data. 1 are heterozygous, 0 and 3 are homozygous for 1/1 and 2/2 for major allele and minor allele respectively
PED[PED == 2] <- NA 
PED[PED == 0] <- 0
PED[PED == 1] <- 1
PED[PED == 3] <- 2


W <- t(matrix(PED, nrow=m, ncol=n, byrow = T))
colnames(W) <- MAP$V2
rownames(W) <- paste0("NSFTV_", FAM$V2)

#Imputation
for (j in 1:ncol(W)) {
  W[, j] = ifelse(is.na(W[, j]), mean(W[, j], na.rm = TRUE), W[, j])
}

Remove markers with low MAF

Markers with a MAF less than 0.05 were removed. This reoved 3,211 markers, leaving a total of 33,690 markers for computing the genomic relationship matrix (GRM).

#load phenotypic data for 378 accessions
PSA.df <- read.csv("RR/RR/Phenotypes_cleaned/PSA.cleaned.csv")

W.orig <- W

W <- W.orig[row.names(W.orig) %in% PSA.df$NSFTV.ID ,]
freq <- colMeans(W) / 2
maf <- ifelse(freq > 0.5, 1-freq, freq)
maf.index <- which(maf < 0.05)
length(maf.index)

## [1] 3211

W <- W[, -maf.index]

Compute GRM using VanRaden’s method

A genomic relationship matrix (\(\mathbf{G}\)) was calculated using VanRaden (2008). \[\mathbf{G} = \frac{\mathbf{Z_{cs} Z_{cs}}' }{m}\] Here, \(\mathbf{Z_{cs}}\) is a centered and scaled \(n \times m\) matrix, where \(m\) is 33,674 SNPs and \(n\) is the 357 genotyped rice lines.

##NOTE that in the standalone of asreml the inverse of G is
##done after loading. So DO NOT take the inverse of G here!!!
Zsc <- scale(x=W,center=T,scale=T)
G <- tcrossprod(Zsc)/ncol(W)
G <- G + diag(nrow(W))*0.001
G <- G[match(unique(PSA.df$NSFTV.ID), row.names(G) ) ,]
G <- G[, match(unique(PSA.df$NSFTV.ID), colnames(G) )]

G.final <- as.data.frame(which(row(G) >= col(G),arr.ind=TRUE))
G.final$G <- G[lower.tri(G, diag=T)]
G.final <- G.final[,c(2,1,3)]
G.final <- G.final[order(G.final[,2], G.final[,1]),]
G.final <- G.final[,c(2,1,3)]
colnames(G.final)[1:2]=c("Row", "Column")
attr(G.final, "rowNames") <- row.names(G)

write.table(G.final, "RR/RR/ModelSelection/G2.grm", col.names=F, row.names=F, quote=F, sep="\t")

Genotypic data

Malachy Campbell

8/7/2017

Background

Impute missing data

Remove markers with low MAF

Compute GRM using VanRaden’s method