--- title: "Genotypic data" author: "Malachy Campbell" date: "8/7/2017" output: rmdformats::html_clean: fig_width: 6 fig_height: 6 highlight: kate thumbnails: true lightbox: true gallery: true --- #Background Here we will prep the genotypic data used for all analyses. The 44K SNP data [Zhao et al (2011)](https://www.nature.com/articles/ncomms1467) was obtained from [ricediversity.org](http://ricediversity.org/data/sets/44kgwas/RiceDiversity.44K.MSU6.Genotypes_PLINK.zip). ```{r setup, include=FALSE} knitr::opts_knit$set(root.dir = '~/Desktop/') ``` ```{r load packages, echo=T} library(BGLR) ``` #Impute missing data This code loads to the 44k SNP data, and imputes the missing markers. The missing markers were replaced with the mean values. ```{r impute missing data} setwd("~/Desktop/") FAM <- read.table("RR/RR/RiceDiversity_44K_Genotypes_PLINK/sativas413.fam")[1:2] MAP <- read.table("RR/RR/RiceDiversity_44K_Genotypes_PLINK/sativas413.map") PED <- read_ped("RR/RR/RiceDiversity_44K_Genotypes_PLINK/sativas413.ped") m <- PED$p n <- PED$n PED <- PED$x ##SNPs in PED are coded as 0, 1, 2, 3. 2 is missing data. 1 are heterozygous, 0 and 3 are homozygous for 1/1 and 2/2 for major allele and minor allele respectively PED[PED == 2] <- NA PED[PED == 0] <- 0 PED[PED == 1] <- 1 PED[PED == 3] <- 2 W <- t(matrix(PED, nrow=m, ncol=n, byrow = T)) colnames(W) <- MAP$V2 rownames(W) <- paste0("NSFTV_", FAM$V2) #Imputation for (j in 1:ncol(W)) { W[, j] = ifelse(is.na(W[, j]), mean(W[, j], na.rm = TRUE), W[, j]) } ``` #Remove markers with low MAF Markers with a MAF less than 0.05 were removed. This reoved 3,211 markers, leaving a total of 33,690 markers for computing the genomic relationship matrix (GRM). ```{r remove markers with low MAF} #load phenotypic data for 378 accessions PSA.df <- read.csv("RR/RR/Phenotypes_cleaned/PSA.cleaned.csv") W.orig <- W W <- W.orig[row.names(W.orig) %in% PSA.df$NSFTV.ID ,] freq <- colMeans(W) / 2 maf <- ifelse(freq > 0.5, 1-freq, freq) maf.index <- which(maf < 0.05) length(maf.index) W <- W[, -maf.index] ``` #Compute GRM using VanRaden's method A genomic relationship matrix ($\mathbf{G}$) was calculated using [VanRaden (2008)](https://www.ncbi.nlm.nih.gov/pubmed/18946147). $$\mathbf{G} = \frac{\mathbf{Z_{cs} Z_{cs}}' }{m}$$ Here, $\mathbf{Z_{cs}}$ is a centered and scaled $n \times m$ matrix, where $m$ is 33,674 SNPs and $n$ is the 357 genotyped rice lines. ```{r GRM, eval = F} ##NOTE that in the standalone of asreml the inverse of G is ##done after loading. So DO NOT take the inverse of G here!!! Zsc <- scale(x=W,center=T,scale=T) G <- tcrossprod(Zsc)/ncol(W) G <- G + diag(nrow(W))*0.001 G <- G[match(unique(PSA.df$NSFTV.ID), row.names(G) ) ,] G <- G[, match(unique(PSA.df$NSFTV.ID), colnames(G) )] G.final <- as.data.frame(which(row(G) >= col(G),arr.ind=TRUE)) G.final$G <- G[lower.tri(G, diag=T)] G.final <- G.final[,c(2,1,3)] G.final <- G.final[order(G.final[,2], G.final[,1]),] G.final <- G.final[,c(2,1,3)] colnames(G.final)[1:2]=c("Row", "Column") attr(G.final, "rowNames") <- row.names(G) write.table(G.final, "RR/RR/ModelSelection/G2.grm", col.names=F, row.names=F, quote=F, sep="\t") ```