---
title: "Genotypic data"
author: "Malachy Campbell"
date: "8/7/2018"
output:
  rmdformats::html_clean:
    fig_width: 6
    fig_height: 6
    highlight: kate
    thumbnails: true
    lightbox: true
    gallery: true
---

```{r setup, include=FALSE}
knitr::opts_knit$set(root.dir = '~/Documents/Dropbox/Work/Manuscripts/2018_RandomRegressionGWAS/ThePlantGenome/Revision/New Analysis/')
```

# Background
Here we will prep the genotypic data used for all analyses. The 44K SNP data [Zhao et al (2011)](https://www.nature.com/articles/ncomms1467) was obtained from [ricediversity.org](http://ricediversity.org/data/sets/44kgwas/RiceDiversity.44K.MSU6.Genotypes_PLINK.zip).

# Impute missing data
This code loads to the 44k SNP data, and imputes the missing markers. The missing markers were replaced with the mode of SNP values.
```{r impute missing data, echo = T, eval = F}
library(BGLR)

FAM <- read.table("DataandCode/RiceDiversity_44K_Genotypes_PLINK/sativas413.fam")[1:2]
MAP <- read.table("DataandCode/RiceDiversity_44K_Genotypes_PLINK/sativas413.map")

PED <- read_ped("DataandCode/RiceDiversity_44K_Genotypes_PLINK/sativas413.ped")
m <- PED$p
n <- PED$n
PED <- PED$x

##SNPs in PED are coded as 0, 1, 2, 3. 2 is missing data. 1 are heterozygous, 0 and 3 are homozygous for 1/1 and 2/2 for major allele and minor allele respectively
PED[PED == 2] <- NA 
PED[PED == 0] <- 0
PED[PED == 1] <- 1
PED[PED == 3] <- 2


W <- t(matrix(PED, nrow=m, ncol=n, byrow = T))
colnames(W) <- MAP$V2
rownames(W) <- paste0("NSFTV_", FAM$V2)

#Imputation
getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

for (j in 1:ncol(W)) {
  W[, j] = ifelse(is.na(W[, j]), getmode(W[, j]), W[, j])
}
```

# Remove markers with low MAF
Markers with a MAF less than 0.05 were removed. This removed 3,211 markers, leaving a total of 33,690 markers for computing the genomic relationship matrix (GRM).
```{r remove markers with low MAF, echo = T, eval = F}
#load phenotypic data for 378 accessions
PSA.df <- read.csv("DataandCode/Phenotypes/PSA.cleaned.mn.csv")

W <- W[row.names(W) %in% PSA.df$NSFTV.ID ,]
W <- W[order(row.names(W)) ,]

freq <- colMeans(W) / 2
maf <- ifelse(freq > 0.5, 1-freq, freq)
maf.index <- which(maf < 0.05)
length(maf.index)
W <- W[, -maf.index]

MAP <- MAP[-maf.index ,]

write.table(MAP, "DataandCode/CompCluster/MAP_maf0.05.txt", row.names = F, col.names = F, quote = F, sep = "\t")
```

# Compute GRM using VanRaden's method
A genomic relationship matrix ($\mathbf{G}$) was calculated using [VanRaden (2008)](https://www.ncbi.nlm.nih.gov/pubmed/18946147).
$$\mathbf{G} = \frac{\mathbf{Z_{sc} Z_{sc}}' }{m}$$
Here, $\mathbf{Z_{c}}$ is a centered $n \times m$ matrix, where $m$ is 33,564 SNPs and $n$ is the 357 genotyped rice lines. 

```{r GRM, echo = T, eval = F}
Zsc <- scale(x=W,center=T,scale=T)
G <- tcrossprod(Zsc)/ncol(W)
G <- G + diag(nrow(W))*0.001
G <- G[match(unique(PSA.df$NSFTV.ID), row.names(G) ) ,]
G <- G[, match(unique(PSA.df$NSFTV.ID), colnames(G) )]

write.table(t(Zsc), "DataandCode/CompCluster/Zsc.txt", row.names = F, col.names = T, quote = F, sep = "\t")
write.table(G, "DataandCode/CompCluster/G.txt", row.names = F, col.names = T, quote = F, sep = "\t")

G.final <- as.data.frame(which(row(G) >= col(G),arr.ind=TRUE))
G.final$G <- G[lower.tri(G, diag=T)]
G.final <- G.final[,c(2,1,3)]
G.final <- G.final[order(G.final[,2], G.final[,1]),]
G.final <- G.final[,c(2,1,3)]
colnames(G.final)[1:2]=c("Row", "Column")
attr(G.final, "rowNames") <- row.names(G)

write.table(G.final, "DataandCode/CompCluster/RR/G2.grm", col.names=F, row.names=F, quote=F, sep="\t")
write.table(G.final, "DataandCode/CompCluster/TP/G2.grm", col.names=F, row.names=F, quote=F, sep="\t")
```