--- title: "Cleaning up the phenotype files" author: "Malachy Campbell" date: "8/7/2017" output: rmdformats::html_clean: fig_width: 6 fig_height: 6 highlight: kate thumbnails: true lightbox: true gallery: true --- ```{r setup, include=FALSE} knitr::opts_knit$set(root.dir = '~/Desktop/RR') ``` #Background The purpose of this script is to check for outlier plants (e.g. those exhibiting abnormal growth), remove these abnormal plants in the dataset,and write the cleaned data to a file. ```{r load packages, echo=T} library(reshape2) library(plyr) library(stringr) ``` #Outlier detection Here, I will use the 1.5 IQR rule to identify outliers. For each plant that is flagged, I check the plot of the drought and control plants to look for abnormal behavior (i.e. no growth/straight lines, sharp decline in growth, etc.). Those with abnormal behavior were removed. ```{r load data, echo=T} #Load data allData <- read.csv("Phenotypes_raw/AllExp.csv") #Drop all the unnecessary columns PSA.df <- allData[c("Exp", "Snapshot.ID.Tag", "Genotype.ID", "Replicate", "Key", "DayOfImaging", "Projected.Shoot.Area")] rm(allData) ``` ```{r outlier detection, echo=T} #Flags large plants outlier.detection.up <- function(x){ lowerq = quantile(x)[2] upperq = quantile(x)[4] iqr = upperq - lowerq extreme.threshold.upper = (iqr * 1.5) + upperq return(extreme.threshold.upper) } #Flags small plants outlier.detection.low <- function(x){ lowerq = quantile(x)[2] upperq = quantile(x)[4] iqr = upperq - lowerq extreme.threshold.lower = lowerq - (iqr * 1.5) return(extreme.threshold.lower) } #Apply the outliers functions to each day and experiment. Here we will determine the threshold for each day and experiment ddout <- ddply(PSA.df, .(Exp, DayOfImaging), summarise, Upper=outlier.detection.up(Projected.Shoot.Area), Lower=outlier.detection.low(Projected.Shoot.Area)) #Now go through the dataset and flag the outliers by comparing the PSA values for each plant and the thresholds determined above. out.list=list() for (i in 1:3){ tmp.e <- PSA.df[PSA.df$Exp %in% paste0("E", i) ,] #Subsets the PSA dataset for each experiment out.e <- ddout[ddout$Exp %in% paste0("E", i) ,] #Subsets the outlier dataset for each experiment for(j in 1:20){ #Subset for each day of imaging tmp.d <- tmp.e[tmp.e$DayOfImaging %in% j ,] out.d <- out.e[out.e$DayOfImaging %in% j ,] foo <- tmp.d[tmp.d$Projected.Shoot.Area <= out.d$Lower ,] foo <- rbind(foo, tmp.d[tmp.d$Projected.Shoot.Area >= out.d$Upper ,]) out.list[[paste0(i,j)]] <- foo$Key } } out.list=unique(unlist(out.list)) #Generate plots for plants flagged as outliers #pdf("RR/Phenotypes_raw/outliersPSA.pdf") #for (i in 1:length(out.list)){ # tmp <- PSA.df[PSA.df$Key %in% out.list[i] ,] # plot(tmp$DayOfImaging, # tmp$Projected.Shoot.Area, # type="l", ylim=c(0, 1126838), # main=out.list[i], col="red") # } #dev.off() #These are the plants that had abnormal growth patterns bad.plants <- c("E1_UNL068_1", "E1_UNL137_2", "E1_UNL146_1", "E1_UNL015_1", "E1_UNL015_1", "E1_UNL001_1", "E1_UNL248_2", "E1_UNL047_1", "E1_UNL298_1", "E1_UNL042_1", "E1_UNL361_1", "E1_UNL215_2", "E2_UNL120_2", "E2_UNL146_2", "E2_UNL232_2", "E2_UNL001_1", "E2_UNL299_1", "E2_UNL210_1", "E3_UNL322_1", "E1_UNL142_2", "E2_UNL323_1") PSA.df <- PSA.df[!PSA.df$Key %in% bad.plants ,] ``` #Write cleaned data to files A total of 378 accessions were phenotyped for this project. The accessions were assigned an identified in the form of "UNLXXX". Here we will link those UNL IDs to the RDP1 identifiers ("NSFTV_XX""). We will drop the accessions without an NSFTV ID. ```{r get NSFTV.IDs for all genotypes, eval = F} #Read in the ID list Acc.ID <- read.csv("Phenotypes_raw/IdList.csv") #Merge with the phenotype file PSA.df <- merge(PSA.df, Acc.ID, by="Genotype.ID", all=F) #Drop accessions without and NSFTV ID PSA.df <- PSA.df[!PSA.df$NSFTV.ID %in% "NSFTV_NA" ,] length(unique(PSA.df$NSFTV.ID)) #357 accessions PSA.df$Genotype.ID=NULL PSA.df <- PSA.df[PSA.df$DayOfImaging > 0 ,] PSA.df <- PSA.df[PSA.df$DayOfImaging < 21 ,] #Drop useless columns PSA.df <- PSA.df[c("NSFTV.ID", "Exp", "Replicate", "DayOfImaging", "Projected.Shoot.Area")] colnames(PSA.df)[5] <- "PSA" #Since some accessions only have a single replicate we will pad the phenotype data with NA's. This way all accessions have an equal number of observations. This is a pretty hacky way of doing it, but it works. PSA.df <- dcast(PSA.df, NSFTV.ID ~ DayOfImaging + Replicate + Exp, value.var = "PSA") PSA.df <- melt(PSA.df, id.vars="NSFTV.ID") PSA.meta <- str_split_fixed(PSA.df$variable, "_", 3) PSA.df <- data.frame(NSFTV.ID = PSA.df$NSFTV.ID, Exp = PSA.meta[,3], Rep = PSA.meta[,2], DayOfImaging = PSA.meta[,1], PSA = PSA.df$value) PSA.df$DayOfImaging <- as.character(PSA.df$DayOfImaging) PSA.df$DayOfImaging <- as.numeric(PSA.df$DayOfImaging) PSA.df <- PSA.df[order(PSA.df$NSFTV.ID, PSA.df$Exp, PSA.df$Rep, PSA.df$DayOfImaging) ,] write.csv(PSA.df, "RR/Phenotypes_cleaned/PSA.cleaned.csv", row.names=F) #For the TP analyses we will need a file that is in a wide format (e.g. PSA at each day will be in each column) PSA.df <- dcast(PSA.df, NSFTV.ID + Exp + Rep ~ DayOfImaging, value.var = "PSA") colnames(PSA.df)[4:ncol(PSA.df)] = paste0("Y", colnames(PSA.df)[4:ncol(PSA.df)]) write.csv(PSA.df, "TP/Phenotypes_cleaned/PSA.cleaned.csv", row.names=F) ``` #Plot mean PSA This is the code used to make figure 2 in the manuscript. ```{r plot mean, echo=T} PSA.df <- read.csv("RR/FinalFiles/PSA.cleaned.csv") meanPSA <- ddply(PSA.df, .(DayOfImaging), summarise, Mean = mean(PSA, na.rm = T), SD = sd(PSA, na.rm = T)) #pdf("RR/Figures/PSA.pdf", h=2.1, w=3.5, useDingbats = F) #par(mar=c(3,3,1,.2), mgp=c(1.8,0.5,0), ps=9) plot(meanPSA$DayOfImaging, meanPSA$Mean, col="lightsteelblue4", ylim=c(0, 550000), pch=19, cex=0.5, ylab="PSA", xlab="Day of Imaging") lines(meanPSA$DayOfImaging, meanPSA$Mean, col="lightsteelblue4", cex=0.5) polygon(c(meanPSA$DayOfImaging, rev(meanPSA$DayOfImaging)), c(meanPSA$Mean - meanPSA$SD, rev(meanPSA$Mean + meanPSA$SD)), col = rgb(0.4313725,0.4823529,0.5450980,alpha=0.3), border = rgb(0.4313725,0.4823529,0.5450980,alpha=0.3)) #dev.off() ```