This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.
# CURRENT WORKING DIRECTORY MUST BE SET AS THE PARENT disease-similarity-fusion FOLDER
#setwd("~/Documents/disease-similarity-fusion")
filePathToData = "Data"
filePathToSimilarityMatrices = "Data/Similarity Matrices"
filePathToScripts = "Scripts"
load(paste(filePathToData,"diseaseDatasetInfo.Rda",sep = "/"))
# Load drug-sharing data (evaluation matrices based on createDrugSimilarityMatrix.R)
load(paste(filePathToData,"shareApprovedAndPhaseThreeDrugs.Rda",sep = "/"))
load(paste(filePathToData,"shareApprovedDrugs.Rda",sep = "/"))
# Load similarity fusion and evaluation functions:
source(paste(filePathToScripts,"Perform Similarity Fusion.R",sep = "/"))
## Warning: package 'limma' was built under R version 3.3.3
source(paste(filePathToScripts,"Functions for Evaluation.R",sep = "/"))
inputMatrices = c("ontologicalSimilarity","phenotypicSimilarity","litCoOccurrenceSimilarity","geneticSimilarity","transcriptomicSimilarity","drugSimilarity")
allFiles = list.files(path = filePathToSimilarityMatrices, pattern="*.Rda")
for(file in allFiles){
if(sub(".Rda","",file)%in%inputMatrices){
load(paste(filePathToSimilarityMatrices,file,sep = "/"))
}
}
# Code chunk not run
# Requires installation of DOSE package (biocLite("DOSE"))
source("Creating the Similarity Matrices/Scripts/createDiseaseOntologySimilarityMatrix.R")
ontologicalSimilarity = createDiseaseOntologySimilarityMatrix()
source("Creating the Similarity Matrices/Scripts/createPhenotypicSimilarityMatrix.R")
phenotypicSimilarity = createPhenotypicSimilarityMatrix()
source("Creating the Similarity Matrices/Scripts/createLiteratureCoOccurrenceSimilarityMatrix.R")
litCoOccurrenceSimilarity = createCoOccurrenceSimilarityMatrix()
source("Creating the Similarity Matrices/Scripts/createGeneticSimilarityMatrix.R")
geneticSimilarity = createGeneticSimilarityMatrix() # may take ~1 min
source("Creating the Similarity Matrices/Scripts/createTranscriptomicSimilarityMatrix.R")
transcriptomicSimilarity = createTranscriptomicSimilarityMatrix()
source("Creating the Similarity Matrices/Scripts/createDrugSharingSimilarityMatrix.R")
drugSimilarity = createDrugSimilarityMatrix()
fusedMatrixMinusDO = createFusedMatrix(filePathToSimilarityMatrices,inputMatrices = c("phenotypicSimilarity","litCoOccurrenceSimilarity","geneticSimilarity","transcriptomicSimilarity","drugSimilarity"),weights = c(1,1,1,1,1))
fusedMatrixMinusDrug = createFusedMatrix(filePathToSimilarityMatrices,inputMatrices = c("ontologicalSimilarity","phenotypicSimilarity","litCoOccurrenceSimilarity","geneticSimilarity","transcriptomicSimilarity"),weights = c(1,1,1,1,1))
fusedMatrix = createFusedMatrix(filePathToSimilarityMatrices,inputMatrices = c("ontologicalSimilarity","phenotypicSimilarity","litCoOccurrenceSimilarity","geneticSimilarity","transcriptomicSimilarity","drugSimilarity"))
load(paste(filePathToData,"fusedRandomMatricesDistWeighted.Rda",sep = "/"))
load(paste(filePathToData,"fusedRandomMatricesWeightedDistWeighted.Rda",sep = "/"))
# Function to find the significance threshold of similarity, given a set of random matrices
getProportionCutOffs = function(similarityMatrix,randomMatrices,verbose = FALSE){
# The significance threshold is defined as the 99.99% highest random similarity value
thresh = sort(sapply(randomMatrices,function(x) x[lower.tri(x)]))[1000*0.9999*length(similarityMatrix[lower.tri(similarityMatrix)])]
proportionOfValuesWhichAreSignificant = length(similarityMatrix[lower.tri(similarityMatrix)][which(similarityMatrix[lower.tri(similarityMatrix)]>thresh)])/length(similarityMatrix[lower.tri(similarityMatrix)])
if(verbose){cat(paste0("Proportion of values which are above the significance threshold: ",proportionOfValuesWhichAreSignificant))}
return(proportionOfValuesWhichAreSignificant)
}
proportionOfValuesWhichAreSignificantInFullMatrix = getProportionCutOffs(fusedMatrix,fusedRandomMatrices,verbose = TRUE)
## Proportion of values which are above the significance threshold: 0.0691336775674125
fusedMatrixSig = applySignificanceThreshold(fusedMatrix,1-proportionOfValuesWhichAreSignificantInFullMatrix)
plot(hclust(as.dist(1-fusedMatrixSig)))
# Code chunk not run
library(igraph) #for generating cytoscape graph
# Create graph representation from significant similarity matrix using igraph function
binarygraph = graph_from_adjacency_matrix(fusedMatrixSig,mode = "undirected",weighted = TRUE,diag = FALSE)
# Define nodes
write.csv(cbind(paste0("\"",V(binarygraph)$name,"\""),sapply(V(binarygraph)$name,function(x) diseaseDatasetInfo[which(diseaseDatasetInfo$condition==x),"disont.label"])),file="nodelist.csv",quote = FALSE)
# Define edges (including edge weight and whether the edge connects two diseases in different DO classes, i.e. a novel link)
write.csv(cbind(get.edgelist(binarygraph),E(binarygraph)$weight,apply(get.edgelist(binarygraph),1,function(x) sameDOClass[x[1],x[2]]==0)),file = "edgelist.csv",quote = FALSE)
Code chunks below this point generate the plots used in ‘Understanding and predicting disease relationships through similarity fusion’; they are not required to create the disease map.
Note that as there are ties in the data, the quantile normalization doesn’t make the distributions exactly equal, but it brings them much closer.
Use the Pearson correlation between spaces to determine how similar the relationships between diseases are in different feature spaces.
Create a weighted fused matrix where the three highly similar spaces (ontology, phenotype, and literature co-occurrence) account for 33% of the fused matrix, instead of half.
fusedMatrixWeighted = createFusedMatrix(filePathToSimilarityMatrices,inputMatrices = c("ontologicalSimilarity","phenotypicSimilarity","litCoOccurrenceSimilarity","geneticSimilarity","transcriptomicSimilarity","drugSimilarity"),weights = c(1,1,1,2,2,2))
## Proportion of links in the significant disease map which are in different top-level classes: 0.152892561983471
## Proportion of links in the significant weighted disease map which are in different top-level classes: 0.170616113744076
## AUC, average of individual spaces (minus ontological space): 0.8522010416667
## AUC, fused matrix (minus ontological space): 0.9528232638889
## AUC Disease of Anatomical Entity, average of individual spaces (minus ontological space): 0.79450625
## AUC Disease of Anatomical Entity, fused matrix (minus ontological space): 0.91974375
## AUC Cellular Proliferation, average of individual spaces (minus ontological space): 0.909895833333333
## AUC Cellular Proliferation, fused matrix (minus ontological space): 0.985902777777778
Note that the ROC plots might look a little different from the calculated AUC value, as they are based on a subset of the 1000 runs (because they aren’t all evaluated at the same number of cut-offs).
## Median RR for diseases connected in the map: 2.352
## Median RR for diseases not connected in the map: 1.057
## What percentage of pairs in the disease map are comorbid?: 71.4%
## What percentage of pairs not connected in the disease map are comorbid?: 27.1%
## How many times higher is the percentage of comorbid disease map pairs than in non-connected pairs? 2.637
## What percentage of pairs in the disease map are comorbid?: 58.7%
## What percentage of pairs not connected in the disease map are comorbid?: 12.7%
## How many times higher is the percentage of comorbid disease map pairs than in non-connected pairs? 4.63
## What percentage of pairs in the disease map are comorbid?: 25.4%
## What percentage of pairs not connected in the disease map are comorbid?: 2.4%
## How many times higher is the percentage of comorbid disease map pairs than in non-connected pairs? 10.582