library(dplyr)
library(ggplot2)
library(DT)
library(scales)
options(stringsAsFactors=FALSE)
write.delim <- function(x, file, sep='\t', quote = FALSE, row.names=FALSE, na = '', ...) {
write.table(x = x, file = file, sep=sep, quote=quote, row.names=row.names, na=na, ...)
}
# Read bindingdb and remove non-human interactions
binding.db <- file.path('data', 'binding.tsv.gz') %>%
read.delim(stringsAsFactors=FALSE) %>%
dplyr::filter(organism == 'Homo sapiens')
# View a subset of the data.frame
binding.db %>% dplyr::sample_n(200) %>% dplyr::select(-c(pubmed, doi)) %>% DT::datatable()
# Read the drugbank to bindingDB fuzzy mappings produced using UniChem
map.df <- 'http://git.dhimmel.com/drugbank/data/mapping/bindingdb.tsv' %>%
read.delim(stringsAsFactors=FALSE)
# Restrict to compounds in drugbank
joined.df <- map.df %>%
dplyr::inner_join(binding.db)
## Joining by: "bindingdb_id"
51164 compound–protein binding measurements are extracted for humans when restricting to DrugBank-mapped compounds.
geom.mean <- function(x) {
# Returns the geometric mean
exp(mean(log(x)))
}
ResolveAffinity <- function(df) {
# Preferentially selects the affinity measure. If multiple meansurements
# exist for the same compound-protein pair, the geometric mean is taken.
measures <- df$measure
for (measure in c('Kd', 'Ki', 'IC50')) {
if (is.element(measure, measures)) {
values <- df$affinity_nM[measures == measure]
return.df <- data.frame(
measure = measure,
affinity_nM = geom.mean(values),
n_measures = length(values))
return(return.df)
}
}
}
# Create a single affinity measure for each compound-protein pair
collapse.df <- joined.df %>%
dplyr::group_by(drugbank_id, bindingdb_id, uniprot, entrez_gene) %>%
dplyr::do(ResolveAffinity(.)) %>%
dplyr::ungroup()
collapse.df %>%
write.delim('data/bindings-drugbank-collapsed.tsv')
# View a subset of the data.frame
collapse.df %>% dplyr::sample_n(200) %>% DT::datatable()
23131 compound–protein pairs were assayed.
gene.df <- collapse.df %>%
dplyr::group_by(drugbank_id, entrez_gene) %>%
dplyr::summarize(
affinity_nM = min(affinity_nM),
n_pairs = n()) %>%
dplyr::ungroup()
gene.df %>%
write.delim('data/bindings-drugbank-gene.tsv')
# View a subset of the data.frame
gene.df %>% dplyr::sample_n(200) %>% DT::datatable()
21617 drugbank–gene pairs have measured binding affinities.
exp.range <- -5:11
gene.df %>%
ggplot(aes(x = affinity_nM)) +
geom_histogram(alpha = 0.6) +
scale_x_log10(
breaks = scales::trans_breaks("log10", n=10, function(x) 10^x),
labels = scales::trans_format("log10", math_format(10^.x))) +
theme_bw()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
gene.df %>%
ggplot(aes(x = affinity_nM)) +
stat_ecdf() +
scale_x_log10(
breaks = scales::trans_breaks("log10", n=10, function(x) 10^x),
labels = scales::trans_format("log10", math_format(10^.x))) +
theme_bw()
gene.df %>%
dplyr::filter(affinity_nM <= 1000) %>%
dplyr::group_by(drugbank_id) %>%
dplyr::summarize(n_genes = n()) %>%
ggplot(aes(x=n_genes)) +
geom_histogram(alpha=0.6) +
scale_x_log10(breaks=c(1:3, 5, 10, 20, 50, 100)) +
xlab('Genes bound per compound') +
theme_bw()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
gene.df %>%
dplyr::filter(affinity_nM <= 1000) %>%
dplyr::group_by(entrez_gene) %>%
dplyr::summarize(n_compounds = n()) %>%
ggplot(aes(x=n_compounds)) +
geom_histogram(alpha=0.6) +
scale_x_log10(breaks=c(1:5, 7, 10, 15, 25, 50)) +
xlab('Compounds binding per gene') +
theme_bw()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.