library(dplyr)
library(ggplot2)
library(DT)
library(scales)

options(stringsAsFactors=FALSE)

write.delim <- function(x, file, sep='\t', quote = FALSE, row.names=FALSE, na = '', ...) {
  write.table(x = x, file = file, sep=sep, quote=quote, row.names=row.names, na=na, ...)
}
# Read bindingdb and remove non-human interactions
binding.db <- file.path('data', 'binding.tsv.gz') %>%
  read.delim(stringsAsFactors=FALSE) %>%
  dplyr::filter(organism == 'Homo sapiens')

# View a subset of the data.frame
binding.db %>% dplyr::sample_n(200) %>% dplyr::select(-c(pubmed, doi)) %>% DT::datatable()

# Read the drugbank to bindingDB fuzzy mappings produced using UniChem
map.df <- 'http://git.dhimmel.com/drugbank/data/mapping/bindingdb.tsv' %>%
  read.delim(stringsAsFactors=FALSE)

# Restrict to compounds in drugbank
joined.df <- map.df %>%
  dplyr::inner_join(binding.db)
## Joining by: "bindingdb_id"

51164 compound–protein binding measurements are extracted for humans when restricting to DrugBank-mapped compounds.

geom.mean <- function(x) {
  # Returns the geometric mean
  exp(mean(log(x)))
}

ResolveAffinity <- function(df) {
  # Preferentially selects the affinity measure. If multiple meansurements
  # exist for the same compound-protein pair, the geometric mean is taken.
  measures <- df$measure
  for (measure in c('Kd', 'Ki', 'IC50')) {
    if (is.element(measure, measures)) {
      values <- df$affinity_nM[measures == measure]
      return.df <- data.frame(
        measure = measure,
        affinity_nM = geom.mean(values),
        n_measures = length(values))
      return(return.df)
    }
  }
}

# Create a single affinity measure for each compound-protein pair
collapse.df <- joined.df %>%
  dplyr::group_by(drugbank_id, bindingdb_id, uniprot, entrez_gene) %>%
  dplyr::do(ResolveAffinity(.)) %>%
  dplyr::ungroup()

collapse.df %>%
  write.delim('data/bindings-drugbank-collapsed.tsv')

# View a subset of the data.frame
collapse.df %>% dplyr::sample_n(200) %>% DT::datatable()

23131 compound–protein pairs were assayed.

gene.df <- collapse.df %>%
  dplyr::group_by(drugbank_id, entrez_gene) %>%
  dplyr::summarize(
    affinity_nM = min(affinity_nM),
    n_pairs = n()) %>%
  dplyr::ungroup()

gene.df %>%
  write.delim('data/bindings-drugbank-gene.tsv')

# View a subset of the data.frame
gene.df %>% dplyr::sample_n(200) %>% DT::datatable()

21617 drugbank–gene pairs have measured binding affinities.

Interaction retention based on affinity threshold

exp.range <- -5:11

gene.df %>%
  ggplot(aes(x = affinity_nM)) +
  geom_histogram(alpha = 0.6) +
  scale_x_log10(
    breaks = scales::trans_breaks("log10", n=10, function(x) 10^x),
    labels = scales::trans_format("log10", math_format(10^.x))) +
  theme_bw()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

gene.df %>%
  ggplot(aes(x = affinity_nM)) +
  stat_ecdf() +
  scale_x_log10(
    breaks = scales::trans_breaks("log10", n=10, function(x) 10^x),
    labels = scales::trans_format("log10", math_format(10^.x))) +
  theme_bw()

Interactions per compound and per gene when restricting to micromolar or stronger affinities.

gene.df %>%
  dplyr::filter(affinity_nM <= 1000) %>%
  dplyr::group_by(drugbank_id) %>%
  dplyr::summarize(n_genes = n()) %>%
  ggplot(aes(x=n_genes)) +
  geom_histogram(alpha=0.6) +
  scale_x_log10(breaks=c(1:3, 5, 10, 20, 50, 100)) +
  xlab('Genes bound per compound') +
  theme_bw()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

gene.df %>%
  dplyr::filter(affinity_nM <= 1000) %>%
  dplyr::group_by(entrez_gene) %>%
  dplyr::summarize(n_compounds = n()) %>%
  ggplot(aes(x=n_compounds)) +
  geom_histogram(alpha=0.6) +
  scale_x_log10(breaks=c(1:5, 7, 10, 15, 25, 50)) +
  xlab('Compounds binding per gene') +
  theme_bw()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.