Collapsing bindingDB compound-gene relationships

library(dplyr)
library(ggplot2)
library(DT)
library(scales)

options(stringsAsFactors=FALSE)

write.delim <- function(x, file, sep='\t', quote = FALSE, row.names=FALSE, na = '', ...) {
  write.table(x = x, file = file, sep=sep, quote=quote, row.names=row.names, na=na, ...)
}

# Read bindingdb and remove non-human interactions
binding.db <- file.path('data', 'binding.tsv.gz') %>%
  read.delim(stringsAsFactors=FALSE) %>%
  dplyr::filter(organism == 'Homo sapiens')

# View a subset of the data.frame
binding.db %>% dplyr::sample_n(200) %>% dplyr::select(-c(pubmed, doi)) %>% DT::datatable()

Show entries

Search:

reaction_id	bindingdb_id	uniprot	entrez_gene	measure	affinity_nM	source	organism
50909899	50311670	P14061	3292	IC50	392	ChEMBL	Homo sapiens
51021796	50418384	P37288	552	Ki	5.62	ChEMBL	Homo sapiens
50735318	50099174	P31391	6754	Ki	3840	ChEMBL	Homo sapiens
50288085	50156710	P78536	6868	IC50	51	ChEMBL	Homo sapiens
50043856	50020503	P14416	1813	Ki	3.3	ChEMBL	Homo sapiens
50339166	50183229	O43174	1592	IC50	1.6	ChEMBL	Homo sapiens
50193460	50106744	P10635	1565	IC50	10500	ChEMBL	Homo sapiens
50340925	50183945	P03951	2160	IC50	250001	ChEMBL	Homo sapiens
51065486	50398230	Q9Y2T6	9290	IC50	10001	ChEMBL	Homo sapiens
50705496	50359035	P15121	231	IC50	193501	ChEMBL	Homo sapiens

Showing 1 to 10 of 200 entries

Previous1 2 3 4 5…20Next

# Read the drugbank to bindingDB fuzzy mappings produced using UniChem
map.df <- 'http://git.dhimmel.com/drugbank/data/mapping/bindingdb.tsv' %>%
  read.delim(stringsAsFactors=FALSE)

# Restrict to compounds in drugbank
joined.df <- map.df %>%
  dplyr::inner_join(binding.db)

## Joining by: "bindingdb_id"

51164 compound–protein binding measurements are extracted for humans when restricting to DrugBank-mapped compounds.

geom.mean <- function(x) {
  # Returns the geometric mean
  exp(mean(log(x)))
}

ResolveAffinity <- function(df) {
  # Preferentially selects the affinity measure. If multiple meansurements
  # exist for the same compound-protein pair, the geometric mean is taken.
  measures <- df$measure
  for (measure in c('Kd', 'Ki', 'IC50')) {
    if (is.element(measure, measures)) {
      values <- df$affinity_nM[measures == measure]
      return.df <- data.frame(
        measure = measure,
        affinity_nM = geom.mean(values),
        n_measures = length(values))
      return(return.df)
    }
  }
}

# Create a single affinity measure for each compound-protein pair
collapse.df <- joined.df %>%
  dplyr::group_by(drugbank_id, bindingdb_id, uniprot, entrez_gene) %>%
  dplyr::do(ResolveAffinity(.)) %>%
  dplyr::ungroup()

collapse.df %>%
  write.delim('data/bindings-drugbank-collapsed.tsv')

# View a subset of the data.frame
collapse.df %>% dplyr::sample_n(200) %>% DT::datatable()

Show entries

Search:

drugbank_id	bindingdb_id	uniprot	entrez_gene	measure	affinity_nM	n_measures
DB01103	50015214	P00390	2936	IC50	1000001	1
DB04849	50331096	P46734	5606	Kd	10001.00000000001	1
DB06288	81790	P23975	6530	Ki	10001.00000000001	1
DB07790	50263028	P11802	1019	IC50	199.9999999999999	1
DB06589	26474	P54646	5563	Kd	10001.00000000001	2
DB07138	15244	Q14164	9641	Kd	10001.00000000001	2
DB08183	50193995	Q15118	5163	IC50	10001.00000000001	1
DB02010	50059889	Q99759	4215	Kd	2800	1
DB08584	50355498	Q96L34	57787	Kd	10001.00000000001	1
DB08584	50355498	Q5S007	120892	Kd	10001.00000000001	2

Showing 1 to 10 of 200 entries

Previous1 2 3 4 5…20Next

23131 compound–protein pairs were assayed.

gene.df <- collapse.df %>%
  dplyr::group_by(drugbank_id, entrez_gene) %>%
  dplyr::summarize(
    affinity_nM = min(affinity_nM),
    n_pairs = n()) %>%
  dplyr::ungroup()

gene.df %>%
  write.delim('data/bindings-drugbank-gene.tsv')

# View a subset of the data.frame
gene.df %>% dplyr::sample_n(200) %>% DT::datatable()

Show entries

Search:

drugbank_id	entrez_gene	affinity_nM	n_pairs
DB08183	79837	10001.00000000001	1
DB07138	2263	10001.00000000001	1
DB08183	3702	10001.00000000001	1
DB08299	5141	60.7837906928259	1
DB00738	146802	5299.056519796705	1
DB00194	1803	62.00000000000001	1
DB05424	2049	10001.00000000001	1
DB00734	1816	563.0000000000001	1
DB01268	9833	1968.194638166221	1
DB00398	5596	10001.00000000001	1

Showing 1 to 10 of 200 entries

Previous1 2 3 4 5…20Next

21617 drugbank–gene pairs have measured binding affinities.

Interaction retention based on affinity threshold

exp.range <- -5:11

gene.df %>%
  ggplot(aes(x = affinity_nM)) +
  geom_histogram(alpha = 0.6) +
  scale_x_log10(
    breaks = scales::trans_breaks("log10", n=10, function(x) 10^x),
    labels = scales::trans_format("log10", math_format(10^.x))) +
  theme_bw()

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

gene.df %>%
  ggplot(aes(x = affinity_nM)) +
  stat_ecdf() +
  scale_x_log10(
    breaks = scales::trans_breaks("log10", n=10, function(x) 10^x),
    labels = scales::trans_format("log10", math_format(10^.x))) +
  theme_bw()

Interactions per compound and per gene when restricting to micromolar or stronger affinities.

gene.df %>%
  dplyr::filter(affinity_nM <= 1000) %>%
  dplyr::group_by(drugbank_id) %>%
  dplyr::summarize(n_genes = n()) %>%
  ggplot(aes(x=n_genes)) +
  geom_histogram(alpha=0.6) +
  scale_x_log10(breaks=c(1:3, 5, 10, 20, 50, 100)) +
  xlab('Genes bound per compound') +
  theme_bw()

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

gene.df %>%
  dplyr::filter(affinity_nM <= 1000) %>%
  dplyr::group_by(entrez_gene) %>%
  dplyr::summarize(n_compounds = n()) %>%
  ggplot(aes(x=n_compounds)) +
  geom_histogram(alpha=0.6) +
  scale_x_log10(breaks=c(1:5, 7, 10, 15, 25, 50)) +
  xlab('Compounds binding per gene') +
  theme_bw()

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.