Introduction

This is a (re-)analysis of Simon Cockell’s simulation on his blog done by Harold Pimentel.

Note that Rob Patro has also done a (re-)analysis of this data (link on Simon’s Blog). There are some differences in the correlation coefficients. These are likely attributed to differences in Pandas and R.

Preliminaries

Load packages

library("dplyr")
library("sleuth")

Some constants

# This is the default from 'polyester' (see ?polyester::simulate_experiment )
MEAN_FL <- 250.0
# small number for plotting log things
SMALL <- 1e-2

Load kallisto results

kal <- read_kallisto_h5("../../sim_test_kallisto/abundance.h5", FALSE)

Load Salmon results

all_salmon <- lapply(paste0("sim_test_salmon_", 1:10),
  function(fname)
  {
    read_salmon(file.path("..", "..", fname, "quant.sf"))
  })

Ensure that all samon are ordered correctly:

all_salmon %>%
  lapply(function(x) all.equal(x$target_id, .[[1]]$target_id)) %>%
  unlist() %>%
  Reduce(function(x, y) x && y, .)
## [1] TRUE

Take averages

salmon_tpm <- lapply(all_salmon, function(x) x$tpm) %>%
  as.data.frame() %>%
  apply(1, mean)
salmon_counts <- lapply(all_salmon, function(x) x$est_counts) %>%
  as.data.frame() %>%
  apply(1, mean)
salmon_avg <- data.frame(target_id = all_salmon[[1]]$target_id,
  tpm = salmon_tpm, est_counts = salmon_counts)

Load oracle

Get the simulated target names

transcripts <- Biostrings::readDNAStringSet("../../data/select_transcripts.fa")
transcript_names <- names(transcripts)
transcript_ids <- unlist(lapply(transcript_names,
    function(x) { substr(x, 1, 15) }))
target_ids <- data.frame(target_id = transcript_ids, stringsAsFactors = FALSE)

Load the oracle data and munge into a form that sleuth likes:

oracle <- read.table("../../data/quant_bias_corrected.sf", sep="\t",
  stringsAsFactors = FALSE)
colnames(oracle) <- c("target_id", "len", "salmon_tpm", "fpkm", "counts")
oracle <- oracle %>%
  select(-c(fpkm)) %>%
  mutate(eff_len = len - MEAN_FL) %>%
  mutate(counts = round(counts))

The number of targets differs from the FASTA file to the number quantified against (thanks, Rob). Because of this issue, we’ll do a left join instead of an inner join.

oracle <- left_join(target_ids, oracle, by = "target_id")

Thus, we have NA values for columns where it didn’t match the oracle counts.

head(oracle)
##         target_id  len salmon_tpm counts eff_len
## 1 ENST00000623396   NA         NA     NA      NA
## 2 ENST00000302622 1002  0.0000000      0     752
## 3 ENST00000629937   NA         NA     NA      NA
## 4 ENST00000377996 2473  0.6632600     37    2223
## 5 ENST00000439587 2150  0.9274760     45    1900
## 6 ENST00000377991 3441  0.0457422      4    3191

Let’s just assign zeroes to the effective length and to the counts for the transcripts that didn’t appear in the dataset that was quantified against.

oracle <- oracle %>%
  mutate(
    len = ifelse(is.na(len), 0, len),
    eff_len = ifelse(is.na(eff_len), 0, eff_len),
    counts = ifelse(is.na(counts), 0, counts)
    )

Sanity check to ensure we’re not getting fragments from things that have effective length <= 0:

oracle %>%
  filter(counts > 0 & eff_len <= 0)
## [1] target_id  len        salmon_tpm counts     eff_len   
## <0 rows> (or 0-length row.names)

Compute TPM from the rounded counts to get the true distribution. Note, this is going to be slightly different than the one read in.

oracle <- oracle %>%
  mutate(tpm = counts_to_tpm(counts, eff_len),
    salmon_tpm = (salmon_tpm / sum(salmon_tpm)) * 1e6 ) %>%
  mutate(rel_diff = ifelse(tpm > 0, abs(tpm - salmon_tpm) / tpm, NA))
oracle$rel_diff %>%
  summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##      NA      NA      NA     NaN      NA      NA    1129
oracle_salmon_tpm <- oracle %>%
  select(-c(tpm)) %>%
  rename(tpm = salmon_tpm)
oracle <- oracle %>%
  select(-c(salmon_tpm, rel_diff))

Final sanity check:

oracle %>%
  summary()
##   target_id              len            counts           eff_len     
##  Length:1129        Min.   :    0   Min.   :    0.0   Min.   : -208  
##  Class :character   1st Qu.:  412   1st Qu.:    0.0   1st Qu.:  162  
##  Mode  :character   Median :  693   Median :    0.0   Median :  443  
##                     Mean   : 1279   Mean   :  174.1   Mean   : 1076  
##                     3rd Qu.: 1608   3rd Qu.:    6.0   3rd Qu.: 1358  
##                     Max.   :12409   Max.   :21436.0   Max.   :12159  
##       tpm          
##  Min.   :     0.0  
##  1st Qu.:     0.0  
##  Median :     0.0  
##  Mean   :   885.7  
##  3rd Qu.:    76.5  
##  Max.   :120417.2
all.equal(sort(kal$abundance$target_id), sort(oracle$target_id))
## [1] TRUE

Definition of error

The relative difference is defined as:

\[ d_i = \frac{x_i - y_i}{\frac{1}{2}\left| x_i + y_i\right|} \]

where \(y_i\) is the true value and \(x_i\) is the estimate.

The relative error is similarly defined:

\[ e_i = \frac{x_i - y_i}{y_i} \]

Note that the relative error is undefined when \(y_i\) is zero.

rel_diff <- function(x, y) {
  stopifnot(length(x) == length(y))

  result <- rep(NA_real_, length(x))

  non_zero <- which( x > 0 | y > 0 )
  both_zero <- setdiff(seq_along(x), non_zero)

  result[both_zero] <- 0.0

  result[non_zero] <- 2 * ((x[non_zero] - y[non_zero]) /
    abs(x[non_zero] + y[non_zero]))

  result
}

Results

Summarize the data using sleuth

res <- merge_results(list(kal$abundance, salmon_avg), c("kallisto", "salmon_avg"), oracle)

filtered_summary() summarizes the results (comparing to oracle). When there is no filter given, it uses everything. We used this function in the kallisto preprint.

res %>%
  filtered_summary() %>%
  lapply(print, width = 300) %>%
  invisible()
## Source: local data table [2 x 6]
## 
##           method   pearson  spearman med_rel_diff_no_zeroes med_rel_diff
## 1   tpm_kallisto 0.9908756 0.8355979              0.2735451    0.0000000
## 2 tpm_salmon_avg 0.9782799 0.7932114              1.7932829    0.3392771
##   med_per_err
## 1          NA
## 2          NA
## Source: local data table [2 x 6]
## 
##                  method   pearson  spearman med_rel_diff_no_zeroes
## 1   est_counts_kallisto 0.9860605 0.8360959              0.2702079
## 2 est_counts_salmon_avg 0.9768138 0.7907503              1.7907574
##   med_rel_diff med_per_err
## 1    0.0000000          NA
## 2    0.1901163          NA

med_rel_diff_no_zeroes ignores when both the estimate and the truth are zero. This is technically perfect, but if many transcripts have no reads mapping to them, then it will skew the distribution causing the median to be artificially low on the “interesting” transcripts (with expression > 0).

We can use filters similar to what Rob used in his notebook.

res %>%
  filtered_summary(tpm_oracle >= 0.01 & est_counts_oracle > 1) %>%
  lapply(print, width = 300) %>%
  invisible()
## Source: local data table [2 x 6]
## 
##           method   pearson  spearman med_rel_diff_no_zeroes med_rel_diff
## 1   tpm_kallisto 0.9948634 0.9280120              0.1601435    0.1601435
## 2 tpm_salmon_avg 0.9812736 0.9161121              0.3765674    0.3765674
##   med_per_err
## 1   0.1522873
## 2   0.3359627
## Source: local data table [2 x 6]
## 
##                  method   pearson  spearman med_rel_diff_no_zeroes
## 1   est_counts_kallisto 0.9912652 0.9350712              0.1426242
## 2 est_counts_salmon_avg 0.9825172 0.9391779              0.2161616
##   med_rel_diff med_per_err
## 1    0.1426242   0.1400430
## 2    0.2161616   0.2008727

Figures

TPM

Correlation

ggplot(res$m_tpm, aes(log2(oracle + 1), log2(estimate + 1))) +
  geom_abline(alpha = 0.2, intercept = 0, slope = 1) +
  geom_point(alpha = 0.3) +
  theme_bw(20) +
  xlim(0, 17.5) +
  ylim(0, 17.5) +
  facet_wrap(~ method, ncol = 1)