Introduction

In sctransform::vst we support several methods to estimate the parameters of the per-gene linear models. Here we briefly go over the methods and compare their results and runtime.

Methods

Comparison

We are going to process a PBMC dataset with all the methods listed above. The dataset is available from 10x Genomics.

cm <- Seurat::Read10X_h5(file = "~/Projects/data_warehouse/raw_public_10x/pbmc_10k_v3_filtered_feature_bc_matrix.h5")
# downsample to speed up compilation of this vignette
set.seed(42)
cm <- cm[, sample(x = ncol(cm), size = 5000)]
message(nrow(cm), " genes across ", ncol(cm), " cells")
#> 33538 genes across 5000 cells
result_lst <- list()
estimation_methods <- c("poisson", "qpoisson", "nb_fast", "nb", "glmGamPoi")
for (estimation_method in estimation_methods) {
    if (estimation_method %in% c("poisson", "nb_fast")) {
        for (theta_estimation_fun in c("theta.ml", "theta.mm")) {
            method_name <- paste(estimation_method, theta_estimation_fun, sep = "-")
            message(method_name)
            set.seed(33)
            vst_out <- vst(umi = cm, method = estimation_method, theta_estimation_fun = theta_estimation_fun, 
                verbosity = 0)
            vst_out$y <- NULL
            result_lst[[method_name]] <- vst_out
        }
    } else {
        method_name <- estimation_method
        message(method_name)
        set.seed(33)
        vst_out <- vst(umi = cm, method = estimation_method, verbosity = 0)
        vst_out$y <- NULL
        result_lst[[method_name]] <- vst_out
    }
}
#> poisson-theta.ml
#> poisson-theta.mm
#> qpoisson
#> nb_fast-theta.ml
#> nb_fast-theta.mm
#> nb
#> glmGamPoi

Show how the residual variances compare between methods (plot below shows log10-transformed values)

mat <- sapply(result_lst, function(x) x$gene_attr$residual_variance)
colnames(mat) <- names(result_lst)
ggpairs(data.frame(log10(mat)), progress = FALSE)

Show how the residual means compare between methods

mat <- sapply(result_lst, function(x) x$gene_attr$residual_mean)
colnames(mat) <- names(result_lst)
ggpairs(data.frame(mat), progress = FALSE)

Show the model parameters for all methods

plot_lst <- lapply(names(result_lst), function(method_name) {
    plot_model_pars(result_lst[[method_name]], show_theta = TRUE) + ggtitle(method_name)
})
plot(cowplot::plot_grid(plotlist = plot_lst, ncol = 2))

Overall, the regularized parameters and the resulting residuals are very similar across the methods. It is unlikely that the small differences we see above would lead to big differences in downstream analyses like dimensionality reduction and clustering.

How do the methods compare with respect to runtime?

tmp <- lapply(names(result_lst), function(method_name) {
    times <- result_lst[[method_name]]$times
    delta_t_total <- as.numeric(times$done) - as.numeric(times$start_time)
    delta_t_model <- as.numeric(times$reg_model_pars) - as.numeric(times$get_model_pars)
    data.frame(method_name, delta_t_total, delta_t_model)
})
df <- do.call(rbind, tmp) %>% mutate(delta_t_rest = delta_t_total - delta_t_model)
df$method_name <- factor(df$method_name, levels = df$method_name, ordered = TRUE)
df <- arrange(df, delta_t_total) %>% mutate(method_name = factor(method_name, unique(method_name)))
melt(df, id.vars = "method_name", measure.vars = c("delta_t_model", "delta_t_rest")) %>% 
    ggplot(aes(method_name, value, fill = variable)) + geom_bar(position = "stack", 
    stat = "identity") + theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    hjust = 1)) + xlab("Method") + ylab("Wall clock time in seconds") + scale_fill_discrete(name = "Part of algorithm", 
    labels = c("Model fitting", "Rest"))

df
#>        method_name delta_t_total delta_t_model delta_t_rest
#> 1         qpoisson      15.02742      3.181082     11.84634
#> 2 poisson-theta.mm      19.03439      6.650548     12.38385
#> 3        glmGamPoi      33.93400     22.256309     11.67769
#> 4 nb_fast-theta.mm      64.04613     52.501900     11.54423
#> 5 poisson-theta.ml      69.13704     57.056848     12.08020
#> 6 nb_fast-theta.ml     118.30736    106.537052     11.77031
#> 7               nb     626.24893    614.482909     11.76602

The runtime comparison above is not an in-depth benchmark, but it gives a good idea about the relative speed of the different methods. All times above were obtained using a single CPU core. However, the model fitting step supports the Future API for parallel processing.