library(spacexr)
library(Matrix)
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(ggplot2)
datadir <- system.file("extdata",'SpatialRNA/VisiumVignette',package = 'spacexr') # directory for sample Visium dataset
if(!dir.exists(datadir))
  dir.create(datadir)
savedir <- 'RCTD_results'
if(!dir.exists(savedir))
  dir.create(savedir)
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  cache = TRUE,
  out.width = "100%"
)

Introduction

Generalized Linear Admixture Models for Differential Expression, or CSIDE, is part of the spacexr R package for learning cell type-specific differential expression from spatial transcriptomics data. In this Vignette, we will use CSIDE to test for differential expression across multiple regions in a toy hippocampus Visium dataset. First, we will first use RCTD to assign cell types to a hippocampus Visium dataset. We will define cell type profiles using an annotated single cell RNA-sequencing (scRNA-seq) hippocampus dataset. We will test for differential expression across three spatial regions.

Data Preprocessing and running RCTD

First, we run RCTD on the data to annotated cell types. Please note that this follows exactly the content of the spatial transcriptomics RCTD vignette, except that we use RCTD on full mode. Since full mode can discover any number of cell types per pixel, it is a reasonable choice for technologies such as Visium that can have spatial resolution much larger than single cells. Please refer to the spatial transcriptomics vignette for more explanation on the RCTD algorithm.

### Load in/preprocess your data, this might vary based on your file type
if(!file.exists(file.path(savedir,'myRCTD_visium_full.rds'))) {
  counts <- as.data.frame(readr::read_csv(file.path(datadir,"counts.csv"))) # load in counts matrix
  coords <- read.csv(file.path(datadir,"coords.csv"))
  rownames(counts) <- counts[,1]; counts[,1] <- NULL # Move first column to rownames
  rownames(coords) <- coords[,1]; coords[,1] <- NULL # Move first column to rownames
  nUMI <- colSums(counts) # In this case, total counts per pixel is nUMI
  puck <- SpatialRNA(coords, counts, nUMI)
  barcodes <- colnames(puck@counts) # pixels to be used (a list of barcode names). 
  plot_puck_continuous(puck, barcodes, puck@nUMI, ylimit = c(0,round(quantile(puck@nUMI,0.9))), 
                       title ='plot of nUMI') 
  refdir <- system.file("extdata",'Reference/Visium_Ref',package = 'spacexr') # directory for the reference
  counts <- read.csv(file.path(refdir,"counts.csv")) # load in cell types
  rownames(counts) <- counts[,1]; counts[,1] <- NULL # Move first column to rownames
  cell_types <- read.csv(file.path(refdir,"cell_types.csv")) # load in cell types
  cell_types <- setNames(cell_types[[2]], cell_types[[1]])
  cell_types <- as.factor(cell_types) # convert to factor data type
  nUMI <- read.csv(file.path(refdir,"nUMI.csv")) # load in cell types
  nUMI <- setNames(nUMI[[2]], nUMI[[1]])
  reference <- Reference(counts, cell_types, nUMI)
  myRCTD <- create.RCTD(puck, reference, max_cores = 2)
  myRCTD <- run.RCTD(myRCTD, doublet_mode = 'full')
  saveRDS(myRCTD,file.path(savedir,'myRCTD_visium_full.rds'))
}

Exploring the full mode results

The results of RCTD full mode are stored in @results$weights. We next normalize the weights using normalize_weights so that they sum to one. Each entry represents the estimated proportion of each cell type on each pixel.

myRCTD <- readRDS(file.path(savedir,'myRCTD_visium_full.rds'))
barcodes <- colnames(myRCTD@spatialRNA@counts)
weights <- myRCTD@results$weights
norm_weights <- normalize_weights(weights)
cell_types <- c('Denate', 'Neurogenesis','Cajal_Retzius')
print(head(norm_weights[,cell_types])) # observe weight values
#> 6 x 3 Matrix of class "dgeMatrix"
#>                          Denate Neurogenesis Cajal_Retzius
#> AAAGGGATGTAGCAAG-1 5.091559e-02    0.2973533    0.06513845
#> AAAGGGCAGCTTGAAT-1 2.217595e-01    0.1662856    0.22292711
#> AACAACTGGTAGTTGC-1 6.172927e-05    0.1972219    0.07028278
#> AACCCAGAGACGGAGA-1 1.923783e-04    0.3324192    0.05714832
#> AACCGAGCTTGGTCAT-1 1.598240e-04    0.2902799    0.09502877
#> AACGATAGAAGGGCCG-1 7.180834e-07    0.4253288    0.01287445
plot_puck_continuous(myRCTD@spatialRNA, barcodes, norm_weights[,'Denate'], ylimit = c(0,0.5), 
                     title ='plot of Dentate weights') # plot Dentate weights

Choose regions

Now that we have successfully run RCTD, we can create a set of regions used for predicting differential expression in CSIDE. In this case, we will create three lists representing membership in the left, middle, or right regions. We construct region_list which is a list of these three lists.

### Create SpatialRNA object
region_left <- barcodes[which(myRCTD@spatialRNA@coords$x < quantile(myRCTD@spatialRNA@coords$x, 1/3))]
region_right <- barcodes[which(myRCTD@spatialRNA@coords$x > quantile(myRCTD@spatialRNA@coords$x, 2/3))]
region_middle <- setdiff(barcodes, union(region_left, region_right))
region_list <- list(region_left, region_right, region_middle)

# Differentially downregulate one gene
gene_list <- c('Fth1', 'Bsg')
myRCTD@originalSpatialRNA@counts <- myRCTD@spatialRNA@counts[gene_list,]
class_num <- rep(0, length(barcodes)); names(class_num) <- barcodes
class_num[region_middle] <- 1; class_num[region_right] <- 2 
plot_class(myRCTD@spatialRNA, barcodes, factor(class_num),  title ='plot of regions') # plot regions

Running CSIDE

After creating the list of regions, we are now ready to run CSIDE using the run.CSIDE.regions function. We will use two cores, and a false discovery rate of 0.25. Next, we will set a gene threshold (i.e. minimum gene expression) of 0.01, and we will set a cell_type_threshold (minimum instances per cell type) of 10.

Warning: On this toy dataset, we have made several choices of parameters that are not recommended for regular use. On real datasets, we recommend first consulting the CSIDE default parameters. This includes gene_threshold (default 5e-5), cell_type_threshold (default 125), fdr (default 0.01), and weight_threshold (default NULL). Please see ?run.CSIDE.regions for more information on these parameters.

#de
myRCTD@config$max_cores <- 2
myRCTD <- run.CSIDE.regions(myRCTD, region_list,cell_types = cell_types, gene_threshold = .01, cell_type_threshold = 10, fdr = 0.25, doublet_mode = F, weight_threshold = 0.1) 
#> choose_cell_types: running de with cell types Denate, Neurogenesis, Cajal_Retzius
#> run.CSIDE.general: configure params_to_test = 1run.CSIDE.general: configure params_to_test = 2run.CSIDE.general: configure params_to_test = 3
saveRDS(myRCTD,file.path(savedir,'myRCTDde_visium_regions.rds'))

Equivalently to using the run.CSIDE.regions function, those who want to have more precise control can build the design matrix directly. In this case, we use the build.designmatrix.regions function, which constructs the design matrix given a list of regions. After constructing the design matrix, one can use the run.CSIDE function to run CSIDE with this general design matrix.

X <- build.designmatrix.regions(myRCTD,region_list)
print(head(X))
#>                    [,1] [,2] [,3]
#> AACCCAGAGACGGAGA-1    1    0    0
#> AACGATAGAAGGGCCG-1    1    0    0
#> AACTGGGTCCCGACGT-1    1    0    0
#> AAGTAGAAGACCGGGT-1    1    0    0
#> AATGGTTCTCACAAGC-1    1    0    0
#> ACACCCGAGAAATCCG-1    1    0    0
barcodes <- rownames(X)
myRCTD_identical <- run.CSIDE(myRCTD, X, barcodes, cell_types, gene_threshold = .01, doublet_mode = F,
                        cell_type_threshold = 10, fdr = 0.25, weight_threshold = 0.1)
#> choose_cell_types: running de with cell types Denate, Neurogenesis, Cajal_Retzius
#> run.CSIDE.general: configure params_to_test = 2

CSIDE results

After running CSIDE, it is time to examine CSIDE results. We will focus on cell type Dentate. Furthermore, we will examine the original Fth1 gene.

#print results for cell type 'Dentate'
cell_type <- 'Denate'
gene <- 'Fth1'
print(paste("following results hold for", gene))
#> [1] "following results hold for Fth1"
print("check for covergence of each cell type")
#> [1] "check for covergence of each cell type"
print(myRCTD@de_results$gene_fits$con_mat[gene, ]) # did not converge (dataset was too small)
#>        Denate  Neurogenesis Cajal_Retzius 
#>         FALSE         FALSE         FALSE
print('estimated expression for Dentate in each region')
#> [1] "estimated expression for Dentate in each region"
cell_type_ind <- which(cell_type == cell_types)
print(myRCTD@de_results$gene_fits$all_vals[gene, , cell_type_ind]) 
#> [1] -3.926944 -5.262367 -5.376826

Load in significant genes from another dataset

Unfortunately, this toy dataset did not yield any significant genes. To illustrate the results of a normal dataset, we will examine the myRCTD@de_results$sig_gene_list object from another dataset. sig_gene_list is a list, for each cell type, of dataframes that contain the hypothesis testing results for significant genes. In particular, notice the following columns: p_val (p value), log_fc (log-fold-change between two regions) and X1…X4 (the estimated expression within each region):

sig_gene_list <- readRDS(file.path(datadir,"testes_sig_gene_list.rds")) # load in counts matrix
print(head(sig_gene_list$`2`))
#>                  sd    sd_lfc p_val    log_fc        X1        X2        X3
#> Acrv1    0.07274442 0.3037137     0 0.6795464 -7.369364 -7.264492 -6.998050
#> Actl7b   0.05362279 0.2483192     0 0.5575609 -6.468663 -6.384588 -6.330300
#> Antxrl   0.16001753 0.7084539     0 1.5944706 -9.165735 -8.567636 -7.908107
#> Aqp7     0.11520419 0.4276247     0 1.0264325 -8.302994 -7.767773 -7.276561
#> Arf4     0.08276788 0.4481691     0 0.9293800 -8.338457 -8.303514 -8.080831
#> BC049635 0.07362217 0.3386442     0 0.6470246 -7.542562 -6.895538 -6.775634
#>                 X4
#> Acrv1    -6.689818
#> Actl7b   -5.911103
#> Antxrl   -7.571264
#> Aqp7     -7.613193
#> Arf4     -7.374133
#> BC049635 -7.140916

Finally, we will plot CSIDE results in the savedir directory!

The following plot shows a spatial visualization of the Fth1 gene.

The function make_all_de_plots will automatically generate several types of plots displaying the CSIDE results across all genes and cell types. Typically, one would want to plot one or multiple significant genes. For this toy dataset, no genes were significant.

myRCTD <- readRDS(file.path(savedir,'myRCTDde_visium_regions.rds'))
plot_gene_regions(myRCTD, cell_type, gene, pixel_weight_thresh = 0.15, expr_thresh = 8)

make_all_de_plots(myRCTD, savedir)
#> Warning in plot_sig_genes_regions(cell_type, myRCTD@internal_vars_de$barcodes, :
#> plot_sig_genes_regions: Not plotting because less than 10 singlet barcodes found
#> for cell type Denate
#> Warning in plot_sig_genes_regions(cell_type, myRCTD@internal_vars_de$barcodes, :
#> plot_sig_genes_regions: Not plotting because less than 10 singlet barcodes found
#> for cell type Neurogenesis
#> Warning in plot_sig_genes_regions(cell_type, myRCTD@internal_vars_de$barcodes, :
#> plot_sig_genes_regions: Not plotting because less than 10 singlet barcodes found
#> for cell type Cajal_Retzius