CSIDE on the Slide-seq KP tumor (immune cell-dependent DE)

Load in CSIDE Results and calculate significant genes

library(spacexr)
library(Matrix)
library(devtools)
library(ggplot2)
library(ggpubr)
library(reshape2)
library(dplyr)
library(ggrepel)
library(fields)
library(stringr)
library(GSA)
load_all()
# Load in spatialRNA data and Reference data
pwd = getwd()
datadir <- paste0('~/Documents/MIT/Research/Rafalab/Projects/slideseq/Cell Demixing/ContentStructure/CSIDE/data/tumor','/')
resultsdir <- paste0('~/Documents/MIT/Research/Rafalab/Projects/slideseq/Cell Demixing/ContentStructure/CSIDE/results/ResultsTumor','/')
myRCTD = readRDS(paste0(resultsdir,'myRCTDde.rds'))
cell_types = c("CAF","hepatocyte 2","vascular smooth mc")
cell_types_present = c("CAF","hepatocyte 2","vascular smooth mc", 'monocyte/DC')
myRCTD@de_results$gene_fits$s_mat <- myRCTD@de_results$gene_fits$I_mat
de_results = myRCTD@de_results
gene_fits <- de_results$gene_fits
cell_type <- 'CAF'
gene_list_type <- get_gene_list_type_wrapper(myRCTD, cell_type, cell_types_present)
res_genes <- find_sig_genes_individual(cell_type, cell_types, myRCTD@de_results$gene_fits, gene_list_type,myRCTD@internal_vars_de$X2, fdr = 0.01, p_thresh = 0.001, log_fc_thresh = 0.4)
my_genes <- gene_list_type[grep("^(Rps|Rpl|mt-)",gene_list_type)]
my_genes <- intersect(rownames(res_genes),my_genes)
res_genes[my_genes,]
my_beta <- myRCTD@internal_vars_de$my_beta
cell_type <- 'CAF'
barcodes_sing <- names(which(my_beta[myRCTD@internal_vars_de$all_barc,cell_type] > 0.999))
big_sing <- intersect(barcodes_sing,names(which(myRCTD@internal_vars_de$X2[,2] > 0.5)))
sm_sing <- intersect(barcodes_sing,names(which(myRCTD@internal_vars_de$X2[,2] < 0.5)))
Y <- colSums(myRCTD@spatialRNA@counts[my_genes,])
Yn <- Y / myRCTD@spatialRNA@nUMI
p <- plot_puck_continuous(myRCTD@spatialRNA, barcodes_sing, Y, ylimit= c(0,100))
p <- plot_puck_continuous(myRCTD@spatialRNA, barcodes_sing, Yn, ylimit= c(0,0.05))
mean(Y[big_sing])
mean(Y[sm_sing])
mean(Yn[big_sing])
mean(Yn[sm_sing])
mean(myRCTD@spatialRNA@nUMI[big_sing])
mean(myRCTD@spatialRNA@nUMI[sm_sing])
gene_list_type <- gene_list_type[-grep("^(Rps|Rpl|mt-)",gene_list_type)]
res_genes <- res_genes[intersect(rownames(res_genes),gene_list_type),]
dim(res_genes)

Plot predictive-variable and cell types

my_barc <- rownames(myRCTD@results$results_df)[which((myRCTD@results$results_df$first_type == 'monocyte/DC' & myRCTD@results$results_df$spot_class != 'reject') | (myRCTD@results$results_df$second_type == 'monocyte/DC' & myRCTD@results$results_df$spot_class == 'doublet_certain' ))]
p1 <- plot_puck_continuous(myRCTD@spatialRNA, colnames(myRCTD@spatialRNA@counts) , myRCTD@internal_vars_de$X2[,2], 
                     title ='') + geom_point(data = myRCTD@spatialRNA@coords[my_barc,], size = 0.5, color = "#D55E00") +  ggplot2::scale_size_identity() + coord_fixed() + theme(legend.position="top")+ 
  scale_y_continuous(limits = c(1100,3750)) + scale_x_continuous( limits = c(2300,4800))+ geom_segment(aes(x = 2400, y = 1300, xend = 2784.6, yend = 1300), color = "black") + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(), axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank()) + scale_colour_gradientn("Myeloid Density",breaks = c(0,1), labels = c(0,1), colors = pals::brewer.blues(20)[2:20], lim = c(0,1))
results_df <- myRCTD@results$results_df
puck <- myRCTD@spatialRNA
barcodes = rownames(results_df[results_df$spot_class != "reject" & puck@nUMI >= 100,])
my_table = puck@coords[barcodes,]
my_table$class = results_df[barcodes,]$first_type
n_levels = myRCTD@cell_type_info$info[[3]]
my_pal = pals::kelly(n_levels+1)[2:(n_levels+1)]
names(my_pal) = myRCTD@cell_type_info$info[[2]]
my_pal_curr <- my_pal
my_pal_curr["vascular smooth mc"] <- "#CC79A7"
my_pal_curr["LSEC"] <- "#E69F00"
my_pal_curr["monocyte/DC"] <- "#D55E00"
my_pal_curr["CAF"] <- "#009E73"
my_pal_curr["hepatocyte 2"] <- "#0072B2"
pres = unique(as.integer(my_table$class))
pres = pres[order(pres)]
p2 <- ggplot2::ggplot(my_table, ggplot2::aes(x=x, y=y)) + ggplot2::geom_point(ggplot2::aes(size = .4, shape=19,color=class)) + ggplot2::scale_color_manual("",values = my_pal_curr[pres], breaks = c('CAF','monocyte/DC','hepatocyte 2','vascular smooth mc', 'LSEC'), labels = c('Cancer cells','Myeloid cells','Hepatocytes','Vascular Smooth MC', 'LSEC'))+ ggplot2::scale_shape_identity() + ggplot2::theme_classic() + ggplot2::scale_size_identity() + coord_fixed() + theme(legend.position="top")+ guides(colour = guide_legend(override.aes = list(size=2)))+ 
  scale_y_continuous(limits = c(1100,3750)) + scale_x_continuous( limits = c(2300,4800))+ geom_segment(aes(x = 2400, y = 1300, xend = 2784.6, yend = 1300), color = "black") + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(), axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank())
ggarrange(p1,p2,nrow = 2)

Gene ontology testing

# Get a list of the overexpressed and under expressed genes in CAF cells
over_genes <- tolower(rownames(res_genes[res_genes$log_fc > 0,]))
under_genes <- tolower(rownames(res_genes[res_genes$log_fc < 0,]))
# Load hallmark gene sets and change the genes to all lowercase so it's easier to work with.
gene_sets = GSA.read.gmt(file.path(datadir,'hallmark_genesets.gmt'))
length(intersect(Reduce(union,gene_sets), tolower(rownames(myRCTD@cell_type_info$info[[1]])))) / length(Reduce(union,gene_sets)) #check for overlap of genes
gene_set_names = gene_sets$geneset.names
gene_set_descriptions = gene_sets$geneset.descriptions
gene_sets = gene_sets$genesets
names(gene_sets)=gene_set_names
gene_sets = lapply(gene_sets, tolower)
n_sets = length(gene_sets)
#### new GSA
NUM_GENE_THRESH <- 5
Q_THRESH <- 0.05
N_samp <- 100000
log_fc_thresh <- 0.4
data_res <- matrix(0, n_sets, 3)
for(i in 1:n_sets) {
  my_genes <- gene_list_type[tolower(gene_list_type) %in% gene_sets[[i]]]
  my_genes_big <- names(which(abs(myRCTD@de_results$gene_fits$mean_val[my_genes,'CAF']) > log_fc_thresh))
  my_Z_big <- (myRCTD@de_results$gene_fits$mean_val[my_genes_big,'CAF'] / myRCTD@de_results$gene_fits$I_mat[my_genes_big,2])
  Z_mean <- mean(my_Z_big)
  n <- length(my_Z_big)
  my_genes <- gene_list_type[tolower(gene_list_type) %in% Reduce(union, gene_sets)]
  my_genes_big <- names(which(abs(myRCTD@de_results$gene_fits$mean_val[my_genes,'CAF']) > log_fc_thresh))
  my_Z_big <- (myRCTD@de_results$gene_fits$mean_val[my_genes_big,'CAF'] / myRCTD@de_results$gene_fits$I_mat[my_genes_big,2])
  sample_res <- numeric(N_samp)
  for(j in 1:N_samp)
    sample_res[j] <- mean(sample(my_Z_big, n))
  if(n > 0) {
    if(Z_mean > 0)
      p_val <- mean(Z_mean < sample_res)*2
    else
      p_val <- mean(Z_mean > sample_res)*2
  } else {
    p_val <- 1
    Z_mean <- 0
  }

  data_res[i,] <- c(n, p_val, Z_mean)
}

# NULL
q_vals <- p.adjust(data_res[data_res[,1] >= NUM_GENE_THRESH,2], method = 'BH')
sig_gene_sets <- which(data_res[,1] >= NUM_GENE_THRESH)[
  which(q_vals < Q_THRESH)]
final_df <- data.frame(data_res[sig_gene_sets,])
colnames(final_df) <- c('n', 'p', 'Z')
final_df$name <- gene_set_names[sig_gene_sets]
final_df$q <- q_vals[q_vals < Q_THRESH]
saveRDS(final_df, file.path(resultsdir, 'gsa_results.rds'))
final_df <- readRDS(file.path(resultsdir, 'gsa_results.rds'))

Make volcano plot

cell_type <- 'CAF'
log_fc <- myRCTD@de_results$gene_fits$mean_val[gene_list_type,cell_type]
cell_type_ind <- which(myRCTD@internal_vars_de$cell_types == cell_type)*2
z_score <- log_fc / myRCTD@de_results$gene_fits$I_mat[gene_list_type, cell_type_ind]
p_vals <- 2*(1-pnorm(abs(z_score)))
p_vals[p_vals == 0] <- 1e-16

plot_df <- data.frame(log_fc, -log(p_vals,10))
colnames(plot_df) <- c('log_fc','p_val')
MAX_P_VAL <- -log(max(res_genes$p_val),10)
plot_df$gene <- rownames(plot_df)
#plot_df$label <- plot_df$gene
#plot_df$label[!(plot_df$label %in% rownames(res_genes))] <- NA
plot_df$sig <- (rownames(plot_df) %in% rownames(res_genes))
plot_df$label <- NA
plot_df[c('Dpysl3', 'Col7a1','Col6a1','Col6a2','Lrp1','Inhba','Pmepa1','Timp3','Lrp1','Mgp','S100a13','Krt19','Krt79','S100a1','Ccl2','Spred1','Ecm1','Nfkb1','S100a10','S100a6','Krt18','Krt8','Krt14','Krtcap2'), 'label'] <- c('Dpysl3', 'Col7a1','Col6a1','Col6a2','Lrp1','Inhba','Pmepa1','Timp3','Lrp1','Mgp','S100a13','Krt19','Krt79','S100a1','Ccl2','Spred1','Ecm1','Nfkb1','S100a10','S100a6','Krt18','Krt8','Krt14','Krtcap2')
plot_df$log_fc <- plot_df$log_fc*log(exp(1),2)
plot_df$group <- 'reg'
gene_sets = GSA.read.gmt(file.path(datadir,'hallmark_genesets.gmt'))
gene_set_names = gene_sets$geneset.names
gene_set_descriptions = gene_sets$geneset.descriptions
gene_sets = gene_sets$genesets
names(gene_sets)=gene_set_names
gene_sets = lapply(gene_sets, tolower)
my_genes <- gene_list_type[tolower(gene_list_type) %in% gene_sets[[30]]]
my_gl <-  intersect(my_genes,rownames(res_genes))
plot_df[rownames(plot_df) %in% my_gl, 'group'] <- 'emt'
p <- ggplot(plot_df, aes(x=log_fc, y = p_val, color = group, alpha = sig)) + geom_point() + theme_classic()  +
  geom_vline(xintercept = 0.4*log(exp(1),2), linetype = 'dotted') + geom_vline(xintercept = -0.4*log(exp(1),2), linetype = 'dotted') +
  geom_label_repel(data = plot_df,aes(label = label),nudge_x = 0,na.rm = TRUE, show.legend = F, size = 4, label.padding = 0.1, max.overlaps = 15)+ xlab('Estimated cell type-specific DE by CSIDE') + ylab('CSIDE p-value') + scale_color_manual("", labels = c('EMT', 'Other'), values = c('#D55E00','black')) + xlab('Estimated cell type-specific DE by CSIDE') + ylab('CSIDE p-value') + scale_alpha_manual("", labels = c('Not significant', 'Significant'), values = c(0.2,1))#+ geom_label_repel(data = plot_df[plot_df$group == 'emt',],aes(label = label),nudge_x = 0,na.rm = TRUE, show.legend = F, size = 4, label.padding = 0.1, max.overlaps = 15) #plot_df[plot_df$group != 'emt',]
p

Plot EMT genes

emt_list <- my_gl
X2 <- myRCTD@internal_vars_de$X2
gene_fits <- myRCTD@de_results$gene_fits
all_barc <- myRCTD@internal_vars_de$all_barc
my_beta <- myRCTD@internal_vars_de$my_beta
puck <- myRCTD@spatialRNA
cell_type <- 'CAF'
barcodes_sing <- names(which(my_beta[all_barc,cell_type] > 0.999))
MULT = 500
density_thresh <- 0.2
barc_plot <- intersect(barcodes_sing,colnames(puck@counts)[puck@nUMI >= 200])
Y_plot <- MULT*colSums(puck@counts[emt_list,])/puck@nUMI
ge_thresh <- 2.5
my_class <- rep(0,length(barc_plot)); names(my_class) <- barc_plot
my_class[(X2[barc_plot,2] <= density_thresh) & (Y_plot[barc_plot] <= ge_thresh)] <- 1
my_class[(X2[barc_plot,2] <= density_thresh) & (Y_plot[barc_plot] > ge_thresh)] <- 3
my_class[(X2[barc_plot,2] > density_thresh) & (Y_plot[barc_plot] <= ge_thresh)] <- 2
my_class[(X2[barc_plot,2] > density_thresh) & (Y_plot[barc_plot] > ge_thresh)] <- 4
p3 <- plot_class(puck, barc_plot[order(my_class[barc_plot])], factor(my_class))
suppressMessages(p3 <- p3 + scale_color_manual(values=c("#CCE2EF","#F6DECC","#0072B2","#D55E00"))+ ggplot2::scale_shape_identity() + ggplot2::theme_classic() + ggplot2::scale_size_identity() + coord_fixed() + theme(legend.position="top")+ guides(colour = guide_legend(override.aes = list(size=2)))+ 
  scale_y_continuous(limits = c(1100,3750)) + scale_x_continuous( limits = c(2300,4800))+ geom_segment(aes(x = 2400, y = 1300, xend = 2784.6, yend = 1300), color = "black") + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(), axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank()))

p3