CSIDE on the Slide-seq testes data

Pre-process testes data

library(spacexr)
library(Matrix)
library(devtools)
library(ggplot2)
library(ggpubr)
library(reshape2)
library(dplyr)
library(ggrepel)
source('~/Documents/MIT/Research/Rafalab/Projects/spacexr/AnalysisCSIDE/helper_functions/testes_helper.R')
source('~/Documents/MIT/Research/Rafalab/Projects/spacexr/AnalysisCSIDE/helper_functions/merge_de_helper.R')
load_all()
datadir <- '../../../spacexr/data/SpatialRNA/Testes'
puck <- readRDS(file.path(datadir,'puck.rds'))
### END PRELUDE
myRCTD <- readRDS(file.path(datadir,'myRCTD_testes.rds'))
myRCTD@originalSpatialRNA <- puck
codes_1 <- c()
stadir <- file.path(datadir,'Stage/I-III')
id <- 0
cvx_df <- data.frame(x=numeric(),y=numeric(),stage=numeric(),id = numeric())
for(file in list.files(stadir)) {
  labels <- read.csv(file.path(stadir,file))
  tubule_codes <- as.character(labels$barcode)
  codes_1 <- c(codes_1,tubule_codes)
  print(length(codes_1))
  my_coords <- puck@coords[tubule_codes,1:2]
  my_ind <- chull(puck@coords[tubule_codes,])
  my_coords <- my_coords[my_ind,]
  temp_df <- cbind(my_coords, 1,id)
  colnames(temp_df)[3] <- 'stage'
  cvx_df <- bind_rows(cvx_df, temp_df)
  id <- id + 1
}

codes_2 <- c()
stadir <- file.path(datadir,'Stage/IV-VI')
for(file in list.files(stadir)) {
  labels <- read.csv(file.path(stadir,file))
  tubule_codes <- as.character(labels$barcode)
  codes_2 <- c(codes_2,tubule_codes)
  print(length(codes_2))
  my_coords <- puck@coords[tubule_codes,1:2]
  my_ind <- chull(puck@coords[tubule_codes,])
  my_coords <- my_coords[my_ind,]
  temp_df <- cbind(my_coords, 2,id)
  colnames(temp_df)[3] <- 'stage'
  cvx_df <- bind_rows(cvx_df, temp_df)
  id <- id + 1
}

codes_3 <- c()
stadir <- file.path(datadir,'Stage/VII-VIII')
for(file in list.files(stadir)) {
  labels <- read.csv(file.path(stadir,file))
  tubule_codes <- as.character(labels$barcode)
  codes_3 <- c(codes_3,tubule_codes)
  print(length(codes_3))
  my_coords <- puck@coords[tubule_codes,1:2]
  my_ind <- chull(puck@coords[tubule_codes,])
  my_coords <- my_coords[my_ind,]
  temp_df <- cbind(my_coords, 3,id)
  colnames(temp_df)[3] <- 'stage'
  cvx_df <- bind_rows(cvx_df, temp_df)
  id <- id + 1
}

codes_4 <- c()
stadir <- file.path(datadir,'Stage/IX-XII')
for(file in list.files(stadir)) {
  labels <- read.csv(file.path(stadir,file))
  tubule_codes <- as.character(labels$barcode)
  codes_4 <- c(codes_4,tubule_codes)
  print(length(codes_4))
  my_coords <- puck@coords[tubule_codes,1:2]
  my_ind <- chull(puck@coords[tubule_codes,])
  my_coords <- my_coords[my_ind,]
  temp_df <- cbind(my_coords, 4,id)
  colnames(temp_df)[3] <- 'stage'
  cvx_df <- bind_rows(cvx_df, temp_df)
  id <- id + 1
}

barcodes_1 <- intersect(codes_1,names(myRCTD@spatialRNA@nUMI))
barcodes_2 <- intersect(codes_2,names(myRCTD@spatialRNA@nUMI))
barcodes_3 <- intersect(codes_3,names(myRCTD@spatialRNA@nUMI))
barcodes_4 <- intersect(codes_4,names(myRCTD@spatialRNA@nUMI))
region_list <- list(barcodes_1, barcodes_2, barcodes_3, barcodes_4)
saveRDS(cvx_df, file.path(datadir, 'cvx_df.rds'))
saveRDS(region_list, file.path(datadir, 'region_list.rds'))
region_list <- readRDS(file.path(datadir, 'region_list.rds'))
cvx_df <- readRDS(file.path(datadir, 'cvx_df.rds'))
cvx_df$stage <- factor(cvx_df$stage)
cvx_df <- bind_rows(cvx_df, cvx_df[which(!duplicated(cvx_df$id)),])
all_barc <- intersect(Reduce(union,region_list), colnames(myRCTD@spatialRNA@counts))
aggregate_cell_types(myRCTD,all_barc,doublet_mode = F)
myRCTD@config$max_cores <- 4
cur_cell_types = c('1','2','4','5','6')
cell_types <- cur_cell_types

Plot regions and cell types

region_no <- rep(0, length(names(puck@nUMI)))
names(region_no) <- names(puck@nUMI)
for(i in 1:4)
  region_no[region_list[[i]]] <- i
p1 <- plot_class(puck, names(region_no), factor(region_no))+ ggplot2::scale_shape_identity() + ggplot2::theme_classic() + ggplot2::scale_size_identity() + coord_fixed() + theme(legend.position="top")+ guides(colour = guide_legend(override.aes = list(size=2)))+ ggplot2::scale_color_manual("Stage",values = c('grey', '#E69F00','#CC79A7', '#EFCB00',"#000000"), breaks = c(0,1,2,3,4), labels = c('Not Classified','I-III','IV-VI', 'VII-VIII', 'IX-XII'))+ geom_segment(aes(x = 1300, y = 1000, xend = 1684.6, yend = 1000), color = "black")+  theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(), axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank())
my_barc <- rownames(myRCTD@results$results_df)
results_df <- myRCTD@results$results_df
puck <- myRCTD@spatialRNA
barcodes = rownames(results_df[results_df$spot_class != "reject" & puck@nUMI >= 100,])
my_table = puck@coords[barcodes,]
my_table$class = results_df[barcodes,]$first_type
n_levels = myRCTD@cell_type_info$info[[3]]
my_pal = pals::kelly(n_levels+1)[2:(n_levels+1)]
names(my_pal) = myRCTD@cell_type_info$info[[2]]
my_pal_curr <- my_pal
my_pal_curr[cell_types[4]] <- "#CC79A7"
my_pal_curr[cell_types[5]] <- "#E69F00"
my_pal_curr[cell_types[3]] <- "#D55E00"
my_pal_curr[cell_types[1]] <- "#009E73"
my_pal_curr[cell_types[2]] <- "#0072B2"
pres = unique(as.integer(my_table$class))
pres = pres[order(pres)]
p2 <- ggplot2::ggplot(my_table, ggplot2::aes(x=x, y=y)) + ggplot2::geom_point(ggplot2::aes(size = .1, shape=19,color=class)) + ggplot2::scale_color_manual(values = my_pal_curr[pres], breaks = cell_types, labels = c('ES','RS','SPC','SPG','Sertoli'))+ ggplot2::scale_shape_identity() + ggplot2::theme_classic() + ggplot2::scale_size_identity() + coord_fixed() + theme(legend.position="top")+ guides(colour = guide_legend(override.aes = list(size=2)))+  geom_segment(aes(x = 1300, y = 1000, xend = 1684.6, yend = 1000), color = "black")+labs(color='') +  theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(), axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank())
ggarrange(p1,p2,nrow = 2)

Run CSIDE

myRCTDde <- run.de.regions(myRCTD, region_list, datadir = datadir, cell_types = cur_cell_types,
                           doublet_mode = F)
myRCTDde@internal_vars_de$delta <- 0; myRCTDde@internal_vars_de$test_mode <- 'multi'
myRCTDde@reference <- gen_small_reference()
myRCTDde <- add_res_genes(myRCTDde, datadir = datadir, plot_genes = F)
saveRDS(myRCTDde, file.path(datadir,'myRCTDde_updated2.rds'))
make_all_de_plots(myRCTDde, datadir)
myRCTDde <- readRDS(file.path(datadir,'myRCTDde_updated2.rds'))
cell_type_mapping <- c('1'="ES", '2'="RS", '3'='Myoid', '4'="SPC", '5'="SPG",
  '6'="Sertoli", '7'="Leydig",'8'="Endothelial",'9'="Macrophage")
#extract relevant variables
X2 <- myRCTDde@internal_vars_de$X2
my_beta <- myRCTDde@internal_vars_de$my_beta
cell_types_present <- myRCTDde@internal_vars_de$cell_types_present
cur_cell_types <- myRCTDde@internal_vars_de$cell_types
all_barc <- myRCTDde@internal_vars_de$all_barc
n_regions <- dim(X2)[2]
gene_fits <- myRCTDde@de_results$gene_fits

Validate SPC DE Genes

res_genes <- myRCTDde@de_results$res_gene_list$`4`
get_best_pair <- function(gene) {
  Z_max <- 0
  imax <- 0
  jmax <- 0
  for(i in 1:3) {
    for(j in (i+1):4) {
      se <- sqrt(gene_fits$I_mat[gene, i+8]^2 + gene_fits$I_mat[gene, j+8]^2)
      Z <- abs(res_genes[gene,i + 4] - res_genes[gene,j+4]) / se
      if(Z > Z_max) {
        imax <- i
        jmax <- j
        Z_max <- Z
      }
    }
  }
  return(c(imax,jmax,Z_max))
}
gl_type12 <- intersect(get_gene_list_type_wrapper(myRCTDde, '1', cell_types_present),
  get_gene_list_type_wrapper(myRCTDde, '2', cell_types_present))
res_genes <- myRCTDde@de_results$res_gene_list$`4`
int_genes <- setdiff(intersect(rownames(res_genes), gl_type12),
        rownames(myRCTDde@de_results$res_gene_list$`1`))
int_genes <- setdiff(int_genes, rownames(myRCTDde@de_results$res_gene_list$`2`))
cur_cell_types <- c('1','2', '4')
ct_thresh <- 0.9
cur_ress <- matrix(0, nrow = length(int_genes), ncol = 9)
rownames(cur_ress) <- int_genes
for(gene in int_genes) {
  bp <- get_best_pair(gene)
  se <- gene_fits$I_mat[gene, c(bp[1], bp[2], bp[1] + 4, bp[2] + 4, bp[1] + 8, bp[2]+ 8)]^2
  se <- sqrt(c(se[1] + se[2], se[3] + se[4], se[5]+se[6]))
  cur_ress[gene, ] <- c(bp,(myRCTDde@de_results$gene_fits$all_vals[gene,bp[1],1:3] -
          myRCTDde@de_results$gene_fits$all_vals[gene,bp[2],1:3]), se)
}
is_good <- cur_ress[,6] > 2*pmax(abs(cur_ress[,4]),abs(cur_ress[,5]))
int_genes <- int_genes[is_good]
int_genes <- c('Prss40','Snx3') 
detect_thresh <- .25
results_df <- data.frame(gene = character(), one_pres = logical(), two_pres = logical(), four_pres = logical(), region = integer(),
                         N = integer(), Y = numeric(), pred = numeric(), se = numeric())
for(gene in int_genes) {
  p_df <- get_quant_df(myRCTDde, myRCTDde@de_results$gene_fits, myRCTDde@internal_vars_de$cell_types, 
                       cur_cell_types, gene, multi_region = T, prop_thresh = ct_thresh)
  for(op in c(F,T))
    for(fp in c(F,T)) {
      for(region in c(cur_ress[gene,1],cur_ress[gene,2])) {
        tp <- op
        cur_barcs <- rownames(p_df)
        if(op) {
          cur_barcs <- cur_barcs[which(p_df[cur_barcs,'X1'] + p_df[cur_barcs,'X2'] > detect_thresh)] 
        } else {
          cur_barcs <- cur_barcs[which(p_df[cur_barcs,'X1'] + p_df[cur_barcs,'X2'] < detect_thresh)] 
        }
        if(fp) {
          cur_barcs <- cur_barcs[which(p_df[cur_barcs,'X4'] > detect_thresh)] 
        } else {
          cur_barcs <- cur_barcs[which(p_df[cur_barcs,'X4'] < detect_thresh)] 
        }
        cur_barcs <- cur_barcs[which(p_df[cur_barcs,'region'] == region)] 
        n <- length(cur_barcs)
        Y <- mean(p_df[cur_barcs, 'Y'])
        pred <- mean(p_df[cur_barcs, 'pred'])
        se <- sqrt(mean(p_df[cur_barcs,'var'])/n)
        de<-data.frame(gene, op, tp, fp, region, n, Y, pred, se)
        names(de)<-c('gene',"one_pres","two_pres", 'four_pres','region','N','Y','pred', 'se')
        if(op | fp)
          results_df <- bind_rows(results_df, de)
      }
    }
}
results_df$region <- factor(results_df$region)
results_df$category <- factor(as.integer(results_df$four_pres)*2 + as.integer(results_df$one_pres), levels = c(1,3,2))
results_df$category <- plyr::mapvalues(results_df$category,
                                 from = c(1,2,3),
                                 to = c('S+, SPC-', 'S-, SPC+', 'S+, SPC+'))

p<- ggplot(results_df, aes(x = category, y = log(pmax(500*Y, 500*10^(-4.5)),2), color = region)) + geom_point(position = position_dodge(0.2)) + geom_point(aes(group = region, y = log(500*pred,2)),position = position_dodge(0.2), shape = 5)+ facet_wrap(gene ~.) + 
  geom_errorbar(aes(ymin = log(pmax(500*(Y - 1.96*se),500*10^(-4.5)),2), ymax = log(500*(Y + 1.96*se),2)), width = 0.2, position = position_dodge(0.2)) + 
  theme_classic() + ylab('Log average expression') + xlab('Cell types present') + scale_color_manual("Stage", breaks = c(1,4), labels = c('I-III','IX-XII'), values = c('#E69F00',"#000000"))
p_df <- cbind(results_df, 'Raw data')
p_df_2 <- cbind(results_df, 'CSIDE model')
p_df_2$Y <- p_df_2$pred
colnames(p_df_2)[11] <- 'model'
colnames(p_df)[11] <- 'model'
plot_df <- rbind(p_df, p_df_2)
p<- ggplot(plot_df, aes(x = category, y = log(pmax(500*Y, 500*10^(-4.5)),2), color = region, shape = model, group = region)) + geom_point(position = position_dodge(0.2)) + facet_wrap(gene ~.) + 
  geom_errorbar(data = plot_df[plot_df$model == 'Raw data',], aes(ymin = log(pmax(500*(Y - 1.96*se),500*10^(-4.5)),2), ymax = log(500*(Y + 1.96*se),2)), width = 0.2, position = position_dodge(0.2)) + 
  theme_classic() + ylab('Log average expression') + xlab('Cell types present') + scale_color_manual("Stage", breaks = c(1,4), labels = c('I-III','IX-XII'), values = c('#E69F00',"#000000")) + scale_shape_discrete("")
p

Detect cell type and stage specific marker genes

all_ct <- c('1','2','4')
info_1 <- get_marker_by_region(all_ct, '1', n_regions, myRCTDde, cell_types_present, gene_fits, trials = 50000, p_val_thresh = .001)
saveRDS(info_1, file.path(datadir, 'info_1.rds'))
info_1 <- readRDS(file.path(datadir, 'info_1.rds'))
info_2 <- get_marker_by_region(all_ct, '2', n_regions, myRCTDde, cell_types_present, gene_fits, trials = 50000, p_val_thresh = .001)
saveRDS(info_2, file.path(datadir, 'info_2.rds'))
info_2 <- readRDS(file.path(datadir, 'info_2.rds'))
info_4 <- get_marker_by_region(all_ct, '4', n_regions, myRCTDde, cell_types_present, gene_fits, trials = 50000, p_val_thresh = .001)
saveRDS(info_4, file.path(datadir, 'info_4.rds'))
info_4 <- readRDS(file.path(datadir, 'info_4.rds'))
table(info_1$first_region)
table(info_2$first_region)
table(info_4$first_region)
info_1[order(info_1$first_region),]
infodf <- info_1[info_1$first_region == 4,]
infodf[order(-infodf$diff),]

Plot marker genes

final_gene_list <- c('Ms4a5', 'Iqcf4', 'Hdac11', 'Tnp1','Prr22', 'Tesk2','Acbd3', 'Tmx4', 
                   'Aurka', 'Syt11', 'Piwil1', 'Smg9')
table(gene_fits$con_all[final_gene_list,1:12])
fit_df <- cbind(gene_fits$all_vals[final_gene_list,,1], gene_fits$all_vals[final_gene_list,,2], gene_fits$all_vals[final_gene_list,,3])
fit_df <- exp(fit_df)
norm_df <- sweep(fit_df, 1, apply(fit_df,1,max),'/')
plot_df <- reshape2::melt(norm_df)
colnames(plot_df) <- c('gene', 'region_id', 'expr')
plot_df$region <- ((plot_df$region_id - 1) %% n_regions)+ 1
plot_df$cell_type <- cur_cell_types[floor((plot_df$region_id - 1) / n_regions) + 1]
cur_range = c(0,1)
p <- ggplot(plot_df, aes(region_id, gene, fill = expr)) + geom_tile() +
  scale_fill_gradientn(colors = pals::brewer.blues(20)[2:20],name = "Normalized estimated expression", labels = c(0,1),breaks = c(0.001,1)) + theme_classic() + ylab('Gene')+ ggplot2::scale_size_identity() + coord_fixed()
p