knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir="E:/DISC/reproducibility")
utilities_path = "./source/utilities.r"
source(utilities_path)
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'data.table':
method from
print.data.table
reldist: Relative Distribution Methods
Version 1.6-6 created on 2016-10-07.
copyright (c) 2003, Mark S. Handcock, University of California-Los Angeles
For citation information, type citation("reldist").
Type help(package="reldist") to get started.
The fraction of sampling is set to 30% of the original library size across cells. Only 1 replicate will be generated here as an example. Only imputed genes will be kept for comparison.
raw_data = readh5_loom("./data/PBMC/raw.loom")
[1] "./data/PBMC/raw.loom"
[1] 32738 2638
cell_type = readRDS("./data/PBMC/cell_type.rds")
ds_dir = "./data/PBMC/ds_0.3/r2"
dir.create(ds_dir, showWarnings = F, recursive = T)
observed_path = paste0(ds_dir, "/observed.loom")
if(!file.exists(observed_path)){
observed_data = downsampling_cell(0.3, raw_data)
save_h5(observed_path, t(observed_data))
stop(paste0("Please run imputation using ", observed_path, " first."))
}else{
observed_data = readh5_loom(observed_path)
}
[1] "./data/PBMC/ds_0.3/r2/observed.loom"
[1] 32738 2638
Here, we use genes filtered by our gene selection for cell type identification evaluation.
gene_filter = gene_selection(observed_data, 10)
data_list = list(Raw = raw_data[gene_filter, ], Observed = observed_data[gene_filter, ])
rm(raw_data, observed_data)
We run imputation using downsampling data. Here, we load the imputation results, which can be found here.
data_list[["DISC"]] = readh5_loom(paste0(ds_dir, "/DISC.loom"))[gene_filter, ]
[1] "./data/PBMC/ds_0.3/r2/DISC.loom"
[1] 32738 2638
data_list[["scVI"]] = readh5_imputation(paste0(ds_dir, "/scVI.hdf5"))
[1] "./data/PBMC/ds_0.3/r2/scVI.hdf5"
[1] 4911 2638
data_list[["MAGIC"]] = readh5_imputation(paste0(ds_dir, "/MAGIC.hdf5"))
[1] "./data/PBMC/ds_0.3/r2/MAGIC.hdf5"
[1] 4911 2638
data_list[["DCA"]] = readh5_imputation(paste0(ds_dir, "/DCA.hdf5"))
[1] "./data/PBMC/ds_0.3/r2/DCA.hdf5"
[1] 4911 2638
data_list[["scScope"]] = readh5_imputation(paste0(ds_dir, "/scScope.hdf5"))
[1] "./data/PBMC/ds_0.3/r2/scScope.hdf5"
[1] 4911 2638
data_list[["DeepImpute"]] = readh5_imputation(paste0(ds_dir, "/DeepImpute.hdf5"))
[1] "./data/PBMC/ds_0.3/r2/DeepImpute.hdf5"
[1] 4911 2638
data_list[["VIPER"]] = readh5_imputation(paste0(ds_dir, "/VIPER.hdf5"))
[1] "./data/PBMC/ds_0.3/r2/VIPER.hdf5"
[1] 4911 2638
data_list[["scImpute"]] = readh5_imputation(paste0(ds_dir, "/scImpute.hdf5"))
[1] "./data/PBMC/ds_0.3/r2/scImpute.hdf5"
[1] 4911 2638
for(ii in data_list){
print(dim(ii))
}
[1] 4911 2638
[1] 4911 2638
[1] 4911 2638
[1] 4911 2638
[1] 4911 2638
[1] 4911 2638
[1] 4911 2638
[1] 4911 2638
[1] 4911 2638
[1] 4911 2638
Settings
method_names = c("Raw", "Observed", "DISC", "scVI", "MAGIC", "DCA", "scScope", "DeepImpute", "VIPER", "scImpute")
method_color = c("black", "#A5A5A5", "#E83828", "#278BC4", "#EADE36", "#198B41", "#920783", "#F8B62D", "#8E5E32", "#1E2B68")
text_color = rep("black", length(method_names))
names(text_color) = method_names
text_color["DISC"] = "red"
cell_type_identification_result_list = list()
for(ii in names(data_list)){
print(ii)
cell_type_identification_result_list[[ii]] = seurat_classification(gene_bc_mat = data_list[[ii]], cell_type = cell_type, pca_dim = 10, res = 0.5, min_pct = 0.25, show_plots = T, cell_type_identification_fun = cell_type_identification_pbmc)
}
[1] "Raw"
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
PC_ 1
Positive: MALAT1, RPS27A, RPL3, RPS3A, RPS6, LTB, CD3D, IL32, RPS18, RPS12
RPL13, CD3E, LDHB, CXCR4, RPL10A, RPS5, TPT1, IL7R, AES, CD7
BTG1, RPL19, GLTSCR2, CD2, CD69, CD27, ACAP1, RPS2, STK17A, MYL12A
Negative: CST3, TYROBP, LST1, AIF1, FTL, FCER1G, FTH1, FCN1, TYMP, LYZ
S100A9, CFD, LGALS1, CD68, SERPINA1, CTSS, IFITM3, SPI1, S100A8, LGALS2
PSAP, CFP, IFI30, SAT1, S100A11, COTL1, NPC2, GRN, LGALS3, AP1S2
PC_ 2
Positive: CD79A, MS4A1, HLA-DRA, TCL1A, HLA-DQA1, HLA-DQB1, LINC00926, VPREB3, CD79B, HLA-DRB1
CD74, HLA-DMA, HLA-DPB1, RPL13, HLA-DRB5, FCER2, HLA-DPA1, HLA-DQA2, HLA-DMB, CD37
BANK1, RPL8, LY86, RPS2, HVCN1, FCRLA, RPS18, RPS5, IRF8, EAF2
Negative: NKG7, CST7, PRF1, GZMA, GZMB, B2M, FGFBP2, CTSW, GNLY, GZMH
SPON2, CCL5, CCL4, CD247, FCGR3A, XCL2, CLIC3, AKR1C3, RARRES3, IL32
HOPX, PFN1, SRGN, TTC38, APMAP, CTSC, MYL12A, CD7, ANXA1, ACTB
PC_ 3
Positive: CD3D, LDHB, VIM, IL7R, CD3E, NOSIP, S100A6, FOS, IL32, S100A8
RPS12, S100A4, FYB, JUNB, GIMAP7, RGS10, TMSB4X, S100A10, MAL, AQP3
S100A9, CD2, RPL13, GIMAP4, TCF7, ANXA1, TPT1, LGALS2, CD14, RPS2
Negative: CD79A, CD79B, HLA-DQA1, MS4A1, HLA-DQB1, CD74, TCL1A, HLA-DPB1, HLA-DPA1, LINC00926
HLA-DRB1, HLA-DRA, VPREB3, HLA-DQA2, HLA-DRB5, BANK1, FCER2, HVCN1, HLA-DMA, FCRLA
PDLIM1, HLA-DMB, GZMB, FGFBP2, EAF2, PRF1, IRF8, BLNK, PLAC8, NKG7
PC_ 4
Positive: PPBP, PF4, SDPR, SPARC, GNG11, NRGN, CD9, HIST1H2AC, RGS18, CLU
TUBB1, CCL5, CA2, F13A1, PGRMC1, GZMB, MYL9, MMD, NKG7, CST7
FGFBP2, ACRBP, GZMA, CCL4, PRF1, S100A8, TSC22D1, RUFY1, GNLY, SPON2
Negative: RPS2, RPS12, RPS6, RPL19, RPS5, RPL8, GNB2L1, RPS18, RPL10A, NACA
LTB, JUNB, RPL13, RPL3, TPT1, GLTSCR2, RPS27A, LDHB, VIM, RPS3A
FOS, SRSF5, TMSB10, YBX1, CD48, BTG1, GDI2, CCNI, EIF3H, IL7R
PC_ 5
Positive: RPS2, NKG7, S100A8, FGFBP2, CST7, GNLY, RPL13, GZMA, GZMB, CCL4
PRF1, LGALS2, TMSB10, RPL19, SPON2, CTSW, CYBA, RPS6, CCL3, GZMH
S100A9, GSTP1, CD14, CEBPD, MS4A6A, TYROBP, XCL2, CLIC3, RPL3, IGFBP7
Negative: SDPR, PF4, PPBP, SPARC, GNG11, HIST1H2AC, TUBB1, RGS18, CLU, NRGN
CA2, CD9, MMD, ACRBP, MPP1, NGFRAP1, RUFY1, F13A1, PGRMC1, TSC22D1
MYL9, NCOA4, TPM4, RGS10, GRAP2, MAP3K7CL, NAP1L1, ODC1, PLA2G12A, TUBA4A
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2638
Number of edges: 94018
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8759
Number of communities: 9
Elapsed time: 0 seconds
[1] "Observed"
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2638
Number of edges: 96003
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8550
Number of communities: 9
Elapsed time: 0 seconds
[1] "DISC"
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2638
Number of edges: 98366
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8506
Number of communities: 8
Elapsed time: 0 seconds
[1] "scVI"
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2638
Number of edges: 83439
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8427
Number of communities: 6
Elapsed time: 0 seconds
[1] "MAGIC"
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2638
Number of edges: 59623
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9215
Number of communities: 12
Elapsed time: 0 seconds
[1] "DCA"
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2638
Number of edges: 83096
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8438
Number of communities: 6
Elapsed time: 0 seconds
[1] "scScope"
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2638
Number of edges: 85316
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8087
Number of communities: 6
Elapsed time: 0 seconds
[1] "DeepImpute"
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2638
Number of edges: 92838
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8619
Number of communities: 9
Elapsed time: 0 seconds
[1] "VIPER"
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2638
Number of edges: 92438
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8735
Number of communities: 10
Elapsed time: 0 seconds
[1] "scImpute"
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2638
Number of edges: 95519
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8614
Number of communities: 9
Elapsed time: 0 seconds
acc_result = rep(NA, length(cell_type_identification_result_list))
names(acc_result) = names(cell_type_identification_result_list)
ari_result = rep(NA, length(cell_type_identification_result_list))
names(ari_result) = names(cell_type_identification_result_list)
for(ii in names(cell_type_identification_result_list)){
acc_result[ii] = cell_type_identification_result_list[[ii]][["summary"]]["ACC"]
ari_result[ii] = cell_type_identification_result_list[[ii]][["summary"]]["ARI"]
}
barplot_usage(acc_result, main = "", cex.main = 1.5, bar_color = method_color, text_color = text_color, use_data_order = T, ylab = "ACC", cex.lab = 1.5, font.main = 1, ylim = c(-0.1, 1), decreasing = TRUE, use_border = F)
barplot_usage(ari_result, main = "", cex.main = 1.5, bar_color = method_color, text_color = text_color, use_data_order = T, ylab = "ARI", cex.lab = 1.5, font.main = 1, ylim = c(-0.1, 1), decreasing = TRUE, use_border = F)
unique_cell_type = unique(cell_type)
jaccard_mat = matrix(ncol = length(unique_cell_type), nrow = length(cell_type_identification_result_list), dimnames = list(names(cell_type_identification_result_list), unique_cell_type))
f1_mat = matrix(ncol = length(unique_cell_type), nrow = length(cell_type_identification_result_list), dimnames = list(names(cell_type_identification_result_list), unique_cell_type))
acc_mat = matrix(ncol = length(unique_cell_type), nrow = length(cell_type_identification_result_list), dimnames = list(names(cell_type_identification_result_list), unique_cell_type))
for(ii in names(cell_type_identification_result_list)){
jaccard_mat[ii, ] = cell_type_identification_result_list[[ii]][["cell_type_result"]][, "Jaccard"]
f1_mat[ii, ] = cell_type_identification_result_list[[ii]][["cell_type_result"]][, "F1-score"]
acc_mat[ii, ] = cell_type_identification_result_list[[ii]][["cell_type_result"]][, "ACC"]
}
cell_type_heatmap(jaccard_mat, "Jaccard")
cell_type_heatmap(f1_mat, "F1-score")
cell_type_heatmap(acc_mat, "ACC")