knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir="E:/DISC/reproducibility")
utilities_path = "./source/utilities.r"
source(utilities_path)
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'data.table':
method from
print.data.table
reldist: Relative Distribution Methods
Version 1.6-6 created on 2016-10-07.
copyright (c) 2003, Mark S. Handcock, University of California-Los Angeles
For citation information, type citation("reldist").
Type help(package="reldist") to get started.
DISC outputs both values and compressed features after imputation. Here, we demostrate how to DISC compressed features for Seurat clustering using PBMC data. Genes filtered by our gene selection are used for cell type identification evaluation.
ds_dir = "./data/PBMC/ds_0.3/r2"
observed_path = paste0(ds_dir, "/observed.loom")
observed_data = readh5_loom(observed_path)
[1] "./data/PBMC/ds_0.3/r2/observed.loom"
[1] 32738 2638
gene_filter = gene_selection(observed_data, 10)
observed_filt = observed_data[gene_filter, ]
DISC_values = readh5_loom(paste0(ds_dir, "/DISC.loom"))[gene_filter, ]
[1] "./data/PBMC/ds_0.3/r2/DISC.loom"
[1] 32738 2638
DISC_features = readh5_feature(paste0(ds_dir, "/DISC_feature.hdf5"))
[1] "./data/PBMC/ds_0.3/r2/DISC_feature.hdf5"
[1] 50 2638
cell_type = readRDS("./data/PBMC/cell_type.rds")
cell_type_identification_result_list = list()
cell_type_identification_result_list[["Observed"]] = seurat_classification(gene_bc_mat = observed_filt, cell_type = cell_type, pca_dim = 10, res = 0.5, min_pct = 0.25, show = T, cell_type_identification_fun = cell_type_identification_pbmc)
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
PC_ 1
Positive: MALAT1, RPS27A, LTB, RPS3A, RPL3, RPL13A, IL32, RPS6, RPSA, CD3D
RPL13, RPS12, RPS18, CD3E, LDHB, CXCR4, RPS5, IL7R, RPL10A, CD7
B2M, CD2, CD69, TPT1, AES, RPL19, BTG1, JUN, GLTSCR2, CCL5
Negative: CST3, TYROBP, LYZ, AIF1, LST1, S100A9, FCER1G, LGALS1, FCN1, TYMP
FTL, COTL1, CTSS, FTH1, S100A8, CFD, IFITM3, SAT1, PSAP, S100A11
SERPINA1, LGALS2, S100A6, CD68, SPI1, S100A4, GSTP1, NPC2, CFP, GPX1
PC_ 2
Positive: HLA-DRA, CD79A, CD74, HLA-DQA1, CD79B, TCL1A, RPL13, HLA-DQB1, HLA-DRB1, MS4A1
HLA-DPB1, RPS2, LINC00926, RPL13A, HLA-DPA1, RPS18, HLA-DRB5, HLA-DMA, RPS5, CD37
VPREB3, HLA-DQA2, LTB, RPL8, S100A8, RPS12, FCER2, HLA-DMB, S100A9, NCF1
Negative: NKG7, GZMB, PRF1, GNLY, FGFBP2, GZMA, CST7, CCL5, CTSW, GZMH
B2M, SPON2, CCL4, FCGR3A, CD247, AKR1C3, CLIC3, HOPX, TTC38, ACTB
APMAP, RARRES3, PFN1, IGFBP7, XCL2, SRGN, IL32, GPR56, CD7, APOBEC3G
PC_ 3
Positive: RPS12, JUNB, RPSA, RPS6, RPS2, RPS18, RPL19, RPS27A, RPL10A, RPS5
LTB, RPS3A, RPL13A, RPL3, RPL13, LDHB, TPT1, GNB2L1, RPL8, FOS
NACA, RPL34, VIM, CD3D, IL7R, GLTSCR2, DUSP1, HINT1, BTG1, NOSIP
Negative: S100A8, PPBP, PF4, GZMB, NKG7, S100A9, GNG11, FGFBP2, GNLY, CD9
SDPR, CCL5, CLU, HIST1H2AC, CCL4, SPON2, CST7, PRF1, SPARC, CCL3
NRGN, RGS18, TYROBP, GZMH, GPX1, GZMA, TSC22D1, TUBB1, CA2, CD14
PC_ 4
Positive: S100A8, S100A9, TMSB4X, CD3D, S100A4, IL7R, VIM, LDHB, S100A6, IL32
JUNB, CD3E, LYZ, NOSIP, LGALS2, CD14, FCN1, FOS, FYB, CD2
GIMAP7, IL8, S100A10, RGS10, S100A12, MAL, AQP3, RBP7, FOLR3, GAPDH
Negative: CD79A, HLA-DQA1, CD79B, HLA-DRA, CD74, HLA-DPB1, HLA-DQB1, HLA-DRB1, HLA-DPA1, MS4A1
TCL1A, LINC00926, HLA-DQA2, HLA-DRB5, VPREB3, HLA-DMA, CD37, FCER2, HVCN1, GZMB
HLA-DMB, IGLL5, EAF2, FCRLA, PLAC8, BLNK, FCGR3A, PRF1, SMIM14, NKG7
PC_ 5
Positive: S100A8, RPS2, RPL13, NKG7, S100A9, GNLY, LGALS2, FGFBP2, LYZ, PRF1
RPL19, MALAT1, CD14, CCL3, GZMB, CCL4, TYROBP, CST7, MS4A6A, GZMA
RPL13A, TMSB10, GSTP1, CTSW, CEBPD, RPS6, GZMH, CYBA, IGFBP7, SPON2
Negative: PPBP, PF4, GNG11, SDPR, CLU, HIST1H2AC, CD9, RGS18, SPARC, CA2
TUBB1, NRGN, TSC22D1, RUFY1, PGRMC1, NCOA4, ACRBP, TUBA4A, ODC1, MPP1
NGFRAP1, MMD, GAS2L1, MFSD1, NAP1L1, MYL9, RGS10, GRAP2, ETFA, MAP3K7CL
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2638
Number of edges: 96003
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8550
Number of communities: 9
Elapsed time: 0 seconds
cell_type_identification_result_list[["DISC"]] = seurat_classification(gene_bc_mat = DISC_values, cell_type = cell_type, pca_dim = 10, res = 0.5, min_pct = 0.25, show = T, cell_type_identification_fun = cell_type_identification_pbmc)
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
PC_ 1
Positive: CST3, TYROBP, FTL, LYZ, LST1, FTH1, AIF1, FCER1G, S100A9, FCN1
LGALS1, TYMP, CFD, S100A8, IFITM3, SERPINA1, LGALS2, CD68, CTSS, SPI1
CFP, COTL1, SAT1, PSAP, S100A11, S100A6, IFI30, S100A4, GRN, LGALS3
Negative: MALAT1, RPS27A, RPS3A, LTB, RPL3, CD3D, IL32, RPSA, RPS6, RPS12
CD3E, LDHB, CXCR4, IL7R, RPS5, RPL10A, CD7, CD2, CD27, TPT1
CD69, AES, AQP3, ACAP1, RPL19, BTG1, GLTSCR2, RPL18, RPL34, NOSIP
PC_ 2
Positive: NKG7, GZMB, PRF1, FGFBP2, GZMA, GNLY, CST7, GZMH, CCL5, CTSW
B2M, SPON2, CCL4, CLIC3, CD247, AKR1C3, FCGR3A, TTC38, HOPX, S1PR5
XCL2, HLA-A, IGFBP7, APMAP, IL32, SRGN, RARRES3, APOBEC3G, ACTB, PLEKHF1
Negative: CD79A, HLA-DRA, HLA-DQA1, MS4A1, CD74, HLA-DQB1, LINC00926, TCL1A, HLA-DRB1, CD79B
RPS2, HLA-DQA2, HLA-DPB1, VPREB3, RPS5, HLA-DMA, HLA-DPA1, CD37, HLA-DRB5, HLA-DMB
RPL19, RPS9, PPAPDC1B, LTB, GNB2L1, RPS12, RPL8, HVCN1, LY86, RPL10A
PC_ 3
Positive: IL7R, VIM, LDHB, CD3D, JUNB, NOSIP, RPS12, TMSB4X, RGS10, S100A6
IL32, FYB, S100A4, MAL, CD3E, AQP3, GIMAP7, CD2, S100A10, FOS
S100A8, TPT1, RPS2, S100A9, CD27, RPS6, S100A11, GIMAP1, GIMAP4, LGALS2
Negative: CD79A, CD79B, HLA-DQA1, CD74, HLA-DQB1, HLA-DRA, TCL1A, HLA-DPB1, MS4A1, HLA-DRB1
LINC00926, HLA-DPA1, VPREB3, HLA-DQA2, HLA-DRB5, GZMB, BANK1, HLA-DMA, HVCN1, TSPAN13
FCER2, FGFBP2, NKG7, HLA-DMB, GNLY, IGLL5, PRF1, GZMH, CST7, CCL4
PC_ 4
Positive: S100A8, S100A9, CD14, LGALS2, MS4A6A, CCL3, GPX1, FOLR3, IL8, LYZ
S100A12, CEBPD, ID1, GSTP1, BLVRB, RNASE6, CSF3R, RBP7, ASGR1, GRN
CD99, GAPDH, ALDH2, IGFBP7, VCAN, CCL4, IER3, QPCT, PLBD1, GNLY
Negative: FCGR3A, HES4, RP11-290F20.3, RPS19, MS4A7, IFITM2, CKB, RHOC, SIGLEC10, CTSL
HMOX1, CSF1R, LILRA3, PILRA, CDKN1C, MS4A4A, ADA, NAP1L1, CTD-2006K23.1, LRRC25
IFITM3, CYTIP, OAS1, WARS, B2M, CEBPB, FAM110A, ABI3, SPN, DRAP1
PC_ 5
Positive: ATP5G2, ID2, GNLY, RPS2, MALAT1, NKG7, HOPX, RPL19, CYBA, IFI35
CTSW, SLC25A11, PSME1, RPL3, XBP1, UBE2L6, RPS6, TMSB10, CHMP4A, VAMP8
NCR3, PRF1, MX1, GZMA, DBI, NKIRAS2, RPS19, MT-CO1, ANXA1, C9orf142
Negative: PPBP, PF4, GNG11, SDPR, HIST1H2AC, CD9, CLU, CA2, TUBB1, NRGN
TSC22D1, PGRMC1, ACRBP, RUFY1, NCOA4, TPM4, ODC1, TUBA4A, MMD, TUBA1C
GSN, NGFRAP1, MPP1, MFSD1, GAS2L1, PACSIN2, MAP3K7CL, GPX1, FERMT3, GRAP2
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2638
Number of edges: 98366
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8506
Number of communities: 8
Elapsed time: 1 seconds
cell_type_identification_result_list[["DISC_feature"]] = seurat_classification(gene_bc_mat = DISC_values, feature_bc_mat = DISC_features, cell_type = cell_type, pca_dim = 10, res = 0.5, min_pct = 0.25, show = T, cell_type_identification_fun = cell_type_identification_pbmc)
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:23:23 UMAP embedding parameters a = 0.9922 b = 1.112
16:23:23 Read 2638 rows and found 50 numeric columns
16:23:23 Using Annoy for neighbor search, n_neighbors = 30
16:23:23 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:23:25 Writing NN index file to temp file C:\Users\yh\AppData\Local\Temp\RtmpQflKbQ\file2f7c256a11a3
16:23:25 Searching Annoy index using 1 thread, search_k = 3000
16:23:26 Annoy recall = 100%
16:23:27 Commencing smooth kNN distance calibration using 1 thread
16:23:28 Initializing from normalized Laplacian + noise
16:23:28 Commencing optimization for 500 epochs, with 95304 positive edges
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:23:38 Optimization finished
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2638
Number of edges: 48769
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8006
Number of communities: 6
Elapsed time: 0 seconds
acc_result = rep(NA, length(cell_type_identification_result_list))
names(acc_result) = names(cell_type_identification_result_list)
ari_result = rep(NA, length(cell_type_identification_result_list))
names(ari_result) = names(cell_type_identification_result_list)
for(ii in names(cell_type_identification_result_list)){
acc_result[ii] = cell_type_identification_result_list[[ii]][["summary"]]["ACC"]
ari_result[ii] = cell_type_identification_result_list[[ii]][["summary"]]["ARI"]
}
method_names = names(cell_type_identification_result_list)
text_color = rep("black", length(method_names))
names(text_color) = method_names
text_color["DISC_feature"] = "red"
bar_color = rep("gray50", length(method_names))
names(bar_color) = method_names
bar_color["Observed"] = "white"
bar_color["DISC_feature"] = "red"
barplot_usage(acc_result, main = "", cex.main = 1.5, bar_color = bar_color, text_color = text_color, use_data_order = T, ylab = "ACC", cex.lab = 1.5, font.main = 1, ylim = c(-0.1, 1), decreasing = TRUE)
barplot_usage(ari_result, main = "", cex.main = 1.5, bar_color = bar_color, text_color = text_color, use_data_order = T, ylab = "ARI", cex.lab = 1.5, font.main = 1, ylim = c(-0.1, 1), decreasing = TRUE)
unique_cell_type = unique(cell_type)
jaccard_mat = matrix(ncol = length(unique_cell_type), nrow = length(cell_type_identification_result_list), dimnames = list(names(cell_type_identification_result_list), unique_cell_type))
f1_mat = matrix(ncol = length(unique_cell_type), nrow = length(cell_type_identification_result_list), dimnames = list(names(cell_type_identification_result_list), unique_cell_type))
acc_mat = matrix(ncol = length(unique_cell_type), nrow = length(cell_type_identification_result_list), dimnames = list(names(cell_type_identification_result_list), unique_cell_type))
for(ii in names(cell_type_identification_result_list)){
jaccard_mat[ii, ] = cell_type_identification_result_list[[ii]][["cell_type_result"]][, "Jaccard"]
f1_mat[ii, ] = cell_type_identification_result_list[[ii]][["cell_type_result"]][, "F1-score"]
acc_mat[ii, ] = cell_type_identification_result_list[[ii]][["cell_type_result"]][, "ACC"]
}
cell_type_heatmap(jaccard_mat, "Jaccard")
cell_type_heatmap(f1_mat, "F1-score")
cell_type_heatmap(acc_mat, "ACC")