This package is a single cell Cluster-based auto-Annotation Toolkit for Cellular Heterogeneity (scCATCH) from cluster potential marker genes identification to cluster annotation based on evidence-based score by matching the potential marker genes with known cell markers in tissue-specific cell taxonomy reference database (CellMatch).
The scCATCH mainly includes two function findmarkergene
and findcelltype
to realize the automatic annotation for each identified cluster.
scCATCH can be used to annotate scRNA-seq data from tissue with cancer and without cancer.
General usage
[1] For scRNA-seq data, we suggest to revise the gene symbols with rev_gene()
. geneinfo
is the system data.frame containing the information of human and mouse from NCBI gene(updated in June 19, 2022). To use your own geneinfo
data.frame, please refer to demo_geneinfo
to build a new one, e.g., rat, zebrafish, Drosophila, C. elegans, etc.
library(scCATCH)
load(paste0(system.file(package = "scCATCH"), "/extdata/mouse_kidney_203.rda"))
# demo_geneinfo
demo_geneinfo()
#> symbol synonyms species
#> 1 A1BG A1B Human
#> 2 A1BG ABG Human
#> 3 A2MP1 A2MP Human
#> 4 Aco1 Aco Mouse
# revise gene symbols
<- rev_gene(data = mouse_kidney_203, data_type = "data", species = "Mouse", geneinfo = geneinfo)
mouse_kidney_203 #> 载入需要的程辑包:Matrix
[2] create scCATCH object with createscCATCH()
. Users need to provide the normalized data and the cluster for each cell.
<- createscCATCH(data = mouse_kidney_203, cluster = mouse_kidney_203_cluster) obj
[3] find highly expressed genes with findmarkergene()
. Users need to provided the speices, tissue, or cancer information. cellmatch
is the system data.frame containing the known markers of human and mouse. To use your own marker data.frame, please refer to demo_marker
to build a new one, e.g., rat, zebrafish, Drosophila, C. elegans, etc.
# demo_geneinfo
demo_marker()
#> species tissue cancer condition subtype1 subtype2
#> 1 Human Liver Normal Normal cell NA CD4+
#> 2 Human Liver Normal Normal cell NA CD8+
#> 3 Human Liver Hepatocellular Cancer Cancer cell NA NA
#> 4 Human Liver Hepatocellular Cancer Cancer cell Regulatory NA
#> subtype3 celltype gene resource pmid
#> 1 NA T Cell CD4 Experiment 27781378
#> 2 NA T Cell CD8A Experiment 27781378
#> 3 Exhausted T Cell ABCG1 Single-cell sequencing 28622514
#> 4 NA T Cell ACP5 Single-cell sequencing 28622514
# find highly expressed genes
<- findmarkergene(object = obj, species = "Mouse", marker = cellmatch, tissue = "Kidney")
obj #> There are 1764 potential marker genes in CellMatch database for Mouse on Kidney.
[4] Evidence-based score and annotation for each cluster with findcelltype()
<- findcelltype(object = obj)
obj
# Results is stored in obj
@celltype
obj#> cluster
#> 1 1
#> 2 2
#> 3 3
#> cluster_marker
#> 1 Sdhd, Ckmt1, Rabac1, Clcn5, Kit, Clcnkb, Atp6v1b1, Pgrmc1, Atp6v0e, Eps8, Cpsf4l, Atp6ap1, Atp6v1e1, Epb41l2, Avpr1a, Uqcr11, Mdh1, Atp6v1c2, Slc26a4, Atp6v1d, Uqcrb, Hexb, Emb, Prkcd, Aco2, Alcam, Pros1, Guca2a, Atp6v0c, Slc4a9, Efhd1, Pam, Rgs2, Atp6v1g3, Mkks, Jag1, Edn3, Car2, Mme, Atp6v0d2, Ociad2, Stap1, Pcolce, Tmem213, Aldh1l1, Itpr2, Ctsc, Atp6ap2, Slc25a4, Plet1, Cox7a2, Car12, Gng11, Mif, Atp6v0b, Atp6v1h, Gadd45a, Etl4, Cox6b1, Fam13a, Slc43a2, Atp6v0a4, Rcan2, Ncoa7, S100g, Serpinb6b, Aqp6, Hepacam2, Oxgr1, Serpinb9, Wnk1, Foxi1, Chchd10, Tdrp, Spink8, Atp6v1a, 9130008F23Rik, Adgrf5, Pde1a, Kng2, Tmem117, Tmem61, Rhbg, Mindy1
#> 2 Cav2, Scnn1g, Calm1, Gstt1, Aif1l, Fxyd4, Wdr1, Crip2, Crip1, Cav1, Ctsd, Pea15a, Stc1, Gata3, Dstn, Pls3, Sdc4, Serinc3, Wfdc2, Nudt4, Sptbn1, Btg2, Pgam2, Cdkl1, Fos, Clu, Fstl1, Aqp2, Krt7, Krt18, Ivns1abp, Myl12a, Anxa1, Ptp4a1, Atp1b1, Olfm1, Cst3, S100a11, Tbck, Calb1, Aqp3, Hacd4, Pdzk1ip1, Tmem52b, Arhgdib, Iqgap1, Scnn1b, Avpr2, Sorbs2, Adgrg1, Cdh16, Hsd11b2, Aplp2, Tpm1, Cdo1, Tspan8, R3hdm4, Btg1, Mcoln3, Kcne1, Arglu1, Npnt, Kcnj1, Tmem45b, S100a1, Cystm1, Cd24a, Cldn4, Tmem229a, Tmsb4x, Lgals3, Tacstd2, Junb, Phactr1, Blnk, Pcnp, Akr1c19, Adh1, S100a16, Apela, Btc, Neat1, Cavin1
#> 3 Slc22a18, Gm2a, Slc7a7, Folr1, Rhoc, Apoe, Dnase1, Htra1, Cyba, Neu1, Slc47a1, Pipox, Igfbp4, Mapt, Sparc, Cxcl16, Timp3, Tcn2, Acox1, Akr1c21, Slc17a1, Slc34a1, Slc6a19, Ephx2, Cyp2d26, Miox, Cela1, Fhl1, Slc22a1, Mep1a, Lims2, Cndp2, Slc22a6, Fth1, Keg1, Acy3, Cyp2e1, Pecr, Dbi, Lrp2, Cat, Sord, Slc27a2, Pck1, Hao2, Hmgcs2, Bdh2, Ddah1, Ccdc107, Scp2, Akr1a1, Cda, Errfi1, Prom1, Khk, Tmem176b, Nox4, Fah, Acsm2, Msn, Ces1f, 4931406C07Rik, Slc37a4, Calml4, Pth1r, Guca2b, Xylb, Slc17a3, Pdzk1, Degs1, Ttc36, Lap3, Id1, Defb29, Tmem174, Ugt3a2, Cyp2j5, Slc22a30, Them7, Fut9, Tpt1, Slc4a4, Slc22a12, Ugt2b38, Kcnj15, Slc22a28, Glyat, Gpx1, Lgals1, Fbp1, Gm10804, Ass1, G6pc, Rida, Mettl26, Nat8f1, Selenop
#> cell_type celltype_score
#> 1 Collecting duct intercalated cell 0.86
#> 2 Collecting duct principal cell 0.86
#> 3 Proximal Proximal tubule cell 0.70
#> celltype_related_marker
#> 1 9130008F23Rik, Atp6v0d2, Atp6v1g3, Car2, Chchd10, Fam13a, Mif, Mme, Oxgr1, Slc25a4, Tmem61, Uqcrb, Aco2, Aldh1l1, Aqp6, Atp6ap1, Atp6ap2, Atp6v0a4, Atp6v0b, Atp6v0c, Atp6v0e, Atp6v1a, Atp6v1b1, Atp6v1c2, Atp6v1d, Atp6v1e1, Atp6v1h, Avpr1a, Car12, Ckmt1, Clcn5, Clcnkb, Cox6b1, Cox7a2, Cpsf4l, Efhd1, Emb, Eps8, Foxi1, Guca2a, Hepacam2, Hexb, Itpr2, Kit, Mdh1, Mkks, Ncoa7, Ociad2, Pam, Pde1a, Pgrmc1, Rcan2, Rgs2, S100g, Sdhd, Serpinb6b, Serpinb9, Slc26a4, Slc4a9, Tdrp, Tmem117, Tmem213, Uqcr11, Mindy1, Adgrf5
#> 2 Adgrg1, Apela, Aqp2, Aqp3, Avpr2, Cav1, Cav2, Cdh16, Cdo1, Cldn4, Fxyd4, Gstt1, Hsd11b2, Ivns1abp, Kcne1, Kcnj1, Npnt, Nudt4, Olfm1, Pdzk1ip1, Phactr1, Scnn1b, Scnn1g, Stc1, Tacstd2, Tbck, Tmem229a, Tmem45b, Tspan8, Wfdc2, Adh1, Aif1l, Akr1c19, Arhgdib, Atp1b1, Btc, Cdkl1, Ctsd, Cystm1, Fstl1, Krt18, Krt7, Lgals3, Neat1, Cd24a
#> 3 4931406C07Rik, Acox1, Acsm2, Acy3, Akr1a1, Akr1c21, Ass1, Bdh2, Calml4, Cat, Ccdc107, Cda, Cela1, Ces1f, Cndp2, Cyp2d26, Cyp2e1, Cyp2j5, Defb29, Dnase1, Ephx2, Errfi1, Fah, Fbp1, Folr1, Fth1, Fut9, G6pc, Glyat, Gm10804, Gpx1, Guca2b, Hao2, Kcnj15, Keg1, Khk, Lap3, Lrp2, Mep1a, Miox, Neu1, Nox4, Pck1, Pdzk1, Pecr, Pipox, Scp2, Slc17a1, Slc17a3, Slc22a1, Slc22a12, Slc22a18, Slc22a28, Slc22a30, Slc22a6, Slc27a2, Slc34a1, Slc37a4, Slc47a1, Slc4a4, Slc6a19, Slc7a7, Sord, Tcn2, Them7, Tmem174, Ttc36, Ugt2b38, Ugt3a2, Xylb, Mettl26, Nat8f1, Rida
#> PMID
#> 1 29622724, 28851704, 29775597
#> 2 29622724, 28851704, 29775597
#> 3 29622724
Note: There two methods to find marker genes. Set use_method
1
to compare with every other cluster and 2
to compare with other clusters together like the strategy in Seurat
. Besides, when setting use_method
1
, users can set comp_cluster
, it represent the number of clusters to compare. Default is to compare all other cluster for each cluster. Set it between 1 and length of unique clusters. More marker genes will be obtained for smaller comp_cluster
.
# The most strict condition to identify marker genes
<- findmarkergene(object = obj, species = "Mouse", marker = cellmatch,tissue = "Kidney", use_method = "1")
obj #> There are 1764 potential marker genes in CellMatch database for Mouse on Kidney.
# The most loose condition to identify marker genes
<- findmarkergene(object = obj, species = "Mouse", marker = cellmatch, tissue = "Kidney", use_method = "2")
obj #> There are 1764 potential marker genes in CellMatch database for Mouse on Kidney.
# Other conditions to identify marker genes
<- findmarkergene(object = obj,species = "Mouse", marker = cellmatch, tissue = "Kidney", use_method = "1", comp_cluster = 1)
obj #> There are 1764 potential marker genes in CellMatch database for Mouse on Kidney.
Moreover, users can adjust the cell_min_pct
, logfc
, and pvalue
to identify the different marker genes.
Custom usage
Users are allowed to use the custom cellmatch
for cell type prediction when [1] users want to select different combination of tissues or cancers for annotation; [2] users want to add more marker genes to cellmatch
for annotation; [3] users want to use markers from different species other than human and mouse. In this way, please set if_use_custom_marker
TRUE
in findmarkergene()
function and do not need to set species
,tissue
, and cancer
[1] Different combination of tissues or cancers
# Example
<- cellmatch[cellmatch$species == "Mouse" & cellmatch$tissue %in% c("Kidney", "Liver", "Lung", "Brain"), ]
cellmatch_new <- findmarkergene(object = obj, if_use_custom_marker = TRUE, marker = cellmatch_new)
obj <- findcelltype(obj)
obj
# Example
<- cellmatch[cellmatch$species == "Mouse" & cellmatch$cancer %in% c("Lung Cancer", "Lymph node", "Renal Cell Carcinoma", "Prostate Cancer"), ]
cellmatch_new <- findmarkergene(object = obj, if_use_custom_marker = TRUE, marker = cellmatch_new)
obj <- findcelltype(obj)
obj
# Example
<- cellmatch[cellmatch$species == "Mouse", ]
cellmatch_new <- cellmatch[cellmatch$cancer %in% c("Lung Cancer", "Lymph node", "Renal Cell Carcinoma", "Prostate Cancer") | cellmatch$tissue %in% c("Kidney", "Liver", "Lung", "Brain"), ]
cellmatch_new <- findmarkergene(object = obj, if_use_custom_marker = TRUE, marker = cellmatch_new)
obj <- findcelltype(obj) obj
[2] Add more marker genes to cellmatch
for annotation
# Example
# cellmatch_new is provided by users
# cellmatch_new <- rbind(cellmatch, cellmatch_new)
# Then use the new cellmatch
# a. define the species, tissue, and cancer
<- findmarkergene(object = obj, species = "Mouse", marker = cellmatch_new, tissue = "Kidney")
obj #> There are 1764 potential marker genes in CellMatch database for Mouse on Kidney.
<- findcelltype(obj)
obj
# b. directly use custom cellmatch
<- findmarkergene(object = obj, if_use_custom_marker = TRUE, marker = cellmatch_new)
obj <- findcelltype(obj) obj
[3] Use markers from different species
# Please refer to demo_marker to build a marker data.frame (new_cellmatch) for another species, e.g., rat
# Then use the new marker
<- findmarkergene(object = obj, species = "Rat", if_use_custom_marker = TRUE, marker = cellmatch_new, tissue = "Kidney")
obj <- findcelltype(obj) obj
About
Please refer to the scCATCH on GitHub for more information. Available tissues and cancers see the wiki page
Cite
Shao et al., scCATCH:Automatic Annotation on Cell Types of Clusters from Single-Cell RNA Sequencing Data, iScience, Volume 23, Issue 3, 27 March 2020. doi: 10.1016/j.isci.2020.100882. PMID:32062421