knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir="E:/DISC/reproducibility")
utilities_path = "./source/utilities.r"
source(utilities_path)
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'data.table':
method from
print.data.table
reldist: Relative Distribution Methods
Version 1.6-6 created on 2016-10-07.
copyright (c) 2003, Mark S. Handcock, University of California-Los Angeles
For citation information, type citation("reldist").
Type help(package="reldist") to get started.
The original BONE_MARROW data was downloaded from HCA website (https://data.humancellatlas.org/explore/projects/cc95ff89-2e68-4a08-a234-480eca21ce79). The authors of this paper (https://www.biorxiv.org/content/10.1101/2020.01.29.925974v1) aligned the original sequence data (in .fastq format) to hg19 reference genome using CellRanger instead of using the count matrix provided by HCA. Their cell-filtered count matrix can be found at https://doc-04-6g-docs.googleusercontent.com/docs/securesc/rm132bl2k8nvnlftqa8a8d5p239lbngf/6o5dsruhjpmecgnkd0nn4b1ak3ss8ufd/1588554075000/07888005335114604629/01857410241295225190/1euh8YB8ThSLHJNQMTCuuKp_nRiME1KzN?e=download&authuser=0&nonce=7apqnnaq9bch8&user=01857410241295225190&hash=a60rd66gq56e0af1vc5ua60146t3gq7m or https://github.com/iyhaoo/DISC/blob/master/reproducibility/data/HCA_tissue/original_data/MantonBM6_count.rds.
if(!file.exists("./data/BONE_MARROW/raw.loom")){
original_data <- readRDS("./data/BONE_MARROW/original_data/MantonBM6_count.rds")
save_h5("./data/BONE_MARROW/original.loom", as.matrix(t(original_data)))
seurat_obj <- CreateSeuratObject(counts = as.data.frame(original_data), min.features = 500)
cell_names = colnames(seurat_obj[["RNA"]]@counts)
gene_names = rownames(original_data)
raw_data = as.matrix(original_data[, cell_names])
save_h5("./data/BONE_MARROW/raw.loom", t(raw_data))
}else{
raw_data = readh5_loom("./data/BONE_MARROW/raw.loom")
cell_names = colnames(raw_data)
gene_names = rownames(raw_data)
}
[1] "./data/BONE_MARROW/raw.loom"
[1] 32738 6939
print(dim(raw_data))
[1] 32738 6939
print("BONE_MARROW...OK!")
[1] "BONE_MARROW...OK!"
if(!file.exists("./data/BONE_MARROW/bulk.loom")){
gene_bulk_all = as.matrix(read.table("./data/BONE_MARROW/original_data/GSE74246_RNAseq_All_Counts.txt.gz", header = T, row.names = 1))
gene_bulk_mat = gene_bulk_all[, grep("^X", colnames(gene_bulk_all))]
save_h5("./data/BONE_MARROW/bulk.loom", as.matrix(t(gene_bulk_mat)))
}else{
gene_bulk_mat = readh5_loom("./data/BONE_MARROW/bulk.loom")
}
[1] "./data/BONE_MARROW/bulk.loom"
[1] 25498 49
print(dim(gene_bulk_mat))
[1] 25498 49
print("BONE_MARROW_bulk...OK!")
[1] "BONE_MARROW_bulk...OK!"
# We used the raw data after gene selection for cell identification.
gene_bc_mat = readh5_loom("./data/BONE_MARROW/raw.loom")
[1] "./data/BONE_MARROW/raw.loom"
[1] 32738 6939
gene_bc_filt = gene_bc_mat[gene_selection(gene_bc_mat, 10), ]
dim(gene_bc_filt) # 13813, 6939
[1] 13813 6939
used_genes = rownames(gene_bc_filt)
output_dir = "./results/BONE_MARROW"
dir.create(output_dir, showWarnings = F, recursive = T)
Following this script (https://github.com/Winnie09/imputationBenchmark/blob/master/data/code/process/07_hca_assign_celltype.R), we use the bulk-sequence data (https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE74246) of 13 normal hematopoietic cell types and 3 acute myeloid leukemia cell types for cell identification, the file is downloaded from https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE74246&format=file&file=GSE74246%5FRNAseq%5FAll%5FCounts%2Etxt%2Egz.
if(!file.exists("./data/BONE_MARROW/cell_type.rds")){
library(scran)
scran_normalization = function(gene_bc_mat){
# The source of this function is https://github.com/Winnie09/imputationBenchmark/blob/master/data/code/process/06_make_hca_MantonBM6.R
dimnames_gene_bc_mat = dimnames(gene_bc_mat)
dimnames(gene_bc_mat) = list()
sce = SingleCellExperiment(list(counts = gene_bc_mat))
no_cores = max(c(detectCores() - 1, 1))
if(ncol(gene_bc_mat) < 21){
sce = computeSumFactors(sce, BPPARAM = MulticoreParam(workers = no_cores), sizes = c(5, 10, 15, 20))
} else {
sce = computeSumFactors(sce, BPPARAM = MulticoreParam(workers = no_cores))
}
sf = sizeFactors(sce)
dimnames(gene_bc_mat) = dimnames_gene_bc_mat
return(log2(sweep(gene_bc_mat, 2, sf, "/") + 1))
}
scalematrix = function(data){
cm = rowMeans(data)
csd = apply(data, 1, sd)
(data - cm) / csd
}
corfunc = function(m1, m2){
scalematrix(t(m1)) %*% t(scalematrix(t(m2))) / (nrow(m1) - 1)
}
# Use annotation information
gz_path = "./data/hg19/Homo_sapiens.GRCh37.87.gtf.gz"
annotation_mat = get_map(gz_path)
tgngl = tapply(annotation_mat[, "gene_length"] / 1000, annotation_mat[, "gene_name"], max)
gngl = as.vector(tgngl)
names(gngl) = names(tgngl)
gene_bulk_filt = gene_bulk_mat[row.names(gene_bulk_mat) %in% names(gngl),]
gene_bulk_norm = sweep(gene_bulk_filt / gngl[rownames(gene_bulk_filt)], 2, colSums(gene_bulk_filt) / 1e6, "/")
bulk_data = log2(gene_bulk_norm[rowSums(gene_bulk_norm) > 0,] + 1)
bulk_cell_type = sapply(colnames(bulk_data), function(x){
strsplit(x,"\\.")[[1]][2]
}, USE.NAMES = F)
gene_bc_filt = gene_bc_mat[gene_selection(gene_bc_mat, 10), ]
dim(gene_bc_filt) # 13813, 6939
sc_data = scran_normalization(gene_bc_filt)
rownames(sc_data) = sub(".*:", "", rownames(gene_bc_filt))
used_genes = intersect(rownames(bulk_data), rownames(sc_data))
bulk_filt = bulk_data[used_genes, ]
sc_filt = sc_data[used_genes, ]
# The expression level for each cell type in bulk sequencing
bulk_mean = sapply(unique(bulk_cell_type),function(x) {
rowMeans(bulk_filt[ , bulk_cell_type == x])
})
# Find 100 top postive differentially expressed genes for each celltype pair.
DEG_list = list()
top_number = 100
unique_celltype_pairs = combn(ncol(bulk_mean), 2)
for(ii in seq(ncol(unique_celltype_pairs))){
celltype_1 = colnames(bulk_mean)[unique_celltype_pairs[1, ii]]
celltype_2 = colnames(bulk_mean)[unique_celltype_pairs[2, ii]]
sort_result = sort(bulk_mean[ , celltype_1] - bulk_mean[ , celltype_2], decreasing = FALSE)
DEG_list[[paste(celltype_2, celltype_1, sep = "-")]] = names(sort_result)[seq(top_number)]
DEG_list[[paste(celltype_1, celltype_2, sep = "-")]] = names(sort_result)[seq(from = length(sort_result), to = length(sort_result) - (top_number - 1))]
}
# Calculate the mean expression of these top-gene combinations across cell types (bulk) or cells (single-cell).
expression_mean_function = function(gene_bc_norm, DEG_list){
return(t(sapply(DEG_list, function(x){
colMeans(gene_bc_norm[x, ])
})))
}
bulk_DEG_expression_mean = expression_mean_function(bulk_mean, DEG_list)
sc_DEG_expression_mean = expression_mean_function(sc_filt, DEG_list)
# Calculate the expression variation of these top-gene combinations across cell types (bulk) or cells (single-cell).
expression_variation_function = function(x){
return((x - rowMeans(x)) / apply(x, 1, sd))
}
bulk_DEG_expression_variation = expression_variation_function(bulk_DEG_expression_mean)
sc_DEG_expression_variation = expression_variation_function(sc_DEG_expression_mean)
# Each top-gene combination correspond a cell type.
bulk_DEG_combination_rank = apply(bulk_DEG_expression_variation, 2, rank)
sc_DEG_combination_rank = apply(sc_DEG_expression_variation, 2, rank)
# Cell type identification.
maxcorcut = 0.6
difcorcut = 0
cormat = corfunc(sc_DEG_combination_rank, bulk_DEG_combination_rank)
maxcor = apply(cormat, 1, max)
max2cor = apply(cormat, 1, function(x){
sort(x, decreasing = T)[2]
})
cell_type = colnames(cormat)[apply(cormat, 1, which.max)]
cell_type[maxcor < maxcorcut] = NA
cell_type[maxcor - max2cor < difcorcut] = NA
names(cell_type) = colnames(sc_data)
saveRDS(cell_type, "./data/BONE_MARROW/cell_type.rds")
}else{
cell_type = readRDS("./data/BONE_MARROW/cell_type.rds")
}
搼㸴搼㸸挼㸸攼㹢搼㸰攼㸸Ҫ戼㸵ij̼愼㹤戼㸰昼㹣愼㸳戼㹡SingleCellExperiment
搼㸴搼㸸挼㸸攼㹢搼㸰攼㸸Ҫ戼㸵ij̼愼㹤戼㸰昼㹣愼㸳戼㹡SummarizedExperiment
搼㸴搼㸸挼㸸攼㹢搼㸰攼㸸Ҫ戼㸵ij̼愼㹤戼㸰昼㹣愼㸳戼㹡GenomicRanges
搼㸴搼㸸挼㸸攼㹢搼㸰攼㸸Ҫ戼㸵ij̼愼㹤戼㸰昼㹣愼㸳戼㹡stats4
搼㸴搼㸸挼㸸攼㹢搼㸰攼㸸Ҫ戼㸵ij̼愼㹤戼㸰昼㹣愼㸳戼㹡BiocGenerics
搼㸴搼㸸挼㸸攼㹢戼㸳̼愼㹤戼㸰昼㹣愼㸳戼㹡愼㸱愼㹥BiocGenerics愼㸱愼㹦
The following objects are masked from 愼㸱愼㹥package:parallel愼㸱愼㹦:
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap, parApply, parCapply, parLapply,
parLapplyLB, parRapply, parSapply, parSapplyLB
The following object is masked from 愼㸱愼㹥package:Matrix愼㸱愼㹦:
which
The following objects are masked from 愼㸱愼㹥package:stats愼㸱愼㹦:
IQR, mad, sd, var, xtabs
The following objects are masked from 愼㸱愼㹥package:base愼㸱愼㹦:
anyDuplicated, append, as.data.frame, basename, cbind, colnames, dirname, do.call, duplicated, eval, evalq, Filter,
Find, get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget, order, paste, pmax, pmax.int,
pmin, pmin.int, Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply, union, unique,
unsplit, which, which.max, which.min
搼㸴搼㸸挼㸸攼㹢搼㸰攼㸸Ҫ戼㸵ij̼愼㹤戼㸰昼㹣愼㸳戼㹡S4Vectors
搼㸴搼㸸挼㸸攼㹢戼㸳̼愼㹤戼㸰昼㹣愼㸳戼㹡愼㸱愼㹥S4Vectors愼㸱愼㹦
The following object is masked from 愼㸱愼㹥package:future愼㸱愼㹦:
values
The following object is masked from 愼㸱愼㹥package:Matrix愼㸱愼㹦:
expand
The following object is masked from 愼㸱愼㹥package:base愼㸱愼㹦:
expand.grid
搼㸴搼㸸挼㸸攼㹢搼㸰攼㸸Ҫ戼㸵ij̼愼㹤戼㸰昼㹣愼㸳戼㹡IRanges
搼㸴搼㸸挼㸸攼㹢戼㸳̼愼㹤戼㸰昼㹣愼㸳戼㹡愼㸱愼㹥IRanges愼㸱愼㹦
The following object is masked from 愼㸱愼㹥package:grDevices愼㸱愼㹦:
windows
搼㸴搼㸸挼㸸攼㹢搼㸰攼㸸Ҫ戼㸵ij̼愼㹤戼㸰昼㹣愼㸳戼㹡GenomeInfoDb
搼㸴搼㸸挼㸸攼㹢搼㸰攼㸸Ҫ戼㸵ij̼愼㹤戼㸰昼㹣愼㸳戼㹡Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with 'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.
搼㸴搼㸸挼㸸攼㹢搼㸰攼㸸Ҫ戼㸵ij̼愼㹤戼㸰昼㹣愼㸳戼㹡DelayedArray
搼㸴搼㸸挼㸸攼㹢搼㸰攼㸸Ҫ戼㸵ij̼愼㹤戼㸰昼㹣愼㸳戼㹡matrixStats
搼㸴搼㸸挼㸸攼㹢戼㸳̼愼㹤戼㸰昼㹣愼㸳戼㹡愼㸱愼㹥matrixStats愼㸱愼㹦
The following objects are masked from 愼㸱愼㹥package:Biobase愼㸱愼㹦:
anyMissing, rowMedians
The following object is masked from 愼㸱愼㹥package:reldist愼㸱愼㹦:
iqr
搼㸴搼㸸挼㸸攼㹢搼㸰攼㸸Ҫ戼㸵ij̼愼㹤戼㸰昼㹣愼㸳戼㹡BiocParallel
搼㸴搼㸸挼㸸攼㹢戼㸳̼愼㹤戼㸰昼㹣愼㸳戼㹡愼㸱愼㹥DelayedArray愼㸱愼㹦
The following objects are masked from 愼㸱愼㹥package:matrixStats愼㸱愼㹦:
colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
The following objects are masked from 愼㸱愼㹥package:base愼㸱愼㹦:
aperm, apply, rowsum
搼㸴搼㸸挼㸸攼㹢戼㸳̼愼㹤戼㸰昼㹣愼㸳戼㹡愼㸱愼㹥SummarizedExperiment愼㸱愼㹦
The following object is masked from 愼㸱愼㹥package:Seurat愼㸱愼㹦:
Assays
Treat input as file
|--------------------------------------------------|
|==================================================|
MulticoreParam() not supported on Windows, use SnowParam()
print("Cell Type ... OK!")
[1] "Cell Type ... OK!"