Setup knitr and load utility functions

knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir="E:/DISC/reproducibility")
utilities_path = "./source/utilities.r"
source(utilities_path)
Registered S3 method overwritten by 'dplyr':
  method           from
  print.rowwise_df     
Registered S3 methods overwritten by 'htmltools':
  method               from         
  print.html           tools:rstudio
  print.shiny.tag      tools:rstudio
  print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
reldist: Relative Distribution Methods
Version 1.6-6 created on 2016-10-07.
copyright (c) 2003, Mark S. Handcock, University of California-Los Angeles
 For citation information, type citation("reldist").
 Type help(package="reldist") to get started.

Generate data file

The original 10X_5CL data can be found at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE126906.
The 10X_5CL metadata can be found at https://github.com/LuyiTian/sc_mixology/blob/master/data/csv/sc_10x_5cl.metadata.csv.gz.
We use 3918 filtered cells and all genes to make raw.loom (genes will be filtered in the following step gene selection as our DISC paper).

if(!file.exists("./data/10X_5CL/raw.loom")){
  original_data = as.matrix(read.csv("./data/10X_5CL/original_data/GSM3618014_gene_count.csv.gz", header = T, sep = ",", row.names = 1))# 32895  5001
  gz_path = "./data/annotation/Homo_sapiens.GRCh37.87.gtf.gz"
  annotation_mat = get_map(gz_path)
  mapping = annotation_mat$gene_name
  names(mapping) = annotation_mat$gene_id
  gene_name_candidate = mapping[rownames(original_data)]
  gene_name = rownames(original_data)
  gene_name[!is.na(gene_name_candidate)] = gene_name_candidate[!is.na(gene_name_candidate)]
  for(ii in gene_name[duplicated(gene_name)]){
    index = 0
    for(jj in which(gene_name == ii)){
      gene_name[jj] = paste0(gene_name[jj], "_duplicate_", index)
      index = index + 1
    }
  }
  rownames(original_data) = gene_name
  print(dim(original_data))
  save_h5("./data/10X_5CL/original.loom", t(original_data))
  metadata = as.matrix(read.csv("./data/10X_5CL/original_data/sc_10x_5cl.metadata.csv.gz", header = T, row.names = 1))
  cell_id = rownames(metadata)
  raw_data = as.matrix(original_data[, cell_id])
  cell_id_new = paste(metadata[, "cell_line"], cell_id, sep = "_")
  colnames(raw_data) = cell_id_new
  print(dim(raw_data))
  save_h5("./data/10X_5CL/raw.loom", t(raw_data))
}else{
  raw_data = readh5_loom("./data/10X_5CL/raw.loom")
  cell_id = colnames(raw_data)
  gene_name = rownames(raw_data)
  print(dim(raw_data))
}
Treat input as file
|--------------------------------------------------|
|==================================================|
[1] 32895  5001
[1] 32895  3918
print("10X_5CL...OK!")
[1] "10X_5CL...OK!"

The bulk data can be found at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE86337.

if(!file.exists("./data/10X_5CL/bulk.loom")){
  library(org.Hs.eg.db)
  gene_bulk_mat_raw = as.matrix(read.table("./data/10X_5CL/original_data/GSE86337_reverse.stranded.unfiltered.count.matrix.txt.gz", header = T, row.names = 1, sep = "\t"))
  sample_mapping = as.matrix(read.csv("./data/10X_5CL/original_data/GSE86337_sample_id_sample_name.txt", sep = "\t"))
  sample_name = sample_mapping[, 2]
  names(sample_name) = sample_mapping[, 1]
  colnames(gene_bulk_mat_raw) = sample_name[colnames(gene_bulk_mat_raw)]
  gene_name_all = select(org.Hs.eg.db, key = as.character(rownames(gene_bulk_mat_raw)), columns = c("SYMBOL"), keytype = "ENTREZID")$SYMBOL
  rownames(gene_bulk_mat_raw) = gene_name_all
  gene_bulk_filt_0 = gene_bulk_mat_raw[!is.na(gene_name_all), ]
  gene_name_used = rownames(gene_bulk_filt_0)
  used_index = sapply(unique(gene_name_used), function(x){
    this_index = which(gene_name_used == x)
    if(length(this_index) > 1){
      return(this_index[which.max(rowSums(gene_bulk_filt_0[this_index, ]))])
    }else{
      return(this_index)
    }
  })
  gene_bulk_mat = gene_bulk_filt_0[used_index, ]
  save_h5("./data/10X_5CL/bulk.loom", as.matrix(t(gene_bulk_mat)))
}else{
  gene_bulk_mat = readh5_loom("./data/10X_5CL/bulk.loom")
}
[1] "./data/10X_5CL/bulk.loom"
[1] 23753    10
print(dim(gene_bulk_mat))
[1] 23753    10
print("10X_5CL_Bulk...OK!")
[1] "10X_5CL_Bulk...OK!"
LS0tDQp0aXRsZTogIkRhdGEgcHJlcGFyYXRpb24gZm9yIDEwWF81Q0wiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KIyMjIFNldHVwIGtuaXRyIGFuZCBsb2FkIHV0aWxpdHkgZnVuY3Rpb25zDQpgYGB7ciBzZXR1cH0NCmtuaXRyOjpvcHRzX2NodW5rJHNldChlY2hvID0gVFJVRSkNCmtuaXRyOjpvcHRzX2tuaXQkc2V0KHJvb3QuZGlyPSJFOi9ESVNDL3JlcHJvZHVjaWJpbGl0eSIpDQpgYGANCmBgYHtyfQ0KdXRpbGl0aWVzX3BhdGggPSAiLi9zb3VyY2UvdXRpbGl0aWVzLnIiDQpzb3VyY2UodXRpbGl0aWVzX3BhdGgpDQpgYGANCiMjIyBHZW5lcmF0ZSBkYXRhIGZpbGUNClRoZSBvcmlnaW5hbCAxMFhfNUNMIGRhdGEgY2FuIGJlIGZvdW5kIGF0IGh0dHBzOi8vd3d3Lm5jYmkubmxtLm5paC5nb3YvZ2VvL3F1ZXJ5L2FjYy5jZ2k/YWNjPUdTRTEyNjkwNi48L2JyPg0KVGhlIDEwWF81Q0wgbWV0YWRhdGEgY2FuIGJlIGZvdW5kIGF0IGh0dHBzOi8vZ2l0aHViLmNvbS9MdXlpVGlhbi9zY19taXhvbG9neS9ibG9iL21hc3Rlci9kYXRhL2Nzdi9zY18xMHhfNWNsLm1ldGFkYXRhLmNzdi5nei4gPC9icj4NCldlIHVzZSAzOTE4IGZpbHRlcmVkIGNlbGxzIGFuZCBhbGwgZ2VuZXMgdG8gbWFrZSByYXcubG9vbSAoZ2VuZXMgd2lsbCBiZSBmaWx0ZXJlZCBpbiB0aGUgZm9sbG93aW5nIHN0ZXAgZ2VuZSBzZWxlY3Rpb24gYXMgb3VyIERJU0MgcGFwZXIpLg0KYGBge3J9DQppZighZmlsZS5leGlzdHMoIi4vZGF0YS8xMFhfNUNML3Jhdy5sb29tIikpew0KICBvcmlnaW5hbF9kYXRhID0gYXMubWF0cml4KHJlYWQuY3N2KCIuL2RhdGEvMTBYXzVDTC9vcmlnaW5hbF9kYXRhL0dTTTM2MTgwMTRfZ2VuZV9jb3VudC5jc3YuZ3oiLCBoZWFkZXIgPSBULCBzZXAgPSAiLCIsIHJvdy5uYW1lcyA9IDEpKSMgMzI4OTUgIDUwMDENCiAgZ3pfcGF0aCA9ICIuL2RhdGEvYW5ub3RhdGlvbi9Ib21vX3NhcGllbnMuR1JDaDM3Ljg3Lmd0Zi5neiINCiAgYW5ub3RhdGlvbl9tYXQgPSBnZXRfbWFwKGd6X3BhdGgpDQogIG1hcHBpbmcgPSBhbm5vdGF0aW9uX21hdCRnZW5lX25hbWUNCiAgbmFtZXMobWFwcGluZykgPSBhbm5vdGF0aW9uX21hdCRnZW5lX2lkDQogIGdlbmVfbmFtZV9jYW5kaWRhdGUgPSBtYXBwaW5nW3Jvd25hbWVzKG9yaWdpbmFsX2RhdGEpXQ0KICBnZW5lX25hbWUgPSByb3duYW1lcyhvcmlnaW5hbF9kYXRhKQ0KICBnZW5lX25hbWVbIWlzLm5hKGdlbmVfbmFtZV9jYW5kaWRhdGUpXSA9IGdlbmVfbmFtZV9jYW5kaWRhdGVbIWlzLm5hKGdlbmVfbmFtZV9jYW5kaWRhdGUpXQ0KICBmb3IoaWkgaW4gZ2VuZV9uYW1lW2R1cGxpY2F0ZWQoZ2VuZV9uYW1lKV0pew0KICAgIGluZGV4ID0gMA0KICAgIGZvcihqaiBpbiB3aGljaChnZW5lX25hbWUgPT0gaWkpKXsNCiAgICAgIGdlbmVfbmFtZVtqal0gPSBwYXN0ZTAoZ2VuZV9uYW1lW2pqXSwgIl9kdXBsaWNhdGVfIiwgaW5kZXgpDQogICAgICBpbmRleCA9IGluZGV4ICsgMQ0KICAgIH0NCiAgfQ0KICByb3duYW1lcyhvcmlnaW5hbF9kYXRhKSA9IGdlbmVfbmFtZQ0KICBwcmludChkaW0ob3JpZ2luYWxfZGF0YSkpDQogIHNhdmVfaDUoIi4vZGF0YS8xMFhfNUNML29yaWdpbmFsLmxvb20iLCB0KG9yaWdpbmFsX2RhdGEpKQ0KICBtZXRhZGF0YSA9IGFzLm1hdHJpeChyZWFkLmNzdigiLi9kYXRhLzEwWF81Q0wvb3JpZ2luYWxfZGF0YS9zY18xMHhfNWNsLm1ldGFkYXRhLmNzdi5neiIsIGhlYWRlciA9IFQsIHJvdy5uYW1lcyA9IDEpKQ0KICBjZWxsX2lkID0gcm93bmFtZXMobWV0YWRhdGEpDQogIHJhd19kYXRhID0gYXMubWF0cml4KG9yaWdpbmFsX2RhdGFbLCBjZWxsX2lkXSkNCiAgY2VsbF9pZF9uZXcgPSBwYXN0ZShtZXRhZGF0YVssICJjZWxsX2xpbmUiXSwgY2VsbF9pZCwgc2VwID0gIl8iKQ0KICBjb2xuYW1lcyhyYXdfZGF0YSkgPSBjZWxsX2lkX25ldw0KICBwcmludChkaW0ocmF3X2RhdGEpKQ0KICBzYXZlX2g1KCIuL2RhdGEvMTBYXzVDTC9yYXcubG9vbSIsIHQocmF3X2RhdGEpKQ0KfWVsc2V7DQogIHJhd19kYXRhID0gcmVhZGg1X2xvb20oIi4vZGF0YS8xMFhfNUNML3Jhdy5sb29tIikNCiAgY2VsbF9pZCA9IGNvbG5hbWVzKHJhd19kYXRhKQ0KICBnZW5lX25hbWUgPSByb3duYW1lcyhyYXdfZGF0YSkNCiAgcHJpbnQoZGltKHJhd19kYXRhKSkNCn0NCnByaW50KCIxMFhfNUNMLi4uT0shIikNCmBgYA0KVGhlIGJ1bGsgZGF0YSBjYW4gYmUgZm91bmQgYXQgaHR0cHM6Ly93d3cubmNiaS5ubG0ubmloLmdvdi9nZW8vcXVlcnkvYWNjLmNnaT9hY2M9R1NFODYzMzcuIDwvYnI+DQpgYGB7cn0NCmlmKCFmaWxlLmV4aXN0cygiLi9kYXRhLzEwWF81Q0wvYnVsay5sb29tIikpew0KICBsaWJyYXJ5KG9yZy5Icy5lZy5kYikNCiAgZ2VuZV9idWxrX21hdF9yYXcgPSBhcy5tYXRyaXgocmVhZC50YWJsZSgiLi9kYXRhLzEwWF81Q0wvb3JpZ2luYWxfZGF0YS9HU0U4NjMzN19yZXZlcnNlLnN0cmFuZGVkLnVuZmlsdGVyZWQuY291bnQubWF0cml4LnR4dC5neiIsIGhlYWRlciA9IFQsIHJvdy5uYW1lcyA9IDEsIHNlcCA9ICJcdCIpKQ0KICBzYW1wbGVfbWFwcGluZyA9IGFzLm1hdHJpeChyZWFkLmNzdigiLi9kYXRhLzEwWF81Q0wvb3JpZ2luYWxfZGF0YS9HU0U4NjMzN19zYW1wbGVfaWRfc2FtcGxlX25hbWUudHh0Iiwgc2VwID0gIlx0IikpDQogIHNhbXBsZV9uYW1lID0gc2FtcGxlX21hcHBpbmdbLCAyXQ0KICBuYW1lcyhzYW1wbGVfbmFtZSkgPSBzYW1wbGVfbWFwcGluZ1ssIDFdDQogIGNvbG5hbWVzKGdlbmVfYnVsa19tYXRfcmF3KSA9IHNhbXBsZV9uYW1lW2NvbG5hbWVzKGdlbmVfYnVsa19tYXRfcmF3KV0NCiAgZ2VuZV9uYW1lX2FsbCA9IHNlbGVjdChvcmcuSHMuZWcuZGIsIGtleSA9IGFzLmNoYXJhY3Rlcihyb3duYW1lcyhnZW5lX2J1bGtfbWF0X3JhdykpLCBjb2x1bW5zID0gYygiU1lNQk9MIiksIGtleXR5cGUgPSAiRU5UUkVaSUQiKSRTWU1CT0wNCiAgcm93bmFtZXMoZ2VuZV9idWxrX21hdF9yYXcpID0gZ2VuZV9uYW1lX2FsbA0KICBnZW5lX2J1bGtfZmlsdF8wID0gZ2VuZV9idWxrX21hdF9yYXdbIWlzLm5hKGdlbmVfbmFtZV9hbGwpLCBdDQogIGdlbmVfbmFtZV91c2VkID0gcm93bmFtZXMoZ2VuZV9idWxrX2ZpbHRfMCkNCiAgdXNlZF9pbmRleCA9IHNhcHBseSh1bmlxdWUoZ2VuZV9uYW1lX3VzZWQpLCBmdW5jdGlvbih4KXsNCiAgICB0aGlzX2luZGV4ID0gd2hpY2goZ2VuZV9uYW1lX3VzZWQgPT0geCkNCiAgICBpZihsZW5ndGgodGhpc19pbmRleCkgPiAxKXsNCiAgICAgIHJldHVybih0aGlzX2luZGV4W3doaWNoLm1heChyb3dTdW1zKGdlbmVfYnVsa19maWx0XzBbdGhpc19pbmRleCwgXSkpXSkNCiAgICB9ZWxzZXsNCiAgICAgIHJldHVybih0aGlzX2luZGV4KQ0KICAgIH0NCiAgfSkNCiAgZ2VuZV9idWxrX21hdCA9IGdlbmVfYnVsa19maWx0XzBbdXNlZF9pbmRleCwgXQ0KICBzYXZlX2g1KCIuL2RhdGEvMTBYXzVDTC9idWxrLmxvb20iLCBhcy5tYXRyaXgodChnZW5lX2J1bGtfbWF0KSkpDQp9ZWxzZXsNCiAgZ2VuZV9idWxrX21hdCA9IHJlYWRoNV9sb29tKCIuL2RhdGEvMTBYXzVDTC9idWxrLmxvb20iKQ0KfQ0KcHJpbnQoZGltKGdlbmVfYnVsa19tYXQpKQ0KcHJpbnQoIjEwWF81Q0xfQnVsay4uLk9LISIpDQpgYGANCg==