Calculation of domain-level correlation with purity

Load some packages

rm(list=ls())
if(!require(DADo))
  devtools::install_github("marzuf/MANUSCRIPT_FIGURES", subdir="DADo")
  # alternatively: 
  # install.packages("DADo_0.0.0.1.tar.gz", repos = NULL, type ="source")
 # data("norm_ID")
library(DADo)
library(doMC)
library(foreach)
nCpu <- 2
registerDoMC(nCpu)

Prepare the data from the pipeline

# list of genes used in the pipeline
data("ENCSR489OCU_NCI-H460_40kb_TCGAluad_norm_luad_pipeline_geneList") # this loads pipeline_geneList
# sample IDs
data("luad_ID") # this loads cond2_ID
data("norm_ID") # this loads cond1_ID

# table with gene-to-domain assignment
gene2tad_dt <- read.delim(system.file("extdata", "ENCSR489OCU_NCI-H460_all_genes_positions.txt", package = "DADo"),
                          stringsAsFactors = FALSE,
                          header=FALSE, 
                          col.names=c("entrezID", "chromosome", "start", "end", "region"))
gene2tad_dt$entrezID <- as.character(gene2tad_dt$entrezID)

# take only the genes used in the pipeline
pip_g2t_dt <- gene2tad_dt[gene2tad_dt$entrezID %in% pipeline_geneList,]

data("ENCSR489OCU_NCI-H460_TCGAluad_norm_luad_fpkmDT") # this loads fpkmDT
#> Warning in data("ENCSR489OCU_NCI-H460_TCGAluad_norm_luad_fpkmDT"): data set
#> 'ENCSR489OCU_NCI-H460_TCGAluad_norm_luad_fpkmDT' not found
fpkm_dt <- fpkmDT
head_sq(fpkm_dt)
#>           TCGA-38-4625-11 TCGA-38-4626-11 TCGA-38-4627-11 TCGA-38-4632-11
#> 100130426    0.000000e+00    0.000000e+00    0.000000e+00    0.000000e+00
#> 100133144    2.109895e-07    7.875051e-08   4.697807e-160    4.436309e-07
#> 100134869    4.146888e-07    3.540477e-07    3.075534e-07    3.535838e-07
#> 10357        1.553667e-05    1.598769e-05    1.860075e-05    1.322551e-05
#> 10431        3.599472e-05    4.673789e-05    5.279937e-05    4.914794e-05
#>           TCGA-44-2655-11
#> 100130426    0.000000e+00
#> 100133144    3.952489e-07
#> 100134869    4.409371e-07
#> 10357        1.804483e-05
#> 10431        3.341402e-05
# emulate FPKM
# normalize sample-wise
fpkm_dt2 <- apply(fpkm_dt, 2, function(x)x/sum(x))
# stopifnot(colSums(fpkm_dt2) == 1)
stopifnot(abs(colSums(fpkm_dt2) - 1) <= 10^-4)
# and then multiply by 10^6 to have FPKM
fpkm_dt2 <- fpkm_dt2*10^6
fpkm_dt2 <- data.frame(fpkm_dt2, check.names = FALSE)
stopifnot(dim(fpkm_dt) == dim(fpkm_dt2))
stopifnot(rownames(fpkm_dt) == rownames(fpkm_dt2))
stopifnot(colnames(fpkm_dt) == colnames(fpkm_dt2))
fpkm_dt <- fpkm_dt2

stopifnot(names(pipeline_geneList) %in% rownames(fpkm_dt))
stopifnot(!duplicated(pipeline_geneList[names(pipeline_geneList) %in% rownames(fpkm_dt)]))
fpkm_dt <- fpkm_dt[ rownames(fpkm_dt) %in% names(pipeline_geneList),]
rownames(fpkm_dt) <- sapply( rownames(fpkm_dt), function(x) pipeline_geneList[names(pipeline_geneList) == x ])
stopifnot(rownames(fpkm_dt) %in% pip_g2t_dt$entrezID)

Prepare purity data

library(TCGAbiolinks)
#> Registered S3 method overwritten by 'R.oo':
#>   method        from       
#>   throw.default R.methodsS3
#> Warning: multiple methods tables found for 'type'

#> Warning: multiple methods tables found for 'type'
#> Warning: replacing previous import 'BiocGenerics::type' by
#> 'DelayedArray::type' when loading 'SummarizedExperiment'
#> Warning: replacing previous import 'BiocGenerics::type' by
#> 'Biostrings::type' when loading 'GenomicAlignments'
#> Warning: replacing previous import 'BiocGenerics::type' by
#> 'Biostrings::type' when loading 'ShortRead'
head_sq(Tumor.purity)
#>          Sample.ID Cancer.type ESTIMATE ABSOLUTE   LUMP
#> 1 TCGA-OR-A5J1-01A         ACC   0,9368      NaN 0,9774
#> 2 TCGA-OR-A5J2-01A         ACC   0,9175      NaN 0,6174
#> 3 TCGA-OR-A5J3-01A         ACC    0,967      NaN 0,9249
#> 4 TCGA-OR-A5J4-01A         ACC      NaN      NaN 0,9199
#> 5 TCGA-OR-A5J5-01A         ACC   0,9761      NaN      1
purityType <- "ESTIMATE"
Tumor.purity$Sample.ID <- as.character(Tumor.purity$Sample.ID)
Tumor.purity$Sample.ID <- substr(Tumor.purity$Sample.ID, start=1, stop=15)
purity_dt <- aggregate(.~Sample.ID, FUN=mean, data=Tumor.purity[c("Sample.ID", purityType)])
mean(cond1_ID %in% purity_dt$Sample.ID) # available samples for cond1
#> [1] 0
mean(cond2_ID %in% purity_dt$Sample.ID) # available samples for cond2
#> [1] 1

Compute correlation with purity

using the get_meanPurityCorr function:

tads_all_purity_dt <- get_meanPurityCorr(exprTable=fpkm_dt, purityTable=purity_dt, g2tTable=pip_g2t_dt, 
                                      all_samples=c(cond1_ID, cond2_ID), purityCol="ESTIMATE" )
#> ... found purity for:     502 / 561
head(tads_all_purity_dt)
#>   nSampWithPurity     region entrezID purityCorr
#> 1             502 chr10_TAD2    22982 0.01028636
#> 2             502 chr10_TAD2    23185 0.15617954
#> 3             502 chr10_TAD2    23560 0.27393326
#> 4             502 chr10_TAD2    91734 0.26660291
#> 5             502 chr10_TAD2    55853 0.08424493
#> 6             502 chr10_TAD2     3422 0.16022671

# average at the domain-level
aggTAD_purity_dt <- aggregate(purityCorr~region, FUN=mean, data=tads_all_purity_dt)
head(aggTAD_purity_dt)
#>        region   purityCorr
#> 1   chr1_TAD1  0.030081939
#> 2 chr1_TAD100 -0.002241675
#> 3 chr1_TAD101  0.067805156
#> 4 chr1_TAD102 -0.020305588
#> 5 chr1_TAD109 -0.381170455
#> 6 chr1_TAD110 -0.109189469