rm(list=ls())
if(!require(DADo))
devtools::install_github("marzuf/MANUSCRIPT_FIGURES", subdir="DADo")
# alternatively:
# install.packages("DADo_0.0.0.1.tar.gz", repos = NULL, type ="source")
# data("norm_ID")
library(DADo)
library(doMC)
library(foreach)
nCpu <- 2
registerDoMC(nCpu)
# list of genes used in the pipeline
data("ENCSR489OCU_NCI-H460_40kb_TCGAluad_norm_luad_pipeline_geneList") # this loads pipeline_geneList
# sample IDs
data("luad_ID") # this loads cond2_ID
data("norm_ID") # this loads cond1_ID
# table with gene-to-domain assignment
gene2tad_dt <- read.delim(system.file("extdata", "ENCSR489OCU_NCI-H460_all_genes_positions.txt", package = "DADo"),
stringsAsFactors = FALSE,
header=FALSE,
col.names=c("entrezID", "chromosome", "start", "end", "region"))
gene2tad_dt$entrezID <- as.character(gene2tad_dt$entrezID)
# take only the genes used in the pipeline
pip_g2t_dt <- gene2tad_dt[gene2tad_dt$entrezID %in% pipeline_geneList,]
data("ENCSR489OCU_NCI-H460_TCGAluad_norm_luad_fpkmDT") # this loads fpkmDT
#> Warning in data("ENCSR489OCU_NCI-H460_TCGAluad_norm_luad_fpkmDT"): data set
#> 'ENCSR489OCU_NCI-H460_TCGAluad_norm_luad_fpkmDT' not found
fpkm_dt <- fpkmDT
head_sq(fpkm_dt)
#> TCGA-38-4625-11 TCGA-38-4626-11 TCGA-38-4627-11 TCGA-38-4632-11
#> 100130426 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
#> 100133144 2.109895e-07 7.875051e-08 4.697807e-160 4.436309e-07
#> 100134869 4.146888e-07 3.540477e-07 3.075534e-07 3.535838e-07
#> 10357 1.553667e-05 1.598769e-05 1.860075e-05 1.322551e-05
#> 10431 3.599472e-05 4.673789e-05 5.279937e-05 4.914794e-05
#> TCGA-44-2655-11
#> 100130426 0.000000e+00
#> 100133144 3.952489e-07
#> 100134869 4.409371e-07
#> 10357 1.804483e-05
#> 10431 3.341402e-05
# emulate FPKM
# normalize sample-wise
fpkm_dt2 <- apply(fpkm_dt, 2, function(x)x/sum(x))
# stopifnot(colSums(fpkm_dt2) == 1)
stopifnot(abs(colSums(fpkm_dt2) - 1) <= 10^-4)
# and then multiply by 10^6 to have FPKM
fpkm_dt2 <- fpkm_dt2*10^6
fpkm_dt2 <- data.frame(fpkm_dt2, check.names = FALSE)
stopifnot(dim(fpkm_dt) == dim(fpkm_dt2))
stopifnot(rownames(fpkm_dt) == rownames(fpkm_dt2))
stopifnot(colnames(fpkm_dt) == colnames(fpkm_dt2))
fpkm_dt <- fpkm_dt2
stopifnot(names(pipeline_geneList) %in% rownames(fpkm_dt))
stopifnot(!duplicated(pipeline_geneList[names(pipeline_geneList) %in% rownames(fpkm_dt)]))
fpkm_dt <- fpkm_dt[ rownames(fpkm_dt) %in% names(pipeline_geneList),]
rownames(fpkm_dt) <- sapply( rownames(fpkm_dt), function(x) pipeline_geneList[names(pipeline_geneList) == x ])
stopifnot(rownames(fpkm_dt) %in% pip_g2t_dt$entrezID)
library(TCGAbiolinks)
#> Registered S3 method overwritten by 'R.oo':
#> method from
#> throw.default R.methodsS3
#> Warning: multiple methods tables found for 'type'
#> Warning: multiple methods tables found for 'type'
#> Warning: replacing previous import 'BiocGenerics::type' by
#> 'DelayedArray::type' when loading 'SummarizedExperiment'
#> Warning: replacing previous import 'BiocGenerics::type' by
#> 'Biostrings::type' when loading 'GenomicAlignments'
#> Warning: replacing previous import 'BiocGenerics::type' by
#> 'Biostrings::type' when loading 'ShortRead'
head_sq(Tumor.purity)
#> Sample.ID Cancer.type ESTIMATE ABSOLUTE LUMP
#> 1 TCGA-OR-A5J1-01A ACC 0,9368 NaN 0,9774
#> 2 TCGA-OR-A5J2-01A ACC 0,9175 NaN 0,6174
#> 3 TCGA-OR-A5J3-01A ACC 0,967 NaN 0,9249
#> 4 TCGA-OR-A5J4-01A ACC NaN NaN 0,9199
#> 5 TCGA-OR-A5J5-01A ACC 0,9761 NaN 1
purityType <- "ESTIMATE"
Tumor.purity$Sample.ID <- as.character(Tumor.purity$Sample.ID)
Tumor.purity$Sample.ID <- substr(Tumor.purity$Sample.ID, start=1, stop=15)
purity_dt <- aggregate(.~Sample.ID, FUN=mean, data=Tumor.purity[c("Sample.ID", purityType)])
mean(cond1_ID %in% purity_dt$Sample.ID) # available samples for cond1
#> [1] 0
mean(cond2_ID %in% purity_dt$Sample.ID) # available samples for cond2
#> [1] 1
using the get_meanPurityCorr function:
tads_all_purity_dt <- get_meanPurityCorr(exprTable=fpkm_dt, purityTable=purity_dt, g2tTable=pip_g2t_dt,
all_samples=c(cond1_ID, cond2_ID), purityCol="ESTIMATE" )
#> ... found purity for: 502 / 561
head(tads_all_purity_dt)
#> nSampWithPurity region entrezID purityCorr
#> 1 502 chr10_TAD2 22982 0.01028636
#> 2 502 chr10_TAD2 23185 0.15617954
#> 3 502 chr10_TAD2 23560 0.27393326
#> 4 502 chr10_TAD2 91734 0.26660291
#> 5 502 chr10_TAD2 55853 0.08424493
#> 6 502 chr10_TAD2 3422 0.16022671
# average at the domain-level
aggTAD_purity_dt <- aggregate(purityCorr~region, FUN=mean, data=tads_all_purity_dt)
head(aggTAD_purity_dt)
#> region purityCorr
#> 1 chr1_TAD1 0.030081939
#> 2 chr1_TAD100 -0.002241675
#> 3 chr1_TAD101 0.067805156
#> 4 chr1_TAD102 -0.020305588
#> 5 chr1_TAD109 -0.381170455
#> 6 chr1_TAD110 -0.109189469