rm(list=ls())
if(!require(DADo))
devtools::install_github("marzuf/MANUSCRIPT_FIGURES", subdir="DADo")
# alternatively:
# install.packages("DADo_0.0.0.1.tar.gz", repos = NULL, type ="source")
# data("norm_ID")
library(DADo)
library(doMC)
library(foreach)
nCpu <- 2
registerDoMC(nCpu)
# table from gene-level DE analysis:
data("ENCSR489OCU_NCI-H460_40kb_TCGAluad_norm_luad_DE_topTable") # this loads DE_topTable
DE_topTable$genes <- as.character(DE_topTable$genes)
# list of genes used in the pipeline
data("ENCSR489OCU_NCI-H460_40kb_TCGAluad_norm_luad_pipeline_geneList") # this loads pipeline_geneList
# for those genes, I have logFC data
stopifnot(names(pipeline_geneList) %in% DE_topTable$genes)
DE_topTable <- DE_topTable[DE_topTable$genes %in% names(pipeline_geneList),]
DE_topTable$entrezID <- pipeline_geneList[DE_topTable$genes]
stopifnot(!is.na(DE_topTable$entrezID))
# table with gene-to-domain assignment
gene2tad_dt <- read.delim(system.file("extdata", "ENCSR489OCU_NCI-H460_all_genes_positions.txt", package = "DADo"),
stringsAsFactors = FALSE,
header=FALSE,
col.names=c("entrezID", "chromosome", "start", "end", "region"))
gene2tad_dt$entrezID <- as.character(gene2tad_dt$entrezID)
# take only the genes used in the pipeline
pip_g2t_dt <- gene2tad_dt[gene2tad_dt$entrezID %in% pipeline_geneList,]
# merge to match gene-to-domain and logFC
merged_dt <- merge(pip_g2t_dt[,c("entrezID", "region")], DE_topTable[,c("logFC", "entrezID")], by="entrezID", all.x=TRUE, all.y=FALSE)
stopifnot(!is.na(merged_dt))
# compute the ratioDown for each domain
meanFC_dt <- aggregate(logFC ~ region, FUN=mean, data=merged_dt)
colnames(meanFC_dt)[colnames(meanFC_dt) == "logFC"] <- "meanFC"
obs_meanFC <- setNames(meanFC_dt$meanFC, meanFC_dt$region)
save(obs_meanFC, file="package_obs_meanFC.RData", version=2)
using the get_ratioDown function:
# compute the ratioDown for each domain
FCC_dt <- aggregate(logFC ~ region, FUN=get_ratioDown, data=merged_dt)
colnames(FCC_dt)[colnames(FCC_dt) == "logFC"] <- "ratioDown"
obs_ratioDown <- setNames(FCC_dt$ratioDown, FCC_dt$region)
save(obs_ratioDown, file="package_obs_ratioDown.RData", version=2)
using the get_ratioFC function:
# compute the ratioFC for each domain
FCC_dt <- aggregate(logFC ~ region, FUN=get_ratioFC, data=merged_dt)
colnames(FCC_dt)[colnames(FCC_dt) == "logFC"] <- "ratioFC"
obs_ratioNegFC <- setNames(FCC_dt$ratioFC, FCC_dt$region)
save(obs_ratioNegFC, file="package_obs_ratioNegFC.RData", version=2)
using the get_fcc function:
# compute the FCC for each domain
FCC_dt <- aggregate(logFC ~ region, FUN=get_fcc, data=merged_dt)
colnames(FCC_dt)[colnames(FCC_dt) == "logFC"] <- "FCC"
obs_FCC <- setNames(FCC_dt$FCC, FCC_dt$region)
save(obs_FCC, file="package_obs_FCC.RData", version=2)
using the get_fcc function:
# compute the FCC for the permutation data
data("cut1000_ENCSR489OCU_NCI-H460_40kb_TCGAluad_norm_luad_permutationsDT") # this loads permutationsDT
head_sq(permutationsDT)
#> region1 region2 region3 region4 region5
#> 8927 chr16_TAD282 chr9_TAD349 chr19_TAD48 chr2_TAD813 chr19_TAD193
#> 221400 chr15_TAD67 chr19_TAD195 chr1_TAD147 chr1_TAD413 chr13_TAD95
#> 2196 chr17_TAD110 chr13_TAD96 chr15_TAD123 chr10_TAD293 chr2_TAD277
#> 379013 chr15_TAD121 chr19_TAD197 chr9_TAD449 chr3_TAD597 chr18_TAD238
#> 130399 chr5_TAD282 chr12_TAD415 chr10_TAD153 chr16_TAD102 chr22_TAD92
stopifnot(setequal(rownames(permutationsDT), pipeline_geneList))
tad_levels <- as.character(FCC_dt$region)
all_permut_FCC_dt <- foreach(i = 1:ncol(permutationsDT), .combine='cbind') %dopar% {
perm_g2t <- data.frame(entrezID=as.character(rownames(permutationsDT)),
region = as.character(permutationsDT[,i]),
stringsAsFactors = FALSE)
perm_merged_dt <- merge(perm_g2t, DE_topTable[,c("logFC", "entrezID")], by="entrezID", all.x=TRUE, all.y=FALSE)
stopifnot(!is.na(perm_merged_dt))
# compute the FCC for each domain
permut_FCC_dt <- aggregate(logFC ~ region, FUN=get_fcc, data=perm_merged_dt)
colnames(permut_FCC_dt)[colnames(permut_FCC_dt) == "logFC"] <- "FCC"
rownames(permut_FCC_dt) <- as.character(permut_FCC_dt$region)
stopifnot(setequal(rownames(permut_FCC_dt), tad_levels))
permut_FCC_dt[tad_levels, "FCC", drop=FALSE]
}
head_sq(all_permut_FCC_dt)
#> FCC FCC.1 FCC.2 FCC.3 FCC.4
#> chr1_TAD1 -0.03021204 0.04464677 0.09681447 0.269371204 -0.005261017
#> chr1_TAD100 -0.15740921 0.23946044 0.00000000 0.207513788 0.204561552
#> chr1_TAD101 0.00000000 -0.03558564 1.00000000 0.000000000 0.000000000
#> chr1_TAD102 -0.13524587 -0.14239946 0.19725006 0.216173388 0.328637303
#> chr1_TAD109 -0.03835880 -0.22795748 0.20862475 -0.008080574 1.000000000
using the get_auc_ratio function:
stopifnot(setequal(names(obs_FCC), rownames(all_permut_FCC_dt)))
get_auc_ratio(fcc_vect=obs_FCC,
fcc_permDT=all_permut_FCC_dt,
doPlot=T,
main="FCC AUC cumsum curves")
#> ... found obs. domains: 1774
#> ... found permut. domains: 1774
mtext(text = paste0("ENCSR489OCU_NCI-H460_40kb - TCGAluad_norm_luad"), side=3)
using the get_meanCorr function:
data("ENCSR489OCU_NCI-H460_TCGAluad_norm_luad_rna_qqnorm_rnaseqDT") # this loads rna_qqnorm_rnaseqDT
head_sq(rna_qqnorm_rnaseqDT)
#> TCGA-38-4625-11 TCGA-38-4626-11 TCGA-38-4627-11 TCGA-38-4632-11
#> 100130426 -3.1242596 -2.7853138 -2.615386 -2.4983379
#> 10357 0.8850789 0.9186307 1.147108 0.3654239
#> 136542 -3.1242596 -2.7853138 -2.615386 -2.4983379
#> 155060 -1.6371254 -0.9890511 -2.407941 -1.8150338
#> 388795 -3.1242596 0.3133554 -2.785314 -2.6153864
#> TCGA-44-2655-11
#> 100130426 -2.40794083
#> 10357 0.32746593
#> 136542 -2.40794083
#> 155060 -0.93234425
#> 388795 -0.08948213
stopifnot(names(pipeline_geneList) %in% rownames(rna_qqnorm_rnaseqDT))
rna_qqnorm_rnaseqDT <- rna_qqnorm_rnaseqDT[names(pipeline_geneList),]
rna_qqnorm_rnaseqDT <- data.frame(rna_qqnorm_rnaseqDT)
rna_qqnorm_rnaseqDT$newEntrez <- pipeline_geneList[rownames(rna_qqnorm_rnaseqDT)]
stopifnot(!is.na(rna_qqnorm_rnaseqDT$newEntrez))
stopifnot(!duplicated(rna_qqnorm_rnaseqDT$newEntrez))
rownames(rna_qqnorm_rnaseqDT) <- rna_qqnorm_rnaseqDT$newEntrez
rna_qqnorm_rnaseqDT$newEntrez <- NULL
stopifnot(pipeline_geneList %in% rownames(rna_qqnorm_rnaseqDT))
head_sq(rna_qqnorm_rnaseqDT)
#> TCGA.38.4625.11 TCGA.38.4626.11 TCGA.38.4627.11 TCGA.38.4632.11
#> 10357 0.8850789 0.9186307 1.1471082 0.3654239
#> 653553 -0.3274659 -0.5029872 0.3321838 -1.5278100
#> 2 1.1823253 0.5388102 1.8895100 1.2190732
#> 53947 0.8083401 0.6843390 0.2853173 -0.8589387
#> 22848 -1.2575456 -0.7537816 -1.6371254 -1.0886861
#> TCGA.44.2655.11
#> 10357 0.3274659
#> 653553 -0.5182583
#> 2 2.2705719
#> 53947 1.0186316
#> 22848 -0.2207181
stopifnot(is.character(pip_g2t_dt$entrezID))
stopifnot(pip_g2t_dt$entrezID %in% rownames(rna_qqnorm_rnaseqDT))
meanCorr_dt <- get_meanCorr(gene2tad_dt=pip_g2t_dt, exprd_dt=rna_qqnorm_rnaseqDT)
head(meanCorr_dt)
#> region meanCorr
#> 1 chr10_TAD2 0.35671000
#> 2 chr10_TAD16 0.85408689
#> 3 chr10_TAD18 0.34567982
#> 4 chr10_TAD20 0.11816802
#> 5 chr10_TAD27 0.03276069
#> 6 chr10_TAD42 0.43075550
obs_meanCorr <- setNames(meanCorr_dt$meanCorr, meanCorr_dt$region)
save(obs_meanCorr, file="package_meanCorr.RData", version=2)
using the stouffer function:
twoTailsStouffer <- FALSE
data("ENCSR489OCU_NCI-H460_40kb_TCGAluad_norm_luad_emp_pval_meanLogFC") # this loads emp_pval_meanLogFC
data("ENCSR489OCU_NCI-H460_40kb_TCGAluad_norm_luad_emp_pval_meanCorr") # this loads emp_pval_meanCorr
# retrieve domains for which I have empirical p-values for both logFC and meanCorr
intersectRegions <- intersect(names(emp_pval_meanLogFC), names(emp_pval_meanCorr))
emp_pval_meanCorr <- emp_pval_meanCorr[intersectRegions]
emp_pval_meanLogFC <- emp_pval_meanLogFC[intersectRegions]
stopifnot(!is.na(emp_pval_meanLogFC))
stopifnot(!is.na(emp_pval_meanCorr))
stopifnot(names(emp_pval_meanLogFC) == names(emp_pval_meanCorr))
emp_pval_combined <- unlist(sapply(seq_along(intersectRegions), function(x)
stouffer(c(emp_pval_meanCorr[x], emp_pval_meanLogFC[x]), two.tails = twoTailsStouffer)))
names(emp_pval_combined) <- intersectRegions
head(emp_pval_combined)
#> chr1_TAD1 chr1_TAD100 chr1_TAD101 chr1_TAD102 chr1_TAD109
#> 0.0004437895 0.0778178362 0.3428428112 0.7096025451 0.5362037924
#> chr1_TAD110
#> 0.1279638662
# and adjust for multiple testing
adj_emp_pval_combined <- p.adjust(emp_pval_combined, method="BH")
head(adj_emp_pval_combined)
#> chr1_TAD1 chr1_TAD100 chr1_TAD101 chr1_TAD102 chr1_TAD109 chr1_TAD110
#> 0.0232985 0.2308509 0.4614591 0.7409270 0.6051053 0.2859884
save(emp_pval_combined, file="package_emp_pval_combined.RData", version=2)
save(adj_emp_pval_combined, file="package_adj_emp_pval_combined.RData", version=2)
stopifnot(setequal(names(obs_meanCorr), names(obs_meanFC)))
stopifnot(setequal(names(obs_meanCorr), names(adj_emp_pval_combined)))
plot_volcanoTADsCorrFC(meanCorr=obs_meanCorr,
meanFC=obs_meanFC,
comb_pval=adj_emp_pval_combined,
padjusted=TRUE,
tads_to_annot = "chr11_TAD390")
#> ... kept meanCorr values: 1774/1774
#> ... kept meanFC values: 1774/1774
#> ... kept comb. p-val values: 1774/1774