Calculation of some domain-level statistics

Load some packages

rm(list=ls())
if(!require(DADo))
  devtools::install_github("marzuf/MANUSCRIPT_FIGURES", subdir="DADo")
  # alternatively: 
  # install.packages("DADo_0.0.0.1.tar.gz", repos = NULL, type ="source")
 # data("norm_ID")
library(DADo)
library(doMC)
library(foreach)
nCpu <- 2
registerDoMC(nCpu)

Prepare the observed data

# table from gene-level DE analysis:
data("ENCSR489OCU_NCI-H460_40kb_TCGAluad_norm_luad_DE_topTable") # this loads DE_topTable
DE_topTable$genes <- as.character(DE_topTable$genes)

# list of genes used in the pipeline
data("ENCSR489OCU_NCI-H460_40kb_TCGAluad_norm_luad_pipeline_geneList") # this loads pipeline_geneList

# for those genes, I have logFC data
stopifnot(names(pipeline_geneList) %in% DE_topTable$genes)
DE_topTable <- DE_topTable[DE_topTable$genes %in% names(pipeline_geneList),]
DE_topTable$entrezID <- pipeline_geneList[DE_topTable$genes]
stopifnot(!is.na(DE_topTable$entrezID))

# table with gene-to-domain assignment
gene2tad_dt <- read.delim(system.file("extdata", "ENCSR489OCU_NCI-H460_all_genes_positions.txt", package = "DADo"),
                          stringsAsFactors = FALSE,
                          header=FALSE, 
                          col.names=c("entrezID", "chromosome", "start", "end", "region"))
gene2tad_dt$entrezID <- as.character(gene2tad_dt$entrezID)


# take only the genes used in the pipeline
pip_g2t_dt <- gene2tad_dt[gene2tad_dt$entrezID %in% pipeline_geneList,]

# merge to match gene-to-domain and logFC
merged_dt <- merge(pip_g2t_dt[,c("entrezID", "region")], DE_topTable[,c("logFC", "entrezID")], by="entrezID", all.x=TRUE, all.y=FALSE)
stopifnot(!is.na(merged_dt))

Compute meanLogFC for a list of domains

# compute the ratioDown for each domain
meanFC_dt <- aggregate(logFC ~ region, FUN=mean, data=merged_dt)
colnames(meanFC_dt)[colnames(meanFC_dt) == "logFC"] <- "meanFC"
obs_meanFC <- setNames(meanFC_dt$meanFC, meanFC_dt$region)
save(obs_meanFC, file="package_obs_meanFC.RData", version=2)

Compute ratioDown for a list of domains

using the get_ratioDown function:

# compute the ratioDown for each domain
FCC_dt <- aggregate(logFC ~ region, FUN=get_ratioDown, data=merged_dt)
colnames(FCC_dt)[colnames(FCC_dt) == "logFC"] <- "ratioDown"
obs_ratioDown <- setNames(FCC_dt$ratioDown, FCC_dt$region)
save(obs_ratioDown, file="package_obs_ratioDown.RData", version=2)

Compute ratioFC for a list of domains

using the get_ratioFC function:

# compute the ratioFC for each domain
FCC_dt <- aggregate(logFC ~ region, FUN=get_ratioFC, data=merged_dt)
colnames(FCC_dt)[colnames(FCC_dt) == "logFC"] <- "ratioFC"
obs_ratioNegFC <- setNames(FCC_dt$ratioFC, FCC_dt$region)
save(obs_ratioNegFC, file="package_obs_ratioNegFC.RData", version=2)

Compute FCC for a list of domains

using the get_fcc function:

# compute the FCC for each domain
FCC_dt <- aggregate(logFC ~ region, FUN=get_fcc, data=merged_dt)
colnames(FCC_dt)[colnames(FCC_dt) == "logFC"] <- "FCC"
obs_FCC <- setNames(FCC_dt$FCC, FCC_dt$region)
save(obs_FCC, file="package_obs_FCC.RData", version=2)

Compute FCC for permutation data

using the get_fcc function:

# compute the FCC for the permutation data
data("cut1000_ENCSR489OCU_NCI-H460_40kb_TCGAluad_norm_luad_permutationsDT") # this loads permutationsDT
head_sq(permutationsDT)
#>             region1      region2      region3      region4      region5
#> 8927   chr16_TAD282  chr9_TAD349  chr19_TAD48  chr2_TAD813 chr19_TAD193
#> 221400  chr15_TAD67 chr19_TAD195  chr1_TAD147  chr1_TAD413  chr13_TAD95
#> 2196   chr17_TAD110  chr13_TAD96 chr15_TAD123 chr10_TAD293  chr2_TAD277
#> 379013 chr15_TAD121 chr19_TAD197  chr9_TAD449  chr3_TAD597 chr18_TAD238
#> 130399  chr5_TAD282 chr12_TAD415 chr10_TAD153 chr16_TAD102  chr22_TAD92

stopifnot(setequal(rownames(permutationsDT), pipeline_geneList))

tad_levels <- as.character(FCC_dt$region)

all_permut_FCC_dt <- foreach(i = 1:ncol(permutationsDT), .combine='cbind') %dopar% {
  perm_g2t <- data.frame(entrezID=as.character(rownames(permutationsDT)),
                        region = as.character(permutationsDT[,i]),
                        stringsAsFactors = FALSE)
  perm_merged_dt <- merge(perm_g2t, DE_topTable[,c("logFC", "entrezID")], by="entrezID", all.x=TRUE, all.y=FALSE)
  stopifnot(!is.na(perm_merged_dt))

  # compute the FCC for each domain
  permut_FCC_dt <- aggregate(logFC ~ region, FUN=get_fcc, data=perm_merged_dt)
  colnames(permut_FCC_dt)[colnames(permut_FCC_dt) == "logFC"] <- "FCC"
  rownames(permut_FCC_dt) <- as.character(permut_FCC_dt$region)
  stopifnot(setequal(rownames(permut_FCC_dt), tad_levels))
  permut_FCC_dt[tad_levels, "FCC", drop=FALSE]
}
head_sq(all_permut_FCC_dt)
#>                     FCC       FCC.1      FCC.2        FCC.3        FCC.4
#> chr1_TAD1   -0.03021204  0.04464677 0.09681447  0.269371204 -0.005261017
#> chr1_TAD100 -0.15740921  0.23946044 0.00000000  0.207513788  0.204561552
#> chr1_TAD101  0.00000000 -0.03558564 1.00000000  0.000000000  0.000000000
#> chr1_TAD102 -0.13524587 -0.14239946 0.19725006  0.216173388  0.328637303
#> chr1_TAD109 -0.03835880 -0.22795748 0.20862475 -0.008080574  1.000000000

Compute FCC AUC ratio and plot FCC cumsum curves

using the get_auc_ratio function:

stopifnot(setequal(names(obs_FCC), rownames(all_permut_FCC_dt)))
get_auc_ratio(fcc_vect=obs_FCC, 
              fcc_permDT=all_permut_FCC_dt, 
              doPlot=T, 
              main="FCC AUC cumsum curves")
#> ... found obs. domains:  1774
#> ... found permut. domains:   1774
mtext(text = paste0("ENCSR489OCU_NCI-H460_40kb - TCGAluad_norm_luad"), side=3)

plot of chunk FCC_AUC_ratio_example

Compute intra-domain mean correlation

using the get_meanCorr function:

data("ENCSR489OCU_NCI-H460_TCGAluad_norm_luad_rna_qqnorm_rnaseqDT") # this loads rna_qqnorm_rnaseqDT
head_sq(rna_qqnorm_rnaseqDT)
#>           TCGA-38-4625-11 TCGA-38-4626-11 TCGA-38-4627-11 TCGA-38-4632-11
#> 100130426      -3.1242596      -2.7853138       -2.615386      -2.4983379
#> 10357           0.8850789       0.9186307        1.147108       0.3654239
#> 136542         -3.1242596      -2.7853138       -2.615386      -2.4983379
#> 155060         -1.6371254      -0.9890511       -2.407941      -1.8150338
#> 388795         -3.1242596       0.3133554       -2.785314      -2.6153864
#>           TCGA-44-2655-11
#> 100130426     -2.40794083
#> 10357          0.32746593
#> 136542        -2.40794083
#> 155060        -0.93234425
#> 388795        -0.08948213
stopifnot(names(pipeline_geneList) %in% rownames(rna_qqnorm_rnaseqDT))
rna_qqnorm_rnaseqDT <- rna_qqnorm_rnaseqDT[names(pipeline_geneList),]
rna_qqnorm_rnaseqDT <- data.frame(rna_qqnorm_rnaseqDT)
rna_qqnorm_rnaseqDT$newEntrez <- pipeline_geneList[rownames(rna_qqnorm_rnaseqDT)]
stopifnot(!is.na(rna_qqnorm_rnaseqDT$newEntrez))
stopifnot(!duplicated(rna_qqnorm_rnaseqDT$newEntrez))
rownames(rna_qqnorm_rnaseqDT) <- rna_qqnorm_rnaseqDT$newEntrez
rna_qqnorm_rnaseqDT$newEntrez <- NULL
stopifnot(pipeline_geneList %in% rownames(rna_qqnorm_rnaseqDT))
head_sq(rna_qqnorm_rnaseqDT)
#>        TCGA.38.4625.11 TCGA.38.4626.11 TCGA.38.4627.11 TCGA.38.4632.11
#> 10357        0.8850789       0.9186307       1.1471082       0.3654239
#> 653553      -0.3274659      -0.5029872       0.3321838      -1.5278100
#> 2            1.1823253       0.5388102       1.8895100       1.2190732
#> 53947        0.8083401       0.6843390       0.2853173      -0.8589387
#> 22848       -1.2575456      -0.7537816      -1.6371254      -1.0886861
#>        TCGA.44.2655.11
#> 10357        0.3274659
#> 653553      -0.5182583
#> 2            2.2705719
#> 53947        1.0186316
#> 22848       -0.2207181
stopifnot(is.character(pip_g2t_dt$entrezID))
stopifnot(pip_g2t_dt$entrezID %in% rownames(rna_qqnorm_rnaseqDT))
meanCorr_dt <- get_meanCorr(gene2tad_dt=pip_g2t_dt, exprd_dt=rna_qqnorm_rnaseqDT)
head(meanCorr_dt)
#>        region   meanCorr
#> 1  chr10_TAD2 0.35671000
#> 2 chr10_TAD16 0.85408689
#> 3 chr10_TAD18 0.34567982
#> 4 chr10_TAD20 0.11816802
#> 5 chr10_TAD27 0.03276069
#> 6 chr10_TAD42 0.43075550
obs_meanCorr <- setNames(meanCorr_dt$meanCorr, meanCorr_dt$region)
save(obs_meanCorr, file="package_meanCorr.RData", version=2)

Combine empirical p-values

using the stouffer function:

twoTailsStouffer <- FALSE
data("ENCSR489OCU_NCI-H460_40kb_TCGAluad_norm_luad_emp_pval_meanLogFC") # this loads emp_pval_meanLogFC
data("ENCSR489OCU_NCI-H460_40kb_TCGAluad_norm_luad_emp_pval_meanCorr") # this loads emp_pval_meanCorr
# retrieve domains for which I have empirical p-values for both logFC and meanCorr
intersectRegions <- intersect(names(emp_pval_meanLogFC), names(emp_pval_meanCorr))
emp_pval_meanCorr <- emp_pval_meanCorr[intersectRegions]
emp_pval_meanLogFC <- emp_pval_meanLogFC[intersectRegions]
stopifnot(!is.na(emp_pval_meanLogFC))
stopifnot(!is.na(emp_pval_meanCorr))
stopifnot(names(emp_pval_meanLogFC) == names(emp_pval_meanCorr))
emp_pval_combined <- unlist(sapply(seq_along(intersectRegions), function(x) 
                  stouffer(c(emp_pval_meanCorr[x], emp_pval_meanLogFC[x]), two.tails = twoTailsStouffer)))
names(emp_pval_combined) <- intersectRegions
head(emp_pval_combined)
#>    chr1_TAD1  chr1_TAD100  chr1_TAD101  chr1_TAD102  chr1_TAD109 
#> 0.0004437895 0.0778178362 0.3428428112 0.7096025451 0.5362037924 
#>  chr1_TAD110 
#> 0.1279638662
# and adjust for multiple testing
adj_emp_pval_combined <- p.adjust(emp_pval_combined, method="BH")
head(adj_emp_pval_combined)
#>   chr1_TAD1 chr1_TAD100 chr1_TAD101 chr1_TAD102 chr1_TAD109 chr1_TAD110 
#>   0.0232985   0.2308509   0.4614591   0.7409270   0.6051053   0.2859884
save(emp_pval_combined, file="package_emp_pval_combined.RData", version=2)
save(adj_emp_pval_combined, file="package_adj_emp_pval_combined.RData", version=2)

Volcano-like intra-domain corr. vs. domain meanLogFC

stopifnot(setequal(names(obs_meanCorr), names(obs_meanFC)))
stopifnot(setequal(names(obs_meanCorr), names(adj_emp_pval_combined)))

plot_volcanoTADsCorrFC(meanCorr=obs_meanCorr, 
                       meanFC=obs_meanFC, 
                       comb_pval=adj_emp_pval_combined,
                       padjusted=TRUE,
                       tads_to_annot = "chr11_TAD390")
#> ... kept meanCorr values:    1774/1774
#> ... kept meanFC values:  1774/1774
#> ... kept comb. p-val values: 1774/1774

plot of chunk plot_volcano