Recurrently differentially activated domains (conserved regions)

rm(list=ls())

if(!require(DADo))
  devtools::install_github("CSOgroup/DADo", subdir="DADo")
#> Loading required package: DADo
  # alternatively: 
  # install.packages("DADo_0.0.0.1.tar.gz", repos = NULL, type ="source")
 # data("norm_ID")
library(DADo)

Retrieve recurrently differentially activated regions

data("allSignif_dt") # this loads allSignif_dt
head(allSignif_dt)
#>                                    dataset       region
#> 1514 Barutcu_MCF-10A_40kb/TCGAbrca_lum_bas  chr6_TAD633
#> 121  Barutcu_MCF-10A_40kb/TCGAbrca_lum_bas  chr1_TAD551
#> 302  Barutcu_MCF-10A_40kb/TCGAbrca_lum_bas chr11_TAD238
#> 344  Barutcu_MCF-10A_40kb/TCGAbrca_lum_bas chr11_TAD384
#> 692  Barutcu_MCF-10A_40kb/TCGAbrca_lum_bas   chr16_TAD6
#> 932  Barutcu_MCF-10A_40kb/TCGAbrca_lum_bas chr19_TAD244

data("allSignif_g2t_dt") # this loads allSignif_g2t_dt
head(allSignif_g2t_dt)
#>                                    dataset entrezID chromo    start
#> 147  Barutcu_MCF-10A_40kb/TCGAbrca_lum_bas    51182  chr10 14880159
#> 149  Barutcu_MCF-10A_40kb/TCGAbrca_lum_bas    79723  chr10 14920782
#> 150  Barutcu_MCF-10A_40kb/TCGAbrca_lum_bas    64421  chr10 14948870
#> 2671 Barutcu_MCF-10A_40kb/TCGAbrca_lum_bas    10647  chr11 62009102
#> 2672 Barutcu_MCF-10A_40kb/TCGAbrca_lum_bas     4250  chr11 62037630
#> 2675 Barutcu_MCF-10A_40kb/TCGAbrca_lum_bas    80150  chr11 62104774
#>           end       region  symbol
#> 147  14913740  chr10_TAD67  HSPA14
#> 149  14946314  chr10_TAD67 SUV39H2
#> 150  14996106  chr10_TAD67 DCLRE1C
#> 2671 62012280 chr11_TAD238 SCGB1D2
#> 2672 62040628 chr11_TAD238 SCGB2A2
#> 2675 62160887 chr11_TAD238  ASRGL1


data("allSignif_tad_pos_dt") # this loads allSignif_dt
head(allSignif_tad_pos_dt)
#>                                    dataset chromo       region     start
#> 68   Barutcu_MCF-10A_40kb/TCGAbrca_lum_bas  chr10  chr10_TAD67  14880001
#> 821  Barutcu_MCF-10A_40kb/TCGAbrca_lum_bas  chr11 chr11_TAD238  61920001
#> 971  Barutcu_MCF-10A_40kb/TCGAbrca_lum_bas  chr11 chr11_TAD384  95720001
#> 996  Barutcu_MCF-10A_40kb/TCGAbrca_lum_bas  chr11 chr11_TAD409 102160001
#> 1006 Barutcu_MCF-10A_40kb/TCGAbrca_lum_bas  chr11 chr11_TAD419 104680001
#> 2364 Barutcu_MCF-10A_40kb/TCGAbrca_lum_bas  chr14 chr14_TAD227  74360001
#>            end
#> 68    15040000
#> 821   62160000
#> 971   96160000
#> 996  102560000
#> 1006 105080000
#> 2364  74560000

conserv_minIntersectGenes <- 3
conserv_geneMatching <- 0.8
conserv_minOverlapBp <- 0.8

signif_conserv_data <- get_conservedRegion(
  signif_dt=allSignif_dt, all_tad_pos_dt=allSignif_tad_pos_dt, all_g2t_dt=allSignif_g2t_dt, 
  minOverlapBpRatio=conserv_minOverlapBp,
  minIntersectGenes=conserv_minIntersectGenes,
  gene_matching_fuse_threshold = conserv_geneMatching,
  verbose=FALSE
)
save(signif_conserv_data, file="package_signif_conserv_data.Rdata", version=2)

Overall conservation

library(ggplot2)

minConserv <- 2

conserved_dt <- data.frame(
      region = names(signif_conserv_data[["conserved_signif_tads"]]),
      nConserv = as.numeric(lengths(signif_conserv_data[["conserved_signif_tads"]])),
      stringsAsFactors=FALSE)
stopifnot(conserved_dt$nConserv >= 2)

conserved_dt <- conserved_dt[order(conserved_dt$nConserv),]
conserved_dt$region <- factor(conserved_dt$region, levels=as.character(conserved_dt$region))
conserved_dt$region_rank <- as.numeric(conserved_dt$region)
conserved_dt <- conserved_dt[conserved_dt$nConserv >= minConserv,]

nCons <- nrow(conserved_dt)

my_ylab <- "# datasets conserved"
my_xlab <- paste0("Ranked conserved regions (", nCons, " with >= ", minConserv, " DS cons.)")
myTit <- "Signif. conserved regions"
subTit <- paste0("conserv. match ratio bp >= ", conserv_minOverlapBp, " and intersect genes >= ", conserv_minIntersectGenes)

ggplot(conserved_dt, aes(x=region_rank, y=nConserv)) +
  ggtitle(myTit, subtitle = subTit)+
  geom_bar(stat="identity", color="darkblue", fill="gray80") +
  scale_y_continuous(name=my_ylab, breaks= scales::pretty_breaks(n = 5))+
  scale_x_continuous(name=my_xlab, breaks= noZero_breaks(n=5), expand=c(0,0))+
  labs(x=my_xlab, y=my_ylab, color="", fill="")+
   theme(
  panel.grid.major.y =  element_line(colour = "grey", size = 0.5, linetype=1),
  panel.grid.minor.y =  element_line(colour = "grey", size = 0.5, linetype=1),
  panel.background = element_rect(fill = "transparent"),
    panel.grid.major.x =  element_blank(),
    panel.grid.minor.x  =  element_blank(),
  axis.title.x = element_text(size=14, hjust=0.5, vjust=0.5),
  axis.title.y = element_text(size=14, hjust=0.5, vjust=0.5),
  axis.text.y = element_text(size=12, hjust=0.5, vjust=0.5),
  axis.text.x = element_text(size=12, hjust=0.5, vjust=0.5),
        axis.line.x = element_line(),
  plot.title = element_text(hjust=0.5, size = 16, face="bold"),
  plot.subtitle = element_text(hjust=0.5, size = 14, face="italic"),
  legend.title = element_text(face="bold"),
        legend.text = element_text(size=12),
)

plot of chunk plot_barplot

Heatmap

library(reshape2)

myTit <- "Conserved regions across datasets"

# transform and plot to a binary matrix
all_datasets <- as.character(unique(allSignif_dt$dataset))

dsByReg_dt <- do.call(rbind, lapply(signif_conserv_data[["conserved_signif_tads"]], function(x) as.numeric(all_datasets %in% unique(dirname(x)))))
colnames(dsByReg_dt) <- all_datasets
stopifnot(max(rowSums(dsByReg_dt)) == max(conserved_dt$nConserv))

plot_dt <- dsByReg_dt[names(sort(rowSums(dsByReg_dt), decreasing = TRUE)),names(sort(colSums(dsByReg_dt), decreasing = TRUE))]
plot_dt_m <- melt(plot_dt)
plot_dt_m$Var1 <- factor(plot_dt_m$Var1, levels=names(sort(rowSums(dsByReg_dt), decreasing = TRUE)))
stopifnot(!is.na(plot_dt_m$Var1))
plot_dt_m$Var2 <- factor(plot_dt_m$Var2, levels=names(sort(colSums(dsByReg_dt), decreasing = TRUE)))
stopifnot(!is.na(plot_dt_m$Var2))
plot_dt_m$value <- as.character(plot_dt_m$value)

ggplot(plot_dt_m, aes(x=Var2, y=Var1, fill=value)) + 
  ggtitle(myTit)+
  geom_raster()+
  labs(x="Ranked datasets", y="Ranked conserved regions")+
  guides(fill=F)+
  scale_fill_manual(values=setNames(c("black", "white"), c("1", "0")))+
  theme(
    plot.title=element_text(size=14, face="bold", hjust=0.5),
    axis.text = element_blank()
  )

plot of chunk plot_heatmap

Conserved region

region_to_plot <- conserved_dt$region[conserved_dt$nConserv == max(conserved_dt$nConserv)][1]
stopifnot(region_to_plot %in% names(signif_conserv_data[["conserved_signif_intersect_genes"]]))
symbols_to_plot <- signif_conserv_data[["conserved_signif_intersect_genes"]][[paste0(region_to_plot)]]
stopifnot(region_to_plot %in% names(signif_conserv_data[["conserved_signif_tads"]]))
tads_to_plot <- signif_conserv_data[["conserved_signif_tads"]][[paste0(region_to_plot)]]
stopifnot(symbols_to_plot %in% allSignif_g2t_dt$symbol)

genes_plot_dt <- allSignif_g2t_dt[allSignif_g2t_dt$symbol %in% symbols_to_plot,c("symbol", "start", "end", "chromo")]
genes_plot_dt <- unique(genes_plot_dt)
tads_plot_dt <- data.frame(dataset = dirname(tads_to_plot),
                           region = basename(tads_to_plot), regID = tads_to_plot,
                           stringsAsFactors=FALSE)
allSignif_tad_pos_dt$regID <- file.path(allSignif_tad_pos_dt$dataset, allSignif_tad_pos_dt$region)
stopifnot( tads_to_plot %in% allSignif_tad_pos_dt$regID)
allSignif_tad_pos_dt$regID <- file.path(allSignif_tad_pos_dt$dataset, allSignif_tad_pos_dt$region)
tads_plot_dt <- allSignif_tad_pos_dt[allSignif_tad_pos_dt$regID %in% tads_to_plot,]
tads_plot_dt <- unique(tads_plot_dt)
tads_plot_dt$dsCat <- 1 # draw all 
tads_plot_dt$cond1 <- gsub("TCGA.+_(.+)_.+", "\\1", basename(tads_plot_dt$dataset))
tads_plot_dt$cond2 <- gsub("TCGA.+_.+_(.+)", "\\1", basename(tads_plot_dt$dataset))
tads_plot_dt$dataset <- gsub("_40kb", "", dirname(as.character(tads_plot_dt$dataset)))
plot_conservedRegions(genes_dt=genes_plot_dt, 
                      tads_dt=tads_plot_dt)

plot of chunk plot_conserved