MOBAA (Multi-Omic Bicluster Association Analysis)

MOBAA is an R-based tool for discovering biologically meaningful subgroups within a population using multi-omics data. The method applies biclustering independently to each omics dataset (e.g., gene expression, protein abundance, DNA methylation) to detect locally coherent patterns of features and samples. It then integrates these biclusters across omic layers by quantifying sample overlap with the Jaccard index, grouping them into bicluster modules. Within each module, MOBAA identifies multi-omic relations, sets of biclusters from different omics that share the largest sample overlap, and tests their statistical significance using permutation-based methods. The resulting subgroups represent coherent, multi-layered molecular profiles that may correspond to disease subtypes, phenotypes, or other biologically relevant states. MOBAA also provides downstream analyses, including phenotype enrichment, inter-omics correlation, differential feature analysis, and pathway enrichment. Unlike many existing tools that integrate only two omics types, MOBAA is scalable to handle multiple omic layers simultaneously. This makes it a powerful framework for uncovering complex biological heterogeneity and advancing precision medicine research.

Example Workflow Using TCGA-BRCA Data

In this tutorial, we demonstrate how to use MOBAA with a multi-omic dataset from TCGA-BRCA. We’ll retrieve gene expression, methylation, and protein abundance data, and walk through the steps of biclustering, multi-omic integration and downstream analyses.

We begin by loading the necessary packages and downloading the BRCA dataset from TCGA. This includes RNA sequencing, methylation, and protein expression data.

This step may take a few minutes depending on your internet speed and disk space.

library(curatedTCGAData)
library(MultiAssayExperiment)

## Download BRCA dataset for this example analysis 
availableTCGA <- curatedTCGAData(dry.run = TRUE,version = "2.1.1")

brca_multi <- curatedTCGAData(
  diseaseCode = "BRCA",
  assays = c("RNASeq2GeneNorm", "Methylation_methyl450", "RPPAArray"),
  version = "2.1.1",
  dry.run = FALSE
)


rnaseq <- assays(brca_multi)[["BRCA_RNASeq2GeneNorm-20160128"]] 
rnaseq <- na.omit(rnaseq) 

prot <- assays(brca_multi)[["BRCA_RPPAArray-20160128"]]
prot <- na.omit(prot) 

meth <- as.matrix(assays(brca_multi)[["BRCA_Methylation_methyl450-20160128"]])
meth <- na.omit(meth)

clinical_data <- colData(brca_multi)
table(clinical_data$PAM50.mRNA)

# keep only those patients who have BRCA subtype info
clinical_data <- clinical_data[complete.cases(clinical_data$PAM50.mRNA), ] 
table(clinical_data$PAM50.mRNA)

Prepare data for mobaa

# extract patient IDs (the first 12 characters),
colnames(rnaseq) <- substr(colnames(rnaseq), 1, 12)
colnames(meth) <- substr(colnames(meth), 1, 12)
colnames(prot) <- substr(colnames(prot), 1, 12)

common.patients <- Reduce(intersect, list(colnames(rnaseq),colnames(meth), colnames(prot), clinical_data$patientID)) # 641
rnaseq_final <- rnaseq[,match(common.patients,colnames(rnaseq))]
meth_final <- meth[,match(common.patients,colnames(meth))]
prot_final <- prot[,match(common.patients,colnames(prot))]
clinical_final <- clinical_data[match(common.patients,clinical_data$patientID),]
clinical_final$patientID==colnames(rnaseq_final)
clinical_final$patientID==colnames(prot_final)
clinical_final$sampleID <- clinical_final$patientID # for the mobaa code to recognize
table(clinical_final$PAM50.mRNA)

# Keep only the 20000 most variable CpGs in the methylation data
row_vars <- apply(meth_final, 1, var, na.rm = TRUE)
top_var_indices <- order(row_vars, decreasing = TRUE)[1:20000]
meth_top20k <- as.matrix(meth_final[top_var_indices, ])

# standardize the data
# Adapted from:
# Rose, T. D. (2025). Example workflow (mosbi package vignette, Bioconductor).
# Retrieved April 15, 2025, from https://bioconductor.org/packages/devel/bioc/vignettes/mosbi/inst/doc/example-workflow.html

z_score <- function(x, margin = 2) {
    z_fun <- function(y) {
        (y - mean(y, na.rm = TRUE)) / sd(y, na.rm = TRUE)
    }
    if (margin == 2) {
        apply(x, margin, z_fun)
    } else if (margin == 1) {
        t(apply(x, margin, z_fun))
    }
}

rnaseq_zscore <- z_score(rnaseq_final)
prot_zscore <- z_score(prot_final)
dim(rnaseq_zscore)

## [1] 12652   179

dim(prot_zscore)

## [1] 216 179

dim(meth_top20k)

## [1] 20000   179

Install mobaa R package directly from github

remotes::install_github("pmishra912/MOBAA")
library(mobaa)

Or, install from local zip file provided in release/

remotes::install_local("mobaa.zip")
library(mobaa)

Run ensembel biclustering using a wrapper function for mosbi

prot_ensemble_bicluster_list <- find_ensemble_bc(
  prot_zscore,methods = c("fabia", "qubic"),
  seed = 1111
  )

## Esimated cut-off:  0.004004004

rnaseq_ensemble_bicluster_list <- find_ensemble_bc(
  rnaseq_zscore,methods = c("fabia", "qubic"),
  seed = 2222
  )

## Esimated cut-off:  0.002002002

meth_ensemble_bicluster_list <- find_ensemble_bc(
  meth_top20k,methods = c("fabia", "qubic"),
  seed = 3333
  )

## Esimated cut-off:  0.003003003

Calculate Jaccard distance among the biclusters

bc_dist_res <- generate_jaccard_dist(
  mosbi_outputs=list(prot_ensemble_bicluster_list,rnaseq_ensemble_bicluster_list,meth_ensemble_bicluster_list),
  omic_names=c("prot","rnaseq","meth"),
  omic_data=list(prot_zscore,rnaseq_zscore,meth_top20k)
  )

Hierarchical clustering of biclusters across different omics

bc_module_res <- hclust_bcs(bc_dist_res[[1]],plots=TRUE)

##  ..done.

# identified bicluster modules, named as colours.
bc_module_res$multiomic_modules

## $grey
## [1] "prot_2"   "prot_3"   "prot_6"   "prot_10"  "rnaseq_2" "rnaseq_6"
## 
## $turquoise
## [1] "prot_4"   "prot_5"   "prot_7"   "rnaseq_1" "meth_1"

Find the best multi-omic relation. For example, the grey bicluster module contains 4 proteomic biclusters and 2 rnaseq biclusters. The ‘find_relations’ function selects the best proteomic-rnaseq bicluster pair (relation) with maximum sample overlap.

# grey relation
grey_relation <- find_relations(
  biclusters_list=list(prot=prot_ensemble_bicluster_list,rnaseq=rnaseq_ensemble_bicluster_list),
  module="grey",
  pheno=clinical_final,
  bc_dist_obj=bc_dist_res,
  bc_hclust_obj=bc_module_res
  )

# turquoise relation
turquoise_relation <- find_relations(
  biclusters_list=list( meth=meth_ensemble_bicluster_list,rnaseq=rnaseq_ensemble_bicluster_list,prot=prot_ensemble_bicluster_list),
  module="turquoise",
  pheno=clinical_final,
  bc_dist_obj=bc_dist_res,
  bc_hclust_obj=bc_module_res
  )

How many patients are present in all the biclusters in each of the relations (patient overlap)?

# grey
patients_overlap_grey <- Reduce(intersect, grey_relation$sample_ids)
length(patients_overlap_grey)

## [1] 22

# turquouse
patients_overlap_turquoise <- Reduce(intersect, turquoise_relation$sample_ids)
length(patients_overlap_turquoise)

## [1] 41

Are the shared number of patients across biclusters in the relations statistically significant?

# grey
perm.overlap.sig(grey_relation$sample_ids,clinical_final$sampleID)

## [1] 1e-04

# turquouse
perm.overlap.sig(turquoise_relation$sample_ids,clinical_final$sampleID)

## [1] 0.5178

Check number of overlapped patients across different subtypes.

sum(patients_overlap_grey %in% clinical_final$sampleID[clinical_final$PAM50.mRNA=="Basal-like"])

## [1] 22

sum(patients_overlap_grey %in% clinical_final$sampleID[clinical_final$PAM50.mRNA=="HER2-enriched"])

## [1] 0

sum(patients_overlap_grey %in% clinical_final$sampleID[clinical_final$PAM50.mRNA=="Luminal A"])

## [1] 0

sum(patients_overlap_grey %in% clinical_final$sampleID[clinical_final$PAM50.mRNA=="Luminal B"])

## [1] 0

sum(patients_overlap_grey %in% clinical_final$sampleID[clinical_final$PAM50.mRNA=="Normal-like"])

## [1] 0

The participant overlap (n=22) is statistically significant in grey relation but not in turquoise. Also, all of the participants who are present in all the biclusters in grey relation (overlap) belong to ‘Basal-like’ subtype. Therefore, lets continue analyzing grey relation further.

Check patient distribution across different subtypes in the whole data

table(clinical_final$PAM50.mRNA)

## 
##    Basal-like HER2-enriched     Luminal A     Luminal B   Normal-like 
##            39            12            85            38             5

Is the proportion of patients with subtype ‘Basal-like’ in the grey relation greater than the background proportion?

prop.test(x = 22, n = 22, p = 39/179, alternative = "greater")

## 
##  1-sample proportions test with continuity correction
## 
## data:  22 out of 22, null probability 39/179
## X-squared = 74.451, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is greater than 0.2178771
## 95 percent confidence interval:
##  0.8531437 1.0000000
## sample estimates:
## p 
## 1

## or, in case of lower sample size, binom.test is the exact test and preferred
binom.test(x = 22, n = 22, p = 39/179, alternative = "greater")

## 
##  Exact binomial test
## 
## data:  22 and 22
## number of successes = 22, number of trials = 22, p-value = 2.758e-15
## alternative hypothesis: true probability of success is greater than 0.2178771
## 95 percent confidence interval:
##  0.8726946 1.0000000
## sample estimates:
## probability of success 
##                      1

Summary level (eigenfeatures) correlation between different omics in the grey relation.

corr_result <- correlate_eigenfeatures(
  omic_list = list(rnaseq = rnaseq_zscore,prot = prot_zscore),
  multiomic_relation = grey_relation,
  output_prefix = "bic_relation"
  )

# correlation among summary expression levels 
corr_result$correlation_matrix

##             prot    rnaseq
## prot    1.000000 -0.372775
## rnaseq -0.372775  1.000000

# statistical significance (p-values) of the correlation
corr_result$p_value_matrix

##              prot     rnaseq
## prot   0.00000000 0.08752067
## rnaseq 0.08752067 0.00000000

# visualize the correlation
corr_result$plots

## $prot_vs_rnaseq

In this example, there is no statistically significant (p < 0.05) correlation between eigengene (rnaseq PC1) and eigenprotein (prot PC1). This suggests that the relationship between the two omic feature sets might not be direct.

Next, lets analyze differential expression/methylation of omic features between patients in bicluster as compared to patients not in the bicluster

res <- analyze_multiomic_relation(
  omics_list=list(rnaseq=rnaseq_final,prot=prot_final), 
  multiomic_relation=grey_relation, 
  test = "t-test"
  )

res_rnaseq <- res$rnaseq
res_prot <- res$prot

res_rnaseq$p.adjust <- p.adjust(res_rnaseq$p_value,method="bonferroni")

# proportion of genes differentially expressed in the rnaseq bicluster
length(which(res_rnaseq$p.adjust<0.05))/length(grey_relation$features$rnaseq)

## [1] 1

# proportion of proteins differentially expressed in the protein bicluster
res_prot$p.adjust <- p.adjust(res_prot$p_value,method="bonferroni")
length(which(res_prot$p.adjust<0.05))/length(grey_relation$features$prot)

## [1] 0.9375

Visulatize selected features

plot_feature_boxplot(
  omics_list = list(rnaseq=rnaseq_zscore,prot=prot_zscore),
  result_list = res,
  multiomic_relation = grey_relation,
  omic_type = "rnaseq",rank = 1
  )

plot_feature_boxplot(
  omics_list = list(rnaseq=rnaseq_zscore,prot=prot_zscore),
  result_list = res,
  multiomic_relation = grey_relation,
  omic_type = "prot",rank = 1
  )

Patients in biclusters are referred as ‘related’ and those in the rest of the omic data are referred as ‘unrelated’.

Biological pathway enrichment analysis among the omic features in the grey multi-omic relation.

library(clusterProfiler)
library(org.Hs.eg.db)

gene_id_map <- mapIds(org.Hs.eg.db, rownames(rnaseq_final), 'ENTREZID', 'SYMBOL')
gene_id_map <- data.frame(symbol=names(gene_id_map),entrez=as.character(gene_id_map))

grey_gene_id_map <- mapIds(org.Hs.eg.db, grey_relation$features$rnaseq, 'ENTREZID', 'SYMBOL')
grey_gene_id_map <- data.frame(symbol=names(grey_gene_id_map),entrez=as.character(grey_gene_id_map))

reference.genes.entrez <- na.omit(gene_id_map$entrez)
grey_genes_entrez <- na.omit(grey_gene_id_map$entrez)

grey_genes_go_bp <- enrichGO(
  gene = grey_genes_entrez,
  universe = reference.genes.entrez, 
  OrgDb= org.Hs.eg.db, 
  ont = "BP", 
  pAdjustMethod = "bonferroni",
  pvalueCutoff  = 0.05,
  qvalueCutoff =0.01,
  readable = TRUE)

grey_genes_go_bp_tb <- grey_genes_go_bp@result 
head(grey_genes_go_bp_tb)

##                    ID
## GO:0006260 GO:0006260
## GO:0031146 GO:0031146
## GO:0045137 GO:0045137
## GO:0021904 GO:0021904
## GO:2000679 GO:2000679
## GO:0007548 GO:0007548
##                                                                        Description
## GO:0006260                                                         DNA replication
## GO:0031146 SCF-dependent proteasomal ubiquitin-dependent protein catabolic process
## GO:0045137                           development of primary sexual characteristics
## GO:0021904                                   dorsal/ventral neural tube patterning
## GO:2000679      positive regulation of transcription regulatory region DNA binding
## GO:0007548                                                     sex differentiation
##            GeneRatio   BgRatio RichFactor FoldEnrichment   zScore       pvalue
## GO:0006260    12/152 219/10069 0.05479452       3.629776 4.871089 0.0001159591
## GO:0031146     5/152  36/10069 0.13888889       9.200475 6.102083 0.0001896710
## GO:0045137     9/152 143/10069 0.06293706       4.169166 4.725298 0.0003066086
## GO:0021904     3/152  12/10069 0.25000000      16.560855 6.677180 0.0006714099
## GO:2000679     3/152  12/10069 0.25000000      16.560855 6.677180 0.0006714099
## GO:0007548     9/152 163/10069 0.05521472       3.657612 4.234864 0.0007968893
##             p.adjust    qvalue
## GO:0006260 0.2887381 0.2301008
## GO:0031146 0.4722807 0.2301008
## GO:0045137 0.7634555 0.2479764
## GO:0021904 1.0000000 0.3062832
## GO:2000679 1.0000000 0.3062832
## GO:0007548 1.0000000 0.3062832
##                                                                              geneID
## GO:0006260 CCNE1/CDK2AP1/DACH1/DNAJC2/DONSON/DSCC1/FAF1/GTPBP4/KIN/NASP/RFC4/TOP1MT
## GO:0031146                                            FBXL17/FBXL5/FBXO38/SKP1/SKP2
## GO:0045137                      BCL2/CBX2/DACH1/ESR1/FOXC1/GATA3/HSD17B4/NASP/WDR19
## GO:0021904                                                         BMP4/FOXA1/WDR19
## GO:2000679                                                          FOXC1/GATA3/RB1
## GO:0007548                      BCL2/CBX2/DACH1/ESR1/FOXC1/GATA3/HSD17B4/NASP/WDR19
##            Count
## GO:0006260    12
## GO:0031146     5
## GO:0045137     9
## GO:0021904     3
## GO:2000679     3
## GO:0007548     9

grey_genes_kegg <- enrichKEGG(
  gene = grey_genes_entrez,
  organism="hsa", 
  pAdjustMethod = "bonferroni",
  pvalueCutoff  = 0.05,
  qvalueCutoff =0.01)

grey_genes_kegg_tb <- grey_genes_kegg@result
head(grey_genes_kegg_tb)

##                    category                     subcategory       ID
## hsa05222     Human Diseases          Cancer: specific types hsa05222
## hsa01522     Human Diseases Drug resistance: antineoplastic hsa01522
## hsa05215     Human Diseases          Cancer: specific types hsa05215
## hsa04110 Cellular Processes           Cell growth and death hsa04110
## hsa05132     Human Diseases   Infectious disease: bacterial hsa05132
## hsa00450         Metabolism Metabolism of other amino acids hsa00450
##                        Description GeneRatio  BgRatio RichFactor FoldEnrichment
## hsa05222    Small cell lung cancer      6/77  93/9446 0.06451613       7.914537
## hsa01522      Endocrine resistance      5/77  99/9446 0.05050505       6.195723
## hsa05215           Prostate cancer      5/77 106/9446 0.04716981       5.786572
## hsa04110                Cell cycle      6/77 158/9446 0.03797468       4.658557
## hsa05132      Salmonella infection      7/77 251/9446 0.02788845       3.421224
## hsa00450 Selenocompound metabolism      2/77  17/9446 0.11764706      14.432391
##            zScore       pvalue   p.adjust     qvalue
## hsa05222 6.074756 0.0001046393 0.01852115 0.01729302
## hsa01522 4.711153 0.0012432225 0.22005039 0.07332655
## hsa05215 4.492663 0.0016854173 0.29831887 0.07332655
## hsa04110 4.204138 0.0017747827 0.31413654 0.07332655
## hsa05132 3.524485 0.0042505899 0.75235441 0.14049318
## hsa00450 5.025100 0.0082405316 1.00000000 0.22697604
##                                       geneID Count
## hsa05222         596/898/1163/1871/5925/6502     6
## hsa01522             596/1871/2099/3480/5925     5
## hsa05215              596/898/1871/3480/5925     5
## hsa04110         898/902/1871/5925/6500/6502     6
## hsa05132 57180/596/79443/5869/388/6500/23339     7
## hsa00450                         22929/51091     2