library(tidyverse)
library(MOFA2)
library(reshape2)
library(cowplot)
library(magrittr)

cols4diet <- c("#1f78b4", "#b2df8a")
names(cols4diet) <- c("fd", "bd")

cols4delivery <- c("#e6ab02", "#d95f02")
names(cols4delivery) <- c("Cesarean", "Vaginal")

1 Load data

The data for this vignette can be downloaded from here and placed in a data/ directory. You should then have the following files:

  • data/microbiome_data.csv containing the microbiome data used as input
  • data/microbiome_features_metadata.csv containing taxonomic information for the features in the model
  • data/microbiome_model.hdf5 containing the pre-trained MEFISTO model

The original data was published by Bokulich et al. Note that the data used in this vignette slightly differs from the preprint on MEFISTO. We first load the dataframe that contains the preprocessed microbiome data for all children (groups) as well as the time annotation (month of life) for each sample.

microbiome <- read.csv("data/microbiome_data.csv")
head(microbiome)
##   group month                          feature      value       view sample
## 1  C001     0 ac5402de1ddf427ab8d2b0a8a0a44f19  0.6160216 microbiome C001_0
## 2  C001     0 2a2947125c677c6e27898ad4e9b9dca7         NA microbiome C001_0
## 3  C001     0 0cc2420a6a4698f8bf664d50b17d26b4         NA microbiome C001_0
## 4  C001     0 651794369aeb3db83839b81fe49c8b4e         NA microbiome C001_0
## 5  C001     0 e6a34eb113dba66df0b8bbec907a8f5d -0.4163788 microbiome C001_0
## 6  C001     0 79280cea51a6fe8a3432b2f266dd34db -1.1685587 microbiome C001_0
##   delivery diet    sex
## 1  Vaginal   bd Female
## 2  Vaginal   bd Female
## 3  Vaginal   bd Female
## 4  Vaginal   bd Female
## 5  Vaginal   bd Female
## 6  Vaginal   bd Female
feature_meta <- read.csv("data/microbiome_features_metadata.csv")
head(feature_meta)
##                           SampleID
## 1 ac5402de1ddf427ab8d2b0a8a0a44f19
## 2 2a2947125c677c6e27898ad4e9b9dca7
## 3 0cc2420a6a4698f8bf664d50b17d26b4
## 4 3d9838f12f6ff5591dbadeb427a855f1
## 5 651794369aeb3db83839b81fe49c8b4e
## 6 e6a34eb113dba66df0b8bbec907a8f5d
##                                                                                                             Taxon
## 1         k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__
## 2            k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__[Tissierellaceae]; g__WAL_1855D; s__
## 3 k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Rikenellaceae; g__Alistipes; s__onderdonkii
## 4         k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__
## 5          k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella; s__
## 6  k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Rikenellaceae; g__Alistipes; s__putredinis
##   Confidence
## 1  0.9997703
## 2  1.0000000
## 3  0.9680367
## 4  0.8596347
## 5  0.9955504
## 6  0.9741591

2 Prepare and train MOFA

Create the MOFA object

First, we need to create a MOFA object from this data. This step is analogous to use of MOFA without the time information and can be done using create_mofa, which results in an untrained MOFA object.

# create the MOFA object and add times as covariate
MOFAobject_untrained <- create_mofa(data = microbiome)
MOFAobject_untrained
## Untrained MOFA model with the following characteristics: 
##  Number of views: 1 
##  Views names: microbiome 
##  Number of features (per view): 969 
##  Number of groups: 43 
##  Groups names: C001 C002 C004 C005 C007 C008 C009 C010 C011 C012 C014 C016 C017 C018 C020 C021 C022 C023 C024 C025 C027 C030 C031 C032 C033 C034 C035 C036 C037 C038 C041 C042 C043 C044 C045 C046 C047 C049 C052 C053 C055 C056 C057 
##  Number of samples (per group): 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 
## 

Next, we want to add the time information for each sample, which we can do using set_covariates. As the information on the month for each sample is already contained in the data.frame that we passed to the MOFAobject in create_mofa, we can just specify to use this column as covariate. Alternatively, we could also supply a new matrix or data frame providing the sample names and covariates. See ?set_covariates.

head(samples_metadata(MOFAobject_untrained))
##        sample group month delivery diet    sex
## C001_0 C001_0  C001     0  Vaginal   bd Female
## C001_1 C001_1  C001     1  Vaginal   bd Female
## C001_2 C001_2  C001     2  Vaginal   bd Female
## C001_3 C001_3  C001     3  Vaginal   bd Female
## C001_4 C001_4  C001     4  Vaginal   bd Female
## C001_5 C001_5  C001     5  Vaginal   bd Female
MOFAobject_untrained <- set_covariates(MOFAobject_untrained,
                                       covariates = "month")
MOFAobject_untrained
## Untrained MEFISTO model with the following characteristics: 
##  Number of views: 1 
##  Views names: microbiome 
##  Number of features (per view): 969 
##  Number of groups: 43 
##  Groups names: C001 C002 C004 C005 C007 C008 C009 C010 C011 C012 C014 C016 C017 C018 C020 C021 C022 C023 C024 C025 C027 C030 C031 C032 C033 C034 C035 C036 C037 C038 C041 C042 C043 C044 C045 C046 C047 C049 C052 C053 C055 C056 C057 
##  Number of samples (per group): 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 
##  Number of covariates per sample: 1 
## 

Show data and time covariate Before moving towards the training, let’s first take a look at the data in the untrained object. The plot shows for children (groups) which time points are available. Note that there are many missing time points for several children in this data.

gg_input <- plot_data_overview(MOFAobject_untrained,
                               show_covariate = FALSE,
                               show_dimensions = FALSE) 
gg_input

Prepare the MOFA object Next, we prepare the model for training. For this we specify the different options. Similar to the standard MOFA workflow we define data, model and training options. Take a look at get_default_data_options, get_default_model_options and get_default_training_options to see what options can be specified. Here, we specify that we want to use 2 factors and set a seed for reproducibility. In addition we do not center the data from individual groups as the data was already centered across all samples. This can be preferably if differences between groups should be maintained and the feature levels are comparable between groups.

In addition, for MEFISTO we can specify MEFISTO-specific options, see get_default_mefisto_options. Here, we specify the optimisation frequency and number of grid points which can speed up inference. For a fast inference, we could also specify to not model inter-group relationships which can be slow in the presence of many groups.

data_opts <- get_default_data_options(MOFAobject_untrained)
data_opts$center_groups <- FALSE

model_opts <- get_default_model_options(MOFAobject_untrained)
model_opts$num_factors <- 2

mefisto_opts <- get_default_mefisto_options(MOFAobject_untrained)
mefisto_opts$n_grid <- 10
mefisto_opts$start_opt <- 50
mefisto_opts$opt_freq <- 50
# mefisto_opts$model_groups <- FALSE # fast option (no optimization of group relationships) 
mefisto_opts$new_values <- matrix(0:24, nrow =1) # set time points to interpolate factors to

train_opts <- get_default_training_options(MOFAobject_untrained)
train_opts$seed <- 2020

We then pass all these options to the object.

MOFAobject_untrained <- prepare_mofa(
  object = MOFAobject_untrained,
  data_options = data_opts,
  model_options = model_opts,
  training_options = train_opts,
  mefisto_options = mefisto_opts
) 

Train the MOFA model Now, we are ready to train the model.

outfile <- "data/microbiome_model.hdf5"
# MOFAobject <- run_mofa(MOFAobject_untrained, outfile = outfile)

For all further analysis we will use a pre-trained model. In addition to the factor values at the observed data points we can optionally load the interpolated data.

MOFAobject <- load_model(outfile, load_interpol_Z = TRUE)

The pretrained object does not yet contain the samples meta information, so we add this and the information on the taxonomy of the features to the model’s metadata and use more readable feature names based on the taxonomy.

samples_metadata(MOFAobject) <- left_join(samples_metadata(MOFAobject),
                                          samples_metadata(MOFAobject_untrained),
                                          by = c("sample", "group","month"))

features_metadata(MOFAobject) <- features_metadata(MOFAobject) %>%
  left_join(feature_meta, by = c("feature" = "SampleID")) %>%
  separate(col = Taxon, sep = ";",
           into = c("kingdom", "phylum", "class",
                    "order", "family", "genus", "species"),
           remove = FALSE)
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 163 rows [12,
## 27, 55, 71, 76, 83, 84, 91, 94, 96, 108, 109, 114, 130, 132, 141, 146, 157, 160,
## 173, ...].

3 Factor overview and visualization

3.1 Variance decomposition and factor correlation

To obtain a first overview of the factors we can take a look at the variance that a factor explains in each child.

p <- plot_variance_explained(MOFAobject, plot_total = T, x = "group", split_by = "view")
p[[1]] + theme(axis.text.x = element_blank()) + xlab("child")

p[[2]] + theme(axis.text.x = element_blank()) + xlab("child")