library(tidyverse)
library(MOFA2)
library(reshape2)
library(cowplot)
library(magrittr)
cols4diet <- c("#1f78b4", "#b2df8a")
names(cols4diet) <- c("fd", "bd")
cols4delivery <- c("#e6ab02", "#d95f02")
names(cols4delivery) <- c("Cesarean", "Vaginal")
The data for this vignette can be downloaded from here and placed in a data/
directory. You should then have the following files:
data/microbiome_data.csv
containing the microbiome data used as inputdata/microbiome_features_metadata.csv
containing taxonomic information for the features in the modeldata/microbiome_model.hdf5
containing the pre-trained MEFISTO modelThe original data was published by Bokulich et al. Note that the data used in this vignette slightly differs from the preprint on MEFISTO. We first load the dataframe that contains the preprocessed microbiome data for all children (groups) as well as the time annotation (month of life) for each sample.
microbiome <- read.csv("data/microbiome_data.csv")
head(microbiome)
## group month feature value view sample
## 1 C001 0 ac5402de1ddf427ab8d2b0a8a0a44f19 0.6160216 microbiome C001_0
## 2 C001 0 2a2947125c677c6e27898ad4e9b9dca7 NA microbiome C001_0
## 3 C001 0 0cc2420a6a4698f8bf664d50b17d26b4 NA microbiome C001_0
## 4 C001 0 651794369aeb3db83839b81fe49c8b4e NA microbiome C001_0
## 5 C001 0 e6a34eb113dba66df0b8bbec907a8f5d -0.4163788 microbiome C001_0
## 6 C001 0 79280cea51a6fe8a3432b2f266dd34db -1.1685587 microbiome C001_0
## delivery diet sex
## 1 Vaginal bd Female
## 2 Vaginal bd Female
## 3 Vaginal bd Female
## 4 Vaginal bd Female
## 5 Vaginal bd Female
## 6 Vaginal bd Female
feature_meta <- read.csv("data/microbiome_features_metadata.csv")
head(feature_meta)
## SampleID
## 1 ac5402de1ddf427ab8d2b0a8a0a44f19
## 2 2a2947125c677c6e27898ad4e9b9dca7
## 3 0cc2420a6a4698f8bf664d50b17d26b4
## 4 3d9838f12f6ff5591dbadeb427a855f1
## 5 651794369aeb3db83839b81fe49c8b4e
## 6 e6a34eb113dba66df0b8bbec907a8f5d
## Taxon
## 1 k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__
## 2 k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__[Tissierellaceae]; g__WAL_1855D; s__
## 3 k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Rikenellaceae; g__Alistipes; s__onderdonkii
## 4 k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__
## 5 k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella; s__
## 6 k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Rikenellaceae; g__Alistipes; s__putredinis
## Confidence
## 1 0.9997703
## 2 1.0000000
## 3 0.9680367
## 4 0.8596347
## 5 0.9955504
## 6 0.9741591
Create the MOFA object
First, we need to create a MOFA object from this data. This step is analogous to use of MOFA without the time information and can be done using create_mofa
, which results in an untrained MOFA object.
# create the MOFA object and add times as covariate
MOFAobject_untrained <- create_mofa(data = microbiome)
MOFAobject_untrained
## Untrained MOFA model with the following characteristics:
## Number of views: 1
## Views names: microbiome
## Number of features (per view): 969
## Number of groups: 43
## Groups names: C001 C002 C004 C005 C007 C008 C009 C010 C011 C012 C014 C016 C017 C018 C020 C021 C022 C023 C024 C025 C027 C030 C031 C032 C033 C034 C035 C036 C037 C038 C041 C042 C043 C044 C045 C046 C047 C049 C052 C053 C055 C056 C057
## Number of samples (per group): 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
##
Next, we want to add the time information for each sample, which we can do using set_covariates
. As the information on the month for each sample is already contained in the data.frame that we passed to the MOFAobject in create_mofa
, we can just specify to use this column as covariate. Alternatively, we could also supply a new matrix or data frame providing the sample names and covariates. See ?set_covariates
.
head(samples_metadata(MOFAobject_untrained))
## sample group month delivery diet sex
## C001_0 C001_0 C001 0 Vaginal bd Female
## C001_1 C001_1 C001 1 Vaginal bd Female
## C001_2 C001_2 C001 2 Vaginal bd Female
## C001_3 C001_3 C001 3 Vaginal bd Female
## C001_4 C001_4 C001 4 Vaginal bd Female
## C001_5 C001_5 C001 5 Vaginal bd Female
MOFAobject_untrained <- set_covariates(MOFAobject_untrained,
covariates = "month")
MOFAobject_untrained
## Untrained MEFISTO model with the following characteristics:
## Number of views: 1
## Views names: microbiome
## Number of features (per view): 969
## Number of groups: 43
## Groups names: C001 C002 C004 C005 C007 C008 C009 C010 C011 C012 C014 C016 C017 C018 C020 C021 C022 C023 C024 C025 C027 C030 C031 C032 C033 C034 C035 C036 C037 C038 C041 C042 C043 C044 C045 C046 C047 C049 C052 C053 C055 C056 C057
## Number of samples (per group): 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
## Number of covariates per sample: 1
##
Show data and time covariate Before moving towards the training, let’s first take a look at the data in the untrained object. The plot shows for children (groups) which time points are available. Note that there are many missing time points for several children in this data.
gg_input <- plot_data_overview(MOFAobject_untrained,
show_covariate = FALSE,
show_dimensions = FALSE)
gg_input
Prepare the MOFA object
Next, we prepare the model for training. For this we specify the different options. Similar to the standard MOFA workflow we define data, model and training options. Take a look at get_default_data_options
, get_default_model_options
and get_default_training_options
to see what options can be specified. Here, we specify that we want to use 2 factors and set a seed for reproducibility. In addition we do not center the data from individual groups as the data was already centered across all samples. This can be preferably if differences between groups should be maintained and the feature levels are comparable between groups.
In addition, for MEFISTO we can specify MEFISTO-specific options, see get_default_mefisto_options
. Here, we specify the optimisation frequency and number of grid points which can speed up inference. For a fast inference, we could also specify to not model inter-group relationships which can be slow in the presence of many groups.
data_opts <- get_default_data_options(MOFAobject_untrained)
data_opts$center_groups <- FALSE
model_opts <- get_default_model_options(MOFAobject_untrained)
model_opts$num_factors <- 2
mefisto_opts <- get_default_mefisto_options(MOFAobject_untrained)
mefisto_opts$n_grid <- 10
mefisto_opts$start_opt <- 50
mefisto_opts$opt_freq <- 50
# mefisto_opts$model_groups <- FALSE # fast option (no optimization of group relationships)
mefisto_opts$new_values <- matrix(0:24, nrow =1) # set time points to interpolate factors to
train_opts <- get_default_training_options(MOFAobject_untrained)
train_opts$seed <- 2020
We then pass all these options to the object.
MOFAobject_untrained <- prepare_mofa(
object = MOFAobject_untrained,
data_options = data_opts,
model_options = model_opts,
training_options = train_opts,
mefisto_options = mefisto_opts
)
Train the MOFA model Now, we are ready to train the model.
outfile <- "data/microbiome_model.hdf5"
# MOFAobject <- run_mofa(MOFAobject_untrained, outfile = outfile)
For all further analysis we will use a pre-trained model. In addition to the factor values at the observed data points we can optionally load the interpolated data.
MOFAobject <- load_model(outfile, load_interpol_Z = TRUE)
The pretrained object does not yet contain the samples meta information, so we add this and the information on the taxonomy of the features to the model’s metadata and use more readable feature names based on the taxonomy.
samples_metadata(MOFAobject) <- left_join(samples_metadata(MOFAobject),
samples_metadata(MOFAobject_untrained),
by = c("sample", "group","month"))
features_metadata(MOFAobject) <- features_metadata(MOFAobject) %>%
left_join(feature_meta, by = c("feature" = "SampleID")) %>%
separate(col = Taxon, sep = ";",
into = c("kingdom", "phylum", "class",
"order", "family", "genus", "species"),
remove = FALSE)
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 163 rows [12,
## 27, 55, 71, 76, 83, 84, 91, 94, 96, 108, 109, 114, 130, 132, 141, 146, 157, 160,
## 173, ...].
To obtain a first overview of the factors we can take a look at the variance that a factor explains in each child.
p <- plot_variance_explained(MOFAobject, plot_total = T, x = "group", split_by = "view")
p[[1]] + theme(axis.text.x = element_blank()) + xlab("child")
p[[2]] + theme(axis.text.x = element_blank()) + xlab("child")
To investigate the inferred factors, we can plot them against the months of life and colour them by the metadata of the samples. Here, we color by delivery mode and diet.
# Factor 1 coloured by delivery
plot_factors_vs_cov(MOFAobject, color_by = "delivery", factor = 1) +
scale_fill_manual(values = cols4delivery) +
stat_summary(aes(col = color_by), geom="line", fun.y = "mean") +
scale_color_manual(values = cols4delivery)
## Warning: `fun.y` is deprecated. Use `fun` instead.
# Factor 2 coloured by diet
plot_factors_vs_cov(MOFAobject, color_by = "diet", factor = 2) +
scale_fill_manual(values = cols4diet) +
stat_summary(aes(col = color_by), geom="line", fun.y = "mean") +
scale_color_manual(values = cols4diet)
## Warning: `fun.y` is deprecated. Use `fun` instead.
Using the underlying Gaussian process for each factor we can interpolate to unseen time points for children that are missing data in these time points and intermediate time points. If you want to show uncertainties, set only_mean to FALSE. Note that these are only available if the new values for interpolation have been specified before training.
plot_interpolation_vs_covariate(MOFAobject,
only_mean = FALSE, factor = 1) +
facet_wrap(~group)
plot_interpolation_vs_covariate(MOFAobject,
only_mean = FALSE, factor = 2) +
facet_wrap(~group)
We can also look at the factor values on the sample level. Here each dot correspond to one time-point-child combination.
gg_scatter <- plot_grid(
plot_factors(MOFAobject, color_by = "delivery") +
theme(legend.position = "top") +
scale_fill_manual(values = cols4delivery),
plot_factors(MOFAobject, color_by = "diet") +
theme(legend.position = "top") +
scale_fill_manual(values = cols4diet),
plot_factors(MOFAobject, color_by = "month") +
theme(legend.position = "top"),
nrow = 1, align = "h", axis = "tb")
gg_scatter
Next we have a look at the microbial species associated with the factors, focusing on Factor 1. For this we have a look at the weights of the factor.
Let’s first have a look at the top positive and top negative species on factor 1. We find top negative weights for species of the genera Bacteroides and Faecalibacterium, meaning that their abundance varies in line with the negative of Factor 1, increasing over the first year of life and with higher abundance in vaginally delivered children.
# top negative ones (more dominant in vaginal)
get_weights(MOFAobject, factors = 1, as.data.frame = TRUE) %>%
arrange(value) %>% head(20) %>%
left_join(., features_metadata(MOFAobject), by = c("feature", "view")) %>%
select(feature, family, genus, species, value)
## feature family
## 1 c4f9ef34bd2919511069f409c25de6f1 f__Bacteroidaceae
## 2 010f0ac2691bc0be12d0633d4b5d2cc4 f__Ruminococcaceae
## 3 ac5402de1ddf427ab8d2b0a8a0a44f19 f__Bacteroidaceae
## 4 40c85a458df8d5a935122ba778d4f334 f__Bacteroidaceae
## 5 8937656c16c20701c107e715bad86732 f__Lachnospiraceae
## 6 2a99ec1157a90661db7ff643b82f1914 f__Bacteroidaceae
## 7 a66dd8d31584930c2b0f811f572dddcf f__Porphyromonadaceae
## 8 79280cea51a6fe8a3432b2f266dd34db f__Ruminococcaceae
## 9 2403a466dd6d1e4f551d80394c2aa994 f__[Paraprevotellaceae]
## 10 b0645cbec7653c887cb868a248d95874 f__Ruminococcaceae
## 11 e91f1c5b67ffecb5ca3ca3b393a50f00 f__
## 12 ea2b0e4a93c24c6c3661cbe347f93b74 f__Bacteroidaceae
## 13 697b3fb29268cba4b2a9eef0eab2de7e f__Ruminococcaceae
## 14 0cc2420a6a4698f8bf664d50b17d26b4 f__Rikenellaceae
## 15 f9b52205cb2d03153fd394a290212a10 f__Bacteroidaceae
## 16 d39f37b814ebd81091a949f8bd9d1710 f__Bacteroidaceae
## 17 5ae365ca527c17c22171f54421e75197 f__Bacteroidaceae
## 18 2364e23d05750415070f8df3ccb0c95b f__Bacteroidaceae
## 19 c162a4f3943238810eba8a25f0563cca f__Bacteroidaceae
## 20 e0fdbc7f687a7ea0cf13009c2e501197 f__
## genus species value
## 1 g__Bacteroides s__ -2.022511
## 2 g__Faecalibacterium s__prausnitzii -2.004185
## 3 g__Bacteroides s__ -1.911058
## 4 g__Bacteroides s__ -1.682951
## 5 g__Roseburia s__faecis -1.623216
## 6 g__Bacteroides s__fragilis -1.560774
## 7 g__Parabacteroides s__distasonis -1.547396
## 8 g__Faecalibacterium s__prausnitzii -1.533220
## 9 g__Paraprevotella s__ -1.344230
## 10 g__Faecalibacterium s__prausnitzii -1.308934
## 11 g__ s__ -1.197169
## 12 g__Bacteroides s__uniformis -1.192211
## 13 g__Faecalibacterium s__prausnitzii -1.157351
## 14 g__Alistipes s__onderdonkii -1.084373
## 15 g__Bacteroides s__ -1.081778
## 16 g__Bacteroides s__caccae -1.077649
## 17 g__Bacteroides s__eggerthii -1.044467
## 18 g__Bacteroides s__ -1.040409
## 19 g__Bacteroides <NA> -1.021549
## 20 g__ s__ -1.005623
# top positive ones (more dominant in C-section)
get_weights(MOFAobject, factors = 1, as.data.frame = TRUE) %>%
arrange(-value) %>% head(20) %>%
left_join(., features_metadata(MOFAobject), by = c("feature", "view")) %>%
select(feature, family, genus, species, value)
## feature family genus
## 1 aee0d6f2626271eacf89d2e803440657 f__S24-7 g__
## 2 0e637c81e25ffbf7b7e68a8fbcc4f2ea f__Alcaligenaceae g__Sutterella
## 3 7281d8476d29dd24b3e674ce9cd9e3b0 f__Ruminococcaceae g__Clostridium
## 4 3df77157a242066dfb4eb9c19062436a f__Corynebacteriaceae g__Corynebacterium
## 5 04195686f2b70585790ec75320de0d6f f__Enterobacteriaceae <NA>
## 6 d6bc69828ae0ef27e99c01c9e7df86b0 f__Lachnospiraceae g__Blautia
## 7 8e6e925baa81466352ff781a289e188d f__Clostridiaceae g__Clostridium
## 8 904f7b1ae29e6a78ff83eacd9ef9cda7 f__Lachnospiraceae g__
## 9 bae0d49b5eb22b0dfaecbcbf39a52265 f__Clostridiaceae g__Clostridium
## 10 f5eeebc67e28fc2cf0b75e4e8ea38647 f__Ruminococcaceae g__Ruminococcus
## 11 eaea5d7b84c08fdef5d8ab40f707e37e f__Lactobacillaceae g__Lactobacillus
## 12 c18afe570abfe82d2f746ecc6e291bab f__Enterobacteriaceae <NA>
## 13 0c4204a0e15fccc22a7456685898cd5c f__Enterobacteriaceae <NA>
## 14 a4dbf71f32a016e120fe6cf253dfa6c0 f__Porphyromonadaceae g__Dysgonomonas
## 15 536018032c5d0d65e6d25d3988f87421 f__Actinomycetaceae g__Varibaculum
## 16 f4b84d33e7e992fdcc68ce666f80d90a f__Lachnospiraceae <NA>
## 17 6ed7152332889b69d2d2f33c53b68934 f__Lactobacillaceae g__Lactobacillus
## 18 14791dac5b7bdbe605f730517ab343da f__Lachnospiraceae g__Blautia
## 19 6961c6244fb505547380d15bfee162a9 f__Lachnospiraceae g__[Ruminococcus]
## 20 6a5f9d9d0fbc066a57bb7798f9f371ff f__Clostridiaceae g__Clostridium
## species value
## 1 s__ 1.6834583
## 2 s__ 1.3481336
## 3 s__methylpentosum 1.3166047
## 4 s__ 1.2640855
## 5 <NA> 1.2509283
## 6 s__producta 1.2093829
## 7 s__neonatale 1.1328926
## 8 s__ 1.1137318
## 9 s__paraputrificum 1.0947469
## 10 s__ 1.0702765
## 11 s__iners 1.0405345
## 12 <NA> 1.0363628
## 13 <NA> 1.0283349
## 14 s__ 1.0095971
## 15 s__ 1.0049116
## 16 <NA> 1.0041819
## 17 s__zeae 0.9811867
## 18 s__ 0.9524899
## 19 s__gnavus 0.9515983
## 20 <NA> 0.9305485
We can also take a look at the data for the top features on the factor:
plot_data_vs_cov(MOFAobject, features = 3,
color_by = "delivery") +
scale_fill_manual(values = cols4delivery)
We now aggregate the weights on the genus level.
df_weights1 <- get_weights(MOFAobject, as.data.frame = T, factors = 1) %>%
left_join(., features_metadata(MOFAobject), by = c("feature", "view"))
# filter out genera with missing genera information and shorten name
df_weights1 %<>% filter(!is.na(genus), genus != "g__")
df_weights1 %<>% mutate(genus = gsub("g__","", genus))
df_weights1 %<>% mutate(genus = ifelse(genus == " Pseudoramibacter_Eubacterium",
"Pseudoramibacter_Eub.", genus))
df_weights1 %<>% mutate(genus = gsub("\\[","", genus))
df_weights1 %<>% mutate(genus = gsub("\\]","", genus))
# summarize by mean weights across all species in the genus and
# filter to top 10 positive and negative ones
df_top <- df_weights1 %>% group_by(genus) %>%
summarize(mean_weight = mean(value), n_spec = n()) %>%
ungroup() %>% filter(n_spec > 2) %>%
# cesarean is linked to higher latent values
mutate(type = ifelse(mean_weight > 0, "Cesarean", "Vaginal")) %>%
group_by(type) %>%
slice_max(abs(mean_weight), n= 5) %>% ungroup() %>% arrange(mean_weight) %>%
mutate(genus = factor(genus, levels = .$genus))
# plot
df_top %>%
ggplot(aes(x= genus, y = mean_weight, fill = type)) + geom_bar(stat="identity") +
coord_flip() + theme_bw() + scale_fill_manual(values = cols4delivery) +
theme(legend.position = "top") + xlab("") + guides(fill = guide_legend(title="")) +
geom_point(data = filter(df_weights1, genus %in% df_top$genus),
aes(x = genus, y = value), inherit.aes = FALSE, alpha = 0.3) +
ylab("Weight (Factor 1)")