From c5308a5072f0ca5771417ea7c2e20679728a68db Mon Sep 17 00:00:00 2001 From: YihanLiu4023 <149611140+YihanLiu4023@users.noreply.github.com> Date: Tue, 15 Jul 2025 01:36:24 +0800 Subject: [PATCH 1/7] Create multimedia.qmd --- inst/pages/multimedia.qmd | 590 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 590 insertions(+) create mode 100644 inst/pages/multimedia.qmd diff --git a/inst/pages/multimedia.qmd b/inst/pages/multimedia.qmd new file mode 100644 index 00000000..439b2666 --- /dev/null +++ b/inst/pages/multimedia.qmd @@ -0,0 +1,590 @@ +--- +title: "Multimodal Mediation Analysis" +format: html +editor: visual +author: Yihan Liu, Himel Mallick +--- + +# **Multimodal Mediation Analysis** {#sec-MSEA} + +```{r setup, echo=FALSE, results="asis"} +remove(list = ls()) +invisible(gc()) +library(rebook) +chapterPreamble() + +``` + +Building upon the concepts and workflows presented in the previous chapter on +mediation analysis, here we demonstrate how to perform **multimodal mediation** +**analysis** to identify potential mediators across multiple omics layers. +**Multimodal mediation analysis** examines whether and how an exposure (e.g., +treatment, diet) affects an outcome (e.g., disease, phenotype) **through** +**intermediate variables (mediators) across multiple omics layers** +**simultaneously** (e.g., microbiome, metabolomics). It identifies which +features across modalities act as mediators, quantifies indirect (mediated) +effects while adjusting for covariates, and compares mediation strength across +layers, providing mechanistic insights into **how exposures impact outcomes** +**via molecular pathways**. + +```{r} +#| label: fig-multimodal-mediation +#| fig-cap: Directed acyclic graph illustrating multimodal mediation where an exposure affects multiple mediators across modalities (microbial species and metabolites), which in turn affect the outcome. A direct path from exposure to outcome is also included (dashed). While gut microbial species biologically influence metabolite levels, for tractability in high-dimensional mediation analysis we assume species and metabolites act as conditionally independent parallel mediators given exposure, acknowledging this as an approximation +#| echo: false + +library(DiagrammeR) + +grViz(" +digraph multimodal_mediation { + graph [layout = dot, rankdir = LR] + + node [fontname = Helvetica, fontsize = 12] + + Exposure [shape=box, style=filled, fillcolor=lightblue, color=black] + Outcome [shape=box, style=filled, fillcolor=palegreen, color=black] + + M1 [label='Species 1', shape=ellipse, style=filled, fillcolor=lightyellow, color=black] + M2 [label='Species 2', shape=ellipse, style=filled, fillcolor=lightyellow, color=black] + M3 [label='Metabolite 1', shape=ellipse, style=filled, fillcolor=lightpink, color=black] + M4 [label='Metabolite 2', shape=ellipse, style=filled, fillcolor=lightpink, color=black] + + Exposure -> M1 + Exposure -> M2 + Exposure -> M3 + Exposure -> M4 + M1 -> Outcome + M2 -> Outcome + M3 -> Outcome + M4 -> Outcome + Exposure -> Outcome [style=dashed] +} +") +``` + +In this chapter, we demonstrate multimodal mediation analysis using microbiome +data from the iHMP IBDMDB study from the R/Bioconductor package +curatedMetagenomicData. [@Lloyd-Price2019]. Unlike conventional mediation +analysis, which typically focuses on a single mediator or a small set of +mediators, we performed multimodal mediation analysis with R/Bioconductor +package \[multimedia\] [@Jiang2025] here, which can handle many potential +mediators across multiple data modalities. For example, species-level taxonomic +abundances, functional pathways, and metabolomic profiles can all serve as +potential mediators. This method allows us to estimate and rank each mediator’s +contribution while accounting for correlations between mediators in a +high-dimensional setting. + +In this example, time point (baseline vs. post-treatment) is used as the +treatment, and microbiome-based dysbiosis scores are used as the outcome. We +will first focus on the relative abundances of gut microbial species as +mediators, followed by functional pathways. + +Generally, we will proceed through the following key steps: + +1. Prepare the microbiome taxonomic data from the iHMP IBDMDB project. + +2. Calculate dysbiosis scores as the outcome. + +3. Define the mediation analysis data structure. + +4. Fit the multimodal mediation model using the R package multimedia. + +5. Interpret both the overall indirect effects and the mediator-specific +indirect effects. + +6. Visualize the results using forest plots, histograms, and rankings to +prioritize findings. + +7. Repeat the process for the iHMP microbial pathways. + +### Performing multimodal mediation analysis for iHMP species relative abundance {#sec-relative-abundance} + +We begin by loading the relative abundance data and calculating dysbiosis scores +for each sample based on Bray–Curtis dissimilarity from a healthy reference set. +This represents deviation from a healthy microbiome, where higher dysbiosis +scores indicate greater microbial imbalance. We then subset to IBD subjects with +samples collected at both baseline (visit 1) and post-treatment (visit 21). + +```{r} +################## +# Load libraries # +################## + +library(curatedMetagenomicData) +library(SummarizedExperiment) +library(dplyr) +library(vegan) +library(tidyverse) +library(multimedia) +library(stringr) +library(ggplot2) + + +################## +# Load iHMP data # +################## + +# Load data +se_relative <- curatedMetagenomicData( + "HMP_2019_ibdmdb.relative_abundance", + dryrun = FALSE + )[[1]] + +# Normalize to proportions (from %) +assay(se_relative) <- assay(se_relative) / 100 + +# Change the rownames names for variables of interest +rownames(se_relative) <- sub('.*s__', '', rownames(se_relative)) + + +######################## +# Reference nonIBD set # +######################## + +# Reference set: healthy +ref_set <- colData(se_relative)$disease == "healthy" + + +######################################## +# Calculate Bray-Curtis dissimilarity # +######################################## + +# Bray-Curtis dissimilarity +dist <- as.matrix(vegdist(t(assay(se_relative)), method = "bray", na.rm = TRUE)) + +# Assign SampleID for matching +colData(se_relative)$SampleID <- colnames(se_relative) + + +################################# +# Calculate the dysbiosis score # +################################# + +# Calculate dysbiosis score for each sample +colData(se_relative)$dysbiosis <- sapply(seq_along(ref_set), function(i) { + median(dist[i, ref_set & + (colData(se_relative)$SampleID != colData(se_relative)$SampleID[i])], + na.rm = TRUE) +}) + + +############################################################# +# Subset to IBD only and only one post-baseline time point # +############################################################# + +# Keep IBD only +se_relative <- se_relative[, colData(se_relative)$disease != "healthy"] + +# Visit 1 (baseline) or 25 (post) +se_relative <- se_relative[, colData(se_relative)$visit_number %in% c(1, 21)] + +# Keep subjects with both visits +keep_subjects <- names(which(table(colData(se_relative)$subject_id) == 2)) +se_relative <- se_relative[, colData(se_relative)$subject_id %in% keep_subjects] + +# Define time point label +colData(se_relative)$Time_point <- ifelse( + colData(se_relative)$visit_number == 1, "T0", "T1" +) + +``` + +Next, we will define the mediation anlaysis components. By using the processed +data, we define: + +- Treatment: time point (baseline vs. post-baseline) +- Outcome: dysbiosis score +- Mediators: scaled species abundances + +These are bundled into a mediation data object (exper), which is passed into the +multimedia function to estimate path models for the treatment-to-mediator +(α path), mediator-to-outcome (β path), and the combined indirect effect (α×β +path). + +```{r define-mediation-data-frame} + +############################# +# Define mediation data set # +############################# + +# Treatment: Pre and Post (Baseline vs. Post-baseline) +treatment <- factor(colData(se_relative)$Time_point, levels = c("T0", "T1")) + +# Outcome: Dysbiosis Score +dysbiosis <- as.numeric(colData(se_relative)$dysbiosis) + +# Mediators (scaled, directly from SE): Species +M <- t(assay(se_relative)) +M <- scale(M) +M_clean <- M[, colSums(is.na(M)) == 0] + +# clean mediator names +raw <- colnames(M_clean) +clean1 <- str_remove_all(raw, "\\[|\\]") +clean2 <- str_replace_all(clean1, "[: \\.,]", "_") + +# Save the final safe names +safe_names <- make.names(clean2, unique = TRUE) + +# Apply names +colnames(M_clean) <- safe_names + +# Convert to tibble +mediators <- as_tibble(M_clean) + +# Create the mediation data frame +df <- data.frame( + treatment = treatment, + dysbiosis = dysbiosis +) %>% + bind_cols(mediators) + +``` + +We now fit the multimodal mediation model and inspect the overall indirect and +direct effects to understand whether microbiome composition mediates the +treatment's effect on dysbiosis. + +```{r overall-mediation-analysis} + +############################# +# Run mediation (non-delta) # +############################# + +# Create the Mediation Data object +exper <- mediation_data( + df, + outcome = "dysbiosis", + treatment = "treatment", + mediators = colnames(mediators) +) + +# Fit and summarize +mdl <- multimedia(exper) +res <- estimate(mdl, exper) + +# Summarize overall indirect/direct +summary(res) +print(indirect_overall(res, exper)) +print(direct_effect(res, exper)) + +``` + +```{r specific-mediation} + +############################# +# Mediator-specific effects +############################# + +# Extract the treatment-to-mediator path coefficients +treatment_to_mediator_coef <- sapply(res@mediation@estimates, function(m) coef(m)["treatmentT1"]) +names(treatment_to_mediator_coef) <- sub("\\.treatmentT1$", "", names(treatment_to_mediator_coef)) + +# Extract the mediator-to-outcome path coefficients (adjusted for all mediators) +mediator_to_outcome_coef <- coef(res@outcome@estimates$dysbiosis)[names(treatment_to_mediator_coef)] + +# Calculate the indirect effect (product of path coefficients) +indirect_effects <- treatment_to_mediator_coef * mediator_to_outcome_coef + +# Summary table +mediator_effects <- data.frame( + mediator = names(treatment_to_mediator_coef), + treatment_to_mediator_coef = treatment_to_mediator_coef, + mediator_to_outcome_coef = mediator_to_outcome_coef, + indirect_effect = indirect_effects +) + +# Rank +top_mediators <- mediator_effects %>% + arrange(desc(abs(indirect_effect))) %>% + head(10) + +print(top_mediators) + +``` + +To quantify uncertainty around the indirect effects, we use non-parametric +bootstrap resampling to compute **95% confidence intervals** for both the +overall and mediator-specific indirect effects. + +```{r visualization-for-overall-indirect-effects} + +##################################### +# Bootstrap overall indirect effect # +##################################### + +set.seed(12345) +boot_overall <- bootstrap(mdl, exper, c(indirect = indirect_overall), B = 100) +summary(boot_overall$indirect) +quantile(boot_overall$indirect$indirect_effect, probs = c(0.025, 0.975)) + +# Histogram +ggplot(boot_overall$indirect) + + geom_histogram(aes(indirect_effect), bins = 20, fill = "#69b3a2", color = "black") + + theme_classic() + + labs( + x = "Overall Indirect Effect", + y = "Frequency", + title = "Bootstrap Distribution of Overall Indirect Effect (B = 100)" + ) + +``` + +Finally, we visualize the **mediator-specific indirect effects** using a forest +plot, displaying point estimates and 95% CIs to highlight the most influential +mediators. + +```{r visualization-for-mediation-specific-indirect-effects} + +############################################### +# Bootstrap mediator-specific indirect effect # +############################################### + +# Define an mediation-specific indirect effect function manually +indirect_each <- function(mdl, exper) { + res <- estimate(mdl, exper) + alpha <- sapply(res@mediation@estimates, function(m) coef(m)["treatmentT1"]) + names(alpha) <- sub("\\.treatmentT1$", "", names(alpha)) + beta <- coef(res@outcome@estimates$dysbiosis)[names(alpha)] + indirect <- alpha * beta + names(indirect) <- names(alpha) + indirect <- indirect[!is.na(indirect)] + return(indirect) +} + +set.seed(12345) +boot_each <- bootstrap(mdl, exper, c(indirect = indirect_each), B = 100) +summary(boot_each$indirect) + +# Summarize the bootstrap results +lower_upper <- apply(boot_each$indirect, 2, function(x) quantile(x, c(0.025, 0.975), na.rm=TRUE)) +means <- apply(boot_each$indirect, 2, mean, na.rm=TRUE) + +# Convert to data frame +summary_df <- data.frame( + mediator = colnames(boot_each$indirect), + estimate = means, + lower = lower_upper[1,], + upper = lower_upper[2,] +) +summary_df <- summary_df[-1, ] + +# Mark significance if CI does not cross zero +summary_df$significant <- with(summary_df, lower > 0 | upper < 0) + +# Remove the species with 0 confidence interval length +summary_df <- summary_df %>% + filter((upper - lower) != 0) + +# Add rankings +summary_df$rank <- ifelse(summary_df$significant, 1, 2) # 1 for significant, 2 for not +summary_df <- summary_df[order(summary_df$rank, -abs(summary_df$estimate)), ] + +# Forest plot +ggplot(summary_df, aes(x=estimate, y=reorder(mediator, estimate), color=significant)) + + geom_point() + + geom_errorbarh(aes(xmin=lower, xmax=upper), height=0.2) + + geom_vline(xintercept=0, linetype="dashed", color="grey") + + theme_classic() + + labs( + x = "Observed Indirect Effect with Bootstrap CI", + y = "Mediator", + title = "Forest Plot of Mediation-specific Indirect Effects for Species" + ) + + scale_color_manual(values=c("black","red")) + + theme(legend.position="bottom") + +``` + +Based on the forest plot results, we can see that the agathobaculum +butyriciproducens and escherichia coli pathways has significant indirect +mediation effects. + +Next we repeat the mediation analysis with the iHMP pathway abundances using the +same setup as before, using the time point as the treatment, dysbiosis score as +the outcome, but functional pathway abundance as the mediation variables instead +of the relative species abundance. + +### Performing multimodal mediation analysis for functional pathway abundance {#sec-pathway-abundance} + +Let's extract the pathway abundance data from the iHMP data first, as well as +other data pre-processing. + +```{r load-pkg-data} + +se_pathway <- curatedMetagenomicData( + "HMP_2019_ibdmdb.pathway_abundance", + dryrun = FALSE + )[[1]] + +rows_to_keep <- !grepl("\\|", rownames(se_pathway)) +rows_to_keep[which(rows_to_keep)[1:2]] <- FALSE +se_pathway <- se_pathway[rows_to_keep, ] +assay(se_pathway) <- assay(se_pathway) / 100 +ref_set <- colData(se_pathway)$disease == "healthy" + +dist <- as.matrix(vegdist(t(assay(se_pathway)), method = "bray", na.rm = TRUE)) +colData(se_pathway)$SampleID <- colnames(se_pathway) +colData(se_pathway)$dysbiosis <- sapply(seq_along(ref_set), function(i) { + median(dist[i, ref_set & + (colData(se_pathway)$SampleID != colData(se_pathway)$SampleID[i])], + na.rm = TRUE) +}) + +se_pathway <- se_pathway[, colData(se_pathway)$disease != "healthy"] +se_pathway <- se_pathway[, colData(se_pathway)$visit_number %in% c(1, 27)] +keep_subjects <- names(which(table(colData(se_pathway)$subject_id) == 2)) +se_pathway <- se_pathway[, colData(se_pathway)$subject_id %in% keep_subjects] + +colData(se_pathway)$Time_point <- ifelse( + colData(se_pathway)$visit_number == 1, "T0", "T1" +) + +``` + +We will define the mediation analysis data set then. + +```{r define-mediation-data-frame} + +treatment <- factor(colData(se_pathway)$Time_point, levels = c("T0", "T1")) +dysbiosis <- as.numeric(colData(se_pathway)$dysbiosis) + +M <- t(assay(se_pathway)) +M <- scale(M) +M_clean <- M[, colSums(is.na(M)) == 0] + +raw <- colnames(M_clean) +clean1 <- str_remove_all(raw, "\\[|\\]") +clean2 <- str_replace_all(clean1, "[: \\.,]", "_") +safe_names <- make.names(clean2, unique = TRUE) +colnames(M_clean) <- safe_names + +mediators <- as_tibble(M_clean) + +df <- data.frame( + treatment = treatment, + dysbiosis = dysbiosis +) %>% + bind_cols(mediators) + +``` + +Next, we continue to fit the multimodal mediation model, and extract the overall +and mediation-specific indirect effects. + +```{r mediation-analysis} + +exper <- mediation_data( + df, + outcome = "dysbiosis", + treatment = "treatment", + mediators = colnames(mediators) +) +mdl <- multimedia(exper) +res <- estimate(mdl, exper) +summary(res) +print(indirect_overall(res, exper)) +print(direct_effect(res, exper)) + + +treatment_to_mediator_coef <- sapply(res@mediation@estimates, function(m) coef(m)["treatmentT1"]) +names(treatment_to_mediator_coef) <- sub("\\.treatmentT1$", "", names(treatment_to_mediator_coef)) +mediator_to_outcome_coef <- coef(res@outcome@estimates$dysbiosis)[names(treatment_to_mediator_coef)] +indirect_effects <- treatment_to_mediator_coef * mediator_to_outcome_coef + +mediator_effects <- data.frame( + mediator = names(treatment_to_mediator_coef), + treatment_to_mediator_coef = treatment_to_mediator_coef, + mediator_to_outcome_coef = mediator_to_outcome_coef, + indirect_effect = indirect_effects +) + +top_mediators <- mediator_effects %>% + arrange(desc(abs(indirect_effect))) %>% + head(10) +print(top_mediators) + +``` + +The non-parametric bootstrap resampling is also used to compute **95% CIs** for +both the overall and mediator-specific pathway indirect effects. + +```{r visualization-for-overall-indirect-effects} + +set.seed(1234) +boot_overall <- bootstrap(mdl, exper, c(indirect = indirect_overall), B = 100) +summary(boot_overall$indirect) +quantile(boot_overall$indirect$indirect_effect, probs = c(0.025, 0.975)) + +ggplot(boot_overall$indirect) + + geom_histogram(aes(indirect_effect), bins = 20, fill = "#69b3a2", color = "black") + + theme_classic() + + labs( + x = "Overall Indirect Effect", + y = "Frequency", + title = "Bootstrap Distribution of Overall Indirect Effect (B = 100)" + ) + +``` + +We visualize the **mediator-specific indirect effects** using a forest plot for +pathway aubundance as well. with the point estimates and 95% CIs. + +```{r visualization-for-mediation-specific-indirect-effects} + +indirect_each <- function(mdl, exper) { + res <- estimate(mdl, exper) + alpha <- sapply(res@mediation@estimates, function(m) coef(m)["treatmentT1"]) + names(alpha) <- sub("\\.treatmentT1$", "", names(alpha)) + beta <- coef(res@outcome@estimates$dysbiosis)[names(alpha)] + indirect <- alpha * beta + names(indirect) <- names(alpha) + indirect <- indirect[!is.na(indirect)] + return(indirect) +} + +set.seed(1234) +boot_each <- bootstrap(mdl, exper, c(indirect = indirect_each), B = 100) + +lower_upper <- apply(boot_each$indirect, 2, function(x) quantile(x, c(0.025, 0.975), na.rm=TRUE)) +means <- apply(boot_each$indirect, 2, mean, na.rm=TRUE) +summary_df <- data.frame( + mediator = colnames(boot_each$indirect), + estimate = means, + lower = lower_upper[1,], + upper = lower_upper[2,] +) + +summary_df <- summary_df[-1, ] +summary_df$significant <- with(summary_df, lower > 0 | upper < 0) +summary_df <- summary_df[summary_df$upper != summary_df$lower, ] + +# Clean the pathway names +pwy <- rownames(summary_df) +pwy <- gsub("PWY0\\.", "PWY0-", pwy) # PWY0.xxx → PWY0-xxx +pwy <- gsub("PWY\\.", "PWY-", pwy) # PWY.xxx → PWY-xxx +pwy <- gsub("\\.", " ", pwy) # leftover dots → spaces +pwy <- gsub("__", ": ", pwy) # double underscores → colon +pwy <- gsub("_\\.", " ", pwy) # underscore then dot → space +pwy <- gsub("_", " ", pwy) # remaining underscores → space +pwy <- gsub("\\.\\.", " ", pwy) # double dots → space +pwy <- gsub("\\.$", "", pwy) # remove ending period +# pwy <- trimws(pwy) # final cleanup +rownames(summary_df) <- pwy + +ggplot(summary_df, aes(x=estimate, y=reorder(mediator, estimate), color=significant)) + + geom_point() + + geom_errorbarh(aes(xmin=lower, xmax=upper), height=0.2) + + geom_vline(xintercept=0, linetype="dashed", color="grey") + + theme_classic() + + labs( + x = "Observed Indirect Effect with Bootstrap CI", + y = "Mediator", + title = "Forest Plot of Mediation-specific Indirect Effects for Pathway" + ) + + scale_color_manual(values=c("black","red")) + + theme(legend.position="bottom") + +``` + +Based on the forest plot results, we can see that the tRNA charging pathway has +significant indirect mediation effects on the dysbiosis scores. From af9fafef80726ba50c8b6de03cc2d5dd44973f7d Mon Sep 17 00:00:00 2001 From: YihanLiu4023 <149611140+YihanLiu4023@users.noreply.github.com> Date: Thu, 7 Aug 2025 04:04:51 +0800 Subject: [PATCH 2/7] Update multimedia.qmd --- inst/pages/multimedia.qmd | 223 +++++++++++++++++++++++--------------- 1 file changed, 135 insertions(+), 88 deletions(-) diff --git a/inst/pages/multimedia.qmd b/inst/pages/multimedia.qmd index 439b2666..c17292e5 100644 --- a/inst/pages/multimedia.qmd +++ b/inst/pages/multimedia.qmd @@ -7,7 +7,10 @@ author: Yihan Liu, Himel Mallick # **Multimodal Mediation Analysis** {#sec-MSEA} -```{r setup, echo=FALSE, results="asis"} +```{r} +#| label: setup +#| echo: false +#| results: asis remove(list = ls()) invisible(gc()) library(rebook) @@ -28,7 +31,7 @@ layers, providing mechanistic insights into **how exposures impact outcomes** **via molecular pathways**. ```{r} -#| label: fig-multimodal-mediation +#| label: fig_multimodal_mediation #| fig-cap: Directed acyclic graph illustrating multimodal mediation where an exposure affects multiple mediators across modalities (microbial species and metabolites), which in turn affect the outcome. A direct path from exposure to outcome is also included (dashed). While gut microbial species biologically influence metabolite levels, for tractability in high-dimensional mediation analysis we assume species and metabolites act as conditionally independent parallel mediators given exposure, acknowledging this as an approximation #| echo: false @@ -68,7 +71,7 @@ analysis, which typically focuses on a single mediator or a small set of mediators, we performed multimodal mediation analysis with R/Bioconductor package \[multimedia\] [@Jiang2025] here, which can handle many potential mediators across multiple data modalities. For example, species-level taxonomic -abundances, functional pathways, and metabolomic profiles can all serve as +abundances, pathways abundances, and metabolomic profiles can all serve as potential mediators. This method allows us to estimate and rank each mediator’s contribution while accounting for correlations between mediators in a high-dimensional setting. @@ -76,7 +79,7 @@ high-dimensional setting. In this example, time point (baseline vs. post-treatment) is used as the treatment, and microbiome-based dysbiosis scores are used as the outcome. We will first focus on the relative abundances of gut microbial species as -mediators, followed by functional pathways. +mediators, followed by pathways abundances. Generally, we will proceed through the following key steps: @@ -105,6 +108,9 @@ scores indicate greater microbial imbalance. We then subset to IBD subjects with samples collected at both baseline (visit 1) and post-treatment (visit 21). ```{r} +#| label: iHMP_data_preprocessing +#| message: false + ################## # Load libraries # ################## @@ -117,78 +123,94 @@ library(tidyverse) library(multimedia) library(stringr) library(ggplot2) - +library(mia) ################## # Load iHMP data # ################## # Load data -se_relative <- curatedMetagenomicData( +tse_relative <- curatedMetagenomicData( "HMP_2019_ibdmdb.relative_abundance", dryrun = FALSE )[[1]] -# Normalize to proportions (from %) -assay(se_relative) <- assay(se_relative) / 100 +# Assign SampleID for matching +colData(tse_relative)$SampleID <- colnames(tse_relative) -# Change the rownames names for variables of interest -rownames(se_relative) <- sub('.*s__', '', rownames(se_relative)) +# Convert relative_abundance assay to relabundance (which is in [0,1] interval) +tse_relative <- transformAssay(tse_relative, assay.type="relative_abundance", method="relabundance") + +# Optionally, remove the original assay to avoid confusion between the two relative abundance versions +assay(tse_relative, "relative_abundance") <- NULL +# Remove samples with NA in the relabundance assay +tse_relative <- tse_relative[, colSums(is.na(assay(tse_relative))) == 0] + +# Change the rownames names for variables of interest +rownames(tse_relative) <- sub('.*s__', '', rownames(tse_relative)) +rownames(tse_relative) <- str_remove_all(rownames(tse_relative), "\\[|\\]") +rownames(tse_relative) <- str_replace_all(rownames(tse_relative), "[: \\.,]", "_") +safe_names <- make.names(rownames(tse_relative), unique = TRUE) +rownames(tse_relative) <- safe_names ######################## # Reference nonIBD set # ######################## # Reference set: healthy -ref_set <- colData(se_relative)$disease == "healthy" - +tse_relative$disease_binary <- tse_relative$disease == "healthy" ######################################## # Calculate Bray-Curtis dissimilarity # ######################################## # Bray-Curtis dissimilarity -dist <- as.matrix(vegdist(t(assay(se_relative)), method = "bray", na.rm = TRUE)) - -# Assign SampleID for matching -colData(se_relative)$SampleID <- colnames(se_relative) - +diss <- as.matrix(getDissimilarity(tse_relative, method = "bray", na.rm=TRUE, assay.type = "relabundance")) ################################# # Calculate the dysbiosis score # ################################# # Calculate dysbiosis score for each sample +# For each sample i, we compute the median distance between sample i and all reference samples in `ref_set` +sample_ids <- colData(se_relative)$SampleID + colData(se_relative)$dysbiosis <- sapply(seq_along(ref_set), function(i) { - median(dist[i, ref_set & - (colData(se_relative)$SampleID != colData(se_relative)$SampleID[i])], - na.rm = TRUE) -}) + # Logical vector indicating all other reference samples (excluding i) + ref_others <- ref_set & (sample_ids != sample_ids[i]) + + # Compute median distance between sample i and these reference samples + median(dist[i, ref_others], na.rm = TRUE) + +}) ############################################################# # Subset to IBD only and only one post-baseline time point # ############################################################# # Keep IBD only -se_relative <- se_relative[, colData(se_relative)$disease != "healthy"] +tse_relative <- tse_relative[, colData(tse_relative)$disease != "healthy"] # Visit 1 (baseline) or 25 (post) -se_relative <- se_relative[, colData(se_relative)$visit_number %in% c(1, 21)] +tse_relative <- tse_relative[, colData(tse_relative)$visit_number %in% c(1, 21)] # Keep subjects with both visits -keep_subjects <- names(which(table(colData(se_relative)$subject_id) == 2)) -se_relative <- se_relative[, colData(se_relative)$subject_id %in% keep_subjects] +keep_subjects <- names(which(table(colData(tse_relative)$subject_id) == 2)) +tse_relative <- tse_relative[, colData(tse_relative)$subject_id %in% keep_subjects] # Define time point label -colData(se_relative)$Time_point <- ifelse( - colData(se_relative)$visit_number == 1, "T0", "T1" +colData(tse_relative)$Time_point <- ifelse( + colData(tse_relative)$visit_number == 1, "T0", "T1" ) +# Define treatment: Pre and Post (Baseline vs. Post-baseline) +treatment <- factor(colData(tse_relative)$Time_point, levels = c("T0", "T1")) + ``` -Next, we will define the mediation anlaysis components. By using the processed +Next, we will define the mediation analysis components. By using the processed data, we define: - Treatment: time point (baseline vs. post-baseline) @@ -200,41 +222,28 @@ multimedia function to estimate path models for the treatment-to-mediator (α path), mediator-to-outcome (β path), and the combined indirect effect (α×β path). -```{r define-mediation-data-frame} +```{r} +#| label: define_mediation_data_frame +#| message: false ############################# # Define mediation data set # ############################# -# Treatment: Pre and Post (Baseline vs. Post-baseline) -treatment <- factor(colData(se_relative)$Time_point, levels = c("T0", "T1")) - # Outcome: Dysbiosis Score -dysbiosis <- as.numeric(colData(se_relative)$dysbiosis) +colData(tse_relative)$dysbiosis <- as.numeric(colData(tse_relative)$dysbiosis) # Mediators (scaled, directly from SE): Species -M <- t(assay(se_relative)) -M <- scale(M) -M_clean <- M[, colSums(is.na(M)) == 0] - -# clean mediator names -raw <- colnames(M_clean) -clean1 <- str_remove_all(raw, "\\[|\\]") -clean2 <- str_replace_all(clean1, "[: \\.,]", "_") - -# Save the final safe names -safe_names <- make.names(clean2, unique = TRUE) - -# Apply names -colnames(M_clean) <- safe_names +tse_relative <- transformAssay(tse_relative, assay.type="relabundance", method="standardize", name="scaled") +M <- assay(tse_relative, "scaled") # Convert to tibble -mediators <- as_tibble(M_clean) +mediators <- as_tibble(M) # Create the mediation data frame df <- data.frame( treatment = treatment, - dysbiosis = dysbiosis + dysbiosis = tse_relative$dysbiosis ) %>% bind_cols(mediators) @@ -244,7 +253,9 @@ We now fit the multimodal mediation model and inspect the overall indirect and direct effects to understand whether microbiome composition mediates the treatment's effect on dysbiosis. -```{r overall-mediation-analysis} +```{r} +#| label: overall_mediation_analysis +#| message: false ############################# # Run mediation (non-delta) # @@ -269,7 +280,9 @@ print(direct_effect(res, exper)) ``` -```{r specific-mediation} +```{r} +#| label: specific_mediation +#| message: false ############################# # Mediator-specific effects @@ -306,7 +319,9 @@ To quantify uncertainty around the indirect effects, we use non-parametric bootstrap resampling to compute **95% confidence intervals** for both the overall and mediator-specific indirect effects. -```{r visualization-for-overall-indirect-effects} +```{r} +#| label: visualization_for_overall_indirect_effects +#| message: false ##################################### # Bootstrap overall indirect effect # @@ -333,7 +348,9 @@ Finally, we visualize the **mediator-specific indirect effects** using a forest plot, displaying point estimates and 95% CIs to highlight the most influential mediators. -```{r visualization-for-mediation-specific-indirect-effects} +```{r} +#| label: visualization_for_mediation_specific_indirect_effects +#| message: false ############################################### # Bootstrap mediator-specific indirect effect # @@ -395,74 +412,93 @@ ggplot(summary_df, aes(x=estimate, y=reorder(mediator, estimate), color=signific ``` -Based on the forest plot results, we can see that the agathobaculum -butyriciproducens and escherichia coli pathways has significant indirect +Based on the forest plot results, we can see that the Agathobaculum +butyriciproducens and Escherichia coli pathways has significant indirect mediation effects. Next we repeat the mediation analysis with the iHMP pathway abundances using the same setup as before, using the time point as the treatment, dysbiosis score as -the outcome, but functional pathway abundance as the mediation variables instead +the outcome, but pathways abundances as the mediation variables instead of the relative species abundance. -### Performing multimodal mediation analysis for functional pathway abundance {#sec-pathway-abundance} +### Performing multimodal mediation analysis for pathways abundances {#sec-pathways-abundances} Let's extract the pathway abundance data from the iHMP data first, as well as other data pre-processing. -```{r load-pkg-data} +```{r} +#| label: load_pkg_data +#| message: false -se_pathway <- curatedMetagenomicData( +tse_pathway <- curatedMetagenomicData( "HMP_2019_ibdmdb.pathway_abundance", dryrun = FALSE )[[1]] -rows_to_keep <- !grepl("\\|", rownames(se_pathway)) +colData(tse_pathway)$SampleID <- colnames(tse_pathway) +tse_pathway <- tse_pathway[, colSums(is.na(assay(tse_pathway))) == 0] + +# Clean the pathway names +pwy <- rownames(tse_pathway) +pwy <- gsub("PWY0\\.", "PWY0-", pwy) # PWY0.xxx → PWY0-xxx +pwy <- gsub("PWY\\.", "PWY-", pwy) # PWY.xxx → PWY-xxx +pwy <- gsub("\\.", " ", pwy) # leftover dots → spaces +pwy <- gsub("__", ": ", pwy) # double underscores → colon +pwy <- gsub("_\\.", " ", pwy) # underscore then dot → space +pwy <- gsub("_", " ", pwy) # remaining underscores → space +pwy <- gsub("\\.\\.", " ", pwy) # double dots → space +pwy <- gsub("\\.$", "", pwy) # remove ending period +# pwy <- trimws(pwy) # final cleanup +rownames(summary_df) <- pwy + +rows_to_keep <- !grepl("\\|", rownames(tse_pathway)) rows_to_keep[which(rows_to_keep)[1:2]] <- FALSE -se_pathway <- se_pathway[rows_to_keep, ] -assay(se_pathway) <- assay(se_pathway) / 100 -ref_set <- colData(se_pathway)$disease == "healthy" - -dist <- as.matrix(vegdist(t(assay(se_pathway)), method = "bray", na.rm = TRUE)) -colData(se_pathway)$SampleID <- colnames(se_pathway) -colData(se_pathway)$dysbiosis <- sapply(seq_along(ref_set), function(i) { - median(dist[i, ref_set & - (colData(se_pathway)$SampleID != colData(se_pathway)$SampleID[i])], +tse_pathway <- tse_pathway[rows_to_keep, ] +assay(tse_pathway) <- assay(tse_pathway) / 100 +tse_pathway$disease_binary <- colData(tse_pathway)$disease == "healthy" + +diss <- as.matrix(vegdist(t(assay(tse_pathway)), method = "bray", na.rm = TRUE)) +colData(tse_pathway)$dysbiosis <- sapply(seq_along(tse_pathway$disease_binary), function(i) { + median(diss[i, tse_pathway$disease_binary & + (colData(tse_pathway)$SampleID != colData(tse_pathway)$SampleID[i])], na.rm = TRUE) }) -se_pathway <- se_pathway[, colData(se_pathway)$disease != "healthy"] -se_pathway <- se_pathway[, colData(se_pathway)$visit_number %in% c(1, 27)] -keep_subjects <- names(which(table(colData(se_pathway)$subject_id) == 2)) -se_pathway <- se_pathway[, colData(se_pathway)$subject_id %in% keep_subjects] +tse_pathway <- tse_pathway[, colData(tse_pathway)$disease != "healthy"] +tse_pathway <- tse_pathway[, colData(tse_pathway)$visit_number %in% c(1, 27)] +keep_subjects <- names(which(table(colData(tse_pathway)$subject_id) == 2)) +tse_pathway <- tse_pathway[, colData(tse_pathway)$subject_id %in% keep_subjects] -colData(se_pathway)$Time_point <- ifelse( - colData(se_pathway)$visit_number == 1, "T0", "T1" +colData(tse_pathway)$Time_point <- ifelse( + colData(tse_pathway)$visit_number == 1, "T0", "T1" ) +treatment <- factor(colData(tse_pathway)$Time_point, levels = c("T0", "T1")) + ``` We will define the mediation analysis data set then. -```{r define-mediation-data-frame} +```{r} +#| label: define_mediation_data_frame +#| message: false -treatment <- factor(colData(se_pathway)$Time_point, levels = c("T0", "T1")) -dysbiosis <- as.numeric(colData(se_pathway)$dysbiosis) +colData(tse_pathway)$dysbiosis <- as.numeric(colData(tse_pathway)$dysbiosis) -M <- t(assay(se_pathway)) -M <- scale(M) -M_clean <- M[, colSums(is.na(M)) == 0] +tse_pathway <- transformAssay(tse_pathway, assay.type="relabundance", method="standardize", name="scaled") +M <- assay(tse_pathway, "scaled") -raw <- colnames(M_clean) +raw <- colnames(M) clean1 <- str_remove_all(raw, "\\[|\\]") clean2 <- str_replace_all(clean1, "[: \\.,]", "_") safe_names <- make.names(clean2, unique = TRUE) -colnames(M_clean) <- safe_names +colnames(M) <- safe_names -mediators <- as_tibble(M_clean) +mediators <- as_tibble(M) df <- data.frame( treatment = treatment, - dysbiosis = dysbiosis + dysbiosis = tse_pathway$dysbiosis ) %>% bind_cols(mediators) @@ -471,7 +507,9 @@ df <- data.frame( Next, we continue to fit the multimodal mediation model, and extract the overall and mediation-specific indirect effects. -```{r mediation-analysis} +```{r} +#| label: mediation_analysis +#| message: false exper <- mediation_data( df, @@ -508,7 +546,9 @@ print(top_mediators) The non-parametric bootstrap resampling is also used to compute **95% CIs** for both the overall and mediator-specific pathway indirect effects. -```{r visualization-for-overall-indirect-effects} +```{r} +#| label: visualization_for_overall_indirect_effects +#| message: false set.seed(1234) boot_overall <- bootstrap(mdl, exper, c(indirect = indirect_overall), B = 100) @@ -529,7 +569,9 @@ ggplot(boot_overall$indirect) + We visualize the **mediator-specific indirect effects** using a forest plot for pathway aubundance as well. with the point estimates and 95% CIs. -```{r visualization-for-mediation-specific-indirect-effects} +```{r} +#| label: visualization_for_mediation_specific_indirect_effects +#| message: false indirect_each <- function(mdl, exper) { res <- estimate(mdl, exper) @@ -571,6 +613,11 @@ pwy <- gsub("\\.$", "", pwy) # remove ending period # pwy <- trimws(pwy) # final cleanup rownames(summary_df) <- pwy + + + + + ggplot(summary_df, aes(x=estimate, y=reorder(mediator, estimate), color=significant)) + geom_point() + geom_errorbarh(aes(xmin=lower, xmax=upper), height=0.2) + From fb8c08057562ade8f6aebcbe0f5ada05ff0bde94 Mon Sep 17 00:00:00 2001 From: YihanLiu4023 <149611140+YihanLiu4023@users.noreply.github.com> Date: Sat, 30 Aug 2025 02:36:57 +0800 Subject: [PATCH 3/7] Update multimedia.qmd Just fixed some minor inconsistence in the name of variables --- inst/pages/multimedia.qmd | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/inst/pages/multimedia.qmd b/inst/pages/multimedia.qmd index c17292e5..40c43a98 100644 --- a/inst/pages/multimedia.qmd +++ b/inst/pages/multimedia.qmd @@ -174,15 +174,15 @@ diss <- as.matrix(getDissimilarity(tse_relative, method = "bray", na.rm=TRUE, as # Calculate dysbiosis score for each sample # For each sample i, we compute the median distance between sample i and all reference samples in `ref_set` -sample_ids <- colData(se_relative)$SampleID +sample_ids <- colData(tse_relative)$SampleID -colData(se_relative)$dysbiosis <- sapply(seq_along(ref_set), function(i) { +colData(tse_relative)$dysbiosis <- sapply(seq_along(tse_relative$disease_binary), function(i) { # Logical vector indicating all other reference samples (excluding i) - ref_others <- ref_set & (sample_ids != sample_ids[i]) + ref_others <- tse_relative$disease_binary & (sample_ids != sample_ids[i]) # Compute median distance between sample i and these reference samples - median(dist[i, ref_others], na.rm = TRUE) + median(diss[i, ref_others], na.rm = TRUE) }) @@ -235,7 +235,7 @@ colData(tse_relative)$dysbiosis <- as.numeric(colData(tse_relative)$dysbiosis) # Mediators (scaled, directly from SE): Species tse_relative <- transformAssay(tse_relative, assay.type="relabundance", method="standardize", name="scaled") -M <- assay(tse_relative, "scaled") +M <- t(assay(tse_relative, "scaled")) # Convert to tibble mediators <- as_tibble(M) @@ -438,19 +438,6 @@ tse_pathway <- curatedMetagenomicData( colData(tse_pathway)$SampleID <- colnames(tse_pathway) tse_pathway <- tse_pathway[, colSums(is.na(assay(tse_pathway))) == 0] -# Clean the pathway names -pwy <- rownames(tse_pathway) -pwy <- gsub("PWY0\\.", "PWY0-", pwy) # PWY0.xxx → PWY0-xxx -pwy <- gsub("PWY\\.", "PWY-", pwy) # PWY.xxx → PWY-xxx -pwy <- gsub("\\.", " ", pwy) # leftover dots → spaces -pwy <- gsub("__", ": ", pwy) # double underscores → colon -pwy <- gsub("_\\.", " ", pwy) # underscore then dot → space -pwy <- gsub("_", " ", pwy) # remaining underscores → space -pwy <- gsub("\\.\\.", " ", pwy) # double dots → space -pwy <- gsub("\\.$", "", pwy) # remove ending period -# pwy <- trimws(pwy) # final cleanup -rownames(summary_df) <- pwy - rows_to_keep <- !grepl("\\|", rownames(tse_pathway)) rows_to_keep[which(rows_to_keep)[1:2]] <- FALSE tse_pathway <- tse_pathway[rows_to_keep, ] @@ -485,8 +472,8 @@ We will define the mediation analysis data set then. colData(tse_pathway)$dysbiosis <- as.numeric(colData(tse_pathway)$dysbiosis) -tse_pathway <- transformAssay(tse_pathway, assay.type="relabundance", method="standardize", name="scaled") -M <- assay(tse_pathway, "scaled") +tse_pathway <- transformAssay(tse_pathway, assay.type="pathway_abundance", method="standardize", name="scaled") +M <- t(assay(tse_pathway, "scaled")) raw <- colnames(M) clean1 <- str_remove_all(raw, "\\[|\\]") From 9981a192b46935e2fe6069b7fe8406f97c2563c0 Mon Sep 17 00:00:00 2001 From: YihanLiu4023 <149611140+YihanLiu4023@users.noreply.github.com> Date: Fri, 19 Sep 2025 06:20:46 +0800 Subject: [PATCH 4/7] Update multimedia.qmd --- inst/pages/multimedia.qmd | 224 ++++++++++++++------------------------ 1 file changed, 83 insertions(+), 141 deletions(-) diff --git a/inst/pages/multimedia.qmd b/inst/pages/multimedia.qmd index 40c43a98..09cd2dda 100644 --- a/inst/pages/multimedia.qmd +++ b/inst/pages/multimedia.qmd @@ -18,17 +18,7 @@ chapterPreamble() ``` -Building upon the concepts and workflows presented in the previous chapter on -mediation analysis, here we demonstrate how to perform **multimodal mediation** -**analysis** to identify potential mediators across multiple omics layers. -**Multimodal mediation analysis** examines whether and how an exposure (e.g., -treatment, diet) affects an outcome (e.g., disease, phenotype) **through** -**intermediate variables (mediators) across multiple omics layers** -**simultaneously** (e.g., microbiome, metabolomics). It identifies which -features across modalities act as mediators, quantifies indirect (mediated) -effects while adjusting for covariates, and compares mediation strength across -layers, providing mechanistic insights into **how exposures impact outcomes** -**via molecular pathways**. +Building upon the concepts and workflows presented in the previous chapter on mediation analysis, here we demonstrate how to perform **multimodal mediation** **analysis** to identify potential mediators across multiple omics layers. **Multimodal mediation analysis** examines whether and how an exposure (e.g., treatment, diet) affects an outcome (e.g., disease, phenotype) **through** **intermediate variables (mediators) across multiple omics layers** **simultaneously** (e.g., microbiome, metabolomics). It identifies which features across modalities act as mediators, quantifies indirect (mediated) effects while adjusting for covariates, and compares mediation strength across layers, providing mechanistic insights into **how exposures impact outcomes** **via molecular pathways**. ```{r} #| label: fig_multimodal_mediation @@ -64,22 +54,9 @@ digraph multimodal_mediation { ") ``` -In this chapter, we demonstrate multimodal mediation analysis using microbiome -data from the iHMP IBDMDB study from the R/Bioconductor package -curatedMetagenomicData. [@Lloyd-Price2019]. Unlike conventional mediation -analysis, which typically focuses on a single mediator or a small set of -mediators, we performed multimodal mediation analysis with R/Bioconductor -package \[multimedia\] [@Jiang2025] here, which can handle many potential -mediators across multiple data modalities. For example, species-level taxonomic -abundances, pathways abundances, and metabolomic profiles can all serve as -potential mediators. This method allows us to estimate and rank each mediator’s -contribution while accounting for correlations between mediators in a -high-dimensional setting. - -In this example, time point (baseline vs. post-treatment) is used as the -treatment, and microbiome-based dysbiosis scores are used as the outcome. We -will first focus on the relative abundances of gut microbial species as -mediators, followed by pathways abundances. +In this chapter, we demonstrate multimodal mediation analysis using microbiome data from the iHMP IBDMDB study from the R/Bioconductor package curatedMetagenomicData. [@Lloyd-Price2019]. Unlike conventional mediation analysis, which typically focuses on a single mediator or a small set of mediators, we performed multimodal mediation analysis with R/Bioconductor package \[multimedia\] [@Jiang2025] here, which can handle many potential mediators across multiple data modalities. For example, species-level taxonomic abundances, pathways abundances, and metabolomic profiles can all serve as potential mediators. This method allows us to estimate and rank each mediator’s contribution while accounting for correlations between mediators in a high-dimensional setting. + +In this example, time point (baseline vs. post-treatment) is used as the treatment, and microbiome-based dysbiosis scores are used as the outcome. We will first focus on the relative abundances of gut microbial species as mediators, followed by pathways abundances. Generally, we will proceed through the following key steps: @@ -91,21 +68,15 @@ Generally, we will proceed through the following key steps: 4. Fit the multimodal mediation model using the R package multimedia. -5. Interpret both the overall indirect effects and the mediator-specific -indirect effects. +5. Interpret both the overall indirect effects and the mediator-specific indirect effects. -6. Visualize the results using forest plots, histograms, and rankings to -prioritize findings. +6. Visualize the results using forest plots, histograms, and rankings to prioritize findings. 7. Repeat the process for the iHMP microbial pathways. ### Performing multimodal mediation analysis for iHMP species relative abundance {#sec-relative-abundance} -We begin by loading the relative abundance data and calculating dysbiosis scores -for each sample based on Bray–Curtis dissimilarity from a healthy reference set. -This represents deviation from a healthy microbiome, where higher dysbiosis -scores indicate greater microbial imbalance. We then subset to IBD subjects with -samples collected at both baseline (visit 1) and post-treatment (visit 21). +We begin by loading the relative abundance data and calculating dysbiosis scores for each sample based on Bray–Curtis dissimilarity from a healthy reference set. This represents deviation from a healthy microbiome, where higher dysbiosis scores indicate greater microbial imbalance. We then subset to IBD subjects with samples collected at both baseline (visit 1) and post-treatment (visit 21). ```{r} #| label: iHMP_data_preprocessing @@ -114,7 +85,8 @@ samples collected at both baseline (visit 1) and post-treatment (visit 21). ################## # Load libraries # ################## - +remove(list = ls()) +gc() library(curatedMetagenomicData) library(SummarizedExperiment) library(dplyr) @@ -206,21 +178,17 @@ colData(tse_relative)$Time_point <- ifelse( ) # Define treatment: Pre and Post (Baseline vs. Post-baseline) -treatment <- factor(colData(tse_relative)$Time_point, levels = c("T0", "T1")) +colData(tse_relative)$treatment <- factor(colData(tse_relative)$Time_point, levels = c("T0", "T1")) ``` -Next, we will define the mediation analysis components. By using the processed -data, we define: +Next, we will define the mediation analysis components. By using the processed data, we define: - Treatment: time point (baseline vs. post-baseline) - Outcome: dysbiosis score - Mediators: scaled species abundances -These are bundled into a mediation data object (exper), which is passed into the -multimedia function to estimate path models for the treatment-to-mediator -(α path), mediator-to-outcome (β path), and the combined indirect effect (α×β -path). +These are bundled into a mediation data object (exper), which is passed into the multimedia function to estimate path models for the treatment-to-mediator (α path), mediator-to-outcome (β path), and the combined indirect effect (α×β path). ```{r} #| label: define_mediation_data_frame @@ -235,23 +203,11 @@ colData(tse_relative)$dysbiosis <- as.numeric(colData(tse_relative)$dysbiosis) # Mediators (scaled, directly from SE): Species tse_relative <- transformAssay(tse_relative, assay.type="relabundance", method="standardize", name="scaled") -M <- t(assay(tse_relative, "scaled")) - -# Convert to tibble -mediators <- as_tibble(M) - -# Create the mediation data frame -df <- data.frame( - treatment = treatment, - dysbiosis = tse_relative$dysbiosis -) %>% - bind_cols(mediators) +assays(tse_relative) <- SimpleList(mediators = assay(tse_relative, "scaled")) ``` -We now fit the multimodal mediation model and inspect the overall indirect and -direct effects to understand whether microbiome composition mediates the -treatment's effect on dysbiosis. +We now fit the multimodal mediation model and inspect the overall indirect and direct effects to understand whether microbiome composition mediates the treatment's effect on dysbiosis. ```{r} #| label: overall_mediation_analysis @@ -261,12 +217,24 @@ treatment's effect on dysbiosis. # Run mediation (non-delta) # ############################# +# Convert the TreeSummarizedExperiment to be SummarizedExperiment +se_relative <- tryCatch(as(tse_relative, "SummarizedExperiment"), error = function(e) NULL) +if (is.null(rownames(se_relative)) || any(rownames(se_relative) == "")) { + rn <- rownames(tse_relative) + stopifnot(length(rn) == nrow(se_relative)) + rownames(se_relative) <- rn + rownames(rowData(se_relative)) <- rn # keep rowData aligned too +} + +# Use indices for mediators +medi_idx <- seq_len(nrow(se_relative)) + # Create the Mediation Data object -exper <- mediation_data( - df, - outcome = "dysbiosis", - treatment = "treatment", - mediators = colnames(mediators) +exper <- multimedia::mediation_data( + se_relative, + outcomes = "dysbiosis", + treatments = "treatment", + mediators = medi_idx ) # Fit and summarize @@ -280,6 +248,7 @@ print(direct_effect(res, exper)) ``` + ```{r} #| label: specific_mediation #| message: false @@ -288,26 +257,27 @@ print(direct_effect(res, exper)) # Mediator-specific effects ############################# -# Extract the treatment-to-mediator path coefficients -treatment_to_mediator_coef <- sapply(res@mediation@estimates, function(m) coef(m)["treatmentT1"]) -names(treatment_to_mediator_coef) <- sub("\\.treatmentT1$", "", names(treatment_to_mediator_coef)) +# Helper function to extract the effects +extract_effects <- function(res, treat_term = "treatmentT1", outcome = "dysbiosis") { + # α: treatment -> each mediator (one model per mediator) + alpha <- sapply(res@mediation@estimates, function(m) unname(coef(m)[treat_term])) -# Extract the mediator-to-outcome path coefficients (adjusted for all mediators) -mediator_to_outcome_coef <- coef(res@outcome@estimates$dysbiosis)[names(treatment_to_mediator_coef)] + # β: mediator -> outcome (single model with all mediators) + beta <- unname(coef(res@outcome@estimates[[outcome]])[names(alpha)]) -# Calculate the indirect effect (product of path coefficients) -indirect_effects <- treatment_to_mediator_coef * mediator_to_outcome_coef + tibble::tibble( + mediator = names(alpha), + alpha = as.numeric(alpha), + beta = as.numeric(beta), + indirect_effect = alpha * beta + ) +} -# Summary table -mediator_effects <- data.frame( - mediator = names(treatment_to_mediator_coef), - treatment_to_mediator_coef = treatment_to_mediator_coef, - mediator_to_outcome_coef = mediator_to_outcome_coef, - indirect_effect = indirect_effects -) +# Extract the treatment-to-mediator path coefficients +effects_by_mediator <- extract_effects(res, treat_term = "treatmentT1", outcome = "dysbiosis") # Rank -top_mediators <- mediator_effects %>% +top_mediators <- effects_by_mediator %>% arrange(desc(abs(indirect_effect))) %>% head(10) @@ -315,9 +285,7 @@ print(top_mediators) ``` -To quantify uncertainty around the indirect effects, we use non-parametric -bootstrap resampling to compute **95% confidence intervals** for both the -overall and mediator-specific indirect effects. +To quantify uncertainty around the indirect effects, we use non-parametric bootstrap resampling to compute **95% confidence intervals** for both the overall and mediator-specific indirect effects. ```{r} #| label: visualization_for_overall_indirect_effects @@ -344,9 +312,7 @@ ggplot(boot_overall$indirect) + ``` -Finally, we visualize the **mediator-specific indirect effects** using a forest -plot, displaying point estimates and 95% CIs to highlight the most influential -mediators. +Finally, we visualize the **mediator-specific indirect effects** using a forest plot, displaying point estimates and 95% CIs to highlight the most influential mediators. ```{r} #| label: visualization_for_mediation_specific_indirect_effects @@ -412,19 +378,13 @@ ggplot(summary_df, aes(x=estimate, y=reorder(mediator, estimate), color=signific ``` -Based on the forest plot results, we can see that the Agathobaculum -butyriciproducens and Escherichia coli pathways has significant indirect -mediation effects. +Based on the forest plot results, we can see that the Agathobaculum butyriciproducens and Escherichia coli pathways has significant indirect mediation effects. -Next we repeat the mediation analysis with the iHMP pathway abundances using the -same setup as before, using the time point as the treatment, dysbiosis score as -the outcome, but pathways abundances as the mediation variables instead -of the relative species abundance. +Next we repeat the mediation analysis with the iHMP pathway abundances using the same setup as before, using the time point as the treatment, dysbiosis score as the outcome, but pathways abundances as the mediation variables instead of the relative species abundance. ### Performing multimodal mediation analysis for pathways abundances {#sec-pathways-abundances} -Let's extract the pathway abundance data from the iHMP data first, as well as -other data pre-processing. +Let's extract the pathway abundance data from the iHMP data first, as well as other data pre-processing. ```{r} #| label: load_pkg_data @@ -460,7 +420,7 @@ colData(tse_pathway)$Time_point <- ifelse( colData(tse_pathway)$visit_number == 1, "T0", "T1" ) -treatment <- factor(colData(tse_pathway)$Time_point, levels = c("T0", "T1")) +colData(tse_pathway)$treatment <- factor(colData(tse_pathway)$Time_point, levels = c("T0", "T1")) ``` @@ -471,67 +431,56 @@ We will define the mediation analysis data set then. #| message: false colData(tse_pathway)$dysbiosis <- as.numeric(colData(tse_pathway)$dysbiosis) - tse_pathway <- transformAssay(tse_pathway, assay.type="pathway_abundance", method="standardize", name="scaled") -M <- t(assay(tse_pathway, "scaled")) - -raw <- colnames(M) -clean1 <- str_remove_all(raw, "\\[|\\]") -clean2 <- str_replace_all(clean1, "[: \\.,]", "_") -safe_names <- make.names(clean2, unique = TRUE) -colnames(M) <- safe_names - -mediators <- as_tibble(M) - -df <- data.frame( - treatment = treatment, - dysbiosis = tse_pathway$dysbiosis -) %>% - bind_cols(mediators) +assays(tse_pathway) <- SimpleList(mediators = assay(tse_pathway, "scaled")) ``` -Next, we continue to fit the multimodal mediation model, and extract the overall -and mediation-specific indirect effects. +Next, we continue to fit the multimodal mediation model, and extract the overall and mediation-specific indirect effects. ```{r} #| label: mediation_analysis #| message: false -exper <- mediation_data( - df, - outcome = "dysbiosis", - treatment = "treatment", - mediators = colnames(mediators) +se_pathway <- tryCatch(as(tse_pathway, "SummarizedExperiment"), error = function(e) NULL) +if (is.null(rownames(se_pathway)) || any(rownames(se_pathway) == "")) { + rn <- rownames(tse_pathway) + stopifnot(length(rn) == nrow(se_pathway)) + rownames(se_pathway) <- rn + rownames(rowData(se_pathway)) <- rn # keep rowData aligned too +} + +raw <- rownames(se_pathway) +clean1 <- str_remove_all(raw, "\\[|\\]") +clean2 <- str_replace_all(clean1, "[: \\.,]", "_") +safe_names <- make.names(clean2, unique = TRUE) +rownames(se_pathway) <- safe_names + +medi_idx <- seq_len(nrow(se_pathway)) + +exper <- multimedia::mediation_data( + se_pathway, + outcomes = "dysbiosis", + treatments = "treatment", + mediators = medi_idx ) + mdl <- multimedia(exper) res <- estimate(mdl, exper) summary(res) print(indirect_overall(res, exper)) print(direct_effect(res, exper)) +effects_by_mediator <- extract_effects(res, treat_term = "treatmentT1", outcome = "dysbiosis") -treatment_to_mediator_coef <- sapply(res@mediation@estimates, function(m) coef(m)["treatmentT1"]) -names(treatment_to_mediator_coef) <- sub("\\.treatmentT1$", "", names(treatment_to_mediator_coef)) -mediator_to_outcome_coef <- coef(res@outcome@estimates$dysbiosis)[names(treatment_to_mediator_coef)] -indirect_effects <- treatment_to_mediator_coef * mediator_to_outcome_coef - -mediator_effects <- data.frame( - mediator = names(treatment_to_mediator_coef), - treatment_to_mediator_coef = treatment_to_mediator_coef, - mediator_to_outcome_coef = mediator_to_outcome_coef, - indirect_effect = indirect_effects -) - -top_mediators <- mediator_effects %>% +top_mediators <- effects_by_mediator %>% arrange(desc(abs(indirect_effect))) %>% head(10) print(top_mediators) ``` -The non-parametric bootstrap resampling is also used to compute **95% CIs** for -both the overall and mediator-specific pathway indirect effects. +The non-parametric bootstrap resampling is also used to compute **95% CIs** for both the overall and mediator-specific pathway indirect effects. ```{r} #| label: visualization_for_overall_indirect_effects @@ -553,8 +502,7 @@ ggplot(boot_overall$indirect) + ``` -We visualize the **mediator-specific indirect effects** using a forest plot for -pathway aubundance as well. with the point estimates and 95% CIs. +We visualize the **mediator-specific indirect effects** using a forest plot for pathway aubundance as well. with the point estimates and 95% CIs. ```{r} #| label: visualization_for_mediation_specific_indirect_effects @@ -600,11 +548,6 @@ pwy <- gsub("\\.$", "", pwy) # remove ending period # pwy <- trimws(pwy) # final cleanup rownames(summary_df) <- pwy - - - - - ggplot(summary_df, aes(x=estimate, y=reorder(mediator, estimate), color=significant)) + geom_point() + geom_errorbarh(aes(xmin=lower, xmax=upper), height=0.2) + @@ -620,5 +563,4 @@ ggplot(summary_df, aes(x=estimate, y=reorder(mediator, estimate), color=signific ``` -Based on the forest plot results, we can see that the tRNA charging pathway has -significant indirect mediation effects on the dysbiosis scores. +Based on the forest plot results, we can see that the tRNA charging pathway has significant indirect mediation effects on the dysbiosis scores. From d2be9147566822bbe24e998dd87135fed3ae6dac Mon Sep 17 00:00:00 2001 From: YihanLiu4023 <149611140+YihanLiu4023@users.noreply.github.com> Date: Fri, 21 Nov 2025 07:37:27 +0800 Subject: [PATCH 5/7] Update multimedia.qmd --- inst/pages/multimedia.qmd | 211 +++++++++++++++++++++----------------- 1 file changed, 117 insertions(+), 94 deletions(-) diff --git a/inst/pages/multimedia.qmd b/inst/pages/multimedia.qmd index 09cd2dda..18ef63f1 100644 --- a/inst/pages/multimedia.qmd +++ b/inst/pages/multimedia.qmd @@ -18,7 +18,17 @@ chapterPreamble() ``` -Building upon the concepts and workflows presented in the previous chapter on mediation analysis, here we demonstrate how to perform **multimodal mediation** **analysis** to identify potential mediators across multiple omics layers. **Multimodal mediation analysis** examines whether and how an exposure (e.g., treatment, diet) affects an outcome (e.g., disease, phenotype) **through** **intermediate variables (mediators) across multiple omics layers** **simultaneously** (e.g., microbiome, metabolomics). It identifies which features across modalities act as mediators, quantifies indirect (mediated) effects while adjusting for covariates, and compares mediation strength across layers, providing mechanistic insights into **how exposures impact outcomes** **via molecular pathways**. +Building upon the concepts and workflows presented in the previous chapter on +mediation analysis, here we demonstrate how to perform **multimodal mediation** +**analysis** to identify potential mediators across multiple omics layers. +**Multimodal mediation analysis** examines whether and how an exposure (e.g., +treatment, diet) affects an outcome (e.g., disease, phenotype) **through** +**intermediate variables (mediators) across multiple omics layers** +**simultaneously** (e.g., microbiome, metabolomics). It identifies which +features across modalities act as mediators, quantifies indirect (mediated) +effects while adjusting for covariates, and compares mediation strength across +layers, providing mechanistic insights into **how exposures impact outcomes** +**via molecular pathways**. ```{r} #| label: fig_multimodal_mediation @@ -54,9 +64,22 @@ digraph multimodal_mediation { ") ``` -In this chapter, we demonstrate multimodal mediation analysis using microbiome data from the iHMP IBDMDB study from the R/Bioconductor package curatedMetagenomicData. [@Lloyd-Price2019]. Unlike conventional mediation analysis, which typically focuses on a single mediator or a small set of mediators, we performed multimodal mediation analysis with R/Bioconductor package \[multimedia\] [@Jiang2025] here, which can handle many potential mediators across multiple data modalities. For example, species-level taxonomic abundances, pathways abundances, and metabolomic profiles can all serve as potential mediators. This method allows us to estimate and rank each mediator’s contribution while accounting for correlations between mediators in a high-dimensional setting. - -In this example, time point (baseline vs. post-treatment) is used as the treatment, and microbiome-based dysbiosis scores are used as the outcome. We will first focus on the relative abundances of gut microbial species as mediators, followed by pathways abundances. +In this chapter, we demonstrate multimodal mediation analysis using microbiome +data from the iHMP IBDMDB study from the R/Bioconductor package +curatedMetagenomicData. [@Lloyd-Price2019]. Unlike conventional mediation +analysis, which typically focuses on a single mediator or a small set of +mediators, we performed multimodal mediation analysis with R/Bioconductor +package \[multimedia\] [@Jiang2025] here, which can handle many potential +mediators across multiple data modalities. For example, species-level taxonomic +abundances, pathways abundances, and metabolomic profiles can all serve as +potential mediators. This method allows us to estimate and rank each mediator’s +contribution while accounting for correlations between mediators in a high- +dimensional setting. + +In this example, time point (baseline vs. post-treatment) is used as the +treatment, and microbiome-based dysbiosis scores are used as the outcome. We +will first focus on the relative abundances of gut microbial species as +mediators, followed by pathways abundances. Generally, we will proceed through the following key steps: @@ -68,15 +91,21 @@ Generally, we will proceed through the following key steps: 4. Fit the multimodal mediation model using the R package multimedia. -5. Interpret both the overall indirect effects and the mediator-specific indirect effects. +5. Interpret both the overall indirect effects and the mediator-specific +indirect effects. -6. Visualize the results using forest plots, histograms, and rankings to prioritize findings. +6. Visualize the results using forest plots, histograms, and rankings to +prioritize findings. 7. Repeat the process for the iHMP microbial pathways. ### Performing multimodal mediation analysis for iHMP species relative abundance {#sec-relative-abundance} -We begin by loading the relative abundance data and calculating dysbiosis scores for each sample based on Bray–Curtis dissimilarity from a healthy reference set. This represents deviation from a healthy microbiome, where higher dysbiosis scores indicate greater microbial imbalance. We then subset to IBD subjects with samples collected at both baseline (visit 1) and post-treatment (visit 21). +We begin by loading the relative abundance data and calculating dysbiosis scores +for each sample based on Bray–Curtis dissimilarity from a healthy reference set. +This represents deviation from a healthy microbiome, where higher dysbiosis +scores indicate greater microbial imbalance. We then subset to IBD subjects with +samples collected at both baseline (visit 1) and post-treatment (visit 21). ```{r} #| label: iHMP_data_preprocessing @@ -85,8 +114,7 @@ We begin by loading the relative abundance data and calculating dysbiosis scores ################## # Load libraries # ################## -remove(list = ls()) -gc() + library(curatedMetagenomicData) library(SummarizedExperiment) library(dplyr) @@ -102,43 +130,31 @@ library(mia) ################## # Load data -tse_relative <- curatedMetagenomicData( +tse <- curatedMetagenomicData( "HMP_2019_ibdmdb.relative_abundance", + rownames = "short", dryrun = FALSE )[[1]] # Assign SampleID for matching -colData(tse_relative)$SampleID <- colnames(tse_relative) +tse[["SampleID"]] <- colnames(tse) # Convert relative_abundance assay to relabundance (which is in [0,1] interval) -tse_relative <- transformAssay(tse_relative, assay.type="relative_abundance", method="relabundance") - -# Optionally, remove the original assay to avoid confusion between the two relative abundance versions -assay(tse_relative, "relative_abundance") <- NULL - -# Remove samples with NA in the relabundance assay -tse_relative <- tse_relative[, colSums(is.na(assay(tse_relative))) == 0] - -# Change the rownames names for variables of interest -rownames(tse_relative) <- sub('.*s__', '', rownames(tse_relative)) -rownames(tse_relative) <- str_remove_all(rownames(tse_relative), "\\[|\\]") -rownames(tse_relative) <- str_replace_all(rownames(tse_relative), "[: \\.,]", "_") -safe_names <- make.names(rownames(tse_relative), unique = TRUE) -rownames(tse_relative) <- safe_names +tse <- transformAssay(tse, assay.type="relative_abundance", method="relabundance") ######################## # Reference nonIBD set # ######################## # Reference set: healthy -tse_relative$disease_binary <- tse_relative$disease == "healthy" +tse$disease_binary <- tse$disease == "healthy" ######################################## # Calculate Bray-Curtis dissimilarity # ######################################## # Bray-Curtis dissimilarity -diss <- as.matrix(getDissimilarity(tse_relative, method = "bray", na.rm=TRUE, assay.type = "relabundance")) +diss <- as.matrix(getDissimilarity(tse, method = "bray", na.rm=TRUE, assay.type = "relabundance")) ################################# # Calculate the dysbiosis score # @@ -146,12 +162,12 @@ diss <- as.matrix(getDissimilarity(tse_relative, method = "bray", na.rm=TRUE, as # Calculate dysbiosis score for each sample # For each sample i, we compute the median distance between sample i and all reference samples in `ref_set` -sample_ids <- colData(tse_relative)$SampleID +sample_ids <- tse[["SampleID"]] -colData(tse_relative)$dysbiosis <- sapply(seq_along(tse_relative$disease_binary), function(i) { +tse[["dysbiosis"]] <- sapply(seq_along(tse$disease_binary), function(i) { # Logical vector indicating all other reference samples (excluding i) - ref_others <- tse_relative$disease_binary & (sample_ids != sample_ids[i]) + ref_others <- tse$disease_binary & (sample_ids != sample_ids[i]) # Compute median distance between sample i and these reference samples median(diss[i, ref_others], na.rm = TRUE) @@ -163,32 +179,36 @@ colData(tse_relative)$dysbiosis <- sapply(seq_along(tse_relative$disease_binary) ############################################################# # Keep IBD only -tse_relative <- tse_relative[, colData(tse_relative)$disease != "healthy"] +tse <- tse[, tse[["disease"]] != "healthy"] # Visit 1 (baseline) or 25 (post) -tse_relative <- tse_relative[, colData(tse_relative)$visit_number %in% c(1, 21)] +tse <- tse[, tse[["visit_number"]] %in% c(1, 21)] # Keep subjects with both visits -keep_subjects <- names(which(table(colData(tse_relative)$subject_id) == 2)) -tse_relative <- tse_relative[, colData(tse_relative)$subject_id %in% keep_subjects] +keep_subjects <- names(which(table(tse[["subject_id"]]) == 2)) +tse <- tse[, tse[["subject_id"]] %in% keep_subjects] # Define time point label -colData(tse_relative)$Time_point <- ifelse( - colData(tse_relative)$visit_number == 1, "T0", "T1" +tse[["Time_point"]] <- ifelse( + tse[["visit_number"]] == 1, "T0", "T1" ) # Define treatment: Pre and Post (Baseline vs. Post-baseline) -colData(tse_relative)$treatment <- factor(colData(tse_relative)$Time_point, levels = c("T0", "T1")) +tse[["treatment"]] <- factor(tse[["Time_point"]], levels = c("T0", "T1")) ``` -Next, we will define the mediation analysis components. By using the processed data, we define: +Next, we will define the mediation analysis components. By using the processed +data, we define: - Treatment: time point (baseline vs. post-baseline) - Outcome: dysbiosis score - Mediators: scaled species abundances -These are bundled into a mediation data object (exper), which is passed into the multimedia function to estimate path models for the treatment-to-mediator (α path), mediator-to-outcome (β path), and the combined indirect effect (α×β path). +These are bundled into a mediation data object (exper), which is passed into the +multimedia function to estimate path models for the treatment-to-mediator (α +path), mediator-to-outcome (β path), and the combined indirect effect (α×β +path). ```{r} #| label: define_mediation_data_frame @@ -198,16 +218,14 @@ These are bundled into a mediation data object (exper), which is passed into the # Define mediation data set # ############################# -# Outcome: Dysbiosis Score -colData(tse_relative)$dysbiosis <- as.numeric(colData(tse_relative)$dysbiosis) - # Mediators (scaled, directly from SE): Species -tse_relative <- transformAssay(tse_relative, assay.type="relabundance", method="standardize", name="scaled") -assays(tse_relative) <- SimpleList(mediators = assay(tse_relative, "scaled")) +tse <- transformAssay(tse, assay.type="relabundance", method="standardize", name="scaled") ``` -We now fit the multimodal mediation model and inspect the overall indirect and direct effects to understand whether microbiome composition mediates the treatment's effect on dysbiosis. +We now fit the multimodal mediation model and inspect the overall indirect and +direct effects to understand whether microbiome composition mediates the +treatment's effect on dysbiosis. ```{r} #| label: overall_mediation_analysis @@ -218,19 +236,15 @@ We now fit the multimodal mediation model and inspect the overall indirect and d ############################# # Convert the TreeSummarizedExperiment to be SummarizedExperiment -se_relative <- tryCatch(as(tse_relative, "SummarizedExperiment"), error = function(e) NULL) -if (is.null(rownames(se_relative)) || any(rownames(se_relative) == "")) { - rn <- rownames(tse_relative) - stopifnot(length(rn) == nrow(se_relative)) - rownames(se_relative) <- rn - rownames(rowData(se_relative)) <- rn # keep rowData aligned too -} +se_relative <- as(tse, "SummarizedExperiment") +rownames(se_relative) <- rownames(tse) +rownames(rowData(se_relative)) <- rownames(tse) # keep rowData aligned too # Use indices for mediators medi_idx <- seq_len(nrow(se_relative)) # Create the Mediation Data object -exper <- multimedia::mediation_data( +exper <- mediation_data( se_relative, outcomes = "dysbiosis", treatments = "treatment", @@ -248,7 +262,6 @@ print(direct_effect(res, exper)) ``` - ```{r} #| label: specific_mediation #| message: false @@ -265,7 +278,7 @@ extract_effects <- function(res, treat_term = "treatmentT1", outcome = "dysbiosi # β: mediator -> outcome (single model with all mediators) beta <- unname(coef(res@outcome@estimates[[outcome]])[names(alpha)]) - tibble::tibble( + tibble( mediator = names(alpha), alpha = as.numeric(alpha), beta = as.numeric(beta), @@ -285,7 +298,9 @@ print(top_mediators) ``` -To quantify uncertainty around the indirect effects, we use non-parametric bootstrap resampling to compute **95% confidence intervals** for both the overall and mediator-specific indirect effects. +To quantify uncertainty around the indirect effects, we use non-parametric +bootstrap resampling to compute **95% confidence intervals** for both the +overall and mediator-specific indirect effects. ```{r} #| label: visualization_for_overall_indirect_effects @@ -312,7 +327,9 @@ ggplot(boot_overall$indirect) + ``` -Finally, we visualize the **mediator-specific indirect effects** using a forest plot, displaying point estimates and 95% CIs to highlight the most influential mediators. +Finally, we visualize the **mediator-specific indirect effects** using a forest +plot, displaying point estimates and 95% CIs to highlight the most influential +mediators. ```{r} #| label: visualization_for_mediation_specific_indirect_effects @@ -378,49 +395,56 @@ ggplot(summary_df, aes(x=estimate, y=reorder(mediator, estimate), color=signific ``` -Based on the forest plot results, we can see that the Agathobaculum butyriciproducens and Escherichia coli pathways has significant indirect mediation effects. +Based on the forest plot results, we can see that the Agathobaculum +butyriciproducens and Escherichia coli pathways has significant indirect +mediation effects. -Next we repeat the mediation analysis with the iHMP pathway abundances using the same setup as before, using the time point as the treatment, dysbiosis score as the outcome, but pathways abundances as the mediation variables instead of the relative species abundance. +Next we repeat the mediation analysis with the iHMP pathway abundances using the +same setup as before, using the time point as the treatment, dysbiosis score as +the outcome, but pathways abundances as the mediation variables instead of the +relative species abundance. ### Performing multimodal mediation analysis for pathways abundances {#sec-pathways-abundances} -Let's extract the pathway abundance data from the iHMP data first, as well as other data pre-processing. +Let's extract the pathway abundance data from the iHMP data first, as well as +other data pre-processing. ```{r} #| label: load_pkg_data #| message: false -tse_pathway <- curatedMetagenomicData( +tse <- curatedMetagenomicData( "HMP_2019_ibdmdb.pathway_abundance", + rownames = "short", dryrun = FALSE )[[1]] -colData(tse_pathway)$SampleID <- colnames(tse_pathway) -tse_pathway <- tse_pathway[, colSums(is.na(assay(tse_pathway))) == 0] +tse[["SampleID"]] <- colnames(tse) +tse <- tse[, colSums(is.na(assay(tse))) == 0] -rows_to_keep <- !grepl("\\|", rownames(tse_pathway)) +rows_to_keep <- !grepl("\\|", rownames(tse)) rows_to_keep[which(rows_to_keep)[1:2]] <- FALSE -tse_pathway <- tse_pathway[rows_to_keep, ] -assay(tse_pathway) <- assay(tse_pathway) / 100 -tse_pathway$disease_binary <- colData(tse_pathway)$disease == "healthy" - -diss <- as.matrix(vegdist(t(assay(tse_pathway)), method = "bray", na.rm = TRUE)) -colData(tse_pathway)$dysbiosis <- sapply(seq_along(tse_pathway$disease_binary), function(i) { - median(diss[i, tse_pathway$disease_binary & - (colData(tse_pathway)$SampleID != colData(tse_pathway)$SampleID[i])], +tse <- tse[rows_to_keep, ] +assay(tse) <- assay(tse) / 100 +tse$disease_binary <- tse[["disease"]] == "healthy" + +diss <- as.matrix(vegdist(t(assay(tse)), method = "bray", na.rm = TRUE)) +tse[["dysbiosis"]] <- sapply(seq_along(tse$disease_binary), function(i) { + median(diss[i, tse$disease_binary & + (tse[["SampleID"]] != tse[["SampleID"]][i])], na.rm = TRUE) }) -tse_pathway <- tse_pathway[, colData(tse_pathway)$disease != "healthy"] -tse_pathway <- tse_pathway[, colData(tse_pathway)$visit_number %in% c(1, 27)] -keep_subjects <- names(which(table(colData(tse_pathway)$subject_id) == 2)) -tse_pathway <- tse_pathway[, colData(tse_pathway)$subject_id %in% keep_subjects] +tse <- tse[, tse[["disease"]] != "healthy"] +tse <- tse[, tse[["visit_number"]] %in% c(1, 27)] +keep_subjects <- names(which(table(tse[["subject_id"]]) == 2)) +tse <- tse[, tse[["subject_id"]] %in% keep_subjects] -colData(tse_pathway)$Time_point <- ifelse( - colData(tse_pathway)$visit_number == 1, "T0", "T1" +tse[["Time_point"]] <- ifelse( + tse[["visit_number"]] == 1, "T0", "T1" ) -colData(tse_pathway)$treatment <- factor(colData(tse_pathway)$Time_point, levels = c("T0", "T1")) +tse[["treatment"]] <- factor(tse[["Time_point"]], levels = c("T0", "T1")) ``` @@ -430,25 +454,21 @@ We will define the mediation analysis data set then. #| label: define_mediation_data_frame #| message: false -colData(tse_pathway)$dysbiosis <- as.numeric(colData(tse_pathway)$dysbiosis) -tse_pathway <- transformAssay(tse_pathway, assay.type="pathway_abundance", method="standardize", name="scaled") -assays(tse_pathway) <- SimpleList(mediators = assay(tse_pathway, "scaled")) +tse <- transformAssay(tse, assay.type="pathway_abundance", method="standardize", name="scaled") ``` -Next, we continue to fit the multimodal mediation model, and extract the overall and mediation-specific indirect effects. +Next, we continue to fit the multimodal mediation model, and extract the overall +and mediation-specific indirect effects. ```{r} #| label: mediation_analysis #| message: false -se_pathway <- tryCatch(as(tse_pathway, "SummarizedExperiment"), error = function(e) NULL) -if (is.null(rownames(se_pathway)) || any(rownames(se_pathway) == "")) { - rn <- rownames(tse_pathway) - stopifnot(length(rn) == nrow(se_pathway)) - rownames(se_pathway) <- rn - rownames(rowData(se_pathway)) <- rn # keep rowData aligned too -} +se_pathway <- as(tse, "SummarizedExperiment") +rownames(se_pathway) <- rownames(tse) +rownames(rowData(se_pathway)) <- rownames(tse) # keep rowData aligned too + raw <- rownames(se_pathway) clean1 <- str_remove_all(raw, "\\[|\\]") @@ -458,7 +478,7 @@ rownames(se_pathway) <- safe_names medi_idx <- seq_len(nrow(se_pathway)) -exper <- multimedia::mediation_data( +exper <- mediation_data( se_pathway, outcomes = "dysbiosis", treatments = "treatment", @@ -480,7 +500,8 @@ print(top_mediators) ``` -The non-parametric bootstrap resampling is also used to compute **95% CIs** for both the overall and mediator-specific pathway indirect effects. +The non-parametric bootstrap resampling is also used to compute **95% CIs** for +both the overall and mediator-specific pathway indirect effects. ```{r} #| label: visualization_for_overall_indirect_effects @@ -502,7 +523,8 @@ ggplot(boot_overall$indirect) + ``` -We visualize the **mediator-specific indirect effects** using a forest plot for pathway aubundance as well. with the point estimates and 95% CIs. +We visualize the **mediator-specific indirect effects** using a forest plot for +pathway aubundance as well. with the point estimates and 95% CIs. ```{r} #| label: visualization_for_mediation_specific_indirect_effects @@ -563,4 +585,5 @@ ggplot(summary_df, aes(x=estimate, y=reorder(mediator, estimate), color=signific ``` -Based on the forest plot results, we can see that the tRNA charging pathway has significant indirect mediation effects on the dysbiosis scores. +Based on the forest plot results, we can see that the tRNA charging pathway has +significant indirect mediation effects on the dysbiosis scores. From 00fe5faf067f5bbca1ec3136a00e677190ee504e Mon Sep 17 00:00:00 2001 From: YihanLiu4023 <149611140+YihanLiu4023@users.noreply.github.com> Date: Mon, 29 Dec 2025 20:59:30 +0800 Subject: [PATCH 6/7] Update multimedia.qmd --- inst/pages/multimedia.qmd | 122 ++++++++++++++++++-------------------- 1 file changed, 59 insertions(+), 63 deletions(-) diff --git a/inst/pages/multimedia.qmd b/inst/pages/multimedia.qmd index 18ef63f1..f5b9bd8f 100644 --- a/inst/pages/multimedia.qmd +++ b/inst/pages/multimedia.qmd @@ -137,7 +137,7 @@ tse <- curatedMetagenomicData( )[[1]] # Assign SampleID for matching -tse[["SampleID"]] <- colnames(tse) +tse$SampleID <- colnames(tse) # Convert relative_abundance assay to relabundance (which is in [0,1] interval) tse <- transformAssay(tse, assay.type="relative_abundance", method="relabundance") @@ -161,17 +161,15 @@ diss <- as.matrix(getDissimilarity(tse, method = "bray", na.rm=TRUE, assay.type ################################# # Calculate dysbiosis score for each sample -# For each sample i, we compute the median distance between sample i and all reference samples in `ref_set` -sample_ids <- tse[["SampleID"]] - -tse[["dysbiosis"]] <- sapply(seq_along(tse$disease_binary), function(i) { +# For each sample i, we compute the median distance between sample i and all reference samples +sample_ids <- tse$SampleID +tse$dysbiosis <- sapply(seq_along(tse$disease_binary), function(i) { # Logical vector indicating all other reference samples (excluding i) ref_others <- tse$disease_binary & (sample_ids != sample_ids[i]) # Compute median distance between sample i and these reference samples median(diss[i, ref_others], na.rm = TRUE) - }) ############################################################# @@ -179,22 +177,20 @@ tse[["dysbiosis"]] <- sapply(seq_along(tse$disease_binary), function(i) { ############################################################# # Keep IBD only -tse <- tse[, tse[["disease"]] != "healthy"] +tse <- tse[, tse$disease != "healthy"] -# Visit 1 (baseline) or 25 (post) -tse <- tse[, tse[["visit_number"]] %in% c(1, 21)] +# Visit 1 (baseline) or 21 (post) +tse <- tse[, tse$visit_number %in% c(1, 21)] # Keep subjects with both visits -keep_subjects <- names(which(table(tse[["subject_id"]]) == 2)) -tse <- tse[, tse[["subject_id"]] %in% keep_subjects] +keep_subjects <- names(which(table(tse$subject_id) == 2)) +tse <- tse[, tse$subject_id %in% keep_subjects] # Define time point label -tse[["Time_point"]] <- ifelse( - tse[["visit_number"]] == 1, "T0", "T1" -) +tse$Time_point <- ifelse(tse$visit_number == 1, "T0", "T1") # Define treatment: Pre and Post (Baseline vs. Post-baseline) -tse[["treatment"]] <- factor(tse[["Time_point"]], levels = c("T0", "T1")) +tse$treatment <- factor(tse$Time_point, levels = c("T0", "T1")) ``` @@ -235,17 +231,12 @@ treatment's effect on dysbiosis. # Run mediation (non-delta) # ############################# -# Convert the TreeSummarizedExperiment to be SummarizedExperiment -se_relative <- as(tse, "SummarizedExperiment") -rownames(se_relative) <- rownames(tse) -rownames(rowData(se_relative)) <- rownames(tse) # keep rowData aligned too - # Use indices for mediators -medi_idx <- seq_len(nrow(se_relative)) +medi_idx <- seq_len(nrow(tse)) # Create the Mediation Data object exper <- mediation_data( - se_relative, + tse, outcomes = "dysbiosis", treatments = "treatment", mediators = medi_idx @@ -270,29 +261,13 @@ print(direct_effect(res, exper)) # Mediator-specific effects ############################# -# Helper function to extract the effects -extract_effects <- function(res, treat_term = "treatmentT1", outcome = "dysbiosis") { - # α: treatment -> each mediator (one model per mediator) - alpha <- sapply(res@mediation@estimates, function(m) unname(coef(m)[treat_term])) - - # β: mediator -> outcome (single model with all mediators) - beta <- unname(coef(res@outcome@estimates[[outcome]])[names(alpha)]) - - tibble( - mediator = names(alpha), - alpha = as.numeric(alpha), - beta = as.numeric(beta), - indirect_effect = alpha * beta - ) -} - # Extract the treatment-to-mediator path coefficients -effects_by_mediator <- extract_effects(res, treat_term = "treatmentT1", outcome = "dysbiosis") +effects_by_mediator <- indirect_pathwise(res, exper) # Rank top_mediators <- effects_by_mediator %>% arrange(desc(abs(indirect_effect))) %>% - head(10) + head(20) print(top_mediators) @@ -404,6 +379,33 @@ same setup as before, using the time point as the treatment, dysbiosis score as the outcome, but pathways abundances as the mediation variables instead of the relative species abundance. +```{r} +#| label: default_visualization_plot_mediators_pathway +#| message: false + +# pick top mediators from the already-computed bootstrap summary (fast) +top_meds <- summary_df %>% + dplyr::arrange(dplyr::desc(abs(estimate))) %>% + dplyr::slice_head(n = 12) %>% + dplyr::pull(mediator) + +# construct a minimal effect table that plot_mediators() can use +ie_pw_fast <- tibble::tibble( + outcome = "dysbiosis", + mediator = top_meds, + direct_setting = levels(tse$treatment)[1], + contrast = paste(levels(tse$treatment)[1], "-", levels(tse$treatment)[2]), + indirect_effect = summary_df$estimate[match(top_meds, summary_df$mediator)] +) + +plot_mediators(ie_pw_fast, exper, n_panels = 12) + + +``` +Mediators are standardized (z-scored) for comparability across features; hence values may be negative. Besides, many microbial features are sparse; several mediators exhibit near-zero values for most samples, producing vertical bands. + + + ### Performing multimodal mediation analysis for pathways abundances {#sec-pathways-abundances} Let's extract the pathway abundance data from the iHMP data first, as well as @@ -419,32 +421,30 @@ tse <- curatedMetagenomicData( dryrun = FALSE )[[1]] -tse[["SampleID"]] <- colnames(tse) +tse$SampleID <- colnames(tse) tse <- tse[, colSums(is.na(assay(tse))) == 0] rows_to_keep <- !grepl("\\|", rownames(tse)) rows_to_keep[which(rows_to_keep)[1:2]] <- FALSE tse <- tse[rows_to_keep, ] assay(tse) <- assay(tse) / 100 -tse$disease_binary <- tse[["disease"]] == "healthy" +tse$disease_binary <- tse$disease == "healthy" diss <- as.matrix(vegdist(t(assay(tse)), method = "bray", na.rm = TRUE)) -tse[["dysbiosis"]] <- sapply(seq_along(tse$disease_binary), function(i) { +tse$dysbiosis <- sapply(seq_along(tse$disease_binary), function(i) { median(diss[i, tse$disease_binary & - (tse[["SampleID"]] != tse[["SampleID"]][i])], + (tse$SampleID != tse$SampleID[i])], na.rm = TRUE) }) -tse <- tse[, tse[["disease"]] != "healthy"] -tse <- tse[, tse[["visit_number"]] %in% c(1, 27)] -keep_subjects <- names(which(table(tse[["subject_id"]]) == 2)) -tse <- tse[, tse[["subject_id"]] %in% keep_subjects] +tse <- tse[, tse$disease != "healthy"] +tse <- tse[, tse$visit_number %in% c(1, 27)] +keep_subjects <- names(which(table(tse$subject_id) == 2)) +tse <- tse[, tse$subject_id %in% keep_subjects] -tse[["Time_point"]] <- ifelse( - tse[["visit_number"]] == 1, "T0", "T1" -) +tse$Time_point <- ifelse(tse$visit_number == 1, "T0", "T1") -tse[["treatment"]] <- factor(tse[["Time_point"]], levels = c("T0", "T1")) +tse$treatment <- factor(tse$Time_point, levels = c("T0", "T1")) ``` @@ -465,21 +465,16 @@ and mediation-specific indirect effects. #| label: mediation_analysis #| message: false -se_pathway <- as(tse, "SummarizedExperiment") -rownames(se_pathway) <- rownames(tse) -rownames(rowData(se_pathway)) <- rownames(tse) # keep rowData aligned too - - -raw <- rownames(se_pathway) +raw <- rownames(tse) clean1 <- str_remove_all(raw, "\\[|\\]") clean2 <- str_replace_all(clean1, "[: \\.,]", "_") safe_names <- make.names(clean2, unique = TRUE) -rownames(se_pathway) <- safe_names +rownames(tse) <- safe_names -medi_idx <- seq_len(nrow(se_pathway)) +medi_idx <- seq_len(nrow(tse)) exper <- mediation_data( - se_pathway, + tse, outcomes = "dysbiosis", treatments = "treatment", mediators = medi_idx @@ -491,11 +486,12 @@ summary(res) print(indirect_overall(res, exper)) print(direct_effect(res, exper)) -effects_by_mediator <- extract_effects(res, treat_term = "treatmentT1", outcome = "dysbiosis") +effects_by_mediator <- indirect_pathwise(res, exper) top_mediators <- effects_by_mediator %>% arrange(desc(abs(indirect_effect))) %>% - head(10) + head(20) + print(top_mediators) ``` From 2c1b06cb633d33988700a6539686eef01517532c Mon Sep 17 00:00:00 2001 From: YihanLiu4023 <149611140+YihanLiu4023@users.noreply.github.com> Date: Mon, 26 Jan 2026 07:22:25 +0800 Subject: [PATCH 7/7] Update multimedia.qmd --- inst/pages/multimedia.qmd | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/inst/pages/multimedia.qmd b/inst/pages/multimedia.qmd index f5b9bd8f..a16d5118 100644 --- a/inst/pages/multimedia.qmd +++ b/inst/pages/multimedia.qmd @@ -165,11 +165,13 @@ diss <- as.matrix(getDissimilarity(tse, method = "bray", na.rm=TRUE, assay.type sample_ids <- tse$SampleID tse$dysbiosis <- sapply(seq_along(tse$disease_binary), function(i) { + # Logical vector indicating all other reference samples (excluding i) ref_others <- tse$disease_binary & (sample_ids != sample_ids[i]) # Compute median distance between sample i and these reference samples median(diss[i, ref_others], na.rm = TRUE) + }) ############################################################# @@ -231,12 +233,16 @@ treatment's effect on dysbiosis. # Run mediation (non-delta) # ############################# +# Convert the TreeSummarizedExperiment to be SummarizedExperiment +se_relative <- as(tse, "SummarizedExperiment") +if (is.null(rownames(se_relative))) rownames(se_relative) <- rownames(rowData(se_relative)) <- make.names(rownames(tse), unique = TRUE) + # Use indices for mediators -medi_idx <- seq_len(nrow(tse)) +medi_idx <- seq_len(nrow(se_relative)) # Create the Mediation Data object exper <- mediation_data( - tse, + se_relative, outcomes = "dysbiosis", treatments = "treatment", mediators = medi_idx @@ -286,7 +292,7 @@ overall and mediator-specific indirect effects. ##################################### set.seed(12345) -boot_overall <- bootstrap(mdl, exper, c(indirect = indirect_overall), B = 100) +boot_overall <- bootstrap(mdl, exper, c(indirect = indirect_overall), B = 10) summary(boot_overall$indirect) quantile(boot_overall$indirect$indirect_effect, probs = c(0.025, 0.975))