up

microbiome · Jan 31, 2025 · 65ee36c · 65ee36c
1 parent b22836b
commit 65ee36c
Show file tree

Hide file tree

Showing 2 changed files with 208 additions and 27 deletions.
diff --git a/R/getStability.R b/R/getStability.R
@@ -4,18 +4,52 @@
 #' @export
 #'
 #' @title
+#' Calculate stability
 #'
 #' @description
+#' ADD.
 #'
 #' @details
+#' The stability is calculated by first defining reference point, \eqn{R_{f}},
+#' for each feature. This is defined by taking mean of range of values
+#'
+#' \deqn{R_{f} = \frac{x_{f, min} + x_{f, max}}{2}}
+#'
+#' where \eqn{f} denotes a single feature and \eqn{x} abundance.
+#'
+#' Then difference between consecutive time points, \eqn{\Delta_{f, x}},
+#'
+#' \deqn{\Delta_{f, x} = ∣x_{f, t} - x_{f, t-1}∣}
+#'
+#' and difference between the previous time point and reference,
+#' \eqn{\Delta_{f. R}}, are calculated:
+#'
+#' \deqn{\Delta_{f. R} = ∣x_{f, t-1} - R_{f}∣}
+#'
+#' where \eqn{t} denotes time point.
+#'
+#' Optionally, time difference \eqn{\Delta_{f, t}} is calculated
+#'
+#' \deqn{\Delta_{f, t} = t_{f, t-1} - t_{f, t-1}}
+#'
+#' The stability coefficient \eqn{s_{f}} is calculated with correlation
+#'
+#' \deqn{s_{f} = corr(\Delta_{f, x}, \Delta_{f, R})}
+#'
+#' or with linear model as follows
+#'
+#' \deqn{s_{f} = \frac{ \Delta_{f, x} - \beta_{0} - \beta_{2}*\Delta_{f, t} -
+#' \epsilon_{f} }{ \Delta_{f, R} }}
 #'
 #' @references
+#' Lahti L, et al. (2014) Tipping elements in the human intestinal ecosystem.
+#' Nat Commun. doi: 10.1038/ncomms5344
 #'
 #' @return
+#' Add here
 #'
 #' @inheritParams addBaselineDivergence
 #'
-#'
 #' @param ... additional arguments.
 #' \itemize{
 #'   \item \code{time.interval}: \code{Integer scalar}. Indicates the increment
@@ -31,6 +65,21 @@
 #' data(minimalgut)
 #' tse <- minimalgut
 #'
+#' # Apply clr transformation
+#' tse <- transformAssay(tse, method = "rclr")
+#'
+#' # Calculate stability single system
+#' tse_sub <- tse[, tse[["StudyIdentifier"]] == "Bioreactor A"]
+#' res <- getStability(tse_sub, assay.type = "rclr", time.col = "Time.hr")
+#' res |> head()
+#'
+#' # Calculate stability for each  system simultaneously by taking time
+#' # difference into account
+#' res <- getStability(
+#'     tse, assay.type = "rclr", time.col = "Time.hr",
+#'     group = "StudyIdentifier", mode = "lm")
+#' res |> head()
+#'
 #' @seealso
 #' \code{\link[miaTime:getBimodality]{getBimodality()}}
 NULL
@@ -53,26 +102,24 @@ setMethod("getStability", signature = c(x = "SummarizedExperiment"),
         ########################### Input check end ############################
         # Get data into long format
         df <- meltSE(x, assay.type, add.col = c(time.col, group))
-        # Calculate metrics
+        # Calculate metrics for stability calculation
         df <- .calculate_stability_metrics(
             df, assay.type, time.col, group, ...)
         # Calculate stability based on calculated metrics
         res <- .calculate_stability(df, group, ...)
-        # Sort the data
+        # Sort the data so that it matches with original order
         res <- res[rownames(x), , drop = FALSE]
         return(res)
     }
 )
 
 ################################ HELP FUNCTIONS ################################
 
-# This function calculates the growth metrics from the data.frame.
+# This function calculates the metrics that will be later used for calculating
+# of stability.
 #' @importFrom dplyr arrange group_by sym summarise mutate select
 .calculate_stability_metrics <- function(
         df, assay.type, time.col, group, time.interval = 1L, ...) {
-    # This following line is to suppress "no visible binding for" messages
-    # in cmdcheck
-    FeatureID <- .data <- value <- NULL
     #
     temp <- .check_input(time.interval, list("numeric scalar"))
     # If there are replicated samples, give warning that average is calculated
@@ -89,67 +136,99 @@ setMethod("getStability", signature = c(x = "SummarizedExperiment"),
     } else{
         df <- df |> group_by(FeatureID, !!sym(time.col))
     }
-    # Calculate difference between current and previous time point and
-    # difference to the reference point which is the mean value.
-    # We do not have to aggregate averages as they are naturally taken into
-    # account.
+    # Calculate metrics for stability
     df <- df |>
         # Summarize duplicated samples by taking an average
         summarise(value = mean(.data[[assay.type]], na.rm = TRUE)) |>
         # Calculate metrics
         mutate(
-            diff = value - lag(value, n = time.interval),
+            # Difference between current and previous time point
+            curr_vs_prev = value - lag(value, n = time.interval),
+            # Difference between current time point and feature's median
+            prev_vs_ref = lag(value, n = time.interval) -
+                mean(range(value, na.rm = TRUE)),
+            # Time difference between consecutive time points
             time_diff = .data[[time.col]] -
-                lag(.data[[time.col]], n = time.interval),
-            start = lag(value, n = time.interval),
-            reference = mean(range(value, na.rm = TRUE)),
-            reference_diff = start - reference
+                lag(.data[[time.col]], n = time.interval)
             ) |>
         ungroup()
     # Filter to remove NAs created by lag
-    df <- df |>
-        filter_at(vars(diff, time_diff, reference_diff), all_vars(!is.na(.)))
+    df <- df |> filter_at(
+        vars(curr_vs_prev, prev_vs_ref, time_diff), all_vars(!is.na(.)))
+    # Check that there are rows left. It might be that the lag is too large
+    # for the dataset. For example, if there is only one time point, we cannot
+    # calculate the difference between consecutive time points.
     if( nrow(df) == 0L ){
-        stop("No rows left after calculating differences between oncecutive time points.", call. = FALSE)
+        stop("Cannot calculate the difference between consecutive time points ",
+            "with 'ẗime.interval=", time.interval, "'.", call. = FALSE)
     }
     return(df)
 }
 
+# This function facilitates the actual calculation of stability based on the
+# previously calculated metrics.
 .calculate_stability <- function(df, group, ...){
+    # Group based on feature so that we calculate stability for each feature.
+    # If sample group was also specified, apply also it so that we calculate
+    # stability for4 each feature-patient/system pair.
     if( !is.null(group) ){
         df <- df |> group_by(!!sym(group), FeatureID)
     } else{
         df <- df |> group_by(FeatureID)
     }
-    #
+    # Calculate stability. We use help function to apply and control how the
+    # stability is calculated.
     df <- df |> summarise(
         stability = .calc_stability(.data, ...)
     )
-    #
+    # If we had sample groups, we put each sample group to own columns.
+    # Otherwise we have single column which contain stability values for each
+    # feature.
     if( !is.null(group) ){
         df <- df |>
             pivot_wider(names_from = group, values_from = stability)
     }
+    # Convert tibble to DF and wrangle rownames.
     df <- DataFrame(df, check.names = FALSE)
     rownames(df) <- df[["FeatureID"]]
     df[["FeatureID"]] <- NULL
     return(df)
 }
 
-.calc_stability <- function(.data, method = "correlation", ...){
-    values <- abs( .data[["diff"]] )
-    ref <- abs( .data[["reference_diff"]] )
+# This function is used to control how the stability is calculated. As an input
+# we get values for single system and feature.
+.calc_stability <- function(.data, mode = "correlation", ...){
+    .check_input(mode, list("character scalar"), list("correlation", "lm"))
+    # Get metrics
+    values <- abs( .data[["curr_vs_prev"]] )
+    ref <- abs( .data[["prev_vs_ref"]] )
     time <- .data[["time_diff"]]
 
     res <- NA
-    if( method == "correlation" ){
+    if( mode == "correlation" ){
+        # Apply simple correlation that do not take into account time
+        # difference. Get the correlation between "difference between
+        # consecutive time points" and "difference between previous time point
+        # and reference point".
         # If taxa is very rare, it might be that its standard deviation of
         # 'ref' is 0. Suppress the warning "the standard deviation is zero"
-        res <- cor(values, ref) |> suppressWarnings()
-    } else if( method == "lm" ){
+        res <- cor(values, ref, ...) |> suppressWarnings()
+    } else if( mode == "lm" ){
+        # Apply linear model that takes into account time difference between
+        # consecutive time points. Get how much the "difference between previous
+        # time point and reference point" explains the "difference between
+        # consecutive time points".
         model <- lm(values ~ ref + time, data = data.frame(values, ref, time))
         res <- coef(model)
         res <- res[["ref"]]
     }
+    # The result is a single value that tells how much the distance from
+    # median explains the difference between consecutive time points.
+    # If the value is positive, we can say that the system is stable:
+    # the abundance of taxon is predictable; with larger distance from
+    # reference the changes are larger and vice versa.
+    # If the value is negative, we can say that the system is unstable:
+    # the magnitude fluctuations are negatively linked with the abundance level;
+    # with lower abundance the changes are larger.
     return(res)
 }
diff --git a/man/getStability.Rd b/man/getStability.Rd