diff --git a/DESCRIPTION b/DESCRIPTION index 7c1f5aa..aa46d92 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: miaTime Type: Package Title: Microbiome Time Series Analysis -Version: 0.99.2 +Version: 0.99.3 Authors@R: c(person(given = "Leo", family = "Lahti", role = c("aut"), email = "leo.lahti@iki.fi", diff --git a/NAMESPACE b/NAMESPACE index 17b3e7d..af83c00 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,3 +27,4 @@ importFrom(dplyr,mutate) importFrom(dplyr,summarize) importFrom(dplyr,ungroup) importFrom(tidyr,pivot_wider) +importFrom(tidyr,unnest) diff --git a/NEWS b/NEWS index f49a19e..4ba2d9f 100755 --- a/NEWS +++ b/NEWS @@ -1,3 +1,6 @@ +Changes in version 0.99.3 ++ Handle repeated samples in divergence functions by calculating average + Changes in version 0.99.1 + Added bimodality functions diff --git a/R/getBaselineDivergence.R b/R/getBaselineDivergence.R index f0b225c..a3c3fa6 100644 --- a/R/getBaselineDivergence.R +++ b/R/getBaselineDivergence.R @@ -135,8 +135,7 @@ setMethod("getBaselineDivergence", signature = c(x = "SummarizedExperiment"), ########################### INPUT CHECK END ############################ # Add baseline samples to colData args <- .add_reference_samples_to_coldata( - x, time.col, group, reference, assay.type, method, - reference.method = "baseline", ...) + x, time.col, group, reference, reference.method = "baseline", ...) # Create an argument list. Do not include altexp as it is already taken # into account. args <- c( @@ -146,12 +145,12 @@ setMethod("getBaselineDivergence", signature = c(x = "SummarizedExperiment"), ) # Calculate divergences res <- do.call(getDivergence, args) - # Add time difference - x <- args[["x"]] - reference <- args[["reference"]] - time_res <- .get_time_difference(x, time.col, reference) + # Get time difference + args[["time.col"]] <- time.col + time_res <- do.call(.get_time_difference, args) # Create a DF to return - res <- .convert_divergence_to_df(x, res, time_res, reference, ...) + args <- c(args, list(res = res, time_res = time_res)) + res <- do.call(.convert_divergence_to_df, args) return(res) } ) @@ -231,7 +230,13 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"), if( is.null(reference) ){ ref <- .get_reference_samples( cd, time.col, time.interval, group, reference.method) - cd[[ref.name]] <- ref + # If the data includes repeated timepoints, the data has multiple + # reference samples for each sample. + ref <- ref[ rownames(cd) ] + if( all(lengths(ref) == 1L) ){ + ref <- unlist(ref) + } + cd[[ref.name]] <- unname(ref) reference <- ref.name } # If reference was specified, check that it is specifying samples @@ -271,7 +276,7 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"), # Add modified colData back to TreeSE colData(x) <- cd # The returned value includes the TreeSE along with reference - # column name because it might be that we have modified it. + # column name res <- list(x = x, reference = reference) return(res) } @@ -280,6 +285,7 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"), # Alternatively, it returns the previous ith sample for each sample in each # group. #' @importFrom dplyr group_by mutate arrange ungroup lag +#' @importFrom tidyr unnest .get_reference_samples <- function( df, time.col, time.interval, group, reference.method){ # This following line is to suppress "no visible binding for" messages @@ -288,8 +294,8 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"), rowname_col <- "temporary_rownames_column" reference_col <- "temporary_reference_column" - # Store rownames and add rownames as a column - df[[rowname_col]] <- original_order <- rownames(df) + # Add rownames as a column + df[[rowname_col]] <- rownames(df) # Convert to data.frame and group data based on group df <- df |> as.data.frame() |> @@ -299,47 +305,90 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"), if( reference.method == "baseline" ){ # Find first timepoint within a group df <- df |> - mutate(!!reference_col := - .data[[rowname_col]][which.min(.data[[time.col]])]) + mutate(!!reference_col := .get_baseline_samples( + .data, time.col, rowname_col)) } else if( reference.method == "stepwise" ){ # For each sample, get the previous ith sample. # For each subject, get previous sample based on time. df <- df |> - mutate(!!reference_col := lag( - .data[[rowname_col]], n = time.interval, - order_by = .data[[time.col]])) + mutate(!!reference_col := .get_previous_samples( + .data, time.col, rowname_col, time.interval)) } - # Ungroup to revert to the original structure and convert to DataFrame - df <- df |> - ungroup() |> - DataFrame() - # Put the data into original order - rownames(df) <- df[[rowname_col]] - df <- df[original_order, ] - # Get only reference samples res <- df[[reference_col]] + names(res) <- df[[rowname_col]] return(res) } +# This function gets df as input. The data must be already grouped if grouping +# exists. For each sample, this function finds baseline timepoint. If there +# are multiple samples from baseline timepoint, all are returned. +.get_baseline_samples <- function(.data, time.col, rowname_col){ + # Get timepoints and corresponding baseline timepoints + time_points <- unique(sort(.data[[time.col]])) + baseline_time <- min(time_points, na.rm = TRUE) + baseline_time <- rep(baseline_time, length(.data[[time.col]])) + # Split sample names so that they are grouped into timepoints + timepoint_samples <- split(.data[[rowname_col]], .data[[time.col]]) + # For each sample, assign baseline samples + baseline_samples <- timepoint_samples[ + match(baseline_time, names(timepoint_samples)) ] + return(baseline_samples) +} + +# This function gets df as input. The data must be already grouped if grouping +# exists. For each sample, this function finds previous timepoint. If there +# are multiple samples from previous timepoint, all are returned. +#' @importFrom dplyr lag +.get_previous_samples <- function(.data, time.col, rowname_col, time.interval){ + # Get timepoints and corresponding previous timepoints + current_time <- unique(sort(.data[[time.col]])) + prev_time <- lag(current_time, n = time.interval) + prev_time <- prev_time[match(.data[[time.col]], current_time)] + # Split sample names so that they are grouped into timepoints + timepoint_samples <- split(.data[[rowname_col]], .data[[time.col]]) + # For each sample, assign previous samples + prev_samples <- timepoint_samples[ + match(prev_time, names(timepoint_samples)) ] + prev_samples[ lengths(prev_samples) == 0L ] <- NA_character_ + return(prev_samples) +} + # This function get time difference between a sample and its reference sample -.get_time_difference <- function(x, time.col, reference){ +#' @importFrom dplyr group_by mutate +.get_time_difference <- function(x, time.col, reference, ...){ # Get timepoints time_point <- x[[time.col]] # Get reference time points - ref <- colData(x)[x[[reference]], time.col] + df <- colData(x) |> as.data.frame() + df[["temp_sample"]] <- colnames(x) + ref <- df |> + group_by(temp_sample) |> + mutate(time := mean(df[unlist(.data[[reference]]), time.col])) # Get difference - res <- time_point - ref + res <- time_point - ref[["time"]] return(res) } # This function converts time divergence results to DF object +#' @importFrom dplyr summarize .convert_divergence_to_df <- function( x, res, time_res, reference, name = c("divergence", "time_diff", "ref_samples"), ...){ # Validate 'name' param temp <- .check_input(name, list("character vector"), length = 3L) # - df <- DataFrame(res, time_res, x[[reference]], row.names = colnames(x)) + df <- data.frame( + res, time_res, sample = colnames(x), reference = I(x[[reference]])) + # Wrangle the format + if( all(lengths(df[["reference"]]) == 1L) ){ + df[["reference"]] <- as.character(df[["reference"]]) + } else{ + df[["reference"]] <- as.list(df[["reference"]]) + } + df <- DataFrame(df) + # Wrangle names + rownames(df) <- df[["sample"]] + df[["sample"]] <- NULL colnames(df) <- name return(df) } diff --git a/R/getStepwiseDivergence.R b/R/getStepwiseDivergence.R index 6237fa9..d2bef69 100644 --- a/R/getStepwiseDivergence.R +++ b/R/getStepwiseDivergence.R @@ -97,12 +97,12 @@ setMethod("getStepwiseDivergence", signature = c(x = "ANY"), ) # Calculate divergences res <- do.call(getDivergence, args) - # Add time difference - x <- args[["x"]] - reference <- args[["reference"]] - time_res <- .get_time_difference(x, time.col, reference) + # Get time difference + args[["time.col"]] <- time.col + time_res <- do.call(.get_time_difference, args) # Create a DF to return - res <- .convert_divergence_to_df(x, res, time_res, reference, ...) + args <- c(args, list(res = res, time_res = time_res)) + res <- do.call(.convert_divergence_to_df, args) return(res) } ) diff --git a/man/addBaselineDivergence.Rd b/man/addBaselineDivergence.Rd index ed798f9..d8f487f 100644 --- a/man/addBaselineDivergence.Rd +++ b/man/addBaselineDivergence.Rd @@ -108,7 +108,7 @@ tse <- addBaselineDivergence( reference = "reference", group = "subject", time.col = "time", - name = c("divergence_from_baseline", + name = c("divergence_from_baseline", "time_from_baseline", "reference_samples"), assay.type = "relabundance", method = "bray") diff --git a/tests/testthat/test-getBaselineDivergence.R b/tests/testthat/test-getBaselineDivergence.R index 1cdeede..2286732 100644 --- a/tests/testthat/test-getBaselineDivergence.R +++ b/tests/testthat/test-getBaselineDivergence.R @@ -2,6 +2,7 @@ test_that("addBaselineDivergence output", { data(hitchip1006) tse <- hitchip1006 + tse <- tse[, !duplicated(colData(tse)[, c("time", "subject")])] tse2 <- addBaselineDivergence( tse, group = "subject", time.col = "time", name = c("divergence_from_baseline", "time_from_baseline", @@ -135,7 +136,7 @@ test_that(".get_reference_samples with different time intervals", { # Basic SummarizedExperiment for testing col_data <- DataFrame( - time = c(0, 1, 2, 1, 2, 0), + time = c(0, 1, 2, 4, 10, 3), group = c("A", "A", "A", "B", "B", "B"), row.names = c("Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6")) @@ -199,7 +200,8 @@ test_that(".get_reference_samples baseline", { colData(se), time.col = "time", group = "group", reference.method = "stepwise", time.interval = 1) expect_equal(stepwise, c( - NA, "Sample1", "Sample2", "Sample6", "Sample4", NA)) + NA, "Sample1", "Sample2", "Sample6", "Sample4", NA), + check.attributes = FALSE) }) # Time difference calculation @@ -209,7 +211,7 @@ test_that(".get_time_difference calculates correct time diff", { colData(se2)[["ref"]] <- reference time_diffs <- .get_time_difference( se2, time.col = "time", reference = "ref") - expect_equal(time_diffs, c(-1, 1, 2, -1, NA, -1)) + expect_equal(time_diffs, c(-1, 1, 2, 2, NA, -1)) }) # Convert divergence to DataFrame @@ -225,9 +227,11 @@ test_that(".convert_divergence_to_df formats correctly", { assays = list(), colData = col_data ) + colnames(se) <- se[["sam"]] <- paste0("sample", seq(1, 6)) reference <- "reference" df <- .convert_divergence_to_df( - se, divergence, time_diff, reference, + x_orig = se, x = se, res = divergence, time_res = time_diff, + reference = reference, orig.sample.names = "sam", name = c("test_div", "test_time_diff", "test_reference_samples")) expect_s4_class(df, "DataFrame") expect_equal(colnames(df), @@ -272,9 +276,46 @@ test_that(".convert_divergence_to_df with NA divergence values", { colData = col_data ) reference <- "reference" + colnames(se) <- se[["sam"]] <- paste0("sample", seq(1, 6)) df <- .convert_divergence_to_df( se, divergence, time_diff, reference, name = c("test_div", "test_time_diff", "test_reference_samples")) expect_s4_class(df, "DataFrame") expect_true(all(is.na(df$test_div[is.na(divergence)]))) }) + +# Test that the function works correctly with replicated samples +test_that("getBaselineDivergence with replicated samples", { + tse <- makeTSE(nrow = 1000, ncol = 20) + assayNames(tse) <- "counts" + colData(tse)[["time"]] <- sample(c(1, 3, 6, 100), 20, replace = TRUE) + res <- getBaselineDivergence( + tse, time.col = "time", group = "group", method = "euclidean") |> + expect_warning() + res <- res[, c(1, 2)] + # For all samples, calculate divergence + sams <- colnames(tse) + ref <- lapply(sams, function(sam){ + # Get data on sample + sam_dat <- colData(tse[, sam]) + # Get its reference samples + group_dat <- colData(tse)[tse[["group"]] == sam_dat[["group"]], ] + ref_time <- sort(group_dat[["time"]])[[1]] + # Loop through each reference sample, calculate its distance to + # the sample, and take mean + ref_sams <- rownames(group_dat[ group_dat[["time"]] == ref_time, ]) + ref_vals <- vapply(ref_sams, function(ref_sam){ + dist(t( assay(tse, "counts")[, c(sam, ref_sam)] ), + method = "euclidean") + }, numeric(1)) + ref_vals <- mean(ref_vals) + # Return divergence and time difference + temp_res <- c(ref_vals, sam_dat[["time"]] - ref_time) + return(temp_res) + }) + ref <- do.call(rbind, ref) + ref <- DataFrame(ref) + colnames(ref) <- colnames(res) + rownames(ref) <- rownames(res) + expect_equal(res, ref) +}) diff --git a/tests/testthat/test-getStepwiseDivergence.R b/tests/testthat/test-getStepwiseDivergence.R index c61a19f..b6c62cf 100644 --- a/tests/testthat/test-getStepwiseDivergence.R +++ b/tests/testthat/test-getStepwiseDivergence.R @@ -161,10 +161,10 @@ test_that(".get_reference_samples with different time intervals", { tse <- hitchip1006 interval_1 <- .get_reference_samples( colData(tse), time.col = "time", group = "subject", - reference.method = "stepwise", time.interval = 1) + reference.method = "stepwise", time.interval = 1) |> unlist() interval_2 <- .get_reference_samples( colData(tse), time.col = "time", group = "subject", - reference.method = "stepwise", time.interval = 2) + reference.method = "stepwise", time.interval = 2) |> unlist() expect_false(all(interval_1 == interval_2)) }) @@ -184,7 +184,7 @@ test_that(".get_reference_samples with different time intervals", { # Basic SummarizedExperiment for testing col_data <- DataFrame( - time = c(0, 1, 2, 1, 2, 0), + time = c(0, 1, 2, 4, 10, 3), group = c("A", "A", "A", "B", "B", "B"), row.names = c("Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6")) @@ -250,7 +250,8 @@ test_that(".get_reference_samples stepwise", { colData(se), time.col = "time", group = "group", reference.method = "stepwise", time.interval = 1) expect_equal(stepwise, c( - NA, "Sample1", "Sample2", "Sample6", "Sample4", NA)) + NA, "Sample1", "Sample2", "Sample6", "Sample4", NA), + check.attributes = FALSE) }) @@ -275,8 +276,49 @@ test_that("addStepwiseDivergence with custom reference sample", { se, time.col = "time", group = "group") # se[["reference"]] <- c(NA, "Sample1", "Sample2", "Sample6", "Sample4", NA) - time_diff <- c(NA, 1, 1, 1, 1, NA) + time_diff <- c(NA, 1, 1, 1, 6, NA) ref <- getDivergence(se, reference = "reference") expect_equal(res[["divergence"]], ref) expect_equal(res[["time_diff"]], time_diff) }) + +# Test that the function works correctly with replicated samples +test_that("getStepwiseDivergence with replicated samples", { + tse <- makeTSE(nrow = 1000, ncol = 20) + assayNames(tse) <- "counts" + colData(tse)[["time"]] <- sample(c(1, 3, 6, 100), 20, replace = TRUE) + res <- getStepwiseDivergence( + tse, time.col = "time", group = "group", method = "euclidean") |> + expect_warning() + res <- res[, c(1, 2)] + # + sams <- colnames(tse) + ref <- lapply(sams, function(sam){ + # Get data on sample + sam_dat <- colData(tse[, sam]) + # Get its reference samples + group_dat <- colData(tse)[tse[["group"]] == sam_dat[["group"]], ] + time_points <- unique(sort(group_dat[["time"]])) + ref_time <- time_points[ which(time_points == sam_dat[["time"]])-1 ] + temp_res <- c(NA, NA) + # Calculate distance if reference samples were found + if( length(ref_time) > 0 ){ + # Loop through each reference sample, calculate its distance to + # the sample, and take mean + ref_sams <- rownames(group_dat[ group_dat[["time"]] == ref_time, ]) + ref_vals <- vapply(ref_sams, function(ref_sam){ + dist(t( assay(tse, "counts")[, c(sam, ref_sam)] ), + method = "euclidean") + }, numeric(1)) + ref_vals <- mean(ref_vals) + # Return divergence and time difference + temp_res <- c(ref_vals, sam_dat[["time"]] - ref_time) + } + return(temp_res) + }) + ref <- do.call(rbind, ref) + ref <- DataFrame(ref) + colnames(ref) <- colnames(res) + rownames(ref) <- rownames(res) + expect_equal(res, ref) +})