Skip to content

Commit

Permalink
Replicated samples in divergence functions (#102)
Browse files Browse the repository at this point in the history
  • Loading branch information
TuomasBorman authored Jan 23, 2025
1 parent af7a0e0 commit e4908a7
Show file tree
Hide file tree
Showing 8 changed files with 180 additions and 44 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: miaTime
Type: Package
Title: Microbiome Time Series Analysis
Version: 0.99.2
Version: 0.99.3
Authors@R:
c(person(given = "Leo", family = "Lahti", role = c("aut"),
email = "[email protected]",
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ importFrom(dplyr,mutate)
importFrom(dplyr,summarize)
importFrom(dplyr,ungroup)
importFrom(tidyr,pivot_wider)
importFrom(tidyr,unnest)
3 changes: 3 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
Changes in version 0.99.3
+ Handle repeated samples in divergence functions by calculating average

Changes in version 0.99.1
+ Added bimodality functions

Expand Down
105 changes: 77 additions & 28 deletions R/getBaselineDivergence.R
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,7 @@ setMethod("getBaselineDivergence", signature = c(x = "SummarizedExperiment"),
########################### INPUT CHECK END ############################
# Add baseline samples to colData
args <- .add_reference_samples_to_coldata(
x, time.col, group, reference, assay.type, method,
reference.method = "baseline", ...)
x, time.col, group, reference, reference.method = "baseline", ...)
# Create an argument list. Do not include altexp as it is already taken
# into account.
args <- c(
Expand All @@ -146,12 +145,12 @@ setMethod("getBaselineDivergence", signature = c(x = "SummarizedExperiment"),
)
# Calculate divergences
res <- do.call(getDivergence, args)
# Add time difference
x <- args[["x"]]
reference <- args[["reference"]]
time_res <- .get_time_difference(x, time.col, reference)
# Get time difference
args[["time.col"]] <- time.col
time_res <- do.call(.get_time_difference, args)
# Create a DF to return
res <- .convert_divergence_to_df(x, res, time_res, reference, ...)
args <- c(args, list(res = res, time_res = time_res))
res <- do.call(.convert_divergence_to_df, args)
return(res)
}
)
Expand Down Expand Up @@ -231,7 +230,13 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"),
if( is.null(reference) ){
ref <- .get_reference_samples(
cd, time.col, time.interval, group, reference.method)
cd[[ref.name]] <- ref
# If the data includes repeated timepoints, the data has multiple
# reference samples for each sample.
ref <- ref[ rownames(cd) ]
if( all(lengths(ref) == 1L) ){
ref <- unlist(ref)
}
cd[[ref.name]] <- unname(ref)
reference <- ref.name
}
# If reference was specified, check that it is specifying samples
Expand Down Expand Up @@ -271,7 +276,7 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"),
# Add modified colData back to TreeSE
colData(x) <- cd
# The returned value includes the TreeSE along with reference
# column name because it might be that we have modified it.
# column name
res <- list(x = x, reference = reference)
return(res)
}
Expand All @@ -280,6 +285,7 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"),
# Alternatively, it returns the previous ith sample for each sample in each
# group.
#' @importFrom dplyr group_by mutate arrange ungroup lag
#' @importFrom tidyr unnest
.get_reference_samples <- function(
df, time.col, time.interval, group, reference.method){
# This following line is to suppress "no visible binding for" messages
Expand All @@ -288,8 +294,8 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"),

rowname_col <- "temporary_rownames_column"
reference_col <- "temporary_reference_column"
# Store rownames and add rownames as a column
df[[rowname_col]] <- original_order <- rownames(df)
# Add rownames as a column
df[[rowname_col]] <- rownames(df)
# Convert to data.frame and group data based on group
df <- df |>
as.data.frame() |>
Expand All @@ -299,47 +305,90 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"),
if( reference.method == "baseline" ){
# Find first timepoint within a group
df <- df |>
mutate(!!reference_col :=
.data[[rowname_col]][which.min(.data[[time.col]])])
mutate(!!reference_col := .get_baseline_samples(
.data, time.col, rowname_col))
} else if( reference.method == "stepwise" ){
# For each sample, get the previous ith sample.
# For each subject, get previous sample based on time.
df <- df |>
mutate(!!reference_col := lag(
.data[[rowname_col]], n = time.interval,
order_by = .data[[time.col]]))
mutate(!!reference_col := .get_previous_samples(
.data, time.col, rowname_col, time.interval))
}
# Ungroup to revert to the original structure and convert to DataFrame
df <- df |>
ungroup() |>
DataFrame()
# Put the data into original order
rownames(df) <- df[[rowname_col]]
df <- df[original_order, ]
# Get only reference samples
res <- df[[reference_col]]
names(res) <- df[[rowname_col]]
return(res)
}

# This function gets df as input. The data must be already grouped if grouping
# exists. For each sample, this function finds baseline timepoint. If there
# are multiple samples from baseline timepoint, all are returned.
.get_baseline_samples <- function(.data, time.col, rowname_col){
# Get timepoints and corresponding baseline timepoints
time_points <- unique(sort(.data[[time.col]]))
baseline_time <- min(time_points, na.rm = TRUE)
baseline_time <- rep(baseline_time, length(.data[[time.col]]))
# Split sample names so that they are grouped into timepoints
timepoint_samples <- split(.data[[rowname_col]], .data[[time.col]])
# For each sample, assign baseline samples
baseline_samples <- timepoint_samples[
match(baseline_time, names(timepoint_samples)) ]
return(baseline_samples)
}

# This function gets df as input. The data must be already grouped if grouping
# exists. For each sample, this function finds previous timepoint. If there
# are multiple samples from previous timepoint, all are returned.
#' @importFrom dplyr lag
.get_previous_samples <- function(.data, time.col, rowname_col, time.interval){
# Get timepoints and corresponding previous timepoints
current_time <- unique(sort(.data[[time.col]]))
prev_time <- lag(current_time, n = time.interval)
prev_time <- prev_time[match(.data[[time.col]], current_time)]
# Split sample names so that they are grouped into timepoints
timepoint_samples <- split(.data[[rowname_col]], .data[[time.col]])
# For each sample, assign previous samples
prev_samples <- timepoint_samples[
match(prev_time, names(timepoint_samples)) ]
prev_samples[ lengths(prev_samples) == 0L ] <- NA_character_
return(prev_samples)
}

# This function get time difference between a sample and its reference sample
.get_time_difference <- function(x, time.col, reference){
#' @importFrom dplyr group_by mutate
.get_time_difference <- function(x, time.col, reference, ...){
# Get timepoints
time_point <- x[[time.col]]
# Get reference time points
ref <- colData(x)[x[[reference]], time.col]
df <- colData(x) |> as.data.frame()
df[["temp_sample"]] <- colnames(x)
ref <- df |>
group_by(temp_sample) |>
mutate(time := mean(df[unlist(.data[[reference]]), time.col]))
# Get difference
res <- time_point - ref
res <- time_point - ref[["time"]]
return(res)
}

# This function converts time divergence results to DF object
#' @importFrom dplyr summarize
.convert_divergence_to_df <- function(
x, res, time_res, reference,
name = c("divergence", "time_diff", "ref_samples"), ...){
# Validate 'name' param
temp <- .check_input(name, list("character vector"), length = 3L)
#
df <- DataFrame(res, time_res, x[[reference]], row.names = colnames(x))
df <- data.frame(
res, time_res, sample = colnames(x), reference = I(x[[reference]]))
# Wrangle the format
if( all(lengths(df[["reference"]]) == 1L) ){
df[["reference"]] <- as.character(df[["reference"]])
} else{
df[["reference"]] <- as.list(df[["reference"]])
}
df <- DataFrame(df)
# Wrangle names
rownames(df) <- df[["sample"]]
df[["sample"]] <- NULL
colnames(df) <- name
return(df)
}
10 changes: 5 additions & 5 deletions R/getStepwiseDivergence.R
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,12 @@ setMethod("getStepwiseDivergence", signature = c(x = "ANY"),
)
# Calculate divergences
res <- do.call(getDivergence, args)
# Add time difference
x <- args[["x"]]
reference <- args[["reference"]]
time_res <- .get_time_difference(x, time.col, reference)
# Get time difference
args[["time.col"]] <- time.col
time_res <- do.call(.get_time_difference, args)
# Create a DF to return
res <- .convert_divergence_to_df(x, res, time_res, reference, ...)
args <- c(args, list(res = res, time_res = time_res))
res <- do.call(.convert_divergence_to_df, args)
return(res)
}
)
Expand Down
2 changes: 1 addition & 1 deletion man/addBaselineDivergence.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

49 changes: 45 additions & 4 deletions tests/testthat/test-getBaselineDivergence.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
test_that("addBaselineDivergence output", {
data(hitchip1006)
tse <- hitchip1006
tse <- tse[, !duplicated(colData(tse)[, c("time", "subject")])]
tse2 <- addBaselineDivergence(
tse, group = "subject", time.col = "time",
name = c("divergence_from_baseline", "time_from_baseline",
Expand Down Expand Up @@ -135,7 +136,7 @@ test_that(".get_reference_samples with different time intervals", {

# Basic SummarizedExperiment for testing
col_data <- DataFrame(
time = c(0, 1, 2, 1, 2, 0),
time = c(0, 1, 2, 4, 10, 3),
group = c("A", "A", "A", "B", "B", "B"),
row.names = c("Sample1", "Sample2",
"Sample3", "Sample4", "Sample5", "Sample6"))
Expand Down Expand Up @@ -199,7 +200,8 @@ test_that(".get_reference_samples baseline", {
colData(se), time.col = "time", group = "group",
reference.method = "stepwise", time.interval = 1)
expect_equal(stepwise, c(
NA, "Sample1", "Sample2", "Sample6", "Sample4", NA))
NA, "Sample1", "Sample2", "Sample6", "Sample4", NA),
check.attributes = FALSE)
})

# Time difference calculation
Expand All @@ -209,7 +211,7 @@ test_that(".get_time_difference calculates correct time diff", {
colData(se2)[["ref"]] <- reference
time_diffs <- .get_time_difference(
se2, time.col = "time", reference = "ref")
expect_equal(time_diffs, c(-1, 1, 2, -1, NA, -1))
expect_equal(time_diffs, c(-1, 1, 2, 2, NA, -1))
})

# Convert divergence to DataFrame
Expand All @@ -225,9 +227,11 @@ test_that(".convert_divergence_to_df formats correctly", {
assays = list(),
colData = col_data
)
colnames(se) <- se[["sam"]] <- paste0("sample", seq(1, 6))
reference <- "reference"
df <- .convert_divergence_to_df(
se, divergence, time_diff, reference,
x_orig = se, x = se, res = divergence, time_res = time_diff,
reference = reference, orig.sample.names = "sam",
name = c("test_div", "test_time_diff", "test_reference_samples"))
expect_s4_class(df, "DataFrame")
expect_equal(colnames(df),
Expand Down Expand Up @@ -272,9 +276,46 @@ test_that(".convert_divergence_to_df with NA divergence values", {
colData = col_data
)
reference <- "reference"
colnames(se) <- se[["sam"]] <- paste0("sample", seq(1, 6))
df <- .convert_divergence_to_df(
se, divergence, time_diff, reference,
name = c("test_div", "test_time_diff", "test_reference_samples"))
expect_s4_class(df, "DataFrame")
expect_true(all(is.na(df$test_div[is.na(divergence)])))
})

# Test that the function works correctly with replicated samples
test_that("getBaselineDivergence with replicated samples", {
tse <- makeTSE(nrow = 1000, ncol = 20)
assayNames(tse) <- "counts"
colData(tse)[["time"]] <- sample(c(1, 3, 6, 100), 20, replace = TRUE)
res <- getBaselineDivergence(
tse, time.col = "time", group = "group", method = "euclidean") |>
expect_warning()
res <- res[, c(1, 2)]
# For all samples, calculate divergence
sams <- colnames(tse)
ref <- lapply(sams, function(sam){
# Get data on sample
sam_dat <- colData(tse[, sam])
# Get its reference samples
group_dat <- colData(tse)[tse[["group"]] == sam_dat[["group"]], ]
ref_time <- sort(group_dat[["time"]])[[1]]
# Loop through each reference sample, calculate its distance to
# the sample, and take mean
ref_sams <- rownames(group_dat[ group_dat[["time"]] == ref_time, ])
ref_vals <- vapply(ref_sams, function(ref_sam){
dist(t( assay(tse, "counts")[, c(sam, ref_sam)] ),
method = "euclidean")
}, numeric(1))
ref_vals <- mean(ref_vals)
# Return divergence and time difference
temp_res <- c(ref_vals, sam_dat[["time"]] - ref_time)
return(temp_res)
})
ref <- do.call(rbind, ref)
ref <- DataFrame(ref)
colnames(ref) <- colnames(res)
rownames(ref) <- rownames(res)
expect_equal(res, ref)
})
Loading

0 comments on commit e4908a7

Please sign in to comment.