Merge pull request #128 from gdrplatform/GDR-2558.2

Gdr 2558.2
gdrplatform · Aug 20, 2024 · 9d7cbf2 · 9d7cbf2
2 parents 79ba8c0 + 65ddfe3
commit 9d7cbf2
Show file tree

Hide file tree

Showing 8 changed files with 105 additions and 20 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: gDRutils
 Type: Package
 Title: A package with helper functions for processing drug response data
-Version: 1.3.8
-Date: 2024-08-06
+Version: 1.3.9
+Date: 2024-08-14
 Authors@R: c(person("Bartosz", "Czech", role=c("aut"),
                    comment = c(ORCID = "0000-0002-9908-3007")),
              person("Arkadiusz", "Gladki", role=c("cre", "aut"), email="[email protected]",
@@ -60,7 +60,7 @@ biocViews: Software, Infrastructure
 VignetteBuilder: knitr
 ByteCompile: TRUE
 Roxygen: list(markdown = TRUE) 
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 SwitchrLibrary: gDRutils
 DeploySubPath: gDRutils
 Encoding: UTF-8
diff --git a/NAMESPACE b/NAMESPACE
@@ -14,6 +14,7 @@ export(aggregate_assay)
 export(apply_bumpy_function)
 export(assert_choices)
 export(average_biological_replicates_dt)
+export(calc_sd)
 export(capVals)
 export(cap_xc50)
 export(convert_colData_to_json)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,6 @@
+## gDRutils 1.3.9 - 2024-08-14
+* extend the logic of `average_biological_replicates_dt` to calculate standard deviation
+
 ## gDRutils 1.3.8 - 2024-08-06
 * add functions for setting unique identifiers in the colData and rowData of SE
 

diff --git a/R/packages.R b/R/packages.R
@@ -33,7 +33,8 @@ if (getRversion() >= "2.15.1") {
       "cId",
       "concs",
       "type",
-      "name"
+      "name",
+      "count"
     ),
     utils::packageName())
 }
diff --git a/R/utils.R b/R/utils.R
@@ -425,8 +425,6 @@ geometric_mean <- function(x, fixed = TRUE, maxlog10Concentration = 1) {
   exp(mean(log(x)))
 }
 
-#' Average biological replicates.
-#'
 #' Average biological replicates on the data table side. 
 #'
 #' @param dt data.table with Metric data
@@ -435,6 +433,7 @@ geometric_mean <- function(x, fixed = TRUE, maxlog10Concentration = 1) {
 #' @param fixed Flag indicating whether to add a fix for -Inf in the geometric mean.
 #' @param geometric_average_fields Character vector of column names in \code{dt} 
 #' to take the geometric average of.
+#' @param add_sd Flag indicating whether to add standard deviation and count columns.
 #' 
 #' @examples
 #' dt <- data.table::data.table(a = c(1:10, 1),
@@ -449,28 +448,39 @@ average_biological_replicates_dt <- function(
     var,
     prettified = FALSE,
     fixed = TRUE,
-    geometric_average_fields = get_header("metric_average_fields")$geometric_mean) {
+    geometric_average_fields = get_header("metric_average_fields")$geometric_mean,
+    add_sd = FALSE) {
+
   data <- data.table::copy(dt)
 
   if (prettified) {
     pidfs <- get_prettified_identifiers()
-    iso_cols <- prettify_flat_metrics(get_header("iso_position"),
-                                      human_readable = TRUE)
+    iso_cols <- prettify_flat_metrics(get_header("iso_position"), human_readable = TRUE)
     id_cols <- prettify_flat_metrics(get_header("id"), human_readable = TRUE)
-
   } else {
     pidfs <- get_env_identifiers()
     iso_cols <- get_header("iso_position")
     id_cols <- prettify_flat_metrics(get_header("id"))
   }
 
-  average_fields <- setdiff(names(Filter(is.numeric, data)), c(unlist(pidfs),
-                                                               var,
-                                                               iso_cols))
+  average_fields <- setdiff(names(Filter(is.numeric, data)), c(unlist(pidfs), var, iso_cols))
   geometric_average_fields <- intersect(geometric_average_fields, names(dt))
-  group_by <- setdiff(names(data),
-                      c(average_fields, var, id_cols))
-  group_by <- grep("Fit Type", group_by, invert = TRUE, value = TRUE)
+  group_by <- setdiff(names(data), c(average_fields, var, id_cols, "fit_type", "Fit Type"))
+
+  if (add_sd) {
+    # Calculate standard deviation for both average_fields and geometric_average_fields
+    sd_fields <- paste0(average_fields, "_sd")
+    geom_sd_fields <- paste0(geometric_average_fields, "_sd")
+
+    data <- data[, (sd_fields) := lapply(.SD, calc_sd),
+                 .SDcols = average_fields, by = group_by]
+    data <- data[, (geom_sd_fields) := lapply(.SD, calc_sd),
+                 .SDcols = geometric_average_fields, by = group_by]
+
+    # Calculate count and add as a single column
+    data <- data[, count := .N, by = group_by]
+  }
+
   data <- data[, (var) := NULL][, 
                                 (average_fields) := lapply(.SD, mean, na.rm = TRUE), 
                                 .SDcols = average_fields, 
@@ -480,9 +490,9 @@ average_biological_replicates_dt <- function(
                                                }), 
                                                .SDcols = geometric_average_fields, 
                                                by = group_by]
+
   unique(data, by = group_by)
 }
-
 #' Helper function to find duplicated rows
 #'
 #' @param x data frame
@@ -733,3 +743,25 @@ get_additional_variables <- function(dt_list,
   }
 }
 
+#' Calculate Standard Deviation or Return Zero
+#'
+#' This function calculates the standard deviation of a numeric vector.
+#' If the vector has a length of 1 and it is numeric, it returns 0.
+#'
+#' @param x A numeric vector.
+#' @return The standard deviation of the vector if its length is greater than 1 or it is not numeric, otherwise 0.
+#' @examples
+#' calc_sd(c(1, 2, 3, 4, 5)) # Should return the standard deviation
+#' calc_sd(c(1)) # Should return 0
+#' calc_sd(numeric(0)) # Should return NA
+#' calc_sd(c("a", "b", "c")) # Should return NA
+#' @keywords package_utils
+#' @export
+calc_sd <- function(x) {
+  if (length(x) == 1 && is.numeric(x) && !is.na(x)) {
+    return(0)
+  } else {
+    return(stats::sd(x, na.rm = TRUE))
+  }
+}
+
diff --git a/man/average_biological_replicates_dt.Rd b/man/average_biological_replicates_dt.Rd
diff --git a/man/calc_sd.Rd b/man/calc_sd.Rd
diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R
@@ -214,6 +214,15 @@ test_that("average_biological_replicates_dt works as expected", {
   expect_equal(dim(metrics_data), c(60, 29))
   expect_equal(dim(avg_metrics_data), c(40, 28))
   expect_true(!"Ligand" %in% names(avg_metrics_data))
+
+  avg_metrics_data2 <- average_biological_replicates_dt(dt = metrics_data,
+                                                        var = "Ligand",
+                                                        prettified = TRUE,
+                                                        add_sd = TRUE)
+
+  expect_equal(dim(avg_metrics_data2), c(40, 44))
+  expect_equal(sum(grepl("_sd", names(avg_metrics_data2))), 15)
+  expect_true("count" %in% names(avg_metrics_data2))
 })
 
 test_that("get_duplicated_rows works as expected", {
@@ -557,4 +566,15 @@ test_that("capVals works as expected", {
   expect_equal(dt1c, dt2)
 
   expect_error(capVals(as.list(dt1)), "Must be a data.table")
-})
+})
+
+test_that("calc_sd works as expected", {
+  expect_equal(calc_sd(c(1, 2, 3, 4, 5)), sd(c(1, 2, 3, 4, 5), na.rm = TRUE))
+  expect_equal(calc_sd(c(10, 20, 30)), sd(c(10, 20, 30), na.rm = TRUE))
+  expect_equal(calc_sd(c(1)), 0)
+  expect_true(is.na(calc_sd("2")))
+  expect_true(is.na(calc_sd(TRUE)))
+  expect_true(is.na(calc_sd(numeric(0))))
+  expect_equal(calc_sd(c(1, 2, NA, 4, 5)), sd(c(1, 2, NA, 4, 5), na.rm = TRUE))
+  expect_true(is.na(calc_sd(c(NA, NA, NA))))
+})
-Original file line number
+Diff line change
@@ Expand Up / @@ -33,7 +33,8 @@ if (getRversion() >= "2.15.1") { @@
           "cId",
           "concs",
           "type",
-          "name"
+          "name",
+          "count"
         ),
         utils::packageName())
     }