Merge pull request #141 from gdrplatform/GDR-2801

feat: make split_SE_components working correctly for sa assay data, m…
gdrplatform · Dec 17, 2024 · 10a0dcc · 10a0dcc
2 parents 3650f71 + d735bc0
commit 10a0dcc
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 13 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: gDRutils
 Type: Package
 Title: A package with helper functions for processing drug response data
-Version: 1.5.4
-Date: 2024-12-09
+Version: 1.5.5
+Date: 2024-12-10
 Authors@R: c(person("Bartosz", "Czech", role=c("aut"),
                    comment = c(ORCID = "0000-0002-9908-3007")),
              person("Arkadiusz", "Gladki", role=c("cre", "aut"), email="[email protected]",

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,6 @@
+## gDRutils 1.5.5 - 2024-12-10
+* make split_SE_components working correctly for sa assay data, modified with `avearge_biological_duplicates`
+
 ## gDRutils 1.5.4 - 2024-12-09
 * minor improvement in the logic of `average_biological_replicates` (new blacklisted column)
 

diff --git a/R/fit_curves.R b/R/fit_curves.R
@@ -78,7 +78,7 @@ fit_curves <- function(df_,
   if (length(setdiff(opt_fields, colnames(df_))) > 0L) {
     df_[, setdiff(opt_fields, colnames(df_))] <- NA
   }
-  df_metrics <- .apllyLogisticFit(df_, normalization_type, series_identifiers, e_0, GR_0, range_conc, force_fit, 
+  df_metrics <- .applyLogisticFit(df_, normalization_type, series_identifiers, e_0, GR_0, range_conc, force_fit, 
                                   pcutoff, cap, n_point_cutoff)
 
   is_unique_normalization_type_and_fit_source <- 
@@ -94,7 +94,7 @@ fit_curves <- function(df_,
 }
 
 #' @keywords internal
-.apllyLogisticFit <- function(df_, normalization_type, series_identifiers, e_0, GR_0, range_conc, force_fit, 
+.applyLogisticFit <- function(df_, normalization_type, series_identifiers, e_0, GR_0, range_conc, force_fit, 
                               pcutoff, cap, n_point_cutoff) {
 
   df_metrics <- NULL
@@ -487,7 +487,12 @@ logistic_metrics <- function(c, x_metrics) {
 #' @keywords fit_curves
 #' @export
 .setup_metric_output <- function() {
-  resp_metric_cols <- c(get_header("response_metrics"), "maxlog10Concentration", "N_conc")
+  resp_metric_all_cols <- c(get_header("response_metrics"), "maxlog10Concentration", "N_conc")
+  # remove cols ending with "_sd"
+  # they are not present in the primary assays 
+  # but only with the assays followed by averaging of biological replicates
+  resp_metric_cols <- resp_metric_all_cols[!endsWith(resp_metric_all_cols, "_sd")]
+
   out <- as.list(rep(NA, length(resp_metric_cols)))
   names(out) <- resp_metric_cols
   out

diff --git a/R/flatten.R b/R/flatten.R
@@ -56,7 +56,9 @@ flatten <- function(tbl, groups, wide_cols, sep = "_") {
   uniquifying <- unique(uniquifying)
 
   out <- split(subset(tbl, select = -idx), subset(tbl, select = idx), sep = sep)
-  missing <- setdiff(wide_cols, colnames(tbl))
+
+  # in original assays there are no columns with SD-related data (with names ending with "_sd")
+  missing <- setdiff(wide_cols[!grepl("_sd$", wide_cols)], colnames(tbl))
   if (length(missing) != 0L) {
     warning(sprintf("missing listed wide_cols columns: '%s'", paste0(missing, collapse = ", ")))
   }

diff --git a/R/headers_list.R b/R/headers_list.R
@@ -21,9 +21,12 @@
 
   HEADERS_LIST[["metrics_results"]] <- c(
     "maxlog10Concentration",
+    "maxlog10Concentration_sd",
     "N_conc",
+    "N_conc_sd", 
     "cotrt_value",
     "source",
+    "count",
     HEADERS_LIST[["response_metrics"]],
     as.character(HEADERS_LIST[["metrics_names"]])
   )
@@ -112,7 +115,11 @@
     "x",
     "x_std",
     "std_RelativeViability",
-    "std_GRvalue"
+    "std_GRvalue",
+    # after averaging for biological replicates
+    "count",
+    "x_sd",
+    "x_std_sd"
   )
 }
 
@@ -132,7 +139,18 @@
     "p_value",
     "rss",
     "x_sd_avg",
-    "fit_type"
+    "fit_type",
+    "x_mean_sd",
+    "x_AOC_sd",
+    "x_AOC_range_sd",
+    "xc50_sd",
+    "x_max_sd",
+    "ec50_sd",
+    "x_inf_sd",
+    "x_0_sd",
+    "h_sd",
+    "r2_sd",
+    "x_sd_avg_sd"
   )
 }
 
@@ -153,7 +171,18 @@
       "RV_p_value",
       "RV_rss",
       "RV_sd_avg",
-      "fit_type_RV"
+      "fit_type_RV",
+      "RV_mean_sd",
+      "RV_AOC_sd",
+      "RV_AOC_range_sd",
+      "IC50_sd",
+      "E_max_sd",
+      "EC50_sd",
+      "E_inf_sd",
+      "E_0_sd",
+      "h_RV_sd",
+      "RV_r2_sd",
+      "RV_sd_avg_sd"
     ),
     GR = c(
       "GR_mean",
@@ -169,7 +198,18 @@
       "GR_p_value",
       "GR_rss",
       "GR_sd_avg",
-      "fit_type_GR"
+      "fit_type_GR",
+      "GR_mean_sd",
+      "GR_AOC_sd",
+      "GR_AOC_range_sd",
+      "GR50_sd",
+      "GR_max_sd",
+      "GEC50_sd",
+      "GR_inf_sd",
+      "GR_0_sd",
+      "h_GR_sd",
+      "GR_r2_sd",
+      "GR_sd_avg_sd"
     )
   )
 }

diff --git a/R/split_SE_components.R b/R/split_SE_components.R
@@ -56,12 +56,11 @@ split_SE_components <- function(df_, nested_keys = NULL, combine_on = 1L) {
   df_ <- S4Vectors::DataFrame(df_, check.names = FALSE)
   all_cols <- colnames(df_)
   # Identify known data fields.
-  data_fields <- c(get_header("raw_data"), get_header("normalized_results"),
+  data_fields <- unique(c(get_header("raw_data"), get_header("normalized_results"),
                    get_header("averaged_results"),
     get_header("metrics_results"), get_env_identifiers("concentration", simplify = TRUE),
     identifiers_md$well_position, identifiers_md$template, nested_keys,
-    get_header("scores"), get_header("excess"), get_header("isobolograms"))
-  data_fields <- unique(data_fields)
+    get_header("scores"), get_header("excess"), get_header("isobolograms")))
   data_cols <- data_fields[data_fields %in% all_cols]
   md_cols <- setdiff(all_cols, data_cols) 
   md <- unique(df_[, md_cols])