diff --git a/DESCRIPTION b/DESCRIPTION index da2068b5..c236715e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: gDRutils Type: Package Title: A package with helper functions for processing drug response data -Version: 1.3.14 -Date: 2024-10-03 +Version: 1.3.15 +Date: 2024-10-07 Authors@R: c(person("Bartosz", "Czech", role=c("aut"), comment = c(ORCID = "0000-0002-9908-3007")), person("Arkadiusz", "Gladki", role=c("cre", "aut"), email="gladki.arkadiusz@gmail.com", diff --git a/NAMESPACE b/NAMESPACE index 21026b4d..a9f9de03 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -43,7 +43,9 @@ export(get_SE_identifiers) export(get_SE_keys) export(get_SE_processing_metadata) export(get_additional_variables) +export(get_assay_dt_duplicated_rows) export(get_assay_names) +export(get_assay_req_uniq_cols) export(get_combo_assay_names) export(get_combo_base_assay_names) export(get_combo_excess_field_names) @@ -68,6 +70,8 @@ export(get_synthetic_data) export(get_testdata) export(get_testdata_codilution) export(get_testdata_combo) +export(has_assay_dt_duplicated_rows) +export(has_dt_duplicated_rows) export(has_single_codrug_data) export(has_valid_codrug_data) export(identify_unique_se_metadata_fields) @@ -111,6 +115,7 @@ export(shorten_normalization_type_name) export(split_SE_components) export(standardize_mae) export(standardize_se) +export(throw_msg_if_duplicates) export(update_env_idfs_from_mae) export(update_idfs_synonyms) export(validate_MAE) diff --git a/NEWS.md b/NEWS.md index d6c0d80e..149fe90b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,6 @@ +## gDRutils 1.3.15 - 2024-10-07 +* refactor the logic for dealing with duplicates in assay data + ## gDRutils 1.3.14 - 2024-10-03 * fixed issue in average_biological_replicated (fit_type) diff --git a/R/duplicates.R b/R/duplicates.R new file mode 100644 index 00000000..95173e8f --- /dev/null +++ b/R/duplicates.R @@ -0,0 +1,187 @@ +#' check if data.table contains duplicated data +#' +#' An auxiliary function that checks for duplicates in the data.table (or its subset) +#' +#' @param dt data.table +#' @param col_names charvec with columns to be used for subsetting +#' @examples +#' dt <- data.table::data.table(a = c(1, 2, 3), b = c(3, 2, 2)) +#' has_dt_duplicated_rows(dt, "b") +#' @return logical flag indicating if a dt contains duplicated rows or not +#' @keywords duplicates +#' +#' @export +#' +has_dt_duplicated_rows <- function(dt, col_names = NULL) { + checkmate::assert_data_table(dt) + checkmate::assert_character(col_names, null.ok = TRUE) + + if (is.null(col_names)) { + anyDuplicated(dt) != 0 + } else { + checkmate::assert_subset(col_names, colnames(dt)) + anyDuplicated(dt, by = col_names) != 0 + } + +} + +#' get columns in the assay data required to have unique data +#' +#' get columns in the assay data required to have unique (non-duplicated) data +#' +#' @param dt data.table with assay data +#' @examples +#' sdata <- get_synthetic_data("finalMAE_small") +#' smetrics_data <- convert_se_assay_to_dt(sdata[[1]], "Metrics") +#' get_assay_req_uniq_cols(smetrics_data) +#' @return charvec with columns required to have unique data +#' @keywords duplicates +#' @export +#' +get_assay_req_uniq_cols <- function(dt) { + + checkmate::assert_data_table(dt) + col_ids <- get_settings_from_json( + "assay_dt_req_uniq_col_ids", + system.file(package = "gDRutils", "settings.json") + ) + + # check with both pretiffied and unprettified version of ids + col_names_p <- unlist(get_prettified_identifiers(col_ids, simplify = FALSE)) + col_names_up <- as.character(get_env_identifiers(col_ids, simplify = FALSE)) + col_names <- unique(c(col_names_p, col_names_up)) + + intersect(col_names, names(dt)) +} + +#' check if assay data contains duplicated data +#' +#' An auxiliary function that checks for duplicates in the assay data +#' +#' @param dt data.table with assay data +#' +#' @return logical flag indicating if a dt contains duplicated rows or not +#' @keywords duplicates +#' @examples +#' sdata <- get_synthetic_data("finalMAE_small") +#' smetrics_data <- convert_se_assay_to_dt(sdata[[1]], "Metrics") +#' has_assay_dt_duplicated_rows(smetrics_data) +#' @export +#' +has_assay_dt_duplicated_rows <- function(dt) { + + checkmate::assert_data_table(dt) + + col_names <- get_assay_req_uniq_cols(dt) + has_dt_duplicated_rows(dt, col_names) + +} + + +#' Helper function to find duplicated rows +#' +#' @param x DataFrame or data.table +#' @param col_names character vector, columns in which duplication are searched for +#' @param output string with the output format to be returned - +#' one of "index" (index of duplicates) or "data" (subset of input data with duplicates) +#' @examples +#' dt <- data.table::data.table(a = c(1, 2, 3), b = c(3, 2, 2)) +#' get_duplicated_rows(dt, "b") +#' get_duplicated_rows(dt, "b", output = "data") +#' @return integer vector or data.table with duplicated rows +#' @keywords duplicates +#' @export +get_duplicated_rows <- function(x, + col_names = NULL, + output = "index") { + + checkmate::assertMultiClass(x, c("data.table", "DataFrame")) + checkmate::assert_true(all(col_names %in% colnames(x))) + checkmate::assert_choice(output, c("index", "data")) + + + if (!is.null(col_names)) { + sub_x <- subset(x, select = col_names) + } else { + sub_x <- x + } + idx <- which(duplicated(sub_x) | duplicated(sub_x, fromLast = TRUE)) + + out <- if (output == "index") { + idx + } else { + if (length(idx)) { + x[idx, ] + } else { + x[0, ] + } + } + out +} + +#' Helper function to find duplicated rows in assay data +#' +#' @param dt data.table +#' @param output string with the output format to be returned +#' @return integer vector or data.table with duplicated rows +#' @examples +#' sdata <- get_synthetic_data("finalMAE_small") +#' smetrics_data <- convert_se_assay_to_dt(sdata[[1]], "Metrics") +#' get_assay_dt_duplicated_rows(smetrics_data, output = "data") +#' get_assay_dt_duplicated_rows(smetrics_data) +#' @keywords duplicates +#' @export +get_assay_dt_duplicated_rows <- function(dt, output = "index") { + + checkmate::assert_data_table(dt) + + col_names <- get_assay_req_uniq_cols(dt) + + get_duplicated_rows(dt, col_names, output = output) +} + + +#' throw message if assay data.table contains duplicated rows +#' +#' An auxiliary function that checks for duplicated rows in assay data.table, +#' In case of duplicates it throws a message. The messsage function is by default `stop()` +#' The message function can be customized with `msg_f` parameter +#' +#' @param dt data.table with assay data +#' @param assay_name string with the name of the assay +#' @param msg_f function to be used to throw the message +#' @param preview_max_numb number of rows to preview if duplicates found +#' @examples +#' sdata <- get_synthetic_data("finalMAE_small") +#' smetrics_data <- convert_se_assay_to_dt(sdata[[1]], "Metrics") +#' throw_msg_if_duplicates(smetrics_data, assay_name = "Metrics", msg_f = futile.logger::flog.info) +#' @return NULL +#' @keywords duplicates +#' +#' @export +#' +throw_msg_if_duplicates <- function(dt, assay_name = "unknown", msg_f = stop, preview_max_numb = 4) { + + checkmate::assert_data_table(dt) + checkmate::assert_string(assay_name) + checkmate::assert_function(msg_f) + checkmate::assert_number(preview_max_numb) + + if (has_assay_dt_duplicated_rows(dt)) { + + dup_dt <- get_assay_dt_duplicated_rows(dt, output = "data") + preview_numb <- min(c(preview_max_numb, NROW(dup_dt))) + + msg <- sprintf( + "The %i ouf of %i rows are duplicated in the assay '%s'", + NROW(dup_dt), + NROW(dt), + assay_name) + msg2 <- sprintf(" when checking uniquness with the following set of columns: '%s'. ", + toString(get_assay_req_uniq_cols(dt))) + msg3 <- sprintf("Here is the preview of the first %i duplicated rows in JSON format: '%s'", + preview_numb, + jsonlite::toJSON(dup_dt[seq(preview_numb), ])) + msg_f(paste0(msg, msg2, msg3)) + } +} diff --git a/R/utils.R b/R/utils.R index 85895572..e96d68c2 100644 --- a/R/utils.R +++ b/R/utils.R @@ -504,25 +504,6 @@ average_biological_replicates_dt <- function( unique(data, by = group_by) } -#' Helper function to find duplicated rows -#' -#' @param x data frame -#' @param col_names character vector, columns in which duplication are searched for -#' @return integer vector -#' @examples -#' dt <- data.table::data.table(a = c(1, 2, 3), b = c(3, 2, 2)) -#' get_duplicated_rows(dt, "b") -#' @keywords package_utils -#' @export -get_duplicated_rows <- function(x, col_names = NULL) { - checkmate::assertMultiClass(x, c("data.table", "DataFrame")) - checkmate::assert_true(all(col_names %in% colnames(x))) - - if (!is.null(col_names)) { - x <- subset(x, select = col_names) - } - which(duplicated(x) | duplicated(x, fromLast = TRUE)) -} #' Checks if \code{se} is combo dataset. #' diff --git a/_pkgdown.yml b/_pkgdown.yml index 7fbe03fb..37788252 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -39,6 +39,9 @@ reference: - title: Standardize MAE - contents: - has_keyword("standardize_MAE") +- title: Duplicates +- contents: + - has_keyword("duplicates") - title: Utils - contents: - has_keyword("package_utils") diff --git a/inst/settings.json b/inst/settings.json index 5f9a461c..1ed6d4c5 100644 --- a/inst/settings.json +++ b/inst/settings.json @@ -22,5 +22,13 @@ "pos_y_ref": "pos_y_ref", "log10_ratio": "log10_ratio_conc", "log2_CI": "log2_CI" - } -} \ No newline at end of file + }, + "assay_dt_req_uniq_col_ids": [ + "drug_name", + "drug_name2", + "cellline_name", + "concentration2", + "duration", + "data_source" + ] +} diff --git a/man/get_assay_dt_duplicated_rows.Rd b/man/get_assay_dt_duplicated_rows.Rd new file mode 100644 index 00000000..43111dc0 --- /dev/null +++ b/man/get_assay_dt_duplicated_rows.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/duplicates.R +\name{get_assay_dt_duplicated_rows} +\alias{get_assay_dt_duplicated_rows} +\title{Helper function to find duplicated rows in assay data} +\usage{ +get_assay_dt_duplicated_rows(dt, output = "index") +} +\arguments{ +\item{dt}{data.table} + +\item{output}{string with the output format to be returned} +} +\value{ +integer vector or data.table with duplicated rows +} +\description{ +Helper function to find duplicated rows in assay data +} +\examples{ +sdata <- get_synthetic_data("finalMAE_small") +smetrics_data <- convert_se_assay_to_dt(sdata[[1]], "Metrics") +get_assay_dt_duplicated_rows(smetrics_data, output = "data") +get_assay_dt_duplicated_rows(smetrics_data) +} +\keyword{duplicates} diff --git a/man/get_assay_req_uniq_cols.Rd b/man/get_assay_req_uniq_cols.Rd new file mode 100644 index 00000000..91ea6ff0 --- /dev/null +++ b/man/get_assay_req_uniq_cols.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/duplicates.R +\name{get_assay_req_uniq_cols} +\alias{get_assay_req_uniq_cols} +\title{get columns in the assay data required to have unique data} +\usage{ +get_assay_req_uniq_cols(dt) +} +\arguments{ +\item{dt}{data.table with assay data} +} +\value{ +charvec with columns required to have unique data +} +\description{ +get columns in the assay data required to have unique (non-duplicated) data +} +\examples{ +sdata <- get_synthetic_data("finalMAE_small") +smetrics_data <- convert_se_assay_to_dt(sdata[[1]], "Metrics") +get_assay_req_uniq_cols(smetrics_data) +} +\keyword{duplicates} diff --git a/man/get_duplicated_rows.Rd b/man/get_duplicated_rows.Rd index 767f0902..f93f8800 100644 --- a/man/get_duplicated_rows.Rd +++ b/man/get_duplicated_rows.Rd @@ -1,18 +1,21 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/duplicates.R \name{get_duplicated_rows} \alias{get_duplicated_rows} \title{Helper function to find duplicated rows} \usage{ -get_duplicated_rows(x, col_names = NULL) +get_duplicated_rows(x, col_names = NULL, output = "index") } \arguments{ -\item{x}{data frame} +\item{x}{DataFrame or data.table} \item{col_names}{character vector, columns in which duplication are searched for} + +\item{output}{string with the output format to be returned - +one of "index" (index of duplicates) or "data" (subset of input data with duplicates)} } \value{ -integer vector +integer vector or data.table with duplicated rows } \description{ Helper function to find duplicated rows @@ -20,5 +23,6 @@ Helper function to find duplicated rows \examples{ dt <- data.table::data.table(a = c(1, 2, 3), b = c(3, 2, 2)) get_duplicated_rows(dt, "b") +get_duplicated_rows(dt, "b", output = "data") } -\keyword{package_utils} +\keyword{duplicates} diff --git a/man/has_assay_dt_duplicated_rows.Rd b/man/has_assay_dt_duplicated_rows.Rd new file mode 100644 index 00000000..0c67d6ab --- /dev/null +++ b/man/has_assay_dt_duplicated_rows.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/duplicates.R +\name{has_assay_dt_duplicated_rows} +\alias{has_assay_dt_duplicated_rows} +\title{check if assay data contains duplicated data} +\usage{ +has_assay_dt_duplicated_rows(dt) +} +\arguments{ +\item{dt}{data.table with assay data} +} +\value{ +logical flag indicating if a dt contains duplicated rows or not +} +\description{ +An auxiliary function that checks for duplicates in the assay data +} +\examples{ +sdata <- get_synthetic_data("finalMAE_small") +smetrics_data <- convert_se_assay_to_dt(sdata[[1]], "Metrics") +has_assay_dt_duplicated_rows(smetrics_data) +} +\keyword{duplicates} diff --git a/man/has_dt_duplicated_rows.Rd b/man/has_dt_duplicated_rows.Rd new file mode 100644 index 00000000..7e7daa0f --- /dev/null +++ b/man/has_dt_duplicated_rows.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/duplicates.R +\name{has_dt_duplicated_rows} +\alias{has_dt_duplicated_rows} +\title{check if data.table contains duplicated data} +\usage{ +has_dt_duplicated_rows(dt, col_names = NULL) +} +\arguments{ +\item{dt}{data.table} + +\item{col_names}{charvec with columns to be used for subsetting} +} +\value{ +logical flag indicating if a dt contains duplicated rows or not +} +\description{ +An auxiliary function that checks for duplicates in the data.table (or its subset) +} +\examples{ +dt <- data.table::data.table(a = c(1, 2, 3), b = c(3, 2, 2)) +has_dt_duplicated_rows(dt, "b") +} +\keyword{duplicates} diff --git a/man/throw_msg_if_duplicates.Rd b/man/throw_msg_if_duplicates.Rd new file mode 100644 index 00000000..ad80b3c5 --- /dev/null +++ b/man/throw_msg_if_duplicates.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/duplicates.R +\name{throw_msg_if_duplicates} +\alias{throw_msg_if_duplicates} +\title{throw message if assay data.table contains duplicated rows} +\usage{ +throw_msg_if_duplicates( + dt, + assay_name = "unknown", + msg_f = stop, + preview_max_numb = 4 +) +} +\arguments{ +\item{dt}{data.table with assay data} + +\item{assay_name}{string with the name of the assay} + +\item{msg_f}{function to be used to throw the message} + +\item{preview_max_numb}{number of rows to preview if duplicates found} +} +\description{ +An auxiliary function that checks for duplicated rows in assay data.table, +In case of duplicates it throws a message. The messsage function is by default \code{stop()} +The message function can be customized with \code{msg_f} parameter +} +\examples{ +sdata <- get_synthetic_data("finalMAE_small") +smetrics_data <- convert_se_assay_to_dt(sdata[[1]], "Metrics") +throw_msg_if_duplicates(smetrics_data, assay_name = "Metrics", msg_f = futile.logger::flog.info) +} +\keyword{duplicates} diff --git a/tests/testthat/test-duplicates.R b/tests/testthat/test-duplicates.R new file mode 100644 index 00000000..37f870d6 --- /dev/null +++ b/tests/testthat/test-duplicates.R @@ -0,0 +1,88 @@ +test_that("has_dt_duplicated_rows works as expected", { + + dt_iris <- data.table::data.table(iris) + expect_true(has_dt_duplicated_rows(dt_iris)) + expect_false(has_dt_duplicated_rows(dt_iris[1:100, ])) + + expect_true(has_dt_duplicated_rows(dt_iris[1:10, ], col_names = c("Sepal.Length", "Species"))) + expect_error(has_dt_duplicated_rows(iris), "Assertion on 'dt' failed") + expect_error( + has_dt_duplicated_rows(dt_iris, col_names = "invalid_value"), + "Assertion on 'col_names' failed" + ) + +}) + +test_that("get_duplicated_rows works as expected", { + DF1co <- S4Vectors::DataFrame("Gnumber" = c("G0123456.1-1", "G0123456.2-2", "G1234567.1-1"), + "DrugName" = c("drug_name1", "drug_name1", "drug_name2"), + "Gnumber_2" = c("G9876543.1-1", "G9876543.1-1", "G9876543.1-1"), + "DrugName_2" = c("codrug_name1", "codrug_name1", "codrug_name1"), + "Concentration_2" = c("untreated", "untreated", "untreated")) + + + # single column + expect_equal( + get_duplicated_rows(DF1co, col_names = "DrugName"), + c(1, 2) + ) + # single column with only duplicates + expect_equal( + get_duplicated_rows(DF1co, col_names = "DrugName_2"), + c(1, 2, 3) + ) + # single column without duplicates + expect_equal( + get_duplicated_rows(DF1co, col_names = c("Gnumber")), + integer() + ) + # multiple columns + expect_equal( + get_duplicated_rows(DF1co, col_names = c("DrugName_2", "DrugName")), + c(1, 2) + ) + # single column with non-default output + expect_equal( + get_duplicated_rows(DF1co, col_names = "DrugName", output = "data"), + DF1co[1:2, ] + ) + + expect_error(get_duplicated_rows(DF1co, c("DrugName", "Fake Column")), + "Assertion on 'all(col_names %in% colnames(x))' failed: Must be TRUE.", fixed = TRUE) +}) + +test_that("[has|get]_assay_dt_duplicated_rows works as expected", { + + sdata <- get_synthetic_data("finalMAE_small") + smetrics_data <- convert_se_assay_to_dt(sdata[[1]], "Metrics") + smetrics_data_f <- gDRutils::flatten( + smetrics_data, + groups = c("normalization_type", "fit_source"), + wide_cols = gDRutils::get_header("response_metrics") + ) + expect_true(has_assay_dt_duplicated_rows(smetrics_data)) + expect_false(has_assay_dt_duplicated_rows(smetrics_data_f)) + + expect_equal(get_assay_dt_duplicated_rows(smetrics_data), 1:200) + expect_equal(dim(smetrics_data), dim(get_assay_dt_duplicated_rows(smetrics_data, output = "data"))) + expect_equal(get_assay_dt_duplicated_rows(smetrics_data_f), integer(0)) + empty_dt <- get_assay_dt_duplicated_rows(smetrics_data_f, output = "data") + expect_true(nrow(empty_dt) == 0) + expect_is(empty_dt, "data.table") +}) + +test_that("throw_msg_if_duplicates works as expected", { + + sdata <- get_synthetic_data("finalMAE_small") + smetrics_data <- convert_se_assay_to_dt(sdata[[1]], "Metrics") + smetrics_data_f <- gDRutils::flatten( + smetrics_data, + groups = c("normalization_type", "fit_source"), + wide_cols = gDRutils::get_header("response_metrics") + ) + + + exp_msg <- "rows are duplicated" + expect_error(throw_msg_if_duplicates(smetrics_data, "Metrics"), exp_msg) + expect_warning(throw_msg_if_duplicates(smetrics_data, "Metrics", msg_f = warning), exp_msg) +}) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 143fa9f4..4c61b0a5 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -255,39 +255,6 @@ test_that("average_biological_replicates_dt works as expected", { }) -test_that("get_duplicated_rows works as expected", { - DF1co <- S4Vectors::DataFrame("Gnumber" = c("G0123456.1-1", "G0123456.2-2", "G1234567.1-1"), - "DrugName" = c("drug_name1", "drug_name1", "drug_name2"), - "Gnumber_2" = c("G9876543.1-1", "G9876543.1-1", "G9876543.1-1"), - "DrugName_2" = c("codrug_name1", "codrug_name1", "codrug_name1"), - "Concentration_2" = c("untreated", "untreated", "untreated")) - - - # single column - expect_equal( - get_duplicated_rows(DF1co, col_names = "DrugName"), - c(1, 2) - ) - # single column with only duplicates - expect_equal( - get_duplicated_rows(DF1co, col_names = "DrugName_2"), - c(1, 2, 3) - ) - # single column without duplicates - expect_equal( - get_duplicated_rows(DF1co, col_names = c("Gnumber")), - integer() - ) - # multiple columns - expect_equal( - get_duplicated_rows(DF1co, col_names = c("DrugName_2", "DrugName")), - c(1, 2) - ) - - expect_error(get_duplicated_rows(DF1co, c("DrugName", "Fake Column")), - "Assertion on 'all(col_names %in% colnames(x))' failed: Must be TRUE.", fixed = TRUE) -}) - test_that("has_single_codrug_data works as expected", { expect_false(has_single_codrug_data("un_col")) expect_true(has_single_codrug_data(get_prettified_identifiers(c(