Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gdr 2591 #134

Merged
merged 24 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: gDRutils
Type: Package
Title: A package with helper functions for processing drug response data
Version: 1.3.14
Date: 2024-10-03
Version: 1.3.15
Date: 2024-10-07
Authors@R: c(person("Bartosz", "Czech", role=c("aut"),
comment = c(ORCID = "0000-0002-9908-3007")),
person("Arkadiusz", "Gladki", role=c("cre", "aut"), email="[email protected]",
Expand Down
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ export(get_SE_identifiers)
export(get_SE_keys)
export(get_SE_processing_metadata)
export(get_additional_variables)
export(get_assay_dt_duplicated_rows)
export(get_assay_names)
export(get_assay_req_uniq_cols)
export(get_combo_assay_names)
export(get_combo_base_assay_names)
export(get_combo_excess_field_names)
Expand All @@ -68,6 +70,8 @@ export(get_synthetic_data)
export(get_testdata)
export(get_testdata_codilution)
export(get_testdata_combo)
export(has_assay_dt_duplicated_rows)
export(has_dt_duplicated_rows)
export(has_single_codrug_data)
export(has_valid_codrug_data)
export(identify_unique_se_metadata_fields)
Expand Down Expand Up @@ -111,6 +115,7 @@ export(shorten_normalization_type_name)
export(split_SE_components)
export(standardize_mae)
export(standardize_se)
export(throw_msg_if_duplicates)
export(update_env_idfs_from_mae)
export(update_idfs_synonyms)
export(validate_MAE)
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
## gDRutils 1.3.15 - 2024-10-07
* refactor the logic for dealing with duplicates in assay data

## gDRutils 1.3.14 - 2024-10-03
* fixed issue in average_biological_replicated (fit_type)

Expand Down
181 changes: 181 additions & 0 deletions R/duplicates.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#' check if data.table contains duplicated data
#'
#' An auxiliary function that checks for duplicates in the data.table (or its subset)
#'
#' @param dt data.table
#' @param col_names charvec with columns to be used for subsetting
#' @examples
#' dt <- data.table::data.table(a = c(1, 2, 3), b = c(3, 2, 2))
#' has_dt_duplicated_rows(dt, "b")
#' @return logical flag indicating if a dt contains duplicated rows or not
#' @keywords duplicates
#'
#' @export
#'
has_dt_duplicated_rows <- function(dt, col_names = NULL) {
checkmate::assert_data_table(dt)
checkmate::assert_character(col_names, null.ok = TRUE)

if (is.null(col_names)) {
anyDuplicated(dt) != 0
} else {
checkmate::assert_subset(col_names, colnames(dt))
anyDuplicated(dt, by = col_names) != 0
}

}

#' get columns in the assay data required to have unique data
#'
#' get columns in the assay data required to have unique (non-duplicated) data
#'
#' @param dt data.table with assay data
#' @examples
#' sdata <- get_synthetic_data("finalMAE_small")
#' smetrics_data <- convert_se_assay_to_dt(sdata[[1]], "Metrics")
#' get_assay_req_uniq_cols(smetrics_data)
#' @return charvec with columns required to have unique data
#' @keywords duplicates
#' @export
#'
get_assay_req_uniq_cols <- function(dt) {

checkmate::assert_data_table(dt)
col_ids <- get_settings_from_json(
"assay_dt_req_uniq_col_ids",
system.file(package = "gDRutils", "settings.json")
)

# check with both pretiffied and unprettified version of ids
col_names_p <- unlist(get_prettified_identifiers(col_ids, simplify = FALSE))
col_names_up <- as.character(get_env_identifiers(col_ids, simplify = FALSE))
col_names <- unique(c(col_names_p, col_names_up))

intersect(col_names, names(dt))
}

#' check if assay data contains duplicated data
#'
#' An auxiliary function that checks for duplicates in the assay data
#'
#' @param dt data.table with assay data
#'
#' @return logical flag indicating if a dt contains duplicated rows or not
#' @keywords duplicates
#' @examples
#' sdata <- get_synthetic_data("finalMAE_small")
#' smetrics_data <- convert_se_assay_to_dt(sdata[[1]], "Metrics")
#' has_assay_dt_duplicated_rows(smetrics_data)
#' @export
#'
has_assay_dt_duplicated_rows <- function(dt) {

checkmate::assert_data_table(dt)

col_names <- get_assay_req_uniq_cols(dt)
has_dt_duplicated_rows(dt, col_names)

}


#' Helper function to find duplicated rows
#'
#' @param x DataFrame or data.table
#' @param col_names character vector, columns in which duplication are searched for
#' @param output string with the output format to be returned - one of "index" (index of duplicates) or "data" (subset of input data with duplicates)
#' @examples
#' dt <- data.table::data.table(a = c(1, 2, 3), b = c(3, 2, 2))
#' get_duplicated_rows(dt, "b")
#' get_duplicated_rows(dt, "b", output = "data")
#' @return integer vector or data.table with duplicated rows
#' @keywords duplicates
#' @export
get_duplicated_rows <- function(x,
col_names = NULL,
output = "index") {

checkmate::assertMultiClass(x, c("data.table", "DataFrame"))
checkmate::assert_true(all(col_names %in% colnames(x)))
checkmate::assert_choice(output, c("index", "data"))


if (!is.null(col_names)) {
sub_x <- subset(x, select = col_names)
}
gladkia marked this conversation as resolved.
Show resolved Hide resolved
idx <- which(duplicated(sub_x) | duplicated(sub_x, fromLast = TRUE))

out <- if (output == "index") {
idx
} else {
if (length(idx)) {
x[idx, ]
} else {
x[0, ]
gladkia marked this conversation as resolved.
Show resolved Hide resolved
}
}
out
}

#' Helper function to find duplicated rows in assay data
#'
#' @param dt data.table
#' @param output string with the output format to be returned
#' @return integer vector or data.table with duplicated rows
#' @examples
#' sdata <- get_synthetic_data("finalMAE_small")
#' smetrics_data <- convert_se_assay_to_dt(sdata[[1]], "Metrics")
#' get_assay_dt_duplicated_rows(smetrics_data, output = "data")
#' get_assay_dt_duplicated_rows(smetrics_data)
#' @keywords duplicates
#' @export
get_assay_dt_duplicated_rows <- function(dt, output = "index") {

checkmate::assert_data_table(dt)

col_names <- get_assay_req_uniq_cols(dt)

get_duplicated_rows(dt, col_names, output = output)
}


#' throw message if assay data.table contains duplicated rows
#'
#' An auxiliary function that checks for duplicated rows in assay data.table,
#' In case of duplicates it throws a message. The messsage function is by default `stop()`
#' The message function can be customized with `msg_f` parameter
#'
#' @param dt data.table with assay data
#' @param assay_name string with the name of the assay
gladkia marked this conversation as resolved.
Show resolved Hide resolved
#' @param msg_f function to be used to throw the message
#' @param preview_max_numb number of rows to preview if duplicates found
#'
#' @return NULL
#' @keywords duplicates
#'
#' @export
#'
throw_msg_if_duplicates <- function(dt, assay_name = "unknown", msg_f = stop, preview_max_numb = 4) {

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we have her some `checkmate::assert_x'?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done in bfe7b7c.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

✔️

checkmate::assert_data_table(dt)
checkmate::assert_string(assay_name)
checkmate::assert_functiong(msg_f)
gladkia marked this conversation as resolved.
Show resolved Hide resolved
checkmate::assert_number(preview_max_numb)

if (has_assay_dt_duplicated_rows(dt)) {

dup_dt <- get_assay_dt_duplicated_rows(dt, output = "data")
preview_numb <- min(c(preview_max_numb, NROW(dup_dt)))

msg <- sprintf(
"The %i ouf of %i rows are duplicated in the assay '%s'",
NROW(dup_dt),
NROW(dt),
assay_name)
msg2 <- sprintf(" when checking uniquness with the following set of columns: '%s'. ",
toString(get_assay_req_uniq_cols(dt)))
msg3 <- sprintf("Here is the preview of the first %i duplicated rows in JSON format: '%s'",
preview_numb,
jsonlite::toJSON(dup_dt[seq(preview_numb), ]))
msg_f(paste0(msg, msg2, msg3))
}
}
19 changes: 0 additions & 19 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -504,25 +504,6 @@ average_biological_replicates_dt <- function(

unique(data, by = group_by)
}
#' Helper function to find duplicated rows
#'
#' @param x data frame
#' @param col_names character vector, columns in which duplication are searched for
#' @return integer vector
#' @examples
#' dt <- data.table::data.table(a = c(1, 2, 3), b = c(3, 2, 2))
#' get_duplicated_rows(dt, "b")
#' @keywords package_utils
#' @export
get_duplicated_rows <- function(x, col_names = NULL) {
checkmate::assertMultiClass(x, c("data.table", "DataFrame"))
checkmate::assert_true(all(col_names %in% colnames(x)))

if (!is.null(col_names)) {
x <- subset(x, select = col_names)
}
which(duplicated(x) | duplicated(x, fromLast = TRUE))
}

#' Checks if \code{se} is combo dataset.
#'
Expand Down
3 changes: 3 additions & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ reference:
- title: Standardize MAE
- contents:
- has_keyword("standardize_MAE")
- title: Duplicates
- contents:
- has_keyword("duplicates")
gladkia marked this conversation as resolved.
Show resolved Hide resolved
- title: Utils
- contents:
- has_keyword("package_utils")
Expand Down
12 changes: 10 additions & 2 deletions inst/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,13 @@
"pos_y_ref": "pos_y_ref",
"log10_ratio": "log10_ratio_conc",
"log2_CI": "log2_CI"
}
}
},
"assay_dt_req_uniq_col_ids": [
"drug_name",
gladkia marked this conversation as resolved.
Show resolved Hide resolved
"drug_name2",
"cellline_name",
"concentration2",
"duration",
"data_source"
]
}
26 changes: 26 additions & 0 deletions man/get_assay_dt_duplicated_rows.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 23 additions & 0 deletions man/get_assay_req_uniq_cols.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 8 additions & 5 deletions man/get_duplicated_rows.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 23 additions & 0 deletions man/has_assay_dt_duplicated_rows.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading