From a82ecb9de5a11246d2b4c959d76a3a3504a70276 Mon Sep 17 00:00:00 2001 From: Jamie Gilbert Date: Tue, 14 Nov 2023 11:51:07 +0000 Subject: [PATCH] Added code to run on sampled cohorts --- DESCRIPTION | 2 +- NEWS.md | 5 +++- R/Private.R | 2 ++ R/RunDiagnostics.R | 38 +++++++++++++++++++++++- man/executeDiagnostics.Rd | 24 ++++++++++++++- tests/testthat/test-1-ResultsDataModel.R | 6 ++-- tests/testthat/test-2-againstCdm.R | 9 ++++-- tests/testthat/test-other.R | 32 ++++++++++++++++++++ 8 files changed, 109 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d3d60360d..4a7bb31bb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -59,7 +59,7 @@ Remotes: ohdsi/ResultModelManager, ohdsi/ROhdsiWebApi, ohdsi/CirceR, - ohdsi/CohortGenerator, + ohdsi/CohortGenerator@random_sample, ohdsi/OhdsiShinyModules License: Apache License VignetteBuilder: knitr diff --git a/NEWS.md b/NEWS.md index 5c4e13ea3..4dfa6ca23 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,7 +5,10 @@ Changes: 1. Removed need -2. Added function to make deployment to posit connect servers easier see `` +2. Added function to make deployment to posit connect servers easier + +3. Added ability to use CohortGenerator sample functionality to executeDiagnostics which speeds up execution for very +large cohort definitions Bug fix: diff --git a/R/Private.R b/R/Private.R index c8381f1a9..f29c5aa37 100644 --- a/R/Private.R +++ b/R/Private.R @@ -71,6 +71,8 @@ enforceMinCellValue <- " because value below minimum" ) } + + browser() if (length(minValues) == 1) { data[toCensor, columnName] <- -minValues } else { diff --git a/R/RunDiagnostics.R b/R/RunDiagnostics.R index 754c83739..eb747a36f 100644 --- a/R/RunDiagnostics.R +++ b/R/RunDiagnostics.R @@ -136,7 +136,23 @@ getDefaultCovariateSettings <- function() { #' @param incremental Create only cohort diagnostics that haven't been created before? #' @param incrementalFolder If \code{incremental = TRUE}, specify a folder where records are kept #' of which cohort diagnostics has been executed. +#' @param runOnSample Logical. If TRUE, the function will operate on a sample of the data. +#' Default is FALSE, meaning the function will operate on the full data set. #' +#' @param sampleN Integer. The number of records to include in the sample if runOnSample is TRUE. +#' Default is 1000. Ignored if runOnSample is FALSE. +#' +#' @param seed Integer. The seed for the random number generator used to create the sample. +#' This ensures that the same sample can be drawn again in future runs. Default is 64374. +#' +#' @param seedArgs List. Additional arguments to pass to the sampling function. +#' This can be used to control aspects of the sampling process beyond the seed and sample size. +#' +#' @param sampleIdentifierExpression Character. An expression that generates unique identifiers for each sample. +#' This expression can use the variables 'cohortId' and 'seed'. +#' Default is "cohortId * 1000 + seed", which ensures unique identifiers +#' as long as there are fewer than 1000 cohorts. + #' @examples #' \dontrun{ #' # Load cohorts (assumes that they have already been instantiated) @@ -211,7 +227,12 @@ executeDiagnostics <- function(cohortDefinitionSet, minCharacterizationMean = 0.01, irWashoutPeriod = 0, incremental = FALSE, - incrementalFolder = file.path(exportFolder, "incremental")) { + incrementalFolder = file.path(exportFolder, "incremental"), + runOnSample = FALSE, + sampleN = 1000, + seed = 64374, + seedArgs = NULL, + sampleIdentifierExpression = "cohortId * 1000 + seed") { # collect arguments that were passed to cohort diagnostics at initiation callingArgs <- formals(executeDiagnostics) callingArgsJson <- @@ -528,6 +549,21 @@ executeDiagnostics <- function(cohortDefinitionSet, } } + if (runOnSample & !isTRUE(attr(cohortDefinitionSet, "isSampledCohortDefinition"))) { + cohortDefinitionSet <- + CohortGenerator::sampleCohortDefinitionSet(connection = connection, + cohortDefinitionSet = cohortDefinitionSet, + tempEmulationSchema = tempEmulationSchema, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTableNames = cohortTableNames, + n = sampleN, + seed = seed, + seedArgs = seedArgs, + identifierExpression = sampleIdentifierExpression, + incremental = incremental, + incrementalFolder = incrementalFolder) + } + ## CDM source information---- timeExecution( exportFolder, diff --git a/man/executeDiagnostics.Rd b/man/executeDiagnostics.Rd index 836c50549..9884c5824 100644 --- a/man/executeDiagnostics.Rd +++ b/man/executeDiagnostics.Rd @@ -34,7 +34,12 @@ executeDiagnostics( minCharacterizationMean = 0.01, irWashoutPeriod = 0, incremental = FALSE, - incrementalFolder = file.path(exportFolder, "incremental") + incrementalFolder = file.path(exportFolder, "incremental"), + runOnSample = FALSE, + sampleN = 1000, + seed = 64374, + seedArgs = NULL, + sampleIdentifierExpression = "cohortId * 1000 + seed" ) } \arguments{ @@ -121,6 +126,23 @@ on covariates that have very low values. The default is 0.001 (i.e. 0.1 percent) \item{incrementalFolder}{If \code{incremental = TRUE}, specify a folder where records are kept of which cohort diagnostics has been executed.} + +\item{runOnSample}{Logical. If TRUE, the function will operate on a sample of the data. +Default is FALSE, meaning the function will operate on the full data set.} + +\item{sampleN}{Integer. The number of records to include in the sample if runOnSample is TRUE. +Default is 1000. Ignored if runOnSample is FALSE.} + +\item{seed}{Integer. The seed for the random number generator used to create the sample. +This ensures that the same sample can be drawn again in future runs. Default is 64374.} + +\item{seedArgs}{List. Additional arguments to pass to the sampling function. +This can be used to control aspects of the sampling process beyond the seed and sample size.} + +\item{sampleIdentifierExpression}{Character. An expression that generates unique identifiers for each sample. +This expression can use the variables 'cohortId' and 'seed'. +Default is "cohortId * 1000 + seed", which ensures unique identifiers +as long as there are fewer than 1000 cohorts.} } \description{ Runs the cohort diagnostics on all (or a subset of) the cohorts instantiated using the diff --git a/tests/testthat/test-1-ResultsDataModel.R b/tests/testthat/test-1-ResultsDataModel.R index 01b448192..9610dde61 100644 --- a/tests/testthat/test-1-ResultsDataModel.R +++ b/tests/testthat/test-1-ResultsDataModel.R @@ -122,7 +122,8 @@ VALUES ('Synthea','Synthea','OHDSI Community','SyntheaTM is a Synthetic Patient runOrphanConcepts = TRUE, incremental = TRUE, incrementalFolder = file.path(folder, "incremental"), - temporalCovariateSettings = temporalCovariateSettings + temporalCovariateSettings = temporalCovariateSettings, + runOnSample = TRUE ) }, "CDM Source table has more than one record while only one is expected." @@ -147,7 +148,8 @@ VALUES ('Synthea','Synthea','OHDSI Community','SyntheaTM is a Synthetic Patient runOrphanConcepts = TRUE, incremental = TRUE, incrementalFolder = file.path(folder, "incremental"), - temporalCovariateSettings = temporalCovariateSettings + temporalCovariateSettings = temporalCovariateSettings, + runOnSample = TRUE ) } diff --git a/tests/testthat/test-2-againstCdm.R b/tests/testthat/test-2-againstCdm.R index 54a2107ba..fa2ffeb15 100644 --- a/tests/testthat/test-2-againstCdm.R +++ b/tests/testthat/test-2-againstCdm.R @@ -44,7 +44,8 @@ test_that("Cohort diagnostics in incremental mode", { minCellCount = minCellCountValue, incremental = TRUE, incrementalFolder = file.path(folder, "incremental"), - temporalCovariateSettings = temporalCovariateSettings + temporalCovariateSettings = temporalCovariateSettings, + runOnSample = TRUE ) ) @@ -74,7 +75,8 @@ test_that("Cohort diagnostics in incremental mode", { minCellCount = minCellCountValue, incremental = TRUE, incrementalFolder = file.path(folder, "incremental"), - temporalCovariateSettings = temporalCovariateSettings + temporalCovariateSettings = temporalCovariateSettings, + runOnSample = TRUE ) ) # generate sqlite file @@ -120,7 +122,8 @@ test_that("Cohort diagnostics in incremental mode", { minCellCount = minCellCountValue, incremental = FALSE, incrementalFolder = file.path(folder, "incremental"), - temporalCovariateSettings = temporalCovariateSettings + temporalCovariateSettings = temporalCovariateSettings, + runOnSample = TRUE ) }) diff --git a/tests/testthat/test-other.R b/tests/testthat/test-other.R index cf57588c3..475f17b18 100644 --- a/tests/testthat/test-other.R +++ b/tests/testthat/test-other.R @@ -124,3 +124,35 @@ test_that("timeExecutions function", { checkmate::expect_data_frame(result, nrows = 5, ncols = 5) expect_false(all(is.na(result$startTime))) }) + +test_that("enforceMinCellValue replaces values below minimum with negative of minimum", { + data <- data.frame(a = c(1, 2, 3, 4, 5)) + minValues <- 3 + result <- enforceMinCellValue(data, "a", minValues, silent = TRUE) + + expect_equal(result$a, c(-3, -3, 3, 4, 5)) +}) + +test_that("enforceMinCellValue does not replace NA values", { + data <- data.frame(a = c(1, 2, NA, 4, 5)) + minValues <- 3 + result <- enforceMinCellValue(data, "a", minValues, silent = TRUE) + + expect_equal(result$a, c(-3, -3, NA, 4, 5)) +}) + +test_that("enforceMinCellValue does not replace zero values", { + data <- data.frame(a = c(0, 2, 3, 4, 5)) + minValues <- 3 + result <- enforceMinCellValue(data, "a", minValues, silent = TRUE) + + expect_equal(result$a, c(0, -3, 3, 4, 5)) +}) + +test_that("enforceMinCellValue works with vector of minimum values", { + data <- data.frame(a = c(1, 2, 3, 4, 5)) + minValues <- c(1, 2, 3, 4, 5) + result <- enforceMinCellValue(data, "a", minValues, silent = TRUE) + + expect_equal(result$a, c(1, 2, 3, 4, 5)) +})