From a82ecb9de5a11246d2b4c959d76a3a3504a70276 Mon Sep 17 00:00:00 2001
From: Jamie Gilbert <JGilber2@its.jnj.com>
Date: Tue, 14 Nov 2023 11:51:07 +0000
Subject: [PATCH] Added code to run on sampled cohorts

---
 DESCRIPTION                              |  2 +-
 NEWS.md                                  |  5 +++-
 R/Private.R                              |  2 ++
 R/RunDiagnostics.R                       | 38 +++++++++++++++++++++++-
 man/executeDiagnostics.Rd                | 24 ++++++++++++++-
 tests/testthat/test-1-ResultsDataModel.R |  6 ++--
 tests/testthat/test-2-againstCdm.R       |  9 ++++--
 tests/testthat/test-other.R              | 32 ++++++++++++++++++++
 8 files changed, 109 insertions(+), 9 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index d3d60360d..4a7bb31bb 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -59,7 +59,7 @@ Remotes:
   ohdsi/ResultModelManager,
   ohdsi/ROhdsiWebApi,
   ohdsi/CirceR,
-  ohdsi/CohortGenerator,
+  ohdsi/CohortGenerator@random_sample,
   ohdsi/OhdsiShinyModules
 License: Apache License
 VignetteBuilder: knitr
diff --git a/NEWS.md b/NEWS.md
index 5c4e13ea3..4dfa6ca23 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -5,7 +5,10 @@ Changes:
 
 1. Removed need 
 
-2. Added function to make deployment to posit connect servers easier see ``
+2. Added function to make deployment to posit connect servers easier
+
+3. Added ability to use CohortGenerator sample functionality to executeDiagnostics which speeds up execution for very
+large cohort definitions
 
 Bug fix:
 
diff --git a/R/Private.R b/R/Private.R
index c8381f1a9..f29c5aa37 100644
--- a/R/Private.R
+++ b/R/Private.R
@@ -71,6 +71,8 @@ enforceMinCellValue <-
         " because value below minimum"
       )
     }
+
+    browser()
     if (length(minValues) == 1) {
       data[toCensor, columnName] <- -minValues
     } else {
diff --git a/R/RunDiagnostics.R b/R/RunDiagnostics.R
index 754c83739..eb747a36f 100644
--- a/R/RunDiagnostics.R
+++ b/R/RunDiagnostics.R
@@ -136,7 +136,23 @@ getDefaultCovariateSettings <- function() {
 #' @param incremental                 Create only cohort diagnostics that haven't been created before?
 #' @param incrementalFolder           If \code{incremental = TRUE}, specify a folder where records are kept
 #'                                    of which cohort diagnostics has been executed.
+#' @param runOnSample                 Logical. If TRUE, the function will operate on a sample of the data.
+#'                                    Default is FALSE, meaning the function will operate on the full data set.
 #'
+#' @param sampleN                     Integer. The number of records to include in the sample if runOnSample is TRUE.
+#'                                    Default is 1000. Ignored if runOnSample is FALSE.
+#'
+#' @param seed                        Integer. The seed for the random number generator used to create the sample.
+#'                                    This ensures that the same sample can be drawn again in future runs. Default is 64374.
+#'
+#' @param seedArgs                    List. Additional arguments to pass to the sampling function.
+#'                                    This can be used to control aspects of the sampling process beyond the seed and sample size.
+#'
+#' @param sampleIdentifierExpression Character. An expression that generates unique identifiers for each sample.
+#'                                   This expression can use the variables 'cohortId' and 'seed'.
+#'                                   Default is "cohortId * 1000 + seed", which ensures unique identifiers
+#'                                   as long as there are fewer than 1000 cohorts.
+
 #' @examples
 #' \dontrun{
 #' # Load cohorts (assumes that they have already been instantiated)
@@ -211,7 +227,12 @@ executeDiagnostics <- function(cohortDefinitionSet,
                                minCharacterizationMean = 0.01,
                                irWashoutPeriod = 0,
                                incremental = FALSE,
-                               incrementalFolder = file.path(exportFolder, "incremental")) {
+                               incrementalFolder = file.path(exportFolder, "incremental"),
+                               runOnSample = FALSE,
+                               sampleN = 1000,
+                               seed = 64374,
+                               seedArgs = NULL,
+                               sampleIdentifierExpression = "cohortId * 1000 + seed") {
   # collect arguments that were passed to cohort diagnostics at initiation
   callingArgs <- formals(executeDiagnostics)
   callingArgsJson <-
@@ -528,6 +549,21 @@ executeDiagnostics <- function(cohortDefinitionSet,
     }
   }
 
+  if (runOnSample & !isTRUE(attr(cohortDefinitionSet, "isSampledCohortDefinition"))) {
+    cohortDefinitionSet <-
+      CohortGenerator::sampleCohortDefinitionSet(connection = connection,
+                                                 cohortDefinitionSet = cohortDefinitionSet,
+                                                 tempEmulationSchema = tempEmulationSchema,
+                                                 cohortDatabaseSchema = cohortDatabaseSchema,
+                                                 cohortTableNames = cohortTableNames,
+                                                 n = sampleN,
+                                                 seed = seed,
+                                                 seedArgs = seedArgs,
+                                                 identifierExpression = sampleIdentifierExpression,
+                                                 incremental = incremental,
+                                                 incrementalFolder = incrementalFolder)
+  }
+
   ## CDM source information----
   timeExecution(
     exportFolder,
diff --git a/man/executeDiagnostics.Rd b/man/executeDiagnostics.Rd
index 836c50549..9884c5824 100644
--- a/man/executeDiagnostics.Rd
+++ b/man/executeDiagnostics.Rd
@@ -34,7 +34,12 @@ executeDiagnostics(
   minCharacterizationMean = 0.01,
   irWashoutPeriod = 0,
   incremental = FALSE,
-  incrementalFolder = file.path(exportFolder, "incremental")
+  incrementalFolder = file.path(exportFolder, "incremental"),
+  runOnSample = FALSE,
+  sampleN = 1000,
+  seed = 64374,
+  seedArgs = NULL,
+  sampleIdentifierExpression = "cohortId * 1000 + seed"
 )
 }
 \arguments{
@@ -121,6 +126,23 @@ on covariates that have very low values. The default is 0.001 (i.e. 0.1 percent)
 
 \item{incrementalFolder}{If \code{incremental = TRUE}, specify a folder where records are kept
 of which cohort diagnostics has been executed.}
+
+\item{runOnSample}{Logical. If TRUE, the function will operate on a sample of the data.
+Default is FALSE, meaning the function will operate on the full data set.}
+
+\item{sampleN}{Integer. The number of records to include in the sample if runOnSample is TRUE.
+Default is 1000. Ignored if runOnSample is FALSE.}
+
+\item{seed}{Integer. The seed for the random number generator used to create the sample.
+This ensures that the same sample can be drawn again in future runs. Default is 64374.}
+
+\item{seedArgs}{List. Additional arguments to pass to the sampling function.
+This can be used to control aspects of the sampling process beyond the seed and sample size.}
+
+\item{sampleIdentifierExpression}{Character. An expression that generates unique identifiers for each sample.
+This expression can use the variables 'cohortId' and 'seed'.
+Default is "cohortId * 1000 + seed", which ensures unique identifiers
+as long as there are fewer than 1000 cohorts.}
 }
 \description{
 Runs the cohort diagnostics on all (or a subset of) the cohorts instantiated using the
diff --git a/tests/testthat/test-1-ResultsDataModel.R b/tests/testthat/test-1-ResultsDataModel.R
index 01b448192..9610dde61 100644
--- a/tests/testthat/test-1-ResultsDataModel.R
+++ b/tests/testthat/test-1-ResultsDataModel.R
@@ -122,7 +122,8 @@ VALUES ('Synthea','Synthea','OHDSI Community','SyntheaTM is a Synthetic Patient
           runOrphanConcepts = TRUE,
           incremental = TRUE,
           incrementalFolder = file.path(folder, "incremental"),
-          temporalCovariateSettings = temporalCovariateSettings
+          temporalCovariateSettings = temporalCovariateSettings,
+          runOnSample = TRUE
         )
       },
       "CDM Source table has more than one record while only one is expected."
@@ -147,7 +148,8 @@ VALUES ('Synthea','Synthea','OHDSI Community','SyntheaTM is a Synthetic Patient
       runOrphanConcepts = TRUE,
       incremental = TRUE,
       incrementalFolder = file.path(folder, "incremental"),
-      temporalCovariateSettings = temporalCovariateSettings
+      temporalCovariateSettings = temporalCovariateSettings,
+      runOnSample = TRUE
     )
   }
 
diff --git a/tests/testthat/test-2-againstCdm.R b/tests/testthat/test-2-againstCdm.R
index 54a2107ba..fa2ffeb15 100644
--- a/tests/testthat/test-2-againstCdm.R
+++ b/tests/testthat/test-2-againstCdm.R
@@ -44,7 +44,8 @@ test_that("Cohort diagnostics in incremental mode", {
       minCellCount = minCellCountValue,
       incremental = TRUE,
       incrementalFolder = file.path(folder, "incremental"),
-      temporalCovariateSettings = temporalCovariateSettings
+      temporalCovariateSettings = temporalCovariateSettings,
+      runOnSample = TRUE
     )
   )
 
@@ -74,7 +75,8 @@ test_that("Cohort diagnostics in incremental mode", {
       minCellCount = minCellCountValue,
       incremental = TRUE,
       incrementalFolder = file.path(folder, "incremental"),
-      temporalCovariateSettings = temporalCovariateSettings
+      temporalCovariateSettings = temporalCovariateSettings,
+      runOnSample = TRUE
     )
   )
   # generate sqlite file
@@ -120,7 +122,8 @@ test_that("Cohort diagnostics in incremental mode", {
         minCellCount = minCellCountValue,
         incremental = FALSE,
         incrementalFolder = file.path(folder, "incremental"),
-        temporalCovariateSettings = temporalCovariateSettings
+        temporalCovariateSettings = temporalCovariateSettings,
+        runOnSample = TRUE
       )
     })
 
diff --git a/tests/testthat/test-other.R b/tests/testthat/test-other.R
index cf57588c3..475f17b18 100644
--- a/tests/testthat/test-other.R
+++ b/tests/testthat/test-other.R
@@ -124,3 +124,35 @@ test_that("timeExecutions function", {
   checkmate::expect_data_frame(result, nrows = 5, ncols = 5)
   expect_false(all(is.na(result$startTime)))
 })
+
+test_that("enforceMinCellValue replaces values below minimum with negative of minimum", {
+  data <- data.frame(a = c(1, 2, 3, 4, 5))
+  minValues <- 3
+  result <- enforceMinCellValue(data, "a", minValues, silent = TRUE)
+
+  expect_equal(result$a, c(-3, -3, 3, 4, 5))
+})
+
+test_that("enforceMinCellValue does not replace NA values", {
+  data <- data.frame(a = c(1, 2, NA, 4, 5))
+  minValues <- 3
+  result <- enforceMinCellValue(data, "a", minValues, silent = TRUE)
+
+  expect_equal(result$a, c(-3, -3, NA, 4, 5))
+})
+
+test_that("enforceMinCellValue does not replace zero values", {
+  data <- data.frame(a = c(0, 2, 3, 4, 5))
+  minValues <- 3
+  result <- enforceMinCellValue(data, "a", minValues, silent = TRUE)
+
+  expect_equal(result$a, c(0, -3, 3, 4, 5))
+})
+
+test_that("enforceMinCellValue works with vector of minimum values", {
+  data <- data.frame(a = c(1, 2, 3, 4, 5))
+  minValues <- c(1, 2, 3, 4, 5)
+  result <- enforceMinCellValue(data, "a", minValues, silent = TRUE)
+
+  expect_equal(result$a, c(1, 2, 3, 4, 5))
+})