Merge pull request #225 from kgoldfeld/216-nonrandom-distribution-ret…

…urns-a-single-value-when-repeated-values-are-expected 216 nonrandom distribution returns a single value when repeated values are expected
kgoldfeld · May 13, 2024 · 1e16802 · 1e16802
2 parents 93e72f8 + aa853cc
commit 1e16802
Show file tree

Hide file tree

Showing 11 changed files with 254 additions and 20 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -1,7 +1,7 @@
-# -----------------------------------------------------------
-# CITATION file created with {cffr} R package, v1.0.0
+# --------------------------------------------
+# CITATION file created with {cffr} R package
 # See also: https://docs.ropensci.org/cffr/
-# -----------------------------------------------------------
+# --------------------------------------------
 
 cff-version: 1.2.0
 message: 'To cite package "simstudy" in publications use:'
@@ -49,7 +49,7 @@ preferred-citation:
 repository: https://CRAN.R-project.org/package=simstudy
 repository-code: https://github.com/kgoldfeld/simstudy
 url: https://kgoldfeld.github.io/simstudy/
-date-released: '2023-11-22'
+date-released: '2024-05-10'
 contact:
 - family-names: Goldfeld
   given-names: Keith
@@ -568,3 +568,4 @@ references:
 identifiers:
 - type: url
   value: https://kgoldfeld.github.io/simstudy/dev/
+
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -2,7 +2,7 @@ Type: Package
 Package: simstudy
 Title: Simulation of Study Data
 Version: 0.7.1.9000
-Date: 2023-11-22
+Date: 2024-05-10
 Authors@R: 
     c(person(given = "Keith",
              family = "Goldfeld",

diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,8 @@
 # simstudy (development version)
 
 ## New features
+* added the option to specify a customized distribution in `defData` and `defDataAdd` by
+specifying `dist = "custom"`.
 *`addPeriods` now includes a new argument `periodVec` that allows users to designate 
 specific measurement time periods using vector.
 

diff --git a/R/define_data.R b/R/define_data.R
@@ -727,6 +727,7 @@ defSurv <- function(dtDefs = NULL,
         .isValidArithmeticFormula(newform, defVars)
         .isValidArithmeticFormula(variance, defVars)
       },
+      custom = {},
       stop("Unknown distribution.")
     )
 

diff --git a/R/generate_dist.R b/R/generate_dist.R
@@ -12,6 +12,7 @@
 #' @return A data.frame with the updated simulated data
 #' @noRd
 .generate <- function(args, n, dfSim, idname, envir = parent.frame()) {
+
   newColumn <- switch(args$dist,
     beta = .genbeta(
       n = n,
@@ -54,6 +55,13 @@
       variance = args$variance,
       envir = envir
     ),
+    custom = .gencustom(
+      n = n,
+      fn = args$formula,
+      args = args$variance,
+      dtSim = copy(dfSim),
+      envir = envir
+    ),
     exponential = .genexp(
       n = n,
       formula = args$formula,
@@ -327,6 +335,7 @@
 # @return A data.frame column  with the updated simulated data
 
 .gendeterm <- function(n, formula, link, dtSim, envir) {
+
   new <- .evalWith(formula, .parseDotVars(formula, envir), dtSim, n)
 
   if (link == "log") {
@@ -336,6 +345,32 @@
   new
 }
 
+# Internal function called by .generate - returns non-random data
+#
+# @param n The number of observations required in the data set
+# @param fn String that specifies the custom function
+# @param args String of comma delimited arguments that will be passed to fn
+# @param dtSim Incomplete simulated data.table
+# @return A data.frame column  with the updated simulated data
+
+.gencustom <- function(n, fn, args, dtSim, envir) {
+
+  args <- gsub("\\s+", "", args) # remove any spaces
+  arg_l <- as.list(unlist(strsplit(args, ",")))
+  arg_l <- lapply(arg_l, function(a) as.list(unlist(strsplit(a, "="))))
+
+  var_vec <- unlist(lapply(arg_l, function(a) a[[1]]))
+  arg_list <- lapply(arg_l, 
+    function(a) .evalWith(a[[2]], .parseDotVars( a[[2]], envir ), dtSim, n)
+  )
+  names(arg_list) <- var_vec
+  assertNotInVector("n", names(arg_list))
+  arg_list <- c(n = n, arg_list)
+  new <- do.call(fn, arg_list, envir = envir)
+
+  new
+}
+
 # Internal function called by .generate - returns exp data
 #
 # @param n The number of observations required in the data set

diff --git a/R/internal_utility.R b/R/internal_utility.R
@@ -51,6 +51,7 @@
                       extVars,
                       dtSim = data.table(),
                       n = nrow(dtSim)) {
+
   if (missing(dtSim) && missing(n)) {
     n <- 1
   }
@@ -198,7 +199,8 @@
       "exponential",
       "mixture",
       "trtAssign",
-      "clusterSize"
+      "clusterSize",
+      "custom"
     )
   }
 

diff --git a/man/defData.Rd b/man/defData.Rd
diff --git a/tests/testthat/test-generate_dist.R b/tests/testthat/test-generate_dist.R
@@ -170,3 +170,32 @@ test_that("clusterSize data is generated as expected.", {
   expect_true(dt2[, var(test)] > dt1[, var(test)])
 
 })
+
+
+# .gencustom ----
+test_that("custom data is generated as expected.", {
+  skip_on_cran()
+
+  trunc_norm <- function(n, lower, upper, mu = 0, s = 1.5) {
+
+    F.a <- pnorm(lower, mean = mu, sd = s)
+    F.b <- pnorm(upper, mean = mu, sd = s)
+
+    u <- runif(n, min = F.a, max = F.b)
+    qnorm(u, mean = mu, sd = s)
+
+  }
+
+  def <-
+    defData(varname = "x", formula = 5, dist = "poisson") |>
+    defData(varname = "y", formula = "trunc_norm",
+            variance = "s = 100, lower = x - 1, upper = x + 1",
+            dist = "custom"
+    )
+
+  dd <- genData(10000, def)
+
+  expect_true( dd[, min(y)] > dd[, min(x-1)])
+  expect_true( dd[, max(y)] < dd[, max(x+1)])
+
+})
diff --git a/tests/testthat/test-internal_utility.R b/tests/testthat/test-internal_utility.R
@@ -80,7 +80,7 @@ test_that("probabilities (matrix) are adjusted as documented.", {
 # .getDists ----
 test_that("number of Dists is up to date.", {
   skip_on_cran()
-  expect_length(.getDists(), 16)
+  expect_length(.getDists(), 17)
 })
 
 # .isFormulaScalar ----

diff --git a/vignettes/customdist.Rmd b/vignettes/customdist.Rmd
@@ -0,0 +1,159 @@
+---
+title: "Customized Distributions"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{Customized Distributions}
+  %\VignetteEngine{knitr::rmarkdown}
+  \usepackage[utf8]{inputenc}
+---
+
+```{r chunkname, echo=-1}
+data.table::setDTthreads(2)
+```
+
+```{r, echo = FALSE, message = FALSE}
+library(simstudy)
+library(ggplot2)
+library(scales)
+library(grid)
+library(gridExtra)
+library(survival)
+library(gee)
+library(data.table)
+library(ordinal)
+
+odds <- function (p)  p/(1 - p) # TODO temporary remove when added to package
+plotcolors <- c("#B84226", "#1B8445", "#1C5974")
+
+cbbPalette <- c("#B84226","#B88F26", "#A5B435", "#1B8446",
+                "#B87326","#B8A526", "#6CA723", "#1C5974") 
+
+ggtheme <- function(panelback = "white") {
+  
+  ggplot2::theme(
+    panel.background = element_rect(fill = panelback),
+    panel.grid = element_blank(),
+    axis.ticks =  element_line(colour = "black"),
+    panel.spacing =unit(0.25, "lines"),  # requires package grid
+    panel.border = element_rect(fill = NA, colour="gray90"), 
+    plot.title = element_text(size = 8,vjust=.5,hjust=0),
+    axis.text = element_text(size=8),
+    axis.title = element_text(size = 8)
+  )  
+  
+}
+
+```
+
+Custom distributions can be specified in `defData` and `defDataAdd` by setting the argument *dist* to "custom". When defining a custom distribution, you provide the name of the user-defined function as a string in the *formula* argument. The arguments of the custom function are listed in the *variance* argument, separated by commas and formatted as "**arg_1 = val_form_1, arg_2 = val_form_2, $\dots$, arg_K = val_form_K**".
+
+Here, the *arg_k's* represent the names of the arguments passed to the customized function, where $k$ ranges from $1$ to $K$. You can use values or formulas for each *val_form_k*. If formulas are used, ensure that the variables have been previously generated. Double dot notation is available in specifying *value_formula_k*. One important requirement of the custom function is that the parameter list used to define the function must include an argument"**n = n**", but do not include $n$ in the definition as part of `defData` or `defDataAdd`.
+
+### Example 1
+
+Here is an example where we would like to generate data from a zero-inflated beta distribution. In this case, there is a user-defined function `zeroBeta` that takes on shape parameters $a$ and $b$, as well as $p_0$, the proportion of the sample that is zero. Note that the function also takes an argument $n$ that will not to be be specified in the data definition; $n$ will represent the number of observations being generated:
+
+```{r}
+zeroBeta <- function(n, a, b, p0) {
+  betas <- rbeta(n, a, b)
+  is.zero <- rbinom(n, 1, p0)
+  betas*!(is.zero)
+}
+```
+
+The data definition specifies a new variable $zb$ that sets $a$ and $b$ to 0.75, and $p_0 = 0.02$:
+
+```{r}
+def <- defData(
+  varname = "zb", 
+  formula = "zeroBeta", 
+  variance = "a = 0.75, b = 0.75, p0 = 0.02", 
+  dist = "custom"
+)
+```
+
+The data are generated:
+
+```{r}
+set.seed(1234)
+dd <- genData(100000, def)
+```
+
+```{r, echo = FALSE}
+dd
+```
+
+A plot of the data reveals dis-proportion of zero's:
+
+```{r, fig.width = 6, fig.height = 3, echo = FALSE}
+ggplot(data = dd, aes(x = zb)) +
+  geom_histogram(binwidth = 0.01, boundary = 0, fill = "grey60") +
+  theme(panel.grid = element_blank()) 
+```
+
+### Example 2
+
+In this second example, we are generating sets of truncated Gaussian distributions with means ranging from $-1$ to $1$. The limits of the truncation vary across three different groups. `rnormt` is a customized (user-defined) function that generates the truncated Gaussiandata. The function requires four arguments (the left truncation value, the right truncation value, the distribution average and the standard deviation).
+
+```{r}
+rnormt <- function(n, min, max, mu, s) {
+  
+  F.a <- pnorm(min, mean = mu, sd = s)
+  F.b <- pnorm(max, mean = mu, sd = s)
+  
+  u <- runif(n, min = F.a, max = F.b)
+  qnorm(u, mean = mu, sd = s)
+  
+}
+```
+
+
+In this example, truncation limits vary based on group membership. Initially, three groups are created, followed by the generation of truncated values. For Group 1, truncation occurs within the range of $-1$ to $1$, for Group 2, it's $-2$ to $2$ and for Group 3, it's $-3$ to $3$. We'll generate three data sets, each with a distinct mean denoted by M, using the double-dot notation to implement these different means.
+
+```{r}
+def <-
+  defData(
+    varname = "limit", 
+    formula = "1/4;1/2;1/4",
+    dist = "categorical"
+  ) |>
+  defData(
+    varname = "tn", 
+    formula = "rnormt", 
+    variance = "min = -limit, max = limit, mu = ..M, s = 1.5",
+    dist = "custom"
+  )
+```
+
+The data generation requires three calls to `genData`. The output is a list of three data sets:
+
+```{r}
+mus <- c(-1, 0, 1)
+dd <-lapply(mus, function(M) genData(100000, def))
+```
+
+Here are the first six observations from each of the three data sets:
+
+```{r, echo=FALSE}
+lapply(dd, function(D) head(D))
+```
+
+A plot highlights the group differences.
+
+```{r, fig.width = 8, fig.height = 6, echo = FALSE}
+pfunc <- function(dx, i) {
+  ggplot(data = dx, aes(x = tn)) +
+    geom_histogram(aes(fill = factor(limit)), binwidth = 0.05, boundary = 0, alpha = .8) +
+    facet_grid( ~ limit) +
+    theme(panel.grid = element_blank(),
+          legend.position = "none") +
+    scale_fill_manual(values = plotcolors) +
+    scale_x_continuous(breaks = seq(-3, 3, by =1)) +
+    scale_y_continuous(limits = c(0, 1000)) +
+    ggtitle(paste("mu =", mus[i]))
+}
+
+plist <- lapply(seq_along(dd), function(a) pfunc(dd[[a]], a))
+grid.arrange(grobs = plist, nrow = 3)
+```
+
diff --git a/vignettes/simstudy.Rmd b/vignettes/simstudy.Rmd
@@ -190,17 +190,18 @@ d[[2]] <- data.table("binary", "probability", "both", "-", "-", "X", "-", "X")
 d[[3]] <- data.table("binomial", "probability", "both", "-", "# of trials", "X", "-", "X")
 d[[4]] <- data.table("categorical", "probability", "string", "p_1;p_2;...;p_n", "a;b;c", "X", "-", "-")
 d[[5]] <- data.table("clusterSize", "total N", "both", "-", "dispersion", "X", "-", "-")
-d[[6]] <- data.table("exponential", "mean", "both", "-", "-", "X", "X", "-")
-d[[7]] <- data.table("gamma", "mean", "both", "-", "dispersion", "X", "X", "-")
-d[[8]] <- data.table("mixture", "formula", "string", "x_1 | p_1 + ... + x_n | p_n", "-", "X", "-", "-")
-d[[9]] <- data.table("negBinomial", "mean", "both", "-", "dispersion", "X", "X", "-")
-d[[10]] <- data.table("nonrandom", "formula", "both", "-", "-", "X", "-", "-")
-d[[11]] <- data.table("normal", "mean", "both", "-", "variance", "X", "-", "-")
-d[[12]] <- data.table("noZeroPoisson", "mean", "both", "-", "-", "X", "X", "-")
-d[[13]] <- data.table("poisson", "mean", "both", "-", "-", "X", "X", "-")
-d[[14]] <- data.table("trtAssign", "ratio", "string", "r_1;r_2;...;r_n", "stratification", "X", "X", "-")
-d[[15]] <- data.table("uniform", "range", "string", "from ; to", "-", "X", "-", "-")
-d[[16]] <- data.table("uniformInt", "range", "string", "from ; to", "-", "X", "-", "-")
+d[[6]] <- data.table("custom", "function", "string", "-", "arguments", "X", "-", "-")
+d[[7]] <- data.table("exponential", "mean", "both", "-", "-", "X", "X", "-")
+d[[8]] <- data.table("gamma", "mean", "both", "-", "dispersion", "X", "X", "-")
+d[[9]] <- data.table("mixture", "formula", "string", "x_1 | p_1 + ... + x_n | p_n", "-", "X", "-", "-")
+d[[10]] <- data.table("negBinomial", "mean", "both", "-", "dispersion", "X", "X", "-")
+d[[11]] <- data.table("nonrandom", "formula", "both", "-", "-", "X", "-", "-")
+d[[12]] <- data.table("normal", "mean", "both", "-", "variance", "X", "-", "-")
+d[[13]] <- data.table("noZeroPoisson", "mean", "both", "-", "-", "X", "X", "-")
+d[[14]] <- data.table("poisson", "mean", "both", "-", "-", "X", "X", "-")
+d[[15]] <- data.table("trtAssign", "ratio", "string", "r_1;r_2;...;r_n", "stratification", "X", "X", "-")
+d[[16]] <- data.table("uniform", "range", "string", "from ; to", "-", "X", "-", "-")
+d[[17]] <- data.table("uniformInt", "range", "string", "from ; to", "-", "X", "-", "-")
 
 
 d <- rbindlist(d)
@@ -226,7 +227,11 @@ A *categorical* distribution is a discrete data distribution taking on values fr
 
 #### clusterSize
 
-The *clusterSize* distribution allocates a total sample size *N* (specified in the *formula* argument) across the *k* rows of the data.table such that the sum of the rows equals *N*. If the *dispersion* argument is set to 0, the allocation to each row is *N/k*, with some rows getting an increment of 1 to ensure that the sum is N. It is possible to relax the assumption of balanced cluster sizes by setting *dispersion > 0*. As the dispersion, the variability of cluster sizes across clusters increases. 
+The *clusterSize* distribution allocates a total sample size *N* (specified in the *formula* argument) across the *k* rows of the data.table such that the sum of the rows equals *N*. If the *dispersion* argument is set to 0, the allocation to each row is *N/k*, with some rows getting an increment of 1 to ensure that the sum is N. It is possible to relax the assumption of balanced cluster sizes by setting *dispersion > 0*. As the dispersion increases, the variability of cluster sizes across clusters increases. 
+
+#### custom
+
+Custom distributions can be specified in `defData` and `defDataAdd` by setting the argument *dist* to "custom". When defining a custom distribution, provide the name of the user-defined function as a string in the *formula* argument. The arguments of the custom function are listed in the *variance* argument, separated by commas and formatted as "**arg_1 = val_form_1, arg_2 = val_form_2, $\dots$, arg_K = val_form_K**". The *arg_k's* represent the names of the arguments passed to the customized function, where $k$ ranges from $1$ to $K$. Values or formulas can be used for each *val_form_k*. If formulas are used, ensure that the variables have been previously generated. Double dot notation is available in specifying *value_formula_k*. One important requirement of the custom function is that the parameter list used to define the function must include an argument"**n = n**", but do not include $n$ in the definition as part of `defData` or `defDataAdd`.
 
 #### exponential