diff --git a/CITATION.cff b/CITATION.cff index bd14b009..51c6847a 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,7 +1,7 @@ -# ----------------------------------------------------------- -# CITATION file created with {cffr} R package, v1.0.0 +# -------------------------------------------- +# CITATION file created with {cffr} R package # See also: https://docs.ropensci.org/cffr/ -# ----------------------------------------------------------- +# -------------------------------------------- cff-version: 1.2.0 message: 'To cite package "simstudy" in publications use:' @@ -49,7 +49,7 @@ preferred-citation: repository: https://CRAN.R-project.org/package=simstudy repository-code: https://github.com/kgoldfeld/simstudy url: https://kgoldfeld.github.io/simstudy/ -date-released: '2023-11-22' +date-released: '2024-05-10' contact: - family-names: Goldfeld given-names: Keith @@ -568,3 +568,4 @@ references: identifiers: - type: url value: https://kgoldfeld.github.io/simstudy/dev/ + diff --git a/DESCRIPTION b/DESCRIPTION index feaf50d7..ea9cb559 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Type: Package Package: simstudy Title: Simulation of Study Data Version: 0.7.1.9000 -Date: 2023-11-22 +Date: 2024-05-10 Authors@R: c(person(given = "Keith", family = "Goldfeld", diff --git a/NEWS.md b/NEWS.md index 76a9ec0f..a40e494d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,8 @@ # simstudy (development version) ## New features +* added the option to specify a customized distribution in `defData` and `defDataAdd` by +specifying `dist = "custom"`. *`addPeriods` now includes a new argument `periodVec` that allows users to designate specific measurement time periods using vector. diff --git a/R/define_data.R b/R/define_data.R index ff65aa94..9912fe80 100644 --- a/R/define_data.R +++ b/R/define_data.R @@ -727,6 +727,7 @@ defSurv <- function(dtDefs = NULL, .isValidArithmeticFormula(newform, defVars) .isValidArithmeticFormula(variance, defVars) }, + custom = {}, stop("Unknown distribution.") ) diff --git a/R/generate_dist.R b/R/generate_dist.R index 5d671063..6a177c9b 100644 --- a/R/generate_dist.R +++ b/R/generate_dist.R @@ -12,6 +12,7 @@ #' @return A data.frame with the updated simulated data #' @noRd .generate <- function(args, n, dfSim, idname, envir = parent.frame()) { + newColumn <- switch(args$dist, beta = .genbeta( n = n, @@ -54,6 +55,13 @@ variance = args$variance, envir = envir ), + custom = .gencustom( + n = n, + fn = args$formula, + args = args$variance, + dtSim = copy(dfSim), + envir = envir + ), exponential = .genexp( n = n, formula = args$formula, @@ -327,6 +335,7 @@ # @return A data.frame column with the updated simulated data .gendeterm <- function(n, formula, link, dtSim, envir) { + new <- .evalWith(formula, .parseDotVars(formula, envir), dtSim, n) if (link == "log") { @@ -336,6 +345,32 @@ new } +# Internal function called by .generate - returns non-random data +# +# @param n The number of observations required in the data set +# @param fn String that specifies the custom function +# @param args String of comma delimited arguments that will be passed to fn +# @param dtSim Incomplete simulated data.table +# @return A data.frame column with the updated simulated data + +.gencustom <- function(n, fn, args, dtSim, envir) { + + args <- gsub("\\s+", "", args) # remove any spaces + arg_l <- as.list(unlist(strsplit(args, ","))) + arg_l <- lapply(arg_l, function(a) as.list(unlist(strsplit(a, "=")))) + + var_vec <- unlist(lapply(arg_l, function(a) a[[1]])) + arg_list <- lapply(arg_l, + function(a) .evalWith(a[[2]], .parseDotVars( a[[2]], envir ), dtSim, n) + ) + names(arg_list) <- var_vec + assertNotInVector("n", names(arg_list)) + arg_list <- c(n = n, arg_list) + new <- do.call(fn, arg_list, envir = envir) + + new +} + # Internal function called by .generate - returns exp data # # @param n The number of observations required in the data set diff --git a/R/internal_utility.R b/R/internal_utility.R index 1dbf3898..e85e271c 100644 --- a/R/internal_utility.R +++ b/R/internal_utility.R @@ -51,6 +51,7 @@ extVars, dtSim = data.table(), n = nrow(dtSim)) { + if (missing(dtSim) && missing(n)) { n <- 1 } @@ -198,7 +199,8 @@ "exponential", "mixture", "trtAssign", - "clusterSize" + "clusterSize", + "custom" ) } diff --git a/man/defData.Rd b/man/defData.Rd index a064491e..ee2453d4 100644 --- a/man/defData.Rd +++ b/man/defData.Rd @@ -36,7 +36,7 @@ A data.table named dtName that is an updated data definitions table Add single row to definitions table } \details{ -The possible data distributions are: normal, binary, binomial, poisson, noZeroPoisson, uniform, categorical, gamma, beta, nonrandom, uniformInt, negBinomial, exponential, mixture, trtAssign, clusterSize. +The possible data distributions are: normal, binary, binomial, poisson, noZeroPoisson, uniform, categorical, gamma, beta, nonrandom, uniformInt, negBinomial, exponential, mixture, trtAssign, clusterSize, custom. } \examples{ extVar <- 2.3 diff --git a/tests/testthat/test-generate_dist.R b/tests/testthat/test-generate_dist.R index da06a088..3a5465b8 100644 --- a/tests/testthat/test-generate_dist.R +++ b/tests/testthat/test-generate_dist.R @@ -170,3 +170,32 @@ test_that("clusterSize data is generated as expected.", { expect_true(dt2[, var(test)] > dt1[, var(test)]) }) + + +# .gencustom ---- +test_that("custom data is generated as expected.", { + skip_on_cran() + + trunc_norm <- function(n, lower, upper, mu = 0, s = 1.5) { + + F.a <- pnorm(lower, mean = mu, sd = s) + F.b <- pnorm(upper, mean = mu, sd = s) + + u <- runif(n, min = F.a, max = F.b) + qnorm(u, mean = mu, sd = s) + + } + + def <- + defData(varname = "x", formula = 5, dist = "poisson") |> + defData(varname = "y", formula = "trunc_norm", + variance = "s = 100, lower = x - 1, upper = x + 1", + dist = "custom" + ) + + dd <- genData(10000, def) + + expect_true( dd[, min(y)] > dd[, min(x-1)]) + expect_true( dd[, max(y)] < dd[, max(x+1)]) + +}) diff --git a/tests/testthat/test-internal_utility.R b/tests/testthat/test-internal_utility.R index b7e27473..15c188c4 100644 --- a/tests/testthat/test-internal_utility.R +++ b/tests/testthat/test-internal_utility.R @@ -80,7 +80,7 @@ test_that("probabilities (matrix) are adjusted as documented.", { # .getDists ---- test_that("number of Dists is up to date.", { skip_on_cran() - expect_length(.getDists(), 16) + expect_length(.getDists(), 17) }) # .isFormulaScalar ---- diff --git a/vignettes/customdist.Rmd b/vignettes/customdist.Rmd new file mode 100644 index 00000000..b3db952e --- /dev/null +++ b/vignettes/customdist.Rmd @@ -0,0 +1,159 @@ +--- +title: "Customized Distributions" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Customized Distributions} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +```{r chunkname, echo=-1} +data.table::setDTthreads(2) +``` + +```{r, echo = FALSE, message = FALSE} +library(simstudy) +library(ggplot2) +library(scales) +library(grid) +library(gridExtra) +library(survival) +library(gee) +library(data.table) +library(ordinal) + +odds <- function (p) p/(1 - p) # TODO temporary remove when added to package +plotcolors <- c("#B84226", "#1B8445", "#1C5974") + +cbbPalette <- c("#B84226","#B88F26", "#A5B435", "#1B8446", + "#B87326","#B8A526", "#6CA723", "#1C5974") + +ggtheme <- function(panelback = "white") { + + ggplot2::theme( + panel.background = element_rect(fill = panelback), + panel.grid = element_blank(), + axis.ticks = element_line(colour = "black"), + panel.spacing =unit(0.25, "lines"), # requires package grid + panel.border = element_rect(fill = NA, colour="gray90"), + plot.title = element_text(size = 8,vjust=.5,hjust=0), + axis.text = element_text(size=8), + axis.title = element_text(size = 8) + ) + +} + +``` + +Custom distributions can be specified in `defData` and `defDataAdd` by setting the argument *dist* to "custom". When defining a custom distribution, you provide the name of the user-defined function as a string in the *formula* argument. The arguments of the custom function are listed in the *variance* argument, separated by commas and formatted as "**arg_1 = val_form_1, arg_2 = val_form_2, $\dots$, arg_K = val_form_K**". + +Here, the *arg_k's* represent the names of the arguments passed to the customized function, where $k$ ranges from $1$ to $K$. You can use values or formulas for each *val_form_k*. If formulas are used, ensure that the variables have been previously generated. Double dot notation is available in specifying *value_formula_k*. One important requirement of the custom function is that the parameter list used to define the function must include an argument"**n = n**", but do not include $n$ in the definition as part of `defData` or `defDataAdd`. + +### Example 1 + +Here is an example where we would like to generate data from a zero-inflated beta distribution. In this case, there is a user-defined function `zeroBeta` that takes on shape parameters $a$ and $b$, as well as $p_0$, the proportion of the sample that is zero. Note that the function also takes an argument $n$ that will not to be be specified in the data definition; $n$ will represent the number of observations being generated: + +```{r} +zeroBeta <- function(n, a, b, p0) { + betas <- rbeta(n, a, b) + is.zero <- rbinom(n, 1, p0) + betas*!(is.zero) +} +``` + +The data definition specifies a new variable $zb$ that sets $a$ and $b$ to 0.75, and $p_0 = 0.02$: + +```{r} +def <- defData( + varname = "zb", + formula = "zeroBeta", + variance = "a = 0.75, b = 0.75, p0 = 0.02", + dist = "custom" +) +``` + +The data are generated: + +```{r} +set.seed(1234) +dd <- genData(100000, def) +``` + +```{r, echo = FALSE} +dd +``` + +A plot of the data reveals dis-proportion of zero's: + +```{r, fig.width = 6, fig.height = 3, echo = FALSE} +ggplot(data = dd, aes(x = zb)) + + geom_histogram(binwidth = 0.01, boundary = 0, fill = "grey60") + + theme(panel.grid = element_blank()) +``` + +### Example 2 + +In this second example, we are generating sets of truncated Gaussian distributions with means ranging from $-1$ to $1$. The limits of the truncation vary across three different groups. `rnormt` is a customized (user-defined) function that generates the truncated Gaussiandata. The function requires four arguments (the left truncation value, the right truncation value, the distribution average and the standard deviation). + +```{r} +rnormt <- function(n, min, max, mu, s) { + + F.a <- pnorm(min, mean = mu, sd = s) + F.b <- pnorm(max, mean = mu, sd = s) + + u <- runif(n, min = F.a, max = F.b) + qnorm(u, mean = mu, sd = s) + +} +``` + + +In this example, truncation limits vary based on group membership. Initially, three groups are created, followed by the generation of truncated values. For Group 1, truncation occurs within the range of $-1$ to $1$, for Group 2, it's $-2$ to $2$ and for Group 3, it's $-3$ to $3$. We'll generate three data sets, each with a distinct mean denoted by M, using the double-dot notation to implement these different means. + +```{r} +def <- + defData( + varname = "limit", + formula = "1/4;1/2;1/4", + dist = "categorical" + ) |> + defData( + varname = "tn", + formula = "rnormt", + variance = "min = -limit, max = limit, mu = ..M, s = 1.5", + dist = "custom" + ) +``` + +The data generation requires three calls to `genData`. The output is a list of three data sets: + +```{r} +mus <- c(-1, 0, 1) +dd <-lapply(mus, function(M) genData(100000, def)) +``` + +Here are the first six observations from each of the three data sets: + +```{r, echo=FALSE} +lapply(dd, function(D) head(D)) +``` + +A plot highlights the group differences. + +```{r, fig.width = 8, fig.height = 6, echo = FALSE} +pfunc <- function(dx, i) { + ggplot(data = dx, aes(x = tn)) + + geom_histogram(aes(fill = factor(limit)), binwidth = 0.05, boundary = 0, alpha = .8) + + facet_grid( ~ limit) + + theme(panel.grid = element_blank(), + legend.position = "none") + + scale_fill_manual(values = plotcolors) + + scale_x_continuous(breaks = seq(-3, 3, by =1)) + + scale_y_continuous(limits = c(0, 1000)) + + ggtitle(paste("mu =", mus[i])) +} + +plist <- lapply(seq_along(dd), function(a) pfunc(dd[[a]], a)) +grid.arrange(grobs = plist, nrow = 3) +``` + diff --git a/vignettes/simstudy.Rmd b/vignettes/simstudy.Rmd index a97541b5..6b56cb75 100644 --- a/vignettes/simstudy.Rmd +++ b/vignettes/simstudy.Rmd @@ -190,17 +190,18 @@ d[[2]] <- data.table("binary", "probability", "both", "-", "-", "X", "-", "X") d[[3]] <- data.table("binomial", "probability", "both", "-", "# of trials", "X", "-", "X") d[[4]] <- data.table("categorical", "probability", "string", "p_1;p_2;...;p_n", "a;b;c", "X", "-", "-") d[[5]] <- data.table("clusterSize", "total N", "both", "-", "dispersion", "X", "-", "-") -d[[6]] <- data.table("exponential", "mean", "both", "-", "-", "X", "X", "-") -d[[7]] <- data.table("gamma", "mean", "both", "-", "dispersion", "X", "X", "-") -d[[8]] <- data.table("mixture", "formula", "string", "x_1 | p_1 + ... + x_n | p_n", "-", "X", "-", "-") -d[[9]] <- data.table("negBinomial", "mean", "both", "-", "dispersion", "X", "X", "-") -d[[10]] <- data.table("nonrandom", "formula", "both", "-", "-", "X", "-", "-") -d[[11]] <- data.table("normal", "mean", "both", "-", "variance", "X", "-", "-") -d[[12]] <- data.table("noZeroPoisson", "mean", "both", "-", "-", "X", "X", "-") -d[[13]] <- data.table("poisson", "mean", "both", "-", "-", "X", "X", "-") -d[[14]] <- data.table("trtAssign", "ratio", "string", "r_1;r_2;...;r_n", "stratification", "X", "X", "-") -d[[15]] <- data.table("uniform", "range", "string", "from ; to", "-", "X", "-", "-") -d[[16]] <- data.table("uniformInt", "range", "string", "from ; to", "-", "X", "-", "-") +d[[6]] <- data.table("custom", "function", "string", "-", "arguments", "X", "-", "-") +d[[7]] <- data.table("exponential", "mean", "both", "-", "-", "X", "X", "-") +d[[8]] <- data.table("gamma", "mean", "both", "-", "dispersion", "X", "X", "-") +d[[9]] <- data.table("mixture", "formula", "string", "x_1 | p_1 + ... + x_n | p_n", "-", "X", "-", "-") +d[[10]] <- data.table("negBinomial", "mean", "both", "-", "dispersion", "X", "X", "-") +d[[11]] <- data.table("nonrandom", "formula", "both", "-", "-", "X", "-", "-") +d[[12]] <- data.table("normal", "mean", "both", "-", "variance", "X", "-", "-") +d[[13]] <- data.table("noZeroPoisson", "mean", "both", "-", "-", "X", "X", "-") +d[[14]] <- data.table("poisson", "mean", "both", "-", "-", "X", "X", "-") +d[[15]] <- data.table("trtAssign", "ratio", "string", "r_1;r_2;...;r_n", "stratification", "X", "X", "-") +d[[16]] <- data.table("uniform", "range", "string", "from ; to", "-", "X", "-", "-") +d[[17]] <- data.table("uniformInt", "range", "string", "from ; to", "-", "X", "-", "-") d <- rbindlist(d) @@ -226,7 +227,11 @@ A *categorical* distribution is a discrete data distribution taking on values fr #### clusterSize -The *clusterSize* distribution allocates a total sample size *N* (specified in the *formula* argument) across the *k* rows of the data.table such that the sum of the rows equals *N*. If the *dispersion* argument is set to 0, the allocation to each row is *N/k*, with some rows getting an increment of 1 to ensure that the sum is N. It is possible to relax the assumption of balanced cluster sizes by setting *dispersion > 0*. As the dispersion, the variability of cluster sizes across clusters increases. +The *clusterSize* distribution allocates a total sample size *N* (specified in the *formula* argument) across the *k* rows of the data.table such that the sum of the rows equals *N*. If the *dispersion* argument is set to 0, the allocation to each row is *N/k*, with some rows getting an increment of 1 to ensure that the sum is N. It is possible to relax the assumption of balanced cluster sizes by setting *dispersion > 0*. As the dispersion increases, the variability of cluster sizes across clusters increases. + +#### custom + +Custom distributions can be specified in `defData` and `defDataAdd` by setting the argument *dist* to "custom". When defining a custom distribution, provide the name of the user-defined function as a string in the *formula* argument. The arguments of the custom function are listed in the *variance* argument, separated by commas and formatted as "**arg_1 = val_form_1, arg_2 = val_form_2, $\dots$, arg_K = val_form_K**". The *arg_k's* represent the names of the arguments passed to the customized function, where $k$ ranges from $1$ to $K$. Values or formulas can be used for each *val_form_k*. If formulas are used, ensure that the variables have been previously generated. Double dot notation is available in specifying *value_formula_k*. One important requirement of the custom function is that the parameter list used to define the function must include an argument"**n = n**", but do not include $n$ in the definition as part of `defData` or `defDataAdd`. #### exponential