diff --git a/R/parallelization-vignette.R b/R/parallelization-vignette.R new file mode 100644 index 0000000..be3c291 --- /dev/null +++ b/R/parallelization-vignette.R @@ -0,0 +1,91 @@ +## code to prepare `parallelization-vignette` dataset goes here + +# Setup ------------------------------------------------------------------- + +library(flipr) +library(future) +library(parallel) +library(progressr) +library(tictoc) + +ngrid_in <- 50L +nperms <- 5000 +n1 <- 10 +set.seed(1234) +x <- rnorm(n1, mean = 1, sd = 1) +y <- rnorm(n1, mean = 4, sd = 1) + +null_spec <- function(y, parameters) { + purrr::map(y, ~ .x - parameters[1]) +} + +stat_functions <- list(stat_t) + +stat_assignments <- list(delta = 1) + +# Inference on the mean without parallelization -------------------------------- + +pf <- PlausibilityFunction$new( + null_spec = null_spec, + stat_functions = stat_functions, + stat_assignments = stat_assignments, + x, y +) + +pf$set_point_estimate(mean(y) - mean(x)) + +pf$set_nperms(nperms) +pf$set_parameter_bounds( + point_estimate = pf$point_estimate, + conf_level = pf$max_conf_level +) + +pf$set_grid( + parameters = pf$parameters, + npoints = ngrid_in +) + +tic() +pf$evaluate_grid(grid = pf$grid) +time_without_parallelization <- toc()$callback_msg + +# Inference on the mean with parallelization ----------------------------------- + +ncores <- 4 +plan(multisession, workers = ncores) +cl <- makeCluster(ncores) +setDefaultCluster(cl) +progressr::handlers(global = TRUE) + +pf <- PlausibilityFunction$new( + null_spec = null_spec, + stat_functions = stat_functions, + stat_assignments = stat_assignments, + x, y +) + +pf$set_point_estimate(mean(y) - mean(x)) + +pf$set_nperms(nperms) +pf$set_parameter_bounds( + point_estimate = pf$point_estimate, + conf_level = pf$max_conf_level +) + +pf$set_grid( + parameters = pf$parameters, + npoints = ngrid_in +) + +tic() +pf$evaluate_grid(grid = pf$grid) +time_with_parallelization <- toc()$callback_msg + +df_parallelization <- list( + delta = pf$grid$delta, + time_par = time_with_parallelization, + time_without_par = time_without_parallelization +) + +saveRDS(df_parallelization, "data-raw/df_parallelization.rds") + diff --git a/R/sysdata.rda b/R/sysdata.rda index b998cc9..2270f22 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/data-raw/build-sysdata.R b/data-raw/build-sysdata.R index b54d5a6..16f9918 100644 --- a/data-raw/build-sysdata.R +++ b/data-raw/build-sysdata.R @@ -6,11 +6,13 @@ df_mean <- readRDS("data-raw/df_mean.rds") df_sd <- readRDS("data-raw/df_sd.rds") df_fisher <- readRDS("data-raw/df_fisher.rds") df_tippett <- readRDS("data-raw/df_tippett.rds") +df_parallelization <- readRDS("data-raw/df_parallelization.rds") usethis::use_data( alpha_estimates, pfa, pfb, pfc, df_mean, df_sd, df_fisher, df_tippett, + df_parallelization, overwrite = TRUE, internal = TRUE ) diff --git a/data-raw/df_parallelization.rds b/data-raw/df_parallelization.rds new file mode 100644 index 0000000..0c46cc2 Binary files /dev/null and b/data-raw/df_parallelization.rds differ diff --git a/vignettes/parallelization.Rmd b/vignettes/parallelization.Rmd index 69b6a74..7ae24ed 100644 --- a/vignettes/parallelization.Rmd +++ b/vignettes/parallelization.Rmd @@ -12,18 +12,64 @@ knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) +load("../R/sysdata.rda") +time_without_parallelization <- df_parallelization$time_without_par +time_with_parallelization <- df_parallelization$time_par ``` The [**flipr**](https://lmjl-alea.github.io/flipr/) package uses functions contained in the [**furrr**](https://future.futureverse.org/index.html) package for parallel processing. The setting of parallelization has to be done on the user side. We illustrate here how to achieve asynchronous evaluation. We use the [**future**](https://future.futureverse.org/index.html) package to set the plan, the **parallel** package to define a default cluster, and the [**progressr**](https://progressr.futureverse.org/index.html) package to report progress updates. -By setting the desired number of cores, we define the number of background R sessions that will be used to evaluate expressions in parallel. This number is used to set the multisession plan with the function `future::plan()` and to define a default cluster with `parallel::setDefaultCluster()`. Then, to enable the visualization of evaluation progress, we can put the code in the `progressr::with_progress()` function, or more simply set it for all the following code with the `progressr::handlers()` function. After these settings, [**flipr**](https://lmjl-alea.github.io/flipr/) functions can be used, as shown in this example. - ```{r setup} library(flipr) ``` +## Computation without parallel processing + +To show the benefit of parallel processing, we compare here the processing times necessary to evaluate a grid with a plausibility function. First, here is the computation without parallelization. + +```{r, eval=FALSE} +set.seed(1234) +x <- rnorm(10, 1, 1) +y <- rnorm(10, 4, 1) + +null_spec <- function(y, parameters) { + purrr::map(y, ~ .x - parameters[1]) +} +stat_functions <- list(stat_t) +stat_assignments <- list(delta = 1) + +pf <- PlausibilityFunction$new( + null_spec = null_spec, + stat_functions = stat_functions, + stat_assignments = stat_assignments, + x, y +) + +pf$set_point_estimate(mean(y) - mean(x), overwrite = TRUE) +pf$set_parameter_bounds( + point_estimate = pf$point_estimate, + conf_level = pf$max_conf_level +) +pf$set_grid( + parameters = pf$parameters, + npoints = 50L +) + +tictoc::tic() +pf$evaluate_grid(grid = pf$grid) +time_without_parallelization <- tictoc::toc() +``` + +```{r} +time_without_parallelization +``` + +## Computation with parallel processing + +By setting the desired number of cores, we define the number of background R sessions that will be used to evaluate expressions in parallel. This number is used to set the multisession plan with the function `future::plan()` and to define a default cluster with `parallel::setDefaultCluster()`. Then, to enable the visualization of evaluation progress, we can put the code in the `progressr::with_progress()` function, or more simply set it for all the following code with the `progressr::handlers()` function. After these settings, [**flipr**](https://lmjl-alea.github.io/flipr/) functions can be used, as shown in this example. + ```{r, eval=FALSE} -ncores <- 3 +ncores <- 4 future::plan(multisession, workers = ncores) cl <- parallel::makeCluster(ncores) parallel::setDefaultCluster(cl) @@ -55,13 +101,22 @@ pf$set_grid( parameters = pf$parameters, npoints = 50L ) + +tictoc::tic() pf$evaluate_grid(grid = pf$grid) +time_with_parallelization <- tictoc::toc() parallel::stopCluster(cl) ``` It is good practice to shut down the workers with the `parallel::stopCluster()` function at the end of the code. +```{r} +time_with_parallelization +``` + +This experiment proves that we can save a lot of computation time when using parallel processing, as we gained approximately 33 seconds in this example to evaluate the plausibility function. + Finally, to return to a sequential plan with no progress updates, the following code can be used. ```{r, eval=FALSE}