Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Corrections in parallelization vignette #22

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 21 additions & 7 deletions R/point-estimation.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,25 @@ compute_point_estimate <- function(pf,
nparams <- pf$nparams

if (!is.null(guess)) {
# uses user default cluster
opt <- optimParallel::optimParallel(
par = guess,
fn = pf$get_value,
control = list(fnscale = -1)
)
# if user did not set a default cluster we compute in sequential
if (is.null(parallel::getDefaultCluster())) {
cli::cli_alert_warning("Default cluster has not been set. You may consider setting one to run computation in parallel.")
opt <- stats::optim(
par = guess,
fn = pf$get_value,
method = "L-BFGS-B",
control = list(fnscale = -1)
)
}
else {
# uses user default cluster to compute in parallel
opt <- optimParallel::optimParallel(
par = guess,
fn = pf$get_value,
control = list(fnscale = -1),
parallel = list(cl = NULL, forward = FALSE, loginfo = FALSE)
)
}
x0 <- opt$par
fval <- opt$value
} else {
Expand All @@ -21,6 +34,7 @@ compute_point_estimate <- function(pf,
if (length(upper_bound) != nparams)
abort("The number of provided upper bounds does not match the number of parameters.")

# if user default cluser is not set, will run in sequential, else run in parallel
opt <- rgenoud::genoud(
fn = pf$get_value,
nvars = nparams,
Expand All @@ -31,7 +45,7 @@ compute_point_estimate <- function(pf,
wait.generations = 2 * nparams + 1,
BFGSburnin = 2 * nparams + 1,
print.level = 0,
cluster = parallel::getDefaultCluster(),
cluster = if (!is.null(parallel::getDefaultCluster())) parallel::getDefaultCluster() else FALSE,
balance = nparams > 2
)
opt <- compute_point_estimate(
Expand Down
Binary file modified R/sysdata.rda
Binary file not shown.
Binary file modified data-raw/df_parallelization.rds
Binary file not shown.
34 changes: 23 additions & 11 deletions data-raw/parallelization-vignette.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ library(progressr)
library(tictoc)
library(flipr)

ngrid_in <- 50L
nperms <- 5000
ngrid_in <- 100L
nperms <- 2000
n1 <- 10
set.seed(1234)
x <- rnorm(n1, mean = 1, sd = 1)
Expand All @@ -25,16 +25,22 @@ stat_assignments <- list(delta = 1)

# Inference on the mean without parallelization --------------------------------

plan(sequential)
setDefaultCluster(NULL)
progressr::handlers(global = FALSE)

pf <- PlausibilityFunction$new(
null_spec = null_spec,
stat_functions = stat_functions,
stat_assignments = stat_assignments,
x, y
)
pf$set_nperms(nperms)

pf$set_point_estimate(mean(y) - mean(x))
tic()
pf$set_point_estimate()
time_without_parallel <- toc()$callback_msg

pf$set_nperms(nperms)
pf$set_parameter_bounds(
point_estimate = pf$point_estimate,
conf_level = pf$max_conf_level
Expand All @@ -47,11 +53,11 @@ pf$set_grid(

tic()
pf$evaluate_grid(grid = pf$grid)
time_without_parallelization <- toc()$callback_msg
time_without_future <- toc()$callback_msg

# Inference on the mean with parallelization -----------------------------------

ncores <- 4
ncores <- 3
plan(multisession, workers = ncores)
cl <- makeCluster(ncores)
setDefaultCluster(cl)
Expand All @@ -63,10 +69,12 @@ pf <- PlausibilityFunction$new(
stat_assignments = stat_assignments,
x, y
)
pf$set_nperms(nperms)

pf$set_point_estimate(mean(y) - mean(x))
tic()
pf$set_point_estimate()
time_with_parallel <- toc()$callback_msg

pf$set_nperms(nperms)
pf$set_parameter_bounds(
point_estimate = pf$point_estimate,
conf_level = pf$max_conf_level
Expand All @@ -79,12 +87,16 @@ pf$set_grid(

tic()
pf$evaluate_grid(grid = pf$grid)
time_with_parallelization <- toc()$callback_msg
time_with_future <- toc()$callback_msg

stopCluster(cl)

df_parallelization <- list(
delta = pf$grid$delta,
time_par = time_with_parallelization,
time_without_par = time_without_parallelization
time_without_parallel = time_without_parallel,
time_without_future = time_without_future,
time_with_parallel = time_with_parallel,
time_with_future = time_with_future
)

saveRDS(df_parallelization, "data-raw/df_parallelization.rds")
Expand Down
79 changes: 51 additions & 28 deletions vignettes/parallelization.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ knitr::opts_chunk$set(
)
library(flipr)
load("../R/sysdata.rda")
time_without_parallelization <- df_parallelization$time_without_par
time_with_parallelization <- df_parallelization$time_par
time_without_parallel <- df_parallelization$time_without_parallel
time_without_future <- df_parallelization$time_without_future
time_with_parallel <- df_parallelization$time_with_parallel
time_with_future <- df_parallelization$time_with_future
```

The [**flipr**](https://permaverse.github.io/flipr/) package uses functions
Expand All @@ -25,21 +27,21 @@ user side. We illustrate here how to achieve asynchronous evaluation. We use the
[**future**](https://future.futureverse.org/index.html) package to set the plan,
the **parallel** package to define a default cluster, and the
[**progressr**](https://progressr.futureverse.org/index.html) package to report
progress updates.
progress updates.

By setting the desired number of cores, we define the number of background R
sessions that will be used to evaluate expressions in parallel. This number is
used to set the multisession plan with the function `future::plan()` and to
define a default cluster with `parallel::setDefaultCluster()`. Then, to enable
the visualization of evaluation progress, we can put the code in the
`progressr::with_progress()` function, or more simply set it for all the
following code with the `progressr::handlers()` function. After these settings,
[**flipr**](https://permaverse.github.io/flipr/) functions can be used, as shown
in this example.
More precisely, setting a default cluster with **parallel** is useful to allow
parallel computation of the point estimate if an estimation is needed. On the
other side, setting the plan with
[**future**](https://future.futureverse.org/index.html) allows parallel
computation when evaluating the plausibility function. A comparison of
computation time with sequential and parallel computation for those two cases is
done in the following.

## Computation without parallel processing

To show the benefit of parallel processing, we compare here the processing times
necessary to evaluate a grid with a plausibility function. First, here is the
computation without parallelization.
necessary to compute a point estimation and to evaluate the grid for a
plausibility function. First, here is the computation without parallelization.

```{r, eval=FALSE}
set.seed(1234)
Expand All @@ -58,24 +60,34 @@ pf <- PlausibilityFunction$new(
stat_assignments = stat_assignments,
x, y
)
pf$set_nperms(2000)

tic()
pf$set_point_estimate()
time_without_parallel <- toc()$callback_msg
```

```{r}
time_without_parallel
```

pf$set_point_estimate(mean(y) - mean(x), overwrite = TRUE)
```{r, eval=FALSE}
pf$set_parameter_bounds(
point_estimate = pf$point_estimate,
conf_level = pf$max_conf_level
)
pf$set_grid(
parameters = pf$parameters,
npoints = 50L
npoints = 100L
)

tictoc::tic()
pf$evaluate_grid(grid = pf$grid)
time_without_parallelization <- tictoc::toc()
time_without_future <- tictoc::toc()$callback_msg
```

```{r}
time_without_parallelization
time_without_future
```

## Computation with parallel processing
Expand All @@ -91,8 +103,8 @@ following code with the `progressr::handlers()` function. After these settings,
in this example.

```{r, eval=FALSE}
ncores <- 4
future::plan(multisession, workers = ncores)
ncores <- 3
future::plan(future::multisession, workers = ncores)
cl <- parallel::makeCluster(ncores)
parallel::setDefaultCluster(cl)
progressr::handlers(global = TRUE)
Expand All @@ -113,20 +125,30 @@ pf <- PlausibilityFunction$new(
stat_assignments = stat_assignments,
x, y
)
pf$set_nperms(2000)

pf$set_point_estimate(mean(y) - mean(x), overwrite = TRUE)
tic()
pf$set_point_estimate()
time_with_parallel <- toc()$callback_msg
```

```{r}
time_with_parallel
```

```{r, eval=FALSE}
pf$set_parameter_bounds(
point_estimate = pf$point_estimate,
conf_level = pf$max_conf_level
)
pf$set_grid(
parameters = pf$parameters,
npoints = 50L
npoints = 100L
)

tictoc::tic()
pf$evaluate_grid(grid = pf$grid)
time_with_parallelization <- tictoc::toc()
time_with_future <- tictoc::toc()$callback_msg

parallel::stopCluster(cl)
```
Expand All @@ -135,18 +157,19 @@ It is good practice to shut down the workers with the `parallel::stopCluster()`
function at the end of the code.

```{r}
time_with_parallelization
time_with_future
```

This experiment proves that we can save a lot of computation time when using
parallel processing, as we gained approximately 33 seconds in this example to
evaluate the plausibility function.
parallel processing. Indeed, by setting 3 cores for each of the parallel
processing tools, we reduced approximately by 3 times the computation time for
both computing a point estimation and evaluating the plausibility function.

Finally, to return to a sequential plan with no progress updates, the following
code can be used.
code can be used. It also allows to shut down the workers used in [**future**](https://future.futureverse.org/index.html).

```{r, eval=FALSE}
future::plan(sequential)
future::plan(future::sequential)
parallel::setDefaultCluster(NULL)
progressr::handlers(global = FALSE)
```
Loading