Merge pull request #8 from geco-bern/activate-compression-of-rds

Activate compression of rds
geco-bern · Sep 17, 2024 · 7af1031 · 7af1031
2 parents e1a91f0 + cd6ead9
commit 7af1031
Show file tree

Hide file tree

Showing 8 changed files with 68 additions and 85 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: map2tidy
 Title: Map Spatio-temporal NetCDF Stacks Along Time
-Version: 2.1.0.9000
+Version: 2.1.1
 Authors@R: c(
     person(
       given = "Benjamin",
@@ -29,7 +29,6 @@ Imports:
     tidync,
     CFtime,
     readr,
-    lubridate,
     parallel,
     multidplyr,
     utils,
@@ -42,6 +41,7 @@ Suggests:
     testthat,
     knitr,
     ggplot2,
+    lubridate,
     here
 License: AGPL-3
 Encoding: UTF-8

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+# map2tidy v2.1.1
+
+* activated RDS compression
+
 # map2tidy v2.1
 
 * Changed the naming of the files (prepending 0's to have always 6 digits: `LON_-000.250`)

diff --git a/R/add_loess.R b/R/add_loess.R
diff --git a/R/add_loess_byilon.R b/R/add_loess_byilon.R
diff --git a/R/helpers.R b/R/helpers.R
@@ -157,7 +157,7 @@ nclist_to_df_byilon <- function(
     if (!is.na(outdir)){
       if (nrow(df) > 0){
         # message(paste("Writing file", outpath, "..."))
-        readr::write_rds(df, file = outpath)
+        readr::write_rds(df, file = outpath, compress = "xz") # xz seems most efficient
         return(df |> dplyr::select(lon) |> dplyr::distinct() |> dplyr::mutate(
           data = paste0("Written data by worker with jobid: ", Sys.getpid(), " into file: ", outpath)))
       } else {

diff --git a/man/check_list_of_ncfiles.Rd b/man/check_list_of_ncfiles.Rd
diff --git a/vignettes/map2tidy_example.Rmd b/vignettes/map2tidy_example.Rmd
@@ -54,6 +54,7 @@ head(df)
 The complete time series are now nested data frames in column `data`. We can plot them.
 ```{r}
 library(ggplot2)
+library(lubridate)
 df |>
   dplyr::ungroup() |> dplyr::slice(1:6) |> tidyr::unnest(cols = data) |>
   dplyr::mutate(datetime = lubridate::ymd(datetime),

diff --git a/vignettes/parallel_computation.Rmd b/vignettes/parallel_computation.Rmd
@@ -4,7 +4,7 @@ author: "Beni Stocker"
 date: "2023-11-03"
 output: rmarkdown::html_vignette
 vignette: >
-  %\VignetteIndexEntry{map2tidy functionality}
+  %\VignetteIndexEntry{Parallel computation on large data}
   %\VignetteEngine{knitr::rmarkdown}
   %\VignetteEncoding{UTF-8}
 ---
@@ -29,16 +29,22 @@ This is demonstrated below.
 
 Convert geospatial raster data (e.g., saved as NetCDF, potentially as a list of NetCDF files for multiple time slices) into a tidy data frame. To avoid the memory limitation, we write tidy data into multiple files, separated by longitudinal bands.
 
-This is demonstrated and explained in the vignette *map2tidy functionality*. The demo data are raster files of 30 x 30 in latitude and longitude.
+This is demonstrated and explained in the vignette *map2tidy example*. The demo data are raster files of 30 x 30 in latitude and longitude.
 ```{r eval = FALSE}
-map2tidy(
+library(dplyr)
+library(map2tidy)
+library(lubridate)
+path <- file.path(system.file(package = "map2tidy"),"extdata")
+files <- list.files(path, pattern = "demo_data_2017_month", full.names = TRUE)
+outdir <- tempdir()
+res_tidy <- map2tidy(
   nclist = files, 
   varnam = "et",
   lonnam = "lon", 
   latnam = "lat", 
   timenam = "time", 
   do_chunks = TRUE,
-  outdir = tempdir(), 
+  outdir = outdir, 
   fileprefix = "demo_data_2017", 
   ncores = 3,
   overwrite = TRUE
@@ -53,28 +59,28 @@ This can be anything. Here, we perform a LOESS spline on the time series of the
 ```{r}
 add_loess <- function(df){
   df <- df |> 
-    dplyr::mutate(year_dec = lubridate::decimal_date(time)) %>%
+    mutate(year_dec = lubridate::decimal_date(lubridate::ymd(datetime))) %>%
     dplyr::mutate(loess = stats::loess( et ~ year_dec,
                                         data = ., 
                                         span = 0.05 )$fitted)
   return(df)
 }
+# # test it:
+# df <- readRDS(file.path(outdir, "demo_data_2017_LON_+000.125.rds")) |>
+#   slice(1) |> unnest(data) |> select(datetime, et)
+# add_loess(df)
 ```
 
 
 ## By longitudinal band
 
-Wrap that function into another function `add_loess_byilon()` that takes the longitude index as its first argument and loops over gridcells within that longitudinal band. The data frame `df` is already nested. Write modified data
-for this longitudinal band to file again. Write this function into a file `R/add_loess_byilon.R`
+Wrap that function into another function `add_loess_byLON()` that takes the list of longitudes as its first argument and loops over gridcells (i.e. latitude values) within that longitudinal band. The data frame `df` is already nested. Write modified data
+for this longitudinal band to file again. Write this function into a file `R/add_loess_byLON.R`
 ```{r}
-add_loess_byilon <- function(ilon){
-  
-  source(paste0(here::here(), "/R/add_loess.R"))
+add_loess_byLON <- function(LON_str){
   
   # read from file that contains tidy data for a single longitudinal band
-  filnam <- list.files(tempdir(), 
-                       pattern = paste0("_", as.character(ilon), ".rds"),
-                       full.names = TRUE)
+  filnam <- file.path(outdir, paste0("demo_data_2017_", LON_str, ".rds"))
   
   df <- readr::read_rds(filnam)
   
@@ -92,16 +98,19 @@ add_loess_byilon <- function(ilon){
   
   # write (complemented) data to file
   readr::write_rds(df, 
-                   paste0(tempdir(), 
-                          "/demo_data_2017_ilon_", 
-                          ilon,
-                          "_COMPLEMENTED.rds"))
+                   file.path(outdir, paste0("demo_data_2017_", LON_str, "_COMPLEMENTED_xz.rds")),
+                   compress = "xz") # xz seems most efficient
 }
+# # test it:
+# LON <- "LON_+000.125"
+# add_loess_byLON(LON)
+# readRDS(file.path(outdir, "demo_data_2017_LON_+000.125_COMPLEMENTED.rds")) |>
+#   slice(1) |> unnest(data)
 ```
 
 ## Distribute jobs
 
-Write an R script that distributes jobs for parallel computation, where each job processes data of one chunk and makes a number of calls to `add_loess_byilon()`. For example, if we have 30 longitudinal bands and 3 chunks, each chunk makes 10 calls. Write this script so that it can be run from the shell and that it can deal with three arguments:
+Write an R script that distributes jobs for parallel computation, where each job processes data of one chunk and makes a number of calls to `add_loess_byLON()`. For example, if we have 30 longitudinal bands and 3 chunks, each chunk makes 10 calls. Write this script so that it can be run from the shell and that it can deal with three arguments:
 
 - an index for the chunk
 - the total number of chunks
@@ -113,13 +122,9 @@ Such an R script would look like this:
 args = commandArgs(trailingOnly=TRUE)   # to receive arguments from the shell
 
 library(map2tidy)
-
-source(paste0(here::here(), "/R/add_loess_byilon.R"))
-
 print("getting data for longitude indices:")
-vec_index <- map2tidy::get_index_by_chunk(as.integer(args[1]), 
-                                          as.integer(args[2]), 
-                                          as.integer(args[3]))
+# list_of_LON_str = c("LON_+046.250", "LON_+046.750")
+list_of_LON_str <- gsub(".*(LON_[0-9.+-]*).*.rds","\\1", res_tidy$data) # extract LONGITUDES from res_tidy
 
 # get all available cores
 ncores <- 3 # parallel::detectCores()
@@ -132,24 +137,24 @@ cl <- multidplyr::new_cluster(ncores) %>%
                                 "tidyr", 
                                 "readr",
                                 "here")) %>%
-  multidplyr::cluster_assign(add_loess_byilon = add_loess_byilon)
+  multidplyr::cluster_assign(add_loess_byLON = add_loess_byLON)
 
 # distribute computation across the cores, calculating for all longitudinal 
 # indices of this chunk
-out <- tibble(ilon = vec_index) %>%
-  multidplyr::partition(cl) %>%
-  dplyr::mutate(out = purrr::map( ilon,
-                                  ~add_loess_byilon(.)))
+out <- tibble(LON_str = list_of_LON_str) %>%
+  # multidplyr::partition(cl) %>%
+  dplyr::mutate(out = purrr::map( LON_str,
+                                  ~add_loess_byLON(.)))
 ```
 Save the sript as `analysis/add_loess_bychunk.R`.
 
 An example shell script to send four jobs on HPC (here: ETH Euler) looks like this:
 ```{sh eval = FALSE}
-#!/bin/bash
-njobs=4
-nlon=30
-for ((n=1;n<=${njobs};n++)); do
-    echo "Submitting chunk number $n ..."
-    bsub -W 72:00 -u bestocke -J "job_name $n" -R "rusage[mem=10000]" "Rscript vanilla analysis/add_loess_bychunk.R $n $njobs $nlon"
-done
+## #!/bin/bash
+## njobs=4
+## nlon=30
+## for ((n=1;n<=${njobs};n++)); do
+##     echo "Submitting chunk number $n ..."
+##     bsub -W 72:00 -u bestocke -J "job_name $n" -R "rusage[mem=10000]" "Rscript vanilla analysis/add_loess_bychunk.R $n $njobs $nlon"
+## done
 ```