geco-bern · stineb · Mar 26, 2024 · Mar 26, 2024 · Mar 26, 2024 · Mar 26, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,13 +1,19 @@
 Package: FluxDataKit
 Title: Flux Data Kit
-Version: 0.9
+Version: 3.0
 Authors@R: c(
   person(
     family = "Hufkens",
     given = "Koen",
     email = "[email protected]",
     role = c("aut", "cre"),
-    comment = c(ORCID = "0000-0002-5070-8109"))
+    comment = c(ORCID = "0000-0002-5070-8109")),
+  person(
+    family = "Benjamin",
+    given = "Stocker",
+    email = "[email protected]",
+    comment = c(ORCID = "0000-0003-2697-9096"),
+    role = c("ctb"))
     )
 Description: A processing workflow for aggregated flux and remote sensing data.
  Returns both Land Surface Model or CSV based harmonized and gap filled data.
@@ -29,8 +35,7 @@ Imports:
     lubridate,
     recipes,
     readr,
-    here,
-    cowplot
+    here
 Suggests:
     knitr,
     rmarkdown,

diff --git a/R/fdk_get_sequence.R b/R/fdk_get_sequence.R
@@ -28,45 +28,36 @@ fdk_get_sequence <- function(
 
   df <- df |>
     mutate(good_gpp = ifelse(NEE_VUT_REF_QC > qc_threshold, TRUE, FALSE),
-           good_le = ifelse(LE_F_MDS_QC > qc_threshold, TRUE, FALSE))
-
-  # determine sequences of consecutive TRUE and merge if gap between them is short
-  instances_merged <- get_consecutive(
-    df$good_gpp,
-    merge_threshold = leng_threshold,
-    do_merge = TRUE
-  )
-
-  df_sequences_merged <- tibble(
-    start = lubridate::as_date(df$TIMESTAMP[instances_merged$idx_start]),
-    end = lubridate::as_date(df$TIMESTAMP[instances_merged$idx_start + instances_merged$len - 1])
-  )
-
-  # determine longest sequence of good quality data
-  longest_sequence <- instances_merged |>
-    filter(len == max(instances_merged$len))
-
-  out <- tibble(
-    sitename = site,
-    start = lubridate::as_date(df$TIMESTAMP[longest_sequence$idx_start]),
-    end = lubridate::as_date(df$TIMESTAMP[longest_sequence$idx_start + longest_sequence$len - 1])) |>
-
-    # truncate to entire years (1. Jan - 31. Dec)
-    mutate(
-      year_start_fullyearsequence = ifelse(
-        lubridate::yday(start) == 1,
-        lubridate::year(start),
-        lubridate::year(start) + 1),
-      year_end_fullyearsequence = ifelse(
-        lubridate::yday(end) >= 365,
-        lubridate::year(end),
-        lubridate::year(end) - 1
-      )) |>
-    mutate(
-      nyears = year_end_fullyearsequence - year_start_fullyearsequence + 1
+           good_le = ifelse(LE_F_MDS_QC > qc_threshold, TRUE, FALSE),
+           good_lecorr = ifelse(LE_F_MDS_QC > qc_threshold & !is.na(LE_CORR), TRUE, FALSE)
+           )
+
+  out <- get_sequence_byvar(site, df, df$good_gpp, leng_threshold, TRUE) |>
+    rename(start_gpp = start,
+           end_gpp = end,
+           year_start_gpp = year_start,
+           year_end_gpp = year_end,
+           nyears_gpp = nyears,
+           drop_gpp = drop) |>
+    left_join(
+      get_sequence_byvar(site, df, df$good_le, leng_threshold, TRUE) |>
+        rename(start_le = start,
+               end_le = end,
+               year_start_le = year_start,
+               year_end_le = year_end,
+               nyears_le = nyears,
+               drop_le = drop),
+      by = join_by(sitename)
     ) |>
-    mutate(
-      drop = ifelse(nyears < 1, TRUE, FALSE)
+    left_join(
+      get_sequence_byvar(site, df, df$good_lecorr, leng_threshold, TRUE) |>
+        rename(start_lecorr = start,
+               end_lecorr = end,
+               year_start_lecorr = year_start,
+               year_end_lecorr = year_end,
+               nyears_lecorr = nyears,
+               drop_lecorr = drop),
+      by = join_by(sitename)
     )
 
   if (do_plot){
@@ -128,8 +119,8 @@ fdk_get_sequence <- function(
       ggplot2::geom_rect(
         data = out,
         ggplot2::aes(
-          xmin = lubridate::ymd(paste0(year_start_fullyearsequence, "-01-01")),
-          xmax = lubridate::ymd(paste0(year_end_fullyearsequence,   "-12-31")),
+          xmin = lubridate::ymd(paste0(year_start_gpp, "-01-01")),
+          xmax = lubridate::ymd(paste0(year_end_gpp,   "-12-31")),
           ymin = min(df$GPP_NT_VUT_REF, na.rm = TRUE),
           ymax = max(df$GPP_NT_VUT_REF, na.rm = TRUE)
         ),
@@ -169,6 +160,58 @@ fdk_get_sequence <- function(
   return(out)
 }
 
+get_sequence_byvar <- function(site, df, good, leng_threshold, do_merge){
+
+  if (any(good)){
+    # determine sequences of consecutive TRUE and merge if gap between them is short
+    inst_merged <- get_consecutive(
+      good,
+      merge_threshold = leng_threshold,
+      do_merge = do_merge
+    )
+
+    # determine longest sequence of good quality data
+    longest_seq <- inst_merged |>
+      filter(len == max(inst_merged$len))
+
+    # get start and end date of longest sequences
+    out <- tibble(
+      sitename = site,
+      start = lubridate::as_date(df$TIMESTAMP[longest_seq$idx_start]),
+      end = lubridate::as_date(df$TIMESTAMP[longest_seq$idx_start + longest_seq$len - 1])) |>
+
+      # truncate to entire years (1. Jan - 31. Dec)
+      mutate(
+        year_start = ifelse(
+          lubridate::yday(start) == 1,
+          lubridate::year(start),
+          lubridate::year(start) + 1),
+        year_end = ifelse(
+          lubridate::yday(end) >= 365,
+          lubridate::year(end),
+          lubridate::year(end) - 1
+        )) |>
+      mutate(
+        nyears = year_end - year_start + 1
+      ) |>
+      mutate(
+        drop = ifelse(nyears < 1, TRUE, FALSE)
+      )
+  } else {
+    out <- tibble(
+      sitename = site,
+      start = NA,
+      end = NA,
+      year_start = NA,
+      year_end = NA,
+      nyears = 0,
+      drop = TRUE
+    )
+  }
+
+
+}
+
 get_consecutive <- function(
     good,
     merge_threshold = 5,

diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Fluxnet aggregation project
+# Multi-network ecosystem flux data compilation
 
 This project is the framework used to create the LEMONTREE "flux data kit", a dataset with consistent model data for use and re-use. In the interest of consistency across the community we re-use the PLUMBER-2 framework, with a few exceptions. The PLUMBER-2 framework generated consistent gap filled data for land surface modelling. We use the same methods (from the underlying FluxnetLSM package), to provide an expanded dataset covering more sites and site years.
 
@@ -8,12 +8,12 @@ The data is generated using [set workflow]() and new releases generated using th
 
 ## Ecosystem flux data sources
 
-We sourced data from openly available ecosystem flux networks or products, mainly ICOS, OneFlux processed data, the FLUXNET2015 dataset and PLUMBER-2 (which includes various data sources in its own right, see Ukkola et al. 2022). Data was sourced from these locations:
+We sourced data from openly available ecosystem flux data products:
 
-- ICOS data was provided through the ICOS carbon portal, this is a pre-release currently *not publicly available*
-- FLUXNET2015 data can be retrieved from the [FLUXNET data portal](https://fluxnet.org/data/fluxnet2015-dataset/)
-- OneFlux data can be retrieved from the [Ameriflux data portal](https://ameriflux.lbl.gov/data/download-data/)
-- PLUMBER data can be downloaded using [an included script](https://github.com/geco-bern/FluxDataKit/blob/main/data-raw/00_download_plumber_data.R)
+- PLUMBER-2: https://dx.doi.org/10.25914/5fdb0902607e1. Can be downloaded using [an included script](https://github.com/geco-bern/FluxDataKit/blob/main/data-raw/00_download_plumber_data.R)
+- The latest Ameriflux release, downloaded data on 14 Oct 2023 from https://ameriflux.lbl.gov/.
+- ICOS Drought2018 release from https://doi.org/10.18160/YVR0-4898.
+- ICOS WarmWinter2020 release from https://doi.org/10.18160/2G60-ZHAK.
 - MODIS LAI/FPAR data is downloaded by an included script
 
 Data should be structured in the following directory structure and referenced
@@ -22,21 +22,17 @@ to as such in the data generation workflow:
 ```
 data/
    ├─ modis/
+   ├─ cloud_cover/
    ├─ flux_data/
-      ├─ fluxnet2015/
-      ├─ icos/
-      ├─ oneflux/
+      ├─ plumber/
+      ├─ icos_warmwinter2020/
+      ├─ icos_drought2018/
       ├─ ameriflux/
 ```
 
 ## Ecosystem flux data selection
 
-Given the various datasets, and at times overlap between the datasets a priority in processing is given to more recent (hopefully) and more complete datasets. In order of processing this means that OneFlux has priority over FLUXNET2015, and Plumber2. ICOS data has priority over FLUXNET2015 for European sites. Overall, Plumber2 mostly fills in the remaining sites in Asia and Australia. The final picking order is thus:
-
-- ICOS
-- OneFlux
-- FLUXNET2015
-- PLUMBER-2
+The flux data source (PLUMBER-2, Ameriflux, ICOS WarmWinter2020, or ICOS Drought2018) is determined for each site based on which source provides the longest data time series. Site meta information is sourced from multiple sources to maximise available information. This is done in scripts `data-raw/01_collect_meta-data.R` and `data-raw/02_compile_final_site_list.R`.
 
 ## Data products
 
@@ -46,21 +42,25 @@ We deliver gap filled ecosystem flux data in line with the PLUMBER dataset. We r
 
 #### Exceptions and processing differences
 
-Contrary to the original PLUMBER data we report both data for a closed energy balance, and the raw data inputs (on request of some data users). Furthermore, we report both MODIS based leaf area index (LAI) and fraction of absorbed photosynthetic active radiation (FAPAR). Processing of the MODIS data was also altered and now follows a workflow similar to the one integrated in the {phenocamr} package. Data is smoothed using a LOESS based curve fitting with a BIC optimized smoothing kernel, instead of multiple cubic splines.
+Contrary to the original PLUMBER data, we report both data for a closed energy balance, and the raw data inputs (on request of some data users). Furthermore, we report both MODIS-based leaf area index (LAI) and fraction of absorbed photosynthetic active radiation (FPAR). Processing of the MODIS data was also altered and now follows a workflow similar to the one integrated in the {phenocamr} package. Data is smoothed using a LOESS based curve fitting with a BIC optimized smoothing kernel, instead of multiple cubic splines.
 
 ### Half-hourly and daily FLUXNET data output (CSV)
 
-To provide easily readable data as requested by some data users we convert the netCDF data to a human-readable CSV file adhering to FLUXNET column- and file-naming conventions. These half-hourly files are further downsampled to a daily time step for modelling efforts which require daily data. The daily data should be easily merged on a day by day basis with remote sensing data as provided by the FluxnetEO data product (Walther et al. 2022).
+To provide easily readable data as requested by some data users we convert the NetCDF data to a human-readable CSV file adhering to FLUXNET column- and file-naming conventions. These half-hourly files are further downsampled to a daily time step for modelling efforts which require daily data. The daily data should be easily merged on a day by day basis with remote sensing data as provided by the FluxnetEO data product (Walther et al. 2022).
 
 > Downsampled daily data is an aggregation of the half-hourly data and not, as would be the case when downloading daily data from an ecosystem flux processing chain, a completely separate product. Some discrepancies therefore exist between the downsampled data and the equivalent daily ecosystem flux product.
 
-### p-model drivers (structured R data)
+### rsofun drivers (structured R data)
+
+A final data product derived from the initial gap-filled LSM data are driver data for the [`rsofun`](https://github.com/geco-bern/rsofun) package. In the current setup, *in-situ* measured model forcing data is combined with GPP and LE values (including their quality-control information) as target data for model calibration.
+
+### Additional data cleaning
 
-A final data product derived from the initial gap-filled LSM data are p-model driver data for the [`rsofun`](https://github.com/geco-bern/rsofun) package. In the current setup *in-situ* environmental forcing will be combined with GPP values as target data for model calibration.
+Information about the longest sequence of full years (365 days) of good-quality gapfilled daily GPP, LE, and LE_CORR data for each site is provided by package data `fdk_site_fullyearsequence`. This is created by `analysis/03_screen_rsofun_data.R`. It provides information about the start and end date and the full years for which sequences are available.
 
 ### Ancillary remote sensing data
 
-For machine learning or other modelling purposes we provide ancillary MODIS based remote sensing data as described in the FluxnetEO dataset. We refer to the original publication and our [FluxnetEO](https://bg.copernicus.org/articles/19/2805/2022/) package for easy reading and processing of the data.
+For machine learning or other modelling purposes, we provide ancillary MODIS based remote sensing data as described in the FluxnetEO dataset. We refer to the original publication and our [FluxnetEO](https://bg.copernicus.org/articles/19/2805/2022/) package for easy reading and processing of the data.
 
 ## Data and code availabilty
 

diff --git a/analysis/03_screen_rsofun_data.R b/analysis/03_screen_rsofun_data.R
@@ -2,7 +2,7 @@
 library(tidyverse)
 library(FluxDataKit)
 
-path <- "~/data/FluxDataKit/v3"  #  "/data/scratch/beta-v4"
+path <- "~/data/FluxDataKit/v3"
 
 sites <- FluxDataKit::fdk_site_info |>
   filter(!(sitename %in% c("MX-Tes", "US-KS3")))

diff --git a/analysis/04_create_zenodo_upload.R b/analysis/04_create_zenodo_upload.R
@@ -11,33 +11,33 @@
 # the Zenodo repository:
 # https://zenodo.org/record/7258291
 
-input_path <- "/data/scratch/beta-v4/"
-tmp_path <- "/data/scratch/upload"
+input_path <- "~/data/FluxDataKit/v3/"
+tmp_path <- "~/data/FluxDataKit/v3/zenodo_upload/"
 
 #---- purge old data -----
 
-# remove temporary path
-system(sprintf("rm -rf %s", tmp_path))
-
-# recreate temporary path
-dir.create(tmp_path)
-
-#---- copy new data over ----
-system(
-  sprintf(
-    "cp -R %s/lsm %s/lsm",
-  input_path,
-  tmp_path
-  )
-)
-
-system(
-  sprintf(
-    "cp -R %s/fluxnet %s/fluxnet",
-    input_path,
-    tmp_path
-  )
-)
+# # remove temporary path
+# system(sprintf("rm -rf %s", tmp_path))
+#
+# # recreate temporary path
+# dir.create(tmp_path)
+#
+# #---- copy new data over ----
+# system(
+#   sprintf(
+#     "cp -R %s/lsm %s/lsm",
+#   input_path,
+#   tmp_path
+#   )
+# )
+#
+# system(
+#   sprintf(
+#     "cp -R %s/fluxnet %s/fluxnet",
+#     input_path,
+#     tmp_path
+#   )
+# )
 
 #---- rename all files in place ----
 

diff --git a/data-raw/README.md b/data-raw/README.md
@@ -21,10 +21,10 @@ for them to function.
 
 Data was sourced from different locations:
 
-- ICOS data was provided through the ICOS carbon portal, this is a pre-release currently *not publicly available*
-- FLUXNET2015 data can be retrieved from the [FLUXNET data porta](https://fluxnet.org/data/fluxnet2015-dataset/)
-- OneFlux data can be retrieved from the [Ameriflux data portal](https://ameriflux.lbl.gov/data/download-data/)
-- PLUMBER data can be downloaded using [an included script](https://github.com/geco-bern/FluxDataKit/blob/main/data-raw/00_download_plumber_data.R)
+- PLUMBER-2: https://dx.doi.org/10.25914/5fdb0902607e1. Can be downloaded using [an included script](https://github.com/geco-bern/FluxDataKit/blob/main/data-raw/00_download_plumber_data.R)
+- The latest Ameriflux release, downloaded data on 14 Oct 2023 from https://ameriflux.lbl.gov/.
+- ICOS Drought2018 release from https://doi.org/10.18160/YVR0-4898.
+- ICOS WarmWinter2020 release from https://doi.org/10.18160/2G60-ZHAK.
 - MODIS LAI/FPAR data is downloaded by an included script
 
 ## Data structure

diff --git a/data/fdk_site_fullyearsequence.rda b/data/fdk_site_fullyearsequence.rda