Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated vignettes and README for v3.0 #76

Merged
merged 3 commits into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
Package: FluxDataKit
Title: Flux Data Kit
Version: 0.9
Version: 3.0
Authors@R: c(
person(
family = "Hufkens",
given = "Koen",
email = "[email protected]",
role = c("aut", "cre"),
comment = c(ORCID = "0000-0002-5070-8109"))
comment = c(ORCID = "0000-0002-5070-8109")),
person(
family = "Benjamin",
given = "Stocker",
email = "[email protected]",
comment = c(ORCID = "0000-0003-2697-9096"),
role = c("ctb"))
)
Description: A processing workflow for aggregated flux and remote sensing data.
Returns both Land Surface Model or CSV based harmonized and gap filled data.
Expand All @@ -29,8 +35,7 @@ Imports:
lubridate,
recipes,
readr,
here,
cowplot
here
Suggests:
knitr,
rmarkdown,
Expand Down
123 changes: 83 additions & 40 deletions R/fdk_get_sequence.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,45 +28,36 @@ fdk_get_sequence <- function(

df <- df |>
mutate(good_gpp = ifelse(NEE_VUT_REF_QC > qc_threshold, TRUE, FALSE),
good_le = ifelse(LE_F_MDS_QC > qc_threshold, TRUE, FALSE))

# determine sequences of consecutive TRUE and merge if gap between them is short
instances_merged <- get_consecutive(
df$good_gpp,
merge_threshold = leng_threshold,
do_merge = TRUE
)

df_sequences_merged <- tibble(
start = lubridate::as_date(df$TIMESTAMP[instances_merged$idx_start]),
end = lubridate::as_date(df$TIMESTAMP[instances_merged$idx_start + instances_merged$len - 1])
)

# determine longest sequence of good quality data
longest_sequence <- instances_merged |>
filter(len == max(instances_merged$len))

out <- tibble(
sitename = site,
start = lubridate::as_date(df$TIMESTAMP[longest_sequence$idx_start]),
end = lubridate::as_date(df$TIMESTAMP[longest_sequence$idx_start + longest_sequence$len - 1])) |>

# truncate to entire years (1. Jan - 31. Dec)
mutate(
year_start_fullyearsequence = ifelse(
lubridate::yday(start) == 1,
lubridate::year(start),
lubridate::year(start) + 1),
year_end_fullyearsequence = ifelse(
lubridate::yday(end) >= 365,
lubridate::year(end),
lubridate::year(end) - 1
)) |>
mutate(
nyears = year_end_fullyearsequence - year_start_fullyearsequence + 1
good_le = ifelse(LE_F_MDS_QC > qc_threshold, TRUE, FALSE),
good_lecorr = ifelse(LE_F_MDS_QC > qc_threshold & !is.na(LE_CORR), TRUE, FALSE)
)

out <- get_sequence_byvar(site, df, df$good_gpp, leng_threshold, TRUE) |>
rename(start_gpp = start,
end_gpp = end,
year_start_gpp = year_start,
year_end_gpp = year_end,
nyears_gpp = nyears,
drop_gpp = drop) |>
left_join(
get_sequence_byvar(site, df, df$good_le, leng_threshold, TRUE) |>
rename(start_le = start,
end_le = end,
year_start_le = year_start,
year_end_le = year_end,
nyears_le = nyears,
drop_le = drop),
by = join_by(sitename)
) |>
mutate(
drop = ifelse(nyears < 1, TRUE, FALSE)
left_join(
get_sequence_byvar(site, df, df$good_lecorr, leng_threshold, TRUE) |>
rename(start_lecorr = start,
end_lecorr = end,
year_start_lecorr = year_start,
year_end_lecorr = year_end,
nyears_lecorr = nyears,
drop_lecorr = drop),
by = join_by(sitename)
)

if (do_plot){
Expand Down Expand Up @@ -128,8 +119,8 @@ fdk_get_sequence <- function(
ggplot2::geom_rect(
data = out,
ggplot2::aes(
xmin = lubridate::ymd(paste0(year_start_fullyearsequence, "-01-01")),
xmax = lubridate::ymd(paste0(year_end_fullyearsequence, "-12-31")),
xmin = lubridate::ymd(paste0(year_start_gpp, "-01-01")),
xmax = lubridate::ymd(paste0(year_end_gpp, "-12-31")),
ymin = min(df$GPP_NT_VUT_REF, na.rm = TRUE),
ymax = max(df$GPP_NT_VUT_REF, na.rm = TRUE)
),
Expand Down Expand Up @@ -169,6 +160,58 @@ fdk_get_sequence <- function(
return(out)
}

get_sequence_byvar <- function(site, df, good, leng_threshold, do_merge){

if (any(good)){
# determine sequences of consecutive TRUE and merge if gap between them is short
inst_merged <- get_consecutive(
good,
merge_threshold = leng_threshold,
do_merge = do_merge
)

# determine longest sequence of good quality data
longest_seq <- inst_merged |>
filter(len == max(inst_merged$len))

# get start and end date of longest sequences
out <- tibble(
sitename = site,
start = lubridate::as_date(df$TIMESTAMP[longest_seq$idx_start]),
end = lubridate::as_date(df$TIMESTAMP[longest_seq$idx_start + longest_seq$len - 1])) |>

# truncate to entire years (1. Jan - 31. Dec)
mutate(
year_start = ifelse(
lubridate::yday(start) == 1,
lubridate::year(start),
lubridate::year(start) + 1),
year_end = ifelse(
lubridate::yday(end) >= 365,
lubridate::year(end),
lubridate::year(end) - 1
)) |>
mutate(
nyears = year_end - year_start + 1
) |>
mutate(
drop = ifelse(nyears < 1, TRUE, FALSE)
)
} else {
out <- tibble(
sitename = site,
start = NA,
end = NA,
year_start = NA,
year_end = NA,
nyears = 0,
drop = TRUE
)
}


}

get_consecutive <- function(
good,
merge_threshold = 5,
Expand Down
40 changes: 20 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Fluxnet aggregation project
# Multi-network ecosystem flux data compilation

This project is the framework used to create the LEMONTREE "flux data kit", a dataset with consistent model data for use and re-use. In the interest of consistency across the community we re-use the PLUMBER-2 framework, with a few exceptions. The PLUMBER-2 framework generated consistent gap filled data for land surface modelling. We use the same methods (from the underlying FluxnetLSM package), to provide an expanded dataset covering more sites and site years.

Expand All @@ -8,12 +8,12 @@ The data is generated using [set workflow]() and new releases generated using th

## Ecosystem flux data sources

We sourced data from openly available ecosystem flux networks or products, mainly ICOS, OneFlux processed data, the FLUXNET2015 dataset and PLUMBER-2 (which includes various data sources in its own right, see Ukkola et al. 2022). Data was sourced from these locations:
We sourced data from openly available ecosystem flux data products:

- ICOS data was provided through the ICOS carbon portal, this is a pre-release currently *not publicly available*
- FLUXNET2015 data can be retrieved from the [FLUXNET data portal](https://fluxnet.org/data/fluxnet2015-dataset/)
- OneFlux data can be retrieved from the [Ameriflux data portal](https://ameriflux.lbl.gov/data/download-data/)
- PLUMBER data can be downloaded using [an included script](https://github.com/geco-bern/FluxDataKit/blob/main/data-raw/00_download_plumber_data.R)
- PLUMBER-2: https://dx.doi.org/10.25914/5fdb0902607e1. Can be downloaded using [an included script](https://github.com/geco-bern/FluxDataKit/blob/main/data-raw/00_download_plumber_data.R)
- The latest Ameriflux release, downloaded data on 14 Oct 2023 from https://ameriflux.lbl.gov/.
- ICOS Drought2018 release from https://doi.org/10.18160/YVR0-4898.
- ICOS WarmWinter2020 release from https://doi.org/10.18160/2G60-ZHAK.
- MODIS LAI/FPAR data is downloaded by an included script

Data should be structured in the following directory structure and referenced
Expand All @@ -22,21 +22,17 @@ to as such in the data generation workflow:
```
data/
├─ modis/
├─ cloud_cover/
├─ flux_data/
├─ fluxnet2015/
├─ icos/
├─ oneflux/
├─ plumber/
├─ icos_warmwinter2020/
├─ icos_drought2018/
├─ ameriflux/
```

## Ecosystem flux data selection

Given the various datasets, and at times overlap between the datasets a priority in processing is given to more recent (hopefully) and more complete datasets. In order of processing this means that OneFlux has priority over FLUXNET2015, and Plumber2. ICOS data has priority over FLUXNET2015 for European sites. Overall, Plumber2 mostly fills in the remaining sites in Asia and Australia. The final picking order is thus:

- ICOS
- OneFlux
- FLUXNET2015
- PLUMBER-2
The flux data source (PLUMBER-2, Ameriflux, ICOS WarmWinter2020, or ICOS Drought2018) is determined for each site based on which source provides the longest data time series. Site meta information is sourced from multiple sources to maximise available information. This is done in scripts `data-raw/01_collect_meta-data.R` and `data-raw/02_compile_final_site_list.R`.

## Data products

Expand All @@ -46,21 +42,25 @@ We deliver gap filled ecosystem flux data in line with the PLUMBER dataset. We r

#### Exceptions and processing differences

Contrary to the original PLUMBER data we report both data for a closed energy balance, and the raw data inputs (on request of some data users). Furthermore, we report both MODIS based leaf area index (LAI) and fraction of absorbed photosynthetic active radiation (FAPAR). Processing of the MODIS data was also altered and now follows a workflow similar to the one integrated in the {phenocamr} package. Data is smoothed using a LOESS based curve fitting with a BIC optimized smoothing kernel, instead of multiple cubic splines.
Contrary to the original PLUMBER data, we report both data for a closed energy balance, and the raw data inputs (on request of some data users). Furthermore, we report both MODIS-based leaf area index (LAI) and fraction of absorbed photosynthetic active radiation (FPAR). Processing of the MODIS data was also altered and now follows a workflow similar to the one integrated in the {phenocamr} package. Data is smoothed using a LOESS based curve fitting with a BIC optimized smoothing kernel, instead of multiple cubic splines.

### Half-hourly and daily FLUXNET data output (CSV)

To provide easily readable data as requested by some data users we convert the netCDF data to a human-readable CSV file adhering to FLUXNET column- and file-naming conventions. These half-hourly files are further downsampled to a daily time step for modelling efforts which require daily data. The daily data should be easily merged on a day by day basis with remote sensing data as provided by the FluxnetEO data product (Walther et al. 2022).
To provide easily readable data as requested by some data users we convert the NetCDF data to a human-readable CSV file adhering to FLUXNET column- and file-naming conventions. These half-hourly files are further downsampled to a daily time step for modelling efforts which require daily data. The daily data should be easily merged on a day by day basis with remote sensing data as provided by the FluxnetEO data product (Walther et al. 2022).

> Downsampled daily data is an aggregation of the half-hourly data and not, as would be the case when downloading daily data from an ecosystem flux processing chain, a completely separate product. Some discrepancies therefore exist between the downsampled data and the equivalent daily ecosystem flux product.

### p-model drivers (structured R data)
### rsofun drivers (structured R data)

A final data product derived from the initial gap-filled LSM data are driver data for the [`rsofun`](https://github.com/geco-bern/rsofun) package. In the current setup, *in-situ* measured model forcing data is combined with GPP and LE values (including their quality-control information) as target data for model calibration.

### Additional data cleaning

A final data product derived from the initial gap-filled LSM data are p-model driver data for the [`rsofun`](https://github.com/geco-bern/rsofun) package. In the current setup *in-situ* environmental forcing will be combined with GPP values as target data for model calibration.
Information about the longest sequence of full years (365 days) of good-quality gapfilled daily GPP, LE, and LE_CORR data for each site is provided by package data `fdk_site_fullyearsequence`. This is created by `analysis/03_screen_rsofun_data.R`. It provides information about the start and end date and the full years for which sequences are available.

### Ancillary remote sensing data

For machine learning or other modelling purposes we provide ancillary MODIS based remote sensing data as described in the FluxnetEO dataset. We refer to the original publication and our [FluxnetEO](https://bg.copernicus.org/articles/19/2805/2022/) package for easy reading and processing of the data.
For machine learning or other modelling purposes, we provide ancillary MODIS based remote sensing data as described in the FluxnetEO dataset. We refer to the original publication and our [FluxnetEO](https://bg.copernicus.org/articles/19/2805/2022/) package for easy reading and processing of the data.

## Data and code availabilty

Expand Down
2 changes: 1 addition & 1 deletion analysis/03_screen_rsofun_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
library(tidyverse)
library(FluxDataKit)

path <- "~/data/FluxDataKit/v3" # "/data/scratch/beta-v4"
path <- "~/data/FluxDataKit/v3"

sites <- FluxDataKit::fdk_site_info |>
filter(!(sitename %in% c("MX-Tes", "US-KS3")))
Expand Down
48 changes: 24 additions & 24 deletions analysis/04_create_zenodo_upload.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,33 +11,33 @@
# the Zenodo repository:
# https://zenodo.org/record/7258291

input_path <- "/data/scratch/beta-v4/"
tmp_path <- "/data/scratch/upload"
input_path <- "~/data/FluxDataKit/v3/"
tmp_path <- "~/data/FluxDataKit/v3/zenodo_upload/"

#---- purge old data -----

# remove temporary path
system(sprintf("rm -rf %s", tmp_path))

# recreate temporary path
dir.create(tmp_path)

#---- copy new data over ----
system(
sprintf(
"cp -R %s/lsm %s/lsm",
input_path,
tmp_path
)
)

system(
sprintf(
"cp -R %s/fluxnet %s/fluxnet",
input_path,
tmp_path
)
)
# # remove temporary path
# system(sprintf("rm -rf %s", tmp_path))
#
# # recreate temporary path
# dir.create(tmp_path)
#
# #---- copy new data over ----
# system(
# sprintf(
# "cp -R %s/lsm %s/lsm",
# input_path,
# tmp_path
# )
# )
#
# system(
# sprintf(
# "cp -R %s/fluxnet %s/fluxnet",
# input_path,
# tmp_path
# )
# )

#---- rename all files in place ----

Expand Down
8 changes: 4 additions & 4 deletions data-raw/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ for them to function.

Data was sourced from different locations:

- ICOS data was provided through the ICOS carbon portal, this is a pre-release currently *not publicly available*
- FLUXNET2015 data can be retrieved from the [FLUXNET data porta](https://fluxnet.org/data/fluxnet2015-dataset/)
- OneFlux data can be retrieved from the [Ameriflux data portal](https://ameriflux.lbl.gov/data/download-data/)
- PLUMBER data can be downloaded using [an included script](https://github.com/geco-bern/FluxDataKit/blob/main/data-raw/00_download_plumber_data.R)
- PLUMBER-2: https://dx.doi.org/10.25914/5fdb0902607e1. Can be downloaded using [an included script](https://github.com/geco-bern/FluxDataKit/blob/main/data-raw/00_download_plumber_data.R)
- The latest Ameriflux release, downloaded data on 14 Oct 2023 from https://ameriflux.lbl.gov/.
- ICOS Drought2018 release from https://doi.org/10.18160/YVR0-4898.
- ICOS WarmWinter2020 release from https://doi.org/10.18160/2G60-ZHAK.
- MODIS LAI/FPAR data is downloaded by an included script

## Data structure
Expand Down
Binary file modified data/fdk_site_fullyearsequence.rda
Binary file not shown.
Loading
Loading