mct_sj02_simsuite.Rmd

---
title: "Rooting depth by site (Schenk & Jackson, 2002)"
author: "Beni Stocker"
date: "4/30/2019"
output:
  html_document:
    toc: true
    toc_depth: 3
    toc_float: true
---
  
```{r setup, include=FALSE}
library(dplyr)
library(rbeni)
library(tidyr)
library(purrr)
library(ncdf4)
library(lubridate)
library(extRemes)
library(R.utils)
library(readr)
library(ggplot2)
library(ingestr)
library(ggridges)

source("R/mct2.R")
source("R/get_plantwhc_mct_bysite.R")
source("R/get_plantwhc_mct_simsuite.R")
source("R/calc_soilparams.R")
source("R/calc_zroot.R")
source("R/extract_return_level.R")
source("R/convert_et.R")
source("R/ingest_wtd_fan.R")
source("R/test_et_tseries.R")
source("R/simulate_snow2.R")
source("R/add_biome_ncrs.R")
#source("R/extract_pointdata_allsites.R")

dir_climate <- "/Users/benjaminstocker/sofun/output_nc_global_sofun"
dir_fapar <- "~/sofun/input_sj02zroot_sofun/sitedata/fapar/"
```


## Load data

Load ALEXI-ET and WATCH-WFDEI precipitation data for each site. This is prepared by `rscript_get_data_sj02sites.R`. This loads an object called `df_alexi`.
```{r}
load("data/df_alexi_sj02.Rdata")
```

## Clean and complement data

### Remove sites with missing ET

Some ET data seems to be missing. Retain only sites with more than 3000 ET dates.
```{r}
n_avl_et <- function(df){
  sum(!is.na(df$et))
}

df_alexi <- df_alexi %>% 
  mutate(avl_et = purrr::map_int(df, ~n_avl_et(.))) 

## where are they? They are along coasts. Ok, valid to remove them.
plot_map_simpl() +
  geom_point(
    data = df_alexi %>% 
      dplyr::filter(avl_et < 3000),
    aes(lon, lat),
    color = 'red')

df_alexi <- df_alexi %>% 
  dplyr::filter(avl_et > 3000)
```

More sites have missing data for colum `et_mm` but have data in column `et`. This is because elevation data was missing when downloading the data and conversion to mm failed.
```{r}
n_avl_et_mm <- function(df){
  sum(!is.na(df$et_mm))
}

df_alexi <- df_alexi %>% 
  mutate(avl_et_mm = purrr::map_int(df, ~n_avl_et_mm(.))) 

df_alexi %>% 
  dplyr::select(lon, lat, idx, avl_et, avl_et_mm) %>% 
  dplyr::filter(avl_et_mm==0)
```

### Re-calculate ET in mm

Read elevation data from ETOPO1 and complement missing ET data, converted to mm.

#### Get ETOPO elevation data
Ingest elevation data for sites from ETOPO1 where it's missing in the original data.
```{r}
df_etopo <- ingest(
  df_alexi %>% 
    dplyr::select(sitename = idx, lon, lat),
  source = "etopo1",
  dir = "~/data/etopo/"
  ) %>% 
  tidyr::unnest(data) %>% 
  rename(idx = sitename, elv_etopo = elv)
```

Add ETOPO1 elevation to df_alexi and compare with data for which elevation is given.
```{r}
df_alexi <- df_alexi %>% 
  tidyr::unnest(df) %>% 
  dplyr::group_by(lon, lat, elv, idx) %>% 
  tidyr::nest() %>% 
  dplyr::rename(df = data) %>% 
  left_join(
    df_etopo,
    by = "idx"
  )
df_alexi %>% 
  analyse_modobs2("elv", "elv_etopo")  
```

Looks ok. Let's fill gaps with this, wrap it inside the data column and re-calculate ET conversion.

#### Re-calculate conversion

```{r}
df_alexi_repaired <- df_alexi %>%
  ungroup() %>% 
  dplyr::mutate(elv = ifelse(is.na(elv), elv_etopo, elv)) %>% 
  dplyr::select(-elv_etopo) %>% 

  ## add elevation to the nested dataframes (repeating same value for each time step)
  tidyr::unnest(df) %>% 
  dplyr::group_by(lon, lat, idx) %>% 
  tidyr::nest() %>% 
  dplyr::rename(df = data) %>% 
  
  ## convert units: get ET in mm d-1
  ## total ET
  dplyr::mutate(et_mm = purrr::map(df, ~convert_et(.$et, .$temp, .$elv))) %>% 
  dplyr::mutate(et_mm = purrr::map(et_mm, ~tibble(et_mm = .))) %>% 
  dplyr::mutate(df    = purrr::map(df, ~dplyr::rename(., et_mm_old = et_mm))) %>% 
  dplyr::mutate(df    = purrr::map2(df, et_mm, ~bind_cols(.x, .y))) %>% 
  dplyr::select(-et_mm) %>% 
  dplyr::mutate(df    = purrr::map(df, ~drop_na(., date))) # Problem: The first row in `df_alexi$df` is always NA. This is a bug. Drop the row.

save(df_alexi_repaired, file = "data/df_alexi_repaired_sj02.Rdata")
```

Check if all is correct for a site where elevation data was given already before and `et_mm` calculated and now re-calculated - should yield identical results.
```{r}
## one example
df_alexi_repaired %>% dplyr::filter(idx == "BF03a") %>% pull(df)

## data for a site where elevation was now recalculated
df1 <- df_alexi %>% 
  dplyr::filter(idx == "AC01a") %>% 
  unnest(df)
df2 <- df_alexi_repaired %>% 
  dplyr::filter(idx == "AC01a") %>% 
  unnest(df)

## ok!
df1 %>%
  ungroup() %>% 
  dplyr::select(idx, date, et_mm_orig = et_mm) %>% 
  left_join(
    df2 %>% 
      ungroup() %>% 
      dplyr::select(idx, date, et_mm_repaired = et_mm),
    by = c("idx", "date")
  ) %>% 
  ggplot() +
  geom_point(aes(x = et_mm_orig, y = et_mm_repaired))
```

Count missing data again.
```{r}
df_alexi_repaired <- df_alexi_repaired %>% 
  mutate(avl_et_mm = purrr::map_int(df, ~n_avl_et_mm(.))) %>% 
  mutate(avl_et = purrr::map_int(df, ~n_avl_et(.))) 

df_alexi_repaired %>% 
  dplyr::select(lon, lat, idx, avl_et, avl_et_mm) %>% 
  dplyr::filter(avl_et_mm==0)
```
Ok. Done.

Plot an example ET time series.
```{r}
df_gg_et <- df_alexi_repaired %>% 
  ungroup() %>% 
  mutate(gg = purrr::map(df, ~test_et_tseries(.))) %>% 
  dplyr::select(idx, lon, lat, gg)

gg <- df_gg_et %>% 
  dplyr::filter(idx == "TD10c") %>% 
  pull(gg)
gg
```


Some sites have exact same locations. Number of rows could be halved if only distinct lon and lat were used:
```{r}
df_alexi_repaired %>% 
  ungroup() %>% 
  dplyr::select(idx, lon, lat) %>% 
  distinct() %>% 
  dim()
df_alexi_repaired %>% 
  ungroup() %>% 
  dplyr::select(lon, lat) %>% 
  distinct() %>% 
  dim()
```

Do some otherwise identical sites differ by elevation? Yes. Some do. -> Might be necessary to use higer resolution precipitation data...
```{r}
df_alexi_repaired %>% 
  dplyr::select(lon, lat, idx, df) %>% 
  ungroup() %>% 
  unnest(df) %>% 
  dplyr::select(idx, lon, lat, elv) %>% 
  dplyr::group_by(idx, lon, lat, elv) %>% 
  nest() %>% 
  dplyr::select(idx, lon, lat, elv) %>% 
  distinct() %>% 
  dim()

df_unique_lon_lat_evl <- df_alexi_repaired %>% 
  dplyr::select(lon, lat, idx, df) %>% 
  ungroup() %>% 
  unnest(df) %>% 
  dplyr::select(lon, lat, elv) %>% 
  dplyr::group_by(lon, lat, elv) %>% 
  nest() %>% 
  dplyr::select(lon, lat, elv) %>% 
  distinct()
df_unique_lon_lat_evl %>% 
  dim()
```

Identify duplicated rows (that have identical lon, lat, and elv).
```{r}
df_alexi_repaired <- df_alexi_repaired %>% 
  ungroup() %>% 
  dplyr::select(lon, lat) %>% 
  mutate(dupl = duplicated(.)) %>% 
  dplyr::select(dupl) %>% 
  bind_cols(
    df_alexi_repaired,
    .
  )
```

Look at some duplicated rows (that have identical lon, lat, and elv): Data should be identical. Yes, for this one it is (didn't check other duplicates).
```{r}
df1 <- df_alexi_repaired %>% 
  dplyr::filter(idx=="AC01b") %>% 
  pull(df)
df2 <- df_alexi_repaired %>% 
  dplyr::filter(idx=="AC01c") %>% 
  pull(df)
all.equal(df1, df2)
```

## Simulate snow water storage and melt

Test it for one site in the high north.
```{r}
df_test <- df_alexi_repaired %>% 
  arrange(desc(lat)) %>% 
  dplyr::filter(idx == "AC01a") %>%
  mutate(df = purrr::map(df, ~slice(., -1))) %>% 
  mutate(df = purrr::map(df, ~simulate_snow(.)))

plot_map_simpl() +
  geom_point(data = df_test, aes(lon, lat), color = 'red')
  
## water balance test:
df_test$df[[1]] %>% 
  summarise(prec_snow = sum(prec + snow), lts = sum(liquid_to_soil)) %>% 
  mutate(lts = lts + tail(df_test$df[[1]]$snow_pool, 1) - + head(df_test$df[[1]]$snow_pool, 1))

df_test$df[[1]] %>% 
  # dplyr::filter(lubridate::year(date) == 2005) %>% 
  ggplot() + 
  geom_line(aes(date, snow_pool), col = "royalblue") +
  geom_line(aes(date, liquid_to_soil), col = "springgreen4") +
  geom_line(aes(date, prec), col = "tomato")
```

Apply snow model at all sites.
```{r}
df_alexi_snow <- df_alexi_repaired %>%
  mutate(df = purrr::map(df, ~slice(., -1))) %>% 
  mutate(df = purrr::map(df, ~simulate_snow(.)))

save(df_alexi_snow, file = "data/df_alexi_snow_sj02.RData")
```


## Apply the MCT function

Calculate the daily water balance and apply the MCT function to get CWD events in different configurations.
```{r}
source("R/get_bal.R")

filn <- "data/df_mct_sj02_alexi_reOLD.RData"

if (!file.exists(filn)){
  avl_et <- function(df, varnam_et){
    any(!is.na(df[varnam_et]))
  }
  
  df_mct_sj02_alexi <- df_alexi_snow %>%
    dplyr::select(-avl_et_mm, -avl_et) %>% 
    dplyr::mutate( avl_et = purrr::map_lgl(df, ~avl_et(., varnam_et = "et_mm"))) %>%
    dplyr::filter( avl_et ) %>% 
    dplyr::mutate( df = purrr::map(df, ~get_bal(., varnam_bal = "bal", varnam_prec = "liquid_to_soil", varnam_et = "et_mm"))) %>% 
    dplyr::mutate(
      out_mct_00_80 = purrr::map(
        df,
        ~get_plantwhc_mct_bysite(
          .,
          varname_wbal = "bal",
          thresh_terminate = 0.0,
          thresh_drop = 0.8,
          fittype = "Gumbel"))
      )
    
  save(df_mct_sj02_alexi, file = filn)
  
} else {
  load(filn)
}
```


Test with slower snow melt just for alpine sites.
```{r}
load("data/df_whc_sj02_reOLD.RData")
df_alpine <- df_modobs %>%
  dplyr::filter(Vegetation == "alpine herbaceous") %>%
  dplyr::select(sitename, zroot_wtd, D95_extrapolated)

sites_alpine <- df_alpine %>% pull(sitename)

df_test_alpine <- df_alexi_repaired %>%

  ## filter only alpine sites
  dplyr::select(-avl_et_mm, -avl_et) %>%
  dplyr::filter(idx %in% sites_alpine) %>%
  dplyr::filter(idx == "AC01a") %>%
  # mutate(df = purrr::map(df, ~slice(., -1))) %>%

  ## simulate snow
  mutate(df = purrr::map(df, ~simulate_snow(.))) %>%

  ## get balance
  dplyr::mutate( avl_et = purrr::map_lgl(df, ~avl_et(., varnam_et = "et_mm"))) %>%
  dplyr::filter( avl_et ) %>%
  dplyr::mutate( df = purrr::map(df, ~get_bal(., varnam_bal = "bal", varnam_prec = "liquid_to_soil", varnam_et = "et_mm"))) %>%

  ## get cwdx
  dplyr::mutate(
    out_mct_00_80 = purrr::map(
      df,
      ~get_plantwhc_mct_bysite(
        .,
        varname_wbal = "bal",
        thresh_terminate = 0.0,
        thresh_drop = 0.8,
        fittype = "Gumbel"))
    ) %>%
  dplyr::mutate(cwd20 = purrr::map_dbl(out_mct_00_80, ~extract_return_level(., 20))) %>% 
  
  ## get zroot
  rename(sitename = idx) %>% 
  left_join(
    df_whc %>% 
      unnest(data_subsoil) %>%
      dplyr::select(sitename, whc_s = whc),
    by = "sitename"
  ) %>% 
  left_join(
    df_whc %>% 
      unnest(data_topsoil) %>%
      dplyr::select(sitename, whc_t = whc, roots, imperm),
    by = "sitename"
  ) %>% 
  rowwise() %>% 
  dplyr::mutate(zroot = calc_zroot(cwd20, whc_t, whc_s, roots, imperm))

```


### Test output

<!-- Look at cumulative water deficit time series. Weird: at some sites ET is very small and doesn't change over time. -->
<!-- ```{r} -->
<!-- ## plot CWD time series for sites with zero SD in ET -->
<!-- source("R/test_cwd_tseries.R") -->
<!-- df_test_mct_gg <- df_test_alpine %>%  -->
<!--   mutate(mct = purrr::map(out_mct_00_80, "mct")) %>%  -->
<!--   mutate(gg_cwd_tseries = purrr::map(mct, ~try(test_cwd_tseries(.)))) -->

<!-- df_test_mct_gg$gg_cwd_tseries[[1]] -->
<!-- ``` -->


### Overview of WHC*

The following figure shows the distribution of WHC* values:
```{r}
# load("data/ddf_mct_simsuite.Rdata") xxx ??? don't know what this is for
df_mct_sj02_alexi %>% 
  dplyr::select(idx, out_mct_00_80) %>% 
  dplyr::mutate(whc20 = purrr::map_dbl(out_mct_00_80, ~extract_return_level(., 20))) %>% 
  ggplot(aes(x = whc20, y = ..count..)) +
  geom_histogram(color = "black", alpha = 0.3, position="identity") +
  labs(title = "Plant rooting zone WHC*", subtitle = "10 y return period, 80% reduction of CWD", x = "WHC* (mm)")
```


## Extract soil texture information

Using the very nice ingestr package.
```{r}
filn <- "data/df_hwsd_sj02.RData"
if (!file.exists(filn)){
  df_hwsd <- ingest(
    dplyr::select(ungroup(df_mct_sj02_alexi), sitename = idx, lon, lat),
    source = "hwsd",
    settings = list(fil = "~/data/hwsd/HWSD_RASTER/hwsd.bil")
    )
  save(df_hwsd, file = filn)
} else {
  load(filn)
}
```

Calculate FC, PWP, and WHC from texture data.
```{r}
## re-organise soil texture data into columns for each site (row)
df_whc <- df_hwsd %>% 
  mutate(data = purrr::map(data, ~slice(., 1))) %>% 
  mutate(
    data_topsoil = purrr::map(data, ~dplyr::select(
      ., fclay = T_CLAY, fgravel = T_GRAVEL, forg = T_OC, fsand = T_SAND, roots = ROOTS, imperm = IL)),
    data_subsoil = purrr::map(data, ~dplyr::select(
      ., fclay = S_CLAY, fgravel = S_GRAVEL, forg = S_OC, fsand = S_SAND, roots = ROOTS, imperm = IL))
    ) %>% 
  dplyr::select(-data) %>% 
  mutate(data_topsoil = purrr::map(data_topsoil, ~calc_soilparams(., method = "balland")),
         data_subsoil = purrr::map(data_subsoil, ~calc_soilparams(., method = "balland")))

save(df_whc, file = "data/df_whc_sj02_reOLD.RData")
```

Plot the distribution of values.
```{r}
df_whc %>% 
  unnest(data_topsoil) %>% 
  ggplot(aes(x = whc, y = ..count..)) +
  geom_histogram(color = "black", alpha = 0.3, position="identity") +
  labs(title = "Top soil WHC", subtitle = " based on HWSD soil texture data", x = "WHC (m3/m3)")

df_whc %>% 
  unnest(data_subsoil) %>% 
  ggplot(aes(x = whc, y = ..count..)) +
  geom_histogram(color = "black", alpha = 0.3, position="identity") +
  labs(title = "Sub soil WHC", subtitle = " based on HWSD soil texture data", x = "WHC (m3/m3)")
```

## Calculate rooting depth.

```{r}
## use CWD10 for zroot, no root obstacles accounted for now
df_zroot <- df_mct_sj02_alexi %>%
  ungroup() %>% 
  dplyr::select(sitename = idx, out_mct_00_80) %>% 
  dplyr::mutate(whcXX = purrr::map_dbl(out_mct_00_80, ~extract_return_level(., 20))) %>% 
  dplyr::select(sitename, whcXX) %>% 
  left_join(
    df_whc %>% 
      unnest(data_subsoil) %>%
      dplyr::select(sitename, whc_s = whc),
    by = "sitename"
  ) %>% 
  left_join(
    df_whc %>% 
      unnest(data_topsoil) %>%
      dplyr::select(sitename, whc_t = whc, roots, imperm),
    by = "sitename"
  ) %>% 
  rowwise() %>% 
  dplyr::mutate(zroot = calc_zroot(whcXX, whc_t, whc_s, roots, imperm))

save(df_zroot, file = "data/df_zroot_sj02_reOLD.RData")
```

The following figure shows the distribution of zroot* values:
```{r}
df_zroot %>% 
  ggplot(aes(x = zroot, y = ..count..)) +
  geom_histogram(color = "black", alpha = 0.3, position="identity") +
  labs(title = "Plant rooting depth", subtitle = "10 y return period, 95% reduction of D", x = "zroot* (mm)")
```

## Comparison to observations

Combine data frames.
```{r}
df_modobs <- read_csv("~/data/rootingdepth/root_profiles_schenkjackson02/data/root_profiles_D50D95.csv") %>% 
  dplyr::filter(Wetland == "N" & Anthropogenic == "N" & Schenk_Jackson_2002 == "YES") %>% 
  dplyr::rename(sitename = ID) %>% 
  dplyr::left_join(dplyr::select(df_zroot, zroot, sitename), by = "sitename") %>% 
  dplyr::mutate(D50 = 1000 * D50, D95 = 1000 * D95, D50_extrapolated = 1000 * D50_extrapolated, D95_extrapolated = 1000 * D95_extrapolated)

save(df_modobs, file = "data/df_modobs_reOLD.RData")
```

### Distribution of values
```{r}
df_modobs %>% 
  dplyr::select(sitename, Vegetation, obs = D95_extrapolated, mod = zroot) %>% 
  tidyr::pivot_longer(cols = c(mod, obs), names_to = "source", values_to = "zroot") %>% 
  ggplot() +
  geom_histogram(
    aes(x = zroot, y = ..count.., fill = source), 
    color = "black", alpha = 0.3, position="identity") +
  scale_fill_manual(name = "", values = c("black", "red")) +
  labs(title = "Distribution of rooting depth (mm), v3", x = "Rooting depth (mm)")

ggsave("fig/hist_zroot_modobs_reOLD.pdf", height = 4, width = 6)
```

### Comparison by vegetation type.
```{r}
df_modobs %>% 
  dplyr::select(sitename, Vegetation, obs = D95_extrapolated, mod = zroot) %>% 
  tidyr::pivot_longer(cols = c(mod, obs), names_to = "source", values_to = "zroot") %>% 
  ggplot() +
  geom_boxplot(aes(x = Vegetation, y = -zroot, fill = source)) +
  theme(axis.text.x = element_text(angle=90, hjust=1)) +
  labs(
    title = "Observed and modelled by vegetation type", 
    subtitle = "Obs.: 95% quantile  Mod.: 5-yr return period, 5% deficit reduction threshold", 
    y = "Rooting depth (mm)")

ggsave("fig/modobs_boxplot_zroot_reOLD.pdf", width = 9, height = 6)
```

### Comparison site by site.
```{r}
out <- df_modobs %>% 
  rbeni::analyse_modobs2(
    mod = "zroot", 
    obs = "D95_extrapolated"
    )
out$gg +  
  labs(
    title = "Obs.: 95% quantile  Mod.: 5-yr return period, 5% deficit reduction threshold", 
    x = "Modelled rooting depth (mm)", 
    y = "Observed rooting depth (mm)"
    )
```

## Check if water table depth is a useful constrait

```{r}
df_modobs <- df_alexi_repaired %>% 
  dplyr::select(idx, lon, lat) %>% 
  ingest_wtd_fan() %>% 
  rename(sitename = idx, wtd_fan13 = wtd) %>% 
  mutate(wtd_fan13 = wtd_fan13 * 1000) %>% 
  right_join(
    df_modobs,
    by = "sitename"
  ) %>% 
  mutate(wtd_fan13 = ifelse(is.na(wtd_fan13), 99999, wtd_fan13)) %>% 
  mutate(zroot_wtd = ifelse(zroot > wtd_fan13, wtd_fan13, zroot))

save(df_modobs, file = "data/df_modobs_sj02_reOLD.RData")
```

Updated histogram by vegetation type
```{r}
## may check here: https://community.rstudio.com/t/how-to-reorder-boxplot-by-only-one-level-of-a-variable/37775
df_modobs %>% 
  dplyr::select(sitename, Vegetation, obs = D95_extrapolated, mod = zroot_wtd) %>% 
  # mutate(Vegetation = forcats::fct_reorder(Vegetation, obs, .fun = 'median')) %>%
  tidyr::pivot_longer(cols = c(mod, obs), names_to = "source", values_to = "zroot") %>%
  ggplot() +
  # geom_boxplot(aes(x = reorder(Vegetation, zroot), y = -zroot, fill = source)) +
  geom_boxplot(aes(x = Vegetation, y = -zroot, fill = source)) +
  theme(axis.text.x = element_text(angle=90, hjust=1)) +
  labs(
    title = "Observed and modelled by vegetation type", 
    subtitle = "Obs.: 95% quantile  Mod.: 5-yr return period, 5% deficit reduction threshold", 
    y = "Rooting depth (mm)")

ggsave("fig/modobs_boxplot_zroot_wtd_reOLD.pdf", width = 9, height = 6)
```

Updated mod. vs. obs.
```{r}
out <- df_modobs %>% 
  rbeni::analyse_modobs2(
    mod = "zroot_wtd", 
    obs = "D95_extrapolated"
    )
out$gg +  
  labs(
    title = "Obs.: 95% quantile  Mod.: 5-yr return period, 5% deficit reduction threshold", 
    x = "Modelled rooting depth (mm)", 
    y = "Observed rooting depth (mm)"
    )
```

Ridges.
```{r}
df_modobs %>% 
  dplyr::select(sitename, Vegetation, obs = D95_extrapolated, mod = zroot_wtd) %>% 
  tidyr::pivot_longer(cols = c(mod, obs), names_to = "source", values_to = "zroot") %>% 

  ggplot(aes(x = zroot, y = Vegetation, color = source, point_color = source, fill = source)) +
  geom_density_ridges(
    jittered_points = TRUE, scale = .95, rel_min_height = .01,
    point_shape = "|", point_size = 1.5, size = 0.25,
    position = position_points_jitter(height = 0)
  ) +
  scale_y_discrete(expand = c(0, 0)) +
  scale_x_continuous(expand = c(0, 0), name = "rooting depth (mm)") +
  scale_fill_manual(values = c("#D55E0050", "#0072B250"), labels = c("modelled", "observed")) +
  scale_color_manual(values = c("#D55E00", "#0072B2"), guide = "none") +
  scale_discrete_manual("point_color", values = c("#D55E00", "#0072B2"), guide = "none") +
  coord_cartesian(clip = "off") +
  guides(fill = guide_legend(
    override.aes = list(
      fill = c("#D55E00A0", "#0072B2A0"),
      color = NA, point_color = NA)
    )
  ) +
  ggtitle("Rooting depth by biomes, SJ02 sites") +
  theme_ridges(center = TRUE)

ggsave("fig/modobs_ridges_zroot_biome_sj02_reOLD2.pdf", width = 15, height = 10)
```


## Focus on model-data mismatch

There seem to be many sites with an observed rooting depth < 2500 mm where modelled values are way too low.
```{r}
df_wrongsites <- df_modobs %>% 
  mutate(relbias = zroot/D95_extrapolated) %>% 
  dplyr::filter(relbias < 0.5)
out$gg +
  geom_point(data = df_wrongsites, aes(x = zroot, y = D95_extrapolated), color = 'red')
```

Where are they located?
```{r}
library(rbeni)
plot_map_simpl() +
  geom_point(data = df_wrongsites, aes(x = lon, y = lat), color = 'red')
```

Look at cumulative water deficit time series. Weird: at some sites ET is very small and doesn't change over time.
```{r}
df_test_gg <- df_mct_sj02_alexi %>% 
  dplyr::filter(idx %in% df_wrongsites$sitename) %>% 
  mutate(mct = purrr::map(out_mct_00_80, "mct")) %>% 
  mutate(gg_cwd_tseries = purrr::map(mct, ~test_cwd_tseries(., filter_years = c(2010))))

df_test_gg %>% 
  dplyr::filter(idx == "TD10c") %>% 
  pull(gg_cwd_tseries)
```

Identidfy weird sites (where ET is constant)
```{r}
df_test_sj02 <- df_mct_sj02_alexi %>% 
  dplyr::select(-avl_et) %>%
  dplyr::filter(idx %in% df_wrongsites$sitename) %>% 
  mutate(mct = purrr::map(out_mct_00_80, "mct")) %>% 
  mutate(mct_df = purrr::map(mct, "df")) %>% 
  tidyr::unnest(mct_df) %>% 
  group_by(idx, lon, lat) %>% 
  summarise(sd_et = sd(et_mm), mean_et = mean(et_mm))

## problematic sites with zero SD:
df_zeroSD <- df_test_sj02 %>% 
  dplyr::filter(sd_et == 0)

shitsites <- df_zeroSD %>% pull(idx)

plot_map_simpl() +
  geom_point(data = df_zeroSD, aes(lon, lat), col = 'red')
```

## Alternative biome classification

### WWF Ecoregions

Try an alternative biome classification based on the WWF Ecoregions (implemented in ingestr). Ingest data into separate dataframe.
```{r}
df_wwf <- ingest(
  dplyr::select(df_modobs, sitename, lon, lat),
  source = "wwf",
  dir = "~/data/biomes/wwf_ecoregions/official/",
  settings = list(layer = "wwf_terr_ecos")
)
save(df_wwf, file = "./data/df_wwf_sj02.RData")
```

Combine dataframes.
```{r}
df_modobs <- df_wwf %>% 
  mutate(data = purrr::map(data, ~slice(., 1))) %>% 
  unnest(data) %>% 
  dplyr::select(sitename, BIOME, biome_wwf = BIOME_NAME, ECO_NAME, ECO_NUM) %>% 
  right_join(df_modobs, by = "sitename")
save(df_modobs, file = "data/df_modobs_sj02.RData")
```

Plot.
```{r}
df_modobs %>% 
  dplyr::select(sitename, biome_wwf, obs = D95_extrapolated, mod = zroot_wtd) %>% 
  tidyr::pivot_longer(cols = c(mod, obs), names_to = "source", values_to = "zroot") %>% 
  ggplot() +
  geom_boxplot(aes(x = biome_wwf, y = -zroot, fill = source)) +
  theme(axis.text.x = element_text(angle=90, hjust=1)) +
  labs(
    title = "Observed and modelled by vegetation type", 
    subtitle = "Obs.: 95% quantile  Mod.: 5-yr return period, 5% deficit reduction threshold", 
    y = "Rooting depth (mm)")

ggsave("fig/modobs_boxplot_zroot_biome_wwf_reOLD.pdf", width = 9, height = 10)
```

Ridges.
```{r}
df_modobs %>% 
  dplyr::select(sitename, biome_wwf, obs = D95_extrapolated, mod = zroot_wtd) %>% 
  tidyr::pivot_longer(cols = c(mod, obs), names_to = "source", values_to = "zroot") %>% 
  dplyr::filter(!is.na(biome_wwf)) %>% 

  ggplot(aes(x = zroot, y = biome_wwf, color = source, point_color = source, fill = source)) +
  geom_density_ridges(
    jittered_points = TRUE, scale = .95, rel_min_height = .01,
    point_shape = "|", point_size = 1.5, size = 0.25,
    position = position_points_jitter(height = 0)
  ) +
  scale_y_discrete(expand = c(0, 0)) +
  scale_x_continuous(expand = c(0, 0), name = "rooting depth (mm)") +
  scale_fill_manual(values = c("#D55E0050", "#0072B250"), labels = c("modelled", "observed")) +
  scale_color_manual(values = c("#D55E00", "#0072B2"), guide = "none") +
  scale_discrete_manual("point_color", values = c("#D55E00", "#0072B2"), guide = "none") +
  coord_cartesian(clip = "off") +
  guides(fill = guide_legend(
    override.aes = list(
      fill = c("#D55E00A0", "#0072B2A0"),
      color = NA, point_color = NA)
    )
  ) +
  ggtitle("Rooting depth by WWF biomes, SJ02 sites") +
  theme_ridges(center = TRUE)

ggsave("fig/modobs_ridges_zroot_biome_wwf_reOLD.pdf", width = 15, height = 10)
```


<!-- ### NCRS Biomes -->

<!-- ```{r} -->
<!-- df_ncrs <- add_biome_ncrs( -->
<!--   dplyr::select(df_modobs, sitename, lon, lat),  -->
<!--   path = "~/data/biomes/NCRS/global_biomes_geotiff/biomes.tif" -->
<!--   ) -->
<!-- ``` -->

<!-- Combine dataframes. -->
<!-- ```{r} -->
<!-- df_modobs <- df_ncrs %>%  -->
<!--   dplyr::select(sitename, biome_ncrs, biome_ncrs_name = biome_ncrs_chr) %>%  -->
<!--   right_join(df_modobs, by = "sitename") -->
<!-- ``` -->

<!-- Boxplot. -->
<!-- ```{r} -->
<!-- df_modobs %>%  -->
<!--   dplyr::select(sitename, biome_ncrs_name, obs = D95_extrapolated, mod = zroot_wtd) %>%  -->
<!--   tidyr::pivot_longer(cols = c(mod, obs), names_to = "source", values_to = "zroot") %>%  -->
<!--   ggplot() + -->
<!--   geom_boxplot(aes(x = biome_ncrs_name, y = -zroot, fill = source)) + -->
<!--   theme(axis.text.x = element_text(angle=90, hjust=1)) + -->
<!--   labs( -->
<!--     title = "Observed and modelled by vegetation type",  -->
<!--     subtitle = "Obs.: 95% quantile  Mod.: 5-yr return period, 5% deficit reduction threshold",  -->
<!--     y = "Rooting depth (mm)") -->

<!-- ggsave("fig/modobs_boxplot_zroot_biome_ncrs_reOLD.pdf", width = 9, height = 6) -->
<!-- ``` -->

<!-- Ridges. -->
<!-- ```{r} -->
<!-- library(ggridges) -->
<!-- df_modobs %>%  -->
<!--   dplyr::select(sitename, biome_ncrs_name, obs = D95_extrapolated, mod = zroot_wtd) %>%  -->
<!--   tidyr::pivot_longer(cols = c(mod, obs), names_to = "source", values_to = "zroot") %>%  -->
<!--   dplyr::filter(!is.na(biome_ncrs_name)) %>%  -->

<!--   ggplot(aes(x = zroot, y = biome_ncrs_name, color = source, point_color = source, fill = source)) + -->
<!--   geom_density_ridges( -->
<!--     jittered_points = TRUE, scale = .95, rel_min_height = .01, -->
<!--     point_shape = "|", point_size = 1.5, size = 0.25, -->
<!--     position = position_points_jitter(height = 0) -->
<!--   ) + -->
<!--   scale_y_discrete(expand = c(0, 0)) + -->
<!--   scale_x_continuous(expand = c(0, 0), name = "rooting depth (mm)") + -->
<!--   scale_fill_manual(values = c("#D55E0050", "#0072B250"), labels = c("modelled", "observed")) + -->
<!--   scale_color_manual(values = c("#D55E00", "#0072B2"), guide = "none") + -->
<!--   scale_discrete_manual("point_color", values = c("#D55E00", "#0072B2"), guide = "none") + -->
<!--   coord_cartesian(clip = "off") + -->
<!--   guides(fill = guide_legend( -->
<!--     override.aes = list( -->
<!--       fill = c("#D55E00A0", "#0072B2A0"), -->
<!--       color = NA, point_color = NA) -->
<!--     ) -->
<!--   ) + -->
<!--   ggtitle("Rooting depth by NCRS biomes, SJ02 sites") + -->
<!--   theme_ridges(center = TRUE) -->

<!-- ggsave("fig/modobs_ridges_zroot_biome_ncrs_reOLD.pdf", width = 9, height = 6) -->
<!-- ``` -->

Quantile regression
```{r}
out <- df_modobs %>% 
  group_by(Vegetation) %>% 
  summarise(obs = median(D95_extrapolated, na.rm = TRUE), mod = median(zroot_wtd, na.rm = TRUE)) %>% 
  analyse_modobs2("mod", "obs")
out$gg

# out <- df_modobs %>% 
#   group_by(biome_ncrs_name) %>% 
#   summarise(obs = median(D95_extrapolated, na.rm = TRUE), mod = median(zroot_wtd, na.rm = TRUE)) %>% 
#   analyse_modobs2("mod", "obs")
# out$gg

out <- df_modobs %>% 
  group_by(biome_wwf) %>% 
  summarise(obs = median(D95_extrapolated, na.rm = TRUE), mod = median(zroot_wtd, na.rm = TRUE)) %>% 
  analyse_modobs2("mod", "obs")
out$gg
```

Save.
```{r}
df_modobs <- df_modobs %>% 
  left_join(df_zroot %>% 
              dplyr::select(sitename, cwd20 = whcXX),
            by = "sitename") %>% 
  dplyr::select(sitename, cwd20, zroot, zroot_wtd)
df_modobs_reOLD <- dplyr::select(df_modobs, sitename, cwd20, zroot, zroot_wtd)
save( df_modobs_reOLD, file = "data/df_modobs_reOLD.Rdata")
```