Merge pull request #30 from inbo/review_metadata_fysicochemisch

Review metadata fysicochemisch
inbo · Aug 30, 2024 · 4a7c643 · 4a7c643
2 parents d0a2989 + d7fa346
commit 4a7c643
Show file tree

Hide file tree

Showing 2 changed files with 114 additions and 90 deletions.
diff --git a/.gitignore b/.gitignore
@@ -16,6 +16,7 @@
 *.shx
 *.xls*
 *_files
+*_cache
 docs
 libs
 output

diff --git a/source/rmarkdown/metadata_bodemstalen_fysicochemisch.Rmd b/source/rmarkdown/metadata_bodemstalen_fysicochemisch.Rmd
@@ -12,18 +12,29 @@ output:
 
 ```{r setup, include=FALSE}
 library(knitr)
-opts_knit$set(root.dir = here::here())
-opts_chunk$set(echo = TRUE)
+opts_knit$set(root.dir = here::here("source/rmarkdown"))
+opts_chunk$set(
+  echo = TRUE,
+  dev = "png",
+  dpi = 300,
+  fig.width = 7, #inches
+  fig.height = 5)
 library(car)
 library(corrplot)
+library(ggplot2)
 library(dplyr)
 library(moments)
-library(magick)
 conflicted::conflicts_prefer(dplyr::select, dplyr::filter)
+```
 
+```{r cache=TRUE, include=FALSE}
+1+1
+# een trukje om te zorgen dat de folder met figuren, die aangemaakt wordt als je
+# knit, bewaard blijft in /metadata_bodemstalen_fysicochemisch_files/figure-html
 ```
 
 
+
 # Inlezen data
 
 ```{r inlezen}
@@ -38,49 +49,118 @@ file_name <- "MBAG_stratfile_v2_cleaned_12.csv"
 file_path <- file.path(full_directory_path, file_name)
 
 df <- read.csv(file_path, stringsAsFactors = FALSE)
-
 ```
 
-Select numeric columns and remove columns with zero variance
+# Verkenning
+
+## Samenvattende statistieken en statistische verdeling
 
 ```{r}
+glimpse(df)
+
+df <- df %>%
+  mutate(
+    Bermen = factor(
+      Bermen,
+      levels = c(0, 1),
+      labels = c("Geen berm", "Berm")),
+    Cmon_GRTS_Rank = factor(Cmon_GRTS_Rank),
+    CmonLU19 = factor(CmonLU19),
+    MBAG_eDNA_staal = factor(MBAG_eDNA_staal),
+    MBAG_Nematoden_staal = factor(MBAG_Nematoden_staal))
+```
 
+```{r}
 numeric_cols <- sapply(df, is.numeric)
 df_numeric <- df[, numeric_cols]
 
 df_numeric_clean <- df_numeric[, apply(df_numeric, 2, var, na.rm = TRUE) != 0]
 
+# de Lambert coordinaten laten we ook buiten beschouwing
+df_numeric_clean <- df_numeric_clean %>%
+  select(-c_LB72X, -c_LB72Y)
 ```
 
-Pairwise correlations
 
-```{r pairwise-correlations}
-cor_matrix <- cor(df_numeric_clean, use = "pairwise.complete.obs")
-
-png("correlation_plot.png", width = 1200, height = 1000)
-corrplot(cor_matrix, method = "color", type = "upper", order = "hclust", 
-         tl.col = "black", tl.srt = 45, tl.cex = 0.7)
-dev.off()
+```{r summary-stats}
+summary_stats <- df_numeric_clean %>%
+  tidyr::pivot_longer(cols = everything()) %>%
+  summarise(
+    mean = mean(value, na.rm = TRUE),
+    sd = sd(value, na.rm = TRUE),
+    min = min(value, na.rm = TRUE),
+    max = max(value, na.rm = TRUE),
+    skewness = moments::skewness(value, na.rm = TRUE),
+    kurtosis = moments::kurtosis(value, na.rm = TRUE),
+    .by = c(name)
+  )
+kable(summary_stats, digits = 2)
 ```
 
+Histogrammen en qq-plots kunnen helpen om in te schatten hoe de data verdeeld zijn en of een transformatie van de data nuttig kan zijn.
+Normaliteit is echter geen vereiste om verklarende variabelen te gebruiken in een regressiemodel.
+Enkel de residuen van een (veralgemeend lineair) regressiemodel moeten normaal verdeeld zijn.
 
-```{r dispay-corrplot}
-# Display correlation plot
-plot(1:10, type = "n", xlab = "", ylab = "", main = "Correlation Plot")
-img <- png::readPNG("correlation_plot.png")
-grid::grid.raster(img)
+
+```{r histogrammen}
+df_numeric_clean %>%
+  tidyr::pivot_longer(cols = everything()) %>%
+  ggplot() +
+  geom_histogram(
+    aes(x = value)
+  ) +
+  facet_wrap(~ name, scales = "free")
 ```
 
-Multicollinearity test
+```{r qqplots}
+df_numeric_clean %>%
+  tidyr::pivot_longer(cols = everything()) %>%
+  ggplot(aes(sample = value)) +
+  stat_qq_line() +
+  stat_qq() +
+  facet_wrap(~ name, scales = "free")
+```
 
-```{r multicollinearity}
-constant_response <- rep(1, nrow(df_numeric_clean))
-vif_model <- lm(constant_response ~ ., data = df_numeric_clean)
-vif_results <- vif(vif_model)
+De textuurvariabelen zijn een voorbeeld van compositionele data (net zoals eDNA reads) en kunnen we beter voorstellen in een textuurdriehoek:
+
+```{r ternaryplot}
+ggtern::ggtern(
+  data = df,
+  ggtern::aes(Textuur_Zandfractie, Textuur_kleifractie, Textuur_leemfractie)) + 
+   ggtern::geom_mask() +
+   geom_point(
+     ggtern::aes(
+       colour = Landgebruik_MBAG,
+       shape = Landgebruik_MBAG)) + 
+   ggtern::theme_ggtern() +
+   ggtern::theme_showarrows() +
+   ggtern::theme_clockwise()
 ```
 
-```{r display-vif-results}
-print(vif_results)
+## Multicollineariteit
+
+We checken multicollineariteit tussen covariaten.
+Significante paarsgewijze correlaties met absolute waarden groter dan 0.7 kunnen op problemen wijzen.
+Variance inflation factoren groter dan 5 zijn mogelijk problematisch en groter dan 10 zijn zeker problematisch.
+
+
+Paarsgewijze correlaties (Spearman's rank correlaties):
+
+```{r pairwise-correlations}
+cor_matrix <- cor(
+  df_numeric_clean,
+  use = "pairwise.complete.obs",
+  method = "spearman")
+
+corrplot(
+  cor_matrix,
+  method = "color",
+  type = "upper",
+  order = "hclust",
+  insig = "blank",
+  tl.col = "black",
+  tl.srt = 45,
+  tl.cex = 0.7)
 ```
 
 ```{r Display-highly-correlated-pairs}
@@ -92,75 +172,18 @@ high_cor_pairs <- data.frame(
 )
 high_cor_pairs <- high_cor_pairs[!duplicated(t(apply(high_cor_pairs[,1:2], 1, sort))),]
 high_cor_pairs <- high_cor_pairs[order(-abs(high_cor_pairs$Correlation)),]
-print(high_cor_pairs)
-```
-
-
-```{r summary-stats}
-summary_stats <- df_numeric_clean %>%
-  summarise(across(everything(), list(
-    mean = ~mean(., na.rm = TRUE),
-    sd = ~sd(., na.rm = TRUE),
-    min = ~min(., na.rm = TRUE),
-    max = ~max(., na.rm = TRUE)
-  )))
-print(summary_stats)
-
-```
-
-
-Calculate skewness and kurtosis
-
-```{r}
-skewness_values <- apply(df_numeric, 2, function(x) ifelse(all(is.na(x)), NA, moments::skewness(x, na.rm = TRUE)))
-kurtosis_values <- apply(df_numeric, 2, function(x) ifelse(all(is.na(x)), NA, moments::kurtosis(x, na.rm = TRUE)))
-
-skew_kurt <- data.frame(
-  variable = names(df_numeric),
-  skewness = skewness_values,
-  kurtosis = kurtosis_values
-)
-```
-
-
-Print skewness and kurtosis
-
-```{r}
-print(skew_kurt)
-
+kable(high_cor_pairs, digits = 2)
 ```
 
+Variance inflation factors:
 
-Create histograms and Q-Q plots for each numeric variable
-
-```{r}
-for (col in names(df_numeric)) {
-  png(paste0(col, "_diagnostic_plots.png"), width = 800, height = 400)
-  par(mfrow = c(1, 2))
-  
-  # Histogram
-  hist(df_numeric[[col]], main = paste("Histogram of", col), xlab = col)
-  
-  # Q-Q plot
-  qqnorm(df_numeric[[col]], main = paste("Q-Q Plot of", col))
-  qqline(df_numeric[[col]], col = "red")
-  
-  dev.off()
-}
-
+```{r multicollinearity}
+constant_response <- rep(1, nrow(df_numeric_clean))
+vif_model <- lm(constant_response ~ ., data = df_numeric_clean)
+vif_results <- vif(vif_model)
 ```
 
-
-Print the generated plot files
-
-```{r}
-# List the plot files
-plot_files <- list.files(pattern = "*_diagnostic_plots.png")
-
-# Loop through each plot file and display it
-for (plot_file in plot_files) {
-  img <- image_read(plot_file)  # Read the image file
-  print(img)                    # Display the image in the plot window
-}
+```{r display-vif-results}
+kable(vif_results, digits = 2)
 ```
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,6 +16,7 @@ @@
     *.shx
     *.xls*
     *_files
+    *_cache
     docs
     libs
     output
@@ Expand Down @@