From ad8f721381cce72ce39ada359b03b891a9584a7c Mon Sep 17 00:00:00 2001 From: hansvancalster Date: Thu, 29 Aug 2024 09:07:35 +0200 Subject: [PATCH 1/2] ignore cache folders --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index c915506..658f9f5 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ *.shx *.xls* *_files +*_cache docs libs output From d7fa34640c8d6023a173519bfe0b94c16113970c Mon Sep 17 00:00:00 2001 From: hansvancalster Date: Thu, 29 Aug 2024 09:08:57 +0200 Subject: [PATCH 2/2] rework: - restructure - select only relevant numeric vars - don't use png(), figs are saved via knitr - simplify code - add ternary plot --- .../metadata_bodemstalen_fysicochemisch.Rmd | 203 ++++++++++-------- 1 file changed, 113 insertions(+), 90 deletions(-) diff --git a/source/rmarkdown/metadata_bodemstalen_fysicochemisch.Rmd b/source/rmarkdown/metadata_bodemstalen_fysicochemisch.Rmd index 5789438..89ad2a0 100644 --- a/source/rmarkdown/metadata_bodemstalen_fysicochemisch.Rmd +++ b/source/rmarkdown/metadata_bodemstalen_fysicochemisch.Rmd @@ -12,18 +12,29 @@ output: ```{r setup, include=FALSE} library(knitr) -opts_knit$set(root.dir = here::here()) -opts_chunk$set(echo = TRUE) +opts_knit$set(root.dir = here::here("source/rmarkdown")) +opts_chunk$set( + echo = TRUE, + dev = "png", + dpi = 300, + fig.width = 7, #inches + fig.height = 5) library(car) library(corrplot) +library(ggplot2) library(dplyr) library(moments) -library(magick) conflicted::conflicts_prefer(dplyr::select, dplyr::filter) +``` +```{r cache=TRUE, include=FALSE} +1+1 +# een trukje om te zorgen dat de folder met figuren, die aangemaakt wordt als je +# knit, bewaard blijft in /metadata_bodemstalen_fysicochemisch_files/figure-html ``` + # Inlezen data ```{r inlezen} @@ -38,49 +49,118 @@ file_name <- "MBAG_stratfile_v2_cleaned_12.csv" file_path <- file.path(full_directory_path, file_name) df <- read.csv(file_path, stringsAsFactors = FALSE) - ``` -Select numeric columns and remove columns with zero variance +# Verkenning + +## Samenvattende statistieken en statistische verdeling ```{r} +glimpse(df) + +df <- df %>% + mutate( + Bermen = factor( + Bermen, + levels = c(0, 1), + labels = c("Geen berm", "Berm")), + Cmon_GRTS_Rank = factor(Cmon_GRTS_Rank), + CmonLU19 = factor(CmonLU19), + MBAG_eDNA_staal = factor(MBAG_eDNA_staal), + MBAG_Nematoden_staal = factor(MBAG_Nematoden_staal)) +``` +```{r} numeric_cols <- sapply(df, is.numeric) df_numeric <- df[, numeric_cols] df_numeric_clean <- df_numeric[, apply(df_numeric, 2, var, na.rm = TRUE) != 0] +# de Lambert coordinaten laten we ook buiten beschouwing +df_numeric_clean <- df_numeric_clean %>% + select(-c_LB72X, -c_LB72Y) ``` -Pairwise correlations -```{r pairwise-correlations} -cor_matrix <- cor(df_numeric_clean, use = "pairwise.complete.obs") - -png("correlation_plot.png", width = 1200, height = 1000) -corrplot(cor_matrix, method = "color", type = "upper", order = "hclust", - tl.col = "black", tl.srt = 45, tl.cex = 0.7) -dev.off() +```{r summary-stats} +summary_stats <- df_numeric_clean %>% + tidyr::pivot_longer(cols = everything()) %>% + summarise( + mean = mean(value, na.rm = TRUE), + sd = sd(value, na.rm = TRUE), + min = min(value, na.rm = TRUE), + max = max(value, na.rm = TRUE), + skewness = moments::skewness(value, na.rm = TRUE), + kurtosis = moments::kurtosis(value, na.rm = TRUE), + .by = c(name) + ) +kable(summary_stats, digits = 2) ``` +Histogrammen en qq-plots kunnen helpen om in te schatten hoe de data verdeeld zijn en of een transformatie van de data nuttig kan zijn. +Normaliteit is echter geen vereiste om verklarende variabelen te gebruiken in een regressiemodel. +Enkel de residuen van een (veralgemeend lineair) regressiemodel moeten normaal verdeeld zijn. -```{r dispay-corrplot} -# Display correlation plot -plot(1:10, type = "n", xlab = "", ylab = "", main = "Correlation Plot") -img <- png::readPNG("correlation_plot.png") -grid::grid.raster(img) + +```{r histogrammen} +df_numeric_clean %>% + tidyr::pivot_longer(cols = everything()) %>% + ggplot() + + geom_histogram( + aes(x = value) + ) + + facet_wrap(~ name, scales = "free") ``` -Multicollinearity test +```{r qqplots} +df_numeric_clean %>% + tidyr::pivot_longer(cols = everything()) %>% + ggplot(aes(sample = value)) + + stat_qq_line() + + stat_qq() + + facet_wrap(~ name, scales = "free") +``` -```{r multicollinearity} -constant_response <- rep(1, nrow(df_numeric_clean)) -vif_model <- lm(constant_response ~ ., data = df_numeric_clean) -vif_results <- vif(vif_model) +De textuurvariabelen zijn een voorbeeld van compositionele data (net zoals eDNA reads) en kunnen we beter voorstellen in een textuurdriehoek: + +```{r ternaryplot} +ggtern::ggtern( + data = df, + ggtern::aes(Textuur_Zandfractie, Textuur_kleifractie, Textuur_leemfractie)) + + ggtern::geom_mask() + + geom_point( + ggtern::aes( + colour = Landgebruik_MBAG, + shape = Landgebruik_MBAG)) + + ggtern::theme_ggtern() + + ggtern::theme_showarrows() + + ggtern::theme_clockwise() ``` -```{r display-vif-results} -print(vif_results) +## Multicollineariteit + +We checken multicollineariteit tussen covariaten. +Significante paarsgewijze correlaties met absolute waarden groter dan 0.7 kunnen op problemen wijzen. +Variance inflation factoren groter dan 5 zijn mogelijk problematisch en groter dan 10 zijn zeker problematisch. + + +Paarsgewijze correlaties (Spearman's rank correlaties): + +```{r pairwise-correlations} +cor_matrix <- cor( + df_numeric_clean, + use = "pairwise.complete.obs", + method = "spearman") + +corrplot( + cor_matrix, + method = "color", + type = "upper", + order = "hclust", + insig = "blank", + tl.col = "black", + tl.srt = 45, + tl.cex = 0.7) ``` ```{r Display-highly-correlated-pairs} @@ -92,75 +172,18 @@ high_cor_pairs <- data.frame( ) high_cor_pairs <- high_cor_pairs[!duplicated(t(apply(high_cor_pairs[,1:2], 1, sort))),] high_cor_pairs <- high_cor_pairs[order(-abs(high_cor_pairs$Correlation)),] -print(high_cor_pairs) -``` - - -```{r summary-stats} -summary_stats <- df_numeric_clean %>% - summarise(across(everything(), list( - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE) - ))) -print(summary_stats) - -``` - - -Calculate skewness and kurtosis - -```{r} -skewness_values <- apply(df_numeric, 2, function(x) ifelse(all(is.na(x)), NA, moments::skewness(x, na.rm = TRUE))) -kurtosis_values <- apply(df_numeric, 2, function(x) ifelse(all(is.na(x)), NA, moments::kurtosis(x, na.rm = TRUE))) - -skew_kurt <- data.frame( - variable = names(df_numeric), - skewness = skewness_values, - kurtosis = kurtosis_values -) -``` - - -Print skewness and kurtosis - -```{r} -print(skew_kurt) - +kable(high_cor_pairs, digits = 2) ``` +Variance inflation factors: -Create histograms and Q-Q plots for each numeric variable - -```{r} -for (col in names(df_numeric)) { - png(paste0(col, "_diagnostic_plots.png"), width = 800, height = 400) - par(mfrow = c(1, 2)) - - # Histogram - hist(df_numeric[[col]], main = paste("Histogram of", col), xlab = col) - - # Q-Q plot - qqnorm(df_numeric[[col]], main = paste("Q-Q Plot of", col)) - qqline(df_numeric[[col]], col = "red") - - dev.off() -} - +```{r multicollinearity} +constant_response <- rep(1, nrow(df_numeric_clean)) +vif_model <- lm(constant_response ~ ., data = df_numeric_clean) +vif_results <- vif(vif_model) ``` - -Print the generated plot files - -```{r} -# List the plot files -plot_files <- list.files(pattern = "*_diagnostic_plots.png") - -# Loop through each plot file and display it -for (plot_file in plot_files) { - img <- image_read(plot_file) # Read the image file - print(img) # Display the image in the plot window -} +```{r display-vif-results} +kable(vif_results, digits = 2) ```