diff --git a/NEWS.md b/NEWS.md index 8c188f22..7d6d4185 100644 --- a/NEWS.md +++ b/NEWS.md @@ -13,6 +13,7 @@ - Add functions `get_funguild_db()` and `funguild_assign()` from the [FUNGuildR](https://github.com/brendanf/FUNGuildR/) package into MiscMetabar to decrease package dependencies - Remove all dependencies from packages not available on CRAN or Bioconductor. Improve code using `goodpractice::gp`() and `devtools::check()` function - Add messages in various cases (NA in samples data, low number of sequences in samples, low number of sequences by taxa) when using `verify_pq()` with args `verbose=TRUE` +- Fix a bug in `mutitax_bar_pq()` when using `nb_seq = FALSE` # MiscMetabar 0.52 diff --git a/R/plot_functions.R b/R/plot_functions.R index cbc88169..e7a42b66 100644 --- a/R/plot_functions.R +++ b/R/plot_functions.R @@ -948,6 +948,9 @@ venn_pq <- #' #' Note that you can use ggplot2 function to customize the plot #' for ex. `+ scale_fill_distiller(palette = "BuPu", direction = 1)` +#' and `+ scale_x_continuous(expand = expansion(mult = 0.5))`. See +#' examples. +#' #' @inheritParams clean_pq #' @param fact (required): Name of the factor to cluster samples by modalities. #' Need to be in \code{physeq@sam_data}. @@ -991,9 +994,10 @@ venn_pq <- #' data_fungi@sam_data$Height %in% c("Low", "High")) #' ggvenn_pq(data_fungi2, fact = "Height") #' -#' ggvenn_pq(data_fungi, fact = "Height", add_nb_sequences = TRUE, set_size = 4) +#' ggvenn_pq(data_fungi, fact = "Height", add_nb_sequences = TRUE, set_size = 4) #' ggvenn_pq(data_fungi, fact = "Height", rarefy_before_merging = TRUE) -#' ggvenn_pq(data_fungi, fact = "Height", rarefy_after_merging = TRUE) +#' ggvenn_pq(data_fungi, fact = "Height", rarefy_after_merging = TRUE) + +#' scale_x_continuous(expand = expansion(mult = 0.5)) #' #' @export #' @author Adrien Taudière @@ -2352,12 +2356,13 @@ plot_tax_pq <- ################################################################################ -#' Plot taxonomic distribution across 3 levels +#' Plot taxonomic distribution across 3 taxonomic levels and optionally +#' one sample factor #' #' @description #' `r lifecycle::badge("experimental")` #' -#' Note that lvl3need to be nested in lvl2 which need to be nested +#' Note that lvl3 need to be nested in lvl2 which need to be nested #' in lvl1 #' #' @inheritParams clean_pq @@ -2384,33 +2389,44 @@ plot_tax_pq <- #' multitax_bar_pq(data_fungi_sp_known, "Phylum", "Class", "Order", #' nb_seq = FALSE, log10trans = FALSE #' ) -multitax_bar_pq <- function(physeq, - lvl1, - lvl2, - lvl3, - fact = NULL, - nb_seq = TRUE, - log10trans = TRUE) { - if (!nb_seq) { - physeq <- as_binary_otu_table(physeq) - } - psm <- psmelt(physeq) %>% filter(!is.na(.data[[lvl1]])) - psm <- psm %>% +multitax_bar_pq <- function( + physeq, + lvl1, + lvl2, + lvl3, + fact = NULL, + nb_seq = TRUE, + log10trans = TRUE) { + psm_1 <- psmelt(physeq) %>% + filter(Abundance > 0) %>% + filter(!is.na(.data[[lvl1]])) %>% filter(!is.na(.data[[lvl3]])) %>% filter(!is.na(.data[[lvl2]])) if (is.null(fact)) { + psm_2 <- psm_1 %>% + group_by(OTU) %>% + summarise(Abundance = sum(Abundance)) + + psm <- inner_join(psm_2, psm_1[,c("OTU", lvl1, lvl2, lvl3)], + by = join_by("OTU" == "OTU"), multiple = + "first") + + if (!nb_seq) { + psm$Abundance = 1 + } + data_gg <- tibble( "Abundance" = tapply(psm$Abundance, psm[[lvl3]], sum), "LVL1" = tapply(psm[[lvl1]], psm[[lvl3]], unique), "LVL2" = tapply(psm[[lvl2]], psm[[lvl3]], unique), "LVL3" = tapply(psm[[lvl3]], psm[[lvl3]], unique) ) - + if (log10trans) { data_gg$Abundance <- log10(data_gg$Abundance) } - + p <- ggplot(data_gg, aes( x = Abundance, fill = LVL1, @@ -2421,6 +2437,20 @@ multitax_bar_pq <- function(physeq, theme(strip.text.y.right = element_text(angle = 0)) + theme(legend.position = "none") } else { + + psm_2 <- psm %>% + group_by(OTU,.data[[fact]]) %>% + summarise(Abundance = sum(Abundance)) %>% + filter(Abundance > 0) + + psm <- inner_join(psm_2, psm_1[,c("OTU", lvl1, lvl2, lvl3, fact)], + by = join_by("OTU" == "OTU"), multiple = + "first") + + if (!nb_seq) { + psm$Abundance = 1 + } + data_gg <- tibble( "Abundance" = tapply(psm$Abundance, paste(psm[[fact]], psm[[lvl3]]), sum), "FACT" = tapply(psm[[fact]], paste(psm[[fact]], psm[[lvl3]]), unique), @@ -2428,11 +2458,11 @@ multitax_bar_pq <- function(physeq, "LVL2" = tapply(psm[[lvl2]], paste(psm[[fact]], psm[[lvl3]]), unique), "LVL3" = tapply(psm[[lvl3]], paste(psm[[fact]], psm[[lvl3]]), unique) ) - + if (log10trans) { data_gg$Abundance <- log10(data_gg$Abundance) } - + p <- ggplot(data_gg, aes( x = Abundance, fill = LVL1, @@ -2667,7 +2697,9 @@ SRS_curve_pq <- function(physeq, clean_pq = FALSE, ...) { #' @author Adrien Taudière #' #' -iNEXT_pq <- function(physeq, merge_sample_by = NULL, ...) { +iNEXT_pq <- function(physeq, + merge_sample_by = NULL, + ...) { if (!is.null(merge_sample_by)) { physeq <- merge_samples2(physeq, merge_sample_by) physeq <- clean_pq(physeq, force_taxa_as_columns = TRUE) @@ -2675,6 +2707,7 @@ iNEXT_pq <- function(physeq, merge_sample_by = NULL, ...) { df <- data.frame(t(as.matrix(unclass(physeq@otu_table)))) res_iNEXT <- iNEXT::iNEXT(df, ...) + return(res_iNEXT) } ################################################################################ @@ -2937,7 +2970,8 @@ upset_test_pq <- psm2 <- data.frame(lapply(psm, function(col) { tapply(col, paste0(psm$OTU), function(vec) { - diff_fct_diff_class(vec, numeric_fonction = numeric_fonction, na.rm = TRUE) + diff_fct_diff_class(vec, numeric_fonction = numeric_fonction, + na.rm = TRUE) }) })) %>% arrange(., desc(Abundance)) diff --git a/vignettes/Reclustering.Rmd b/vignettes/Reclustering.Rmd index a49f5258..4a3193a5 100644 --- a/vignettes/Reclustering.Rmd +++ b/vignettes/Reclustering.Rmd @@ -23,11 +23,11 @@ library(MiscMetabar) # Re-clustering ASVs -**ASV** (stands for *Amplicon Sequence Variant*; also called **ESV** for Exact Amplicon Variant) is a DNA sequence obtained from high-throughput analysis of marker genes. **OTU* are a group of closely related individuals created by clustering sequences based on a threshold of similarity. An ASV is a special case of an OTU with a similarity threshold of 100%. +**ASV** (stands for *Amplicon Sequence Variant*; also called **ESV** for Exact Amplicon Variant) is a DNA sequence obtained from high-throughput analysis of marker genes. **OTU** are a group of closely related individuals created by clustering sequences based on a threshold of similarity. An ASV is a special case of an OTU with a similarity threshold of 100%. A third concept is the zero-radius OTU **zOTU** [@edgar2016] which is the same concept than ASV but compute with other softwares than [dada](https://github.com/benjjneb/dada2) (e.g. [vsearch](https://github.com/torognes/vsearch)). -The choice between ASV and OTU is important because they lead to different results [@joos2020], Box 2 in [@tedersoo2022]). Most articles recommend making a choice depending on the question. For example, ASV may be better than OTU for describing a group of very closely related species. In addition, ASV are comparable across different datasets (obtained using identical marker genes). On the other hand, [@tedersoo2022] report that ASV approaches overestimate the richness of common fungal species (due to haplotype variation), but underestimate the richness of rare species. They therefore recommend the use of OTUs in metabarcoding analyses of fungal communities. Finally, [@kauserud2023] argues that the ASV term falls within the original OTU term and recommends adopting only the OTU terms, but with a concise and clear report on how the OTUs were generated. +The choice between ASV and OTU is important because they lead to different results (@joos2020, Box 2 in @tedersoo2022, @chiarello2022). Most articles recommend making a choice depending on the question. For example, ASV may be better than OTU for describing a group of very closely related species. In addition, ASV are comparable across different datasets (obtained using identical marker genes). On the other hand, [@tedersoo2022] report that ASV approaches overestimate the richness of common fungal species (due to haplotype variation), but underestimate the richness of rare species. They therefore recommend the use of OTUs in metabarcoding analyses of fungal communities. Finally, [@kauserud2023] argues that the ASV term falls within the original OTU term and recommends adopting only the OTU terms, but with a concise and clear report on how the OTUs were generated. -Recent articles ([@forster2019; @antich2021]) propose to use both approach together. They recommend (i) using ASV to denoise the dataset and (ii) for some questions, clustering the ASV sequences into OTUs. This is the goal of the function `asv2otu()`, using either the `DECIPHER::Clusterize` function from R or the [vsearch](https://github.com/torognes/vsearch) software. +Recent articles [@forster2019; @antich2021] propose to use both approach together. They recommend (i) using ASV to denoise the dataset and (ii) for some questions, clustering the ASV sequences into OTUs. This is the goal of the function `asv2otu()`, using either the `DECIPHER::Clusterize` function from R or the [vsearch](https://github.com/torognes/vsearch) software. ## Using decipher or Vsearch algorithm ```{r} diff --git a/vignettes/bibliography.bib b/vignettes/bibliography.bib index c7c74f37..8ab7ce49 100644 --- a/vignettes/bibliography.bib +++ b/vignettes/bibliography.bib @@ -221,6 +221,39 @@ @article{McCauley2023 url = {https://app.dimensions.ai/details/publication/pub.1151239188} } +@article{chiarello2022, + title={Ranking the biases: The choice of OTUs vs. ASVs in 16S rRNA amplicon data analysis has stronger effects on diversity measures than rarefaction and OTU identity threshold}, + author={Chiarello, Marl{\`e}ne and McCauley, Mark and Vill{\'e}ger, S{\'e}bastien and Jackson, Colin R}, + journal={PLoS One}, + volume={17}, + number={2}, + pages={e0264443}, + year={2022}, + publisher={Public Library of Science San Francisco, CA USA}, + doi={10.1371/journal.pone.0264443} +} + +@article{edgar2016, + title={UNOISE2: improved error-correction for Illumina 16S and ITS amplicon sequencing}, + author={Edgar, Robert C}, + journal={BioRxiv}, + pages={081257}, + year={2016}, + publisher={Cold Spring Harbor Laboratory} +} + +@article{segata2011, + title={Metagenomic biomarker discovery and explanation}, + author={Segata, Nicola and Izard, Jacques and Waldron, Levi and Gevers, Dirk and Miropolsky, Larisa and Garrett, Wendy S and Huttenhower, Curtis}, + journal={Genome biology}, + volume={12}, + pages={1--18}, + year={2011}, + publisher={Springer}, + doi={10.1186/gb-2011-12-6-r60} +} + + @software{mclaren2020, author = {Michael McLaren}, title = {mikemc/speedyseq: speedyseq v0.2.0},