diff --git a/CHANGELOG.md b/CHANGELOG.md index 01c7118f..8b8d0d21 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,10 +9,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#798](https://github.com/nf-core/ampliseq/pull/798) - Added SILVA version 138.2 of DADA2 taxonomy database: `silva=138.2` or `silva` as parameter to `--dada2_ref_taxonomy` - [#804](https://github.com/nf-core/ampliseq/pull/804) - Added version 10 of Unite as parameter for `--dada_ref_taxonomy` (issue [#768](https://github.com/nf-core/ampliseq/issues/768)) +- [#803](https://github.com/nf-core/ampliseq/pull/803) - New parameters introduced related to `--mergepairs_strategy`. These parameters would only be effective if `--mergepairs_strategy consensus` is set. - [#807](https://github.com/nf-core/ampliseq/pull/807) - Export of TreeSummarizedExperiment R object by default, can be omitted with `--skip_tse`, also added ability to skip phyloseq R object generation with `--skip_phyloseq` +| **Parameter** | **Description** | **Default Value** | +| ------------------------------------------ | ----------------------------------------------------------------------------------------- | ----------------- | +| **mergepairs_consensus_match** | The score assigned for each matching base pair during sequence alignment. | 1 | +| **mergepairs_consensus_mismatch** | The penalty score assigned for each mismatched base pair during sequence alignment. | -2 | +| **mergepairs_consensus_gap** | The penalty score assigned for each gap introduced during sequence alignment. | -4 | +| **mergepairs_consensus_minoverlap** | The minimum number of overlapping base pairs required to merge forward and reverse reads. | 12 | +| **mergepairs_consensus_maxmismatch** | The maximum number of mismatches allowed within the overlapping region for merging reads. | 0 | +| **mergepairs_consensus_percentile_cutoff** | The percentile cutoff determining the minimum observed overlap in the dataset. | 0.001 | + ### `Changed` +- [#803](https://github.com/nf-core/ampliseq/pull/803) - Changed DADA2_DENOISING : `--concatenate_reads` renaming to `--mergepairs_strategy` ; support new method named "consensus" by setting `--mergepairs_strategy consensus` ; changed options of `--mergepairs_strategy` from TRUE/FALSE (boolean) to ["merge", "concatenate", "consensus"]. - [#818](https://github.com/nf-core/ampliseq/pull/818) - Provide users the ability to not bump stack size in vsearch clustering. ### `Fixed` diff --git a/conf/modules.config b/conf/modules.config index cbbf66f0..bc0907a8 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -232,9 +232,12 @@ process { ].join(',').replaceAll('(,)*$', "") // setting from https://rdrr.io/bioc/dada2/man/mergePairs.html & https://rdrr.io/bioc/dada2/man/nwalign.html & match = getDadaOpt("MATCH"), mismatch = getDadaOpt("MISMATCH"), gap = getDadaOpt("GAP_PENALTY"), missing from the list below is: 'band = -1' ext.args2 = [ - 'minOverlap = 12, maxMismatch = 0, returnRejects = FALSE, propagateCol = character(0), trimOverhang = FALSE, match = 1, mismatch = -64, gap = -64, homo_gap = NULL, endsfree = TRUE, vec = FALSE', - params.concatenate_reads ? "justConcatenate = TRUE" : "justConcatenate = FALSE" + "homo_gap = NULL, endsfree = TRUE, vec = FALSE, propagateCol = character(0), trimOverhang = FALSE", + params.mergepairs_strategy == "consensus" ? + "returnRejects = TRUE, match = ${params.mergepairs_consensus_match}, mismatch = ${params.mergepairs_consensus_mismatch}, minOverlap = ${params.mergepairs_consensus_minoverlap}, maxMismatch = ${params.mergepairs_consensus_maxmismatch}, gap = ${params.mergepairs_consensus_gap}" : + "justConcatenate = ${params.mergepairs_strategy == 'concatenate' ? 'TRUE' : 'FALSE'}, returnRejects = FALSE, match = 1, mismatch = -64, gap = -64, minOverlap = 12, maxMismatch = 0" ].join(',').replaceAll('(,)*$', "") + ext.quantile = "${params.mergepairs_consensus_percentile_cutoff}" publishDir = [ [ path: { "${params.outdir}/dada2/args" }, diff --git a/modules/local/dada2_denoising.nf b/modules/local/dada2_denoising.nf index 637bd898..4e77238b 100644 --- a/modules/local/dada2_denoising.nf +++ b/modules/local/dada2_denoising.nf @@ -26,13 +26,14 @@ process DADA2_DENOISING { def prefix = task.ext.prefix ?: "prefix" def args = task.ext.args ?: '' def args2 = task.ext.args2 ?: '' + def quantile = task.ext.quantile ?: 0.001 if (!meta.single_end) { """ #!/usr/bin/env Rscript suppressPackageStartupMessages(library(dada2)) - errF = readRDS("${errormodel[0]}") - errR = readRDS("${errormodel[1]}") + errF <- readRDS("${errormodel[0]}") + errR <- readRDS("${errormodel[1]}") filtFs <- sort(list.files("./filtered/", pattern = "_1.filt.fastq.gz", full.names = TRUE), method = "radix") filtRs <- sort(list.files("./filtered/", pattern = "_2.filt.fastq.gz", full.names = TRUE), method = "radix") @@ -45,9 +46,50 @@ process DADA2_DENOISING { saveRDS(dadaRs, "${prefix}_2.dada.rds") sink(file = NULL) - #make table - mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, $args2, verbose=TRUE) - saveRDS(mergers, "${prefix}.mergers.rds") + # merge + if ("${params.mergepairs_strategy}" == "consensus") { + mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, $args2, justConcatenate = FALSE, verbose=TRUE) + concats <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, $args2, justConcatenate = TRUE, verbose=TRUE) + + # in case there is only one sample in the entire run + if (is.data.frame(mergers)) { + mergers <- list(sample = mergers) + concats <- list(sample = concats) + } + + # define the overlap threshold to decide if concatenation or not + min_overlap_obs <- lapply(mergers, function(X) { + mergers_accepted <- X[["accept"]] + if (sum(mergers_accepted) > 0) { + min_overlap_obs <- X[["nmatch"]][mergers_accepted] + X[["nmismatch"]][mergers_accepted] + rep(min_overlap_obs, X[["abundance"]][mergers_accepted]) + } else { + NA + } + }) + + min_overlap_obs <- Reduce(c, min_overlap_obs) + min_overlap_obs <- min_overlap_obs[!is.na(min_overlap_obs)] + min_overlap_obs <- quantile(min_overlap_obs, $quantile) + + for (x in names(mergers)) { + to_concat <- !mergers[[x]][["accept"]] & (mergers[[x]][["nmismatch"]] + mergers[[x]][["nmatch"]]) < min_overlap_obs + + if (sum(to_concat) > 0) { + mergers[[x]][to_concat, ] <- concats[[x]][to_concat, ] + # filter out unaccepted non concatenated sequences + mergers[[x]] <- mergers[[x]][mergers[[x]][["accept"]], ] + } + + } + + } else { + mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, $args2, verbose=TRUE) + } + + saveRDS(mergers, "${meta.run}.mergers.rds") + + # make table seqtab <- makeSequenceTable(mergers) saveRDS(seqtab, "${prefix}.seqtab.rds") diff --git a/nextflow.config b/nextflow.config index 7a9e3f0f..50126233 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,66 +23,72 @@ params { metadata = null // Other options - save_intermediates = false - trunc_qmin = 25 - trunc_rmin = 0.75 - trunclenf = null - trunclenr = null - max_ee = 2 - max_len = null - ignore_failed_filtering = false - min_len = 50 - metadata_category = null - metadata_category_barplot = null - double_primer = false - retain_untrimmed = false - cutadapt_min_overlap = 3 - cutadapt_max_error_rate = 0.1 - exclude_taxa = "mitochondria,chloroplast" - min_frequency = 1 - min_samples = 1 - multiple_sequencing_runs = false - single_end = false - sample_inference = "independent" - illumina_novaseq = false - illumina_pe_its = false - concatenate_reads = false - cut_its = "none" - its_partial = 0 - picrust = false - sbdiexport = false - addsh = false - tax_agglom_min = 2 - tax_agglom_max = 6 - min_read_counts = 1 - ignore_failed_trimming = false - ignore_empty_input_files = false - qiime_adonis_formula = null - seed = 100 - filter_ssu = null - min_len_asv = null - max_len_asv = null - filter_codons = null - orf_start = 1 - orf_end = null - stop_codons = "TAA,TAG" - pplace_tree = null - pplace_aln = null - pplace_model = null - pplace_alnmethod = 'hmmer' - pplace_taxonomy = null - pplace_name = null - diversity_rarefaction_depth= 500 - ancom_sample_min_count = 1 - vsearch_cluster = null - vsearch_cluster_id = 0.97 - ancom = false - ancombc = false - ancombc_effect_size = 1 - ancombc_significance = 0.05 - ancombc_formula = null - ancombc_formula_reflvl = null - raise_filter_stacksize = true + save_intermediates = false + trunc_qmin = 25 + trunc_rmin = 0.75 + trunclenf = null + trunclenr = null + max_ee = 2 + max_len = null + ignore_failed_filtering = false + min_len = 50 + metadata_category = null + metadata_category_barplot = null + double_primer = false + retain_untrimmed = false + cutadapt_min_overlap = 3 + cutadapt_max_error_rate = 0.1 + exclude_taxa = "mitochondria,chloroplast" + min_frequency = 1 + min_samples = 1 + multiple_sequencing_runs = false + single_end = false + sample_inference = "independent" + illumina_novaseq = false + illumina_pe_its = false + mergepairs_strategy = "merge" + mergepairs_consensus_minoverlap = 12 + mergepairs_consensus_maxmismatch = 0 + mergepairs_consensus_match = 1 + mergepairs_consensus_mismatch = -2 + mergepairs_consensus_gap = -4 + mergepairs_consensus_percentile_cutoff = 0.001 + cut_its = "none" + its_partial = 0 + picrust = false + sbdiexport = false + addsh = false + tax_agglom_min = 2 + tax_agglom_max = 6 + min_read_counts = 1 + ignore_failed_trimming = false + ignore_empty_input_files = false + qiime_adonis_formula = null + seed = 100 + filter_ssu = null + min_len_asv = null + max_len_asv = null + filter_codons = null + orf_start = 1 + orf_end = null + stop_codons = "TAA,TAG" + pplace_tree = null + pplace_aln = null + pplace_model = null + pplace_alnmethod = 'hmmer' + pplace_taxonomy = null + pplace_name = null + diversity_rarefaction_depth = 500 + ancom_sample_min_count = 1 + vsearch_cluster = null + vsearch_cluster_id = 0.97 + raise_filter_stacksize = true + ancom = false + ancombc = false + ancombc_effect_size = 1 + ancombc_significance = 0.05 + ancombc_formula = null + ancombc_formula_reflvl = null // Report options report_template = "${projectDir}/assets/report_template.Rmd" diff --git a/nextflow_schema.json b/nextflow_schema.json index 1db9be80..65d31c68 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -261,10 +261,47 @@ "description": "Mode of sample inference: \"independent\", \"pooled\" or \"pseudo\"", "enum": ["independent", "pooled", "pseudo"] }, - "concatenate_reads": { - "type": "boolean", - "description": "Not recommended: When paired end reads are not sufficiently overlapping for merging.", - "help_text": "This parameters specifies that paired-end reads are not merged after denoising but concatenated (separated by 10 N's). This is of advantage when an amplicon was sequenced that is too long for merging (i.e. bad experimental design). This is an alternative to only analyzing the forward or reverse read in case of non-overlapping paired-end sequencing data.\n\n**This parameter is not recommended! Only if all other options fail.**" + "mergepairs_strategy": { + "type": "string", + "default": "merge", + "description": "Strategy to merge paired end reads. When paired end reads are not sufficiently overlapping for merging, you can use \"concatenate\" (not recommended). When you have a mix of overlapping and non overlapping reads use \"consensus\"", + "help_text": "This parameters specifies how paired-end reads are merged after denoising. By default, read pairs will be merged by overlap. Concatenating read pairs (separated by 10 N's) is an alternative to only analyzing the forward or reverse read in case of non-overlapping paired-end sequencing data, this is not recommended, only if all other options fail. The consensus strategy is merging read pairs by overlap if possible and concatenates non-overlapping read pairs, based on `--mergepairs_consensus_*` parameters.", + "enum": ["merge", "concatenate", "consensus"] + }, + "mergepairs_consensus_match": { + "type": "integer", + "default": 1, + "description": "The score assigned for each matching base pair during sequence alignment.", + "help_text": "This parameter specifies the numerical value added to the alignment score for every pair of bases that match between the forward and reverse reads. A higher value increases the preference for alignments with more matching bases." + }, + "mergepairs_consensus_mismatch": { + "type": "integer", + "default": -2, + "description": "The penalty score assigned for each mismatched base pair during sequence alignment.", + "help_text": "This parameter defines the numerical penalty subtracted from the alignment score for each base pair mismatch between the forward and reverse reads. A higher penalty reduces the likelihood of accepting alignments with mismatches." + }, + "mergepairs_consensus_gap": { + "type": "integer", + "default": -4, + "description": "The penalty score assigned for each gap introduced during sequence alignment.", + "help_text": "This parameter sets the numerical penalty subtracted from the alignment score for each gap (insertion or deletion) introduced to align the forward and reverse reads. A higher penalty discourages alignments that require gaps." + }, + "mergepairs_consensus_minoverlap": { + "type": "integer", + "default": 12, + "description": "The minimum number of overlapping base pairs required to merge forward and reverse reads.", + "help_text": "This parameter specifies the smallest number of consecutive base pairs that must overlap between the forward and reverse reads for them to be merged. Ensuring sufficient overlap is crucial for accurate merging." + }, + "mergepairs_consensus_maxmismatch": { + "type": "integer", + "default": 0, + "description": "The maximum number of mismatches allowed within the overlapping region for merging reads.", + "help_text": "This parameter defines the highest number of mismatched base pairs permitted in the overlap region between forward and reverse reads for a successful merge. Setting this value helps control the stringency of read merging, balancing between sensitivity and accuracy." + }, + "mergepairs_consensus_percentile_cutoff": { + "type": "number", + "default": 0.001, + "description": "The percentile used to determine a stringent cutoff which will correspond to the minimum observed overlap in the dataset. This ensures that only read pairs with high overlap are merged into consensus sequences. Those with insufficient overlap are concatenated." } }, "fa_icon": "fas fa-braille"