diff --git a/CHANGELOG.md b/CHANGELOG.md index a8fc4eae..60f0403b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,10 +9,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Changed` +- [#657](https://github.com/nf-core/ampliseq/pull/657) - Improved parameter descriptions and sequence + ### `Fixed` - [#655](https://github.com/nf-core/ampliseq/pull/655) - Added `NUMBA_CACHE_DIR` to fix downstream analysis with QIIME2 that failed on some systems - [#656](https://github.com/nf-core/ampliseq/pull/656) - Moved conda-check to script-section and replaced `exit 1` with `error()` +- [#657](https://github.com/nf-core/ampliseq/pull/657) - Corrected inaccurate reporting of QIIME2 taxonomic classifications and ASV length filtering ### `Dependencies` diff --git a/assets/report_template.Rmd b/assets/report_template.Rmd index f0b4073e..8c8fc21e 100644 --- a/assets/report_template.Rmd +++ b/assets/report_template.Rmd @@ -370,7 +370,7 @@ cat(paste0(" Overall read quality profiles are displayed as heat map of the frequency of each quality score at each base position. The mean quality score at each position is shown by the green line, and the quartiles of the quality score distribution by the orange lines. The red line shows the scaled proportion of reads that extend to at least -that position. Original plots can be found [folder dada2/QC/](../dada2/QC/) with names that end in `_qual_stats.pdf`. +that position. Original plots can be found in folder [dada2/QC/](../dada2/QC/) with names that end in `_qual_stats.pdf`. ")) ``` @@ -426,8 +426,8 @@ cat(paste0(" Estimated error rates are displayed for each possible transition. The black line shows the estimated error rates after convergence of the machine-learning algorithm. The red line shows the error rates expected under the nominal definition of the Q-score. The estimated error rates (black line) should be a good fit to the observed rates -(points), and the error rates should drop with increased quality. Original plots can be found in -[folder dada2/QC/](../dada2/QC/) with names that end in `.err.pdf`. +(points), and the error rates should drop with increased quality. Original plots can be found in folder +[dada2/QC/](../dada2/QC/) with names that end in `.err.pdf`. ")) ``` @@ -724,9 +724,10 @@ if ( params$max_len_asv != 0 ) { } # replace 1 with 1.5 to display on log scale -filter_len_profile$Counts[filter_len_profile$Counts == 1] <- 1.5 +filter_len_profile_replaced <- filter_len_profile +filter_len_profile_replaced$Counts[filter_len_profile_replaced$Counts == 1] <- 1.5 -plot_filter_len_profile <- ggplot(filter_len_profile, +plot_filter_len_profile <- ggplot(filter_len_profile_replaced, aes(x = Length, y = Counts)) + geom_bar(stat = "identity", fill = rgb(0.1, 0.4, 0.75), width = 0.5) + ylab("Number of ASVs") + @@ -989,17 +990,18 @@ asv_tax <- read.table(params$qiime2_taxonomy, header = TRUE, sep = "\t") asv_tax <- subset(asv_tax, select = Taxon) # Remove greengenes85 ".__" placeholders -df = as.data.frame(lapply(asv_tax, function(x) gsub(".__", "", x))) -# remove all last, empty ; -df = as.data.frame(lapply(df, function(x) gsub(" ;","",x))) +df = as.data.frame(lapply(asv_tax, function(x) gsub(" .__", "", x))) +# remove all empty ; +df = as.data.frame(lapply(df, function(x) gsub(";;","",x))) # remove last remaining, empty ; -df = as.data.frame(lapply(df, function(x) gsub("; $","",x))) +df = as.data.frame(lapply(df, function(x) gsub(";$","",x))) # get maximum amount of taxa levels per ASV -max_taxa <- lengths(regmatches(df$Taxon, gregexpr("; ", df$Taxon)))+1 +max_taxa <- lengths(regmatches(df$Taxon, gregexpr(";", df$Taxon)))+1 -# Currently, all QIIME2 databases seem to have the same levels! +# Currently, all QIIME2 databases seem to have the same levels! But for compatibility, restrict number of levels to max_taxa level <- c("Kingdom","Phylum","Class","Order","Family","Genus","Species") +level <- head(level, n = max(max_taxa) ) # Calculate the classified numbers/percent of asv n_asv_tax = nrow(asv_tax) @@ -1811,7 +1813,7 @@ if ( !isFALSE(params$dada2_ref_tax_title) ) { "- citation: `", params$dada2_ref_tax_citation, "`\n\n", sep = "") } else if (!isFALSE(params$dada2_taxonomy)) { cat("Taxonomic classification by DADA2:\n\n", - "- database: unknown - user provided\n\n", sep = "") + "- database: user provided file(s)\n\n", sep = "") } if ( !isFALSE(params$sintax_ref_tax_title) ) { @@ -1821,7 +1823,7 @@ if ( !isFALSE(params$sintax_ref_tax_title) ) { "- citation: `", params$sintax_ref_tax_citation, "`\n\n", sep = "") } else if (!isFALSE(params$sintax_taxonomy)) { cat("Taxonomic classification by SINTAX:\n\n", - "- database: unknown - user provided\n\n", sep = "") + "- database: user provided file\n\n", sep = "") } if ( !isFALSE(params$kraken2_ref_tax_title) ) { @@ -1831,7 +1833,7 @@ if ( !isFALSE(params$kraken2_ref_tax_title) ) { "- citation: `", params$kraken2_ref_tax_citation, "`\n\n", sep = "") } else if (!isFALSE(params$kraken2_taxonomy)) { cat("Taxonomic classification by Kraken2:\n\n", - "- database: unknown - user provided\n\n", sep = "") + "- database: user provided files\n\n", sep = "") } if ( !isFALSE(params$qiime2_ref_tax_title) ) { @@ -1841,7 +1843,7 @@ if ( !isFALSE(params$qiime2_ref_tax_title) ) { "- citation: `", params$qiime2_ref_tax_citation, "`\n\n", sep = "") } else if (!isFALSE(params$qiime2_taxonomy)) { cat("Taxonomic classification by QIIME2:\n\n", - "- database: unknown - user provided\n\n", sep = "") + "- database: user provided file\n\n", sep = "") } ``` diff --git a/nextflow_schema.json b/nextflow_schema.json index 9652e68b..1d3098da 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -76,37 +76,43 @@ "properties": { "illumina_novaseq": { "type": "boolean", - "description": "If data has binned quality scores such as Illumina NovaSeq" + "description": "If data has binned quality scores such as Illumina NovaSeq", + "fa_icon": "fas fa-align-justify" }, "pacbio": { "type": "boolean", - "description": "If data is single-ended PacBio reads instead of Illumina" + "description": "If data is single-ended PacBio reads instead of Illumina", + "fa_icon": "fas fa-align-justify" }, "iontorrent": { "type": "boolean", - "description": "If data is single-ended IonTorrent reads instead of Illumina" + "description": "If data is single-ended IonTorrent reads instead of Illumina", + "fa_icon": "fas fa-align-justify" }, "single_end": { "type": "boolean", "description": "If data is single-ended Illumina reads instead of paired-end", - "help_text": "When using a sample sheet with `--input` containing forward and reverse reads, specifying `--single_end` will only extract forward reads and treat the data as single ended instead of extracting forward and reverse reads." + "help_text": "When using a sample sheet with `--input` containing forward and reverse reads, specifying `--single_end` will only extract forward reads and treat the data as single ended instead of extracting forward and reverse reads.", + "fa_icon": "fas fa-align-left" }, "illumina_pe_its": { "type": "boolean", "description": "If analysing ITS amplicons or any other region with large length variability with Illumina paired end reads", - "help_text": "This will cause the pipeline to\n- not truncate input reads if not `--trunclenf` and `--trunclenr` are overwriting defaults\n- remove reverse complement primers from the end of reads in case the read length exceeds the amplicon length" + "help_text": "This will cause the pipeline to\n- not truncate input reads if not `--trunclenf` and `--trunclenr` are overwriting defaults\n- remove reverse complement primers from the end of reads in case the read length exceeds the amplicon length", + "fa_icon": "fas fa-align-justify" }, "multiple_sequencing_runs": { "type": "boolean", - "description": "If samples were sequenced in multiple sequencing runs", - "help_text": "Expects one sub-folder per sequencing run in the folder specified by `--input` containing sequencing data of the specific run.\nSample identifiers are taken from sequencing files, specifically the string before the first underscore will be the sample ID. Sample IDs across all sequencing runs (all sequencing files) have to be unique. If this is not the case, please use a sample sheet as input instead.\n\nExample for input data organization:\n\n```bash\ndata\n |-run1\n | |-sample1_1_L001_R1_001.fastq.gz\n | |-sample1_1_L001_R2_001.fastq.gz\n | |-sample2_1_L001_R1_001.fastq.gz\n | |-sample2_1_L001_R2_001.fastq.gz\n |\n |-run2\n |-sample3_1_L001_R1_001.fastq.gz\n |-sample3_1_L001_R2_001.fastq.gz\n |-sample4_1_L001_R1_001.fastq.gz\n |-sample4_1_L001_R2_001.fastq.gz\n```\n\nExample command to analyze this data in one pipeline run:\n\n```bash\nnextflow run nf-core/ampliseq \\\n -profile singularity \\\n --input \"data\" \\\n --FW_primer \"GTGYCAGCMGCCGCGGTAA\" \\\n --RV_primer \"GGACTACNVGGGTWTCTAAT\" \\\n --metadata \"data/Metadata.tsv\" \\\n --multiple_sequencing_runs\n```", + "description": "If using `--input_folder`: samples were sequenced in multiple sequencing runs", + "help_text": "Expects one sub-folder per sequencing run in the folder specified by `--input_folder` containing sequencing data of the specific run.\nSample identifiers are taken from sequencing files, specifically the string before the first underscore will be the sample ID. Sample IDs across all sequencing runs (all sequencing files) have to be unique. If this is not the case, please use a sample sheet as input instead.\n\nExample for input data organization:\n\n```bash\ndata\n |-run1\n | |-sample1_1_L001_R1_001.fastq.gz\n | |-sample1_1_L001_R2_001.fastq.gz\n | |-sample2_1_L001_R1_001.fastq.gz\n | |-sample2_1_L001_R2_001.fastq.gz\n |\n |-run2\n |-sample3_1_L001_R1_001.fastq.gz\n |-sample3_1_L001_R2_001.fastq.gz\n |-sample4_1_L001_R1_001.fastq.gz\n |-sample4_1_L001_R2_001.fastq.gz\n```\n\nExample command to analyze this data in one pipeline run:\n\n```bash\nnextflow run nf-core/ampliseq \\\n -profile singularity \\\n --input_folder \"data\" \\\n --FW_primer \"GTGYCAGCMGCCGCGGTAA\" \\\n --RV_primer \"GGACTACNVGGGTWTCTAAT\" \\\n --metadata \"data/Metadata.tsv\" \\\n --multiple_sequencing_runs\n```", "fa_icon": "fas fa-running" }, "extension": { "type": "string", "default": "/*_R{1,2}_001.fastq.gz", - "description": "Naming of sequencing files", - "help_text": "Indicates the naming of sequencing files (default: `\"/*_R{1,2}_001.fastq.gz\"`).\n\nPlease note:\n\n1. The prepended slash (`/`) is required\n2. The star (`*`) is the required wildcard for sample names\n3. The curly brackets (`{}`) enclose the orientation for paired end reads, separated by a comma (`,`).\n4. The pattern must be enclosed in quotes\n\nFor example for one sample (name: `1`) with forward (file: `1_a.fastq.gz`) and reverse (file: `1_b.fastq.gz`) reads in folder `data`:\n\n```bash\n--input \"data\" --extension \"/*_{a,b}.fastq.gz\"\n```" + "description": "If using `--input_folder`: naming of sequencing files", + "help_text": "Indicates the naming of sequencing files (default: `\"/*_R{1,2}_001.fastq.gz\"`).\n\nPlease note:\n\n1. The prepended slash (`/`) is required\n2. The star (`*`) is the required wildcard for sample names\n3. The curly brackets (`{}`) enclose the orientation for paired end reads, separated by a comma (`,`).\n4. The pattern must be enclosed in quotes\n\nFor example for one sample (name: `1`) with forward (file: `1_a.fastq.gz`) and reverse (file: `1_b.fastq.gz`) reads in folder `data`:\n\n```bash\n--input_folder \"data\" --extension \"/*_{a,b}.fastq.gz\"\n```", + "fa_icon": "fab fa-amilia" }, "min_read_counts": { "type": "integer", @@ -118,14 +124,15 @@ "ignore_empty_input_files": { "type": "boolean", "description": "Ignore input files with too few reads.", - "help_text": "Ignore input files with less reads than specified by `--min_read_counts` and continue the pipeline without those samples." + "help_text": "Ignore input files with less reads than specified by `--min_read_counts` and continue the pipeline without those samples.", + "fa_icon": "fas fa-arrow-right" } - } + }, + "fa_icon": "fas fa-align-justify" }, "primer_removal": { "title": "Primer removal", "type": "object", - "description": "", "default": "", "properties": { "retain_untrimmed": { @@ -137,12 +144,14 @@ "cutadapt_min_overlap": { "type": "integer", "default": 3, - "description": "Sets the minimum overlap for valid matches of primer sequences with reads for cutadapt (-O)." + "description": "Sets the minimum overlap for valid matches of primer sequences with reads for cutadapt (-O).", + "fa_icon": "fas fa-align-left" }, "cutadapt_max_error_rate": { "type": "number", "default": 0.1, - "description": "Sets the maximum error rate for valid matches of primer sequences with reads for cutadapt (-e)." + "description": "Sets the maximum error rate for valid matches of primer sequences with reads for cutadapt (-e).", + "fa_icon": "fas fa-exclamation-circle" }, "double_primer": { "type": "boolean", @@ -153,9 +162,11 @@ "ignore_failed_trimming": { "type": "boolean", "description": "Ignore files with too few reads after trimming.", - "help_text": "Ignore files with less reads than specified by `--min_read_counts` after trimming and continue the pipeline without those samples." + "help_text": "Ignore files with less reads than specified by `--min_read_counts` after trimming and continue the pipeline without those samples.", + "fa_icon": "fas fa-arrow-right" } - } + }, + "fa_icon": "fas fa-align-left" }, "read_trimming_and_quality_filtering": { "title": "Read trimming and quality filtering", @@ -188,7 +199,8 @@ "description": "Assures that values chosen with --trunc_qmin will retain a fraction of reads.", "help_text": "Value can range from 0 to 1. 0 means no reads need to be retained and 1 means all reads need to be retained. The minimum lengths of --trunc_qmin and --trunc_rmin are chosen as DADA2 cutoffs.", "minimum": 0, - "maximum": 1 + "maximum": 1, + "fa_icon": "fas fa-greater-than-equal" }, "max_ee": { "type": "integer", @@ -213,14 +225,15 @@ "ignore_failed_filtering": { "type": "boolean", "description": "Ignore files with too few reads after quality filtering.", - "help_text": "Ignore files with fewer reads than specified by `--min_read_counts` after trimming and continue the pipeline without those samples. Please review all quality trimming and filtering options before using this parameter. For example, one sample with shorter sequences than other samples might loose all sequences due to minimum length requirements by read truncation (see --trunclenf)." + "help_text": "Ignore files with fewer reads than specified by `--min_read_counts` after trimming and continue the pipeline without those samples. Please review all quality trimming and filtering options before using this parameter. For example, one sample with shorter sequences than other samples might loose all sequences due to minimum length requirements by read truncation (see --trunclenf).", + "fa_icon": "fas fa-arrow-right" } - } + }, + "fa_icon": "fas fa-ban" }, "amplicon_sequence_variants_asv_calculation": { "title": "Amplicon Sequence Variants (ASV) calculation", "type": "object", - "description": "", "default": "", "properties": { "sample_inference": { @@ -235,12 +248,85 @@ "description": "Not recommended: When paired end reads are not sufficiently overlapping for merging.", "help_text": "This parameters specifies that paired-end reads are not merged after denoising but concatenated (separated by 10 N's). This is of advantage when an amplicon was sequenced that is too long for merging (i.e. bad experimental design). This is an alternative to only analyzing the forward or reverse read in case of non-overlapping paired-end sequencing data.\n\n**This parameter is not recommended! Only if all other options fail.**" } - } + }, + "fa_icon": "fas fa-braille" + }, + "asv_post_processing": { + "title": "ASV post processing", + "type": "object", + "default": "", + "properties": { + "vsearch_cluster": { + "type": "boolean", + "description": "Post-cluster ASVs with VSEARCH", + "help_text": "ASVs will be clustered with VSEARCH using the id value found in `--vsearch_cluster_id`." + }, + "vsearch_cluster_id": { + "type": "number", + "default": 0.97, + "minimum": 0, + "maximum": 1, + "description": "Pairwise Identity value used when post-clustering ASVs if `--vsearch_cluster` option is used (default: 0.97).", + "help_text": "Lowering or increasing this value can change the number ASVs left over after clustering." + }, + "filter_ssu": { + "type": "string", + "description": "Enable SSU filtering. Comma separated list of kingdoms (domains) in Barrnap, a combination (or one) of \"bac\", \"arc\", \"mito\", and \"euk\". ASVs that have their lowest evalue in that kingdoms are kept.", + "enum": [ + "bac,arc,mito,euk", + "bac", + "arc", + "mito", + "euk", + "bac,arc", + "bac,mito", + "bac,euk", + "arc,mito", + "arc,euk", + "mito,euk", + "bac,arc,mito", + "bac,mito,euk", + "arc,mito,euk" + ] + }, + "min_len_asv": { + "type": "integer", + "description": "Minimal ASV length", + "help_text": "Remove ASV that are below the minimum length threshold (default: filter is disabled, otherwise 1). Increasing the threshold might reduce false positive ASVs (e.g. PCR off-targets)." + }, + "max_len_asv": { + "type": "integer", + "description": "Maximum ASV length", + "help_text": "Remove ASV that are above the maximum length threshold (default: filter is disabled, otherwise 1000000). Lowering the threshold might reduce false positive ASVs (e.g. PCR off-targets)." + }, + "filter_codons": { + "type": "boolean", + "description": "Filter ASVs based on codon usage", + "help_text": "ASVs will be filtered to contain no stop codon in their coding sequence and that their length is a multiple of 3." + }, + "orf_start": { + "type": "integer", + "default": 1, + "description": "Starting position of codon tripletts", + "help_text": "By default, when `--filter_codons` is set, the codons start from the first position of the ASV sequences. The start of the codons can be changed to any position." + }, + "orf_end": { + "type": "integer", + "description": "Ending position of codon tripletts", + "help_text": "By default, when `--filter_codons` is set, the codons are checked until the end of the ASV sequences. If you would like to change this setting, you can specify until which position of the ASV sequences the codon triplets are checked.\n\nPlease note that the length of the ASV from the beginning or from the `--orf_start` until this position must be a multiple of 3." + }, + "stop_codons": { + "type": "string", + "default": "TAA,TAG", + "description": "Define stop codons", + "help_text": "By default, when `--filter_codons` is set, the codons `TAA,TAG` are set as stop codons. Here you can specify any comma-separated list of codons to be used as stop codons, e.g. `--stop_codons \"TAA,TAG,TGA\"`" + } + }, + "fa_icon": "fas fa-filter" }, "taxonomic_database": { "title": "Taxonomic database", "type": "object", - "description": "", "default": "", "properties": { "dada_ref_taxonomy": { @@ -379,7 +465,7 @@ }, "kraken2_confidence": { "type": "number", - "default": 0, + "default": 0.0, "help_text": "Increasing the threshold will require more k-mers to match at a taxonomic levels and reduce the taxonomic levels shown until the threshold is met.", "description": "Confidence score threshold for taxonomic classification.", "minimum": 0, @@ -425,74 +511,9 @@ "asv_filtering": { "title": "ASV filtering", "type": "object", - "description": "", "default": "", + "fa_icon": "fas fa-filter", "properties": { - "vsearch_cluster": { - "type": "boolean", - "description": "Post-cluster ASVs with VSEARCH", - "help_text": "ASVs will be clustered with VSEARCH using the id value found in `--vsearch_cluster_id`." - }, - "vsearch_cluster_id": { - "type": "number", - "default": 0.97, - "minimum": 0.0, - "maximum": 1.0, - "description": "Pairwise Identity value used when post-clustering ASVs if `--vsearch_cluster` option is used (default: 0.97).", - "help_text": "Lowering or increasing this value can change the number ASVs left over after clustering." - }, - "filter_ssu": { - "type": "string", - "description": "Enable SSU filtering. Comma separated list of kingdoms (domains) in Barrnap, a combination (or one) of \"bac\", \"arc\", \"mito\", and \"euk\". ASVs that have their lowest evalue in that kingdoms are kept.", - "enum": [ - "bac,arc,mito,euk", - "bac", - "arc", - "mito", - "euk", - "bac,arc", - "bac,mito", - "bac,euk", - "arc,mito", - "arc,euk", - "mito,euk", - "bac,arc,mito", - "bac,mito,euk", - "arc,mito,euk" - ] - }, - "min_len_asv": { - "type": "integer", - "description": "Minimal ASV length", - "help_text": "Remove ASV that are below the minimum length threshold (default: filter is disabled, otherwise 1). Increasing the threshold might reduce false positive ASVs (e.g. PCR off-targets)." - }, - "max_len_asv": { - "type": "integer", - "description": "Maximum ASV length", - "help_text": "Remove ASV that are above the maximum length threshold (default: filter is disabled, otherwise 1000000). Lowering the threshold might reduce false positive ASVs (e.g. PCR off-targets)." - }, - "filter_codons": { - "type": "boolean", - "description": "Filter ASVs based on codon usage", - "help_text": "ASVs will be filtered to contain no stop codon in their coding sequence and that their length is a multiple of 3." - }, - "orf_start": { - "type": "integer", - "default": 1, - "description": "Starting position of codon tripletts", - "help_text": "By default, when `--filter_codons` is set, the codons start from the first position of the ASV sequences. The start of the codons can be changed to any position." - }, - "orf_end": { - "type": "integer", - "description": "Ending position of codon tripletts", - "help_text": "By default, when `--filter_codons` is set, the codons are checked until the end of the ASV sequences. If you would like to change this setting, you can specify until which position of the ASV sequences the codon triplets are checked.\n\nPlease note that the length of the ASV from the beginning or from the `--orf_start` until this position must be a multiple of 3." - }, - "stop_codons": { - "type": "string", - "default": "TAA,TAG", - "description": "Define stop codons", - "help_text": "By default, when `--filter_codons` is set, the codons `TAA,TAG` are set as stop codons. Here you can specify any comma-separated list of codons to be used as stop codons, e.g. `--stop_codons \"TAA,TAG,TGA\"`" - }, "exclude_taxa": { "type": "string", "default": "mitochondria,chloroplast", @@ -511,15 +532,14 @@ "description": "Prevalence filtering", "help_text": "Filtering low prevalent features from the feature table, e.g. keeping only features that are present in at least two samples can be achived by choosing a value of 2 (default: 1, meaning filter is disabled). Typically only used when having replicates for all samples.\n\nFor example to retain features that are present in at least two sample:\n\n```bash\n--min_samples 2\n```\n\nPlease note this is independent of abundance." } - }, - "fa_icon": "fas fa-filter" + } }, "downstream_analysis": { "title": "Downstream analysis", "type": "object", "description": "", "default": "", - "fa_icon": "fas fa-filter", + "fa_icon": "fas fa-bacteria", "properties": { "metadata_category": { "type": "string", @@ -600,10 +620,10 @@ }, "report_abstract": { "type": "string", - "default": null, "description": "Path to Markdown file (md) that replaces the 'Abstract' section" } - } + }, + "fa_icon": "fas fa-book-open" }, "skipping_specific_steps": { "title": "Skipping specific steps", @@ -671,7 +691,8 @@ "type": "boolean", "description": "Skip Markdown summary report" } - } + }, + "fa_icon": "fas fa-hand-paper" }, "generic_options": { "title": "Generic options", @@ -892,6 +913,9 @@ { "$ref": "#/definitions/amplicon_sequence_variants_asv_calculation" }, + { + "$ref": "#/definitions/asv_post_processing" + }, { "$ref": "#/definitions/taxonomic_database" },