Merge pull request #567 from d4straub/use-phylogenetic-placement-outp…

…ut-in-qiime use phylogenetic placement output in qiime
nf-core · Apr 5, 2023 · ebbce9e · ebbce9e
2 parents 7c57170 + e112d39
commit ebbce9e
Show file tree

Hide file tree

Showing 14 changed files with 199 additions and 58 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,11 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Added`
 
-- [#564](https://github.com/nf-core/ampliseq/pull/564) - Added phylogenetic placement
+- [#564](https://github.com/nf-core/ampliseq/pull/564),[#567](https://github.com/nf-core/ampliseq/pull/567) - Added phylogenetic placement
 
 ### `Changed`
 
 - [#563](https://github.com/nf-core/ampliseq/pull/563) - Renamed DADA2 taxonomic classification files to include the chosen reference taxonomy abbreviation.
+- [#567](https://github.com/nf-core/ampliseq/pull/567) - Renamed `--dada_tax_agglom_min` and `--qiime_tax_agglom_min` to `--tax_agglom_min` and `--dada_tax_agglom_max` and `--qiime_tax_agglom_max` to `--tax_agglom_max`
 
 ### `Fixed`
 

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -69,7 +69,9 @@
 
 ### Phylogenetic placement
 
-- nf-core/phyloplace (https://github.com/nf-core/phyloplace, https://nf-co.re/phyloplace) was originally written by Daniel Lundin.
+- [nf-core/phyloplace](https://nf-co.re/phyloplace)
+
+  > Daniel Lundin. (2023). nf-core/phyloplace: First release (1.0.0). Zenodo. https://doi.org/10.5281/zenodo.7643948
 
 - [HMMER](https://pubmed.ncbi.nlm.nih.gov/22039361/)
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -573,14 +573,22 @@ process {
         ]
     }
 
-    withName: 'QIIME2_INASV|QIIME2_INSEQ|QIIME2_INTAX' {
+    withName: 'QIIME2_INASV|QIIME2_INSEQ|QIIME2_INTAX|QIIME2_INTREE' {
         publishDir = [
             path: { "${params.outdir}/qiime2/input" },
             mode: params.publish_dir_mode,
             pattern: "*.qza"
         ]
     }
 
+    withName: FORMAT_PPLACETAX {
+        publishDir = [
+            path: { "${params.outdir}/pplace" },
+            mode: params.publish_dir_mode,
+            pattern: "*.per_query_unique.tsv"
+        ]
+    }
+
     withName: 'QIIME2_INASV_BPAVG' {
         publishDir = [
             path: { "${params.outdir}/qiime2/input" },

diff --git a/conf/test.config b/conf/test.config
@@ -38,8 +38,7 @@ params {
     metadata_category_barplot = "treatment1,badpairwise10"
 
     //restrict ANCOM analysis to higher taxonomic levels
-    dada_tax_agglom_max = 4
-    qiime_tax_agglom_max = 4
+    tax_agglom_max = 4
 
     sbdiexport = true
 

diff --git a/conf/test_pplace.config b/conf/test_pplace.config
@@ -28,19 +28,23 @@ params {
     qiime_ref_taxonomy = "greengenes85"
     filter_ssu = "bac"
 
-    //this is to remove low abundance ASVs to reduce runtime of downstream processes
+    // this is to remove low abundance ASVs to reduce runtime of downstream processes
     min_samples = 2
     min_frequency = 10
 
-    //pplace
+    // pplace
     pplace_tree = "https://github.com/nf-core/test-datasets/raw/phyloplace/testdata/cyanos_16s.newick"
     pplace_aln = "https://github.com/nf-core/test-datasets/raw/phyloplace/testdata/cyanos_16s.alnfna"
     pplace_model = "GTR+F+I+I+R3"
     pplace_taxonomy = "https://github.com/nf-core/test-datasets/raw/phyloplace/testdata/cyanos_16s.taxonomy.tsv"
     pplace_name = "test_pplace"
 
+    // Adjust taxonomic levels
+    tax_agglom_min = 1
+    tax_agglom_max = 3
 
-    //Skip some steps to reduce runtime
+    // Skip some steps to reduce runtime
     skip_alpha_rarefaction = true
     skip_fastqc = true
+    skip_ancom = true
 }
diff --git a/docs/output.md b/docs/output.md
@@ -242,6 +242,7 @@ Phylogenetic placement grafts sequences onto a phylogenetic reference tree and o
 - `pplace/`
   - `*.graft.*.epa_result.newick`: Full phylogeny with query sequences grafted on to the reference phylogeny, in newick format.
   - `*.taxonomy.per_query.tsv`: Tab separated file with taxonomy information per query from classification by `gappa examine examinassign`
+  - `*.per_query_unique.tsv`: Tab separated file with taxonomy information as above, but one row per query, by filtering for lowest LWR (likelihood weight ratio)
   - `*.heattree.tree.svg`: Heattree in SVG format from calling `gappa examine heattree`, see [Gappa documentation](https://github.com/Pbdas/epa-ng/blob/master/README.md) for details.
   - `pplace/hmmer/`: Contains intermediatary files if HMMER is used
   - `pplace/mafft/`: Contains intermediatary files if MAFFT is used
@@ -270,7 +271,7 @@ Intermediate data imported to QIIME2 is saved as QIIME2 fragments, that can be c
 
 #### Taxonomic classification
 
-Taxonomic classification with QIIME2 is typically similar to DADA2 classifications. However, both options are available. When taxonomic classification with DADA2 and QIIME2 is performed, DADA2 classification takes precedence over QIIME2 classifications for all downstream analysis.
+Taxonomic classification with QIIME2 is typically similar to DADA2 classifications. However, both options are available. When taxonomic classification with DADA2 and QIIME2 is performed, DADA2 classification takes precedence over QIIME2 classifications for all downstream analysis. Taxonomic classification by phylogenetic placement superseeds DADA2 and QIIME2 classification.
 
 <details markdown="1">
 <summary>Output files</summary>
@@ -284,7 +285,7 @@ Taxonomic classification with QIIME2 is typically similar to DADA2 classificatio
 
 #### Abundance tables
 
-The abundance tables are the final data for further downstream analysis and visualisations. The tables are based on the computed ASVs and taxonomic classification (DADA2 classification takes precedence over QIIME2 classifications), but after removal of unwanted taxa. Unwanted taxa are often off-targets generated in PCR with primers that are not perfectly specific for the target DNA (can be specified by `--exclude_taxa`), by default mitrochondria and chloroplast sequences are removed because these are frequent unwanted non-bacteria PCR products.
+The abundance tables are the final data for further downstream analysis and visualisations. The tables are based on the computed ASVs and taxonomic classification (in the following priotity: phylogenetic placement [EPA-NG, Gappa], DADA2, QIIME2), but after removal of unwanted taxa. Unwanted taxa are often off-targets generated in PCR with primers that are not perfectly specific for the target DNA (can be specified by `--exclude_taxa`), by default mitrochondria and chloroplast sequences are removed because these are frequent unwanted non-bacteria PCR products.
 
 All following analysis is based on these filtered tables.
 
@@ -317,6 +318,7 @@ Absolute abundance tables produced by the previous steps contain count data, but
   - `rel-table-ASV.tsv`: Tab-separated relative abundance table for all ASVs.
   - `rel-table-ASV_with-DADA2-tax.tsv`: Tab-separated table for all ASVs with DADA2 taxonomic classification, sequence and relative abundance.
   - `rel-table-ASV_with-QIIME2-tax.tsv`: Tab-separated table for all ASVs with QIIME2 taxonomic classification, sequence and relative abundance.
+  - `rel-table-ASV_with-PPLACE-tax.tsv`: Tab-separated table for all ASVs with EPA-NG - Gappa taxonomic classification, sequence and relative abundance.
 
 </details>
 

diff --git a/lib/WorkflowAmpliseq.groovy b/lib/WorkflowAmpliseq.groovy
@@ -43,13 +43,8 @@ class WorkflowAmpliseq {
             System.exit(1)
         }
 
-        if (params.dada_tax_agglom_min > params.dada_tax_agglom_max) {
-            log.error "Incompatible parameters: `--dada_tax_agglom_min` may not be greater than `--dada_tax_agglom_max`."
-            System.exit(1)
-        }
-
-        if (params.qiime_tax_agglom_min > params.qiime_tax_agglom_max) {
-            log.error "Incompatible parameters: `--qiime_tax_agglom_min` may not be greater than `--qiime_tax_agglom_max`."
+        if (params.tax_agglom_min > params.tax_agglom_max) {
+            log.error "Incompatible parameters: `--tax_agglom_min` may not be greater than `--tax_agglom_max`."
             System.exit(1)
         }
 

diff --git a/modules/local/format_pplacetax.nf b/modules/local/format_pplacetax.nf
@@ -0,0 +1,96 @@
+process FORMAT_PPLACETAX {
+    tag "${tax.baseName}"
+    label 'process_high'
+
+    conda "bioconda::bioconductor-dada2=1.22.0 conda-forge::r-digest=0.6.30"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/bioconductor-dada2:1.22.0--r41h399db7b_0' :
+        'quay.io/biocontainers/bioconductor-dada2:1.22.0--r41h399db7b_0' }"
+
+    input:
+    tuple val(meta), path(tax)
+
+    output:
+    path("*.per_query_unique.tsv"), emit: unique
+    path("*.taxonomy.tsv")        , emit: tsv
+    path "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    """
+    #!/usr/bin/env Rscript
+
+    tax = read.table("$tax", header = TRUE, sep = "\t", stringsAsFactors = FALSE, comment.char = '', quote = '')
+
+    df <- data.frame(
+        name=character(),
+        LWR=character(),
+        fract=character(),
+        aLWR=character(),
+        afract=character(),
+        taxopath=character(),
+        stringsAsFactors=FALSE)
+
+    for (asvid in unique(tax\$name)) {
+        # subset per ASV ID
+        temp = subset(tax, tax\$name == asvid)
+        maxLWR = max(temp\$LWR)
+        temp = subset(temp, temp\$LWR == maxLWR)
+
+        if ( nrow(temp) == 1 ) {
+            # STEP 1: if there is just 1 maximum, choose this row
+            print( paste ( asvid,"was added in STEP 1" ) )
+            temp = temp[which.max(temp\$LWR),]
+            df <- rbind(df, temp)
+        } else {
+            # STEP2: be conservative, choose the taxonomy with less entries
+            # count number of entries in "taxopath", simplified: number of semicolons
+            temp\$taxlevels = nchar( temp\$taxopath ) - nchar(gsub(';', '', temp\$taxopath )) + 1
+            minTaxlevels = min(temp\$taxlevels)
+            temp = subset(temp, temp\$taxlevels == minTaxlevels)
+            temp\$taxlevels = NULL
+            if ( nrow(temp) == 1 ) {
+                df <- rbind(df, temp)
+                print( paste ( asvid,"was added in STEP 2" ) )
+            } else {
+                # STEP 3: if number of entries is same, remove last entry
+                # then, check if reduced entries are identical > if yes, choose any row, if no, repeat
+                # at that step the taxonomies have same length
+                print( paste ( asvid,"enters STEP 3" ) )
+                list_taxonpath <- str_split( temp\$taxopath, ";")
+                df_taxonpath <- as.data.frame(do.call(rbind, list_taxonpath))
+                for (i in ncol(df_taxonpath):0) {
+                    # choose first column and change taxon to reduced overlap
+                    if ( length(unique(df_taxonpath[,i])) == 1 ) {
+                        if (i>1) {
+                            temp\$taxopath <- apply(df_taxonpath[1,1:i], 1, paste, collapse=";")[[1]]
+                        } else if (i==1) {
+                            temp\$taxopath <- df_taxonpath[1,1]
+                        }
+                        df <- rbind(df, temp[1,])
+                        print( paste ( asvid,"was added with",temp\$taxopath[[1]],"in STEP 3a" ) )
+                        break
+                    } else if (i==0) {
+                        # if all fails, i.e. there is no consensus on any taxonomic level, use NA
+                        temp\$taxopath <- "NA"
+                        df <- rbind(df, temp[1,])
+                        print( paste ( asvid,"was added with",temp\$taxopath[[1]],"in STEP 3b" ) )
+                    }
+                }
+            }
+        }
+    }
+
+    # output the cleaned, unified, unique taxonomic classification per ASV
+    write.table(df, file = "${meta.id}.taxonomy.per_query_unique.tsv", row.names = FALSE, col.names = TRUE, quote = FALSE, na = '', sep = "\\t")
+
+    # output ASV taxonomy table for QIIME2
+    df_clean <- subset(df, select = c("name","taxopath"))
+    colnames(df_clean) <- c("ASV_ID","taxonomy")
+    write.table(df_clean, file = "${meta.id}.taxonomy.tsv", row.names = FALSE, col.names = TRUE, quote = FALSE, na = '', sep = "\\t")
+
+    writeLines(c("\\"${task.process}\\":", paste0("    R: ", paste0(R.Version()[c("major","minor")], collapse = ".")),paste0("    dada2: ", packageVersion("dada2")) ), "versions.yml")
+    """
+}
diff --git a/modules/local/qiime2_intree.nf b/modules/local/qiime2_intree.nf
@@ -0,0 +1,34 @@
+process QIIME2_INTREE {
+    tag "${meta.id}:${meta.model}"
+    label 'process_low'
+
+    container "quay.io/qiime2/core:2022.11"
+
+    // Exit if running this module with -profile conda / -profile mamba
+    if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
+        exit 1, "QIIME2 does not support Conda. Please use Docker / Singularity / Podman instead."
+    }
+
+    input:
+    tuple val(meta), path(tree)
+
+    output:
+    path("tree.qza")   , emit: qza
+    path "versions.yml", emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    """
+    qiime tools import \\
+        --type 'Phylogeny[Rooted]' \\
+        --input-path $tree \\
+        --output-path tree.qza
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        qiime2: \$( qiime --version | sed '1!d;s/.* //' )
+    END_VERSIONS
+    """
+}
diff --git a/nextflow.config b/nextflow.config
@@ -47,10 +47,8 @@ params {
     picrust           = false
     sbdiexport        = false
     addsh             = false
-    dada_tax_agglom_min = 2
-    dada_tax_agglom_max = 6
-    qiime_tax_agglom_min = 2
-    qiime_tax_agglom_max = 6
+    tax_agglom_min = 2
+    tax_agglom_max = 6
     ignore_failed_trimming = false
     ignore_empty_input_files = false
     qiime_adonis_formula = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -280,7 +280,7 @@
                 },
                 "pplace_taxonomy": {
                     "type": "string",
-                    "help_text": "Headerless, tab-separated, first column with tree leaves, second column with taxonomy ranks separated by semicolon `;`.",
+                    "help_text": "Headerless, tab-separated, first column with tree leaves, second column with taxonomy ranks separated by semicolon `;`. The results take precedence over DADA2 and QIIME2 classifications.",
                     "description": "Tab-separated file with taxonomy assignments of reference sequences."
                 },
                 "pplace_name": {
@@ -426,33 +426,19 @@
                     "description": "Minimum sample counts to retain a sample for ANCOM analysis. Any sample below that threshold will be removed.",
                     "fa_icon": "fas fa-greater-than-equal"
                 },
-                "dada_tax_agglom_min": {
+                "tax_agglom_min": {
                     "type": "integer",
                     "default": 2,
-                    "description": "Minimum taxonomy agglomeration level for DADA2 classification",
+                    "description": "Minimum taxonomy agglomeration level for taxonomic classifications",
                     "fa_icon": "fas fa-greater-than-equal",
                     "help_text": "Depends on the reference taxonomy database used."
                 },
-                "dada_tax_agglom_max": {
+                "tax_agglom_max": {
                     "type": "integer",
                     "default": 6,
-                    "description": "Maximum taxonomy agglomeration level for DADA2 classification",
+                    "description": "Maximum taxonomy agglomeration level for taxonomic classifications",
                     "fa_icon": "fas fa-greater-than-equal",
                     "help_text": "Depends on the reference taxonomy database used. Most default databases have genus level at 6."
-                },
-                "qiime_tax_agglom_min": {
-                    "type": "integer",
-                    "default": 2,
-                    "description": "Minimum taxonomy agglomeration level for QIIME2 classification",
-                    "fa_icon": "fas fa-greater-than-equal",
-                    "help_text": "Depends on the reference taxonomy database used."
-                },
-                "qiime_tax_agglom_max": {
-                    "type": "integer",
-                    "default": 6,
-                    "description": "Maximum taxonomy agglomeration level for QIIME2 classification",
-                    "fa_icon": "fas fa-greater-than-equal",
-                    "help_text": "Depends on the reference taxonomy database used. Default databases should have genus level at 6."
                 }
             }
         },

diff --git a/subworkflows/local/qiime2_diversity.nf b/subworkflows/local/qiime2_diversity.nf
@@ -15,6 +15,7 @@ workflow QIIME2_DIVERSITY {
     ch_metadata
     ch_asv
     ch_seq
+    ch_tree
     ch_stats //QIIME2_FILTERTAXA.out.tsv
     ch_metacolumn_pairwise //METADATA_PAIRWISE.out
     ch_metacolumn_all //METADATA_ALL.out
@@ -24,17 +25,20 @@ workflow QIIME2_DIVERSITY {
 
     main:
     //Phylogenetic tree for beta & alpha diversities
-    QIIME2_TREE ( ch_seq )
+    if (!ch_tree) {
+        QIIME2_TREE ( ch_seq )
+        ch_tree = QIIME2_TREE.out.qza
+    }
 
     //Alpha-rarefaction
     if (!skip_alpha_rarefaction) {
-        QIIME2_ALPHARAREFACTION ( ch_metadata, ch_asv, QIIME2_TREE.out.qza, ch_stats )
+        QIIME2_ALPHARAREFACTION ( ch_metadata, ch_asv, ch_tree, ch_stats )
     }
 
     //Calculate diversity indices
     if (!skip_diversity_indices) {
 
-        QIIME2_DIVERSITY_CORE ( ch_metadata, ch_asv, QIIME2_TREE.out.qza, ch_stats, diversity_rarefaction_depth )
+        QIIME2_DIVERSITY_CORE ( ch_metadata, ch_asv, ch_tree, ch_stats, diversity_rarefaction_depth )
         //Print warning if rarefaction depth is <10000
         QIIME2_DIVERSITY_CORE.out.depth.subscribe { if ( it.baseName.toString().startsWith("WARNING") ) log.warn it.baseName.toString().replace("WARNING ","QIIME2_DIVERSITY_CORE: ") }
 

diff --git a/subworkflows/local/qiime2_export.nf b/subworkflows/local/qiime2_export.nf
@@ -7,6 +7,7 @@ include { QIIME2_EXPORT_RELASV                  } from '../../modules/local/qiim
 include { QIIME2_EXPORT_RELTAX                  } from '../../modules/local/qiime2_export_reltax'
 include { COMBINE_TABLE as COMBINE_TABLE_QIIME2 } from '../../modules/local/combine_table'
 include { COMBINE_TABLE as COMBINE_TABLE_DADA2  } from '../../modules/local/combine_table'
+include { COMBINE_TABLE as COMBINE_TABLE_PPLACE } from '../../modules/local/combine_table'
 
 workflow QIIME2_EXPORT {
     take:
@@ -15,6 +16,7 @@ workflow QIIME2_EXPORT {
     ch_tax
     ch_QIIME2_tax_tsv
     ch_DADA2_tax_tsv
+    ch_pplace_tax_tsv
     tax_agglom_min
     tax_agglom_max
 
@@ -34,6 +36,9 @@ workflow QIIME2_EXPORT {
     //combine_table.r (optional), similar to DADA2_table.tsv but with additionally taxonomy merged
     COMBINE_TABLE_DADA2 ( QIIME2_EXPORT_RELASV.out.tsv, QIIME2_EXPORT_ABSOLUTE.out.fasta, ch_DADA2_tax_tsv, 'rel-table-ASV_with-DADA2-tax.tsv' )
 
+    //combine_table.r (optional), similar to DADA2_table.tsv but with additionally taxonomy merged
+    COMBINE_TABLE_PPLACE ( QIIME2_EXPORT_RELASV.out.tsv, QIIME2_EXPORT_ABSOLUTE.out.fasta, ch_pplace_tax_tsv, 'rel-table-ASV_with-PPLACE-tax.tsv' )
+
     emit:
     abs_fasta      = QIIME2_EXPORT_ABSOLUTE.out.fasta
     abs_tsv        = QIIME2_EXPORT_ABSOLUTE.out.tsv