Merge branch 'master' into gene-order-workflow

* master: feat: Add recombination entry (#175) docs: Add evolCCM usage documentation feat: Add evolCCM entry (#174) docs: Update rSPR outputs (#173) fix: Stop ppanggolin msa from altering input file (#172) feat: Add rspr entry (#171) refactor: Ignore time reached error in rspr_exact (#170)
beiko-lab · Nov 10, 2023 · bc0cd3c · bc0cd3c
2 parents b5fdfd7 + 710ba25
commit bc0cd3c
Show file tree

Hide file tree

Showing 12 changed files with 238 additions and 18 deletions.
diff --git a/bin/rspr_approx.py b/bin/rspr_approx.py
@@ -36,6 +36,7 @@ def parse_args(args=None):
         "--annotation",
         dest="ANNOTATION",
         help="Annotation table from BAKTA/PROKKA",
+        nargs="?",
     )
     parser.add_argument(
         "-o", "--output", dest="OUTPUT_DIR", default="approx", help="Gene tree list"
@@ -295,7 +296,8 @@ def main(args=None):
     make_heatmap(results, fig_path)
 
     results.reset_index("file_name", inplace=True)
-    results = join_annotation_data(results, args.ANNOTATION)
+    if args.ANNOTATION:
+        results = join_annotation_data(results, args.ANNOTATION)
     res_path = os.path.join(args.OUTPUT_DIR, "output.tsv")
     df_with_groups = make_groups_from_csv(results, args.MIN_RSPR_DISTANCE)
     df_with_groups.to_csv(res_path, sep="\t", index=False)

diff --git a/conf/modules.config b/conf/modules.config
@@ -440,6 +440,7 @@ process {
     }
 
     withName: RSPR_EXACT {
+        errorStrategy = { task.exitStatus in [140] ? 'ignore' : 'retry' }
         publishDir = [
             path: { "${params.outdir}/dynamics/rSPR/exact" },
             mode: params.publish_dir_mode,

diff --git a/docs/output.md b/docs/output.md
@@ -296,6 +296,8 @@ See [the PPanGGoLiN documentation](https://github.com/labgem/PPanGGOLiN/wiki/Out
 
 #### _rSPR_
 
+The outputs are approximate and exact Subtree Prune and Regraft (rSPR) distances between pairs of rooted phylogenetic trees. Each CSV file contains these distances and the tree sizes. The PNG files are heatmaps of these distances and their respective tree sizes.
+
 - `dynamics/rSPR/`
 
       - `approx` - Approximate rSPR distances

diff --git a/docs/params.md b/docs/params.md
@@ -93,8 +93,11 @@ Parameters for the recombination subworkflow
 | `run_rspr` | Run rSPR | `boolean` |  |  |  |
 | `min_rspr_distance` | Minimum rSPR distance used to define processing groups | `integer` | 10 |  |  |
 | `min_branch_length` | Minimum rSPR branch length | `integer` | 0 |  |  |
-| `max_support_threshold` | Maximum rSPR support threshold | `integer` | 0 |  |  |
+| `max_support_threshold` | Maximum rSPR support threshold | `number` | 0.7 |  |  |
 | `max_approx_rspr` | Maximum approximate rSPR distance for filtering | `integer` | -1 |  |  |
+| `core_gene_tree` | Core (or reference) genome tree. Used in the rSPR and evolCCM entries. | `string` |  |  |  |
+| `concatenated_annotation` | TSV table of annotations for all genomes. Such as the ones generated by Bakta or Prokka in ARETE. | `string` |  |  |  |
+| `feature_profile` | Feature profile TSV (A presence-absence matrix). Used in the evolCCM entry. | `string` |  |  |  |
 
 ## Institutional config options
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -149,6 +149,77 @@ To execute phylogenomic and pangenomics analysis on pre-existing assemblies:
 nextflow run beiko-lab/ARETE -entry phylogenomics --input_sample_table samplesheet.csv -profile docker
 ```
 
+### rSPR Entry
+
+To execute the rSPR analysis on pre-existing trees:
+
+```bash
+nextflow run beiko-lab/ARETE \
+  -entry rspr \
+  --input_sample_table samplesheet.csv \
+  --core_gene_tree core_gene_alignment.tre \
+  --concatenated_annotation BAKTA.txt \
+  -profile docker
+```
+
+The parameters being:
+
+- `--core_gene_tree` - The reference tree, coming from a core genome alignment, like the one generated by panaroo in ARETE.
+- `--concatenated_annotation` - The tabular annotation results (TSV) for all genomes, like the ones generated at the end of Prokka or Bakta in ARETE. Although useful, it's not necessary to execute the rSPR entry.
+- `--input_sample_table` - A samplesheet containing all individual gene trees in the following format:
+
+`gene_tree,path
+CDS_0000,/path/to/CDS_0000.tre
+CDS_0001,/path/to/CDS_0001.tre
+CDS_0002,/path/to/CDS_0002.tre
+CDS_0003,/path/to/CDS_0003.tre
+CDS_0004,/path/to/CDS_0004.tre
+`
+
+### evolCCM Entry
+
+To execute the evolCCM analysis on a pre-existing reference tree and feature profile:
+
+```bash
+nextflow run beiko-lab/ARETE \
+  -entry evolccm \
+  --core_gene_tree core_gene_alignment.tre \
+  --feature_profile feature_profile.tsv.gz \
+  -profile docker
+```
+
+The parameters being:
+
+- `--core_gene_tree` - The reference tree, coming from a core genome alignment, like the one generated by panaroo in ARETE.
+- `--feature_profile` - A presence/absence TSV matrix of features
+  in genomes. Genome names should be the same in the core tree and
+  should be contained to a 'genome_id' column, with all other columns represent features absent (0) or present (1) in each genome. I.e.:
+
+```
+genome_id	plasmid_AA155	plasmid_AA161
+ED010	0	0
+ED017	0	1
+ED040	0	0
+ED073	0	1
+ED075	1	1
+ED082	0	1
+ED142	0	1
+ED178	0	1
+ED180	0	0
+```
+
+### Recombination Entry
+
+To execute the recombination analysis on pre-existing assemblies (PopPUNK model can be either bgmm, dbscan, refine, threshold or lineage):
+
+```bash
+nextflow run beiko-lab/ARETE \
+  -entry recombination \
+  --input_sample_table samplesheet.csv \
+  --poppunk_model dbscan \
+  -profile docker
+```
+
 ## Updating the pipeline
 
 When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline:

diff --git a/lib/WorkflowArete.groovy b/lib/WorkflowArete.groovy
@@ -66,11 +66,6 @@ class WorkflowArete {
         // Check the hostnames against configured profiles
         NfcoreTemplate.hostName(workflow, params, log)
 
-        // Check input has been provided
-        if (!params.input_sample_table) {
-            log.error "Please provide an input samplesheet to the pipeline e.g. '--input_sample_table samplesheet.csv'"
-            System.exit(1)
-        }
     }
 
     //

diff --git a/main.nf b/main.nf
@@ -37,7 +37,9 @@ include { ANNOTATION } from './workflows/arete'
 include { PHYLO } from './workflows/arete'
 include { QUALITYCHECK } from './workflows/arete'
 include { POPPUNK } from './workflows/arete'
-
+include { RUN_RSPR } from './workflows/arete'
+include { RUN_EVOLCCM } from './workflows/arete'
+include { RUN_RECOMBINATION } from './workflows/arete'
 
 //
 // WORKFLOW: Run main nf-core/arete analysis pipeline
@@ -68,6 +70,17 @@ workflow poppunk {
     POPPUNK()
 }
 
+workflow rspr {
+    RUN_RSPR()
+}
+
+workflow evolccm {
+    RUN_EVOLCCM()
+}
+
+workflow recombination {
+    RUN_RECOMBINATION()
+}
 /*
 ========================================================================================
     RUN ALL WORKFLOWS

diff --git a/modules/local/ppanggolin/msa/main.nf b/modules/local/ppanggolin/msa/main.nf
@@ -12,8 +12,7 @@ process PPANGGOLIN_MSA {
     tuple val(meta), path(pangenome)
 
     output:
-    tuple val(meta), path("${prefix}_msa")    , emit: results
-    path "${prefix}_msa/msa_all_protein/*.aln", emit: alignments
+    path "ppanggolin_msa/msa_all_protein/*.aln", emit: alignments
     path "versions.yml"                       , emit: versions
 
     when:
@@ -23,12 +22,14 @@ process PPANGGOLIN_MSA {
     def args = task.ext.args ?: ''
     prefix = task.ext.prefix ?: "${meta.id}"
     """
+    cp $pangenome copied_pangenome.h5
+
     ppanggolin \\
         msa \\
         $args \\
         --cpu $task.cpus \\
-        --pangenome $pangenome \\
-        --output "${prefix}"_msa \\
+        --pangenome copied_pangenome.h5 \\
+        --output ppanggolin_msa \\
         --partition all
 
     cat <<-END_VERSIONS > versions.yml

diff --git a/nextflow.config b/nextflow.config
@@ -79,6 +79,11 @@ params {
     max_support_threshold      = 0.7
     max_approx_rspr            = -1
 
+    // rSPR/evolCCM entries
+    core_gene_tree             = null
+    concatenated_annotation    = null
+    feature_profile            = null
+
     // MultiQC options
     multiqc_config             = null
     multiqc_title              = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -10,7 +10,7 @@
             "type": "object",
             "fa_icon": "fas fa-terminal",
             "description": "Define where the pipeline should find input data and save output data.",
-            "required": ["input_sample_table"],
+            "required": [],
             "properties": {
                 "input_sample_table": {
                     "type": "string",
@@ -217,7 +217,7 @@
                 },
                 "accessory_similarity": {
                     "type": "number",
-                    "default": 99,
+                    "default": 99.0,
                     "fa_icon": "far fa-clone",
                     "description": "Similarity threshold for accessory genes"
                 }
@@ -331,6 +331,18 @@
                     "type": "integer",
                     "default": -1,
                     "description": "Maximum approximate rSPR distance for filtering"
+                },
+                "core_gene_tree": {
+                    "type": "string",
+                    "description": "Core (or reference) genome tree. Used in the rSPR and evolCCM entries."
+                },
+                "concatenated_annotation": {
+                    "type": "string",
+                    "description": "TSV table of annotations for all genomes. Such as the ones generated by Bakta or Prokka in ARETE."
+                },
+                "feature_profile": {
+                    "type": "string",
+                    "description": "Feature profile TSV (A presence-absence matrix). Used in the evolCCM entry."
                 }
             },
             "fa_icon": "fas fa-bezier-curve"

diff --git a/subworkflows/local/rspr_input_check.nf b/subworkflows/local/rspr_input_check.nf
@@ -0,0 +1,27 @@
+workflow RSPR_INPUT_CHECK {
+    take:
+    samplesheet
+
+    main:
+    samplesheet
+        .splitCsv(header: true)
+        .map { it -> get_sample_info_rspr(it.path) }
+        .set { trees }
+
+    emit:
+    trees
+}
+
+def get_sample_info_rspr(row) {
+
+    def array = []
+    if (!file(row).exists()) {
+        print("***")
+        print(row)
+        print("***")
+        exit 1, "ERROR: Please check input samplesheet -> Tree file does not exist!\n${row.path}"
+    }
+    array = [ file(row)]
+
+    return array
+}
diff --git a/workflows/arete.nf b/workflows/arete.nf
@@ -11,10 +11,6 @@ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params)
 def checkPathParamList = [ params.input_sample_table, params.multiqc_config, params.reference_genome ]
 for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
 
-// Check mandatory parameters
-if (params.input_sample_table) { ch_input = file(params.input_sample_table) } else { exit 1, 'Input samplesheet not specified!' }
-
-
 /*
 ========================================================================================
     CONFIG FILES
@@ -38,6 +34,7 @@ ch_multiqc_logo            = params.multiqc_logo   ? Channel.fromPath( params.mu
 include { INPUT_CHECK } from '../subworkflows/local/input_check'
 include { PHYLO_INPUT_CHECK } from '../subworkflows/local/phylo_input_check'
 include { ANNOTATION_INPUT_CHECK } from '../subworkflows/local/annotation_input_check'
+include { RSPR_INPUT_CHECK } from '../subworkflows/local/rspr_input_check'
 include { ASSEMBLE_SHORTREADS } from '../subworkflows/local/assembly'
 include { ANNOTATE_ASSEMBLIES } from '../subworkflows/local/annotation'
 include { CHECK_ASSEMBLIES } from '../subworkflows/local/assemblyqc'
@@ -595,6 +592,97 @@ workflow POPPUNK {
     )
 }
 
+
+workflow RUN_RSPR {
+    if (params.input_sample_table) { ch_input = Channel.of(file(params.input_sample_table)) } else { exit 1, 'Input samplesheet not specified!' }
+    if (params.core_gene_tree) { ch_core = file(params.core_gene_tree) } else { exit 1, 'Core tree not specified!' }
+    ch_annotation_data = params.concatenated_annotation ? file(params.concatenated_annotation) : []
+
+    RSPR_INPUT_CHECK (
+        ch_input
+    )
+
+    RSPR (
+        ch_core,
+        RSPR_INPUT_CHECK.out.trees,
+        ch_annotation_data
+    )
+}
+
+workflow RUN_EVOLCCM {
+    if (params.core_gene_tree) { ch_core = file(params.core_gene_tree) } else { exit 1, 'Core tree not specified!' }
+    if (params.feature_profile) { ch_input = file(params.feature_profile) } else { exit 1, 'Input feature profile not specified!' }
+
+    EVOLCCM (
+        ch_core,
+        ch_input
+    )
+}
+
+workflow RUN_RECOMBINATION {
+    if (params.input_sample_table){ ch_input = file(params.input_sample_table) } else { exit 1, 'Input samplesheet not specified!' }
+    if (params.reference_genome) {
+        ch_reference_genome = file(params.reference_genome)
+        use_reference_genome = true
+        }
+    else {
+        ch_reference_genome = []
+        use_reference_genome = false
+    }
+    if (params.poppunk_model == null) { exit 1, 'A model must be specified with --poppunk_model in order to run PopPunk' }
+    ch_software_versions = Channel.empty()
+
+    ANNOTATION_INPUT_CHECK(ch_input)
+    ANNOTATION_INPUT_CHECK.out.genomes.set { assemblies }
+
+    CHECK_ASSEMBLIES(
+        assemblies,
+        ch_reference_genome,
+        use_reference_genome
+    )
+    ch_software_versions = ch_software_versions.mix(CHECK_ASSEMBLIES.out.assemblyqc_software)
+
+    if (params.apply_filtering) {
+        CHECK_ASSEMBLIES.out.assemblies
+            .set { assemblies }
+    }
+
+    RUN_POPPUNK(assemblies)
+    ch_software_versions = ch_software_versions.mix(RUN_POPPUNK.out.poppunk_version)
+
+
+    RECOMBINATION (
+        assemblies,
+        RUN_POPPUNK.out.clusters,
+        CHECK_ASSEMBLIES.out.quast_report
+    )
+
+    CUSTOM_DUMPSOFTWAREVERSIONS (
+        ch_software_versions.unique().collectFile(name: 'collated_versions.yml')
+    )
+
+    /*
+     * MODULE: MultiQC
+     */
+    workflow_summary    = WorkflowArete.paramsSummaryMultiqc(workflow, summary_params)
+    ch_workflow_summary = Channel.value(workflow_summary)
+
+    //Mix QUAST results into one report file
+    ch_multiqc_files = Channel.empty()
+    ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'))
+    ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())
+    ch_multiqc_files = ch_multiqc_files.mix(CHECK_ASSEMBLIES.out.multiqc)
+
+    MULTIQC(
+        ch_multiqc_files.collect(),
+        ch_multiqc_config.collect().ifEmpty([]),
+        ch_multiqc_custom_config.collect().ifEmpty([]),
+        ch_multiqc_logo.collect().ifEmpty([])
+    )
+    multiqc_report       = MULTIQC.out.report.toList()
+    ch_software_versions = ch_software_versions.mix(MULTIQC.out.versions.ifEmpty(null))
+
+}
 /*
 ========================================================================================
     COMPLETION EMAIL AND SUMMARY