diff --git a/bin/rspr_approx.py b/bin/rspr_approx.py index 89c2db71..40856800 100755 --- a/bin/rspr_approx.py +++ b/bin/rspr_approx.py @@ -36,6 +36,7 @@ def parse_args(args=None): "--annotation", dest="ANNOTATION", help="Annotation table from BAKTA/PROKKA", + nargs="?", ) parser.add_argument( "-o", "--output", dest="OUTPUT_DIR", default="approx", help="Gene tree list" @@ -295,7 +296,8 @@ def main(args=None): make_heatmap(results, fig_path) results.reset_index("file_name", inplace=True) - results = join_annotation_data(results, args.ANNOTATION) + if args.ANNOTATION: + results = join_annotation_data(results, args.ANNOTATION) res_path = os.path.join(args.OUTPUT_DIR, "output.tsv") df_with_groups = make_groups_from_csv(results, args.MIN_RSPR_DISTANCE) df_with_groups.to_csv(res_path, sep="\t", index=False) diff --git a/conf/modules.config b/conf/modules.config index 6100928a..43234de7 100755 --- a/conf/modules.config +++ b/conf/modules.config @@ -440,6 +440,7 @@ process { } withName: RSPR_EXACT { + errorStrategy = { task.exitStatus in [140] ? 'ignore' : 'retry' } publishDir = [ path: { "${params.outdir}/dynamics/rSPR/exact" }, mode: params.publish_dir_mode, diff --git a/docs/output.md b/docs/output.md index af62faf8..20d0ba29 100644 --- a/docs/output.md +++ b/docs/output.md @@ -296,6 +296,8 @@ See [the PPanGGoLiN documentation](https://github.com/labgem/PPanGGOLiN/wiki/Out #### _rSPR_ +The outputs are approximate and exact Subtree Prune and Regraft (rSPR) distances between pairs of rooted phylogenetic trees. Each CSV file contains these distances and the tree sizes. The PNG files are heatmaps of these distances and their respective tree sizes. + - `dynamics/rSPR/` - `approx` - Approximate rSPR distances diff --git a/docs/params.md b/docs/params.md index 9d6274e1..88ea41e5 100644 --- a/docs/params.md +++ b/docs/params.md @@ -93,8 +93,11 @@ Parameters for the recombination subworkflow | `run_rspr` | Run rSPR | `boolean` | | | | | `min_rspr_distance` | Minimum rSPR distance used to define processing groups | `integer` | 10 | | | | `min_branch_length` | Minimum rSPR branch length | `integer` | 0 | | | -| `max_support_threshold` | Maximum rSPR support threshold | `integer` | 0 | | | +| `max_support_threshold` | Maximum rSPR support threshold | `number` | 0.7 | | | | `max_approx_rspr` | Maximum approximate rSPR distance for filtering | `integer` | -1 | | | +| `core_gene_tree` | Core (or reference) genome tree. Used in the rSPR and evolCCM entries. | `string` | | | | +| `concatenated_annotation` | TSV table of annotations for all genomes. Such as the ones generated by Bakta or Prokka in ARETE. | `string` | | | | +| `feature_profile` | Feature profile TSV (A presence-absence matrix). Used in the evolCCM entry. | `string` | | | | ## Institutional config options diff --git a/docs/usage.md b/docs/usage.md index 8a55f30e..7cdca107 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -149,6 +149,77 @@ To execute phylogenomic and pangenomics analysis on pre-existing assemblies: nextflow run beiko-lab/ARETE -entry phylogenomics --input_sample_table samplesheet.csv -profile docker ``` +### rSPR Entry + +To execute the rSPR analysis on pre-existing trees: + +```bash +nextflow run beiko-lab/ARETE \ + -entry rspr \ + --input_sample_table samplesheet.csv \ + --core_gene_tree core_gene_alignment.tre \ + --concatenated_annotation BAKTA.txt \ + -profile docker +``` + +The parameters being: + +- `--core_gene_tree` - The reference tree, coming from a core genome alignment, like the one generated by panaroo in ARETE. +- `--concatenated_annotation` - The tabular annotation results (TSV) for all genomes, like the ones generated at the end of Prokka or Bakta in ARETE. Although useful, it's not necessary to execute the rSPR entry. +- `--input_sample_table` - A samplesheet containing all individual gene trees in the following format: + +`gene_tree,path +CDS_0000,/path/to/CDS_0000.tre +CDS_0001,/path/to/CDS_0001.tre +CDS_0002,/path/to/CDS_0002.tre +CDS_0003,/path/to/CDS_0003.tre +CDS_0004,/path/to/CDS_0004.tre +` + +### evolCCM Entry + +To execute the evolCCM analysis on a pre-existing reference tree and feature profile: + +```bash +nextflow run beiko-lab/ARETE \ + -entry evolccm \ + --core_gene_tree core_gene_alignment.tre \ + --feature_profile feature_profile.tsv.gz \ + -profile docker +``` + +The parameters being: + +- `--core_gene_tree` - The reference tree, coming from a core genome alignment, like the one generated by panaroo in ARETE. +- `--feature_profile` - A presence/absence TSV matrix of features + in genomes. Genome names should be the same in the core tree and + should be contained to a 'genome_id' column, with all other columns represent features absent (0) or present (1) in each genome. I.e.: + +``` +genome_id plasmid_AA155 plasmid_AA161 +ED010 0 0 +ED017 0 1 +ED040 0 0 +ED073 0 1 +ED075 1 1 +ED082 0 1 +ED142 0 1 +ED178 0 1 +ED180 0 0 +``` + +### Recombination Entry + +To execute the recombination analysis on pre-existing assemblies (PopPUNK model can be either bgmm, dbscan, refine, threshold or lineage): + +```bash +nextflow run beiko-lab/ARETE \ + -entry recombination \ + --input_sample_table samplesheet.csv \ + --poppunk_model dbscan \ + -profile docker +``` + ## Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: diff --git a/lib/WorkflowArete.groovy b/lib/WorkflowArete.groovy index 1dc644ec..37d21de8 100755 --- a/lib/WorkflowArete.groovy +++ b/lib/WorkflowArete.groovy @@ -66,11 +66,6 @@ class WorkflowArete { // Check the hostnames against configured profiles NfcoreTemplate.hostName(workflow, params, log) - // Check input has been provided - if (!params.input_sample_table) { - log.error "Please provide an input samplesheet to the pipeline e.g. '--input_sample_table samplesheet.csv'" - System.exit(1) - } } // diff --git a/main.nf b/main.nf index 73f783ad..7c479b8e 100755 --- a/main.nf +++ b/main.nf @@ -37,7 +37,9 @@ include { ANNOTATION } from './workflows/arete' include { PHYLO } from './workflows/arete' include { QUALITYCHECK } from './workflows/arete' include { POPPUNK } from './workflows/arete' - +include { RUN_RSPR } from './workflows/arete' +include { RUN_EVOLCCM } from './workflows/arete' +include { RUN_RECOMBINATION } from './workflows/arete' // // WORKFLOW: Run main nf-core/arete analysis pipeline @@ -68,6 +70,17 @@ workflow poppunk { POPPUNK() } +workflow rspr { + RUN_RSPR() +} + +workflow evolccm { + RUN_EVOLCCM() +} + +workflow recombination { + RUN_RECOMBINATION() +} /* ======================================================================================== RUN ALL WORKFLOWS diff --git a/modules/local/ppanggolin/msa/main.nf b/modules/local/ppanggolin/msa/main.nf index ecd8c0dd..d9736b09 100644 --- a/modules/local/ppanggolin/msa/main.nf +++ b/modules/local/ppanggolin/msa/main.nf @@ -12,8 +12,7 @@ process PPANGGOLIN_MSA { tuple val(meta), path(pangenome) output: - tuple val(meta), path("${prefix}_msa") , emit: results - path "${prefix}_msa/msa_all_protein/*.aln", emit: alignments + path "ppanggolin_msa/msa_all_protein/*.aln", emit: alignments path "versions.yml" , emit: versions when: @@ -23,12 +22,14 @@ process PPANGGOLIN_MSA { def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" """ + cp $pangenome copied_pangenome.h5 + ppanggolin \\ msa \\ $args \\ --cpu $task.cpus \\ - --pangenome $pangenome \\ - --output "${prefix}"_msa \\ + --pangenome copied_pangenome.h5 \\ + --output ppanggolin_msa \\ --partition all cat <<-END_VERSIONS > versions.yml diff --git a/nextflow.config b/nextflow.config index 17d3b01a..89729f58 100755 --- a/nextflow.config +++ b/nextflow.config @@ -79,6 +79,11 @@ params { max_support_threshold = 0.7 max_approx_rspr = -1 + // rSPR/evolCCM entries + core_gene_tree = null + concatenated_annotation = null + feature_profile = null + // MultiQC options multiqc_config = null multiqc_title = null diff --git a/nextflow_schema.json b/nextflow_schema.json index d016e19a..1e4e35bc 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input_sample_table"], + "required": [], "properties": { "input_sample_table": { "type": "string", @@ -217,7 +217,7 @@ }, "accessory_similarity": { "type": "number", - "default": 99, + "default": 99.0, "fa_icon": "far fa-clone", "description": "Similarity threshold for accessory genes" } @@ -331,6 +331,18 @@ "type": "integer", "default": -1, "description": "Maximum approximate rSPR distance for filtering" + }, + "core_gene_tree": { + "type": "string", + "description": "Core (or reference) genome tree. Used in the rSPR and evolCCM entries." + }, + "concatenated_annotation": { + "type": "string", + "description": "TSV table of annotations for all genomes. Such as the ones generated by Bakta or Prokka in ARETE." + }, + "feature_profile": { + "type": "string", + "description": "Feature profile TSV (A presence-absence matrix). Used in the evolCCM entry." } }, "fa_icon": "fas fa-bezier-curve" diff --git a/subworkflows/local/rspr_input_check.nf b/subworkflows/local/rspr_input_check.nf new file mode 100644 index 00000000..a6eb3fdd --- /dev/null +++ b/subworkflows/local/rspr_input_check.nf @@ -0,0 +1,27 @@ +workflow RSPR_INPUT_CHECK { + take: + samplesheet + + main: + samplesheet + .splitCsv(header: true) + .map { it -> get_sample_info_rspr(it.path) } + .set { trees } + + emit: + trees +} + +def get_sample_info_rspr(row) { + + def array = [] + if (!file(row).exists()) { + print("***") + print(row) + print("***") + exit 1, "ERROR: Please check input samplesheet -> Tree file does not exist!\n${row.path}" + } + array = [ file(row)] + + return array +} diff --git a/workflows/arete.nf b/workflows/arete.nf index 07380b74..bf72fb1b 100755 --- a/workflows/arete.nf +++ b/workflows/arete.nf @@ -11,10 +11,6 @@ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) def checkPathParamList = [ params.input_sample_table, params.multiqc_config, params.reference_genome ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } -// Check mandatory parameters -if (params.input_sample_table) { ch_input = file(params.input_sample_table) } else { exit 1, 'Input samplesheet not specified!' } - - /* ======================================================================================== CONFIG FILES @@ -38,6 +34,7 @@ ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.mu include { INPUT_CHECK } from '../subworkflows/local/input_check' include { PHYLO_INPUT_CHECK } from '../subworkflows/local/phylo_input_check' include { ANNOTATION_INPUT_CHECK } from '../subworkflows/local/annotation_input_check' +include { RSPR_INPUT_CHECK } from '../subworkflows/local/rspr_input_check' include { ASSEMBLE_SHORTREADS } from '../subworkflows/local/assembly' include { ANNOTATE_ASSEMBLIES } from '../subworkflows/local/annotation' include { CHECK_ASSEMBLIES } from '../subworkflows/local/assemblyqc' @@ -595,6 +592,97 @@ workflow POPPUNK { ) } + +workflow RUN_RSPR { + if (params.input_sample_table) { ch_input = Channel.of(file(params.input_sample_table)) } else { exit 1, 'Input samplesheet not specified!' } + if (params.core_gene_tree) { ch_core = file(params.core_gene_tree) } else { exit 1, 'Core tree not specified!' } + ch_annotation_data = params.concatenated_annotation ? file(params.concatenated_annotation) : [] + + RSPR_INPUT_CHECK ( + ch_input + ) + + RSPR ( + ch_core, + RSPR_INPUT_CHECK.out.trees, + ch_annotation_data + ) +} + +workflow RUN_EVOLCCM { + if (params.core_gene_tree) { ch_core = file(params.core_gene_tree) } else { exit 1, 'Core tree not specified!' } + if (params.feature_profile) { ch_input = file(params.feature_profile) } else { exit 1, 'Input feature profile not specified!' } + + EVOLCCM ( + ch_core, + ch_input + ) +} + +workflow RUN_RECOMBINATION { + if (params.input_sample_table){ ch_input = file(params.input_sample_table) } else { exit 1, 'Input samplesheet not specified!' } + if (params.reference_genome) { + ch_reference_genome = file(params.reference_genome) + use_reference_genome = true + } + else { + ch_reference_genome = [] + use_reference_genome = false + } + if (params.poppunk_model == null) { exit 1, 'A model must be specified with --poppunk_model in order to run PopPunk' } + ch_software_versions = Channel.empty() + + ANNOTATION_INPUT_CHECK(ch_input) + ANNOTATION_INPUT_CHECK.out.genomes.set { assemblies } + + CHECK_ASSEMBLIES( + assemblies, + ch_reference_genome, + use_reference_genome + ) + ch_software_versions = ch_software_versions.mix(CHECK_ASSEMBLIES.out.assemblyqc_software) + + if (params.apply_filtering) { + CHECK_ASSEMBLIES.out.assemblies + .set { assemblies } + } + + RUN_POPPUNK(assemblies) + ch_software_versions = ch_software_versions.mix(RUN_POPPUNK.out.poppunk_version) + + + RECOMBINATION ( + assemblies, + RUN_POPPUNK.out.clusters, + CHECK_ASSEMBLIES.out.quast_report + ) + + CUSTOM_DUMPSOFTWAREVERSIONS ( + ch_software_versions.unique().collectFile(name: 'collated_versions.yml') + ) + + /* + * MODULE: MultiQC + */ + workflow_summary = WorkflowArete.paramsSummaryMultiqc(workflow, summary_params) + ch_workflow_summary = Channel.value(workflow_summary) + + //Mix QUAST results into one report file + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) + ch_multiqc_files = ch_multiqc_files.mix(CHECK_ASSEMBLIES.out.multiqc) + + MULTIQC( + ch_multiqc_files.collect(), + ch_multiqc_config.collect().ifEmpty([]), + ch_multiqc_custom_config.collect().ifEmpty([]), + ch_multiqc_logo.collect().ifEmpty([]) + ) + multiqc_report = MULTIQC.out.report.toList() + ch_software_versions = ch_software_versions.mix(MULTIQC.out.versions.ifEmpty(null)) + +} /* ======================================================================================== COMPLETION EMAIL AND SUMMARY