Skip to content

Commit

Permalink
Merge branch 'master' into gene-order-workflow
Browse files Browse the repository at this point in the history
* master:
  feat: Add recombination entry (#175)
  docs: Add evolCCM usage documentation
  feat: Add evolCCM entry (#174)
  docs: Update rSPR outputs (#173)
  fix: Stop ppanggolin msa from altering input file (#172)
  feat: Add rspr entry (#171)
  refactor: Ignore time reached error in rspr_exact (#170)
  • Loading branch information
jvfe committed Nov 10, 2023
2 parents b5fdfd7 + 710ba25 commit bc0cd3c
Show file tree
Hide file tree
Showing 12 changed files with 238 additions and 18 deletions.
4 changes: 3 additions & 1 deletion bin/rspr_approx.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def parse_args(args=None):
"--annotation",
dest="ANNOTATION",
help="Annotation table from BAKTA/PROKKA",
nargs="?",
)
parser.add_argument(
"-o", "--output", dest="OUTPUT_DIR", default="approx", help="Gene tree list"
Expand Down Expand Up @@ -295,7 +296,8 @@ def main(args=None):
make_heatmap(results, fig_path)

results.reset_index("file_name", inplace=True)
results = join_annotation_data(results, args.ANNOTATION)
if args.ANNOTATION:
results = join_annotation_data(results, args.ANNOTATION)
res_path = os.path.join(args.OUTPUT_DIR, "output.tsv")
df_with_groups = make_groups_from_csv(results, args.MIN_RSPR_DISTANCE)
df_with_groups.to_csv(res_path, sep="\t", index=False)
Expand Down
1 change: 1 addition & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,7 @@ process {
}

withName: RSPR_EXACT {
errorStrategy = { task.exitStatus in [140] ? 'ignore' : 'retry' }
publishDir = [
path: { "${params.outdir}/dynamics/rSPR/exact" },
mode: params.publish_dir_mode,
Expand Down
2 changes: 2 additions & 0 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,8 @@ See [the PPanGGoLiN documentation](https://github.com/labgem/PPanGGOLiN/wiki/Out

#### _rSPR_

The outputs are approximate and exact Subtree Prune and Regraft (rSPR) distances between pairs of rooted phylogenetic trees. Each CSV file contains these distances and the tree sizes. The PNG files are heatmaps of these distances and their respective tree sizes.

- `dynamics/rSPR/`

- `approx` - Approximate rSPR distances
Expand Down
5 changes: 4 additions & 1 deletion docs/params.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,11 @@ Parameters for the recombination subworkflow
| `run_rspr` | Run rSPR | `boolean` | | | |
| `min_rspr_distance` | Minimum rSPR distance used to define processing groups | `integer` | 10 | | |
| `min_branch_length` | Minimum rSPR branch length | `integer` | 0 | | |
| `max_support_threshold` | Maximum rSPR support threshold | `integer` | 0 | | |
| `max_support_threshold` | Maximum rSPR support threshold | `number` | 0.7 | | |
| `max_approx_rspr` | Maximum approximate rSPR distance for filtering | `integer` | -1 | | |
| `core_gene_tree` | Core (or reference) genome tree. Used in the rSPR and evolCCM entries. | `string` | | | |
| `concatenated_annotation` | TSV table of annotations for all genomes. Such as the ones generated by Bakta or Prokka in ARETE. | `string` | | | |
| `feature_profile` | Feature profile TSV (A presence-absence matrix). Used in the evolCCM entry. | `string` | | | |

## Institutional config options

Expand Down
71 changes: 71 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,77 @@ To execute phylogenomic and pangenomics analysis on pre-existing assemblies:
nextflow run beiko-lab/ARETE -entry phylogenomics --input_sample_table samplesheet.csv -profile docker
```

### rSPR Entry

To execute the rSPR analysis on pre-existing trees:

```bash
nextflow run beiko-lab/ARETE \
-entry rspr \
--input_sample_table samplesheet.csv \
--core_gene_tree core_gene_alignment.tre \
--concatenated_annotation BAKTA.txt \
-profile docker
```

The parameters being:

- `--core_gene_tree` - The reference tree, coming from a core genome alignment, like the one generated by panaroo in ARETE.
- `--concatenated_annotation` - The tabular annotation results (TSV) for all genomes, like the ones generated at the end of Prokka or Bakta in ARETE. Although useful, it's not necessary to execute the rSPR entry.
- `--input_sample_table` - A samplesheet containing all individual gene trees in the following format:

`gene_tree,path
CDS_0000,/path/to/CDS_0000.tre
CDS_0001,/path/to/CDS_0001.tre
CDS_0002,/path/to/CDS_0002.tre
CDS_0003,/path/to/CDS_0003.tre
CDS_0004,/path/to/CDS_0004.tre
`

### evolCCM Entry

To execute the evolCCM analysis on a pre-existing reference tree and feature profile:

```bash
nextflow run beiko-lab/ARETE \
-entry evolccm \
--core_gene_tree core_gene_alignment.tre \
--feature_profile feature_profile.tsv.gz \
-profile docker
```

The parameters being:

- `--core_gene_tree` - The reference tree, coming from a core genome alignment, like the one generated by panaroo in ARETE.
- `--feature_profile` - A presence/absence TSV matrix of features
in genomes. Genome names should be the same in the core tree and
should be contained to a 'genome_id' column, with all other columns represent features absent (0) or present (1) in each genome. I.e.:

```
genome_id plasmid_AA155 plasmid_AA161
ED010 0 0
ED017 0 1
ED040 0 0
ED073 0 1
ED075 1 1
ED082 0 1
ED142 0 1
ED178 0 1
ED180 0 0
```

### Recombination Entry

To execute the recombination analysis on pre-existing assemblies (PopPUNK model can be either bgmm, dbscan, refine, threshold or lineage):

```bash
nextflow run beiko-lab/ARETE \
-entry recombination \
--input_sample_table samplesheet.csv \
--poppunk_model dbscan \
-profile docker
```

## Updating the pipeline

When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline:
Expand Down
5 changes: 0 additions & 5 deletions lib/WorkflowArete.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,6 @@ class WorkflowArete {
// Check the hostnames against configured profiles
NfcoreTemplate.hostName(workflow, params, log)

// Check input has been provided
if (!params.input_sample_table) {
log.error "Please provide an input samplesheet to the pipeline e.g. '--input_sample_table samplesheet.csv'"
System.exit(1)
}
}

//
Expand Down
15 changes: 14 additions & 1 deletion main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ include { ANNOTATION } from './workflows/arete'
include { PHYLO } from './workflows/arete'
include { QUALITYCHECK } from './workflows/arete'
include { POPPUNK } from './workflows/arete'

include { RUN_RSPR } from './workflows/arete'
include { RUN_EVOLCCM } from './workflows/arete'
include { RUN_RECOMBINATION } from './workflows/arete'

//
// WORKFLOW: Run main nf-core/arete analysis pipeline
Expand Down Expand Up @@ -68,6 +70,17 @@ workflow poppunk {
POPPUNK()
}

workflow rspr {
RUN_RSPR()
}

workflow evolccm {
RUN_EVOLCCM()
}

workflow recombination {
RUN_RECOMBINATION()
}
/*
========================================================================================
RUN ALL WORKFLOWS
Expand Down
9 changes: 5 additions & 4 deletions modules/local/ppanggolin/msa/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ process PPANGGOLIN_MSA {
tuple val(meta), path(pangenome)

output:
tuple val(meta), path("${prefix}_msa") , emit: results
path "${prefix}_msa/msa_all_protein/*.aln", emit: alignments
path "ppanggolin_msa/msa_all_protein/*.aln", emit: alignments
path "versions.yml" , emit: versions

when:
Expand All @@ -23,12 +22,14 @@ process PPANGGOLIN_MSA {
def args = task.ext.args ?: ''
prefix = task.ext.prefix ?: "${meta.id}"
"""
cp $pangenome copied_pangenome.h5
ppanggolin \\
msa \\
$args \\
--cpu $task.cpus \\
--pangenome $pangenome \\
--output "${prefix}"_msa \\
--pangenome copied_pangenome.h5 \\
--output ppanggolin_msa \\
--partition all
cat <<-END_VERSIONS > versions.yml
Expand Down
5 changes: 5 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ params {
max_support_threshold = 0.7
max_approx_rspr = -1

// rSPR/evolCCM entries
core_gene_tree = null
concatenated_annotation = null
feature_profile = null

// MultiQC options
multiqc_config = null
multiqc_title = null
Expand Down
16 changes: 14 additions & 2 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"type": "object",
"fa_icon": "fas fa-terminal",
"description": "Define where the pipeline should find input data and save output data.",
"required": ["input_sample_table"],
"required": [],
"properties": {
"input_sample_table": {
"type": "string",
Expand Down Expand Up @@ -217,7 +217,7 @@
},
"accessory_similarity": {
"type": "number",
"default": 99,
"default": 99.0,
"fa_icon": "far fa-clone",
"description": "Similarity threshold for accessory genes"
}
Expand Down Expand Up @@ -331,6 +331,18 @@
"type": "integer",
"default": -1,
"description": "Maximum approximate rSPR distance for filtering"
},
"core_gene_tree": {
"type": "string",
"description": "Core (or reference) genome tree. Used in the rSPR and evolCCM entries."
},
"concatenated_annotation": {
"type": "string",
"description": "TSV table of annotations for all genomes. Such as the ones generated by Bakta or Prokka in ARETE."
},
"feature_profile": {
"type": "string",
"description": "Feature profile TSV (A presence-absence matrix). Used in the evolCCM entry."
}
},
"fa_icon": "fas fa-bezier-curve"
Expand Down
27 changes: 27 additions & 0 deletions subworkflows/local/rspr_input_check.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
workflow RSPR_INPUT_CHECK {
take:
samplesheet

main:
samplesheet
.splitCsv(header: true)
.map { it -> get_sample_info_rspr(it.path) }
.set { trees }

emit:
trees
}

def get_sample_info_rspr(row) {

def array = []
if (!file(row).exists()) {
print("***")
print(row)
print("***")
exit 1, "ERROR: Please check input samplesheet -> Tree file does not exist!\n${row.path}"
}
array = [ file(row)]

return array
}
96 changes: 92 additions & 4 deletions workflows/arete.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params)
def checkPathParamList = [ params.input_sample_table, params.multiqc_config, params.reference_genome ]
for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }

// Check mandatory parameters
if (params.input_sample_table) { ch_input = file(params.input_sample_table) } else { exit 1, 'Input samplesheet not specified!' }


/*
========================================================================================
CONFIG FILES
Expand All @@ -38,6 +34,7 @@ ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.mu
include { INPUT_CHECK } from '../subworkflows/local/input_check'
include { PHYLO_INPUT_CHECK } from '../subworkflows/local/phylo_input_check'
include { ANNOTATION_INPUT_CHECK } from '../subworkflows/local/annotation_input_check'
include { RSPR_INPUT_CHECK } from '../subworkflows/local/rspr_input_check'
include { ASSEMBLE_SHORTREADS } from '../subworkflows/local/assembly'
include { ANNOTATE_ASSEMBLIES } from '../subworkflows/local/annotation'
include { CHECK_ASSEMBLIES } from '../subworkflows/local/assemblyqc'
Expand Down Expand Up @@ -595,6 +592,97 @@ workflow POPPUNK {
)
}


workflow RUN_RSPR {
if (params.input_sample_table) { ch_input = Channel.of(file(params.input_sample_table)) } else { exit 1, 'Input samplesheet not specified!' }
if (params.core_gene_tree) { ch_core = file(params.core_gene_tree) } else { exit 1, 'Core tree not specified!' }
ch_annotation_data = params.concatenated_annotation ? file(params.concatenated_annotation) : []

RSPR_INPUT_CHECK (
ch_input
)

RSPR (
ch_core,
RSPR_INPUT_CHECK.out.trees,
ch_annotation_data
)
}

workflow RUN_EVOLCCM {
if (params.core_gene_tree) { ch_core = file(params.core_gene_tree) } else { exit 1, 'Core tree not specified!' }
if (params.feature_profile) { ch_input = file(params.feature_profile) } else { exit 1, 'Input feature profile not specified!' }

EVOLCCM (
ch_core,
ch_input
)
}

workflow RUN_RECOMBINATION {
if (params.input_sample_table){ ch_input = file(params.input_sample_table) } else { exit 1, 'Input samplesheet not specified!' }
if (params.reference_genome) {
ch_reference_genome = file(params.reference_genome)
use_reference_genome = true
}
else {
ch_reference_genome = []
use_reference_genome = false
}
if (params.poppunk_model == null) { exit 1, 'A model must be specified with --poppunk_model in order to run PopPunk' }
ch_software_versions = Channel.empty()

ANNOTATION_INPUT_CHECK(ch_input)
ANNOTATION_INPUT_CHECK.out.genomes.set { assemblies }

CHECK_ASSEMBLIES(
assemblies,
ch_reference_genome,
use_reference_genome
)
ch_software_versions = ch_software_versions.mix(CHECK_ASSEMBLIES.out.assemblyqc_software)

if (params.apply_filtering) {
CHECK_ASSEMBLIES.out.assemblies
.set { assemblies }
}

RUN_POPPUNK(assemblies)
ch_software_versions = ch_software_versions.mix(RUN_POPPUNK.out.poppunk_version)


RECOMBINATION (
assemblies,
RUN_POPPUNK.out.clusters,
CHECK_ASSEMBLIES.out.quast_report
)

CUSTOM_DUMPSOFTWAREVERSIONS (
ch_software_versions.unique().collectFile(name: 'collated_versions.yml')
)

/*
* MODULE: MultiQC
*/
workflow_summary = WorkflowArete.paramsSummaryMultiqc(workflow, summary_params)
ch_workflow_summary = Channel.value(workflow_summary)

//Mix QUAST results into one report file
ch_multiqc_files = Channel.empty()
ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'))
ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())
ch_multiqc_files = ch_multiqc_files.mix(CHECK_ASSEMBLIES.out.multiqc)

MULTIQC(
ch_multiqc_files.collect(),
ch_multiqc_config.collect().ifEmpty([]),
ch_multiqc_custom_config.collect().ifEmpty([]),
ch_multiqc_logo.collect().ifEmpty([])
)
multiqc_report = MULTIQC.out.report.toList()
ch_software_versions = ch_software_versions.mix(MULTIQC.out.versions.ifEmpty(null))

}
/*
========================================================================================
COMPLETION EMAIL AND SUMMARY
Expand Down

0 comments on commit bc0cd3c

Please sign in to comment.