From b52f0e172654413a1f64ecb9150fcf1176af2a7d Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Thu, 5 Dec 2024 13:32:07 -0800 Subject: [PATCH 1/3] Remove "run_pangolin" config option Removes scripts, dependencies, docs, and rules for running pangolin as part of this workflow. --- docs/src/reference/change_log.md | 2 + docs/src/reference/workflow-config-file.rst | 9 ---- scripts/make_pangolin_node_data.py | 32 ------------- workflow/envs/nextstrain.yaml | 2 - workflow/snakemake_rules/main_workflow.smk | 52 --------------------- 5 files changed, 2 insertions(+), 95 deletions(-) delete mode 100644 scripts/make_pangolin_node_data.py diff --git a/docs/src/reference/change_log.md b/docs/src/reference/change_log.md index 8d5f61194..5a61b23cf 100644 --- a/docs/src/reference/change_log.md +++ b/docs/src/reference/change_log.md @@ -5,6 +5,8 @@ We also use this change log to document new features that maintain backward comp ## New features since last version update +- Remove `run_pangolin` configuration option from the workflow, since the pangoLEARN tool that this enabled has been deprecated + ## v14 (23 October 2024) - 23 October 2024: Update workflow to use Nextclade v3. This includes the removal of unused mutation summary script and rules that expected Nextclade v2 outputs. Dropping the mutation summary rules removed the need for the full alignment rule `align` to produce the insertions and translations outputs, so they have been removed. The `build_align` rule no longer produces a separate `insertions.tsv` since insertions are now included in the `nextclade_qc.tsv`. [PR 1160](https://github.com/nextstrain/ncov/pull/1160) diff --git a/docs/src/reference/workflow-config-file.rst b/docs/src/reference/workflow-config-file.rst index 60a54d823..bc8e0fcc7 100644 --- a/docs/src/reference/workflow-config-file.rst +++ b/docs/src/reference/workflow-config-file.rst @@ -728,15 +728,6 @@ crowding_penalty .. _title-1: -run_pangolin ------------- - -- type: boolean -- description: Enable annotation of Pangolin lineages for a given build's subsampled sequences. -- default: ``false`` - -.. _workflow-config-mask: - mask ---- diff --git a/scripts/make_pangolin_node_data.py b/scripts/make_pangolin_node_data.py deleted file mode 100644 index dcbf5b2db..000000000 --- a/scripts/make_pangolin_node_data.py +++ /dev/null @@ -1,32 +0,0 @@ -""" -Translate pangolineages from CSV -> JSON for node_data -Note: this should arguably live instead as part of `combine_metadata`, -but this gets particularly complex given the new multiple-inputs logic. -So, for now, following the initial suggestion in the issue. -""" - -import argparse -import pandas as pd -import csv -import json -from augur.utils import write_json - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description="Create node data for assigned pangolin lineages", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--pangolineages", type=str, required=True, help="pangolineages.csv") - parser.add_argument("--node_data_outfile", type=str, help="pangolineages.json") - parser.add_argument("--attribute_name", default="pango_lineage_local", help="attribute name for pangolin lineage annotations in the output JSON") - args = parser.parse_args() - - pangolineages = pd.read_csv(args.pangolineages) - - node_data = { - "nodes": { - row['taxon']: {args.attribute_name: row['lineage']} for idx, row in pangolineages.iterrows() - } - } - - write_json(node_data, args.node_data_outfile) diff --git a/workflow/envs/nextstrain.yaml b/workflow/envs/nextstrain.yaml index bf59f7072..02187c107 100644 --- a/workflow/envs/nextstrain.yaml +++ b/workflow/envs/nextstrain.yaml @@ -8,6 +8,4 @@ dependencies: - epiweeks=2.1.2 - iqtree=2.2.0.3 - nextclade=3.9.0 - - pangolin=3.1.20 - - pangolearn=2022.01.20 - python>=3.8* diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 4e9ac00a4..8834897bf 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -702,55 +702,6 @@ rule filter: --output-log {output.filter_log} 2>&1 | tee {log}; """ -if "run_pangolin" in config and config["run_pangolin"]: - rule run_pangolin: - message: - """ - Running pangolin to assign lineage labels to samples. Includes putative lineage definitions by default. - Please remember to update your installation of pangolin regularly to ensure the most up-to-date classifications. - """ - input: - alignment = "results/{build_name}/aligned.fasta", - output: - lineages = "results/{build_name}/pangolineages.csv", - params: - outdir = "results/{build_name}", - csv_outfile = "pangolineages.csv", - node_data_outfile = "pangolineages.json" - log: - "logs/pangolin_{build_name}.txt" - conda: config["conda_environment"] - threads: 1 - resources: - mem_mb=3000 - benchmark: - "benchmarks/pangolineages_{build_name}.txt" - shell: ## once pangolin fully supports threads, add `--threads {threads}` to the below (existing pango cli param) - r""" - pangolin {input.alignment}\ - --outdir {params.outdir} \ - --outfile {params.csv_outfile} 2>&1 | tee {log}\ - """ - - rule make_pangolin_node_data: - input: - lineages = rules.run_pangolin.output.lineages - output: - node_data = "results/{build_name}/pangolineages.json" - log: - "logs/pangolin_export_{build_name}.txt" - conda: config["conda_environment"] - resources: - mem_mb=3000 - benchmark: - "benchmarks/make_pangolin_node_data_{build_name}.txt" - shell: - r""" - python3 scripts/make_pangolin_node_data.py \ - --pangolineages {input.lineages} \ - --node_data_outfile {output.node_data} 2>&1 | tee {log}\ - """ - # TODO: This will probably not work for build names like "country_usa" where we need to know the country is "USA". rule adjust_metadata_regions: message: @@ -1308,9 +1259,6 @@ def _get_node_data_by_wildcards(wildcards): rules.calculate_epiweeks.output.node_data, ] - if "run_pangolin" in config and config["run_pangolin"]: - inputs.append(rules.make_pangolin_node_data.output.node_data) - # Convert input files from wildcard strings to real file names. inputs = [input_file.format(**wildcards_dict) for input_file in inputs] From f5fc7c5c511ab2e8168204288e2e212c206b44b5 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Thu, 5 Dec 2024 13:34:53 -0800 Subject: [PATCH 2/3] Note removing pango logic is a breaking change --- docs/src/reference/change_log.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/src/reference/change_log.md b/docs/src/reference/change_log.md index 5a61b23cf..da8018f49 100644 --- a/docs/src/reference/change_log.md +++ b/docs/src/reference/change_log.md @@ -5,7 +5,9 @@ We also use this change log to document new features that maintain backward comp ## New features since last version update -- Remove `run_pangolin` configuration option from the workflow, since the pangoLEARN tool that this enabled has been deprecated +## v15 (5 December 2024) + +- Remove `run_pangolin` configuration option from the workflow, since the pangoLEARN tool that this enabled has been deprecated. [PR 1164](https://github.com/nextstrain/ncov/pull/1164) ## v14 (23 October 2024) From bd4d994ae62f4f713554aa5bf960b656bd726084 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Thu, 5 Dec 2024 13:59:21 -0800 Subject: [PATCH 3/3] Fix docs typo --- docs/src/reference/workflow-config-file.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/src/reference/workflow-config-file.rst b/docs/src/reference/workflow-config-file.rst index bc8e0fcc7..63575290b 100644 --- a/docs/src/reference/workflow-config-file.rst +++ b/docs/src/reference/workflow-config-file.rst @@ -728,6 +728,8 @@ crowding_penalty .. _title-1: +.. _workflow-config-mask: + mask ----