From 7ee96b5fc7b1ab08e88c32897a71d636240cbc9b Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 25 Mar 2024 16:56:58 -0700 Subject: [PATCH 1/7] Prototype GitHub Action for Nextclade annotations Adds rules, config, and GitHub Action file to support running Nextclade on all available sequences. Not yet tested. --- .github/workflows/run-nextclade.yaml | 32 +++++++ Snakefile | 1 + profiles/nextclade.yaml | 20 +++++ profiles/nextclade/run-nextclade.smk | 86 +++++++++++++++++++ workflow/snakemake_rules/download_from_s3.smk | 25 ++++++ 5 files changed, 164 insertions(+) create mode 100644 .github/workflows/run-nextclade.yaml create mode 100644 profiles/nextclade.yaml create mode 100644 profiles/nextclade/run-nextclade.smk diff --git a/.github/workflows/run-nextclade.yaml b/.github/workflows/run-nextclade.yaml new file mode 100644 index 00000000..530bec71 --- /dev/null +++ b/.github/workflows/run-nextclade.yaml @@ -0,0 +1,32 @@ +name: Run Nextclade on all sequences + +on: + workflow_dispatch: + inputs: + dockerImage: + description: "Specific container image to use for build (will override the default of `nextstrain build`)" + required: false + type: string + +jobs: + run-build: + permissions: + id-token: write + uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + secrets: inherit + with: + runtime: aws-batch + env: | + NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.dockerImage }} + run: | + nextstrain build \ + --detach \ + --cpus 36 \ + --memory 72gib \ + --env AWS_ACCESS_KEY_ID \ + --env AWS_SECRET_ACCESS_KEY \ + . \ + upload_all_nextclade_files \ + -p \ + --configfile profiles/nextclade.yaml \ + --set-threads run_nextclade=16 diff --git a/Snakefile b/Snakefile index 67f975eb..ce810f49 100644 --- a/Snakefile +++ b/Snakefile @@ -4,6 +4,7 @@ from treetime.utils import numeric_date wildcard_constraints: + lineage = r'h1n1pdm|h3n2|vic|yam', segment = r'pb2|pb1|pa|ha|np|na|mp|ns', center = r'who|cdc|crick|niid|crick|vidrl', passage = r'cell|egg', diff --git a/profiles/nextclade.yaml b/profiles/nextclade.yaml new file mode 100644 index 00000000..e6ddabf9 --- /dev/null +++ b/profiles/nextclade.yaml @@ -0,0 +1,20 @@ +custom_rules: + - workflow/snakemake_rules/download_from_s3.smk + - profiles/nextclade/run-nextclade.smk + +s3_dst: "s3://nextstrain-data-private/files/workflows/seasonal-flu" + +lat-longs: "config/lat_longs.tsv" + +segments: + - ha + +submission_date_field: date_submitted +recency: + date_bins: [7, 30, 90] + date_bin_labels: ["last week", "last month", "last quarter"] + upper_bin_label: older + +builds: + h1n1pdm: + lineage: h1n1pdm diff --git a/profiles/nextclade/run-nextclade.smk b/profiles/nextclade/run-nextclade.smk new file mode 100644 index 00000000..49ec81fa --- /dev/null +++ b/profiles/nextclade/run-nextclade.smk @@ -0,0 +1,86 @@ +nextclade_dataset_by_lineage_and_segment = { + "h1n1pdm": { + "ha": "nextstrain/flu/h1n1pdm/ha/california-7-2009", + }, + "h3n2": { + "ha": "nextstrain/flu/h3n2/ha/wisconsin-67-2005", + }, + "vic": { + "ha": "nextstrain/flu/vic/ha/brisbane-60-2008", + }, +} + +rule upload_all_nextclade_files: + input: + files=lambda wildcards: [ + "data/upload/s3/{filetype}_{lineage}_{segment}.done".format(filetype=filetype, lineage=lineage, segment=segment) + for filetype in ("alignment", "nextclade") + for lineage in nextclade_dataset_by_lineage_and_segment.keys() + for segment in nextclade_dataset_by_lineage_and_segment[lineage].keys() + ] + +rule get_nextclade_dataset_for_lineage_and_segment: + output: + nextclade_dir=directory("nextclade_dataset/{lineage}_{segment}/"), + params: + dataset_name=lambda wildcards: nextclade_dataset_by_lineage_and_segment.get(wildcards.lineage, {}).get(wildcards.segment), + shell: + """ + nextclade3 dataset get \ + -n {params.dataset_name:q} \ + --output-dir {output.nextclade_dir} + """ + +rule run_nextclade: + input: + nextclade_dir="nextclade_dataset/{lineage}_{segment}/", + sequences="data/{lineage}/{segment}.fasta", + output: + alignment="data/upload/s3/{lineage}/{segment}/aligned.fasta", + annotations="data/upload/s3/{lineage}/{segment}/nextclade.tsv", + log: + "logs/run_nextclade_{lineage}_{segment}.txt" + threads: 8 + shell: + """ + nextclade3 run \ + -j {threads} \ + -D {input.nextclade_dir} \ + --output-fasta {output.alignment} \ + --output-tsv {output.annotations} \ + {input.sequences} + """ + +rule upload_alignment: + input: + alignment="data/upload/s3/{lineage}/{segment}/aligned.fasta", + output: + flag="data/upload/s3/alignment_{lineage}_{segment}.done", + params: + s3_dst=config["s3_dst"], + log: + "logs/upload_alignment_{lineage}_{segment}.txt" + shell: + """ + ./scripts/upload-to-s3 \ + --quiet \ + {input.alignment:q} \ + {params.s3_dst:q}/{wildcards.lineage}/{wildcards.segment}/aligned.fasta.xz 2>&1 | tee {output.flag} + """ + +rule upload_nextclade_annotations: + input: + annotations="data/upload/s3/{lineage}/{segment}/nextclade.tsv", + output: + flag="data/upload/s3/nextclade_{lineage}_{segment}.done", + params: + s3_dst=config["s3_dst"], + log: + "logs/upload_nextclade_annotations_{lineage}_{segment}.txt" + shell: + """ + ./scripts/upload-to-s3 \ + --quiet \ + {input.annotations:q} \ + {params.s3_dst:q}/{wildcards.lineage}/{wildcards.segment}/nextclade.tsv.xz 2>&1 | tee {output.flag} + """ diff --git a/workflow/snakemake_rules/download_from_s3.smk b/workflow/snakemake_rules/download_from_s3.smk index 05733145..19d06145 100644 --- a/workflow/snakemake_rules/download_from_s3.smk +++ b/workflow/snakemake_rules/download_from_s3.smk @@ -1,3 +1,6 @@ +ruleorder: download_parsed_sequences > parse +ruleorder: download_parsed_metadata > annotate_metadata_with_reference_strains + rule download_sequences: output: sequences="data/{lineage}/raw_{segment}.fasta" @@ -19,3 +22,25 @@ rule download_titers: """ aws s3 cp {params.s3_path} - | gzip -c -d > {output.titers} """ + +rule download_parsed_sequences: + output: + sequences="data/{lineage}/{segment}.fasta" + params: + s3_path="s3://nextstrain-data-private/files/workflows/seasonal-flu/{lineage}/{segment}/sequences.fasta.xz" + conda: "../../workflow/envs/nextstrain.yaml" + shell: + """ + aws s3 cp {params.s3_path} - | xz -c -d > {output.sequences} + """ + +rule download_parsed_metadata: + output: + metadata="data/{lineage}/metadata.tsv", + params: + s3_path="s3://nextstrain-data-private/files/workflows/seasonal-flu/{lineage}/metadata.tsv.xz" + conda: "../../workflow/envs/nextstrain.yaml" + shell: + """ + aws s3 cp {params.s3_path} - | xz -c -d > {output.metadata} + """ From d74823442877af0c1a7799f8e872017bf2f821fb Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Tue, 26 Mar 2024 09:32:55 -0700 Subject: [PATCH 2/7] Simplify Nextclade build config Remove unnecessary configuration parameters from the Nextclade build config and update the workflow to allow these parameters to be missing. Since Snakemake evaluates the Python code in each rule's inputs, outputs, and params, rules that we don't plan to run in the workflow can produce key errors when their config parameters are not defined. --- profiles/nextclade.yaml | 8 -------- workflow/snakemake_rules/core.smk | 8 ++++---- workflow/snakemake_rules/export.smk | 2 +- 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/profiles/nextclade.yaml b/profiles/nextclade.yaml index e6ddabf9..c04cd2de 100644 --- a/profiles/nextclade.yaml +++ b/profiles/nextclade.yaml @@ -4,17 +4,9 @@ custom_rules: s3_dst: "s3://nextstrain-data-private/files/workflows/seasonal-flu" -lat-longs: "config/lat_longs.tsv" - segments: - ha -submission_date_field: date_submitted -recency: - date_bins: [7, 30, 90] - date_bin_labels: ["last week", "last month", "last quarter"] - upper_bin_label: older - builds: h1n1pdm: lineage: h1n1pdm diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index 9087ddc8..e82a78dd 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -488,10 +488,10 @@ rule annotate_recency_of_submissions: output: node_data = "builds/{build_name}/recency.json", params: - submission_date_field=config["submission_date_field"], - date_bins=config["recency"]["date_bins"], - date_bin_labels=config["recency"]["date_bin_labels"], - upper_bin_label=config["recency"]["upper_bin_label"], + submission_date_field=config.get("submission_date_field"), + date_bins=config.get("recency", {}).get("date_bins"), + date_bin_labels=config.get("recency", {}).get("date_bin_labels"), + upper_bin_label=config.get("recency", {}).get("upper_bin_label"), conda: "../envs/nextstrain.yaml" benchmark: "benchmarks/recency_{build_name}.txt" diff --git a/workflow/snakemake_rules/export.smk b/workflow/snakemake_rules/export.smk index 3898f067..84b96d3b 100644 --- a/workflow/snakemake_rules/export.smk +++ b/workflow/snakemake_rules/export.smk @@ -59,7 +59,7 @@ rule export: metadata = build_dir + "/{build_name}/metadata.tsv", node_data = _get_node_data_by_wildcards, auspice_config = lambda w: config['builds'][w.build_name]['auspice_config'], - lat_longs = config['lat-longs'] + lat_longs = config.get('lat-longs', "config/lat_longs.tsv"), output: auspice_json = "auspice/{build_name}_{segment}.json", root_sequence_json = "auspice/{build_name}_{segment}_root-sequence.json", From d18e7f22435d78128d1da79682b3a180a763afa3 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Tue, 26 Mar 2024 09:35:04 -0700 Subject: [PATCH 3/7] Simplify Nextclade dataset logic Simplifies the logic to get Nextclade datasets by following the same pattern as the flu_frequencies workflow [1] where we grab the default dataset for a given lineage and segment instead of specifying a reference name. The "broad" and more recent references for H3N2 HA, for example, are not too different from each other, but the Nextclade annotations for the former are far more verbose than for the latter. We also want the files produced by this workflow to plug directly into the flu_frequencies workflow logic, so it is best to use the same approach here. [1] https://github.com/nextstrain/flu_frequencies/blob/6e4298fac3361f4a6751d85bcb963064dbb9eee1/Snakefile#L95 --- profiles/nextclade.yaml | 4 ++++ profiles/nextclade/run-nextclade.smk | 22 ++++------------------ 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/profiles/nextclade.yaml b/profiles/nextclade.yaml index c04cd2de..a01d04b5 100644 --- a/profiles/nextclade.yaml +++ b/profiles/nextclade.yaml @@ -10,3 +10,7 @@ segments: builds: h1n1pdm: lineage: h1n1pdm + h3n2: + lineage: h3n2 + vic: + lineage: vic diff --git a/profiles/nextclade/run-nextclade.smk b/profiles/nextclade/run-nextclade.smk index 49ec81fa..d4a8c41e 100644 --- a/profiles/nextclade/run-nextclade.smk +++ b/profiles/nextclade/run-nextclade.smk @@ -1,33 +1,19 @@ -nextclade_dataset_by_lineage_and_segment = { - "h1n1pdm": { - "ha": "nextstrain/flu/h1n1pdm/ha/california-7-2009", - }, - "h3n2": { - "ha": "nextstrain/flu/h3n2/ha/wisconsin-67-2005", - }, - "vic": { - "ha": "nextstrain/flu/vic/ha/brisbane-60-2008", - }, -} - rule upload_all_nextclade_files: input: files=lambda wildcards: [ - "data/upload/s3/{filetype}_{lineage}_{segment}.done".format(filetype=filetype, lineage=lineage, segment=segment) + "data/upload/s3/{filetype}_{lineage}_{segment}.done".format(filetype=filetype, lineage=build["lineage"], segment=segment) for filetype in ("alignment", "nextclade") - for lineage in nextclade_dataset_by_lineage_and_segment.keys() - for segment in nextclade_dataset_by_lineage_and_segment[lineage].keys() + for build in config["builds"].values() + for segment in config["segments"] ] rule get_nextclade_dataset_for_lineage_and_segment: output: nextclade_dir=directory("nextclade_dataset/{lineage}_{segment}/"), - params: - dataset_name=lambda wildcards: nextclade_dataset_by_lineage_and_segment.get(wildcards.lineage, {}).get(wildcards.segment), shell: """ nextclade3 dataset get \ - -n {params.dataset_name:q} \ + -n flu_{wildcards.lineage}_{wildcards.segment} \ --output-dir {output.nextclade_dir} """ From 3ba4c131815feebda7d4899bf00480eda3527d10 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Tue, 26 Mar 2024 09:37:32 -0700 Subject: [PATCH 4/7] Run Nextclade for NA Adds NA to list of segments, since we want to know the subclade annotations for NA as well as HA and use these data to estimate frequencies. --- profiles/nextclade.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/profiles/nextclade.yaml b/profiles/nextclade.yaml index a01d04b5..44346bc3 100644 --- a/profiles/nextclade.yaml +++ b/profiles/nextclade.yaml @@ -6,6 +6,7 @@ s3_dst: "s3://nextstrain-data-private/files/workflows/seasonal-flu" segments: - ha + - na builds: h1n1pdm: From b4464b0fbd152220e47fad6b9a24203d64852520 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Tue, 26 Mar 2024 14:25:46 -0700 Subject: [PATCH 5/7] Trigger Nextclade on PR Add temporary trigger for Nextclade workflow on PR event. This should trigger the workflow when I push the update to the PR. If it works, I should drop this commit again. --- .github/workflows/run-nextclade.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/run-nextclade.yaml b/.github/workflows/run-nextclade.yaml index 530bec71..4f22a410 100644 --- a/.github/workflows/run-nextclade.yaml +++ b/.github/workflows/run-nextclade.yaml @@ -1,6 +1,7 @@ name: Run Nextclade on all sequences on: + pull_request: workflow_dispatch: inputs: dockerImage: From 4adf7316b1aca10dbd26fe6ac2698c03d533261b Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Tue, 26 Mar 2024 14:51:13 -0700 Subject: [PATCH 6/7] Remove pull request trigger The workflow ran successfully, so removing this trigger. --- .github/workflows/run-nextclade.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/run-nextclade.yaml b/.github/workflows/run-nextclade.yaml index 4f22a410..530bec71 100644 --- a/.github/workflows/run-nextclade.yaml +++ b/.github/workflows/run-nextclade.yaml @@ -1,7 +1,6 @@ name: Run Nextclade on all sequences on: - pull_request: workflow_dispatch: inputs: dockerImage: From 94cdfe8f7b1309cdc4d9b37868959370f0948bca Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Fri, 29 Mar 2024 15:42:15 -0700 Subject: [PATCH 7/7] Set Nextclade threads to a factor of 36 Reduce threads requested for Nextclade runs from 16 to 12 so we can run 3 Nextclade jobs at once (one per lineage) on a 36-core instance of AWS Batch. --- .github/workflows/run-nextclade.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-nextclade.yaml b/.github/workflows/run-nextclade.yaml index 530bec71..da95a1ef 100644 --- a/.github/workflows/run-nextclade.yaml +++ b/.github/workflows/run-nextclade.yaml @@ -29,4 +29,4 @@ jobs: upload_all_nextclade_files \ -p \ --configfile profiles/nextclade.yaml \ - --set-threads run_nextclade=16 + --set-threads run_nextclade=12