From 7ee96b5fc7b1ab08e88c32897a71d636240cbc9b Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 25 Mar 2024 16:56:58 -0700
Subject: [PATCH 1/7] Prototype GitHub Action for Nextclade annotations

Adds rules, config, and GitHub Action file to support running Nextclade
on all available sequences. Not yet tested.
---
 .github/workflows/run-nextclade.yaml          | 32 +++++++
 Snakefile                                     |  1 +
 profiles/nextclade.yaml                       | 20 +++++
 profiles/nextclade/run-nextclade.smk          | 86 +++++++++++++++++++
 workflow/snakemake_rules/download_from_s3.smk | 25 ++++++
 5 files changed, 164 insertions(+)
 create mode 100644 .github/workflows/run-nextclade.yaml
 create mode 100644 profiles/nextclade.yaml
 create mode 100644 profiles/nextclade/run-nextclade.smk

diff --git a/.github/workflows/run-nextclade.yaml b/.github/workflows/run-nextclade.yaml
new file mode 100644
index 00000000..530bec71
--- /dev/null
+++ b/.github/workflows/run-nextclade.yaml
@@ -0,0 +1,32 @@
+name: Run Nextclade on all sequences
+
+on:
+  workflow_dispatch:
+    inputs:
+      dockerImage:
+        description: "Specific container image to use for build (will override the default of `nextstrain build`)"
+        required: false
+        type: string
+
+jobs:
+  run-build:
+    permissions:
+      id-token: write
+    uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
+    secrets: inherit
+    with:
+      runtime: aws-batch
+      env: |
+        NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.dockerImage }}
+      run: |
+        nextstrain build \
+          --detach \
+          --cpus 36 \
+          --memory 72gib \
+          --env AWS_ACCESS_KEY_ID \
+          --env AWS_SECRET_ACCESS_KEY \
+          . \
+          upload_all_nextclade_files \
+          -p \
+          --configfile profiles/nextclade.yaml \
+          --set-threads run_nextclade=16
diff --git a/Snakefile b/Snakefile
index 67f975eb..ce810f49 100644
--- a/Snakefile
+++ b/Snakefile
@@ -4,6 +4,7 @@ from treetime.utils import numeric_date
 
 
 wildcard_constraints:
+    lineage = r'h1n1pdm|h3n2|vic|yam',
     segment = r'pb2|pb1|pa|ha|np|na|mp|ns',
     center = r'who|cdc|crick|niid|crick|vidrl',
     passage = r'cell|egg',
diff --git a/profiles/nextclade.yaml b/profiles/nextclade.yaml
new file mode 100644
index 00000000..e6ddabf9
--- /dev/null
+++ b/profiles/nextclade.yaml
@@ -0,0 +1,20 @@
+custom_rules:
+  - workflow/snakemake_rules/download_from_s3.smk
+  - profiles/nextclade/run-nextclade.smk
+
+s3_dst: "s3://nextstrain-data-private/files/workflows/seasonal-flu"
+
+lat-longs: "config/lat_longs.tsv"
+
+segments:
+  - ha
+
+submission_date_field: date_submitted
+recency:
+  date_bins: [7, 30, 90]
+  date_bin_labels: ["last week", "last month", "last quarter"]
+  upper_bin_label: older
+
+builds:
+  h1n1pdm:
+    lineage: h1n1pdm
diff --git a/profiles/nextclade/run-nextclade.smk b/profiles/nextclade/run-nextclade.smk
new file mode 100644
index 00000000..49ec81fa
--- /dev/null
+++ b/profiles/nextclade/run-nextclade.smk
@@ -0,0 +1,86 @@
+nextclade_dataset_by_lineage_and_segment = {
+    "h1n1pdm": {
+        "ha": "nextstrain/flu/h1n1pdm/ha/california-7-2009",
+    },
+    "h3n2": {
+        "ha": "nextstrain/flu/h3n2/ha/wisconsin-67-2005",
+    },
+    "vic": {
+        "ha": "nextstrain/flu/vic/ha/brisbane-60-2008",
+    },
+}
+
+rule upload_all_nextclade_files:
+    input:
+        files=lambda wildcards: [
+            "data/upload/s3/{filetype}_{lineage}_{segment}.done".format(filetype=filetype, lineage=lineage, segment=segment)
+            for filetype in ("alignment", "nextclade")
+            for lineage in nextclade_dataset_by_lineage_and_segment.keys()
+            for segment in nextclade_dataset_by_lineage_and_segment[lineage].keys()
+        ]
+
+rule get_nextclade_dataset_for_lineage_and_segment:
+    output:
+        nextclade_dir=directory("nextclade_dataset/{lineage}_{segment}/"),
+    params:
+        dataset_name=lambda wildcards: nextclade_dataset_by_lineage_and_segment.get(wildcards.lineage, {}).get(wildcards.segment),
+    shell:
+        """
+        nextclade3 dataset get \
+            -n {params.dataset_name:q} \
+            --output-dir {output.nextclade_dir}
+        """
+
+rule run_nextclade:
+    input:
+        nextclade_dir="nextclade_dataset/{lineage}_{segment}/",
+        sequences="data/{lineage}/{segment}.fasta",
+    output:
+        alignment="data/upload/s3/{lineage}/{segment}/aligned.fasta",
+        annotations="data/upload/s3/{lineage}/{segment}/nextclade.tsv",
+    log:
+        "logs/run_nextclade_{lineage}_{segment}.txt"
+    threads: 8
+    shell:
+        """
+        nextclade3 run \
+            -j {threads} \
+            -D {input.nextclade_dir} \
+            --output-fasta {output.alignment} \
+            --output-tsv {output.annotations} \
+            {input.sequences}
+        """
+
+rule upload_alignment:
+    input:
+        alignment="data/upload/s3/{lineage}/{segment}/aligned.fasta",
+    output:
+        flag="data/upload/s3/alignment_{lineage}_{segment}.done",
+    params:
+        s3_dst=config["s3_dst"],
+    log:
+        "logs/upload_alignment_{lineage}_{segment}.txt"
+    shell:
+        """
+        ./scripts/upload-to-s3 \
+            --quiet \
+            {input.alignment:q} \
+            {params.s3_dst:q}/{wildcards.lineage}/{wildcards.segment}/aligned.fasta.xz 2>&1 | tee {output.flag}
+        """
+
+rule upload_nextclade_annotations:
+    input:
+        annotations="data/upload/s3/{lineage}/{segment}/nextclade.tsv",
+    output:
+        flag="data/upload/s3/nextclade_{lineage}_{segment}.done",
+    params:
+        s3_dst=config["s3_dst"],
+    log:
+        "logs/upload_nextclade_annotations_{lineage}_{segment}.txt"
+    shell:
+        """
+        ./scripts/upload-to-s3 \
+            --quiet \
+            {input.annotations:q} \
+            {params.s3_dst:q}/{wildcards.lineage}/{wildcards.segment}/nextclade.tsv.xz 2>&1 | tee {output.flag}
+        """
diff --git a/workflow/snakemake_rules/download_from_s3.smk b/workflow/snakemake_rules/download_from_s3.smk
index 05733145..19d06145 100644
--- a/workflow/snakemake_rules/download_from_s3.smk
+++ b/workflow/snakemake_rules/download_from_s3.smk
@@ -1,3 +1,6 @@
+ruleorder: download_parsed_sequences > parse
+ruleorder: download_parsed_metadata > annotate_metadata_with_reference_strains
+
 rule download_sequences:
     output:
         sequences="data/{lineage}/raw_{segment}.fasta"
@@ -19,3 +22,25 @@ rule download_titers:
         """
         aws s3 cp {params.s3_path} - | gzip -c -d > {output.titers}
         """
+
+rule download_parsed_sequences:
+    output:
+        sequences="data/{lineage}/{segment}.fasta"
+    params:
+        s3_path="s3://nextstrain-data-private/files/workflows/seasonal-flu/{lineage}/{segment}/sequences.fasta.xz"
+    conda: "../../workflow/envs/nextstrain.yaml"
+    shell:
+        """
+        aws s3 cp {params.s3_path} - | xz -c -d > {output.sequences}
+        """
+
+rule download_parsed_metadata:
+    output:
+        metadata="data/{lineage}/metadata.tsv",
+    params:
+        s3_path="s3://nextstrain-data-private/files/workflows/seasonal-flu/{lineage}/metadata.tsv.xz"
+    conda: "../../workflow/envs/nextstrain.yaml"
+    shell:
+        """
+        aws s3 cp {params.s3_path} - | xz -c -d > {output.metadata}
+        """

From d74823442877af0c1a7799f8e872017bf2f821fb Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Tue, 26 Mar 2024 09:32:55 -0700
Subject: [PATCH 2/7] Simplify Nextclade build config

Remove unnecessary configuration parameters from the Nextclade build
config and update the workflow to allow these parameters to be missing.
Since Snakemake evaluates the Python code in each rule's inputs,
outputs, and params, rules that we don't plan to run in the workflow can
produce key errors when their config parameters are not defined.
---
 profiles/nextclade.yaml             | 8 --------
 workflow/snakemake_rules/core.smk   | 8 ++++----
 workflow/snakemake_rules/export.smk | 2 +-
 3 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/profiles/nextclade.yaml b/profiles/nextclade.yaml
index e6ddabf9..c04cd2de 100644
--- a/profiles/nextclade.yaml
+++ b/profiles/nextclade.yaml
@@ -4,17 +4,9 @@ custom_rules:
 
 s3_dst: "s3://nextstrain-data-private/files/workflows/seasonal-flu"
 
-lat-longs: "config/lat_longs.tsv"
-
 segments:
   - ha
 
-submission_date_field: date_submitted
-recency:
-  date_bins: [7, 30, 90]
-  date_bin_labels: ["last week", "last month", "last quarter"]
-  upper_bin_label: older
-
 builds:
   h1n1pdm:
     lineage: h1n1pdm
diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk
index 9087ddc8..e82a78dd 100644
--- a/workflow/snakemake_rules/core.smk
+++ b/workflow/snakemake_rules/core.smk
@@ -488,10 +488,10 @@ rule annotate_recency_of_submissions:
     output:
         node_data = "builds/{build_name}/recency.json",
     params:
-        submission_date_field=config["submission_date_field"],
-        date_bins=config["recency"]["date_bins"],
-        date_bin_labels=config["recency"]["date_bin_labels"],
-        upper_bin_label=config["recency"]["upper_bin_label"],
+        submission_date_field=config.get("submission_date_field"),
+        date_bins=config.get("recency", {}).get("date_bins"),
+        date_bin_labels=config.get("recency", {}).get("date_bin_labels"),
+        upper_bin_label=config.get("recency", {}).get("upper_bin_label"),
     conda: "../envs/nextstrain.yaml"
     benchmark:
         "benchmarks/recency_{build_name}.txt"
diff --git a/workflow/snakemake_rules/export.smk b/workflow/snakemake_rules/export.smk
index 3898f067..84b96d3b 100644
--- a/workflow/snakemake_rules/export.smk
+++ b/workflow/snakemake_rules/export.smk
@@ -59,7 +59,7 @@ rule export:
         metadata = build_dir + "/{build_name}/metadata.tsv",
         node_data = _get_node_data_by_wildcards,
         auspice_config = lambda w: config['builds'][w.build_name]['auspice_config'],
-        lat_longs = config['lat-longs']
+        lat_longs = config.get('lat-longs', "config/lat_longs.tsv"),
     output:
         auspice_json = "auspice/{build_name}_{segment}.json",
         root_sequence_json = "auspice/{build_name}_{segment}_root-sequence.json",

From d18e7f22435d78128d1da79682b3a180a763afa3 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Tue, 26 Mar 2024 09:35:04 -0700
Subject: [PATCH 3/7] Simplify Nextclade dataset logic

Simplifies the logic to get Nextclade datasets by following the same
pattern as the flu_frequencies workflow [1] where we grab the default
dataset for a given lineage and segment instead of specifying a
reference name. The "broad" and more recent references for H3N2 HA, for
example, are not too different from each other, but the Nextclade
annotations for the former are far more verbose than for the latter. We
also want the files produced by this workflow to plug directly into the
flu_frequencies workflow logic, so it is best to use the same approach
here.

[1] https://github.com/nextstrain/flu_frequencies/blob/6e4298fac3361f4a6751d85bcb963064dbb9eee1/Snakefile#L95
---
 profiles/nextclade.yaml              |  4 ++++
 profiles/nextclade/run-nextclade.smk | 22 ++++------------------
 2 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/profiles/nextclade.yaml b/profiles/nextclade.yaml
index c04cd2de..a01d04b5 100644
--- a/profiles/nextclade.yaml
+++ b/profiles/nextclade.yaml
@@ -10,3 +10,7 @@ segments:
 builds:
   h1n1pdm:
     lineage: h1n1pdm
+  h3n2:
+    lineage: h3n2
+  vic:
+    lineage: vic
diff --git a/profiles/nextclade/run-nextclade.smk b/profiles/nextclade/run-nextclade.smk
index 49ec81fa..d4a8c41e 100644
--- a/profiles/nextclade/run-nextclade.smk
+++ b/profiles/nextclade/run-nextclade.smk
@@ -1,33 +1,19 @@
-nextclade_dataset_by_lineage_and_segment = {
-    "h1n1pdm": {
-        "ha": "nextstrain/flu/h1n1pdm/ha/california-7-2009",
-    },
-    "h3n2": {
-        "ha": "nextstrain/flu/h3n2/ha/wisconsin-67-2005",
-    },
-    "vic": {
-        "ha": "nextstrain/flu/vic/ha/brisbane-60-2008",
-    },
-}
-
 rule upload_all_nextclade_files:
     input:
         files=lambda wildcards: [
-            "data/upload/s3/{filetype}_{lineage}_{segment}.done".format(filetype=filetype, lineage=lineage, segment=segment)
+            "data/upload/s3/{filetype}_{lineage}_{segment}.done".format(filetype=filetype, lineage=build["lineage"], segment=segment)
             for filetype in ("alignment", "nextclade")
-            for lineage in nextclade_dataset_by_lineage_and_segment.keys()
-            for segment in nextclade_dataset_by_lineage_and_segment[lineage].keys()
+            for build in config["builds"].values()
+            for segment in config["segments"]
         ]
 
 rule get_nextclade_dataset_for_lineage_and_segment:
     output:
         nextclade_dir=directory("nextclade_dataset/{lineage}_{segment}/"),
-    params:
-        dataset_name=lambda wildcards: nextclade_dataset_by_lineage_and_segment.get(wildcards.lineage, {}).get(wildcards.segment),
     shell:
         """
         nextclade3 dataset get \
-            -n {params.dataset_name:q} \
+            -n flu_{wildcards.lineage}_{wildcards.segment} \
             --output-dir {output.nextclade_dir}
         """
 

From 3ba4c131815feebda7d4899bf00480eda3527d10 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Tue, 26 Mar 2024 09:37:32 -0700
Subject: [PATCH 4/7] Run Nextclade for NA

Adds NA to list of segments, since we want to know the subclade
annotations for NA as well as HA and use these data to estimate
frequencies.
---
 profiles/nextclade.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/profiles/nextclade.yaml b/profiles/nextclade.yaml
index a01d04b5..44346bc3 100644
--- a/profiles/nextclade.yaml
+++ b/profiles/nextclade.yaml
@@ -6,6 +6,7 @@ s3_dst: "s3://nextstrain-data-private/files/workflows/seasonal-flu"
 
 segments:
   - ha
+  - na
 
 builds:
   h1n1pdm:

From b4464b0fbd152220e47fad6b9a24203d64852520 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Tue, 26 Mar 2024 14:25:46 -0700
Subject: [PATCH 5/7] Trigger Nextclade on PR

Add temporary trigger for Nextclade workflow on PR event. This should
trigger the workflow when I push the update to the PR. If it works, I
should drop this commit again.
---
 .github/workflows/run-nextclade.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/run-nextclade.yaml b/.github/workflows/run-nextclade.yaml
index 530bec71..4f22a410 100644
--- a/.github/workflows/run-nextclade.yaml
+++ b/.github/workflows/run-nextclade.yaml
@@ -1,6 +1,7 @@
 name: Run Nextclade on all sequences
 
 on:
+  pull_request:
   workflow_dispatch:
     inputs:
       dockerImage:

From 4adf7316b1aca10dbd26fe6ac2698c03d533261b Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Tue, 26 Mar 2024 14:51:13 -0700
Subject: [PATCH 6/7] Remove pull request trigger

The workflow ran successfully, so removing this trigger.
---
 .github/workflows/run-nextclade.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/run-nextclade.yaml b/.github/workflows/run-nextclade.yaml
index 4f22a410..530bec71 100644
--- a/.github/workflows/run-nextclade.yaml
+++ b/.github/workflows/run-nextclade.yaml
@@ -1,7 +1,6 @@
 name: Run Nextclade on all sequences
 
 on:
-  pull_request:
   workflow_dispatch:
     inputs:
       dockerImage:

From 94cdfe8f7b1309cdc4d9b37868959370f0948bca Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Fri, 29 Mar 2024 15:42:15 -0700
Subject: [PATCH 7/7] Set Nextclade threads to a factor of 36

Reduce threads requested for Nextclade runs from 16 to 12 so we can run 3 Nextclade jobs at once (one per lineage) on a 36-core instance of AWS Batch.
---
 .github/workflows/run-nextclade.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/run-nextclade.yaml b/.github/workflows/run-nextclade.yaml
index 530bec71..da95a1ef 100644
--- a/.github/workflows/run-nextclade.yaml
+++ b/.github/workflows/run-nextclade.yaml
@@ -29,4 +29,4 @@ jobs:
           upload_all_nextclade_files \
           -p \
           --configfile profiles/nextclade.yaml \
-          --set-threads run_nextclade=16
+          --set-threads run_nextclade=12