[WIP] whole genome builds for pre-defined strains

Main TODO is to join the genbank files & translate
nextstrain · Apr 25, 2024 · 6268449 · 6268449
1 parent 64b98ca
commit 6268449
Show file tree

Hide file tree

Showing 3 changed files with 543 additions and 0 deletions.
diff --git a/Snakefile.genome b/Snakefile.genome
@@ -0,0 +1,260 @@
+import json
+import glob
+
+SEGMENTS = ["ha", "np", "pb2", "pb1", "pa", "np", "mp", "ns"] # order important!
+SUBTYPES = ['h5n1']
+
+
+rule all:
+    input: expand("results/genome/aligned_{subtype}_{segment}.fasta", subtype=SUBTYPES, segment=SEGMENTS)
+
+
+rule files:
+    params:
+        # dropped_strains = "config/dropped_strains_{subtype}.txt",
+        # include_strains = "config/include_strains_{subtype}_{time}.txt",
+        reference = "config/reference_{subtype}_{segment}.gb",
+        include = "config/include_genome-strains_{subtype}.txt",
+        colors = "config/colors_{subtype}.tsv",
+        lat_longs = "config/lat_longs_{subtype}.tsv",
+        auspice_config = "config/auspice_config_{subtype}.json",
+        # clades_file = "clade-labeling/{subtype}-clades.tsv",
+        description = "config/description.md"
+
+files = rules.files.params
+
+rule filter:
+    input:
+        sequences = "results/sequences_{subtype}_{segment}.fasta",
+        metadata = "results/metadata_{subtype}_{segment}.tsv",
+        include = files.include,
+    output:
+        sequences = "results/genome/sequences_{subtype}_{segment}.fasta",
+        metadata = "results/genome/metadata_{subtype}_{segment}.tsv",
+    shell:
+        """
+        augur filter \
+            --sequences {input.sequences} \
+            --metadata {input.metadata} \
+            --exclude-all --include {input.include} \
+            --output-sequences {output.sequences} \
+            --output-metadata {output.metadata}
+        """
+
+rule align:
+    input:
+        sequences ="results/genome/sequences_{subtype}_{segment}.fasta",
+        reference = files.reference,
+    output:
+        alignment = "results/genome/aligned_{subtype}_{segment}.fasta",
+    shell:
+        """
+        augur align \
+            --sequences {input.sequences} \
+            --reference-sequence {input.reference} \
+            --output {output.alignment} \
+            --remove-reference \
+            --nthreads 1
+        """
+
+rule join_sequences:
+    input:
+        alignment = expand("results/genome/aligned_{{subtype}}_{segment}.fasta", segment=SEGMENTS),
+    output:
+        alignment = "results/genome/genome_{subtype}.fasta",
+    shell:
+        """
+        python scripts/join-segments.py \
+            --segments {input.alignment} \
+            --output {output.alignment} 
+        """
+
+# NOTE - I'm not joining metadata, I'm relying on everything being there in the starting HA metadata...
+
+# NOTE - from now on the rules mirror the main snakefile...
+rule tree:
+    message: "Building tree"
+    input:
+        alignment = "results/genome/genome_{subtype}.fasta",
+    output:
+        tree = "results/genome/tree-raw_{subtype}.nwk",
+    params:
+        method = "iqtree"
+    shell:
+        """
+        augur tree \
+            --alignment {input.alignment} \
+            --output {output.tree} \
+            --method {params.method} \
+            --nthreads 1
+        """
+
+def clock_rate(w):
+    """
+    MVP!!!!!!!!!
+    """
+    clock_rates_h5n1 = {
+        'pb2': 0.00287,
+        'pb1': 0.00264,
+        'pa': 0.00248,
+        'ha': 0.00455,
+        'np': 0.00252,
+        'na': 0.00349,
+        'mp': 0.00191,
+        'ns': 0.00249
+    }
+    lengths = {
+        'pb2': 2341,
+        'pb1': 2341,
+        'pa': 2233,
+        'ha': 1760,
+        'np': 1565,
+        'na': 1458,
+        'mp': 1027,
+        'ns': 865 
+    }
+    mean = sum([cr * lengths[seg] for seg,cr in clock_rates_h5n1.items()])/sum(lengths.values())
+    stdev = mean/2
+    return f"--clock-rate {mean} --clock-std-dev {stdev}"
+
+rule refine:
+    message:
+        """
+        Refining tree
+          - estimate timetree
+          - use {params.coalescent} coalescent timescale
+          - estimate {params.date_inference} node dates
+        """
+    input:
+        tree = "results/genome/tree-raw_{subtype}.nwk",
+        alignment = "results/genome/genome_{subtype}.fasta",
+        metadata = "results/metadata_{subtype}_ha.tsv", # assuming all salient info is in HA
+    output:
+        tree = "results/genome/tree_{subtype}.nwk",
+        node_data = "results/branch-lengths_{subtype}.json"
+    params:
+        coalescent = "const",
+        date_inference = "marginal",
+        clock_rate = clock_rate,
+    shell:
+        """
+        augur refine \
+            --tree {input.tree} \
+            --alignment {input.alignment} \
+            --metadata {input.metadata} \
+            --output-tree {output.tree} \
+            --output-node-data {output.node_data} \
+            --timetree \
+            --coalescent {params.coalescent} \
+            --date-confidence \
+            --date-inference {params.date_inference} \
+            {params.clock_rate} \
+        """
+
+rule ancestral:
+    input:
+        tree = "results/genome/tree_{subtype}.nwk",
+        alignment = "results/genome/genome_{subtype}.fasta",
+    output:
+        node_data = "results/genome/nt-muts_{subtype}.json"
+    params:
+        inference = "joint"
+    shell:
+        """
+        augur ancestral \
+            --tree {input.tree} \
+            --alignment {input.alignment} \
+            --output-node-data {output.node_data} \
+            --inference {params.inference}\
+            --keep-ambiguous
+        """
+
+# TODO XXX translate needs joined genbank
+
+# rule translate:
+#     message: "Translating amino acid sequences"
+#     input:
+#         tree = "results/genome/tree_{subtype}.nwk",
+#         node_data = "results/genome/nt-muts_{subtype}.json"
+#         reference = files.reference
+#     output:
+#         node_data = "results/aa-muts_{subtype}_{segment}_{time}.json"
+#     shell:
+#         """
+#         augur translate \
+#             --tree {input.tree} \
+#             --ancestral-sequences {input.node_data} \
+#             --reference-sequence {input.reference} \
+#             --output {output.node_data}
+#         """
+
+
+def traits_columns(w):
+    traits = {'h5nx':'region','h5n1': 'region country', 'h7n9': 'country division', 'h9n2': 'region country'}
+    return traits[w.subtype]
+
+rule traits:
+    message: "Inferring ancestral traits for {params.columns!s}"
+    input:
+        tree = "results/genome/tree_{subtype}.nwk",
+        metadata = "results/metadata_{subtype}_ha.tsv",
+    output:
+        node_data = "results/genome/traits_{subtype}.json",
+    params:
+        columns = traits_columns,
+    shell:
+        """
+        augur traits \
+            --tree {input.tree} \
+            --metadata {input.metadata} \
+            --output {output.node_data} \
+            --columns {params.columns} \
+            --confidence
+        """
+
+rule cleavage_site:
+    input:
+        alignment = "results/genome/aligned_{subtype}_ha.fasta",
+    output:
+        cleavage_site_annotations = "results/genome/cleavage-site_{subtype}_ha.json",
+        cleavage_site_sequences = "results/genome/cleavage-site-sequences_{subtype}_ha.json"
+    shell:
+        """
+        python scripts/annotate-ha-cleavage-site.py \
+            --alignment {input.alignment} \
+            --furin_site_motif {output.cleavage_site_annotations} \
+            --cleavage_site_sequence {output.cleavage_site_sequences}
+        """
+
+rule export:
+    message: "Exporting data files for for auspice"
+    input:
+        tree = "results/genome/tree_{subtype}.nwk",
+        metadata = "results/metadata_{subtype}_ha.tsv",
+        node_data = [
+            rules.refine.output.node_data,
+            rules.traits.output.node_data,
+            rules.ancestral.output.node_data,
+            # rules.translate.output.node_data,
+            rules.cleavage_site.output.cleavage_site_annotations,
+            rules.cleavage_site.output.cleavage_site_sequences,
+        ],
+        colors = files.colors,
+        lat_longs = files.lat_longs,
+        auspice_config = files.auspice_config,
+        description = files.description,
+    output:
+        auspice_json = "auspice/avian-flu_{subtype}_genome.json"
+    shell:
+        """
+        augur export v2 \
+            --tree {input.tree} \
+            --metadata {input.metadata} \
+            --node-data {input.node_data} \
+            --colors {input.colors} \
+            --lat-longs {input.lat_longs} \
+            --auspice-config {input.auspice_config} \
+            --description {input.description} \
+            --include-root-sequence-inline \
+            --output {output.auspice_json}
+        """