Skip to content

Commit

Permalink
[WIP] whole genome builds for pre-defined strains
Browse files Browse the repository at this point in the history
Main TODO is to join the genbank files & translate
  • Loading branch information
jameshadfield committed Apr 25, 2024
1 parent 64b98ca commit 6268449
Show file tree
Hide file tree
Showing 3 changed files with 543 additions and 0 deletions.
260 changes: 260 additions & 0 deletions Snakefile.genome
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
import json
import glob

SEGMENTS = ["ha", "np", "pb2", "pb1", "pa", "np", "mp", "ns"] # order important!
SUBTYPES = ['h5n1']


rule all:
input: expand("results/genome/aligned_{subtype}_{segment}.fasta", subtype=SUBTYPES, segment=SEGMENTS)


rule files:
params:
# dropped_strains = "config/dropped_strains_{subtype}.txt",
# include_strains = "config/include_strains_{subtype}_{time}.txt",
reference = "config/reference_{subtype}_{segment}.gb",
include = "config/include_genome-strains_{subtype}.txt",
colors = "config/colors_{subtype}.tsv",
lat_longs = "config/lat_longs_{subtype}.tsv",
auspice_config = "config/auspice_config_{subtype}.json",
# clades_file = "clade-labeling/{subtype}-clades.tsv",
description = "config/description.md"

files = rules.files.params

rule filter:
input:
sequences = "results/sequences_{subtype}_{segment}.fasta",
metadata = "results/metadata_{subtype}_{segment}.tsv",
include = files.include,
output:
sequences = "results/genome/sequences_{subtype}_{segment}.fasta",
metadata = "results/genome/metadata_{subtype}_{segment}.tsv",
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--exclude-all --include {input.include} \
--output-sequences {output.sequences} \
--output-metadata {output.metadata}
"""

rule align:
input:
sequences ="results/genome/sequences_{subtype}_{segment}.fasta",
reference = files.reference,
output:
alignment = "results/genome/aligned_{subtype}_{segment}.fasta",
shell:
"""
augur align \
--sequences {input.sequences} \
--reference-sequence {input.reference} \
--output {output.alignment} \
--remove-reference \
--nthreads 1
"""

rule join_sequences:
input:
alignment = expand("results/genome/aligned_{{subtype}}_{segment}.fasta", segment=SEGMENTS),
output:
alignment = "results/genome/genome_{subtype}.fasta",
shell:
"""
python scripts/join-segments.py \
--segments {input.alignment} \
--output {output.alignment}
"""

# NOTE - I'm not joining metadata, I'm relying on everything being there in the starting HA metadata...

# NOTE - from now on the rules mirror the main snakefile...
rule tree:
message: "Building tree"
input:
alignment = "results/genome/genome_{subtype}.fasta",
output:
tree = "results/genome/tree-raw_{subtype}.nwk",
params:
method = "iqtree"
shell:
"""
augur tree \
--alignment {input.alignment} \
--output {output.tree} \
--method {params.method} \
--nthreads 1
"""

def clock_rate(w):
"""
MVP!!!!!!!!!
"""
clock_rates_h5n1 = {
'pb2': 0.00287,
'pb1': 0.00264,
'pa': 0.00248,
'ha': 0.00455,
'np': 0.00252,
'na': 0.00349,
'mp': 0.00191,
'ns': 0.00249
}
lengths = {
'pb2': 2341,
'pb1': 2341,
'pa': 2233,
'ha': 1760,
'np': 1565,
'na': 1458,
'mp': 1027,
'ns': 865
}
mean = sum([cr * lengths[seg] for seg,cr in clock_rates_h5n1.items()])/sum(lengths.values())
stdev = mean/2
return f"--clock-rate {mean} --clock-std-dev {stdev}"

rule refine:
message:
"""
Refining tree
- estimate timetree
- use {params.coalescent} coalescent timescale
- estimate {params.date_inference} node dates
"""
input:
tree = "results/genome/tree-raw_{subtype}.nwk",
alignment = "results/genome/genome_{subtype}.fasta",
metadata = "results/metadata_{subtype}_ha.tsv", # assuming all salient info is in HA
output:
tree = "results/genome/tree_{subtype}.nwk",
node_data = "results/branch-lengths_{subtype}.json"
params:
coalescent = "const",
date_inference = "marginal",
clock_rate = clock_rate,
shell:
"""
augur refine \
--tree {input.tree} \
--alignment {input.alignment} \
--metadata {input.metadata} \
--output-tree {output.tree} \
--output-node-data {output.node_data} \
--timetree \
--coalescent {params.coalescent} \
--date-confidence \
--date-inference {params.date_inference} \
{params.clock_rate} \
"""

rule ancestral:
input:
tree = "results/genome/tree_{subtype}.nwk",
alignment = "results/genome/genome_{subtype}.fasta",
output:
node_data = "results/genome/nt-muts_{subtype}.json"
params:
inference = "joint"
shell:
"""
augur ancestral \
--tree {input.tree} \
--alignment {input.alignment} \
--output-node-data {output.node_data} \
--inference {params.inference}\
--keep-ambiguous
"""

# TODO XXX translate needs joined genbank

# rule translate:
# message: "Translating amino acid sequences"
# input:
# tree = "results/genome/tree_{subtype}.nwk",
# node_data = "results/genome/nt-muts_{subtype}.json"
# reference = files.reference
# output:
# node_data = "results/aa-muts_{subtype}_{segment}_{time}.json"
# shell:
# """
# augur translate \
# --tree {input.tree} \
# --ancestral-sequences {input.node_data} \
# --reference-sequence {input.reference} \
# --output {output.node_data}
# """


def traits_columns(w):
traits = {'h5nx':'region','h5n1': 'region country', 'h7n9': 'country division', 'h9n2': 'region country'}
return traits[w.subtype]

rule traits:
message: "Inferring ancestral traits for {params.columns!s}"
input:
tree = "results/genome/tree_{subtype}.nwk",
metadata = "results/metadata_{subtype}_ha.tsv",
output:
node_data = "results/genome/traits_{subtype}.json",
params:
columns = traits_columns,
shell:
"""
augur traits \
--tree {input.tree} \
--metadata {input.metadata} \
--output {output.node_data} \
--columns {params.columns} \
--confidence
"""

rule cleavage_site:
input:
alignment = "results/genome/aligned_{subtype}_ha.fasta",
output:
cleavage_site_annotations = "results/genome/cleavage-site_{subtype}_ha.json",
cleavage_site_sequences = "results/genome/cleavage-site-sequences_{subtype}_ha.json"
shell:
"""
python scripts/annotate-ha-cleavage-site.py \
--alignment {input.alignment} \
--furin_site_motif {output.cleavage_site_annotations} \
--cleavage_site_sequence {output.cleavage_site_sequences}
"""

rule export:
message: "Exporting data files for for auspice"
input:
tree = "results/genome/tree_{subtype}.nwk",
metadata = "results/metadata_{subtype}_ha.tsv",
node_data = [
rules.refine.output.node_data,
rules.traits.output.node_data,
rules.ancestral.output.node_data,
# rules.translate.output.node_data,
rules.cleavage_site.output.cleavage_site_annotations,
rules.cleavage_site.output.cleavage_site_sequences,
],
colors = files.colors,
lat_longs = files.lat_longs,
auspice_config = files.auspice_config,
description = files.description,
output:
auspice_json = "auspice/avian-flu_{subtype}_genome.json"
shell:
"""
augur export v2 \
--tree {input.tree} \
--metadata {input.metadata} \
--node-data {input.node_data} \
--colors {input.colors} \
--lat-longs {input.lat_longs} \
--auspice-config {input.auspice_config} \
--description {input.description} \
--include-root-sequence-inline \
--output {output.auspice_json}
"""
Loading

0 comments on commit 6268449

Please sign in to comment.