Skip to content

Commit

Permalink
Use ISO duration format for clade recency
Browse files Browse the repository at this point in the history
Keeps consistent with other existing usage of this format.
  • Loading branch information
victorlin committed Oct 3, 2024
1 parent f2418ce commit 0bd8251
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 37 deletions.
2 changes: 1 addition & 1 deletion defaults/parameters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ ancestral:

colors:
default:
# Months back to color clades, if "all" then all clades are colored
# Amount of time back to color clades, if "all" then all clades are colored
# Can be specified per build in builds.yaml
clade_recency: "all"

Expand Down
9 changes: 5 additions & 4 deletions docs/src/reference/workflow-config-file.rst
Original file line number Diff line number Diff line change
Expand Up @@ -955,7 +955,7 @@ colors
clade_recency: "all"
global-6m:
# Override clade recency colors for "global-6m" build
clade_recency: 6
clade_recency: "6M"
Each named traits configuration (``default`` or build-named) supports the following attributes:

Expand All @@ -965,9 +965,10 @@ Each named traits configuration (``default`` or build-named) supports the follow
clade_recency
~~~~~~~~~~~~~

- type: integer
- description: if integer value is provided, restrict to clades found in tree within X months of present
- default: ``all``
- type: string
- format: `ISO 8601 <https://en.wikipedia.org/wiki/ISO_8601#Durations>`__ duration with optional ``P`` prefix (e.g. ``2M``, ``18M``, ``1Y6M``)
- description: restrict to clades found in tree within this duration from present
- default: ``all`` (no restriction)

traits
------
Expand Down
42 changes: 21 additions & 21 deletions nextstrain_profiles/nextstrain-gisaid/builds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -332,47 +332,47 @@ colors:
default:
clade_recency: "all"
global_1m:
clade_recency: 1
clade_recency: "1M"
global_2m:
clade_recency: 2
clade_recency: "2M"
global_6m:
clade_recency: 6
clade_recency: "6M"
africa_1m:
clade_recency: 1
clade_recency: "1M"
africa_2m:
clade_recency: 2
clade_recency: "2M"
africa_6m:
clade_recency: 6
clade_recency: "6M"
asia_1m:
clade_recency: 1
clade_recency: "1M"
asia_2m:
clade_recency: 2
clade_recency: "2M"
asia_6m:
clade_recency: 6
clade_recency: "6M"
europe_1m:
clade_recency: 1
clade_recency: "1M"
europe_2m:
clade_recency: 2
clade_recency: "2M"
europe_6m:
clade_recency: 6
clade_recency: "6M"
north-america_1m:
clade_recency: 1
clade_recency: "1M"
north-america_2m:
clade_recency: 2
clade_recency: "2M"
north-america_6m:
clade_recency: 6
clade_recency: "6M"
oceania_1m:
clade_recency: 1
clade_recency: "1M"
oceania_2m:
clade_recency: 2
clade_recency: "2M"
oceania_6m:
clade_recency: 6
clade_recency: "6M"
south-america_1m:
clade_recency: 1
clade_recency: "1M"
south-america_2m:
clade_recency: 2
clade_recency: "2M"
south-america_6m:
clade_recency: 6
clade_recency: "6M"

# if different traits should be reconstructed for some builds, specify here
# otherwise the default trait config in defaults/parameters.yaml will used
Expand Down
31 changes: 24 additions & 7 deletions scripts/assign-colors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import datetime
import isodate
import pandas as pd
from datetime import datetime, timedelta

# Forced colours MUST NOT appear in the ordering TSV
forced_colors = {
Expand All @@ -10,11 +11,27 @@ def date_within_last_n_months(date_str, cutoff_date):
if 'XX' in date_str:
return False # Ignore uncertain dates
try:
date = datetime.strptime(date_str, "%Y-%m-%d")
date = datetime.datetime.strptime(date_str, "%Y-%m-%d").date()
return date >= cutoff_date
except ValueError:
return False


def relative_date(duration: str):
"""
Convert an ISO 8601 duration to an absolute date by subtracting it from the
current date.
`duration` should be a backwards-looking relative date in ISO 8601 duration
format with optional P prefix (e.g. '1W', 'P1W').
"""
if duration.startswith('P'):
duration = duration
else:
duration = 'P' + duration
return datetime.date.today() - isodate.parse_duration(duration)


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Assign colors based on ordering",
Expand All @@ -25,7 +42,10 @@ def date_within_last_n_months(date_str, cutoff_date):
parser.add_argument('--color-schemes', type=str, required=True, help="input color schemes file")
parser.add_argument('--metadata', type=str, help="if provided, restrict colors to only those found in metadata")
parser.add_argument('--clade-node-data', type=str, help="if provided, restrict to only those clades found in tree")
parser.add_argument('--clade-recency', type=int, help="if provided, restrict to clades found in tree within X months of present")
parser.add_argument('--clade-recency', type=relative_date, metavar='DURATION',
help="""if provided, restrict to clades found in tree within this time
frame. Format: ISO 8601 duration with optional P prefix (e.g. '1W',
'P1W')""")
parser.add_argument('--output', type=str, required=True, help="output colors tsv")
args = parser.parse_args()

Expand Down Expand Up @@ -62,16 +82,13 @@ def date_within_last_n_months(date_str, cutoff_date):
clades = json.load(fh)['nodes']

if args.clade_recency is not None and args.metadata:
# Calculate the cutoff date based on clade_recency (number of months ago from today)
cutoff_date = datetime.today() - timedelta(days=args.clade_recency * 30) # approximate months as 30 days

# Generate a set of present values within the specified recency
subset_present = set()
metadata = pd.read_csv(args.metadata, delimiter='\t')
for strain, info in clades.items():
if strain in metadata['strain'].values:
date_str = metadata.loc[metadata['strain'] == strain, 'date'].values[0]
if date_within_last_n_months(date_str, cutoff_date):
if date_within_last_n_months(date_str, args.clade_recency):
subset_present.add(info["clade_membership"])

# Restrict to only those present while maintaining order
Expand Down
27 changes: 23 additions & 4 deletions workflow/snakemake_rules/common.smk
Original file line number Diff line number Diff line change
@@ -1,13 +1,31 @@
"""Small, shared functions used to generate inputs and parameters.
"""
import datetime
import isodate
from itertools import product
from shlex import (
quote as shquote, # shquote() is used in this file and also other workflow files
split as shsplitwords,
)
from urllib.parse import urlsplit

# TODO: deduplicate this with the same function in scripts/assign-colors.py.
# There is no easy way to share functions between the workflow and that file at
# the moment. One approach would be to surface it via Augur's Python API.
def relative_date(duration: str):
"""
Convert an ISO 8601 duration to an absolute date by subtracting it from the
current date.
`duration` should be a backwards-looking relative date in ISO 8601 duration
format with optional P prefix (e.g. '1W', 'P1W').
"""
if duration.startswith('P'):
duration = duration
else:
duration = 'P' + duration
return datetime.date.today() - isodate.parse_duration(duration)

def shquotewords(s: str) -> str:
"""
Split string *s* into (POSIX) shell words, quote each word, and join them
Expand Down Expand Up @@ -183,10 +201,11 @@ def _get_clade_recency_argument(wildcards):
clade_recency_setting = _get_clade_recency_for_wildcards(wildcards)
if clade_recency_setting == "all":
return ""
elif isinstance(clade_recency_setting, int):
return "--clade-recency " + shquote(str(clade_recency_setting))
else:
raise Exception(f'clade_recency must be "all" or an integer number of months. Got: {clade_recency_setting!r}')
try:
relative_date(clade_recency_setting)
return "--clade-recency " + shquote(clade_recency_setting)
except:
raise Exception(f'clade_recency must be "all" or a duration string (e.g. "6M", "1Y"). Got: {clade_recency_setting!r}')

def _get_trait_columns_by_wildcards(wildcards):
if wildcards.build_name in config["traits"]:
Expand Down

0 comments on commit 0bd8251

Please sign in to comment.