diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml index 4e693219d..0e16b8bd9 100644 --- a/defaults/parameters.yaml +++ b/defaults/parameters.yaml @@ -136,7 +136,7 @@ ancestral: colors: default: - # Months back to color clades, if "all" then all clades are colored + # Amount of time back to color clades, if "all" then all clades are colored # Can be specified per build in builds.yaml clade_recency: "all" diff --git a/docs/src/reference/workflow-config-file.rst b/docs/src/reference/workflow-config-file.rst index 90ef9396d..e317ab36b 100644 --- a/docs/src/reference/workflow-config-file.rst +++ b/docs/src/reference/workflow-config-file.rst @@ -955,7 +955,7 @@ colors clade_recency: "all" global-6m: # Override clade recency colors for "global-6m" build - clade_recency: 6 + clade_recency: "6M" Each named traits configuration (``default`` or build-named) supports the following attributes: @@ -965,9 +965,10 @@ Each named traits configuration (``default`` or build-named) supports the follow clade_recency ~~~~~~~~~~~~~ -- type: integer -- description: if integer value is provided, restrict to clades found in tree within X months of present -- default: ``all`` +- type: string +- format: `ISO 8601 `__ duration with optional ``P`` prefix (e.g. ``2M``, ``18M``, ``1Y6M``) +- description: restrict to clades found in tree within this duration from present +- default: ``all`` (no restriction) traits ------ diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml index 5979e3ce5..d80b7f521 100644 --- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml @@ -332,47 +332,47 @@ colors: default: clade_recency: "all" global_1m: - clade_recency: 1 + clade_recency: "1M" global_2m: - clade_recency: 2 + clade_recency: "2M" global_6m: - clade_recency: 6 + clade_recency: "6M" africa_1m: - clade_recency: 1 + clade_recency: "1M" africa_2m: - clade_recency: 2 + clade_recency: "2M" africa_6m: - clade_recency: 6 + clade_recency: "6M" asia_1m: - clade_recency: 1 + clade_recency: "1M" asia_2m: - clade_recency: 2 + clade_recency: "2M" asia_6m: - clade_recency: 6 + clade_recency: "6M" europe_1m: - clade_recency: 1 + clade_recency: "1M" europe_2m: - clade_recency: 2 + clade_recency: "2M" europe_6m: - clade_recency: 6 + clade_recency: "6M" north-america_1m: - clade_recency: 1 + clade_recency: "1M" north-america_2m: - clade_recency: 2 + clade_recency: "2M" north-america_6m: - clade_recency: 6 + clade_recency: "6M" oceania_1m: - clade_recency: 1 + clade_recency: "1M" oceania_2m: - clade_recency: 2 + clade_recency: "2M" oceania_6m: - clade_recency: 6 + clade_recency: "6M" south-america_1m: - clade_recency: 1 + clade_recency: "1M" south-america_2m: - clade_recency: 2 + clade_recency: "2M" south-america_6m: - clade_recency: 6 + clade_recency: "6M" # if different traits should be reconstructed for some builds, specify here # otherwise the default trait config in defaults/parameters.yaml will used diff --git a/scripts/assign-colors.py b/scripts/assign-colors.py index b0624340a..a73a55584 100644 --- a/scripts/assign-colors.py +++ b/scripts/assign-colors.py @@ -1,6 +1,7 @@ import argparse +import datetime +import isodate import pandas as pd -from datetime import datetime, timedelta # Forced colours MUST NOT appear in the ordering TSV forced_colors = { @@ -10,11 +11,27 @@ def date_within_last_n_months(date_str, cutoff_date): if 'XX' in date_str: return False # Ignore uncertain dates try: - date = datetime.strptime(date_str, "%Y-%m-%d") + date = datetime.datetime.strptime(date_str, "%Y-%m-%d").date() return date >= cutoff_date except ValueError: return False + +def relative_date(duration: str): + """ + Convert an ISO 8601 duration to an absolute date by subtracting it from the + current date. + + `duration` should be a backwards-looking relative date in ISO 8601 duration + format with optional P prefix (e.g. '1W', 'P1W'). + """ + if duration.startswith('P'): + duration = duration + else: + duration = 'P' + duration + return datetime.date.today() - isodate.parse_duration(duration) + + if __name__ == '__main__': parser = argparse.ArgumentParser( description="Assign colors based on ordering", @@ -25,7 +42,10 @@ def date_within_last_n_months(date_str, cutoff_date): parser.add_argument('--color-schemes', type=str, required=True, help="input color schemes file") parser.add_argument('--metadata', type=str, help="if provided, restrict colors to only those found in metadata") parser.add_argument('--clade-node-data', type=str, help="if provided, restrict to only those clades found in tree") - parser.add_argument('--clade-recency', type=int, help="if provided, restrict to clades found in tree within X months of present") + parser.add_argument('--clade-recency', type=relative_date, metavar='DURATION', + help="""if provided, restrict to clades found in tree within this time + frame. Format: ISO 8601 duration with optional P prefix (e.g. '1W', + 'P1W')""") parser.add_argument('--output', type=str, required=True, help="output colors tsv") args = parser.parse_args() @@ -62,16 +82,13 @@ def date_within_last_n_months(date_str, cutoff_date): clades = json.load(fh)['nodes'] if args.clade_recency is not None and args.metadata: - # Calculate the cutoff date based on clade_recency (number of months ago from today) - cutoff_date = datetime.today() - timedelta(days=args.clade_recency * 30) # approximate months as 30 days - # Generate a set of present values within the specified recency subset_present = set() metadata = pd.read_csv(args.metadata, delimiter='\t') for strain, info in clades.items(): if strain in metadata['strain'].values: date_str = metadata.loc[metadata['strain'] == strain, 'date'].values[0] - if date_within_last_n_months(date_str, cutoff_date): + if date_within_last_n_months(date_str, args.clade_recency): subset_present.add(info["clade_membership"]) # Restrict to only those present while maintaining order diff --git a/workflow/snakemake_rules/common.smk b/workflow/snakemake_rules/common.smk index 9348efb55..744f47dd0 100644 --- a/workflow/snakemake_rules/common.smk +++ b/workflow/snakemake_rules/common.smk @@ -1,6 +1,7 @@ """Small, shared functions used to generate inputs and parameters. """ import datetime +import isodate from itertools import product from shlex import ( quote as shquote, # shquote() is used in this file and also other workflow files @@ -8,6 +9,23 @@ from shlex import ( ) from urllib.parse import urlsplit +# TODO: deduplicate this with the same function in scripts/assign-colors.py. +# There is no easy way to share functions between the workflow and that file at +# the moment. One approach would be to surface it via Augur's Python API. +def relative_date(duration: str): + """ + Convert an ISO 8601 duration to an absolute date by subtracting it from the + current date. + + `duration` should be a backwards-looking relative date in ISO 8601 duration + format with optional P prefix (e.g. '1W', 'P1W'). + """ + if duration.startswith('P'): + duration = duration + else: + duration = 'P' + duration + return datetime.date.today() - isodate.parse_duration(duration) + def shquotewords(s: str) -> str: """ Split string *s* into (POSIX) shell words, quote each word, and join them @@ -183,10 +201,11 @@ def _get_clade_recency_argument(wildcards): clade_recency_setting = _get_clade_recency_for_wildcards(wildcards) if clade_recency_setting == "all": return "" - elif isinstance(clade_recency_setting, int): - return "--clade-recency " + shquote(str(clade_recency_setting)) - else: - raise Exception(f'clade_recency must be "all" or an integer number of months. Got: {clade_recency_setting!r}') + try: + relative_date(clade_recency_setting) + return "--clade-recency " + shquote(clade_recency_setting) + except: + raise Exception(f'clade_recency must be "all" or a duration string (e.g. "6M", "1Y"). Got: {clade_recency_setting!r}') def _get_trait_columns_by_wildcards(wildcards): if wildcards.build_name in config["traits"]: