diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml index 170963e80..ae9c22f65 100644 --- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml @@ -800,47 +800,47 @@ colors: default: clade_recency: "all" global_1m: - clade_recency: 1 + clade_recency: "1M" global_2m: - clade_recency: 2 + clade_recency: "2M" global_6m: - clade_recency: 6 + clade_recency: "6M" africa_1m: - clade_recency: 1 + clade_recency: "1M" africa_2m: - clade_recency: 2 + clade_recency: "2M" africa_6m: - clade_recency: 6 + clade_recency: "6M" asia_1m: - clade_recency: 1 + clade_recency: "1M" asia_2m: - clade_recency: 2 + clade_recency: "2M" asia_6m: - clade_recency: 6 + clade_recency: "6M" europe_1m: - clade_recency: 1 + clade_recency: "1M" europe_2m: - clade_recency: 2 + clade_recency: "2M" europe_6m: - clade_recency: 6 + clade_recency: "6M" north-america_1m: - clade_recency: 1 + clade_recency: "1M" north-america_2m: - clade_recency: 2 + clade_recency: "2M" north-america_6m: - clade_recency: 6 + clade_recency: "6M" oceania_1m: - clade_recency: 1 + clade_recency: "1M" oceania_2m: - clade_recency: 2 + clade_recency: "2M" oceania_6m: - clade_recency: 6 + clade_recency: "6M" south-america_1m: - clade_recency: 1 + clade_recency: "1M" south-america_2m: - clade_recency: 2 + clade_recency: "2M" south-america_6m: - clade_recency: 6 + clade_recency: "6M" # if different traits should be reconstructed for some builds, specify here # otherwise the default trait config in defaults/parameters.yaml will used diff --git a/scripts/assign-colors.py b/scripts/assign-colors.py index b0624340a..679a77d3f 100644 --- a/scripts/assign-colors.py +++ b/scripts/assign-colors.py @@ -1,6 +1,7 @@ import argparse +import datetime +import isodate import pandas as pd -from datetime import datetime, timedelta # Forced colours MUST NOT appear in the ordering TSV forced_colors = { @@ -10,11 +11,27 @@ def date_within_last_n_months(date_str, cutoff_date): if 'XX' in date_str: return False # Ignore uncertain dates try: - date = datetime.strptime(date_str, "%Y-%m-%d") + date = datetime.datetime.strptime(date_str, "%Y-%m-%d") return date >= cutoff_date except ValueError: return False + +def relative_date(duration: str): + """ + Convert an ISO 8601 duration to an absolute date by subtracting it from the + current date. + + `duration` should be a backwards-looking relative date in ISO 8601 duration + format with optional P prefix (e.g. '1W', 'P1W'). + """ + if duration.startswith('P'): + duration = duration + else: + duration = 'P' + duration + return datetime.date.today() - isodate.parse_duration(duration) + + if __name__ == '__main__': parser = argparse.ArgumentParser( description="Assign colors based on ordering", @@ -25,7 +42,10 @@ def date_within_last_n_months(date_str, cutoff_date): parser.add_argument('--color-schemes', type=str, required=True, help="input color schemes file") parser.add_argument('--metadata', type=str, help="if provided, restrict colors to only those found in metadata") parser.add_argument('--clade-node-data', type=str, help="if provided, restrict to only those clades found in tree") - parser.add_argument('--clade-recency', type=int, help="if provided, restrict to clades found in tree within X months of present") + parser.add_argument('--clade-recency', type=relative_date, metavar='DURATION', + help="""if provided, restrict to clades found in tree within this time + frame. Format: ISO 8601 duration with optional P prefix (e.g. '1W', + 'P1W')""") parser.add_argument('--output', type=str, required=True, help="output colors tsv") args = parser.parse_args() @@ -62,16 +82,13 @@ def date_within_last_n_months(date_str, cutoff_date): clades = json.load(fh)['nodes'] if args.clade_recency is not None and args.metadata: - # Calculate the cutoff date based on clade_recency (number of months ago from today) - cutoff_date = datetime.today() - timedelta(days=args.clade_recency * 30) # approximate months as 30 days - # Generate a set of present values within the specified recency subset_present = set() metadata = pd.read_csv(args.metadata, delimiter='\t') for strain, info in clades.items(): if strain in metadata['strain'].values: date_str = metadata.loc[metadata['strain'] == strain, 'date'].values[0] - if date_within_last_n_months(date_str, cutoff_date): + if date_within_last_n_months(date_str, args.clade_recency): subset_present.add(info["clade_membership"]) # Restrict to only those present while maintaining order