Skip to content

Commit

Permalink
Use ISO duration format for clade recency
Browse files Browse the repository at this point in the history
Keeps consistent with other existing usage of this format.
  • Loading branch information
victorlin committed Sep 30, 2024
1 parent 82ca8d4 commit 2bf39d3
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 28 deletions.
42 changes: 21 additions & 21 deletions nextstrain_profiles/nextstrain-gisaid/builds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -800,47 +800,47 @@ colors:
default:
clade_recency: "all"
global_1m:
clade_recency: 1
clade_recency: "1M"
global_2m:
clade_recency: 2
clade_recency: "2M"
global_6m:
clade_recency: 6
clade_recency: "6M"
africa_1m:
clade_recency: 1
clade_recency: "1M"
africa_2m:
clade_recency: 2
clade_recency: "2M"
africa_6m:
clade_recency: 6
clade_recency: "6M"
asia_1m:
clade_recency: 1
clade_recency: "1M"
asia_2m:
clade_recency: 2
clade_recency: "2M"
asia_6m:
clade_recency: 6
clade_recency: "6M"
europe_1m:
clade_recency: 1
clade_recency: "1M"
europe_2m:
clade_recency: 2
clade_recency: "2M"
europe_6m:
clade_recency: 6
clade_recency: "6M"
north-america_1m:
clade_recency: 1
clade_recency: "1M"
north-america_2m:
clade_recency: 2
clade_recency: "2M"
north-america_6m:
clade_recency: 6
clade_recency: "6M"
oceania_1m:
clade_recency: 1
clade_recency: "1M"
oceania_2m:
clade_recency: 2
clade_recency: "2M"
oceania_6m:
clade_recency: 6
clade_recency: "6M"
south-america_1m:
clade_recency: 1
clade_recency: "1M"
south-america_2m:
clade_recency: 2
clade_recency: "2M"
south-america_6m:
clade_recency: 6
clade_recency: "6M"

# if different traits should be reconstructed for some builds, specify here
# otherwise the default trait config in defaults/parameters.yaml will used
Expand Down
31 changes: 24 additions & 7 deletions scripts/assign-colors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import datetime
import isodate
import pandas as pd
from datetime import datetime, timedelta

# Forced colours MUST NOT appear in the ordering TSV
forced_colors = {
Expand All @@ -10,11 +11,27 @@ def date_within_last_n_months(date_str, cutoff_date):
if 'XX' in date_str:
return False # Ignore uncertain dates
try:
date = datetime.strptime(date_str, "%Y-%m-%d")
date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
return date >= cutoff_date
except ValueError:
return False


def relative_date(duration: str):
"""
Convert an ISO 8601 duration to an absolute date by subtracting it from the
current date.
`duration` should be a backwards-looking relative date in ISO 8601 duration
format with optional P prefix (e.g. '1W', 'P1W').
"""
if duration.startswith('P'):
duration = duration
else:
duration = 'P' + duration
return datetime.date.today() - isodate.parse_duration(duration)


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Assign colors based on ordering",
Expand All @@ -25,7 +42,10 @@ def date_within_last_n_months(date_str, cutoff_date):
parser.add_argument('--color-schemes', type=str, required=True, help="input color schemes file")
parser.add_argument('--metadata', type=str, help="if provided, restrict colors to only those found in metadata")
parser.add_argument('--clade-node-data', type=str, help="if provided, restrict to only those clades found in tree")
parser.add_argument('--clade-recency', type=int, help="if provided, restrict to clades found in tree within X months of present")
parser.add_argument('--clade-recency', type=relative_date, metavar='DURATION',
help="""if provided, restrict to clades found in tree within this time
frame. Format: ISO 8601 duration with optional P prefix (e.g. '1W',
'P1W')""")
parser.add_argument('--output', type=str, required=True, help="output colors tsv")
args = parser.parse_args()

Expand Down Expand Up @@ -62,16 +82,13 @@ def date_within_last_n_months(date_str, cutoff_date):
clades = json.load(fh)['nodes']

if args.clade_recency is not None and args.metadata:
# Calculate the cutoff date based on clade_recency (number of months ago from today)
cutoff_date = datetime.today() - timedelta(days=args.clade_recency * 30) # approximate months as 30 days

# Generate a set of present values within the specified recency
subset_present = set()
metadata = pd.read_csv(args.metadata, delimiter='\t')
for strain, info in clades.items():
if strain in metadata['strain'].values:
date_str = metadata.loc[metadata['strain'] == strain, 'date'].values[0]
if date_within_last_n_months(date_str, cutoff_date):
if date_within_last_n_months(date_str, args.clade_recency):
subset_present.add(info["clade_membership"])

# Restrict to only those present while maintaining order
Expand Down

0 comments on commit 2bf39d3

Please sign in to comment.