From 9515a0458b01457c1d552d997cc8751e20b64c78 Mon Sep 17 00:00:00 2001
From: Becky Sweger <rsweger@umass.edu>
Date: Tue, 7 Jan 2025 10:49:19 -0500
Subject: [PATCH 1/3] change collection_min default to tree_as_of -  90 days

---
 src/get_target_data.py | 53 ++++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 25 deletions(-)

diff --git a/src/get_target_data.py b/src/get_target_data.py
index 87ad2570..bd4927db 100644
--- a/src/get_target_data.py
+++ b/src/get_target_data.py
@@ -1,5 +1,4 @@
 """
-Create time series and oracle output target data for a specific modeling round.
 
 This script wraps the cladetime package, which generates clade counts using the
 GenBank-based Sars-CoV-2 sequence metadata from Nextstrain.
@@ -7,6 +6,7 @@
 
 The script is scheduled to run every Wednesday evening (US Eastern)
 
+Create time series and oracle output target data for a specific modeling round.
 To run the script manually:
 1. Install uv on your machine: https://docs.astral.sh/uv/getting-started/installation/
 2. From the root of this repo:
@@ -123,15 +123,6 @@ def set_sequence_as_of(ctx, param, value):
     return value
 
 
-def set_collection_min_date(ctx, param, value):
-    """Set the collection_min_date default value to nowcast date minus 31 days."""
-    if value is None:
-        nowcast_date = ctx.params.get("nowcast_date")
-        value = nowcast_date + timedelta(days=-31)
-    value = value.replace(hour=23, minute=59, second=59, tzinfo=timezone.utc)
-    return value
-
-
 def set_collection_max_date(ctx, param, value):
     """Set the collection_max_date default value to nowcast date plus 10 days."""
     if value is None:
@@ -169,7 +160,7 @@ def set_collection_max_date(ctx, param, value):
     type=click.DateTime(formats=["%Y-%m-%d"]),
     required=False,
     default=None,
-    callback=set_collection_min_date,
+    callback=normalize_date,
     help="Assign clades to sequences collected on or after this UTC date (YYYY-MM-DD). Default is the nowcast date minus 31 days.",
 )
 @click.option(
@@ -195,8 +186,6 @@ def main(
     collection_max_date: datetime,
     target_data_dir: Path,
 ) -> tuple[Path, Path]:
-    """Get clade counts and save to S3 bucket."""
-
     # Date for retrieving sequences cannot be in the future
     if sequence_as_of > datetime.now(tz=timezone.utc):
         logger.info(
@@ -232,6 +221,9 @@ def main(
         else:
             tree_as_of = datetime.fromisoformat(modeled_clades["meta"]["created_at"])
 
+    if collection_min_date is None:
+        collection_min_date = tree_as_of - timedelta(days=90)
+
     assignments = assign_clades(
         nowcast_date,
         sequence_as_of,
@@ -417,10 +409,10 @@ def mock_command():
     result = normalize_date(ctx, param, datetime(2024, 11, 11, 11, 11, 11))
     assert result == datetime(2024, 11, 11, 23, 59, 59, tzinfo=timezone.utc)
 
-    # default collection_min_date is 31 days before the nowcast date
+    # if collection_min_date is provided, it should be set to end of day UTC
     param = Option(["--collection-min-date"])
-    result = set_collection_min_date(ctx, param, None)
-    assert result == datetime(2024, 9, 1, 23, 59, 59, tzinfo=timezone.utc)
+    result = normalize_date(ctx, param, datetime(2024, 11, 11, 11, 11, 11))
+    assert result == datetime(2024, 11, 11, 23, 59, 59, tzinfo=timezone.utc)
 
     # default collection_max_date is 10 days after the nowcast date
     param = Option(["--collection-max-date"])
@@ -491,7 +483,7 @@ def test_target_data():
     time_series, oracle = create_target_data(
         test_assignments,
         test_clade_list,
-        "2024-9-11",
+        "2024-09-11",
         "2024-12-17",
         test_min_date,
         test_max_date,
@@ -518,7 +510,7 @@ def test_target_data():
     assert ts.get_column("target_date").min() == date(2024, 11, 30)
     assert ts.get_column("target_date").max() == date(2024, 12, 4)
     assert ts.get_column("observation").sum() == 20
-    assert ts.get_column("nowcast_date").unique().to_list() == ["2024-9-11"]
+    assert ts.get_column("nowcast_date").unique().to_list() == ["2024-09-11"]
     assert ts.get_column("sequence_as_of").unique().to_list() == ["2024-12-17"]
     assert ts.get_column("tree_as_of").unique().to_list() == ["2024-08-01"]
 
@@ -541,7 +533,8 @@ def test_target_data():
 def test_target_data_integration(caplog, tmp_path):
     """
     If the modeled-clades file doesn't have meta.created_at, tree_as_of should default to
-    nowcast_date - two days.
+    nowcast_date - two days. Additionally, when collection_min_date isn't provided,
+    it should default to tree_as_of - 90 days.
     """
     caplog.set_level(logging.INFO)
 
@@ -563,15 +556,25 @@ def test_target_data_integration(caplog, tmp_path):
     ts = pl.read_parquet(result.return_value[0])
 
     # sequence date should default to nowcast_date + 90 days
-    assert "sequence_as_of=2024-12-10" in caplog.text
+    assert "sequence_as_of=2024-12-10" in caplog.text.lower()
     # tasks.json for 2024-09-11 doesn't have a meta.created_at field, so tree_as_of = nowcast_date - 2 days
-    assert "tree_as_of=2024-09-09" in caplog.text
+    assert "tree_as_of=2024-09-09" in caplog.text.lower()
 
-    # time series target dates should be limited to dates that match collection date min/max options
-    # (2024-08-11 to 2024-09-21 is 43 days, but both have 11:59:59 timestamps, so we'd expect
-    # 42 days in the time series)
+    # number of unique dates in the time series target should be the number of
+    # days between collection_min_date (tree_as_of = 90 days) and the
+    # collection_max_date (nowcast_date + 10 days), inclusive
+    nowcast_datetime = datetime.fromisoformat(nowcast_date).replace(
+        hour=11, minute=59, second=59, tzinfo=timezone.utc
+    )
+    tree_as_of_datetime = datetime.fromisoformat("2024-09-09").replace(
+        hour=11, minute=59, second=59, tzinfo=timezone.utc
+    )
+    expected_num_days = (
+        (nowcast_datetime + timedelta(days=10))
+        - (tree_as_of_datetime - timedelta(days=90))
+    ).days + 1
     target_dates = ts["target_date"].unique().to_list()
-    assert len(target_dates) == 42
+    assert len(target_dates) == expected_num_days
 
     modeled_clades_path = Path("auxiliary-data/modeled-clades") / f"{nowcast_date}.json"
     modeled_clades_json = json.loads(modeled_clades_path.read_text(encoding="utf-8"))

From 50feb952ecffa7d7c2829d17b4e977306f39ad8a Mon Sep 17 00:00:00 2001
From: Becky Sweger <rsweger@umass.edu>
Date: Tue, 7 Jan 2025 12:20:08 -0500
Subject: [PATCH 2/3] change collect_min default in oracle output

---
 src/get_target_data.py | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/get_target_data.py b/src/get_target_data.py
index bd4927db..d3f9946a 100644
--- a/src/get_target_data.py
+++ b/src/get_target_data.py
@@ -1,4 +1,5 @@
 """
+Create time series and oracle output target data for a specific modeling round.
 
 This script wraps the cladetime package, which generates clade counts using the
 GenBank-based Sars-CoV-2 sequence metadata from Nextstrain.
@@ -6,7 +7,6 @@
 
 The script is scheduled to run every Wednesday evening (US Eastern)
 
-Create time series and oracle output target data for a specific modeling round.
 To run the script manually:
 1. Install uv on your machine: https://docs.astral.sh/uv/getting-started/installation/
 2. From the root of this repo:
@@ -335,6 +335,12 @@ def create_target_data(
 
     oracle_output = (
         time_series_all.select(["location", "target_date", "clade", "observation"])
+        # for oracle output, include only sequence collection dates that are >=
+        # nowcast_date - 31 days
+        .filter(
+            pl.col("target_date")
+            >= datetime.fromisoformat(nowcast_string) - timedelta(days=31)
+        )
         .with_columns(pl.lit(nowcast_string).alias("nowcast_date"))
         .rename({"observation": "oracle_value"})
     )
@@ -472,11 +478,11 @@ def test_target_data():
         "clade_nextstrain": ["AA", "BB", "CC", "DD", "BB"],
         "count": [2, 3, 4, 5, 6],
     }
-    test_assignments: Clade = Clade(
+    test_assignments = Clade(
         {"tree_as_of": datetime(2024, 8, 1, 14, 30, 40)},
         pl.LazyFrame(),
         pl.LazyFrame(test_summary),
-    )  # type: ignore
+    )
     test_clade_list = ["AA", "BB", "other"]
     test_min_date = datetime(2024, 11, 30, tzinfo=timezone.utc)
     test_max_date = datetime(2024, 12, 4, tzinfo=timezone.utc)
@@ -599,7 +605,21 @@ def test_target_data_integration(caplog, tmp_path):
     len(target_dates) * len(state_list) * len(modeled_clades) == ts.height
 
     oracle = pl.read_parquet(result.return_value[1])
-    assert oracle.height == ts.height
+
+    oracle_min_date = oracle["target_date"].min()
+    oracle_max_date = oracle["target_date"].max()
+    assert (
+        oracle_min_date
+        == (datetime.strptime(nowcast_date, "%Y-%m-%d") - timedelta(days=31)).date()
+    )
+    assert (
+        oracle_max_date
+        == (datetime.strptime(nowcast_date, "%Y-%m-%d") + timedelta(days=10)).date()
+    )
+
+    # oracle series rows should = number of oracle target dates * total locations * total clades
+    expected_num_days = (oracle_max_date - oracle_min_date).days + 1
+    assert oracle.height == expected_num_days * len(state_list) * len(modeled_clades)
 
     oracle_clades = oracle["clade"].unique().to_list()
     assert len(modeled_clades) == len(oracle_clades)

From 929e737d43281b9a9db2be38c50e0484c2afc962 Mon Sep 17 00:00:00 2001
From: Becky Sweger <bsweger@gmail.com>
Date: Wed, 8 Jan 2025 11:42:38 -0500
Subject: [PATCH 3/3] Update src/get_target_data.py

Co-authored-by: Evan Ray <elray1@users.noreply.github.com>
---
 src/get_target_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/get_target_data.py b/src/get_target_data.py
index d3f9946a..df4cb96d 100644
--- a/src/get_target_data.py
+++ b/src/get_target_data.py
@@ -567,7 +567,7 @@ def test_target_data_integration(caplog, tmp_path):
     assert "tree_as_of=2024-09-09" in caplog.text.lower()
 
     # number of unique dates in the time series target should be the number of
-    # days between collection_min_date (tree_as_of = 90 days) and the
+    # days between collection_min_date (tree_as_of - 90 days) and the
     # collection_max_date (nowcast_date + 10 days), inclusive
     nowcast_datetime = datetime.fromisoformat(nowcast_date).replace(
         hour=11, minute=59, second=59, tzinfo=timezone.utc