Only include more recent context sequences

For "global" subsampling, rather than treating early contextual samples as origin of pandemic to beginning of focal window, eg for 6m analysis from 2020 to 6m ago, instead use a consistent 1y of additional context. So, for 6m, this is context of 18m ago to 6m and focal of 6m ago to present. Additionally, reduce the amount of contextual sequences included from a 4:1 ratio of focal to context to a 10:1 ratio of focal to context.
nextstrain · Jul 24, 2024 · 125686d · 125686d
1 parent b6efca7
commit 125686d
Showing 1 changed file with 64 additions and 34 deletions.
diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
@@ -580,48 +580,58 @@ subsampling:
       exclude: "--exclude-where 'region={region}'"
 
   # Custom subsampling logic for global region over 1m
-  # 5125 total (expect ~3400)
-  # 4:1 ratio of recent to early
-  # all eight regions equal except Oceania at 20%
+  # ~4500 total (expect ~3400)
+  # 10:1 ratio of recent to early
+  # recent is present to 1m, n = 4120
+  # early is 1m to 13m, n = 412
+  # regions are proportional to population size
   nextstrain_global_1m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 60
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     china_early:
       group_by: "division year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 125
+      max_sequences: 50
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Europe'"
     india_early:
       group_by: "division year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 100
+      max_sequences: 40
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 90
+      max_sequences: 36
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
       group_by: "division year month"
-      max_sequences: 15
+      max_sequences: 6
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent:
@@ -666,48 +676,58 @@ subsampling:
       exclude: "--exclude-where 'region!=Oceania'"
 
   # Custom subsampling logic for global region over 2m
-  # 5125 total (expect ~3400)
-  # 4:1 ratio of recent to early
-  # all eight regions equal except Oceania at 20%
+  # ~4500 total (expect ~3400)
+  # 10:1 ratio of recent to early
+  # recent is present to 2m, n = 4120
+  # early is 2m to 14m, n = 412
+  # regions are proportional to population size
   nextstrain_global_2m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 60
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     china_early:
       group_by: "division year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 125
+      max_sequences: 50
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Europe'"
     india_early:
       group_by: "division year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 100
+      max_sequences: 40
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 90
+      max_sequences: 36
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
       group_by: "division year month"
-      max_sequences: 15
+      max_sequences: 6
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent:
@@ -752,48 +772,58 @@ subsampling:
       exclude: "--exclude-where 'region!=Oceania'"
 
   # Custom subsampling logic for global region over 6m
-  # 5125 total (expect ~3400)
-  # 4:1 ratio of recent to early
-  # all eight regions equal except Oceania at 20%
+  # ~4500 total (expect ~3400)
+  # 10:1 ratio of recent to early
+  # recent is present to 6m, n = 4120
+  # early is 6m to 18m, n = 412
+  # regions are proportional to population size
   nextstrain_global_6m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 60
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     china_early:
       group_by: "division year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 125
+      max_sequences: 50
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Europe'"
     india_early:
       group_by: "division year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 100
+      max_sequences: 40
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 90
+      max_sequences: 36
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
       group_by: "division year month"
-      max_sequences: 15
+      max_sequences: 6
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent:
@@ -839,7 +869,7 @@ subsampling:
 
   # Custom subsampling logic for global region over all-time
   # 4320 total (expect ~3200)
-  # all eight regions equal except Oceania at 20%
+  # regions are proportional to population size
   nextstrain_global_all_time:
     africa:
       group_by: "country year month"