From 37c68b8729e291e821e5fc1eeebd3767bc408215 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Wed, 20 Mar 2024 17:03:44 -0700
Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20Read=20from=20subsampling=20conf?=
 =?UTF-8?q?ig=20files?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 nextstrain_profiles/100k/config-gisaid.yaml   |   3 +-
 nextstrain_profiles/100k/config-open.yaml     |  12 +-
 nextstrain_profiles/nextstrain-ci/builds.yaml |  22 +-
 .../nextstrain-country/builds.yaml            | 123 +--
 .../nextstrain-gisaid-21L/builds.yaml         | 802 +-----------------
 .../nextstrain-gisaid/builds.yaml             | 802 +-----------------
 .../nextstrain-open/builds.yaml               | 778 +----------------
 workflow/snakemake_rules/common.smk           |   2 +-
 workflow/snakemake_rules/main_workflow.smk    |  30 +-
 9 files changed, 115 insertions(+), 2459 deletions(-)

diff --git a/nextstrain_profiles/100k/config-gisaid.yaml b/nextstrain_profiles/100k/config-gisaid.yaml
index 27a84c882..ca091f23a 100644
--- a/nextstrain_profiles/100k/config-gisaid.yaml
+++ b/nextstrain_profiles/100k/config-gisaid.yaml
@@ -15,7 +15,7 @@ inputs:
 
 builds:
   100k:
-    subsampling_scheme: 100k_scheme
+    subsampling_scheme: subsampling/100k.yaml
 
 # mapping of remote: local files to be uploaded under S3_DST_BUCKET
 upload:
@@ -26,6 +26,7 @@ upload:
 filter:
   exclude_where: "division='USA'"
 
+# FIXME: move this comment to somewhere else then delete the entire entry.
 # We wish to subsample 50k in the previous 12 months and 50k prior to that.
 # Note 1: both --max-date and --min-date are inclusive of the boundary date,
 #         so sequences from that date will be available to both sub-samples
diff --git a/nextstrain_profiles/100k/config-open.yaml b/nextstrain_profiles/100k/config-open.yaml
index 0702c92ee..9f390e4dd 100644
--- a/nextstrain_profiles/100k/config-open.yaml
+++ b/nextstrain_profiles/100k/config-open.yaml
@@ -12,19 +12,9 @@ inputs:
     skip_sanitize_metadata: true
 builds:
   100k:
-    subsampling_scheme: 100k_scheme
+    subsampling_scheme: subsampling/100k.yaml
 upload:
   metadata.tsv.xz: results/100k/100k_subsampled_metadata.tsv.xz
   sequences.fasta.xz: results/100k/100k_subsampled_sequences.fasta.xz
 filter:
   exclude_where: "division='USA'"
-subsampling:
-  100k_scheme:
-    50k_early:
-      group_by: "year month country"
-      max_sequences: 50000
-      max_date: "--max-date 1Y"
-    50k_late:
-      group_by: "year month country"
-      max_sequences: 50000
-      min_date: "--min-date 1Y"
diff --git a/nextstrain_profiles/nextstrain-ci/builds.yaml b/nextstrain_profiles/nextstrain-ci/builds.yaml
index b5352214c..3ad13191e 100644
--- a/nextstrain_profiles/nextstrain-ci/builds.yaml
+++ b/nextstrain_profiles/nextstrain-ci/builds.yaml
@@ -11,27 +11,7 @@ builds:
   # Override the default Nextstrain European build's subsampling scheme for more
   # stable subsampling of a fixed dataset in continuous integration tests.
   europe:
-    subsampling_scheme: nextstrain_ci_sampling
-    region: Europe
-
-subsampling:
-  # Custom subsampling logic for CI tests.
-  nextstrain_ci_sampling:
-    # Focal samples for region
-    region:
-      group_by: "division year month"
-      max_sequences: 20
-      sampling_scheme: "--no-probabilistic-sampling"
-      exclude: "--exclude-where 'region!={region}'"
-    # Contextual samples for region from the rest of the world
-    global:
-      group_by: "year month"
-      max_sequences: 10
-      sampling_scheme: "--no-probabilistic-sampling"
-      exclude: "--exclude-where 'region={region}'"
-      priorities:
-        type: "proximity"
-        focus: "region"
+    subsampling_scheme: subsampling/nextstrain_ci_sampling.yaml
 
 # Override default frequency settings, so we can estimate frequencies from older
 # data with a fixed time range.
diff --git a/nextstrain_profiles/nextstrain-country/builds.yaml b/nextstrain_profiles/nextstrain-country/builds.yaml
index 529f201f3..6041b6794 100644
--- a/nextstrain_profiles/nextstrain-country/builds.yaml
+++ b/nextstrain_profiles/nextstrain-country/builds.yaml
@@ -39,137 +39,22 @@ inputs:
 # Auspice config is specified in rule auspice_config in export_for_nextstrain.smk
 builds:
   nextstrain_country_1m:
-    subsampling_scheme: nextstrain_country_1m
+    subsampling_scheme: subsampling/nextstrain_country_1m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with country-focused subsampling over the past month
-    country: India
   nextstrain_country_2m:
-    subsampling_scheme: nextstrain_country_2m
+    subsampling_scheme: subsampling/nextstrain_country_2m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with country-focused subsampling over the past 2 months
-    country: India
   nextstrain_country_6m:
-    subsampling_scheme: nextstrain_country_6m
+    subsampling_scheme: subsampling/nextstrain_country_6m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with country-focused subsampling over the past 6 months
-    country: India
   nextstrain_country_all-time:
-    subsampling_scheme: nextstrain_country_all-time
+    subsampling_scheme: subsampling/nextstrain_country_all-time.yaml
     title: Genomic epidemiology of SARS-CoV-2 with country-focused subsampling since pandemic start
-    country: India
 
 # remove sequences without division label in US
 filter:
   exclude_where: "division='USA'"
 
-subsampling:
-
-  # Custom subsampling logic for regions over 1m
-  # Grouping by division for North America and Oceania
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_country_1m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "division year month"
-      max_sequences: 640
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'country!={country}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'country={country}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "division week"
-      max_sequences: 2560
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'country!={country}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country week"
-      max_sequences: 640
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'country={country}'"
-
-  # Custom subsampling logic for regions over 2m
-  # Grouping by division for North America and Oceania
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_country_2m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "division year month"
-      max_sequences: 640
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'country!={country}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'country={country}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "division week"
-      max_sequences: 2560
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'country!={country}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country week"
-      max_sequences: 640
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'country={country}'"
-
-  # Custom subsampling logic for regions over 6m
-  # Grouping by division for North America and Oceania
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_country_6m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "division year month"
-      max_sequences: 640
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'country!={country}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'country={country}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "division year month"
-      max_sequences: 2560
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'country!={country}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country year month"
-      max_sequences: 640
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'country={country}'"
-
-  # Custom subsampling logic for regions over all-time
-  # Grouping by division for North America and Oceania
-  # 4000 total
-  # 4:1 ratio of focal to context
-  nextstrain_country_all-time:
-    # Focal samples for country
-    focal:
-      group_by: "division year month"
-      max_sequences: 640
-      exclude: "--exclude-where 'country!={country}'"
-    # Contextual samples from the rest of the world
-    context:
-      group_by: "country year month"
-      max_sequences: 160
-      exclude: "--exclude-where 'country={country}'"
-
 # if different traits should be reconstructed for some builds, specify here
 # otherwise the default trait config in defaults/parameters.yaml will used
 traits:
diff --git a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml
index 24519a426..1a291333e 100644
--- a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml
+++ b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml
@@ -46,115 +46,91 @@ inputs:
 # Auspice config is specified in rule auspice_config in export_for_nextstrain.smk
 builds:
   reference:
-    subsampling_scheme: nextstrain_reference
+    subsampling_scheme: subsampling/reference.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with clade-focused subsampling
   global_1m:
-    subsampling_scheme: nextstrain_global_1m
+    subsampling_scheme: subsampling/global_1m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused globally over the past month
   global_2m:
-    subsampling_scheme: nextstrain_global_2m
+    subsampling_scheme: subsampling/global_2m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused globally over the past 2 months
   global_6m:
-    subsampling_scheme: nextstrain_global_6m
+    subsampling_scheme: subsampling/global_6m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused globally over the past 6 months
   global_all-time:
-    subsampling_scheme: nextstrain_global_all_time
+    subsampling_scheme: subsampling/global_all-time.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused globally since pandemic start
   africa_1m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_1m
-    region: Africa
+    subsampling_scheme: subsampling/africa_1m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Africa over the past month
   africa_2m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_2m
-    region: Africa
+    subsampling_scheme: subsampling/africa_2m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Africa over the past 2 months
   africa_6m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_6m
-    region: Africa
+    subsampling_scheme: subsampling/africa_6m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Africa over the past 6 months
   africa_all-time:
-    subsampling_scheme: nextstrain_region_grouped_by_country_all_time
-    region: Africa
+    subsampling_scheme: subsampling/africa_all-time.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Africa since pandemic start
   asia_1m:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_1m
-    region: Asia
+    subsampling_scheme: subsampling/asia_1m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past month
   asia_2m:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_2m
-    region: Asia
+    subsampling_scheme: subsampling/asia_2m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past 2 months
   asia_6m:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_6m
-    region: Asia
+    subsampling_scheme: subsampling/asia_6m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past 6 months
   asia_all-time:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_all_time
-    region: Asia
+    subsampling_scheme: subsampling/asia_all-time.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia since pandemic start
   europe_1m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_1m
-    region: Europe
+    subsampling_scheme: subsampling/europe_1m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Europe over the past month
   europe_2m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_2m
-    region: Europe
+    subsampling_scheme: subsampling/europe_2m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Europe over the past 2 months
   europe_6m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_6m
-    region: Europe
+    subsampling_scheme: subsampling/europe_6m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Europe over the past 6 months
   europe_all-time:
-    subsampling_scheme: nextstrain_region_grouped_by_country_all_time
-    region: Europe
+    subsampling_scheme: subsampling/europe_all-time.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Europe since pandemic start
   north-america_1m:
-    subsampling_scheme: nextstrain_region_grouped_by_division_1m
-    region: North America
+    subsampling_scheme: subsampling/north-america_1m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on North America over the past month
   north-america_2m:
-    subsampling_scheme: nextstrain_region_grouped_by_division_2m
-    region: North America
+    subsampling_scheme: subsampling/north-america_2m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on North America over the past 2 months
   north-america_6m:
-    subsampling_scheme: nextstrain_region_grouped_by_division_6m
-    region: North America
+    subsampling_scheme: subsampling/north-america_6m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on North America over the past 6 months
   north-america_all-time:
-    subsampling_scheme: nextstrain_region_grouped_by_division_all_time
-    region: North America
+    subsampling_scheme: subsampling/north-america_all-time.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on North America since pandemic start
   oceania_1m:
-    subsampling_scheme: nextstrain_region_grouped_by_division_1m
-    region: Oceania
+    subsampling_scheme: subsampling/oceania_1m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Oceania over the past month
   oceania_2m:
-    subsampling_scheme: nextstrain_region_grouped_by_division_2m
-    region: Oceania
+    subsampling_scheme: subsampling/oceania_2m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Oceania over the past 2 months
   oceania_6m:
-    subsampling_scheme: nextstrain_region_grouped_by_division_6m
-    region: Oceania
+    subsampling_scheme: subsampling/oceania_6m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Oceania over the past 6 months
   oceania_all-time:
-    subsampling_scheme: nextstrain_region_grouped_by_division_all_time
-    region: Oceania
+    subsampling_scheme: subsampling/oceania_all-time.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Oceania since pandemic start
   south-america_1m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_1m
-    region: South America
+    subsampling_scheme: subsampling/south-america_1m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on South America over the past month
   south-america_2m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_2m
-    region: South America
+    subsampling_scheme: subsampling/south-america_2m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on South America over the past 2 months
   south-america_6m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_6m
-    region: South America
+    subsampling_scheme: subsampling/south-america_6m.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on South America over the past 6 months
   south-america_all-time:
-    subsampling_scheme: nextstrain_region_grouped_by_country_all_time
-    region: South America
+    subsampling_scheme: subsampling/south-america_all-time.yaml
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on South America since pandemic start
 
 # remove sequences without division label in US and sequences from prior to clade 21L
@@ -162,726 +138,6 @@ filter:
   exclude_where: "division='USA'"
   min_date: "2022-01-01"
 
-subsampling:
-
-  # Custom subsampling logic for group by clade
-  nextstrain_reference:
-    clades:
-      group_by: "Nextstrain_clade"
-      max_sequences: 300
-
-  # Custom subsampling logic for regions over 1m
-  # Grouping by division for North America and Oceania
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_division_1m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "division year month"
-      max_sequences: 640
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region={region}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "division week"
-      max_sequences: 2560
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country week"
-      max_sequences: 640
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over 2m
-  # Grouping by division for North America and Oceania
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_division_2m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "division year month"
-      max_sequences: 640
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region={region}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "division week"
-      max_sequences: 2560
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country week"
-      max_sequences: 640
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over 6m
-  # Grouping by division for North America and Oceania
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_division_6m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "division year month"
-      max_sequences: 640
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region={region}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "division year month"
-      max_sequences: 2560
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country year month"
-      max_sequences: 640
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over all-time
-  # Grouping by division for North America and Oceania
-  # 4000 total
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_division_all_time:
-    # Focal samples for region
-    focal:
-      group_by: "division year month"
-      max_sequences: 3200
-      exclude: "--exclude-where 'region!={region}'"
-    # Contextual samples from the rest of the world
-    context:
-      group_by: "country year month"
-      max_sequences: 800
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for region Asia over 1m
-  # Grouping by division
-  # Separating three buckets for China, India and elsewhere
-  # 4375 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
-  nextstrain_region_asia_grouped_by_division_1m:
-    # Early focal samples for Asia
-    asia_early:
-      group_by: "division year month"
-      max_sequences: 300
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Early focal samples for China
-    china_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'country!=China'"
-    # Early focal samples for India
-    india_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'country!=India'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 175
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region=Asia'"
-    # Recent focal samples for Asia
-    asia_recent:
-      group_by: "division year month"
-      max_sequences: 1200
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Recent focal samples for China
-    china_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 1M"
-      exclude: "--exclude-where 'country!=China'"
-    # Recent focal samples for India
-    india_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 1M"
-      exclude: "--exclude-where 'country!=India'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country year month"
-      max_sequences: 700
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region=Asia'"
-
-  # Custom subsampling logic for region Asia over 2m
-  # Grouping by division
-  # Separating three buckets for China, India and elsewhere
-  # 4375 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
-  nextstrain_region_asia_grouped_by_division_2m:
-    # Early focal samples for Asia
-    asia_early:
-      group_by: "division year month"
-      max_sequences: 300
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Early focal samples for China
-    china_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'country!=China'"
-    # Early focal samples for India
-    india_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'country!=India'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 175
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region=Asia'"
-    # Recent focal samples for Asia
-    asia_recent:
-      group_by: "division year month"
-      max_sequences: 1200
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Recent focal samples for China
-    china_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 2M"
-      exclude: "--exclude-where 'country!=China'"
-    # Recent focal samples for India
-    india_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 2M"
-      exclude: "--exclude-where 'country!=India'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country year month"
-      max_sequences: 700
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region=Asia'"
-
-  # Custom subsampling logic for region Asia over 6m
-  # Grouping by division
-  # Separating three buckets for China, India and elsewhere
-  # 4375 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
-  nextstrain_region_asia_grouped_by_division_6m:
-    # Early focal samples for Asia
-    asia_early:
-      group_by: "division year month"
-      max_sequences: 300
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Early focal samples for China
-    china_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'country!=China'"
-    # Early focal samples for India
-    india_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'country!=India'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 175
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region=Asia'"
-    # Recent focal samples for Asia
-    asia_recent:
-      group_by: "division year month"
-      max_sequences: 1200
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Recent focal samples for China
-    china_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 6M"
-      exclude: "--exclude-where 'country!=China'"
-    # Recent focal samples for India
-    india_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 6M"
-      exclude: "--exclude-where 'country!=India'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country year month"
-      max_sequences: 700
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region=Asia'"
-
-  # Custom subsampling logic for region Asia over all-time
-  # Grouping by division
-  # Separating three buckets for China, India and elsewhere
-  # 4375 total
-  # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
-  nextstrain_region_asia_grouped_by_division_all_time:
-    # Focal samples for Asia
-    asia:
-      group_by: "division year month"
-      max_sequences: 1500
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Focal samples for China
-    china:
-      group_by: "division year month"
-      max_sequences: 1000
-      exclude: "--exclude-where 'country!=China'"
-    # Focal samples for India
-    india:
-      group_by: "division year month"
-      max_sequences: 1000
-      exclude: "--exclude-where 'country!=India'"
-    # Contextual samples from the rest of the world
-    context:
-      group_by: "country year month"
-      max_sequences: 875
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over 1m
-  # Grouping by country for Africa, Asia, Europe and South America
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_country_1m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "country year month"
-      max_sequences: 640
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region={region}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "country week"
-      max_sequences: 2560
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country week"
-      max_sequences: 640
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over 2m
-  # Grouping by country for Africa, Asia, Europe and South America
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_country_2m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "country year month"
-      max_sequences: 640
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region={region}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "country week"
-      max_sequences: 2560
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country week"
-      max_sequences: 640
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over 6m
-  # Grouping by country for Africa, Asia, Europe and South America
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_country_6m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "country year month"
-      max_sequences: 640
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region={region}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "country year month"
-      max_sequences: 2560
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country year month"
-      max_sequences: 640
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over all-time
-  # Grouping by country for Africa, Asia, Europe and South America
-  # 4000 total
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_country_all_time:
-    # Focal samples for region
-    focal:
-      group_by: "country year month"
-      max_sequences: 3200
-      exclude: "--exclude-where 'region!={region}'"
-    # Contextual samples from the rest of the world
-    context:
-      group_by: "country year month"
-      max_sequences: 800
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for global region over 1m
-  # 5125 total (expect ~3400)
-  # 4:1 ratio of recent to early
-  # all eight regions equal except Oceania at 20%
-  nextstrain_global_1m:
-    africa_early:
-      group_by: "country year month"
-      max_sequences: 150
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Africa'"
-    asia_early:
-      group_by: "country year month"
-      max_sequences: 200
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china_early:
-      group_by: "division year month"
-      max_sequences: 175
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'country!=China'"
-    europe_early:
-      group_by: "country year month"
-      max_sequences: 125
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Europe'"
-    india_early:
-      group_by: "division year month"
-      max_sequences: 175
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'country!=India'"
-    north_america_early:
-      group_by: "division year month"
-      max_sequences: 100
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=North America'"
-    south_america_early:
-      group_by: "country year month"
-      max_sequences: 90
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=South America'"
-    oceania_early:
-      group_by: "division year month"
-      max_sequences: 15
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Oceania'"
-    africa_recent:
-      group_by: "country week"
-      max_sequences: 600
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Africa'"
-    asia_recent:
-      group_by: "country week"
-      max_sequences: 800
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china_recent:
-      group_by: "division week"
-      max_sequences: 700
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'country!=China'"
-    europe_recent:
-      group_by: "country week"
-      max_sequences: 500
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Europe'"
-    india_recent:
-      group_by: "division week"
-      max_sequences: 700
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'country!=India'"
-    north_america_recent:
-      group_by: "division week"
-      max_sequences: 400
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=North America'"
-    south_america_recent:
-      group_by: "country week"
-      max_sequences: 360
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=South America'"
-    oceania_recent:
-      group_by: "division week"
-      max_sequences: 60
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Oceania'"
-
-  # Custom subsampling logic for global region over 2m
-  # 5125 total (expect ~3400)
-  # 4:1 ratio of recent to early
-  # all eight regions equal except Oceania at 20%
-  nextstrain_global_2m:
-    africa_early:
-      group_by: "country year month"
-      max_sequences: 150
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Africa'"
-    asia_early:
-      group_by: "country year month"
-      max_sequences: 200
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china_early:
-      group_by: "division year month"
-      max_sequences: 175
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'country!=China'"
-    europe_early:
-      group_by: "country year month"
-      max_sequences: 125
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Europe'"
-    india_early:
-      group_by: "division year month"
-      max_sequences: 175
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'country!=India'"
-    north_america_early:
-      group_by: "division year month"
-      max_sequences: 100
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=North America'"
-    south_america_early:
-      group_by: "country year month"
-      max_sequences: 90
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=South America'"
-    oceania_early:
-      group_by: "division year month"
-      max_sequences: 15
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Oceania'"
-    africa_recent:
-      group_by: "country week"
-      max_sequences: 600
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Africa'"
-    asia_recent:
-      group_by: "country week"
-      max_sequences: 800
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china_recent:
-      group_by: "division week"
-      max_sequences: 700
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'country!=China'"
-    europe_recent:
-      group_by: "country week"
-      max_sequences: 500
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Europe'"
-    india_recent:
-      group_by: "division week"
-      max_sequences: 700
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'country!=India'"
-    north_america_recent:
-      group_by: "division week"
-      max_sequences: 400
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=North America'"
-    south_america_recent:
-      group_by: "country week"
-      max_sequences: 360
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=South America'"
-    oceania_recent:
-      group_by: "division week"
-      max_sequences: 60
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Oceania'"
-
-  # Custom subsampling logic for global region over 6m
-  # 5125 total (expect ~3400)
-  # 4:1 ratio of recent to early
-  # all eight regions equal except Oceania at 20%
-  nextstrain_global_6m:
-    africa_early:
-      group_by: "country year month"
-      max_sequences: 150
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Africa'"
-    asia_early:
-      group_by: "country year month"
-      max_sequences: 200
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china_early:
-      group_by: "division year month"
-      max_sequences: 175
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'country!=China'"
-    europe_early:
-      group_by: "country year month"
-      max_sequences: 125
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Europe'"
-    india_early:
-      group_by: "division year month"
-      max_sequences: 175
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'country!=India'"
-    north_america_early:
-      group_by: "division year month"
-      max_sequences: 100
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=North America'"
-    south_america_early:
-      group_by: "country year month"
-      max_sequences: 90
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=South America'"
-    oceania_early:
-      group_by: "division year month"
-      max_sequences: 15
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Oceania'"
-    africa_recent:
-      group_by: "country year month"
-      max_sequences: 600
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Africa'"
-    asia_recent:
-      group_by: "country year month"
-      max_sequences: 800
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china_recent:
-      group_by: "division year month"
-      max_sequences: 700
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'country!=China'"
-    europe_recent:
-      group_by: "country year month"
-      max_sequences: 500
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Europe'"
-    india_recent:
-      group_by: "division year month"
-      max_sequences: 700
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'country!=India'"
-    north_america_recent:
-      group_by: "division year month"
-      max_sequences: 400
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=North America'"
-    south_america_recent:
-      group_by: "country year month"
-      max_sequences: 360
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=South America'"
-    oceania_recent:
-      group_by: "division year month"
-      max_sequences: 60
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Oceania'"
-
-  # Custom subsampling logic for global region over all-time
-  # 4320 total (expect ~3200)
-  # all eight regions equal except Oceania at 20%
-  nextstrain_global_all_time:
-    africa:
-      group_by: "country year month"
-      max_sequences: 750
-      exclude: "--exclude-where 'region!=Africa'"
-    asia:
-      group_by: "country year month"
-      max_sequences: 1000
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china:
-      group_by: "division year month"
-      max_sequences: 875
-      exclude: "--exclude-where 'country!=China'"
-    europe:
-      group_by: "country year month"
-      max_sequences: 625
-      exclude: "--exclude-where 'region!=Europe'"
-    india:
-      group_by: "division year month"
-      max_sequences: 875
-      exclude: "--exclude-where 'country!=India'"
-    north_america:
-      group_by: "division year month"
-      max_sequences: 500
-      exclude: "--exclude-where 'region!=North America'"
-    south_america:
-      group_by: "country year month"
-      max_sequences: 450
-      exclude: "--exclude-where 'region!=South America'"
-    oceania:
-      group_by: "division year month"
-      max_sequences: 75
-      exclude: "--exclude-where 'region!=Oceania'"
-
 # Root to clade 21L
 refine:
   root: "21L"
diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
index dfd006a28..6397ced86 100644
--- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml
+++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
@@ -39,841 +39,97 @@ inputs:
 # Auspice config is specified in rule auspice_config in export_for_nextstrain.smk
 builds:
   reference:
-    subsampling_scheme: nextstrain_reference
+    subsampling_scheme: subsampling/reference.yaml
     title: Genomic epidemiology of SARS-CoV-2 with clade-focused subsampling
   global_1m:
-    subsampling_scheme: nextstrain_global_1m
+    subsampling_scheme: subsampling/global_1m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally over the past month
   global_2m:
-    subsampling_scheme: nextstrain_global_2m
+    subsampling_scheme: subsampling/global_2m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally over the past 2 months
   global_6m:
-    subsampling_scheme: nextstrain_global_6m
+    subsampling_scheme: subsampling/global_6m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally over the past 6 months
   global_all-time:
-    subsampling_scheme: nextstrain_global_all_time
+    subsampling_scheme: subsampling/global_all-time.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally since pandemic start
   africa_1m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_1m
-    region: Africa
+    subsampling_scheme: subsampling/africa_1m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa over the past month
   africa_2m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_2m
-    region: Africa
+    subsampling_scheme: subsampling/africa_2m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa over the past 2 months
   africa_6m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_6m
-    region: Africa
+    subsampling_scheme: subsampling/africa_6m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa over the past 6 months
   africa_all-time:
-    subsampling_scheme: nextstrain_region_grouped_by_country_all_time
-    region: Africa
+    subsampling_scheme: subsampling/africa_all-time.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa since pandemic start
   asia_1m:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_1m
-    region: Asia
+    subsampling_scheme: subsampling/asia_1m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past month
   asia_2m:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_2m
-    region: Asia
+    subsampling_scheme: subsampling/asia_2m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 2 months
   asia_6m:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_6m
-    region: Asia
+    subsampling_scheme: subsampling/asia_6m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 6 months
   asia_all-time:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_all_time
-    region: Asia
+    subsampling_scheme: subsampling/asia_all-time.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia since pandemic start
   europe_1m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_1m
-    region: Europe
+    subsampling_scheme: subsampling/europe_1m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe over the past month
   europe_2m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_2m
-    region: Europe
+    subsampling_scheme: subsampling/europe_2m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe over the past 2 months
   europe_6m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_6m
-    region: Europe
+    subsampling_scheme: subsampling/europe_6m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe over the past 6 months
   europe_all-time:
-    subsampling_scheme: nextstrain_region_grouped_by_country_all_time
-    region: Europe
+    subsampling_scheme: subsampling/europe_all-time.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe since pandemic start
   north-america_1m:
-    subsampling_scheme: nextstrain_region_grouped_by_division_1m
-    region: North America
+    subsampling_scheme: subsampling/north-america_1m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America over the past month
   north-america_2m:
-    subsampling_scheme: nextstrain_region_grouped_by_division_2m
-    region: North America
+    subsampling_scheme: subsampling/north-america_2m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America over the past 2 months
   north-america_6m:
-    subsampling_scheme: nextstrain_region_grouped_by_division_6m
-    region: North America
+    subsampling_scheme: subsampling/north-america_6m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America over the past 6 months
   north-america_all-time:
-    subsampling_scheme: nextstrain_region_grouped_by_division_all_time
-    region: North America
+    subsampling_scheme: subsampling/north-america_all-t.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America since pandemic start
   oceania_1m:
-    subsampling_scheme: nextstrain_region_grouped_by_division_1m
-    region: Oceania
+    subsampling_scheme: subsampling/oceania_1m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania over the past month
   oceania_2m:
-    subsampling_scheme: nextstrain_region_grouped_by_division_2m
-    region: Oceania
+    subsampling_scheme: subsampling/oceania_2m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania over the past 2 months
   oceania_6m:
-    subsampling_scheme: nextstrain_region_grouped_by_division_6m
-    region: Oceania
+    subsampling_scheme: subsampling/oceania_6m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania over the past 6 months
   oceania_all-time:
-    subsampling_scheme: nextstrain_region_grouped_by_division_all_time
-    region: Oceania
+    subsampling_scheme: subsampling/oceania_all-time.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania since pandemic start
   south-america_1m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_1m
-    region: South America
+    subsampling_scheme: subsampling/south-america_1m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America over the past month
   south-america_2m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_2m
-    region: South America
+    subsampling_scheme: subsampling/south-america_2m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America over the past 2 months
   south-america_6m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_6m
-    region: South America
+    subsampling_scheme: subsampling/south-america_6m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America over the past 6 months
   south-america_all-time:
-    subsampling_scheme: nextstrain_region_grouped_by_country_all_time
-    region: South America
+    subsampling_scheme: subsampling/south-america_all-t.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America since pandemic start
 
 # remove sequences without division label in US
 filter:
   exclude_where: "division='USA'"
 
-subsampling:
-
-  # Custom subsampling logic for group by clade
-  nextstrain_reference:
-    clades:
-      group_by: "Nextstrain_clade"
-      max_sequences: 300
-
-  # Custom subsampling logic for regions over 1m
-  # Grouping by division for North America and Oceania
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_division_1m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "division year month"
-      max_sequences: 640
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region={region}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "division week"
-      max_sequences: 2560
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country week"
-      max_sequences: 640
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over 2m
-  # Grouping by division for North America and Oceania
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_division_2m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "division year month"
-      max_sequences: 640
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region={region}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "division week"
-      max_sequences: 2560
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country week"
-      max_sequences: 640
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over 6m
-  # Grouping by division for North America and Oceania
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_division_6m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "division year month"
-      max_sequences: 640
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region={region}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "division year month"
-      max_sequences: 2560
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country year month"
-      max_sequences: 640
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over all-time
-  # Grouping by division for North America and Oceania
-  # 4000 total
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_division_all_time:
-    # Focal samples for region
-    focal:
-      group_by: "division year month"
-      max_sequences: 3200
-      exclude: "--exclude-where 'region!={region}'"
-    # Contextual samples from the rest of the world
-    context:
-      group_by: "country year month"
-      max_sequences: 800
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for region Asia over 1m
-  # Grouping by division
-  # Separating three buckets for China, India and elsewhere
-  # 4375 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
-  nextstrain_region_asia_grouped_by_division_1m:
-    # Early focal samples for Asia
-    asia_early:
-      group_by: "division year month"
-      max_sequences: 300
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Early focal samples for China
-    china_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'country!=China'"
-    # Early focal samples for India
-    india_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'country!=India'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 175
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region=Asia'"
-    # Recent focal samples for Asia
-    asia_recent:
-      group_by: "division year month"
-      max_sequences: 1200
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Recent focal samples for China
-    china_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 1M"
-      exclude: "--exclude-where 'country!=China'"
-    # Recent focal samples for India
-    india_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 1M"
-      exclude: "--exclude-where 'country!=India'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country year month"
-      max_sequences: 700
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region=Asia'"
-
-  # Custom subsampling logic for region Asia over 2m
-  # Grouping by division
-  # Separating three buckets for China, India and elsewhere
-  # 4375 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
-  nextstrain_region_asia_grouped_by_division_2m:
-    # Early focal samples for Asia
-    asia_early:
-      group_by: "division year month"
-      max_sequences: 300
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Early focal samples for China
-    china_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'country!=China'"
-    # Early focal samples for India
-    india_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'country!=India'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 175
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region=Asia'"
-    # Recent focal samples for Asia
-    asia_recent:
-      group_by: "division year month"
-      max_sequences: 1200
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Recent focal samples for China
-    china_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 2M"
-      exclude: "--exclude-where 'country!=China'"
-    # Recent focal samples for India
-    india_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 2M"
-      exclude: "--exclude-where 'country!=India'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country year month"
-      max_sequences: 700
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region=Asia'"
-
-  # Custom subsampling logic for region Asia over 6m
-  # Grouping by division
-  # Separating three buckets for China, India and elsewhere
-  # 4375 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
-  nextstrain_region_asia_grouped_by_division_6m:
-    # Early focal samples for Asia
-    asia_early:
-      group_by: "division year month"
-      max_sequences: 300
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Early focal samples for China
-    china_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'country!=China'"
-    # Early focal samples for India
-    india_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'country!=India'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 175
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region=Asia'"
-    # Recent focal samples for Asia
-    asia_recent:
-      group_by: "division year month"
-      max_sequences: 1200
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Recent focal samples for China
-    china_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 6M"
-      exclude: "--exclude-where 'country!=China'"
-    # Recent focal samples for India
-    india_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 6M"
-      exclude: "--exclude-where 'country!=India'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country year month"
-      max_sequences: 700
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region=Asia'"
-
-  # Custom subsampling logic for region Asia over all-time
-  # Grouping by division
-  # Separating three buckets for China, India and elsewhere
-  # 4375 total
-  # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
-  nextstrain_region_asia_grouped_by_division_all_time:
-    # Focal samples for Asia
-    asia:
-      group_by: "division year month"
-      max_sequences: 1500
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Focal samples for China
-    china:
-      group_by: "division year month"
-      max_sequences: 1000
-      exclude: "--exclude-where 'country!=China'"
-    # Focal samples for India
-    india:
-      group_by: "division year month"
-      max_sequences: 1000
-      exclude: "--exclude-where 'country!=India'"
-    # Contextual samples from the rest of the world
-    context:
-      group_by: "country year month"
-      max_sequences: 875
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over 1m
-  # Grouping by country for Africa, Asia, Europe and South America
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_country_1m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "country year month"
-      max_sequences: 640
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region={region}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "country week"
-      max_sequences: 2560
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country week"
-      max_sequences: 640
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over 2m
-  # Grouping by country for Africa, Asia, Europe and South America
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_country_2m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "country year month"
-      max_sequences: 640
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region={region}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "country week"
-      max_sequences: 2560
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country week"
-      max_sequences: 640
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over 6m
-  # Grouping by country for Africa, Asia, Europe and South America
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_country_6m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "country year month"
-      max_sequences: 640
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region={region}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "country year month"
-      max_sequences: 2560
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country year month"
-      max_sequences: 640
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over all-time
-  # Grouping by country for Africa, Asia, Europe and South America
-  # 4000 total
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_country_all_time:
-    # Focal samples for region
-    focal:
-      group_by: "country year month"
-      max_sequences: 3200
-      exclude: "--exclude-where 'region!={region}'"
-    # Contextual samples from the rest of the world
-    context:
-      group_by: "country year month"
-      max_sequences: 800
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for global region over 1m
-  # 5125 total (expect ~3400)
-  # 4:1 ratio of recent to early
-  # all eight regions equal except Oceania at 20%
-  nextstrain_global_1m:
-    africa_early:
-      group_by: "country year month"
-      max_sequences: 150
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Africa'"
-    asia_early:
-      group_by: "country year month"
-      max_sequences: 200
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china_early:
-      group_by: "division year month"
-      max_sequences: 175
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'country!=China'"
-    europe_early:
-      group_by: "country year month"
-      max_sequences: 125
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Europe'"
-    india_early:
-      group_by: "division year month"
-      max_sequences: 175
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'country!=India'"
-    north_america_early:
-      group_by: "division year month"
-      max_sequences: 100
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=North America'"
-    south_america_early:
-      group_by: "country year month"
-      max_sequences: 90
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=South America'"
-    oceania_early:
-      group_by: "division year month"
-      max_sequences: 15
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Oceania'"
-    africa_recent:
-      group_by: "country week"
-      max_sequences: 600
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Africa'"
-    asia_recent:
-      group_by: "country week"
-      max_sequences: 800
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china_recent:
-      group_by: "division week"
-      max_sequences: 700
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'country!=China'"
-    europe_recent:
-      group_by: "country week"
-      max_sequences: 500
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Europe'"
-    india_recent:
-      group_by: "division week"
-      max_sequences: 700
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'country!=India'"
-    north_america_recent:
-      group_by: "division week"
-      max_sequences: 400
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=North America'"
-    south_america_recent:
-      group_by: "country week"
-      max_sequences: 360
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=South America'"
-    oceania_recent:
-      group_by: "division week"
-      max_sequences: 60
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Oceania'"
-
-  # Custom subsampling logic for global region over 2m
-  # 5125 total (expect ~3400)
-  # 4:1 ratio of recent to early
-  # all eight regions equal except Oceania at 20%
-  nextstrain_global_2m:
-    africa_early:
-      group_by: "country year month"
-      max_sequences: 150
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Africa'"
-    asia_early:
-      group_by: "country year month"
-      max_sequences: 200
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china_early:
-      group_by: "division year month"
-      max_sequences: 175
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'country!=China'"
-    europe_early:
-      group_by: "country year month"
-      max_sequences: 125
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Europe'"
-    india_early:
-      group_by: "division year month"
-      max_sequences: 175
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'country!=India'"
-    north_america_early:
-      group_by: "division year month"
-      max_sequences: 100
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=North America'"
-    south_america_early:
-      group_by: "country year month"
-      max_sequences: 90
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=South America'"
-    oceania_early:
-      group_by: "division year month"
-      max_sequences: 15
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Oceania'"
-    africa_recent:
-      group_by: "country week"
-      max_sequences: 600
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Africa'"
-    asia_recent:
-      group_by: "country week"
-      max_sequences: 800
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china_recent:
-      group_by: "division week"
-      max_sequences: 700
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'country!=China'"
-    europe_recent:
-      group_by: "country week"
-      max_sequences: 500
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Europe'"
-    india_recent:
-      group_by: "division week"
-      max_sequences: 700
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'country!=India'"
-    north_america_recent:
-      group_by: "division week"
-      max_sequences: 400
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=North America'"
-    south_america_recent:
-      group_by: "country week"
-      max_sequences: 360
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=South America'"
-    oceania_recent:
-      group_by: "division week"
-      max_sequences: 60
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Oceania'"
-
-  # Custom subsampling logic for global region over 6m
-  # 5125 total (expect ~3400)
-  # 4:1 ratio of recent to early
-  # all eight regions equal except Oceania at 20%
-  nextstrain_global_6m:
-    africa_early:
-      group_by: "country year month"
-      max_sequences: 150
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Africa'"
-    asia_early:
-      group_by: "country year month"
-      max_sequences: 200
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china_early:
-      group_by: "division year month"
-      max_sequences: 175
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'country!=China'"
-    europe_early:
-      group_by: "country year month"
-      max_sequences: 125
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Europe'"
-    india_early:
-      group_by: "division year month"
-      max_sequences: 175
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'country!=India'"
-    north_america_early:
-      group_by: "division year month"
-      max_sequences: 100
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=North America'"
-    south_america_early:
-      group_by: "country year month"
-      max_sequences: 90
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=South America'"
-    oceania_early:
-      group_by: "division year month"
-      max_sequences: 15
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Oceania'"
-    africa_recent:
-      group_by: "country year month"
-      max_sequences: 600
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Africa'"
-    asia_recent:
-      group_by: "country year month"
-      max_sequences: 800
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china_recent:
-      group_by: "division year month"
-      max_sequences: 700
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'country!=China'"
-    europe_recent:
-      group_by: "country year month"
-      max_sequences: 500
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Europe'"
-    india_recent:
-      group_by: "division year month"
-      max_sequences: 700
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'country!=India'"
-    north_america_recent:
-      group_by: "division year month"
-      max_sequences: 400
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=North America'"
-    south_america_recent:
-      group_by: "country year month"
-      max_sequences: 360
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=South America'"
-    oceania_recent:
-      group_by: "division year month"
-      max_sequences: 60
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Oceania'"
-
-  # Custom subsampling logic for global region over all-time
-  # 4320 total (expect ~3200)
-  # all eight regions equal except Oceania at 20%
-  nextstrain_global_all_time:
-    africa:
-      group_by: "country year month"
-      max_sequences: 750
-      exclude: "--exclude-where 'region!=Africa'"
-    asia:
-      group_by: "country year month"
-      max_sequences: 1000
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china:
-      group_by: "division year month"
-      max_sequences: 875
-      exclude: "--exclude-where 'country!=China'"
-    europe:
-      group_by: "country year month"
-      max_sequences: 625
-      exclude: "--exclude-where 'region!=Europe'"
-    india:
-      group_by: "division year month"
-      max_sequences: 875
-      exclude: "--exclude-where 'country!=India'"
-    north_america:
-      group_by: "division year month"
-      max_sequences: 500
-      exclude: "--exclude-where 'region!=North America'"
-    south_america:
-      group_by: "country year month"
-      max_sequences: 450
-      exclude: "--exclude-where 'region!=South America'"
-    oceania:
-      group_by: "division year month"
-      max_sequences: 75
-      exclude: "--exclude-where 'region!=Oceania'"
-
 # if different traits should be reconstructed for some builds, specify here
 # otherwise the default trait config in defaults/parameters.yaml will used
 traits:
diff --git a/nextstrain_profiles/nextstrain-open/builds.yaml b/nextstrain_profiles/nextstrain-open/builds.yaml
index aa26e0101..95999bf8a 100644
--- a/nextstrain_profiles/nextstrain-open/builds.yaml
+++ b/nextstrain_profiles/nextstrain-open/builds.yaml
@@ -39,114 +39,114 @@ inputs:
 # Auspice config is specified in rule auspice_config in export_for_nextstrain.smk
 builds:
   reference:
-    subsampling_scheme: nextstrain_reference
+    subsampling_scheme: subsampling/reference.yaml
     title: Genomic epidemiology of SARS-CoV-2 with clade-focused subsampling
   global_1m:
-    subsampling_scheme: nextstrain_global_1m
+    subsampling_scheme: subsampling/global_1m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally over the past month
   global_2m:
-    subsampling_scheme: nextstrain_global_2m
+    subsampling_scheme: subsampling/global_2m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally over the past 2 months
   global_6m:
-    subsampling_scheme: nextstrain_global_6m
+    subsampling_scheme: subsampling/global_6m.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally over the past 6 months
   global_all-time:
-    subsampling_scheme: nextstrain_global_all_time
+    subsampling_scheme: subsampling/global_all-time.yaml
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally since pandemic start
   africa_1m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_1m
+    subsampling_scheme: subsampling/africa_1m.yaml
     region: Africa
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa over the past month
   africa_2m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_2m
+    subsampling_scheme: subsampling/africa_2m.yaml
     region: Africa
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa over the past 2 months
   africa_6m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_6m
+    subsampling_scheme: subsampling/africa_6m.yaml
     region: Africa
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa over the past 6 months
   africa_all-time:
-    subsampling_scheme: nextstrain_region_grouped_by_country_all_time
+    subsampling_scheme: subsampling/africa_all-time.yaml
     region: Africa
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa since pandemic start
   asia_1m:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_1m
+    subsampling_scheme: subsampling/asia_1m.yaml
     region: Asia
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past month
   asia_2m:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_2m
+    subsampling_scheme: subsampling/asia_2m.yaml
     region: Asia
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 2 months
   asia_6m:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_6m
+    subsampling_scheme: subsampling/asia_6m.yaml
     region: Asia
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 6 months
   asia_all-time:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_all_time
+    subsampling_scheme: subsampling/asia_all-time.yaml
     region: Asia
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia since pandemic start
   europe_1m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_1m
+    subsampling_scheme: subsampling/europe_1m.yaml
     region: Europe
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe over the past month
   europe_2m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_2m
+    subsampling_scheme: subsampling/europe_2m.yaml
     region: Europe
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe over the past 2 months
   europe_6m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_6m
+    subsampling_scheme: subsampling/europe_6m.yaml
     region: Europe
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe over the past 6 months
   europe_all-time:
-    subsampling_scheme: nextstrain_region_grouped_by_country_all_time
+    subsampling_scheme: subsampling/europe_all-time.yaml
     region: Europe
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe since pandemic start
   north-america_1m:
-    subsampling_scheme: nextstrain_region_grouped_by_division_1m
+    subsampling_scheme: subsampling/north-america_1m.yaml
     region: North America
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America over the past month
   north-america_2m:
-    subsampling_scheme: nextstrain_region_grouped_by_division_2m
+    subsampling_scheme: subsampling/north-america_2m.yaml
     region: North America
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America over the past 2 months
   north-america_6m:
-    subsampling_scheme: nextstrain_region_grouped_by_division_6m
+    subsampling_scheme: subsampling/north-america_6m.yaml
     region: North America
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America over the past 6 months
   north-america_all-time:
-    subsampling_scheme: nextstrain_region_grouped_by_division_all_time
+    subsampling_scheme: subsampling/north-america_all-time.yaml
     region: North America
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America since pandemic start
   oceania_1m:
-    subsampling_scheme: nextstrain_region_grouped_by_division_1m
+    subsampling_scheme: subsampling/oceania_1m.yaml
     region: Oceania
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania over the past month
   oceania_2m:
-    subsampling_scheme: nextstrain_region_grouped_by_division_2m
+    subsampling_scheme: subsampling/oceania_2m.yaml
     region: Oceania
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania over the past 2 months
   oceania_6m:
-    subsampling_scheme: nextstrain_region_grouped_by_division_6m
+    subsampling_scheme: subsampling/oceania_6m.yaml
     region: Oceania
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania over the past 6 months
   oceania_all-time:
-    subsampling_scheme: nextstrain_region_grouped_by_division_all_time
+    subsampling_scheme: subsampling/oceania_all-time.yaml
     region: Oceania
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania since pandemic start
   south-america_1m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_1m
+    subsampling_scheme: subsampling/south-america_1m.yaml
     region: South America
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America over the past month
   south-america_2m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_2m
+    subsampling_scheme: subsampling/south-america_2m.yaml
     region: South America
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America over the past 2 months
   south-america_6m:
-    subsampling_scheme: nextstrain_region_grouped_by_country_6m
+    subsampling_scheme: subsampling/south-america_6m.yaml
     region: South America
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America over the past 6 months
   south-america_all-time:
-    subsampling_scheme: nextstrain_region_grouped_by_country_all_time
+    subsampling_scheme: subsampling/south-america_all-time.yaml
     region: South America
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America since pandemic start
 
@@ -154,726 +154,6 @@ builds:
 filter:
   exclude_where: "division='USA'"
 
-subsampling:
-
-  # Custom subsampling logic for group by clade
-  nextstrain_reference:
-    clades:
-      group_by: "Nextstrain_clade"
-      max_sequences: 300
-
-  # Custom subsampling logic for regions over 1m
-  # Grouping by division for North America and Oceania
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_division_1m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "division year month"
-      max_sequences: 640
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region={region}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "division week"
-      max_sequences: 2560
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country week"
-      max_sequences: 640
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over 2m
-  # Grouping by division for North America and Oceania
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_division_2m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "division year month"
-      max_sequences: 640
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region={region}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "division week"
-      max_sequences: 2560
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country week"
-      max_sequences: 640
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over 6m
-  # Grouping by division for North America and Oceania
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_division_6m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "division year month"
-      max_sequences: 640
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region={region}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "division year month"
-      max_sequences: 2560
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country year month"
-      max_sequences: 640
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over all-time
-  # Grouping by division for North America and Oceania
-  # 4000 total
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_division_all_time:
-    # Focal samples for region
-    focal:
-      group_by: "division year month"
-      max_sequences: 3200
-      exclude: "--exclude-where 'region!={region}'"
-    # Contextual samples from the rest of the world
-    context:
-      group_by: "country year month"
-      max_sequences: 800
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for region Asia over 1m
-  # Grouping by division
-  # Separating three buckets for China, India and elsewhere
-  # 4375 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
-  nextstrain_region_asia_grouped_by_division_1m:
-    # Early focal samples for Asia
-    asia_early:
-      group_by: "division year month"
-      max_sequences: 300
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Early focal samples for China
-    china_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'country!=China'"
-    # Early focal samples for India
-    india_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'country!=India'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 175
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region=Asia'"
-    # Recent focal samples for Asia
-    asia_recent:
-      group_by: "division year month"
-      max_sequences: 1200
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Recent focal samples for China
-    china_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 1M"
-      exclude: "--exclude-where 'country!=China'"
-    # Recent focal samples for India
-    india_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 1M"
-      exclude: "--exclude-where 'country!=India'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country year month"
-      max_sequences: 700
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region=Asia'"
-
-  # Custom subsampling logic for region Asia over 2m
-  # Grouping by division
-  # Separating three buckets for China, India and elsewhere
-  # 4375 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
-  nextstrain_region_asia_grouped_by_division_2m:
-    # Early focal samples for Asia
-    asia_early:
-      group_by: "division year month"
-      max_sequences: 300
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Early focal samples for China
-    china_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'country!=China'"
-    # Early focal samples for India
-    india_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'country!=India'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 175
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region=Asia'"
-    # Recent focal samples for Asia
-    asia_recent:
-      group_by: "division year month"
-      max_sequences: 1200
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Recent focal samples for China
-    china_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 2M"
-      exclude: "--exclude-where 'country!=China'"
-    # Recent focal samples for India
-    india_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 2M"
-      exclude: "--exclude-where 'country!=India'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country year month"
-      max_sequences: 700
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region=Asia'"
-
-  # Custom subsampling logic for region Asia over 6m
-  # Grouping by division
-  # Separating three buckets for China, India and elsewhere
-  # 4375 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
-  nextstrain_region_asia_grouped_by_division_6m:
-    # Early focal samples for Asia
-    asia_early:
-      group_by: "division year month"
-      max_sequences: 300
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Early focal samples for China
-    china_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'country!=China'"
-    # Early focal samples for India
-    india_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'country!=India'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 175
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region=Asia'"
-    # Recent focal samples for Asia
-    asia_recent:
-      group_by: "division year month"
-      max_sequences: 1200
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Recent focal samples for China
-    china_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 6M"
-      exclude: "--exclude-where 'country!=China'"
-    # Recent focal samples for India
-    india_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 6M"
-      exclude: "--exclude-where 'country!=India'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country year month"
-      max_sequences: 700
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region=Asia'"
-
-  # Custom subsampling logic for region Asia over all-time
-  # Grouping by division
-  # Separating three buckets for China, India and elsewhere
-  # 4375 total
-  # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
-  nextstrain_region_asia_grouped_by_division_all_time:
-    # Focal samples for Asia
-    asia:
-      group_by: "division year month"
-      max_sequences: 1500
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Focal samples for China
-    china:
-      group_by: "division year month"
-      max_sequences: 1000
-      exclude: "--exclude-where 'country!=China'"
-    # Focal samples for India
-    india:
-      group_by: "division year month"
-      max_sequences: 1000
-      exclude: "--exclude-where 'country!=India'"
-    # Contextual samples from the rest of the world
-    context:
-      group_by: "country year month"
-      max_sequences: 875
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over 1m
-  # Grouping by country for Africa, Asia, Europe and South America
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_country_1m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "country year month"
-      max_sequences: 640
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region={region}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "country week"
-      max_sequences: 2560
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country week"
-      max_sequences: 640
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over 2m
-  # Grouping by country for Africa, Asia, Europe and South America
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_country_2m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "country year month"
-      max_sequences: 640
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region={region}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "country week"
-      max_sequences: 2560
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country week"
-      max_sequences: 640
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over 6m
-  # Grouping by country for Africa, Asia, Europe and South America
-  # 4000 total
-  # 4:1 ratio of recent to early
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_country_6m:
-    # Early focal samples for region
-    focal_early:
-      group_by: "country year month"
-      max_sequences: 640
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_early:
-      group_by: "country year month"
-      max_sequences: 160
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region={region}'"
-    # Recent focal samples for region
-    focal_recent:
-      group_by: "country year month"
-      max_sequences: 2560
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!={region}'"
-    # Early contextual samples from the rest of the world
-    context_recent:
-      group_by: "country year month"
-      max_sequences: 640
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for regions over all-time
-  # Grouping by country for Africa, Asia, Europe and South America
-  # 4000 total
-  # 4:1 ratio of focal to context
-  nextstrain_region_grouped_by_country_all_time:
-    # Focal samples for region
-    focal:
-      group_by: "country year month"
-      max_sequences: 3200
-      exclude: "--exclude-where 'region!={region}'"
-    # Contextual samples from the rest of the world
-    context:
-      group_by: "country year month"
-      max_sequences: 800
-      exclude: "--exclude-where 'region={region}'"
-
-  # Custom subsampling logic for global region over 1m
-  # 5125 total (expect ~3400)
-  # 4:1 ratio of recent to early
-  # all eight regions equal except Oceania at 20%
-  nextstrain_global_1m:
-    africa_early:
-      group_by: "country year month"
-      max_sequences: 150
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Africa'"
-    asia_early:
-      group_by: "country year month"
-      max_sequences: 200
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china_early:
-      group_by: "division year month"
-      max_sequences: 175
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'country!=China'"
-    europe_early:
-      group_by: "country year month"
-      max_sequences: 125
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Europe'"
-    india_early:
-      group_by: "division year month"
-      max_sequences: 175
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'country!=India'"
-    north_america_early:
-      group_by: "division year month"
-      max_sequences: 100
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=North America'"
-    south_america_early:
-      group_by: "country year month"
-      max_sequences: 90
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=South America'"
-    oceania_early:
-      group_by: "division year month"
-      max_sequences: 15
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Oceania'"
-    africa_recent:
-      group_by: "country week"
-      max_sequences: 600
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Africa'"
-    asia_recent:
-      group_by: "country week"
-      max_sequences: 800
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china_recent:
-      group_by: "division week"
-      max_sequences: 700
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'country!=China'"
-    europe_recent:
-      group_by: "country week"
-      max_sequences: 500
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Europe'"
-    india_recent:
-      group_by: "division week"
-      max_sequences: 700
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'country!=India'"
-    north_america_recent:
-      group_by: "division week"
-      max_sequences: 400
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=North America'"
-    south_america_recent:
-      group_by: "country week"
-      max_sequences: 360
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=South America'"
-    oceania_recent:
-      group_by: "division week"
-      max_sequences: 60
-      min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Oceania'"
-
-  # Custom subsampling logic for global region over 2m
-  # 5125 total (expect ~3400)
-  # 4:1 ratio of recent to early
-  # all eight regions equal except Oceania at 20%
-  nextstrain_global_2m:
-    africa_early:
-      group_by: "country year month"
-      max_sequences: 150
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Africa'"
-    asia_early:
-      group_by: "country year month"
-      max_sequences: 200
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china_early:
-      group_by: "division year month"
-      max_sequences: 175
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'country!=China'"
-    europe_early:
-      group_by: "country year month"
-      max_sequences: 125
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Europe'"
-    india_early:
-      group_by: "division year month"
-      max_sequences: 175
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'country!=India'"
-    north_america_early:
-      group_by: "division year month"
-      max_sequences: 100
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=North America'"
-    south_america_early:
-      group_by: "country year month"
-      max_sequences: 90
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=South America'"
-    oceania_early:
-      group_by: "division year month"
-      max_sequences: 15
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Oceania'"
-    africa_recent:
-      group_by: "country week"
-      max_sequences: 600
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Africa'"
-    asia_recent:
-      group_by: "country week"
-      max_sequences: 800
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china_recent:
-      group_by: "division week"
-      max_sequences: 700
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'country!=China'"
-    europe_recent:
-      group_by: "country week"
-      max_sequences: 500
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Europe'"
-    india_recent:
-      group_by: "division week"
-      max_sequences: 700
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'country!=India'"
-    north_america_recent:
-      group_by: "division week"
-      max_sequences: 400
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=North America'"
-    south_america_recent:
-      group_by: "country week"
-      max_sequences: 360
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=South America'"
-    oceania_recent:
-      group_by: "division week"
-      max_sequences: 60
-      min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Oceania'"
-
-  # Custom subsampling logic for global region over 6m
-  # 5125 total (expect ~3400)
-  # 4:1 ratio of recent to early
-  # all eight regions equal except Oceania at 20%
-  nextstrain_global_6m:
-    africa_early:
-      group_by: "country year month"
-      max_sequences: 150
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Africa'"
-    asia_early:
-      group_by: "country year month"
-      max_sequences: 200
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china_early:
-      group_by: "division year month"
-      max_sequences: 175
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'country!=China'"
-    europe_early:
-      group_by: "country year month"
-      max_sequences: 125
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Europe'"
-    india_early:
-      group_by: "division year month"
-      max_sequences: 175
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'country!=India'"
-    north_america_early:
-      group_by: "division year month"
-      max_sequences: 100
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=North America'"
-    south_america_early:
-      group_by: "country year month"
-      max_sequences: 90
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=South America'"
-    oceania_early:
-      group_by: "division year month"
-      max_sequences: 15
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Oceania'"
-    africa_recent:
-      group_by: "country year month"
-      max_sequences: 600
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Africa'"
-    asia_recent:
-      group_by: "country year month"
-      max_sequences: 800
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china_recent:
-      group_by: "division year month"
-      max_sequences: 700
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'country!=China'"
-    europe_recent:
-      group_by: "country year month"
-      max_sequences: 500
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Europe'"
-    india_recent:
-      group_by: "division year month"
-      max_sequences: 700
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'country!=India'"
-    north_america_recent:
-      group_by: "division year month"
-      max_sequences: 400
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=North America'"
-    south_america_recent:
-      group_by: "country year month"
-      max_sequences: 360
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=South America'"
-    oceania_recent:
-      group_by: "division year month"
-      max_sequences: 60
-      min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Oceania'"
-
-  # Custom subsampling logic for global region over all-time
-  # 4320 total (expect ~3200)
-  # all eight regions equal except Oceania at 20%
-  nextstrain_global_all_time:
-    africa:
-      group_by: "country year month"
-      max_sequences: 750
-      exclude: "--exclude-where 'region!=Africa'"
-    asia:
-      group_by: "country year month"
-      max_sequences: 1000
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    china:
-      group_by: "division year month"
-      max_sequences: 875
-      exclude: "--exclude-where 'country!=China'"
-    europe:
-      group_by: "country year month"
-      max_sequences: 625
-      exclude: "--exclude-where 'region!=Europe'"
-    india:
-      group_by: "division year month"
-      max_sequences: 875
-      exclude: "--exclude-where 'country!=India'"
-    north_america:
-      group_by: "division year month"
-      max_sequences: 500
-      exclude: "--exclude-where 'region!=North America'"
-    south_america:
-      group_by: "country year month"
-      max_sequences: 450
-      exclude: "--exclude-where 'region!=South America'"
-    oceania:
-      group_by: "division year month"
-      max_sequences: 75
-      exclude: "--exclude-where 'region!=Oceania'"
-
 # GenBank data includes "Wuhan-Hu-1/2019" which we use as the root for this build
 # as Wuhan/Hu-1/2019 is not in the data.
 refine:
diff --git a/workflow/snakemake_rules/common.smk b/workflow/snakemake_rules/common.smk
index 0c2713950..c567dcc10 100644
--- a/workflow/snakemake_rules/common.smk
+++ b/workflow/snakemake_rules/common.smk
@@ -42,7 +42,7 @@ def numeric_date(dt=None):
 
     return res
 
-def _get_subsampling_scheme_by_build_name(build_name):
+def _get_subsampling_config_by_build_name(build_name):
     return config["builds"].get(build_name, {}).get("subsampling_scheme", build_name)
 
 def _get_skipped_inputs_for_diagnostic(wildcards):
diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index 82922157e..e12dd77bd 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -103,23 +103,31 @@ rule align:
         xz -2 -T {threads} {params.output_translations_toxz}
         """
 
+
+import ruamel.yaml
+
+yaml=ruamel.yaml.YAML()
+
 def _get_subsampling_settings(wildcards):
     # Allow users to override default subsampling with their own settings keyed
     # by location type and name. For example, "region_europe" or
     # "country_iceland". Otherwise, default to settings for the location type.
-    subsampling_scheme = _get_subsampling_scheme_by_build_name(wildcards.build_name)
+
+    subsampling_config_file = _get_subsampling_config_by_build_name(wildcards.build_name)
+    with open(subsampling_config_file) as f:
+        subsampling_config = yaml.load(f)
 
     # When there is no well-defined subsampling scheme, default to using all
     # available samples.
-    if subsampling_scheme not in config["subsampling"]:
-        print(
-            f"WARNING: No valid subsampling scheme is defined for build '{wildcards.build_name}'.",
-            "Skipping subsampling and using all available samples.",
-            file=sys.stderr
-        )
-        subsampling_scheme = "all"
+    # FIXME: handle case where no subsampling YAML is available
+        # print(
+        #     f"WARNING: No valid subsampling scheme is defined for build '{wildcards.build_name}'.",
+        #     "Skipping subsampling and using all available samples.",
+        #     file=sys.stderr
+        # )
+        # subsampling_scheme = "all"
 
-    subsampling_settings = config["subsampling"][subsampling_scheme]
+    subsampling_settings = subsampling_config["samples"]
 
     if hasattr(wildcards, "subsample"):
         subsampling_settings = subsampling_settings[wildcards.subsample]
@@ -418,8 +426,8 @@ def _get_subsampled_files(wildcards):
     subsampling_settings = _get_subsampling_settings(wildcards)
 
     return [
-        f"results/{wildcards.build_name}/sample-{subsample}.txt"
-        for subsample in subsampling_settings
+        f"results/{wildcards.build_name}/sample-{sample}.txt"
+        for sample in subsampling_settings
     ]
 
 rule combine_samples: