From 37c68b8729e291e821e5fc1eeebd3767bc408215 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 20 Mar 2024 17:03:44 -0700 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20Read=20from=20subsampling=20conf?= =?UTF-8?q?ig=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- nextstrain_profiles/100k/config-gisaid.yaml | 3 +- nextstrain_profiles/100k/config-open.yaml | 12 +- nextstrain_profiles/nextstrain-ci/builds.yaml | 22 +- .../nextstrain-country/builds.yaml | 123 +-- .../nextstrain-gisaid-21L/builds.yaml | 802 +----------------- .../nextstrain-gisaid/builds.yaml | 802 +----------------- .../nextstrain-open/builds.yaml | 778 +---------------- workflow/snakemake_rules/common.smk | 2 +- workflow/snakemake_rules/main_workflow.smk | 30 +- 9 files changed, 115 insertions(+), 2459 deletions(-) diff --git a/nextstrain_profiles/100k/config-gisaid.yaml b/nextstrain_profiles/100k/config-gisaid.yaml index 27a84c882..ca091f23a 100644 --- a/nextstrain_profiles/100k/config-gisaid.yaml +++ b/nextstrain_profiles/100k/config-gisaid.yaml @@ -15,7 +15,7 @@ inputs: builds: 100k: - subsampling_scheme: 100k_scheme + subsampling_scheme: subsampling/100k.yaml # mapping of remote: local files to be uploaded under S3_DST_BUCKET upload: @@ -26,6 +26,7 @@ upload: filter: exclude_where: "division='USA'" +# FIXME: move this comment to somewhere else then delete the entire entry. # We wish to subsample 50k in the previous 12 months and 50k prior to that. # Note 1: both --max-date and --min-date are inclusive of the boundary date, # so sequences from that date will be available to both sub-samples diff --git a/nextstrain_profiles/100k/config-open.yaml b/nextstrain_profiles/100k/config-open.yaml index 0702c92ee..9f390e4dd 100644 --- a/nextstrain_profiles/100k/config-open.yaml +++ b/nextstrain_profiles/100k/config-open.yaml @@ -12,19 +12,9 @@ inputs: skip_sanitize_metadata: true builds: 100k: - subsampling_scheme: 100k_scheme + subsampling_scheme: subsampling/100k.yaml upload: metadata.tsv.xz: results/100k/100k_subsampled_metadata.tsv.xz sequences.fasta.xz: results/100k/100k_subsampled_sequences.fasta.xz filter: exclude_where: "division='USA'" -subsampling: - 100k_scheme: - 50k_early: - group_by: "year month country" - max_sequences: 50000 - max_date: "--max-date 1Y" - 50k_late: - group_by: "year month country" - max_sequences: 50000 - min_date: "--min-date 1Y" diff --git a/nextstrain_profiles/nextstrain-ci/builds.yaml b/nextstrain_profiles/nextstrain-ci/builds.yaml index b5352214c..3ad13191e 100644 --- a/nextstrain_profiles/nextstrain-ci/builds.yaml +++ b/nextstrain_profiles/nextstrain-ci/builds.yaml @@ -11,27 +11,7 @@ builds: # Override the default Nextstrain European build's subsampling scheme for more # stable subsampling of a fixed dataset in continuous integration tests. europe: - subsampling_scheme: nextstrain_ci_sampling - region: Europe - -subsampling: - # Custom subsampling logic for CI tests. - nextstrain_ci_sampling: - # Focal samples for region - region: - group_by: "division year month" - max_sequences: 20 - sampling_scheme: "--no-probabilistic-sampling" - exclude: "--exclude-where 'region!={region}'" - # Contextual samples for region from the rest of the world - global: - group_by: "year month" - max_sequences: 10 - sampling_scheme: "--no-probabilistic-sampling" - exclude: "--exclude-where 'region={region}'" - priorities: - type: "proximity" - focus: "region" + subsampling_scheme: subsampling/nextstrain_ci_sampling.yaml # Override default frequency settings, so we can estimate frequencies from older # data with a fixed time range. diff --git a/nextstrain_profiles/nextstrain-country/builds.yaml b/nextstrain_profiles/nextstrain-country/builds.yaml index 529f201f3..6041b6794 100644 --- a/nextstrain_profiles/nextstrain-country/builds.yaml +++ b/nextstrain_profiles/nextstrain-country/builds.yaml @@ -39,137 +39,22 @@ inputs: # Auspice config is specified in rule auspice_config in export_for_nextstrain.smk builds: nextstrain_country_1m: - subsampling_scheme: nextstrain_country_1m + subsampling_scheme: subsampling/nextstrain_country_1m.yaml title: Genomic epidemiology of SARS-CoV-2 with country-focused subsampling over the past month - country: India nextstrain_country_2m: - subsampling_scheme: nextstrain_country_2m + subsampling_scheme: subsampling/nextstrain_country_2m.yaml title: Genomic epidemiology of SARS-CoV-2 with country-focused subsampling over the past 2 months - country: India nextstrain_country_6m: - subsampling_scheme: nextstrain_country_6m + subsampling_scheme: subsampling/nextstrain_country_6m.yaml title: Genomic epidemiology of SARS-CoV-2 with country-focused subsampling over the past 6 months - country: India nextstrain_country_all-time: - subsampling_scheme: nextstrain_country_all-time + subsampling_scheme: subsampling/nextstrain_country_all-time.yaml title: Genomic epidemiology of SARS-CoV-2 with country-focused subsampling since pandemic start - country: India # remove sequences without division label in US filter: exclude_where: "division='USA'" -subsampling: - - # Custom subsampling logic for regions over 1m - # Grouping by division for North America and Oceania - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_country_1m: - # Early focal samples for region - focal_early: - group_by: "division year month" - max_sequences: 640 - max_date: "--max-date 1M" - exclude: "--exclude-where 'country!={country}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 1M" - exclude: "--exclude-where 'country={country}'" - # Recent focal samples for region - focal_recent: - group_by: "division week" - max_sequences: 2560 - min_date: "--min-date 1M" - exclude: "--exclude-where 'country!={country}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country week" - max_sequences: 640 - min_date: "--min-date 1M" - exclude: "--exclude-where 'country={country}'" - - # Custom subsampling logic for regions over 2m - # Grouping by division for North America and Oceania - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_country_2m: - # Early focal samples for region - focal_early: - group_by: "division year month" - max_sequences: 640 - max_date: "--max-date 2M" - exclude: "--exclude-where 'country!={country}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 2M" - exclude: "--exclude-where 'country={country}'" - # Recent focal samples for region - focal_recent: - group_by: "division week" - max_sequences: 2560 - min_date: "--min-date 2M" - exclude: "--exclude-where 'country!={country}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country week" - max_sequences: 640 - min_date: "--min-date 2M" - exclude: "--exclude-where 'country={country}'" - - # Custom subsampling logic for regions over 6m - # Grouping by division for North America and Oceania - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_country_6m: - # Early focal samples for region - focal_early: - group_by: "division year month" - max_sequences: 640 - max_date: "--max-date 6M" - exclude: "--exclude-where 'country!={country}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 6M" - exclude: "--exclude-where 'country={country}'" - # Recent focal samples for region - focal_recent: - group_by: "division year month" - max_sequences: 2560 - min_date: "--min-date 6M" - exclude: "--exclude-where 'country!={country}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country year month" - max_sequences: 640 - min_date: "--min-date 6M" - exclude: "--exclude-where 'country={country}'" - - # Custom subsampling logic for regions over all-time - # Grouping by division for North America and Oceania - # 4000 total - # 4:1 ratio of focal to context - nextstrain_country_all-time: - # Focal samples for country - focal: - group_by: "division year month" - max_sequences: 640 - exclude: "--exclude-where 'country!={country}'" - # Contextual samples from the rest of the world - context: - group_by: "country year month" - max_sequences: 160 - exclude: "--exclude-where 'country={country}'" - # if different traits should be reconstructed for some builds, specify here # otherwise the default trait config in defaults/parameters.yaml will used traits: diff --git a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml index 24519a426..1a291333e 100644 --- a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml @@ -46,115 +46,91 @@ inputs: # Auspice config is specified in rule auspice_config in export_for_nextstrain.smk builds: reference: - subsampling_scheme: nextstrain_reference + subsampling_scheme: subsampling/reference.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with clade-focused subsampling global_1m: - subsampling_scheme: nextstrain_global_1m + subsampling_scheme: subsampling/global_1m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused globally over the past month global_2m: - subsampling_scheme: nextstrain_global_2m + subsampling_scheme: subsampling/global_2m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused globally over the past 2 months global_6m: - subsampling_scheme: nextstrain_global_6m + subsampling_scheme: subsampling/global_6m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused globally over the past 6 months global_all-time: - subsampling_scheme: nextstrain_global_all_time + subsampling_scheme: subsampling/global_all-time.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused globally since pandemic start africa_1m: - subsampling_scheme: nextstrain_region_grouped_by_country_1m - region: Africa + subsampling_scheme: subsampling/africa_1m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Africa over the past month africa_2m: - subsampling_scheme: nextstrain_region_grouped_by_country_2m - region: Africa + subsampling_scheme: subsampling/africa_2m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Africa over the past 2 months africa_6m: - subsampling_scheme: nextstrain_region_grouped_by_country_6m - region: Africa + subsampling_scheme: subsampling/africa_6m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Africa over the past 6 months africa_all-time: - subsampling_scheme: nextstrain_region_grouped_by_country_all_time - region: Africa + subsampling_scheme: subsampling/africa_all-time.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Africa since pandemic start asia_1m: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_1m - region: Asia + subsampling_scheme: subsampling/asia_1m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past month asia_2m: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_2m - region: Asia + subsampling_scheme: subsampling/asia_2m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past 2 months asia_6m: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_6m - region: Asia + subsampling_scheme: subsampling/asia_6m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past 6 months asia_all-time: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_all_time - region: Asia + subsampling_scheme: subsampling/asia_all-time.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia since pandemic start europe_1m: - subsampling_scheme: nextstrain_region_grouped_by_country_1m - region: Europe + subsampling_scheme: subsampling/europe_1m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Europe over the past month europe_2m: - subsampling_scheme: nextstrain_region_grouped_by_country_2m - region: Europe + subsampling_scheme: subsampling/europe_2m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Europe over the past 2 months europe_6m: - subsampling_scheme: nextstrain_region_grouped_by_country_6m - region: Europe + subsampling_scheme: subsampling/europe_6m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Europe over the past 6 months europe_all-time: - subsampling_scheme: nextstrain_region_grouped_by_country_all_time - region: Europe + subsampling_scheme: subsampling/europe_all-time.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Europe since pandemic start north-america_1m: - subsampling_scheme: nextstrain_region_grouped_by_division_1m - region: North America + subsampling_scheme: subsampling/north-america_1m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on North America over the past month north-america_2m: - subsampling_scheme: nextstrain_region_grouped_by_division_2m - region: North America + subsampling_scheme: subsampling/north-america_2m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on North America over the past 2 months north-america_6m: - subsampling_scheme: nextstrain_region_grouped_by_division_6m - region: North America + subsampling_scheme: subsampling/north-america_6m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on North America over the past 6 months north-america_all-time: - subsampling_scheme: nextstrain_region_grouped_by_division_all_time - region: North America + subsampling_scheme: subsampling/north-america_all-time.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on North America since pandemic start oceania_1m: - subsampling_scheme: nextstrain_region_grouped_by_division_1m - region: Oceania + subsampling_scheme: subsampling/oceania_1m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Oceania over the past month oceania_2m: - subsampling_scheme: nextstrain_region_grouped_by_division_2m - region: Oceania + subsampling_scheme: subsampling/oceania_2m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Oceania over the past 2 months oceania_6m: - subsampling_scheme: nextstrain_region_grouped_by_division_6m - region: Oceania + subsampling_scheme: subsampling/oceania_6m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Oceania over the past 6 months oceania_all-time: - subsampling_scheme: nextstrain_region_grouped_by_division_all_time - region: Oceania + subsampling_scheme: subsampling/oceania_all-time.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Oceania since pandemic start south-america_1m: - subsampling_scheme: nextstrain_region_grouped_by_country_1m - region: South America + subsampling_scheme: subsampling/south-america_1m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on South America over the past month south-america_2m: - subsampling_scheme: nextstrain_region_grouped_by_country_2m - region: South America + subsampling_scheme: subsampling/south-america_2m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on South America over the past 2 months south-america_6m: - subsampling_scheme: nextstrain_region_grouped_by_country_6m - region: South America + subsampling_scheme: subsampling/south-america_6m.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on South America over the past 6 months south-america_all-time: - subsampling_scheme: nextstrain_region_grouped_by_country_all_time - region: South America + subsampling_scheme: subsampling/south-america_all-time.yaml title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on South America since pandemic start # remove sequences without division label in US and sequences from prior to clade 21L @@ -162,726 +138,6 @@ filter: exclude_where: "division='USA'" min_date: "2022-01-01" -subsampling: - - # Custom subsampling logic for group by clade - nextstrain_reference: - clades: - group_by: "Nextstrain_clade" - max_sequences: 300 - - # Custom subsampling logic for regions over 1m - # Grouping by division for North America and Oceania - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_division_1m: - # Early focal samples for region - focal_early: - group_by: "division year month" - max_sequences: 640 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region={region}'" - # Recent focal samples for region - focal_recent: - group_by: "division week" - max_sequences: 2560 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country week" - max_sequences: 640 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over 2m - # Grouping by division for North America and Oceania - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_division_2m: - # Early focal samples for region - focal_early: - group_by: "division year month" - max_sequences: 640 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region={region}'" - # Recent focal samples for region - focal_recent: - group_by: "division week" - max_sequences: 2560 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country week" - max_sequences: 640 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over 6m - # Grouping by division for North America and Oceania - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_division_6m: - # Early focal samples for region - focal_early: - group_by: "division year month" - max_sequences: 640 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region={region}'" - # Recent focal samples for region - focal_recent: - group_by: "division year month" - max_sequences: 2560 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country year month" - max_sequences: 640 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over all-time - # Grouping by division for North America and Oceania - # 4000 total - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_division_all_time: - # Focal samples for region - focal: - group_by: "division year month" - max_sequences: 3200 - exclude: "--exclude-where 'region!={region}'" - # Contextual samples from the rest of the world - context: - group_by: "country year month" - max_sequences: 800 - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for region Asia over 1m - # Grouping by division - # Separating three buckets for China, India and elsewhere - # 4375 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India - nextstrain_region_asia_grouped_by_division_1m: - # Early focal samples for Asia - asia_early: - group_by: "division year month" - max_sequences: 300 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Early focal samples for China - china_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 1M" - exclude: "--exclude-where 'country!=China'" - # Early focal samples for India - india_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 1M" - exclude: "--exclude-where 'country!=India'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 175 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region=Asia'" - # Recent focal samples for Asia - asia_recent: - group_by: "division year month" - max_sequences: 1200 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Recent focal samples for China - china_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 1M" - exclude: "--exclude-where 'country!=China'" - # Recent focal samples for India - india_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 1M" - exclude: "--exclude-where 'country!=India'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country year month" - max_sequences: 700 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region=Asia'" - - # Custom subsampling logic for region Asia over 2m - # Grouping by division - # Separating three buckets for China, India and elsewhere - # 4375 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India - nextstrain_region_asia_grouped_by_division_2m: - # Early focal samples for Asia - asia_early: - group_by: "division year month" - max_sequences: 300 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Early focal samples for China - china_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 2M" - exclude: "--exclude-where 'country!=China'" - # Early focal samples for India - india_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 2M" - exclude: "--exclude-where 'country!=India'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 175 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region=Asia'" - # Recent focal samples for Asia - asia_recent: - group_by: "division year month" - max_sequences: 1200 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Recent focal samples for China - china_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 2M" - exclude: "--exclude-where 'country!=China'" - # Recent focal samples for India - india_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 2M" - exclude: "--exclude-where 'country!=India'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country year month" - max_sequences: 700 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region=Asia'" - - # Custom subsampling logic for region Asia over 6m - # Grouping by division - # Separating three buckets for China, India and elsewhere - # 4375 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India - nextstrain_region_asia_grouped_by_division_6m: - # Early focal samples for Asia - asia_early: - group_by: "division year month" - max_sequences: 300 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Early focal samples for China - china_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 6M" - exclude: "--exclude-where 'country!=China'" - # Early focal samples for India - india_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 6M" - exclude: "--exclude-where 'country!=India'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 175 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region=Asia'" - # Recent focal samples for Asia - asia_recent: - group_by: "division year month" - max_sequences: 1200 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Recent focal samples for China - china_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 6M" - exclude: "--exclude-where 'country!=China'" - # Recent focal samples for India - india_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 6M" - exclude: "--exclude-where 'country!=India'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country year month" - max_sequences: 700 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region=Asia'" - - # Custom subsampling logic for region Asia over all-time - # Grouping by division - # Separating three buckets for China, India and elsewhere - # 4375 total - # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India - nextstrain_region_asia_grouped_by_division_all_time: - # Focal samples for Asia - asia: - group_by: "division year month" - max_sequences: 1500 - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Focal samples for China - china: - group_by: "division year month" - max_sequences: 1000 - exclude: "--exclude-where 'country!=China'" - # Focal samples for India - india: - group_by: "division year month" - max_sequences: 1000 - exclude: "--exclude-where 'country!=India'" - # Contextual samples from the rest of the world - context: - group_by: "country year month" - max_sequences: 875 - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over 1m - # Grouping by country for Africa, Asia, Europe and South America - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_country_1m: - # Early focal samples for region - focal_early: - group_by: "country year month" - max_sequences: 640 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region={region}'" - # Recent focal samples for region - focal_recent: - group_by: "country week" - max_sequences: 2560 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country week" - max_sequences: 640 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over 2m - # Grouping by country for Africa, Asia, Europe and South America - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_country_2m: - # Early focal samples for region - focal_early: - group_by: "country year month" - max_sequences: 640 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region={region}'" - # Recent focal samples for region - focal_recent: - group_by: "country week" - max_sequences: 2560 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country week" - max_sequences: 640 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over 6m - # Grouping by country for Africa, Asia, Europe and South America - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_country_6m: - # Early focal samples for region - focal_early: - group_by: "country year month" - max_sequences: 640 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region={region}'" - # Recent focal samples for region - focal_recent: - group_by: "country year month" - max_sequences: 2560 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country year month" - max_sequences: 640 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over all-time - # Grouping by country for Africa, Asia, Europe and South America - # 4000 total - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_country_all_time: - # Focal samples for region - focal: - group_by: "country year month" - max_sequences: 3200 - exclude: "--exclude-where 'region!={region}'" - # Contextual samples from the rest of the world - context: - group_by: "country year month" - max_sequences: 800 - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for global region over 1m - # 5125 total (expect ~3400) - # 4:1 ratio of recent to early - # all eight regions equal except Oceania at 20% - nextstrain_global_1m: - africa_early: - group_by: "country year month" - max_sequences: 150 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Africa'" - asia_early: - group_by: "country year month" - max_sequences: 200 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china_early: - group_by: "division year month" - max_sequences: 175 - max_date: "--max-date 1M" - exclude: "--exclude-where 'country!=China'" - europe_early: - group_by: "country year month" - max_sequences: 125 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Europe'" - india_early: - group_by: "division year month" - max_sequences: 175 - max_date: "--max-date 1M" - exclude: "--exclude-where 'country!=India'" - north_america_early: - group_by: "division year month" - max_sequences: 100 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=North America'" - south_america_early: - group_by: "country year month" - max_sequences: 90 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=South America'" - oceania_early: - group_by: "division year month" - max_sequences: 15 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Oceania'" - africa_recent: - group_by: "country week" - max_sequences: 600 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Africa'" - asia_recent: - group_by: "country week" - max_sequences: 800 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china_recent: - group_by: "division week" - max_sequences: 700 - min_date: "--min-date 1M" - exclude: "--exclude-where 'country!=China'" - europe_recent: - group_by: "country week" - max_sequences: 500 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Europe'" - india_recent: - group_by: "division week" - max_sequences: 700 - min_date: "--min-date 1M" - exclude: "--exclude-where 'country!=India'" - north_america_recent: - group_by: "division week" - max_sequences: 400 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=North America'" - south_america_recent: - group_by: "country week" - max_sequences: 360 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=South America'" - oceania_recent: - group_by: "division week" - max_sequences: 60 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Oceania'" - - # Custom subsampling logic for global region over 2m - # 5125 total (expect ~3400) - # 4:1 ratio of recent to early - # all eight regions equal except Oceania at 20% - nextstrain_global_2m: - africa_early: - group_by: "country year month" - max_sequences: 150 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Africa'" - asia_early: - group_by: "country year month" - max_sequences: 200 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china_early: - group_by: "division year month" - max_sequences: 175 - max_date: "--max-date 2M" - exclude: "--exclude-where 'country!=China'" - europe_early: - group_by: "country year month" - max_sequences: 125 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Europe'" - india_early: - group_by: "division year month" - max_sequences: 175 - max_date: "--max-date 2M" - exclude: "--exclude-where 'country!=India'" - north_america_early: - group_by: "division year month" - max_sequences: 100 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=North America'" - south_america_early: - group_by: "country year month" - max_sequences: 90 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=South America'" - oceania_early: - group_by: "division year month" - max_sequences: 15 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Oceania'" - africa_recent: - group_by: "country week" - max_sequences: 600 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Africa'" - asia_recent: - group_by: "country week" - max_sequences: 800 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china_recent: - group_by: "division week" - max_sequences: 700 - min_date: "--min-date 2M" - exclude: "--exclude-where 'country!=China'" - europe_recent: - group_by: "country week" - max_sequences: 500 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Europe'" - india_recent: - group_by: "division week" - max_sequences: 700 - min_date: "--min-date 2M" - exclude: "--exclude-where 'country!=India'" - north_america_recent: - group_by: "division week" - max_sequences: 400 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=North America'" - south_america_recent: - group_by: "country week" - max_sequences: 360 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=South America'" - oceania_recent: - group_by: "division week" - max_sequences: 60 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Oceania'" - - # Custom subsampling logic for global region over 6m - # 5125 total (expect ~3400) - # 4:1 ratio of recent to early - # all eight regions equal except Oceania at 20% - nextstrain_global_6m: - africa_early: - group_by: "country year month" - max_sequences: 150 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Africa'" - asia_early: - group_by: "country year month" - max_sequences: 200 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china_early: - group_by: "division year month" - max_sequences: 175 - max_date: "--max-date 6M" - exclude: "--exclude-where 'country!=China'" - europe_early: - group_by: "country year month" - max_sequences: 125 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Europe'" - india_early: - group_by: "division year month" - max_sequences: 175 - max_date: "--max-date 6M" - exclude: "--exclude-where 'country!=India'" - north_america_early: - group_by: "division year month" - max_sequences: 100 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=North America'" - south_america_early: - group_by: "country year month" - max_sequences: 90 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=South America'" - oceania_early: - group_by: "division year month" - max_sequences: 15 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Oceania'" - africa_recent: - group_by: "country year month" - max_sequences: 600 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Africa'" - asia_recent: - group_by: "country year month" - max_sequences: 800 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china_recent: - group_by: "division year month" - max_sequences: 700 - min_date: "--min-date 6M" - exclude: "--exclude-where 'country!=China'" - europe_recent: - group_by: "country year month" - max_sequences: 500 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Europe'" - india_recent: - group_by: "division year month" - max_sequences: 700 - min_date: "--min-date 6M" - exclude: "--exclude-where 'country!=India'" - north_america_recent: - group_by: "division year month" - max_sequences: 400 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=North America'" - south_america_recent: - group_by: "country year month" - max_sequences: 360 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=South America'" - oceania_recent: - group_by: "division year month" - max_sequences: 60 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Oceania'" - - # Custom subsampling logic for global region over all-time - # 4320 total (expect ~3200) - # all eight regions equal except Oceania at 20% - nextstrain_global_all_time: - africa: - group_by: "country year month" - max_sequences: 750 - exclude: "--exclude-where 'region!=Africa'" - asia: - group_by: "country year month" - max_sequences: 1000 - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china: - group_by: "division year month" - max_sequences: 875 - exclude: "--exclude-where 'country!=China'" - europe: - group_by: "country year month" - max_sequences: 625 - exclude: "--exclude-where 'region!=Europe'" - india: - group_by: "division year month" - max_sequences: 875 - exclude: "--exclude-where 'country!=India'" - north_america: - group_by: "division year month" - max_sequences: 500 - exclude: "--exclude-where 'region!=North America'" - south_america: - group_by: "country year month" - max_sequences: 450 - exclude: "--exclude-where 'region!=South America'" - oceania: - group_by: "division year month" - max_sequences: 75 - exclude: "--exclude-where 'region!=Oceania'" - # Root to clade 21L refine: root: "21L" diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml index dfd006a28..6397ced86 100644 --- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml @@ -39,841 +39,97 @@ inputs: # Auspice config is specified in rule auspice_config in export_for_nextstrain.smk builds: reference: - subsampling_scheme: nextstrain_reference + subsampling_scheme: subsampling/reference.yaml title: Genomic epidemiology of SARS-CoV-2 with clade-focused subsampling global_1m: - subsampling_scheme: nextstrain_global_1m + subsampling_scheme: subsampling/global_1m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally over the past month global_2m: - subsampling_scheme: nextstrain_global_2m + subsampling_scheme: subsampling/global_2m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally over the past 2 months global_6m: - subsampling_scheme: nextstrain_global_6m + subsampling_scheme: subsampling/global_6m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally over the past 6 months global_all-time: - subsampling_scheme: nextstrain_global_all_time + subsampling_scheme: subsampling/global_all-time.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally since pandemic start africa_1m: - subsampling_scheme: nextstrain_region_grouped_by_country_1m - region: Africa + subsampling_scheme: subsampling/africa_1m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa over the past month africa_2m: - subsampling_scheme: nextstrain_region_grouped_by_country_2m - region: Africa + subsampling_scheme: subsampling/africa_2m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa over the past 2 months africa_6m: - subsampling_scheme: nextstrain_region_grouped_by_country_6m - region: Africa + subsampling_scheme: subsampling/africa_6m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa over the past 6 months africa_all-time: - subsampling_scheme: nextstrain_region_grouped_by_country_all_time - region: Africa + subsampling_scheme: subsampling/africa_all-time.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa since pandemic start asia_1m: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_1m - region: Asia + subsampling_scheme: subsampling/asia_1m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past month asia_2m: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_2m - region: Asia + subsampling_scheme: subsampling/asia_2m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 2 months asia_6m: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_6m - region: Asia + subsampling_scheme: subsampling/asia_6m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 6 months asia_all-time: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_all_time - region: Asia + subsampling_scheme: subsampling/asia_all-time.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia since pandemic start europe_1m: - subsampling_scheme: nextstrain_region_grouped_by_country_1m - region: Europe + subsampling_scheme: subsampling/europe_1m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe over the past month europe_2m: - subsampling_scheme: nextstrain_region_grouped_by_country_2m - region: Europe + subsampling_scheme: subsampling/europe_2m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe over the past 2 months europe_6m: - subsampling_scheme: nextstrain_region_grouped_by_country_6m - region: Europe + subsampling_scheme: subsampling/europe_6m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe over the past 6 months europe_all-time: - subsampling_scheme: nextstrain_region_grouped_by_country_all_time - region: Europe + subsampling_scheme: subsampling/europe_all-time.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe since pandemic start north-america_1m: - subsampling_scheme: nextstrain_region_grouped_by_division_1m - region: North America + subsampling_scheme: subsampling/north-america_1m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America over the past month north-america_2m: - subsampling_scheme: nextstrain_region_grouped_by_division_2m - region: North America + subsampling_scheme: subsampling/north-america_2m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America over the past 2 months north-america_6m: - subsampling_scheme: nextstrain_region_grouped_by_division_6m - region: North America + subsampling_scheme: subsampling/north-america_6m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America over the past 6 months north-america_all-time: - subsampling_scheme: nextstrain_region_grouped_by_division_all_time - region: North America + subsampling_scheme: subsampling/north-america_all-t.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America since pandemic start oceania_1m: - subsampling_scheme: nextstrain_region_grouped_by_division_1m - region: Oceania + subsampling_scheme: subsampling/oceania_1m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania over the past month oceania_2m: - subsampling_scheme: nextstrain_region_grouped_by_division_2m - region: Oceania + subsampling_scheme: subsampling/oceania_2m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania over the past 2 months oceania_6m: - subsampling_scheme: nextstrain_region_grouped_by_division_6m - region: Oceania + subsampling_scheme: subsampling/oceania_6m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania over the past 6 months oceania_all-time: - subsampling_scheme: nextstrain_region_grouped_by_division_all_time - region: Oceania + subsampling_scheme: subsampling/oceania_all-time.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania since pandemic start south-america_1m: - subsampling_scheme: nextstrain_region_grouped_by_country_1m - region: South America + subsampling_scheme: subsampling/south-america_1m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America over the past month south-america_2m: - subsampling_scheme: nextstrain_region_grouped_by_country_2m - region: South America + subsampling_scheme: subsampling/south-america_2m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America over the past 2 months south-america_6m: - subsampling_scheme: nextstrain_region_grouped_by_country_6m - region: South America + subsampling_scheme: subsampling/south-america_6m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America over the past 6 months south-america_all-time: - subsampling_scheme: nextstrain_region_grouped_by_country_all_time - region: South America + subsampling_scheme: subsampling/south-america_all-t.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America since pandemic start # remove sequences without division label in US filter: exclude_where: "division='USA'" -subsampling: - - # Custom subsampling logic for group by clade - nextstrain_reference: - clades: - group_by: "Nextstrain_clade" - max_sequences: 300 - - # Custom subsampling logic for regions over 1m - # Grouping by division for North America and Oceania - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_division_1m: - # Early focal samples for region - focal_early: - group_by: "division year month" - max_sequences: 640 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region={region}'" - # Recent focal samples for region - focal_recent: - group_by: "division week" - max_sequences: 2560 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country week" - max_sequences: 640 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over 2m - # Grouping by division for North America and Oceania - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_division_2m: - # Early focal samples for region - focal_early: - group_by: "division year month" - max_sequences: 640 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region={region}'" - # Recent focal samples for region - focal_recent: - group_by: "division week" - max_sequences: 2560 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country week" - max_sequences: 640 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over 6m - # Grouping by division for North America and Oceania - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_division_6m: - # Early focal samples for region - focal_early: - group_by: "division year month" - max_sequences: 640 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region={region}'" - # Recent focal samples for region - focal_recent: - group_by: "division year month" - max_sequences: 2560 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country year month" - max_sequences: 640 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over all-time - # Grouping by division for North America and Oceania - # 4000 total - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_division_all_time: - # Focal samples for region - focal: - group_by: "division year month" - max_sequences: 3200 - exclude: "--exclude-where 'region!={region}'" - # Contextual samples from the rest of the world - context: - group_by: "country year month" - max_sequences: 800 - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for region Asia over 1m - # Grouping by division - # Separating three buckets for China, India and elsewhere - # 4375 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India - nextstrain_region_asia_grouped_by_division_1m: - # Early focal samples for Asia - asia_early: - group_by: "division year month" - max_sequences: 300 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Early focal samples for China - china_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 1M" - exclude: "--exclude-where 'country!=China'" - # Early focal samples for India - india_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 1M" - exclude: "--exclude-where 'country!=India'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 175 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region=Asia'" - # Recent focal samples for Asia - asia_recent: - group_by: "division year month" - max_sequences: 1200 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Recent focal samples for China - china_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 1M" - exclude: "--exclude-where 'country!=China'" - # Recent focal samples for India - india_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 1M" - exclude: "--exclude-where 'country!=India'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country year month" - max_sequences: 700 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region=Asia'" - - # Custom subsampling logic for region Asia over 2m - # Grouping by division - # Separating three buckets for China, India and elsewhere - # 4375 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India - nextstrain_region_asia_grouped_by_division_2m: - # Early focal samples for Asia - asia_early: - group_by: "division year month" - max_sequences: 300 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Early focal samples for China - china_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 2M" - exclude: "--exclude-where 'country!=China'" - # Early focal samples for India - india_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 2M" - exclude: "--exclude-where 'country!=India'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 175 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region=Asia'" - # Recent focal samples for Asia - asia_recent: - group_by: "division year month" - max_sequences: 1200 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Recent focal samples for China - china_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 2M" - exclude: "--exclude-where 'country!=China'" - # Recent focal samples for India - india_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 2M" - exclude: "--exclude-where 'country!=India'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country year month" - max_sequences: 700 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region=Asia'" - - # Custom subsampling logic for region Asia over 6m - # Grouping by division - # Separating three buckets for China, India and elsewhere - # 4375 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India - nextstrain_region_asia_grouped_by_division_6m: - # Early focal samples for Asia - asia_early: - group_by: "division year month" - max_sequences: 300 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Early focal samples for China - china_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 6M" - exclude: "--exclude-where 'country!=China'" - # Early focal samples for India - india_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 6M" - exclude: "--exclude-where 'country!=India'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 175 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region=Asia'" - # Recent focal samples for Asia - asia_recent: - group_by: "division year month" - max_sequences: 1200 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Recent focal samples for China - china_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 6M" - exclude: "--exclude-where 'country!=China'" - # Recent focal samples for India - india_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 6M" - exclude: "--exclude-where 'country!=India'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country year month" - max_sequences: 700 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region=Asia'" - - # Custom subsampling logic for region Asia over all-time - # Grouping by division - # Separating three buckets for China, India and elsewhere - # 4375 total - # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India - nextstrain_region_asia_grouped_by_division_all_time: - # Focal samples for Asia - asia: - group_by: "division year month" - max_sequences: 1500 - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Focal samples for China - china: - group_by: "division year month" - max_sequences: 1000 - exclude: "--exclude-where 'country!=China'" - # Focal samples for India - india: - group_by: "division year month" - max_sequences: 1000 - exclude: "--exclude-where 'country!=India'" - # Contextual samples from the rest of the world - context: - group_by: "country year month" - max_sequences: 875 - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over 1m - # Grouping by country for Africa, Asia, Europe and South America - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_country_1m: - # Early focal samples for region - focal_early: - group_by: "country year month" - max_sequences: 640 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region={region}'" - # Recent focal samples for region - focal_recent: - group_by: "country week" - max_sequences: 2560 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country week" - max_sequences: 640 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over 2m - # Grouping by country for Africa, Asia, Europe and South America - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_country_2m: - # Early focal samples for region - focal_early: - group_by: "country year month" - max_sequences: 640 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region={region}'" - # Recent focal samples for region - focal_recent: - group_by: "country week" - max_sequences: 2560 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country week" - max_sequences: 640 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over 6m - # Grouping by country for Africa, Asia, Europe and South America - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_country_6m: - # Early focal samples for region - focal_early: - group_by: "country year month" - max_sequences: 640 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region={region}'" - # Recent focal samples for region - focal_recent: - group_by: "country year month" - max_sequences: 2560 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country year month" - max_sequences: 640 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over all-time - # Grouping by country for Africa, Asia, Europe and South America - # 4000 total - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_country_all_time: - # Focal samples for region - focal: - group_by: "country year month" - max_sequences: 3200 - exclude: "--exclude-where 'region!={region}'" - # Contextual samples from the rest of the world - context: - group_by: "country year month" - max_sequences: 800 - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for global region over 1m - # 5125 total (expect ~3400) - # 4:1 ratio of recent to early - # all eight regions equal except Oceania at 20% - nextstrain_global_1m: - africa_early: - group_by: "country year month" - max_sequences: 150 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Africa'" - asia_early: - group_by: "country year month" - max_sequences: 200 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china_early: - group_by: "division year month" - max_sequences: 175 - max_date: "--max-date 1M" - exclude: "--exclude-where 'country!=China'" - europe_early: - group_by: "country year month" - max_sequences: 125 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Europe'" - india_early: - group_by: "division year month" - max_sequences: 175 - max_date: "--max-date 1M" - exclude: "--exclude-where 'country!=India'" - north_america_early: - group_by: "division year month" - max_sequences: 100 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=North America'" - south_america_early: - group_by: "country year month" - max_sequences: 90 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=South America'" - oceania_early: - group_by: "division year month" - max_sequences: 15 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Oceania'" - africa_recent: - group_by: "country week" - max_sequences: 600 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Africa'" - asia_recent: - group_by: "country week" - max_sequences: 800 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china_recent: - group_by: "division week" - max_sequences: 700 - min_date: "--min-date 1M" - exclude: "--exclude-where 'country!=China'" - europe_recent: - group_by: "country week" - max_sequences: 500 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Europe'" - india_recent: - group_by: "division week" - max_sequences: 700 - min_date: "--min-date 1M" - exclude: "--exclude-where 'country!=India'" - north_america_recent: - group_by: "division week" - max_sequences: 400 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=North America'" - south_america_recent: - group_by: "country week" - max_sequences: 360 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=South America'" - oceania_recent: - group_by: "division week" - max_sequences: 60 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Oceania'" - - # Custom subsampling logic for global region over 2m - # 5125 total (expect ~3400) - # 4:1 ratio of recent to early - # all eight regions equal except Oceania at 20% - nextstrain_global_2m: - africa_early: - group_by: "country year month" - max_sequences: 150 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Africa'" - asia_early: - group_by: "country year month" - max_sequences: 200 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china_early: - group_by: "division year month" - max_sequences: 175 - max_date: "--max-date 2M" - exclude: "--exclude-where 'country!=China'" - europe_early: - group_by: "country year month" - max_sequences: 125 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Europe'" - india_early: - group_by: "division year month" - max_sequences: 175 - max_date: "--max-date 2M" - exclude: "--exclude-where 'country!=India'" - north_america_early: - group_by: "division year month" - max_sequences: 100 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=North America'" - south_america_early: - group_by: "country year month" - max_sequences: 90 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=South America'" - oceania_early: - group_by: "division year month" - max_sequences: 15 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Oceania'" - africa_recent: - group_by: "country week" - max_sequences: 600 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Africa'" - asia_recent: - group_by: "country week" - max_sequences: 800 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china_recent: - group_by: "division week" - max_sequences: 700 - min_date: "--min-date 2M" - exclude: "--exclude-where 'country!=China'" - europe_recent: - group_by: "country week" - max_sequences: 500 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Europe'" - india_recent: - group_by: "division week" - max_sequences: 700 - min_date: "--min-date 2M" - exclude: "--exclude-where 'country!=India'" - north_america_recent: - group_by: "division week" - max_sequences: 400 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=North America'" - south_america_recent: - group_by: "country week" - max_sequences: 360 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=South America'" - oceania_recent: - group_by: "division week" - max_sequences: 60 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Oceania'" - - # Custom subsampling logic for global region over 6m - # 5125 total (expect ~3400) - # 4:1 ratio of recent to early - # all eight regions equal except Oceania at 20% - nextstrain_global_6m: - africa_early: - group_by: "country year month" - max_sequences: 150 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Africa'" - asia_early: - group_by: "country year month" - max_sequences: 200 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china_early: - group_by: "division year month" - max_sequences: 175 - max_date: "--max-date 6M" - exclude: "--exclude-where 'country!=China'" - europe_early: - group_by: "country year month" - max_sequences: 125 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Europe'" - india_early: - group_by: "division year month" - max_sequences: 175 - max_date: "--max-date 6M" - exclude: "--exclude-where 'country!=India'" - north_america_early: - group_by: "division year month" - max_sequences: 100 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=North America'" - south_america_early: - group_by: "country year month" - max_sequences: 90 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=South America'" - oceania_early: - group_by: "division year month" - max_sequences: 15 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Oceania'" - africa_recent: - group_by: "country year month" - max_sequences: 600 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Africa'" - asia_recent: - group_by: "country year month" - max_sequences: 800 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china_recent: - group_by: "division year month" - max_sequences: 700 - min_date: "--min-date 6M" - exclude: "--exclude-where 'country!=China'" - europe_recent: - group_by: "country year month" - max_sequences: 500 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Europe'" - india_recent: - group_by: "division year month" - max_sequences: 700 - min_date: "--min-date 6M" - exclude: "--exclude-where 'country!=India'" - north_america_recent: - group_by: "division year month" - max_sequences: 400 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=North America'" - south_america_recent: - group_by: "country year month" - max_sequences: 360 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=South America'" - oceania_recent: - group_by: "division year month" - max_sequences: 60 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Oceania'" - - # Custom subsampling logic for global region over all-time - # 4320 total (expect ~3200) - # all eight regions equal except Oceania at 20% - nextstrain_global_all_time: - africa: - group_by: "country year month" - max_sequences: 750 - exclude: "--exclude-where 'region!=Africa'" - asia: - group_by: "country year month" - max_sequences: 1000 - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china: - group_by: "division year month" - max_sequences: 875 - exclude: "--exclude-where 'country!=China'" - europe: - group_by: "country year month" - max_sequences: 625 - exclude: "--exclude-where 'region!=Europe'" - india: - group_by: "division year month" - max_sequences: 875 - exclude: "--exclude-where 'country!=India'" - north_america: - group_by: "division year month" - max_sequences: 500 - exclude: "--exclude-where 'region!=North America'" - south_america: - group_by: "country year month" - max_sequences: 450 - exclude: "--exclude-where 'region!=South America'" - oceania: - group_by: "division year month" - max_sequences: 75 - exclude: "--exclude-where 'region!=Oceania'" - # if different traits should be reconstructed for some builds, specify here # otherwise the default trait config in defaults/parameters.yaml will used traits: diff --git a/nextstrain_profiles/nextstrain-open/builds.yaml b/nextstrain_profiles/nextstrain-open/builds.yaml index aa26e0101..95999bf8a 100644 --- a/nextstrain_profiles/nextstrain-open/builds.yaml +++ b/nextstrain_profiles/nextstrain-open/builds.yaml @@ -39,114 +39,114 @@ inputs: # Auspice config is specified in rule auspice_config in export_for_nextstrain.smk builds: reference: - subsampling_scheme: nextstrain_reference + subsampling_scheme: subsampling/reference.yaml title: Genomic epidemiology of SARS-CoV-2 with clade-focused subsampling global_1m: - subsampling_scheme: nextstrain_global_1m + subsampling_scheme: subsampling/global_1m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally over the past month global_2m: - subsampling_scheme: nextstrain_global_2m + subsampling_scheme: subsampling/global_2m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally over the past 2 months global_6m: - subsampling_scheme: nextstrain_global_6m + subsampling_scheme: subsampling/global_6m.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally over the past 6 months global_all-time: - subsampling_scheme: nextstrain_global_all_time + subsampling_scheme: subsampling/global_all-time.yaml title: Genomic epidemiology of SARS-CoV-2 with subsampling focused globally since pandemic start africa_1m: - subsampling_scheme: nextstrain_region_grouped_by_country_1m + subsampling_scheme: subsampling/africa_1m.yaml region: Africa title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa over the past month africa_2m: - subsampling_scheme: nextstrain_region_grouped_by_country_2m + subsampling_scheme: subsampling/africa_2m.yaml region: Africa title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa over the past 2 months africa_6m: - subsampling_scheme: nextstrain_region_grouped_by_country_6m + subsampling_scheme: subsampling/africa_6m.yaml region: Africa title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa over the past 6 months africa_all-time: - subsampling_scheme: nextstrain_region_grouped_by_country_all_time + subsampling_scheme: subsampling/africa_all-time.yaml region: Africa title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa since pandemic start asia_1m: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_1m + subsampling_scheme: subsampling/asia_1m.yaml region: Asia title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past month asia_2m: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_2m + subsampling_scheme: subsampling/asia_2m.yaml region: Asia title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 2 months asia_6m: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_6m + subsampling_scheme: subsampling/asia_6m.yaml region: Asia title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 6 months asia_all-time: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_all_time + subsampling_scheme: subsampling/asia_all-time.yaml region: Asia title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia since pandemic start europe_1m: - subsampling_scheme: nextstrain_region_grouped_by_country_1m + subsampling_scheme: subsampling/europe_1m.yaml region: Europe title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe over the past month europe_2m: - subsampling_scheme: nextstrain_region_grouped_by_country_2m + subsampling_scheme: subsampling/europe_2m.yaml region: Europe title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe over the past 2 months europe_6m: - subsampling_scheme: nextstrain_region_grouped_by_country_6m + subsampling_scheme: subsampling/europe_6m.yaml region: Europe title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe over the past 6 months europe_all-time: - subsampling_scheme: nextstrain_region_grouped_by_country_all_time + subsampling_scheme: subsampling/europe_all-time.yaml region: Europe title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Europe since pandemic start north-america_1m: - subsampling_scheme: nextstrain_region_grouped_by_division_1m + subsampling_scheme: subsampling/north-america_1m.yaml region: North America title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America over the past month north-america_2m: - subsampling_scheme: nextstrain_region_grouped_by_division_2m + subsampling_scheme: subsampling/north-america_2m.yaml region: North America title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America over the past 2 months north-america_6m: - subsampling_scheme: nextstrain_region_grouped_by_division_6m + subsampling_scheme: subsampling/north-america_6m.yaml region: North America title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America over the past 6 months north-america_all-time: - subsampling_scheme: nextstrain_region_grouped_by_division_all_time + subsampling_scheme: subsampling/north-america_all-time.yaml region: North America title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on North America since pandemic start oceania_1m: - subsampling_scheme: nextstrain_region_grouped_by_division_1m + subsampling_scheme: subsampling/oceania_1m.yaml region: Oceania title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania over the past month oceania_2m: - subsampling_scheme: nextstrain_region_grouped_by_division_2m + subsampling_scheme: subsampling/oceania_2m.yaml region: Oceania title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania over the past 2 months oceania_6m: - subsampling_scheme: nextstrain_region_grouped_by_division_6m + subsampling_scheme: subsampling/oceania_6m.yaml region: Oceania title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania over the past 6 months oceania_all-time: - subsampling_scheme: nextstrain_region_grouped_by_division_all_time + subsampling_scheme: subsampling/oceania_all-time.yaml region: Oceania title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Oceania since pandemic start south-america_1m: - subsampling_scheme: nextstrain_region_grouped_by_country_1m + subsampling_scheme: subsampling/south-america_1m.yaml region: South America title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America over the past month south-america_2m: - subsampling_scheme: nextstrain_region_grouped_by_country_2m + subsampling_scheme: subsampling/south-america_2m.yaml region: South America title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America over the past 2 months south-america_6m: - subsampling_scheme: nextstrain_region_grouped_by_country_6m + subsampling_scheme: subsampling/south-america_6m.yaml region: South America title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America over the past 6 months south-america_all-time: - subsampling_scheme: nextstrain_region_grouped_by_country_all_time + subsampling_scheme: subsampling/south-america_all-time.yaml region: South America title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on South America since pandemic start @@ -154,726 +154,6 @@ builds: filter: exclude_where: "division='USA'" -subsampling: - - # Custom subsampling logic for group by clade - nextstrain_reference: - clades: - group_by: "Nextstrain_clade" - max_sequences: 300 - - # Custom subsampling logic for regions over 1m - # Grouping by division for North America and Oceania - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_division_1m: - # Early focal samples for region - focal_early: - group_by: "division year month" - max_sequences: 640 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region={region}'" - # Recent focal samples for region - focal_recent: - group_by: "division week" - max_sequences: 2560 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country week" - max_sequences: 640 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over 2m - # Grouping by division for North America and Oceania - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_division_2m: - # Early focal samples for region - focal_early: - group_by: "division year month" - max_sequences: 640 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region={region}'" - # Recent focal samples for region - focal_recent: - group_by: "division week" - max_sequences: 2560 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country week" - max_sequences: 640 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over 6m - # Grouping by division for North America and Oceania - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_division_6m: - # Early focal samples for region - focal_early: - group_by: "division year month" - max_sequences: 640 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region={region}'" - # Recent focal samples for region - focal_recent: - group_by: "division year month" - max_sequences: 2560 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country year month" - max_sequences: 640 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over all-time - # Grouping by division for North America and Oceania - # 4000 total - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_division_all_time: - # Focal samples for region - focal: - group_by: "division year month" - max_sequences: 3200 - exclude: "--exclude-where 'region!={region}'" - # Contextual samples from the rest of the world - context: - group_by: "country year month" - max_sequences: 800 - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for region Asia over 1m - # Grouping by division - # Separating three buckets for China, India and elsewhere - # 4375 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India - nextstrain_region_asia_grouped_by_division_1m: - # Early focal samples for Asia - asia_early: - group_by: "division year month" - max_sequences: 300 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Early focal samples for China - china_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 1M" - exclude: "--exclude-where 'country!=China'" - # Early focal samples for India - india_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 1M" - exclude: "--exclude-where 'country!=India'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 175 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region=Asia'" - # Recent focal samples for Asia - asia_recent: - group_by: "division year month" - max_sequences: 1200 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Recent focal samples for China - china_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 1M" - exclude: "--exclude-where 'country!=China'" - # Recent focal samples for India - india_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 1M" - exclude: "--exclude-where 'country!=India'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country year month" - max_sequences: 700 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region=Asia'" - - # Custom subsampling logic for region Asia over 2m - # Grouping by division - # Separating three buckets for China, India and elsewhere - # 4375 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India - nextstrain_region_asia_grouped_by_division_2m: - # Early focal samples for Asia - asia_early: - group_by: "division year month" - max_sequences: 300 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Early focal samples for China - china_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 2M" - exclude: "--exclude-where 'country!=China'" - # Early focal samples for India - india_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 2M" - exclude: "--exclude-where 'country!=India'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 175 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region=Asia'" - # Recent focal samples for Asia - asia_recent: - group_by: "division year month" - max_sequences: 1200 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Recent focal samples for China - china_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 2M" - exclude: "--exclude-where 'country!=China'" - # Recent focal samples for India - india_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 2M" - exclude: "--exclude-where 'country!=India'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country year month" - max_sequences: 700 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region=Asia'" - - # Custom subsampling logic for region Asia over 6m - # Grouping by division - # Separating three buckets for China, India and elsewhere - # 4375 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India - nextstrain_region_asia_grouped_by_division_6m: - # Early focal samples for Asia - asia_early: - group_by: "division year month" - max_sequences: 300 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Early focal samples for China - china_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 6M" - exclude: "--exclude-where 'country!=China'" - # Early focal samples for India - india_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 6M" - exclude: "--exclude-where 'country!=India'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 175 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region=Asia'" - # Recent focal samples for Asia - asia_recent: - group_by: "division year month" - max_sequences: 1200 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Recent focal samples for China - china_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 6M" - exclude: "--exclude-where 'country!=China'" - # Recent focal samples for India - india_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 6M" - exclude: "--exclude-where 'country!=India'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country year month" - max_sequences: 700 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region=Asia'" - - # Custom subsampling logic for region Asia over all-time - # Grouping by division - # Separating three buckets for China, India and elsewhere - # 4375 total - # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India - nextstrain_region_asia_grouped_by_division_all_time: - # Focal samples for Asia - asia: - group_by: "division year month" - max_sequences: 1500 - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Focal samples for China - china: - group_by: "division year month" - max_sequences: 1000 - exclude: "--exclude-where 'country!=China'" - # Focal samples for India - india: - group_by: "division year month" - max_sequences: 1000 - exclude: "--exclude-where 'country!=India'" - # Contextual samples from the rest of the world - context: - group_by: "country year month" - max_sequences: 875 - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over 1m - # Grouping by country for Africa, Asia, Europe and South America - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_country_1m: - # Early focal samples for region - focal_early: - group_by: "country year month" - max_sequences: 640 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region={region}'" - # Recent focal samples for region - focal_recent: - group_by: "country week" - max_sequences: 2560 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country week" - max_sequences: 640 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over 2m - # Grouping by country for Africa, Asia, Europe and South America - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_country_2m: - # Early focal samples for region - focal_early: - group_by: "country year month" - max_sequences: 640 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region={region}'" - # Recent focal samples for region - focal_recent: - group_by: "country week" - max_sequences: 2560 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country week" - max_sequences: 640 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over 6m - # Grouping by country for Africa, Asia, Europe and South America - # 4000 total - # 4:1 ratio of recent to early - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_country_6m: - # Early focal samples for region - focal_early: - group_by: "country year month" - max_sequences: 640 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_early: - group_by: "country year month" - max_sequences: 160 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region={region}'" - # Recent focal samples for region - focal_recent: - group_by: "country year month" - max_sequences: 2560 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!={region}'" - # Early contextual samples from the rest of the world - context_recent: - group_by: "country year month" - max_sequences: 640 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for regions over all-time - # Grouping by country for Africa, Asia, Europe and South America - # 4000 total - # 4:1 ratio of focal to context - nextstrain_region_grouped_by_country_all_time: - # Focal samples for region - focal: - group_by: "country year month" - max_sequences: 3200 - exclude: "--exclude-where 'region!={region}'" - # Contextual samples from the rest of the world - context: - group_by: "country year month" - max_sequences: 800 - exclude: "--exclude-where 'region={region}'" - - # Custom subsampling logic for global region over 1m - # 5125 total (expect ~3400) - # 4:1 ratio of recent to early - # all eight regions equal except Oceania at 20% - nextstrain_global_1m: - africa_early: - group_by: "country year month" - max_sequences: 150 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Africa'" - asia_early: - group_by: "country year month" - max_sequences: 200 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china_early: - group_by: "division year month" - max_sequences: 175 - max_date: "--max-date 1M" - exclude: "--exclude-where 'country!=China'" - europe_early: - group_by: "country year month" - max_sequences: 125 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Europe'" - india_early: - group_by: "division year month" - max_sequences: 175 - max_date: "--max-date 1M" - exclude: "--exclude-where 'country!=India'" - north_america_early: - group_by: "division year month" - max_sequences: 100 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=North America'" - south_america_early: - group_by: "country year month" - max_sequences: 90 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=South America'" - oceania_early: - group_by: "division year month" - max_sequences: 15 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Oceania'" - africa_recent: - group_by: "country week" - max_sequences: 600 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Africa'" - asia_recent: - group_by: "country week" - max_sequences: 800 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china_recent: - group_by: "division week" - max_sequences: 700 - min_date: "--min-date 1M" - exclude: "--exclude-where 'country!=China'" - europe_recent: - group_by: "country week" - max_sequences: 500 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Europe'" - india_recent: - group_by: "division week" - max_sequences: 700 - min_date: "--min-date 1M" - exclude: "--exclude-where 'country!=India'" - north_america_recent: - group_by: "division week" - max_sequences: 400 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=North America'" - south_america_recent: - group_by: "country week" - max_sequences: 360 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=South America'" - oceania_recent: - group_by: "division week" - max_sequences: 60 - min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Oceania'" - - # Custom subsampling logic for global region over 2m - # 5125 total (expect ~3400) - # 4:1 ratio of recent to early - # all eight regions equal except Oceania at 20% - nextstrain_global_2m: - africa_early: - group_by: "country year month" - max_sequences: 150 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Africa'" - asia_early: - group_by: "country year month" - max_sequences: 200 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china_early: - group_by: "division year month" - max_sequences: 175 - max_date: "--max-date 2M" - exclude: "--exclude-where 'country!=China'" - europe_early: - group_by: "country year month" - max_sequences: 125 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Europe'" - india_early: - group_by: "division year month" - max_sequences: 175 - max_date: "--max-date 2M" - exclude: "--exclude-where 'country!=India'" - north_america_early: - group_by: "division year month" - max_sequences: 100 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=North America'" - south_america_early: - group_by: "country year month" - max_sequences: 90 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=South America'" - oceania_early: - group_by: "division year month" - max_sequences: 15 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Oceania'" - africa_recent: - group_by: "country week" - max_sequences: 600 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Africa'" - asia_recent: - group_by: "country week" - max_sequences: 800 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china_recent: - group_by: "division week" - max_sequences: 700 - min_date: "--min-date 2M" - exclude: "--exclude-where 'country!=China'" - europe_recent: - group_by: "country week" - max_sequences: 500 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Europe'" - india_recent: - group_by: "division week" - max_sequences: 700 - min_date: "--min-date 2M" - exclude: "--exclude-where 'country!=India'" - north_america_recent: - group_by: "division week" - max_sequences: 400 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=North America'" - south_america_recent: - group_by: "country week" - max_sequences: 360 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=South America'" - oceania_recent: - group_by: "division week" - max_sequences: 60 - min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Oceania'" - - # Custom subsampling logic for global region over 6m - # 5125 total (expect ~3400) - # 4:1 ratio of recent to early - # all eight regions equal except Oceania at 20% - nextstrain_global_6m: - africa_early: - group_by: "country year month" - max_sequences: 150 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Africa'" - asia_early: - group_by: "country year month" - max_sequences: 200 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china_early: - group_by: "division year month" - max_sequences: 175 - max_date: "--max-date 6M" - exclude: "--exclude-where 'country!=China'" - europe_early: - group_by: "country year month" - max_sequences: 125 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Europe'" - india_early: - group_by: "division year month" - max_sequences: 175 - max_date: "--max-date 6M" - exclude: "--exclude-where 'country!=India'" - north_america_early: - group_by: "division year month" - max_sequences: 100 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=North America'" - south_america_early: - group_by: "country year month" - max_sequences: 90 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=South America'" - oceania_early: - group_by: "division year month" - max_sequences: 15 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Oceania'" - africa_recent: - group_by: "country year month" - max_sequences: 600 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Africa'" - asia_recent: - group_by: "country year month" - max_sequences: 800 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china_recent: - group_by: "division year month" - max_sequences: 700 - min_date: "--min-date 6M" - exclude: "--exclude-where 'country!=China'" - europe_recent: - group_by: "country year month" - max_sequences: 500 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Europe'" - india_recent: - group_by: "division year month" - max_sequences: 700 - min_date: "--min-date 6M" - exclude: "--exclude-where 'country!=India'" - north_america_recent: - group_by: "division year month" - max_sequences: 400 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=North America'" - south_america_recent: - group_by: "country year month" - max_sequences: 360 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=South America'" - oceania_recent: - group_by: "division year month" - max_sequences: 60 - min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Oceania'" - - # Custom subsampling logic for global region over all-time - # 4320 total (expect ~3200) - # all eight regions equal except Oceania at 20% - nextstrain_global_all_time: - africa: - group_by: "country year month" - max_sequences: 750 - exclude: "--exclude-where 'region!=Africa'" - asia: - group_by: "country year month" - max_sequences: 1000 - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - china: - group_by: "division year month" - max_sequences: 875 - exclude: "--exclude-where 'country!=China'" - europe: - group_by: "country year month" - max_sequences: 625 - exclude: "--exclude-where 'region!=Europe'" - india: - group_by: "division year month" - max_sequences: 875 - exclude: "--exclude-where 'country!=India'" - north_america: - group_by: "division year month" - max_sequences: 500 - exclude: "--exclude-where 'region!=North America'" - south_america: - group_by: "country year month" - max_sequences: 450 - exclude: "--exclude-where 'region!=South America'" - oceania: - group_by: "division year month" - max_sequences: 75 - exclude: "--exclude-where 'region!=Oceania'" - # GenBank data includes "Wuhan-Hu-1/2019" which we use as the root for this build # as Wuhan/Hu-1/2019 is not in the data. refine: diff --git a/workflow/snakemake_rules/common.smk b/workflow/snakemake_rules/common.smk index 0c2713950..c567dcc10 100644 --- a/workflow/snakemake_rules/common.smk +++ b/workflow/snakemake_rules/common.smk @@ -42,7 +42,7 @@ def numeric_date(dt=None): return res -def _get_subsampling_scheme_by_build_name(build_name): +def _get_subsampling_config_by_build_name(build_name): return config["builds"].get(build_name, {}).get("subsampling_scheme", build_name) def _get_skipped_inputs_for_diagnostic(wildcards): diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 82922157e..e12dd77bd 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -103,23 +103,31 @@ rule align: xz -2 -T {threads} {params.output_translations_toxz} """ + +import ruamel.yaml + +yaml=ruamel.yaml.YAML() + def _get_subsampling_settings(wildcards): # Allow users to override default subsampling with their own settings keyed # by location type and name. For example, "region_europe" or # "country_iceland". Otherwise, default to settings for the location type. - subsampling_scheme = _get_subsampling_scheme_by_build_name(wildcards.build_name) + + subsampling_config_file = _get_subsampling_config_by_build_name(wildcards.build_name) + with open(subsampling_config_file) as f: + subsampling_config = yaml.load(f) # When there is no well-defined subsampling scheme, default to using all # available samples. - if subsampling_scheme not in config["subsampling"]: - print( - f"WARNING: No valid subsampling scheme is defined for build '{wildcards.build_name}'.", - "Skipping subsampling and using all available samples.", - file=sys.stderr - ) - subsampling_scheme = "all" + # FIXME: handle case where no subsampling YAML is available + # print( + # f"WARNING: No valid subsampling scheme is defined for build '{wildcards.build_name}'.", + # "Skipping subsampling and using all available samples.", + # file=sys.stderr + # ) + # subsampling_scheme = "all" - subsampling_settings = config["subsampling"][subsampling_scheme] + subsampling_settings = subsampling_config["samples"] if hasattr(wildcards, "subsample"): subsampling_settings = subsampling_settings[wildcards.subsample] @@ -418,8 +426,8 @@ def _get_subsampled_files(wildcards): subsampling_settings = _get_subsampling_settings(wildcards) return [ - f"results/{wildcards.build_name}/sample-{subsample}.txt" - for subsample in subsampling_settings + f"results/{wildcards.build_name}/sample-{sample}.txt" + for sample in subsampling_settings ] rule combine_samples: