Skip to content

Commit

Permalink
🚧 Read from subsampling config files
Browse files Browse the repository at this point in the history
  • Loading branch information
victorlin committed Mar 28, 2024
1 parent 0fd6861 commit 37c68b8
Show file tree
Hide file tree
Showing 9 changed files with 115 additions and 2,459 deletions.
3 changes: 2 additions & 1 deletion nextstrain_profiles/100k/config-gisaid.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ inputs:

builds:
100k:
subsampling_scheme: 100k_scheme
subsampling_scheme: subsampling/100k.yaml

# mapping of remote: local files to be uploaded under S3_DST_BUCKET
upload:
Expand All @@ -26,6 +26,7 @@ upload:
filter:
exclude_where: "division='USA'"

# FIXME: move this comment to somewhere else then delete the entire entry.
# We wish to subsample 50k in the previous 12 months and 50k prior to that.
# Note 1: both --max-date and --min-date are inclusive of the boundary date,
# so sequences from that date will be available to both sub-samples
Expand Down
12 changes: 1 addition & 11 deletions nextstrain_profiles/100k/config-open.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,9 @@ inputs:
skip_sanitize_metadata: true
builds:
100k:
subsampling_scheme: 100k_scheme
subsampling_scheme: subsampling/100k.yaml
upload:
metadata.tsv.xz: results/100k/100k_subsampled_metadata.tsv.xz
sequences.fasta.xz: results/100k/100k_subsampled_sequences.fasta.xz
filter:
exclude_where: "division='USA'"
subsampling:
100k_scheme:
50k_early:
group_by: "year month country"
max_sequences: 50000
max_date: "--max-date 1Y"
50k_late:
group_by: "year month country"
max_sequences: 50000
min_date: "--min-date 1Y"
22 changes: 1 addition & 21 deletions nextstrain_profiles/nextstrain-ci/builds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,7 @@ builds:
# Override the default Nextstrain European build's subsampling scheme for more
# stable subsampling of a fixed dataset in continuous integration tests.
europe:
subsampling_scheme: nextstrain_ci_sampling
region: Europe

subsampling:
# Custom subsampling logic for CI tests.
nextstrain_ci_sampling:
# Focal samples for region
region:
group_by: "division year month"
max_sequences: 20
sampling_scheme: "--no-probabilistic-sampling"
exclude: "--exclude-where 'region!={region}'"
# Contextual samples for region from the rest of the world
global:
group_by: "year month"
max_sequences: 10
sampling_scheme: "--no-probabilistic-sampling"
exclude: "--exclude-where 'region={region}'"
priorities:
type: "proximity"
focus: "region"
subsampling_scheme: subsampling/nextstrain_ci_sampling.yaml

# Override default frequency settings, so we can estimate frequencies from older
# data with a fixed time range.
Expand Down
123 changes: 4 additions & 119 deletions nextstrain_profiles/nextstrain-country/builds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,137 +39,22 @@ inputs:
# Auspice config is specified in rule auspice_config in export_for_nextstrain.smk
builds:
nextstrain_country_1m:
subsampling_scheme: nextstrain_country_1m
subsampling_scheme: subsampling/nextstrain_country_1m.yaml
title: Genomic epidemiology of SARS-CoV-2 with country-focused subsampling over the past month
country: India
nextstrain_country_2m:
subsampling_scheme: nextstrain_country_2m
subsampling_scheme: subsampling/nextstrain_country_2m.yaml
title: Genomic epidemiology of SARS-CoV-2 with country-focused subsampling over the past 2 months
country: India
nextstrain_country_6m:
subsampling_scheme: nextstrain_country_6m
subsampling_scheme: subsampling/nextstrain_country_6m.yaml
title: Genomic epidemiology of SARS-CoV-2 with country-focused subsampling over the past 6 months
country: India
nextstrain_country_all-time:
subsampling_scheme: nextstrain_country_all-time
subsampling_scheme: subsampling/nextstrain_country_all-time.yaml
title: Genomic epidemiology of SARS-CoV-2 with country-focused subsampling since pandemic start
country: India

# remove sequences without division label in US
filter:
exclude_where: "division='USA'"

subsampling:

# Custom subsampling logic for regions over 1m
# Grouping by division for North America and Oceania
# 4000 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
nextstrain_country_1m:
# Early focal samples for region
focal_early:
group_by: "division year month"
max_sequences: 640
max_date: "--max-date 1M"
exclude: "--exclude-where 'country!={country}'"
# Early contextual samples from the rest of the world
context_early:
group_by: "country year month"
max_sequences: 160
max_date: "--max-date 1M"
exclude: "--exclude-where 'country={country}'"
# Recent focal samples for region
focal_recent:
group_by: "division week"
max_sequences: 2560
min_date: "--min-date 1M"
exclude: "--exclude-where 'country!={country}'"
# Early contextual samples from the rest of the world
context_recent:
group_by: "country week"
max_sequences: 640
min_date: "--min-date 1M"
exclude: "--exclude-where 'country={country}'"

# Custom subsampling logic for regions over 2m
# Grouping by division for North America and Oceania
# 4000 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
nextstrain_country_2m:
# Early focal samples for region
focal_early:
group_by: "division year month"
max_sequences: 640
max_date: "--max-date 2M"
exclude: "--exclude-where 'country!={country}'"
# Early contextual samples from the rest of the world
context_early:
group_by: "country year month"
max_sequences: 160
max_date: "--max-date 2M"
exclude: "--exclude-where 'country={country}'"
# Recent focal samples for region
focal_recent:
group_by: "division week"
max_sequences: 2560
min_date: "--min-date 2M"
exclude: "--exclude-where 'country!={country}'"
# Early contextual samples from the rest of the world
context_recent:
group_by: "country week"
max_sequences: 640
min_date: "--min-date 2M"
exclude: "--exclude-where 'country={country}'"

# Custom subsampling logic for regions over 6m
# Grouping by division for North America and Oceania
# 4000 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
nextstrain_country_6m:
# Early focal samples for region
focal_early:
group_by: "division year month"
max_sequences: 640
max_date: "--max-date 6M"
exclude: "--exclude-where 'country!={country}'"
# Early contextual samples from the rest of the world
context_early:
group_by: "country year month"
max_sequences: 160
max_date: "--max-date 6M"
exclude: "--exclude-where 'country={country}'"
# Recent focal samples for region
focal_recent:
group_by: "division year month"
max_sequences: 2560
min_date: "--min-date 6M"
exclude: "--exclude-where 'country!={country}'"
# Early contextual samples from the rest of the world
context_recent:
group_by: "country year month"
max_sequences: 640
min_date: "--min-date 6M"
exclude: "--exclude-where 'country={country}'"

# Custom subsampling logic for regions over all-time
# Grouping by division for North America and Oceania
# 4000 total
# 4:1 ratio of focal to context
nextstrain_country_all-time:
# Focal samples for country
focal:
group_by: "division year month"
max_sequences: 640
exclude: "--exclude-where 'country!={country}'"
# Contextual samples from the rest of the world
context:
group_by: "country year month"
max_sequences: 160
exclude: "--exclude-where 'country={country}'"

# if different traits should be reconstructed for some builds, specify here
# otherwise the default trait config in defaults/parameters.yaml will used
traits:
Expand Down
Loading

0 comments on commit 37c68b8

Please sign in to comment.