Skip to content

Commit

Permalink
Only include more recent context sequences
Browse files Browse the repository at this point in the history
For "global" subsampling, rather than treating early contextual samples as origin of pandemic to beginning of focal window, eg for 6m analysis from 2020 to 6m ago, instead use a consistent 1y of additional context. So, for 6m, this is context of 18m ago to 6m and focal of 6m ago to present. Additionally, reduce the amount of contextual sequences included from a 4:1 ratio of focal to context to a 10:1 ratio of focal to context.
  • Loading branch information
trvrb committed Jul 24, 2024
1 parent b6efca7 commit 125686d
Showing 1 changed file with 64 additions and 34 deletions.
98 changes: 64 additions & 34 deletions nextstrain_profiles/nextstrain-gisaid/builds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -580,48 +580,58 @@ subsampling:
exclude: "--exclude-where 'region={region}'"

# Custom subsampling logic for global region over 1m
# 5125 total (expect ~3400)
# 4:1 ratio of recent to early
# all eight regions equal except Oceania at 20%
# ~4500 total (expect ~3400)
# 10:1 ratio of recent to early
# recent is present to 1m, n = 4120
# early is 1m to 13m, n = 412
# regions are proportional to population size
nextstrain_global_1m:
africa_early:
group_by: "country year month"
max_sequences: 150
max_sequences: 60
min_date: "--min-date 13M"
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=Africa'"
asia_early:
group_by: "country year month"
max_sequences: 200
max_sequences: 80
min_date: "--min-date 13M"
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
china_early:
group_by: "division year month"
max_sequences: 175
max_sequences: 70
min_date: "--min-date 13M"
max_date: "--max-date 1M"
exclude: "--exclude-where 'country!=China'"
europe_early:
group_by: "country year month"
max_sequences: 125
max_sequences: 50
min_date: "--min-date 13M"
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=Europe'"
india_early:
group_by: "division year month"
max_sequences: 175
max_sequences: 70
min_date: "--min-date 13M"
max_date: "--max-date 1M"
exclude: "--exclude-where 'country!=India'"
north_america_early:
group_by: "division year month"
max_sequences: 100
max_sequences: 40
min_date: "--min-date 13M"
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=North America'"
south_america_early:
group_by: "country year month"
max_sequences: 90
max_sequences: 36
min_date: "--min-date 13M"
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=South America'"
oceania_early:
group_by: "division year month"
max_sequences: 15
max_sequences: 6
min_date: "--min-date 13M"
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=Oceania'"
africa_recent:
Expand Down Expand Up @@ -666,48 +676,58 @@ subsampling:
exclude: "--exclude-where 'region!=Oceania'"

# Custom subsampling logic for global region over 2m
# 5125 total (expect ~3400)
# 4:1 ratio of recent to early
# all eight regions equal except Oceania at 20%
# ~4500 total (expect ~3400)
# 10:1 ratio of recent to early
# recent is present to 2m, n = 4120
# early is 2m to 14m, n = 412
# regions are proportional to population size
nextstrain_global_2m:
africa_early:
group_by: "country year month"
max_sequences: 150
max_sequences: 60
min_date: "--min-date 14M"
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=Africa'"
asia_early:
group_by: "country year month"
max_sequences: 200
max_sequences: 80
min_date: "--min-date 14M"
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
china_early:
group_by: "division year month"
max_sequences: 175
max_sequences: 70
min_date: "--min-date 14M"
max_date: "--max-date 2M"
exclude: "--exclude-where 'country!=China'"
europe_early:
group_by: "country year month"
max_sequences: 125
max_sequences: 50
min_date: "--min-date 14M"
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=Europe'"
india_early:
group_by: "division year month"
max_sequences: 175
max_sequences: 70
min_date: "--min-date 14M"
max_date: "--max-date 2M"
exclude: "--exclude-where 'country!=India'"
north_america_early:
group_by: "division year month"
max_sequences: 100
max_sequences: 40
min_date: "--min-date 14M"
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=North America'"
south_america_early:
group_by: "country year month"
max_sequences: 90
max_sequences: 36
min_date: "--min-date 14M"
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=South America'"
oceania_early:
group_by: "division year month"
max_sequences: 15
max_sequences: 6
min_date: "--min-date 14M"
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=Oceania'"
africa_recent:
Expand Down Expand Up @@ -752,48 +772,58 @@ subsampling:
exclude: "--exclude-where 'region!=Oceania'"

# Custom subsampling logic for global region over 6m
# 5125 total (expect ~3400)
# 4:1 ratio of recent to early
# all eight regions equal except Oceania at 20%
# ~4500 total (expect ~3400)
# 10:1 ratio of recent to early
# recent is present to 6m, n = 4120
# early is 6m to 18m, n = 412
# regions are proportional to population size
nextstrain_global_6m:
africa_early:
group_by: "country year month"
max_sequences: 150
max_sequences: 60
min_date: "--min-date 18M"
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=Africa'"
asia_early:
group_by: "country year month"
max_sequences: 200
max_sequences: 80
min_date: "--min-date 18M"
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
china_early:
group_by: "division year month"
max_sequences: 175
max_sequences: 70
min_date: "--min-date 18M"
max_date: "--max-date 6M"
exclude: "--exclude-where 'country!=China'"
europe_early:
group_by: "country year month"
max_sequences: 125
max_sequences: 50
min_date: "--min-date 18M"
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=Europe'"
india_early:
group_by: "division year month"
max_sequences: 175
max_sequences: 70
min_date: "--min-date 18M"
max_date: "--max-date 6M"
exclude: "--exclude-where 'country!=India'"
north_america_early:
group_by: "division year month"
max_sequences: 100
max_sequences: 40
min_date: "--min-date 18M"
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=North America'"
south_america_early:
group_by: "country year month"
max_sequences: 90
max_sequences: 36
min_date: "--min-date 18M"
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=South America'"
oceania_early:
group_by: "division year month"
max_sequences: 15
max_sequences: 6
min_date: "--min-date 18M"
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=Oceania'"
africa_recent:
Expand Down Expand Up @@ -839,7 +869,7 @@ subsampling:

# Custom subsampling logic for global region over all-time
# 4320 total (expect ~3200)
# all eight regions equal except Oceania at 20%
# regions are proportional to population size
nextstrain_global_all_time:
africa:
group_by: "country year month"
Expand Down

0 comments on commit 125686d

Please sign in to comment.