Skip to content

Commit

Permalink
Merge pull request #1150 from nextstrain/victorlin/fix-asia-weighted-…
Browse files Browse the repository at this point in the history
…sampling

Fix Asia weighted sampling
  • Loading branch information
trvrb authored Sep 26, 2024
2 parents 41e2e8f + 8a002b4 commit 85e0310
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 149 deletions.
163 changes: 40 additions & 123 deletions nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,19 +77,19 @@ builds:
region: Africa
title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Africa since pandemic start
asia_1m:
subsampling_scheme: nextstrain_region_asia_grouped_by_division_1m
subsampling_scheme: nextstrain_region_asia_1m
region: Asia
title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past month
asia_2m:
subsampling_scheme: nextstrain_region_asia_grouped_by_division_2m
subsampling_scheme: nextstrain_region_asia_2m
region: Asia
title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past 2 months
asia_6m:
subsampling_scheme: nextstrain_region_asia_grouped_by_division_6m
subsampling_scheme: nextstrain_region_asia_6m
region: Asia
title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past 6 months
asia_all-time:
subsampling_scheme: nextstrain_region_asia_grouped_by_division_all_time
subsampling_scheme: nextstrain_region_asia_all_time
region: Asia
title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia since pandemic start
europe_1m:
Expand Down Expand Up @@ -280,31 +280,18 @@ subsampling:
exclude: "--exclude-where 'region={region}'"

# Custom subsampling logic for region Asia over 1m
# Grouping by division
# Separating three buckets for China, India and elsewhere
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
# 3:2:2 proportions of Asia, China, India
nextstrain_region_asia_grouped_by_division_1m:
nextstrain_region_asia_1m:
# Early focal samples for Asia
asia_early:
group_by: "division year month"
max_sequences: 300
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Early focal samples for China
china_early:
group_by: "division year month"
max_sequences: 200
max_date: "--max-date 1M"
exclude: "--exclude-where 'country!=China'"
# Early focal samples for India
india_early:
group_by: "division year month"
max_sequences: 200
group_by: "country year month"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 700
max_date: "--max-date 1M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_early:
group_by: "country year month"
Expand All @@ -313,22 +300,11 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"
# Recent focal samples for Asia
asia_recent:
group_by: "division week"
max_sequences: 1200
group_by: "country week"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 2800
min_date: "--min-date 1M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Recent focal samples for China
china_recent:
group_by: "division week"
max_sequences: 800
max_date: "--min-date 1M"
exclude: "--exclude-where 'country!=China'"
# Recent focal samples for India
india_recent:
group_by: "division week"
max_sequences: 800
max_date: "--min-date 1M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_recent:
group_by: "country week"
Expand All @@ -337,31 +313,18 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"

# Custom subsampling logic for region Asia over 2m
# Grouping by division
# Separating three buckets for China, India and elsewhere
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
# 3:2:2 proportions of Asia, China, India
nextstrain_region_asia_grouped_by_division_2m:
nextstrain_region_asia_2m:
# Early focal samples for Asia
asia_early:
group_by: "division year month"
max_sequences: 300
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Early focal samples for China
china_early:
group_by: "division year month"
max_sequences: 200
max_date: "--max-date 2M"
exclude: "--exclude-where 'country!=China'"
# Early focal samples for India
india_early:
group_by: "division year month"
max_sequences: 200
group_by: "country year month"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 700
max_date: "--max-date 2M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_early:
group_by: "country year month"
Expand All @@ -370,22 +333,11 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"
# Recent focal samples for Asia
asia_recent:
group_by: "division week"
max_sequences: 1200
group_by: "country week"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 2800
min_date: "--min-date 2M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Recent focal samples for China
china_recent:
group_by: "division week"
max_sequences: 800
max_date: "--min-date 2M"
exclude: "--exclude-where 'country!=China'"
# Recent focal samples for India
india_recent:
group_by: "division week"
max_sequences: 800
max_date: "--min-date 2M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_recent:
group_by: "country week"
Expand All @@ -394,31 +346,18 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"

# Custom subsampling logic for region Asia over 6m
# Grouping by division
# Separating three buckets for China, India and elsewhere
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
# 3:2:2 proportions of Asia, China, India
nextstrain_region_asia_grouped_by_division_6m:
nextstrain_region_asia_6m:
# Early focal samples for Asia
asia_early:
group_by: "division year month"
max_sequences: 300
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Early focal samples for China
china_early:
group_by: "division year month"
max_sequences: 200
max_date: "--max-date 6M"
exclude: "--exclude-where 'country!=China'"
# Early focal samples for India
india_early:
group_by: "division year month"
max_sequences: 200
group_by: "country year month"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 700
max_date: "--max-date 6M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_early:
group_by: "country year month"
Expand All @@ -427,22 +366,11 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"
# Recent focal samples for Asia
asia_recent:
group_by: "division year month"
max_sequences: 1200
group_by: "country year month"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 2800
min_date: "--min-date 6M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Recent focal samples for China
china_recent:
group_by: "division year month"
max_sequences: 800
max_date: "--min-date 6M"
exclude: "--exclude-where 'country!=China'"
# Recent focal samples for India
india_recent:
group_by: "division year month"
max_sequences: 800
max_date: "--min-date 6M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_recent:
group_by: "country year month"
Expand All @@ -451,27 +379,16 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"

# Custom subsampling logic for region Asia over all-time
# Grouping by division
# Separating three buckets for China, India and elsewhere
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of focal to context
# 3:2:2 proportions of Asia, China, India
nextstrain_region_asia_grouped_by_division_all_time:
nextstrain_region_asia_all_time:
# Focal samples for Asia
asia:
group_by: "division year month"
max_sequences: 1500
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Focal samples for China
china:
group_by: "division year month"
max_sequences: 1000
exclude: "--exclude-where 'country!=China'"
# Focal samples for India
india:
group_by: "division year month"
max_sequences: 1000
exclude: "--exclude-where 'country!=India'"
group_by: "country year month"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 3500
exclude: "--exclude-where 'region!=Asia'"
# Contextual samples from the rest of the world
context:
group_by: "country year month"
Expand Down
23 changes: 10 additions & 13 deletions nextstrain_profiles/nextstrain-gisaid/builds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,19 +70,19 @@ builds:
region: Africa
title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa since pandemic start
asia_1m:
subsampling_scheme: nextstrain_region_asia_grouped_by_division_1m
subsampling_scheme: nextstrain_region_asia_1m
region: Asia
title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past month
asia_2m:
subsampling_scheme: nextstrain_region_asia_grouped_by_division_2m
subsampling_scheme: nextstrain_region_asia_2m
region: Asia
title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 2 months
asia_6m:
subsampling_scheme: nextstrain_region_asia_grouped_by_division_6m
subsampling_scheme: nextstrain_region_asia_6m
region: Asia
title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 6 months
asia_all-time:
subsampling_scheme: nextstrain_region_asia_grouped_by_division_all_time
subsampling_scheme: nextstrain_region_asia_all_time
region: Asia
title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia since pandemic start
europe_1m:
Expand Down Expand Up @@ -272,12 +272,11 @@ subsampling:
exclude: "--exclude-where 'region={region}'"

# Custom subsampling logic for region Asia over 1m
# Grouping by division
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
nextstrain_region_asia_grouped_by_division_1m:
nextstrain_region_asia_1m:
# Early focal samples for Asia
asia_early:
group_by: "country year month"
Expand All @@ -293,7 +292,7 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"
# Recent focal samples for Asia
asia_recent:
group_by: "country year month"
group_by: "country week"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 2800
min_date: "--min-date 1M"
Expand All @@ -306,12 +305,11 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"

# Custom subsampling logic for region Asia over 2m
# Grouping by division
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
nextstrain_region_asia_grouped_by_division_2m:
nextstrain_region_asia_2m:
# Early focal samples for Asia
asia_early:
group_by: "country year month"
Expand All @@ -327,7 +325,7 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"
# Recent focal samples for Asia
asia_recent:
group_by: "country year month"
group_by: "country week"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 2800
min_date: "--min-date 2M"
Expand All @@ -340,12 +338,11 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"

# Custom subsampling logic for region Asia over 6m
# Grouping by division
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
nextstrain_region_asia_grouped_by_division_6m:
nextstrain_region_asia_6m:
# Early focal samples for Asia
asia_early:
group_by: "country year month"
Expand Down Expand Up @@ -377,7 +374,7 @@ subsampling:
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of focal to context
nextstrain_region_asia_grouped_by_division_all_time:
nextstrain_region_asia_all_time:
# Focal samples for Asia
asia:
group_by: "country year month"
Expand Down
Loading

0 comments on commit 85e0310

Please sign in to comment.