From cca31b406cc940d64cc468b416434fe76c789268 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Mon, 26 Aug 2024 10:32:38 -0700 Subject: [PATCH 1/4] Fix Asia 1m/2m temporal grouping The temporal grouping attribute was changed from 'week' to 'year month' inadvertently in "Use population-based weighted sampling for Asia builds" (bc3f69ee). Revert it back to 'week'. --- nextstrain_profiles/nextstrain-gisaid/builds.yaml | 4 ++-- nextstrain_profiles/nextstrain-open/builds.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml index f13692a33..d7e160758 100644 --- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml @@ -293,7 +293,7 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Recent focal samples for Asia asia_recent: - group_by: "country year month" + group_by: "country week" group_by_weights: "defaults/population_weights.tsv" max_sequences: 2800 min_date: "--min-date 1M" @@ -327,7 +327,7 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Recent focal samples for Asia asia_recent: - group_by: "country year month" + group_by: "country week" group_by_weights: "defaults/population_weights.tsv" max_sequences: 2800 min_date: "--min-date 2M" diff --git a/nextstrain_profiles/nextstrain-open/builds.yaml b/nextstrain_profiles/nextstrain-open/builds.yaml index d7709518b..a4bc237df 100644 --- a/nextstrain_profiles/nextstrain-open/builds.yaml +++ b/nextstrain_profiles/nextstrain-open/builds.yaml @@ -293,7 +293,7 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Recent focal samples for Asia asia_recent: - group_by: "country year month" + group_by: "country week" group_by_weights: "defaults/population_weights.tsv" max_sequences: 2800 min_date: "--min-date 1M" @@ -327,7 +327,7 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Recent focal samples for Asia asia_recent: - group_by: "country year month" + group_by: "country week" group_by_weights: "defaults/population_weights.tsv" max_sequences: 2800 min_date: "--min-date 2M" From ad963cf6e4c8bac9422fbeb71e5253b25d6d89b6 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Mon, 26 Aug 2024 10:39:49 -0700 Subject: [PATCH 2/4] Remove outdated comment on division grouping This has been superseded by the following comment on country grouping with population size weights. --- nextstrain_profiles/nextstrain-gisaid/builds.yaml | 3 --- nextstrain_profiles/nextstrain-open/builds.yaml | 3 --- 2 files changed, 6 deletions(-) diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml index d7e160758..0b698f569 100644 --- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml @@ -272,7 +272,6 @@ subsampling: exclude: "--exclude-where 'region={region}'" # Custom subsampling logic for region Asia over 1m - # Grouping by division # Grouping by country weighted by population size # 4375 total # 4:1 ratio of recent to early @@ -306,7 +305,6 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Custom subsampling logic for region Asia over 2m - # Grouping by division # Grouping by country weighted by population size # 4375 total # 4:1 ratio of recent to early @@ -340,7 +338,6 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Custom subsampling logic for region Asia over 6m - # Grouping by division # Grouping by country weighted by population size # 4375 total # 4:1 ratio of recent to early diff --git a/nextstrain_profiles/nextstrain-open/builds.yaml b/nextstrain_profiles/nextstrain-open/builds.yaml index a4bc237df..b428464b7 100644 --- a/nextstrain_profiles/nextstrain-open/builds.yaml +++ b/nextstrain_profiles/nextstrain-open/builds.yaml @@ -272,7 +272,6 @@ subsampling: exclude: "--exclude-where 'region={region}'" # Custom subsampling logic for region Asia over 1m - # Grouping by division # Grouping by country weighted by population size # 4375 total # 4:1 ratio of recent to early @@ -306,7 +305,6 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Custom subsampling logic for region Asia over 2m - # Grouping by division # Grouping by country weighted by population size # 4375 total # 4:1 ratio of recent to early @@ -340,7 +338,6 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Custom subsampling logic for region Asia over 6m - # Grouping by division # Grouping by country weighted by population size # 4375 total # 4:1 ratio of recent to early From 0599b189d70df6728c9cd455d09bc29a17a917ba Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Mon, 26 Aug 2024 10:46:42 -0700 Subject: [PATCH 3/4] Simplify Asia subsampling scheme names The grouping information is not necessary here and became inaccurate with the switch to country-based weighted sampling. It is only necessary for other regions to distinguish `nextstrain_region_grouped_by_country_*` from `nextstrain_region_grouped_by_division_*`. --- .../nextstrain-gisaid-21L/builds.yaml | 16 ++++++++-------- .../nextstrain-gisaid/builds.yaml | 16 ++++++++-------- nextstrain_profiles/nextstrain-open/builds.yaml | 16 ++++++++-------- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml index bc7e8b5be..d5f0015f0 100644 --- a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml @@ -77,19 +77,19 @@ builds: region: Africa title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Africa since pandemic start asia_1m: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_1m + subsampling_scheme: nextstrain_region_asia_1m region: Asia title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past month asia_2m: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_2m + subsampling_scheme: nextstrain_region_asia_2m region: Asia title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past 2 months asia_6m: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_6m + subsampling_scheme: nextstrain_region_asia_6m region: Asia title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past 6 months asia_all-time: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_all_time + subsampling_scheme: nextstrain_region_asia_all_time region: Asia title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia since pandemic start europe_1m: @@ -286,7 +286,7 @@ subsampling: # 4:1 ratio of recent to early # 4:1 ratio of focal to context # 3:2:2 proportions of Asia, China, India - nextstrain_region_asia_grouped_by_division_1m: + nextstrain_region_asia_1m: # Early focal samples for Asia asia_early: group_by: "division year month" @@ -343,7 +343,7 @@ subsampling: # 4:1 ratio of recent to early # 4:1 ratio of focal to context # 3:2:2 proportions of Asia, China, India - nextstrain_region_asia_grouped_by_division_2m: + nextstrain_region_asia_2m: # Early focal samples for Asia asia_early: group_by: "division year month" @@ -400,7 +400,7 @@ subsampling: # 4:1 ratio of recent to early # 4:1 ratio of focal to context # 3:2:2 proportions of Asia, China, India - nextstrain_region_asia_grouped_by_division_6m: + nextstrain_region_asia_6m: # Early focal samples for Asia asia_early: group_by: "division year month" @@ -456,7 +456,7 @@ subsampling: # 4375 total # 4:1 ratio of focal to context # 3:2:2 proportions of Asia, China, India - nextstrain_region_asia_grouped_by_division_all_time: + nextstrain_region_asia_all_time: # Focal samples for Asia asia: group_by: "division year month" diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml index 0b698f569..ab07dcc63 100644 --- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml @@ -70,19 +70,19 @@ builds: region: Africa title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa since pandemic start asia_1m: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_1m + subsampling_scheme: nextstrain_region_asia_1m region: Asia title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past month asia_2m: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_2m + subsampling_scheme: nextstrain_region_asia_2m region: Asia title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 2 months asia_6m: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_6m + subsampling_scheme: nextstrain_region_asia_6m region: Asia title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 6 months asia_all-time: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_all_time + subsampling_scheme: nextstrain_region_asia_all_time region: Asia title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia since pandemic start europe_1m: @@ -276,7 +276,7 @@ subsampling: # 4375 total # 4:1 ratio of recent to early # 4:1 ratio of focal to context - nextstrain_region_asia_grouped_by_division_1m: + nextstrain_region_asia_1m: # Early focal samples for Asia asia_early: group_by: "country year month" @@ -309,7 +309,7 @@ subsampling: # 4375 total # 4:1 ratio of recent to early # 4:1 ratio of focal to context - nextstrain_region_asia_grouped_by_division_2m: + nextstrain_region_asia_2m: # Early focal samples for Asia asia_early: group_by: "country year month" @@ -342,7 +342,7 @@ subsampling: # 4375 total # 4:1 ratio of recent to early # 4:1 ratio of focal to context - nextstrain_region_asia_grouped_by_division_6m: + nextstrain_region_asia_6m: # Early focal samples for Asia asia_early: group_by: "country year month" @@ -374,7 +374,7 @@ subsampling: # Grouping by country weighted by population size # 4375 total # 4:1 ratio of focal to context - nextstrain_region_asia_grouped_by_division_all_time: + nextstrain_region_asia_all_time: # Focal samples for Asia asia: group_by: "country year month" diff --git a/nextstrain_profiles/nextstrain-open/builds.yaml b/nextstrain_profiles/nextstrain-open/builds.yaml index b428464b7..e39f59da7 100644 --- a/nextstrain_profiles/nextstrain-open/builds.yaml +++ b/nextstrain_profiles/nextstrain-open/builds.yaml @@ -70,19 +70,19 @@ builds: region: Africa title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa since pandemic start asia_1m: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_1m + subsampling_scheme: nextstrain_region_asia_1m region: Asia title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past month asia_2m: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_2m + subsampling_scheme: nextstrain_region_asia_2m region: Asia title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 2 months asia_6m: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_6m + subsampling_scheme: nextstrain_region_asia_6m region: Asia title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 6 months asia_all-time: - subsampling_scheme: nextstrain_region_asia_grouped_by_division_all_time + subsampling_scheme: nextstrain_region_asia_all_time region: Asia title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia since pandemic start europe_1m: @@ -276,7 +276,7 @@ subsampling: # 4375 total # 4:1 ratio of recent to early # 4:1 ratio of focal to context - nextstrain_region_asia_grouped_by_division_1m: + nextstrain_region_asia_1m: # Early focal samples for Asia asia_early: group_by: "country year month" @@ -309,7 +309,7 @@ subsampling: # 4375 total # 4:1 ratio of recent to early # 4:1 ratio of focal to context - nextstrain_region_asia_grouped_by_division_2m: + nextstrain_region_asia_2m: # Early focal samples for Asia asia_early: group_by: "country year month" @@ -342,7 +342,7 @@ subsampling: # 4375 total # 4:1 ratio of recent to early # 4:1 ratio of focal to context - nextstrain_region_asia_grouped_by_division_6m: + nextstrain_region_asia_6m: # Early focal samples for Asia asia_early: group_by: "country year month" @@ -374,7 +374,7 @@ subsampling: # Grouping by country weighted by population size # 4375 total # 4:1 ratio of focal to context - nextstrain_region_asia_grouped_by_division_all_time: + nextstrain_region_asia_all_time: # Focal samples for Asia asia: group_by: "country year month" From 8a002b4db7e3c835a2596065a4fccb6e0b72a453 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Mon, 26 Aug 2024 10:58:11 -0700 Subject: [PATCH 4/4] Use population-based weighted sampling for 21L Asia builds Follow-up to "Use population-based weighted sampling for Asia builds" (bc3f69ee) and subsequent fixes. Those were only applied to the open and gisaid profiles, inadvertently skipping the gisaid-21L profile. --- .../nextstrain-gisaid-21L/builds.yaml | 147 ++++-------------- 1 file changed, 32 insertions(+), 115 deletions(-) diff --git a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml index d5f0015f0..41363741a 100644 --- a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml @@ -280,31 +280,18 @@ subsampling: exclude: "--exclude-where 'region={region}'" # Custom subsampling logic for region Asia over 1m - # Grouping by division - # Separating three buckets for China, India and elsewhere + # Grouping by country weighted by population size # 4375 total # 4:1 ratio of recent to early # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India nextstrain_region_asia_1m: # Early focal samples for Asia asia_early: - group_by: "division year month" - max_sequences: 300 - max_date: "--max-date 1M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Early focal samples for China - china_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 1M" - exclude: "--exclude-where 'country!=China'" - # Early focal samples for India - india_early: - group_by: "division year month" - max_sequences: 200 + group_by: "country year month" + group_by_weights: "defaults/population_weights.tsv" + max_sequences: 700 max_date: "--max-date 1M" - exclude: "--exclude-where 'country!=India'" + exclude: "--exclude-where 'region!=Asia'" # Early contextual samples from the rest of the world context_early: group_by: "country year month" @@ -313,22 +300,11 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Recent focal samples for Asia asia_recent: - group_by: "division week" - max_sequences: 1200 + group_by: "country week" + group_by_weights: "defaults/population_weights.tsv" + max_sequences: 2800 min_date: "--min-date 1M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Recent focal samples for China - china_recent: - group_by: "division week" - max_sequences: 800 - max_date: "--min-date 1M" - exclude: "--exclude-where 'country!=China'" - # Recent focal samples for India - india_recent: - group_by: "division week" - max_sequences: 800 - max_date: "--min-date 1M" - exclude: "--exclude-where 'country!=India'" + exclude: "--exclude-where 'region!=Asia'" # Early contextual samples from the rest of the world context_recent: group_by: "country week" @@ -337,31 +313,18 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Custom subsampling logic for region Asia over 2m - # Grouping by division - # Separating three buckets for China, India and elsewhere + # Grouping by country weighted by population size # 4375 total # 4:1 ratio of recent to early # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India nextstrain_region_asia_2m: # Early focal samples for Asia asia_early: - group_by: "division year month" - max_sequences: 300 - max_date: "--max-date 2M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Early focal samples for China - china_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 2M" - exclude: "--exclude-where 'country!=China'" - # Early focal samples for India - india_early: - group_by: "division year month" - max_sequences: 200 + group_by: "country year month" + group_by_weights: "defaults/population_weights.tsv" + max_sequences: 700 max_date: "--max-date 2M" - exclude: "--exclude-where 'country!=India'" + exclude: "--exclude-where 'region!=Asia'" # Early contextual samples from the rest of the world context_early: group_by: "country year month" @@ -370,22 +333,11 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Recent focal samples for Asia asia_recent: - group_by: "division week" - max_sequences: 1200 + group_by: "country week" + group_by_weights: "defaults/population_weights.tsv" + max_sequences: 2800 min_date: "--min-date 2M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Recent focal samples for China - china_recent: - group_by: "division week" - max_sequences: 800 - max_date: "--min-date 2M" - exclude: "--exclude-where 'country!=China'" - # Recent focal samples for India - india_recent: - group_by: "division week" - max_sequences: 800 - max_date: "--min-date 2M" - exclude: "--exclude-where 'country!=India'" + exclude: "--exclude-where 'region!=Asia'" # Early contextual samples from the rest of the world context_recent: group_by: "country week" @@ -394,31 +346,18 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Custom subsampling logic for region Asia over 6m - # Grouping by division - # Separating three buckets for China, India and elsewhere + # Grouping by country weighted by population size # 4375 total # 4:1 ratio of recent to early # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India nextstrain_region_asia_6m: # Early focal samples for Asia asia_early: - group_by: "division year month" - max_sequences: 300 - max_date: "--max-date 6M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Early focal samples for China - china_early: - group_by: "division year month" - max_sequences: 200 - max_date: "--max-date 6M" - exclude: "--exclude-where 'country!=China'" - # Early focal samples for India - india_early: - group_by: "division year month" - max_sequences: 200 + group_by: "country year month" + group_by_weights: "defaults/population_weights.tsv" + max_sequences: 700 max_date: "--max-date 6M" - exclude: "--exclude-where 'country!=India'" + exclude: "--exclude-where 'region!=Asia'" # Early contextual samples from the rest of the world context_early: group_by: "country year month" @@ -427,22 +366,11 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Recent focal samples for Asia asia_recent: - group_by: "division year month" - max_sequences: 1200 + group_by: "country year month" + group_by_weights: "defaults/population_weights.tsv" + max_sequences: 2800 min_date: "--min-date 6M" - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Recent focal samples for China - china_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 6M" - exclude: "--exclude-where 'country!=China'" - # Recent focal samples for India - india_recent: - group_by: "division year month" - max_sequences: 800 - max_date: "--min-date 6M" - exclude: "--exclude-where 'country!=India'" + exclude: "--exclude-where 'region!=Asia'" # Early contextual samples from the rest of the world context_recent: group_by: "country year month" @@ -451,27 +379,16 @@ subsampling: exclude: "--exclude-where 'region=Asia'" # Custom subsampling logic for region Asia over all-time - # Grouping by division - # Separating three buckets for China, India and elsewhere + # Grouping by country weighted by population size # 4375 total # 4:1 ratio of focal to context - # 3:2:2 proportions of Asia, China, India nextstrain_region_asia_all_time: # Focal samples for Asia asia: - group_by: "division year month" - max_sequences: 1500 - exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'" - # Focal samples for China - china: - group_by: "division year month" - max_sequences: 1000 - exclude: "--exclude-where 'country!=China'" - # Focal samples for India - india: - group_by: "division year month" - max_sequences: 1000 - exclude: "--exclude-where 'country!=India'" + group_by: "country year month" + group_by_weights: "defaults/population_weights.tsv" + max_sequences: 3500 + exclude: "--exclude-where 'region!=Asia'" # Contextual samples from the rest of the world context: group_by: "country year month"