Merge pull request #1150 from nextstrain/victorlin/fix-asia-weighted-…

…sampling Fix Asia weighted sampling
nextstrain · Sep 26, 2024 · 85e0310 · 85e0310
2 parents 41e2e8f + 8a002b4
commit 85e0310
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 149 deletions.
diff --git a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml
@@ -77,19 +77,19 @@ builds:
     region: Africa
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Africa since pandemic start
   asia_1m:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_1m
+    subsampling_scheme: nextstrain_region_asia_1m
     region: Asia
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past month
   asia_2m:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_2m
+    subsampling_scheme: nextstrain_region_asia_2m
     region: Asia
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past 2 months
   asia_6m:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_6m
+    subsampling_scheme: nextstrain_region_asia_6m
     region: Asia
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past 6 months
   asia_all-time:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_all_time
+    subsampling_scheme: nextstrain_region_asia_all_time
     region: Asia
     title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia since pandemic start
   europe_1m:
@@ -280,31 +280,18 @@ subsampling:
       exclude: "--exclude-where 'region={region}'"
 
   # Custom subsampling logic for region Asia over 1m
-  # Grouping by division
-  # Separating three buckets for China, India and elsewhere
+  # Grouping by country weighted by population size
   # 4375 total
   # 4:1 ratio of recent to early
   # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
-  nextstrain_region_asia_grouped_by_division_1m:
+  nextstrain_region_asia_1m:
     # Early focal samples for Asia
     asia_early:
-      group_by: "division year month"
-      max_sequences: 300
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Early focal samples for China
-    china_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 1M"
-      exclude: "--exclude-where 'country!=China'"
-    # Early focal samples for India
-    india_early:
-      group_by: "division year month"
-      max_sequences: 200
+      group_by: "country year month"
+      group_by_weights: "defaults/population_weights.tsv"
+      max_sequences: 700
       max_date: "--max-date 1M"
-      exclude: "--exclude-where 'country!=India'"
+      exclude: "--exclude-where 'region!=Asia'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
@@ -313,22 +300,11 @@ subsampling:
       exclude: "--exclude-where 'region=Asia'"
     # Recent focal samples for Asia
     asia_recent:
-      group_by: "division week"
-      max_sequences: 1200
+      group_by: "country week"
+      group_by_weights: "defaults/population_weights.tsv"
+      max_sequences: 2800
       min_date: "--min-date 1M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Recent focal samples for China
-    china_recent:
-      group_by: "division week"
-      max_sequences: 800
-      max_date: "--min-date 1M"
-      exclude: "--exclude-where 'country!=China'"
-    # Recent focal samples for India
-    india_recent:
-      group_by: "division week"
-      max_sequences: 800
-      max_date: "--min-date 1M"
-      exclude: "--exclude-where 'country!=India'"
+      exclude: "--exclude-where 'region!=Asia'"
     # Early contextual samples from the rest of the world
     context_recent:
       group_by: "country week"
@@ -337,31 +313,18 @@ subsampling:
       exclude: "--exclude-where 'region=Asia'"
 
   # Custom subsampling logic for region Asia over 2m
-  # Grouping by division
-  # Separating three buckets for China, India and elsewhere
+  # Grouping by country weighted by population size
   # 4375 total
   # 4:1 ratio of recent to early
   # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
-  nextstrain_region_asia_grouped_by_division_2m:
+  nextstrain_region_asia_2m:
     # Early focal samples for Asia
     asia_early:
-      group_by: "division year month"
-      max_sequences: 300
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Early focal samples for China
-    china_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 2M"
-      exclude: "--exclude-where 'country!=China'"
-    # Early focal samples for India
-    india_early:
-      group_by: "division year month"
-      max_sequences: 200
+      group_by: "country year month"
+      group_by_weights: "defaults/population_weights.tsv"
+      max_sequences: 700
       max_date: "--max-date 2M"
-      exclude: "--exclude-where 'country!=India'"
+      exclude: "--exclude-where 'region!=Asia'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
@@ -370,22 +333,11 @@ subsampling:
       exclude: "--exclude-where 'region=Asia'"
     # Recent focal samples for Asia
     asia_recent:
-      group_by: "division week"
-      max_sequences: 1200
+      group_by: "country week"
+      group_by_weights: "defaults/population_weights.tsv"
+      max_sequences: 2800
       min_date: "--min-date 2M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Recent focal samples for China
-    china_recent:
-      group_by: "division week"
-      max_sequences: 800
-      max_date: "--min-date 2M"
-      exclude: "--exclude-where 'country!=China'"
-    # Recent focal samples for India
-    india_recent:
-      group_by: "division week"
-      max_sequences: 800
-      max_date: "--min-date 2M"
-      exclude: "--exclude-where 'country!=India'"
+      exclude: "--exclude-where 'region!=Asia'"
     # Early contextual samples from the rest of the world
     context_recent:
       group_by: "country week"
@@ -394,31 +346,18 @@ subsampling:
       exclude: "--exclude-where 'region=Asia'"
 
   # Custom subsampling logic for region Asia over 6m
-  # Grouping by division
-  # Separating three buckets for China, India and elsewhere
+  # Grouping by country weighted by population size
   # 4375 total
   # 4:1 ratio of recent to early
   # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
-  nextstrain_region_asia_grouped_by_division_6m:
+  nextstrain_region_asia_6m:
     # Early focal samples for Asia
     asia_early:
-      group_by: "division year month"
-      max_sequences: 300
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Early focal samples for China
-    china_early:
-      group_by: "division year month"
-      max_sequences: 200
-      max_date: "--max-date 6M"
-      exclude: "--exclude-where 'country!=China'"
-    # Early focal samples for India
-    india_early:
-      group_by: "division year month"
-      max_sequences: 200
+      group_by: "country year month"
+      group_by_weights: "defaults/population_weights.tsv"
+      max_sequences: 700
       max_date: "--max-date 6M"
-      exclude: "--exclude-where 'country!=India'"
+      exclude: "--exclude-where 'region!=Asia'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
@@ -427,22 +366,11 @@ subsampling:
       exclude: "--exclude-where 'region=Asia'"
     # Recent focal samples for Asia
     asia_recent:
-      group_by: "division year month"
-      max_sequences: 1200
+      group_by: "country year month"
+      group_by_weights: "defaults/population_weights.tsv"
+      max_sequences: 2800
       min_date: "--min-date 6M"
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Recent focal samples for China
-    china_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 6M"
-      exclude: "--exclude-where 'country!=China'"
-    # Recent focal samples for India
-    india_recent:
-      group_by: "division year month"
-      max_sequences: 800
-      max_date: "--min-date 6M"
-      exclude: "--exclude-where 'country!=India'"
+      exclude: "--exclude-where 'region!=Asia'"
     # Early contextual samples from the rest of the world
     context_recent:
       group_by: "country year month"
@@ -451,27 +379,16 @@ subsampling:
       exclude: "--exclude-where 'region=Asia'"
 
   # Custom subsampling logic for region Asia over all-time
-  # Grouping by division
-  # Separating three buckets for China, India and elsewhere
+  # Grouping by country weighted by population size
   # 4375 total
   # 4:1 ratio of focal to context
-  # 3:2:2 proportions of Asia, China, India
-  nextstrain_region_asia_grouped_by_division_all_time:
+  nextstrain_region_asia_all_time:
     # Focal samples for Asia
     asia:
-      group_by: "division year month"
-      max_sequences: 1500
-      exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
-    # Focal samples for China
-    china:
-      group_by: "division year month"
-      max_sequences: 1000
-      exclude: "--exclude-where 'country!=China'"
-    # Focal samples for India
-    india:
-      group_by: "division year month"
-      max_sequences: 1000
-      exclude: "--exclude-where 'country!=India'"
+      group_by: "country year month"
+      group_by_weights: "defaults/population_weights.tsv"
+      max_sequences: 3500
+      exclude: "--exclude-where 'region!=Asia'"
     # Contextual samples from the rest of the world
     context:
       group_by: "country year month"

diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
@@ -70,19 +70,19 @@ builds:
     region: Africa
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa since pandemic start
   asia_1m:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_1m
+    subsampling_scheme: nextstrain_region_asia_1m
     region: Asia
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past month
   asia_2m:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_2m
+    subsampling_scheme: nextstrain_region_asia_2m
     region: Asia
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 2 months
   asia_6m:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_6m
+    subsampling_scheme: nextstrain_region_asia_6m
     region: Asia
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 6 months
   asia_all-time:
-    subsampling_scheme: nextstrain_region_asia_grouped_by_division_all_time
+    subsampling_scheme: nextstrain_region_asia_all_time
     region: Asia
     title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia since pandemic start
   europe_1m:
@@ -272,12 +272,11 @@ subsampling:
       exclude: "--exclude-where 'region={region}'"
 
   # Custom subsampling logic for region Asia over 1m
-  # Grouping by division
   # Grouping by country weighted by population size
   # 4375 total
   # 4:1 ratio of recent to early
   # 4:1 ratio of focal to context
-  nextstrain_region_asia_grouped_by_division_1m:
+  nextstrain_region_asia_1m:
     # Early focal samples for Asia
     asia_early:
       group_by: "country year month"
@@ -293,7 +292,7 @@ subsampling:
       exclude: "--exclude-where 'region=Asia'"
     # Recent focal samples for Asia
     asia_recent:
-      group_by: "country year month"
+      group_by: "country week"
       group_by_weights: "defaults/population_weights.tsv"
       max_sequences: 2800
       min_date: "--min-date 1M"
@@ -306,12 +305,11 @@ subsampling:
       exclude: "--exclude-where 'region=Asia'"
 
   # Custom subsampling logic for region Asia over 2m
-  # Grouping by division
   # Grouping by country weighted by population size
   # 4375 total
   # 4:1 ratio of recent to early
   # 4:1 ratio of focal to context
-  nextstrain_region_asia_grouped_by_division_2m:
+  nextstrain_region_asia_2m:
     # Early focal samples for Asia
     asia_early:
       group_by: "country year month"
@@ -327,7 +325,7 @@ subsampling:
       exclude: "--exclude-where 'region=Asia'"
     # Recent focal samples for Asia
     asia_recent:
-      group_by: "country year month"
+      group_by: "country week"
       group_by_weights: "defaults/population_weights.tsv"
       max_sequences: 2800
       min_date: "--min-date 2M"
@@ -340,12 +338,11 @@ subsampling:
       exclude: "--exclude-where 'region=Asia'"
 
   # Custom subsampling logic for region Asia over 6m
-  # Grouping by division
   # Grouping by country weighted by population size
   # 4375 total
   # 4:1 ratio of recent to early
   # 4:1 ratio of focal to context
-  nextstrain_region_asia_grouped_by_division_6m:
+  nextstrain_region_asia_6m:
     # Early focal samples for Asia
     asia_early:
       group_by: "country year month"
@@ -377,7 +374,7 @@ subsampling:
   # Grouping by country weighted by population size
   # 4375 total
   # 4:1 ratio of focal to context
-  nextstrain_region_asia_grouped_by_division_all_time:
+  nextstrain_region_asia_all_time:
     # Focal samples for Asia
     asia:
       group_by: "country year month"