From db6684c8291274a3c1c0e48a05166f834b168df3 Mon Sep 17 00:00:00 2001
From: Trevor Bedford <trevor@bedford.io>
Date: Tue, 23 Jul 2024 17:19:13 -0700
Subject: [PATCH 1/3] Only include more recent context sequences

For when subsampling in the Nextstrain GISAID profile, rather than treating early contextual samples as origin of pandemic to beginning of focal window, eg for 6m analysis from 2020 to 6m ago, instead use a consistent 24m of additional context. So, for 6m, this is context of 30m ago to 6m and focal of 6m ago to present. Additionally, reduce the amount of contextual sequences included from a 4:1 ratio of focal to context to a 10:1 ratio of focal to context.
---
 .../nextstrain-gisaid/builds.yaml             | 200 +++++++++++-------
 1 file changed, 127 insertions(+), 73 deletions(-)

diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
index 4034e4730..d56530772 100644
--- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml
+++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
@@ -165,19 +165,21 @@ subsampling:
   # Custom subsampling logic for regions over 1m
   # Grouping by division for North America and Oceania
   # 4000 total
-  # 4:1 ratio of recent to early
+  # 10:1 ratio of recent to early
   # 4:1 ratio of focal to context
   nextstrain_region_grouped_by_division_1m:
     # Early focal samples for region
     focal_early:
       group_by: "division year month"
-      max_sequences: 640
+      max_sequences: 256
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!={region}'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
-      max_sequences: 160
+      max_sequences: 64
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region={region}'"
     # Recent focal samples for region
@@ -196,19 +198,21 @@ subsampling:
   # Custom subsampling logic for regions over 2m
   # Grouping by division for North America and Oceania
   # 4000 total
-  # 4:1 ratio of recent to early
+  # 10:1 ratio of recent to early
   # 4:1 ratio of focal to context
   nextstrain_region_grouped_by_division_2m:
     # Early focal samples for region
     focal_early:
       group_by: "division year month"
-      max_sequences: 640
+      max_sequences: 256
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!={region}'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
-      max_sequences: 160
+      max_sequences: 64
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region={region}'"
     # Recent focal samples for region
@@ -227,19 +231,21 @@ subsampling:
   # Custom subsampling logic for regions over 6m
   # Grouping by division for North America and Oceania
   # 4000 total
-  # 4:1 ratio of recent to early
+  # 10:1 ratio of recent to early
   # 4:1 ratio of focal to context
   nextstrain_region_grouped_by_division_6m:
     # Early focal samples for region
     focal_early:
       group_by: "division year month"
-      max_sequences: 640
+      max_sequences: 256
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!={region}'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
-      max_sequences: 160
+      max_sequences: 64
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region={region}'"
     # Recent focal samples for region
@@ -275,32 +281,36 @@ subsampling:
   # Grouping by division
   # Separating three buckets for China, India and elsewhere
   # 4375 total
-  # 4:1 ratio of recent to early
+  # 10:1 ratio of recent to early
   # 4:1 ratio of focal to context
   # 3:2:2 proportions of Asia, China, India
   nextstrain_region_asia_grouped_by_division_1m:
     # Early focal samples for Asia
     asia_early:
       group_by: "division year month"
-      max_sequences: 300
+      max_sequences: 120
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     # Early focal samples for China
     china_early:
       group_by: "division year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'country!=China'"
     # Early focal samples for India
     india_early:
       group_by: "division year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'country!=India'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region=Asia'"
     # Recent focal samples for Asia
@@ -313,13 +323,13 @@ subsampling:
     china_recent:
       group_by: "division week"
       max_sequences: 800
-      max_date: "--min-date 1M"
+      min_date: "--min-date 1M"
       exclude: "--exclude-where 'country!=China'"
     # Recent focal samples for India
     india_recent:
       group_by: "division week"
       max_sequences: 800
-      max_date: "--min-date 1M"
+      min_date: "--min-date 1M"
       exclude: "--exclude-where 'country!=India'"
     # Early contextual samples from the rest of the world
     context_recent:
@@ -332,32 +342,36 @@ subsampling:
   # Grouping by division
   # Separating three buckets for China, India and elsewhere
   # 4375 total
-  # 4:1 ratio of recent to early
+  # 10:1 ratio of recent to early
   # 4:1 ratio of focal to context
   # 3:2:2 proportions of Asia, China, India
   nextstrain_region_asia_grouped_by_division_2m:
     # Early focal samples for Asia
     asia_early:
       group_by: "division year month"
-      max_sequences: 300
+      max_sequences: 120
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     # Early focal samples for China
     china_early:
       group_by: "division year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'country!=China'"
     # Early focal samples for India
     india_early:
       group_by: "division year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'country!=India'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region=Asia'"
     # Recent focal samples for Asia
@@ -370,13 +384,13 @@ subsampling:
     china_recent:
       group_by: "division week"
       max_sequences: 800
-      max_date: "--min-date 2M"
+      min_date: "--min-date 2M"
       exclude: "--exclude-where 'country!=China'"
     # Recent focal samples for India
     india_recent:
       group_by: "division week"
       max_sequences: 800
-      max_date: "--min-date 2M"
+      min_date: "--min-date 2M"
       exclude: "--exclude-where 'country!=India'"
     # Early contextual samples from the rest of the world
     context_recent:
@@ -389,32 +403,36 @@ subsampling:
   # Grouping by division
   # Separating three buckets for China, India and elsewhere
   # 4375 total
-  # 4:1 ratio of recent to early
+  # 10:1 ratio of recent to early
   # 4:1 ratio of focal to context
   # 3:2:2 proportions of Asia, China, India
   nextstrain_region_asia_grouped_by_division_6m:
     # Early focal samples for Asia
     asia_early:
       group_by: "division year month"
-      max_sequences: 300
+      max_sequences: 120
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     # Early focal samples for China
     china_early:
       group_by: "division year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'country!=China'"
     # Early focal samples for India
     india_early:
       group_by: "division year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'country!=India'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region=Asia'"
     # Recent focal samples for Asia
@@ -427,13 +445,13 @@ subsampling:
     china_recent:
       group_by: "division year month"
       max_sequences: 800
-      max_date: "--min-date 6M"
+      min_date: "--min-date 6M"
       exclude: "--exclude-where 'country!=China'"
     # Recent focal samples for India
     india_recent:
       group_by: "division year month"
       max_sequences: 800
-      max_date: "--min-date 6M"
+      min_date: "--min-date 6M"
       exclude: "--exclude-where 'country!=India'"
     # Early contextual samples from the rest of the world
     context_recent:
@@ -473,19 +491,21 @@ subsampling:
   # Custom subsampling logic for regions over 1m
   # Grouping by country for Africa, Asia, Europe and South America
   # 4000 total
-  # 4:1 ratio of recent to early
+  # 10:1 ratio of recent to early
   # 4:1 ratio of focal to context
   nextstrain_region_grouped_by_country_1m:
     # Early focal samples for region
     focal_early:
       group_by: "country year month"
-      max_sequences: 640
+      max_sequences: 256
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!={region}'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
-      max_sequences: 160
+      max_sequences: 64
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region={region}'"
     # Recent focal samples for region
@@ -504,19 +524,21 @@ subsampling:
   # Custom subsampling logic for regions over 2m
   # Grouping by country for Africa, Asia, Europe and South America
   # 4000 total
-  # 4:1 ratio of recent to early
+  # 10:1 ratio of recent to early
   # 4:1 ratio of focal to context
   nextstrain_region_grouped_by_country_2m:
     # Early focal samples for region
     focal_early:
       group_by: "country year month"
-      max_sequences: 640
+      max_sequences: 256
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!={region}'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
-      max_sequences: 160
+      max_sequences: 64
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region={region}'"
     # Recent focal samples for region
@@ -535,19 +557,21 @@ subsampling:
   # Custom subsampling logic for regions over 6m
   # Grouping by country for Africa, Asia, Europe and South America
   # 4000 total
-  # 4:1 ratio of recent to early
+  # 10:1 ratio of recent to early
   # 4:1 ratio of focal to context
   nextstrain_region_grouped_by_country_6m:
     # Early focal samples for region
     focal_early:
       group_by: "country year month"
-      max_sequences: 640
+      max_sequences: 256
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!={region}'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
-      max_sequences: 160
+      max_sequences: 64
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region={region}'"
     # Recent focal samples for region
@@ -580,48 +604,58 @@ subsampling:
       exclude: "--exclude-where 'region={region}'"
 
   # Custom subsampling logic for global region over 1m
-  # 5125 total (expect ~3400)
-  # 4:1 ratio of recent to early
-  # all eight regions equal except Oceania at 20%
+  # ~4500 total (expect ~3400)
+  # 10:1 ratio of recent to early
+  # recent is present to 1m, n = 4120
+  # early is 1m to 25m, n = 412
+  # regions are proportional to population size
   nextstrain_global_1m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 60
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     china_early:
       group_by: "division year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 125
+      max_sequences: 50
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Europe'"
     india_early:
       group_by: "division year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 100
+      max_sequences: 40
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 90
+      max_sequences: 36
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
       group_by: "division year month"
-      max_sequences: 15
+      max_sequences: 6
+      min_date: "--min-date 25M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent:
@@ -666,48 +700,58 @@ subsampling:
       exclude: "--exclude-where 'region!=Oceania'"
 
   # Custom subsampling logic for global region over 2m
-  # 5125 total (expect ~3400)
-  # 4:1 ratio of recent to early
-  # all eight regions equal except Oceania at 20%
+  # ~4500 total (expect ~3400)
+  # 10:1 ratio of recent to early
+  # recent is present to 2m, n = 4120
+  # early is 2m to 26m, n = 412
+  # regions are proportional to population size
   nextstrain_global_2m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 60
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     china_early:
       group_by: "division year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 125
+      max_sequences: 50
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Europe'"
     india_early:
       group_by: "division year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 100
+      max_sequences: 40
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 90
+      max_sequences: 36
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
       group_by: "division year month"
-      max_sequences: 15
+      max_sequences: 6
+      min_date: "--min-date 26M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent:
@@ -752,48 +796,58 @@ subsampling:
       exclude: "--exclude-where 'region!=Oceania'"
 
   # Custom subsampling logic for global region over 6m
-  # 5125 total (expect ~3400)
-  # 4:1 ratio of recent to early
-  # all eight regions equal except Oceania at 20%
+  # ~4500 total (expect ~3400)
+  # 10:1 ratio of recent to early
+  # recent is present to 6m, n = 4120
+  # early is 6m to 30m, n = 412
+  # regions are proportional to population size
   nextstrain_global_6m:
     africa_early:
       group_by: "country year month"
-      max_sequences: 150
+      max_sequences: 60
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
-      max_sequences: 200
+      max_sequences: 80
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     china_early:
       group_by: "division year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
-      max_sequences: 125
+      max_sequences: 50
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Europe'"
     india_early:
       group_by: "division year month"
-      max_sequences: 175
+      max_sequences: 70
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
-      max_sequences: 100
+      max_sequences: 40
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
-      max_sequences: 90
+      max_sequences: 36
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
       group_by: "division year month"
-      max_sequences: 15
+      max_sequences: 6
+      min_date: "--min-date 30M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent:
@@ -839,7 +893,7 @@ subsampling:
 
   # Custom subsampling logic for global region over all-time
   # 4320 total (expect ~3200)
-  # all eight regions equal except Oceania at 20%
+  # regions are proportional to population size
   nextstrain_global_all_time:
     africa:
       group_by: "country year month"

From 4dd3f2aaa5056af569925df200cd759a7ec225e6 Mon Sep 17 00:00:00 2001
From: Trevor Bedford <trevor@bedford.io>
Date: Wed, 24 Jul 2024 10:59:43 -0700
Subject: [PATCH 2/3] Don't include Wuhan root in Nextstrain profile

Drop forced inclusion of Wuhan/1 root in the Nextstrain GISAID profile and swap rooting to use "best", ie temporally optimal rooting. This allows the root to be the common ancestor of the subsampled sequences. This makes it so that with the changes to time-based subsampling in the previous commit, the "6m" analysis includes samples from the previous 30m and the TMRCA is in ~2021.

This set up should be significantly more future proof than needing to continually make new clade-specific (eg /21L/) roots as selective sweeps occur.
---
 nextstrain_profiles/nextstrain-gisaid/builds.yaml | 6 ++++++
 nextstrain_profiles/nextstrain-gisaid/include.txt | 0
 2 files changed, 6 insertions(+)
 create mode 100644 nextstrain_profiles/nextstrain-gisaid/include.txt

diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
index d56530772..892253853 100644
--- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml
+++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
@@ -20,7 +20,9 @@ genes: ["ORF1a", "ORF1b", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF
 use_nextalign: true
 include_hcov19_prefix: True
 
+# use custom include file that doesn't specify Wuhan/1
 files:
+  include: "nextstrain_profiles/nextstrain-gisaid/include.txt"
   description: "nextstrain_profiles/nextstrain-gisaid/nextstrain_description.md"
 
 inputs:
@@ -928,6 +930,10 @@ subsampling:
       max_sequences: 75
       exclude: "--exclude-where 'region!=Oceania'"
 
+# root via temporal fit rather than explicit outgroup
+refine:
+  root: "best"
+
 # if different traits should be reconstructed for some builds, specify here
 # otherwise the default trait config in defaults/parameters.yaml will used
 traits:
diff --git a/nextstrain_profiles/nextstrain-gisaid/include.txt b/nextstrain_profiles/nextstrain-gisaid/include.txt
new file mode 100644
index 000000000..e69de29bb

From 9e59c586a091562d384d1d08c8821fb3030585da Mon Sep 17 00:00:00 2001
From: Trevor Bedford <trevor@bedford.io>
Date: Wed, 24 Jul 2024 18:06:59 -0700
Subject: [PATCH 3/3] Reduce from 2y context to 1y context

---
 .../nextstrain-gisaid/builds.yaml             | 102 +++++++++---------
 1 file changed, 51 insertions(+), 51 deletions(-)

diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
index 892253853..ea05798c2 100644
--- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml
+++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
@@ -174,14 +174,14 @@ subsampling:
     focal_early:
       group_by: "division year month"
       max_sequences: 256
-      min_date: "--min-date 25M"
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!={region}'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
       max_sequences: 64
-      min_date: "--min-date 25M"
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region={region}'"
     # Recent focal samples for region
@@ -207,14 +207,14 @@ subsampling:
     focal_early:
       group_by: "division year month"
       max_sequences: 256
-      min_date: "--min-date 26M"
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!={region}'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
       max_sequences: 64
-      min_date: "--min-date 26M"
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region={region}'"
     # Recent focal samples for region
@@ -240,14 +240,14 @@ subsampling:
     focal_early:
       group_by: "division year month"
       max_sequences: 256
-      min_date: "--min-date 30M"
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!={region}'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
       max_sequences: 64
-      min_date: "--min-date 30M"
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region={region}'"
     # Recent focal samples for region
@@ -291,28 +291,28 @@ subsampling:
     asia_early:
       group_by: "division year month"
       max_sequences: 120
-      min_date: "--min-date 25M"
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     # Early focal samples for China
     china_early:
       group_by: "division year month"
       max_sequences: 80
-      min_date: "--min-date 25M"
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'country!=China'"
     # Early focal samples for India
     india_early:
       group_by: "division year month"
       max_sequences: 80
-      min_date: "--min-date 25M"
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'country!=India'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
       max_sequences: 70
-      min_date: "--min-date 25M"
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region=Asia'"
     # Recent focal samples for Asia
@@ -352,28 +352,28 @@ subsampling:
     asia_early:
       group_by: "division year month"
       max_sequences: 120
-      min_date: "--min-date 26M"
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     # Early focal samples for China
     china_early:
       group_by: "division year month"
       max_sequences: 80
-      min_date: "--min-date 26M"
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'country!=China'"
     # Early focal samples for India
     india_early:
       group_by: "division year month"
       max_sequences: 80
-      min_date: "--min-date 26M"
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'country!=India'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
       max_sequences: 70
-      min_date: "--min-date 26M"
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region=Asia'"
     # Recent focal samples for Asia
@@ -413,28 +413,28 @@ subsampling:
     asia_early:
       group_by: "division year month"
       max_sequences: 120
-      min_date: "--min-date 30M"
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     # Early focal samples for China
     china_early:
       group_by: "division year month"
       max_sequences: 80
-      min_date: "--min-date 30M"
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'country!=China'"
     # Early focal samples for India
     india_early:
       group_by: "division year month"
       max_sequences: 80
-      min_date: "--min-date 30M"
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'country!=India'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
       max_sequences: 70
-      min_date: "--min-date 30M"
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region=Asia'"
     # Recent focal samples for Asia
@@ -500,14 +500,14 @@ subsampling:
     focal_early:
       group_by: "country year month"
       max_sequences: 256
-      min_date: "--min-date 25M"
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!={region}'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
       max_sequences: 64
-      min_date: "--min-date 25M"
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region={region}'"
     # Recent focal samples for region
@@ -533,14 +533,14 @@ subsampling:
     focal_early:
       group_by: "country year month"
       max_sequences: 256
-      min_date: "--min-date 26M"
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!={region}'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
       max_sequences: 64
-      min_date: "--min-date 26M"
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region={region}'"
     # Recent focal samples for region
@@ -566,14 +566,14 @@ subsampling:
     focal_early:
       group_by: "country year month"
       max_sequences: 256
-      min_date: "--min-date 30M"
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!={region}'"
     # Early contextual samples from the rest of the world
     context_early:
       group_by: "country year month"
       max_sequences: 64
-      min_date: "--min-date 30M"
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region={region}'"
     # Recent focal samples for region
@@ -609,55 +609,55 @@ subsampling:
   # ~4500 total (expect ~3400)
   # 10:1 ratio of recent to early
   # recent is present to 1m, n = 4120
-  # early is 1m to 25m, n = 412
+  # early is 1m to 13M, n = 412
   # regions are proportional to population size
   nextstrain_global_1m:
     africa_early:
       group_by: "country year month"
       max_sequences: 60
-      min_date: "--min-date 25M"
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
       max_sequences: 80
-      min_date: "--min-date 25M"
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     china_early:
       group_by: "division year month"
       max_sequences: 70
-      min_date: "--min-date 25M"
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
       max_sequences: 50
-      min_date: "--min-date 25M"
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Europe'"
     india_early:
       group_by: "division year month"
       max_sequences: 70
-      min_date: "--min-date 25M"
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
       max_sequences: 40
-      min_date: "--min-date 25M"
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
       max_sequences: 36
-      min_date: "--min-date 25M"
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
       group_by: "division year month"
       max_sequences: 6
-      min_date: "--min-date 25M"
+      min_date: "--min-date 13M"
       max_date: "--max-date 1M"
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent:
@@ -705,55 +705,55 @@ subsampling:
   # ~4500 total (expect ~3400)
   # 10:1 ratio of recent to early
   # recent is present to 2m, n = 4120
-  # early is 2m to 26m, n = 412
+  # early is 2m to 14M, n = 412
   # regions are proportional to population size
   nextstrain_global_2m:
     africa_early:
       group_by: "country year month"
       max_sequences: 60
-      min_date: "--min-date 26M"
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
       max_sequences: 80
-      min_date: "--min-date 26M"
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     china_early:
       group_by: "division year month"
       max_sequences: 70
-      min_date: "--min-date 26M"
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
       max_sequences: 50
-      min_date: "--min-date 26M"
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Europe'"
     india_early:
       group_by: "division year month"
       max_sequences: 70
-      min_date: "--min-date 26M"
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
       max_sequences: 40
-      min_date: "--min-date 26M"
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
       max_sequences: 36
-      min_date: "--min-date 26M"
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
       group_by: "division year month"
       max_sequences: 6
-      min_date: "--min-date 26M"
+      min_date: "--min-date 14M"
       max_date: "--max-date 2M"
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent:
@@ -801,55 +801,55 @@ subsampling:
   # ~4500 total (expect ~3400)
   # 10:1 ratio of recent to early
   # recent is present to 6m, n = 4120
-  # early is 6m to 30m, n = 412
+  # early is 6m to 18M, n = 412
   # regions are proportional to population size
   nextstrain_global_6m:
     africa_early:
       group_by: "country year month"
       max_sequences: 60
-      min_date: "--min-date 30M"
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Africa'"
     asia_early:
       group_by: "country year month"
       max_sequences: 80
-      min_date: "--min-date 30M"
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
     china_early:
       group_by: "division year month"
       max_sequences: 70
-      min_date: "--min-date 30M"
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'country!=China'"
     europe_early:
       group_by: "country year month"
       max_sequences: 50
-      min_date: "--min-date 30M"
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Europe'"
     india_early:
       group_by: "division year month"
       max_sequences: 70
-      min_date: "--min-date 30M"
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'country!=India'"
     north_america_early:
       group_by: "division year month"
       max_sequences: 40
-      min_date: "--min-date 30M"
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=North America'"
     south_america_early:
       group_by: "country year month"
       max_sequences: 36
-      min_date: "--min-date 30M"
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=South America'"
     oceania_early:
       group_by: "division year month"
       max_sequences: 6
-      min_date: "--min-date 30M"
+      min_date: "--min-date 18M"
       max_date: "--max-date 6M"
       exclude: "--exclude-where 'region!=Oceania'"
     africa_recent: