Merge pull request #26 from nextstrain/include-target-seqs

Include WHO reference strains, vaccine strains, NCBI genotypes in trees
nextstrain · Apr 25, 2024 · 15957d4 · 15957d4
2 parents 6bed278 + b519f2f
commit 15957d4
Show file tree

Hide file tree

Showing 17 changed files with 348 additions and 62 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,4 +1,5 @@
 # CHANGELOG
+* 25 April 2024: Add specific sequences and metadata to the measles trees, including WHO reference sequences, vaccine strains, and genotypes reported on NCBI [PR #26](https://github.com/nextstrain/measles/pull/26)
 * 10 April 2024: Add a single GH Action workflow to automate the ingest and phylogenetic workflows [PR #22](https://github.com/nextstrain/measles/pull/22)
 * 2 April 2024: Add nextstrain-automation build-configs for deploying the final Auspice dataset of the phylogenetic workflow [PR #21](https://github.com/nextstrain/measles/pull/21)
 * 1 April 2024: Create a "N450" tree using the 450 nucleotides encoding the carboxyl-terminal 150 amino acids of the nucleoprotein, which is highly represented on NCBI for measles. [PR #20](https://github.com/nextstrain/measles/pull/20)

diff --git a/ingest/bin/parse-measles-genotype-names.py b/ingest/bin/parse-measles-genotype-names.py
@@ -0,0 +1,60 @@
+#! /usr/bin/env python3
+"""
+From stdin, parses genotypes from GenBank's 'virus-name' field of the NDJSON record to 'genotype_ncbi'
+
+Outputs the modified record to stdout.
+"""
+
+import argparse
+import json
+from sys import stdin, stdout, stderr
+
+import re
+
+EXPECTED_GENOTYPES = ['A', 'B1', 'B2', 'B3', 'C1', 'C2', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'E', 'F', 'G1', 'G2', 'G3', 'H1', 'H2']
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Modify measles virus-name attribute to extract genotypes to 'genotype_ncbi'."
+    )
+    parser.add_argument("--genotype-field", default='virus_name',
+        help="Field from the records to use as the genotype to be parsed.")
+
+    return parser.parse_args()
+
+def _set_genotype_name(record):
+    genotype_name = record["genotype_ncbi"]
+
+    genotype_name = genotype_name.replace('Measles virus genotype ', '')
+    genotype_name = re.sub(r'Measles morbillivirus.*$', r'', genotype_name)   
+    genotype_name = re.sub(r'.*?\[(.*)\]$', r'\1', genotype_name) # If square brackets present at end of string, keep only the text inside the brackets
+    genotype_name = re.sub(r'Measles virus MVs.*$', r'', genotype_name)
+    genotype_name = re.sub(r'Measles virus MVi.*$', r'', genotype_name)
+    genotype_name = re.sub(r'Measles virus strain MVi.*$', r'', genotype_name)
+    genotype_name = genotype_name.replace('Measles virus strain ', '')
+    genotype_name = re.sub(r'Measles virus.*$', r'', genotype_name)
+    genotype_name = re.sub(r'A-vaccine.*$', r'A', genotype_name)
+    genotype_name = re.sub(r'B3.1', r'B3', genotype_name) 
+    genotype_name = re.sub(r'B3.2', r'B3', genotype_name) 
+    genotype_name = re.sub(r'D4a', r'D4', genotype_name) 
+    genotype_name = re.sub(r'D4b', r'D4', genotype_name) 
+    genotype_name = re.sub(r'H1a', r'H1', genotype_name) 
+    genotype_name = re.sub(r'H1b', r'H1', genotype_name) 
+
+    return (
+        genotype_name)
+
+def main():
+    args = parse_args()
+
+    for index, record in enumerate(stdin):
+        record = json.loads(record)
+        record['genotype_ncbi'] = record[args.genotype_field]
+        record['genotype_ncbi'] = _set_genotype_name(record)
+        if record['genotype_ncbi'] not in EXPECTED_GENOTYPES:
+            print(f"WARNING: unexpected NCBI genotype {record['genotype_ncbi']} parsed from record {index} will be excluded.", file=stderr)
+            record['genotype_ncbi'] = ''
+        stdout.write(json.dumps(record) + "\n")
+
+if __name__ == "__main__":
+    main()
diff --git a/ingest/defaults/annotations.tsv b/ingest/defaults/annotations.tsv
@@ -8,19 +8,141 @@
 # Vaccine strain information from Parks et al. Comparison of predicted amino acid
 # sequences of measles virus strains in the Edmonston vaccine lineage
 # https://doi.org/10.1128/jvi.75.2.910-920.2001
-AF266288.2	strain	Measles strain Edmonston WT
-AF266288.2	date	1954
-AF266288.2	region	North America
-AF266288.2	country	USA
-AF266288.2	division	Massachusetts
-AF266288.2	location	Boston
-AF266287.1	strain	Measles vaccine strain Moraten
-AF266287.1	date	1954
-AF266290.1	strain	Measles vaccine strain Zagreb
-AF266290.1	date	1954
-AF266289.1	strain	Measles vaccine strain Rubeovax
-AF266289.1	date	1954
-AF266291.1	strain	Measles vaccine strain Schwarz
-AF266291.1	date	1954
-AF266286.1	strain	Measles vaccine strain AIK-C
-AF266286.1	date	1954
+AF266288	strain	Measles strain Edmonston WT
+AF266288	date	1954
+AF266288	region	North America
+AF266288	country	USA
+AF266288	division	Massachusetts
+AF266288	location	Boston
+AF266288	genotype_ncbi	A
+AF266287	strain	Measles vaccine strain Moraten
+AF266287	date	1954
+AF266287	genotype_ncbi	A
+AF266290	strain	Measles vaccine strain Zagreb
+AF266290	date	1954
+AF266290	genotype_ncbi	A
+AF266289	strain	Measles vaccine strain Rubeovax
+AF266289	date	1954
+AF266289	genotype_ncbi	A
+AF266291	strain	Measles vaccine strain Schwarz
+AF266291	date	1954
+AF266291	genotype_ncbi	A
+AF266286	strain	Measles vaccine strain AIK-C
+AF266286	date	1954
+AF266286	genotype_ncbi	A
+#
+# WHO genotype reference strains
+# Information from https://www.who.int/publications/i/item/WER8709
+# Dates are retrieved from epi-weeks reported within strain names
+AF045212	is_reference	TRUE
+AF045217	is_reference	TRUE
+AF079555	is_reference	TRUE
+AF171232	is_reference	TRUE
+AF243450	is_reference	TRUE
+AF280803	is_reference	TRUE
+AF481485	is_reference	TRUE
+AJ232203	is_reference	TRUE
+AY037020	is_reference	TRUE
+AY043459	is_reference	TRUE
+AY184217	is_reference	TRUE
+AY923185	is_reference	TRUE
+D01005	is_reference	TRUE
+GU440571	is_reference	TRUE
+L46750	is_reference	TRUE
+L46753	is_reference	TRUE
+L46758	is_reference	TRUE
+M89921	is_reference	TRUE
+U01974	is_reference	TRUE
+U01976	is_reference	TRUE
+U01977	is_reference	TRUE
+U01987	is_reference	TRUE
+U01994	is_reference	TRUE
+U01998	is_reference	TRUE
+U64582	is_reference	TRUE
+X84865	is_reference	TRUE
+X84872	is_reference	TRUE
+X84879	is_reference	TRUE
+AF045212	genotype_ncbi	H1
+AF045217	genotype_ncbi	H2
+AF079555	genotype_ncbi	D5
+AF171232	genotype_ncbi	G2
+AF243450	genotype_ncbi	D7
+AF280803	genotype_ncbi	D8
+AF481485	genotype_ncbi	D9
+AJ232203	genotype_ncbi	B3
+AY037020	genotype_ncbi	D7
+AY043459	genotype_ncbi	C1
+AY184217	genotype_ncbi	G3
+AY923185	genotype_ncbi	D10
+D01005	genotype_ncbi	D1
+GU440571	genotype_ncbi	D11
+L46750	genotype_ncbi	D6
+L46753	genotype_ncbi	B3
+L46758	genotype_ncbi	D5
+M89921	genotype_ncbi	C2
+U01974	genotype_ncbi	G1
+U01976	genotype_ncbi	D4
+U01977	genotype_ncbi	D3
+U01987	genotype_ncbi	A
+U01994	genotype_ncbi	B2
+U01998	genotype_ncbi	B1
+U64582	genotype_ncbi	D2
+X84865	genotype_ncbi	F
+X84872	genotype_ncbi	C2
+X84879	genotype_ncbi	E
+AF045212	strain	MVi/Hunan.CHN/0.93/7
+AF045217	strain	MVi/Beijing.CHN/0.94/1
+AF079555	strain	MVi/Bangkok.THA/0.93/1
+AF171232	strain	MVi/Amsterdam.NLD/49.97
+AF243450	strain	MVi/Victoria.AUS/16.85
+AF280803	strain	MVi/Manchester.GBR/30.94
+AF481485	strain	MVi/Victoria.AUS/12.99
+AJ232203	strain	MVi/Ibadan.NGA/0.97/1
+AY037020	strain	MVi/Illinois.USA/50.99
+AY043459	strain	MVi/Tokyo.JPN/0.84
+AY184217	strain	MVi/Gresik.IDN/17.02
+AY923185	strain	MVi/Kampala.UGA/51.01/1
+D01005	strain	MVi/Bristol.GBR/0.74
+GU440571	strain	MVi/Menglian.Yunnan.CHN/47.09
+L46750	strain	MVi/NewJersey.USA/0.94/1
+L46753	strain	MVi/NewYork.USA/0.94
+L46758	strain	MVi/Palau/0.93
+M89921	strain	MVi/Maryland.USA/0.77
+U01974	strain	MVi/Berkeley.USA/0.83
+U01976	strain	MVi/Montreal.CAN/0.89
+U01977	strain	MVi/Illinois.USA/0.89/1
+U01987	strain	MVi/Maryland.USA/0.54
+U01994	strain	MVi/Libreville.GAB/0.84
+U01998	strain	MVi/Yaounde.CMR/12.83
+U64582	strain	MVi/Johannesburg.ZAF/0.88/1
+X84865	strain	MVs/Madrid.ESP/0.94(SSPE)
+X84872	strain	MVi/Erlangen.DEU/0.90
+X84879	strain	MVi/Goettingen.DEU/0.71
+AF045212	date	1993
+AF045217	date	1994
+AF079555	date	1993
+AF171232	date	1997-12-01
+AF243450	date	1985-04-15
+AF280803	date	1994-07-25
+AF481485	date	1999-03-22
+AJ232203	date	1997
+AY037020	date	1999-12-13
+AY043459	date	1984
+AY184217	date	2002-04-22
+AY923185	date	2001-12-17
+D01005	date	1974
+GU440571	date	2009-11-16
+L46750	date	1994
+L46753	date	1994
+L46758	date	1993
+M89921	date	1977
+U01974	date	1983
+U01976	date	1989
+U01977	date	1989
+U01987	date	1954
+U01994	date	1984
+U01998	date	1983-03-21
+U64582	date	1988
+X84865	date	1994
+X84872	date	1990
+X84879	date	1971
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
@@ -45,6 +45,7 @@ curate:
   # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
   field_map:
     accession: accession
+    accession_version: accession_version
     sourcedb: database
     sra-accs: sra_accessions
     isolate-lineage: strain
@@ -61,6 +62,7 @@ curate:
     submitter-affiliation: institution
     submitter-country: submitter_country
     virus-name: virus_name
+    is_reference: is_reference
   # Standardized strain name regex
   # Currently accepts any characters because we do not have a clear standard for strain names across pathogens
   strain_regex: '^.+$'
@@ -100,6 +102,7 @@ curate:
   # The list of metadata columns to keep in the final output of the curation pipeline.
   metadata_columns: [
     'accession',
+    'accession_version',
     'strain',
     'virus_name',
     'date',
@@ -115,5 +118,7 @@ curate:
     'authors',
     'abbr_authors',
     'institution',
+    'genotype_ncbi',
+    'is_reference'
   ]
-
+  genotype_field: "virus_name"
diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
@@ -82,6 +82,7 @@ rule curate:
         annotations_id=config["curate"]["annotations_id"],
         id_field=config["curate"]["output_id_field"],
         sequence_field=config["curate"]["output_sequence_field"],
+        genotype_field=config["curate"]["genotype_field"],
     shell:
         """
         (cat {input.sequences_ndjson} \
@@ -105,6 +106,7 @@ rule curate:
                 --abbr-authors-field {params.abbr_authors_field} \
             | ./vendored/apply-geolocation-rules \
                 --geolocation-rules {input.all_geolocation_rules} \
+            | ./bin/parse-measles-genotype-names.py --genotype-field {params.genotype_field} \
             | ./vendored/merge-user-metadata \
                 --annotations {input.annotations} \
                 --id-field {params.annotations_id} \

diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk
@@ -75,6 +75,9 @@ rule format_ncbi_dataset_report:
             --fields {params.ncbi_datasets_fields:q} \
             --elide-header \
             | csvtk add-header -t -l -n {params.ncbi_datasets_fields:q} \
+            | csvtk rename -t -f accession -n accession_version \
+            | csvtk -tl mutate -f accession_version -n accession -p "^(.+?)\." \
+            | tsv-select -H -f accession --rest last \
             > {output.ncbi_dataset_tsv}
         """
 
@@ -98,7 +101,7 @@ rule format_ncbi_datasets_ndjson:
         augur curate passthru \
             --metadata {input.ncbi_dataset_tsv} \
             --fasta {input.ncbi_dataset_sequences} \
-            --seq-id-column accession \
+            --seq-id-column accession_version \
             --seq-field sequence \
             --unmatched-reporting warn \
             --duplicate-reporting warn \

diff --git a/phylogenetic/defaults/auspice_config.json b/phylogenetic/defaults/auspice_config.json
@@ -25,6 +25,11 @@
       "key": "region",
       "title": "Region",
       "type": "categorical"
+    },
+    {
+      "key": "genotype_ncbi",
+      "title": "Genotype (NCBI)",
+      "type": "categorical"
     }
   ],
   "geo_resolutions": [

diff --git a/phylogenetic/defaults/auspice_config_N450.json b/phylogenetic/defaults/auspice_config_N450.json
@@ -25,6 +25,16 @@
       "key": "region",
       "title": "Region",
       "type": "categorical"
+    },
+    {
+      "key": "genotype_ncbi",
+      "title": "Genotype (NCBI)",
+      "type": "categorical"
+    },
+    {
+      "key": "is_reference",
+      "title": "WHO reference",
+      "type": "categorical"
     }
   ],
   "geo_resolutions": [

diff --git a/phylogenetic/defaults/colors.tsv b/phylogenetic/defaults/colors.tsv
@@ -4,3 +4,28 @@ region	Africa	#8ABB6A
 region	Europe	#BEBB48
 region	South America	#E29E39
 region	North America	#E2562B
+
+genotype_ncbi	A	#5E1D9D
+genotype_ncbi	B1	#4B26B1
+genotype_ncbi	B2	#4138C3
+genotype_ncbi	B3	#3F4FCC
+genotype_ncbi	C1	#4065CF
+genotype_ncbi	C2	#447ACD
+genotype_ncbi	D1	#4A8BC3
+genotype_ncbi	D2	#529AB6
+genotype_ncbi	D3	#5BA6A6
+genotype_ncbi	D4	#66AE95
+genotype_ncbi	D5	#73B583
+genotype_ncbi	D6	#81B973
+genotype_ncbi	D7	#91BC64
+genotype_ncbi	D8	#A1BE58
+genotype_ncbi	D9	#B1BD4E
+genotype_ncbi	D10	#C0BA47
+genotype_ncbi	D11	#CEB541
+genotype_ncbi	E	#DAAD3D
+genotype_ncbi	F	#E19F3A
+genotype_ncbi	G1	#E68E36
+genotype_ncbi	G2	#E67832
+genotype_ncbi	G3	#E35F2D
+genotype_ncbi	H1	#DF4328
+genotype_ncbi	H2	#DB2823
diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml
@@ -1,6 +1,8 @@
 strain_id_field: "accession"
 files:
     exclude: "defaults/dropped_strains.txt"
+    include_genome: "defaults/include_strains_genome.txt"
+    include_N450: "defaults/include_strains_N450.txt"
     reference: "defaults/measles_reference.gb"
     reference_N450: "defaults/measles_reference_N450.gb"
     reference_N450_fasta: "defaults/measles_reference_N450.fasta"

diff --git a/phylogenetic/defaults/dropped_strains.txt b/phylogenetic/defaults/dropped_strains.txt
@@ -1,2 +1,2 @@
-HM562901.1 # temara.MOR/24.03
-HM562900.1 # Mvs/Toulon.FRA/08.07
+HM562901 # temara.MOR/24.03
+HM562900 # Mvs/Toulon.FRA/08.07