Skip to content

Commit

Permalink
Merge pull request #26 from nextstrain/include-target-seqs
Browse files Browse the repository at this point in the history
Include WHO reference strains, vaccine strains, NCBI genotypes in trees
  • Loading branch information
kimandrews authored Apr 25, 2024
2 parents 6bed278 + b519f2f commit 15957d4
Show file tree
Hide file tree
Showing 17 changed files with 348 additions and 62 deletions.
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# CHANGELOG
* 25 April 2024: Add specific sequences and metadata to the measles trees, including WHO reference sequences, vaccine strains, and genotypes reported on NCBI [PR #26](https://github.com/nextstrain/measles/pull/26)
* 10 April 2024: Add a single GH Action workflow to automate the ingest and phylogenetic workflows [PR #22](https://github.com/nextstrain/measles/pull/22)
* 2 April 2024: Add nextstrain-automation build-configs for deploying the final Auspice dataset of the phylogenetic workflow [PR #21](https://github.com/nextstrain/measles/pull/21)
* 1 April 2024: Create a "N450" tree using the 450 nucleotides encoding the carboxyl-terminal 150 amino acids of the nucleoprotein, which is highly represented on NCBI for measles. [PR #20](https://github.com/nextstrain/measles/pull/20)
Expand Down
60 changes: 60 additions & 0 deletions ingest/bin/parse-measles-genotype-names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#! /usr/bin/env python3
"""
From stdin, parses genotypes from GenBank's 'virus-name' field of the NDJSON record to 'genotype_ncbi'
Outputs the modified record to stdout.
"""

import argparse
import json
from sys import stdin, stdout, stderr

import re

EXPECTED_GENOTYPES = ['A', 'B1', 'B2', 'B3', 'C1', 'C2', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'E', 'F', 'G1', 'G2', 'G3', 'H1', 'H2']

def parse_args():
parser = argparse.ArgumentParser(
description="Modify measles virus-name attribute to extract genotypes to 'genotype_ncbi'."
)
parser.add_argument("--genotype-field", default='virus_name',
help="Field from the records to use as the genotype to be parsed.")

return parser.parse_args()

def _set_genotype_name(record):
genotype_name = record["genotype_ncbi"]

genotype_name = genotype_name.replace('Measles virus genotype ', '')
genotype_name = re.sub(r'Measles morbillivirus.*$', r'', genotype_name)
genotype_name = re.sub(r'.*?\[(.*)\]$', r'\1', genotype_name) # If square brackets present at end of string, keep only the text inside the brackets
genotype_name = re.sub(r'Measles virus MVs.*$', r'', genotype_name)
genotype_name = re.sub(r'Measles virus MVi.*$', r'', genotype_name)
genotype_name = re.sub(r'Measles virus strain MVi.*$', r'', genotype_name)
genotype_name = genotype_name.replace('Measles virus strain ', '')
genotype_name = re.sub(r'Measles virus.*$', r'', genotype_name)
genotype_name = re.sub(r'A-vaccine.*$', r'A', genotype_name)
genotype_name = re.sub(r'B3.1', r'B3', genotype_name)
genotype_name = re.sub(r'B3.2', r'B3', genotype_name)
genotype_name = re.sub(r'D4a', r'D4', genotype_name)
genotype_name = re.sub(r'D4b', r'D4', genotype_name)
genotype_name = re.sub(r'H1a', r'H1', genotype_name)
genotype_name = re.sub(r'H1b', r'H1', genotype_name)

return (
genotype_name)

def main():
args = parse_args()

for index, record in enumerate(stdin):
record = json.loads(record)
record['genotype_ncbi'] = record[args.genotype_field]
record['genotype_ncbi'] = _set_genotype_name(record)
if record['genotype_ncbi'] not in EXPECTED_GENOTYPES:
print(f"WARNING: unexpected NCBI genotype {record['genotype_ncbi']} parsed from record {index} will be excluded.", file=stderr)
record['genotype_ncbi'] = ''
stdout.write(json.dumps(record) + "\n")

if __name__ == "__main__":
main()
154 changes: 138 additions & 16 deletions ingest/defaults/annotations.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,141 @@
# Vaccine strain information from Parks et al. Comparison of predicted amino acid
# sequences of measles virus strains in the Edmonston vaccine lineage
# https://doi.org/10.1128/jvi.75.2.910-920.2001
AF266288.2 strain Measles strain Edmonston WT
AF266288.2 date 1954
AF266288.2 region North America
AF266288.2 country USA
AF266288.2 division Massachusetts
AF266288.2 location Boston
AF266287.1 strain Measles vaccine strain Moraten
AF266287.1 date 1954
AF266290.1 strain Measles vaccine strain Zagreb
AF266290.1 date 1954
AF266289.1 strain Measles vaccine strain Rubeovax
AF266289.1 date 1954
AF266291.1 strain Measles vaccine strain Schwarz
AF266291.1 date 1954
AF266286.1 strain Measles vaccine strain AIK-C
AF266286.1 date 1954
AF266288 strain Measles strain Edmonston WT
AF266288 date 1954
AF266288 region North America
AF266288 country USA
AF266288 division Massachusetts
AF266288 location Boston
AF266288 genotype_ncbi A
AF266287 strain Measles vaccine strain Moraten
AF266287 date 1954
AF266287 genotype_ncbi A
AF266290 strain Measles vaccine strain Zagreb
AF266290 date 1954
AF266290 genotype_ncbi A
AF266289 strain Measles vaccine strain Rubeovax
AF266289 date 1954
AF266289 genotype_ncbi A
AF266291 strain Measles vaccine strain Schwarz
AF266291 date 1954
AF266291 genotype_ncbi A
AF266286 strain Measles vaccine strain AIK-C
AF266286 date 1954
AF266286 genotype_ncbi A
#
# WHO genotype reference strains
# Information from https://www.who.int/publications/i/item/WER8709
# Dates are retrieved from epi-weeks reported within strain names
AF045212 is_reference TRUE
AF045217 is_reference TRUE
AF079555 is_reference TRUE
AF171232 is_reference TRUE
AF243450 is_reference TRUE
AF280803 is_reference TRUE
AF481485 is_reference TRUE
AJ232203 is_reference TRUE
AY037020 is_reference TRUE
AY043459 is_reference TRUE
AY184217 is_reference TRUE
AY923185 is_reference TRUE
D01005 is_reference TRUE
GU440571 is_reference TRUE
L46750 is_reference TRUE
L46753 is_reference TRUE
L46758 is_reference TRUE
M89921 is_reference TRUE
U01974 is_reference TRUE
U01976 is_reference TRUE
U01977 is_reference TRUE
U01987 is_reference TRUE
U01994 is_reference TRUE
U01998 is_reference TRUE
U64582 is_reference TRUE
X84865 is_reference TRUE
X84872 is_reference TRUE
X84879 is_reference TRUE
AF045212 genotype_ncbi H1
AF045217 genotype_ncbi H2
AF079555 genotype_ncbi D5
AF171232 genotype_ncbi G2
AF243450 genotype_ncbi D7
AF280803 genotype_ncbi D8
AF481485 genotype_ncbi D9
AJ232203 genotype_ncbi B3
AY037020 genotype_ncbi D7
AY043459 genotype_ncbi C1
AY184217 genotype_ncbi G3
AY923185 genotype_ncbi D10
D01005 genotype_ncbi D1
GU440571 genotype_ncbi D11
L46750 genotype_ncbi D6
L46753 genotype_ncbi B3
L46758 genotype_ncbi D5
M89921 genotype_ncbi C2
U01974 genotype_ncbi G1
U01976 genotype_ncbi D4
U01977 genotype_ncbi D3
U01987 genotype_ncbi A
U01994 genotype_ncbi B2
U01998 genotype_ncbi B1
U64582 genotype_ncbi D2
X84865 genotype_ncbi F
X84872 genotype_ncbi C2
X84879 genotype_ncbi E
AF045212 strain MVi/Hunan.CHN/0.93/7
AF045217 strain MVi/Beijing.CHN/0.94/1
AF079555 strain MVi/Bangkok.THA/0.93/1
AF171232 strain MVi/Amsterdam.NLD/49.97
AF243450 strain MVi/Victoria.AUS/16.85
AF280803 strain MVi/Manchester.GBR/30.94
AF481485 strain MVi/Victoria.AUS/12.99
AJ232203 strain MVi/Ibadan.NGA/0.97/1
AY037020 strain MVi/Illinois.USA/50.99
AY043459 strain MVi/Tokyo.JPN/0.84
AY184217 strain MVi/Gresik.IDN/17.02
AY923185 strain MVi/Kampala.UGA/51.01/1
D01005 strain MVi/Bristol.GBR/0.74
GU440571 strain MVi/Menglian.Yunnan.CHN/47.09
L46750 strain MVi/NewJersey.USA/0.94/1
L46753 strain MVi/NewYork.USA/0.94
L46758 strain MVi/Palau/0.93
M89921 strain MVi/Maryland.USA/0.77
U01974 strain MVi/Berkeley.USA/0.83
U01976 strain MVi/Montreal.CAN/0.89
U01977 strain MVi/Illinois.USA/0.89/1
U01987 strain MVi/Maryland.USA/0.54
U01994 strain MVi/Libreville.GAB/0.84
U01998 strain MVi/Yaounde.CMR/12.83
U64582 strain MVi/Johannesburg.ZAF/0.88/1
X84865 strain MVs/Madrid.ESP/0.94(SSPE)
X84872 strain MVi/Erlangen.DEU/0.90
X84879 strain MVi/Goettingen.DEU/0.71
AF045212 date 1993
AF045217 date 1994
AF079555 date 1993
AF171232 date 1997-12-01
AF243450 date 1985-04-15
AF280803 date 1994-07-25
AF481485 date 1999-03-22
AJ232203 date 1997
AY037020 date 1999-12-13
AY043459 date 1984
AY184217 date 2002-04-22
AY923185 date 2001-12-17
D01005 date 1974
GU440571 date 2009-11-16
L46750 date 1994
L46753 date 1994
L46758 date 1993
M89921 date 1977
U01974 date 1983
U01976 date 1989
U01977 date 1989
U01987 date 1954
U01994 date 1984
U01998 date 1983-03-21
U64582 date 1988
X84865 date 1994
X84872 date 1990
X84879 date 1971
7 changes: 6 additions & 1 deletion ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ curate:
# This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
field_map:
accession: accession
accession_version: accession_version
sourcedb: database
sra-accs: sra_accessions
isolate-lineage: strain
Expand All @@ -61,6 +62,7 @@ curate:
submitter-affiliation: institution
submitter-country: submitter_country
virus-name: virus_name
is_reference: is_reference
# Standardized strain name regex
# Currently accepts any characters because we do not have a clear standard for strain names across pathogens
strain_regex: '^.+$'
Expand Down Expand Up @@ -100,6 +102,7 @@ curate:
# The list of metadata columns to keep in the final output of the curation pipeline.
metadata_columns: [
'accession',
'accession_version',
'strain',
'virus_name',
'date',
Expand All @@ -115,5 +118,7 @@ curate:
'authors',
'abbr_authors',
'institution',
'genotype_ncbi',
'is_reference'
]

genotype_field: "virus_name"
2 changes: 2 additions & 0 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ rule curate:
annotations_id=config["curate"]["annotations_id"],
id_field=config["curate"]["output_id_field"],
sequence_field=config["curate"]["output_sequence_field"],
genotype_field=config["curate"]["genotype_field"],
shell:
"""
(cat {input.sequences_ndjson} \
Expand All @@ -105,6 +106,7 @@ rule curate:
--abbr-authors-field {params.abbr_authors_field} \
| ./vendored/apply-geolocation-rules \
--geolocation-rules {input.all_geolocation_rules} \
| ./bin/parse-measles-genotype-names.py --genotype-field {params.genotype_field} \
| ./vendored/merge-user-metadata \
--annotations {input.annotations} \
--id-field {params.annotations_id} \
Expand Down
5 changes: 4 additions & 1 deletion ingest/rules/fetch_from_ncbi.smk
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ rule format_ncbi_dataset_report:
--fields {params.ncbi_datasets_fields:q} \
--elide-header \
| csvtk add-header -t -l -n {params.ncbi_datasets_fields:q} \
| csvtk rename -t -f accession -n accession_version \
| csvtk -tl mutate -f accession_version -n accession -p "^(.+?)\." \
| tsv-select -H -f accession --rest last \
> {output.ncbi_dataset_tsv}
"""

Expand All @@ -98,7 +101,7 @@ rule format_ncbi_datasets_ndjson:
augur curate passthru \
--metadata {input.ncbi_dataset_tsv} \
--fasta {input.ncbi_dataset_sequences} \
--seq-id-column accession \
--seq-id-column accession_version \
--seq-field sequence \
--unmatched-reporting warn \
--duplicate-reporting warn \
Expand Down
5 changes: 5 additions & 0 deletions phylogenetic/defaults/auspice_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@
"key": "region",
"title": "Region",
"type": "categorical"
},
{
"key": "genotype_ncbi",
"title": "Genotype (NCBI)",
"type": "categorical"
}
],
"geo_resolutions": [
Expand Down
10 changes: 10 additions & 0 deletions phylogenetic/defaults/auspice_config_N450.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,16 @@
"key": "region",
"title": "Region",
"type": "categorical"
},
{
"key": "genotype_ncbi",
"title": "Genotype (NCBI)",
"type": "categorical"
},
{
"key": "is_reference",
"title": "WHO reference",
"type": "categorical"
}
],
"geo_resolutions": [
Expand Down
25 changes: 25 additions & 0 deletions phylogenetic/defaults/colors.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,28 @@ region Africa #8ABB6A
region Europe #BEBB48
region South America #E29E39
region North America #E2562B

genotype_ncbi A #5E1D9D
genotype_ncbi B1 #4B26B1
genotype_ncbi B2 #4138C3
genotype_ncbi B3 #3F4FCC
genotype_ncbi C1 #4065CF
genotype_ncbi C2 #447ACD
genotype_ncbi D1 #4A8BC3
genotype_ncbi D2 #529AB6
genotype_ncbi D3 #5BA6A6
genotype_ncbi D4 #66AE95
genotype_ncbi D5 #73B583
genotype_ncbi D6 #81B973
genotype_ncbi D7 #91BC64
genotype_ncbi D8 #A1BE58
genotype_ncbi D9 #B1BD4E
genotype_ncbi D10 #C0BA47
genotype_ncbi D11 #CEB541
genotype_ncbi E #DAAD3D
genotype_ncbi F #E19F3A
genotype_ncbi G1 #E68E36
genotype_ncbi G2 #E67832
genotype_ncbi G3 #E35F2D
genotype_ncbi H1 #DF4328
genotype_ncbi H2 #DB2823
2 changes: 2 additions & 0 deletions phylogenetic/defaults/config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
strain_id_field: "accession"
files:
exclude: "defaults/dropped_strains.txt"
include_genome: "defaults/include_strains_genome.txt"
include_N450: "defaults/include_strains_N450.txt"
reference: "defaults/measles_reference.gb"
reference_N450: "defaults/measles_reference_N450.gb"
reference_N450_fasta: "defaults/measles_reference_N450.fasta"
Expand Down
4 changes: 2 additions & 2 deletions phylogenetic/defaults/dropped_strains.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
HM562901.1 # temara.MOR/24.03
HM562900.1 # Mvs/Toulon.FRA/08.07
HM562901 # temara.MOR/24.03
HM562900 # Mvs/Toulon.FRA/08.07
Loading

0 comments on commit 15957d4

Please sign in to comment.