Skip to content

Commit

Permalink
add abbreviations
Browse files Browse the repository at this point in the history
  • Loading branch information
anna-parker committed Oct 18, 2024
1 parent 8632ca8 commit eca9109
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 1 deletion.
1 change: 0 additions & 1 deletion ingest/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ log_level: DEBUG
compound_country_field: ncbiGeoLocation
fasta_id_field: genbankAccession
keep:
- division
- geoLocAdmin1
- geoLocAdmin2
- submissionId
Expand Down
18 changes: 18 additions & 0 deletions ingest/scripts/prepare_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import hashlib
import json
import logging
import re
from dataclasses import dataclass
from pathlib import Path

Expand Down Expand Up @@ -53,6 +54,23 @@ def get_geoloc(input_string: str, config: Config) -> tuple[str, str, str]:
geo_loc_admin1 = option
geo_loc_admin2 = "" if option.lower() == division.lower() else division
return country, geo_loc_admin1, geo_loc_admin2
try:
geolocadmin1_abbreviations = {
division.code: division.name
for division in pycountry.subdivisions.get(country_code=country_code)
}
geolocadmin1_abbreviations = {
abbrev.split("-")[1]: name for abbrev, name in geolocadmin1_abbreviations.items()
}
except Exception as e:
logger.error(f"Error getting subdivisions codes for {country}: {e}")
return country, division, ""
for option, name in geolocadmin1_abbreviations.items():
division_words = re.split(r"[,\s]+", division)
if option in division_words:
geo_loc_admin1 = name
geo_loc_admin2 = "" if option == division else division
return country, geo_loc_admin1, geo_loc_admin2
return country, "", division
return country, division, ""

Expand Down

0 comments on commit eca9109

Please sign in to comment.