diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml index cf1203010..4c265ae20 100644 --- a/ingest/config/defaults.yaml +++ b/ingest/config/defaults.yaml @@ -6,7 +6,6 @@ log_level: DEBUG compound_country_field: ncbiGeoLocation fasta_id_field: genbankAccession keep: - - division - geoLocAdmin1 - geoLocAdmin2 - submissionId diff --git a/ingest/scripts/prepare_metadata.py b/ingest/scripts/prepare_metadata.py index 34b74cd8b..b02c9f42d 100644 --- a/ingest/scripts/prepare_metadata.py +++ b/ingest/scripts/prepare_metadata.py @@ -8,6 +8,7 @@ import hashlib import json import logging +import re from dataclasses import dataclass from pathlib import Path @@ -53,6 +54,23 @@ def get_geoloc(input_string: str, config: Config) -> tuple[str, str, str]: geo_loc_admin1 = option geo_loc_admin2 = "" if option.lower() == division.lower() else division return country, geo_loc_admin1, geo_loc_admin2 + try: + geolocadmin1_abbreviations = { + division.code: division.name + for division in pycountry.subdivisions.get(country_code=country_code) + } + geolocadmin1_abbreviations = { + abbrev.split("-")[1]: name for abbrev, name in geolocadmin1_abbreviations.items() + } + except Exception as e: + logger.error(f"Error getting subdivisions codes for {country}: {e}") + return country, division, "" + for option, name in geolocadmin1_abbreviations.items(): + division_words = re.split(r"[,\s]+", division) + if option in division_words: + geo_loc_admin1 = name + geo_loc_admin2 = "" if option == division else division + return country, geo_loc_admin1, geo_loc_admin2 return country, "", division return country, division, ""