Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ingest BioPortal PURLs from prefixmaps #1099

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions src/bioregistry/curation/ingest_prefixmaps_bioportal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""Intest manually curated BioPortal PURLs from :mod:`prefixmaps`, curated by Harry Caufield."""

import requests
import yaml
import bioregistry
from bioregistry.external.bioportal import get_bioportal

URL = "https://raw.githubusercontent.com/linkml/prefixmaps/main/src/prefixmaps/data/bioportal.curated.yaml"
#: A mapping from BioPortal prefixes to lists of URI prefixes to skip
BLACKLIST = {"BFO": ["http://www.ifomis.org/bfo/1.1/snap#"]}


def main():
count = 0
max_count = 10
bioportal = get_bioportal(force_download=False)

bioportal_to_bioregistry = bioregistry.get_registry_invmap("bioportal")

res = requests.get(URL)
data = yaml.safe_load(res.text)["prefixes"]
for bioportal_prefix, uri_prefixes in data.items():
if bioportal_prefix not in bioportal:
# these are nonsense
continue

bioregistry_prefix = bioportal_to_bioregistry.get(bioportal_prefix)
if bioregistry_prefix is None:
# these might be relevant, but are not currently in the Bioregistry.
# note that there's no quality filter on BioPortal content, and it's not
# clear if there's a quality filter on the curation here, so we skip them
continue

resource = bioregistry.get_resource(bioregistry_prefix)
bioregistry_uri_prefixes = resource.get_uri_prefixes()
if isinstance(uri_prefixes, str):
uri_prefixes = [uri_prefixes]
for uri_prefix in uri_prefixes:
if uri_prefix in BLACKLIST.get(bioportal_prefix, []):
continue
if uri_prefix.startswith("OBO:"):
uri_prefix = "http://purl.obolibrary.org/obo/" + uri_prefix[len("OBO:") :]
if uri_prefix in bioregistry_uri_prefixes:
continue
print(bioregistry_prefix, uri_prefix)

if count > max_count:
continue
p = bioregistry.Provider(
code="",
name="",
homepage="",
description="",
uri_format=uri_prefix + "$1",
)
if resource.providers is None:
resource.providers = []
resource.providers.append(p)
count += 1

bioregistry.manager.write_registry()


if __name__ == "__main__":
main()
72 changes: 62 additions & 10 deletions src/bioregistry/data/bioregistry.json
Original file line number Diff line number Diff line change
Expand Up @@ -1014,6 +1014,15 @@
"prefix": "ADO"
},
"pattern": "^\\d{7}$",
"providers": [
{
"code": "legacy",
"description": "Legacy URI used before switching to OBO PURLs",
"homepage": "http://scai.fraunhofer.de",
"name": "Legacy",
"uri_format": "http://scai.fraunhofer.de/AlzheimerOntology#$1"
}
],
"publications": [
{
"doi": "10.1016/j.jalz.2013.02.009",
Expand Down Expand Up @@ -2717,10 +2726,10 @@
"pattern": "^ACTRN\\d+$",
"providers": [
{
"code": "",
"description": "",
"homepage": "",
"name": "",
"code": "redirect",
"description": "Australian New Zealand Clinical Trials Registry (redirect)",
"homepage": "https://anzctr.org.au",
"name": "Australian New Zealand Clinical Trials Registry (redirect)",
"uri_format": "https://anzctr.org.au/ACTRN$1.aspx"
}
],
Expand Down Expand Up @@ -4799,6 +4808,15 @@
],
"uri_format": "http://www.whocc.no/atc_ddd_index/?code=$1"
},
"providers": [
{
"code": "bioportal.purl",
"description": "BioPortal assigned additional PURLs for ATC codes",
"homepage": "http://purl.bioontology.org/ontology/ATC",
"name": "BioPortal",
"uri_format": "http://purl.bioontology.org/ontology/ATC/$1"
}
],
"publications": [
{
"pubmed": "7368387",
Expand Down Expand Up @@ -9539,6 +9557,15 @@
},
"name": "Biomedical Informatics Research Network Lexicon",
"pattern": "^\\d+$",
"providers": [
{
"code": "bioportal.purl",
"description": "Pseudo-BioPortal PURL assigned to BirnLex",
"homepage": "http://bioontology.org/projects/ontologies/birnlex",
"name": "BioPortal",
"uri_format": "http://bioontology.org/projects/ontologies/birnlex#$1"
}
],
"uri_format": "http://uri.neuinfo.org/nif/nifstd/birnlex_$1"
},
"biro": {
Expand Down Expand Up @@ -26859,10 +26886,17 @@
"providers": [
{
"code": "purl",
"description": "Legacy PURLs found in OAE",
"description": "Legacy PURLs for DOID, including the redundant DOID_",
"homepage": "http://purl.org/obo/owl/",
"name": "Legacy PURL",
"uri_format": "http://purl.org/obo/owl/DOID#DOID_$1"
},
{
"code": "purl2",
"description": "Legacy PURLs for DOID, not including the redundant DOID_",
"homepage": "http://purl.org/obo/owl/",
"name": "Legacy PURL",
"uri_format": "http://purl.org/obo/owl/DOID#$1"
}
],
"publications": [
Expand Down Expand Up @@ -34809,6 +34843,15 @@
"prefix": "EPIO"
},
"pattern": "^\\d{7}$",
"providers": [
{
"code": "legacy",
"description": "Legacy internal URL before switching to OBO PURL",
"homepage": "https://bio.scai.fraunhofer.de/ontolog",
"name": "Legacy",
"uri_format": "https://bio.scai.fraunhofer.de/ontology/epilepsy#$1"
}
],
"uri_format": "http://purl.obolibrary.org/obo/EPIO_$1"
},
"epo": {
Expand Down Expand Up @@ -39673,7 +39716,7 @@
"description": "Access funder data through a DOI for crossref funders.",
"homepage": "https://doi.org",
"name": "DOI",
"uri_format": "https://dx.doi.org/10.13039/501100000995"
"uri_format": "https://dx.doi.org/10.13039/$1"
}
],
"synonyms": [
Expand Down Expand Up @@ -42715,6 +42758,15 @@
"fairsharing": "FAIRsharing.175hsz"
},
"name": "General Formal Ontology",
"providers": [
{
"code": "alt1",
"description": "Alternate identifier using gfo-basic.owl instead of gfo.owl",
"homepage": "http://www.onto-med.de/ontologies",
"name": "Alternate 1",
"uri_format": "http://www.onto-med.de/ontologies/gfo-basic.owl#$1"
}
],
"uri_format": "http://www.onto-med.de/ontologies/gfo.owl#$1"
},
"ghr": {
Expand Down Expand Up @@ -57210,10 +57262,10 @@
"pattern": "^jRCT\\w?\\d+$",
"providers": [
{
"code": "",
"description": "",
"homepage": "",
"name": "",
"code": "detail",
"description": "Japan Registry of Clinical Trials (Details)",
"homepage": "https://jrct.niph.go.jp",
"name": "Japan Registry of Clinical Trials (Details)",
"uri_format": "https://jrct.niph.go.jp/latest-detail/$1"
}
],
Expand Down
6 changes: 5 additions & 1 deletion tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -672,6 +672,10 @@ def test_providers(self):
for provider in resource.providers:
with self.subTest(prefix=prefix, code=provider.code):
self.assertNotEqual(provider.code, prefix)
self.assertNotEqual(provider.code, "", msg="code should not be an empty string")
self.assertNotEqual(provider.homepage, "", msg="homepage should not be an empty string")
self.assertNotEqual(provider.description, "", msg="desc. should not be an empty string")
self.assertNotEqual(provider.name, "", msg="name should not be an empty string")
self.assertNotIn(
provider.code,
set(self.metaregistry),
Expand All @@ -683,7 +687,7 @@ def test_providers(self):
provider.code,
msg="Provider codes must be lowercase. Ideally, they should be simple and memorable",
)
# self.assertIn("$1", provider.uri_format)
self.assertIn("$1", provider.uri_format)
self.assertNotIn(
"$2",
provider.uri_format,
Expand Down
Loading