Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add anatomy index #2

Merged
merged 4 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lexica/anatomy/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Anatomy, Tissues, and Organ Systems
76 changes: 76 additions & 0 deletions lexica/anatomy/generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from pathlib import Path

import semra

import biolexica

HERE = Path(__file__).parent.resolve()
TERMS_PATH = HERE.joinpath("terms.tsv.gz")

PRIORITY = [
"uberon",
"mesh",
"bto",
"caro",
"ncit",
# "umls", # TODO find appropriate subset
]
BIOLEXICA_CONFIG = [
biolexica.Input(source="uberon", processor="pyobo"),
biolexica.Input(
source="mesh",
# skip A11 since it's cells
ancestors=biolexica.get_mesh_category_curies("A", skip=["A11"]),
processor="pyobo",
),
biolexica.Input(
source="ncit",
ancestors=[
"NCIT:C12219", # Anatomic Structure, System, or Substance
],
processor="pyobo",
),
biolexica.Input(source="bto", processor="pyobo"),
biolexica.Input(source="caro", processor="pyobo"),
]

SEMRA_CONFIG = semra.Configuration(
name="Anatomy mappings",
inputs=[
semra.Input(source="biomappings"),
semra.Input(source="gilda"),
semra.Input(prefix="uberon", source="pyobo", confidence=0.99),
semra.Input(prefix="bto", source="pyobo", confidence=0.99),
semra.Input(prefix="caro", source="pyobo", confidence=0.99),
semra.Input(prefix="mesh", source="pyobo", confidence=0.99),
semra.Input(prefix="ncit", source="pyobo", confidence=0.99),
# semra.Input(prefix="umls", source="pyobo", confidence=0.99),
],
add_labels=False,
priority=PRIORITY,
keep_prefixes=PRIORITY,
remove_imprecise=False,
mutations=[
semra.Mutation(source="uberon", confidence=0.8),
semra.Mutation(source="bto", confidence=0.65),
semra.Mutation(source="caro", confidence=0.8),
semra.Mutation(source="ncit", confidence=0.7),
# semra.Mutation(source="umls", confidence=0.7),
],
raw_pickle_path=HERE.joinpath("mappings_raw.pkl.gz"),
processed_pickle_path=HERE.joinpath("mappings_processed.pkl.gz"),
priority_pickle_path=HERE.joinpath("mappings_prioritized.pkl"),
)


def _main() -> None:
mappings = SEMRA_CONFIG.get_mappings()
biolexica.assemble_terms(
inputs=BIOLEXICA_CONFIG,
mappings=mappings,
processed_path=TERMS_PATH,
)


if __name__ == "__main__":
_main()
Binary file added lexica/anatomy/mappings_prioritized.pkl
Binary file not shown.
Binary file added lexica/anatomy/mappings_processed.pkl.gz
Binary file not shown.
Binary file added lexica/anatomy/mappings_raw.pkl.gz
Binary file not shown.
Binary file added lexica/anatomy/terms.tsv.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion lexica/cell/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

PRIORITY = ["mesh", "efo", "cellosaurus", "ccle", "depmap", "bto", "cl", "clo"]
BIOLEXICA_CONFIG = [
biolexica.Input(source="mesh", processor="pyobo", ancestors=["mesh:D002477"]), # cells
biolexica.Input(source="mesh", processor="pyobo", ancestors=["mesh:D002477"]), # cells (A11)
biolexica.Input(source="efo", processor="pyobo", ancestors=["efo:0000324"]),
biolexica.Input(source="cellosaurus", processor="pyobo"),
# biolexica.Input(source="ccle", processor="pyobo"),
Expand Down
2 changes: 1 addition & 1 deletion lexica/phenotype/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Cell and Cell Line Lexicon
# Disease, Phenotype, and Condition Lexicon

This directory contains a coherent, merged lexical index for the following resources:

Expand Down
12 changes: 9 additions & 3 deletions lexica/phenotype/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,15 @@
biolexica.Input(source="mondo", processor="pyobo"),
biolexica.Input(source="hp", processor="pyobo"),
biolexica.Input(source="symp", processor="pyobo"),
# TODO get subsets of MeSH (C for diseases, F for Psychiatry/Psychology,
# and maybe others. See https://meshb.nlm.nih.gov/treeView)
biolexica.Input(source="mesh", processor="pyobo"),
biolexica.Input(
source="mesh",
processor="pyobo",
ancestors=[
*biolexica.get_mesh_category_curies("C"),
*biolexica.get_mesh_category_curies("F"),
# TODO should there be others?
],
),
biolexica.Input(source="efo", processor="pyobo"), # TODO find subset of EFO
# biolexica.Input(source="umls", processor="pyobo"), # TODO find subset of UMLS
# biolexica.Input(source="ncit", processor="pyobo"), # TODO find subset of NCIT
Expand Down
Binary file modified lexica/phenotype/terms.tsv.gz
Binary file not shown.
26 changes: 25 additions & 1 deletion src/biolexica/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
"assemble_terms",
"iter_terms_by_prefix",
"load_grounder",
"get_mesh_category_curies",
]

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -139,7 +140,7 @@ def _get_pyobo_subset_terms(source: str, ancestors: Union[str, List[str]]) -> It
subset = {
descendant
for parent_curie in _ensure_list(ancestors)
for descendant in pyobo.get_descendants(*parent_curie.split(":"))
for descendant in pyobo.get_descendants(*parent_curie.split(":")) or []
}
for term in get_gilda_terms(source):
if bioregistry.curie_to_str(term.db, term.id) in subset:
Expand Down Expand Up @@ -196,3 +197,26 @@ def _get_bioontologies_subset_terms(
status="synonym",
source=source,
)


def get_mesh_category_curies(letter, skip=None) -> List[str]:
"""Get the MeSH LUIDs for a category, by letter (e.g., "A")."""
# see https://meshb.nlm.nih.gov/treeView

import bioversions
from pyobo.sources.mesh import get_tree_to_mesh_id

mesh_version = bioversions.get_version("mesh")
if mesh_version is None:
raise ValueError
tree_to_mesh = get_tree_to_mesh_id(mesh_version)
rv = []
for i in range(1, 100):
key = f"{letter}{i:02}"
if skip and key in skip:
continue
mesh_id = tree_to_mesh.get(key)
if mesh_id is None:
break
rv.append(f"mesh:{mesh_id}")
return rv
Loading