biopragmatics · cthoyt · Jan 31, 2024 · Jan 31, 2024 · Jan 31, 2024 · Jan 31, 2024
diff --git a/lexica/anatomy/README.md b/lexica/anatomy/README.md
@@ -0,0 +1 @@
+# Anatomy, Tissues, and Organ Systems
diff --git a/lexica/anatomy/generate.py b/lexica/anatomy/generate.py
@@ -0,0 +1,76 @@
+from pathlib import Path
+
+import semra
+
+import biolexica
+
+HERE = Path(__file__).parent.resolve()
+TERMS_PATH = HERE.joinpath("terms.tsv.gz")
+
+PRIORITY = [
+    "uberon",
+    "mesh",
+    "bto",
+    "caro",
+    "ncit",
+    # "umls", # TODO find appropriate subset
+]
+BIOLEXICA_CONFIG = [
+    biolexica.Input(source="uberon", processor="pyobo"),
+    biolexica.Input(
+        source="mesh",
+        # skip A11 since it's cells
+        ancestors=biolexica.get_mesh_category_curies("A", skip=["A11"]),
+        processor="pyobo",
+    ),
+    biolexica.Input(
+        source="ncit",
+        ancestors=[
+            "NCIT:C12219",  # Anatomic Structure, System, or Substance
+        ],
+        processor="pyobo",
+    ),
+    biolexica.Input(source="bto", processor="pyobo"),
+    biolexica.Input(source="caro", processor="pyobo"),
+]
+
+SEMRA_CONFIG = semra.Configuration(
+    name="Anatomy mappings",
+    inputs=[
+        semra.Input(source="biomappings"),
+        semra.Input(source="gilda"),
+        semra.Input(prefix="uberon", source="pyobo", confidence=0.99),
+        semra.Input(prefix="bto", source="pyobo", confidence=0.99),
+        semra.Input(prefix="caro", source="pyobo", confidence=0.99),
+        semra.Input(prefix="mesh", source="pyobo", confidence=0.99),
+        semra.Input(prefix="ncit", source="pyobo", confidence=0.99),
+        # semra.Input(prefix="umls", source="pyobo", confidence=0.99),
+    ],
+    add_labels=False,
+    priority=PRIORITY,
+    keep_prefixes=PRIORITY,
+    remove_imprecise=False,
+    mutations=[
+        semra.Mutation(source="uberon", confidence=0.8),
+        semra.Mutation(source="bto", confidence=0.65),
+        semra.Mutation(source="caro", confidence=0.8),
+        semra.Mutation(source="ncit", confidence=0.7),
+        # semra.Mutation(source="umls", confidence=0.7),
+    ],
+    raw_pickle_path=HERE.joinpath("mappings_raw.pkl.gz"),
+    processed_pickle_path=HERE.joinpath("mappings_processed.pkl.gz"),
+    priority_pickle_path=HERE.joinpath("mappings_prioritized.pkl"),
+)
+
+
+def _main() -> None:
+    mappings = SEMRA_CONFIG.get_mappings()
+    biolexica.assemble_terms(
+        inputs=BIOLEXICA_CONFIG,
+        mappings=mappings,
+        processed_path=TERMS_PATH,
+    )
+
+
+if __name__ == "__main__":
+    _main()
diff --git a/lexica/anatomy/mappings_prioritized.pkl b/lexica/anatomy/mappings_prioritized.pkl
diff --git a/lexica/anatomy/mappings_processed.pkl.gz b/lexica/anatomy/mappings_processed.pkl.gz
diff --git a/lexica/anatomy/mappings_raw.pkl.gz b/lexica/anatomy/mappings_raw.pkl.gz
diff --git a/lexica/anatomy/terms.tsv.gz b/lexica/anatomy/terms.tsv.gz
diff --git a/lexica/cell/generate.py b/lexica/cell/generate.py
@@ -9,7 +9,7 @@
 
 PRIORITY = ["mesh", "efo", "cellosaurus", "ccle", "depmap", "bto", "cl", "clo"]
 BIOLEXICA_CONFIG = [
-    biolexica.Input(source="mesh", processor="pyobo", ancestors=["mesh:D002477"]),  # cells
+    biolexica.Input(source="mesh", processor="pyobo", ancestors=["mesh:D002477"]),  # cells (A11)
     biolexica.Input(source="efo", processor="pyobo", ancestors=["efo:0000324"]),
     biolexica.Input(source="cellosaurus", processor="pyobo"),
     # biolexica.Input(source="ccle", processor="pyobo"),

diff --git a/lexica/phenotype/README.md b/lexica/phenotype/README.md
@@ -1,4 +1,4 @@
-# Cell and Cell Line Lexicon
+# Disease, Phenotype, and Condition Lexicon
 
 This directory contains a coherent, merged lexical index for the following resources:
 

diff --git a/lexica/phenotype/generate.py b/lexica/phenotype/generate.py
@@ -20,9 +20,15 @@
     biolexica.Input(source="mondo", processor="pyobo"),
     biolexica.Input(source="hp", processor="pyobo"),
     biolexica.Input(source="symp", processor="pyobo"),
-    # TODO get subsets of MeSH (C for diseases, F for Psychiatry/Psychology,
-    #  and maybe others. See https://meshb.nlm.nih.gov/treeView)
-    biolexica.Input(source="mesh", processor="pyobo"),
+    biolexica.Input(
+        source="mesh",
+        processor="pyobo",
+        ancestors=[
+            *biolexica.get_mesh_category_curies("C"),
+            *biolexica.get_mesh_category_curies("F"),
+            # TODO should there be others?
+        ],
+    ),
     biolexica.Input(source="efo", processor="pyobo"),  # TODO find subset of EFO
     # biolexica.Input(source="umls", processor="pyobo"), # TODO find subset of UMLS
     # biolexica.Input(source="ncit", processor="pyobo"), # TODO find subset of NCIT

diff --git a/lexica/phenotype/terms.tsv.gz b/lexica/phenotype/terms.tsv.gz
diff --git a/src/biolexica/api.py b/src/biolexica/api.py
@@ -23,6 +23,7 @@
     "assemble_terms",
     "iter_terms_by_prefix",
     "load_grounder",
+    "get_mesh_category_curies",
 ]
 
 logger = logging.getLogger(__name__)
@@ -139,7 +140,7 @@ def _get_pyobo_subset_terms(source: str, ancestors: Union[str, List[str]]) -> It
     subset = {
         descendant
         for parent_curie in _ensure_list(ancestors)
-        for descendant in pyobo.get_descendants(*parent_curie.split(":"))
+        for descendant in pyobo.get_descendants(*parent_curie.split(":")) or []
     }
     for term in get_gilda_terms(source):
         if bioregistry.curie_to_str(term.db, term.id) in subset:
@@ -196,3 +197,26 @@ def _get_bioontologies_subset_terms(
                 status="synonym",
                 source=source,
             )
+
+
+def get_mesh_category_curies(letter, skip=None) -> List[str]:
+    """Get the MeSH LUIDs for a category, by letter (e.g., "A")."""
+    # see https://meshb.nlm.nih.gov/treeView
+
+    import bioversions
+    from pyobo.sources.mesh import get_tree_to_mesh_id
+
+    mesh_version = bioversions.get_version("mesh")
+    if mesh_version is None:
+        raise ValueError
+    tree_to_mesh = get_tree_to_mesh_id(mesh_version)
+    rv = []
+    for i in range(1, 100):
+        key = f"{letter}{i:02}"
+        if skip and key in skip:
+            continue
+        mesh_id = tree_to_mesh.get(key)
+        if mesh_id is None:
+            break
+        rv.append(f"mesh:{mesh_id}")
+    return rv