Skip to content

Commit

Permalink
Pass custom IC map to Semsimian through CLI (#759)
Browse files Browse the repository at this point in the history
* Pass custom IC map to Semsimian through CLI

* Add test

* Add comment

* Stricter test for equivalence

* Nano-linting

---------

Co-authored-by: Chris Mungall <[email protected]>
  • Loading branch information
caufieldjh and cmungall authored May 16, 2024
1 parent 377f5ac commit 55c2709
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 4 deletions.
16 changes: 14 additions & 2 deletions src/oaklib/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
from oaklib.implementations.obograph.obograph_implementation import (
OboGraphImplementation,
)
from oaklib.implementations.semsimian.semsimian_implementation import SemSimianImplementation
from oaklib.implementations.sqldb.sql_implementation import SqlImplementation
from oaklib.interfaces import (
BasicOntologyInterface,
Expand Down Expand Up @@ -2956,7 +2957,10 @@ def similarity(
if not isinstance(impl, SemanticSimilarityInterface):
raise NotImplementedError(f"Cannot execute this using {impl} of type {type(impl)}")
if information_content_file:
impl.cached_information_content_map = load_information_content_map(information_content_file)
if isinstance(impl, SemSimianImplementation):
impl.custom_ic_map_path = information_content_file
else:
impl.cached_information_content_map = load_information_content_map(information_content_file)
set1it = None
set2it = None
if not (set1_file or set2_file):
Expand Down Expand Up @@ -3037,8 +3041,16 @@ def termset_similarity(
writer.output = output
if not isinstance(impl, SemanticSimilarityInterface):
raise NotImplementedError(f"Cannot execute this using {impl} of type {type(impl)}")

# TODO: @cmungall - one possibility in future is to relieve client of the need for
# out of band knowledge about impl details. The generic SemSim interface could have
# a load_ic_map method, with the generic impl being to directly load, and the semsimian
# impl passing the path through.
if information_content_file:
impl.cached_information_content_map = load_information_content_map(information_content_file)
if isinstance(impl, SemSimianImplementation):
impl.custom_ic_map_path = information_content_file
else:
impl.cached_information_content_map = load_information_content_map(information_content_file)
terms = list(terms)
ix = terms.index("@")
set1 = list(query_terms_iterator(terms[0:ix], impl))
Expand Down
16 changes: 14 additions & 2 deletions src/oaklib/implementations/semsimian/semsimian_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ class SemSimianImplementation(
AssociationProviderInterface.add_associations,
]

custom_ic_map_path: str = None

semsimian_object_cache: Dict[Tuple[PRED_CURIE], Optional["Semsimian"]] = field(default_factory=dict) # type: ignore # noqa

def __post_init__(self):
Expand Down Expand Up @@ -84,6 +86,7 @@ def _get_semsimian_object(
predicates: List[PRED_CURIE] = None,
attributes: List[str] = None,
resource_path: str = None,
custom_ic_map_path: str = None,
) -> "Semsimian": # type: ignore # noqa
"""
Get Semsimian object from "semsimian_object_cache" or add a new one.
Expand All @@ -94,6 +97,10 @@ def _get_semsimian_object(
from semsimian import Semsimian

predicates = tuple(sorted(predicates))

if custom_ic_map_path is not None:
logging.info(f"Using custom IC map with Semsimian: {custom_ic_map_path}")

if predicates not in self.semsimian_object_cache:
# spo = [
# r
Expand All @@ -111,6 +118,7 @@ def _get_semsimian_object(
predicates=predicates,
pairwise_similarity_attributes=attributes,
resource_path=self.resource_path,
custom_ic_map_path=self.custom_ic_map_path,
)

return self.semsimian_object_cache[predicates]
Expand Down Expand Up @@ -139,7 +147,9 @@ def pairwise_similarity(
"""
logging.debug(f"Calculating pairwise similarity for {subject} x {object} over {predicates}")
semsimian = self._get_semsimian_object(
predicates=predicates, attributes=self.term_pairwise_similarity_attributes
predicates=predicates,
attributes=self.term_pairwise_similarity_attributes,
custom_ic_map_path=self.custom_ic_map_path
)

jaccard_val = semsimian.jaccard_similarity(subject, object)
Expand Down Expand Up @@ -194,7 +204,9 @@ def all_by_all_pairwise_similarity(
objects = list(objects)
logging.info(f"Calculating all-by-all pairwise similarity for {len(objects)} objects")
semsimian = self._get_semsimian_object(
predicates=predicates, attributes=self.term_pairwise_similarity_attributes
predicates=predicates,
attributes=self.term_pairwise_similarity_attributes,
custom_ic_map_path=self.custom_ic_map_path
)
all_results = semsimian.all_by_all_pairwise_similarity(
subject_terms=set(subjects),
Expand Down
2 changes: 2 additions & 0 deletions tests/input/test_ic.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
GO:0005773 5.5
GO:0012505 6.0
25 changes: 25 additions & 0 deletions tests/test_implementations/test_semsimian_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

DB = INPUT_DIR / "go-nucleus.db"

TEST_IC_MAP = INPUT_DIR / "test_ic.tsv"

EXPECTED_ICS = {
"CARO:0000000": 21.05,
"BFO:0000002": 0.7069,
Expand Down Expand Up @@ -134,6 +136,29 @@ def test_all_by_all_pairwise_similarity(self):
sem_similarity_object.phenodigm_score, sql_similarity_object.phenodigm_score, places=2
)

def test_similarity_with_custom_ic_map(self):
adapter = self.oi

adapter.custom_ic_map_path = TEST_IC_MAP.as_posix()

if not isinstance(adapter, SemanticSimilarityInterface):
raise AssertionError("SemanticSimilarityInterface not implemented")
entities = [VACUOLE, ENDOMEMBRANE_SYSTEM]

for s in entities:
for o in entities:
for preds in [self.predicates]:
sim = adapter.pairwise_similarity(s, o, predicates=preds)
if sim is not None:
if s == VACUOLE and o == VACUOLE:
self.assertEqual(sim.ancestor_information_content, 5.5)
if s == ENDOMEMBRANE_SYSTEM and o == ENDOMEMBRANE_SYSTEM:
self.assertEqual(sim.ancestor_information_content, 6.0)
if s == VACUOLE and o == ENDOMEMBRANE_SYSTEM:
self.assertEqual(sim.ancestor_information_content, 0)
else:
raise ValueError(f"Did not get similarity for got {s} and {o}")

def test_semsimian_object_cache(self):
start_time = timeit.default_timer()
_ = list(
Expand Down

0 comments on commit 55c2709

Please sign in to comment.