From 7ada3978c45f1fd43b6eb8e5296267c67e7060a1 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee@gmail.com>
Date: Thu, 8 Feb 2024 15:21:56 -0500
Subject: [PATCH 01/18] Add subsumption command (this should be moved to some
 module I guess)

---
 src/curate_gpt/cli.py | 62 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py
index 43bbbae..f66b089 100644
--- a/src/curate_gpt/cli.py
+++ b/src/curate_gpt/cli.py
@@ -5,6 +5,7 @@
 import logging
 import sys
 from pathlib import Path
+import random
 from typing import Any, Dict, List, Union
 
 import click
@@ -17,6 +18,7 @@
 from llm.cli import load_conversation
 from oaklib import get_adapter
 from pydantic import BaseModel
+from tqdm import tqdm
 
 from curate_gpt import ChromaDBAdapter, __version__
 from curate_gpt.agents.chat_agent import ChatAgent, ChatResponse
@@ -37,6 +39,8 @@
 from curate_gpt.wrappers.literature.pubmed_wrapper import PubmedWrapper
 from curate_gpt.wrappers.ontology import OntologyWrapper
 
+from oaklib.datamodels.vocabulary import IS_A, PART_OF
+
 __all__ = [
     "main",
 ]
@@ -1595,6 +1599,64 @@ def _text_lookup(obj: Dict):
     db.update_collection_metadata(collection, object_type="OntologyClass")
 
 
+@ontology.command(name="subsumption")
+@path_option
+@collection_option
+@model_option
+@click.option("--prefix", required=False, default=None, help="Prefix of terms to use, e.g. 'HP:'")
+@click.option('--predicates', multiple=True, help='Predicates of interest (e.g., is_a, part_of)')
+@click.option("--seed", required=False, default=42, help="Seed for random number generator")
+@click.option('--num_terms', required=False, default=1000, help='Number of term pairs to compare')
+@click.argument("ont")
+def subsumption_command(ont, path, collection, prefix, predicates, seed, num_terms, model, **kwargs):
+    """
+    Compare pairs of ontology terms where one subsumes the other, or one does NOT
+    subsume the other, to determine whether LLM embeddings reflect subsumption
+    relationships.
+
+    Example:
+    -------
+        curategpt subsumption  -c obo_hp $db/hp.db
+
+    """
+    if not predicates:
+        predicates = [IS_A, PART_OF]
+
+    oak_adapter = get_adapter(ont)
+    view = OntologyWrapper(oak_adapter=oak_adapter)
+    db = ChromaDBAdapter(path, **kwargs)
+    db.text_lookup = view.text_field
+
+    c = db.client.get_collection(collection)
+
+    # get all terms
+    terms = list(view.oak_adapter.all_entity_curies())
+    if prefix is not None:
+        terms = [t for t in terms if t.startswith(prefix)]
+        if not terms:
+            raise ValueError(f"No terms found with prefix {prefix}")
+
+    # choose 1000 pseudo-random terms, get ancestor info, choose a random subsuming
+    # and non-subsuming term, calculate fraction of ancestors in common while we are
+    # at it
+    random.seed(seed)
+    ancs = []
+    random_pairs = []
+    for term in tqdm(random.sample(terms, num_terms), desc="Choosing terms to compare"):
+        anc = list(view.oak_adapter.ancestors(term, predicates=predicates, reflexive=False))
+        ancs.append((term, anc))
+
+        # choose random term to pair with
+        random_other_term = random.choice(terms)
+        random_term_ancs = list(view.oak_adapter.ancestors(random_other_term,
+                                                           predicates=predicates,
+                                                           reflexive=False))
+        pair_shared_anc = len(set(anc).intersection(
+            set(random_term_ancs))) / len(anc)  # fraction of ancestors in common
+        random_pairs.append((term, random_other_term, pair_shared_anc))
+
+    pass
+
 @main.group()
 def view():
     "Virtual store/wrapper"

From 528793d6c1641ccb12f2763376cdf00cca5b52ba Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee@gmail.com>
Date: Fri, 9 Feb 2024 10:57:21 -0500
Subject: [PATCH 02/18] Add better comment/warning

---
 src/curate_gpt/cli.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py
index f66b089..293fab8 100644
--- a/src/curate_gpt/cli.py
+++ b/src/curate_gpt/cli.py
@@ -4,6 +4,7 @@
 import json
 import logging
 import sys
+import warnings
 from pathlib import Path
 import random
 from typing import Any, Dict, List, Union
@@ -1627,6 +1628,10 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter
     db = ChromaDBAdapter(path, **kwargs)
     db.text_lookup = view.text_field
 
+    if model is None:
+        warnings.warn("No model specified, using default model. Note that if you must"
+                      "the same model that was used to build the collection.")
+
     c = db.client.get_collection(collection)
 
     # get all terms
@@ -1636,9 +1641,9 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter
         if not terms:
             raise ValueError(f"No terms found with prefix {prefix}")
 
-    # choose 1000 pseudo-random terms, get ancestor info, choose a random subsuming
-    # and non-subsuming term, calculate fraction of ancestors in common while we are
-    # at it
+    # choose 1000 pseudo-random terms, for each, choose another random term, then
+    # calculate fraction of ancestors in common while we are at it. we'll compare with
+    # cosine similarity of embeddings later
     random.seed(seed)
     ancs = []
     random_pairs = []

From 61888a62d31ecd3aa3a610c0d5f6da6fd07a47a9 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee@gmail.com>
Date: Tue, 13 Feb 2024 17:31:09 -0500
Subject: [PATCH 03/18] Finish first pass of subsumption command

---
 src/curate_gpt/cli.py | 87 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 74 insertions(+), 13 deletions(-)

diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py
index 293fab8..4469b9f 100644
--- a/src/curate_gpt/cli.py
+++ b/src/curate_gpt/cli.py
@@ -20,6 +20,7 @@
 from oaklib import get_adapter
 from pydantic import BaseModel
 from tqdm import tqdm
+import numpy as np
 
 from curate_gpt import ChromaDBAdapter, __version__
 from curate_gpt.agents.chat_agent import ChatAgent, ChatResponse
@@ -1608,12 +1609,13 @@ def _text_lookup(obj: Dict):
 @click.option('--predicates', multiple=True, help='Predicates of interest (e.g., is_a, part_of)')
 @click.option("--seed", required=False, default=42, help="Seed for random number generator")
 @click.option('--num_terms', required=False, default=1000, help='Number of term pairs to compare')
+@click.option('--choose_subsuming_terms', required=False, default=False, help='Whether to choose subsuming terms or just random terms')
 @click.argument("ont")
-def subsumption_command(ont, path, collection, prefix, predicates, seed, num_terms, model, **kwargs):
+def subsumption_command(ont, path, collection, prefix, predicates, seed, num_terms,
+                        choose_subsuming_terms, model, **kwargs):
     """
-    Compare pairs of ontology terms where one subsumes the other, or one does NOT
-    subsume the other, to determine whether LLM embeddings reflect subsumption
-    relationships.
+    Compare pairs of ontology terms (optionally where one subsums the other) to
+    determine whether similarity of LLM embeddings reflect subsumption relationships.
 
     Example:
     -------
@@ -1641,26 +1643,85 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter
         if not terms:
             raise ValueError(f"No terms found with prefix {prefix}")
 
-    # choose 1000 pseudo-random terms, for each, choose another random term, then
+    c = db.client.get_collection(collection,
+                                 embedding_function=db._embedding_function(model))
+
+    # build CURIE to object map
+    curie2obj_id = {}
+    for o in tqdm(list(view.objects())):
+        curie2obj_id[o['original_id']] = o
+
+    # get embeddings to manually do cosine similarity
+    d = c.get(include=['embeddings'])
+    ids = d['ids']
+    emb = d['embeddings']
+    # make id2emb map
+    id2emb = {}
+    for i, id in tqdm(enumerate(ids), desc="Building id2emb map"):
+        id2emb[id] = emb[i]
+
+    # choose num_terms pseudo-random terms, for each, choose another random term, then
     # calculate fraction of ancestors in common while we are at it. we'll compare with
     # cosine similarity of embeddings later
     random.seed(seed)
-    ancs = []
-    random_pairs = []
+    results = []
     for term in tqdm(random.sample(terms, num_terms), desc="Choosing terms to compare"):
         anc = list(view.oak_adapter.ancestors(term, predicates=predicates, reflexive=False))
-        ancs.append((term, anc))
 
         # choose random term to pair with
-        random_other_term = random.choice(terms)
+        if choose_subsuming_terms:
+            random_other_term = random.choice(anc)
+        else:
+            random_other_term = random.choice(terms)
         random_term_ancs = list(view.oak_adapter.ancestors(random_other_term,
                                                            predicates=predicates,
                                                            reflexive=False))
-        pair_shared_anc = len(set(anc).intersection(
-            set(random_term_ancs))) / len(anc)  # fraction of ancestors in common
-        random_pairs.append((term, random_other_term, pair_shared_anc))
+        # fraction of ancestors in common
+        pair_shared_anc = len(set(anc).intersection(set(random_term_ancs))) / len(anc)
+
+        id1 = curie2obj_id[term]['id']
+        id2 = curie2obj_id[random_other_term]['id']
+
+        # calculate cosine sim
+        cosine_sim = np.dot(id2emb[id1], id2emb[id2]) / (np.linalg.norm(id2emb[id1]) * np.linalg.norm(id2emb[id2]))
+
+        # if debugging
+        if (logging.getLogger().getEffectiveLevel() == logging.DEBUG and
+                (cosine_sim > 0.85 or cosine_sim < 0.2)):
+            print(f"\nterm: {term} {(curie2obj_id[term]['label'])},"
+                  f" random_other_term: {random_other_term} {(curie2obj_id[random_other_term]['label'])},"
+                  f" pair_shared_anc: {pair_shared_anc}, cosine_sim: {round(cosine_sim, 2)}")
+
+        results.append((term, random_other_term, pair_shared_anc, cosine_sim))
+        pass
+
+    # plot cosine similarity vs fraction of ancestors in common
+    # in matplotlib
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    import pandas as pd
+    df = pd.DataFrame(results, columns=['term', 'random_other_term', 'pair_shared_anc', 'cosine_sim'])
+    sns.scatterplot(data=df, x='pair_shared_anc', y='cosine_sim')
+
+    # least squares fit
+    m, b = np.polyfit(df['pair_shared_anc'], df['cosine_sim'], 1)
+    # plot line
+    plt.plot(df['pair_shared_anc'], m*df['pair_shared_anc'] + b, color='red')
+    # calculate r-squared value
+    r2 = np.corrcoef(df['pair_shared_anc'], df['cosine_sim'])[0, 1]**2
+    plt.text(0.1, 0.9, f"R-squared: {round(r2, 2)}", ha='center',
+             va='center', transform=plt.gca().transAxes)
+
+    plt.xlabel('Fraction of ancestors in common')
+    plt.ylabel('Cosine similarity')
+    # title = ontology name
+    plt.title(f'{ont}')
+
+
+    plt.show()
+
+    # write results to file
 
-    pass
 
 @main.group()
 def view():

From 4b4319a76ea7426806628e0ebfd03f84d721a44a Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee@gmail.com>
Date: Wed, 14 Feb 2024 14:33:08 -0500
Subject: [PATCH 04/18] Add arg to select root term (to avoid selecting
 modifier terms that aren't of interest)

---
 src/curate_gpt/cli.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py
index 4469b9f..375eff3 100644
--- a/src/curate_gpt/cli.py
+++ b/src/curate_gpt/cli.py
@@ -1609,10 +1609,11 @@ def _text_lookup(obj: Dict):
 @click.option('--predicates', multiple=True, help='Predicates of interest (e.g., is_a, part_of)')
 @click.option("--seed", required=False, default=42, help="Seed for random number generator")
 @click.option('--num_terms', required=False, default=1000, help='Number of term pairs to compare')
-@click.option('--choose_subsuming_terms', required=False, default=False, help='Whether to choose subsuming terms or just random terms')
+@click.option('--choose_subsuming_terms', required=False, default=True, help='Whether to choose subsuming terms or just random terms')
+@click.option("--root_term", required=False, default=None, help="Root term to use for selecting terms to sample")
 @click.argument("ont")
 def subsumption_command(ont, path, collection, prefix, predicates, seed, num_terms,
-                        choose_subsuming_terms, model, **kwargs):
+                        choose_subsuming_terms, root_term, model, **kwargs):
     """
     Compare pairs of ontology terms (optionally where one subsums the other) to
     determine whether similarity of LLM embeddings reflect subsumption relationships.
@@ -1637,7 +1638,10 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter
     c = db.client.get_collection(collection)
 
     # get all terms
-    terms = list(view.oak_adapter.all_entity_curies())
+    if root_term is not None:
+        pass
+    else:
+        terms = list(view.oak_adapter.all_entity_curies())
     if prefix is not None:
         terms = [t for t in terms if t.startswith(prefix)]
         if not terms:
@@ -1666,7 +1670,7 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter
     random.seed(seed)
     results = []
     for term in tqdm(random.sample(terms, num_terms), desc="Choosing terms to compare"):
-        anc = list(view.oak_adapter.ancestors(term, predicates=predicates, reflexive=False))
+        anc = list(view.oak_adapter.ancestors(term, predicates=predicates, reflexive=True))
 
         # choose random term to pair with
         if choose_subsuming_terms:
@@ -1675,7 +1679,7 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter
             random_other_term = random.choice(terms)
         random_term_ancs = list(view.oak_adapter.ancestors(random_other_term,
                                                            predicates=predicates,
-                                                           reflexive=False))
+                                                           reflexive=True))
         # fraction of ancestors in common
         pair_shared_anc = len(set(anc).intersection(set(random_term_ancs))) / len(anc)
 
@@ -1683,7 +1687,11 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter
         id2 = curie2obj_id[random_other_term]['id']
 
         # calculate cosine sim
-        cosine_sim = np.dot(id2emb[id1], id2emb[id2]) / (np.linalg.norm(id2emb[id1]) * np.linalg.norm(id2emb[id2]))
+        try:
+            cosine_sim = np.dot(id2emb[id1], id2emb[id2]) / (np.linalg.norm(id2emb[id1]) * np.linalg.norm(id2emb[id2]))
+        except KeyError as e:
+            print(f"KeyError: {e}")
+            continue
 
         # if debugging
         if (logging.getLogger().getEffectiveLevel() == logging.DEBUG and
@@ -1716,12 +1724,8 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter
     plt.ylabel('Cosine similarity')
     # title = ontology name
     plt.title(f'{ont}')
-
-
     plt.show()
 
-    # write results to file
-
 
 @main.group()
 def view():

From 9680d98fb04557e920394b12255b15f40ede2515 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee@gmail.com>
Date: Wed, 14 Feb 2024 14:36:24 -0500
Subject: [PATCH 05/18] Add arg to select root term (to avoid selecting
 modifier terms that aren't of interest)

---
 src/curate_gpt/cli.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py
index 375eff3..186a193 100644
--- a/src/curate_gpt/cli.py
+++ b/src/curate_gpt/cli.py
@@ -1635,11 +1635,10 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter
         warnings.warn("No model specified, using default model. Note that if you must"
                       "the same model that was used to build the collection.")
 
-    c = db.client.get_collection(collection)
-
     # get all terms
     if root_term is not None:
-        pass
+        print(f"Using root term: {root_term} to select terms to compare.")
+        terms = list(view.oak_adapter.descendants(root_term, predicates=predicates, reflexive=True))
     else:
         terms = list(view.oak_adapter.all_entity_curies())
     if prefix is not None:

From e87013a9b274bfdd3c811bd46e39c63b540193bf Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee@gmail.com>
Date: Wed, 14 Feb 2024 16:43:14 -0500
Subject: [PATCH 06/18] Fix comment

---
 src/curate_gpt/cli.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py
index 186a193..0bfd7c3 100644
--- a/src/curate_gpt/cli.py
+++ b/src/curate_gpt/cli.py
@@ -1664,8 +1664,7 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter
         id2emb[id] = emb[i]
 
     # choose num_terms pseudo-random terms, for each, choose another random term, then
-    # calculate fraction of ancestors in common while we are at it. we'll compare with
-    # cosine similarity of embeddings later
+    # calculate fraction of ancestors in common, then calculate cosine similarity
     random.seed(seed)
     results = []
     for term in tqdm(random.sample(terms, num_terms), desc="Choosing terms to compare"):
@@ -1692,6 +1691,13 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter
             print(f"KeyError: {e}")
             continue
 
+        # this seems to have a few times in HPO
+        if cosine_sim == 1.0 and pair_shared_anc < 1.0:
+            print(f"term: {term} {(curie2obj_id[term]['label'])},"
+                  f" random_other_term: {random_other_term} {(curie2obj_id[random_other_term]['label'])},"
+                  f" pair_shared_anc: {pair_shared_anc}, cosine_sim: {round(cosine_sim, 2)}")
+            continue
+
         # if debugging
         if (logging.getLogger().getEffectiveLevel() == logging.DEBUG and
                 (cosine_sim > 0.85 or cosine_sim < 0.2)):

From e3c6e42fb7c381928e17991eaaccb882e70550c9 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee@gmail.com>
Date: Thu, 15 Feb 2024 13:18:43 -0500
Subject: [PATCH 07/18] Add some alpha to plot, fix some comments

---
 src/curate_gpt/cli.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py
index 0bfd7c3..5f4a193 100644
--- a/src/curate_gpt/cli.py
+++ b/src/curate_gpt/cli.py
@@ -1615,7 +1615,7 @@ def _text_lookup(obj: Dict):
 def subsumption_command(ont, path, collection, prefix, predicates, seed, num_terms,
                         choose_subsuming_terms, root_term, model, **kwargs):
     """
-    Compare pairs of ontology terms (optionally where one subsums the other) to
+    Compare pairs of ontology terms (optionally where one subsumes the other) to
     determine whether similarity of LLM embeddings reflect subsumption relationships.
 
     Example:
@@ -1672,14 +1672,16 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter
 
         # choose random term to pair with
         if choose_subsuming_terms:
-            random_other_term = random.choice(anc)
+            # do not choose term itself (remove term from list of ancestors)
+            random_other_term = random.choice(list(set(anc) - set([term])))
         else:
             random_other_term = random.choice(terms)
         random_term_ancs = list(view.oak_adapter.ancestors(random_other_term,
                                                            predicates=predicates,
                                                            reflexive=True))
         # fraction of ancestors in common
-        pair_shared_anc = len(set(anc).intersection(set(random_term_ancs))) / len(anc)
+        pair_shared_anc = (len(set(anc).intersection(set(random_term_ancs))) /
+                           len(list(set(anc))))
 
         id1 = curie2obj_id[term]['id']
         id2 = curie2obj_id[random_other_term]['id']
@@ -1691,13 +1693,6 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter
             print(f"KeyError: {e}")
             continue
 
-        # this seems to have a few times in HPO
-        if cosine_sim == 1.0 and pair_shared_anc < 1.0:
-            print(f"term: {term} {(curie2obj_id[term]['label'])},"
-                  f" random_other_term: {random_other_term} {(curie2obj_id[random_other_term]['label'])},"
-                  f" pair_shared_anc: {pair_shared_anc}, cosine_sim: {round(cosine_sim, 2)}")
-            continue
-
         # if debugging
         if (logging.getLogger().getEffectiveLevel() == logging.DEBUG and
                 (cosine_sim > 0.85 or cosine_sim < 0.2)):
@@ -1714,7 +1709,9 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter
     import seaborn as sns
     import pandas as pd
     df = pd.DataFrame(results, columns=['term', 'random_other_term', 'pair_shared_anc', 'cosine_sim'])
-    sns.scatterplot(data=df, x='pair_shared_anc', y='cosine_sim')
+
+    # plot with some alpha
+    sns.scatterplot(data=df, x='pair_shared_anc', y='cosine_sim', alpha=0.5)
 
     # least squares fit
     m, b = np.polyfit(df['pair_shared_anc'], df['cosine_sim'], 1)
@@ -1730,6 +1727,7 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter
     # title = ontology name
     plt.title(f'{ont}')
     plt.show()
+    pass
 
 
 @main.group()

From beea4b84fc91c925c789a3b748bcd0baa3b319b7 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee@gmail.com>
Date: Thu, 15 Feb 2024 15:38:16 -0500
Subject: [PATCH 08/18] Move subsumption stuff to an agent to declutter cli.py

---
 .../agents/subsumption_eval_agent.py          | 144 ++++++++++++++++++
 src/curate_gpt/cli.py                         | 116 +++-----------
 2 files changed, 164 insertions(+), 96 deletions(-)
 create mode 100644 src/curate_gpt/agents/subsumption_eval_agent.py

diff --git a/src/curate_gpt/agents/subsumption_eval_agent.py b/src/curate_gpt/agents/subsumption_eval_agent.py
new file mode 100644
index 0000000..60f8bee
--- /dev/null
+++ b/src/curate_gpt/agents/subsumption_eval_agent.py
@@ -0,0 +1,144 @@
+import logging
+import random
+
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+import pandas as pd
+
+from dataclasses import dataclass
+
+from oaklib import get_adapter
+from tqdm import tqdm
+
+from curate_gpt import ChromaDBAdapter
+from curate_gpt.agents.base_agent import BaseAgent
+from curate_gpt.wrappers.ontology import OntologyWrapper
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SubsumptionEvalAgent(BaseAgent):
+
+    """
+    An agent to evaluate subsumption relations between entities: compare
+    cosine similarity between two entities and fraction of shared ancestors
+
+    """
+
+    def compare_cosine_sim_to_shared_ancestors(
+        self,
+        view: OntologyWrapper,
+        db: ChromaDBAdapter,
+        collection: str,
+        ont: str,
+        num_terms: int,
+        choose_subsuming_terms: bool,
+        model: str,
+        prefix: str = None,
+        predicates: list = None,
+        root_term: str = None,
+        seed: int = 42,
+        **kwargs):
+        """
+        Summarize a list of objects.
+
+        Example:
+        -------
+
+        >>> print(response)
+        """
+
+        db = self.knowledge_source
+
+        # get all terms
+        if root_term is not None:
+            print(f"Using root term: {root_term} to select terms to compare.")
+            terms = list(view.oak_adapter.descendants(root_term, predicates=predicates, reflexive=True))
+        else:
+            terms = list(view.oak_adapter.all_entity_curies())
+        if prefix is not None:
+            terms = [t for t in terms if t.startswith(prefix)]
+            if not terms:
+                raise ValueError(f"No terms found with prefix {prefix}")
+
+        c = db.client.get_collection(collection,
+                                     embedding_function=db._embedding_function(model))
+
+        # build CURIE to object map
+        curie2obj_id = {}
+        for o in tqdm(list(view.objects())):
+            curie2obj_id[o['original_id']] = o
+
+        # get embeddings to manually do cosine similarity
+        d = c.get(include=['embeddings'])
+        ids = d['ids']
+        emb = d['embeddings']
+        # make id2emb map
+        id2emb = {}
+        for i, id in tqdm(enumerate(ids), desc="Building id2emb map"):
+            id2emb[id] = emb[i]
+
+        # choose num_terms pseudo-random terms, for each, choose another random term, then
+        # calculate fraction of ancestors in common, then calculate cosine similarity
+        random.seed(seed)
+        results = []
+        for term in tqdm(random.sample(terms, num_terms), desc="Choosing terms to compare"):
+            anc = list(view.oak_adapter.ancestors(term, predicates=predicates, reflexive=True))
+
+            # choose random term to pair with
+            if choose_subsuming_terms:
+                # do not choose term itself (remove term from list of ancestors)
+                random_other_term = random.choice(list(set(anc) - set([term])))
+            else:
+                random_other_term = random.choice(terms)
+            random_term_ancs = list(view.oak_adapter.ancestors(random_other_term,
+                                                               predicates=predicates,
+                                                               reflexive=True))
+            # fraction of ancestors in common
+            pair_shared_anc = (len(set(anc).intersection(set(random_term_ancs))) /
+                               len(list(set(anc))))
+
+            id1 = curie2obj_id[term]['id']
+            id2 = curie2obj_id[random_other_term]['id']
+
+            # calculate cosine sim
+            try:
+                cosine_sim = np.dot(id2emb[id1], id2emb[id2]) / (np.linalg.norm(id2emb[id1]) * np.linalg.norm(id2emb[id2]))
+            except KeyError as e:
+                print(f"KeyError: {e}")
+                continue
+
+            # if debugging
+            if (logging.getLogger().getEffectiveLevel() == logging.DEBUG and
+                    (cosine_sim > 0.85 or cosine_sim < 0.2)):
+                print(f"\nterm: {term} {(curie2obj_id[term]['label'])},"
+                      f" random_other_term: {random_other_term} {(curie2obj_id[random_other_term]['label'])},"
+                      f" pair_shared_anc: {pair_shared_anc}, cosine_sim: {round(cosine_sim, 2)}")
+
+            results.append((term, random_other_term, pair_shared_anc, cosine_sim))
+            pass
+
+        # plot cosine similarity vs fraction of ancestors in common
+        # in matplotlib
+        df = pd.DataFrame(results, columns=['term', 'random_other_term', 'pair_shared_anc', 'cosine_sim'])
+
+        # plot with some alpha
+        sns.scatterplot(data=df, x='pair_shared_anc', y='cosine_sim', alpha=0.5)
+
+        # least squares fit
+        m, b = np.polyfit(df['pair_shared_anc'], df['cosine_sim'], 1)
+        # plot line
+        plt.plot(df['pair_shared_anc'], m*df['pair_shared_anc'] + b, color='red')
+        # calculate r-squared value
+        r2 = np.corrcoef(df['pair_shared_anc'], df['cosine_sim'])[0, 1]**2
+        plt.text(0.1, 0.9, f"R-squared: {round(r2, 2)}", ha='center',
+                 va='center', transform=plt.gca().transAxes)
+
+        plt.xlabel('Fraction of ancestors in common')
+        plt.ylabel('Cosine similarity')
+        # title = ontology name
+        plt.title(f'{ont}')
+        plt.show()
+        return {"rsquared": r2}
diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py
index 5f4a193..a375715 100644
--- a/src/curate_gpt/cli.py
+++ b/src/curate_gpt/cli.py
@@ -28,6 +28,7 @@
 from curate_gpt.agents.dase_agent import DatabaseAugmentedStructuredExtraction
 from curate_gpt.agents.dragon_agent import DragonAgent
 from curate_gpt.agents.evidence_agent import EvidenceAgent
+from curate_gpt.agents.subsumption_eval_agent import SubsumptionEvalAgent
 from curate_gpt.agents.summarization_agent import SummarizationAgent
 from curate_gpt.evaluation.dae_evaluator import DatabaseAugmentedCompletionEvaluator
 from curate_gpt.evaluation.evaluation_datamodel import StratifiedCollection, Task
@@ -1623,111 +1624,34 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter
         curategpt subsumption  -c obo_hp $db/hp.db
 
     """
-    if not predicates:
-        predicates = [IS_A, PART_OF]
-
     oak_adapter = get_adapter(ont)
     view = OntologyWrapper(oak_adapter=oak_adapter)
     db = ChromaDBAdapter(path, **kwargs)
     db.text_lookup = view.text_field
 
+    if not predicates:
+        predicates = [IS_A, PART_OF]
+
     if model is None:
         warnings.warn("No model specified, using default model. Note that if you must"
                       "the same model that was used to build the collection.")
 
-    # get all terms
-    if root_term is not None:
-        print(f"Using root term: {root_term} to select terms to compare.")
-        terms = list(view.oak_adapter.descendants(root_term, predicates=predicates, reflexive=True))
-    else:
-        terms = list(view.oak_adapter.all_entity_curies())
-    if prefix is not None:
-        terms = [t for t in terms if t.startswith(prefix)]
-        if not terms:
-            raise ValueError(f"No terms found with prefix {prefix}")
-
-    c = db.client.get_collection(collection,
-                                 embedding_function=db._embedding_function(model))
-
-    # build CURIE to object map
-    curie2obj_id = {}
-    for o in tqdm(list(view.objects())):
-        curie2obj_id[o['original_id']] = o
-
-    # get embeddings to manually do cosine similarity
-    d = c.get(include=['embeddings'])
-    ids = d['ids']
-    emb = d['embeddings']
-    # make id2emb map
-    id2emb = {}
-    for i, id in tqdm(enumerate(ids), desc="Building id2emb map"):
-        id2emb[id] = emb[i]
-
-    # choose num_terms pseudo-random terms, for each, choose another random term, then
-    # calculate fraction of ancestors in common, then calculate cosine similarity
-    random.seed(seed)
-    results = []
-    for term in tqdm(random.sample(terms, num_terms), desc="Choosing terms to compare"):
-        anc = list(view.oak_adapter.ancestors(term, predicates=predicates, reflexive=True))
-
-        # choose random term to pair with
-        if choose_subsuming_terms:
-            # do not choose term itself (remove term from list of ancestors)
-            random_other_term = random.choice(list(set(anc) - set([term])))
-        else:
-            random_other_term = random.choice(terms)
-        random_term_ancs = list(view.oak_adapter.ancestors(random_other_term,
-                                                           predicates=predicates,
-                                                           reflexive=True))
-        # fraction of ancestors in common
-        pair_shared_anc = (len(set(anc).intersection(set(random_term_ancs))) /
-                           len(list(set(anc))))
-
-        id1 = curie2obj_id[term]['id']
-        id2 = curie2obj_id[random_other_term]['id']
-
-        # calculate cosine sim
-        try:
-            cosine_sim = np.dot(id2emb[id1], id2emb[id2]) / (np.linalg.norm(id2emb[id1]) * np.linalg.norm(id2emb[id2]))
-        except KeyError as e:
-            print(f"KeyError: {e}")
-            continue
-
-        # if debugging
-        if (logging.getLogger().getEffectiveLevel() == logging.DEBUG and
-                (cosine_sim > 0.85 or cosine_sim < 0.2)):
-            print(f"\nterm: {term} {(curie2obj_id[term]['label'])},"
-                  f" random_other_term: {random_other_term} {(curie2obj_id[random_other_term]['label'])},"
-                  f" pair_shared_anc: {pair_shared_anc}, cosine_sim: {round(cosine_sim, 2)}")
-
-        results.append((term, random_other_term, pair_shared_anc, cosine_sim))
-        pass
-
-    # plot cosine similarity vs fraction of ancestors in common
-    # in matplotlib
-    import matplotlib.pyplot as plt
-    import seaborn as sns
-    import pandas as pd
-    df = pd.DataFrame(results, columns=['term', 'random_other_term', 'pair_shared_anc', 'cosine_sim'])
-
-    # plot with some alpha
-    sns.scatterplot(data=df, x='pair_shared_anc', y='cosine_sim', alpha=0.5)
-
-    # least squares fit
-    m, b = np.polyfit(df['pair_shared_anc'], df['cosine_sim'], 1)
-    # plot line
-    plt.plot(df['pair_shared_anc'], m*df['pair_shared_anc'] + b, color='red')
-    # calculate r-squared value
-    r2 = np.corrcoef(df['pair_shared_anc'], df['cosine_sim'])[0, 1]**2
-    plt.text(0.1, 0.9, f"R-squared: {round(r2, 2)}", ha='center',
-             va='center', transform=plt.gca().transAxes)
-
-    plt.xlabel('Fraction of ancestors in common')
-    plt.ylabel('Cosine similarity')
-    # title = ontology name
-    plt.title(f'{ont}')
-    plt.show()
-    pass
+    agent = SubsumptionEvalAgent(knowledge_source=db,
+                                 knowledge_source_collection=collection)
+    response = (
+        agent.compare_cosine_sim_to_shared_ancestors(view=view,
+                                                     db=db,
+                                                     collection=collection,
+                                                     ont=ont,
+                                                     num_terms=num_terms,
+                                                     choose_subsuming_terms=choose_subsuming_terms,
+                                                     model=model,
+                                                     prefix=prefix,
+                                                     predicates=predicates,
+                                                     root_term=root_term,
+                                                     seed=seed,
+                                                     **kwargs))
+    click.echo(f"r-squared: {response.get('rsquared')}")
 
 
 @main.group()

From 9f6bba5ea742871c7e7fccb9ec6666864f1d53de Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee@gmail.com>
Date: Thu, 15 Feb 2024 15:41:28 -0500
Subject: [PATCH 09/18] Refactor

---
 src/curate_gpt/agents/subsumption_eval_agent.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/curate_gpt/agents/subsumption_eval_agent.py b/src/curate_gpt/agents/subsumption_eval_agent.py
index 60f8bee..81e6efc 100644
--- a/src/curate_gpt/agents/subsumption_eval_agent.py
+++ b/src/curate_gpt/agents/subsumption_eval_agent.py
@@ -30,7 +30,6 @@ class SubsumptionEvalAgent(BaseAgent):
     def compare_cosine_sim_to_shared_ancestors(
         self,
         view: OntologyWrapper,
-        db: ChromaDBAdapter,
         collection: str,
         ont: str,
         num_terms: int,
@@ -50,7 +49,6 @@ def compare_cosine_sim_to_shared_ancestors(
         >>> print(response)
         """
 
-        db = self.knowledge_source
 
         # get all terms
         if root_term is not None:
@@ -63,8 +61,8 @@ def compare_cosine_sim_to_shared_ancestors(
             if not terms:
                 raise ValueError(f"No terms found with prefix {prefix}")
 
-        c = db.client.get_collection(collection,
-                                     embedding_function=db._embedding_function(model))
+        c = self.knowledge_source.client.get_collection(self.knowledge_source_collection,
+                                                        embedding_function=self.knowledge_source._embedding_function(model))
 
         # build CURIE to object map
         curie2obj_id = {}

From f308558fb63001f34eaaa24d78f9c6147a2a21e6 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee@gmail.com>
Date: Thu, 15 Feb 2024 15:42:01 -0500
Subject: [PATCH 10/18] Refactor

---
 src/curate_gpt/agents/subsumption_eval_agent.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/curate_gpt/agents/subsumption_eval_agent.py b/src/curate_gpt/agents/subsumption_eval_agent.py
index 81e6efc..863b1b7 100644
--- a/src/curate_gpt/agents/subsumption_eval_agent.py
+++ b/src/curate_gpt/agents/subsumption_eval_agent.py
@@ -30,7 +30,6 @@ class SubsumptionEvalAgent(BaseAgent):
     def compare_cosine_sim_to_shared_ancestors(
         self,
         view: OntologyWrapper,
-        collection: str,
         ont: str,
         num_terms: int,
         choose_subsuming_terms: bool,

From b657322db99a2229ff6a87f089a9ee58b0dfcbd0 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee@gmail.com>
Date: Thu, 15 Feb 2024 16:15:48 -0500
Subject: [PATCH 11/18] Tidy up, add documentation

---
 .../agents/subsumption_eval_agent.py          | 56 ++++++++++++-------
 src/curate_gpt/cli.py                         | 12 ++--
 2 files changed, 43 insertions(+), 25 deletions(-)

diff --git a/src/curate_gpt/agents/subsumption_eval_agent.py b/src/curate_gpt/agents/subsumption_eval_agent.py
index 863b1b7..f5021ec 100644
--- a/src/curate_gpt/agents/subsumption_eval_agent.py
+++ b/src/curate_gpt/agents/subsumption_eval_agent.py
@@ -7,11 +7,8 @@
 import pandas as pd
 
 from dataclasses import dataclass
-
-from oaklib import get_adapter
 from tqdm import tqdm
 
-from curate_gpt import ChromaDBAdapter
 from curate_gpt.agents.base_agent import BaseAgent
 from curate_gpt.wrappers.ontology import OntologyWrapper
 
@@ -26,46 +23,67 @@ class SubsumptionEvalAgent(BaseAgent):
     cosine similarity between two entities and fraction of shared ancestors
 
     """
+    view: OntologyWrapper = None
+    model: str = None
+    ont: str = None
 
     def compare_cosine_sim_to_shared_ancestors(
         self,
-        view: OntologyWrapper,
-        ont: str,
         num_terms: int,
         choose_subsuming_terms: bool,
-        model: str,
         prefix: str = None,
         predicates: list = None,
         root_term: str = None,
         seed: int = 42,
         **kwargs):
         """
-        Summarize a list of objects.
+        compare cosine similarity between two entities and fraction of shared ancestors
 
         Example:
-        -------
 
-        >>> print(response)
+        from oaklib.datamodels.vocabulary import IS_A, PART_OF
+        from oaklib.adapters import get_adapter
+        from oaklib.adapters.chroma import ChromaDBAdapter
+        from curate_gpt.agents.subsumption_eval_agent import SubsumptionEvalAgent
+
+        oak_adapter = get_adapter("hp")
+        view = OntologyWrapper(oak_adapter=oak_adapter)
+        db = ChromaDBAdapter(path, **kwargs)
+        db.text_lookup = view.text_fie
+        predicates = [IS_A, PART_OF]
+        model = "openai:" # use the same model as was used in db for embeddings
+        agent = SubsumptionEvalAgent(knowledge_source=db,
+                                     knowledge_source_collection=collection,
+                                     view=view,
+                                     model=model,
+                                     ont="hp)
+        response = (agent.compare_cosine_sim_to_shared_ancestors(num_terms=num_terms,
+                                             choose_subsuming_terms=True,
+                                             prefix="HP:",
+                                             predicates=predicates,
+                                             root_term="HP:0000118",
+        print(response)
         """
 
-
         # get all terms
         if root_term is not None:
             print(f"Using root term: {root_term} to select terms to compare.")
-            terms = list(view.oak_adapter.descendants(root_term, predicates=predicates, reflexive=True))
+            terms = list(self.view.oak_adapter.descendants(root_term,
+                                                      predicates=predicates,
+                                                      reflexive=True))
         else:
-            terms = list(view.oak_adapter.all_entity_curies())
+            terms = list(self.view.oak_adapter.all_entity_curies())
         if prefix is not None:
             terms = [t for t in terms if t.startswith(prefix)]
             if not terms:
                 raise ValueError(f"No terms found with prefix {prefix}")
 
         c = self.knowledge_source.client.get_collection(self.knowledge_source_collection,
-                                                        embedding_function=self.knowledge_source._embedding_function(model))
+                                                        embedding_function=self.knowledge_source._embedding_function(self.model))
 
         # build CURIE to object map
         curie2obj_id = {}
-        for o in tqdm(list(view.objects())):
+        for o in tqdm(list(self.view.objects())):
             curie2obj_id[o['original_id']] = o
 
         # get embeddings to manually do cosine similarity
@@ -82,7 +100,7 @@ def compare_cosine_sim_to_shared_ancestors(
         random.seed(seed)
         results = []
         for term in tqdm(random.sample(terms, num_terms), desc="Choosing terms to compare"):
-            anc = list(view.oak_adapter.ancestors(term, predicates=predicates, reflexive=True))
+            anc = list(self.view.oak_adapter.ancestors(term, predicates=predicates, reflexive=True))
 
             # choose random term to pair with
             if choose_subsuming_terms:
@@ -90,9 +108,9 @@ def compare_cosine_sim_to_shared_ancestors(
                 random_other_term = random.choice(list(set(anc) - set([term])))
             else:
                 random_other_term = random.choice(terms)
-            random_term_ancs = list(view.oak_adapter.ancestors(random_other_term,
-                                                               predicates=predicates,
-                                                               reflexive=True))
+            random_term_ancs = list(self.view.oak_adapter.ancestors(random_other_term,
+                                                                    predicates=predicates,
+                                                                    reflexive=True))
             # fraction of ancestors in common
             pair_shared_anc = (len(set(anc).intersection(set(random_term_ancs))) /
                                len(list(set(anc))))
@@ -136,6 +154,6 @@ def compare_cosine_sim_to_shared_ancestors(
         plt.xlabel('Fraction of ancestors in common')
         plt.ylabel('Cosine similarity')
         # title = ontology name
-        plt.title(f'{ont}')
+        plt.title(f'{self.ont}')
         plt.show()
         return {"rsquared": r2}
diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py
index a375715..91e1055 100644
--- a/src/curate_gpt/cli.py
+++ b/src/curate_gpt/cli.py
@@ -1637,15 +1637,15 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter
                       "the same model that was used to build the collection.")
 
     agent = SubsumptionEvalAgent(knowledge_source=db,
-                                 knowledge_source_collection=collection)
+                                 knowledge_source_collection=collection,
+                                 view=view,
+                                 model=model,
+                                 ont=ont)
+
     response = (
-        agent.compare_cosine_sim_to_shared_ancestors(view=view,
-                                                     db=db,
-                                                     collection=collection,
-                                                     ont=ont,
+        agent.compare_cosine_sim_to_shared_ancestors(
                                                      num_terms=num_terms,
                                                      choose_subsuming_terms=choose_subsuming_terms,
-                                                     model=model,
                                                      prefix=prefix,
                                                      predicates=predicates,
                                                      root_term=root_term,

From b79f1ec800c003280b2d10b2b97c1411785c71b9 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee@gmail.com>
Date: Fri, 16 Feb 2024 09:56:51 -0500
Subject: [PATCH 12/18] Write out image file

---
 src/curate_gpt/agents/subsumption_eval_agent.py |  6 ++++++
 src/curate_gpt/cli.py                           | 13 +++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/curate_gpt/agents/subsumption_eval_agent.py b/src/curate_gpt/agents/subsumption_eval_agent.py
index f5021ec..00369c9 100644
--- a/src/curate_gpt/agents/subsumption_eval_agent.py
+++ b/src/curate_gpt/agents/subsumption_eval_agent.py
@@ -35,6 +35,7 @@ def compare_cosine_sim_to_shared_ancestors(
         predicates: list = None,
         root_term: str = None,
         seed: int = 42,
+        img_file_name: str = None,
         **kwargs):
         """
         compare cosine similarity between two entities and fraction of shared ancestors
@@ -155,5 +156,10 @@ def compare_cosine_sim_to_shared_ancestors(
         plt.ylabel('Cosine similarity')
         # title = ontology name
         plt.title(f'{self.ont}')
+
+        # save to file
+        if img_file_name is not None:
+            plt.savefig(img_file_name)
+
         plt.show()
         return {"rsquared": r2}
diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py
index 91e1055..78650ff 100644
--- a/src/curate_gpt/cli.py
+++ b/src/curate_gpt/cli.py
@@ -3,6 +3,7 @@
 import gzip
 import json
 import logging
+import os
 import sys
 import warnings
 from pathlib import Path
@@ -1612,9 +1613,10 @@ def _text_lookup(obj: Dict):
 @click.option('--num_terms', required=False, default=1000, help='Number of term pairs to compare')
 @click.option('--choose_subsuming_terms', required=False, default=True, help='Whether to choose subsuming terms or just random terms')
 @click.option("--root_term", required=False, default=None, help="Root term to use for selecting terms to sample")
+@click.option("--output_dir", required=False, default=None, help="Directory to write output to")
 @click.argument("ont")
 def subsumption_command(ont, path, collection, prefix, predicates, seed, num_terms,
-                        choose_subsuming_terms, root_term, model, **kwargs):
+                        choose_subsuming_terms, root_term, output_dir, model, **kwargs):
     """
     Compare pairs of ontology terms (optionally where one subsumes the other) to
     determine whether similarity of LLM embeddings reflect subsumption relationships.
@@ -1642,14 +1644,21 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter
                                  model=model,
                                  ont=ont)
 
+    if output_dir and not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+    img_file_name = (os.path.join(output_dir,
+                                  f"cosine_sim_vs_shared_anc_{ont}_"
+                                  f"sub_{str(choose_subsuming_terms)}.png"))
     response = (
         agent.compare_cosine_sim_to_shared_ancestors(
                                                      num_terms=num_terms,
-                                                     choose_subsuming_terms=choose_subsuming_terms,
+                                                     choose_subsuming_terms=
+                                                     choose_subsuming_terms,
                                                      prefix=prefix,
                                                      predicates=predicates,
                                                      root_term=root_term,
                                                      seed=seed,
+                                                     img_file_name=img_file_name,
                                                      **kwargs))
     click.echo(f"r-squared: {response.get('rsquared')}")
 

From b4a680ca9529819d319d4f7857a8a83366f81d0e Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee@gmail.com>
Date: Fri, 16 Feb 2024 10:01:25 -0500
Subject: [PATCH 13/18] Bug

---
 src/curate_gpt/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py
index 78650ff..1769449 100644
--- a/src/curate_gpt/cli.py
+++ b/src/curate_gpt/cli.py
@@ -1646,7 +1646,7 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter
 
     if output_dir and not os.path.exists(output_dir):
         os.mkdir(output_dir)
-    img_file_name = (os.path.join(output_dir,
+    img_file_name = (os.path.join(output_dir if output_dir else "",
                                   f"cosine_sim_vs_shared_anc_{ont}_"
                                   f"sub_{str(choose_subsuming_terms)}.png"))
     response = (

From af7b171559460b12faa8b971e7b9786a0a80bd1f Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee@gmail.com>
Date: Fri, 23 Feb 2024 15:12:10 -0500
Subject: [PATCH 14/18] Tidy up subsumption code a bit

---
 src/curate_gpt/agents/subsumption_eval_agent.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/curate_gpt/agents/subsumption_eval_agent.py b/src/curate_gpt/agents/subsumption_eval_agent.py
index 00369c9..0466c76 100644
--- a/src/curate_gpt/agents/subsumption_eval_agent.py
+++ b/src/curate_gpt/agents/subsumption_eval_agent.py
@@ -116,15 +116,17 @@ def compare_cosine_sim_to_shared_ancestors(
             pair_shared_anc = (len(set(anc).intersection(set(random_term_ancs))) /
                                len(list(set(anc))))
 
-            id1 = curie2obj_id[term]['id']
-            id2 = curie2obj_id[random_other_term]['id']
+            try:
+                id1 = curie2obj_id[term]['id']
+                id2 = curie2obj_id[random_other_term]['id']
+            except KeyError as e:
+                raise KeyError(f"KeyError retrieving item from curie2obj_id: {e}")
 
             # calculate cosine sim
             try:
                 cosine_sim = np.dot(id2emb[id1], id2emb[id2]) / (np.linalg.norm(id2emb[id1]) * np.linalg.norm(id2emb[id2]))
             except KeyError as e:
-                print(f"KeyError: {e}")
-                continue
+                raise KeyError(f"KeyError retrieving item from id2emb: {e}")
 
             # if debugging
             if (logging.getLogger().getEffectiveLevel() == logging.DEBUG and
@@ -149,13 +151,12 @@ def compare_cosine_sim_to_shared_ancestors(
         plt.plot(df['pair_shared_anc'], m*df['pair_shared_anc'] + b, color='red')
         # calculate r-squared value
         r2 = np.corrcoef(df['pair_shared_anc'], df['cosine_sim'])[0, 1]**2
-        plt.text(0.1, 0.9, f"R-squared: {round(r2, 2)}", ha='center',
+        plt.text(0.2, 0.9, f"R-squared: {round(r2, 2)}", ha='center',
                  va='center', transform=plt.gca().transAxes)
 
         plt.xlabel('Fraction of ancestors in common')
         plt.ylabel('Cosine similarity')
-        # title = ontology name
-        plt.title(f'{self.ont}')
+        plt.title(f'{self.ont.split(":")[-1]} subsuming: {choose_subsuming_terms}')
 
         # save to file
         if img_file_name is not None:

From 8415f7b7ad36f521af273d16e3d3b15094bc2ad1 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee@gmail.com>
Date: Fri, 23 Feb 2024 16:27:43 -0500
Subject: [PATCH 15/18] Remove unused import

---
 src/curate_gpt/cli.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py
index 1769449..e220704 100644
--- a/src/curate_gpt/cli.py
+++ b/src/curate_gpt/cli.py
@@ -20,8 +20,6 @@
 from llm.cli import load_conversation
 from oaklib import get_adapter
 from pydantic import BaseModel
-from tqdm import tqdm
-import numpy as np
 
 from curate_gpt import ChromaDBAdapter, __version__
 from curate_gpt.agents.chat_agent import ChatAgent, ChatResponse

From 4fdf5161a5278416f889da4ffeb82f49dc06e3e1 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee@gmail.com>
Date: Fri, 23 Feb 2024 16:27:58 -0500
Subject: [PATCH 16/18] Add notebook for subsumption experiments

---
 notebooks/command-line/Subsumption-exp.ipynb | 157 +++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 notebooks/command-line/Subsumption-exp.ipynb

diff --git a/notebooks/command-line/Subsumption-exp.ipynb b/notebooks/command-line/Subsumption-exp.ipynb
new file mode 100644
index 0000000..4bf57e9
--- /dev/null
+++ b/notebooks/command-line/Subsumption-exp.ipynb
@@ -0,0 +1,157 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Subsumption experiments\n",
+    "\n",
+    "Experiments to investigate the relationship between cosine similarity of LLM embeddings and fraction of shared ancestors for pairs of ontology terms where one is subsumed by the other"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "11ab2c3cabc2f8d2"
+  },
+  {
+   "cell_type": "raw",
+   "source": [
+    "Index ontology terms without relationship info (just label and definition) "
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "34719c597052c669"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001B[34mInstalling dependencies from lock file\u001B[39m\r\n",
+      "\r\n",
+      "No dependencies to install or update\r\n",
+      "\r\n",
+      "\u001B[39;1mInstalling\u001B[39;22m the current project: \u001B[36mcurate-gpt\u001B[39m (\u001B[39;1m0.0.0.post2.dev0+335f59e\u001B[39;22m)\u001B[1G\u001B[2K\u001B[39;1mInstalling\u001B[39;22m the current project: \u001B[36mcurate-gpt\u001B[39m (\u001B[32m0.0.0.post2.dev0+335f59e\u001B[39m)\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!poetry install"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-02-23T20:28:22.995963Z",
+     "start_time": "2024-02-23T20:28:16.250674Z"
+    }
+   },
+   "id": "c04ef952913dd018"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "ontologies = [\n",
+    "    (\"ont_hp_norel\", \"hp\"),\n",
+    "    (\"ont_mondo_norel\", \"mondo\"),\n",
+    "    (\"ont_go_norel\", \"go\"),\n",
+    "    (\"ont_foodon_norel\", \"foodon\"),\n",
+    "    (\"ont_chebi_norel\", \"chebi\")\n",
+    "]\n",
+    "\n",
+    "for chr_db_collection, ontology in ontologies:\n",
+    "    command = f\"!curategpt ontology index --index-fields label,definition -p stagedb -c {chr_db_collection} -m openai: sqlite:obo:{ontology}\"\n",
+    "    print(command)\n",
+    "    get_ipython().system(command)"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "15f7f0b0fcb8cc39"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n",
+      "\r\n",
+      "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n",
+      "  warnings.warn(\r\n",
+      "Using root term: MONDO:0700096 to select terms to compare.\r\n",
+      "100%|████████████████████████████████| 42542/42542 [00:00<00:00, 2024347.44it/s]\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Mondo\n",
+    "!curategpt ontology subsumption --output_dir build --root_term MONDO:0700096 --prefix \"MONDO:\" -p ../../stagedb -c ont_mondo_norel sqlite:obo:mondo -m openai:\n",
+    "!curategpt ontology subsumption --output_dir build --choose_subsuming_terms False --root_term MONDO:0700096 --prefix \"MONDO:\" -p ../../stagedb -c ont_mondo_norel sqlite:obo:mondo -m openai:\n",
+    "\n",
+    "# HP\n",
+    "!curategpt ontology subsumption --output_dir build --root_term HP:0000118 --prefix \"HP:\" -p ../../stagedb -c ont_hp_norel sqlite:obo:hp -m openai:\n",
+    "!curategpt ontology subsumption --output_dir build --choose_subsuming_terms False --root_term HP:0000118 --prefix \"HP:\" -p ../../stagedb -c ont_hp_norel sqlite:obo:hp -m openai:\n",
+    "\n",
+    "# GO\n",
+    "!curategpt ontology subsumption --output_dir build --prefix \"GO:\" -p ../../stagedb -c ont_go_norel sqlite:obo:go -m openai:\n",
+    "!curategpt ontology subsumption --output_dir build --choose_subsuming_terms False --prefix \"GO:\" -p ../../stagedb -c ont_go_norel sqlite:obo:go -m openai:\n",
+    "\n",
+    "# FOODON\n",
+    "!curategpt ontology subsumption --output_dir build --root_term FOODON:00002403 --prefix \"FOODON:\" -p ../../stagedb -c ont_foodon_norel sqlite:obo:foodon -m openai:\n",
+    "!curategpt ontology subsumption --output_dir build --choose_subsuming_terms False --root_term FOODON:00002403 --prefix \"FOODON:\" -p ../../stagedb -c ont_foodon_norel sqlite:obo:foodon -m openai:\n",
+    "\n",
+    "# CHEBI\n",
+    "!curategpt ontology subsumption --output_dir build --root_term CHEBI:59999 --prefix \"CHEBI:\" -p ../../stagedb -c ont_chebi_norel sqlite:obo:chebi -m openai:\n",
+    "!curategpt ontology subsumption --choose_subsuming_terms False --output_dir build --root_term CHEBI:59999 --prefix \"CHEBI:\" -p ../../stagedb -c ont_chebi_norel sqlite:obo:chebi -m openai:\n",
+    "\n",
+    "!open *png"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "is_executing": true,
+    "ExecuteTime": {
+     "start_time": "2024-02-23T21:26:07.255874Z"
+    }
+   },
+   "id": "3e2025479fee4a38"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "aa64889d863ebd6a"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From e39f786700cf9eacf41bb10d076f2f796bdb8d0b Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee@gmail.com>
Date: Fri, 23 Feb 2024 16:29:02 -0500
Subject: [PATCH 17/18] Correct path to stagedb

---
 notebooks/command-line/Subsumption-exp.ipynb | 31 ++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/notebooks/command-line/Subsumption-exp.ipynb b/notebooks/command-line/Subsumption-exp.ipynb
index 4bf57e9..426978c 100644
--- a/notebooks/command-line/Subsumption-exp.ipynb
+++ b/notebooks/command-line/Subsumption-exp.ipynb
@@ -64,7 +64,7 @@
     "]\n",
     "\n",
     "for chr_db_collection, ontology in ontologies:\n",
-    "    command = f\"!curategpt ontology index --index-fields label,definition -p stagedb -c {chr_db_collection} -m openai: sqlite:obo:{ontology}\"\n",
+    "    command = f\"!curategpt ontology index --index-fields label,definition -p ../../stagedb -c {chr_db_collection} -m openai: sqlite:obo:{ontology}\"\n",
     "    print(command)\n",
     "    get_ipython().system(command)"
    ],
@@ -86,7 +86,34 @@
       "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n",
       "  warnings.warn(\r\n",
       "Using root term: MONDO:0700096 to select terms to compare.\r\n",
-      "100%|████████████████████████████████| 42542/42542 [00:00<00:00, 2024347.44it/s]\r\n"
+      "100%|████████████████████████████████| 42542/42542 [00:00<00:00, 2024347.44it/s]\r\n",
+      "Building id2emb map: 42542it [00:00, 2673089.66it/s]\r\n",
+      "Choosing terms to compare: 100%|███████████| 1000/1000 [00:01<00:00, 656.31it/s]\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n",
+      "  if pd.api.types.is_categorical_dtype(vector):\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n",
+      "  if pd.api.types.is_categorical_dtype(vector):\r\n",
+      "Figure(640x480)\r\n",
+      "r-squared: 0.3702593848159516\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n",
+      "\r\n",
+      "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n",
+      "  warnings.warn(\r\n",
+      "Using root term: MONDO:0700096 to select terms to compare.\r\n",
+      "100%|████████████████████████████████| 42542/42542 [00:00<00:00, 1939732.80it/s]\r\n",
+      "Building id2emb map: 42542it [00:00, 2888872.21it/s]\r\n",
+      "Choosing terms to compare: 100%|███████████| 1000/1000 [00:01<00:00, 589.07it/s]\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n",
+      "  if pd.api.types.is_categorical_dtype(vector):\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n",
+      "  if pd.api.types.is_categorical_dtype(vector):\r\n",
+      "Figure(640x480)\r\n",
+      "r-squared: 0.02021735186826436\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n",
+      "\r\n",
+      "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n",
+      "  warnings.warn(\r\n",
+      "Using root term: HP:0000118 to select terms to compare.\r\n"
      ]
     }
    ],

From c79a0f5f9cf5e6a833103e59fcb12267514029aa Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee@gmail.com>
Date: Mon, 26 Feb 2024 10:16:52 -0500
Subject: [PATCH 18/18] Add notebook for subsumption experiments

---
 notebooks/command-line/Subsumption-exp.ipynb | 133 ++++++++++++++++++-
 1 file changed, 130 insertions(+), 3 deletions(-)

diff --git a/notebooks/command-line/Subsumption-exp.ipynb b/notebooks/command-line/Subsumption-exp.ipynb
index 426978c..289bd59 100644
--- a/notebooks/command-line/Subsumption-exp.ipynb
+++ b/notebooks/command-line/Subsumption-exp.ipynb
@@ -75,7 +75,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "outputs": [
     {
      "name": "stdout",
@@ -113,7 +113,134 @@
       "\r\n",
       "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n",
       "  warnings.warn(\r\n",
-      "Using root term: HP:0000118 to select terms to compare.\r\n"
+      "Using root term: HP:0000118 to select terms to compare.\r\n",
+      "100%|████████████████████████████████| 29499/29499 [00:00<00:00, 1745102.59it/s]\r\n",
+      "Building id2emb map: 29499it [00:00, 2759255.45it/s]\r\n",
+      "Choosing terms to compare: 100%|███████████| 1000/1000 [00:01<00:00, 816.61it/s]\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n",
+      "  if pd.api.types.is_categorical_dtype(vector):\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n",
+      "  if pd.api.types.is_categorical_dtype(vector):\r\n",
+      "Figure(640x480)\r\n",
+      "r-squared: 0.5089792065670653\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n",
+      "\r\n",
+      "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n",
+      "  warnings.warn(\r\n",
+      "Using root term: HP:0000118 to select terms to compare.\r\n",
+      "100%|████████████████████████████████| 29499/29499 [00:00<00:00, 1836214.03it/s]\r\n",
+      "Building id2emb map: 29499it [00:00, 2972153.40it/s]\r\n",
+      "Choosing terms to compare: 100%|███████████| 1000/1000 [00:01<00:00, 830.03it/s]\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n",
+      "  if pd.api.types.is_categorical_dtype(vector):\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n",
+      "  if pd.api.types.is_categorical_dtype(vector):\r\n",
+      "Figure(640x480)\r\n",
+      "r-squared: 0.13296081180577732\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n",
+      "\r\n",
+      "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n",
+      "  warnings.warn(\r\n",
+      "100%|████████████████████████████████| 75893/75893 [00:00<00:00, 1917268.36it/s]\r\n",
+      "Building id2emb map: 75893it [00:00, 2946273.30it/s]\r\n",
+      "Choosing terms to compare: 100%|███████████| 1000/1000 [00:01<00:00, 625.99it/s]\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n",
+      "  if pd.api.types.is_categorical_dtype(vector):\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n",
+      "  if pd.api.types.is_categorical_dtype(vector):\r\n",
+      "Figure(640x480)\r\n",
+      "r-squared: 0.45925148278084876\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n",
+      "\r\n",
+      "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n",
+      "  warnings.warn(\r\n",
+      "100%|████████████████████████████████| 75893/75893 [00:00<00:00, 1741995.45it/s]\r\n",
+      "Building id2emb map: 75893it [00:00, 2828264.25it/s]\r\n",
+      "Choosing terms to compare: 100%|███████████| 1000/1000 [00:01<00:00, 574.20it/s]\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n",
+      "  if pd.api.types.is_categorical_dtype(vector):\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n",
+      "  if pd.api.types.is_categorical_dtype(vector):\r\n",
+      "Figure(640x480)\r\n",
+      "r-squared: 0.1351592896407047\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n",
+      "\r\n",
+      "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n",
+      "  warnings.warn(\r\n",
+      "Using root term: FOODON:00002403 to select terms to compare.\r\n",
+      "100%|████████████████████████████████| 32387/32387 [00:00<00:00, 1970479.61it/s]\r\n",
+      "Building id2emb map: 32387it [00:00, 2767914.17it/s]\r\n",
+      "Choosing terms to compare: 100%|███████████| 1000/1000 [00:01<00:00, 720.01it/s]\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n",
+      "  if pd.api.types.is_categorical_dtype(vector):\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n",
+      "  if pd.api.types.is_categorical_dtype(vector):\r\n",
+      "Figure(640x480)\r\n",
+      "r-squared: 0.5717097474555206\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n",
+      "\r\n",
+      "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n",
+      "  warnings.warn(\r\n",
+      "Using root term: FOODON:00002403 to select terms to compare.\r\n",
+      "100%|████████████████████████████████| 32387/32387 [00:00<00:00, 1821436.65it/s]\r\n",
+      "Building id2emb map: 32387it [00:00, 2744205.65it/s]\r\n",
+      "Choosing terms to compare:  31%|███▋        | 308/1000 [00:00<00:00, 744.24it/s]\r\n",
+      "Traceback (most recent call last):\r\n",
+      "  File \"/Users/jtr4v/PythonProject/curate-gpt/src/curate_gpt/agents/subsumption_eval_agent.py\", line 121, in compare_cosine_sim_to_shared_ancestors\r\n",
+      "    id2 = curie2obj_id[random_other_term]['id']\r\n",
+      "KeyError: 'FOODON:03412687'\r\n",
+      "\r\n",
+      "During handling of the above exception, another exception occurred:\r\n",
+      "\r\n",
+      "Traceback (most recent call last):\r\n",
+      "  File \"/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/bin/curategpt\", line 6, in <module>\r\n",
+      "    sys.exit(main())\r\n",
+      "  File \"/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/click/core.py\", line 1157, in __call__\r\n",
+      "    return self.main(*args, **kwargs)\r\n",
+      "  File \"/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/click/core.py\", line 1078, in main\r\n",
+      "    rv = self.invoke(ctx)\r\n",
+      "  File \"/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/click/core.py\", line 1688, in invoke\r\n",
+      "    return _process_result(sub_ctx.command.invoke(sub_ctx))\r\n",
+      "  File \"/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/click/core.py\", line 1688, in invoke\r\n",
+      "    return _process_result(sub_ctx.command.invoke(sub_ctx))\r\n",
+      "  File \"/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/click/core.py\", line 1434, in invoke\r\n",
+      "    return ctx.invoke(self.callback, **ctx.params)\r\n",
+      "  File \"/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/click/core.py\", line 783, in invoke\r\n",
+      "    return __callback(*args, **kwargs)\r\n",
+      "  File \"/Users/jtr4v/PythonProject/curate-gpt/src/curate_gpt/cli.py\", line 1651, in subsumption_command\r\n",
+      "    agent.compare_cosine_sim_to_shared_ancestors(\r\n",
+      "  File \"/Users/jtr4v/PythonProject/curate-gpt/src/curate_gpt/agents/subsumption_eval_agent.py\", line 123, in compare_cosine_sim_to_shared_ancestors\r\n",
+      "    raise KeyError(f\"KeyError retrieving item from curie2obj_id: {e}\")\r\n",
+      "KeyError: \"KeyError retrieving item from curie2obj_id: 'FOODON:03412687'\"\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n",
+      "\r\n",
+      "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n",
+      "  warnings.warn(\r\n",
+      "Using root term: CHEBI:59999 to select terms to compare.\r\n",
+      "100%|██████████████████████████████| 166855/166855 [00:00<00:00, 1683980.96it/s]\r\n",
+      "Building id2emb map: 166855it [00:00, 2504385.80it/s]\r\n",
+      "Choosing terms to compare: 100%|███████████| 1000/1000 [00:01<00:00, 643.53it/s]\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n",
+      "  if pd.api.types.is_categorical_dtype(vector):\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n",
+      "  if pd.api.types.is_categorical_dtype(vector):\r\n",
+      "Figure(640x480)\r\n",
+      "r-squared: 0.22513629243465821\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n",
+      "\r\n",
+      "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n",
+      "  warnings.warn(\r\n",
+      "Using root term: CHEBI:59999 to select terms to compare.\r\n",
+      "100%|██████████████████████████████| 166855/166855 [00:00<00:00, 1737491.17it/s]\r\n",
+      "Building id2emb map: 166855it [00:00, 2696423.70it/s]\r\n",
+      "Choosing terms to compare: 100%|███████████| 1000/1000 [00:01<00:00, 737.49it/s]\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n",
+      "  if pd.api.types.is_categorical_dtype(vector):\r\n",
+      "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n",
+      "  if pd.api.types.is_categorical_dtype(vector):\r\n",
+      "Figure(640x480)\r\n",
+      "r-squared: 0.0680209366780529\r\n",
+      "The file /Users/jtr4v/PythonProject/curate-gpt/notebooks/command-line/*png does not exist.\r\n"
      ]
     }
    ],
@@ -142,8 +269,8 @@
    ],
    "metadata": {
     "collapsed": false,
-    "is_executing": true,
     "ExecuteTime": {
+     "end_time": "2024-02-23T21:56:34.771347Z",
      "start_time": "2024-02-23T21:26:07.255874Z"
     }
    },