From 7ada3978c45f1fd43b6eb8e5296267c67e7060a1 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Thu, 8 Feb 2024 15:21:56 -0500 Subject: [PATCH 01/18] Add subsumption command (this should be moved to some module I guess) --- src/curate_gpt/cli.py | 62 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py index 43bbbae..f66b089 100644 --- a/src/curate_gpt/cli.py +++ b/src/curate_gpt/cli.py @@ -5,6 +5,7 @@ import logging import sys from pathlib import Path +import random from typing import Any, Dict, List, Union import click @@ -17,6 +18,7 @@ from llm.cli import load_conversation from oaklib import get_adapter from pydantic import BaseModel +from tqdm import tqdm from curate_gpt import ChromaDBAdapter, __version__ from curate_gpt.agents.chat_agent import ChatAgent, ChatResponse @@ -37,6 +39,8 @@ from curate_gpt.wrappers.literature.pubmed_wrapper import PubmedWrapper from curate_gpt.wrappers.ontology import OntologyWrapper +from oaklib.datamodels.vocabulary import IS_A, PART_OF + __all__ = [ "main", ] @@ -1595,6 +1599,64 @@ def _text_lookup(obj: Dict): db.update_collection_metadata(collection, object_type="OntologyClass") +@ontology.command(name="subsumption") +@path_option +@collection_option +@model_option +@click.option("--prefix", required=False, default=None, help="Prefix of terms to use, e.g. 'HP:'") +@click.option('--predicates', multiple=True, help='Predicates of interest (e.g., is_a, part_of)') +@click.option("--seed", required=False, default=42, help="Seed for random number generator") +@click.option('--num_terms', required=False, default=1000, help='Number of term pairs to compare') +@click.argument("ont") +def subsumption_command(ont, path, collection, prefix, predicates, seed, num_terms, model, **kwargs): + """ + Compare pairs of ontology terms where one subsumes the other, or one does NOT + subsume the other, to determine whether LLM embeddings reflect subsumption + relationships. + + Example: + ------- + curategpt subsumption -c obo_hp $db/hp.db + + """ + if not predicates: + predicates = [IS_A, PART_OF] + + oak_adapter = get_adapter(ont) + view = OntologyWrapper(oak_adapter=oak_adapter) + db = ChromaDBAdapter(path, **kwargs) + db.text_lookup = view.text_field + + c = db.client.get_collection(collection) + + # get all terms + terms = list(view.oak_adapter.all_entity_curies()) + if prefix is not None: + terms = [t for t in terms if t.startswith(prefix)] + if not terms: + raise ValueError(f"No terms found with prefix {prefix}") + + # choose 1000 pseudo-random terms, get ancestor info, choose a random subsuming + # and non-subsuming term, calculate fraction of ancestors in common while we are + # at it + random.seed(seed) + ancs = [] + random_pairs = [] + for term in tqdm(random.sample(terms, num_terms), desc="Choosing terms to compare"): + anc = list(view.oak_adapter.ancestors(term, predicates=predicates, reflexive=False)) + ancs.append((term, anc)) + + # choose random term to pair with + random_other_term = random.choice(terms) + random_term_ancs = list(view.oak_adapter.ancestors(random_other_term, + predicates=predicates, + reflexive=False)) + pair_shared_anc = len(set(anc).intersection( + set(random_term_ancs))) / len(anc) # fraction of ancestors in common + random_pairs.append((term, random_other_term, pair_shared_anc)) + + pass + @main.group() def view(): "Virtual store/wrapper" From 528793d6c1641ccb12f2763376cdf00cca5b52ba Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Fri, 9 Feb 2024 10:57:21 -0500 Subject: [PATCH 02/18] Add better comment/warning --- src/curate_gpt/cli.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py index f66b089..293fab8 100644 --- a/src/curate_gpt/cli.py +++ b/src/curate_gpt/cli.py @@ -4,6 +4,7 @@ import json import logging import sys +import warnings from pathlib import Path import random from typing import Any, Dict, List, Union @@ -1627,6 +1628,10 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter db = ChromaDBAdapter(path, **kwargs) db.text_lookup = view.text_field + if model is None: + warnings.warn("No model specified, using default model. Note that if you must" + "the same model that was used to build the collection.") + c = db.client.get_collection(collection) # get all terms @@ -1636,9 +1641,9 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter if not terms: raise ValueError(f"No terms found with prefix {prefix}") - # choose 1000 pseudo-random terms, get ancestor info, choose a random subsuming - # and non-subsuming term, calculate fraction of ancestors in common while we are - # at it + # choose 1000 pseudo-random terms, for each, choose another random term, then + # calculate fraction of ancestors in common while we are at it. we'll compare with + # cosine similarity of embeddings later random.seed(seed) ancs = [] random_pairs = [] From 61888a62d31ecd3aa3a610c0d5f6da6fd07a47a9 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Tue, 13 Feb 2024 17:31:09 -0500 Subject: [PATCH 03/18] Finish first pass of subsumption command --- src/curate_gpt/cli.py | 87 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 74 insertions(+), 13 deletions(-) diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py index 293fab8..4469b9f 100644 --- a/src/curate_gpt/cli.py +++ b/src/curate_gpt/cli.py @@ -20,6 +20,7 @@ from oaklib import get_adapter from pydantic import BaseModel from tqdm import tqdm +import numpy as np from curate_gpt import ChromaDBAdapter, __version__ from curate_gpt.agents.chat_agent import ChatAgent, ChatResponse @@ -1608,12 +1609,13 @@ def _text_lookup(obj: Dict): @click.option('--predicates', multiple=True, help='Predicates of interest (e.g., is_a, part_of)') @click.option("--seed", required=False, default=42, help="Seed for random number generator") @click.option('--num_terms', required=False, default=1000, help='Number of term pairs to compare') +@click.option('--choose_subsuming_terms', required=False, default=False, help='Whether to choose subsuming terms or just random terms') @click.argument("ont") -def subsumption_command(ont, path, collection, prefix, predicates, seed, num_terms, model, **kwargs): +def subsumption_command(ont, path, collection, prefix, predicates, seed, num_terms, + choose_subsuming_terms, model, **kwargs): """ - Compare pairs of ontology terms where one subsumes the other, or one does NOT - subsume the other, to determine whether LLM embeddings reflect subsumption - relationships. + Compare pairs of ontology terms (optionally where one subsums the other) to + determine whether similarity of LLM embeddings reflect subsumption relationships. Example: ------- @@ -1641,26 +1643,85 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter if not terms: raise ValueError(f"No terms found with prefix {prefix}") - # choose 1000 pseudo-random terms, for each, choose another random term, then + c = db.client.get_collection(collection, + embedding_function=db._embedding_function(model)) + + # build CURIE to object map + curie2obj_id = {} + for o in tqdm(list(view.objects())): + curie2obj_id[o['original_id']] = o + + # get embeddings to manually do cosine similarity + d = c.get(include=['embeddings']) + ids = d['ids'] + emb = d['embeddings'] + # make id2emb map + id2emb = {} + for i, id in tqdm(enumerate(ids), desc="Building id2emb map"): + id2emb[id] = emb[i] + + # choose num_terms pseudo-random terms, for each, choose another random term, then # calculate fraction of ancestors in common while we are at it. we'll compare with # cosine similarity of embeddings later random.seed(seed) - ancs = [] - random_pairs = [] + results = [] for term in tqdm(random.sample(terms, num_terms), desc="Choosing terms to compare"): anc = list(view.oak_adapter.ancestors(term, predicates=predicates, reflexive=False)) - ancs.append((term, anc)) # choose random term to pair with - random_other_term = random.choice(terms) + if choose_subsuming_terms: + random_other_term = random.choice(anc) + else: + random_other_term = random.choice(terms) random_term_ancs = list(view.oak_adapter.ancestors(random_other_term, predicates=predicates, reflexive=False)) - pair_shared_anc = len(set(anc).intersection( - set(random_term_ancs))) / len(anc) # fraction of ancestors in common - random_pairs.append((term, random_other_term, pair_shared_anc)) + # fraction of ancestors in common + pair_shared_anc = len(set(anc).intersection(set(random_term_ancs))) / len(anc) + + id1 = curie2obj_id[term]['id'] + id2 = curie2obj_id[random_other_term]['id'] + + # calculate cosine sim + cosine_sim = np.dot(id2emb[id1], id2emb[id2]) / (np.linalg.norm(id2emb[id1]) * np.linalg.norm(id2emb[id2])) + + # if debugging + if (logging.getLogger().getEffectiveLevel() == logging.DEBUG and + (cosine_sim > 0.85 or cosine_sim < 0.2)): + print(f"\nterm: {term} {(curie2obj_id[term]['label'])}," + f" random_other_term: {random_other_term} {(curie2obj_id[random_other_term]['label'])}," + f" pair_shared_anc: {pair_shared_anc}, cosine_sim: {round(cosine_sim, 2)}") + + results.append((term, random_other_term, pair_shared_anc, cosine_sim)) + pass + + # plot cosine similarity vs fraction of ancestors in common + # in matplotlib + import matplotlib.pyplot as plt + import seaborn as sns + import pandas as pd + df = pd.DataFrame(results, columns=['term', 'random_other_term', 'pair_shared_anc', 'cosine_sim']) + sns.scatterplot(data=df, x='pair_shared_anc', y='cosine_sim') + + # least squares fit + m, b = np.polyfit(df['pair_shared_anc'], df['cosine_sim'], 1) + # plot line + plt.plot(df['pair_shared_anc'], m*df['pair_shared_anc'] + b, color='red') + # calculate r-squared value + r2 = np.corrcoef(df['pair_shared_anc'], df['cosine_sim'])[0, 1]**2 + plt.text(0.1, 0.9, f"R-squared: {round(r2, 2)}", ha='center', + va='center', transform=plt.gca().transAxes) + + plt.xlabel('Fraction of ancestors in common') + plt.ylabel('Cosine similarity') + # title = ontology name + plt.title(f'{ont}') + + + plt.show() + + # write results to file - pass @main.group() def view(): From 4b4319a76ea7426806628e0ebfd03f84d721a44a Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Wed, 14 Feb 2024 14:33:08 -0500 Subject: [PATCH 04/18] Add arg to select root term (to avoid selecting modifier terms that aren't of interest) --- src/curate_gpt/cli.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py index 4469b9f..375eff3 100644 --- a/src/curate_gpt/cli.py +++ b/src/curate_gpt/cli.py @@ -1609,10 +1609,11 @@ def _text_lookup(obj: Dict): @click.option('--predicates', multiple=True, help='Predicates of interest (e.g., is_a, part_of)') @click.option("--seed", required=False, default=42, help="Seed for random number generator") @click.option('--num_terms', required=False, default=1000, help='Number of term pairs to compare') -@click.option('--choose_subsuming_terms', required=False, default=False, help='Whether to choose subsuming terms or just random terms') +@click.option('--choose_subsuming_terms', required=False, default=True, help='Whether to choose subsuming terms or just random terms') +@click.option("--root_term", required=False, default=None, help="Root term to use for selecting terms to sample") @click.argument("ont") def subsumption_command(ont, path, collection, prefix, predicates, seed, num_terms, - choose_subsuming_terms, model, **kwargs): + choose_subsuming_terms, root_term, model, **kwargs): """ Compare pairs of ontology terms (optionally where one subsums the other) to determine whether similarity of LLM embeddings reflect subsumption relationships. @@ -1637,7 +1638,10 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter c = db.client.get_collection(collection) # get all terms - terms = list(view.oak_adapter.all_entity_curies()) + if root_term is not None: + pass + else: + terms = list(view.oak_adapter.all_entity_curies()) if prefix is not None: terms = [t for t in terms if t.startswith(prefix)] if not terms: @@ -1666,7 +1670,7 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter random.seed(seed) results = [] for term in tqdm(random.sample(terms, num_terms), desc="Choosing terms to compare"): - anc = list(view.oak_adapter.ancestors(term, predicates=predicates, reflexive=False)) + anc = list(view.oak_adapter.ancestors(term, predicates=predicates, reflexive=True)) # choose random term to pair with if choose_subsuming_terms: @@ -1675,7 +1679,7 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter random_other_term = random.choice(terms) random_term_ancs = list(view.oak_adapter.ancestors(random_other_term, predicates=predicates, - reflexive=False)) + reflexive=True)) # fraction of ancestors in common pair_shared_anc = len(set(anc).intersection(set(random_term_ancs))) / len(anc) @@ -1683,7 +1687,11 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter id2 = curie2obj_id[random_other_term]['id'] # calculate cosine sim - cosine_sim = np.dot(id2emb[id1], id2emb[id2]) / (np.linalg.norm(id2emb[id1]) * np.linalg.norm(id2emb[id2])) + try: + cosine_sim = np.dot(id2emb[id1], id2emb[id2]) / (np.linalg.norm(id2emb[id1]) * np.linalg.norm(id2emb[id2])) + except KeyError as e: + print(f"KeyError: {e}") + continue # if debugging if (logging.getLogger().getEffectiveLevel() == logging.DEBUG and @@ -1716,12 +1724,8 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter plt.ylabel('Cosine similarity') # title = ontology name plt.title(f'{ont}') - - plt.show() - # write results to file - @main.group() def view(): From 9680d98fb04557e920394b12255b15f40ede2515 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Wed, 14 Feb 2024 14:36:24 -0500 Subject: [PATCH 05/18] Add arg to select root term (to avoid selecting modifier terms that aren't of interest) --- src/curate_gpt/cli.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py index 375eff3..186a193 100644 --- a/src/curate_gpt/cli.py +++ b/src/curate_gpt/cli.py @@ -1635,11 +1635,10 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter warnings.warn("No model specified, using default model. Note that if you must" "the same model that was used to build the collection.") - c = db.client.get_collection(collection) - # get all terms if root_term is not None: - pass + print(f"Using root term: {root_term} to select terms to compare.") + terms = list(view.oak_adapter.descendants(root_term, predicates=predicates, reflexive=True)) else: terms = list(view.oak_adapter.all_entity_curies()) if prefix is not None: From e87013a9b274bfdd3c811bd46e39c63b540193bf Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Wed, 14 Feb 2024 16:43:14 -0500 Subject: [PATCH 06/18] Fix comment --- src/curate_gpt/cli.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py index 186a193..0bfd7c3 100644 --- a/src/curate_gpt/cli.py +++ b/src/curate_gpt/cli.py @@ -1664,8 +1664,7 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter id2emb[id] = emb[i] # choose num_terms pseudo-random terms, for each, choose another random term, then - # calculate fraction of ancestors in common while we are at it. we'll compare with - # cosine similarity of embeddings later + # calculate fraction of ancestors in common, then calculate cosine similarity random.seed(seed) results = [] for term in tqdm(random.sample(terms, num_terms), desc="Choosing terms to compare"): @@ -1692,6 +1691,13 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter print(f"KeyError: {e}") continue + # this seems to have a few times in HPO + if cosine_sim == 1.0 and pair_shared_anc < 1.0: + print(f"term: {term} {(curie2obj_id[term]['label'])}," + f" random_other_term: {random_other_term} {(curie2obj_id[random_other_term]['label'])}," + f" pair_shared_anc: {pair_shared_anc}, cosine_sim: {round(cosine_sim, 2)}") + continue + # if debugging if (logging.getLogger().getEffectiveLevel() == logging.DEBUG and (cosine_sim > 0.85 or cosine_sim < 0.2)): From e3c6e42fb7c381928e17991eaaccb882e70550c9 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Thu, 15 Feb 2024 13:18:43 -0500 Subject: [PATCH 07/18] Add some alpha to plot, fix some comments --- src/curate_gpt/cli.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py index 0bfd7c3..5f4a193 100644 --- a/src/curate_gpt/cli.py +++ b/src/curate_gpt/cli.py @@ -1615,7 +1615,7 @@ def _text_lookup(obj: Dict): def subsumption_command(ont, path, collection, prefix, predicates, seed, num_terms, choose_subsuming_terms, root_term, model, **kwargs): """ - Compare pairs of ontology terms (optionally where one subsums the other) to + Compare pairs of ontology terms (optionally where one subsumes the other) to determine whether similarity of LLM embeddings reflect subsumption relationships. Example: @@ -1672,14 +1672,16 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter # choose random term to pair with if choose_subsuming_terms: - random_other_term = random.choice(anc) + # do not choose term itself (remove term from list of ancestors) + random_other_term = random.choice(list(set(anc) - set([term]))) else: random_other_term = random.choice(terms) random_term_ancs = list(view.oak_adapter.ancestors(random_other_term, predicates=predicates, reflexive=True)) # fraction of ancestors in common - pair_shared_anc = len(set(anc).intersection(set(random_term_ancs))) / len(anc) + pair_shared_anc = (len(set(anc).intersection(set(random_term_ancs))) / + len(list(set(anc)))) id1 = curie2obj_id[term]['id'] id2 = curie2obj_id[random_other_term]['id'] @@ -1691,13 +1693,6 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter print(f"KeyError: {e}") continue - # this seems to have a few times in HPO - if cosine_sim == 1.0 and pair_shared_anc < 1.0: - print(f"term: {term} {(curie2obj_id[term]['label'])}," - f" random_other_term: {random_other_term} {(curie2obj_id[random_other_term]['label'])}," - f" pair_shared_anc: {pair_shared_anc}, cosine_sim: {round(cosine_sim, 2)}") - continue - # if debugging if (logging.getLogger().getEffectiveLevel() == logging.DEBUG and (cosine_sim > 0.85 or cosine_sim < 0.2)): @@ -1714,7 +1709,9 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter import seaborn as sns import pandas as pd df = pd.DataFrame(results, columns=['term', 'random_other_term', 'pair_shared_anc', 'cosine_sim']) - sns.scatterplot(data=df, x='pair_shared_anc', y='cosine_sim') + + # plot with some alpha + sns.scatterplot(data=df, x='pair_shared_anc', y='cosine_sim', alpha=0.5) # least squares fit m, b = np.polyfit(df['pair_shared_anc'], df['cosine_sim'], 1) @@ -1730,6 +1727,7 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter # title = ontology name plt.title(f'{ont}') plt.show() + pass @main.group() From beea4b84fc91c925c789a3b748bcd0baa3b319b7 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Thu, 15 Feb 2024 15:38:16 -0500 Subject: [PATCH 08/18] Move subsumption stuff to an agent to declutter cli.py --- .../agents/subsumption_eval_agent.py | 144 ++++++++++++++++++ src/curate_gpt/cli.py | 116 +++----------- 2 files changed, 164 insertions(+), 96 deletions(-) create mode 100644 src/curate_gpt/agents/subsumption_eval_agent.py diff --git a/src/curate_gpt/agents/subsumption_eval_agent.py b/src/curate_gpt/agents/subsumption_eval_agent.py new file mode 100644 index 0000000..60f8bee --- /dev/null +++ b/src/curate_gpt/agents/subsumption_eval_agent.py @@ -0,0 +1,144 @@ +import logging +import random + +import matplotlib.pyplot as plt +import numpy as np +import seaborn as sns +import pandas as pd + +from dataclasses import dataclass + +from oaklib import get_adapter +from tqdm import tqdm + +from curate_gpt import ChromaDBAdapter +from curate_gpt.agents.base_agent import BaseAgent +from curate_gpt.wrappers.ontology import OntologyWrapper + +logger = logging.getLogger(__name__) + + +@dataclass +class SubsumptionEvalAgent(BaseAgent): + + """ + An agent to evaluate subsumption relations between entities: compare + cosine similarity between two entities and fraction of shared ancestors + + """ + + def compare_cosine_sim_to_shared_ancestors( + self, + view: OntologyWrapper, + db: ChromaDBAdapter, + collection: str, + ont: str, + num_terms: int, + choose_subsuming_terms: bool, + model: str, + prefix: str = None, + predicates: list = None, + root_term: str = None, + seed: int = 42, + **kwargs): + """ + Summarize a list of objects. + + Example: + ------- + + >>> print(response) + """ + + db = self.knowledge_source + + # get all terms + if root_term is not None: + print(f"Using root term: {root_term} to select terms to compare.") + terms = list(view.oak_adapter.descendants(root_term, predicates=predicates, reflexive=True)) + else: + terms = list(view.oak_adapter.all_entity_curies()) + if prefix is not None: + terms = [t for t in terms if t.startswith(prefix)] + if not terms: + raise ValueError(f"No terms found with prefix {prefix}") + + c = db.client.get_collection(collection, + embedding_function=db._embedding_function(model)) + + # build CURIE to object map + curie2obj_id = {} + for o in tqdm(list(view.objects())): + curie2obj_id[o['original_id']] = o + + # get embeddings to manually do cosine similarity + d = c.get(include=['embeddings']) + ids = d['ids'] + emb = d['embeddings'] + # make id2emb map + id2emb = {} + for i, id in tqdm(enumerate(ids), desc="Building id2emb map"): + id2emb[id] = emb[i] + + # choose num_terms pseudo-random terms, for each, choose another random term, then + # calculate fraction of ancestors in common, then calculate cosine similarity + random.seed(seed) + results = [] + for term in tqdm(random.sample(terms, num_terms), desc="Choosing terms to compare"): + anc = list(view.oak_adapter.ancestors(term, predicates=predicates, reflexive=True)) + + # choose random term to pair with + if choose_subsuming_terms: + # do not choose term itself (remove term from list of ancestors) + random_other_term = random.choice(list(set(anc) - set([term]))) + else: + random_other_term = random.choice(terms) + random_term_ancs = list(view.oak_adapter.ancestors(random_other_term, + predicates=predicates, + reflexive=True)) + # fraction of ancestors in common + pair_shared_anc = (len(set(anc).intersection(set(random_term_ancs))) / + len(list(set(anc)))) + + id1 = curie2obj_id[term]['id'] + id2 = curie2obj_id[random_other_term]['id'] + + # calculate cosine sim + try: + cosine_sim = np.dot(id2emb[id1], id2emb[id2]) / (np.linalg.norm(id2emb[id1]) * np.linalg.norm(id2emb[id2])) + except KeyError as e: + print(f"KeyError: {e}") + continue + + # if debugging + if (logging.getLogger().getEffectiveLevel() == logging.DEBUG and + (cosine_sim > 0.85 or cosine_sim < 0.2)): + print(f"\nterm: {term} {(curie2obj_id[term]['label'])}," + f" random_other_term: {random_other_term} {(curie2obj_id[random_other_term]['label'])}," + f" pair_shared_anc: {pair_shared_anc}, cosine_sim: {round(cosine_sim, 2)}") + + results.append((term, random_other_term, pair_shared_anc, cosine_sim)) + pass + + # plot cosine similarity vs fraction of ancestors in common + # in matplotlib + df = pd.DataFrame(results, columns=['term', 'random_other_term', 'pair_shared_anc', 'cosine_sim']) + + # plot with some alpha + sns.scatterplot(data=df, x='pair_shared_anc', y='cosine_sim', alpha=0.5) + + # least squares fit + m, b = np.polyfit(df['pair_shared_anc'], df['cosine_sim'], 1) + # plot line + plt.plot(df['pair_shared_anc'], m*df['pair_shared_anc'] + b, color='red') + # calculate r-squared value + r2 = np.corrcoef(df['pair_shared_anc'], df['cosine_sim'])[0, 1]**2 + plt.text(0.1, 0.9, f"R-squared: {round(r2, 2)}", ha='center', + va='center', transform=plt.gca().transAxes) + + plt.xlabel('Fraction of ancestors in common') + plt.ylabel('Cosine similarity') + # title = ontology name + plt.title(f'{ont}') + plt.show() + return {"rsquared": r2} diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py index 5f4a193..a375715 100644 --- a/src/curate_gpt/cli.py +++ b/src/curate_gpt/cli.py @@ -28,6 +28,7 @@ from curate_gpt.agents.dase_agent import DatabaseAugmentedStructuredExtraction from curate_gpt.agents.dragon_agent import DragonAgent from curate_gpt.agents.evidence_agent import EvidenceAgent +from curate_gpt.agents.subsumption_eval_agent import SubsumptionEvalAgent from curate_gpt.agents.summarization_agent import SummarizationAgent from curate_gpt.evaluation.dae_evaluator import DatabaseAugmentedCompletionEvaluator from curate_gpt.evaluation.evaluation_datamodel import StratifiedCollection, Task @@ -1623,111 +1624,34 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter curategpt subsumption -c obo_hp $db/hp.db """ - if not predicates: - predicates = [IS_A, PART_OF] - oak_adapter = get_adapter(ont) view = OntologyWrapper(oak_adapter=oak_adapter) db = ChromaDBAdapter(path, **kwargs) db.text_lookup = view.text_field + if not predicates: + predicates = [IS_A, PART_OF] + if model is None: warnings.warn("No model specified, using default model. Note that if you must" "the same model that was used to build the collection.") - # get all terms - if root_term is not None: - print(f"Using root term: {root_term} to select terms to compare.") - terms = list(view.oak_adapter.descendants(root_term, predicates=predicates, reflexive=True)) - else: - terms = list(view.oak_adapter.all_entity_curies()) - if prefix is not None: - terms = [t for t in terms if t.startswith(prefix)] - if not terms: - raise ValueError(f"No terms found with prefix {prefix}") - - c = db.client.get_collection(collection, - embedding_function=db._embedding_function(model)) - - # build CURIE to object map - curie2obj_id = {} - for o in tqdm(list(view.objects())): - curie2obj_id[o['original_id']] = o - - # get embeddings to manually do cosine similarity - d = c.get(include=['embeddings']) - ids = d['ids'] - emb = d['embeddings'] - # make id2emb map - id2emb = {} - for i, id in tqdm(enumerate(ids), desc="Building id2emb map"): - id2emb[id] = emb[i] - - # choose num_terms pseudo-random terms, for each, choose another random term, then - # calculate fraction of ancestors in common, then calculate cosine similarity - random.seed(seed) - results = [] - for term in tqdm(random.sample(terms, num_terms), desc="Choosing terms to compare"): - anc = list(view.oak_adapter.ancestors(term, predicates=predicates, reflexive=True)) - - # choose random term to pair with - if choose_subsuming_terms: - # do not choose term itself (remove term from list of ancestors) - random_other_term = random.choice(list(set(anc) - set([term]))) - else: - random_other_term = random.choice(terms) - random_term_ancs = list(view.oak_adapter.ancestors(random_other_term, - predicates=predicates, - reflexive=True)) - # fraction of ancestors in common - pair_shared_anc = (len(set(anc).intersection(set(random_term_ancs))) / - len(list(set(anc)))) - - id1 = curie2obj_id[term]['id'] - id2 = curie2obj_id[random_other_term]['id'] - - # calculate cosine sim - try: - cosine_sim = np.dot(id2emb[id1], id2emb[id2]) / (np.linalg.norm(id2emb[id1]) * np.linalg.norm(id2emb[id2])) - except KeyError as e: - print(f"KeyError: {e}") - continue - - # if debugging - if (logging.getLogger().getEffectiveLevel() == logging.DEBUG and - (cosine_sim > 0.85 or cosine_sim < 0.2)): - print(f"\nterm: {term} {(curie2obj_id[term]['label'])}," - f" random_other_term: {random_other_term} {(curie2obj_id[random_other_term]['label'])}," - f" pair_shared_anc: {pair_shared_anc}, cosine_sim: {round(cosine_sim, 2)}") - - results.append((term, random_other_term, pair_shared_anc, cosine_sim)) - pass - - # plot cosine similarity vs fraction of ancestors in common - # in matplotlib - import matplotlib.pyplot as plt - import seaborn as sns - import pandas as pd - df = pd.DataFrame(results, columns=['term', 'random_other_term', 'pair_shared_anc', 'cosine_sim']) - - # plot with some alpha - sns.scatterplot(data=df, x='pair_shared_anc', y='cosine_sim', alpha=0.5) - - # least squares fit - m, b = np.polyfit(df['pair_shared_anc'], df['cosine_sim'], 1) - # plot line - plt.plot(df['pair_shared_anc'], m*df['pair_shared_anc'] + b, color='red') - # calculate r-squared value - r2 = np.corrcoef(df['pair_shared_anc'], df['cosine_sim'])[0, 1]**2 - plt.text(0.1, 0.9, f"R-squared: {round(r2, 2)}", ha='center', - va='center', transform=plt.gca().transAxes) - - plt.xlabel('Fraction of ancestors in common') - plt.ylabel('Cosine similarity') - # title = ontology name - plt.title(f'{ont}') - plt.show() - pass + agent = SubsumptionEvalAgent(knowledge_source=db, + knowledge_source_collection=collection) + response = ( + agent.compare_cosine_sim_to_shared_ancestors(view=view, + db=db, + collection=collection, + ont=ont, + num_terms=num_terms, + choose_subsuming_terms=choose_subsuming_terms, + model=model, + prefix=prefix, + predicates=predicates, + root_term=root_term, + seed=seed, + **kwargs)) + click.echo(f"r-squared: {response.get('rsquared')}") @main.group() From 9f6bba5ea742871c7e7fccb9ec6666864f1d53de Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Thu, 15 Feb 2024 15:41:28 -0500 Subject: [PATCH 09/18] Refactor --- src/curate_gpt/agents/subsumption_eval_agent.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/curate_gpt/agents/subsumption_eval_agent.py b/src/curate_gpt/agents/subsumption_eval_agent.py index 60f8bee..81e6efc 100644 --- a/src/curate_gpt/agents/subsumption_eval_agent.py +++ b/src/curate_gpt/agents/subsumption_eval_agent.py @@ -30,7 +30,6 @@ class SubsumptionEvalAgent(BaseAgent): def compare_cosine_sim_to_shared_ancestors( self, view: OntologyWrapper, - db: ChromaDBAdapter, collection: str, ont: str, num_terms: int, @@ -50,7 +49,6 @@ def compare_cosine_sim_to_shared_ancestors( >>> print(response) """ - db = self.knowledge_source # get all terms if root_term is not None: @@ -63,8 +61,8 @@ def compare_cosine_sim_to_shared_ancestors( if not terms: raise ValueError(f"No terms found with prefix {prefix}") - c = db.client.get_collection(collection, - embedding_function=db._embedding_function(model)) + c = self.knowledge_source.client.get_collection(self.knowledge_source_collection, + embedding_function=self.knowledge_source._embedding_function(model)) # build CURIE to object map curie2obj_id = {} From f308558fb63001f34eaaa24d78f9c6147a2a21e6 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Thu, 15 Feb 2024 15:42:01 -0500 Subject: [PATCH 10/18] Refactor --- src/curate_gpt/agents/subsumption_eval_agent.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/curate_gpt/agents/subsumption_eval_agent.py b/src/curate_gpt/agents/subsumption_eval_agent.py index 81e6efc..863b1b7 100644 --- a/src/curate_gpt/agents/subsumption_eval_agent.py +++ b/src/curate_gpt/agents/subsumption_eval_agent.py @@ -30,7 +30,6 @@ class SubsumptionEvalAgent(BaseAgent): def compare_cosine_sim_to_shared_ancestors( self, view: OntologyWrapper, - collection: str, ont: str, num_terms: int, choose_subsuming_terms: bool, From b657322db99a2229ff6a87f089a9ee58b0dfcbd0 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Thu, 15 Feb 2024 16:15:48 -0500 Subject: [PATCH 11/18] Tidy up, add documentation --- .../agents/subsumption_eval_agent.py | 56 ++++++++++++------- src/curate_gpt/cli.py | 12 ++-- 2 files changed, 43 insertions(+), 25 deletions(-) diff --git a/src/curate_gpt/agents/subsumption_eval_agent.py b/src/curate_gpt/agents/subsumption_eval_agent.py index 863b1b7..f5021ec 100644 --- a/src/curate_gpt/agents/subsumption_eval_agent.py +++ b/src/curate_gpt/agents/subsumption_eval_agent.py @@ -7,11 +7,8 @@ import pandas as pd from dataclasses import dataclass - -from oaklib import get_adapter from tqdm import tqdm -from curate_gpt import ChromaDBAdapter from curate_gpt.agents.base_agent import BaseAgent from curate_gpt.wrappers.ontology import OntologyWrapper @@ -26,46 +23,67 @@ class SubsumptionEvalAgent(BaseAgent): cosine similarity between two entities and fraction of shared ancestors """ + view: OntologyWrapper = None + model: str = None + ont: str = None def compare_cosine_sim_to_shared_ancestors( self, - view: OntologyWrapper, - ont: str, num_terms: int, choose_subsuming_terms: bool, - model: str, prefix: str = None, predicates: list = None, root_term: str = None, seed: int = 42, **kwargs): """ - Summarize a list of objects. + compare cosine similarity between two entities and fraction of shared ancestors Example: - ------- - >>> print(response) + from oaklib.datamodels.vocabulary import IS_A, PART_OF + from oaklib.adapters import get_adapter + from oaklib.adapters.chroma import ChromaDBAdapter + from curate_gpt.agents.subsumption_eval_agent import SubsumptionEvalAgent + + oak_adapter = get_adapter("hp") + view = OntologyWrapper(oak_adapter=oak_adapter) + db = ChromaDBAdapter(path, **kwargs) + db.text_lookup = view.text_fie + predicates = [IS_A, PART_OF] + model = "openai:" # use the same model as was used in db for embeddings + agent = SubsumptionEvalAgent(knowledge_source=db, + knowledge_source_collection=collection, + view=view, + model=model, + ont="hp) + response = (agent.compare_cosine_sim_to_shared_ancestors(num_terms=num_terms, + choose_subsuming_terms=True, + prefix="HP:", + predicates=predicates, + root_term="HP:0000118", + print(response) """ - # get all terms if root_term is not None: print(f"Using root term: {root_term} to select terms to compare.") - terms = list(view.oak_adapter.descendants(root_term, predicates=predicates, reflexive=True)) + terms = list(self.view.oak_adapter.descendants(root_term, + predicates=predicates, + reflexive=True)) else: - terms = list(view.oak_adapter.all_entity_curies()) + terms = list(self.view.oak_adapter.all_entity_curies()) if prefix is not None: terms = [t for t in terms if t.startswith(prefix)] if not terms: raise ValueError(f"No terms found with prefix {prefix}") c = self.knowledge_source.client.get_collection(self.knowledge_source_collection, - embedding_function=self.knowledge_source._embedding_function(model)) + embedding_function=self.knowledge_source._embedding_function(self.model)) # build CURIE to object map curie2obj_id = {} - for o in tqdm(list(view.objects())): + for o in tqdm(list(self.view.objects())): curie2obj_id[o['original_id']] = o # get embeddings to manually do cosine similarity @@ -82,7 +100,7 @@ def compare_cosine_sim_to_shared_ancestors( random.seed(seed) results = [] for term in tqdm(random.sample(terms, num_terms), desc="Choosing terms to compare"): - anc = list(view.oak_adapter.ancestors(term, predicates=predicates, reflexive=True)) + anc = list(self.view.oak_adapter.ancestors(term, predicates=predicates, reflexive=True)) # choose random term to pair with if choose_subsuming_terms: @@ -90,9 +108,9 @@ def compare_cosine_sim_to_shared_ancestors( random_other_term = random.choice(list(set(anc) - set([term]))) else: random_other_term = random.choice(terms) - random_term_ancs = list(view.oak_adapter.ancestors(random_other_term, - predicates=predicates, - reflexive=True)) + random_term_ancs = list(self.view.oak_adapter.ancestors(random_other_term, + predicates=predicates, + reflexive=True)) # fraction of ancestors in common pair_shared_anc = (len(set(anc).intersection(set(random_term_ancs))) / len(list(set(anc)))) @@ -136,6 +154,6 @@ def compare_cosine_sim_to_shared_ancestors( plt.xlabel('Fraction of ancestors in common') plt.ylabel('Cosine similarity') # title = ontology name - plt.title(f'{ont}') + plt.title(f'{self.ont}') plt.show() return {"rsquared": r2} diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py index a375715..91e1055 100644 --- a/src/curate_gpt/cli.py +++ b/src/curate_gpt/cli.py @@ -1637,15 +1637,15 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter "the same model that was used to build the collection.") agent = SubsumptionEvalAgent(knowledge_source=db, - knowledge_source_collection=collection) + knowledge_source_collection=collection, + view=view, + model=model, + ont=ont) + response = ( - agent.compare_cosine_sim_to_shared_ancestors(view=view, - db=db, - collection=collection, - ont=ont, + agent.compare_cosine_sim_to_shared_ancestors( num_terms=num_terms, choose_subsuming_terms=choose_subsuming_terms, - model=model, prefix=prefix, predicates=predicates, root_term=root_term, From b79f1ec800c003280b2d10b2b97c1411785c71b9 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Fri, 16 Feb 2024 09:56:51 -0500 Subject: [PATCH 12/18] Write out image file --- src/curate_gpt/agents/subsumption_eval_agent.py | 6 ++++++ src/curate_gpt/cli.py | 13 +++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/curate_gpt/agents/subsumption_eval_agent.py b/src/curate_gpt/agents/subsumption_eval_agent.py index f5021ec..00369c9 100644 --- a/src/curate_gpt/agents/subsumption_eval_agent.py +++ b/src/curate_gpt/agents/subsumption_eval_agent.py @@ -35,6 +35,7 @@ def compare_cosine_sim_to_shared_ancestors( predicates: list = None, root_term: str = None, seed: int = 42, + img_file_name: str = None, **kwargs): """ compare cosine similarity between two entities and fraction of shared ancestors @@ -155,5 +156,10 @@ def compare_cosine_sim_to_shared_ancestors( plt.ylabel('Cosine similarity') # title = ontology name plt.title(f'{self.ont}') + + # save to file + if img_file_name is not None: + plt.savefig(img_file_name) + plt.show() return {"rsquared": r2} diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py index 91e1055..78650ff 100644 --- a/src/curate_gpt/cli.py +++ b/src/curate_gpt/cli.py @@ -3,6 +3,7 @@ import gzip import json import logging +import os import sys import warnings from pathlib import Path @@ -1612,9 +1613,10 @@ def _text_lookup(obj: Dict): @click.option('--num_terms', required=False, default=1000, help='Number of term pairs to compare') @click.option('--choose_subsuming_terms', required=False, default=True, help='Whether to choose subsuming terms or just random terms') @click.option("--root_term", required=False, default=None, help="Root term to use for selecting terms to sample") +@click.option("--output_dir", required=False, default=None, help="Directory to write output to") @click.argument("ont") def subsumption_command(ont, path, collection, prefix, predicates, seed, num_terms, - choose_subsuming_terms, root_term, model, **kwargs): + choose_subsuming_terms, root_term, output_dir, model, **kwargs): """ Compare pairs of ontology terms (optionally where one subsumes the other) to determine whether similarity of LLM embeddings reflect subsumption relationships. @@ -1642,14 +1644,21 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter model=model, ont=ont) + if output_dir and not os.path.exists(output_dir): + os.mkdir(output_dir) + img_file_name = (os.path.join(output_dir, + f"cosine_sim_vs_shared_anc_{ont}_" + f"sub_{str(choose_subsuming_terms)}.png")) response = ( agent.compare_cosine_sim_to_shared_ancestors( num_terms=num_terms, - choose_subsuming_terms=choose_subsuming_terms, + choose_subsuming_terms= + choose_subsuming_terms, prefix=prefix, predicates=predicates, root_term=root_term, seed=seed, + img_file_name=img_file_name, **kwargs)) click.echo(f"r-squared: {response.get('rsquared')}") From b4a680ca9529819d319d4f7857a8a83366f81d0e Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Fri, 16 Feb 2024 10:01:25 -0500 Subject: [PATCH 13/18] Bug --- src/curate_gpt/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py index 78650ff..1769449 100644 --- a/src/curate_gpt/cli.py +++ b/src/curate_gpt/cli.py @@ -1646,7 +1646,7 @@ def subsumption_command(ont, path, collection, prefix, predicates, seed, num_ter if output_dir and not os.path.exists(output_dir): os.mkdir(output_dir) - img_file_name = (os.path.join(output_dir, + img_file_name = (os.path.join(output_dir if output_dir else "", f"cosine_sim_vs_shared_anc_{ont}_" f"sub_{str(choose_subsuming_terms)}.png")) response = ( From af7b171559460b12faa8b971e7b9786a0a80bd1f Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Fri, 23 Feb 2024 15:12:10 -0500 Subject: [PATCH 14/18] Tidy up subsumption code a bit --- src/curate_gpt/agents/subsumption_eval_agent.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/curate_gpt/agents/subsumption_eval_agent.py b/src/curate_gpt/agents/subsumption_eval_agent.py index 00369c9..0466c76 100644 --- a/src/curate_gpt/agents/subsumption_eval_agent.py +++ b/src/curate_gpt/agents/subsumption_eval_agent.py @@ -116,15 +116,17 @@ def compare_cosine_sim_to_shared_ancestors( pair_shared_anc = (len(set(anc).intersection(set(random_term_ancs))) / len(list(set(anc)))) - id1 = curie2obj_id[term]['id'] - id2 = curie2obj_id[random_other_term]['id'] + try: + id1 = curie2obj_id[term]['id'] + id2 = curie2obj_id[random_other_term]['id'] + except KeyError as e: + raise KeyError(f"KeyError retrieving item from curie2obj_id: {e}") # calculate cosine sim try: cosine_sim = np.dot(id2emb[id1], id2emb[id2]) / (np.linalg.norm(id2emb[id1]) * np.linalg.norm(id2emb[id2])) except KeyError as e: - print(f"KeyError: {e}") - continue + raise KeyError(f"KeyError retrieving item from id2emb: {e}") # if debugging if (logging.getLogger().getEffectiveLevel() == logging.DEBUG and @@ -149,13 +151,12 @@ def compare_cosine_sim_to_shared_ancestors( plt.plot(df['pair_shared_anc'], m*df['pair_shared_anc'] + b, color='red') # calculate r-squared value r2 = np.corrcoef(df['pair_shared_anc'], df['cosine_sim'])[0, 1]**2 - plt.text(0.1, 0.9, f"R-squared: {round(r2, 2)}", ha='center', + plt.text(0.2, 0.9, f"R-squared: {round(r2, 2)}", ha='center', va='center', transform=plt.gca().transAxes) plt.xlabel('Fraction of ancestors in common') plt.ylabel('Cosine similarity') - # title = ontology name - plt.title(f'{self.ont}') + plt.title(f'{self.ont.split(":")[-1]} subsuming: {choose_subsuming_terms}') # save to file if img_file_name is not None: From 8415f7b7ad36f521af273d16e3d3b15094bc2ad1 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Fri, 23 Feb 2024 16:27:43 -0500 Subject: [PATCH 15/18] Remove unused import --- src/curate_gpt/cli.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/curate_gpt/cli.py b/src/curate_gpt/cli.py index 1769449..e220704 100644 --- a/src/curate_gpt/cli.py +++ b/src/curate_gpt/cli.py @@ -20,8 +20,6 @@ from llm.cli import load_conversation from oaklib import get_adapter from pydantic import BaseModel -from tqdm import tqdm -import numpy as np from curate_gpt import ChromaDBAdapter, __version__ from curate_gpt.agents.chat_agent import ChatAgent, ChatResponse From 4fdf5161a5278416f889da4ffeb82f49dc06e3e1 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Fri, 23 Feb 2024 16:27:58 -0500 Subject: [PATCH 16/18] Add notebook for subsumption experiments --- notebooks/command-line/Subsumption-exp.ipynb | 157 +++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 notebooks/command-line/Subsumption-exp.ipynb diff --git a/notebooks/command-line/Subsumption-exp.ipynb b/notebooks/command-line/Subsumption-exp.ipynb new file mode 100644 index 0000000..4bf57e9 --- /dev/null +++ b/notebooks/command-line/Subsumption-exp.ipynb @@ -0,0 +1,157 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Subsumption experiments\n", + "\n", + "Experiments to investigate the relationship between cosine similarity of LLM embeddings and fraction of shared ancestors for pairs of ontology terms where one is subsumed by the other" + ], + "metadata": { + "collapsed": false + }, + "id": "11ab2c3cabc2f8d2" + }, + { + "cell_type": "raw", + "source": [ + "Index ontology terms without relationship info (just label and definition) " + ], + "metadata": { + "collapsed": false + }, + "id": "34719c597052c669" + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[34mInstalling dependencies from lock file\u001B[39m\r\n", + "\r\n", + "No dependencies to install or update\r\n", + "\r\n", + "\u001B[39;1mInstalling\u001B[39;22m the current project: \u001B[36mcurate-gpt\u001B[39m (\u001B[39;1m0.0.0.post2.dev0+335f59e\u001B[39;22m)\u001B[1G\u001B[2K\u001B[39;1mInstalling\u001B[39;22m the current project: \u001B[36mcurate-gpt\u001B[39m (\u001B[32m0.0.0.post2.dev0+335f59e\u001B[39m)\r\n" + ] + } + ], + "source": [ + "!poetry install" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-02-23T20:28:22.995963Z", + "start_time": "2024-02-23T20:28:16.250674Z" + } + }, + "id": "c04ef952913dd018" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "ontologies = [\n", + " (\"ont_hp_norel\", \"hp\"),\n", + " (\"ont_mondo_norel\", \"mondo\"),\n", + " (\"ont_go_norel\", \"go\"),\n", + " (\"ont_foodon_norel\", \"foodon\"),\n", + " (\"ont_chebi_norel\", \"chebi\")\n", + "]\n", + "\n", + "for chr_db_collection, ontology in ontologies:\n", + " command = f\"!curategpt ontology index --index-fields label,definition -p stagedb -c {chr_db_collection} -m openai: sqlite:obo:{ontology}\"\n", + " print(command)\n", + " get_ipython().system(command)" + ], + "metadata": { + "collapsed": false + }, + "id": "15f7f0b0fcb8cc39" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n", + "\r\n", + "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n", + " warnings.warn(\r\n", + "Using root term: MONDO:0700096 to select terms to compare.\r\n", + "100%|████████████████████████████████| 42542/42542 [00:00<00:00, 2024347.44it/s]\r\n" + ] + } + ], + "source": [ + "# Mondo\n", + "!curategpt ontology subsumption --output_dir build --root_term MONDO:0700096 --prefix \"MONDO:\" -p ../../stagedb -c ont_mondo_norel sqlite:obo:mondo -m openai:\n", + "!curategpt ontology subsumption --output_dir build --choose_subsuming_terms False --root_term MONDO:0700096 --prefix \"MONDO:\" -p ../../stagedb -c ont_mondo_norel sqlite:obo:mondo -m openai:\n", + "\n", + "# HP\n", + "!curategpt ontology subsumption --output_dir build --root_term HP:0000118 --prefix \"HP:\" -p ../../stagedb -c ont_hp_norel sqlite:obo:hp -m openai:\n", + "!curategpt ontology subsumption --output_dir build --choose_subsuming_terms False --root_term HP:0000118 --prefix \"HP:\" -p ../../stagedb -c ont_hp_norel sqlite:obo:hp -m openai:\n", + "\n", + "# GO\n", + "!curategpt ontology subsumption --output_dir build --prefix \"GO:\" -p ../../stagedb -c ont_go_norel sqlite:obo:go -m openai:\n", + "!curategpt ontology subsumption --output_dir build --choose_subsuming_terms False --prefix \"GO:\" -p ../../stagedb -c ont_go_norel sqlite:obo:go -m openai:\n", + "\n", + "# FOODON\n", + "!curategpt ontology subsumption --output_dir build --root_term FOODON:00002403 --prefix \"FOODON:\" -p ../../stagedb -c ont_foodon_norel sqlite:obo:foodon -m openai:\n", + "!curategpt ontology subsumption --output_dir build --choose_subsuming_terms False --root_term FOODON:00002403 --prefix \"FOODON:\" -p ../../stagedb -c ont_foodon_norel sqlite:obo:foodon -m openai:\n", + "\n", + "# CHEBI\n", + "!curategpt ontology subsumption --output_dir build --root_term CHEBI:59999 --prefix \"CHEBI:\" -p ../../stagedb -c ont_chebi_norel sqlite:obo:chebi -m openai:\n", + "!curategpt ontology subsumption --choose_subsuming_terms False --output_dir build --root_term CHEBI:59999 --prefix \"CHEBI:\" -p ../../stagedb -c ont_chebi_norel sqlite:obo:chebi -m openai:\n", + "\n", + "!open *png" + ], + "metadata": { + "collapsed": false, + "is_executing": true, + "ExecuteTime": { + "start_time": "2024-02-23T21:26:07.255874Z" + } + }, + "id": "3e2025479fee4a38" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + }, + "id": "aa64889d863ebd6a" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From e39f786700cf9eacf41bb10d076f2f796bdb8d0b Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Fri, 23 Feb 2024 16:29:02 -0500 Subject: [PATCH 17/18] Correct path to stagedb --- notebooks/command-line/Subsumption-exp.ipynb | 31 ++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/notebooks/command-line/Subsumption-exp.ipynb b/notebooks/command-line/Subsumption-exp.ipynb index 4bf57e9..426978c 100644 --- a/notebooks/command-line/Subsumption-exp.ipynb +++ b/notebooks/command-line/Subsumption-exp.ipynb @@ -64,7 +64,7 @@ "]\n", "\n", "for chr_db_collection, ontology in ontologies:\n", - " command = f\"!curategpt ontology index --index-fields label,definition -p stagedb -c {chr_db_collection} -m openai: sqlite:obo:{ontology}\"\n", + " command = f\"!curategpt ontology index --index-fields label,definition -p ../../stagedb -c {chr_db_collection} -m openai: sqlite:obo:{ontology}\"\n", " print(command)\n", " get_ipython().system(command)" ], @@ -86,7 +86,34 @@ "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n", " warnings.warn(\r\n", "Using root term: MONDO:0700096 to select terms to compare.\r\n", - "100%|████████████████████████████████| 42542/42542 [00:00<00:00, 2024347.44it/s]\r\n" + "100%|████████████████████████████████| 42542/42542 [00:00<00:00, 2024347.44it/s]\r\n", + "Building id2emb map: 42542it [00:00, 2673089.66it/s]\r\n", + "Choosing terms to compare: 100%|███████████| 1000/1000 [00:01<00:00, 656.31it/s]\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n", + " if pd.api.types.is_categorical_dtype(vector):\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n", + " if pd.api.types.is_categorical_dtype(vector):\r\n", + "Figure(640x480)\r\n", + "r-squared: 0.3702593848159516\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n", + "\r\n", + "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n", + " warnings.warn(\r\n", + "Using root term: MONDO:0700096 to select terms to compare.\r\n", + "100%|████████████████████████████████| 42542/42542 [00:00<00:00, 1939732.80it/s]\r\n", + "Building id2emb map: 42542it [00:00, 2888872.21it/s]\r\n", + "Choosing terms to compare: 100%|███████████| 1000/1000 [00:01<00:00, 589.07it/s]\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n", + " if pd.api.types.is_categorical_dtype(vector):\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n", + " if pd.api.types.is_categorical_dtype(vector):\r\n", + "Figure(640x480)\r\n", + "r-squared: 0.02021735186826436\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n", + "\r\n", + "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n", + " warnings.warn(\r\n", + "Using root term: HP:0000118 to select terms to compare.\r\n" ] } ], From c79a0f5f9cf5e6a833103e59fcb12267514029aa Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Mon, 26 Feb 2024 10:16:52 -0500 Subject: [PATCH 18/18] Add notebook for subsumption experiments --- notebooks/command-line/Subsumption-exp.ipynb | 133 ++++++++++++++++++- 1 file changed, 130 insertions(+), 3 deletions(-) diff --git a/notebooks/command-line/Subsumption-exp.ipynb b/notebooks/command-line/Subsumption-exp.ipynb index 426978c..289bd59 100644 --- a/notebooks/command-line/Subsumption-exp.ipynb +++ b/notebooks/command-line/Subsumption-exp.ipynb @@ -75,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "outputs": [ { "name": "stdout", @@ -113,7 +113,134 @@ "\r\n", "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n", " warnings.warn(\r\n", - "Using root term: HP:0000118 to select terms to compare.\r\n" + "Using root term: HP:0000118 to select terms to compare.\r\n", + "100%|████████████████████████████████| 29499/29499 [00:00<00:00, 1745102.59it/s]\r\n", + "Building id2emb map: 29499it [00:00, 2759255.45it/s]\r\n", + "Choosing terms to compare: 100%|███████████| 1000/1000 [00:01<00:00, 816.61it/s]\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n", + " if pd.api.types.is_categorical_dtype(vector):\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n", + " if pd.api.types.is_categorical_dtype(vector):\r\n", + "Figure(640x480)\r\n", + "r-squared: 0.5089792065670653\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n", + "\r\n", + "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n", + " warnings.warn(\r\n", + "Using root term: HP:0000118 to select terms to compare.\r\n", + "100%|████████████████████████████████| 29499/29499 [00:00<00:00, 1836214.03it/s]\r\n", + "Building id2emb map: 29499it [00:00, 2972153.40it/s]\r\n", + "Choosing terms to compare: 100%|███████████| 1000/1000 [00:01<00:00, 830.03it/s]\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n", + " if pd.api.types.is_categorical_dtype(vector):\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n", + " if pd.api.types.is_categorical_dtype(vector):\r\n", + "Figure(640x480)\r\n", + "r-squared: 0.13296081180577732\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n", + "\r\n", + "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n", + " warnings.warn(\r\n", + "100%|████████████████████████████████| 75893/75893 [00:00<00:00, 1917268.36it/s]\r\n", + "Building id2emb map: 75893it [00:00, 2946273.30it/s]\r\n", + "Choosing terms to compare: 100%|███████████| 1000/1000 [00:01<00:00, 625.99it/s]\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n", + " if pd.api.types.is_categorical_dtype(vector):\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n", + " if pd.api.types.is_categorical_dtype(vector):\r\n", + "Figure(640x480)\r\n", + "r-squared: 0.45925148278084876\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n", + "\r\n", + "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n", + " warnings.warn(\r\n", + "100%|████████████████████████████████| 75893/75893 [00:00<00:00, 1741995.45it/s]\r\n", + "Building id2emb map: 75893it [00:00, 2828264.25it/s]\r\n", + "Choosing terms to compare: 100%|███████████| 1000/1000 [00:01<00:00, 574.20it/s]\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n", + " if pd.api.types.is_categorical_dtype(vector):\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n", + " if pd.api.types.is_categorical_dtype(vector):\r\n", + "Figure(640x480)\r\n", + "r-squared: 0.1351592896407047\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n", + "\r\n", + "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n", + " warnings.warn(\r\n", + "Using root term: FOODON:00002403 to select terms to compare.\r\n", + "100%|████████████████████████████████| 32387/32387 [00:00<00:00, 1970479.61it/s]\r\n", + "Building id2emb map: 32387it [00:00, 2767914.17it/s]\r\n", + "Choosing terms to compare: 100%|███████████| 1000/1000 [00:01<00:00, 720.01it/s]\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n", + " if pd.api.types.is_categorical_dtype(vector):\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n", + " if pd.api.types.is_categorical_dtype(vector):\r\n", + "Figure(640x480)\r\n", + "r-squared: 0.5717097474555206\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n", + "\r\n", + "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n", + " warnings.warn(\r\n", + "Using root term: FOODON:00002403 to select terms to compare.\r\n", + "100%|████████████████████████████████| 32387/32387 [00:00<00:00, 1821436.65it/s]\r\n", + "Building id2emb map: 32387it [00:00, 2744205.65it/s]\r\n", + "Choosing terms to compare: 31%|███▋ | 308/1000 [00:00<00:00, 744.24it/s]\r\n", + "Traceback (most recent call last):\r\n", + " File \"/Users/jtr4v/PythonProject/curate-gpt/src/curate_gpt/agents/subsumption_eval_agent.py\", line 121, in compare_cosine_sim_to_shared_ancestors\r\n", + " id2 = curie2obj_id[random_other_term]['id']\r\n", + "KeyError: 'FOODON:03412687'\r\n", + "\r\n", + "During handling of the above exception, another exception occurred:\r\n", + "\r\n", + "Traceback (most recent call last):\r\n", + " File \"/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/bin/curategpt\", line 6, in \r\n", + " sys.exit(main())\r\n", + " File \"/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/click/core.py\", line 1157, in __call__\r\n", + " return self.main(*args, **kwargs)\r\n", + " File \"/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/click/core.py\", line 1078, in main\r\n", + " rv = self.invoke(ctx)\r\n", + " File \"/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/click/core.py\", line 1688, in invoke\r\n", + " return _process_result(sub_ctx.command.invoke(sub_ctx))\r\n", + " File \"/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/click/core.py\", line 1688, in invoke\r\n", + " return _process_result(sub_ctx.command.invoke(sub_ctx))\r\n", + " File \"/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/click/core.py\", line 1434, in invoke\r\n", + " return ctx.invoke(self.callback, **ctx.params)\r\n", + " File \"/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/click/core.py\", line 783, in invoke\r\n", + " return __callback(*args, **kwargs)\r\n", + " File \"/Users/jtr4v/PythonProject/curate-gpt/src/curate_gpt/cli.py\", line 1651, in subsumption_command\r\n", + " agent.compare_cosine_sim_to_shared_ancestors(\r\n", + " File \"/Users/jtr4v/PythonProject/curate-gpt/src/curate_gpt/agents/subsumption_eval_agent.py\", line 123, in compare_cosine_sim_to_shared_ancestors\r\n", + " raise KeyError(f\"KeyError retrieving item from curie2obj_id: {e}\")\r\n", + "KeyError: \"KeyError retrieving item from curie2obj_id: 'FOODON:03412687'\"\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n", + "\r\n", + "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n", + " warnings.warn(\r\n", + "Using root term: CHEBI:59999 to select terms to compare.\r\n", + "100%|██████████████████████████████| 166855/166855 [00:00<00:00, 1683980.96it/s]\r\n", + "Building id2emb map: 166855it [00:00, 2504385.80it/s]\r\n", + "Choosing terms to compare: 100%|███████████| 1000/1000 [00:01<00:00, 643.53it/s]\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n", + " if pd.api.types.is_categorical_dtype(vector):\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n", + " if pd.api.types.is_categorical_dtype(vector):\r\n", + "Figure(640x480)\r\n", + "r-squared: 0.22513629243465821\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/pydantic/_internal/_fields.py:149: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\r\n", + "\r\n", + "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\r\n", + " warnings.warn(\r\n", + "Using root term: CHEBI:59999 to select terms to compare.\r\n", + "100%|██████████████████████████████| 166855/166855 [00:00<00:00, 1737491.17it/s]\r\n", + "Building id2emb map: 166855it [00:00, 2696423.70it/s]\r\n", + "Choosing terms to compare: 100%|███████████| 1000/1000 [00:01<00:00, 737.49it/s]\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n", + " if pd.api.types.is_categorical_dtype(vector):\r\n", + "/Users/jtr4v/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead\r\n", + " if pd.api.types.is_categorical_dtype(vector):\r\n", + "Figure(640x480)\r\n", + "r-squared: 0.0680209366780529\r\n", + "The file /Users/jtr4v/PythonProject/curate-gpt/notebooks/command-line/*png does not exist.\r\n" ] } ], @@ -142,8 +269,8 @@ ], "metadata": { "collapsed": false, - "is_executing": true, "ExecuteTime": { + "end_time": "2024-02-23T21:56:34.771347Z", "start_time": "2024-02-23T21:26:07.255874Z" } },