monarch-initiative · lucinvitae · Feb 7, 2024 · Feb 8, 2024
diff --git a/.gitignore b/.gitignore
@@ -139,4 +139,8 @@ dmypy.json
 stagedb/*
 *.rej
 tests/input/dbs/go-nucleus-chroma/*/*.bin
-tests/output/*
+tests/output/*
+
+# Credential files
+**_custom.*
+.idea/
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ subset of the functionality of the local app)
 
 You will first need to [install Poetry](https://python-poetry.org/docs/#installation).
 
-Then clone this repo 
+Then clone this repo
 
 ```
 git clone https://github.com/monarch-initiative/curate-gpt.git
@@ -243,7 +243,7 @@ label: mesenchymal stem cell of the apical papilla
 
 ### All-by-all comparisons
 
-You can compare all objects in one collection 
+You can compare all objects in one collection
 
 `curategpt all-by-all --threshold 0.80 -c ont_hp -X ont_mp --ids-only -t csv > ~/tmp/allxall.mp.hp.csv`
 
@@ -271,3 +271,10 @@ HP:5200134,Jumping,MP:0001401,jumpy,0.9011393233129765
 Note that CurateGPT has a separate component for using an LLM to evaluate candidate matches (see also https://arxiv.org/abs/2310.03666); this is
 not enabled by default, this would be expensive to run for a whole ontology.
 
+## Azure Support
+
+If using with Azure OpenAI Services, a few additional steps are required.
+
+Step 1. `cp etc/azure.toml etc/azure_custom.toml`
+Step 2. Fill out the settings for the chat and embeddings model deployments in `etc/azure_custom.toml`
+Step 3. Set `USE_AZURE=true` when running the code.
diff --git a/etc/azure.toml b/etc/azure.toml
@@ -0,0 +1,13 @@
+[chat_model]
+api_key = ""
+api_version = ""
+base_url = ""
+deployment_name = ""
+model_name = ""
+
+[embedding_model]
+api_key = ""
+api_version = ""
+base_url = ""
+deployment_name = ""
+model_name = ""
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,7 @@ importlib-metadata = ">=6"
 oaklib = ">=0.5.15"
 beautifulsoup4 = ">=4.8.0"
 streamlit = ">=1.22.0"
-openai = ">=0.27.7"
+openai = "^1.11"
 wikipedia = ">=1.4.0"
 google-search-results = ">=2.4.2"
 chromadb = "^0.4.22"
@@ -24,7 +24,7 @@ pymongo = ">=4.4.1"
 linkml-runtime = "^1.6.3"
 python-ulid = "^1.1.0"
 sqlite-utils = "^3.34"
-gpt4all = "^1.0.8"
+gpt4all = "^2.0.1"
 httpx = "^0.24.1"
 eutils = "^0.6.0"
 matplotlib = "^3.7.2"
@@ -42,11 +42,12 @@ textract = "1.5.0"
 jsonpath-ng = "^1.5.3"
 pygithub = "^1.59.1"
 jsonlines = "^3.1.0"
-llm = "^0.12"
-llm-gpt4all = "^0.1.1"
+llm = "^0.13"
+llm-gpt4all = "^0.3"
 gspread = "^5.10.0"
 defusedxml = "^0.7.1"
 click-default-group = "^1.2.4"
+toml = "^0.10.2"
 
 [tool.poetry.dev-dependencies]
 pytest = ">=7.1.2"
@@ -129,4 +130,3 @@ reverse_relative = true
 [build-system]
 requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning"]
 build-backend = "poetry_dynamic_versioning.backend"
-
diff --git a/src/curate_gpt/extract/openai_extractor.py b/src/curate_gpt/extract/openai_extractor.py
@@ -2,12 +2,14 @@
 import json
 import logging
 from dataclasses import dataclass
-from typing import List
+from typing import List, Union
 
-import openai
 import yaml
+from openai import OpenAI
+from openai.lib.azure import AzureOpenAI
 
 from curate_gpt.extract.extractor import AnnotatedObject, Extractor
+from curate_gpt.utils.azure import USE_AZURE, get_azure_settings
 
 FUNC_NAME = "extract_data"
 
@@ -26,6 +28,9 @@ class OpenAIExtractor(Extractor):
     # conversation: List[Dict[str, Any]] = None
     # conversation_mode: bool = False
 
+    def __init__(self, use_azure: bool = USE_AZURE):
+        self._client = self.get_client(use_azure)
+
     def functions(self):
         return [
             {
@@ -90,7 +95,7 @@ def extract(
             }
         )
         print(yaml.dump(messages))
-        response = openai.ChatCompletion.create(
+        response = self._client.chat.completions.create(
             model=self.model,
             functions=self.functions(),
             messages=messages,
@@ -116,3 +121,17 @@ def extract(
                 raise e
             obj = {}
         return AnnotatedObject(object=obj)
+
+    def get_client(self, use_azure: bool, **kwargs) -> Union[OpenAI, AzureOpenAI]:
+        client_class = AzureOpenAI if use_azure else OpenAI
+        if use_azure:
+            config = get_azure_settings()["chat_model"]
+            kwargs.update(
+                {
+                    "api_version": config["api_version"],
+                    "azure_endpoint": config["base_url"],
+                    "api_key": config["api_key"],
+                    "azure_deployment": config["deployment_name"],
+                }
+            )
+        return client_class(**kwargs)
diff --git a/src/curate_gpt/store/chromadb_adapter.py b/src/curate_gpt/store/chromadb_adapter.py
@@ -26,6 +26,7 @@
     CollectionMetadata,
     DBAdapter,
 )
+from curate_gpt.utils.azure import USE_AZURE, get_azure_settings
 from curate_gpt.utils.vector_algorithms import mmr_diversified_search
 
 logger = logging.getLogger(__name__)
@@ -129,10 +130,7 @@ def _embedding_function(self, model: str = None) -> EmbeddingFunction:
         if model is None:
             raise ValueError("Model must be specified")
         if model.startswith("openai:"):
-            return embedding_functions.OpenAIEmbeddingFunction(
-                api_key=os.environ.get("OPENAI_API_KEY"),
-                model_name="text-embedding-ada-002",
-            )
+            return get_openai_embedding_function()
         return embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model)
 
     def insert(
@@ -567,3 +565,20 @@ def dump_then_load(self, collection: str = None, target: DBAdapter = None):
                 batched_obj[k] = result[k][i : i + batch_size]
             target_collection_obj.add(**batched_obj)
             i += batch_size
+
+
+def get_openai_embedding_function(use_azure: bool = USE_AZURE):
+    if use_azure:
+        config = get_azure_settings()["embedding_model"]
+        return embedding_functions.OpenAIEmbeddingFunction(
+            api_key=config["api_key"],
+            model_name=config["model_name"],
+            api_base=config["base_url"],
+            api_type="azure",
+            api_version=config["api_version"],
+            deployment_id=config["deployment_name"],
+        )
+    return embedding_functions.OpenAIEmbeddingFunction(
+        api_key=os.environ.get("OPENAI_API_KEY"),
+        model_name="text-embedding-ada-002",
+    )
diff --git a/src/curate_gpt/utils/azure.py b/src/curate_gpt/utils/azure.py
@@ -0,0 +1,43 @@
+import functools
+import os
+from pathlib import Path
+from typing import TypedDict
+
+import toml
+
+CWD = Path.cwd()
+SETTINGS_PATH = os.getenv("AZURE_SETTINGS_PATH", None)
+SETTINGS_NAME = os.getenv("AZURE_SETTINGS_NAME", "azure")
+USE_AZURE = os.getenv("USE_AZURE", "false") == "true"
+
+
+def read_toml(path: Path) -> dict:
+    with path.open() as cf:
+        return toml.load(cf)
+
+
+class ModelSettings(TypedDict):
+    api_key: str
+    api_version: str
+    base_url: str
+    deployment_name: str
+    model_name: str
+
+
+class AzureSettings(TypedDict):
+    chat_model: ModelSettings
+    embedding_model: ModelSettings
+
+
+@functools.lru_cache(maxsize=1)
+def get_azure_settings(base_dir: Path = CWD) -> AzureSettings:
+    etc_dir = base_dir / "etc"
+    default_settings = etc_dir / f"{SETTINGS_NAME}.toml"
+    settings_override = etc_dir / f"{SETTINGS_NAME}_custom.toml"
+    if SETTINGS_PATH:
+        settings_path = Path(SETTINGS_PATH)
+    elif settings_override.is_file():
+        settings_path = settings_override
+    else:
+        settings_path = default_settings
+    return read_toml(settings_path) if settings_path.is_file() else {}