Merge pull request #91 from databricks/bge_v1_5_marketplace

Add fine-tuning example for bge-v1.5 from marketplace
databricks · Nov 14, 2023 · d67631f · d67631f
2 parents f40ae0e + 7c9cb76
commit d67631f
Show file tree

Hide file tree

Showing 7 changed files with 250 additions and 98 deletions.
diff --git a/llm-models/embedding/bge/README.md b/llm-models/embedding/bge/README.md
@@ -22,19 +22,18 @@ limitations under the License.
 
 | Model                                                                               | Language |              query instruction for retrieval\*              |
 |:------------------------------------------------------------------------------------|:--------:|:-----------------------------------------------------------:|
-| [BAAI/bge-large-en](https://huggingface.co/BAAI/bge-large-en)                       | English  | `Represent this sentence for searching relevant passages: ` |
-| [BAAI/bge-base-en](https://huggingface.co/BAAI/bge-base-en)                         | English  | `Represent this sentence for searching relevant passages: ` |
-| [BAAI/bge-small-en](https://huggingface.co/BAAI/bge-small-en)                       | English  | `Represent this sentence for searching relevant passages: ` |
-| [BAAI/bge-large-zh](https://huggingface.co/BAAI/bge-large-zh)                       | Chinese  |                    `为这个句子生成表示以用于检索相关文章：`                    |
-| [BAAI/bge-large-zh-noinstruct](https://huggingface.co/BAAI/bge-large-zh-noinstruct) | Chinese  |                                                             |
-| [BAAI/bge-base-zh](https://huggingface.co/BAAI/bge-base-zh)                         | Chinese  |                    `为这个句子生成表示以用于检索相关文章：`                    |
-| [BAAI/bge-small-zh](https://huggingface.co/BAAI/bge-small-zh)                       | Chinese  |                    `为这个句子生成表示以用于检索相关文章：`                    |
+| [BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5)                       | English  | `Represent this sentence for searching relevant passages: ` |
+| [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5)                         | English  | `Represent this sentence for searching relevant passages: ` |
+| [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5)                       | English  | `Represent this sentence for searching relevant passages: ` |
+| [BAAI/bge-large-zh-v1.5](https://huggingface.co/BAAI/bge-large-zh-v1.5)                       | Chinese  |                    `为这个句子生成表示以用于检索相关文章：`                    |
+| [BAAI/bge-base-zh-v1.5](https://huggingface.co/BAAI/bge-base-zh)                         | Chinese  |                    `为这个句子生成表示以用于检索相关文章：`                    |
+| [BAAI/bge-small-zh-v1.5](https://huggingface.co/BAAI/bge-small-zh)                       | Chinese  |                    `为这个句子生成表示以用于检索相关文章：`                    |
 
 \*: If you need to search the **long** relevant passages to a **short** query (s2p retrieval task), you need to add the instruction to the query; in other cases, no instruction is needed, just use the original query directly. In all cases, **no instruction** need to be added to passages.
 
 ## Example notebooks
-This folder contains the following examples for Llama 2 models:
-`
+This folder contains the following examples for BGE models:
+
 <!---
 <style>
 table th:first-of-type {
@@ -54,7 +53,7 @@ table th:nth-of-type(4) {
 
 |                      **File**                       |                                         **Description**                                          | **GPU Minimum Requirement** |
 |:---------------------------------------------------:|:------------------------------------------------------------------------------------------------:|:---------------------------:|
-|                 `01_load_inference`                 |    Environment setup and suggested configurations when inferencing BGE models on Databricks.     |         1xA10-24GB          |
-|            `02_mlflow_logging_inference`            | Save, register, and load BGE models with MLFlow, and create a Databricks model serving endpoint. |         1xA10-24GB          |
-|              `03_build_document_index`              |                        Build a vector store with faiss using BGE models.                         |         1xA10-24GB          |
-|              `04_fine_tune_embedding`               |                                       Fine-tune BGE models                                       |             N/A             |
+|                 `01_load_inference`                 |    Environment setup and suggested configurations when inferencing BGE models on Databricks.     |         1xT4          |
+|            `02_mlflow_logging_inference`            | Save, register, and load BGE models with MLFlow, and create a Databricks model serving endpoint. |         1xT4          |
+|              `03_build_document_index`              |                        Build a vector store with faiss using BGE models.                         |         1xT4          |
+|              `04_fine_tune_embedding`               |                                       Fine-tune BGE models.                                       |             1xT4             |
diff --git a/...edding/bge/bge-large/01_load_inference.py → ...g/bge/bge-large-v1.5/01_load_inference.py b/...edding/bge/bge-large/01_load_inference.py → ...g/bge/bge-large-v1.5/01_load_inference.py
@@ -1,24 +1,12 @@
 # Databricks notebook source
 # MAGIC %md
-# MAGIC # Run `bge-large-en` Embedding on Databricks
+# MAGIC # Run `bge-large-en-v1.5` Embedding on Databricks
 # MAGIC
-# MAGIC [bge-large-en (BAAI General Embedding) model](https://huggingface.co/BAAI/bge-large-en) can map any text to a low-dimensional dense vector which can be used for tasks like retrieval, classification, clustering, or semantic search. And it also can be used in vector database for LLMs.
+# MAGIC [bge-large-en-v1.5 (BAAI General Embedding) model](https://huggingface.co/BAAI/bge-large-en-v1.5) can map any text to a low-dimensional dense vector which can be used for tasks like retrieval, classification, clustering, or semantic search. And it also can be used in vector database for LLMs.
 # MAGIC
 # MAGIC Environment for this notebook:
-# MAGIC - Runtime: 13.3 GPU ML Runtime
-# MAGIC - Instance: `g4dn.xlarge` on AWS or `Standard_NC4as_T4_v3` on Azure.
-# MAGIC
-
-# COMMAND ----------
-
-# MAGIC %pip install -U langchain==0.0.262
-# MAGIC dbutils.library.restartPython()
-
-# COMMAND ----------
-
-# MAGIC %md
-# MAGIC ## Inference
-# MAGIC The example in the model card should also work on Databricks with the same environment.
+# MAGIC - Runtime: 14.1 GPU ML Runtime
+# MAGIC - Instance: `g4dn.xlarge` on AWS or `Standard_NC4as_T4_v3` on Azure
 
 # COMMAND ----------
 
@@ -32,7 +20,7 @@
 from sentence_transformers import SentenceTransformer, util
 
 
-model_name = "BAAI/bge-large-en"
+model_name = "BAAI/bge-large-en-v1.5"
 model = SentenceTransformer(model_name)
 
 
@@ -54,18 +42,20 @@
 # COMMAND ----------
 
 # MAGIC %md
-# MAGIC For s2p(short query to long passage) retrieval task, each short query should start with an instruction `Represent this sentence for searching relevant passages:`.
+# MAGIC For s2p (short query to long passage) retrieval task, each short query should start with an instruction `Represent this sentence for searching relevant passages:` for `bge` models.
 
 # COMMAND ----------
 
+instruction = "Represent this sentence for searching relevant passages: "
+
 queries = ["What type of organism is commonly used in preparation of foods such as cheese and yogurt?"]
 passages = [
   "Mesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans or other animals. The optimal growth temperature of many pathogenic mesophiles is 37°C (98°F), the normal human body temperature. Mesophilic organisms have important uses in food preparation, including cheese, yogurt, beer and wine.", 
   "Without Coriolis Effect the global winds would blow north to south or south to north. But Coriolis makes them blow northeast to southwest or the reverse in the Northern Hemisphere. The winds blow northwest to southeast or the reverse in the southern hemisphere.",
   "Summary Changes of state are examples of phase changes, or phase transitions. All phase changes are accompanied by changes in the energy of a system. Changes from a more-ordered state to a less-ordered state (such as a liquid to a gas) areendothermic. Changes from a less-ordered state to a more-ordered state (such as a liquid to a solid) are always exothermic. The conversion of a solid to a liquid is called fusion (or melting). The energy required to melt 1 mol of a substance is its enthalpy of fusion (ΔHfus). The energy change required to vaporize 1 mol of a substance is the enthalpy of vaporization (ΔHvap). The direct conversion of a solid to a gas is sublimation. The amount of energy needed to sublime 1 mol of a substance is its enthalpy of sublimation (ΔHsub) and is the sum of the enthalpies of fusion and vaporization. Plots of the temperature of a substance versus heat added or versus heating time at a constant rate of heating are calledheating curves. Heating curves relate temperature changes to phase transitions. A superheated liquid, a liquid at a temperature and pressure at which it should be a gas, is not stable. A cooling curve is not exactly the reverse of the heating curve because many liquids do not freeze at the expected temperature. Instead, they form a supercooled liquid, a metastable liquid phase that exists below the normal melting point. Supercooled liquids usually crystallize on standing, or adding a seed crystal of the same or another substance can induce crystallization."
   ]
-instruction = "Represent this sentence for searching relevant passages: "
 query_with_instruction = [instruction+q for q in queries]
+
 q_embeddings = model.encode(query_with_instruction, normalize_embeddings=True)
 p_embeddings = model.encode(passages, normalize_embeddings=True)
 
@@ -95,7 +85,3 @@
 
 scores = util.cos_sim(q_embeddings, p_embeddings)
 print("Cosine-Similarity scores:", scores)
-
-# COMMAND ----------
-
-
diff --git a/.../bge-large/02_mlflow_logging_inference.py → ...large-v1.5/02_mlflow_logging_inference.py b/.../bge-large/02_mlflow_logging_inference.py → ...large-v1.5/02_mlflow_logging_inference.py
@@ -1,28 +1,23 @@
 # Databricks notebook source
 # MAGIC %md
-# MAGIC # Manage `bge-large-en` model with MLFlow on Databricks
+# MAGIC # Manage `bge-large-en-v1.5` model with MLFlow on Databricks
 # MAGIC
-# MAGIC [bge-large-en (BAAI General Embedding) model](https://huggingface.co/BAAI/bge-large-en) can map any text to a low-dimensional dense vector which can be used for tasks like retrieval, classification, clustering, or semantic search. And it also can be used in vector database for LLMs.
+# MAGIC In this example, we demonstrate how to log the [bge-large-en-v1.5 model](https://huggingface.co/BAAI/bge-large-en-v1.5) to MLFLow with the `sentence_transformers` flavor, manage the model with Unity Catalog, and create a model serving endpoint.
 # MAGIC
 # MAGIC Environment for this notebook:
-# MAGIC - Runtime: 13.3 GPU ML Runtime
-# MAGIC - Instance: `g4dn.xlarge` on AWS or `Standard_NC4as_T4_v3` on Azure.
+# MAGIC - Runtime: 14.1 GPU ML Runtime
+# MAGIC - Instance: `g4dn.xlarge` on AWS or `Standard_NC4as_T4_v3` on Azure
 # MAGIC
 
 # COMMAND ----------
 
-# MAGIC %pip install --upgrade "mlflow-skinny[databricks]>=2.4.1"
-# MAGIC dbutils.library.restartPython()
-
-# COMMAND ----------
-
 # MAGIC %md
 # MAGIC ## Log the model to MLFlow
 
 # COMMAND ----------
 
 from sentence_transformers import SentenceTransformer
-model_name = "BAAI/bge-large-en"
+model_name = "BAAI/bge-large-en-v1.5"
 
 model = SentenceTransformer(model_name)
 
@@ -62,9 +57,8 @@
 # COMMAND ----------
 
 # Register model to Unity Catalog
-# This may take 2.2 minutes to complete
 
-registered_name = "models.default.bge-large-en" # Note that the UC model name follows the pattern <catalog_name>.<schema_name>.<model_name>, corresponding to the catalog, schema, and registered model name
+registered_name = "models.default.bge_large_en_v1_5" # Note that the UC model name follows the pattern <catalog_name>.<schema_name>.<model_name>, corresponding to the catalog, schema, and registered model name
 result = mlflow.register_model(
     "runs:/"+run.info.run_id+"/bge-embedding",
     registered_name,
@@ -129,7 +123,7 @@
       "name": f'{model_version.name.replace(".", "_")}_{model_version.version}',
       "model_name": model_version.name,
       "model_version": model_version.version,
-      "workload_type": "GPU_MEDIUM",
+      "workload_type": "GPU_SMALL",
       "workload_size": "Small",
       "scale_to_zero_enabled": "False"
     }]

diff --git a/...e/bge-large/03_build_document_Index.py.py → ...bge-large-v1.5/03_build_document_Index.py b/...e/bge-large/03_build_document_Index.py.py → ...bge-large-v1.5/03_build_document_Index.py
@@ -1,12 +1,12 @@
 # Databricks notebook source
 # MAGIC %md
 # MAGIC
-# MAGIC # Build vector database with `bge-large-en`
+# MAGIC # Build vector database with `bge-large-en-v1.5`
 # MAGIC
-# MAGIC This notebook demostrates how to build a vector store with [faiss](https://github.com/facebookresearch/faiss) using [bge-large-en model](https://huggingface.co/BAAI/bge-large-en).
+# MAGIC This notebook demostrates how to build a vector store with [faiss](https://github.com/facebookresearch/faiss) using [bge-large-en-v1.5 model](https://huggingface.co/BAAI/bge-large-en-v1.5).
 # MAGIC
 # MAGIC Environment for this notebook:
-# MAGIC - Runtime: 13.3 GPU ML Runtime
+# MAGIC - Runtime: 14.1 GPU ML Runtime
 # MAGIC - Instance: `g4dn.xlarge` on AWS or `Standard_NC4as_T4_v3` on Azure.
 
 # COMMAND ----------
@@ -16,7 +16,7 @@
 
 # COMMAND ----------
 
-# MAGIC %pip install langchain==0.0.262 faiss-gpu==1.7.2
+# MAGIC %pip install faiss-gpu==1.7.2
 # MAGIC dbutils.library.restartPython()
 
 # COMMAND ----------
@@ -95,7 +95,7 @@ def get_chunks(text):
 
 text_inputs = text_df["chunks"].to_list()
 
-model_name = "BAAI/bge-large-en"
+model_name = "BAAI/bge-large-en-v1.5"
 model_kwargs = {'device': 'cuda'}
 encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
 model = HuggingFaceBgeEmbeddings(
@@ -125,7 +125,3 @@ def get_chunks(text):
 # COMMAND ----------
 
 vector_store.save_local(folder_path="/dbfs/peft-doc-embed/vector_store")
-
-# COMMAND ----------
-
-
diff --git a/...g/bge/bge-large/04_fine_tune_embedding.py → .../bge-large-v1.5/04_fine_tune_embedding.py b/...g/bge/bge-large/04_fine_tune_embedding.py → .../bge-large-v1.5/04_fine_tune_embedding.py
@@ -1,13 +1,13 @@
 # Databricks notebook source
 # MAGIC %md
 # MAGIC
-# MAGIC # Fine-Tune `bge-large-en` with Sentence Transformers
+# MAGIC # Fine-Tune `bge-large-en-v1.5` with Sentence Transformers
 # MAGIC
-# MAGIC This notebook demostrates how to fine tune [bge-large-en model](https://huggingface.co/BAAI/bge-large-en).
+# MAGIC This notebook demostrates how to fine tune [bge-large-en-v1.5 model](https://huggingface.co/BAAI/bge-large-en-v1.5).
 # MAGIC
 # MAGIC Environment for this notebook:
-# MAGIC - Runtime: 13.3 GPU ML Runtime
-# MAGIC - Instance: `g5.xlarge` on AWS or `Standard_NV36ads_A10_v5` on Azure.
+# MAGIC - Runtime: 14.1 GPU ML Runtime
+# MAGIC - Instance: `g4dn.xlarge` on AWS or `Standard_NC4as_T4_v3` on Azure
 
 # COMMAND ----------
 
@@ -16,7 +16,7 @@
 
 # COMMAND ----------
 
-output_model_path = "/dbfs/fine_tuned_bge_model"
+output_model_path = "/dbfs/fine_tuned_bge_v1_5_model"
 
 # COMMAND ----------
 
@@ -49,7 +49,7 @@
 # MAGIC %md
 # MAGIC ### Data preprocessing
 # MAGIC
-# MAGIC Convert the examples into InputExample's.
+# MAGIC Convert the examples into `InputExample`s.
 
 # COMMAND ----------
 
@@ -63,7 +63,7 @@ def create_dataset_for_multiple_loss(train_dataset):
 
     for text in texts:
       train_examples.append(InputExample(texts=[query, text]))
-  return train_example
+  return train_examples
 
 train_examples = create_dataset_for_multiple_loss(dataset)
 
@@ -78,7 +78,7 @@ def create_dataset_for_multiple_loss(train_dataset):
 # COMMAND ----------
 
 from sentence_transformers import SentenceTransformer
-model = SentenceTransformer('BAAI/bge-large-en')
+model = SentenceTransformer('BAAI/bge-large-en-v1.5')
 
 # COMMAND ----------
 
@@ -142,48 +142,28 @@ def predict(self, context, model_input):
 
 # COMMAND ----------
 
-# Register model
-# This may take 1 minutes to complete
-
-registered_name = "bge-embedding-model"
-
-
-result = mlflow.register_model(
-    "runs:/"+run.info.run_id+"/model",
-    registered_name,
-)
-
-# COMMAND ----------
-
 # MAGIC %md
-# MAGIC
-# MAGIC ## Test logged model
-# MAGIC
-# MAGIC The below code assumes that it is run in a separate notebook to avoid CUDA OOM.
-
-# COMMAND ----------
-
-dbutils.library.restartPython()
+# MAGIC Run model inference with the model logged in MLFlow.
 
 # COMMAND ----------
 
 import mlflow
 import pandas as pd
 
-registered_name = "bge-embedding-model"
+# Load model as a PyFuncModel.
+run_id = run.info.run_id
+logged_model = f"runs:/{run_id}/model"
 
-loaded_model = mlflow.pyfunc.load_model(f"models:/{registered_name}/1")
+loaded_model = mlflow.pyfunc.load_model(logged_model)
 
-# COMMAND ----------
-
-import pandas as pd
 # Predict on a Pandas DataFrame.
 test_df = pd.DataFrame(['London has 9,787,426 inhabitants at the 2011 census',
               'London is known for its finacial district'], columns=["text"])
 
 loaded_model.predict(test_df)
 
 # COMMAND ----------
+
 # If you need to search the long relevant passages to a short query,
 # you need to add the instruction `Represent this sentence for searching relevant passages:` to the query
 test_df = pd.DataFrame(['London has 9,787,426 inhabitants at the 2011 census',