Add vector search with embedding generation workload (#232)

* Add vector search with embedding generation workload Signed-off-by: Vesa Pehkonen <[email protected]> * Add vector search with embedding generation workload Signed-off-by: Vesa Pehkonen <[email protected]> * Updated README.md with the license text. Signed-off-by: Vesa Pehkonen <[email protected]> * - Changed the workload form vectorsearch_embedding to semantic_search. - Changed dataset from ms marco to trec-covid. - Moved benchmark task runners DeletePipeline, DeleteMlModel, RegisterMlModel and DeployMlModel to OS-benchmark repo. Signed-off-by: Vesa Pehkonen <[email protected]> * - Changed the workload name semantic_search to treccovid_semantic_search. - Added the sample output for treccovid_semantic_search. - Added description of test procedure. - Simplified treccovid_semantics_search workload configuration. Signed-off-by: Vesa Pehkonen <[email protected]> * Updated parameters of treccovid workload. Signed-off-by: Vesa Pehkonen <[email protected]> * Added files.txt to treccovid workload. Signed-off-by: Vesa Pehkonen <[email protected]> * Updated the documents url for treccovid_semantic_search. Signed-off-by: Vesa Pehkonen <[email protected]> --------- Signed-off-by: Vesa Pehkonen <[email protected]>
opensearch-project · Jul 24, 2024 · 417170f · 417170f
1 parent b7ff271
commit 417170f
Show file tree

Hide file tree

Showing 8 changed files with 596 additions and 0 deletions.
diff --git a/treccovid_semantic_search/README.md b/treccovid_semantic_search/README.md
diff --git a/treccovid_semantic_search/files.txt b/treccovid_semantic_search/files.txt
@@ -0,0 +1,2 @@
+documents.json.bz2
+queries.json.bz2
diff --git a/treccovid_semantic_search/index.json b/treccovid_semantic_search/index.json
@@ -0,0 +1,36 @@
+{
+  "settings": {
+    {%-if number_of_shards is defined %}
+    "index.number_of_shards": {{number_of_shards}},
+    {%- endif %}
+    {%-if number_of_replicas is defined %}
+    "index.number_of_replicas": {{number_of_replicas}},
+    {%- endif %}
+    "index.queries.cache.enabled": {{query_cache_enabled | default(false) | tojson}},
+    "index.requests.cache.enable": {{requests_cache_enabled | default(false) | tojson}},
+    "index.knn": true,
+    "default_pipeline": "nlp-ingest-pipeline"
+  },
+  "mappings": {
+    "properties": {
+      "id": {
+        "type": "text"
+      },
+      "passage_embedding": {
+        "type": "knn_vector",
+        "dimension": {{dimensions | default(768)}},
+        "method": {
+          {%-if engine is defined %}
+          "engine": "{{engine}}",
+          {%- endif %}
+          "space_type": "{{space_type | default('l2')}}",
+          "name": "{{method | default('hnsw')}}",
+          "parameters": {}
+        }
+      },
+      "text": {
+        "type": "text"
+      }
+    }
+  }
+}
diff --git a/treccovid_semantic_search/operations/default.json b/treccovid_semantic_search/operations/default.json
@@ -0,0 +1,72 @@
+    {
+      "name": "delete-ingest-pipeline",
+      "operation-type": "delete-pipeline",
+      "id": "nlp-ingest-pipeline"
+    },
+    {
+      "name": "create-ingest-pipeline",
+      "operation-type": "put-pipeline",
+      "param-source": "create-ingest-pipeline",
+      "id": "nlp-ingest-pipeline",  
+      "body": {
+        "description": "An NLP ingest pipeline",
+        "processors": [
+          {
+            "text_embedding": {
+              "model_id": "",
+              "field_map": {
+                "text": "passage_embedding"
+              }
+            }
+          }
+        ]
+      }
+    },
+    {
+      "name": "index-append",
+      "operation-type": "bulk",
+      "bulk-size": {{bulk_size | default(100)}},
+      "ingest-percentage": {{ingest_percentage | default(100)}}
+    },
+    {
+      "name": "wait-until-merges-finish",
+      "operation-type": "index-stats",
+      "index": "_all",
+      "condition": {
+        "path": "_all.total.merges.current",
+        "expected-value": 0
+      },
+      "retry-until-success": true,
+      "include-in-reporting": false
+    },
+    {
+      "name": "default",
+      "operation-type": "search",
+      "body": {
+        "query": {
+          "match_all": {}
+        }
+      }
+    },
+    {
+      "name": "semantic-search",
+      "operation-type": "search",
+      "num-variable-queries": {{num_variable_queries | default(0)}},
+      "param-source": "semantic-search-source",
+      "body": {
+        "_source": {
+          "excludes": [
+            "passage_embedding"
+          ]
+        },
+        "query": {
+          "neural": {
+            "passage_embedding": {
+              "query_text": "what types of rapid testing for Covid-19 have been developed?",
+              "model_id": "",
+              "k": {{k | default(10)}}
+            }
+          }
+        }
+      }
+    }
diff --git a/treccovid_semantic_search/test_procedures/default.json b/treccovid_semantic_search/test_procedures/default.json
@@ -0,0 +1,112 @@
+    {
+      "name": "index-merge-search",
+      "description": "Indexes the corpus with vector embedding and then runs queries with vector embedding.",
+      "default": true,
+      "schedule": [
+        {
+          "name": "cluster-settings",
+          "operation": {
+            "operation-type": "put-settings",
+            "body": {
+              "persistent": {
+                "plugins": {
+                  "ml_commons": {
+                    "only_run_on_ml_node": "false",
+                    "native_memory_threshold": "99",
+                    "allow_registering_model_via_local_file": "true",
+                    "allow_registering_model_via_url": "true"
+                  }
+                }
+              }
+            }
+          }
+        },
+        {
+          "operation": "delete-index"
+        },
+        {
+          "operation": "delete-ingest-pipeline"
+        },
+        {
+          "operation": {
+            "operation-type": "delete-ml-model",
+            "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}"
+          }
+        },
+        {
+          "operation": {
+            "operation-type": "register-ml-model",
+            "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}",
+            "model-version": "{{ model_version | default('1.0.1') }}",
+            "model-format": "{{ model_format | default('TORCH_SCRIPT') }}",
+            "model-config-file": "{{ model_config_file | default('') }}"
+          }
+        },
+        {
+          "operation": "deploy-ml-model"
+        },
+        {
+          "operation": "create-ingest-pipeline"
+        },
+        {
+          "operation": {
+            "operation-type": "create-index",
+            "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} {
+              "index.refresh_interval": "5s",
+              "index.translog.flush_threshold_size": "1g"
+            }{%- endif %}
+          }
+        },
+        {
+          "name": "check-cluster-health",
+          "operation": {
+            "operation-type": "cluster-health",
+            "index": "treccovid",
+            "request-params": {
+              "wait_for_status": "{{cluster_health | default('green')}}",
+              "wait_for_no_relocating_shards": "true"
+            },
+            "retry-until-success": true
+          }
+        },
+        {
+          "operation": "index-append",
+          "warmup-time-period": 60,
+          "clients": {{bulk_indexing_clients | default(1)}},
+          "ignore-response-error-level": "{{error_level | default('non-fatal')}}"
+        },
+        {
+          "name": "refresh-after-index",
+          "operation": "refresh"
+        },
+        {
+          "operation": {
+            "operation-type": "force-merge",
+            "request-timeout": 7200{%- if force_merge_max_num_segments is defined %},
+            "max-num-segments": {{ force_merge_max_num_segments | tojson }}
+            {%- endif %}
+          }
+        },
+        {
+          "name": "refresh-after-force-merge",
+          "operation": "refresh"
+        },
+        {
+          "operation": "wait-until-merges-finish"
+        },
+        {
+          "operation": "default",
+          "warmup-iterations": {{warmup_iterations | default(500) | tojson}},
+          "iterations": {{iterations | default(500) | tojson }},
+          "target-throughput": {{ target_throughput | default(100) | tojson}},
+          "clients": {{ search_clients | default(1) }}
+        },
+        {
+          "operation": "semantic-search",
+          "warmup-iterations": {{warmup_iterations | default(100) | tojson}},
+          "iterations": {{iterations | default(100) | tojson }},
+          "target-throughput": {{ target_throughput | default(10) | tojson}},
+          "clients": {{ search_clients | default(1)}}
+        }
+      ]
+    }
diff --git a/treccovid_semantic_search/workload.json b/treccovid_semantic_search/workload.json
@@ -0,0 +1,32 @@
+{% import "benchmark.helpers" as benchmark with context %}
+
+{
+  "version": 2,
+    "description": "Trec-Covid is a dataset collection of documents about COVID-19 information.",
+  "indices": [
+    {
+      "name": "treccovid",
+      "body": "index.json"
+    }
+  ],
+  "corpora": [
+    {
+      "name": "treccovid",
+      "base-url": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/treccovid",
+      "documents": [
+        {
+          "source-file": "documents.json.bz2",
+          "document-count": 129192,
+          "compressed-bytes": 51187469,
+          "uncompressed-bytes": 211980208
+        }
+      ]
+    }
+  ],
+  "operations": [
+    {{ benchmark.collect(parts="operations/*.json") }}
+  ],
+  "test_procedures": [
+    {{ benchmark.collect(parts="test_procedures/*.json") }}
+  ]
+}
diff --git a/treccovid_semantic_search/workload.py b/treccovid_semantic_search/workload.py
@@ -0,0 +1,75 @@
+import random
+import os
+import json
+from pathlib import Path
+
+from osbenchmark.workload.loader import Downloader
+from osbenchmark.workload.loader import Decompressor
+from osbenchmark.workload.loader import Decompressor
+
+script_dir = os.path.dirname(os.path.realpath(__file__))
+
+def ingest_pipeline_param_source(workload, params, **kwargs):
+    model_id = params['body']['processors'][0]['text_embedding']['model_id']
+    if not model_id:
+        with open('model_id.json') as f:
+            d = json.loads(f.read())
+            model_id = d['model_id']
+            params['body']['processors'][0]['text_embedding']['model_id'] = model_id
+    return params
+
+class QueryParamSource:
+    def __init__(self, workload, params, **kwargs):
+        if len(workload.indices) == 1:
+            index = workload.indices[0].name
+            if len(workload.indices[0].types) == 1:
+                type = workload.indices[0].types[0].name
+            else:
+                type = None
+        else:
+            index = "_all"
+            type = None
+
+        self._params = params
+        self._params['index'] = index
+        self._params['type'] = type
+        self._params['variable-queries'] = params.get("variable-queries", 0)
+        self.infinite = True
+
+        if self._params['variable-queries'] > 0:
+            with open(script_dir + os.sep + 'workload_queries.json', 'r') as f:
+                d = json.loads(f.read())
+                source_file = d['source-file']
+                base_url = d['base-url']
+                compressed_bytes = d['compressed-bytes']
+                uncompressed_bytes = d['uncompressed-bytes']
+                compressed_path = script_dir + os.sep + source_file
+                uncompressed_path = script_dir + os.sep + Path(source_file).stem
+            if not os.path.exists(compressed_path):
+                downloader = Downloader(False, False)
+                downloader.download(base_url, None, compressed_path, compressed_bytes)
+            if not os.path.exists(uncompressed_path):
+                decompressor = Decompressor()
+                decompressor.decompress(compressed_path, uncompressed_path, uncompressed_bytes)
+
+    def partition(self, partition_index, total_partitions):
+        return self
+
+    def params(self):
+        params = self._params
+        with open('model_id.json', 'r') as f:
+            d = json.loads(f.read())
+            params['body']['query']['neural']['passage_embedding']['model_id'] = d['model_id']
+        count = self._params.get("variable-queries", 0)
+        if count > 0:
+            script_dir = os.path.dirname(os.path.realpath(__file__))
+            with open(script_dir + '/queries.json', 'r') as f:
+                lines = f.read().splitlines()
+                line =random.choice(lines)
+                query_text = json.loads(line)['text']
+                params['body']['query']['neural']['passage_embedding']['query_text'] = query_text
+        return params
+
+def register(registry):
+    registry.register_param_source("semantic-search-source", QueryParamSource)
+    registry.register_param_source("create-ingest-pipeline", ingest_pipeline_param_source)
diff --git a/treccovid_semantic_search/workload_queries.json b/treccovid_semantic_search/workload_queries.json
@@ -0,0 +1,6 @@
+{
+  "base-url": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/treccovid",
+  "source-file": "queries.json.bz2",
+  "compressed-bytes": 4310,
+  "uncompressed-bytes": 16552
+}