Skip to content

Commit

Permalink
Add vector search with embedding generation workload (#232)
Browse files Browse the repository at this point in the history
* Add vector search with embedding generation workload
Signed-off-by: Vesa Pehkonen <[email protected]>

* Add vector search with embedding generation workload
Signed-off-by: Vesa Pehkonen <[email protected]>

* Updated README.md with the license text.

Signed-off-by: Vesa Pehkonen <[email protected]>

* - Changed the workload form vectorsearch_embedding to semantic_search.
- Changed dataset from ms marco to trec-covid.
- Moved benchmark task runners DeletePipeline, DeleteMlModel, RegisterMlModel and DeployMlModel to OS-benchmark repo.

Signed-off-by: Vesa Pehkonen <[email protected]>

* - Changed the workload name semantic_search to treccovid_semantic_search.
- Added the sample output for treccovid_semantic_search.
- Added description of test procedure.
- Simplified treccovid_semantics_search workload configuration.

Signed-off-by: Vesa Pehkonen <[email protected]>

* Updated parameters of treccovid workload.

Signed-off-by: Vesa Pehkonen <[email protected]>

* Added files.txt to treccovid workload.

Signed-off-by: Vesa Pehkonen <[email protected]>

* Updated the documents url for treccovid_semantic_search.

Signed-off-by: Vesa Pehkonen <[email protected]>

---------

Signed-off-by: Vesa Pehkonen <[email protected]>
  • Loading branch information
vpehkone authored Jul 24, 2024
1 parent b7ff271 commit 417170f
Show file tree
Hide file tree
Showing 8 changed files with 596 additions and 0 deletions.
261 changes: 261 additions & 0 deletions treccovid_semantic_search/README.md

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions treccovid_semantic_search/files.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
documents.json.bz2
queries.json.bz2
36 changes: 36 additions & 0 deletions treccovid_semantic_search/index.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"settings": {
{%-if number_of_shards is defined %}
"index.number_of_shards": {{number_of_shards}},
{%- endif %}
{%-if number_of_replicas is defined %}
"index.number_of_replicas": {{number_of_replicas}},
{%- endif %}
"index.queries.cache.enabled": {{query_cache_enabled | default(false) | tojson}},
"index.requests.cache.enable": {{requests_cache_enabled | default(false) | tojson}},
"index.knn": true,
"default_pipeline": "nlp-ingest-pipeline"
},
"mappings": {
"properties": {
"id": {
"type": "text"
},
"passage_embedding": {
"type": "knn_vector",
"dimension": {{dimensions | default(768)}},
"method": {
{%-if engine is defined %}
"engine": "{{engine}}",
{%- endif %}
"space_type": "{{space_type | default('l2')}}",
"name": "{{method | default('hnsw')}}",
"parameters": {}
}
},
"text": {
"type": "text"
}
}
}
}
72 changes: 72 additions & 0 deletions treccovid_semantic_search/operations/default.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
{
"name": "delete-ingest-pipeline",
"operation-type": "delete-pipeline",
"id": "nlp-ingest-pipeline"
},
{
"name": "create-ingest-pipeline",
"operation-type": "put-pipeline",
"param-source": "create-ingest-pipeline",
"id": "nlp-ingest-pipeline",
"body": {
"description": "An NLP ingest pipeline",
"processors": [
{
"text_embedding": {
"model_id": "",
"field_map": {
"text": "passage_embedding"
}
}
}
]
}
},
{
"name": "index-append",
"operation-type": "bulk",
"bulk-size": {{bulk_size | default(100)}},
"ingest-percentage": {{ingest_percentage | default(100)}}
},
{
"name": "wait-until-merges-finish",
"operation-type": "index-stats",
"index": "_all",
"condition": {
"path": "_all.total.merges.current",
"expected-value": 0
},
"retry-until-success": true,
"include-in-reporting": false
},
{
"name": "default",
"operation-type": "search",
"body": {
"query": {
"match_all": {}
}
}
},
{
"name": "semantic-search",
"operation-type": "search",
"num-variable-queries": {{num_variable_queries | default(0)}},
"param-source": "semantic-search-source",
"body": {
"_source": {
"excludes": [
"passage_embedding"
]
},
"query": {
"neural": {
"passage_embedding": {
"query_text": "what types of rapid testing for Covid-19 have been developed?",
"model_id": "",
"k": {{k | default(10)}}
}
}
}
}
}
112 changes: 112 additions & 0 deletions treccovid_semantic_search/test_procedures/default.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
{
"name": "index-merge-search",
"description": "Indexes the corpus with vector embedding and then runs queries with vector embedding.",
"default": true,
"schedule": [
{
"name": "cluster-settings",
"operation": {
"operation-type": "put-settings",
"body": {
"persistent": {
"plugins": {
"ml_commons": {
"only_run_on_ml_node": "false",
"native_memory_threshold": "99",
"allow_registering_model_via_local_file": "true",
"allow_registering_model_via_url": "true"
}
}
}
}
}
},
{
"operation": "delete-index"
},
{
"operation": "delete-ingest-pipeline"
},
{
"operation": {
"operation-type": "delete-ml-model",
"model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}"
}
},
{
"operation": {
"operation-type": "register-ml-model",
"model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}",
"model-version": "{{ model_version | default('1.0.1') }}",
"model-format": "{{ model_format | default('TORCH_SCRIPT') }}",
"model-config-file": "{{ model_config_file | default('') }}"
}
},
{
"operation": "deploy-ml-model"
},
{
"operation": "create-ingest-pipeline"
},
{
"operation": {
"operation-type": "create-index",
"settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} {
"index.refresh_interval": "5s",
"index.translog.flush_threshold_size": "1g"
}{%- endif %}
}
},
{
"name": "check-cluster-health",
"operation": {
"operation-type": "cluster-health",
"index": "treccovid",
"request-params": {
"wait_for_status": "{{cluster_health | default('green')}}",
"wait_for_no_relocating_shards": "true"
},
"retry-until-success": true
}
},
{
"operation": "index-append",
"warmup-time-period": 60,
"clients": {{bulk_indexing_clients | default(1)}},
"ignore-response-error-level": "{{error_level | default('non-fatal')}}"
},
{
"name": "refresh-after-index",
"operation": "refresh"
},
{
"operation": {
"operation-type": "force-merge",
"request-timeout": 7200{%- if force_merge_max_num_segments is defined %},
"max-num-segments": {{ force_merge_max_num_segments | tojson }}
{%- endif %}
}
},
{
"name": "refresh-after-force-merge",
"operation": "refresh"
},
{
"operation": "wait-until-merges-finish"
},
{
"operation": "default",
"warmup-iterations": {{warmup_iterations | default(500) | tojson}},
"iterations": {{iterations | default(500) | tojson }},
"target-throughput": {{ target_throughput | default(100) | tojson}},
"clients": {{ search_clients | default(1) }}
},
{
"operation": "semantic-search",
"warmup-iterations": {{warmup_iterations | default(100) | tojson}},
"iterations": {{iterations | default(100) | tojson }},
"target-throughput": {{ target_throughput | default(10) | tojson}},
"clients": {{ search_clients | default(1)}}
}
]
}
32 changes: 32 additions & 0 deletions treccovid_semantic_search/workload.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{% import "benchmark.helpers" as benchmark with context %}

{
"version": 2,
"description": "Trec-Covid is a dataset collection of documents about COVID-19 information.",
"indices": [
{
"name": "treccovid",
"body": "index.json"
}
],
"corpora": [
{
"name": "treccovid",
"base-url": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/treccovid",
"documents": [
{
"source-file": "documents.json.bz2",
"document-count": 129192,
"compressed-bytes": 51187469,
"uncompressed-bytes": 211980208
}
]
}
],
"operations": [
{{ benchmark.collect(parts="operations/*.json") }}
],
"test_procedures": [
{{ benchmark.collect(parts="test_procedures/*.json") }}
]
}
75 changes: 75 additions & 0 deletions treccovid_semantic_search/workload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import random
import os
import json
from pathlib import Path

from osbenchmark.workload.loader import Downloader
from osbenchmark.workload.loader import Decompressor
from osbenchmark.workload.loader import Decompressor

script_dir = os.path.dirname(os.path.realpath(__file__))

def ingest_pipeline_param_source(workload, params, **kwargs):
model_id = params['body']['processors'][0]['text_embedding']['model_id']
if not model_id:
with open('model_id.json') as f:
d = json.loads(f.read())
model_id = d['model_id']
params['body']['processors'][0]['text_embedding']['model_id'] = model_id
return params

class QueryParamSource:
def __init__(self, workload, params, **kwargs):
if len(workload.indices) == 1:
index = workload.indices[0].name
if len(workload.indices[0].types) == 1:
type = workload.indices[0].types[0].name
else:
type = None
else:
index = "_all"
type = None

self._params = params
self._params['index'] = index
self._params['type'] = type
self._params['variable-queries'] = params.get("variable-queries", 0)
self.infinite = True

if self._params['variable-queries'] > 0:
with open(script_dir + os.sep + 'workload_queries.json', 'r') as f:
d = json.loads(f.read())
source_file = d['source-file']
base_url = d['base-url']
compressed_bytes = d['compressed-bytes']
uncompressed_bytes = d['uncompressed-bytes']
compressed_path = script_dir + os.sep + source_file
uncompressed_path = script_dir + os.sep + Path(source_file).stem
if not os.path.exists(compressed_path):
downloader = Downloader(False, False)
downloader.download(base_url, None, compressed_path, compressed_bytes)
if not os.path.exists(uncompressed_path):
decompressor = Decompressor()
decompressor.decompress(compressed_path, uncompressed_path, uncompressed_bytes)

def partition(self, partition_index, total_partitions):
return self

def params(self):
params = self._params
with open('model_id.json', 'r') as f:
d = json.loads(f.read())
params['body']['query']['neural']['passage_embedding']['model_id'] = d['model_id']
count = self._params.get("variable-queries", 0)
if count > 0:
script_dir = os.path.dirname(os.path.realpath(__file__))
with open(script_dir + '/queries.json', 'r') as f:
lines = f.read().splitlines()
line =random.choice(lines)
query_text = json.loads(line)['text']
params['body']['query']['neural']['passage_embedding']['query_text'] = query_text
return params

def register(registry):
registry.register_param_source("semantic-search-source", QueryParamSource)
registry.register_param_source("create-ingest-pipeline", ingest_pipeline_param_source)
6 changes: 6 additions & 0 deletions treccovid_semantic_search/workload_queries.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"base-url": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/treccovid",
"source-file": "queries.json.bz2",
"compressed-bytes": 4310,
"uncompressed-bytes": 16552
}

0 comments on commit 417170f

Please sign in to comment.