From 417170f177b8bd28eb5f2b96ff47419d0cadc2a6 Mon Sep 17 00:00:00 2001
From: vpehkone <101240162+vpehkone@users.noreply.github.com>
Date: Wed, 24 Jul 2024 11:19:18 -0700
Subject: [PATCH] Add vector search with embedding generation workload (#232)

* Add vector search with embedding generation workload
Signed-off-by: Vesa Pehkonen <vesa.pehkonen@intel.com>

* Add vector search with embedding generation workload
Signed-off-by: Vesa Pehkonen <vesa.pehkonen@intel.com>

* Updated README.md with the license text.

Signed-off-by: Vesa Pehkonen <vesa.pehkonen@intel.com>

* - Changed the workload form vectorsearch_embedding to semantic_search.
- Changed dataset from ms marco to trec-covid.
- Moved benchmark task runners DeletePipeline, DeleteMlModel, RegisterMlModel and DeployMlModel to OS-benchmark repo.

Signed-off-by: Vesa Pehkonen <vesa.pehkonen@intel.com>

* - Changed the workload name semantic_search to treccovid_semantic_search.
- Added the sample output for treccovid_semantic_search.
- Added description of test procedure.
- Simplified treccovid_semantics_search workload configuration.

Signed-off-by: Vesa Pehkonen <vesa.pehkonen@intel.com>

* Updated parameters of treccovid workload.

Signed-off-by: Vesa Pehkonen <vesa.pehkonen@intel.com>

* Added files.txt to treccovid workload.

Signed-off-by: Vesa Pehkonen <vesa.pehkonen@intel.com>

* Updated the documents url for treccovid_semantic_search.

Signed-off-by: Vesa Pehkonen <vesa.pehkonen@intel.com>

---------

Signed-off-by: Vesa Pehkonen <vesa.pehkonen@intel.com>
---
 treccovid_semantic_search/README.md           | 261 ++++++++++++++++++
 treccovid_semantic_search/files.txt           |   2 +
 treccovid_semantic_search/index.json          |  36 +++
 .../operations/default.json                   |  72 +++++
 .../test_procedures/default.json              | 112 ++++++++
 treccovid_semantic_search/workload.json       |  32 +++
 treccovid_semantic_search/workload.py         |  75 +++++
 .../workload_queries.json                     |   6 +
 8 files changed, 596 insertions(+)
 create mode 100644 treccovid_semantic_search/README.md
 create mode 100644 treccovid_semantic_search/files.txt
 create mode 100644 treccovid_semantic_search/index.json
 create mode 100644 treccovid_semantic_search/operations/default.json
 create mode 100644 treccovid_semantic_search/test_procedures/default.json
 create mode 100644 treccovid_semantic_search/workload.json
 create mode 100644 treccovid_semantic_search/workload.py
 create mode 100644 treccovid_semantic_search/workload_queries.json

diff --git a/treccovid_semantic_search/README.md b/treccovid_semantic_search/README.md
new file mode 100644
index 00000000..1aecb084
--- /dev/null
+++ b/treccovid_semantic_search/README.md
@@ -0,0 +1,261 @@
+## Trec-Covid Semantic Search workload
+
+This workload uses OpenSearch pretrained model and ml-common-plugin to embed vectors. It is based on the neural search tutorial https://opensearch.org/docs/latest/search-plugins/neural-search-tutorial/ 
+
+### Dataset
+
+Trec-Covid is a dataset collection of documents about COVID-19 information.
+- Trec-Covid website: https://ir.nist.gov/covidSubmit/index.html
+- Dataset: https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/trec-covid.zip
+
+### Example document and query
+```json
+{
+  "_id": "2b73a28n",
+  "title": "Role of endothelin-1 in lung disease",
+  "text": "Endothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases.....",
+  "metadata": {
+    "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59574/",
+    "pubmed_id": "11686871"
+  }
+}
+```
+```json
+{
+  "query": {
+    "neural": {
+      "passage_embedding": {
+        "query_text": "what types of rapid testing for Covid-19 have been developed?",
+        "model_id": "LSmIG44BlTi78mODPYgy",
+        "k": 10
+      }
+    }
+  }
+}
+```
+
+### Procedures
+
+#### Index, force-merge and search
+
+This procedure runs all tasks of this workload. First it deletes the current index and model. Then it indexes the corpus with vector embedding. Then it does the force-merging. At the end it does the semantic search.
+
+### Workload tasks:
+
+- cluster-settings
+- delete-index
+- delete-ingest-pipeline
+- delete-ml-model
+- register-ml-model
+- deploy-ml-model
+- create-ingest-pipeline
+- create-index
+- check-cluster-health
+- index-append
+- refresh-after-index
+- force-merge
+- refresh-after-force-merge
+- wait-until-merges-finish
+- default
+- semantic-search
+
+### Parameters
+
+This workload allows [specifying the following parameters](#specifying-workload-parameters) using the `--workload-params` option to OpenSearch Benchmark:
+
+* `bulk_size` (default: 100)
+* `bulk_indexing_clients` (default: 1): Number of clients that issue bulk indexing requests.
+* `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested.
+* `number_of_replicas` (default: 0)
+* `number_of_shards` (default: 1)
+* `query_cache_enabled` (default: false)
+* `requests_cache_enabled` (default: false)
+* `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index.
+* `force_merge_max_num_segments` (default: unset): An integer specifying the max amount of segments the force-merge operation should use.
+* `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly.
+* `cluster_health` (default: "green"): The minimum required cluster health.
+* `error_level` (default: "non-fatal"): Available for bulk operations only to specify ignore-response-error-level.
+* `target_throughput` (default: default values for each operation): Number of requests per second, `""` for no limit.
+* `search_clients`: Number of clients that issue search requests.
+* `model_name` (default: huggingface/sentence-transformers/all-mpnet-base-v2) OpenSearch-provided pretrained model name.
+* `model_version` (default: 1.0.1) Model version.
+* `model_format` (default: TORCH_SCRIPT) Model format.
+* `dimensions` (default: 768): Vector dimensions, needed to match the model.
+* `engine` (default:` lucene): The approximate k-NN library to use for indexing and search.
+* `method` (default:` hnsw): K-NN search algorithm.
+* `space_type` (default:` l2): The vector space used to calculate the distance between vectors.
+* `k` (default: 10) Number of nearest neighbors are returned.
+* `warmup_iterations` Number of Warmup iteration of each search client executes.
+* `iterations`  Number of test iterations of each search client executes.
+* `num_variable_queries` (default: 0) Number of variable queries will be used for the semantic search task, 0 means fixed query and max value is 50.
+
+### Specifying Workload Parameters
+
+Example:
+```json
+{
+  "index_settings": {
+    "index.number_of_shards": 1,
+    "index.number_of_replicas": 0
+  },
+  "bulk_indexing_clients": 2,
+  "ingest_percentage": 20,
+  "search_clients": 10,
+  "target_throughput": "",
+  "iterations": 100,
+  "warmup_iterations": 100,
+  "k": 100,
+  "variable_queries": 100
+}
+ ```
+
+Save it as `params.json` and provide it to OpenSearch Benchmark with `--workload-params="/path/to/params.json"`. The overrides for simple parameters could be specified in-place, for example `--workload-params=search_clients:2`.
+
+### Sample command and output
+
+```
+./opensearch-benchmark execute-test --workload=treccovid_semantic_search \
+ --target-hosts=<target-ip>:9200 --pipeline=benchmark-only --workload-params=params.json
+
+   ____                  _____                      __       ____                  __                         __
+  / __ \____  ___  ____ / ___/___  ____ ___________/ /_     / __ )___  ____  _____/ /_  ____ ___  ____ ______/ /__
+ / / / / __ \/ _ \/ __ \\__ \/ _ \/ __ `/ ___/ ___/ __ \   / __  / _ \/ __ \/ ___/ __ \/ __ `__ \/ __ `/ ___/ //_/
+/ /_/ / /_/ /  __/ / / /__/ /  __/ /_/ / /  / /__/ / / /  / /_/ /  __/ / / / /__/ / / / / / / / / /_/ / /  / ,<
+\____/ .___/\___/_/ /_/____/\___/\__,_/_/   \___/_/ /_/  /_____/\___/_/ /_/\___/_/ /_/_/ /_/ /_/\__,_/_/  /_/|_|
+    /_/
+
+[INFO] [Test Execution ID]: b6117408-73b8-4fc0-ba5d-f324cb3e1844
+[INFO] Executing test with workload [treccovid_semantic_search], test_procedure [index-merge-search] and provision_config_instance ['external'] with version [2.13.0].
+
+Running cluster-settings                                                       [100% done]
+Running delete-index                                                           [100% done]
+Running delete-ingest-pipeline                                                 [100% done]
+Running delete-ml-model                                                        [100% done]
+Running register-ml-model                                                      [100% done]
+Running deploy-ml-model                                                        [100% done]
+Running create-ingest-pipeline                                                 [100% done]
+Running create-index                                                           [100% done]
+Running check-cluster-health                                                   [100% done]
+Running index-append                                                           [100% done]
+Running refresh-after-index                                                    [100% done]
+Running force-merge                                                            [100% done]
+Running refresh-after-force-merge                                              [100% done]
+Running wait-until-merges-finish                                               [100% done]
+Running default                                                                [100% done]
+Running semantic-search                                                        [100% done]
+
+------------------------------------------------------
+    _______             __   _____
+   / ____(_)___  ____ _/ /  / ___/_________  ________
+  / /_  / / __ \/ __ `/ /   \__ \/ ___/ __ \/ ___/ _ \
+ / __/ / / / / / /_/ / /   ___/ / /__/ /_/ / /  /  __/
+/_/   /_/_/ /_/\__,_/_/   /____/\___/\____/_/   \___/
+------------------------------------------------------
+
+|                                                         Metric |                     Task |       Value |   Unit |
+|---------------------------------------------------------------:|-------------------------:|------------:|-------:|
+|                     Cumulative indexing time of primary shards |                          |    0.433717 |    min |
+|             Min cumulative indexing time across primary shards |                          |           0 |    min |
+|          Median cumulative indexing time across primary shards |                          |     0.00015 |    min |
+|             Max cumulative indexing time across primary shards |                          |       0.171 |    min |
+|            Cumulative indexing throttle time of primary shards |                          |           0 |    min |
+|    Min cumulative indexing throttle time across primary shards |                          |           0 |    min |
+| Median cumulative indexing throttle time across primary shards |                          |           0 |    min |
+|    Max cumulative indexing throttle time across primary shards |                          |           0 |    min |
+|                        Cumulative merge time of primary shards |                          |    0.374233 |    min |
+|                       Cumulative merge count of primary shards |                          |           8 |        |
+|                Min cumulative merge time across primary shards |                          |           0 |    min |
+|             Median cumulative merge time across primary shards |                          |     0.00055 |    min |
+|                Max cumulative merge time across primary shards |                          |    0.345033 |    min |
+|               Cumulative merge throttle time of primary shards |                          |     0.33885 |    min |
+|       Min cumulative merge throttle time across primary shards |                          |           0 |    min |
+|    Median cumulative merge throttle time across primary shards |                          |           0 |    min |
+|       Max cumulative merge throttle time across primary shards |                          |     0.33885 |    min |
+|                      Cumulative refresh time of primary shards |                          |     0.10995 |    min |
+|                     Cumulative refresh count of primary shards |                          |         162 |        |
+|              Min cumulative refresh time across primary shards |                          |           0 |    min |
+|           Median cumulative refresh time across primary shards |                          | 0.000783333 |    min |
+|              Max cumulative refresh time across primary shards |                          |   0.0343667 |    min |
+|                        Cumulative flush time of primary shards |                          |     0.00885 |    min |
+|                       Cumulative flush count of primary shards |                          |           4 |        |
+|                Min cumulative flush time across primary shards |                          |           0 |    min |
+|             Median cumulative flush time across primary shards |                          |           0 |    min |
+|                Max cumulative flush time across primary shards |                          |     0.00885 |    min |
+|                                        Total Young Gen GC time |                          |       0.523 |      s |
+|                                       Total Young Gen GC count |                          |          24 |        |
+|                                          Total Old Gen GC time |                          |           0 |      s |
+|                                         Total Old Gen GC count |                          |           0 |        |
+|                                                     Store size |                          |     2.18146 |     GB |
+|                                                  Translog size |                          |   0.0721766 |     GB |
+|                                         Heap used for segments |                          |           0 |     MB |
+|                                       Heap used for doc values |                          |           0 |     MB |
+|                                            Heap used for terms |                          |           0 |     MB |
+|                                            Heap used for norms |                          |           0 |     MB |
+|                                           Heap used for points |                          |           0 |     MB |
+|                                    Heap used for stored fields |                          |           0 |     MB |
+|                                                  Segment count |                          |          50 |        |
+|                                                 Min Throughput |             index-append |      108.82 | docs/s |
+|                                                Mean Throughput |             index-append |      110.47 | docs/s |
+|                                              Median Throughput |             index-append |       110.6 | docs/s |
+|                                                 Max Throughput |             index-append |      111.68 | docs/s |
+|                                        50th percentile latency |             index-append |     3465.01 |     ms |
+|                                        90th percentile latency |             index-append |     3588.01 |     ms |
+|                                       100th percentile latency |             index-append |     3764.87 |     ms |
+|                                   50th percentile service time |             index-append |     3465.01 |     ms |
+|                                   90th percentile service time |             index-append |     3588.01 |     ms |
+|                                  100th percentile service time |             index-append |     3764.87 |     ms |
+|                                                     error rate |             index-append |           0 |      % |
+|                                                 Min Throughput | wait-until-merges-finish |       90.88 |  ops/s |
+|                                                Mean Throughput | wait-until-merges-finish |       90.88 |  ops/s |
+|                                              Median Throughput | wait-until-merges-finish |       90.88 |  ops/s |
+|                                                 Max Throughput | wait-until-merges-finish |       90.88 |  ops/s |
+|                                       100th percentile latency | wait-until-merges-finish |     10.6818 |     ms |
+|                                  100th percentile service time | wait-until-merges-finish |     10.6818 |     ms |
+|                                                     error rate | wait-until-merges-finish |           0 |      % |
+|                                                 Min Throughput |                  default |     1030.78 |  ops/s |
+|                                                Mean Throughput |                  default |     1030.78 |  ops/s |
+|                                              Median Throughput |                  default |     1030.78 |  ops/s |
+|                                                 Max Throughput |                  default |     1030.78 |  ops/s |
+|                                        50th percentile latency |                  default |     8.11098 |     ms |
+|                                        90th percentile latency |                  default |     10.5718 |     ms |
+|                                        99th percentile latency |                  default |     12.5866 |     ms |
+|                                      99.9th percentile latency |                  default |     13.8164 |     ms |
+|                                       100th percentile latency |                  default |     14.1444 |     ms |
+|                                   50th percentile service time |                  default |     8.11098 |     ms |
+|                                   90th percentile service time |                  default |     10.5718 |     ms |
+|                                   99th percentile service time |                  default |     12.5866 |     ms |
+|                                 99.9th percentile service time |                  default |     13.8164 |     ms |
+|                                  100th percentile service time |                  default |     14.1444 |     ms |
+|                                                     error rate |                  default |           0 |      % |
+|                                                 Min Throughput |          semantic-search |      110.75 |  ops/s |
+|                                                Mean Throughput |          semantic-search |      112.87 |  ops/s |
+|                                              Median Throughput |          semantic-search |      112.98 |  ops/s |
+|                                                 Max Throughput |          semantic-search |      114.51 |  ops/s |
+|                                        50th percentile latency |          semantic-search |     82.0484 |     ms |
+|                                        90th percentile latency |          semantic-search |     99.8155 |     ms |
+|                                        99th percentile latency |          semantic-search |     125.478 |     ms |
+|                                      99.9th percentile latency |          semantic-search |     139.749 |     ms |
+|                                       100th percentile latency |          semantic-search |     144.083 |     ms |
+|                                   50th percentile service time |          semantic-search |     82.0484 |     ms |
+|                                   90th percentile service time |          semantic-search |     99.8155 |     ms |
+|                                   99th percentile service time |          semantic-search |     125.478 |     ms |
+|                                 99.9th percentile service time |          semantic-search |     139.749 |     ms |
+|                                  100th percentile service time |          semantic-search |     144.083 |     ms |
+|                                                     error rate |          semantic-search |           0 |      % |
+
+
+---------------------------------
+[INFO] SUCCESS (took 266 seconds)
+```
+
+### License
+
+We use the same license for the data as the original data.
+```
+               Apache License
+           Version 2.0, January 2004
+         http://www.apache.org/licenses/
+```
+Covid-trec [1] is part of the COVID-19 Open Research dataset [2], which is licensed under Apache 2.0.  
+[1] https://arxiv.org/pdf/2005.04474v1.pdf  
+[2] https://github.com/allenai/cord19/ 
diff --git a/treccovid_semantic_search/files.txt b/treccovid_semantic_search/files.txt
new file mode 100644
index 00000000..20fb5d6f
--- /dev/null
+++ b/treccovid_semantic_search/files.txt
@@ -0,0 +1,2 @@
+documents.json.bz2
+queries.json.bz2
diff --git a/treccovid_semantic_search/index.json b/treccovid_semantic_search/index.json
new file mode 100644
index 00000000..3ba5b385
--- /dev/null
+++ b/treccovid_semantic_search/index.json
@@ -0,0 +1,36 @@
+{
+  "settings": {
+    {%-if number_of_shards is defined %}
+    "index.number_of_shards": {{number_of_shards}},
+    {%- endif %}
+    {%-if number_of_replicas is defined %}
+    "index.number_of_replicas": {{number_of_replicas}},
+    {%- endif %}
+    "index.queries.cache.enabled": {{query_cache_enabled | default(false) | tojson}},
+    "index.requests.cache.enable": {{requests_cache_enabled | default(false) | tojson}},
+    "index.knn": true,
+    "default_pipeline": "nlp-ingest-pipeline"
+  },
+  "mappings": {
+    "properties": {
+      "id": {
+        "type": "text"
+      },
+      "passage_embedding": {
+        "type": "knn_vector",
+        "dimension": {{dimensions | default(768)}},
+        "method": {
+          {%-if engine is defined %}
+          "engine": "{{engine}}",
+          {%- endif %}
+          "space_type": "{{space_type | default('l2')}}",
+          "name": "{{method | default('hnsw')}}",
+          "parameters": {}
+        }
+      },
+      "text": {
+        "type": "text"
+      }
+    }
+  }
+}
diff --git a/treccovid_semantic_search/operations/default.json b/treccovid_semantic_search/operations/default.json
new file mode 100644
index 00000000..ac3a65e7
--- /dev/null
+++ b/treccovid_semantic_search/operations/default.json
@@ -0,0 +1,72 @@
+    {
+      "name": "delete-ingest-pipeline",
+      "operation-type": "delete-pipeline",
+      "id": "nlp-ingest-pipeline"
+    },
+    {
+      "name": "create-ingest-pipeline",
+      "operation-type": "put-pipeline",
+      "param-source": "create-ingest-pipeline",
+      "id": "nlp-ingest-pipeline",  
+      "body": {
+        "description": "An NLP ingest pipeline",
+        "processors": [
+          {
+            "text_embedding": {
+              "model_id": "",
+              "field_map": {
+                "text": "passage_embedding"
+              }
+            }
+          }
+        ]
+      }
+    },
+    {
+      "name": "index-append",
+      "operation-type": "bulk",
+      "bulk-size": {{bulk_size | default(100)}},
+      "ingest-percentage": {{ingest_percentage | default(100)}}
+    },
+    {
+      "name": "wait-until-merges-finish",
+      "operation-type": "index-stats",
+      "index": "_all",
+      "condition": {
+        "path": "_all.total.merges.current",
+        "expected-value": 0
+      },
+      "retry-until-success": true,
+      "include-in-reporting": false
+    },
+    {
+      "name": "default",
+      "operation-type": "search",
+      "body": {
+        "query": {
+          "match_all": {}
+        }
+      }
+    },
+    {
+      "name": "semantic-search",
+      "operation-type": "search",
+      "num-variable-queries": {{num_variable_queries | default(0)}},
+      "param-source": "semantic-search-source",
+      "body": {
+        "_source": {
+          "excludes": [
+            "passage_embedding"
+          ]
+        },
+        "query": {
+          "neural": {
+            "passage_embedding": {
+              "query_text": "what types of rapid testing for Covid-19 have been developed?",
+              "model_id": "",
+              "k": {{k | default(10)}}
+            }
+          }
+        }
+      }
+    }
diff --git a/treccovid_semantic_search/test_procedures/default.json b/treccovid_semantic_search/test_procedures/default.json
new file mode 100644
index 00000000..12c1f675
--- /dev/null
+++ b/treccovid_semantic_search/test_procedures/default.json
@@ -0,0 +1,112 @@
+    {
+      "name": "index-merge-search",
+      "description": "Indexes the corpus with vector embedding and then runs queries with vector embedding.",
+      "default": true,
+      "schedule": [
+        {
+          "name": "cluster-settings",
+          "operation": {
+            "operation-type": "put-settings",
+            "body": {
+              "persistent": {
+                "plugins": {
+                  "ml_commons": {
+                    "only_run_on_ml_node": "false",
+                    "native_memory_threshold": "99",
+                    "allow_registering_model_via_local_file": "true",
+                    "allow_registering_model_via_url": "true"
+                  }
+                }
+              }
+            }
+          }
+        },
+        {
+          "operation": "delete-index"
+        },
+        {
+          "operation": "delete-ingest-pipeline"
+        },
+        {
+          "operation": {
+            "operation-type": "delete-ml-model",
+            "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}"
+          }
+        },
+        {
+          "operation": {
+            "operation-type": "register-ml-model",
+            "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}",
+            "model-version": "{{ model_version | default('1.0.1') }}",
+            "model-format": "{{ model_format | default('TORCH_SCRIPT') }}",
+            "model-config-file": "{{ model_config_file | default('') }}"
+          }
+        },
+        {
+          "operation": "deploy-ml-model"
+        },
+        {
+          "operation": "create-ingest-pipeline"
+        },
+        {
+          "operation": {
+            "operation-type": "create-index",
+            "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} {
+              "index.refresh_interval": "5s",
+              "index.translog.flush_threshold_size": "1g"
+            }{%- endif %}
+          }
+        },
+        {
+          "name": "check-cluster-health",
+          "operation": {
+            "operation-type": "cluster-health",
+            "index": "treccovid",
+            "request-params": {
+              "wait_for_status": "{{cluster_health | default('green')}}",
+              "wait_for_no_relocating_shards": "true"
+            },
+            "retry-until-success": true
+          }
+        },
+        {
+          "operation": "index-append",
+          "warmup-time-period": 60,
+          "clients": {{bulk_indexing_clients | default(1)}},
+          "ignore-response-error-level": "{{error_level | default('non-fatal')}}"
+        },
+        {
+          "name": "refresh-after-index",
+          "operation": "refresh"
+        },
+        {
+          "operation": {
+            "operation-type": "force-merge",
+            "request-timeout": 7200{%- if force_merge_max_num_segments is defined %},
+            "max-num-segments": {{ force_merge_max_num_segments | tojson }}
+            {%- endif %}
+          }
+        },
+        {
+          "name": "refresh-after-force-merge",
+          "operation": "refresh"
+        },
+        {
+          "operation": "wait-until-merges-finish"
+        },
+        {
+          "operation": "default",
+          "warmup-iterations": {{warmup_iterations | default(500) | tojson}},
+          "iterations": {{iterations | default(500) | tojson }},
+          "target-throughput": {{ target_throughput | default(100) | tojson}},
+          "clients": {{ search_clients | default(1) }}
+        },
+        {
+          "operation": "semantic-search",
+          "warmup-iterations": {{warmup_iterations | default(100) | tojson}},
+          "iterations": {{iterations | default(100) | tojson }},
+          "target-throughput": {{ target_throughput | default(10) | tojson}},
+          "clients": {{ search_clients | default(1)}}
+        }
+      ]
+    }
diff --git a/treccovid_semantic_search/workload.json b/treccovid_semantic_search/workload.json
new file mode 100644
index 00000000..761d1d0e
--- /dev/null
+++ b/treccovid_semantic_search/workload.json
@@ -0,0 +1,32 @@
+{% import "benchmark.helpers" as benchmark with context %}
+
+{
+  "version": 2,
+    "description": "Trec-Covid is a dataset collection of documents about COVID-19 information.",
+  "indices": [
+    {
+      "name": "treccovid",
+      "body": "index.json"
+    }
+  ],
+  "corpora": [
+    {
+      "name": "treccovid",
+      "base-url": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/treccovid",
+      "documents": [
+        {
+          "source-file": "documents.json.bz2",
+          "document-count": 129192,
+          "compressed-bytes": 51187469,
+          "uncompressed-bytes": 211980208
+        }
+      ]
+    }
+  ],
+  "operations": [
+    {{ benchmark.collect(parts="operations/*.json") }}
+  ],
+  "test_procedures": [
+    {{ benchmark.collect(parts="test_procedures/*.json") }}
+  ]
+}
diff --git a/treccovid_semantic_search/workload.py b/treccovid_semantic_search/workload.py
new file mode 100644
index 00000000..1eaa0436
--- /dev/null
+++ b/treccovid_semantic_search/workload.py
@@ -0,0 +1,75 @@
+import random
+import os
+import json
+from pathlib import Path
+
+from osbenchmark.workload.loader import Downloader
+from osbenchmark.workload.loader import Decompressor
+from osbenchmark.workload.loader import Decompressor
+
+script_dir = os.path.dirname(os.path.realpath(__file__))
+
+def ingest_pipeline_param_source(workload, params, **kwargs):
+    model_id = params['body']['processors'][0]['text_embedding']['model_id']
+    if not model_id:
+        with open('model_id.json') as f:
+            d = json.loads(f.read())
+            model_id = d['model_id']
+            params['body']['processors'][0]['text_embedding']['model_id'] = model_id
+    return params
+
+class QueryParamSource:
+    def __init__(self, workload, params, **kwargs):
+        if len(workload.indices) == 1:
+            index = workload.indices[0].name
+            if len(workload.indices[0].types) == 1:
+                type = workload.indices[0].types[0].name
+            else:
+                type = None
+        else:
+            index = "_all"
+            type = None
+
+        self._params = params
+        self._params['index'] = index
+        self._params['type'] = type
+        self._params['variable-queries'] = params.get("variable-queries", 0)
+        self.infinite = True
+
+        if self._params['variable-queries'] > 0:
+            with open(script_dir + os.sep + 'workload_queries.json', 'r') as f:
+                d = json.loads(f.read())
+                source_file = d['source-file']
+                base_url = d['base-url']
+                compressed_bytes = d['compressed-bytes']
+                uncompressed_bytes = d['uncompressed-bytes']
+                compressed_path = script_dir + os.sep + source_file
+                uncompressed_path = script_dir + os.sep + Path(source_file).stem
+            if not os.path.exists(compressed_path):
+                downloader = Downloader(False, False)
+                downloader.download(base_url, None, compressed_path, compressed_bytes)
+            if not os.path.exists(uncompressed_path):
+                decompressor = Decompressor()
+                decompressor.decompress(compressed_path, uncompressed_path, uncompressed_bytes)
+
+    def partition(self, partition_index, total_partitions):
+        return self
+
+    def params(self):
+        params = self._params
+        with open('model_id.json', 'r') as f:
+            d = json.loads(f.read())
+            params['body']['query']['neural']['passage_embedding']['model_id'] = d['model_id']
+        count = self._params.get("variable-queries", 0)
+        if count > 0:
+            script_dir = os.path.dirname(os.path.realpath(__file__))
+            with open(script_dir + '/queries.json', 'r') as f:
+                lines = f.read().splitlines()
+                line =random.choice(lines)
+                query_text = json.loads(line)['text']
+                params['body']['query']['neural']['passage_embedding']['query_text'] = query_text
+        return params
+
+def register(registry):
+    registry.register_param_source("semantic-search-source", QueryParamSource)
+    registry.register_param_source("create-ingest-pipeline", ingest_pipeline_param_source)
diff --git a/treccovid_semantic_search/workload_queries.json b/treccovid_semantic_search/workload_queries.json
new file mode 100644
index 00000000..d445066d
--- /dev/null
+++ b/treccovid_semantic_search/workload_queries.json
@@ -0,0 +1,6 @@
+{
+  "base-url": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/treccovid",
+  "source-file": "queries.json.bz2",
+  "compressed-bytes": 4310,
+  "uncompressed-bytes": 16552
+}