From 37215ab4dcefb5204c6e46bec0864dbbbaed60ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Ba=CC=88uerle?= <alex@a13x.io>
Date: Thu, 2 Nov 2023 11:22:32 +0100
Subject: [PATCH] feat: add eleuther harness script

---
 examples/eleuther_harness/harness.ipynb | 353 ++++++++++++++++++++++++
 1 file changed, 353 insertions(+)
 create mode 100644 examples/eleuther_harness/harness.ipynb

diff --git a/examples/eleuther_harness/harness.ipynb b/examples/eleuther_harness/harness.ipynb
new file mode 100644
index 0000000..c00e7f2
--- /dev/null
+++ b/examples/eleuther_harness/harness.ipynb
@@ -0,0 +1,353 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Eleuther AI Harness\n",
+    "\n",
+    "This notebook can be used to run tasks of the eleuther AI harness on most types of models and upload the results to Zeno.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To run the harness, first clone and install its codebase.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!git clone git@github.com:EleutherAI/lm-evaluation-harness.git\n",
+    "%pip install -e lm-evaluation-harness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from zeno_client import ZenoClient, ZenoMetric\n",
+    "import pandas as pd\n",
+    "from lm_eval import evaluator, tasks, utils\n",
+    "import json\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "import numpy as np\n",
+    "import numbers\n",
+    "\n",
+    "API_KEY = os.environ[\"ZENO_API_KEY\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Settings\n",
+    "\n",
+    "The following settings can be configured to your needs.\n",
+    "\n",
+    "**Warning: This might take very long to run depending on your hardware. We recommend setting the limit argument so as to only evaluate a subset of the task.**\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "args = {\n",
+    "    \"model\": \"hf-causal\",  # for huggingface models\n",
+    "    \"model_args\": \"pretrained=EleutherAI/gpt-neo-2.7B\",  # this specifies the HF model to use\n",
+    "    \"model_name\": \"llama-7b\",  # used to identify the model on upload\n",
+    "    \"tasks\": \"bigbench_geometric_shapes\",  # tasks to run, multiple tasks separated by commas\n",
+    "    \"num_fewshot\": 3,  # num fewshot examples to include in the prompt\n",
+    "    \"device\": \"cuda\",  # device to run the model on, \"cuda\" if you have a GPU\n",
+    "    \"output_path\": \"./output/latest_results.json\",  # path to write results to\n",
+    "    \"limit\": 250,  # limit on the number of examples to run\n",
+    "    \"no_cache\": False,  # don't cache results\n",
+    "    \"output_base_path\": \"./output\",  # where to write the outputs to\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Running Evaluation\n",
+    "\n",
+    "This evaluates the model on the selected tasks and write the results to the outputs folder. This step might take a while.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "task_names = utils.pattern_match(task_names.split(\",\"), tasks.ALL_TASKS)\n",
+    "results = evaluator.simple_evaluate(\n",
+    "    model=args[\"model\"],\n",
+    "    model_args=args[\"model_args\"],\n",
+    "    tasks=task_names,\n",
+    "    num_fewshot=args[\"num_fewshot\"],\n",
+    "    batch_size=None,\n",
+    "    max_batch_size=None,\n",
+    "    device=args[\"device\"],\n",
+    "    no_cache=args[\"no_cache\"],\n",
+    "    limit=args[\"limit\"],\n",
+    "    description_dict={},\n",
+    "    decontamination_ngrams_path=None,\n",
+    "    check_integrity=None,\n",
+    "    write_out=True,\n",
+    "    output_base_path=Path(args[\"output_base_path\"], \"tasks\"),\n",
+    ")\n",
+    "\n",
+    "dumped = json.dumps(results, indent=2)\n",
+    "\n",
+    "os.makedirs(args[\"output_base_path\"], exist_ok=True)\n",
+    "with open(Path(args[\"output_base_path\"], \"latest_results.json\"), \"w\") as f:\n",
+    "    f.write(dumped)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get all Output Files for the Tasks\n",
+    "\n",
+    "Loads all task output files to be able to combine them and upload them to Zeno.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get all output files for the tasks\n",
+    "dir_path = Path(args[\"output_base_path\"], \"tasks\")\n",
+    "files = list(dir_path.glob(\"*\"))\n",
+    "tasks_dict = [\n",
+    "    {\n",
+    "        \"name\": file.name.replace(\"_write_out_info.json\", \"\").replace(\"bigbench_\", \"\"),\n",
+    "        \"file\": file,\n",
+    "    }\n",
+    "    for file in files\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Helper Functions to Format the Data\n",
+    "\n",
+    "These functions are used to get the data in the right shape to be uploaded to Zeno.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def common_prefix(strings: list[str]):\n",
+    "    prefix: list[str] = []\n",
+    "    prefix_lines: list[list[str]] = list(map(lambda x: x.splitlines(), strings))\n",
+    "    index = 0\n",
+    "    while all(i[index] == prefix_lines[0][index] for i in prefix_lines):\n",
+    "        prefix.append(prefix_lines[0][index])\n",
+    "        index += 1\n",
+    "    common_lines = list(map(lambda x: x[index], prefix_lines))\n",
+    "    common_in_line = os.path.commonprefix(common_lines)\n",
+    "    prefix.append(common_in_line)\n",
+    "    common = \"\\n\".join(prefix)\n",
+    "    common_in_line_answers: list[list[str]] = list(\n",
+    "        map(lambda x: [x.replace(common_in_line, \"\")], common_lines)\n",
+    "    )\n",
+    "    remaining_answers = list(map(lambda x: x[index + 1 :], prefix_lines))\n",
+    "    answers = list(map(list.__add__, common_in_line_answers, remaining_answers))\n",
+    "    answers = list(map(lambda x: \"\\n\".join(x), answers))\n",
+    "    return {\"prefix\": common, \"answers\": answers}\n",
+    "\n",
+    "\n",
+    "def get_prefix_answers(data):\n",
+    "    prefix_answers = []\n",
+    "    keys = list(map(lambda x: x.keys(), data))\n",
+    "    prompt_keys = list(\n",
+    "        map(lambda x: list(filter(lambda y: y.startswith(\"prompt_\"), x)), keys)\n",
+    "    )\n",
+    "    for element_index, element in enumerate(data):\n",
+    "        prompts = list(map(lambda x: element[x], prompt_keys[element_index]))\n",
+    "        prefix_answers.append(common_prefix(prompts))\n",
+    "    return prefix_answers\n",
+    "\n",
+    "\n",
+    "def generate_dataset(data, prefix_answers, task_name):\n",
+    "    ids = list(map(lambda x: task_name + str(x[\"doc_id\"]), data))\n",
+    "    labels = list(map(lambda x: x[\"truth\"], data))\n",
+    "    df = pd.DataFrame(\n",
+    "        {\n",
+    "            \"id\": ids,\n",
+    "            \"data\": list(\n",
+    "                map(\n",
+    "                    lambda x: x[\"prefix\"]\n",
+    "                    + \"\\n\"\n",
+    "                    + \"\\n\".join(list(map(lambda y: \"- \" + y, x[\"answers\"]))),\n",
+    "                    prefix_answers,\n",
+    "                )\n",
+    "            ),\n",
+    "            \"task\": task_name,\n",
+    "            \"labels\": labels,\n",
+    "        }\n",
+    "    )\n",
+    "    return df\n",
+    "\n",
+    "\n",
+    "def generate_system_df(data, prefix_answers, df, task_name):\n",
+    "    ids = list(map(lambda x: task_name + str(x[\"doc_id\"]), data))\n",
+    "    keys = list(map(lambda x: x.keys(), data))\n",
+    "    logit_keys = list(\n",
+    "        map(lambda x: list(filter(lambda y: y.startswith(\"logit_\"), x)), keys)\n",
+    "    )\n",
+    "    answers = []\n",
+    "    correct_list = []\n",
+    "    for element_index, element in enumerate(data):\n",
+    "        answer = \"\"\n",
+    "        correct = False\n",
+    "        logits = list(\n",
+    "            filter(\n",
+    "                lambda y: isinstance(y, numbers.Number),\n",
+    "                map(lambda x: element[x], logit_keys[element_index]),\n",
+    "            )\n",
+    "        )\n",
+    "        if \"acc_norm\" in keys[element_index]:\n",
+    "            norm_logits = logits / np.array(\n",
+    "                [float(len(i)) for i in prefix_answers[element_index][\"answers\"]]\n",
+    "            )\n",
+    "            answer = prefix_answers[element_index][\"answers\"][np.argmax(norm_logits)]\n",
+    "            correct = df[\"labels\"][element_index].endswith(answer)\n",
+    "            answer = (\n",
+    "                answer\n",
+    "                + \"\\n\\n\"\n",
+    "                + \"Raw Pred.: \"\n",
+    "                + \", \".join(map(lambda y: str(round(y, 2)), logits))\n",
+    "                + \"\\n\\n\"\n",
+    "                + \"Norm Pred.: \"\n",
+    "                + \", \".join(map(lambda y: str(round(y, 2)), norm_logits))\n",
+    "            )\n",
+    "        else:\n",
+    "            answer = prefix_answers[element_index][\"answers\"][np.argmax(logits)]\n",
+    "            correct = df[\"labels\"][element_index].endswith(answer)\n",
+    "            answer = (\n",
+    "                answer\n",
+    "                + \"\\n\\n\"\n",
+    "                + \"Pred.: \"\n",
+    "                + \", \".join(map(lambda y: str(round(y, 2)), logits))\n",
+    "            )\n",
+    "        answers.append(answer)\n",
+    "        correct_list.append(correct)\n",
+    "    system_df = pd.DataFrame({\"id\": ids, \"output\": answers, \"correct\": correct_list})\n",
+    "    return system_df\n",
+    "\n",
+    "\n",
+    "# Accumulate data for all tasks\n",
+    "for index, task in enumerate(tasks_dict):\n",
+    "    data = json.load(open(task[\"file\"]))\n",
+    "    prefix_answers = get_prefix_answers(data)\n",
+    "    if index == 0:\n",
+    "        df = generate_dataset(data, prefix_answers, task[\"name\"])\n",
+    "        system_df = generate_system_df(data, prefix_answers, df, task[\"name\"])\n",
+    "    else:\n",
+    "        current_df = generate_dataset(data, prefix_answers, task[\"name\"])\n",
+    "        df = pd.concat([df, current_df])\n",
+    "        system_df = pd.concat(\n",
+    "            [\n",
+    "                system_df,\n",
+    "                generate_system_df(data, prefix_answers, current_df, task[\"name\"]),\n",
+    "            ]\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create the Zeno Project\n",
+    "\n",
+    "This will load an existing project if you already have a project with the same name.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = ZenoClient(API_KEY, endpoint=ENDPOINT)\n",
+    "\n",
+    "project = client.create_project(\n",
+    "    name=\"Eleuther Harness Project\",\n",
+    "    view=\"text-classification\",\n",
+    "    metrics=[ZenoMetric(name=\"accuracy\", type=\"mean\", columns=[\"correct\"])],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Upload the Dataset\n",
+    "\n",
+    "Uploads the task data to Zeno.\n",
+    "\n",
+    "**Only run this if you want to upload fresh data. This will delete all existing systems.**\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "project.upload_dataset(df, id_column=\"id\", data_column=\"data\", label_column=\"labels\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Upload the System\n",
+    "\n",
+    "Uploads the model output to Zeno.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "project.upload_system(\n",
+    "    system_df,\n",
+    "    name=args[\"model_name\"],\n",
+    "    id_column=\"id\",\n",
+    "    output_column=\"output\",\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}