From 37215ab4dcefb5204c6e46bec0864dbbbaed60ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Ba=CC=88uerle?= Date: Thu, 2 Nov 2023 11:22:32 +0100 Subject: [PATCH] feat: add eleuther harness script --- examples/eleuther_harness/harness.ipynb | 353 ++++++++++++++++++++++++ 1 file changed, 353 insertions(+) create mode 100644 examples/eleuther_harness/harness.ipynb diff --git a/examples/eleuther_harness/harness.ipynb b/examples/eleuther_harness/harness.ipynb new file mode 100644 index 0000000..c00e7f2 --- /dev/null +++ b/examples/eleuther_harness/harness.ipynb @@ -0,0 +1,353 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Eleuther AI Harness\n", + "\n", + "This notebook can be used to run tasks of the eleuther AI harness on most types of models and upload the results to Zeno.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To run the harness, first clone and install its codebase.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!git clone git@github.com:EleutherAI/lm-evaluation-harness.git\n", + "%pip install -e lm-evaluation-harness" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from zeno_client import ZenoClient, ZenoMetric\n", + "import pandas as pd\n", + "from lm_eval import evaluator, tasks, utils\n", + "import json\n", + "import os\n", + "from pathlib import Path\n", + "import numpy as np\n", + "import numbers\n", + "\n", + "API_KEY = os.environ[\"ZENO_API_KEY\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Settings\n", + "\n", + "The following settings can be configured to your needs.\n", + "\n", + "**Warning: This might take very long to run depending on your hardware. We recommend setting the limit argument so as to only evaluate a subset of the task.**\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "args = {\n", + " \"model\": \"hf-causal\", # for huggingface models\n", + " \"model_args\": \"pretrained=EleutherAI/gpt-neo-2.7B\", # this specifies the HF model to use\n", + " \"model_name\": \"llama-7b\", # used to identify the model on upload\n", + " \"tasks\": \"bigbench_geometric_shapes\", # tasks to run, multiple tasks separated by commas\n", + " \"num_fewshot\": 3, # num fewshot examples to include in the prompt\n", + " \"device\": \"cuda\", # device to run the model on, \"cuda\" if you have a GPU\n", + " \"output_path\": \"./output/latest_results.json\", # path to write results to\n", + " \"limit\": 250, # limit on the number of examples to run\n", + " \"no_cache\": False, # don't cache results\n", + " \"output_base_path\": \"./output\", # where to write the outputs to\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Running Evaluation\n", + "\n", + "This evaluates the model on the selected tasks and write the results to the outputs folder. This step might take a while.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "task_names = utils.pattern_match(task_names.split(\",\"), tasks.ALL_TASKS)\n", + "results = evaluator.simple_evaluate(\n", + " model=args[\"model\"],\n", + " model_args=args[\"model_args\"],\n", + " tasks=task_names,\n", + " num_fewshot=args[\"num_fewshot\"],\n", + " batch_size=None,\n", + " max_batch_size=None,\n", + " device=args[\"device\"],\n", + " no_cache=args[\"no_cache\"],\n", + " limit=args[\"limit\"],\n", + " description_dict={},\n", + " decontamination_ngrams_path=None,\n", + " check_integrity=None,\n", + " write_out=True,\n", + " output_base_path=Path(args[\"output_base_path\"], \"tasks\"),\n", + ")\n", + "\n", + "dumped = json.dumps(results, indent=2)\n", + "\n", + "os.makedirs(args[\"output_base_path\"], exist_ok=True)\n", + "with open(Path(args[\"output_base_path\"], \"latest_results.json\"), \"w\") as f:\n", + " f.write(dumped)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get all Output Files for the Tasks\n", + "\n", + "Loads all task output files to be able to combine them and upload them to Zeno.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get all output files for the tasks\n", + "dir_path = Path(args[\"output_base_path\"], \"tasks\")\n", + "files = list(dir_path.glob(\"*\"))\n", + "tasks_dict = [\n", + " {\n", + " \"name\": file.name.replace(\"_write_out_info.json\", \"\").replace(\"bigbench_\", \"\"),\n", + " \"file\": file,\n", + " }\n", + " for file in files\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Helper Functions to Format the Data\n", + "\n", + "These functions are used to get the data in the right shape to be uploaded to Zeno.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def common_prefix(strings: list[str]):\n", + " prefix: list[str] = []\n", + " prefix_lines: list[list[str]] = list(map(lambda x: x.splitlines(), strings))\n", + " index = 0\n", + " while all(i[index] == prefix_lines[0][index] for i in prefix_lines):\n", + " prefix.append(prefix_lines[0][index])\n", + " index += 1\n", + " common_lines = list(map(lambda x: x[index], prefix_lines))\n", + " common_in_line = os.path.commonprefix(common_lines)\n", + " prefix.append(common_in_line)\n", + " common = \"\\n\".join(prefix)\n", + " common_in_line_answers: list[list[str]] = list(\n", + " map(lambda x: [x.replace(common_in_line, \"\")], common_lines)\n", + " )\n", + " remaining_answers = list(map(lambda x: x[index + 1 :], prefix_lines))\n", + " answers = list(map(list.__add__, common_in_line_answers, remaining_answers))\n", + " answers = list(map(lambda x: \"\\n\".join(x), answers))\n", + " return {\"prefix\": common, \"answers\": answers}\n", + "\n", + "\n", + "def get_prefix_answers(data):\n", + " prefix_answers = []\n", + " keys = list(map(lambda x: x.keys(), data))\n", + " prompt_keys = list(\n", + " map(lambda x: list(filter(lambda y: y.startswith(\"prompt_\"), x)), keys)\n", + " )\n", + " for element_index, element in enumerate(data):\n", + " prompts = list(map(lambda x: element[x], prompt_keys[element_index]))\n", + " prefix_answers.append(common_prefix(prompts))\n", + " return prefix_answers\n", + "\n", + "\n", + "def generate_dataset(data, prefix_answers, task_name):\n", + " ids = list(map(lambda x: task_name + str(x[\"doc_id\"]), data))\n", + " labels = list(map(lambda x: x[\"truth\"], data))\n", + " df = pd.DataFrame(\n", + " {\n", + " \"id\": ids,\n", + " \"data\": list(\n", + " map(\n", + " lambda x: x[\"prefix\"]\n", + " + \"\\n\"\n", + " + \"\\n\".join(list(map(lambda y: \"- \" + y, x[\"answers\"]))),\n", + " prefix_answers,\n", + " )\n", + " ),\n", + " \"task\": task_name,\n", + " \"labels\": labels,\n", + " }\n", + " )\n", + " return df\n", + "\n", + "\n", + "def generate_system_df(data, prefix_answers, df, task_name):\n", + " ids = list(map(lambda x: task_name + str(x[\"doc_id\"]), data))\n", + " keys = list(map(lambda x: x.keys(), data))\n", + " logit_keys = list(\n", + " map(lambda x: list(filter(lambda y: y.startswith(\"logit_\"), x)), keys)\n", + " )\n", + " answers = []\n", + " correct_list = []\n", + " for element_index, element in enumerate(data):\n", + " answer = \"\"\n", + " correct = False\n", + " logits = list(\n", + " filter(\n", + " lambda y: isinstance(y, numbers.Number),\n", + " map(lambda x: element[x], logit_keys[element_index]),\n", + " )\n", + " )\n", + " if \"acc_norm\" in keys[element_index]:\n", + " norm_logits = logits / np.array(\n", + " [float(len(i)) for i in prefix_answers[element_index][\"answers\"]]\n", + " )\n", + " answer = prefix_answers[element_index][\"answers\"][np.argmax(norm_logits)]\n", + " correct = df[\"labels\"][element_index].endswith(answer)\n", + " answer = (\n", + " answer\n", + " + \"\\n\\n\"\n", + " + \"Raw Pred.: \"\n", + " + \", \".join(map(lambda y: str(round(y, 2)), logits))\n", + " + \"\\n\\n\"\n", + " + \"Norm Pred.: \"\n", + " + \", \".join(map(lambda y: str(round(y, 2)), norm_logits))\n", + " )\n", + " else:\n", + " answer = prefix_answers[element_index][\"answers\"][np.argmax(logits)]\n", + " correct = df[\"labels\"][element_index].endswith(answer)\n", + " answer = (\n", + " answer\n", + " + \"\\n\\n\"\n", + " + \"Pred.: \"\n", + " + \", \".join(map(lambda y: str(round(y, 2)), logits))\n", + " )\n", + " answers.append(answer)\n", + " correct_list.append(correct)\n", + " system_df = pd.DataFrame({\"id\": ids, \"output\": answers, \"correct\": correct_list})\n", + " return system_df\n", + "\n", + "\n", + "# Accumulate data for all tasks\n", + "for index, task in enumerate(tasks_dict):\n", + " data = json.load(open(task[\"file\"]))\n", + " prefix_answers = get_prefix_answers(data)\n", + " if index == 0:\n", + " df = generate_dataset(data, prefix_answers, task[\"name\"])\n", + " system_df = generate_system_df(data, prefix_answers, df, task[\"name\"])\n", + " else:\n", + " current_df = generate_dataset(data, prefix_answers, task[\"name\"])\n", + " df = pd.concat([df, current_df])\n", + " system_df = pd.concat(\n", + " [\n", + " system_df,\n", + " generate_system_df(data, prefix_answers, current_df, task[\"name\"]),\n", + " ]\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create the Zeno Project\n", + "\n", + "This will load an existing project if you already have a project with the same name.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client = ZenoClient(API_KEY, endpoint=ENDPOINT)\n", + "\n", + "project = client.create_project(\n", + " name=\"Eleuther Harness Project\",\n", + " view=\"text-classification\",\n", + " metrics=[ZenoMetric(name=\"accuracy\", type=\"mean\", columns=[\"correct\"])],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Upload the Dataset\n", + "\n", + "Uploads the task data to Zeno.\n", + "\n", + "**Only run this if you want to upload fresh data. This will delete all existing systems.**\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "project.upload_dataset(df, id_column=\"id\", data_column=\"data\", label_column=\"labels\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Upload the System\n", + "\n", + "Uploads the model output to Zeno.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "project.upload_system(\n", + " system_df,\n", + " name=args[\"model_name\"],\n", + " id_column=\"id\",\n", + " output_column=\"output\",\n", + ")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}