feat: add open llm leaderboard notebooks to examples

zeno-ml · Oct 18, 2023 · 2ceaea9 · 2ceaea9
1 parent 5ea5b48
commit 2ceaea9
Show file tree

Hide file tree

Showing 6 changed files with 806 additions and 0 deletions.
diff --git a/examples/open_llm_leaderboard/README.md b/examples/open_llm_leaderboard/README.md
@@ -0,0 +1,23 @@
+# Open LLM Leaderboard
+
+This example downloads data from the [Open LLM Leaderboard][1] and ingests it
+into a Zeno Project.
+There are four tasks in the leaderboard, for each task, there is one notebook
+to upload your data.
+
+You can configure which models to upload data for.
+
+## Setup
+
+To run this example, you'll need to install the requirements.
+
+```bash
+pip install -r requirements.txt
+```
+
+You also need to add an environment variable named `ZENO_API_KEY` that contains
+your API key to be able to upload data to Zeno.
+
+Then, simply run any of the notebooks.
+
+[1]: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
diff --git a/examples/open_llm_leaderboard/arc.ipynb b/examples/open_llm_leaderboard/arc.ipynb
@@ -0,0 +1,190 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ARC Task\n",
+    "\n",
+    "ARC is a question-answering task with grade-school science questions."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from zeno_client import ZenoClient, ZenoMetric\n",
+    "import datasets\n",
+    "import numpy as np\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Feel free to change the list of models used.\n",
+    "You can go to the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to check what models are available.\n",
+    "Some of them might not have associated data, you can check this by clicking on the little icon next to the model name.\n",
+    "If you get a 404 after clicking, we won't be able to fetch the model data and this notebook will crash."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "models = [\"meta-llama/Llama-2-70b-hf\", \"mistralai/Mistral-7B-v0.1\", \"tiiuae/falcon-40b\", \"Riiid/sheep-duck-llama-2-70b-v1.1\", \"AIDC-ai-business/Marcoroni-70B-v1\", \"ICBU-NPU/FashionGPT-70B-V1.1\", \"adonlee/LLaMA_2_70B_LoRA\", \"uni-tianyan/Uni-TianYan\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_data(model: str):\n",
+    "    data_path = \"details_\" + model.replace(\"/\", \"__\")\n",
+    "    return datasets.load_dataset(\n",
+    "        \"open-llm-leaderboard/\" + data_path,\n",
+    "        \"harness_arc_challenge_25\",\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "labels = [\"A\", \"B\", \"C\", \"D\", \"E\"]\n",
+    "\n",
+    "def generate_dataset(df):\n",
+    "    df_lim = df[[\"example\", \"choices\", \"gold\"]]\n",
+    "    df_lim.loc[:, \"data\"] = df_lim.apply(lambda x: \"\\n\" + x[\"example\"] + \"\\n\" + \"\\n\".join(f\"{labels[i]}: {x}\" for i,x in enumerate(x['choices'])), axis=1)\n",
+    "    df_lim.loc[:, \"num_choices\"] = df_lim.apply(lambda x: str(len(x['choices'])), axis=1)\n",
+    "    df_lim.loc[:, \"label\"] = df_lim.apply(lambda x: labels[x[\"gold\"]], axis=1)\n",
+    "    df_lim = df_lim.drop(columns=[\"example\", \"choices\", \"gold\"])\n",
+    "    df_lim[\"id\"] = df_lim.index \n",
+    "    return df_lim\n",
+    "\n",
+    "def generate_system(df):\n",
+    "    df_system = df[[\"predictions\", \"acc_norm\", \"choices\", \"acc\"]]\n",
+    "    df_system[\"answer_raw\"] = df_system.apply(lambda x: labels[np.argmax(x['predictions'])], axis=1)\n",
+    "    df_system[\"answer_norm\"] = df_system.apply(lambda x: labels[np.argmax(x['predictions']/np.array([float(len(i)) for i in x['choices']]))], axis=1)\n",
+    "    df_system[\"predictions\"] = df_system.apply(lambda x: x['answer_norm'] + \"\\n\\n\" + \"Raw Pred.: \" + \", \".join(map(lambda y: str(round(y, 2)), x['predictions'])) + \"\\nNorm Pred.: \" + \", \".join(map(lambda y: str(round(y, 2)), x['predictions']/np.array([float(len(i)) for i in x['choices']]))), axis=1)\n",
+    "    df_system[\"correct\"] = df_system.apply(lambda x: True if x['acc_norm'] > 0 else False, axis=1)\n",
+    "    df_system[\"correct_raw\"] = df_system.apply(lambda x: True if x['acc'] > 0 else False, axis=1)\n",
+    "    df_system = df_system.drop(columns=[\"acc_norm\", \"choices\", \"acc\"])\n",
+    "    df_system[\"id\"] = df_system.index\n",
+    "    return df_system"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Make sure you have your Zeno API key in your environment variables."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "API_KEY = os.environ[\"ZENO_API_KEY\"]\n",
+    "client = ZenoClient(API_KEY)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Lets create a project to hold the data for the ARC task."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "proj = client.create_project(\n",
+    "    name=\"ARC\", \n",
+    "    view=\"text-classification\", \n",
+    "    description=\"ARC (https://arxiv.org/abs/1803.05457) task in the Open-LLM-Leaderboard.\",\n",
+    "    metrics=[\n",
+    "        ZenoMetric(name=\"accuracy\", type=\"mean\", columns=[\"correct\"])\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let us now upload the data to the project we just created."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = generate_dataset(get_data(models[0])['latest'].to_pandas())\n",
+    "print(\"\\nYour dataset has {} rows\\n\".format(len(df)))\n",
+    "num_rows = len(df)\n",
+    "proj.upload_dataset(df, id_column=\"id\", label_column=\"label\", data_column=\"data\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, let us upload all the model outputs for the models we specified above."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for model in models:\n",
+    "    dataset = get_data(model)['latest'].to_pandas()\n",
+    "    if len(dataset) != num_rows:\n",
+    "        print(\"Skipping {} because it has {} rows instead of {}\".format(model, len(dataset), num_rows))\n",
+    "        continue\n",
+    "    df_system = generate_system(dataset)\n",
+    "    proj.upload_system(df_system, name=model.replace('/', \"__\"), output_column=\"predictions\", id_column=\"id\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "compare",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}