From d0fef2ffd3f77ca72753b26560d43fc6061f5c94 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20B=C3=A4uerle?= <alex@a13x.io>
Date: Thu, 21 Dec 2023 16:47:46 +0100
Subject: [PATCH] fix: update harness with link to eleuther docs (#197)

* fix: update harness with link to eleuther docs

It does not seem to make a ton of sense if we duplicate the code here. Should probably have one central place to maintain it.

* lint
---
 examples/eleuther_harness/README.md        |  25 +-
 examples/eleuther_harness/harness.ipynb    | 353 ---------------------
 examples/eleuther_harness/requirements.txt |   3 -
 3 files changed, 3 insertions(+), 378 deletions(-)
 delete mode 100644 examples/eleuther_harness/harness.ipynb
 delete mode 100644 examples/eleuther_harness/requirements.txt

diff --git a/examples/eleuther_harness/README.md b/examples/eleuther_harness/README.md
index 9b1f03e..46e7652 100644
--- a/examples/eleuther_harness/README.md
+++ b/examples/eleuther_harness/README.md
@@ -1,27 +1,8 @@
 # EleutherAI Harness
 
 Use Zeno to visualize the data from the [Eleuther LM Evaluation Harness][1]!
-There is a notebook for running harness tasks on diferent models and uploading
-the results to Zeno.
-
-You can configure tasks, models, and more directly in the notebook, get a task
-list at:
-[Task List](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md)
-
-## Setup
-
-To run this example, first install the requirements:
-
-```bash
-pip install -r requirements.txt
-```
-
-You'll then need to get an API key from Zeno Hub.
-Create an account at [https://hub.zenoml.com](https://hub.zenoml.com) and navigate
-to [your account page](https://hub.zenoml.com/account) to get an API key.
-Add this key as an environment variable, `ZENO_API_KEY`.
-
-You can now run the notebook to create a Zeno Project for any of the
-harness tasks.
+You can find all the details about how to upload harness results to Zeno in the
+[lm-evaluation-harness repo][2].
 
 [1]: https://github.com/EleutherAI/lm-evaluation-harness
+[2]: https://github.com/EleutherAI/lm-evaluation-harness#visualizing-results
diff --git a/examples/eleuther_harness/harness.ipynb b/examples/eleuther_harness/harness.ipynb
deleted file mode 100644
index c00e7f2..0000000
--- a/examples/eleuther_harness/harness.ipynb
+++ /dev/null
@@ -1,353 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Eleuther AI Harness\n",
-    "\n",
-    "This notebook can be used to run tasks of the eleuther AI harness on most types of models and upload the results to Zeno.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "To run the harness, first clone and install its codebase.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!git clone git@github.com:EleutherAI/lm-evaluation-harness.git\n",
-    "%pip install -e lm-evaluation-harness"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from zeno_client import ZenoClient, ZenoMetric\n",
-    "import pandas as pd\n",
-    "from lm_eval import evaluator, tasks, utils\n",
-    "import json\n",
-    "import os\n",
-    "from pathlib import Path\n",
-    "import numpy as np\n",
-    "import numbers\n",
-    "\n",
-    "API_KEY = os.environ[\"ZENO_API_KEY\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Settings\n",
-    "\n",
-    "The following settings can be configured to your needs.\n",
-    "\n",
-    "**Warning: This might take very long to run depending on your hardware. We recommend setting the limit argument so as to only evaluate a subset of the task.**\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "args = {\n",
-    "    \"model\": \"hf-causal\",  # for huggingface models\n",
-    "    \"model_args\": \"pretrained=EleutherAI/gpt-neo-2.7B\",  # this specifies the HF model to use\n",
-    "    \"model_name\": \"llama-7b\",  # used to identify the model on upload\n",
-    "    \"tasks\": \"bigbench_geometric_shapes\",  # tasks to run, multiple tasks separated by commas\n",
-    "    \"num_fewshot\": 3,  # num fewshot examples to include in the prompt\n",
-    "    \"device\": \"cuda\",  # device to run the model on, \"cuda\" if you have a GPU\n",
-    "    \"output_path\": \"./output/latest_results.json\",  # path to write results to\n",
-    "    \"limit\": 250,  # limit on the number of examples to run\n",
-    "    \"no_cache\": False,  # don't cache results\n",
-    "    \"output_base_path\": \"./output\",  # where to write the outputs to\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Running Evaluation\n",
-    "\n",
-    "This evaluates the model on the selected tasks and write the results to the outputs folder. This step might take a while.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "task_names = utils.pattern_match(task_names.split(\",\"), tasks.ALL_TASKS)\n",
-    "results = evaluator.simple_evaluate(\n",
-    "    model=args[\"model\"],\n",
-    "    model_args=args[\"model_args\"],\n",
-    "    tasks=task_names,\n",
-    "    num_fewshot=args[\"num_fewshot\"],\n",
-    "    batch_size=None,\n",
-    "    max_batch_size=None,\n",
-    "    device=args[\"device\"],\n",
-    "    no_cache=args[\"no_cache\"],\n",
-    "    limit=args[\"limit\"],\n",
-    "    description_dict={},\n",
-    "    decontamination_ngrams_path=None,\n",
-    "    check_integrity=None,\n",
-    "    write_out=True,\n",
-    "    output_base_path=Path(args[\"output_base_path\"], \"tasks\"),\n",
-    ")\n",
-    "\n",
-    "dumped = json.dumps(results, indent=2)\n",
-    "\n",
-    "os.makedirs(args[\"output_base_path\"], exist_ok=True)\n",
-    "with open(Path(args[\"output_base_path\"], \"latest_results.json\"), \"w\") as f:\n",
-    "    f.write(dumped)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Get all Output Files for the Tasks\n",
-    "\n",
-    "Loads all task output files to be able to combine them and upload them to Zeno.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get all output files for the tasks\n",
-    "dir_path = Path(args[\"output_base_path\"], \"tasks\")\n",
-    "files = list(dir_path.glob(\"*\"))\n",
-    "tasks_dict = [\n",
-    "    {\n",
-    "        \"name\": file.name.replace(\"_write_out_info.json\", \"\").replace(\"bigbench_\", \"\"),\n",
-    "        \"file\": file,\n",
-    "    }\n",
-    "    for file in files\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Helper Functions to Format the Data\n",
-    "\n",
-    "These functions are used to get the data in the right shape to be uploaded to Zeno.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def common_prefix(strings: list[str]):\n",
-    "    prefix: list[str] = []\n",
-    "    prefix_lines: list[list[str]] = list(map(lambda x: x.splitlines(), strings))\n",
-    "    index = 0\n",
-    "    while all(i[index] == prefix_lines[0][index] for i in prefix_lines):\n",
-    "        prefix.append(prefix_lines[0][index])\n",
-    "        index += 1\n",
-    "    common_lines = list(map(lambda x: x[index], prefix_lines))\n",
-    "    common_in_line = os.path.commonprefix(common_lines)\n",
-    "    prefix.append(common_in_line)\n",
-    "    common = \"\\n\".join(prefix)\n",
-    "    common_in_line_answers: list[list[str]] = list(\n",
-    "        map(lambda x: [x.replace(common_in_line, \"\")], common_lines)\n",
-    "    )\n",
-    "    remaining_answers = list(map(lambda x: x[index + 1 :], prefix_lines))\n",
-    "    answers = list(map(list.__add__, common_in_line_answers, remaining_answers))\n",
-    "    answers = list(map(lambda x: \"\\n\".join(x), answers))\n",
-    "    return {\"prefix\": common, \"answers\": answers}\n",
-    "\n",
-    "\n",
-    "def get_prefix_answers(data):\n",
-    "    prefix_answers = []\n",
-    "    keys = list(map(lambda x: x.keys(), data))\n",
-    "    prompt_keys = list(\n",
-    "        map(lambda x: list(filter(lambda y: y.startswith(\"prompt_\"), x)), keys)\n",
-    "    )\n",
-    "    for element_index, element in enumerate(data):\n",
-    "        prompts = list(map(lambda x: element[x], prompt_keys[element_index]))\n",
-    "        prefix_answers.append(common_prefix(prompts))\n",
-    "    return prefix_answers\n",
-    "\n",
-    "\n",
-    "def generate_dataset(data, prefix_answers, task_name):\n",
-    "    ids = list(map(lambda x: task_name + str(x[\"doc_id\"]), data))\n",
-    "    labels = list(map(lambda x: x[\"truth\"], data))\n",
-    "    df = pd.DataFrame(\n",
-    "        {\n",
-    "            \"id\": ids,\n",
-    "            \"data\": list(\n",
-    "                map(\n",
-    "                    lambda x: x[\"prefix\"]\n",
-    "                    + \"\\n\"\n",
-    "                    + \"\\n\".join(list(map(lambda y: \"- \" + y, x[\"answers\"]))),\n",
-    "                    prefix_answers,\n",
-    "                )\n",
-    "            ),\n",
-    "            \"task\": task_name,\n",
-    "            \"labels\": labels,\n",
-    "        }\n",
-    "    )\n",
-    "    return df\n",
-    "\n",
-    "\n",
-    "def generate_system_df(data, prefix_answers, df, task_name):\n",
-    "    ids = list(map(lambda x: task_name + str(x[\"doc_id\"]), data))\n",
-    "    keys = list(map(lambda x: x.keys(), data))\n",
-    "    logit_keys = list(\n",
-    "        map(lambda x: list(filter(lambda y: y.startswith(\"logit_\"), x)), keys)\n",
-    "    )\n",
-    "    answers = []\n",
-    "    correct_list = []\n",
-    "    for element_index, element in enumerate(data):\n",
-    "        answer = \"\"\n",
-    "        correct = False\n",
-    "        logits = list(\n",
-    "            filter(\n",
-    "                lambda y: isinstance(y, numbers.Number),\n",
-    "                map(lambda x: element[x], logit_keys[element_index]),\n",
-    "            )\n",
-    "        )\n",
-    "        if \"acc_norm\" in keys[element_index]:\n",
-    "            norm_logits = logits / np.array(\n",
-    "                [float(len(i)) for i in prefix_answers[element_index][\"answers\"]]\n",
-    "            )\n",
-    "            answer = prefix_answers[element_index][\"answers\"][np.argmax(norm_logits)]\n",
-    "            correct = df[\"labels\"][element_index].endswith(answer)\n",
-    "            answer = (\n",
-    "                answer\n",
-    "                + \"\\n\\n\"\n",
-    "                + \"Raw Pred.: \"\n",
-    "                + \", \".join(map(lambda y: str(round(y, 2)), logits))\n",
-    "                + \"\\n\\n\"\n",
-    "                + \"Norm Pred.: \"\n",
-    "                + \", \".join(map(lambda y: str(round(y, 2)), norm_logits))\n",
-    "            )\n",
-    "        else:\n",
-    "            answer = prefix_answers[element_index][\"answers\"][np.argmax(logits)]\n",
-    "            correct = df[\"labels\"][element_index].endswith(answer)\n",
-    "            answer = (\n",
-    "                answer\n",
-    "                + \"\\n\\n\"\n",
-    "                + \"Pred.: \"\n",
-    "                + \", \".join(map(lambda y: str(round(y, 2)), logits))\n",
-    "            )\n",
-    "        answers.append(answer)\n",
-    "        correct_list.append(correct)\n",
-    "    system_df = pd.DataFrame({\"id\": ids, \"output\": answers, \"correct\": correct_list})\n",
-    "    return system_df\n",
-    "\n",
-    "\n",
-    "# Accumulate data for all tasks\n",
-    "for index, task in enumerate(tasks_dict):\n",
-    "    data = json.load(open(task[\"file\"]))\n",
-    "    prefix_answers = get_prefix_answers(data)\n",
-    "    if index == 0:\n",
-    "        df = generate_dataset(data, prefix_answers, task[\"name\"])\n",
-    "        system_df = generate_system_df(data, prefix_answers, df, task[\"name\"])\n",
-    "    else:\n",
-    "        current_df = generate_dataset(data, prefix_answers, task[\"name\"])\n",
-    "        df = pd.concat([df, current_df])\n",
-    "        system_df = pd.concat(\n",
-    "            [\n",
-    "                system_df,\n",
-    "                generate_system_df(data, prefix_answers, current_df, task[\"name\"]),\n",
-    "            ]\n",
-    "        )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Create the Zeno Project\n",
-    "\n",
-    "This will load an existing project if you already have a project with the same name.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "client = ZenoClient(API_KEY, endpoint=ENDPOINT)\n",
-    "\n",
-    "project = client.create_project(\n",
-    "    name=\"Eleuther Harness Project\",\n",
-    "    view=\"text-classification\",\n",
-    "    metrics=[ZenoMetric(name=\"accuracy\", type=\"mean\", columns=[\"correct\"])],\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Upload the Dataset\n",
-    "\n",
-    "Uploads the task data to Zeno.\n",
-    "\n",
-    "**Only run this if you want to upload fresh data. This will delete all existing systems.**\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "project.upload_dataset(df, id_column=\"id\", data_column=\"data\", label_column=\"labels\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Upload the System\n",
-    "\n",
-    "Uploads the model output to Zeno.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "project.upload_system(\n",
-    "    system_df,\n",
-    "    name=args[\"model_name\"],\n",
-    "    id_column=\"id\",\n",
-    "    output_column=\"output\",\n",
-    ")"
-   ]
-  }
- ],
- "metadata": {
-  "language_info": {
-   "name": "python"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/examples/eleuther_harness/requirements.txt b/examples/eleuther_harness/requirements.txt
deleted file mode 100644
index ccb9078..0000000
--- a/examples/eleuther_harness/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-zeno_client
-pandas
-numpy
\ No newline at end of file