From d0fef2ffd3f77ca72753b26560d43fc6061f5c94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20B=C3=A4uerle?= Date: Thu, 21 Dec 2023 16:47:46 +0100 Subject: [PATCH] fix: update harness with link to eleuther docs (#197) * fix: update harness with link to eleuther docs It does not seem to make a ton of sense if we duplicate the code here. Should probably have one central place to maintain it. * lint --- examples/eleuther_harness/README.md | 25 +- examples/eleuther_harness/harness.ipynb | 353 --------------------- examples/eleuther_harness/requirements.txt | 3 - 3 files changed, 3 insertions(+), 378 deletions(-) delete mode 100644 examples/eleuther_harness/harness.ipynb delete mode 100644 examples/eleuther_harness/requirements.txt diff --git a/examples/eleuther_harness/README.md b/examples/eleuther_harness/README.md index 9b1f03e..46e7652 100644 --- a/examples/eleuther_harness/README.md +++ b/examples/eleuther_harness/README.md @@ -1,27 +1,8 @@ # EleutherAI Harness Use Zeno to visualize the data from the [Eleuther LM Evaluation Harness][1]! -There is a notebook for running harness tasks on diferent models and uploading -the results to Zeno. - -You can configure tasks, models, and more directly in the notebook, get a task -list at: -[Task List](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md) - -## Setup - -To run this example, first install the requirements: - -```bash -pip install -r requirements.txt -``` - -You'll then need to get an API key from Zeno Hub. -Create an account at [https://hub.zenoml.com](https://hub.zenoml.com) and navigate -to [your account page](https://hub.zenoml.com/account) to get an API key. -Add this key as an environment variable, `ZENO_API_KEY`. - -You can now run the notebook to create a Zeno Project for any of the -harness tasks. +You can find all the details about how to upload harness results to Zeno in the +[lm-evaluation-harness repo][2]. [1]: https://github.com/EleutherAI/lm-evaluation-harness +[2]: https://github.com/EleutherAI/lm-evaluation-harness#visualizing-results diff --git a/examples/eleuther_harness/harness.ipynb b/examples/eleuther_harness/harness.ipynb deleted file mode 100644 index c00e7f2..0000000 --- a/examples/eleuther_harness/harness.ipynb +++ /dev/null @@ -1,353 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Eleuther AI Harness\n", - "\n", - "This notebook can be used to run tasks of the eleuther AI harness on most types of models and upload the results to Zeno.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To run the harness, first clone and install its codebase.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!git clone git@github.com:EleutherAI/lm-evaluation-harness.git\n", - "%pip install -e lm-evaluation-harness" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from zeno_client import ZenoClient, ZenoMetric\n", - "import pandas as pd\n", - "from lm_eval import evaluator, tasks, utils\n", - "import json\n", - "import os\n", - "from pathlib import Path\n", - "import numpy as np\n", - "import numbers\n", - "\n", - "API_KEY = os.environ[\"ZENO_API_KEY\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Settings\n", - "\n", - "The following settings can be configured to your needs.\n", - "\n", - "**Warning: This might take very long to run depending on your hardware. We recommend setting the limit argument so as to only evaluate a subset of the task.**\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "args = {\n", - " \"model\": \"hf-causal\", # for huggingface models\n", - " \"model_args\": \"pretrained=EleutherAI/gpt-neo-2.7B\", # this specifies the HF model to use\n", - " \"model_name\": \"llama-7b\", # used to identify the model on upload\n", - " \"tasks\": \"bigbench_geometric_shapes\", # tasks to run, multiple tasks separated by commas\n", - " \"num_fewshot\": 3, # num fewshot examples to include in the prompt\n", - " \"device\": \"cuda\", # device to run the model on, \"cuda\" if you have a GPU\n", - " \"output_path\": \"./output/latest_results.json\", # path to write results to\n", - " \"limit\": 250, # limit on the number of examples to run\n", - " \"no_cache\": False, # don't cache results\n", - " \"output_base_path\": \"./output\", # where to write the outputs to\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Running Evaluation\n", - "\n", - "This evaluates the model on the selected tasks and write the results to the outputs folder. This step might take a while.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "task_names = utils.pattern_match(task_names.split(\",\"), tasks.ALL_TASKS)\n", - "results = evaluator.simple_evaluate(\n", - " model=args[\"model\"],\n", - " model_args=args[\"model_args\"],\n", - " tasks=task_names,\n", - " num_fewshot=args[\"num_fewshot\"],\n", - " batch_size=None,\n", - " max_batch_size=None,\n", - " device=args[\"device\"],\n", - " no_cache=args[\"no_cache\"],\n", - " limit=args[\"limit\"],\n", - " description_dict={},\n", - " decontamination_ngrams_path=None,\n", - " check_integrity=None,\n", - " write_out=True,\n", - " output_base_path=Path(args[\"output_base_path\"], \"tasks\"),\n", - ")\n", - "\n", - "dumped = json.dumps(results, indent=2)\n", - "\n", - "os.makedirs(args[\"output_base_path\"], exist_ok=True)\n", - "with open(Path(args[\"output_base_path\"], \"latest_results.json\"), \"w\") as f:\n", - " f.write(dumped)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get all Output Files for the Tasks\n", - "\n", - "Loads all task output files to be able to combine them and upload them to Zeno.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get all output files for the tasks\n", - "dir_path = Path(args[\"output_base_path\"], \"tasks\")\n", - "files = list(dir_path.glob(\"*\"))\n", - "tasks_dict = [\n", - " {\n", - " \"name\": file.name.replace(\"_write_out_info.json\", \"\").replace(\"bigbench_\", \"\"),\n", - " \"file\": file,\n", - " }\n", - " for file in files\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Helper Functions to Format the Data\n", - "\n", - "These functions are used to get the data in the right shape to be uploaded to Zeno.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def common_prefix(strings: list[str]):\n", - " prefix: list[str] = []\n", - " prefix_lines: list[list[str]] = list(map(lambda x: x.splitlines(), strings))\n", - " index = 0\n", - " while all(i[index] == prefix_lines[0][index] for i in prefix_lines):\n", - " prefix.append(prefix_lines[0][index])\n", - " index += 1\n", - " common_lines = list(map(lambda x: x[index], prefix_lines))\n", - " common_in_line = os.path.commonprefix(common_lines)\n", - " prefix.append(common_in_line)\n", - " common = \"\\n\".join(prefix)\n", - " common_in_line_answers: list[list[str]] = list(\n", - " map(lambda x: [x.replace(common_in_line, \"\")], common_lines)\n", - " )\n", - " remaining_answers = list(map(lambda x: x[index + 1 :], prefix_lines))\n", - " answers = list(map(list.__add__, common_in_line_answers, remaining_answers))\n", - " answers = list(map(lambda x: \"\\n\".join(x), answers))\n", - " return {\"prefix\": common, \"answers\": answers}\n", - "\n", - "\n", - "def get_prefix_answers(data):\n", - " prefix_answers = []\n", - " keys = list(map(lambda x: x.keys(), data))\n", - " prompt_keys = list(\n", - " map(lambda x: list(filter(lambda y: y.startswith(\"prompt_\"), x)), keys)\n", - " )\n", - " for element_index, element in enumerate(data):\n", - " prompts = list(map(lambda x: element[x], prompt_keys[element_index]))\n", - " prefix_answers.append(common_prefix(prompts))\n", - " return prefix_answers\n", - "\n", - "\n", - "def generate_dataset(data, prefix_answers, task_name):\n", - " ids = list(map(lambda x: task_name + str(x[\"doc_id\"]), data))\n", - " labels = list(map(lambda x: x[\"truth\"], data))\n", - " df = pd.DataFrame(\n", - " {\n", - " \"id\": ids,\n", - " \"data\": list(\n", - " map(\n", - " lambda x: x[\"prefix\"]\n", - " + \"\\n\"\n", - " + \"\\n\".join(list(map(lambda y: \"- \" + y, x[\"answers\"]))),\n", - " prefix_answers,\n", - " )\n", - " ),\n", - " \"task\": task_name,\n", - " \"labels\": labels,\n", - " }\n", - " )\n", - " return df\n", - "\n", - "\n", - "def generate_system_df(data, prefix_answers, df, task_name):\n", - " ids = list(map(lambda x: task_name + str(x[\"doc_id\"]), data))\n", - " keys = list(map(lambda x: x.keys(), data))\n", - " logit_keys = list(\n", - " map(lambda x: list(filter(lambda y: y.startswith(\"logit_\"), x)), keys)\n", - " )\n", - " answers = []\n", - " correct_list = []\n", - " for element_index, element in enumerate(data):\n", - " answer = \"\"\n", - " correct = False\n", - " logits = list(\n", - " filter(\n", - " lambda y: isinstance(y, numbers.Number),\n", - " map(lambda x: element[x], logit_keys[element_index]),\n", - " )\n", - " )\n", - " if \"acc_norm\" in keys[element_index]:\n", - " norm_logits = logits / np.array(\n", - " [float(len(i)) for i in prefix_answers[element_index][\"answers\"]]\n", - " )\n", - " answer = prefix_answers[element_index][\"answers\"][np.argmax(norm_logits)]\n", - " correct = df[\"labels\"][element_index].endswith(answer)\n", - " answer = (\n", - " answer\n", - " + \"\\n\\n\"\n", - " + \"Raw Pred.: \"\n", - " + \", \".join(map(lambda y: str(round(y, 2)), logits))\n", - " + \"\\n\\n\"\n", - " + \"Norm Pred.: \"\n", - " + \", \".join(map(lambda y: str(round(y, 2)), norm_logits))\n", - " )\n", - " else:\n", - " answer = prefix_answers[element_index][\"answers\"][np.argmax(logits)]\n", - " correct = df[\"labels\"][element_index].endswith(answer)\n", - " answer = (\n", - " answer\n", - " + \"\\n\\n\"\n", - " + \"Pred.: \"\n", - " + \", \".join(map(lambda y: str(round(y, 2)), logits))\n", - " )\n", - " answers.append(answer)\n", - " correct_list.append(correct)\n", - " system_df = pd.DataFrame({\"id\": ids, \"output\": answers, \"correct\": correct_list})\n", - " return system_df\n", - "\n", - "\n", - "# Accumulate data for all tasks\n", - "for index, task in enumerate(tasks_dict):\n", - " data = json.load(open(task[\"file\"]))\n", - " prefix_answers = get_prefix_answers(data)\n", - " if index == 0:\n", - " df = generate_dataset(data, prefix_answers, task[\"name\"])\n", - " system_df = generate_system_df(data, prefix_answers, df, task[\"name\"])\n", - " else:\n", - " current_df = generate_dataset(data, prefix_answers, task[\"name\"])\n", - " df = pd.concat([df, current_df])\n", - " system_df = pd.concat(\n", - " [\n", - " system_df,\n", - " generate_system_df(data, prefix_answers, current_df, task[\"name\"]),\n", - " ]\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create the Zeno Project\n", - "\n", - "This will load an existing project if you already have a project with the same name.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client = ZenoClient(API_KEY, endpoint=ENDPOINT)\n", - "\n", - "project = client.create_project(\n", - " name=\"Eleuther Harness Project\",\n", - " view=\"text-classification\",\n", - " metrics=[ZenoMetric(name=\"accuracy\", type=\"mean\", columns=[\"correct\"])],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Upload the Dataset\n", - "\n", - "Uploads the task data to Zeno.\n", - "\n", - "**Only run this if you want to upload fresh data. This will delete all existing systems.**\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "project.upload_dataset(df, id_column=\"id\", data_column=\"data\", label_column=\"labels\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Upload the System\n", - "\n", - "Uploads the model output to Zeno.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "project.upload_system(\n", - " system_df,\n", - " name=args[\"model_name\"],\n", - " id_column=\"id\",\n", - " output_column=\"output\",\n", - ")" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/eleuther_harness/requirements.txt b/examples/eleuther_harness/requirements.txt deleted file mode 100644 index ccb9078..0000000 --- a/examples/eleuther_harness/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -zeno_client -pandas -numpy \ No newline at end of file