diff --git a/examples/rag/README.md b/examples/rag/README.md new file mode 100644 index 0000000..d825295 --- /dev/null +++ b/examples/rag/README.md @@ -0,0 +1,7 @@ +# Retrieval Augmented Generation (RAG) Evaluation with RAGAS + +This example shows how to use the [ragas](https://docs.ragas.io/) library to evaluate retrieval augmented generation (RAG) models +and visualize the results using Zeno. + +The [rag-eval.ipynb](rag-eval.ipynb) notebook walks through an example on the [ +Financial Opinion Mining and Question Answering (fiqa)](https://sites.google.com/view/fiqa/) dataset. diff --git a/examples/rag/rag-eval.ipynb b/examples/rag/rag-eval.ipynb new file mode 100644 index 0000000..d8be722 --- /dev/null +++ b/examples/rag/rag-eval.ipynb @@ -0,0 +1,205 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import dotenv\n", + "from datasets import load_dataset\n", + "import pandas as pd\n", + "from zeno_client import ZenoClient, ZenoMetric\n", + "from ragas.metrics import (\n", + " answer_relevancy,\n", + " faithfulness,\n", + " context_recall,\n", + " context_precision,\n", + ")\n", + "from ragas import evaluate\n", + "\n", + "dotenv.load_dotenv(override=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")\n", + "fiqa_eval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = evaluate(\n", + " fiqa_eval[\"baseline\"],\n", + " metrics=[\n", + " context_precision,\n", + " faithfulness,\n", + " answer_relevancy,\n", + " context_recall,\n", + " ],\n", + ")\n", + "\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = result.to_pandas()\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client = ZenoClient(os.environ[\"ZENO_API_KEY\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "project = client.create_project(\n", + " name=\"ragas FICA eval\",\n", + " description=\"Evaluation of RAG model using ragas on FICA dataset\",\n", + " public=True,\n", + " view={\n", + " \"data\": {\n", + " \"type\": \"vstack\",\n", + " \"keys\": {\n", + " \"question\": {\"type\": \"markdown\"},\n", + " \"texts\": {\n", + " \"type\": \"list\",\n", + " \"elements\": {\"type\": \"markdown\"},\n", + " \"border\": True,\n", + " \"pad\": True,\n", + " },\n", + " },\n", + " },\n", + " \"label\": {\n", + " \"type\": \"markdown\",\n", + " },\n", + " \"output\": {\n", + " \"type\": \"vstack\",\n", + " \"keys\": {\n", + " \"answer\": {\"type\": \"markdown\"},\n", + " \"ground_truths\": {\n", + " \"type\": \"list\",\n", + " \"elements\": {\"type\": \"markdown\"},\n", + " \"border\": True,\n", + " \"pad\": True,\n", + " },\n", + " },\n", + " },\n", + " \"size\": \"large\",\n", + " },\n", + " metrics=[\n", + " ZenoMetric(\n", + " name=\"context_precision\", type=\"mean\", columns=[\"context_precision\"]\n", + " ),\n", + " ZenoMetric(name=\"faithfulness\", type=\"mean\", columns=[\"faithfulness\"]),\n", + " ZenoMetric(name=\"answer_relevancy\", type=\"mean\", columns=[\"answer_relevancy\"]),\n", + " ZenoMetric(name=\"context_recall\", type=\"mean\", columns=[\"context_recall\"]),\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_df = pd.DataFrame(\n", + " {\n", + " \"data\": df.apply(\n", + " lambda x: {\"question\": x[\"question\"], \"texts\": list(x[\"contexts\"])}, axis=1\n", + " ),\n", + " \"label\": df[\"ground_truths\"].apply(lambda x: \"\\n\".join(x)),\n", + " }\n", + ")\n", + "data_df[\"id\"] = data_df.index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "project.upload_dataset(\n", + " data_df, id_column=\"id\", data_column=\"data\", label_column=\"label\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_df = df[\n", + " [\n", + " \"context_precision\",\n", + " \"faithfulness\",\n", + " \"answer_relevancy\",\n", + " \"context_recall\",\n", + " ]\n", + "].copy()\n", + "output_df[\"output\"] = df.apply(\n", + " lambda x: {\"answer\": x[\"answer\"], \"ground_truths\": list(x[\"ground_truths\"])}, axis=1\n", + ")\n", + "output_df[\"id\"] = output_df.index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "project.upload_system(\n", + " output_df, name=\"Base System\", id_column=\"id\", output_column=\"output\"\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "zeno-build", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/rag/requirements.txt b/examples/rag/requirements.txt new file mode 100644 index 0000000..da40437 --- /dev/null +++ b/examples/rag/requirements.txt @@ -0,0 +1,3 @@ +ragas +python-dotenv +datasets \ No newline at end of file