diff --git a/.prettierignore b/.prettierignore index dc083186..7bb74ff7 100644 --- a/.prettierignore +++ b/.prettierignore @@ -1,4 +1,5 @@ node_modules build .docusaurus -docs/api \ No newline at end of file +docs/api +docs/evaluation \ No newline at end of file diff --git a/docs/evaluation/how_to_guides/static/view_experiment.gif b/docs/evaluation/how_to_guides/static/view_experiment.gif index 50538de0..0e04c88b 100644 Binary files a/docs/evaluation/how_to_guides/static/view_experiment.gif and b/docs/evaluation/how_to_guides/static/view_experiment.gif differ diff --git a/docs/evaluation/index.mdx b/docs/evaluation/index.mdx index c119cfc7..5ca01805 100644 --- a/docs/evaluation/index.mdx +++ b/docs/evaluation/index.mdx @@ -16,7 +16,7 @@ import { RegionalUrl } from "@site/src/components/RegionalUrls"; This quick start will get you up and running with our evaluation SDK and Experiments UI. -## 1. Install LangSmith +## 1. Install Dependencies -## 4. Run your evaluation +## 3. Import dependencies =0.2.0`"})` - from langsmith import Client + { + value: "python", + label: "Python", + content: `from langsmith import wrappers, Client +from pydantic import BaseModel, Field +from openai import OpenAI - # 1. Create and/or select your dataset - client = Client() - dataset = client.clone_public_dataset( - "https://smith.langchain.com/public/a63525f9-bdf2-4512-83e3-077dc9417f96/d" - ) +client = Client() +openai_client = wrappers.wrap_openai(OpenAI())`, + }, + { + value: "typescript", + label: "TypeScript", + content: `import { Client } from "langsmith"; +import OpenAI from "openai"; +import { z } from "zod"; +import { zodResponseFormat } from "openai/helpers/zod"; +import type { EvaluationResult } from "langsmith/evaluation"; +import { evaluate } from "langsmith/evaluation"; - # 2. Define an evaluator - def is_concise(outputs: dict, reference_outputs: dict) -> bool: - return len(outputs["answer"]) < (3 * len(reference_outputs["answer"])) - - # 3. Define the interface to your app - def chatbot(inputs: dict) -> dict: - return {"answer": inputs["question"] + " is a good question. I don't know the answer."} - - # 4. Run an evaluation - experiment_results = client.evaluate( - chatbot, - data=dataset, - evaluators=[is_concise], - experiment_prefix="my first experiment ", - max_concurrency=4, - ) +const client = new Client(); + +const openai = new OpenAI();`, + }, + ]} + groupId="client-language" +/> + +## 4. Create a dataset + +=0.2.9`"})` -import { Client } from "langsmith"; -import { evaluate } from "langsmith/evaluation"; -import type { EvaluationResult } from "langsmith/evaluation"; + }, + { + value: "typescript", + label: "TypeScript", + content: ` +// For other dataset creation methods, see: +// https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_programmatically +// https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_in_application + +// Create inputs and reference outputs +const examples: [string, string][] = [ + [ + "Which country is Mount Kilimanjaro located in?", + "Mount Kilimanjaro is located in Tanzania.", + ], + [ + "What is Earth's lowest point?", + "Earth's lowest point is The Dead Sea.", + ], +]; -// 1. Define a dataset -const client = new Client(); -const datasetName = "my first dataset" -const dataset = await client.clonePublicDataset( -"https://smith.langchain.com/public/a63525f9-bdf2-4512-83e3-077dc9417f96/d", -{ datasetName: datasetName } -)\n -// 2. Define an evaluator -function isConcise({ outputs, referenceOutputs }: { outputs?: Record, referenceOutputs?: Record }): EvaluationResult { -const score = outputs?.answer.length < 3 \* referenceOutputs?.answer.length; -return { key: "is_concise", score: score }; -}\n -// 3. Run an evaluation -await evaluate( -(inputs: { question: string }) => { -return { -answer: inputs.question + " Good question. I don't know the answer" -}; -}, { -data: datasetName, -evaluators: [isConcise], -experimentPrefix: "my first experiment ", -maxConcurrency: 4, -});`, +const inputs = examples.map(([inputPrompt]) => ({ + question: inputPrompt, +})); +const outputs = examples.map(([, outputAnswer]) => ({ + answer: outputAnswer, +})); + +// Programmatically create a dataset in LangSmith +const dataset = await client.createDataset("Sample dataset", { + description: "A sample dataset in LangSmith.", +}); + +// Add examples to the dataset +await client.createExamples({ + inputs, + outputs, + datasetId: dataset.id, +}); +`, +}, ]} groupId="client-language" /> -## 5. View Experiments UI +## 5. Define what you're evaluating + + dict: + response = openai_client.chat.completions.create( + model="gpt-4o-mini", + messages=[ + { "role": "system", "content": "Answer the following question accurately" }, + { "role": "user", "content": inputs["question"] }, + ], + ) + return { "response": response.choices[0].message.content.strip() } + `}, + { + value: "typescript", + label: "TypeScript", + content: `// Define the application logic you want to evaluate inside a target function +// The SDK will automatically send the inputs from the dataset to your target function +async function target(inputs: string): Promise<{ response: string }> { + const response = await openai.chat.completions.create({ + model: "gpt-4o-mini", + messages: [ + { role: "system", content: "Answer the following question accurately" }, + { role: "user", content: inputs }, + ], + }); + return { response: response.choices[0].message.content?.trim() || "" }; +} +`, + }, + ]} + groupId="client-language" +/> + +## 6. Define evaluator + + bool: + response = openai_client.beta.chat.completions.parse( + model="gpt-4o-mini", + messages=[ + { "role": "system", "content": instructions }, + { + "role": "user", + "content": f"""Ground Truth answer: {reference_outputs["answer"]}; + Student's Answer: {outputs["response"]}""" + }, + ], + response_format=Grade, + ) + return response.choices[0].message.parsed.score`, + }, + { + value: "typescript", + label: "TypeScript", + content: `// Define instructions for the LLM judge evaluator +const instructions = \`Evaluate Student Answer against Ground Truth for conceptual similarity and classify true or false: +- False: No conceptual match and similarity +- True: Most or full conceptual match and similarity +- Key criteria: Concept should match, not exact wording. +\`; + +// Define context for the LLM judge evaluator +const context = \`Ground Truth answer: {reference}; Student's Answer: {prediction}\`; + +// Define output schema for the LLM judge +const ResponseSchema = z.object({ + score: z + .boolean() + .describe( + "Boolean that indicates whether the response is accurate relative to the reference answer" + ), +}); + +// Define LLM judge that grades the accuracy of the response relative to reference output +async function accuracy({ + outputs, + referenceOutputs, +}: { + outputs?: Record; + referenceOutputs?: Record; +}): Promise { + const response = await openai.chat.completions.create({ + model: "gpt-4o-mini", + messages: [ + { role: "system", content: instructions }, + { role: "user", content: context.replace("{prediction}", outputs?.answer || "").replace("{reference}", referenceOutputs?.answer || "") } + ], + response_format: zodResponseFormat(ResponseSchema, "response") + }); + + return { + key: "accuracy", + score: ResponseSchema.parse(JSON.parse(response.choices[0].message.content || "")).score, + }; +}`, + }, + ]} + groupId="client-language" +/> + +## 7. Run and view results + + { + return target(exampleInput.question); + }, + { + data: "Sample dataset", + evaluators: [ + accuracy, + // can add multiple evaluators here + ], + experimentPrefix: "first-eval-in-langsmith", + maxConcurrency: 2, + } +); +`, + }, + ]} + groupId="client-language" +/> Click the link printed out by your evaluation run to access the LangSmith Experiments UI, and explore the results of your evaluation.