Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: New eval quickstart #575

Merged
merged 15 commits into from
Dec 18, 2024
3 changes: 2 additions & 1 deletion .prettierignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
node_modules
build
.docusaurus
docs/api
docs/api
docs/evaluation
Binary file modified docs/evaluation/how_to_guides/static/view_experiment.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
330 changes: 273 additions & 57 deletions docs/evaluation/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,21 @@ import { RegionalUrl } from "@site/src/components/RegionalUrls";

This quick start will get you up and running with our evaluation SDK and Experiments UI.

## 1. Install LangSmith
## 1. Install Dependencies

<CodeTabs
tabs={[
{
value: "python",
label: "Python",
language: "bash",
content: `pip install -U langsmith`,
content: `pip install -U langsmith openai pydantic`,
},
{
value: "typescript",
label: "TypeScript",
language: "bash",
content: `yarn add langsmith`,
content: `yarn add langsmith openai zod`,
},
]}
groupId="client-language"
Expand All @@ -45,76 +45,292 @@ To create an API key head to the <RegionalUrl text='Settings page' suffix='/sett
<CodeTabs
tabs={[
ShellBlock(`export LANGCHAIN_TRACING_V2=true
export LANGCHAIN_API_KEY=<your-api-key>`),
export LANGCHAIN_API_KEY="<your-langchain-api-key>"
# The example uses OpenAI, but it's not necessary in general
export OPENAI_API_KEY="<your-openai-api-key>"`),
]}
groupId="client-language"
/>

## 4. Run your evaluation
## 3. Import Dependencies

<CodeTabs
tabs={[
python({caption: "Requires `langsmith>=0.2.0`"})`
from langsmith import Client
{
value: "python",
label: "Python",
content: `from langsmith import wrappers, Client
from pydantic import BaseModel, Field
from openai import OpenAI

# 1. Create and/or select your dataset
client = Client()
dataset = client.clone_public_dataset(
"https://smith.langchain.com/public/a63525f9-bdf2-4512-83e3-077dc9417f96/d"
)
client = Client()
openai_client = wrappers.wrap_openai(OpenAI())`,
},
{
value: "typescript",
label: "TypeScript",
content: `import { Client } from "langsmith";
import OpenAI from "openai";
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
import type { EvaluationResult } from "langsmith/evaluation";
import { evaluate } from "langsmith/evaluation";

# 2. Define an evaluator
def is_concise(outputs: dict, reference_outputs: dict) -> bool:
return len(outputs["answer"]) < (3 * len(reference_outputs["answer"]))

# 3. Define the interface to your app
def chatbot(inputs: dict) -> dict:
return {"answer": inputs["question"] + " is a good question. I don't know the answer."}

# 4. Run an evaluation
experiment_results = client.evaluate(
chatbot,
data=dataset,
evaluators=[is_concise],
experiment_prefix="my first experiment ",
max_concurrency=4,
)

const client = new Client();

const openai = new OpenAI();`,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to match python

Suggested change
const openai = new OpenAI();`,
const openaiClient = new OpenAI();`,

},
]}
groupId="client-language"
/>

## 4. Create a dataset

<CodeTabs
tabs={[
{
value: "python",
label: "Python",
content: `# For other dataset creation methods, see:
# https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_programmatically
# https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_in_application

# Create inputs and reference outputs
examples = [
(
"Which country is Mount Kilimanjaro located in?",
"Mount Kilimanjaro is located in Tanzania.",
),
(
"What is Earth's lowest point?",
"Earth's lowest point is The Dead Sea.",
),
]

inputs = [{"question": input_prompt} for input_prompt, _ in examples]
outputs = [{"answer": output_answer} for _, output_answer in examples]

# Programmatically create a dataset in LangSmith
dataset = client.create_dataset(
dataset_name="Sample dataset", description="A sample dataset in LangSmith."
)

# Add examples to the dataset
client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
`,
typescript({caption: "Requires `langsmith>=0.2.9`"})`
import { Client } from "langsmith";
import { evaluate } from "langsmith/evaluation";
import type { EvaluationResult } from "langsmith/evaluation";
},
{
value: "typescript",
label: "TypeScript",
content: `
// For other dataset creation methods, see:
// https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_programmatically
// https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_in_application

// Create inputs and reference outputs
const examples: [string, string][] = [
[
"Which country is Mount Kilimanjaro located in?",
"Mount Kilimanjaro is located in Tanzania.",
],
[
"What is Earth's lowest point?",
"Earth's lowest point is The Dead Sea.",
],
];

// 1. Define a dataset
const client = new Client();
const datasetName = "my first dataset"
const dataset = await client.clonePublicDataset(
"https://smith.langchain.com/public/a63525f9-bdf2-4512-83e3-077dc9417f96/d",
{ datasetName: datasetName }
)\n
// 2. Define an evaluator
function isConcise({ outputs, referenceOutputs }: { outputs?: Record<string, any>, referenceOutputs?: Record<string, any> }): EvaluationResult {
const score = outputs?.answer.length < 3 \* referenceOutputs?.answer.length;
return { key: "is_concise", score: score };
}\n
// 3. Run an evaluation
await evaluate(
(inputs: { question: string }) => {
return {
answer: inputs.question + " Good question. I don't know the answer"
};
}, {
data: datasetName,
evaluators: [isConcise],
experimentPrefix: "my first experiment ",
maxConcurrency: 4,
});`,
const inputs = examples.map(([inputPrompt]) => ({
question: inputPrompt,
}));
const outputs = examples.map(([, outputAnswer]) => ({
answer: outputAnswer,
}));

// Programmatically create a dataset in LangSmith
const dataset = await client.createDataset("Sample dataset", {
description: "A sample dataset in LangSmith.",
});

// Add examples to the dataset
await client.createExamples({
inputs,
outputs,
datasetId: dataset.id,
});
`,
},
]}
groupId="client-language"
/>

## 5. View Experiments UI
## 5. Define what you're evaluating

<CodeTabs
tabs={[
{
value: "python",
label: "Python",
content: `# Define the application logic you want to evaluate inside a target function
# The SDK will automatically send the inputs from the dataset to your target function
def target(inputs: dict) -> dict:
response = openai_client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{ "role": "system", "content": "Answer the following question accurately" },
{ "role": "user", "content": inputs["question"] },
],
)
return { "response": response.choices[0].message.content.strip() }
Comment on lines +180 to +184
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

python formatting nit

Suggested change
{ "role": "system", "content": "Answer the following question accurately" },
{ "role": "user", "content": inputs["question"] },
],
)
return { "response": response.choices[0].message.content.strip() }
{"role": "system", "content": "Answer the following question accurately"},
{"role": "user", "content": inputs["question"]},
],
)
return {"response": response.choices[0].message.content.strip()}

`},
{
value: "typescript",
label: "TypeScript",
content: `// Define the application logic you want to evaluate inside a target function
// The SDK will automatically send the inputs from the dataset to your target function
async function target(inputs: string): Promise<{ response: string }> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is inputs not a Record in js as well? didnt' realize you could have a target that just takes a string

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just deconstructed it into a string before I sent it over to target

const response = await openai.chat.completions.create({
model: "gpt-4o-mini",
messages: [
{ role: "system", content: "Answer the following question accurately" },
{ role: "user", content: inputs },
],
});
return { response: response.choices[0].message.content?.trim() || "" };
}
`,
},
]}
groupId="client-language"
/>

## 6. Define evaluator

<CodeTabs
tabs={[
{
value: "python",
label: "Python",
content: `# Define instructions for the LLM judge evaluator
instructions = """Evaluate Student Answer against Ground Truth for conceptual similarity and classify true or false:
- False: No conceptual match and similarity
- True: Most or full conceptual match and similarity
- Key criteria: Concept should match, not exact wording.
"""

# Define output schema for the LLM judge
class Grade(BaseModel):
score: bool = Field(
description="Boolean that indicates whether the response is accurate relative to the reference answer"
)

# Define LLM judge that grades the accuracy of the response relative to reference output
def accuracy(outputs: dict, reference_outputs: dict) -> bool:
response = openai_client.beta.chat.completions.parse(
model="gpt-4o-mini",
messages=[
{ "role": "system", "content": instructions },
{
"role": "user",
"content": f"""Ground Truth answer: {reference_outputs["answer"]};
Student's Answer: {outputs["response"]}"""
},
],
Comment on lines +231 to +238
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

formatting nit

Suggested change
messages=[
{ "role": "system", "content": instructions },
{
"role": "user",
"content": f"""Ground Truth answer: {reference_outputs["answer"]};
Student's Answer: {outputs["response"]}"""
},
],
to_evaluate = (
f"Ground Truth Answer: {reference_outputs['answer']}\n"
f"Student Answer: {outputs['response']}"
)
messages=[
{"role": "system", "content": instructions},
{"role": "user", "content": to_evaluate},
],

response_format=Grade,
)
return response.choices[0].message.parsed.score`,
},
{
value: "typescript",
label: "TypeScript",
content: `// Define instructions for the LLM judge evaluator
const instructions = \`Evaluate Student Answer against Ground Truth for conceptual similarity and classify true or false:
- False: No conceptual match and similarity
- True: Most or full conceptual match and similarity
- Key criteria: Concept should match, not exact wording.
\`;

// Define context for the LLM judge evaluator
const context = \`Ground Truth answer: {reference}; Student's Answer: {prediction}\`;

// Define output schema for the LLM judge
const ResponseSchema = z.object({
score: z
.boolean()
.describe(
"Boolean that indicates whether the response is accurate relative to the reference answer"
),
});

// Define LLM judge that grades the accuracy of the response relative to reference output
async function accuracy({
outputs,
referenceOutputs,
}: {
outputs?: Record<string, string>;
referenceOutputs?: Record<string, string>;
}): Promise<EvaluationResult> {
const response = await openai.chat.completions.create({
model: "gpt-4o-mini",
messages: [
{ role: "system", content: instructions },
{ role: "user", content: context.replace("{prediction}", outputs?.answer || "").replace("{reference}", referenceOutputs?.answer || "") }
],
response_format: zodResponseFormat(ResponseSchema, "response")
});

return {
key: "accuracy",
score: ResponseSchema.parse(JSON.parse(response.choices[0].message.content || "")).score,
};
}`,
},
]}
groupId="client-language"
/>

## 7. Run and view results

<CodeTabs tabs={[

{
value: "python",
label: "Python",
content: `# After running the evaluation, a link will be provided to view the results in langsmith
experiment_results = client.evaluate(
target,
data="Sample dataset",
evaluators=[
accuracy,
# can add multiple evaluators here
],
experiment_prefix="first-eval-in-langsmith",
max_concurrency=2,
)
`},
{
value: "typescript",
label: "TypeScript",
content: `// After running the evaluation, a link will be provided to view the results in langsmith
await evaluate(
(exampleInput) => {
return target(exampleInput.question);
},
{
data: "Sample dataset",
evaluators: [
accuracy,
// can add multiple evaluators here
],
experimentPrefix: "first-eval-in-langsmith",
maxConcurrency: 2,
}
);
`,
},
]}
groupId="client-language"
/>

Click the link printed out by your evaluation run to access the LangSmith Experiments UI, and explore the results of your evaluation.

Expand Down
Loading