diff --git a/.prettierignore b/.prettierignore
index dc083186..7bb74ff7 100644
--- a/.prettierignore
+++ b/.prettierignore
@@ -1,4 +1,5 @@
node_modules
build
.docusaurus
-docs/api
\ No newline at end of file
+docs/api
+docs/evaluation
\ No newline at end of file
diff --git a/docs/evaluation/how_to_guides/static/view_experiment.gif b/docs/evaluation/how_to_guides/static/view_experiment.gif
index 50538de0..0e04c88b 100644
Binary files a/docs/evaluation/how_to_guides/static/view_experiment.gif and b/docs/evaluation/how_to_guides/static/view_experiment.gif differ
diff --git a/docs/evaluation/index.mdx b/docs/evaluation/index.mdx
index c119cfc7..5ca01805 100644
--- a/docs/evaluation/index.mdx
+++ b/docs/evaluation/index.mdx
@@ -16,7 +16,7 @@ import { RegionalUrl } from "@site/src/components/RegionalUrls";
This quick start will get you up and running with our evaluation SDK and Experiments UI.
-## 1. Install LangSmith
+## 1. Install Dependencies
-## 4. Run your evaluation
+## 3. Import dependencies
=0.2.0`"})`
- from langsmith import Client
+ {
+ value: "python",
+ label: "Python",
+ content: `from langsmith import wrappers, Client
+from pydantic import BaseModel, Field
+from openai import OpenAI
- # 1. Create and/or select your dataset
- client = Client()
- dataset = client.clone_public_dataset(
- "https://smith.langchain.com/public/a63525f9-bdf2-4512-83e3-077dc9417f96/d"
- )
+client = Client()
+openai_client = wrappers.wrap_openai(OpenAI())`,
+ },
+ {
+ value: "typescript",
+ label: "TypeScript",
+ content: `import { Client } from "langsmith";
+import OpenAI from "openai";
+import { z } from "zod";
+import { zodResponseFormat } from "openai/helpers/zod";
+import type { EvaluationResult } from "langsmith/evaluation";
+import { evaluate } from "langsmith/evaluation";
- # 2. Define an evaluator
- def is_concise(outputs: dict, reference_outputs: dict) -> bool:
- return len(outputs["answer"]) < (3 * len(reference_outputs["answer"]))
-
- # 3. Define the interface to your app
- def chatbot(inputs: dict) -> dict:
- return {"answer": inputs["question"] + " is a good question. I don't know the answer."}
-
- # 4. Run an evaluation
- experiment_results = client.evaluate(
- chatbot,
- data=dataset,
- evaluators=[is_concise],
- experiment_prefix="my first experiment ",
- max_concurrency=4,
- )
+const client = new Client();
+
+const openai = new OpenAI();`,
+ },
+ ]}
+ groupId="client-language"
+/>
+
+## 4. Create a dataset
+
+=0.2.9`"})`
-import { Client } from "langsmith";
-import { evaluate } from "langsmith/evaluation";
-import type { EvaluationResult } from "langsmith/evaluation";
+ },
+ {
+ value: "typescript",
+ label: "TypeScript",
+ content: `
+// For other dataset creation methods, see:
+// https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_programmatically
+// https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_in_application
+
+// Create inputs and reference outputs
+const examples: [string, string][] = [
+ [
+ "Which country is Mount Kilimanjaro located in?",
+ "Mount Kilimanjaro is located in Tanzania.",
+ ],
+ [
+ "What is Earth's lowest point?",
+ "Earth's lowest point is The Dead Sea.",
+ ],
+];
-// 1. Define a dataset
-const client = new Client();
-const datasetName = "my first dataset"
-const dataset = await client.clonePublicDataset(
-"https://smith.langchain.com/public/a63525f9-bdf2-4512-83e3-077dc9417f96/d",
-{ datasetName: datasetName }
-)\n
-// 2. Define an evaluator
-function isConcise({ outputs, referenceOutputs }: { outputs?: Record, referenceOutputs?: Record }): EvaluationResult {
-const score = outputs?.answer.length < 3 \* referenceOutputs?.answer.length;
-return { key: "is_concise", score: score };
-}\n
-// 3. Run an evaluation
-await evaluate(
-(inputs: { question: string }) => {
-return {
-answer: inputs.question + " Good question. I don't know the answer"
-};
-}, {
-data: datasetName,
-evaluators: [isConcise],
-experimentPrefix: "my first experiment ",
-maxConcurrency: 4,
-});`,
+const inputs = examples.map(([inputPrompt]) => ({
+ question: inputPrompt,
+}));
+const outputs = examples.map(([, outputAnswer]) => ({
+ answer: outputAnswer,
+}));
+
+// Programmatically create a dataset in LangSmith
+const dataset = await client.createDataset("Sample dataset", {
+ description: "A sample dataset in LangSmith.",
+});
+
+// Add examples to the dataset
+await client.createExamples({
+ inputs,
+ outputs,
+ datasetId: dataset.id,
+});
+`,
+},
]}
groupId="client-language"
/>
-## 5. View Experiments UI
+## 5. Define what you're evaluating
+
+ dict:
+ response = openai_client.chat.completions.create(
+ model="gpt-4o-mini",
+ messages=[
+ { "role": "system", "content": "Answer the following question accurately" },
+ { "role": "user", "content": inputs["question"] },
+ ],
+ )
+ return { "response": response.choices[0].message.content.strip() }
+ `},
+ {
+ value: "typescript",
+ label: "TypeScript",
+ content: `// Define the application logic you want to evaluate inside a target function
+// The SDK will automatically send the inputs from the dataset to your target function
+async function target(inputs: string): Promise<{ response: string }> {
+ const response = await openai.chat.completions.create({
+ model: "gpt-4o-mini",
+ messages: [
+ { role: "system", content: "Answer the following question accurately" },
+ { role: "user", content: inputs },
+ ],
+ });
+ return { response: response.choices[0].message.content?.trim() || "" };
+}
+`,
+ },
+ ]}
+ groupId="client-language"
+/>
+
+## 6. Define evaluator
+
+ bool:
+ response = openai_client.beta.chat.completions.parse(
+ model="gpt-4o-mini",
+ messages=[
+ { "role": "system", "content": instructions },
+ {
+ "role": "user",
+ "content": f"""Ground Truth answer: {reference_outputs["answer"]};
+ Student's Answer: {outputs["response"]}"""
+ },
+ ],
+ response_format=Grade,
+ )
+ return response.choices[0].message.parsed.score`,
+ },
+ {
+ value: "typescript",
+ label: "TypeScript",
+ content: `// Define instructions for the LLM judge evaluator
+const instructions = \`Evaluate Student Answer against Ground Truth for conceptual similarity and classify true or false:
+- False: No conceptual match and similarity
+- True: Most or full conceptual match and similarity
+- Key criteria: Concept should match, not exact wording.
+\`;
+
+// Define context for the LLM judge evaluator
+const context = \`Ground Truth answer: {reference}; Student's Answer: {prediction}\`;
+
+// Define output schema for the LLM judge
+const ResponseSchema = z.object({
+ score: z
+ .boolean()
+ .describe(
+ "Boolean that indicates whether the response is accurate relative to the reference answer"
+ ),
+});
+
+// Define LLM judge that grades the accuracy of the response relative to reference output
+async function accuracy({
+ outputs,
+ referenceOutputs,
+}: {
+ outputs?: Record;
+ referenceOutputs?: Record;
+}): Promise {
+ const response = await openai.chat.completions.create({
+ model: "gpt-4o-mini",
+ messages: [
+ { role: "system", content: instructions },
+ { role: "user", content: context.replace("{prediction}", outputs?.answer || "").replace("{reference}", referenceOutputs?.answer || "") }
+ ],
+ response_format: zodResponseFormat(ResponseSchema, "response")
+ });
+
+ return {
+ key: "accuracy",
+ score: ResponseSchema.parse(JSON.parse(response.choices[0].message.content || "")).score,
+ };
+}`,
+ },
+ ]}
+ groupId="client-language"
+/>
+
+## 7. Run and view results
+
+ {
+ return target(exampleInput.question);
+ },
+ {
+ data: "Sample dataset",
+ evaluators: [
+ accuracy,
+ // can add multiple evaluators here
+ ],
+ experimentPrefix: "first-eval-in-langsmith",
+ maxConcurrency: 2,
+ }
+);
+`,
+ },
+ ]}
+ groupId="client-language"
+/>
Click the link printed out by your evaluation run to access the LangSmith Experiments UI, and explore the results of your evaluation.