From 1983a5f808d4e03ce30043f15d5f2ed7b5c1670e Mon Sep 17 00:00:00 2001
From: Collin Dutter <collindutter@gmail.com>
Date: Tue, 7 Jan 2025 13:15:00 -0800
Subject: [PATCH] Use structured output when generating evaluation steps

---
 .github/ISSUE_TEMPLATE/preapproved.md         |  4 ++--
 CHANGELOG.md                                  |  3 +++
 CONTRIBUTING.md                               | 10 +++++----
 griptape/engines/eval/eval_engine.py          |  5 ++---
 .../templates/engines/eval/results/system.j2  |  2 --
 .../templates/engines/eval/steps/system.j2    |  2 --
 tests/unit/engines/eval/test_eval_engine.py   | 22 ++++++++-----------
 7 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/preapproved.md b/.github/ISSUE_TEMPLATE/preapproved.md
index bf5525abd..59331f550 100644
--- a/.github/ISSUE_TEMPLATE/preapproved.md
+++ b/.github/ISSUE_TEMPLATE/preapproved.md
@@ -1,6 +1,6 @@
 ---
-name: Pre-Discussed and Approved Topics 
-about: Only for topics already discussed and approved in the GitHub Discussions section. 
+name: Pre-Discussed and Approved Topics
+about: Only for topics already discussed and approved in the GitHub Discussions section.
 ---
 
 Is this a reproducible bug? If not, please open a new [Github Discussion](https://github.com/orgs/griptape-ai/discussions/new/choose).
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5cab7c778..acdde9dd5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Deprecated
 
 - `FuturesExecutorMixin.futures_executor`. Use `FuturesExecutorMixin.create_futures_executor` instead.
+### Changed
+
+- `EvalEngine` to use structured output when generating evaluation steps.
 
 ## [1.1.1] - 2025-01-03
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2db9d8225..8a85628d5 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -16,8 +16,8 @@ friendly to new contributors are tagged with "good first issue".
 **I have a bug!**
 
 1. Search the issue tracker and discussions for similar issues.
-2. If you don't have steps to reproduce, open a discussion.
-3. If you have steps to reproduce, open an issue.
+1. If you don't have steps to reproduce, open a discussion.
+1. If you have steps to reproduce, open an issue.
 
 **I have an idea for a feature!**
 
@@ -26,7 +26,7 @@ friendly to new contributors are tagged with "good first issue".
 **I've implemented a feature!**
 
 1. If there is an issue for the feature, open a pull request.
-2. If there is no issue, open a discussion and link to your branch.
+1. If there is no issue, open a discussion and link to your branch.
 
 **I have a question!**
 
@@ -58,7 +58,6 @@ Pull requests should be associated with a previously accepted issue.
 **If you open a pull request for something that wasn't previously discussed,**
 it may be closed or remain stale for an indefinite period of time.
 
-
 > [!NOTE]
 >
 > **Pull requests are NOT a place to discuss feature design.** Please do
@@ -75,16 +74,19 @@ The [Griptape Extension Template](https://github.com/griptape-ai/griptape-extens
 ## Dev Environment
 
 Install all dependencies via Make:
+
 ```shell
 make install
 ```
 
 Run tests:
+
 ```shell
 make test/unit
 ```
 
 Run checks:
+
 ```shell
 make check
 ```
diff --git a/griptape/engines/eval/eval_engine.py b/griptape/engines/eval/eval_engine.py
index b3bd258ba..045c0d643 100644
--- a/griptape/engines/eval/eval_engine.py
+++ b/griptape/engines/eval/eval_engine.py
@@ -11,7 +11,6 @@
 from griptape.configs import Defaults
 from griptape.engines import BaseEvalEngine
 from griptape.mixins.serializable_mixin import SerializableMixin
-from griptape.rules import JsonSchemaRule
 from griptape.utils import J2
 
 if TYPE_CHECKING:
@@ -89,7 +88,6 @@ def _generate_steps(self, evaluation_params: dict[str, str]) -> list[str]:
         system_prompt = self.generate_steps_system_template.render(
             evaluation_params=", ".join(param for param in evaluation_params),
             criteria=self.criteria,
-            json_schema_rule=JsonSchemaRule(STEPS_SCHEMA.json_schema("Output Format")),
         )
         user_prompt = self.generate_steps_user_template.render()
 
@@ -99,6 +97,7 @@ def _generate_steps(self, evaluation_params: dict[str, str]) -> list[str]:
                     Message(system_prompt, role=Message.SYSTEM_ROLE),
                     Message(user_prompt, role=Message.USER_ROLE),
                 ],
+                output_schema=STEPS_SCHEMA,
             ),
         ).to_artifact()
 
@@ -111,7 +110,6 @@ def _generate_results(self, evaluation_params: dict[str, str]) -> tuple[float, s
             evaluation_params=", ".join(param for param in evaluation_params),
             evaluation_steps=self.evaluation_steps,
             evaluation_text="\n\n".join(f"{key}: {value}" for key, value in evaluation_params.items()),
-            json_schema_rule=JsonSchemaRule(RESULTS_SCHEMA.json_schema("Output Format")),
         )
         user_prompt = self.generate_results_user_template.render()
 
@@ -121,6 +119,7 @@ def _generate_results(self, evaluation_params: dict[str, str]) -> tuple[float, s
                     Message(system_prompt, role=Message.SYSTEM_ROLE),
                     Message(user_prompt, role=Message.USER_ROLE),
                 ],
+                output_schema=RESULTS_SCHEMA,
             ),
         ).to_text()
 
diff --git a/griptape/templates/engines/eval/results/system.j2 b/griptape/templates/engines/eval/results/system.j2
index 0fb8583ab..3cd9c0780 100644
--- a/griptape/templates/engines/eval/results/system.j2
+++ b/griptape/templates/engines/eval/results/system.j2
@@ -6,5 +6,3 @@ Evaluation Steps:
 {{ evaluation_steps }}
 
 {{ evaluation_text }}
-
-{{ json_schema_rule }}
diff --git a/griptape/templates/engines/eval/steps/system.j2 b/griptape/templates/engines/eval/steps/system.j2
index e6792ba85..5c3f70a77 100644
--- a/griptape/templates/engines/eval/steps/system.j2
+++ b/griptape/templates/engines/eval/steps/system.j2
@@ -3,5 +3,3 @@ You MUST make it clear how to evaluate {{ evaluation_params }} in relation to on
 
 Evaluation Criteria:
 {{ criteria }}
-
-{{ json_schema_rule }}
diff --git a/tests/unit/engines/eval/test_eval_engine.py b/tests/unit/engines/eval/test_eval_engine.py
index 5a7dcb80a..7625a80d2 100644
--- a/tests/unit/engines/eval/test_eval_engine.py
+++ b/tests/unit/engines/eval/test_eval_engine.py
@@ -13,13 +13,11 @@ def engine(self):
         return EvalEngine(
             criteria="foo",
             prompt_driver=MockPromptDriver(
-                mock_output=json.dumps(
-                    {
-                        "steps": ["mock output"],
-                        "score": 0.0,
-                        "reason": "mock output",
-                    }
-                ),
+                mock_structured_output={
+                    "steps": ["mock output"],
+                    "score": 0.0,
+                    "reason": "mock output",
+                }
             ),
         )
 
@@ -74,12 +72,10 @@ def test_evaluate(self):
         engine = EvalEngine(
             evaluation_steps=["foo"],
             prompt_driver=MockPromptDriver(
-                mock_output=json.dumps(
-                    {
-                        "score": 0.0,
-                        "reason": "mock output",
-                    }
-                ),
+                mock_structured_output={
+                    "score": 0.0,
+                    "reason": "mock output",
+                }
             ),
         )
         score, reason = engine.evaluate(