From 1983a5f808d4e03ce30043f15d5f2ed7b5c1670e Mon Sep 17 00:00:00 2001 From: Collin Dutter Date: Tue, 7 Jan 2025 13:15:00 -0800 Subject: [PATCH] Use structured output when generating evaluation steps --- .github/ISSUE_TEMPLATE/preapproved.md | 4 ++-- CHANGELOG.md | 3 +++ CONTRIBUTING.md | 10 +++++---- griptape/engines/eval/eval_engine.py | 5 ++--- .../templates/engines/eval/results/system.j2 | 2 -- .../templates/engines/eval/steps/system.j2 | 2 -- tests/unit/engines/eval/test_eval_engine.py | 22 ++++++++----------- 7 files changed, 22 insertions(+), 26 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/preapproved.md b/.github/ISSUE_TEMPLATE/preapproved.md index bf5525abd..59331f550 100644 --- a/.github/ISSUE_TEMPLATE/preapproved.md +++ b/.github/ISSUE_TEMPLATE/preapproved.md @@ -1,6 +1,6 @@ --- -name: Pre-Discussed and Approved Topics -about: Only for topics already discussed and approved in the GitHub Discussions section. +name: Pre-Discussed and Approved Topics +about: Only for topics already discussed and approved in the GitHub Discussions section. --- Is this a reproducible bug? If not, please open a new [Github Discussion](https://github.com/orgs/griptape-ai/discussions/new/choose). diff --git a/CHANGELOG.md b/CHANGELOG.md index 5cab7c778..acdde9dd5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Deprecated - `FuturesExecutorMixin.futures_executor`. Use `FuturesExecutorMixin.create_futures_executor` instead. +### Changed + +- `EvalEngine` to use structured output when generating evaluation steps. ## [1.1.1] - 2025-01-03 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2db9d8225..8a85628d5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,8 +16,8 @@ friendly to new contributors are tagged with "good first issue". **I have a bug!** 1. Search the issue tracker and discussions for similar issues. -2. If you don't have steps to reproduce, open a discussion. -3. If you have steps to reproduce, open an issue. +1. If you don't have steps to reproduce, open a discussion. +1. If you have steps to reproduce, open an issue. **I have an idea for a feature!** @@ -26,7 +26,7 @@ friendly to new contributors are tagged with "good first issue". **I've implemented a feature!** 1. If there is an issue for the feature, open a pull request. -2. If there is no issue, open a discussion and link to your branch. +1. If there is no issue, open a discussion and link to your branch. **I have a question!** @@ -58,7 +58,6 @@ Pull requests should be associated with a previously accepted issue. **If you open a pull request for something that wasn't previously discussed,** it may be closed or remain stale for an indefinite period of time. - > [!NOTE] > > **Pull requests are NOT a place to discuss feature design.** Please do @@ -75,16 +74,19 @@ The [Griptape Extension Template](https://github.com/griptape-ai/griptape-extens ## Dev Environment Install all dependencies via Make: + ```shell make install ``` Run tests: + ```shell make test/unit ``` Run checks: + ```shell make check ``` diff --git a/griptape/engines/eval/eval_engine.py b/griptape/engines/eval/eval_engine.py index b3bd258ba..045c0d643 100644 --- a/griptape/engines/eval/eval_engine.py +++ b/griptape/engines/eval/eval_engine.py @@ -11,7 +11,6 @@ from griptape.configs import Defaults from griptape.engines import BaseEvalEngine from griptape.mixins.serializable_mixin import SerializableMixin -from griptape.rules import JsonSchemaRule from griptape.utils import J2 if TYPE_CHECKING: @@ -89,7 +88,6 @@ def _generate_steps(self, evaluation_params: dict[str, str]) -> list[str]: system_prompt = self.generate_steps_system_template.render( evaluation_params=", ".join(param for param in evaluation_params), criteria=self.criteria, - json_schema_rule=JsonSchemaRule(STEPS_SCHEMA.json_schema("Output Format")), ) user_prompt = self.generate_steps_user_template.render() @@ -99,6 +97,7 @@ def _generate_steps(self, evaluation_params: dict[str, str]) -> list[str]: Message(system_prompt, role=Message.SYSTEM_ROLE), Message(user_prompt, role=Message.USER_ROLE), ], + output_schema=STEPS_SCHEMA, ), ).to_artifact() @@ -111,7 +110,6 @@ def _generate_results(self, evaluation_params: dict[str, str]) -> tuple[float, s evaluation_params=", ".join(param for param in evaluation_params), evaluation_steps=self.evaluation_steps, evaluation_text="\n\n".join(f"{key}: {value}" for key, value in evaluation_params.items()), - json_schema_rule=JsonSchemaRule(RESULTS_SCHEMA.json_schema("Output Format")), ) user_prompt = self.generate_results_user_template.render() @@ -121,6 +119,7 @@ def _generate_results(self, evaluation_params: dict[str, str]) -> tuple[float, s Message(system_prompt, role=Message.SYSTEM_ROLE), Message(user_prompt, role=Message.USER_ROLE), ], + output_schema=RESULTS_SCHEMA, ), ).to_text() diff --git a/griptape/templates/engines/eval/results/system.j2 b/griptape/templates/engines/eval/results/system.j2 index 0fb8583ab..3cd9c0780 100644 --- a/griptape/templates/engines/eval/results/system.j2 +++ b/griptape/templates/engines/eval/results/system.j2 @@ -6,5 +6,3 @@ Evaluation Steps: {{ evaluation_steps }} {{ evaluation_text }} - -{{ json_schema_rule }} diff --git a/griptape/templates/engines/eval/steps/system.j2 b/griptape/templates/engines/eval/steps/system.j2 index e6792ba85..5c3f70a77 100644 --- a/griptape/templates/engines/eval/steps/system.j2 +++ b/griptape/templates/engines/eval/steps/system.j2 @@ -3,5 +3,3 @@ You MUST make it clear how to evaluate {{ evaluation_params }} in relation to on Evaluation Criteria: {{ criteria }} - -{{ json_schema_rule }} diff --git a/tests/unit/engines/eval/test_eval_engine.py b/tests/unit/engines/eval/test_eval_engine.py index 5a7dcb80a..7625a80d2 100644 --- a/tests/unit/engines/eval/test_eval_engine.py +++ b/tests/unit/engines/eval/test_eval_engine.py @@ -13,13 +13,11 @@ def engine(self): return EvalEngine( criteria="foo", prompt_driver=MockPromptDriver( - mock_output=json.dumps( - { - "steps": ["mock output"], - "score": 0.0, - "reason": "mock output", - } - ), + mock_structured_output={ + "steps": ["mock output"], + "score": 0.0, + "reason": "mock output", + } ), ) @@ -74,12 +72,10 @@ def test_evaluate(self): engine = EvalEngine( evaluation_steps=["foo"], prompt_driver=MockPromptDriver( - mock_output=json.dumps( - { - "score": 0.0, - "reason": "mock output", - } - ), + mock_structured_output={ + "score": 0.0, + "reason": "mock output", + } ), ) score, reason = engine.evaluate(