Skip to content

Commit

Permalink
Use structured output when generating evaluation steps (#1519)
Browse files Browse the repository at this point in the history
  • Loading branch information
collindutter authored Jan 8, 2025
1 parent bc148b5 commit 4ff7fce
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 20 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Deprecated

- `FuturesExecutorMixin.futures_executor`. Use `FuturesExecutorMixin.create_futures_executor` instead.
### Changed

- `EvalEngine` to use structured output when generating evaluation steps.

## [1.1.1] - 2025-01-03

Expand Down
5 changes: 2 additions & 3 deletions griptape/engines/eval/eval_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from griptape.configs import Defaults
from griptape.engines import BaseEvalEngine
from griptape.mixins.serializable_mixin import SerializableMixin
from griptape.rules import JsonSchemaRule
from griptape.utils import J2

if TYPE_CHECKING:
Expand Down Expand Up @@ -89,7 +88,6 @@ def _generate_steps(self, evaluation_params: dict[str, str]) -> list[str]:
system_prompt = self.generate_steps_system_template.render(
evaluation_params=", ".join(param for param in evaluation_params),
criteria=self.criteria,
json_schema_rule=JsonSchemaRule(STEPS_SCHEMA.json_schema("Output Format")),
)
user_prompt = self.generate_steps_user_template.render()

Expand All @@ -99,6 +97,7 @@ def _generate_steps(self, evaluation_params: dict[str, str]) -> list[str]:
Message(system_prompt, role=Message.SYSTEM_ROLE),
Message(user_prompt, role=Message.USER_ROLE),
],
output_schema=STEPS_SCHEMA,
),
).to_artifact()

Expand All @@ -111,7 +110,6 @@ def _generate_results(self, evaluation_params: dict[str, str]) -> tuple[float, s
evaluation_params=", ".join(param for param in evaluation_params),
evaluation_steps=self.evaluation_steps,
evaluation_text="\n\n".join(f"{key}: {value}" for key, value in evaluation_params.items()),
json_schema_rule=JsonSchemaRule(RESULTS_SCHEMA.json_schema("Output Format")),
)
user_prompt = self.generate_results_user_template.render()

Expand All @@ -121,6 +119,7 @@ def _generate_results(self, evaluation_params: dict[str, str]) -> tuple[float, s
Message(system_prompt, role=Message.SYSTEM_ROLE),
Message(user_prompt, role=Message.USER_ROLE),
],
output_schema=RESULTS_SCHEMA,
),
).to_text()

Expand Down
2 changes: 0 additions & 2 deletions griptape/templates/engines/eval/results/system.j2
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,3 @@ Evaluation Steps:
{{ evaluation_steps }}

{{ evaluation_text }}

{{ json_schema_rule }}
2 changes: 0 additions & 2 deletions griptape/templates/engines/eval/steps/system.j2
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,3 @@ You MUST make it clear how to evaluate {{ evaluation_params }} in relation to on

Evaluation Criteria:
{{ criteria }}

{{ json_schema_rule }}
22 changes: 9 additions & 13 deletions tests/unit/engines/eval/test_eval_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,11 @@ def engine(self):
return EvalEngine(
criteria="foo",
prompt_driver=MockPromptDriver(
mock_output=json.dumps(
{
"steps": ["mock output"],
"score": 0.0,
"reason": "mock output",
}
),
mock_structured_output={
"steps": ["mock output"],
"score": 0.0,
"reason": "mock output",
}
),
)

Expand Down Expand Up @@ -74,12 +72,10 @@ def test_evaluate(self):
engine = EvalEngine(
evaluation_steps=["foo"],
prompt_driver=MockPromptDriver(
mock_output=json.dumps(
{
"score": 0.0,
"reason": "mock output",
}
),
mock_structured_output={
"score": 0.0,
"reason": "mock output",
}
),
)
score, reason = engine.evaluate(
Expand Down

0 comments on commit 4ff7fce

Please sign in to comment.