Skip to content

Commit

Permalink
[OPIK-531] error tracking sdk implement error tracking in evaluate ca…
Browse files Browse the repository at this point in the history
…lls (#855)

* Draft error tracking

* Implement error tracking for opik.track decorator, added new unit tests

* Update tests

* Fix non-integration type hints

* Update decorator-based integrations

* Update anthropic and openai tests to include error_info when LLM provider raises an exception

* Fix lint errors

* Update langchain integration. Add new test that handles error coming from openai

* Fix tests

* Fix langchain integration to properly handle errors

* Update pre-commit

* Add show-fixes argument to ruff

* Add verbose flag for ruff

* Remove unused fixture

* Move openai configuration check fixture to conftest for library integration tests

* Fix lint errors

* Remove verbose flag for ruff

* Add diff flag for ruff

* Remove --show-fixes flag for ruff

* Reorder imports

* Add noqa for ANY_STRING import

* Fix lint error

* Fix example

* Add error logging to evaluations

* Add ErrorInfoDict type

* Fix lint errors

* Add missing docstrings

* Update error_info_collector

* Remove error info from update_current_span/trace functions. Update the type hint in scorer.py

* Add unit test for evaluate with failing task
  • Loading branch information
alexkuzmik authored Dec 12, 2024
1 parent 8f75709 commit c4acf28
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 2 deletions.
17 changes: 16 additions & 1 deletion sdks/python/src/opik/evaluation/scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@

from typing import List, Optional, Dict, Any, Union, Callable
from .types import LLMTask
from opik.types import ErrorInfoDict

from opik.api_objects.dataset import dataset, dataset_item
from opik.api_objects.experiment import experiment, experiment_item
from opik.api_objects import opik_client, trace
from opik import context_storage, opik_context, exceptions
from opik.decorator import error_info_collector

from . import test_case, test_result
from .metrics import arguments_helpers, score_result, base_metric
Expand Down Expand Up @@ -72,6 +75,8 @@ def _process_item(
],
) -> test_result.TestResult:
try:
error_info: Optional[ErrorInfoDict] = None

trace_data = trace.TraceData(
input=item.get_content(),
name="evaluation_task",
Expand All @@ -80,8 +85,13 @@ def _process_item(
)
context_storage.set_trace_data(trace_data)
item_content = item.get_content()

LOGGER.debug("Task started, input: %s", item_content)
task_output_ = task(item_content)
try:
task_output_ = task(item_content)
except Exception as exception:
error_info = error_info_collector.collect(exception)
raise
LOGGER.debug("Task finished, output: %s", task_output_)

opik_context.update_current_trace(output=task_output_)
Expand All @@ -107,7 +117,12 @@ def _process_item(

finally:
trace_data = context_storage.pop_trace_data() # type: ignore

assert trace_data is not None

if error_info is not None:
trace_data.error_info = error_info

trace_data.init_end_time()
client.trace(**trace_data.__dict__)
experiment_item_ = experiment_item.ExperimentItem(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from opik.config import OPIK_PROJECT_DEFAULT_NAME
from opik.integrations.llama_index import LlamaIndexCallbackHandler

from ...testlib import ANY_BUT_NONE, TraceModel, assert_equal


Expand Down
74 changes: 73 additions & 1 deletion sdks/python/tests/unit/evaluation/test_evaluate.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import mock
import pytest

from typing import Dict, Any

import opik
from opik.api_objects.dataset import dataset_item
from opik.api_objects import opik_client
from opik import evaluation, exceptions, url_helpers
from opik.evaluation import metrics
from ...testlib import ANY_BUT_NONE, assert_equal
from ...testlib import ANY_BUT_NONE, ANY_STRING, assert_equal
from ...testlib.models import (
TraceModel,
FeedbackScoreModel,
Expand Down Expand Up @@ -277,3 +279,73 @@ def say_task(dataset_item: Dict[str, Any]):
)

mock_dataset.__internal_api__get_items_as_dataclasses__.assert_called_once()


def test_evaluate__exception_raised_from_the_task__error_info_added_to_the_trace(
fake_backend,
):
mock_dataset = mock.MagicMock(spec=["__internal_api__get_items_as_dataclasses__"])
mock_dataset.name = "the-dataset-name"
mock_dataset.__internal_api__get_items_as_dataclasses__.return_value = [
dataset_item.DatasetItem(
id="dataset-item-id-1",
input={"message": "say hello"},
reference="hello",
),
]

def say_task(dataset_item: Dict[str, Any]):
raise Exception("some-error-message")

mock_experiment = mock.Mock()
mock_create_experiment = mock.Mock()
mock_create_experiment.return_value = mock_experiment

mock_get_experiment_url = mock.Mock()
mock_get_experiment_url.return_value = "any_url"

with mock.patch.object(
opik_client.Opik, "create_experiment", mock_create_experiment
):
with mock.patch.object(
url_helpers, "get_experiment_url", mock_get_experiment_url
):
with pytest.raises(Exception):
evaluation.evaluate(
dataset=mock_dataset,
task=say_task,
experiment_name="the-experiment-name",
scoring_metrics=[],
task_threads=1,
)
opik.flush_tracker()

mock_dataset.__internal_api__get_items_as_dataclasses__.assert_called_once()

mock_create_experiment.assert_called_once_with(
dataset_name="the-dataset-name",
name="the-experiment-name",
experiment_config=None,
prompt=None,
)

mock_experiment.insert.assert_called_once_with(experiment_items=mock.ANY)
EXPECTED_TRACE_TREE = TraceModel(
id=ANY_BUT_NONE,
name="evaluation_task",
input={
"input": {"message": "say hello"},
"reference": "hello",
},
output=None,
start_time=ANY_BUT_NONE,
end_time=ANY_BUT_NONE,
error_info={
"exception_type": "Exception",
"message": "some-error-message",
"traceback": ANY_STRING(),
},
spans=[],
)

assert_equal(EXPECTED_TRACE_TREE, fake_backend.trace_trees[0])

0 comments on commit c4acf28

Please sign in to comment.