Skip to content

Commit

Permalink
Added nb_samples parameter in evaluate function (#355)
Browse files Browse the repository at this point in the history
* Added nb_samples parameter in eveluate function

* Fixed test
  • Loading branch information
jverre authored Oct 7, 2024
1 parent e95988d commit 1096e06
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 18 deletions.
12 changes: 7 additions & 5 deletions sdks/python/examples/evaluation_example.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Dict, Any

from opik.evaluation.metrics import (
Contains,
IsJson,
Hallucination,
)
Expand All @@ -16,13 +15,15 @@

openai_client = track_openai(openai.OpenAI())

contains_hello = Contains(searched_value="hello", name="ContainsHello")
contains_bye = Contains(searched_value="bye", name="ContainsBye")
# contains_hello = Contains(searched_value="hello", name="ContainsHello")
# contains_bye = Contains(searched_value="bye", name="ContainsBye")
is_json = IsJson()
hallucination = Hallucination()

client = Opik()
dataset = client.create_dataset(name="My 42 dataset", description="For storing stuff")
dataset = client.get_or_create_dataset(
name="My 42 dataset", description="For storing stuff"
)
# dataset = client.get_dataset(name="My 42 dataset")

json = """
Expand Down Expand Up @@ -69,5 +70,6 @@ def llm_task(item: DatasetItem) -> Dict[str, Any]:
experiment_name="My experiment",
dataset=dataset,
task=llm_task,
scoring_metrics=[contains_hello, contains_bye, is_json, hallucination],
nb_samples=2,
scoring_metrics=[is_json, hallucination],
)
36 changes: 28 additions & 8 deletions sdks/python/src/opik/api_objects/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,12 +179,17 @@ def to_json(self) -> str:

return converters.to_json(dataset_items, keys_mapping={})

def get_all_items(self) -> List[dataset_item.DatasetItem]:
def get_items(
self, nb_samples: Optional[int] = None
) -> List[dataset_item.DatasetItem]:
"""
Retrieve all items from the dataset.
Retrieve a fixed set number of dataset items.
Args:
nb_samples: The number of samples to retrieve.
Returns:
A list of DatasetItem objects representing all items in the dataset.
A list of DatasetItem objects representing the samples.
"""
results: List[dataset_item.DatasetItem] = []

Expand All @@ -194,8 +199,11 @@ def get_all_items(self) -> List[dataset_item.DatasetItem]:
last_retrieved_id=results[-1].id if len(results) > 0 else None,
)

previous_results_size = len(results)
if nb_samples is not None and len(results) == nb_samples:
break

item_bytes = b"".join(stream)
stream_results: List[dataset_item.DatasetItem] = []
for line in item_bytes.split(b"\n"):
if len(line) == 0:
continue
Expand All @@ -212,15 +220,27 @@ def get_all_items(self) -> List[dataset_item.DatasetItem]:
source=item_content.get("source"), # type: ignore
)

stream_results.append(item)
results.append(item)

if len(stream_results) == 0:
break
# Break the loop if we have enough samples
if nb_samples is not None and len(results) == nb_samples:
break

results.extend(stream_results)
# Break the loop if we have not received any new samples
if len(results) == previous_results_size:
break

return results

def get_all_items(self) -> List[dataset_item.DatasetItem]:
"""
Retrieve all items from the dataset.
Returns:
A list of DatasetItem objects representing all items in the dataset.
"""
return self.get_items()

def insert_from_json(
self,
json_array: str,
Expand Down
4 changes: 4 additions & 0 deletions sdks/python/src/opik/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def evaluate(
experiment_name: Optional[str] = None,
experiment_config: Optional[Dict[str, Any]] = None,
verbose: int = 1,
nb_samples: Optional[int] = None,
task_threads: int = 16,
) -> evaluation_result.EvaluationResult:
"""
Expand All @@ -42,6 +43,8 @@ def evaluate(
verbose: an integer value that controls evaluation output logs such as summary and tqdm progress bar.
0 - no outputs, 1 - outputs are enabled (default).
nb_samples: number of samples to evaluate. If no value is provided, all samples in the dataset will be evaluated.
task_threads: amount of thread workers to run tasks. If set to 1, no additional
threads are created, all tasks executed in the current thread sequentially.
are executed sequentially in the current thread.
Expand All @@ -55,6 +58,7 @@ def evaluate(
dataset_=dataset,
task=task,
scoring_metrics=scoring_metrics,
nb_samples=nb_samples,
workers=task_threads,
verbose=verbose,
)
Expand Down
5 changes: 3 additions & 2 deletions sdks/python/src/opik/evaluation/tasks_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
from concurrent import futures

from typing import List
from typing import List, Optional
from .types import LLMTask
from opik.api_objects.dataset import dataset, dataset_item
from opik.api_objects import opik_client, trace
Expand Down Expand Up @@ -96,9 +96,10 @@ def run(
task: LLMTask,
scoring_metrics: List[base_metric.BaseMetric],
workers: int,
nb_samples: Optional[int],
verbose: int,
) -> List[test_result.TestResult]:
dataset_items = dataset_.get_all_items()
dataset_items = dataset_.get_items(nb_samples=nb_samples)
test_cases: List[test_result.TestResult]

if workers == 1:
Expand Down
6 changes: 3 additions & 3 deletions sdks/python/tests/unit/evaluation/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_evaluate_happyflow(fake_streamer):

mock_dataset = mock.Mock()
mock_dataset.name = "the-dataset-name"
mock_dataset.get_all_items.return_value = [
mock_dataset.get_items.return_value = [
dataset_item.DatasetItem(
id="dataset-item-id-1",
input={"input": "say hello"},
Expand Down Expand Up @@ -133,7 +133,7 @@ def test_evaluate___output_key_is_missing_in_task_output_dict__equals_metric_mis
# to compute Equals metric score.
mock_dataset = mock.Mock()
mock_dataset.name = "the-dataset-name"
mock_dataset.get_all_items.return_value = [
mock_dataset.get_items.return_value = [
dataset_item.DatasetItem(
id="dataset-item-id-1",
input={"input": "say hello"},
Expand All @@ -158,4 +158,4 @@ def say_task(dataset_item: dataset_item.DatasetItem):
task_threads=1,
)

mock_dataset.get_all_items.assert_called_once()
mock_dataset.get_items.assert_called_once()

0 comments on commit 1096e06

Please sign in to comment.