Skip to content

Commit

Permalink
sync 09-07-24
Browse files Browse the repository at this point in the history
  • Loading branch information
aisi-inspect committed Jul 9, 2024
1 parent 53fe15a commit 696fb05
Show file tree
Hide file tree
Showing 42 changed files with 1,516 additions and 583 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
- Match scorers now return answers consistently even when there is no match.
- Relocate tool related types into a new top-level `inspect_ai.tool` module (previous imports still work fow now, but result in a runtime deprecation warning)
- Decouple tools entirely from solvers and task state (previously they had ways to interact with metadata, removing this coupling will enable tool use in lower level interactions with models). Accordingly, the `call_tools()` function now operates directly on messages rather than task state.
- Use multiple scorers to score a task (by passing a list of scorers rather than a single scorer).
- Support for complex scorers that return multiple scores and metrics (use a dictionary in the `@scorer` declaration to map metrics to the score value to apply the metrics to).

## v0.3.17 (25 June 2024)

Expand Down
5 changes: 3 additions & 2 deletions src/inspect_ai/_cli/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ async def score(
# print results
display().print(f"\n{eval_log.eval.task}")
if eval_log.results:
for name, metric in eval_log.results.metrics.items():
display().print(f"{name}: {metric.value}")
for score in eval_log.results.scores:
for name, metric in score.metrics.items():
display().print(f"{name}: {metric.value}")
display().print(f"log: {log_file}\n")
22 changes: 12 additions & 10 deletions src/inspect_ai/_display/rich.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,17 +379,19 @@ def task_interrupted(
def task_results(results: EvalResults) -> tuple[RenderableType, RenderableType]:
theme = rich_theme()
output: dict[str, str] = {}
for name, metric in results.metrics.items():
value = (
"1.0"
if metric.value == 1
else (
str(metric.value)
if isinstance(metric.value, int)
else f"{metric.value:.3g}"
for score in results.scores:
for name, metric in score.metrics.items():
value = (
"1.0"
if metric.value == 1
else (
str(metric.value)
if isinstance(metric.value, int)
else f"{metric.value:.3g}"
)
)
)
output[name] = value
key = f"{score.name}/{name}" if len(results.scores) > 1 else name
output[key] = value
metrics = f"[{theme.metric}]{task_dict(output, True)}[/{theme.metric}]"

return (metrics, "")
Expand Down
42 changes: 27 additions & 15 deletions src/inspect_ai/_eval/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from inspect_ai._util.platform import platform_init
from inspect_ai._util.registry import (
registry_create,
registry_log_name,
)
from inspect_ai.log import (
EvalLog,
Expand All @@ -22,12 +23,12 @@
from .task.util import task_run_dir


def score(log: EvalLog, scorer: Scorer) -> EvalLog:
def score(log: EvalLog, scorers: Scorer | list[Scorer]) -> EvalLog:
"""Score an evaluation log.
Args:
log (EvalLog): Evaluation log.
scorer (Scorer): Scorer to apply to log
scorers (Scorer): List of Scorers to apply to log
metrics: (list[Metric]): Additional metrics to compute
(Scorer built-in metrics are always computed).
Expand All @@ -37,15 +38,18 @@ def score(log: EvalLog, scorer: Scorer) -> EvalLog:
# standard platform init for top level entry points
platform_init()

return asyncio.run(score_async(log, scorer))
# resolve scorers into a list
scorers = [scorers] if isinstance(scorers, Scorer) else scorers

return asyncio.run(score_async(log, scorers))

async def score_async(log: EvalLog, scorer: Scorer) -> EvalLog:

async def score_async(log: EvalLog, scorers: list[Scorer]) -> EvalLog:
"""Score an evaluation log.
Args:
log (EvalLog): Evaluation log.
scorer (Scorer): Scorer to apply to log
scorers (list[Scorer]): Scorers to apply to log
Returns:
Log with scores yielded by scorer.
Expand Down Expand Up @@ -78,23 +82,23 @@ def progress() -> None:
p.update(1)

tasks = [
run_score_task(state, Target(sample.target), scorer, progress)
run_score_task(state, Target(sample.target), scorers, progress)
for (sample, state) in zip(log.samples, states)
]

# do scoring
scores = await asyncio.gather(*tasks)
scores: list[dict[str, Score]] = await asyncio.gather(*tasks)

# write them back (gather ensures that they come back in the same order)
for index, score in enumerate(scores):
log.samples[index].score = score
log.samples[index].scores = score

# collect metrics from EvalLog (they may overlap w/ the scorer metrics,
# that will be taken care of in eval_results)
log_metrics = metrics_from_log(log)

# compute metrics
log.results = eval_results(scores, scorer, log_metrics)
log.results = eval_results(scores, scorers, log_metrics)

return log

Expand All @@ -119,7 +123,7 @@ async def task_score(task: Task, log: EvalLog) -> EvalLog:
display().print(f"Aggregating scores for task: {task_name}")
if task.scorer and log.samples:
log.results = eval_results(
[sample.score for sample in log.samples if isinstance(sample.score, Score)],
[sample.scores for sample in log.samples if sample.scores is not None],
task.scorer,
task.metrics,
)
Expand All @@ -129,17 +133,25 @@ async def task_score(task: Task, log: EvalLog) -> EvalLog:
async def run_score_task(
state: TaskState,
target: Target,
scorer: Scorer,
scorers: list[Scorer],
progress: Callable[..., None],
) -> Score:
result = await scorer(state, target)
) -> dict[str, Score]:
results: dict[str, Score] = {}
for scorer in scorers:
result = await scorer(state, target)
results[registry_log_name(scorer)] = result

progress()
return result
return results


def metrics_from_log(log: EvalLog) -> list[Metric]:
return (
[metric_from_log(metric) for metric in log.results.metrics.values()]
[
metric_from_log(metric)
for score in log.results.scores
for metric in score.metrics.values()
]
if log.results
else []
)
Expand Down
4 changes: 2 additions & 2 deletions src/inspect_ai/_eval/task/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def log_sample(
epoch: int,
sample: Sample,
state: TaskState,
score: Score | None,
scores: dict[str, Score] | None,
flush: bool = False,
) -> None:
# log
Expand All @@ -137,7 +137,7 @@ def log_sample(
metadata=state.metadata if state.metadata else {},
messages=state.messages,
output=state.output,
score=score,
scores=scores,
),
flush,
)
Expand Down
184 changes: 154 additions & 30 deletions src/inspect_ai/_eval/task/results.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
from copy import deepcopy
from typing import Any, cast

from inspect_ai._util.registry import (
registry_info,
Expand All @@ -10,42 +11,156 @@
from inspect_ai.log import (
EvalMetric,
EvalResults,
EvalScorer,
EvalScore,
)
from inspect_ai.scorer import Metric, Score, Scorer
from inspect_ai.scorer._scorer import SCORER_METRICS, scorer_metrics


def eval_results(
scores: list[Score], scorer: Scorer | None, metrics: list[Metric] = []
scores: list[dict[str, Score]],
scorers: list[Scorer] | None,
metrics: list[Metric] = [],
) -> EvalResults:
# record scorer
results = EvalResults()
if scorer:
# extract non-metrics metadata
metadata = deepcopy(registry_info(scorer).metadata)
del metadata[SCORER_METRICS]
if scorers:
result_scores = []
for scorer in scorers:
# extract non-metrics metadata
metadata = deepcopy(registry_info(scorer).metadata)
del metadata[SCORER_METRICS]

# this scorer
scorer_name = registry_log_name(scorer)

# scores for this scorer
resolved_scores = [
score[scorer_name] for score in scores if scorer_name in score
]

# Compute metrics for this scorer
targets = target_metrics(scorer, metrics)
if isinstance(targets, list):
# If there is a simple list of metrics
# just compute the metrics for this scorer
result_scores.extend(
scorer_for_metrics(
scorer_name=scorer_name,
scorer=scorer,
metadata=metadata,
scores=resolved_scores,
metrics=targets,
)
)
else:
# If there is a dictionary of metrics, apply
# the metrics to the values within the scores
# (corresponding by key) and emit an EvalScorer for
# each key (which effectively creates multiple scorers
# by expanding a dictionary score value into multiple
# results with metrics)
result_scores.extend(
scorers_from_metric_dict(
scorer_name=scorer_name,
scorer=scorer,
metadata=metadata,
scores=resolved_scores,
metrics=targets,
)
)
# build results
results.scorer = EvalScorer(
name=registry_log_name(scorer),
results.scores = result_scores

return results


def scorer_for_metrics(
scorer_name: str,
scorer: Scorer,
metadata: dict[str, Any],
scores: list[Score],
metrics: list[Metric],
) -> list[EvalScore]:
results: list[EvalScore] = []
# we want to use simple names for metrics in the metrics dict
# (i.e. without package prefixes). we do this by getting the
# unqualified name, then appending a suffix if there are duplicates
# this keeps the code straightforward and intuitive for users
# programming against the log (e.g. metrics["accuracy"]) vs.
# metrics["pkgname/accuracy"])
list_metrics: dict[str, EvalMetric] = {}
for metric in metrics:
key = metrics_unique_key(
registry_unqualified_name(metric), list(list_metrics.keys())
)

list_metrics[key] = EvalMetric(
name=registry_log_name(metric),
value=cast(float, metric(scores)),
)

# build results
results.append(
EvalScore(
scorer=scorer_name,
name=scorer_name,
params=registry_params(scorer),
metadata=metadata if len(metadata.keys()) > 0 else None,
metrics=list_metrics,
)
)
return results


# we want to use simple names for metrics in the metrics dict
# (i.e. without package prefixes). we do this by getting the
# unqualified name, then appending a suffix if there are duplicates
# this keeps the code straightforward and intuitive for users
# programming against the log (e.g. metrics["accuracy"]) vs.
# metrics["pkgname/accuracy"])
for metric in target_metrics(scorer, metrics):
key = metrics_unique_key(
registry_unqualified_name(metric), list(results.metrics.keys())
def scorers_from_metric_dict(
scorer_name: str,
scorer: Scorer,
metadata: dict[str, Any],
scores: list[Score],
metrics: dict[str, list[Metric]],
) -> list[EvalScore]:
results: list[EvalScore] = []
for metric_key, metric_list in metrics.items():
# filter scores to a list of scalars with the value of the metric name
metric_scores: list[Score] = []
for score in scores:
if isinstance(score.value, dict):
if metric_key in score.value:
# Convert the score into a simple scalar value to apply metrics
metric_score = deepcopy(score)
metric_score.value = cast(float, score.value[metric_key])
metric_scores.append(metric_score)
else:
raise TypeError(
f"key '{metric_key}' isn't present in the score value dictionary"
)
else:
raise TypeError(
"dictionary of metrics specific for a non-dictionary score"
)

result_metrics: dict[str, EvalMetric] = {}
for target_metric in metric_list:
# compute the metric value
metric_name = registry_log_name(target_metric)
result_metrics[metric_name] = EvalMetric(
name=metric_name,
value=cast(float, target_metric(metric_scores)),
)
results.metrics[key] = EvalMetric(
name=registry_log_name(metric), value=metric(scores)

# create a scorer result for this metric
# TODO: What is there is separate simple scorer which has a name collision with
# a score created by this scorer
results.append(
EvalScore(
scorer=scorer_name,
name=metric_key,
params=registry_params(scorer),
metadata=metadata if len(metadata.keys()) > 0 else None,
metrics=result_metrics,
)
)
return results


Expand All @@ -64,14 +179,23 @@ def metrics_unique_key(key: str, existing: list[str]) -> str:


# build a list of metrics (scorer built-in metrics + de-duplicated additional metrics)
def target_metrics(scorer: Scorer, metrics: list[Metric]) -> list[Metric]:
target_metrics = scorer_metrics(scorer)
target_metrics_names = [registry_log_name(metric) for metric in target_metrics]
target_metrics.extend(
[
metric
for metric in metrics
if registry_log_name(metric) not in target_metrics_names
]
)
return target_metrics
def target_metrics(
scorer: Scorer, metrics: list[Metric]
) -> list[Metric] | dict[str, list[Metric]]:
output_metrics = scorer_metrics(scorer)

if isinstance(output_metrics, dict):
if isinstance(metrics, dict):
output_metrics.update(metrics)
return output_metrics
else:
output_metrics_names = [registry_log_name(metric) for metric in output_metrics]
if isinstance(metrics, list):
output_metrics.extend(
[
metric
for metric in metrics
if registry_log_name(metric) not in output_metrics_names
]
)
return output_metrics
Loading

0 comments on commit 696fb05

Please sign in to comment.