sync 09-07-24

UKGovernmentBEIS · Jul 9, 2024 · 696fb05 · 696fb05
1 parent 53fe15a
commit 696fb05
Show file tree

Hide file tree

Showing 42 changed files with 1,516 additions and 583 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,8 @@
 - Match scorers now return answers consistently even when there is no match.
 - Relocate tool related types into a new top-level `inspect_ai.tool` module (previous imports still work fow now, but result in a runtime deprecation warning)
 - Decouple tools entirely from solvers and task state (previously they had ways to interact with metadata, removing this coupling will enable tool use in lower level interactions with models). Accordingly, the `call_tools()` function now operates directly on messages rather than task state.
+- Use multiple scorers to score a task (by passing a list of scorers rather than a single scorer).
+- Support for complex scorers that return multiple scores and metrics (use a dictionary in the `@scorer` declaration to map metrics to the score value to apply the metrics to).
 
 ## v0.3.17 (25 June 2024)
 

diff --git a/src/inspect_ai/_cli/score.py b/src/inspect_ai/_cli/score.py
@@ -89,6 +89,7 @@ async def score(
     # print results
     display().print(f"\n{eval_log.eval.task}")
     if eval_log.results:
-        for name, metric in eval_log.results.metrics.items():
-            display().print(f"{name}: {metric.value}")
+        for score in eval_log.results.scores:
+            for name, metric in score.metrics.items():
+                display().print(f"{name}: {metric.value}")
     display().print(f"log: {log_file}\n")
diff --git a/src/inspect_ai/_display/rich.py b/src/inspect_ai/_display/rich.py
@@ -379,17 +379,19 @@ def task_interrupted(
 def task_results(results: EvalResults) -> tuple[RenderableType, RenderableType]:
     theme = rich_theme()
     output: dict[str, str] = {}
-    for name, metric in results.metrics.items():
-        value = (
-            "1.0"
-            if metric.value == 1
-            else (
-                str(metric.value)
-                if isinstance(metric.value, int)
-                else f"{metric.value:.3g}"
+    for score in results.scores:
+        for name, metric in score.metrics.items():
+            value = (
+                "1.0"
+                if metric.value == 1
+                else (
+                    str(metric.value)
+                    if isinstance(metric.value, int)
+                    else f"{metric.value:.3g}"
+                )
             )
-        )
-        output[name] = value
+            key = f"{score.name}/{name}" if len(results.scores) > 1 else name
+            output[key] = value
     metrics = f"[{theme.metric}]{task_dict(output, True)}[/{theme.metric}]"
 
     return (metrics, "")

diff --git a/src/inspect_ai/_eval/score.py b/src/inspect_ai/_eval/score.py
@@ -8,6 +8,7 @@
 from inspect_ai._util.platform import platform_init
 from inspect_ai._util.registry import (
     registry_create,
+    registry_log_name,
 )
 from inspect_ai.log import (
     EvalLog,
@@ -22,12 +23,12 @@
 from .task.util import task_run_dir
 
 
-def score(log: EvalLog, scorer: Scorer) -> EvalLog:
+def score(log: EvalLog, scorers: Scorer | list[Scorer]) -> EvalLog:
     """Score an evaluation log.
 
     Args:
        log (EvalLog): Evaluation log.
-       scorer (Scorer): Scorer to apply to log
+       scorers (Scorer): List of Scorers to apply to log
        metrics: (list[Metric]): Additional metrics to compute
          (Scorer built-in metrics are always computed).
 
@@ -37,15 +38,18 @@ def score(log: EvalLog, scorer: Scorer) -> EvalLog:
     # standard platform init for top level entry points
     platform_init()
 
-    return asyncio.run(score_async(log, scorer))
+    # resolve scorers into a list
+    scorers = [scorers] if isinstance(scorers, Scorer) else scorers
 
+    return asyncio.run(score_async(log, scorers))
 
-async def score_async(log: EvalLog, scorer: Scorer) -> EvalLog:
+
+async def score_async(log: EvalLog, scorers: list[Scorer]) -> EvalLog:
     """Score an evaluation log.
 
     Args:
        log (EvalLog): Evaluation log.
-       scorer (Scorer): Scorer to apply to log
+       scorers (list[Scorer]): Scorers to apply to log
 
     Returns:
        Log with scores yielded by scorer.
@@ -78,23 +82,23 @@ def progress() -> None:
             p.update(1)
 
         tasks = [
-            run_score_task(state, Target(sample.target), scorer, progress)
+            run_score_task(state, Target(sample.target), scorers, progress)
             for (sample, state) in zip(log.samples, states)
         ]
 
         # do scoring
-        scores = await asyncio.gather(*tasks)
+        scores: list[dict[str, Score]] = await asyncio.gather(*tasks)
 
         # write them back (gather ensures that they come back in the same order)
         for index, score in enumerate(scores):
-            log.samples[index].score = score
+            log.samples[index].scores = score
 
         # collect metrics from EvalLog (they may overlap w/ the scorer metrics,
         # that will be taken care of in eval_results)
         log_metrics = metrics_from_log(log)
 
         # compute metrics
-        log.results = eval_results(scores, scorer, log_metrics)
+        log.results = eval_results(scores, scorers, log_metrics)
 
     return log
 
@@ -119,7 +123,7 @@ async def task_score(task: Task, log: EvalLog) -> EvalLog:
     display().print(f"Aggregating scores for task: {task_name}")
     if task.scorer and log.samples:
         log.results = eval_results(
-            [sample.score for sample in log.samples if isinstance(sample.score, Score)],
+            [sample.scores for sample in log.samples if sample.scores is not None],
             task.scorer,
             task.metrics,
         )
@@ -129,17 +133,25 @@ async def task_score(task: Task, log: EvalLog) -> EvalLog:
 async def run_score_task(
     state: TaskState,
     target: Target,
-    scorer: Scorer,
+    scorers: list[Scorer],
     progress: Callable[..., None],
-) -> Score:
-    result = await scorer(state, target)
+) -> dict[str, Score]:
+    results: dict[str, Score] = {}
+    for scorer in scorers:
+        result = await scorer(state, target)
+        results[registry_log_name(scorer)] = result
+
     progress()
-    return result
+    return results
 
 
 def metrics_from_log(log: EvalLog) -> list[Metric]:
     return (
-        [metric_from_log(metric) for metric in log.results.metrics.values()]
+        [
+            metric_from_log(metric)
+            for score in log.results.scores
+            for metric in score.metrics.values()
+        ]
         if log.results
         else []
     )

diff --git a/src/inspect_ai/_eval/task/log.py b/src/inspect_ai/_eval/task/log.py
@@ -122,7 +122,7 @@ def log_sample(
         epoch: int,
         sample: Sample,
         state: TaskState,
-        score: Score | None,
+        scores: dict[str, Score] | None,
         flush: bool = False,
     ) -> None:
         # log
@@ -137,7 +137,7 @@ def log_sample(
                 metadata=state.metadata if state.metadata else {},
                 messages=state.messages,
                 output=state.output,
-                score=score,
+                scores=scores,
             ),
             flush,
         )

diff --git a/src/inspect_ai/_eval/task/results.py b/src/inspect_ai/_eval/task/results.py
@@ -1,5 +1,6 @@
 import re
 from copy import deepcopy
+from typing import Any, cast
 
 from inspect_ai._util.registry import (
     registry_info,
@@ -10,42 +11,156 @@
 from inspect_ai.log import (
     EvalMetric,
     EvalResults,
-    EvalScorer,
+    EvalScore,
 )
 from inspect_ai.scorer import Metric, Score, Scorer
 from inspect_ai.scorer._scorer import SCORER_METRICS, scorer_metrics
 
 
 def eval_results(
-    scores: list[Score], scorer: Scorer | None, metrics: list[Metric] = []
+    scores: list[dict[str, Score]],
+    scorers: list[Scorer] | None,
+    metrics: list[Metric] = [],
 ) -> EvalResults:
     # record scorer
     results = EvalResults()
-    if scorer:
-        # extract non-metrics metadata
-        metadata = deepcopy(registry_info(scorer).metadata)
-        del metadata[SCORER_METRICS]
+    if scorers:
+        result_scores = []
+        for scorer in scorers:
+            # extract non-metrics metadata
+            metadata = deepcopy(registry_info(scorer).metadata)
+            del metadata[SCORER_METRICS]
 
+            # this scorer
+            scorer_name = registry_log_name(scorer)
+
+            # scores for this scorer
+            resolved_scores = [
+                score[scorer_name] for score in scores if scorer_name in score
+            ]
+
+            # Compute metrics for this scorer
+            targets = target_metrics(scorer, metrics)
+            if isinstance(targets, list):
+                # If there is a simple list of metrics
+                # just compute the metrics for this scorer
+                result_scores.extend(
+                    scorer_for_metrics(
+                        scorer_name=scorer_name,
+                        scorer=scorer,
+                        metadata=metadata,
+                        scores=resolved_scores,
+                        metrics=targets,
+                    )
+                )
+            else:
+                # If there is a dictionary of metrics, apply
+                # the metrics to the values within the scores
+                # (corresponding by key) and emit an EvalScorer for
+                # each key (which effectively creates multiple scorers
+                # by expanding a dictionary score value into multiple
+                # results with metrics)
+                result_scores.extend(
+                    scorers_from_metric_dict(
+                        scorer_name=scorer_name,
+                        scorer=scorer,
+                        metadata=metadata,
+                        scores=resolved_scores,
+                        metrics=targets,
+                    )
+                )
         # build results
-        results.scorer = EvalScorer(
-            name=registry_log_name(scorer),
+        results.scores = result_scores
+
+    return results
+
+
+def scorer_for_metrics(
+    scorer_name: str,
+    scorer: Scorer,
+    metadata: dict[str, Any],
+    scores: list[Score],
+    metrics: list[Metric],
+) -> list[EvalScore]:
+    results: list[EvalScore] = []
+    # we want to use simple names for metrics in the metrics dict
+    # (i.e. without package prefixes). we do this by getting the
+    # unqualified name, then appending a suffix if there are duplicates
+    # this keeps the code straightforward and intuitive for users
+    # programming against the log (e.g. metrics["accuracy"]) vs.
+    # metrics["pkgname/accuracy"])
+    list_metrics: dict[str, EvalMetric] = {}
+    for metric in metrics:
+        key = metrics_unique_key(
+            registry_unqualified_name(metric), list(list_metrics.keys())
+        )
+
+        list_metrics[key] = EvalMetric(
+            name=registry_log_name(metric),
+            value=cast(float, metric(scores)),
+        )
+
+    # build results
+    results.append(
+        EvalScore(
+            scorer=scorer_name,
+            name=scorer_name,
             params=registry_params(scorer),
             metadata=metadata if len(metadata.keys()) > 0 else None,
+            metrics=list_metrics,
         )
+    )
+    return results
+
 
-        # we want to use simple names for metrics in the metrics dict
-        # (i.e. without package prefixes). we do this by getting the
-        # unqualified name, then appending a suffix if there are duplicates
-        # this keeps the code straightforward and intuitive for users
-        # programming against the log (e.g. metrics["accuracy"]) vs.
-        # metrics["pkgname/accuracy"])
-        for metric in target_metrics(scorer, metrics):
-            key = metrics_unique_key(
-                registry_unqualified_name(metric), list(results.metrics.keys())
+def scorers_from_metric_dict(
+    scorer_name: str,
+    scorer: Scorer,
+    metadata: dict[str, Any],
+    scores: list[Score],
+    metrics: dict[str, list[Metric]],
+) -> list[EvalScore]:
+    results: list[EvalScore] = []
+    for metric_key, metric_list in metrics.items():
+        # filter scores to a list of scalars with the value of the metric name
+        metric_scores: list[Score] = []
+        for score in scores:
+            if isinstance(score.value, dict):
+                if metric_key in score.value:
+                    # Convert the score into a simple scalar value to apply metrics
+                    metric_score = deepcopy(score)
+                    metric_score.value = cast(float, score.value[metric_key])
+                    metric_scores.append(metric_score)
+                else:
+                    raise TypeError(
+                        f"key '{metric_key}' isn't present in the score value dictionary"
+                    )
+            else:
+                raise TypeError(
+                    "dictionary of metrics specific for a non-dictionary score"
+                )
+
+        result_metrics: dict[str, EvalMetric] = {}
+        for target_metric in metric_list:
+            # compute the metric value
+            metric_name = registry_log_name(target_metric)
+            result_metrics[metric_name] = EvalMetric(
+                name=metric_name,
+                value=cast(float, target_metric(metric_scores)),
             )
-            results.metrics[key] = EvalMetric(
-                name=registry_log_name(metric), value=metric(scores)
+
+        # create a scorer result for this metric
+        # TODO: What is there is separate simple scorer which has a name collision with
+        # a score created by this scorer
+        results.append(
+            EvalScore(
+                scorer=scorer_name,
+                name=metric_key,
+                params=registry_params(scorer),
+                metadata=metadata if len(metadata.keys()) > 0 else None,
+                metrics=result_metrics,
             )
+        )
     return results
 
 
@@ -64,14 +179,23 @@ def metrics_unique_key(key: str, existing: list[str]) -> str:
 
 
 # build a list of metrics (scorer built-in metrics + de-duplicated additional metrics)
-def target_metrics(scorer: Scorer, metrics: list[Metric]) -> list[Metric]:
-    target_metrics = scorer_metrics(scorer)
-    target_metrics_names = [registry_log_name(metric) for metric in target_metrics]
-    target_metrics.extend(
-        [
-            metric
-            for metric in metrics
-            if registry_log_name(metric) not in target_metrics_names
-        ]
-    )
-    return target_metrics
+def target_metrics(
+    scorer: Scorer, metrics: list[Metric]
+) -> list[Metric] | dict[str, list[Metric]]:
+    output_metrics = scorer_metrics(scorer)
+
+    if isinstance(output_metrics, dict):
+        if isinstance(metrics, dict):
+            output_metrics.update(metrics)
+        return output_metrics
+    else:
+        output_metrics_names = [registry_log_name(metric) for metric in output_metrics]
+        if isinstance(metrics, list):
+            output_metrics.extend(
+                [
+                    metric
+                    for metric in metrics
+                    if registry_log_name(metric) not in output_metrics_names
+                ]
+            )
+        return output_metrics