diff --git a/CHANGELOG.md b/CHANGELOG.md index 6fb9152c6..a238977a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ - Match scorers now return answers consistently even when there is no match. - Relocate tool related types into a new top-level `inspect_ai.tool` module (previous imports still work fow now, but result in a runtime deprecation warning) - Decouple tools entirely from solvers and task state (previously they had ways to interact with metadata, removing this coupling will enable tool use in lower level interactions with models). Accordingly, the `call_tools()` function now operates directly on messages rather than task state. +- Use multiple scorers to score a task (by passing a list of scorers rather than a single scorer). +- Support for complex scorers that return multiple scores and metrics (use a dictionary in the `@scorer` declaration to map metrics to the score value to apply the metrics to). ## v0.3.17 (25 June 2024) diff --git a/src/inspect_ai/_cli/score.py b/src/inspect_ai/_cli/score.py index 7f6367308..4575ffe07 100644 --- a/src/inspect_ai/_cli/score.py +++ b/src/inspect_ai/_cli/score.py @@ -89,6 +89,7 @@ async def score( # print results display().print(f"\n{eval_log.eval.task}") if eval_log.results: - for name, metric in eval_log.results.metrics.items(): - display().print(f"{name}: {metric.value}") + for score in eval_log.results.scores: + for name, metric in score.metrics.items(): + display().print(f"{name}: {metric.value}") display().print(f"log: {log_file}\n") diff --git a/src/inspect_ai/_display/rich.py b/src/inspect_ai/_display/rich.py index 62054c2d1..35e4d2f06 100644 --- a/src/inspect_ai/_display/rich.py +++ b/src/inspect_ai/_display/rich.py @@ -379,17 +379,19 @@ def task_interrupted( def task_results(results: EvalResults) -> tuple[RenderableType, RenderableType]: theme = rich_theme() output: dict[str, str] = {} - for name, metric in results.metrics.items(): - value = ( - "1.0" - if metric.value == 1 - else ( - str(metric.value) - if isinstance(metric.value, int) - else f"{metric.value:.3g}" + for score in results.scores: + for name, metric in score.metrics.items(): + value = ( + "1.0" + if metric.value == 1 + else ( + str(metric.value) + if isinstance(metric.value, int) + else f"{metric.value:.3g}" + ) ) - ) - output[name] = value + key = f"{score.name}/{name}" if len(results.scores) > 1 else name + output[key] = value metrics = f"[{theme.metric}]{task_dict(output, True)}[/{theme.metric}]" return (metrics, "") diff --git a/src/inspect_ai/_eval/score.py b/src/inspect_ai/_eval/score.py index db8330523..c2bf9ee24 100644 --- a/src/inspect_ai/_eval/score.py +++ b/src/inspect_ai/_eval/score.py @@ -8,6 +8,7 @@ from inspect_ai._util.platform import platform_init from inspect_ai._util.registry import ( registry_create, + registry_log_name, ) from inspect_ai.log import ( EvalLog, @@ -22,12 +23,12 @@ from .task.util import task_run_dir -def score(log: EvalLog, scorer: Scorer) -> EvalLog: +def score(log: EvalLog, scorers: Scorer | list[Scorer]) -> EvalLog: """Score an evaluation log. Args: log (EvalLog): Evaluation log. - scorer (Scorer): Scorer to apply to log + scorers (Scorer): List of Scorers to apply to log metrics: (list[Metric]): Additional metrics to compute (Scorer built-in metrics are always computed). @@ -37,15 +38,18 @@ def score(log: EvalLog, scorer: Scorer) -> EvalLog: # standard platform init for top level entry points platform_init() - return asyncio.run(score_async(log, scorer)) + # resolve scorers into a list + scorers = [scorers] if isinstance(scorers, Scorer) else scorers + return asyncio.run(score_async(log, scorers)) -async def score_async(log: EvalLog, scorer: Scorer) -> EvalLog: + +async def score_async(log: EvalLog, scorers: list[Scorer]) -> EvalLog: """Score an evaluation log. Args: log (EvalLog): Evaluation log. - scorer (Scorer): Scorer to apply to log + scorers (list[Scorer]): Scorers to apply to log Returns: Log with scores yielded by scorer. @@ -78,23 +82,23 @@ def progress() -> None: p.update(1) tasks = [ - run_score_task(state, Target(sample.target), scorer, progress) + run_score_task(state, Target(sample.target), scorers, progress) for (sample, state) in zip(log.samples, states) ] # do scoring - scores = await asyncio.gather(*tasks) + scores: list[dict[str, Score]] = await asyncio.gather(*tasks) # write them back (gather ensures that they come back in the same order) for index, score in enumerate(scores): - log.samples[index].score = score + log.samples[index].scores = score # collect metrics from EvalLog (they may overlap w/ the scorer metrics, # that will be taken care of in eval_results) log_metrics = metrics_from_log(log) # compute metrics - log.results = eval_results(scores, scorer, log_metrics) + log.results = eval_results(scores, scorers, log_metrics) return log @@ -119,7 +123,7 @@ async def task_score(task: Task, log: EvalLog) -> EvalLog: display().print(f"Aggregating scores for task: {task_name}") if task.scorer and log.samples: log.results = eval_results( - [sample.score for sample in log.samples if isinstance(sample.score, Score)], + [sample.scores for sample in log.samples if sample.scores is not None], task.scorer, task.metrics, ) @@ -129,17 +133,25 @@ async def task_score(task: Task, log: EvalLog) -> EvalLog: async def run_score_task( state: TaskState, target: Target, - scorer: Scorer, + scorers: list[Scorer], progress: Callable[..., None], -) -> Score: - result = await scorer(state, target) +) -> dict[str, Score]: + results: dict[str, Score] = {} + for scorer in scorers: + result = await scorer(state, target) + results[registry_log_name(scorer)] = result + progress() - return result + return results def metrics_from_log(log: EvalLog) -> list[Metric]: return ( - [metric_from_log(metric) for metric in log.results.metrics.values()] + [ + metric_from_log(metric) + for score in log.results.scores + for metric in score.metrics.values() + ] if log.results else [] ) diff --git a/src/inspect_ai/_eval/task/log.py b/src/inspect_ai/_eval/task/log.py index 110eb753c..a9c844c85 100644 --- a/src/inspect_ai/_eval/task/log.py +++ b/src/inspect_ai/_eval/task/log.py @@ -122,7 +122,7 @@ def log_sample( epoch: int, sample: Sample, state: TaskState, - score: Score | None, + scores: dict[str, Score] | None, flush: bool = False, ) -> None: # log @@ -137,7 +137,7 @@ def log_sample( metadata=state.metadata if state.metadata else {}, messages=state.messages, output=state.output, - score=score, + scores=scores, ), flush, ) diff --git a/src/inspect_ai/_eval/task/results.py b/src/inspect_ai/_eval/task/results.py index 193ea669f..1bbb33aa0 100644 --- a/src/inspect_ai/_eval/task/results.py +++ b/src/inspect_ai/_eval/task/results.py @@ -1,5 +1,6 @@ import re from copy import deepcopy +from typing import Any, cast from inspect_ai._util.registry import ( registry_info, @@ -10,42 +11,156 @@ from inspect_ai.log import ( EvalMetric, EvalResults, - EvalScorer, + EvalScore, ) from inspect_ai.scorer import Metric, Score, Scorer from inspect_ai.scorer._scorer import SCORER_METRICS, scorer_metrics def eval_results( - scores: list[Score], scorer: Scorer | None, metrics: list[Metric] = [] + scores: list[dict[str, Score]], + scorers: list[Scorer] | None, + metrics: list[Metric] = [], ) -> EvalResults: # record scorer results = EvalResults() - if scorer: - # extract non-metrics metadata - metadata = deepcopy(registry_info(scorer).metadata) - del metadata[SCORER_METRICS] + if scorers: + result_scores = [] + for scorer in scorers: + # extract non-metrics metadata + metadata = deepcopy(registry_info(scorer).metadata) + del metadata[SCORER_METRICS] + # this scorer + scorer_name = registry_log_name(scorer) + + # scores for this scorer + resolved_scores = [ + score[scorer_name] for score in scores if scorer_name in score + ] + + # Compute metrics for this scorer + targets = target_metrics(scorer, metrics) + if isinstance(targets, list): + # If there is a simple list of metrics + # just compute the metrics for this scorer + result_scores.extend( + scorer_for_metrics( + scorer_name=scorer_name, + scorer=scorer, + metadata=metadata, + scores=resolved_scores, + metrics=targets, + ) + ) + else: + # If there is a dictionary of metrics, apply + # the metrics to the values within the scores + # (corresponding by key) and emit an EvalScorer for + # each key (which effectively creates multiple scorers + # by expanding a dictionary score value into multiple + # results with metrics) + result_scores.extend( + scorers_from_metric_dict( + scorer_name=scorer_name, + scorer=scorer, + metadata=metadata, + scores=resolved_scores, + metrics=targets, + ) + ) # build results - results.scorer = EvalScorer( - name=registry_log_name(scorer), + results.scores = result_scores + + return results + + +def scorer_for_metrics( + scorer_name: str, + scorer: Scorer, + metadata: dict[str, Any], + scores: list[Score], + metrics: list[Metric], +) -> list[EvalScore]: + results: list[EvalScore] = [] + # we want to use simple names for metrics in the metrics dict + # (i.e. without package prefixes). we do this by getting the + # unqualified name, then appending a suffix if there are duplicates + # this keeps the code straightforward and intuitive for users + # programming against the log (e.g. metrics["accuracy"]) vs. + # metrics["pkgname/accuracy"]) + list_metrics: dict[str, EvalMetric] = {} + for metric in metrics: + key = metrics_unique_key( + registry_unqualified_name(metric), list(list_metrics.keys()) + ) + + list_metrics[key] = EvalMetric( + name=registry_log_name(metric), + value=cast(float, metric(scores)), + ) + + # build results + results.append( + EvalScore( + scorer=scorer_name, + name=scorer_name, params=registry_params(scorer), metadata=metadata if len(metadata.keys()) > 0 else None, + metrics=list_metrics, ) + ) + return results + - # we want to use simple names for metrics in the metrics dict - # (i.e. without package prefixes). we do this by getting the - # unqualified name, then appending a suffix if there are duplicates - # this keeps the code straightforward and intuitive for users - # programming against the log (e.g. metrics["accuracy"]) vs. - # metrics["pkgname/accuracy"]) - for metric in target_metrics(scorer, metrics): - key = metrics_unique_key( - registry_unqualified_name(metric), list(results.metrics.keys()) +def scorers_from_metric_dict( + scorer_name: str, + scorer: Scorer, + metadata: dict[str, Any], + scores: list[Score], + metrics: dict[str, list[Metric]], +) -> list[EvalScore]: + results: list[EvalScore] = [] + for metric_key, metric_list in metrics.items(): + # filter scores to a list of scalars with the value of the metric name + metric_scores: list[Score] = [] + for score in scores: + if isinstance(score.value, dict): + if metric_key in score.value: + # Convert the score into a simple scalar value to apply metrics + metric_score = deepcopy(score) + metric_score.value = cast(float, score.value[metric_key]) + metric_scores.append(metric_score) + else: + raise TypeError( + f"key '{metric_key}' isn't present in the score value dictionary" + ) + else: + raise TypeError( + "dictionary of metrics specific for a non-dictionary score" + ) + + result_metrics: dict[str, EvalMetric] = {} + for target_metric in metric_list: + # compute the metric value + metric_name = registry_log_name(target_metric) + result_metrics[metric_name] = EvalMetric( + name=metric_name, + value=cast(float, target_metric(metric_scores)), ) - results.metrics[key] = EvalMetric( - name=registry_log_name(metric), value=metric(scores) + + # create a scorer result for this metric + # TODO: What is there is separate simple scorer which has a name collision with + # a score created by this scorer + results.append( + EvalScore( + scorer=scorer_name, + name=metric_key, + params=registry_params(scorer), + metadata=metadata if len(metadata.keys()) > 0 else None, + metrics=result_metrics, ) + ) return results @@ -64,14 +179,23 @@ def metrics_unique_key(key: str, existing: list[str]) -> str: # build a list of metrics (scorer built-in metrics + de-duplicated additional metrics) -def target_metrics(scorer: Scorer, metrics: list[Metric]) -> list[Metric]: - target_metrics = scorer_metrics(scorer) - target_metrics_names = [registry_log_name(metric) for metric in target_metrics] - target_metrics.extend( - [ - metric - for metric in metrics - if registry_log_name(metric) not in target_metrics_names - ] - ) - return target_metrics +def target_metrics( + scorer: Scorer, metrics: list[Metric] +) -> list[Metric] | dict[str, list[Metric]]: + output_metrics = scorer_metrics(scorer) + + if isinstance(output_metrics, dict): + if isinstance(metrics, dict): + output_metrics.update(metrics) + return output_metrics + else: + output_metrics_names = [registry_log_name(metric) for metric in output_metrics] + if isinstance(metrics, list): + output_metrics.extend( + [ + metric + for metric in metrics + if registry_log_name(metric) not in output_metrics_names + ] + ) + return output_metrics diff --git a/src/inspect_ai/_eval/task/run.py b/src/inspect_ai/_eval/task/run.py index b5479b65e..67b16811b 100644 --- a/src/inspect_ai/_eval/task/run.py +++ b/src/inspect_ai/_eval/task/run.py @@ -141,7 +141,12 @@ async def task_run( else task.plan ) score = score and task.scorer is not None - scorer: Scorer | None = task.scorer if (score and task.scorer) else None + scorers: list[Scorer] | None = task.scorer if (score and task.scorer) else None + scorer_profiles = ( + [registry_log_name(scorer) for scorer in scorers if is_registry_object(scorer)] + if scorers is not None + else ["(none)"] + ) # compute steps (steps = samples * steps in plan + 1 for scorer) steps = len(samples) * ( @@ -153,7 +158,7 @@ async def task_run( name=task.name, model=model_name, dataset=task.dataset.name or "(samples)", - scorer=(registry_log_name(scorer) if is_registry_object(scorer) else "(none)"), + scorer=", ".join(scorer_profiles), samples=len(samples), steps=steps, eval_config=config, @@ -206,7 +211,7 @@ async def generate( toolenv_cleanup=toolenv_cleanup, plan=plan, max_messages=config.max_messages, - scorer=scorer, + scorers=scorers, generate=generate, progress=progress, logger=logger if log_samples else None, @@ -221,11 +226,14 @@ async def generate( scores = await asyncio.gather(*sample_coroutines) # compute and record metrics if we have scores - completed_scores = [score for score in scores if isinstance(score, Score)] + completed_scores = [ + score_dict for score_dict in scores if isinstance(score_dict, dict) + ] + if len(completed_scores) > 0: results = eval_results( scores=completed_scores, - scorer=scorer, + scorers=scorers, metrics=task.metrics, ) logger.log_results(results) @@ -289,14 +297,14 @@ async def task_run_sample( toolenv_cleanup: bool, plan: Plan, max_messages: int | None, - scorer: Scorer | None, + scorers: list[Scorer] | None, generate: Generate, progress: Callable[..., None], logger: TaskLogger | None, log_images: bool, sample_source: EvalSampleSource | None, semaphore: asyncio.Semaphore | None, -) -> Score | None: +) -> dict[str, Score] | None: # if there is an existing sample then tick off its progress, log it, and return it if sample_source and sample.id is not None: previous_sample = sample_source(sample.id, state.epoch) @@ -309,7 +317,7 @@ async def task_run_sample( logger.log_event("sample", previous_sample, False) # return score - return previous_sample.score + return previous_sample.scores # use semaphore if provided semaphore_cm: asyncio.Semaphore | contextlib.AbstractAsyncContextManager[None] = ( @@ -356,7 +364,15 @@ async def task_run_sample( ) # score it - result = await scorer(state, Target(sample.target)) if scorer else None + results: dict[str, Score] = {} + if scorers: + for scorer in scorers: + scorer_name = registry_log_name(scorer) + score_result = ( + await scorer(state, Target(sample.target)) if scorer else None + ) + if score_result is not None: + results[scorer_name] = score_result progress() # log it @@ -366,10 +382,10 @@ async def task_run_sample( state = (await states_with_base64_images([state]))[0] # log the sample - logger.log_sample(state.epoch, sample, state, result, True) + logger.log_sample(state.epoch, sample, state, results, True) # return - return result + return results async def resolve_dataset( diff --git a/src/inspect_ai/_eval/task/task.py b/src/inspect_ai/_eval/task/task.py index c1129946f..6252b1bbf 100644 --- a/src/inspect_ai/_eval/task/task.py +++ b/src/inspect_ai/_eval/task/task.py @@ -25,7 +25,7 @@ class Task: dataset (Dataset | Sequence[Sample]): Dataset to evaluate plan: (Plan | Solver | list[Solver]): Default plan. If not specified defaults to generate(), a normal call to the model. - scorer: (Scorer | None): Scorer used to evaluate model output. + scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output. metrics (list[Metric]): Additional metrics to compute beyond the base metrics provided by the scorer. config (GenerateConfig): Model generation config. @@ -45,7 +45,7 @@ def __init__( self, dataset: Dataset | Sequence[Sample], plan: Plan | Solver | list[Solver] = generate(), - scorer: Scorer | None = None, + scorer: Scorer | list[Scorer] | None = None, metrics: list[Metric] = [], config: GenerateConfig = GenerateConfig(), tool_environment: str | tuple[str, str] | None = None, @@ -58,7 +58,13 @@ def __init__( dataset if isinstance(dataset, Dataset) else MemoryDataset(list(dataset)) ) self.plan = plan if isinstance(plan, Plan) else Plan(plan) - self.scorer = scorer + self.scorer = ( + scorer + if isinstance(scorer, list) + else [scorer] + if scorer is not None + else None + ) self.metrics = metrics self.config = config self.tool_environment = ( diff --git a/src/inspect_ai/_view/view.py b/src/inspect_ai/_view/view.py index 25844d189..5be2137f0 100644 --- a/src/inspect_ai/_view/view.py +++ b/src/inspect_ai/_view/view.py @@ -23,7 +23,7 @@ ) from inspect_ai._util.dotenv import init_dotenv from inspect_ai._util.error import exception_message -from inspect_ai._util.file import FileSystem, file, filesystem +from inspect_ai._util.file import FileSystem, filesystem from inspect_ai._util.http import InspectHTTPRequestHandler from inspect_ai.log._file import ( eval_log_json, @@ -180,9 +180,8 @@ def handle_log(self) -> None: pass if contents is None: # normal read - with file(path, "rb") as f: - # read file and determine its length - contents = f.read() + log = read_eval_log(path, header_only=False) + contents = eval_log_json(log).encode() # respond with the log length = len(contents) diff --git a/src/inspect_ai/_view/www/App.css b/src/inspect_ai/_view/www/App.css index 9984392c3..5e243f91f 100644 --- a/src/inspect_ai/_view/www/App.css +++ b/src/inspect_ai/_view/www/App.css @@ -4,6 +4,18 @@ --bs-popover-max-width: 50%; } +#app { + height: 100vh; + overflow-y: hidden; +} + +.app-main-grid { + display: grid; + height: 100vh; + overflow-y: hidden; + grid-template-rows: minmax(65px, max-content) max-content 1fr; +} + .modal { --bs-modal-margin: 0.5rem; } @@ -12,6 +24,10 @@ --bs-backdrop-opacity: 0.4; } +body[class^="vscode-"] .app-main-grid { + grid-template-rows: minmax(55px, max-content) max-content 1fr; +} + body[class^="vscode-"] { --bs-body-bg: var(--vscode-editor-background); --bs-card-bg: var(--vscode-editor-background); @@ -57,21 +73,20 @@ body[class^="vscode-"] .modal-content { background-clip: unset; } +body[class^="vscode-"] .multi-score-label { + margin-bottom: 5px; +} + body[class^="vscode-"] { min-width: 400px; } body[class^="vscode-"] .navbar-brand { - font-size: 0.9em; -} - -body[class^="vscode-"] .navbar-brand > div { - margin-top: 0.3em; + font-size: 1em; } -body[class^="vscode-"] .navbar-metrics { - margin-top: -0.4em; - font-size: 0.7em; +body[class^="vscode-"] .navbar-brand .navbar-secondary-text { + font-size: 0.8em; } body[class^="vscode-"] .navbar #sidebarToggle > i.bi { @@ -186,7 +201,6 @@ body[class^="vscode-"] .sidebar .list-group { :root { --bs-navbar-padding-y: 0; --bs-navbar-brand-padding-y: 0; - --navbar-height: 50px; --sidebar-width: 500px; } @@ -199,7 +213,6 @@ body { .navbar { padding-top: 0; padding-bottom: 0; - height: var(--navbar-height); background-color: var(--bs-light); } @@ -209,7 +222,12 @@ body { .navbar-brand { font-weight: 400; - font-size: 1.2em; + font-size: 1.4em; +} + +.navbar-text { + padding-top: 0px; + padding-bottom: 0px; } #sidebarToggle > i.bi { @@ -230,7 +248,6 @@ body { .workspace { display: flex; flex-direction: column; - height: calc(100vh - var(--navbar-height)); } .workspace.full-screen { @@ -548,7 +565,6 @@ table.table.table-sm td { .tab-tools > * { flex: 0 1 auto; height: 1.5rem; - max-width: 150px; margin-left: 0.5rem; } diff --git a/src/inspect_ai/_view/www/App.mjs b/src/inspect_ai/_view/www/App.mjs index 966e54f1d..cf4031769 100644 --- a/src/inspect_ai/_view/www/App.mjs +++ b/src/inspect_ai/_view/www/App.mjs @@ -107,8 +107,9 @@ export function App() { capabilities, ); if (logContents) { + const log = logContents.parsed; setCurrentLog({ - contents: logContents.parsed, + contents: log, name: targetLog.name, raw: logContents.raw, }); @@ -265,39 +266,39 @@ export function App() { // if there are no log files, then don't show sidebar const fullScreen = filteredLogs.files.length === 1 && !filteredLogs.log_dir; - const appEnvelope = [ - html` <${Navbar} - file=${currentLog.name} - logs=${filteredLogs} - task=${currentLog.contents?.eval?.task} - model=${currentLog.contents?.eval?.model} - metrics=${currentLog.contents?.results?.metrics} - samples=${currentLog.contents?.samples} - status=${currentLog.contents?.status} - offcanvas=${offcanvas} - />`, - ]; - if (!fullScreen) { - appEnvelope.push(html` - <${Sidebar} - logs=${filteredLogs} - logHeaders=${logHeaders} - loading=${headersLoading} - offcanvas=${offcanvas} - selectedIndex=${selected} - onSelectedIndexChanged=${(index) => { - setSelected(index); - - // hide the sidebar offcanvas - var myOffcanvas = document.getElementById("sidebarOffCanvas"); - var bsOffcanvas = bootstrap.Offcanvas.getInstance(myOffcanvas); - if (bsOffcanvas) { - bsOffcanvas.hide(); - } - }} - /> - `); - } + const navbar = html` <${Navbar} + file=${currentLog.name} + logs=${filteredLogs} + task=${currentLog.contents?.eval?.task} + model=${currentLog.contents?.eval?.model} + results=${currentLog.contents?.results} + samples=${currentLog.contents?.samples} + status=${currentLog.contents?.status} + offcanvas=${offcanvas} + />`; + + const sidebar = + !fullScreen && currentLog.contents + ? html` + <${Sidebar} + logs=${filteredLogs} + logHeaders=${logHeaders} + loading=${headersLoading} + offcanvas=${offcanvas} + selectedIndex=${selected} + onSelectedIndexChanged=${(index) => { + setSelected(index); + + // hide the sidebar offcanvas + var myOffcanvas = document.getElementById("sidebarOffCanvas"); + var bsOffcanvas = bootstrap.Offcanvas.getInstance(myOffcanvas); + if (bsOffcanvas) { + bsOffcanvas.hide(); + } + }} + /> + ` + : ""; const workspace = useMemo(() => { if (status.error) { @@ -317,10 +318,14 @@ export function App() { } }, [logs, currentLog, selected, fullScreen, offcanvas, status]); + const fullScreenClz = fullScreen ? " full-screen" : ""; + const offcanvasClz = offcanvas ? " off-canvas" : ""; + return html` <${AppErrorBoundary}> -
- ${appEnvelope} + ${sidebar} +
+ ${navbar} <${ProgressBar} animating=${status.loading} /> ${workspace}
diff --git a/src/inspect_ai/_view/www/log-schema.json b/src/inspect_ai/_view/www/log-schema.json index 94c1cf5b0..fa8bf07ef 100644 --- a/src/inspect_ai/_view/www/log-schema.json +++ b/src/inspect_ai/_view/www/log-schema.json @@ -652,24 +652,13 @@ }, "EvalResults": { "properties": { - "scorer": { - "anyOf": [ - { - "$ref": "#/$defs/EvalScorer" - }, - { - "type": "null" - } - ], - "default": null - }, - "metrics": { - "additionalProperties": { - "$ref": "#/$defs/EvalMetric" + "scores": { + "default": [], + "items": { + "$ref": "#/$defs/EvalScore" }, - "default": {}, - "title": "Metrics", - "type": "object" + "title": "Scores", + "type": "array" }, "metadata": { "anyOf": [ @@ -686,7 +675,7 @@ }, "title": "EvalResults", "type": "object", - "required": ["scorer", "metrics", "metadata"], + "required": ["scores", "metadata"], "additionalProperties": false }, "EvalRevision": { @@ -807,16 +796,20 @@ "output": { "$ref": "#/$defs/ModelOutput" }, - "score": { + "scores": { "anyOf": [ { - "$ref": "#/$defs/Score" + "additionalProperties": { + "$ref": "#/$defs/Score" + }, + "type": "object" }, { "type": "null" } ], - "default": null + "default": null, + "title": "Scores" }, "metadata": { "title": "Metadata", @@ -831,24 +824,36 @@ "target", "messages", "output", - "score", + "scores", "metadata" ], "title": "EvalSample", "type": "object", "additionalProperties": false }, - "EvalScorer": { + "EvalScore": { "properties": { "name": { "title": "Name", "type": "string" }, + "scorer": { + "title": "Scorer", + "type": "string" + }, "params": { "default": {}, "title": "Params", "type": "object" }, + "metrics": { + "additionalProperties": { + "$ref": "#/$defs/EvalMetric" + }, + "default": [], + "title": "Metrics", + "type": "object" + }, "metadata": { "anyOf": [ { @@ -862,8 +867,8 @@ "title": "Metadata" } }, - "required": ["name", "params", "metadata"], - "title": "EvalScorer", + "required": ["name", "scorer", "params", "metrics", "metadata"], + "title": "EvalScore", "type": "object", "additionalProperties": false }, @@ -1633,7 +1638,7 @@ }, "properties": { "version": { - "default": 1, + "default": 2, "title": "Version", "type": "integer" }, diff --git a/src/inspect_ai/_view/www/log.d.ts b/src/inspect_ai/_view/www/log.d.ts index 559958154..cde6dc97b 100644 --- a/src/inspect_ai/_view/www/log.d.ts +++ b/src/inspect_ai/_view/www/log.d.ts @@ -57,10 +57,12 @@ export type NumChoices = number | null; export type Logprobs = boolean | null; export type TopLogprobs = number | null; export type Name2 = string; -export type Metadata1 = {} | null; +export type Scorer = string; export type Name3 = string; export type Value = number; +export type Metadata1 = {} | null; export type Metadata2 = {} | null; +export type Scores = EvalScore[]; export type Metadata3 = {} | null; export type StartedAt = string; export type CompletedAt = string; @@ -131,14 +133,16 @@ export type Bytes1 = number[] | null; export type Content4 = Logprob[]; export type Choices1 = ChatCompletionChoice[]; export type Error = string | null; +export type Scores1 = { + [k: string]: Score; +} | null; export type Value1 = | string | number - | number | boolean - | (string | number | number | boolean)[] + | (string | number | boolean)[] | { - [k: string]: string | number | number | boolean | null; + [k: string]: string | number | boolean | null; }; export type Answer = string | null; export type Explanation = string | null; @@ -248,14 +252,15 @@ export interface GenerateConfig { top_logprobs: TopLogprobs; } export interface EvalResults { - scorer: EvalScorer | null; - metrics: Metrics; + scores: Scores; metadata: Metadata3; } -export interface EvalScorer { +export interface EvalScore { name: Name2; + scorer: Scorer; params: Params1; - metadata: Metadata1; + metrics: Metrics; + metadata: Metadata2; } export interface Params1 {} export interface Metrics { @@ -265,7 +270,7 @@ export interface EvalMetric { name: Name3; value: Value; options: Options; - metadata: Metadata2; + metadata: Metadata1; } export interface Options {} export interface EvalStats { @@ -294,7 +299,7 @@ export interface EvalSample { target: Target; messages: Messages; output: ModelOutput; - score: Score | null; + scores: Scores1; metadata: Metadata5; } export interface ChatMessageSystem { diff --git a/src/inspect_ai/_view/www/src/Constants.mjs b/src/inspect_ai/_view/www/src/Constants.mjs index bdbadaad5..5956e614f 100644 --- a/src/inspect_ai/_view/www/src/Constants.mjs +++ b/src/inspect_ai/_view/www/src/Constants.mjs @@ -29,6 +29,8 @@ export const icons = { critical: "bi bi-fire", }, menu: "bi bi-list", + messages: "bi bi-chat-right-text", + metadata: "bi bi-table", model: "bi bi-cpu", "toggle-right": "bi bi-chevron-right", more: "bi bi-zoom-in", diff --git a/src/inspect_ai/_view/www/src/components/CopyButton.mjs b/src/inspect_ai/_view/www/src/components/CopyButton.mjs index 5641a5d4b..413da8928 100644 --- a/src/inspect_ai/_view/www/src/components/CopyButton.mjs +++ b/src/inspect_ai/_view/www/src/components/CopyButton.mjs @@ -4,7 +4,12 @@ import { icons } from "../Constants.mjs"; export const CopyButton = ({ value }) => { return html` diff --git a/src/inspect_ai/_view/www/src/navbar/Navbar.mjs b/src/inspect_ai/_view/www/src/navbar/Navbar.mjs index c73ded971..d41cfaaae 100644 --- a/src/inspect_ai/_view/www/src/navbar/Navbar.mjs +++ b/src/inspect_ai/_view/www/src/navbar/Navbar.mjs @@ -13,7 +13,7 @@ export const Navbar = ({ model, status, samples, - metrics, + results, offcanvas, }) => { const toggleOffCanClass = offcanvas ? "" : " d-md-none"; @@ -21,7 +21,7 @@ export const Navbar = ({ let statusPanel; if (status === "success") { - statusPanel = html`<${ResultsPanel} results="${metrics}" />`; + statusPanel = html`<${ResultsPanel} results="${results}" />`; } else if (status === "cancelled") { statusPanel = html`<${CanceledPanel} sampleCount=${samples?.length || 0} @@ -34,7 +34,12 @@ export const Navbar = ({ const navbarContents = logFileName ? html`