${statusPanel}
@@ -128,7 +141,7 @@ export const Navbar = ({
@@ -171,43 +184,109 @@ const RunningPanel = () => {
const ResultsPanel = ({ results }) => {
// Map the scores into a list of key/values
- const metrics = results
- ? Object.keys(results).map((key) => {
- return { name: key, value: results[key].value };
- })
- : [];
+ const scorers = {};
+ results.scores.map((score) => {
+ scorers[score.name] = Object.keys(score.metrics).map((key) => {
+ return { name: key, value: score.metrics[key].value };
+ });
+ });
- return html`
- ${metrics.map((metric, i) => {
- return html`
-
- ${metric.name}
-
-
- ${formatPrettyDecimal(metric.value)}
-
-
`;
- })}
+ if (results.scores.length === 1) {
+ const metrics = Object.values(scorers)[0];
+ return html`
+ ${metrics.map((metric, i) => {
+ return html`<${VerticalMetric} metric=${metric} isFirst=${i === 0} />`;
+ })}
+
`;
+ } else {
+ return html`
+ ${results.scores.map((score, index) => {
+ return html`<${MultiScorerMetric}
+ scorer=${score}
+ isFirst=${index === 0}
+ />`;
+ })}
+
`;
+ }
+};
+
+const VerticalMetric = ({ metric, isFirst }) => {
+ return html`
+
+ ${metric.name}
+
+
+ ${formatPrettyDecimal(metric.value)}
+
+
`;
+};
+
+const MultiScorerMetric = ({ scorer, isFirst }) => {
+ const baseFontSize = Object.keys(scorer.metrics).length === 1 ? 0.9 : 0.7;
+ return html`
+
+ ${scorer.name}
+
+
+ ${Object.keys(scorer.metrics).map((key) => {
+ const metric = scorer.metrics[key];
+ return html`
${metric.name}
+
+ ${formatPrettyDecimal(metric.value)}
+
`;
+ })}
+
`;
};
diff --git a/src/inspect_ai/_view/www/src/samples/SampleDisplay.mjs b/src/inspect_ai/_view/www/src/samples/SampleDisplay.mjs
index b4b9151cf..85a4ca3b4 100644
--- a/src/inspect_ai/_view/www/src/samples/SampleDisplay.mjs
+++ b/src/inspect_ai/_view/www/src/samples/SampleDisplay.mjs
@@ -7,12 +7,8 @@ import { TabSet, TabPanel } from "../components/TabSet.mjs";
import { inputString } from "../utils/Format.mjs";
-import { sharedStyles } from "../Constants.mjs";
-import {
- arrayToString,
- shortenCompletion,
- answerForSample,
-} from "../utils/Format.mjs";
+import { icons, sharedStyles } from "../Constants.mjs";
+import { arrayToString, shortenCompletion } from "../utils/Format.mjs";
import { SampleScoreView } from "./SampleScoreView.mjs";
import { MarkdownDiv } from "../components/MarkdownDiv.mjs";
@@ -67,25 +63,45 @@ export const SampleDisplay = ({
// The core tabs
const tabs = [
html`
- <${TabPanel} id=${msgTabId} title="Messages" onSelected=${onSelectedTab} selected=${
+ <${TabPanel} id=${msgTabId} title="Messages" icon=${icons.messages} onSelected=${onSelectedTab} selected=${
selectedTab === msgTabId || selectedTab === undefined
}>
<${ChatView} key=${`${baseId}-chat`} id=${`${baseId}-chat`} messages=${
sample.messages
}/>
${TabPanel}>`,
- html`
- <${TabPanel} id=${scoringTabId} title="Scoring" onSelected=${onSelectedTab} selected=${
- selectedTab === scoringTabId
- }>
- <${SampleScoreView}
- sample=${sample}
- context=${context}
- sampleDescriptor=${sampleDescriptor}
- />
- ${TabPanel}>`,
];
+ const scorerNames = Object.keys(sample.scores);
+ if (scorerNames.length === 1) {
+ tabs.push(html`
+ <${TabPanel} id=${scoringTabId} title="Scoring" icon=${icons.scorer} onSelected=${onSelectedTab} selected=${
+ selectedTab === scoringTabId
+ }>
+ <${SampleScoreView}
+ sample=${sample}
+ context=${context}
+ sampleDescriptor=${sampleDescriptor}
+ scorer=${Object.keys(sample.scores)[0]}
+ />
+ ${TabPanel}>`);
+ } else {
+ for (const scorer of Object.keys(sample.scores)) {
+ const tabId = `score-${scorer}`;
+ tabs.push(html`
+ <${TabPanel} id="${tabId}" title="${scorer}" icon=${icons.scorer} onSelected=${onSelectedTab} selected=${
+ selectedTab === tabId
+ }>
+ <${SampleScoreView}
+ sample=${sample}
+ context=${context}
+ sampleDescriptor=${sampleDescriptor}
+ scorer=${scorer}
+ />
+ ${TabPanel}>`);
+ }
+ }
+
const sampleMetadatas = metadataViewsForSample(baseId, sample, context);
if (sampleMetadatas.length > 0) {
tabs.push(
@@ -93,6 +109,7 @@ export const SampleDisplay = ({
<${TabPanel}
id=${metdataTabId}
title="Metadata"
+ icon=${icons.metadata}
onSelected=${onSelectedTab}
selected=${selectedTab === metdataTabId}>
${sampleMetadatas}
@@ -201,7 +218,9 @@ const SampleSummary = ({ id, sample, sampleDescriptor }) => {
});
}
- const fullAnswer = sample ? answerForSample(sample) : undefined;
+ const fullAnswer = sample
+ ? sampleDescriptor.selectedScorer(sample).answer()
+ : undefined;
if (fullAnswer) {
columns.push({
label: "Answer",
@@ -219,11 +238,7 @@ const SampleSummary = ({ id, sample, sampleDescriptor }) => {
columns.push({
label: "Score",
- value: sampleDescriptor?.scoreDescriptor.render
- ? sampleDescriptor.scoreDescriptor.render(sample?.score?.value)
- : sample?.score?.value === null
- ? "null"
- : sample?.score?.value,
+ value: sampleDescriptor?.selectedScore(sample).render(),
size: "minmax(2em, auto)",
center: true,
});
diff --git a/src/inspect_ai/_view/www/src/samples/SampleList.mjs b/src/inspect_ai/_view/www/src/samples/SampleList.mjs
index fc05e1d5d..3cbb050a9 100644
--- a/src/inspect_ai/_view/www/src/samples/SampleList.mjs
+++ b/src/inspect_ai/_view/www/src/samples/SampleList.mjs
@@ -4,11 +4,7 @@ import { useEffect, useMemo } from "preact/hooks";
import { sharedStyles } from "../Constants.mjs";
import { MarkdownDiv } from "../components/MarkdownDiv.mjs";
-import {
- shortenCompletion,
- arrayToString,
- answerForSample,
-} from "../utils/Format.mjs";
+import { shortenCompletion, arrayToString } from "../utils/Format.mjs";
import { EmptyPanel } from "../components/EmptyPanel.mjs";
import { VirtualList } from "../components/VirtualList.mjs";
import { inputString } from "../utils/Format.mjs";
@@ -25,6 +21,7 @@ export const SampleList = (props) => {
style,
selectedIndex,
setSelectedIndex,
+ selectedScore,
nextSample,
prevSample,
showSample,
@@ -97,6 +94,7 @@ export const SampleList = (props) => {
sampleDescriptor=${sampleDescriptor}
selected=${selectedIndex === index}
setSelected=${setSelectedIndex}
+ selectedScore=${selectedScore}
showSample=${showSample}
/>
`;
@@ -268,7 +266,9 @@ const SampleRow = ({
${sample
? html`
<${MarkdownDiv}
- markdown=${shortenCompletion(answerForSample(sample))}
+ markdown=${shortenCompletion(
+ sampleDescriptor.selectedScorer(sample).answer(),
+ )}
style=${{ paddingLeft: "0" }}
class="no-last-para-padding"
/>
@@ -283,11 +283,7 @@ const SampleRow = ({
display: "flex",
}}
>
- ${sampleDescriptor?.scoreDescriptor.render
- ? sampleDescriptor.scoreDescriptor.render(sample?.score?.value)
- : sample?.score?.value === null
- ? "null"
- : sample?.score?.value}
+ ${sampleDescriptor?.selectedScore(sample).render()}
`;
diff --git a/src/inspect_ai/_view/www/src/samples/SampleScoreView.mjs b/src/inspect_ai/_view/www/src/samples/SampleScoreView.mjs
index 11cb14011..821944840 100644
--- a/src/inspect_ai/_view/www/src/samples/SampleScoreView.mjs
+++ b/src/inspect_ai/_view/www/src/samples/SampleScoreView.mjs
@@ -2,10 +2,10 @@ import { html } from "htm/preact";
import {
arrayToString,
shortenCompletion,
- answerForSample,
inputString,
} from "../utils/Format.mjs";
import { MarkdownDiv } from "../components/MarkdownDiv.mjs";
+import { SampleScores } from "./SampleScores.mjs";
const labelStyle = {
paddingRight: "2em",
@@ -13,7 +13,12 @@ const labelStyle = {
paddingBottom: "0",
};
-export const SampleScoreView = ({ sample, sampleDescriptor, style }) => {
+export const SampleScoreView = ({
+ sample,
+ sampleDescriptor,
+ style,
+ scorer,
+}) => {
const scoreInput = [inputString(sample.input)];
if (sample.choices && sample.choices.length > 0) {
scoreInput.push("");
@@ -24,6 +29,10 @@ export const SampleScoreView = ({ sample, sampleDescriptor, style }) => {
);
}
+ const scorerDescriptor = sampleDescriptor.scorer(sample, scorer);
+ const explanation = scorerDescriptor.explanation() || "(No Explanation)";
+ const answer = scorerDescriptor.answer();
+
return html`
{
-
+ |
<${MarkdownDiv}
markdown=${arrayToString(
arrayToString(sample?.target || "none"),
@@ -63,26 +78,25 @@ export const SampleScoreView = ({ sample, sampleDescriptor, style }) => {
class="no-last-para-padding"
/>
|
-
+ |
<${MarkdownDiv}
class="no-last-para-padding"
- markdown=${shortenCompletion(answerForSample(sample))}
+ markdown=${shortenCompletion(answer)}
style=${{ paddingLeft: "0" }}
/>
|
-
- ${sampleDescriptor?.scoreDescriptor.render
- ? sampleDescriptor.scoreDescriptor.render(sample?.score?.value)
- : sample?.score?.value === null
- ? "null"
- : sample?.score?.value}
+ |
+ <${SampleScores}
+ sample=${sample}
+ sampleDescriptor=${sampleDescriptor}
+ scorer=${scorer}
+ />
|
- ${sample?.score?.explanation &&
- sample?.score?.explanation !== answerForSample(sample)
+ ${explanation && explanation !== answer
? html`
@@ -95,9 +109,7 @@ export const SampleScoreView = ({ sample, sampleDescriptor, style }) => {
- <${MarkdownDiv} markdown=${arrayToString(
- sample?.score?.explanation,
- )} style=${{ paddingLeft: "0" }} class="no-last-para-padding"/>
+ <${MarkdownDiv} markdown=${arrayToString(explanation)} style=${{ paddingLeft: "0" }} class="no-last-para-padding"/>
|
{
+ const scores = scorer
+ ? sampleDescriptor.scorer(sample, scorer).scores()
+ : sampleDescriptor.selectedScorer(sample).scores();
+
+ if (scores.length === 1) {
+ return scores[0].rendered();
+ } else {
+ const rows = scores.map((score) => {
+ return html`
${score.name}
+
${score.rendered()}
`;
+ });
+ return html`
+ ${rows}
+
`;
+ }
+};
diff --git a/src/inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs b/src/inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs
index 88262a4cd..b79cf02c5 100644
--- a/src/inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs
+++ b/src/inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs
@@ -5,7 +5,6 @@ import {
formatDecimalNoTrailingZeroes,
inputString,
arrayToString,
- answerForSample,
} from "../utils/Format.mjs";
import { RenderedContent } from "../components/RenderedContent.mjs";
import { isNumeric } from "../utils/Type.mjs";
@@ -22,16 +21,99 @@ export const kScoreTypeOther = "other";
export const kScoreTypeObject = "object";
export const kScoreTypeBoolean = "boolean";
-export const samplesDescriptor = (samples, epochs, context) => {
+export const samplesDescriptor = (
+ selectedScore,
+ scorers,
+ samples,
+ epochs,
+ context,
+) => {
if (!samples) {
return undefined;
}
+ const score = (sample, scorer = selectedScore?.scorer) => {
+ if (sample.scores[scorer]) {
+ return sample.scores[scorer];
+ } else {
+ return undefined;
+ }
+ };
+
+ // function for retrieving the sample score value
+ const scoreValue = (sample) => {
+ // no scores, no value
+ if (Object.keys(sample.scores).length === 0 || !selectedScore) {
+ return undefined;
+ }
+
+ if (
+ selectedScore.scorer !== selectedScore.name &&
+ sample.scores[selectedScore.scorer] &&
+ sample.scores[selectedScore.scorer].value
+ ) {
+ return sample.scores[selectedScore.scorer].value[selectedScore.name];
+ } else if (sample.scores[selectedScore.name]) {
+ return sample.scores[selectedScore.name].value;
+ } else {
+ return undefined;
+ }
+ };
+
+ // Retrieve the answer for a sample
+ const scoreAnswer = (sample, scorer) => {
+ if (sample) {
+ const sampleScore = score(sample, scorer);
+ if (sampleScore && sampleScore.answer) {
+ return sampleScore.answer;
+ } else if (sample.output.choices && sample.output.choices.length > 0) {
+ const content = sample.output.choices[0].message.content;
+ if (typeof content === "string") {
+ return content;
+ } else {
+ // TODO: Support image completions.
+ return content.length > 0 ? content[0].text : "";
+ }
+ }
+ } else {
+ return undefined;
+ }
+ };
+
+ const scoreExplanation = (sample, scorer) => {
+ if (sample) {
+ const sampleScore = score(sample, scorer);
+ if (sampleScore && sampleScore.explanation) {
+ return sampleScore.explanation;
+ }
+ }
+ return undefined;
+ };
+
const uniqScoreValues = [
...new Set(
samples
- .filter((sample) => !!sample.score)
- .map((sample) => sample.score.value)
+ .filter((sample) => !!sample.scores)
+ .filter((sample) => {
+ // There is no selected scorer, so include this value
+ if (!selectedScore) {
+ return true;
+ }
+
+ if (selectedScore.scorer !== selectedScore.name) {
+ return (
+ Object.keys(sample.scores).includes(selectedScore.scorer) &&
+ Object.keys(sample.scores[selectedScore.scorer].value).includes(
+ selectedScore.name,
+ )
+ );
+ } else {
+ return Object.keys(sample.scores).includes(selectedScore.name);
+ }
+ })
+ .map((sample) => {
+ return scoreValue(sample);
+ })
.filter((value) => {
return value !== null;
}),
@@ -58,10 +140,7 @@ export const samplesDescriptor = (samples, epochs, context) => {
(previous, current) => {
previous[0] = Math.max(previous[0], inputString(current.input).length);
previous[1] = Math.max(previous[1], arrayToString(current.target).length);
- previous[2] = Math.max(
- previous[2],
- answerForSample(current)?.length || 0,
- );
+ previous[2] = Math.max(previous[2], scoreAnswer(current)?.length || 0);
return previous;
},
[0, 0, 0],
@@ -74,7 +153,103 @@ export const samplesDescriptor = (samples, epochs, context) => {
target: sizes[1] / base,
answer: sizes[2] / base,
};
- return { scoreDescriptor, epochs, messageShape };
+
+ const scoreRendered = (sample) => {
+ const score = scoreValue(sample);
+ if (score === null || score === "undefined") {
+ return "null";
+ } else if (scoreDescriptor.render) {
+ return scoreDescriptor.render(score);
+ } else {
+ return score;
+ }
+ };
+
+ const scorerDescriptor = (sample, scorer) => {
+ return {
+ explanation: () => {
+ return scoreExplanation(sample, scorer);
+ },
+ answer: () => {
+ return scoreAnswer(sample, scorer);
+ },
+ scores: () => {
+ if (!sample || !sample.scores) {
+ return [];
+ }
+
+ // Make a list of all the valid score names (this is
+ // used to distinguish between dictionaries that contain
+ // scores that should be treated as standlone scores and
+ // dictionaries that just contain random values, which is allowed)
+ const scoreNames = scorers.map((score) => {
+ return score.name;
+ });
+ const sampleScorer = sample.scores[scorer];
+ const scoreVal = sampleScorer.value;
+ if (typeof scoreVal === "object") {
+ const names = Object.keys(scoreVal);
+ if (
+ names.find((name) => {
+ return !scoreNames.includes(name);
+ })
+ ) {
+ // Since this dictionary contains keys which are not scores
+ // we just treat it like an opaque dictionary
+ return [
+ {
+ name: scorer,
+ rendered: () => {
+ return scoreDescriptor.render(scoreVal);
+ },
+ },
+ ];
+ } else {
+ // Since this dictionary contains keys which are scores
+ // we actually render the individual scores
+ const scores = names.map((name) => {
+ return {
+ name,
+ rendered: () => {
+ return scoreDescriptor.render(scoreVal[name]);
+ },
+ };
+ });
+ return scores;
+ }
+ } else {
+ return [
+ {
+ name: scorer,
+ rendered: () => {
+ return scoreDescriptor.render(scoreVal);
+ },
+ },
+ ];
+ }
+ },
+ };
+ };
+
+ return {
+ scoreDescriptor,
+ epochs,
+ messageShape,
+ selectedScore: (sample) => {
+ return {
+ value: scoreValue(sample),
+ render: () => {
+ return scoreRendered(sample);
+ },
+ };
+ },
+ scorer: (sample, scorer) => {
+ return scorerDescriptor(sample, scorer);
+ },
+ selectedScorer: (sample) => {
+ return scorerDescriptor(sample, selectedScore?.scorer);
+ },
+ };
};
const scoreCategorizers = [
@@ -162,12 +337,11 @@ const scoreCategorizers = [
scoreType: kScoreTypeObject,
categories,
render: (score) => {
- if (score === null) {
+ if (score === null || score === undefined) {
return "[null]";
}
const scores = [];
-
const keys = Object.keys(score);
keys.forEach((key, index) => {
const value = score[key];
diff --git a/src/inspect_ai/_view/www/src/samples/SamplesTab.mjs b/src/inspect_ai/_view/www/src/samples/SamplesTab.mjs
index 414830f7c..8cb651036 100644
--- a/src/inspect_ai/_view/www/src/samples/SamplesTab.mjs
+++ b/src/inspect_ai/_view/www/src/samples/SamplesTab.mjs
@@ -16,6 +16,8 @@ export const SamplesTab = (props) => {
sort,
epoch,
context,
+ selectedScore,
+ //setSelectedScore,
} = props;
const [selectedIndex, setSelectedIndex] = useState(0);
@@ -169,6 +171,7 @@ export const SamplesTab = (props) => {
sampleDescriptor=${sampleDescriptor}
selectedIndex=${selectedIndex}
setSelectedIndex=${setSelectedIndex}
+ selectedScore=${selectedScore}
nextSample=${nextSample}
prevSample=${previousSample}
showSample=${showSample}
diff --git a/src/inspect_ai/_view/www/src/samples/SamplesTools.mjs b/src/inspect_ai/_view/www/src/samples/SamplesTools.mjs
index 3dbacfd46..2a3eb5668 100644
--- a/src/inspect_ai/_view/www/src/samples/SamplesTools.mjs
+++ b/src/inspect_ai/_view/www/src/samples/SamplesTools.mjs
@@ -3,6 +3,7 @@ import { html } from "htm/preact";
import { EpochFilter } from "./tools/EpochFilter.mjs";
import { SortFilter } from "./tools/SortFilter.mjs";
import { SampleFilter } from "./tools/SampleFilter.mjs";
+import { SelectScorer } from "./tools/SelectScorer.mjs";
export const SampleTools = (props) => {
const {
@@ -14,10 +15,24 @@ export const SampleTools = (props) => {
setSort,
epochs,
sampleDescriptor,
+ score,
+ setScore,
+ scores,
} = props;
const hasEpochs = epochs > 1;
const tools = [];
+
+ if (scores.length > 1) {
+ tools.push(
+ html`<${SelectScorer}
+ scores=${scores}
+ score=${score}
+ setScore=${setScore}
+ />`,
+ );
+ }
+
if (hasEpochs) {
tools.push(
html`<${EpochFilter}
@@ -37,7 +52,12 @@ export const SampleTools = (props) => {
);
tools.push(
- html`<${SortFilter} sort=${sort} setSort=${setSort} epochs=${hasEpochs} />`,
+ html`<${SortFilter}
+ sampleDescriptor=${sampleDescriptor}
+ sort=${sort}
+ setSort=${setSort}
+ epochs=${hasEpochs}
+ />`,
);
return tools;
diff --git a/src/inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs b/src/inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs
index ba8c6af25..cd42d7975 100644
--- a/src/inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs
+++ b/src/inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs
@@ -20,12 +20,13 @@ export const SampleFilter = ({ descriptor, filter, filterChanged }) => {
filterChanged({
value: val,
filterFn: (sample, value) => {
- if (typeof sample.score.value === "string") {
- return sample.score.value.toLowerCase() === value?.toLowerCase();
- } else if (typeof sample.score.value === "object") {
- return JSON.stringify(sample.score.value) == value;
+ const score = descriptor.selectedScore(sample);
+ if (typeof score.value === "string") {
+ return score.value.toLowerCase() === value?.toLowerCase();
+ } else if (typeof score.value === "object") {
+ return JSON.stringify(score.value) == value;
} else {
- return sample.score.value === value;
+ return score.value === value;
}
},
});
@@ -35,7 +36,7 @@ export const SampleFilter = ({ descriptor, filter, filterChanged }) => {
const filterInput = (e) => {
filterChanged({
value: e.currentTarget.value,
- filterFn: filterText,
+ filterFn: filterText(descriptor),
});
};
@@ -76,6 +77,7 @@ export const SampleFilter = ({ descriptor, filter, filterChanged }) => {
class="form-control"
value=${filter.value}
placeholder="Filter Samples (score)"
+ style=${{ width: "150px" }}
onInput=${filterInput}
/>
`;
@@ -128,72 +130,75 @@ const SelectFilter = ({ value, options, filterFn }) => {
`;
};
-const filterText = (sample, value) => {
- if (!value) {
- return true;
- } else {
- if (isNumeric(value)) {
- if (typeof sample.score.value === "number") {
- return sample.score.value === Number(value);
- } else {
- return Number(sample.score.value) === Number(value);
- }
+const filterText = (descriptor) => {
+ return (sample, value) => {
+ const score = descriptor.selectedScore(sample);
+ if (!value) {
+ return true;
} else {
- const filters = [
- {
- prefix: ">=",
- fn: (score, val) => {
- return score >= val;
+ if (isNumeric(value)) {
+ if (typeof score.value === "number") {
+ return score.value === Number(value);
+ } else {
+ return Number(score.value) === Number(value);
+ }
+ } else {
+ const filters = [
+ {
+ prefix: ">=",
+ fn: (score, val) => {
+ return score >= val;
+ },
},
- },
- {
- prefix: "<=",
- fn: (score, val) => {
- return score <= val;
+ {
+ prefix: "<=",
+ fn: (score, val) => {
+ return score <= val;
+ },
},
- },
- {
- prefix: ">",
- fn: (score, val) => {
- return score > val;
+ {
+ prefix: ">",
+ fn: (score, val) => {
+ return score > val;
+ },
},
- },
- {
- prefix: "<",
- fn: (score, val) => {
- return score < val;
+ {
+ prefix: "<",
+ fn: (score, val) => {
+ return score < val;
+ },
},
- },
- {
- prefix: "=",
- fn: (score, val) => {
- return score === val;
+ {
+ prefix: "=",
+ fn: (score, val) => {
+ return score === val;
+ },
},
- },
- {
- prefix: "!=",
- fn: (score, val) => {
- return score !== val;
+ {
+ prefix: "!=",
+ fn: (score, val) => {
+ return score !== val;
+ },
},
- },
- ];
+ ];
- for (const filter of filters) {
- if (value?.startsWith(filter.prefix)) {
- const val = value.slice(filter.prefix.length).trim();
- if (!val) {
- return true;
- }
+ for (const filter of filters) {
+ if (value?.startsWith(filter.prefix)) {
+ const val = value.slice(filter.prefix.length).trim();
+ if (!val) {
+ return true;
+ }
- const num = Number(val);
- return filter.fn(sample.score.value, num);
+ const num = Number(val);
+ return filter.fn(score.value, num);
+ }
+ }
+ if (typeof score.value === "string") {
+ return score.value.toLowerCase() === value?.toLowerCase();
+ } else {
+ return score.value === value;
}
- }
- if (typeof sample.score.value === "string") {
- return sample.score.value.toLowerCase() === value?.toLowerCase();
- } else {
- return sample.score.value === value;
}
}
- }
+ };
};
diff --git a/src/inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs b/src/inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs
new file mode 100644
index 000000000..c6640305e
--- /dev/null
+++ b/src/inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs
@@ -0,0 +1,117 @@
+import { html } from "htm/preact";
+
+export const SelectScorer = ({ scores, score, setScore }) => {
+ const scorers = scores.reduce((accum, scorer) => {
+ if (
+ !accum.find((sc) => {
+ return scorer.scorer === sc.scorer;
+ })
+ ) {
+ accum.push(scorer);
+ }
+ return accum;
+ }, []);
+
+ if (scorers.length === 1) {
+ // There is only a single scorer in play, just show the list of available scores
+ return html`
+
+ Score:
+ <${ScoreSelector}
+ scores=${scores}
+ selectedIndex=${scoreIndex(score, scores)}
+ selectedIndexChanged=${(index) => {
+ setScore(scores[index]);
+ }}
+ />
+
+ `;
+ } else {
+ // selected scorer
+
+ const scorerScores = scores.filter((sc) => {
+ return sc.scorer === score.scorer;
+ });
+
+ const selectors = [
+ html`<${ScorerSelector}
+ scorers=${scorers}
+ selectedIndex=${scorerIndex(score, scorers)}
+ selectedIndexChanged=${(index) => {
+ setScore(scorers[index]);
+ }}
+ />`,
+ ];
+ if (scorerScores.length > 1) {
+ selectors.push(
+ html`<${ScoreSelector}
+ style=${{ marginLeft: "1em" }}
+ scores=${scorerScores}
+ selectedIndex=${scoreIndex(score, scorerScores)}
+ selectedIndexChanged=${(index) => {
+ setScore(scorerScores[index]);
+ }}
+ />`,
+ );
+ }
+
+ // There are multiple scorers, so show a scorer selector and a r
+ return html`
+
+ Scorer:
+ ${selectors}
+
+ `;
+ }
+};
+
+const ScoreSelector = ({
+ scores,
+ selectedIndex,
+ selectedIndexChanged,
+ style,
+}) => {
+ return html`
`;
+};
+
+const ScorerSelector = ({ scorers, selectedIndex, selectedIndexChanged }) => {
+ return html`
`;
+};
+
+const scoreIndex = (score, scores) =>
+ scores.findIndex((sc) => {
+ return sc.name === score.name && sc.scorer === score.scorer;
+ });
+
+const scorerIndex = (score, scores) =>
+ scores.findIndex((sc) => {
+ return sc.scorer === score.scorer;
+ });
diff --git a/src/inspect_ai/_view/www/src/samples/tools/SortFilter.mjs b/src/inspect_ai/_view/www/src/samples/tools/SortFilter.mjs
index e0a352efa..dbd067bdc 100644
--- a/src/inspect_ai/_view/www/src/samples/tools/SortFilter.mjs
+++ b/src/inspect_ai/_view/www/src/samples/tools/SortFilter.mjs
@@ -10,7 +10,7 @@ const kScoreDescVal = "score-desc";
export const kDefaultSort = kSampleAscVal;
-export const SortFilter = ({ sort, setSort, epochs }) => {
+export const SortFilter = ({ sampleDescriptor, sort, setSort, epochs }) => {
const options = [
{ label: "sample asc", val: kSampleAscVal },
{ label: "sample desc", val: kSampleDescVal },
@@ -25,14 +25,16 @@ export const SortFilter = ({ sort, setSort, epochs }) => {
val: kEpochDescVal,
});
}
- options.push({
- label: "score asc",
- val: kScoreAscVal,
- });
- options.push({
- label: "score desc",
- val: kScoreDescVal,
- });
+ if (sampleDescriptor?.scoreDescriptor?.compare) {
+ options.push({
+ label: "score asc",
+ val: kScoreAscVal,
+ });
+ options.push({
+ label: "score desc",
+ val: kScoreDescVal,
+ });
+ }
return html`
{
return b.epoch - a.epoch;
case kScoreAscVal:
return sampleDescriptor.scoreDescriptor.compare(
- a.score.value,
- b.score.value,
+ sampleDescriptor.selectedScore(a).value,
+ sampleDescriptor.selectedScore(b).value,
);
case kScoreDescVal:
return sampleDescriptor.scoreDescriptor.compare(
- b.score.value,
- a.score.value,
+ sampleDescriptor.selectedScore(b).value,
+ sampleDescriptor.selectedScore(a).value,
);
}
});
diff --git a/src/inspect_ai/_view/www/src/sidebar/Sidebar.mjs b/src/inspect_ai/_view/www/src/sidebar/Sidebar.mjs
index b728e1a52..a729fb36e 100644
--- a/src/inspect_ai/_view/www/src/sidebar/Sidebar.mjs
+++ b/src/inspect_ai/_view/www/src/sidebar/Sidebar.mjs
@@ -53,10 +53,13 @@ export const Sidebar = ({
-
- <${ProgressBar} animating=${loading} style=${{ marginTop: "-2px" }} />
+
+ <${ProgressBar} animating=${loading} />
-
+
${logs.files.map((file, index) => {
const active = index === selectedIndex ? " active" : "";
const logHeader = logHeaders[file.name];
@@ -69,7 +72,15 @@ export const Sidebar = ({
const model = logHeader?.eval?.model;
const dataset = logHeader?.eval?.dataset;
- const scorer = logHeader?.results?.scorer?.name;
+ const scorer = logHeader?.results?.scores
+ ?.map((scorer) => {
+ return scorer.name;
+ })
+ .join(",");
+ const scorerLabel =
+ Object.keys(logHeader?.results?.scores || {}).length === 1
+ ? "scorer"
+ : "scorers";
const completed = logHeader?.stats?.completed_at;
const time = completed ? new Date(completed) : undefined;
@@ -114,57 +125,7 @@ export const Sidebar = ({
`
: ""}
- ${logHeader?.results?.metrics
- ? html`
- ${Object.keys(logHeader?.results.metrics).map(
- (metric) => {
- return html`
-
-
- ${logHeader?.results.metrics[metric].name}
-
-
- ${formatPrettyDecimal(
- logHeader?.results.metrics[metric].value,
- )}
-
-
- `;
- },
- )}
-
`
- : logHeader?.status === "error"
- ? html`
- Eval Error
-
`
- : logHeader?.status === "cancelled"
- ? html`
- Cancelled
-
`
- : logHeader?.status === "started"
- ? html`
- Loading...
-
`
- : ""}
+ <${EvalStatus} logHeader=${logHeader} />
@@ -186,7 +147,7 @@ export const Sidebar = ({
}}
>
dataset: ${dataset.name || "(samples)"}scorer: ${scorer}
+ >${scorerLabel}: ${scorer}
`
: ""}
@@ -212,6 +173,129 @@ const prettyDir = (path) => {
}
};
+const EvalStatus = ({ logHeader }) => {
+ switch (logHeader.status) {
+ case "error":
+ return html`<${StatusError} message="Error" />`;
+
+ case "cancelled":
+ return html`<${StatusCancelled} message="Cancelled" />`;
+
+ case "started":
+ return html`<${StatusRunning} message="Running" />`;
+
+ default:
+ if (logHeader?.results?.scores && logHeader.results.scores.length > 0) {
+ if (logHeader.results.scores.length === 1) {
+ return html`<${SidebarScore}
+ scorer=${logHeader.results.scores[0]}
+ />`;
+ } else {
+ return html`<${SidebarScores} scores=${logHeader.results.scores} />`;
+ }
+ } else {
+ return "";
+ }
+ }
+};
+
+const SidebarScore = ({ scorer }) => {
+ return html`
+ ${Object.keys(scorer.metrics).map((metric) => {
+ return html`
+
+
${scorer.metrics[metric].name}
+
+ ${formatPrettyDecimal(scorer.metrics[metric].value)}
+
+
+ `;
+ })}
+
`;
+};
+
+const SidebarScores = ({ scores }) => {
+ return html`
+ ${scores.map((score) => {
+ const name = score.name;
+ return html`
+
+
+ ${name}
+
+
+ ${Object.keys(score.metrics).map((key) => {
+ const metric = score.metrics[key];
+ return html`
${metric.name}
+
+ ${formatPrettyDecimal(metric.value)}
+
`;
+ })}
+
+
+ `;
+ })}
+
`;
+};
+
+const StatusCancelled = ({ message }) => {
+ return html`
${message}
`;
+};
+
+const StatusRunning = ({ message }) => {
+ return html`
+ ${message}
+
`;
+};
+
+const StatusError = ({ message }) => {
+ return html`
${message}
`;
+};
+
const LogDirectoryTitle = ({ log_dir, offcanvas }) => {
if (log_dir) {
const displayDir = prettyDir(log_dir);
diff --git a/src/inspect_ai/_view/www/src/title/TitleBlock.mjs b/src/inspect_ai/_view/www/src/title/TitleBlock.mjs
index 97d481166..1cfc02d3f 100644
--- a/src/inspect_ai/_view/www/src/title/TitleBlock.mjs
+++ b/src/inspect_ai/_view/www/src/title/TitleBlock.mjs
@@ -35,11 +35,12 @@ export const TitleBlock = ({ log, status }) => {
`,
});
+ const label = log?.results?.scores.length > 1 ? "Scorers" : "Scorer";
values.push({
size: "auto",
- value: html`<${LabeledValue} label="Scorer" style=${staticColStyle}>
+ value: html`<${LabeledValue} label="${label}" style=${staticColStyle}>
<${ScorerSummary}
- scorer=${log?.results?.scorer} />
+ scorers=${log?.results?.scores} />
${LabeledValue}>`,
});
@@ -90,13 +91,19 @@ const DatasetSummary = ({ dataset, samples, epochs, style }) => {
`;
};
-const ScorerSummary = ({ scorer }) => {
- if (!scorer) {
+const ScorerSummary = ({ scorers }) => {
+ if (!scorers) {
return "";
}
const summary = [];
- summary.push(scorer.name);
+ summary.push(
+ scorers
+ .map((scorer) => {
+ return scorer.name;
+ })
+ .join(", "),
+ );
return summary;
};
diff --git a/src/inspect_ai/_view/www/src/utils/Format.mjs b/src/inspect_ai/_view/www/src/utils/Format.mjs
index 157d50bd7..ca30897d5 100644
--- a/src/inspect_ai/_view/www/src/utils/Format.mjs
+++ b/src/inspect_ai/_view/www/src/utils/Format.mjs
@@ -36,24 +36,6 @@ export const shortenCompletion = (completion) => {
return shortened || completion;
};
-export const answerForSample = (sample) => {
- if (sample) {
- if (sample.score?.answer) {
- return sample.score.answer;
- } else if (sample.output.choices && sample.output.choices.length > 0) {
- const content = sample.output.choices[0].message.content;
- if (typeof content === "string") {
- return content;
- } else {
- // TODO: Support image completions.
- return content.length > 0 ? content[0].text : "";
- }
- }
- } else {
- return undefined;
- }
-};
-
// Gets a string for a sample input
export const inputString = (input) => {
if (typeof input === "string") {
diff --git a/src/inspect_ai/_view/www/src/workspace/WorkSpace.mjs b/src/inspect_ai/_view/www/src/workspace/WorkSpace.mjs
index 3e88151c4..05fc2a21d 100644
--- a/src/inspect_ai/_view/www/src/workspace/WorkSpace.mjs
+++ b/src/inspect_ai/_view/www/src/workspace/WorkSpace.mjs
@@ -36,23 +36,21 @@ export const WorkSpace = (props) => {
const divRef = useRef();
const codeRef = useRef();
+ // alias the log for the workspace
const workspaceLog = props.log;
+
+ // State tracking for the view
const [currentTaskId, setCurrentTaskId] = useState(
workspaceLog?.contents?.eval?.run_id,
);
-
- // State tracking for the view
- const [state, setState] = useState({
- logFiltered: undefined,
- viewState: {
- selectedTab: kEvalTabId,
- openSamples: [],
- filter: {},
- epoch: "all",
- sort: kDefaultSort,
- renderedCode: false,
- },
- });
+ const [selectedTab, setSelectedTab] = useState(kEvalTabId);
+ const [scores, setScores] = useState([]);
+ const [score, setScore] = useState(undefined);
+ const [samplesDesc, setSamplesDesc] = useState(undefined);
+ const [filter, setFilter] = useState({});
+ const [epoch, setEpoch] = useState("all");
+ const [sort, setSort] = useState(kDefaultSort);
+ const [renderedCode, setRenderedCode] = useState(false);
// Context is shared with most/all components and
// allows for global information to pass between components
@@ -63,12 +61,68 @@ export const WorkSpace = (props) => {
},
};
- const sampleDescriptor = useMemo(() => {
- return samplesDescriptor(
+ const clearSampleTools = useCallback(() => {
+ setEpoch("all");
+ setFilter({});
+ setSort(kDefaultSort);
+ }, [setEpoch, setFilter, setSort]);
+
+ // Display the log
+ useEffect(() => {
+ if (
+ workspaceLog.contents &&
+ workspaceLog.contents.eval?.run_id !== currentTaskId
+ ) {
+ const defaultTab =
+ workspaceLog.contents?.status !== "error" ? kEvalTabId : kInfoTabId;
+ setSelectedTab(defaultTab);
+ if (divRef.current) {
+ divRef.current.scrollTop = 0;
+ }
+ }
+ }, [workspaceLog, divRef, currentTaskId, setSelectedTab]);
+
+ useEffect(() => {
+ // Select the default scorer to use
+ const scorer = workspaceLog?.contents?.results?.scores[0]
+ ? {
+ name: workspaceLog.contents.results.scores[0].name,
+ scorer: workspaceLog.contents.results.scores[0].scorer,
+ }
+ : undefined;
+ const scorers = (workspaceLog.contents?.results?.scores || []).map(
+ (score) => {
+ return {
+ name: score.name,
+ scorer: score.scorer,
+ };
+ },
+ );
+
+ // Reset state
+ setScores(scorers);
+ setScore(scorer);
+ clearSampleTools();
+ setRenderedCode(false);
+ }, [workspaceLog, setScores, setScore, setEpoch, setFilter, setRenderedCode]);
+
+ useEffect(() => {
+ clearSampleTools();
+ }, [score]);
+
+ useEffect(() => {
+ const sampleDescriptor = samplesDescriptor(
+ score,
+ scores,
workspaceLog.contents?.samples,
workspaceLog.contents?.eval?.config?.epochs || 1,
context,
);
+ setSamplesDesc(sampleDescriptor);
+ }, [workspaceLog, score, scores, setSamplesDesc]);
+
+ useEffect(() => {
+ setCurrentTaskId(workspaceLog.contents?.eval?.run_id);
}, [workspaceLog]);
// Tabs that are available within the app
@@ -89,28 +143,33 @@ export const WorkSpace = (props) => {
return html` <${SamplesTab}
task=${workspaceLog.contents?.eval?.task}
model=${workspaceLog.contents?.eval?.model}
+ selectedScore=${score}
+ setSelectedScore=${setScore}
samples=${workspaceLog.contents?.samples}
- sampleDescriptor=${sampleDescriptor}
- filter=${state.viewState.filter}
- sort=${state.viewState.sort}
- epoch=${state.viewState.epoch}
+ sampleDescriptor=${samplesDesc}
+ filter=${filter}
+ sort=${sort}
+ epoch=${epoch}
context=${context}
/>`;
},
- tools: (state) => {
+ tools: () => {
// Don't show tools if there is a sample sample
if (workspaceLog.contents?.samples?.length <= 1) {
return "";
}
return html`<${SampleTools}
- epoch=${state.viewState.epoch}
+ epoch=${epoch}
epochs=${workspaceLog.contents?.eval?.config?.epochs}
setEpoch=${setEpoch}
- filter=${state.viewState.filter}
- filterChanged=${filterChanged}
- sort=${state.viewState.sort}
+ filter=${filter}
+ filterChanged=${setFilter}
+ sort=${sort}
setSort=${setSort}
- sampleDescriptor=${sampleDescriptor}
+ score=${score}
+ setScore=${setScore}
+ scores=${scores}
+ sampleDescriptor=${samplesDesc}
/>`;
},
};
@@ -190,7 +249,7 @@ export const WorkSpace = (props) => {
/>`,
);
} else {
- if (codeRef.current && !state.viewState.renderedCode) {
+ if (codeRef.current && !renderedCode) {
if (workspaceLog.raw.length < kPrismRenderMaxSize) {
codeRef.current.innerHTML = Prism.highlight(
workspaceLog.raw,
@@ -203,9 +262,7 @@ export const WorkSpace = (props) => {
codeRef.current.appendChild(textNode);
}
- const viewState = state.viewState;
- viewState.renderedCode = true;
- setState({ viewState });
+ setRenderedCode(true);
}
renderedContent.push(
html`
@@ -248,40 +305,18 @@ export const WorkSpace = (props) => {
};
return resolvedTabs;
- }, [state, workspaceLog]);
-
- const setSelectedTab = (currentState, selectedTab) => {
- const viewState = currentState.viewState;
- viewState.selectedTab = selectedTab;
- setState({ viewState });
- };
-
- const filterChanged = useCallback(
- (filter) => {
- const viewState = state.viewState;
- viewState.filter = filter;
- setState({ viewState });
- },
- [state, setState],
- );
-
- const setEpoch = useCallback(
- (epoch) => {
- const viewState = state.viewState;
- viewState.epoch = epoch;
- setState({ viewState });
- },
- [state],
- );
-
- const setSort = useCallback(
- (sort) => {
- const viewState = state.viewState;
- viewState.sort = sort;
- setState({ viewState });
- },
- [state],
- );
+ }, [
+ samplesDesc,
+ workspaceLog,
+ filter,
+ setFilter,
+ epoch,
+ setEpoch,
+ sort,
+ setSort,
+ renderedCode,
+ setRenderedCode,
+ ]);
const copyFeedback = useCallback(
(e) => {
@@ -301,36 +336,9 @@ export const WorkSpace = (props) => {
}, 1250);
}
},
- [state],
+ [renderedCode],
);
- // Display the log
- useEffect(() => {
- if (workspaceLog.contents && workspaceLog.eval?.run_id !== currentTaskId) {
- const defaultTab =
- workspaceLog.contents?.status !== "error" ? kEvalTabId : kInfoTabId;
- setSelectedTab(state, defaultTab);
- if (divRef.current) {
- divRef.current.scrollTop = 0;
- }
- }
-
- // Reset state
- const newState = {
- openSamples: [],
- filter: {},
- epoch: "all",
- sort: kDefaultSort,
- renderedCode: false,
- };
-
- setState({ viewState: { ...state.viewState, ...newState } });
- }, [workspaceLog, divRef, currentTaskId]);
-
- useEffect(() => {
- setCurrentTaskId(workspaceLog.contents?.eval?.run_id);
- }, [workspaceLog]);
-
// Compute the tools for this tab
const tabTools = Object.keys(tabs)
.map((key) => {
@@ -338,32 +346,27 @@ export const WorkSpace = (props) => {
return tab;
})
.filter((tab) => {
- return tab.id === state.viewState.selectedTab;
+ return tab.id === selectedTab;
})
.map((tab) => {
if (tab.tools) {
- const tools = tab.tools(state);
+ const tools = tab.tools();
return tools;
} else {
return "";
}
});
- const selectTab = (event) => {
- const id = event.currentTarget.id;
- setSelectedTab(state, id);
- };
-
return html`<${WorkspaceDisplay}
divRef=${divRef}
tabs=${tabs}
tabTools=${tabTools}
log=${workspaceLog}
- selectedTab=${state.viewState.selectedTab}
+ selectedTab=${selectedTab}
fullScreen=${props.fullScreen}
offcanvas=${props.offcanvas}
context=${context}
- selectTab=${selectTab}
+ setSelectedTab=${setSelectedTab}
afterBodyElements=${afterBodyElements}
/>`;
};
@@ -373,9 +376,7 @@ const WorkspaceDisplay = ({
selectedTab,
tabs,
tabTools,
- selectTab,
- fullScreen,
- offcanvas,
+ setSelectedTab,
divRef,
context,
afterBodyElements,
@@ -383,11 +384,9 @@ const WorkspaceDisplay = ({
if (log.contents === undefined) {
return html`<${EmptyPanel} />`;
} else {
- const fullScreenClz = fullScreen ? " full-screen" : "";
- const offcanvasClz = offcanvas ? " off-canvas" : "";
-
- return html`
<${TitleBlock}
created=${log.contents?.eval.created}
@@ -426,7 +425,10 @@ const WorkspaceDisplay = ({
return html`<${TabPanel}
id=${tab.id}
title="${tab.label}"
- onSelected="${selectTab}"
+ onSelected=${(e) => {
+ const id = e.currentTarget.id;
+ setSelectedTab(id);
+ }}
selected=${selectedTab === tab.id}
scrollable=${!!tab.scrollable}>
${tab.content()}
diff --git a/src/inspect_ai/log/__init__.py b/src/inspect_ai/log/__init__.py
index 14fb46f13..6763178fd 100644
--- a/src/inspect_ai/log/__init__.py
+++ b/src/inspect_ai/log/__init__.py
@@ -15,7 +15,7 @@
EvalResults,
EvalRevision,
EvalSample,
- EvalScorer,
+ EvalScore,
EvalSpec,
EvalStats,
LoggingLevel,
@@ -34,7 +34,7 @@
"EvalResults",
"EvalRevision",
"EvalSample",
- "EvalScorer",
+ "EvalScore",
"EvalSpec",
"EvalStats",
"EvalLogInfo",
diff --git a/src/inspect_ai/log/_file.py b/src/inspect_ai/log/_file.py
index 2e9803097..e77b8a2b3 100644
--- a/src/inspect_ai/log/_file.py
+++ b/src/inspect_ai/log/_file.py
@@ -27,6 +27,8 @@
Recorder,
)
+LOG_SCHEMA_VERSION = 2
+
class EvalLogInfo(FileInfo):
task: str
@@ -120,7 +122,7 @@ def read_eval_log(log_file: str | FileInfo, header_only: bool = False) -> EvalLo
# verify we know about this version of the log file format
def validate_version(ver: int) -> None:
- if ver > 1:
+ if ver > LOG_SCHEMA_VERSION:
raise ValueError(f"Unable to read version {ver} of log format.")
# header-only uses json-stream
@@ -136,8 +138,10 @@ def read_field(field: str) -> Any:
return None
# fail for unknown version
- version = read_field("version")
- validate_version(version)
+ validate_version(read_field("version"))
+
+ # set the version to the schema version we'll be returning
+ version = LOG_SCHEMA_VERSION
results = read_field("results")
error = read_field("error")
@@ -171,6 +175,9 @@ def read_field(field: str) -> Any:
# fail for unknown version
validate_version(log.version)
+ # set the version to the schema version we'll be returning
+ log.version = LOG_SCHEMA_VERSION
+
# prune if header_only
if header_only:
log.samples = None
diff --git a/src/inspect_ai/log/_log.py b/src/inspect_ai/log/_log.py
index 4e5713984..a7604a2d2 100644
--- a/src/inspect_ai/log/_log.py
+++ b/src/inspect_ai/log/_log.py
@@ -1,5 +1,6 @@
import abc
import asyncio
+import logging
import os
import sys
import traceback
@@ -9,7 +10,7 @@
import click
import tenacity
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, model_validator
from rich.console import Console, RenderableType
from rich.traceback import Traceback
@@ -23,6 +24,8 @@
)
from inspect_ai.scorer import Score
+SCORER_PLACEHOLDER = "88F74D2C"
+
class EvalConfig(BaseModel):
limit: int | tuple[int, int] | None = Field(default=None)
@@ -75,12 +78,41 @@ class EvalSample(BaseModel):
output: ModelOutput
"""Model output from sample."""
- score: Score | None = Field(default=None)
- """Score for sample."""
+ @property
+ def score(self) -> Score | None:
+ """Score for sample (deprecated)."""
+ logging.warning(
+ "The 'score' field is deprecated. Access sample scores through 'scores' instead."
+ )
+ return list(self.scores.values())[0] if self.scores else None
+
+ scores: dict[str, Score] | None = Field(default=None)
+ """Scores for sample."""
metadata: dict[str, Any]
"""Additional sample metadata."""
+ @model_validator(mode="before")
+ @classmethod
+ def convert_score_to_scores(
+ cls: Type["EvalSample"], values: dict[str, Any]
+ ) -> dict[str, Any]:
+ if "score" in values:
+ # There cannot be a scorers property too
+ if "scores" in values:
+ raise TypeError(
+ "Unexpected value `scores` present when `score` has already been specified."
+ )
+
+ # Convert the scorer to the new schema
+ score = values["score"]
+ values["scores"] = {SCORER_PLACEHOLDER: score}
+
+ # Get rid of the 'scorer' property
+ del values["score"]
+
+ return values
+
class EvalPlanStep(BaseModel):
solver: str
@@ -90,17 +122,6 @@ class EvalPlanStep(BaseModel):
"""Parameters used to instantiate solver."""
-class EvalScorer(BaseModel):
- name: str
- """Scorer name."""
-
- params: dict[str, Any] = Field(default={})
- """Parameters specified when creating scorer."""
-
- metadata: dict[str, Any] | None = Field(default=None)
- """Additional scorer metadata."""
-
-
class EvalPlan(BaseModel):
name: str = Field(default="plan")
"""Plan name."""
@@ -129,16 +150,74 @@ class EvalMetric(BaseModel):
"""Additional metadata associated with metric."""
+class EvalScore(BaseModel):
+ name: str
+ """Score name."""
+
+ scorer: str
+ """Scorer name."""
+
+ params: dict[str, Any] = Field(default={})
+ """Parameters specified when creating scorer."""
+
+ metrics: dict[str, EvalMetric] = Field(default=[])
+ """Metrics computed for this scorer."""
+
+ metadata: dict[str, Any] | None = Field(default=None)
+ """Additional scorer metadata."""
+
+
class EvalResults(BaseModel):
- scorer: EvalScorer | None = Field(default=None)
- """Scorer used to compute results"""
+ @property
+ def scorer(self) -> EvalScore | None:
+ """Scorer used to compute results (deprecated)."""
+ logging.warning(
+ "The 'scorer' field is deprecated. Use 'scorers' instead.",
+ )
+ return self.scores[0] if self.scores else None
+
+ @property
+ def metrics(self) -> dict[str, EvalMetric]:
+ """Metrics computed (deprecated)."""
+ logging.warning(
+ "The 'metrics' field is deprecated. Access metrics through 'scorers' instead."
+ )
+ return self.scores[0].metrics if self.scores else {}
- metrics: dict[str, EvalMetric] = Field(default={})
- """Metrics computed."""
+ scores: list[EvalScore] = Field(default=[])
+ """Scorers used to compute results"""
metadata: dict[str, Any] | None = Field(default=None)
"""Additional results metadata."""
+ @model_validator(mode="before")
+ @classmethod
+ def convert_scorer_to_scorers(
+ cls: Type["EvalResults"], values: dict[str, Any]
+ ) -> dict[str, Any]:
+ if "scorer" in values:
+ # There cannot be a scorers property too
+ if "scores" in values:
+ raise TypeError(
+ "Unexpected value `scores` present when `scorer` has already been specified."
+ )
+
+ # Gather metrics
+ if "metrics" in values:
+ metrics = values["metrics"]
+ del values["metrics"]
+ # Convert the scorer to the new schema
+ score = values["scorer"]
+ if metrics:
+ score["metrics"] = metrics
+ score["scorer"] = score["name"]
+ values["scores"] = [score]
+
+ # Get rid of the 'scorer' property
+ del values["scorer"]
+
+ return values
+
class EvalDataset(BaseModel):
name: str | None = Field(default=None)
@@ -316,7 +395,7 @@ def from_log_record(record: LogRecord) -> "LoggingMessage":
class EvalLog(BaseModel):
- version: int = Field(default=1)
+ version: int = Field(default=2)
"""Eval log file format version."""
status: Literal["started", "success", "cancelled", "error"] = Field(
@@ -345,6 +424,17 @@ class EvalLog(BaseModel):
logging: list[LoggingMessage] = Field(default=[])
"""Logging message captured during eval."""
+ @model_validator(mode="after")
+ def populate_scorer_name_for_samples(self) -> "EvalLog":
+ if self.samples and self.results and self.results.scores:
+ scorer_name = self.results.scores[0].name
+ for sample in self.samples:
+ if sample.scores and SCORER_PLACEHOLDER in sample.scores:
+ sample.scores[scorer_name] = sample.scores[SCORER_PLACEHOLDER]
+ del sample.scores[SCORER_PLACEHOLDER]
+
+ return self
+
LogEvent = Literal["plan", "sample", "score", "results", "scorer", "logging"]
diff --git a/src/inspect_ai/scorer/_metric.py b/src/inspect_ai/scorer/_metric.py
index 9f14b370b..1baf31217 100644
--- a/src/inspect_ai/scorer/_metric.py
+++ b/src/inspect_ai/scorer/_metric.py
@@ -153,7 +153,7 @@ class Metric(Protocol):
Metric value
"""
- def __call__(self, scores: list[Score]) -> int | float: ...
+ def __call__(self, scores: list[Score]) -> Value: ...
MetricType = TypeVar("MetricType", Callable[..., Metric], type[Metric])
diff --git a/src/inspect_ai/scorer/_scorer.py b/src/inspect_ai/scorer/_scorer.py
index 4e894c463..b77bfb6eb 100644
--- a/src/inspect_ai/scorer/_scorer.py
+++ b/src/inspect_ai/scorer/_scorer.py
@@ -87,12 +87,14 @@ def scorer_create(name: str, **kwargs: Any) -> Scorer:
def scorer(
- metrics: list[Metric], name: str | None = None, **metadata: Any
+ metrics: list[Metric] | dict[str, list[Metric]],
+ name: str | None = None,
+ **metadata: Any,
) -> Callable[[Callable[..., Scorer]], Callable[..., Scorer]]:
r"""Decorator for registering scorers.
Args:
- metrics (list[Metric]): One or more metrics to calculate
+ metrics (list[Metric] | dict[str, list[Metric]]): One or more metrics to calculate
over the scores.
name (str | None):
Optional name for scorer. If the decorator has no name
@@ -144,8 +146,12 @@ def scorer_wrapper(*args: Any, **kwargs: Any) -> Scorer:
return wrapper
-def scorer_metrics(scorer: Scorer) -> list[Metric]:
- return cast(list[Metric], registry_info(scorer).metadata[SCORER_METRICS])
+def scorer_metrics(scorer: Scorer) -> list[Metric] | dict[str, list[Metric]]:
+ metrics_raw = registry_info(scorer).metadata[SCORER_METRICS]
+ if isinstance(metrics_raw, dict):
+ return cast(dict[str, list[Metric]], metrics_raw)
+ else:
+ return cast(list[Metric], metrics_raw)
SCORER_METRICS = "metrics"
diff --git a/tests/log/test_eval_log.py b/tests/log/test_eval_log.py
index daf7a9fc6..5e640e63b 100644
--- a/tests/log/test_eval_log.py
+++ b/tests/log/test_eval_log.py
@@ -59,7 +59,7 @@ def test_fail_invalid():
def test_fail_version():
- check_log_raises(log_path("log_version_2"))
+ check_log_raises(log_path("log_version_3"))
def check_log_raises(log_file):
diff --git a/tests/log/test_eval_log/log_version_2.txt b/tests/log/test_eval_log/log_version_3.txt
similarity index 75%
rename from tests/log/test_eval_log/log_version_2.txt
rename to tests/log/test_eval_log/log_version_3.txt
index 63e92fb01..470e79b73 100644
--- a/tests/log/test_eval_log/log_version_2.txt
+++ b/tests/log/test_eval_log/log_version_3.txt
@@ -1,5 +1,5 @@
{
- "version": 2,
+ "version": 3,
"status": "success",
"eval": {
"task": "wikipedia",
@@ -31,22 +31,22 @@
"config": {}
},
"results": {
- "scorer": {
+ "scorers": [{
"name": "model_graded_fact",
"params": {}
- },
- "metrics": {
- "accuracy": {
- "name": "accuracy",
- "value": 1,
- "options": {}
- },
- "bootstrap_std": {
- "name": "bootstrap_std",
- "value": 0.0,
- "options": {}
+ "metrics": {
+ "accuracy": {
+ "name": "accuracy",
+ "value": 1,
+ "options": {}
+ },
+ "bootstrap_std": {
+ "name": "bootstrap_std",
+ "value": 0.0,
+ "options": {}
+ }
}
- }
+ }]
},
"stats": {
"started_at": "2024-05-05T07:59:35",
diff --git a/tests/scorer/test_metric.py b/tests/scorer/test_metric.py
index 7a4d2ef33..0d1a6a39e 100644
--- a/tests/scorer/test_metric.py
+++ b/tests/scorer/test_metric.py
@@ -96,7 +96,7 @@ def check_log(log):
check_log(log)
# eval log w/ different scorer (that still uses accuracy)
- log = score(log, scorer=includes())
+ log = score(log, scorers=[includes()])
check_log(log)
diff --git a/tests/scorer/test_multiscorer.py b/tests/scorer/test_multiscorer.py
new file mode 100644
index 000000000..d2a877b4d
--- /dev/null
+++ b/tests/scorer/test_multiscorer.py
@@ -0,0 +1,103 @@
+import random
+
+from inspect_ai import Task, eval
+from inspect_ai.dataset import Sample
+from inspect_ai.scorer import Score, Target, bootstrap_std, mean, scorer
+from inspect_ai.solver import TaskState
+
+
+@scorer(metrics=[mean(), bootstrap_std()])
+def rand_score():
+ async def score(state: TaskState, target: Target):
+ answer = state.output.completion
+ return Score(value=random.randint(1, 100), answer=answer)
+
+ return score
+
+
+@scorer(metrics=[mean(), bootstrap_std()])
+def another_rand_score():
+ async def score(state: TaskState, target: Target):
+ answer = state.output.completion
+ return Score(value=random.randint(1, 100), answer=answer)
+
+ return score
+
+
+@scorer(
+ metrics={"a_count": [mean(), bootstrap_std()], "e_count": [mean(), bootstrap_std()]}
+)
+def letter_count():
+ async def score(state: TaskState, target: Target):
+ answer = state.output.completion
+ a_count = answer.count("a")
+ e_count = answer.count("e")
+ return Score(value={"a_count": a_count, "e_count": e_count}, answer=answer)
+
+ return score
+
+
+def check_log(log, scorers, metrics):
+ # core checks
+ assert log.results
+ assert log.results.scores
+ assert len(log.results.scores) == len(scorers)
+
+ scorer_names = [scorer.name for scorer in log.results.scores]
+ assert all(scorer in scorer_names for scorer in scorers)
+ assert all(
+ all(metric in scorer.metrics for metric in metrics)
+ for scorer in log.results.scores
+ )
+
+ # test deprecated fields for now
+ assert log.results.scorer is not None
+ assert log.results.metrics is not None
+
+
+# test a single scorer
+def test_single_scorer() -> None:
+ task = Task(
+ dataset=[Sample(input="What is 1 + 1?", target=["2", "2.0", "Two"])],
+ scorer=rand_score(),
+ )
+
+ # normal eval
+ log = eval(tasks=task, model="mockllm/model")[0]
+ check_log(log, ["rand_score"], ["mean", "bootstrap_std"])
+
+
+# test two scorers
+def test_multi_scorer() -> None:
+ task = Task(
+ dataset=[Sample(input="What is 1 + 1?", target=["2", "2.0", "Two"])],
+ scorer=[rand_score(), another_rand_score()],
+ )
+
+ # normal eval
+ log = eval(tasks=task, model="mockllm/model")[0]
+ check_log(log, ["rand_score", "another_rand_score"], ["mean", "bootstrap_std"])
+
+
+# test dictionary scorer
+def test_dict_scorer() -> None:
+ task = Task(
+ dataset=[Sample(input="What is 1 + 1?", target=["2", "2.0", "Two"])],
+ scorer=letter_count(),
+ )
+
+ # normal eval
+ log = eval(tasks=task, model="mockllm/model")[0]
+ check_log(log, ["a_count", "e_count"], ["mean", "bootstrap_std"])
+
+
+# test blend of dictionary and simple scorers
+def test_blend_scorer() -> None:
+ task = Task(
+ dataset=[Sample(input="What is 1 + 1?", target=["2", "2.0", "Two"])],
+ scorer=[letter_count(), rand_score()],
+ )
+
+ # normal eval
+ log = eval(tasks=task, model="mockllm/model")[0]
+ check_log(log, ["a_count", "e_count", "rand_score"], ["mean", "bootstrap_std"])
diff --git a/tools/vscode/assets/www/view/view-overrides.css b/tools/vscode/assets/www/view/view-overrides.css
index aba01b072..ebbff341c 100644
--- a/tools/vscode/assets/www/view/view-overrides.css
+++ b/tools/vscode/assets/www/view/view-overrides.css
@@ -46,6 +46,5 @@ body[class^="vscode-"] code:not(.sourceCode) {
to truly fix, remove 'navbar-brand' from metrics div and use `navbar-metrics`
to properly style it */
body[class^="vscode-"] .navbar > div > .navbar-text:not(.navbar-brand) > div > div > div:last-of-type {
- margin-top: -10px;
transform: scale(0.7);
}
\ No newline at end of file