Skip to content

Commit

Permalink
Merge branch 'UKGovernmentBEIS:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
andrei-apollo authored Jan 6, 2025
2 parents 86736cc + 5393dbd commit c879e4e
Show file tree
Hide file tree
Showing 20 changed files with 612 additions and 253 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ default_language_version:
python: python3.11
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.8.5
rev: v0.8.6
hooks:
# Run the linter.
- id: ruff
Expand Down
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,13 @@
- Inspect View: never truncate tool result images and display at default width of 800px.
- Inspect View: display tool error messages in transcript when tool errors occur.
- Inspect View: display any completed samples even if the task fails because of an error
- Inspect View: don't display the 'input' column heading if there isn't an input
- Open AI: Handle additional bad request status codes (mapping them to appropriate `StopReason`)
- Open AI: Use new `max_completion_tokens` option for o1 full.
- Sandboxes: Apply dataset filters (limit and sample id) prior to sandbox initialisation.
- Tool parameters with a default of `None` are now supported.
- More fine graned HTML escaping for sample transcripts displalyed in terminal.
- Fix an issue that would result in an error when a state or storage value used a tilda or slash in the key name.

## v0.3.56 (01 January 2025)

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ dev = [
"pytest-cov",
"pytest-dotenv",
"pytest-xdist",
"ruff==0.8.5", # match version specified in .pre-commit-config.yaml
"ruff==0.8.6", # match version specified in .pre-commit-config.yaml
"textual-dev>=0.86.2",
"types-PyYAML",
"types-beautifulsoup4",
Expand Down
27 changes: 16 additions & 11 deletions src/inspect_ai/_eval/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from .task.run import TaskRunOptions, task_run
from .task.rundir import task_run_dir_switching
from .task.sandbox import TaskSandboxEnvironment, resolve_sandbox_for_task
from .task.util import task_run_dir
from .task.util import slice_dataset, task_run_dir

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -70,12 +70,23 @@ async def eval_run(
# get cwd before switching to task dir
eval_wd = os.getcwd()

# ensure sample ids
for resolved_task in tasks:
# add sample ids to dataset if they aren't there (start at 1 not 0)
task = resolved_task.task
for id, sample in enumerate(task.dataset):
if sample.id is None:
sample.id = id + 1

# Ensure sample ids are unique
ensure_unique_ids(task.dataset)

# run startup pass for the sandbox environments
shutdown_sandbox_environments: Callable[[], Awaitable[None]] | None = None
if has_sandbox:
cleanup = eval_config.sandbox_cleanup is not False
shutdown_sandbox_environments = await startup_sandbox_environments(
resolve_sandbox_environment(eval_sandbox), tasks, cleanup
resolve_sandbox_environment(eval_sandbox), tasks, eval_config, cleanup
)

# resolve solver and solver spec
Expand Down Expand Up @@ -146,14 +157,6 @@ async def eval_run(
else:
task.fail_on_error = task_eval_config.fail_on_error

# add sample ids to dataset if they aren't there (start at 1 not 0)
for id, sample in enumerate(task.dataset):
if sample.id is None:
sample.id = id + 1

# Ensure sample ids are unique
ensure_unique_ids(task.dataset)

# create and track the logger
logger = TaskLogger(
task_name=task.name,
Expand Down Expand Up @@ -340,13 +343,15 @@ async def worker() -> None:
async def startup_sandbox_environments(
eval_sandbox: SandboxEnvironmentSpec | None,
tasks: list[ResolvedTask],
config: EvalConfig,
cleanup: bool,
) -> Callable[[], Awaitable[None]]:
# find unique sandboxenvs
sandboxenvs: Set[TaskSandboxEnvironment] = set()
for task in tasks:
# resolve each sample and add to sandboxenvs
for sample in task.task.dataset:
dataset = slice_dataset(task.task.dataset, config.limit, config.sample_id)
for sample in dataset:
sandbox = resolve_sandbox_for_task(eval_sandbox, task.task, sample)
if sandbox is not None and sandbox not in sandboxenvs:
sandboxenvs.add(sandbox)
Expand Down
12 changes: 11 additions & 1 deletion src/inspect_ai/_util/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,20 @@ def json_changes(
paths = json_change.path.split("/")[1:]
replaced = before
for path in paths:
index: Any = int(path) if path.isnumeric() else path
decoded_path = decode_json_pointer_segment(path)
index: Any = (
int(decoded_path) if decoded_path.isnumeric() else decoded_path
)
replaced = replaced[index]
json_change.replaced = replaced
changes.append(json_change)
return changes
else:
return None


def decode_json_pointer_segment(segment: str) -> str:
"""Decode a single JSON Pointer segment."""
# JSON points encode ~ and / because they are special characters
# this decodes these values (https://www.rfc-editor.org/rfc/rfc6901)
return segment.replace("~1", "/").replace("~0", "~")
13 changes: 10 additions & 3 deletions src/inspect_ai/_util/trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,9 +250,16 @@ def read_file(f: TextIO) -> list[TraceRecord]:


def rotate_trace_files() -> None:
rotate_files = list_trace_files()[10:]
for file in rotate_files:
file.file.unlink(missing_ok=True)
# if multiple inspect processes start up at once they
# will all be attempting to rotate at the same time,
# which can lead to FileNotFoundError -- ignore these
# errors if they occur
try:
rotate_files = list_trace_files()[10:]
for file in rotate_files:
file.file.unlink(missing_ok=True)
except FileNotFoundError:
pass


def compress_trace_log(log_handler: FileHandler) -> Callable[[], None]:
Expand Down
33 changes: 32 additions & 1 deletion src/inspect_ai/_util/transcript.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import html
import re
from typing import Any

from rich.align import AlignMethod
Expand All @@ -19,13 +20,43 @@ def transcript_code_theme() -> str:
def transcript_markdown(content: str, *, escape: bool = False) -> Markdown:
code_theme = transcript_code_theme()
return Markdown(
html.escape(content) if escape else content,
html_escape_markdown(content) if escape else content,
code_theme=code_theme,
inline_code_lexer="python",
inline_code_theme=code_theme,
)


def html_escape_markdown(content: str) -> str:
"""Escape markdown lines that aren't in a code block."""
codeblock_pattern = re.compile("`{3,}")
current_codeblock = ""
escaped: list[str] = []
lines = content.splitlines()
for line in lines:
# look for matching end of codeblock
if current_codeblock:
if current_codeblock in line:
current_codeblock = ""
escaped.append(line)
continue

# look for beginning of codeblock
match = codeblock_pattern.search(line)
if match:
current_codeblock = match[0]
escaped.append(line)
continue

# escape if we are not in a codeblock
if current_codeblock:
escaped.append(line)
else:
escaped.append(html.escape(line, quote=False))

return "\n".join(escaped)


def set_transcript_markdown_options(markdown: Markdown) -> None:
code_theme = transcript_code_theme()
markdown.code_theme = code_theme
Expand Down
4 changes: 2 additions & 2 deletions src/inspect_ai/_view/www/dist/assets/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -25838,7 +25838,7 @@ ${events}
[selectedIndex]
);
const listStyle = { ...style2, flex: "1", overflowY: "auto", outline: "none" };
const { limit, answer, target } = gridColumns(sampleDescriptor);
const { input, limit, answer, target } = gridColumns(sampleDescriptor);
const headerRow = m$1`<div
style=${{
display: "grid",
Expand All @@ -25852,7 +25852,7 @@ ${events}
}}
>
<div>Id</div>
<div>Input</div>
<div>${input !== "0" ? "Input" : ""}</div>
<div>${target !== "0" ? "Target" : ""}</div>
<div>${answer !== "0" ? "Answer" : ""}</div>
<div>${limit !== "0" ? "Limit" : ""}</div>
Expand Down
4 changes: 2 additions & 2 deletions src/inspect_ai/_view/www/src/samples/SampleList.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ export const SampleList = (props) => {
);

const listStyle = { ...style, flex: "1", overflowY: "auto", outline: "none" };
const { limit, answer, target } = gridColumns(sampleDescriptor);
const { input, limit, answer, target } = gridColumns(sampleDescriptor);

const headerRow = html`<div
style=${{
Expand All @@ -176,7 +176,7 @@ export const SampleList = (props) => {
}}
>
<div>Id</div>
<div>Input</div>
<div>${input !== "0" ? "Input" : ""}</div>
<div>${target !== "0" ? "Target" : ""}</div>
<div>${answer !== "0" ? "Answer" : ""}</div>
<div>${limit !== "0" ? "Limit" : ""}</div>
Expand Down
20 changes: 17 additions & 3 deletions src/inspect_ai/model/_providers/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
Logprobs,
ModelOutput,
ModelUsage,
StopReason,
)
from .openai_o1 import generate_o1
from .util import (
Expand Down Expand Up @@ -262,7 +263,10 @@ def completion_params(self, config: GenerateConfig, tools: bool) -> dict[str, An
model=self.model_name,
)
if config.max_tokens is not None:
params["max_tokens"] = config.max_tokens
if self.is_o1_full():
params["max_completion_tokens"] = config.max_tokens
else:
params["max_tokens"] = config.max_tokens
if config.frequency_penalty is not None:
params["frequency_penalty"] = config.frequency_penalty
if config.stop_seqs is not None:
Expand Down Expand Up @@ -303,13 +307,23 @@ def completion_params(self, config: GenerateConfig, tools: bool) -> dict[str, An

# convert some well known bad request errors into ModelOutput
def handle_bad_request(self, e: BadRequestError) -> ModelOutput:
if e.status_code == 400 and e.code == "context_length_exceeded":
if e.status_code == 400:
# extract message
if isinstance(e.body, dict) and "message" in e.body.keys():
content = str(e.body.get("message"))
else:
content = e.message

# narrow stop_reason
if e.code == "context_length_exceeded":
stop_reason: StopReason = "model_length"
elif e.code == "invalid_prompt":
stop_reason = "content_filter"
else:
stop_reason = "unknown"

return ModelOutput.from_content(
model=self.model_name, content=content, stop_reason="model_length"
model=self.model_name, content=content, stop_reason=stop_reason
)
else:
raise e
Expand Down
16 changes: 10 additions & 6 deletions src/inspect_ai/model/_providers/openai_o1.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from inspect_ai.tool import ToolCall, ToolInfo

from .._model_call import ModelCall
from .._model_output import ModelUsage
from .._model_output import ModelUsage, StopReason
from .._providers.util import (
ChatAPIHandler,
ChatAPIMessage,
Expand Down Expand Up @@ -89,12 +89,16 @@ def model_call() -> ModelCall:


def handle_bad_request(model: str, ex: BadRequestError) -> ModelOutput:
if ex.code == "invalid_prompt":
return ModelOutput.from_content(
model=model, content=str(ex), stop_reason="content_filter"
)
if ex.code == "context_length_exceeded":
stop_reason: StopReason = "model_length"
elif ex.code == "invalid_prompt":
stop_reason = "content_filter"
else:
raise ex
stop_reason = "unknown"

return ModelOutput.from_content(
model=model, content=str(ex), stop_reason=stop_reason
)


def chat_messages(
Expand Down
5 changes: 5 additions & 0 deletions tools/vscode/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Changelog

## 0.3.49

- Improve code lense detection of Inspect tasks (ty @tobiasraabe)
- Use icon to reflect log status in log listing activity panel (red = error, yellow = cancelled, green = running)

## 0.3.48

- Properly shutdown the `inspect view` process when exiting VSCode.
Expand Down
13 changes: 13 additions & 0 deletions tools/vscode/assets/icon/eval-treeview-cancelled.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
13 changes: 13 additions & 0 deletions tools/vscode/assets/icon/eval-treeview-error.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
14 changes: 14 additions & 0 deletions tools/vscode/assets/icon/eval-treeview-started.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion tools/vscode/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"author": {
"name": "UK AI Safety Institute"
},
"version": "0.3.48",
"version": "0.3.49",
"license": "MIT",
"homepage": "https://inspect.ai-safety-institute.org.uk/",
"repository": {
Expand Down
Loading

0 comments on commit c879e4e

Please sign in to comment.