Merge branch 'UKGovernmentBEIS:main' into main

UKGovernmentBEIS · Jan 6, 2025 · c879e4e · c879e4e
2 parents 86736cc + 5393dbd
commit c879e4e
Show file tree

Hide file tree

Showing 20 changed files with 612 additions and 253 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@ default_language_version:
   python: python3.11
 repos:
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.8.5
+  rev: v0.8.6
   hooks:
     # Run the linter.
     - id: ruff

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,7 +6,13 @@
 - Inspect View: never truncate tool result images and display at default width of 800px.
 - Inspect View: display tool error messages in transcript when tool errors occur.
 - Inspect View: display any completed samples even if the task fails because of an error
+- Inspect View: don't display the 'input' column heading if there isn't an input
+- Open AI: Handle additional bad request status codes (mapping them to appropriate `StopReason`)
+- Open AI: Use new `max_completion_tokens` option for o1 full.
+- Sandboxes: Apply dataset filters (limit and sample id) prior to sandbox initialisation.
 - Tool parameters with a default of `None` are now supported.
+- More fine graned HTML escaping for sample transcripts displalyed in terminal.
+- Fix an issue that would result in an error when a state or storage value used a tilda or slash in the key name.
 
 ## v0.3.56 (01 January 2025)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -129,7 +129,7 @@ dev = [
     "pytest-cov",
     "pytest-dotenv",
     "pytest-xdist",
-    "ruff==0.8.5", # match version specified in .pre-commit-config.yaml
+    "ruff==0.8.6", # match version specified in .pre-commit-config.yaml
     "textual-dev>=0.86.2",
     "types-PyYAML",
     "types-beautifulsoup4",

diff --git a/src/inspect_ai/_eval/run.py b/src/inspect_ai/_eval/run.py
@@ -42,7 +42,7 @@
 from .task.run import TaskRunOptions, task_run
 from .task.rundir import task_run_dir_switching
 from .task.sandbox import TaskSandboxEnvironment, resolve_sandbox_for_task
-from .task.util import task_run_dir
+from .task.util import slice_dataset, task_run_dir
 
 log = logging.getLogger(__name__)
 
@@ -70,12 +70,23 @@ async def eval_run(
     # get cwd before switching to task dir
     eval_wd = os.getcwd()
 
+    # ensure sample ids
+    for resolved_task in tasks:
+        # add sample ids to dataset if they aren't there (start at 1 not 0)
+        task = resolved_task.task
+        for id, sample in enumerate(task.dataset):
+            if sample.id is None:
+                sample.id = id + 1
+
+        # Ensure sample ids are unique
+        ensure_unique_ids(task.dataset)
+
     # run startup pass for the sandbox environments
     shutdown_sandbox_environments: Callable[[], Awaitable[None]] | None = None
     if has_sandbox:
         cleanup = eval_config.sandbox_cleanup is not False
         shutdown_sandbox_environments = await startup_sandbox_environments(
-            resolve_sandbox_environment(eval_sandbox), tasks, cleanup
+            resolve_sandbox_environment(eval_sandbox), tasks, eval_config, cleanup
         )
 
     # resolve solver and solver spec
@@ -146,14 +157,6 @@ async def eval_run(
                 else:
                     task.fail_on_error = task_eval_config.fail_on_error
 
-                # add sample ids to dataset if they aren't there (start at 1 not 0)
-                for id, sample in enumerate(task.dataset):
-                    if sample.id is None:
-                        sample.id = id + 1
-
-                # Ensure sample ids are unique
-                ensure_unique_ids(task.dataset)
-
                 # create and track the logger
                 logger = TaskLogger(
                     task_name=task.name,
@@ -340,13 +343,15 @@ async def worker() -> None:
 async def startup_sandbox_environments(
     eval_sandbox: SandboxEnvironmentSpec | None,
     tasks: list[ResolvedTask],
+    config: EvalConfig,
     cleanup: bool,
 ) -> Callable[[], Awaitable[None]]:
     # find unique sandboxenvs
     sandboxenvs: Set[TaskSandboxEnvironment] = set()
     for task in tasks:
         # resolve each sample and add to sandboxenvs
-        for sample in task.task.dataset:
+        dataset = slice_dataset(task.task.dataset, config.limit, config.sample_id)
+        for sample in dataset:
             sandbox = resolve_sandbox_for_task(eval_sandbox, task.task, sample)
             if sandbox is not None and sandbox not in sandboxenvs:
                 sandboxenvs.add(sandbox)

diff --git a/src/inspect_ai/_util/json.py b/src/inspect_ai/_util/json.py
@@ -103,10 +103,20 @@ def json_changes(
                 paths = json_change.path.split("/")[1:]
                 replaced = before
                 for path in paths:
-                    index: Any = int(path) if path.isnumeric() else path
+                    decoded_path = decode_json_pointer_segment(path)
+                    index: Any = (
+                        int(decoded_path) if decoded_path.isnumeric() else decoded_path
+                    )
                     replaced = replaced[index]
                 json_change.replaced = replaced
             changes.append(json_change)
         return changes
     else:
         return None
+
+
+def decode_json_pointer_segment(segment: str) -> str:
+    """Decode a single JSON Pointer segment."""
+    # JSON points encode ~ and / because they are special characters
+    # this decodes these values (https://www.rfc-editor.org/rfc/rfc6901)
+    return segment.replace("~1", "/").replace("~0", "~")
diff --git a/src/inspect_ai/_util/trace.py b/src/inspect_ai/_util/trace.py
@@ -250,9 +250,16 @@ def read_file(f: TextIO) -> list[TraceRecord]:
 
 
 def rotate_trace_files() -> None:
-    rotate_files = list_trace_files()[10:]
-    for file in rotate_files:
-        file.file.unlink(missing_ok=True)
+    # if multiple inspect processes start up at once they
+    # will all be attempting to rotate at the same time,
+    # which can lead to FileNotFoundError -- ignore these
+    # errors if they occur
+    try:
+        rotate_files = list_trace_files()[10:]
+        for file in rotate_files:
+            file.file.unlink(missing_ok=True)
+    except FileNotFoundError:
+        pass
 
 
 def compress_trace_log(log_handler: FileHandler) -> Callable[[], None]:

diff --git a/src/inspect_ai/_util/transcript.py b/src/inspect_ai/_util/transcript.py
@@ -1,4 +1,5 @@
 import html
+import re
 from typing import Any
 
 from rich.align import AlignMethod
@@ -19,13 +20,43 @@ def transcript_code_theme() -> str:
 def transcript_markdown(content: str, *, escape: bool = False) -> Markdown:
     code_theme = transcript_code_theme()
     return Markdown(
-        html.escape(content) if escape else content,
+        html_escape_markdown(content) if escape else content,
         code_theme=code_theme,
         inline_code_lexer="python",
         inline_code_theme=code_theme,
     )
 
 
+def html_escape_markdown(content: str) -> str:
+    """Escape markdown lines that aren't in a code block."""
+    codeblock_pattern = re.compile("`{3,}")
+    current_codeblock = ""
+    escaped: list[str] = []
+    lines = content.splitlines()
+    for line in lines:
+        # look for matching end of codeblock
+        if current_codeblock:
+            if current_codeblock in line:
+                current_codeblock = ""
+                escaped.append(line)
+                continue
+
+        # look for beginning of codeblock
+        match = codeblock_pattern.search(line)
+        if match:
+            current_codeblock = match[0]
+            escaped.append(line)
+            continue
+
+        # escape if we are not in a codeblock
+        if current_codeblock:
+            escaped.append(line)
+        else:
+            escaped.append(html.escape(line, quote=False))
+
+    return "\n".join(escaped)
+
+
 def set_transcript_markdown_options(markdown: Markdown) -> None:
     code_theme = transcript_code_theme()
     markdown.code_theme = code_theme

diff --git a/src/inspect_ai/_view/www/dist/assets/index.js b/src/inspect_ai/_view/www/dist/assets/index.js
@@ -25838,7 +25838,7 @@ ${events}
         [selectedIndex]
       );
       const listStyle = { ...style2, flex: "1", overflowY: "auto", outline: "none" };
-      const { limit, answer, target } = gridColumns(sampleDescriptor);
+      const { input, limit, answer, target } = gridColumns(sampleDescriptor);
       const headerRow = m$1`<div
     style=${{
         display: "grid",
@@ -25852,7 +25852,7 @@ ${events}
       }}
   >
     <div>Id</div>
-    <div>Input</div>
+    <div>${input !== "0" ? "Input" : ""}</div>
     <div>${target !== "0" ? "Target" : ""}</div>
     <div>${answer !== "0" ? "Answer" : ""}</div>
     <div>${limit !== "0" ? "Limit" : ""}</div>

diff --git a/src/inspect_ai/_view/www/src/samples/SampleList.mjs b/src/inspect_ai/_view/www/src/samples/SampleList.mjs
@@ -161,7 +161,7 @@ export const SampleList = (props) => {
   );
 
   const listStyle = { ...style, flex: "1", overflowY: "auto", outline: "none" };
-  const { limit, answer, target } = gridColumns(sampleDescriptor);
+  const { input, limit, answer, target } = gridColumns(sampleDescriptor);
 
   const headerRow = html`<div
     style=${{
@@ -176,7 +176,7 @@ export const SampleList = (props) => {
     }}
   >
     <div>Id</div>
-    <div>Input</div>
+    <div>${input !== "0" ? "Input" : ""}</div>
     <div>${target !== "0" ? "Target" : ""}</div>
     <div>${answer !== "0" ? "Answer" : ""}</div>
     <div>${limit !== "0" ? "Limit" : ""}</div>

diff --git a/src/inspect_ai/model/_providers/openai.py b/src/inspect_ai/model/_providers/openai.py
@@ -51,6 +51,7 @@
     Logprobs,
     ModelOutput,
     ModelUsage,
+    StopReason,
 )
 from .openai_o1 import generate_o1
 from .util import (
@@ -262,7 +263,10 @@ def completion_params(self, config: GenerateConfig, tools: bool) -> dict[str, An
             model=self.model_name,
         )
         if config.max_tokens is not None:
-            params["max_tokens"] = config.max_tokens
+            if self.is_o1_full():
+                params["max_completion_tokens"] = config.max_tokens
+            else:
+                params["max_tokens"] = config.max_tokens
         if config.frequency_penalty is not None:
             params["frequency_penalty"] = config.frequency_penalty
         if config.stop_seqs is not None:
@@ -303,13 +307,23 @@ def completion_params(self, config: GenerateConfig, tools: bool) -> dict[str, An
 
     # convert some well known bad request errors into ModelOutput
     def handle_bad_request(self, e: BadRequestError) -> ModelOutput:
-        if e.status_code == 400 and e.code == "context_length_exceeded":
+        if e.status_code == 400:
+            # extract message
             if isinstance(e.body, dict) and "message" in e.body.keys():
                 content = str(e.body.get("message"))
             else:
                 content = e.message
+
+            # narrow stop_reason
+            if e.code == "context_length_exceeded":
+                stop_reason: StopReason = "model_length"
+            elif e.code == "invalid_prompt":
+                stop_reason = "content_filter"
+            else:
+                stop_reason = "unknown"
+
             return ModelOutput.from_content(
-                model=self.model_name, content=content, stop_reason="model_length"
+                model=self.model_name, content=content, stop_reason=stop_reason
             )
         else:
             raise e

diff --git a/src/inspect_ai/model/_providers/openai_o1.py b/src/inspect_ai/model/_providers/openai_o1.py
@@ -25,7 +25,7 @@
 from inspect_ai.tool import ToolCall, ToolInfo
 
 from .._model_call import ModelCall
-from .._model_output import ModelUsage
+from .._model_output import ModelUsage, StopReason
 from .._providers.util import (
     ChatAPIHandler,
     ChatAPIMessage,
@@ -89,12 +89,16 @@ def model_call() -> ModelCall:
 
 
 def handle_bad_request(model: str, ex: BadRequestError) -> ModelOutput:
-    if ex.code == "invalid_prompt":
-        return ModelOutput.from_content(
-            model=model, content=str(ex), stop_reason="content_filter"
-        )
+    if ex.code == "context_length_exceeded":
+        stop_reason: StopReason = "model_length"
+    elif ex.code == "invalid_prompt":
+        stop_reason = "content_filter"
     else:
-        raise ex
+        stop_reason = "unknown"
+
+    return ModelOutput.from_content(
+        model=model, content=str(ex), stop_reason=stop_reason
+    )
 
 
 def chat_messages(

diff --git a/tools/vscode/CHANGELOG.md b/tools/vscode/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## 0.3.49
+
+- Improve code lense detection of Inspect tasks (ty @tobiasraabe)
+- Use icon to reflect log status in log listing activity panel (red = error, yellow = cancelled, green = running)
+
 ## 0.3.48
 
 - Properly shutdown the `inspect view` process when exiting VSCode.

diff --git a/tools/vscode/assets/icon/eval-treeview-cancelled.svg b/tools/vscode/assets/icon/eval-treeview-cancelled.svg
diff --git a/tools/vscode/assets/icon/eval-treeview-error.svg b/tools/vscode/assets/icon/eval-treeview-error.svg
diff --git a/tools/vscode/assets/icon/eval-treeview-started.svg b/tools/vscode/assets/icon/eval-treeview-started.svg
diff --git a/tools/vscode/package.json b/tools/vscode/package.json
@@ -7,7 +7,7 @@
   "author": {
     "name": "UK AI Safety Institute"
   },
-  "version": "0.3.48",
+  "version": "0.3.49",
   "license": "MIT",
   "homepage": "https://inspect.ai-safety-institute.org.uk/",
   "repository": {