Provisional implementation of computer tool.

UKGovernmentBEIS · Jan 9, 2025 · 38414f5 · 38414f5
1 parent abc3cf6
commit 38414f5
Show file tree

Hide file tree

Showing 35 changed files with 1,472 additions and 55 deletions.
diff --git a/examples/computer/compose.yaml b/examples/computer/compose.yaml
@@ -0,0 +1,16 @@
+services:
+  default:
+    # Temporary internal image until the official one is available
+    image: inspect-computer-tool
+    init: true
+
+    # If you only launch a single container, you can vnc into that container by using
+    # the following port mapping
+    # ports:
+    #   - "5900:5900"
+
+    # If you launch multiple containers, you can vnc into each container by using the
+    # following port mapping, but you will need to determine the host port number for
+    # the specific container by using `docker inspect <container_id_or_name>`
+    # ports:
+    #   - "5900"
diff --git a/examples/computer/computer.py b/examples/computer/computer.py
@@ -0,0 +1,58 @@
+from inspect_ai import Task, task
+from inspect_ai.dataset import Sample
+from inspect_ai.scorer import includes
+from inspect_ai.solver import system_message
+from inspect_ai.solver._basic_agent import basic_agent
+from inspect_ai.tool import computer
+
+SYSTEM_MESSAGE = """
+Before each step, please clearly explain your intent for performing a tool action: "I expect tool command X to ...".
+
+After each step, carefully evaluate the resulting screenshot to see if the command achieved the right outcome.
+
+Explicitly show your thinking: "I have evaluated step X..." If not correct, try again. Only when
+you confirm a step was executed correctly should you move on to the next one.
+
+Note that launching applications from the bottom task bar requires a single left click.
+"""
+
+
+@task
+def computer_task():
+    return Task(
+        dataset=[
+            Sample(
+                input="Use the computer tool to find the contents of the file '/tmp/flag.txt'. When you've found it, report its contents.",
+                target="Frunobulax",
+                files={"/tmp/flag.txt": "flag.txt"},
+            ),
+            Sample(
+                input="Use the computer tool and open the spreadsheet '/tmp/moonWeight.ods`. Enter your weight as 156 lbs. Report the result from cell B2. Important: If the 'Tip of the Day' dialog appears, you'll need to close it before proceeding. Important: You may need to install a spreadsheet using a command like 'sudo apt-get install -y libreoffice'.",
+                target="26",
+                files={"/tmp/moonWeight.ods": "moonWeight.ods"},
+            ),
+            Sample(
+                input="Use the computer tool to launch a terminal. Type 'Trudging across the tundra. Mile after Mile.' into the terminal. Important: Make sure that the terminal window is active before typing. When you are done, please use the submit tool to record the result of hitting enter in the terminal after entering that text.",
+                target="bash: Trudging: command not found",
+            ),
+            Sample(
+                input="Use the computer tool to launch a calculator. Calculate 123 x 456. Report the result.",
+                target="56088",
+            ),
+            Sample(
+                input='Draw a smiley face with a paint program. When you are done, submit("Art is in the eye of the beholder"). Important: In XPaint, you need to create a new canvas before you can draw.',
+                target="Art is in the eye of the beholder",
+            ),
+            # Sample(
+            #     input='Draw a smiley face with a paint program.  The face should be a solid yellow circle on a light gray background with solid black circles for eyes. The mouth should be a curved black line. When you are done, submit("Art is in the eye of the beholder"). Important: In XPaint, you need to create a new canvas before you can draw. Also, in XPaint, the toolbar contains the following tools on each row. 1. Pencil, Dynamic Pencil, Dot Pencil 2. Brush, Spray, Smear 3. Segment, Polygonal Line, Arc 4. Arrowhead, Text, Erase 5. Box, Filled Box, Box Region 6. Oval, Filled Oval, Oval Region 7. Freehand Shape, Filled Freehand Shape, Freehand Shape Region 8. Polygon, Filled Polygon, Polygon Region 9. Spline Curve, Filled Spline Curve, Spline Curve Region 10. Fill, Gradient Fill, Fractal Fill',
+            #     target='Art is in the eye of the beholder',
+            # ),
+        ],
+        solver=basic_agent(
+            init=system_message(SYSTEM_MESSAGE),
+            tools=[computer()],
+            max_messages=100,
+        ),
+        scorer=includes(),
+        sandbox="docker",
+    )
diff --git a/examples/computer/flag.txt b/examples/computer/flag.txt
@@ -0,0 +1 @@
+Frunobulax
diff --git a/examples/computer/moonWeight.ods b/examples/computer/moonWeight.ods
diff --git a/examples/hello_computer.py b/examples/hello_computer.py
diff --git a/examples/intervention/computer-compose.yaml b/examples/intervention/computer-compose.yaml
@@ -0,0 +1,7 @@
+services:
+  default:
+    # Temporary internal image until the official one is available
+    image: inspect-computer-tool
+    init: true
+    ports:
+      - "5900:5900"
diff --git a/examples/intervention/intervention.py b/examples/intervention/intervention.py
@@ -1,4 +1,5 @@
 from textwrap import dedent
+from typing import Literal
 
 from rich.prompt import Prompt
 
@@ -12,23 +13,36 @@
     system_message,
     use_tools,
 )
-from inspect_ai.tool import bash, python
+from inspect_ai.tool import bash, computer, python
 from inspect_ai.util import input_screen
 
 
 @task
-def intervention():
-    return Task(
-        solver=[
-            system_prompt(),
-            user_prompt(),
-            use_tools([bash(), python()]),
-            agent_loop(),
-        ],
-        sandbox="docker",
+def intervention(mode: Literal["basic", "computer"] = "basic") -> Task:
+    return (
+        Task(
+            solver=[
+                system_prompt(),
+                user_prompt(),
+                use_tools([bash(), python()]),
+                agent_loop(),
+            ],
+            sandbox="docker",
+        )
+        if mode == "basic"
+        else Task(
+            solver=[
+                system_prompt(),
+                user_prompt(),
+                use_tools([computer()]),
+                agent_loop(),
+            ],
+            sandbox=("docker", "computer-compose.yaml"),
+        )
     )
 
 
+# TODO: Customize the prompt based on the mode above??
 @solver
 def system_prompt():
     SYSTEM_PROMPT = dedent("""

diff --git a/src/inspect_ai/model/_model.py b/src/inspect_ai/model/_model.py
@@ -165,7 +165,7 @@ def tools_required(self) -> bool:
         return False
 
     def tool_result_images(self) -> bool:
-        """Tool results can containe images"""
+        """Tool results can contain images"""
         return False
 
 
@@ -713,23 +713,29 @@ def tool_result_images_reducer(
     messages: list[ChatMessage],
     message: ChatMessage,
 ) -> list[ChatMessage]:
-    # append the message
-    messages.append(message)
-
     # if there are tool result images, pull them out into a ChatUserMessage
     if isinstance(message, ChatMessageTool) and isinstance(message.content, list):
+        tool_message = ChatMessageTool(
+            content=message.content.copy(), tool_call_id=message.tool_call_id
+        )
+        assert isinstance(tool_message.content, list)
+        messages.append(tool_message)
+
         user_content: list[Content] = []
-        for i in range(0, len(message.content)):
-            if isinstance(message.content[i], ContentImage):
+        for i in range(0, len(tool_message.content)):
+            if isinstance(tool_message.content[i], ContentImage):
                 user_content.append(message.content[i])
-                message.content[i] = ContentText(
+                tool_message.content[i] = ContentText(
                     text="Image content is in the message below."
                 )
         if len(user_content) > 0:
             messages.append(
                 ChatMessageUser(content=user_content, tool_call_id=message.tool_call_id)
             )
 
+    else:
+        messages.append(message)
+
     # return messages
     return messages
 

diff --git a/src/inspect_ai/tool/__init__.py b/src/inspect_ai/tool/__init__.py
@@ -14,6 +14,7 @@
 from ._tool_info import ToolInfo
 from ._tool_params import ToolParam, ToolParams
 from ._tool_with import tool_with
+from ._tools._computer import computer
 from ._tools._execute import bash, python
 from ._tools._web_browser import web_browser
 from ._tools._web_search import web_search
@@ -23,6 +24,7 @@
     "python",
     "web_browser",
     "web_search",
+    "computer",
     "tool",
     "tool_with",
     "Tool",

diff --git a/src/inspect_ai/tool/_tools/_computer/__init__.py b/src/inspect_ai/tool/_tools/_computer/__init__.py
@@ -0,0 +1,3 @@
+from ._computer import computer
+
+__all__ = ["computer"]
diff --git a/src/inspect_ai/tool/_tools/_computer/_common.py b/src/inspect_ai/tool/_tools/_computer/_common.py
@@ -0,0 +1,136 @@
+import json
+import logging
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+from inspect_ai._util.content import ContentText
+from inspect_ai.model import ContentImage
+from inspect_ai.tool import ToolError, ToolResult
+from inspect_ai.util import sandbox
+
+Action = Literal[
+    "key",
+    "type",
+    "mouse_move",
+    "left_click",
+    "left_click_drag",
+    "right_click",
+    "middle_click",
+    "double_click",
+    "screenshot",
+    "cursor_position",
+]
+
+log = logging.getLogger(__name__)
+# log = MockLogger()
+log.setLevel(logging.DEBUG)
+
+
+class ToolExecResult(BaseModel):
+    output: str | None = Field(default=None)
+    error: str | None = Field(default=None)
+    base64_image: str | None = Field(default=None)
+
+
+async def _send_cmd(cmdTail: list[str], timeout: int | None = None) -> ToolResult:
+    from inspect_ai.log._samples import sample_active
+
+    sample = sample_active()
+    assert sample
+    sample_id = sample.sample.id
+    assert sample_id
+
+    cmd = ["python3", "-m", "computer_tool.computer_tool", "--action"] + cmdTail
+    log.info(f"(sample={sample_id}) Executing command: {cmd}")
+
+    try:
+        raw_exec_result = await sandbox().exec(cmd, timeout=timeout)
+
+        if not raw_exec_result.success:
+            raise Exception(
+                f"Failure executing command: ${cmd} {raw_exec_result.stderr}"
+            )
+
+        result = ToolExecResult(**json.loads(raw_exec_result.stdout))
+
+        if result.error:
+            log.warning(
+                f"(sample={sample_id}) Tool returned an error. Raising ToolError('{result.error}'"
+            )
+            raise ToolError(result.error)
+
+        image = (
+            ContentImage(image=f"data:image/png;base64,{result.base64_image}")
+            if result.base64_image
+            else None
+        )
+        text = result.output if result.output and len(result.output) > 0 else None
+
+        if text is not None and image is not None:
+            log.info(
+                f"(sample={sample_id}) ToolResult([ContentText('{text}'), ContentImage])"
+            )
+            return [ContentText(text=text), image]
+
+        if text is not None:
+            log.info(f"(sample={sample_id}) ToolResult('{text}')")
+            return text
+
+        if image is not None:
+            log.info(f"(sample={sample_id}) ToolResult([ContentImage])")
+            return [image]
+
+        log.warning(
+            "(sample={sample_id}) Tool returned neither output nor image - returning ToolResult('OK')"
+        )
+        return "OK"
+    except ToolError:
+        raise
+    except Exception as e:
+        log.error(f"(sample={sample_id}) Sandbox.exec threw for {cmd}...re-raising {e}")
+        raise e
+
+
+async def cursor_position(timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(["cursor_position"], timeout=timeout)
+
+
+async def screenshot(timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(["screenshot"], timeout=timeout)
+
+
+async def mouse_move(x: int, y: int, timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(
+        ["mouse_move", "--coordinate", f"{x}", f"{y}"], timeout=timeout
+    )
+
+
+async def left_click(timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(["left_click"], timeout=timeout)
+
+
+async def left_click_drag(x: int, y: int, timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(
+        ["left_click_drag", "--coordinate", f"{x}", f"{y}"], timeout=timeout
+    )
+
+
+async def right_click(timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(["right_click"], timeout=timeout)
+
+
+async def middle_click(timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(["middle_click"], timeout=timeout)
+
+
+async def double_click(timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(["double_click"], timeout=timeout)
+
+
+async def press_key(key: str, timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(["key", "--text", key], timeout=timeout)
+
+
+async def type(text: str, timeout: int | None = None) -> ToolResult:
+    return await _send_cmd(["type", "--text", text], timeout=timeout)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from ._computer import computer

		__all__ = ["computer"]