Skip to content

Commit

Permalink
Provisional implementation of computer tool.
Browse files Browse the repository at this point in the history
  • Loading branch information
Eric Patey committed Jan 9, 2025
1 parent abc3cf6 commit 38414f5
Show file tree
Hide file tree
Showing 35 changed files with 1,472 additions and 55 deletions.
16 changes: 16 additions & 0 deletions examples/computer/compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
services:
default:
# Temporary internal image until the official one is available
image: inspect-computer-tool
init: true

# If you only launch a single container, you can vnc into that container by using
# the following port mapping
# ports:
# - "5900:5900"

# If you launch multiple containers, you can vnc into each container by using the
# following port mapping, but you will need to determine the host port number for
# the specific container by using `docker inspect <container_id_or_name>`
# ports:
# - "5900"
58 changes: 58 additions & 0 deletions examples/computer/computer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from inspect_ai import Task, task
from inspect_ai.dataset import Sample
from inspect_ai.scorer import includes
from inspect_ai.solver import system_message
from inspect_ai.solver._basic_agent import basic_agent
from inspect_ai.tool import computer

SYSTEM_MESSAGE = """
Before each step, please clearly explain your intent for performing a tool action: "I expect tool command X to ...".
After each step, carefully evaluate the resulting screenshot to see if the command achieved the right outcome.
Explicitly show your thinking: "I have evaluated step X..." If not correct, try again. Only when
you confirm a step was executed correctly should you move on to the next one.
Note that launching applications from the bottom task bar requires a single left click.
"""


@task
def computer_task():
return Task(
dataset=[
Sample(
input="Use the computer tool to find the contents of the file '/tmp/flag.txt'. When you've found it, report its contents.",
target="Frunobulax",
files={"/tmp/flag.txt": "flag.txt"},
),
Sample(
input="Use the computer tool and open the spreadsheet '/tmp/moonWeight.ods`. Enter your weight as 156 lbs. Report the result from cell B2. Important: If the 'Tip of the Day' dialog appears, you'll need to close it before proceeding. Important: You may need to install a spreadsheet using a command like 'sudo apt-get install -y libreoffice'.",
target="26",
files={"/tmp/moonWeight.ods": "moonWeight.ods"},
),
Sample(
input="Use the computer tool to launch a terminal. Type 'Trudging across the tundra. Mile after Mile.' into the terminal. Important: Make sure that the terminal window is active before typing. When you are done, please use the submit tool to record the result of hitting enter in the terminal after entering that text.",
target="bash: Trudging: command not found",
),
Sample(
input="Use the computer tool to launch a calculator. Calculate 123 x 456. Report the result.",
target="56088",
),
Sample(
input='Draw a smiley face with a paint program. When you are done, submit("Art is in the eye of the beholder"). Important: In XPaint, you need to create a new canvas before you can draw.',
target="Art is in the eye of the beholder",
),
# Sample(
# input='Draw a smiley face with a paint program. The face should be a solid yellow circle on a light gray background with solid black circles for eyes. The mouth should be a curved black line. When you are done, submit("Art is in the eye of the beholder"). Important: In XPaint, you need to create a new canvas before you can draw. Also, in XPaint, the toolbar contains the following tools on each row. 1. Pencil, Dynamic Pencil, Dot Pencil 2. Brush, Spray, Smear 3. Segment, Polygonal Line, Arc 4. Arrowhead, Text, Erase 5. Box, Filled Box, Box Region 6. Oval, Filled Oval, Oval Region 7. Freehand Shape, Filled Freehand Shape, Freehand Shape Region 8. Polygon, Filled Polygon, Polygon Region 9. Spline Curve, Filled Spline Curve, Spline Curve Region 10. Fill, Gradient Fill, Fractal Fill',
# target='Art is in the eye of the beholder',
# ),
],
solver=basic_agent(
init=system_message(SYSTEM_MESSAGE),
tools=[computer()],
max_messages=100,
),
scorer=includes(),
sandbox="docker",
)
1 change: 1 addition & 0 deletions examples/computer/flag.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Frunobulax
Binary file added examples/computer/moonWeight.ods
Binary file not shown.
37 changes: 0 additions & 37 deletions examples/hello_computer.py

This file was deleted.

7 changes: 7 additions & 0 deletions examples/intervention/computer-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
services:
default:
# Temporary internal image until the official one is available
image: inspect-computer-tool
init: true
ports:
- "5900:5900"
34 changes: 24 additions & 10 deletions examples/intervention/intervention.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from textwrap import dedent
from typing import Literal

from rich.prompt import Prompt

Expand All @@ -12,23 +13,36 @@
system_message,
use_tools,
)
from inspect_ai.tool import bash, python
from inspect_ai.tool import bash, computer, python
from inspect_ai.util import input_screen


@task
def intervention():
return Task(
solver=[
system_prompt(),
user_prompt(),
use_tools([bash(), python()]),
agent_loop(),
],
sandbox="docker",
def intervention(mode: Literal["basic", "computer"] = "basic") -> Task:
return (
Task(
solver=[
system_prompt(),
user_prompt(),
use_tools([bash(), python()]),
agent_loop(),
],
sandbox="docker",
)
if mode == "basic"
else Task(
solver=[
system_prompt(),
user_prompt(),
use_tools([computer()]),
agent_loop(),
],
sandbox=("docker", "computer-compose.yaml"),
)
)


# TODO: Customize the prompt based on the mode above??
@solver
def system_prompt():
SYSTEM_PROMPT = dedent("""
Expand Down
20 changes: 13 additions & 7 deletions src/inspect_ai/model/_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def tools_required(self) -> bool:
return False

def tool_result_images(self) -> bool:
"""Tool results can containe images"""
"""Tool results can contain images"""
return False


Expand Down Expand Up @@ -713,23 +713,29 @@ def tool_result_images_reducer(
messages: list[ChatMessage],
message: ChatMessage,
) -> list[ChatMessage]:
# append the message
messages.append(message)

# if there are tool result images, pull them out into a ChatUserMessage
if isinstance(message, ChatMessageTool) and isinstance(message.content, list):
tool_message = ChatMessageTool(
content=message.content.copy(), tool_call_id=message.tool_call_id
)
assert isinstance(tool_message.content, list)
messages.append(tool_message)

user_content: list[Content] = []
for i in range(0, len(message.content)):
if isinstance(message.content[i], ContentImage):
for i in range(0, len(tool_message.content)):
if isinstance(tool_message.content[i], ContentImage):
user_content.append(message.content[i])
message.content[i] = ContentText(
tool_message.content[i] = ContentText(
text="Image content is in the message below."
)
if len(user_content) > 0:
messages.append(
ChatMessageUser(content=user_content, tool_call_id=message.tool_call_id)
)

else:
messages.append(message)

# return messages
return messages

Expand Down
2 changes: 2 additions & 0 deletions src/inspect_ai/tool/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from ._tool_info import ToolInfo
from ._tool_params import ToolParam, ToolParams
from ._tool_with import tool_with
from ._tools._computer import computer
from ._tools._execute import bash, python
from ._tools._web_browser import web_browser
from ._tools._web_search import web_search
Expand All @@ -23,6 +24,7 @@
"python",
"web_browser",
"web_search",
"computer",
"tool",
"tool_with",
"Tool",
Expand Down
3 changes: 3 additions & 0 deletions src/inspect_ai/tool/_tools/_computer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from ._computer import computer

__all__ = ["computer"]
136 changes: 136 additions & 0 deletions src/inspect_ai/tool/_tools/_computer/_common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import json
import logging
from typing import Literal

from pydantic import BaseModel, Field

from inspect_ai._util.content import ContentText
from inspect_ai.model import ContentImage
from inspect_ai.tool import ToolError, ToolResult
from inspect_ai.util import sandbox

Action = Literal[
"key",
"type",
"mouse_move",
"left_click",
"left_click_drag",
"right_click",
"middle_click",
"double_click",
"screenshot",
"cursor_position",
]

log = logging.getLogger(__name__)
# log = MockLogger()
log.setLevel(logging.DEBUG)


class ToolExecResult(BaseModel):
output: str | None = Field(default=None)
error: str | None = Field(default=None)
base64_image: str | None = Field(default=None)


async def _send_cmd(cmdTail: list[str], timeout: int | None = None) -> ToolResult:
from inspect_ai.log._samples import sample_active

sample = sample_active()
assert sample
sample_id = sample.sample.id
assert sample_id

cmd = ["python3", "-m", "computer_tool.computer_tool", "--action"] + cmdTail
log.info(f"(sample={sample_id}) Executing command: {cmd}")

try:
raw_exec_result = await sandbox().exec(cmd, timeout=timeout)

if not raw_exec_result.success:
raise Exception(
f"Failure executing command: ${cmd} {raw_exec_result.stderr}"
)

result = ToolExecResult(**json.loads(raw_exec_result.stdout))

if result.error:
log.warning(
f"(sample={sample_id}) Tool returned an error. Raising ToolError('{result.error}'"
)
raise ToolError(result.error)

image = (
ContentImage(image=f"data:image/png;base64,{result.base64_image}")
if result.base64_image
else None
)
text = result.output if result.output and len(result.output) > 0 else None

if text is not None and image is not None:
log.info(
f"(sample={sample_id}) ToolResult([ContentText('{text}'), ContentImage])"
)
return [ContentText(text=text), image]

if text is not None:
log.info(f"(sample={sample_id}) ToolResult('{text}')")
return text

if image is not None:
log.info(f"(sample={sample_id}) ToolResult([ContentImage])")
return [image]

log.warning(
"(sample={sample_id}) Tool returned neither output nor image - returning ToolResult('OK')"
)
return "OK"
except ToolError:
raise
except Exception as e:
log.error(f"(sample={sample_id}) Sandbox.exec threw for {cmd}...re-raising {e}")
raise e


async def cursor_position(timeout: int | None = None) -> ToolResult:
return await _send_cmd(["cursor_position"], timeout=timeout)


async def screenshot(timeout: int | None = None) -> ToolResult:
return await _send_cmd(["screenshot"], timeout=timeout)


async def mouse_move(x: int, y: int, timeout: int | None = None) -> ToolResult:
return await _send_cmd(
["mouse_move", "--coordinate", f"{x}", f"{y}"], timeout=timeout
)


async def left_click(timeout: int | None = None) -> ToolResult:
return await _send_cmd(["left_click"], timeout=timeout)


async def left_click_drag(x: int, y: int, timeout: int | None = None) -> ToolResult:
return await _send_cmd(
["left_click_drag", "--coordinate", f"{x}", f"{y}"], timeout=timeout
)


async def right_click(timeout: int | None = None) -> ToolResult:
return await _send_cmd(["right_click"], timeout=timeout)


async def middle_click(timeout: int | None = None) -> ToolResult:
return await _send_cmd(["middle_click"], timeout=timeout)


async def double_click(timeout: int | None = None) -> ToolResult:
return await _send_cmd(["double_click"], timeout=timeout)


async def press_key(key: str, timeout: int | None = None) -> ToolResult:
return await _send_cmd(["key", "--text", key], timeout=timeout)


async def type(text: str, timeout: int | None = None) -> ToolResult:
return await _send_cmd(["type", "--text", text], timeout=timeout)
Loading

0 comments on commit 38414f5

Please sign in to comment.