Skip to content

Commit

Permalink
Plumb timeout through the tool and do final pass on tool doc strings.
Browse files Browse the repository at this point in the history
  • Loading branch information
Eric Patey committed Jan 8, 2025
1 parent bc7fa51 commit 9f9ce6a
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 79 deletions.
55 changes: 26 additions & 29 deletions src/inspect_ai/tool/_tools/_computer/_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class ToolExecResult(BaseModel):
base64_image: str | None = Field(default=None)


async def _send_cmd(cmdTail: list[str]) -> ToolResult:
async def _send_cmd(cmdTail: list[str], timeout: int | None = None) -> ToolResult:
from inspect_ai.log._samples import sample_active

sample = sample_active()
Expand All @@ -45,7 +45,7 @@ async def _send_cmd(cmdTail: list[str]) -> ToolResult:
log.info(f"(sample={sample_id}) Executing command: {cmd}")

try:
raw_exec_result = await sandbox().exec(cmd)
raw_exec_result = await sandbox().exec(cmd, timeout=timeout)

if not raw_exec_result.success:
raise Exception(
Expand Down Expand Up @@ -92,48 +92,45 @@ async def _send_cmd(cmdTail: list[str]) -> ToolResult:
raise e


async def cursor_position() -> ToolResult:
return await _send_cmd(["cursor_position"])
async def cursor_position(timeout: int | None = None) -> ToolResult:
return await _send_cmd(["cursor_position"], timeout=timeout)


async def screenshot() -> ToolResult:
return await _send_cmd(["screenshot"])
async def screenshot(timeout: int | None = None) -> ToolResult:
return await _send_cmd(["screenshot"], timeout=timeout)


async def mouse_move(x: int, y: int) -> ToolResult:
return await _send_cmd(["mouse_move", "--coordinate", f"{x}", f"{y}"])
async def mouse_move(x: int, y: int, timeout: int | None = None) -> ToolResult:
return await _send_cmd(
["mouse_move", "--coordinate", f"{x}", f"{y}"], timeout=timeout
)


async def left_click() -> ToolResult:
return await _send_cmd(["left_click"])
async def left_click(timeout: int | None = None) -> ToolResult:
return await _send_cmd(["left_click"], timeout=timeout)


async def left_click_drag(x: int, y: int) -> ToolResult:
return await _send_cmd(["left_click_drag", "--coordinate", f"{x}", f"{y}"])
async def left_click_drag(x: int, y: int, timeout: int | None = None) -> ToolResult:
return await _send_cmd(
["left_click_drag", "--coordinate", f"{x}", f"{y}"], timeout=timeout
)


async def right_click() -> ToolResult:
return await _send_cmd(["right_click"])
async def right_click(timeout: int | None = None) -> ToolResult:
return await _send_cmd(["right_click"], timeout=timeout)


async def middle_click() -> ToolResult:
return await _send_cmd(["middle_click"])
async def middle_click(timeout: int | None = None) -> ToolResult:
return await _send_cmd(["middle_click"], timeout=timeout)


async def double_click() -> ToolResult:
return await _send_cmd(["double_click"])
async def double_click(timeout: int | None = None) -> ToolResult:
return await _send_cmd(["double_click"], timeout=timeout)


async def press_key(key: str) -> ToolResult:
# TODO: Temporary partial fix for lack of escaping of user input
# When the model wants to key "*", it turns into a command line
# ending in "-- *", which expands to a list of all files and folders
# and hilarity ensues
if key == "*":
key = "KP_Multiply"
res = await _send_cmd(["key", "--text", key])
return res
async def press_key(key: str, timeout: int | None = None) -> ToolResult:
return await _send_cmd(["key", "--text", key], timeout=timeout)


async def type(text: str) -> ToolResult:
return await _send_cmd(["type", "--text", text])
async def type(text: str, timeout: int | None = None) -> ToolResult:
return await _send_cmd(["type", "--text", text], timeout=timeout)
31 changes: 16 additions & 15 deletions src/inspect_ai/tool/_tools/_computer/_computer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,9 @@ async def execute(
coordinate: list[int] | None = None,
) -> ToolResult:
"""
Use this tool to interact with the computer.
Use this tool to interact with a computer.
Use a mouse and keyboard to interact with a computer, and take screenshots.
* This is an interface to a desktop GUI. You must click on desktop menus or icons to start applications.
* Before taking any action, it's wise to consult the result of a screenshot action to determine current state
of the computer. Without doing so, you may not be able to complete the task.
Use a mouse and keyboard to interact with a computer's desktop GUI.
Keep in mind that icons require double clicks to open while other UI affordances like menu items and buttons require a single click.
Expand Down Expand Up @@ -79,9 +76,13 @@ async def execute(
)

if action == "mouse_move":
return await common.mouse_move(coordinate[0], coordinate[1])
return await common.mouse_move(
coordinate[0], coordinate[1], timeout=timeout
)
elif action == "left_click_drag":
return await common.left_click_drag(coordinate[0], coordinate[1])
return await common.left_click_drag(
coordinate[0], coordinate[1], timeout=timeout
)

if action in ("key", "type"):
if text is None:
Expand All @@ -92,9 +93,9 @@ async def execute(
raise ToolParsingError(output=f"{text} must be a string")

if action == "key":
return await common.press_key(text)
return await common.press_key(text, timeout=timeout)
elif action == "type":
return await common.type(text)
return await common.type(text, timeout=timeout)

if action in (
"left_click",
Expand All @@ -110,17 +111,17 @@ async def execute(
raise ToolParsingError(f"coordinate is not accepted for {action}")

if action == "screenshot":
return await common.screenshot()
return await common.screenshot(timeout=timeout)
elif action == "cursor_position":
return await common.cursor_position()
return await common.cursor_position(timeout=timeout)
elif action == "left_click":
return await common.left_click()
return await common.left_click(timeout=timeout)
elif action == "right_click":
return await common.right_click()
return await common.right_click(timeout=timeout)
elif action == "middle_click":
return await common.middle_click()
return await common.middle_click(timeout=timeout)
elif action == "double_click":
return await common.double_click()
return await common.double_click(timeout=timeout)

raise ToolParsingError(f"Invalid action: {action}")

Expand Down
71 changes: 36 additions & 35 deletions src/inspect_ai/tool/_tools/_computer/_computer_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
ActionFunction = Callable[[str], ToolResult | Awaitable[ToolResult]]


def computer_split() -> list[Tool]:
def computer_split(timeout: int | None = None) -> list[Tool]:
"""
Computer interaction tools.
Expand All @@ -22,7 +22,7 @@ def computer_split() -> list[Tool]:
computer_screenshot(),
computer_mouse_move(),
computer_left_click(),
computer_left_double_click(),
computer_double_click(),
computer_left_click_drag(),
computer_right_click(),
computer_key(),
Expand All @@ -31,41 +31,41 @@ def computer_split() -> list[Tool]:


@tool()
def computer_cursor_position() -> Tool:
def computer_cursor_position(timeout: int | None = None) -> Tool:
async def execute() -> ToolResult:
"""
Get the current mouse cursor position.
Get the current (x, y) pixel coordinate of the cursor on the screen.
Args:
None
Returns:
A `str` of the form "x y" where x and y are the current mouse coordinates.
"""
return await common.cursor_position()
return await common.cursor_position(timeout=timeout)

return execute


@tool()
def computer_screenshot() -> Tool:
def computer_screenshot(timeout: int | None = None) -> Tool:
async def execute() -> ToolResult:
"""
Take a screenshot of the screen.
Take a screenshot.
Args:
None
Returns:
A `list` with a single `ContentImage` of the screen.
"""
return await common.screenshot()
return await common.screenshot(timeout=timeout)

return execute


@tool()
def computer_mouse_move() -> Tool:
def computer_mouse_move(timeout: int | None = None) -> Tool:
async def execute(x: int, y: int) -> ToolResult:
"""
Move the cursor to a specified (x, y) pixel coordinate on the screen.
Expand All @@ -75,15 +75,15 @@ async def execute(x: int, y: int) -> ToolResult:
y: Y coordinate of the mouse destination.
Returns:
The `str` "OK" on success.
A `list` with a single `ContentImage` of the screen.
"""
return await common.mouse_move(x, y)
return await common.mouse_move(x, y, timeout=timeout)

return execute


@tool()
def computer_left_click() -> Tool:
def computer_left_click(timeout: int | None = None) -> Tool:
async def execute() -> ToolResult:
"""
Click the left mouse button.
Expand All @@ -92,15 +92,15 @@ async def execute() -> ToolResult:
None
Returns:
The `str` "OK" on success.
A `list` with a single `ContentImage` of the screen.
"""
return await common.left_click()
return await common.left_click(timeout=timeout)

return execute


@tool()
def computer_left_double_click() -> Tool:
def computer_double_click(timeout: int | None = None) -> Tool:
async def execute() -> ToolResult:
"""
Double-click the left mouse button.
Expand All @@ -109,33 +109,33 @@ async def execute() -> ToolResult:
None
Returns:
The `str` "OK" on success.
A `list` with a single `ContentImage` of the screen.
"""
return await common.double_click()
return await common.double_click(timeout=timeout)

return execute


@tool()
def computer_left_click_drag() -> Tool:
def computer_left_click_drag(timeout: int | None = None) -> Tool:
async def execute(x: int, y: int) -> ToolResult:
"""
Click the left button and drag to a specified (x, y) pixel coordinate on the screen.
Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.
Args:
x: X coordinate of the mouse destination.
y: Y coordinate of the mouse destination.
Returns:
The `str` "OK" on success.
A `list` with a single `ContentImage` of the screen.
"""
return await common.left_click_drag(x, y)
return await common.left_click_drag(x, y, timeout=timeout)

return execute


@tool()
def computer_right_click() -> Tool:
def computer_right_click(timeout: int | None = None) -> Tool:
async def execute() -> ToolResult:
"""
Click the right mouse button.
Expand All @@ -144,48 +144,49 @@ async def execute() -> ToolResult:
None
Returns:
The `str` "OK" on success.
A `list` with a single `ContentImage` of the screen.
"""
return await common.right_click()
return await common.right_click(timeout=timeout)

return execute


# keysm list is from https://gist.github.com/rvaiya/be31f42049a4b5ad46666a8e120d9843
@tool()
def computer_key() -> Tool:
def computer_key(timeout: int | None = None) -> Tool:
async def execute(key: str) -> ToolResult:
"""
Press the specified key.
Press a key or key-combination on the keyboard.
Args:
key: The key to press. Can be any valid keysym name such as:
"BackSpace", "Tab", "Return", "Escape", "Insert", "Delete", "Home", "End", "Prior", "Next", "Left", "Up", "Right", "Down",
key: The key or key-combination to press. Can be any key name supported by xdotool's `key` such as:
"Return", "Escape", "alt+Tab", "BackSpace", "Tab", "alt+Tab", "ctrl+s", "Up", "KP_0" (for the numpad 0 key),
"Insert", "Delete", "Home", "End", "Prior", "Next", "Left", "Up", "Right", "Down",
"F1", "F2", "F3", "F4", "F5", "F6", "F7", "F8", "F9", "F10", "F11", "F12",
"Shift_L", "Shift_R", "Control_L", "Control_R", "Alt_L", "Alt_R", "Scroll_Lock", "Num_Lock", "Caps_Lock", "Pause",
"KP_Multiply", "KP_Home", "KP_Up", "KP_Prior", "KP_Subtract", "KP_Left", "KP_Begin", "KP_Right", "KP_Add", "KP_End","KP_Down",
"KP_Next", "KP_Insert", "KP_Delete", "KP_Enter", "KP_Divide", "KP_Equal", "KP_Decimal",
"KP_Next", "KP_Insert", "KP_Delete", "KP_Enter", "KP_Divide", "KP_Equal", "KP_Decimal"
Returns:
The `str` "OK" on success.
A `list` with a single `ContentImage` of the screen.
"""
return await common.press_key(key)
return await common.press_key(key, timeout=timeout)

return execute


@tool()
def computer_type() -> Tool:
def computer_type(timeout: int | None = None) -> Tool:
async def execute(text: str) -> ToolResult:
"""
Type a string of text on the keyboard.
Args:
text: The text to type.
text: The text to type. If the text contains spaces, enclose it in quotes.
Returns:
The `str` "OK" on success.
A `list` with a single `ContentImage` of the screen.
"""
return await common.type(text)
return await common.type(text, timeout=timeout)

return execute

0 comments on commit 9f9ce6a

Please sign in to comment.