diff --git a/src/inspect_ai/tool/_tools/_computer/_common.py b/src/inspect_ai/tool/_tools/_computer/_common.py index 8d0244b8c..78b458b34 100644 --- a/src/inspect_ai/tool/_tools/_computer/_common.py +++ b/src/inspect_ai/tool/_tools/_computer/_common.py @@ -33,7 +33,7 @@ class ToolExecResult(BaseModel): base64_image: str | None = Field(default=None) -async def _send_cmd(cmdTail: list[str]) -> ToolResult: +async def _send_cmd(cmdTail: list[str], timeout: int | None = None) -> ToolResult: from inspect_ai.log._samples import sample_active sample = sample_active() @@ -45,7 +45,7 @@ async def _send_cmd(cmdTail: list[str]) -> ToolResult: log.info(f"(sample={sample_id}) Executing command: {cmd}") try: - raw_exec_result = await sandbox().exec(cmd) + raw_exec_result = await sandbox().exec(cmd, timeout=timeout) if not raw_exec_result.success: raise Exception( @@ -92,48 +92,45 @@ async def _send_cmd(cmdTail: list[str]) -> ToolResult: raise e -async def cursor_position() -> ToolResult: - return await _send_cmd(["cursor_position"]) +async def cursor_position(timeout: int | None = None) -> ToolResult: + return await _send_cmd(["cursor_position"], timeout=timeout) -async def screenshot() -> ToolResult: - return await _send_cmd(["screenshot"]) +async def screenshot(timeout: int | None = None) -> ToolResult: + return await _send_cmd(["screenshot"], timeout=timeout) -async def mouse_move(x: int, y: int) -> ToolResult: - return await _send_cmd(["mouse_move", "--coordinate", f"{x}", f"{y}"]) +async def mouse_move(x: int, y: int, timeout: int | None = None) -> ToolResult: + return await _send_cmd( + ["mouse_move", "--coordinate", f"{x}", f"{y}"], timeout=timeout + ) -async def left_click() -> ToolResult: - return await _send_cmd(["left_click"]) +async def left_click(timeout: int | None = None) -> ToolResult: + return await _send_cmd(["left_click"], timeout=timeout) -async def left_click_drag(x: int, y: int) -> ToolResult: - return await _send_cmd(["left_click_drag", "--coordinate", f"{x}", f"{y}"]) +async def left_click_drag(x: int, y: int, timeout: int | None = None) -> ToolResult: + return await _send_cmd( + ["left_click_drag", "--coordinate", f"{x}", f"{y}"], timeout=timeout + ) -async def right_click() -> ToolResult: - return await _send_cmd(["right_click"]) +async def right_click(timeout: int | None = None) -> ToolResult: + return await _send_cmd(["right_click"], timeout=timeout) -async def middle_click() -> ToolResult: - return await _send_cmd(["middle_click"]) +async def middle_click(timeout: int | None = None) -> ToolResult: + return await _send_cmd(["middle_click"], timeout=timeout) -async def double_click() -> ToolResult: - return await _send_cmd(["double_click"]) +async def double_click(timeout: int | None = None) -> ToolResult: + return await _send_cmd(["double_click"], timeout=timeout) -async def press_key(key: str) -> ToolResult: - # TODO: Temporary partial fix for lack of escaping of user input - # When the model wants to key "*", it turns into a command line - # ending in "-- *", which expands to a list of all files and folders - # and hilarity ensues - if key == "*": - key = "KP_Multiply" - res = await _send_cmd(["key", "--text", key]) - return res +async def press_key(key: str, timeout: int | None = None) -> ToolResult: + return await _send_cmd(["key", "--text", key], timeout=timeout) -async def type(text: str) -> ToolResult: - return await _send_cmd(["type", "--text", text]) +async def type(text: str, timeout: int | None = None) -> ToolResult: + return await _send_cmd(["type", "--text", text], timeout=timeout) diff --git a/src/inspect_ai/tool/_tools/_computer/_computer.py b/src/inspect_ai/tool/_tools/_computer/_computer.py index a0af8da1b..96e9e1b73 100644 --- a/src/inspect_ai/tool/_tools/_computer/_computer.py +++ b/src/inspect_ai/tool/_tools/_computer/_computer.py @@ -27,12 +27,9 @@ async def execute( coordinate: list[int] | None = None, ) -> ToolResult: """ - Use this tool to interact with the computer. + Use this tool to interact with a computer. - Use a mouse and keyboard to interact with a computer, and take screenshots. - * This is an interface to a desktop GUI. You must click on desktop menus or icons to start applications. - * Before taking any action, it's wise to consult the result of a screenshot action to determine current state - of the computer. Without doing so, you may not be able to complete the task. + Use a mouse and keyboard to interact with a computer's desktop GUI. Keep in mind that icons require double clicks to open while other UI affordances like menu items and buttons require a single click. @@ -79,9 +76,13 @@ async def execute( ) if action == "mouse_move": - return await common.mouse_move(coordinate[0], coordinate[1]) + return await common.mouse_move( + coordinate[0], coordinate[1], timeout=timeout + ) elif action == "left_click_drag": - return await common.left_click_drag(coordinate[0], coordinate[1]) + return await common.left_click_drag( + coordinate[0], coordinate[1], timeout=timeout + ) if action in ("key", "type"): if text is None: @@ -92,9 +93,9 @@ async def execute( raise ToolParsingError(output=f"{text} must be a string") if action == "key": - return await common.press_key(text) + return await common.press_key(text, timeout=timeout) elif action == "type": - return await common.type(text) + return await common.type(text, timeout=timeout) if action in ( "left_click", @@ -110,17 +111,17 @@ async def execute( raise ToolParsingError(f"coordinate is not accepted for {action}") if action == "screenshot": - return await common.screenshot() + return await common.screenshot(timeout=timeout) elif action == "cursor_position": - return await common.cursor_position() + return await common.cursor_position(timeout=timeout) elif action == "left_click": - return await common.left_click() + return await common.left_click(timeout=timeout) elif action == "right_click": - return await common.right_click() + return await common.right_click(timeout=timeout) elif action == "middle_click": - return await common.middle_click() + return await common.middle_click(timeout=timeout) elif action == "double_click": - return await common.double_click() + return await common.double_click(timeout=timeout) raise ToolParsingError(f"Invalid action: {action}") diff --git a/src/inspect_ai/tool/_tools/_computer/_computer_split.py b/src/inspect_ai/tool/_tools/_computer/_computer_split.py index e3ad197aa..9c0eb5ed6 100644 --- a/src/inspect_ai/tool/_tools/_computer/_computer_split.py +++ b/src/inspect_ai/tool/_tools/_computer/_computer_split.py @@ -7,7 +7,7 @@ ActionFunction = Callable[[str], ToolResult | Awaitable[ToolResult]] -def computer_split() -> list[Tool]: +def computer_split(timeout: int | None = None) -> list[Tool]: """ Computer interaction tools. @@ -22,7 +22,7 @@ def computer_split() -> list[Tool]: computer_screenshot(), computer_mouse_move(), computer_left_click(), - computer_left_double_click(), + computer_double_click(), computer_left_click_drag(), computer_right_click(), computer_key(), @@ -31,10 +31,10 @@ def computer_split() -> list[Tool]: @tool() -def computer_cursor_position() -> Tool: +def computer_cursor_position(timeout: int | None = None) -> Tool: async def execute() -> ToolResult: """ - Get the current mouse cursor position. + Get the current (x, y) pixel coordinate of the cursor on the screen. Args: None @@ -42,16 +42,16 @@ async def execute() -> ToolResult: Returns: A `str` of the form "x y" where x and y are the current mouse coordinates. """ - return await common.cursor_position() + return await common.cursor_position(timeout=timeout) return execute @tool() -def computer_screenshot() -> Tool: +def computer_screenshot(timeout: int | None = None) -> Tool: async def execute() -> ToolResult: """ - Take a screenshot of the screen. + Take a screenshot. Args: None @@ -59,13 +59,13 @@ async def execute() -> ToolResult: Returns: A `list` with a single `ContentImage` of the screen. """ - return await common.screenshot() + return await common.screenshot(timeout=timeout) return execute @tool() -def computer_mouse_move() -> Tool: +def computer_mouse_move(timeout: int | None = None) -> Tool: async def execute(x: int, y: int) -> ToolResult: """ Move the cursor to a specified (x, y) pixel coordinate on the screen. @@ -75,15 +75,15 @@ async def execute(x: int, y: int) -> ToolResult: y: Y coordinate of the mouse destination. Returns: - The `str` "OK" on success. + A `list` with a single `ContentImage` of the screen. """ - return await common.mouse_move(x, y) + return await common.mouse_move(x, y, timeout=timeout) return execute @tool() -def computer_left_click() -> Tool: +def computer_left_click(timeout: int | None = None) -> Tool: async def execute() -> ToolResult: """ Click the left mouse button. @@ -92,15 +92,15 @@ async def execute() -> ToolResult: None Returns: - The `str` "OK" on success. + A `list` with a single `ContentImage` of the screen. """ - return await common.left_click() + return await common.left_click(timeout=timeout) return execute @tool() -def computer_left_double_click() -> Tool: +def computer_double_click(timeout: int | None = None) -> Tool: async def execute() -> ToolResult: """ Double-click the left mouse button. @@ -109,33 +109,33 @@ async def execute() -> ToolResult: None Returns: - The `str` "OK" on success. + A `list` with a single `ContentImage` of the screen. """ - return await common.double_click() + return await common.double_click(timeout=timeout) return execute @tool() -def computer_left_click_drag() -> Tool: +def computer_left_click_drag(timeout: int | None = None) -> Tool: async def execute(x: int, y: int) -> ToolResult: """ - Click the left button and drag to a specified (x, y) pixel coordinate on the screen. + Click and drag the cursor to a specified (x, y) pixel coordinate on the screen. Args: x: X coordinate of the mouse destination. y: Y coordinate of the mouse destination. Returns: - The `str` "OK" on success. + A `list` with a single `ContentImage` of the screen. """ - return await common.left_click_drag(x, y) + return await common.left_click_drag(x, y, timeout=timeout) return execute @tool() -def computer_right_click() -> Tool: +def computer_right_click(timeout: int | None = None) -> Tool: async def execute() -> ToolResult: """ Click the right mouse button. @@ -144,48 +144,49 @@ async def execute() -> ToolResult: None Returns: - The `str` "OK" on success. + A `list` with a single `ContentImage` of the screen. """ - return await common.right_click() + return await common.right_click(timeout=timeout) return execute # keysm list is from https://gist.github.com/rvaiya/be31f42049a4b5ad46666a8e120d9843 @tool() -def computer_key() -> Tool: +def computer_key(timeout: int | None = None) -> Tool: async def execute(key: str) -> ToolResult: """ - Press the specified key. + Press a key or key-combination on the keyboard. Args: - key: The key to press. Can be any valid keysym name such as: - "BackSpace", "Tab", "Return", "Escape", "Insert", "Delete", "Home", "End", "Prior", "Next", "Left", "Up", "Right", "Down", + key: The key or key-combination to press. Can be any key name supported by xdotool's `key` such as: + "Return", "Escape", "alt+Tab", "BackSpace", "Tab", "alt+Tab", "ctrl+s", "Up", "KP_0" (for the numpad 0 key), + "Insert", "Delete", "Home", "End", "Prior", "Next", "Left", "Up", "Right", "Down", "F1", "F2", "F3", "F4", "F5", "F6", "F7", "F8", "F9", "F10", "F11", "F12", "Shift_L", "Shift_R", "Control_L", "Control_R", "Alt_L", "Alt_R", "Scroll_Lock", "Num_Lock", "Caps_Lock", "Pause", "KP_Multiply", "KP_Home", "KP_Up", "KP_Prior", "KP_Subtract", "KP_Left", "KP_Begin", "KP_Right", "KP_Add", "KP_End","KP_Down", - "KP_Next", "KP_Insert", "KP_Delete", "KP_Enter", "KP_Divide", "KP_Equal", "KP_Decimal", + "KP_Next", "KP_Insert", "KP_Delete", "KP_Enter", "KP_Divide", "KP_Equal", "KP_Decimal" Returns: - The `str` "OK" on success. + A `list` with a single `ContentImage` of the screen. """ - return await common.press_key(key) + return await common.press_key(key, timeout=timeout) return execute @tool() -def computer_type() -> Tool: +def computer_type(timeout: int | None = None) -> Tool: async def execute(text: str) -> ToolResult: """ Type a string of text on the keyboard. Args: - text: The text to type. + text: The text to type. If the text contains spaces, enclose it in quotes. Returns: - The `str` "OK" on success. + A `list` with a single `ContentImage` of the screen. """ - return await common.type(text) + return await common.type(text, timeout=timeout) return execute