From 38414f5c94cc451ed5bc8ad33471ceb0f5f7751f Mon Sep 17 00:00:00 2001 From: Eric Patey <> Date: Tue, 7 Jan 2025 21:17:23 -0500 Subject: [PATCH] Provisional implementation of computer tool. --- examples/computer/compose.yaml | 16 ++ examples/computer/computer.py | 58 ++++ examples/computer/flag.txt | 1 + examples/computer/moonWeight.ods | Bin 0 -> 9817 bytes examples/hello_computer.py | 37 --- examples/intervention/computer-compose.yaml | 7 + examples/intervention/intervention.py | 34 ++- src/inspect_ai/model/_model.py | 20 +- src/inspect_ai/tool/__init__.py | 2 + .../tool/_tools/_computer/__init__.py | 3 + .../tool/_tools/_computer/_common.py | 136 +++++++++ .../tool/_tools/_computer/_computer.py | 133 +++++++++ .../tool/_tools/_computer/_computer_split.py | 192 +++++++++++++ .../tool/_tools/_computer/_mock_logger.py | 44 +++ .../_tools/_computer/_resources/Dockerfile | 80 ++++++ .../_computer/_resources/build_image.sh | 5 + .../_resources/computer_tool/__init__.py | 0 .../_resources/computer_tool/_logger.py | 22 ++ .../_resources/computer_tool/_run.py | 42 +++ .../_resources/computer_tool/_tool_result.py | 33 +++ .../_resources/computer_tool/_x11_client.py | 260 ++++++++++++++++++ .../_resources/computer_tool/computer_tool.py | 85 ++++++ .../_resources/computer_tool/requirements.txt | 0 .../tint2/applications/firefox-custom.desktop | 8 + .../.config/tint2/applications/gedit.desktop | 8 + .../tint2/applications/terminal.desktop | 8 + .../image_home_dir/.config/tint2/tint2rc | 100 +++++++ .../_resources/image_home_dir/README.md | 28 ++ .../_resources/image_home_dir/entrypoint.sh | 11 + .../image_home_dir/mutter_startup.sh | 21 ++ .../image_home_dir/tint2_startup.sh | 24 ++ .../image_home_dir/x11vnc_startup.sh | 48 ++++ .../_resources/image_home_dir/xvfb_startup.sh | 48 ++++ .../_tools/_computer/_resources/run_image.sh | 9 + .../util/_sandbox/docker/internal.py | 4 +- 35 files changed, 1472 insertions(+), 55 deletions(-) create mode 100644 examples/computer/compose.yaml create mode 100644 examples/computer/computer.py create mode 100644 examples/computer/flag.txt create mode 100644 examples/computer/moonWeight.ods delete mode 100644 examples/hello_computer.py create mode 100644 examples/intervention/computer-compose.yaml create mode 100644 src/inspect_ai/tool/_tools/_computer/__init__.py create mode 100644 src/inspect_ai/tool/_tools/_computer/_common.py create mode 100644 src/inspect_ai/tool/_tools/_computer/_computer.py create mode 100644 src/inspect_ai/tool/_tools/_computer/_computer_split.py create mode 100644 src/inspect_ai/tool/_tools/_computer/_mock_logger.py create mode 100644 src/inspect_ai/tool/_tools/_computer/_resources/Dockerfile create mode 100755 src/inspect_ai/tool/_tools/_computer/_resources/build_image.sh create mode 100644 src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/__init__.py create mode 100644 src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/_logger.py create mode 100644 src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/_run.py create mode 100644 src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/_tool_result.py create mode 100644 src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/_x11_client.py create mode 100644 src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/computer_tool.py create mode 100644 src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/requirements.txt create mode 100755 src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/tint2/applications/firefox-custom.desktop create mode 100755 src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/tint2/applications/gedit.desktop create mode 100644 src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/tint2/applications/terminal.desktop create mode 100644 src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/tint2/tint2rc create mode 100644 src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/README.md create mode 100755 src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/entrypoint.sh create mode 100755 src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/mutter_startup.sh create mode 100755 src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/tint2_startup.sh create mode 100755 src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/x11vnc_startup.sh create mode 100755 src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/xvfb_startup.sh create mode 100755 src/inspect_ai/tool/_tools/_computer/_resources/run_image.sh diff --git a/examples/computer/compose.yaml b/examples/computer/compose.yaml new file mode 100644 index 000000000..e6e4f0fa7 --- /dev/null +++ b/examples/computer/compose.yaml @@ -0,0 +1,16 @@ +services: + default: + # Temporary internal image until the official one is available + image: inspect-computer-tool + init: true + + # If you only launch a single container, you can vnc into that container by using + # the following port mapping + # ports: + # - "5900:5900" + + # If you launch multiple containers, you can vnc into each container by using the + # following port mapping, but you will need to determine the host port number for + # the specific container by using `docker inspect ` + # ports: + # - "5900" diff --git a/examples/computer/computer.py b/examples/computer/computer.py new file mode 100644 index 000000000..c77b976c7 --- /dev/null +++ b/examples/computer/computer.py @@ -0,0 +1,58 @@ +from inspect_ai import Task, task +from inspect_ai.dataset import Sample +from inspect_ai.scorer import includes +from inspect_ai.solver import system_message +from inspect_ai.solver._basic_agent import basic_agent +from inspect_ai.tool import computer + +SYSTEM_MESSAGE = """ +Before each step, please clearly explain your intent for performing a tool action: "I expect tool command X to ...". + +After each step, carefully evaluate the resulting screenshot to see if the command achieved the right outcome. + +Explicitly show your thinking: "I have evaluated step X..." If not correct, try again. Only when +you confirm a step was executed correctly should you move on to the next one. + +Note that launching applications from the bottom task bar requires a single left click. +""" + + +@task +def computer_task(): + return Task( + dataset=[ + Sample( + input="Use the computer tool to find the contents of the file '/tmp/flag.txt'. When you've found it, report its contents.", + target="Frunobulax", + files={"/tmp/flag.txt": "flag.txt"}, + ), + Sample( + input="Use the computer tool and open the spreadsheet '/tmp/moonWeight.ods`. Enter your weight as 156 lbs. Report the result from cell B2. Important: If the 'Tip of the Day' dialog appears, you'll need to close it before proceeding. Important: You may need to install a spreadsheet using a command like 'sudo apt-get install -y libreoffice'.", + target="26", + files={"/tmp/moonWeight.ods": "moonWeight.ods"}, + ), + Sample( + input="Use the computer tool to launch a terminal. Type 'Trudging across the tundra. Mile after Mile.' into the terminal. Important: Make sure that the terminal window is active before typing. When you are done, please use the submit tool to record the result of hitting enter in the terminal after entering that text.", + target="bash: Trudging: command not found", + ), + Sample( + input="Use the computer tool to launch a calculator. Calculate 123 x 456. Report the result.", + target="56088", + ), + Sample( + input='Draw a smiley face with a paint program. When you are done, submit("Art is in the eye of the beholder"). Important: In XPaint, you need to create a new canvas before you can draw.', + target="Art is in the eye of the beholder", + ), + # Sample( + # input='Draw a smiley face with a paint program. The face should be a solid yellow circle on a light gray background with solid black circles for eyes. The mouth should be a curved black line. When you are done, submit("Art is in the eye of the beholder"). Important: In XPaint, you need to create a new canvas before you can draw. Also, in XPaint, the toolbar contains the following tools on each row. 1. Pencil, Dynamic Pencil, Dot Pencil 2. Brush, Spray, Smear 3. Segment, Polygonal Line, Arc 4. Arrowhead, Text, Erase 5. Box, Filled Box, Box Region 6. Oval, Filled Oval, Oval Region 7. Freehand Shape, Filled Freehand Shape, Freehand Shape Region 8. Polygon, Filled Polygon, Polygon Region 9. Spline Curve, Filled Spline Curve, Spline Curve Region 10. Fill, Gradient Fill, Fractal Fill', + # target='Art is in the eye of the beholder', + # ), + ], + solver=basic_agent( + init=system_message(SYSTEM_MESSAGE), + tools=[computer()], + max_messages=100, + ), + scorer=includes(), + sandbox="docker", + ) diff --git a/examples/computer/flag.txt b/examples/computer/flag.txt new file mode 100644 index 000000000..44d731cb9 --- /dev/null +++ b/examples/computer/flag.txt @@ -0,0 +1 @@ +Frunobulax diff --git a/examples/computer/moonWeight.ods b/examples/computer/moonWeight.ods new file mode 100644 index 0000000000000000000000000000000000000000..067bcace458611d2afd82b26f7aa0bf21649af70 GIT binary patch literal 9817 zcmeHtcU)83(smT-0wPt4AfiY|0qG#U_udH*ItdU$FVdSJQl)p0-fJiV1`wo(NbkK% z4IMtX_w#zrd5`D&?*H#3`J{Eu;B@10RR9u z$MshLTT5GzqpKYVXlG|*X##Y#gn(I{z@{t^Ak-4d0bDOf%U)BGP8jI9YO!3MfNv+*+J}J zcGu>@{>$EORQ}mUiofX%Xkr4g0bLIi^55EdGbeUHFv#W~?EJeP?HnNH4j?G>pKblS znSnY29bx|uJ*vMMqopm-90X+*v2?Tr+Cl%XxfmE2|GJ#6|NkGkd)=}Hf-TKJP)8OA zQ?uUaUa%kAJ^vs40ieFoSFEV04$r#ltfk2sn40<9UrY>q)MdRwerhJV`y*&Gw>M31u%2n zwdt?SeCCq4&#%}>)B2s;Ug~Z~RRjj+!*CJH!$l7=p1yfZGfid3jr2&@BWi)LDc=0_ z3M^oaN>fk`^*f(e;h1vW{*HbW4GOXd_PBjzp4N=5`A(yzZ+@BlMmvlV{iD;tcXO8v zW}C{k_55Z$VlCXZP26DL5W6#vV~mntY%8wA`CPA7E?N4o9?X|kIf>c4>dP`eja;eP zqf(bFRgy)=s5-J3M7;$7_@Mv*|Go$QZuhdP(M1jQ)aPj@VU$ zIJ*to)&VvD+-|pgQ;$3BYI5+LulI-9Tjw(fXkOdG!u6EOJ&+EJf`(QOJJ;BvILG+B z%lkPB7G*HW@QsqHq}1E9iGFXDXkl)4lj`0>^Tm1Dys9@O@$o2he%}aeF9u9DVuNpQ zZ=?ijjrJV_H!W_XS~M9U-i%FIRHfem8q9H#`1gCFUE?iLeOd9wOCy{>gPKm?+-C`lPq1hy=`Icz z)SW-S`Ie0vDaKFLe?jr^Yy5~aPr?*U$8+WqUj#p{*@xF0t>R8Z9_Kqx8!<|1XY>a- z5ch2FW8v{Ih;<i(!(o{;iw=ZQUM9#fcLV;CVb;i`fdZf`ePp+6}nr`LndQkp=SZhuj_vfJ_K z5*$oE_%u@V9a#tuW&1$)kN&04$tJ6OXUv-2pK$ndu;$P?-Qi52y-VR)|E%b)HWcm< z^TR2?b~`7g4YRZP4K;Py3yme;+Ip->d{{Ce%v6hdd6wt?N6omlm*P|SdL9qEixF;3 z5ggxzRo598Bcj=I9zTldkRLZoWvuerBeIdLh-EAMBm~ruX4L)IClpLJIjK*vM`=7= z;RCqfm%6}-KZ{e(=$0#DPXVdFm08P>lJkB(IiMdxszLkephiASw4eN2tF9#35YzBy zoG@+`(x85=sK^Yxm?F)0o3ewFbIcN*Bw+Lm^Y@m~*{vC|Y#da|KzwPjV6eJjk=5x;coE z!L?NEm^2jzGHAL!S9GwdpfEl6J-#&VJ`G9z(nn(ccW@1EqrE`u_SEW@6PGjM86u4m zlVz#8OYjBtnVMn5N~xVPpA2QS!ndjxTa#HnbGqWhR6tG1W#KjmuJSh#pBiOm{F{k&+0O_?KK_6 zPxmFzJ*cB5WT9=Nez`;j3mZw?tFO(F^qm1<@r23v*}K>*>O;P0E)And)W(;mcNV0H z!XcL^w>Xs*ZBo@SwZlh%x;t9qYsrt)Qkz+gFV>dF_i}nUE6sANMVzEwtuH-p8#8d~ zqI~k%Lgs;jeCF^s70Mp8DLdF};I#U&WS!=L{Uwo@><8$on3S!WAoGHvx?qnLD`p*@ zZt&QAQCIy3MX{2x`z#I1Tw$(}TJgy=X(mSw@aW=vzN;BE*ZlI{ND++O^l-df%B=VhAl=$hXfax}1s#TCxb8b2%f+(Ht|w zzl1rlRie!zU()H0!>16kPp!fO_+pY#wHH57@1b9+5gqa6 zeAaF()>#>%@NZc5ka-;8ef%imA_eA$>&qEYYhKaigijSVyb+{*fbD`jex}P`?4#5# zGd+Bct!2NBF;cy{VR=Yy5+mSE#Qj8ZJbL|Y=X+OU6_3gEBX z4kb>SOgKNYiX;J1tLdzU0mFRQzi^0fSr&nRU20PlogUMEUxqmU;xigLt z3u&`>iHFiffDmt<@o(`u4qyVY`f6Neii&#~c{c*DfcbJW#)3lMT^0v9{jF!xVwUk8bs zMv{AUSA$MGgpXp_d%B`uulH!oyOfQ|hgekWe08f0$dxhg?4WW?5Akuvia@oPI|shd zz(Nh4r^hEHE)G2|O8+tOH5k7H|Kj2T-nj^PAWGR$!SACXL$czM>{=CNsEQLXSR=ll zjY5(W$N7eike0h5P@?ifn1O@QxPrVE&bGTrDpD1MANnMk=>1lTfAjqkZBF&Dc4K`c zLjSoOk@IMCkqSdS!6+98@WonkH63R36l@k-^}D`mEU^|hGe1gC?zqFKaJck4Z;Db# zt$WFje2EB&D7EnHdnxdZq^s?L!x6Ri-UM>OFTRSb?@_7LuqWgdjd8YRS3or=0%I?o zQa5;|^7tYg@TPYzSPyr!@5$y)pac0}Q4pLM^I4G9%Fqh|8J-V?Ka zZ=s&bZp;Hldc}ss*#h=&XyB&Odo;UF382ReLiIih^L!X3ruBM6Hr8v+Dn|>N!s{1}Q z#eK^fYj28%rH|fxxk)?d$4M6&Xm6P$`M6nhw4N?n5_lo7K+PM#a}mZPpeDZgfiT zTzOa@bQL=3QKMI4&?f&DRiSX*m$c+S`-HjH1yC-eWzK)<6w>j7fCcBCy4Jhr(x-zO z@oXuzpjy}3um^p#1KCFI)V8sCMKQ>>{8UjdTn8Z5X9b)zi6nUSY``1S(5$EVAyZA% z7i@7}krh+)@16R?PZxac{60XAFD}wn;$!_zOW21>bfnn2>$~|%v6%ZbHD0RJ?`ymW>vi(EVEv;&5iU#?o2=$-hEJxLpHi{J$ig};snE96 z&pIA5IPf8mc+!*IT7 zG69ZU71t3=;vYk~lb~|alL^a`nTpxzkvE_AKc1gB=Q$HR4f7(nOq-G(YqG&IX^kiu z4`de zCD{C5Tk0*H8R$6oJ+GD)nuU=h!cI%5`O^#&j6e>|M=#OlxyFdoCE`4y-q>HAM%#Af z3@Omh_D#vV09wUT%NeP3#O>xn#ElKbRJ1zRBHyre{!CaUak04kr(vPKHwkMY6B-K~U zcVj)?^T=WH4^9l1h@$-wweK9$OK#p!!nFq{SC&}|GYZt#io>=pIBix=Zjo9shm+N$ zg21Dv$FbIZvBMpBNo%z%BT#xTg?F!k%Sq9POLAkeH1c2;d^v;0+?en)w=bmji$DuD z3<(aEmt5jxpObr3|A<`xu^fq7QhC=k_4g4Ew>xBjjxn z`x=xB-scN3IP{Vt72Z0_8!21f*e7ej=jYbz5hsp!<3gcpfG@?@Y{4 ztks{NzM5QEj=-@>!V+$Od>bAU_&7x6vI`eLi9!L%BHmO#I=uD1ovCIR6(5|6gaAg74-lg$MQ+s%qwKop^ZP;`eS}p@1sD9LY`RCt*$Eaa zy)Fx|kdW7o=`jk!x^D^HJo^wXA|`fF9L_ybg-0UkQpCxfgI8C@j~gN$EqKSLsMT6I zY6U;R!s=Da9A0_k8L>rPhUD|wIpm%f;S-*PvkI4w+P5A>o5Y*CRz@kH-xCO{+K8`n}iw{=b8dShU@zBsDUw0W3 zh_}TAnB#=Wmpn~E4=g+-Y6|gu<H0%2FOm!e;&4ct8JIKua16uvOXZp6s7 zIaTCkBps1w$V1x5M^~oIu91n4f~nab;5^Bg-+DTYpQnn2$KB{MBXOrp%@UY(gHx`FV3RErL`4=Dt9? z+Ix7`|F+i4`{Pu8m;gPUt-F&bHu|H>&4`!04z4(IXfE6@z?79!oRl+L>8c=aB-N?7^}i()43Dmrd=Ds7o;{IHxAf~Q;ts(^*TpP^V0$U%ovY60kz$}PG#I2H`!e^3z)4j7-(q& zWp(`7WU&LAhbSpX;$V^8Bt3AXrNmSKfLnUkM=!>8jsyVOV?VzBFjJCK6GuTo!N$fW zARwTiprE0lVPs@vXJ_Z-L= zNm*H0g@uKcm6dQfyuH1>r>AFVXlQ(Vd|_c>V`Jms;NbG|^17d^t1C6RrS|{;rlz!* zu$s&G)_9P^L<4aE5IODMdDgNt1uz|pfWFwJSnGx^8?r~Ny*EAq_|E21il-8z`%(qL zAO7g+If3BLCK|e)b~-es4YcsRo0+d^o;F+YwC=G#YNmLDbcbKfp_F9pyM|ZRFBUcK z!)jv5-H~!_!t~y59{u6iIZlxC;m(=`#%-#%y(hg$XU&?5hLIi{I%myF+gW3N~OT~r_T$3!(cxwvqW&639!$vpR)S=SWUiK3@tL-{bqWHmSC~@9YR>$Np#A9rE z)(LuFpTX9~`Aegf0fe&1p4ZY}71qkOl0iwlXGcb3c|>io!V&eBq}Qk>K|=LM^}0iw zS41WGwq3p~R~N^~bcqukSzdR35(qYLf7c+(S}(i|akwPbW(|ddJV;AId&SASOB5bF z+^+jMHtt9s|(y3 z642CHcC6(Cm*Rn)kv&8;z4d}P*wBZa2J|h09NVts(4Za3*>w}Sjdr}kwYfUCyHbL) z&H9BVYzPLuMs4Q4MwUW3?P)u_&P=aQTOwy+vR-tdX?7&FDe+ZRC2;7p8TQ)^rWu(c zxT1zfNAVEPq5;xq^40k6p8GeG!yUU$0sY+)l{Mo$_L)56S|E_C)6{N;H?MhI^5}F3 z3aJcu`kjhpD|kmx*G0L%c)oVq+TOa@Xq!9?DoCigZQY}M8gJ#d^(+~$jv)%)#Nl8` z^<=-EE2ut$sC7Vmkk3!IU%tY!ywJRpIIjC(@~lirJ_@=sTpkD0z`<@KxpfrcGrn!e>a8T#9B}HY3-B!7H_-FnRvem6G+p=97;osGj}ImCG+`^ z_0YwmdCbE}+DgtTh`@x0MQ96sTvA^3w-aDAwmy^Lj#O_RU0aWdC-A1 ztbVTUDRQ^gc^xd6Vxf~MqOV;3VLrq1|Gz@)rRt0$o_!C8Y<_nux@H z#GZKE_gVdT_SLZLDUc9s2L@t{#M5NiBT~Tcd5%!ekAAIYaPXx~Gq3JR?w-ELa z{CII?i&q$(;WBKpHe!5Y9J-Xs9QOlKcN$yT3RAxVjvryxMr@TC@v5c@QEK7|ogRK9 z2ZI7p-5I$bhE1G&d%95XVOVo6Tki1^u{Tb2q;7_D8mMuiZ!?FLT7MG6Fbpa0492C(lO_&8hO7iRJy)Q*AkyT1#_YFvBI60W@F%!VJU;m)br_1lx;-J$34#b&uhqGyL$H|s0= z5tT&mHcQvMDw>a)g)*82J40JcikWE(wLZe{#_yJb2 zkC+3Q+7d)RHER=nsI z@3C6PoCxnr4tZ?eN;6E!yC=Nn2%vr%_rb%Bqo&6`JCAEOU8K897Tu+$F@+#5K+&4t zYX6m8z8if8Hv8+LbhjYIrHfC-7MYcP2aYzDPjE_49QCOgTDDHw&P2Dpm9*pU$csKK zgA*9LfA=9p%63bfpY2W<-t5(bLoTkKUsoTQt`jCovbSy%0)Ef$xLKcH@{{fHSFK#r!Ge}nRi-|>5t8y>?i`FP#@t1W-zd;A{fCT{s9H<8F+ zaenbXevfj)$M_{I*DLT>lt1%9evkC$aasHg(jWODzsLErMd^Qo^Jl)u?~(p&(fHpW z{gFTNdz>2v$uGHo`+v^S|KOAS8|)_~;*VsBn|$9dQM?BGkMFqVm;Ahle`@}*D1Sqy z_$3k7A?P*D>0i`}ziR!l7;_W5{t~0>aQIjG{;zs}%vj!(YJN%T^{L6P!p&b5{}^fi zd4T2j{+!lKTvL&0ssI2 literal 0 HcmV?d00001 diff --git a/examples/hello_computer.py b/examples/hello_computer.py deleted file mode 100644 index 670812115..000000000 --- a/examples/hello_computer.py +++ /dev/null @@ -1,37 +0,0 @@ -from inspect_ai import Task, task -from inspect_ai.dataset import Sample -from inspect_ai.solver import generate, use_tools -from inspect_ai.tool import tool - - -@tool -def computer(): - async def execute( - action: str, - text: str | None = None, - coordinate: list[int] | None = None, - ) -> str: - """Take an action using a computer. - - Args: - action: Action to take. - text: Text related to the action - coordinate: Coordinate related to the action. - - Returns: - The sound that was passed to check. - """ - return action - - return execute - - -@task -def hello_computer(): - return Task( - dataset=[Sample(input="Call the computer tool with the action 'screenshot'")], - solver=[ - use_tools([computer()]), - generate(), - ], - ) diff --git a/examples/intervention/computer-compose.yaml b/examples/intervention/computer-compose.yaml new file mode 100644 index 000000000..fa375cfed --- /dev/null +++ b/examples/intervention/computer-compose.yaml @@ -0,0 +1,7 @@ +services: + default: + # Temporary internal image until the official one is available + image: inspect-computer-tool + init: true + ports: + - "5900:5900" diff --git a/examples/intervention/intervention.py b/examples/intervention/intervention.py index 07901ff77..fd710a118 100644 --- a/examples/intervention/intervention.py +++ b/examples/intervention/intervention.py @@ -1,4 +1,5 @@ from textwrap import dedent +from typing import Literal from rich.prompt import Prompt @@ -12,23 +13,36 @@ system_message, use_tools, ) -from inspect_ai.tool import bash, python +from inspect_ai.tool import bash, computer, python from inspect_ai.util import input_screen @task -def intervention(): - return Task( - solver=[ - system_prompt(), - user_prompt(), - use_tools([bash(), python()]), - agent_loop(), - ], - sandbox="docker", +def intervention(mode: Literal["basic", "computer"] = "basic") -> Task: + return ( + Task( + solver=[ + system_prompt(), + user_prompt(), + use_tools([bash(), python()]), + agent_loop(), + ], + sandbox="docker", + ) + if mode == "basic" + else Task( + solver=[ + system_prompt(), + user_prompt(), + use_tools([computer()]), + agent_loop(), + ], + sandbox=("docker", "computer-compose.yaml"), + ) ) +# TODO: Customize the prompt based on the mode above?? @solver def system_prompt(): SYSTEM_PROMPT = dedent(""" diff --git a/src/inspect_ai/model/_model.py b/src/inspect_ai/model/_model.py index 6b483cd1f..e92984fe1 100644 --- a/src/inspect_ai/model/_model.py +++ b/src/inspect_ai/model/_model.py @@ -165,7 +165,7 @@ def tools_required(self) -> bool: return False def tool_result_images(self) -> bool: - """Tool results can containe images""" + """Tool results can contain images""" return False @@ -713,16 +713,19 @@ def tool_result_images_reducer( messages: list[ChatMessage], message: ChatMessage, ) -> list[ChatMessage]: - # append the message - messages.append(message) - # if there are tool result images, pull them out into a ChatUserMessage if isinstance(message, ChatMessageTool) and isinstance(message.content, list): + tool_message = ChatMessageTool( + content=message.content.copy(), tool_call_id=message.tool_call_id + ) + assert isinstance(tool_message.content, list) + messages.append(tool_message) + user_content: list[Content] = [] - for i in range(0, len(message.content)): - if isinstance(message.content[i], ContentImage): + for i in range(0, len(tool_message.content)): + if isinstance(tool_message.content[i], ContentImage): user_content.append(message.content[i]) - message.content[i] = ContentText( + tool_message.content[i] = ContentText( text="Image content is in the message below." ) if len(user_content) > 0: @@ -730,6 +733,9 @@ def tool_result_images_reducer( ChatMessageUser(content=user_content, tool_call_id=message.tool_call_id) ) + else: + messages.append(message) + # return messages return messages diff --git a/src/inspect_ai/tool/__init__.py b/src/inspect_ai/tool/__init__.py index d53f76651..2bed7b432 100644 --- a/src/inspect_ai/tool/__init__.py +++ b/src/inspect_ai/tool/__init__.py @@ -14,6 +14,7 @@ from ._tool_info import ToolInfo from ._tool_params import ToolParam, ToolParams from ._tool_with import tool_with +from ._tools._computer import computer from ._tools._execute import bash, python from ._tools._web_browser import web_browser from ._tools._web_search import web_search @@ -23,6 +24,7 @@ "python", "web_browser", "web_search", + "computer", "tool", "tool_with", "Tool", diff --git a/src/inspect_ai/tool/_tools/_computer/__init__.py b/src/inspect_ai/tool/_tools/_computer/__init__.py new file mode 100644 index 000000000..908766c87 --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/__init__.py @@ -0,0 +1,3 @@ +from ._computer import computer + +__all__ = ["computer"] diff --git a/src/inspect_ai/tool/_tools/_computer/_common.py b/src/inspect_ai/tool/_tools/_computer/_common.py new file mode 100644 index 000000000..78b458b34 --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_common.py @@ -0,0 +1,136 @@ +import json +import logging +from typing import Literal + +from pydantic import BaseModel, Field + +from inspect_ai._util.content import ContentText +from inspect_ai.model import ContentImage +from inspect_ai.tool import ToolError, ToolResult +from inspect_ai.util import sandbox + +Action = Literal[ + "key", + "type", + "mouse_move", + "left_click", + "left_click_drag", + "right_click", + "middle_click", + "double_click", + "screenshot", + "cursor_position", +] + +log = logging.getLogger(__name__) +# log = MockLogger() +log.setLevel(logging.DEBUG) + + +class ToolExecResult(BaseModel): + output: str | None = Field(default=None) + error: str | None = Field(default=None) + base64_image: str | None = Field(default=None) + + +async def _send_cmd(cmdTail: list[str], timeout: int | None = None) -> ToolResult: + from inspect_ai.log._samples import sample_active + + sample = sample_active() + assert sample + sample_id = sample.sample.id + assert sample_id + + cmd = ["python3", "-m", "computer_tool.computer_tool", "--action"] + cmdTail + log.info(f"(sample={sample_id}) Executing command: {cmd}") + + try: + raw_exec_result = await sandbox().exec(cmd, timeout=timeout) + + if not raw_exec_result.success: + raise Exception( + f"Failure executing command: ${cmd} {raw_exec_result.stderr}" + ) + + result = ToolExecResult(**json.loads(raw_exec_result.stdout)) + + if result.error: + log.warning( + f"(sample={sample_id}) Tool returned an error. Raising ToolError('{result.error}'" + ) + raise ToolError(result.error) + + image = ( + ContentImage(image=f"data:image/png;base64,{result.base64_image}") + if result.base64_image + else None + ) + text = result.output if result.output and len(result.output) > 0 else None + + if text is not None and image is not None: + log.info( + f"(sample={sample_id}) ToolResult([ContentText('{text}'), ContentImage])" + ) + return [ContentText(text=text), image] + + if text is not None: + log.info(f"(sample={sample_id}) ToolResult('{text}')") + return text + + if image is not None: + log.info(f"(sample={sample_id}) ToolResult([ContentImage])") + return [image] + + log.warning( + "(sample={sample_id}) Tool returned neither output nor image - returning ToolResult('OK')" + ) + return "OK" + except ToolError: + raise + except Exception as e: + log.error(f"(sample={sample_id}) Sandbox.exec threw for {cmd}...re-raising {e}") + raise e + + +async def cursor_position(timeout: int | None = None) -> ToolResult: + return await _send_cmd(["cursor_position"], timeout=timeout) + + +async def screenshot(timeout: int | None = None) -> ToolResult: + return await _send_cmd(["screenshot"], timeout=timeout) + + +async def mouse_move(x: int, y: int, timeout: int | None = None) -> ToolResult: + return await _send_cmd( + ["mouse_move", "--coordinate", f"{x}", f"{y}"], timeout=timeout + ) + + +async def left_click(timeout: int | None = None) -> ToolResult: + return await _send_cmd(["left_click"], timeout=timeout) + + +async def left_click_drag(x: int, y: int, timeout: int | None = None) -> ToolResult: + return await _send_cmd( + ["left_click_drag", "--coordinate", f"{x}", f"{y}"], timeout=timeout + ) + + +async def right_click(timeout: int | None = None) -> ToolResult: + return await _send_cmd(["right_click"], timeout=timeout) + + +async def middle_click(timeout: int | None = None) -> ToolResult: + return await _send_cmd(["middle_click"], timeout=timeout) + + +async def double_click(timeout: int | None = None) -> ToolResult: + return await _send_cmd(["double_click"], timeout=timeout) + + +async def press_key(key: str, timeout: int | None = None) -> ToolResult: + return await _send_cmd(["key", "--text", key], timeout=timeout) + + +async def type(text: str, timeout: int | None = None) -> ToolResult: + return await _send_cmd(["type", "--text", text], timeout=timeout) diff --git a/src/inspect_ai/tool/_tools/_computer/_computer.py b/src/inspect_ai/tool/_tools/_computer/_computer.py new file mode 100644 index 000000000..96e9e1b73 --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_computer.py @@ -0,0 +1,133 @@ +from typing import Awaitable, Callable + +from inspect_ai.tool import Tool, ToolError, ToolResult, tool +from inspect_ai.tool._tool import ToolParsingError + +from . import _common as common +from ._common import Action + +ActionFunction = Callable[[str], ToolResult | Awaitable[ToolResult]] + + +@tool() +def computer(timeout: int | None = None) -> Tool: + """ + Computer interaction tool. + + Args: + timeout (int | None): Timeout (in seconds) for command. + + Returns: + Computer interaction tool. + """ + + async def execute( + action: Action, + text: str | None = None, + coordinate: list[int] | None = None, + ) -> ToolResult: + """ + Use this tool to interact with a computer. + + Use a mouse and keyboard to interact with a computer's desktop GUI. + + Keep in mind that icons require double clicks to open while other UI affordances like menu items and buttons require a single click. + + Args: + action (Action): The action to perform. + - `key`: Press a key or key-combination on the keyboard. + - Example: execute(action="key", text="ctrl+s") + - Text can be any key name supported by xdotool's `key` such as: + "Return", "Escape", "alt+Tab", "BackSpace", "Tab", "alt+Tab", "ctrl+s", "Up", "KP_0" (for the numpad 0 key), + "Insert", "Delete", "Home", "End", "Prior", "Next", "Left", "Up", "Right", "Down", + "F1", "F2", "F3", "F4", "F5", "F6", "F7", "F8", "F9", "F10", "F11", "F12", + "Shift_L", "Shift_R", "Control_L", "Control_R", "Alt_L", "Alt_R", "Scroll_Lock", "Num_Lock", "Caps_Lock", "Pause", + "KP_Multiply", "KP_Home", "KP_Up", "KP_Prior", "KP_Subtract", "KP_Left", "KP_Begin", "KP_Right", "KP_Add", "KP_End","KP_Down", + "KP_Next", "KP_Insert", "KP_Delete", "KP_Enter", "KP_Divide", "KP_Equal", "KP_Decimal", + - `type`: Type a string of text on the keyboard. If the text contains spaces, enclose it in quotes. + - Example: execute(action="type", text="The crux of the biscuit is the apostrophe!") + - `cursor_position`: Get the current (x, y) pixel coordinate of the cursor on the screen. + - `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen. + - Example: execute(action="mouse_move", coordinate=(100, 200)) + - `left_click`: Click the left mouse button. + - `left_click_drag`: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen. + - Example: execute(action="left_click_drag", coordinate=(150, 250)) + - `right_click`: Click the right mouse button. + - `middle_click`: Click the middle mouse button. + - `double_click`: Double-click the left mouse button. + - `screenshot`: Take a screenshot. + text (str | None): The text to type or the key to press. Required when action is "key" or "type". + coordinate (tuple[int, int] | None): The (x, y) pixel coordinate on the screen to which to move or drag. Required when action is "mouse_move" or "left_click_drag". + + Returns: + The output of the command. Many commands will include a screenshot reflecting the result of the command in their output. + """ + try: + if action in ("mouse_move", "left_click_drag"): + if coordinate is None: + raise ToolParsingError(f"coordinate is required for {action}") + if text is not None: + raise ToolParsingError(f"text is not accepted for {action}") + if not isinstance(coordinate, list) or len(coordinate) != 2: + raise ToolParsingError(f"{coordinate} must be a tuple of length 2") + if not all(isinstance(i, int) and i >= 0 for i in coordinate): + raise ToolParsingError( + f"{coordinate} must be a tuple of non-negative ints" + ) + + if action == "mouse_move": + return await common.mouse_move( + coordinate[0], coordinate[1], timeout=timeout + ) + elif action == "left_click_drag": + return await common.left_click_drag( + coordinate[0], coordinate[1], timeout=timeout + ) + + if action in ("key", "type"): + if text is None: + raise ToolParsingError(f"text is required for {action}") + if coordinate is not None: + raise ToolParsingError(f"coordinate is not accepted for {action}") + if not isinstance(text, str): + raise ToolParsingError(output=f"{text} must be a string") + + if action == "key": + return await common.press_key(text, timeout=timeout) + elif action == "type": + return await common.type(text, timeout=timeout) + + if action in ( + "left_click", + "right_click", + "double_click", + "middle_click", + "screenshot", + "cursor_position", + ): + if text is not None: + raise ToolParsingError(f"text is not accepted for {action}") + if coordinate is not None: + raise ToolParsingError(f"coordinate is not accepted for {action}") + + if action == "screenshot": + return await common.screenshot(timeout=timeout) + elif action == "cursor_position": + return await common.cursor_position(timeout=timeout) + elif action == "left_click": + return await common.left_click(timeout=timeout) + elif action == "right_click": + return await common.right_click(timeout=timeout) + elif action == "middle_click": + return await common.middle_click(timeout=timeout) + elif action == "double_click": + return await common.double_click(timeout=timeout) + + raise ToolParsingError(f"Invalid action: {action}") + + except ToolError: + raise + except Exception as e: + raise ToolError(str(e)) + + return execute diff --git a/src/inspect_ai/tool/_tools/_computer/_computer_split.py b/src/inspect_ai/tool/_tools/_computer/_computer_split.py new file mode 100644 index 000000000..9c0eb5ed6 --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_computer_split.py @@ -0,0 +1,192 @@ +from typing import Awaitable, Callable + +from inspect_ai.tool import Tool, ToolResult, tool + +from . import _common as common + +ActionFunction = Callable[[str], ToolResult | Awaitable[ToolResult]] + + +def computer_split(timeout: int | None = None) -> list[Tool]: + """ + Computer interaction tools. + + Args: + timeout (int | None): Timeout (in seconds) for command. + + Returns: + List of computer interaction tools. + """ + return [ + computer_cursor_position(), + computer_screenshot(), + computer_mouse_move(), + computer_left_click(), + computer_double_click(), + computer_left_click_drag(), + computer_right_click(), + computer_key(), + computer_type(), + ] + + +@tool() +def computer_cursor_position(timeout: int | None = None) -> Tool: + async def execute() -> ToolResult: + """ + Get the current (x, y) pixel coordinate of the cursor on the screen. + + Args: + None + + Returns: + A `str` of the form "x y" where x and y are the current mouse coordinates. + """ + return await common.cursor_position(timeout=timeout) + + return execute + + +@tool() +def computer_screenshot(timeout: int | None = None) -> Tool: + async def execute() -> ToolResult: + """ + Take a screenshot. + + Args: + None + + Returns: + A `list` with a single `ContentImage` of the screen. + """ + return await common.screenshot(timeout=timeout) + + return execute + + +@tool() +def computer_mouse_move(timeout: int | None = None) -> Tool: + async def execute(x: int, y: int) -> ToolResult: + """ + Move the cursor to a specified (x, y) pixel coordinate on the screen. + + Args: + x: X coordinate of the mouse destination. + y: Y coordinate of the mouse destination. + + Returns: + A `list` with a single `ContentImage` of the screen. + """ + return await common.mouse_move(x, y, timeout=timeout) + + return execute + + +@tool() +def computer_left_click(timeout: int | None = None) -> Tool: + async def execute() -> ToolResult: + """ + Click the left mouse button. + + Args: + None + + Returns: + A `list` with a single `ContentImage` of the screen. + """ + return await common.left_click(timeout=timeout) + + return execute + + +@tool() +def computer_double_click(timeout: int | None = None) -> Tool: + async def execute() -> ToolResult: + """ + Double-click the left mouse button. + + Args: + None + + Returns: + A `list` with a single `ContentImage` of the screen. + """ + return await common.double_click(timeout=timeout) + + return execute + + +@tool() +def computer_left_click_drag(timeout: int | None = None) -> Tool: + async def execute(x: int, y: int) -> ToolResult: + """ + Click and drag the cursor to a specified (x, y) pixel coordinate on the screen. + + Args: + x: X coordinate of the mouse destination. + y: Y coordinate of the mouse destination. + + Returns: + A `list` with a single `ContentImage` of the screen. + """ + return await common.left_click_drag(x, y, timeout=timeout) + + return execute + + +@tool() +def computer_right_click(timeout: int | None = None) -> Tool: + async def execute() -> ToolResult: + """ + Click the right mouse button. + + Args: + None + + Returns: + A `list` with a single `ContentImage` of the screen. + """ + return await common.right_click(timeout=timeout) + + return execute + + +# keysm list is from https://gist.github.com/rvaiya/be31f42049a4b5ad46666a8e120d9843 +@tool() +def computer_key(timeout: int | None = None) -> Tool: + async def execute(key: str) -> ToolResult: + """ + Press a key or key-combination on the keyboard. + + Args: + key: The key or key-combination to press. Can be any key name supported by xdotool's `key` such as: + "Return", "Escape", "alt+Tab", "BackSpace", "Tab", "alt+Tab", "ctrl+s", "Up", "KP_0" (for the numpad 0 key), + "Insert", "Delete", "Home", "End", "Prior", "Next", "Left", "Up", "Right", "Down", + "F1", "F2", "F3", "F4", "F5", "F6", "F7", "F8", "F9", "F10", "F11", "F12", + "Shift_L", "Shift_R", "Control_L", "Control_R", "Alt_L", "Alt_R", "Scroll_Lock", "Num_Lock", "Caps_Lock", "Pause", + "KP_Multiply", "KP_Home", "KP_Up", "KP_Prior", "KP_Subtract", "KP_Left", "KP_Begin", "KP_Right", "KP_Add", "KP_End","KP_Down", + "KP_Next", "KP_Insert", "KP_Delete", "KP_Enter", "KP_Divide", "KP_Equal", "KP_Decimal" + + Returns: + A `list` with a single `ContentImage` of the screen. + """ + return await common.press_key(key, timeout=timeout) + + return execute + + +@tool() +def computer_type(timeout: int | None = None) -> Tool: + async def execute(text: str) -> ToolResult: + """ + Type a string of text on the keyboard. + + Args: + text: The text to type. If the text contains spaces, enclose it in quotes. + + Returns: + A `list` with a single `ContentImage` of the screen. + """ + return await common.type(text, timeout=timeout) + + return execute diff --git a/src/inspect_ai/tool/_tools/_computer/_mock_logger.py b/src/inspect_ai/tool/_tools/_computer/_mock_logger.py new file mode 100644 index 000000000..dc0c0ad17 --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_mock_logger.py @@ -0,0 +1,44 @@ +""" +This module is a temporary solution that allows one to alter the log level of a specific child logger. + +As currently implemented, even there is no way to enable DEBUG level console logging for a child logger +without also enabling DEBUG level logging for the root logger. + +This should be resolved, but in the meantime, this module provides a workaround. +""" + +import logging + + +class MockLogger: + level = logging.INFO + + def debug(self, msg: str, *args: object, **kwargs: object) -> None: + if self.level <= logging.DEBUG: + print(f"DEBUG: {msg % args}") + + def info(self, msg: str, *args: object, **kwargs: object) -> None: + if self.level <= logging.INFO: + print(f"INFO: {msg % args}") + + def warning(self, msg: str, *args: object, **kwargs: object) -> None: + if self.level <= logging.WARNING: + print(f"WARNING: {msg % args}") + + def error(self, msg: str, *args: object, **kwargs: object) -> None: + if self.level <= logging.ERROR: + print(f"ERROR: {msg % args}") + + def critical(self, msg: str, *args: object, **kwargs: object) -> None: + if self.level <= logging.CRITICAL: + print(f"CRITICAL: {msg % args}") + + def exception(self, msg: str, *args: object, **kwargs: object) -> None: + print(f"EXCEPTION: {msg % args}") + if "exc_info" in kwargs and kwargs["exc_info"]: + import traceback + + traceback.print_exc() + + def setLevel(self, level: int) -> None: + self.level = level diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/Dockerfile b/src/inspect_ai/tool/_tools/_computer/_resources/Dockerfile new file mode 100644 index 000000000..c320dfef9 --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_resources/Dockerfile @@ -0,0 +1,80 @@ +FROM docker.io/ubuntu:22.04 + +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBIAN_PRIORITY=high + +RUN apt-get update && \ + apt-get -y upgrade && \ + apt-get -y install \ + # UI Requirements + # A virtual framebuffer for running GUI applications without a physical display. + xvfb \ + # A terminal emulator for X. + xterm \ + # A command-line tool for automating X11 applications (e.g., simulating keyboard/mouse inputs). + xdotool \ + # A command-line tool for taking screenshots. + scrot \ + # A suite for image manipulation. + imagemagick \ + sudo \ + # A lightweight window manager. + mutter \ + # A VNC server for sharing X11 desktops. + x11vnc \ + # Python reqs + python3 \ + python3-pip \ + # Network tools + # Provides networking tools like ifconfig, netstat, etc. + net-tools \ + # A versatile networking tool for debugging, port scanning, and more. + netcat \ + # Adds tools like add-apt-repository for managing PPAs. + software-properties-common && \ + # Add PPA repository for Firefox + sudo add-apt-repository ppa:mozillateam/ppa && \ + # Userland apps + sudo apt-get install -y --no-install-recommends \ + firefox-esr \ + # A lightweight PDF viewer. + xpdf \ + # A simple text editor. + gedit \ + # A simple image viewer. + xpaint \ + # A lightweight taskbar for graphical desktops. + tint2 \ + # A calculator application. + galculator \ + # A lightweight file manager. + pcmanfm && \ + apt-get clean + +# setup user +ENV USERNAME=computeruse +ENV HOME=/home/$USERNAME +RUN useradd -m -s /bin/bash -d $HOME $USERNAME +RUN echo "${USERNAME} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers +USER computeruse +WORKDIR $HOME + +# only reinstall if requirements.txt changes +COPY computer_tool/requirements.txt /opt/computer_tool/requirements.txt +RUN cd /opt/computer_tool && pip3 install --no-cache-dir -r requirements.txt + +COPY --chown=$USERNAME:$USERNAME image_home_dir/ $HOME +COPY computer_tool/ /opt/computer_tool +# This is needed if we want to use relative imports in the tool source files. +ENV PYTHONPATH=/opt + +EXPOSE 5900 + +ARG DISPLAY_NUM=1 +ARG HEIGHT=768 +ARG WIDTH=1024 +ENV DISPLAY_NUM=$DISPLAY_NUM +ENV HEIGHT=$HEIGHT +ENV WIDTH=$WIDTH + +ENTRYPOINT [ "./entrypoint.sh" ] diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/build_image.sh b/src/inspect_ai/tool/_tools/_computer/_resources/build_image.sh new file mode 100755 index 000000000..73a7db9d1 --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_resources/build_image.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +docker build "$SCRIPT_DIR" -t epatey/inspect-computer-tool:local \ No newline at end of file diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/__init__.py b/src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/_logger.py b/src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/_logger.py new file mode 100644 index 000000000..c3a3a42fe --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/_logger.py @@ -0,0 +1,22 @@ +import logging + + +def setup_logger(level=logging.INFO): + """ + This logger emits all of its output to PID 1's stdout. + + This makes it so that logging from invocations of the computer_tool cli show up in `docker logs` output. + """ + new_logger = logging.getLogger("computer_tool") + new_logger.setLevel(level) + + stdout_handler = logging.FileHandler("/proc/1/fd/1", mode="w") + stdout_handler.setLevel(level) + stdout_handler.setFormatter( + logging.Formatter("%(name)s(pid=%(process)d) - %(levelname)s - %(message)s") + ) + + if not new_logger.handlers: + new_logger.addHandler(stdout_handler) + + return new_logger diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/_run.py b/src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/_run.py new file mode 100644 index 000000000..89db980ac --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/_run.py @@ -0,0 +1,42 @@ +"""Utility to run shell commands asynchronously with a timeout.""" + +import asyncio + +TRUNCATED_MESSAGE: str = "To save on context only part of this file has been shown to you. You should retry this tool after you have searched inside the file with `grep -n` in order to find the line numbers of what you are looking for." +MAX_RESPONSE_LEN: int = 16000 + + +def maybe_truncate(content: str, truncate_after: int | None = MAX_RESPONSE_LEN): + """Truncate content and append a notice if content exceeds the specified length.""" + return ( + content + if not truncate_after or len(content) <= truncate_after + else content[:truncate_after] + TRUNCATED_MESSAGE + ) + + +async def run( + cmd: str, + timeout: float | None = 120.0, # seconds + truncate_after: int | None = MAX_RESPONSE_LEN, +): + """Run a shell command asynchronously with a timeout.""" + process = await asyncio.create_subprocess_shell( + cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + + try: + stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout) + return ( + process.returncode or 0, + maybe_truncate(stdout.decode(), truncate_after=truncate_after), + maybe_truncate(stderr.decode(), truncate_after=truncate_after), + ) + except asyncio.TimeoutError as exc: + try: + process.kill() + except ProcessLookupError: + pass + raise TimeoutError( + f"Command '{cmd}' timed out after {timeout} seconds" + ) from exc diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/_tool_result.py b/src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/_tool_result.py new file mode 100644 index 000000000..138f85e4a --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/_tool_result.py @@ -0,0 +1,33 @@ +from dataclasses import dataclass, fields, replace + + +@dataclass(kw_only=True, frozen=True) +class ToolResult: + """Represents the result of a tool execution.""" + + output: str | None = None + error: str | None = None + base64_image: str | None = None + + def __bool__(self): + return any(getattr(self, field.name) for field in fields(self)) + + def __add__(self, other: "ToolResult"): + def combine_fields( + field: str | None, other_field: str | None, concatenate: bool = True + ): + if field and other_field: + if concatenate: + return field + other_field + raise ValueError("Cannot combine tool results") + return field or other_field + + return ToolResult( + output=combine_fields(self.output, other.output), + error=combine_fields(self.error, other.error), + base64_image=combine_fields(self.base64_image, other.base64_image, False), + ) + + def replace(self, **kwargs): + """Returns a new ToolResult with the given fields replaced.""" + return replace(self, **kwargs) diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/_x11_client.py b/src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/_x11_client.py new file mode 100644 index 000000000..f632cedd2 --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/_x11_client.py @@ -0,0 +1,260 @@ +"""Based on https://github.com/anthropics/anthropic-quickstarts/blob/main/computer-use-demo/computer_use_demo/tools/computer.py""" + +import asyncio +import base64 +import logging +import os +import shlex +from pathlib import Path +from typing import Literal, TypedDict +from uuid import uuid4 + +from ._run import run +from ._tool_result import ToolResult + +OUTPUT_DIR = "/tmp/outputs" + +TYPING_DELAY_MS = 12 +TYPING_GROUP_SIZE = 50 + +ColorCount = Literal[4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4] + +Action = Literal[ + "key", + "type", + "mouse_move", + "left_click", + "left_click_drag", + "right_click", + "middle_click", + "double_click", + "screenshot", + "cursor_position", +] + + +class ToolError(Exception): + def __init__(self, message): + self.message = message + + +class Resolution(TypedDict): + width: int + height: int + + +# sizes above XGA/WXGA are not recommended (see README.md) +# scale down to one of these targets if ComputerTool._scaling_enabled is set +MAX_SCALING_TARGETS: dict[str, Resolution] = { + "XGA": Resolution(width=1024, height=768), # 4:3 + "WXGA": Resolution(width=1280, height=800), # 16:10 + "FWXGA": Resolution(width=1366, height=768), # ~16:9 +} + + +ScalingSource = Literal["computer", "api"] + + +class ComputerToolOptions(TypedDict): + display_height_px: int + display_width_px: int + display_number: int | None + + +def chunks(s: str, chunk_size: int) -> list[str]: + return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)] + + +class X11Client: + """ + A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer. + + The tool parameters are defined by Anthropic and are not editable. + """ + + width: int + height: int + display_num: int | None + # TODO: Complete plumbing this or remove it + color_count: ColorCount | None = 64 + + _screenshot_delay = 2.0 + _scaling_enabled = True + + @property + def options(self) -> ComputerToolOptions: + width, height = self.scale_coordinates("computer", self.width, self.height) + return { + "display_width_px": width, + "display_height_px": height, + "display_number": self.display_num, + } + + def __init__(self): + super().__init__() + + self.width = int(os.getenv("WIDTH") or 0) + self.height = int(os.getenv("HEIGHT") or 0) + assert self.width and self.height, "WIDTH, HEIGHT must be set" + if (display_num := os.getenv("DISPLAY_NUM")) is not None: + self.display_num = int(display_num) + self._display_prefix = f"DISPLAY=:{self.display_num} " + else: + self.display_num = None + self._display_prefix = "" + + self.xdotool = f"{self._display_prefix}xdotool" + + async def __call__( + self, + *, + action: Action, + text: str | None = None, + coordinate: tuple[int, int] | None = None, + **kwargs, + ): + if action in ("mouse_move", "left_click_drag"): + if coordinate is None: + raise ToolError(f"coordinate is required for {action}") + if text is not None: + raise ToolError(f"text is not accepted for {action}") + if not isinstance(coordinate, list) or len(coordinate) != 2: + raise ToolError(f"{coordinate} must be a tuple of length 2") + if not all(isinstance(i, int) and i >= 0 for i in coordinate): + raise ToolError(f"{coordinate} must be a tuple of non-negative ints") + + x, y = self.scale_coordinates("api", coordinate[0], coordinate[1]) + + if action == "mouse_move": + return await self.shell(f"{self.xdotool} mousemove --sync {x} {y}") + elif action == "left_click_drag": + return await self.shell( + f"{self.xdotool} mousedown 1 mousemove --sync {x} {y} mouseup 1" + ) + + if action in ("key", "type"): + if text is None: + raise ToolError(f"text is required for {action}") + if coordinate is not None: + raise ToolError(f"coordinate is not accepted for {action}") + if not isinstance(text, str): + raise ToolError(output=f"{text} must be a string") + + if action == "key": + return await self.shell(f"{self.xdotool} key -- {shlex.quote(text)}") + elif action == "type": + results: list[ToolResult] = [] + for chunk in chunks(text, TYPING_GROUP_SIZE): + cmd = f"{self.xdotool} type --delay {TYPING_DELAY_MS} -- {shlex.quote(chunk)}" + results.append(await self.shell(cmd, take_screenshot=False)) + + screenshot_base64 = await self.take_screenshot_after_delay() + return ToolResult( + output="".join(result.output or "" for result in results), + error="".join(result.error or "" for result in results), + base64_image=screenshot_base64, + ) + + if action in ( + "left_click", + "right_click", + "double_click", + "middle_click", + "screenshot", + "cursor_position", + ): + if text is not None: + raise ToolError(f"text is not accepted for {action}") + if coordinate is not None: + raise ToolError(f"coordinate is not accepted for {action}") + + if action == "screenshot": + return await self.screenshot() + elif action == "cursor_position": + result = await self.shell( + f"{self.xdotool} getmouselocation --shell", + take_screenshot=False, + ) + output = result.output or "" + x, y = self.scale_coordinates( + "computer", + int(output.split("X=")[1].split("\n")[0]), + int(output.split("Y=")[1].split("\n")[0]), + ) + return result.replace(output=f"X={x},Y={y}") + else: + click_arg = { + "left_click": "1", + "right_click": "3", + "middle_click": "2", + "double_click": "--repeat 2 --delay 500 1", + }[action] + return await self.shell(f"{self.xdotool} click {click_arg}") + + raise ToolError(f"Invalid action: {action}") + + async def screenshot(self): + """Take a screenshot of the current screen and return the base64 encoded image.""" + output_dir = Path(OUTPUT_DIR) + output_dir.mkdir(parents=True, exist_ok=True) + path = output_dir / f"screenshot_{uuid4().hex}.png" + + result = await self.shell( + f"{self._display_prefix}scrot --silent -p {path}", take_screenshot=False + ) + if self._scaling_enabled: + x, y = self.scale_coordinates("computer", self.width, self.height) + convert_cmd = f"convert {path} -resize {x}x{y}!" + if self.color_count is not None: + convert_cmd += f" -colors {self.color_count}" + convert_cmd += f" {path}" + await self.shell(convert_cmd, take_screenshot=False) + + if path.exists(): + return result.replace( + base64_image=base64.b64encode(path.read_bytes()).decode() + ) + raise ToolError(f"Failed to take screenshot: {result.error}") + + async def shell(self, command: str, take_screenshot=True) -> ToolResult: + """Run a shell command and return the output, error, and optionally a screenshot.""" + logging.debug(f"running shell command {command}") + _, stdout, stderr = await run(command) + logging.debug(f"shell command returned stdout: {stdout}, stderr: {stderr}") + return ToolResult( + output=stdout, + error=stderr, + base64_image=(await self.take_screenshot_after_delay()) + if take_screenshot + else None, + ) + + async def take_screenshot_after_delay(self) -> str: + # delay to let things settle before taking a screenshot + await asyncio.sleep(self._screenshot_delay) + return (await self.screenshot()).base64_image + + def scale_coordinates(self, source: ScalingSource, x: int, y: int): + """Scale coordinates to a target maximum resolution.""" + if not self._scaling_enabled: + return x, y + ratio = self.width / self.height + target_dimension = None + for dimension in MAX_SCALING_TARGETS.values(): + # allow some error in the aspect ratio - not ratios are exactly 16:9 + if abs(dimension["width"] / dimension["height"] - ratio) < 0.02: + if dimension["width"] < self.width: + target_dimension = dimension + break + if target_dimension is None: + return x, y + # should be less than 1 + x_scaling_factor = target_dimension["width"] / self.width + y_scaling_factor = target_dimension["height"] / self.height + if source == "api": + if x > self.width or y > self.height: + raise ToolError(f"Coordinates {x}, {y} are out of bounds") + # scale up + return round(x / x_scaling_factor), round(y / y_scaling_factor) + # scale down + return round(x * x_scaling_factor), round(y * y_scaling_factor) diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/computer_tool.py b/src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/computer_tool.py new file mode 100644 index 000000000..1927d16c1 --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/computer_tool.py @@ -0,0 +1,85 @@ +import argparse +import asyncio +import json +import logging +import os +import sys +import time + +from ._logger import setup_logger +from ._tool_result import ToolResult +from ._x11_client import X11Client + +# This is a bit sketchy. We really want to use relative imports here. Using absolute imports +# works at runtime, but it prevents intellisense from working. However, when this folder is +# copied to the container, by default relative imports won't work if this file is launched +# normally. To overcome this, two things need to happen: +# 1. PYTHONPATH must be set to the parent of the container folder. `PYTHONPATH=/opt` +# 2. The program must be launched with the -m flag. `python3 -m computer_tool.computer_tool` +# +# TODO: There's got to be a cleaner way. + +my_logger = setup_logger(logging.INFO) + + +def main(): + try: + args = parse_arguments() + my_logger.info(f"({args})") + result = asyncio.run(execute_action(args)) + + print( + json.dumps( + { + "output": result.output, + "error": result.error, + "base64_image": result.base64_image, + } + ) + ) + my_logger.debug("SUCCESS") + except Exception as e: + my_logger.warning(f"An error occurred: {e}") + print(f"An error occurred: {e}", file=sys.stderr) + sys.exit(1) + + +def parse_arguments(): + parser = argparse.ArgumentParser(description="Execute computer tool action") + parser.add_argument("--action", type=str, required=True, help="Action to perform") + parser.add_argument("--text", type=str, help="Optional text parameter") + parser.add_argument( + "--coordinate", + type=int, + nargs=2, + help="Optional coordinate parameter as a list of two integers", + ) + return parser.parse_args() + + +async def execute_action(args) -> ToolResult: + # we can't do anything until X11 is ready to go. + await wait_for_file("/tmp/mutter_started") + + computer = X11Client() + return await computer( + action=args.action, + text=args.text, + coordinate=args.coordinate if args.coordinate else None, + ) + + +async def wait_for_file(file_path, check_interval=1): + if os.path.exists(file_path): + return + my_logger.info(f"Waiting for {file_path}") + start_time = time.time() + while not os.path.exists(file_path): + await asyncio.sleep(check_interval) + my_logger.info( + f"Done waiting for {file_path} after {time.time() - start_time:.1f} seconds" + ) + + +if __name__ == "__main__": + main() diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/requirements.txt b/src/inspect_ai/tool/_tools/_computer/_resources/computer_tool/requirements.txt new file mode 100644 index 000000000..e69de29bb diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/tint2/applications/firefox-custom.desktop b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/tint2/applications/firefox-custom.desktop new file mode 100755 index 000000000..948021262 --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/tint2/applications/firefox-custom.desktop @@ -0,0 +1,8 @@ +[Desktop Entry] +Name=Firefox Custom +Comment=Open Firefox with custom URL +Exec=firefox-esr -new-window +Icon=firefox-esr +Terminal=false +Type=Application +Categories=Network;WebBrowser; diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/tint2/applications/gedit.desktop b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/tint2/applications/gedit.desktop new file mode 100755 index 000000000..d5af03f40 --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/tint2/applications/gedit.desktop @@ -0,0 +1,8 @@ +[Desktop Entry] +Name=Gedit +Comment=Open gedit +Exec=gedit +Icon=text-editor-symbolic +Terminal=false +Type=Application +Categories=TextEditor; diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/tint2/applications/terminal.desktop b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/tint2/applications/terminal.desktop new file mode 100644 index 000000000..0c2d45d4d --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/tint2/applications/terminal.desktop @@ -0,0 +1,8 @@ +[Desktop Entry] +Name=Terminal +Comment=Open Terminal +Exec=xterm +Icon=utilities-terminal +Terminal=false +Type=Application +Categories=System;TerminalEmulator; diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/tint2/tint2rc b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/tint2/tint2rc new file mode 100644 index 000000000..e62987609 --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/tint2/tint2rc @@ -0,0 +1,100 @@ +#------------------------------------- +# Panel +panel_items = TL +panel_size = 100% 60 +panel_margin = 0 0 +panel_padding = 2 0 2 +panel_background_id = 1 +wm_menu = 0 +panel_dock = 0 +panel_position = bottom center horizontal +panel_layer = top +panel_monitor = all +panel_shrink = 0 +autohide = 0 +autohide_show_timeout = 0 +autohide_hide_timeout = 0.5 +autohide_height = 2 +strut_policy = follow_size +panel_window_name = tint2 +disable_transparency = 1 +mouse_effects = 1 +font_shadow = 0 +mouse_hover_icon_asb = 100 0 10 +mouse_pressed_icon_asb = 100 0 0 +scale_relative_to_dpi = 0 +scale_relative_to_screen_height = 0 + +#------------------------------------- +# Taskbar +taskbar_mode = single_desktop +taskbar_hide_if_empty = 0 +taskbar_padding = 0 0 2 +taskbar_background_id = 0 +taskbar_active_background_id = 0 +taskbar_name = 1 +taskbar_hide_inactive_tasks = 0 +taskbar_hide_different_monitor = 0 +taskbar_hide_different_desktop = 0 +taskbar_always_show_all_desktop_tasks = 0 +taskbar_name_padding = 4 2 +taskbar_name_background_id = 0 +taskbar_name_active_background_id = 0 +taskbar_name_font_color = #e3e3e3 100 +taskbar_name_active_font_color = #ffffff 100 +taskbar_distribute_size = 0 +taskbar_sort_order = none +task_align = left + +#------------------------------------- +# Launcher +launcher_padding = 4 8 4 +launcher_background_id = 0 +launcher_icon_background_id = 0 +launcher_icon_size = 48 +launcher_icon_asb = 100 0 0 +launcher_icon_theme_override = 0 +startup_notifications = 1 +launcher_tooltip = 1 + +#------------------------------------- +# Launcher icon +launcher_item_app = /usr/share/applications/pcmanfm.desktop +launcher_item_app = /home/computeruse/.config/tint2/applications/terminal.desktop +launcher_item_app = /home/computeruse/.config/tint2/applications/firefox-custom.desktop +launcher_item_app = /usr/share/applications/xpaint.desktop +launcher_item_app = /usr/share/applications/xpdf.desktop +launcher_item_app = /home/computeruse/.config/tint2/applications/gedit.desktop +launcher_item_app = /usr/share/applications/galculator.desktop + +#------------------------------------- +# Background definitions +# ID 1 +rounded = 0 +border_width = 0 +background_color = #000000 60 +border_color = #000000 30 + +# ID 2 +rounded = 4 +border_width = 1 +background_color = #777777 20 +border_color = #777777 30 + +# ID 3 +rounded = 4 +border_width = 1 +background_color = #777777 20 +border_color = #ffffff 40 + +# ID 4 +rounded = 4 +border_width = 1 +background_color = #aa4400 100 +border_color = #aa7733 100 + +# ID 5 +rounded = 4 +border_width = 1 +background_color = #aaaa00 100 +border_color = #aaaa00 100 diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/README.md b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/README.md new file mode 100644 index 000000000..571a2a8f5 --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/README.md @@ -0,0 +1,28 @@ +# About This Image + +This image is based heavily on the image from Anthropic's Computer Use Demo [here](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo/image). + +It has been adapted to launch only those tools required for the Inspect `computer_tool` to interact with the computer via X11 and `xdotool`. + +## Tools Launched + +1. **Xvfb (X Virtual Framebuffer)** + - **Script:** `xvfb_startup.sh` + - **Description:** Xvfb is a display server implementing the X11 display server protocol. It runs in memory and does not require a physical display. This is useful for running graphical applications in a headless environment. + +2. **tint2** + - **Script:** `tint2_startup.sh` + - **Description:** tint2 is a lightweight panel/taskbar. It provides a taskbar, system tray, and application launcher. It is highly configurable and is used to manage and display open applications. + +3. **Mutter** + - **Script:** `mutter_startup.sh` + - **Description:** Mutter is a window manager for the X Window System. It is used to manage windows and provide compositing effects. In this setup, it is used to replace the default window manager and provide a graphical environment. + +4. **x11vnc** + - **Script:** `x11vnc_startup.sh` + - **Description:** x11vnc is a VNC server that allows remote access to the X11 display. It enables users to connect to the virtual display environment from a remote machine using a VNC client. + +## `.config/tint2` Directory + +The `.config/tint2` directory contains configuration files for tint2. These files define the appearance and behavior of the tint2 panel, including the taskbar, system tray, and application launcher. You can customize the tint2 panel by modifying the configuration files in this directory. + diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/entrypoint.sh b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/entrypoint.sh new file mode 100755 index 000000000..bd20dde4f --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/entrypoint.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +export DISPLAY=:${DISPLAY_NUM} +./xvfb_startup.sh +./tint2_startup.sh +./mutter_startup.sh +./x11vnc_startup.sh + +# Keep the container running +tail -f /dev/null diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/mutter_startup.sh b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/mutter_startup.sh new file mode 100755 index 000000000..17c2795fc --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/mutter_startup.sh @@ -0,0 +1,21 @@ +echo "starting mutter" +XDG_SESSION_TYPE=x11 mutter --replace --sm-disable 2>/tmp/mutter_stderr.log & + +# Wait for tint2 window properties to appear +timeout=30 +while [ $timeout -gt 0 ]; do + if xdotool search --class "mutter" >/dev/null 2>&1; then + break + fi + sleep 1 + ((timeout--)) +done + +if [ $timeout -eq 0 ]; then + echo "mutter stderr output:" >&2 + cat /tmp/mutter_stderr.log >&2 + exit 1 +fi + +touch /tmp/mutter_started +rm /tmp/mutter_stderr.log diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/tint2_startup.sh b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/tint2_startup.sh new file mode 100755 index 000000000..34f39a18b --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/tint2_startup.sh @@ -0,0 +1,24 @@ +#!/bin/bash +echo "starting tint2 on display :$DISPLAY_NUM ..." + +# Start tint2 and capture its stderr +tint2 -c $HOME/.config/tint2/tint2rc 2>/tmp/tint2_stderr.log & + +# Wait for tint2 window properties to appear +timeout=30 +while [ $timeout -gt 0 ]; do + if xdotool search --class "tint2" >/dev/null 2>&1; then + break + fi + sleep 1 + ((timeout--)) +done + +if [ $timeout -eq 0 ]; then + echo "tint2 stderr output:" >&2 + cat /tmp/tint2_stderr.log >&2 + exit 1 +fi + +# Remove the temporary stderr log file +rm /tmp/tint2_stderr.log diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/x11vnc_startup.sh b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/x11vnc_startup.sh new file mode 100755 index 000000000..ccb2fa7a3 --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/x11vnc_startup.sh @@ -0,0 +1,48 @@ +#!/bin/bash +echo "starting vnc" + +(x11vnc -display $DISPLAY \ + -forever \ + -shared \ + -wait 50 \ + -cursor most \ + -cursor arrow \ + -rfbport 5900 \ + -nopw \ + 2>/tmp/x11vnc_stderr.log) & + +x11vnc_pid=$! + +# Wait for x11vnc to start +timeout=10 +while [ $timeout -gt 0 ]; do + if netstat -tuln | grep -q ":5900 "; then + break + fi + sleep 1 + ((timeout--)) +done + +if [ $timeout -eq 0 ]; then + echo "x11vnc failed to start, stderr output:" >&2 + cat /tmp/x11vnc_stderr.log >&2 + exit 1 +fi + +: > /tmp/x11vnc_stderr.log + +# Monitor x11vnc process in the background +( + while true; do + if ! kill -0 $x11vnc_pid 2>/dev/null; then + echo "x11vnc process crashed, restarting..." >&2 + if [ -f /tmp/x11vnc_stderr.log ]; then + echo "x11vnc stderr output:" >&2 + cat /tmp/x11vnc_stderr.log >&2 + rm /tmp/x11vnc_stderr.log + fi + exec "$0" + fi + sleep 5 + done +) & diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/xvfb_startup.sh b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/xvfb_startup.sh new file mode 100755 index 000000000..9b9ae5852 --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_resources/image_home_dir/xvfb_startup.sh @@ -0,0 +1,48 @@ +#!/bin/bash +set -e # Exit on error + +DPI=96 +RES_AND_DEPTH=${WIDTH}x${HEIGHT}x24 + +# Function to check if Xvfb is already running +check_xvfb_running() { + if [ -e /tmp/.X${DISPLAY_NUM}-lock ]; then + return 0 # Xvfb is already running + else + return 1 # Xvfb is not running + fi +} + +# Function to check if Xvfb is ready +wait_for_xvfb() { + local timeout=10 + local start_time=$(date +%s) + while ! xdpyinfo >/dev/null 2>&1; do + if [ $(($(date +%s) - start_time)) -gt $timeout ]; then + echo "Xvfb failed to start within $timeout seconds" >&2 + return 1 + fi + sleep 0.1 + done + return 0 +} + +# Check if Xvfb is already running +if check_xvfb_running; then + echo "Xvfb is already running on display ${DISPLAY}" + exit 0 +fi + +# Start Xvfb +Xvfb $DISPLAY -ac -screen 0 $RES_AND_DEPTH -retro -dpi $DPI -nolisten tcp -nolisten unix & +XVFB_PID=$! + +# Wait for Xvfb to start +if wait_for_xvfb; then + echo "Xvfb started successfully on display ${DISPLAY}" + echo "Xvfb PID: $XVFB_PID" +else + echo "Xvfb failed to start" + kill $XVFB_PID + exit 1 +fi diff --git a/src/inspect_ai/tool/_tools/_computer/_resources/run_image.sh b/src/inspect_ai/tool/_tools/_computer/_resources/run_image.sh new file mode 100755 index 000000000..ce12223cb --- /dev/null +++ b/src/inspect_ai/tool/_tools/_computer/_resources/run_image.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +docker run -d --rm \ + -v "$SCRIPT_DIR"/computer_tool_support:/home/computeruse/computer_tool_support/ \ + -p 5900:5900 \ + -p 6080:6080 \ + -it epatey/inspect-computer-tool:local \ No newline at end of file diff --git a/src/inspect_ai/util/_sandbox/docker/internal.py b/src/inspect_ai/util/_sandbox/docker/internal.py index 4a4108b86..6130b7916 100644 --- a/src/inspect_ai/util/_sandbox/docker/internal.py +++ b/src/inspect_ai/util/_sandbox/docker/internal.py @@ -6,13 +6,15 @@ INSPECT_WEB_BROWSER_IMAGE_DOCKERHUB = "aisiuk/inspect-web-browser-tool" INSPECT_WEB_BROWSER_IMAGE = "inspect_web_browser" +INSPECT_COMPUTER_IMAGE = "inspect-computer-tool" INTERNAL_IMAGES = { INSPECT_WEB_BROWSER_IMAGE: PKG_PATH / "tool" / "_tools" / "_web_browser" - / "_resources" + / "_resources", + INSPECT_COMPUTER_IMAGE: PKG_PATH / "tool" / "_tools" / "_computer" / "_resources", }