From a57c20db6438440d97bc0f7a4cc1e9b0a0f736e6 Mon Sep 17 00:00:00 2001 From: Catherine Jue Date: Mon, 26 May 2025 12:19:54 -0400 Subject: [PATCH 1/8] TS CU initial --- README.md | 15 +- templates/typescript/computer-use/.gitignore | 39 ++ templates/typescript/computer-use/README.md | 5 + templates/typescript/computer-use/index.ts | 84 +++ templates/typescript/computer-use/loop.ts | 400 ++++++++++++++ .../typescript/computer-use/package.json | 13 + .../typescript/computer-use/tools/computer.ts | 488 ++++++++++++++++++ .../typescript/computer-use/tsconfig.json | 31 ++ 8 files changed, 1074 insertions(+), 1 deletion(-) create mode 100644 templates/typescript/computer-use/.gitignore create mode 100644 templates/typescript/computer-use/README.md create mode 100644 templates/typescript/computer-use/index.ts create mode 100644 templates/typescript/computer-use/loop.ts create mode 100644 templates/typescript/computer-use/package.json create mode 100644 templates/typescript/computer-use/tools/computer.ts create mode 100644 templates/typescript/computer-use/tsconfig.json diff --git a/README.md b/README.md index 8df711d..54e4132 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,8 @@ create-kernel-app [app-name] [options] - `sample-app`: Basic template with Playwright integration - `browser-use`: Template with Browser Use SDK (Python only) - `stagehand`: Template with Stagehand SDK (Typescript only) + - `persistent-browser`: Implements `sample-app` using a persistent browser + - `computer-use`: Implements a prompt loop using Anthropic Computer Use ### Examples @@ -51,6 +53,11 @@ Create a Typescript application with Stagehand template: npx @onkernel/create-kernel-app my-app --language typescript --template stagehand ``` +Create a Typescript application with Computer Use template: +```bash +npx @onkernel/create-kernel-app my-app --language typescript --template computer-use +``` + Create a Python application with a sample app: ```bash npx @onkernel/create-kernel-app my-app --language python --template sample-app @@ -60,6 +67,7 @@ Create a Python application with Browser Use template: ```bash npx @onkernel/create-kernel-app my-app --language python --template browser-use ``` +``` ## Next Steps @@ -82,7 +90,7 @@ export KERNEL_API_KEY= 4. Deploy your application: ```bash # Typscript -kernel deploy index.ts # --env OPENAI_API_KEY=XXX if Stagehand +kernel deploy index.ts # --env OPENAI_API_KEY=XXX if Stagehand; --env ANTHROPIC_API_KEY=XXX if Computer Use # Python kernel deploy main.py # --env OPENAI_API_KEY=XXX if Browser Use @@ -98,6 +106,9 @@ kernel invoke ts-basic get-page-title --payload '{"url": "https://www.google.com # Typescript + Stagehand kernel invoke ts-stagehand stagehand-task --payload '{"query": "Best wired earbuds"}' +# Typescript + Computer Use +kernel invoke ts-cu cu-task --payload '{"query": "Search for the top 3 restaurants in NYC according to Pete Wells"}' + # Python + Sample App kernel invoke python-basic get-page-title --payload '{"url": "https://www.google.com"}' @@ -114,6 +125,8 @@ These are the sample apps currently available when you run `npx @onkernel/create | **sample-app** | Returns the page title of a specified URL | Playwright | `{ url }` | | **browser-use** | Completes a specified task | Browser Use | `{ task }` | | **stagehand** | Returns the first result of a specified Google search | Stagehand | `{ query }` | +| **persistent-browser** | Implements `sample-app` using a persistent browser | Playwright | `{ url }` | +| **computer-use** | Implements a prompt loop | Anthropic Computer Use API | `{ query }` | ## Documentation diff --git a/templates/typescript/computer-use/.gitignore b/templates/typescript/computer-use/.gitignore new file mode 100644 index 0000000..9325515 --- /dev/null +++ b/templates/typescript/computer-use/.gitignore @@ -0,0 +1,39 @@ +# Dependencies +node_modules/ +package-lock.json + +# TypeScript +*.tsbuildinfo +dist/ +build/ + +# Environment +.env +.env.local +.env.*.local + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Logs +logs/ +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* + +# Testing +coverage/ +.nyc_output/ + +# Misc +.cache/ +.temp/ +.tmp/ \ No newline at end of file diff --git a/templates/typescript/computer-use/README.md b/templates/typescript/computer-use/README.md new file mode 100644 index 0000000..e8075fb --- /dev/null +++ b/templates/typescript/computer-use/README.md @@ -0,0 +1,5 @@ +# Kernel Typscript Sample App - Computer Use + +This is a simple Kernel application that implements a prompt loop using Anthropic Computer Use. + +See the [docs](https://docs.onkernel.com/quickstart) for information. \ No newline at end of file diff --git a/templates/typescript/computer-use/index.ts b/templates/typescript/computer-use/index.ts new file mode 100644 index 0000000..58a4911 --- /dev/null +++ b/templates/typescript/computer-use/index.ts @@ -0,0 +1,84 @@ +import { Kernel, type KernelContext } from '@onkernel/sdk'; +import { samplingLoop, APIProvider, ToolVersion } from './loop'; +import type { BetaMessageParam } from './loop'; +import type { ToolResult } from './tools/computer'; + +const kernel = new Kernel(); + +const app = kernel.app('ts-cu'); + +interface QueryInput { + query: string; +} + +interface QueryOutput { + result: string; +} + +// Anthropic callbacks for handling loop output +const cuOutputCallback = (block: any) => { + console.log('Output block:', block); +}; + +const cuToolOutputCallback = (result: ToolResult, id: string) => { + console.log('Tool output:', { id, result }); +}; + +const cuApiResponseCallback = (request: any, response: any, error: any) => { + if (error) { + console.error('API error:', error); + } else { + console.log('API response:', { request, response }); + } +}; + +app.action( + 'cu-task', + async (ctx: KernelContext, payload?: QueryInput): Promise => { + if (!payload?.query) { + throw new Error('Query is required'); + } + + const kernelBrowser = await kernel.browsers.create({ + invocation_id: ctx.invocation_id, + }); + + console.log("Kernel browser live view url: ", kernelBrowser.browser_live_view_url); + + // Initialize messages with the user's query + const messages: BetaMessageParam[] = [{ + role: 'user', + content: payload.query + }]; + + // Run the sampling loop + const finalMessages = await samplingLoop({ + model: 'claude-3-opus-20240229', + provider: APIProvider.ANTHROPIC, + messages, + outputCallback: cuOutputCallback, + toolOutputCallback: cuToolOutputCallback, + apiResponseCallback: cuApiResponseCallback, + apiKey: process.env.ANTHROPIC_API_KEY || '', + toolVersion: ToolVersion.V20250124, + }); + + // Extract the final result from the messages + if (finalMessages.length === 0) { + throw new Error('No messages were generated during the sampling loop'); + } + + const lastMessage = finalMessages[finalMessages.length - 1]; + if (!lastMessage) { + throw new Error('Failed to get the last message from the sampling loop'); + } + + const result = typeof lastMessage.content === 'string' + ? lastMessage.content + : lastMessage.content.map(block => + block.type === 'text' ? block.text : '' + ).join(''); + + return { result }; + }, +); diff --git a/templates/typescript/computer-use/loop.ts b/templates/typescript/computer-use/loop.ts new file mode 100644 index 0000000..d3f4923 --- /dev/null +++ b/templates/typescript/computer-use/loop.ts @@ -0,0 +1,400 @@ +import { Anthropic } from '@anthropic-ai/sdk'; +import { DateTime } from 'luxon'; +import type { ToolResult } from './tools/computer'; +import { Action_20241022, Action_20250124, ComputerTool20241022, ComputerTool20250124 } from './tools/computer'; + +export enum APIProvider { + ANTHROPIC = 'anthropic' +} + +export enum ToolVersion { + V20241022 = '20241022', + V20250124 = '20250124', +} + +export interface BetaMessageParam { + role: 'user' | 'assistant'; + content: BetaContentBlockParam[] | string; +} + +export interface BetaContentBlockParam { + type: string; + text?: string; + name?: string; + input?: Record; + id?: string; + cache_control?: { + type: 'ephemeral'; + }; +} + +export interface BetaToolResultBlockParam { + type: 'tool_result'; + content: (BetaTextBlockParam | BetaImageBlockParam)[] | string; + tool_use_id: string; + is_error: boolean; +} + +export interface BetaTextBlockParam { + type: 'text'; + text: string; +} + +export interface BetaImageBlockParam { + type: 'image'; + source: { + type: 'base64'; + media_type: 'image/png'; + data: string; + }; +} + +export interface BetaMessage { + content: Array<{ + type: string; + text?: string; + name?: string; + input?: Record; + id?: string; + thinking?: any; + signature?: string; + }>; +} + +const PROMPT_CACHING_BETA_FLAG = 'prompt-caching-2024-07-31'; + +// System prompt optimized for the environment +const SYSTEM_PROMPT = ` +* You are utilising an Ubuntu virtual machine using ${process.arch} architecture with internet access. +* When you connect to the display, Chromium is already open. Use that browser to complete your tasks. +* When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available. +* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request. +* The current date is ${DateTime.now().toFormat('EEEE, MMMM d, yyyy')}. + + + +* When using Chromium, if a startup wizard appears, IGNORE IT. Do not even click "skip this step". Instead, click on the address bar where it says "Search or enter address", and enter the appropriate search term or URL there. +* If the item you are looking at is a pdf, if after taking a single screenshot of the pdf it seems that you want to read the entire document instead of trying to continue to read the pdf from your screenshots + navigation, determine the URL, use curl to download the pdf, install and use pdftotext to convert it to a text file, and then read that text file directly. +`; + +// Tool collection class to manage available tools +class ToolCollection { + private tools: (ComputerTool20241022 | ComputerTool20250124)[]; + + constructor(...tools: (ComputerTool20241022 | ComputerTool20250124)[]) { + this.tools = tools; + } + + toParams(): any[] { + return this.tools.map(tool => tool.toParams()); + } + + async run(name: string, toolInput: { action: Action_20241022 | Action_20250124 } & Record): Promise { + const tool = this.tools.find(t => t.name === name); + if (!tool) { + throw new Error(`Tool ${name} not found`); + } + + // Type guard to ensure action matches the tool version + if (tool instanceof ComputerTool20241022) { + if (!Object.values(Action_20241022).includes(toolInput.action as Action_20241022)) { + throw new Error(`Invalid action ${toolInput.action} for tool version 20241022`); + } + return await tool.call(toolInput as { action: Action_20241022 } & Record); + } else if (tool instanceof ComputerTool20250124) { + if (!Object.values(Action_20250124).includes(toolInput.action as Action_20250124)) { + throw new Error(`Invalid action ${toolInput.action} for tool version 20250124`); + } + return await tool.call(toolInput as { action: Action_20250124 } & Record); + } + + throw new Error(`Unsupported tool version for ${name}`); + } +} + +// Tool groups by version +const TOOL_GROUPS_BY_VERSION: Record = { + [ToolVersion.V20241022]: { + tools: [ComputerTool20241022], + beta_flag: 'tools-2024-10-22', + }, + [ToolVersion.V20250124]: { + tools: [ComputerTool20250124], + beta_flag: 'tools-2025-01-24', + }, +}; + +export async function samplingLoop({ + model, + provider, + systemPromptSuffix, + messages, + outputCallback, + toolOutputCallback, + apiResponseCallback, + apiKey, + onlyNMostRecentImages, + maxTokens = 4096, + toolVersion, + thinkingBudget, + tokenEfficientToolsBeta = false, +}: { + model: string; + provider: APIProvider; + systemPromptSuffix?: string; + messages: BetaMessageParam[]; + outputCallback: (block: BetaContentBlockParam) => void; + toolOutputCallback: (result: ToolResult, id: string) => void; + apiResponseCallback: (request: any, response: any, error: any) => void; + apiKey: string; + onlyNMostRecentImages?: number; + maxTokens?: number; + toolVersion: ToolVersion; + thinkingBudget?: number; + tokenEfficientToolsBeta?: boolean; +}): Promise { + const toolGroup = TOOL_GROUPS_BY_VERSION[toolVersion]; + const toolCollection = new ToolCollection(...toolGroup.tools.map(Tool => new Tool())); + + const system: BetaTextBlockParam = { + type: 'text', + text: `${SYSTEM_PROMPT}${systemPromptSuffix ? ' ' + systemPromptSuffix : ''}`, + }; + + while (true) { + let enablePromptCaching = false; + const betas: string[] = toolGroup.beta_flag ? [toolGroup.beta_flag] : []; + + if (tokenEfficientToolsBeta) { + betas.push('token-efficient-tools-2025-02-19'); + } + + let imageTruncationThreshold = onlyNMostRecentImages || 0; + let client: Anthropic; + + if (provider === APIProvider.ANTHROPIC) { + client = new Anthropic({ apiKey, maxRetries: 4 }); + enablePromptCaching = true; + } else { + throw new Error(`Unsupported provider: ${provider}`); + } + + if (enablePromptCaching) { + betas.push(PROMPT_CACHING_BETA_FLAG); + injectPromptCaching(messages); + onlyNMostRecentImages = 0; + (system as any).cache_control = { type: 'ephemeral' }; + } + + if (onlyNMostRecentImages) { + maybeFilterToNMostRecentImages( + messages, + onlyNMostRecentImages, + imageTruncationThreshold + ); + } + + const extraBody: Record = {}; + if (thinkingBudget) { + extraBody.thinking = { type: 'enabled', budget_tokens: thinkingBudget }; + } + + try { + // Use beta API for messages + const response = await (client as Anthropic).beta.messages.create({ + max_tokens: maxTokens, + messages: messages as any, // Type assertion needed for beta API + model, + system: [system], + tools: toolCollection.toParams(), + betas, + ...extraBody, + }); + + apiResponseCallback( + response._request_id, + response, + null + ); + + const responseParams = responseToParams(response as unknown as BetaMessage); + + messages.push({ + role: 'assistant', + content: responseParams, + }); + + const toolResultContent: BetaToolResultBlockParam[] = []; + + for (const contentBlock of responseParams) { + outputCallback(contentBlock); + + if (contentBlock.type === 'tool_use' && contentBlock.name && contentBlock.input) { + const toolInput = { + action: contentBlock.input.action as Action_20241022 | Action_20250124, + ...contentBlock.input + }; + + const result = await toolCollection.run( + contentBlock.name, + toolInput + ); + + const toolResult = makeApiToolResult(result, contentBlock.id!); + toolResultContent.push(toolResult); + toolOutputCallback(result, contentBlock.id!); + } + } + + if (toolResultContent.length === 0) { + return messages; + } + + messages.push({ + content: toolResultContent, + role: 'user', + }); + } catch (error: any) { + apiResponseCallback(error.request, error.response || error.body, error); + return messages; + } + } +} + +function maybeFilterToNMostRecentImages( + messages: BetaMessageParam[], + imagesToKeep: number, + minRemovalThreshold: number +): void { + if (imagesToKeep === undefined) return; + + const toolResultBlocks = messages + .flatMap(message => { + if (!message || !Array.isArray(message.content)) return []; + return message.content.filter(item => + typeof item === 'object' && item.type === 'tool_result' + ); + }) as BetaToolResultBlockParam[]; + + let totalImages = 0; + for (const toolResult of toolResultBlocks) { + if (Array.isArray(toolResult.content)) { + totalImages += toolResult.content.filter( + content => typeof content === 'object' && content.type === 'image' + ).length; + } + } + + let imagesToRemove = totalImages - imagesToKeep; + imagesToRemove -= imagesToRemove % minRemovalThreshold; + + for (const toolResult of toolResultBlocks) { + if (Array.isArray(toolResult.content)) { + const newContent = []; + for (const content of toolResult.content) { + if (typeof content === 'object' && content.type === 'image') { + if (imagesToRemove > 0) { + imagesToRemove--; + continue; + } + } + newContent.push(content); + } + toolResult.content = newContent; + } + } +} + +function responseToParams(response: BetaMessage): BetaContentBlockParam[] { + const res: BetaContentBlockParam[] = []; + + for (const block of response.content) { + if (block.type === 'text' && block.text) { + res.push({ type: 'text', text: block.text }); + } else if (block.type === 'thinking') { + const thinkingBlock: any = { + type: 'thinking', + thinking: block.thinking, + }; + if (block.signature) { + thinkingBlock.signature = block.signature; + } + res.push(thinkingBlock); + } else { + res.push(block as BetaContentBlockParam); + } + } + + return res; +} + +function injectPromptCaching(messages: BetaMessageParam[]): void { + let breakpointsRemaining = 3; + + for (let i = messages.length - 1; i >= 0; i--) { + const message = messages[i]; + if (!message) continue; + if (message.role === 'user' && Array.isArray(message.content)) { + if (breakpointsRemaining > 0) { + breakpointsRemaining--; + const lastContent = message.content[message.content.length - 1]; + if (lastContent) { + (lastContent as any).cache_control = { type: 'ephemeral' }; + } + } else { + const lastContent = message.content[message.content.length - 1]; + if (lastContent) { + delete (lastContent as any).cache_control; + } + break; + } + } + } +} + +function makeApiToolResult( + result: ToolResult, + toolUseId: string +): BetaToolResultBlockParam { + const toolResultContent: (BetaTextBlockParam | BetaImageBlockParam)[] = []; + let isError = false; + + if (result.error) { + isError = true; + toolResultContent.push({ + type: 'text', + text: maybePrependSystemToolResult(result, result.error), + }); + } else { + if (result.output) { + toolResultContent.push({ + type: 'text', + text: maybePrependSystemToolResult(result, result.output), + }); + } + if (result.base64Image) { + toolResultContent.push({ + type: 'image', + source: { + type: 'base64', + media_type: 'image/png', + data: result.base64Image, + }, + }); + } + } + + return { + type: 'tool_result', + content: toolResultContent, + tool_use_id: toolUseId, + is_error: isError, + }; +} + +function maybePrependSystemToolResult(result: ToolResult, resultText: string): string { + if (result.system) { + return `${result.system}\n${resultText}`; + } + return resultText; +} diff --git a/templates/typescript/computer-use/package.json b/templates/typescript/computer-use/package.json new file mode 100644 index 0000000..f5ca974 --- /dev/null +++ b/templates/typescript/computer-use/package.json @@ -0,0 +1,13 @@ +{ + "name": "ts-cu", + "module": "index.ts", + "type": "module", + "private": true, + "peerDependencies": { + "typescript": "^5" + }, + "dependencies": { + "@onkernel/sdk": "0.3.0" + } + } + \ No newline at end of file diff --git a/templates/typescript/computer-use/tools/computer.ts b/templates/typescript/computer-use/tools/computer.ts new file mode 100644 index 0000000..3de5ac6 --- /dev/null +++ b/templates/typescript/computer-use/tools/computer.ts @@ -0,0 +1,488 @@ +import { exec } from 'child_process'; +import { promisify } from 'util'; +import { mkdir, readFile } from 'fs/promises'; +import { existsSync } from 'fs'; +import { join } from 'path'; +import { v4 as uuidv4 } from 'uuid'; + +export enum Action_20241022 { + KEY = 'key', + TYPE = 'type', + MOUSE_MOVE = 'mouse_move', + LEFT_CLICK = 'left_click', + LEFT_CLICK_DRAG = 'left_click_drag', + RIGHT_CLICK = 'right_click', + MIDDLE_CLICK = 'middle_click', + DOUBLE_CLICK = 'double_click', + TRIPLE_CLICK = 'triple_click', + SCREENSHOT = 'screenshot', + CURSOR_POSITION = 'cursor_position', +} + +export enum Action_20250124 { + // Include all actions from 20241022 + KEY = Action_20241022.KEY, + TYPE = Action_20241022.TYPE, + MOUSE_MOVE = Action_20241022.MOUSE_MOVE, + LEFT_CLICK = Action_20241022.LEFT_CLICK, + LEFT_CLICK_DRAG = Action_20241022.LEFT_CLICK_DRAG, + RIGHT_CLICK = Action_20241022.RIGHT_CLICK, + MIDDLE_CLICK = Action_20241022.MIDDLE_CLICK, + DOUBLE_CLICK = Action_20241022.DOUBLE_CLICK, + TRIPLE_CLICK = Action_20241022.TRIPLE_CLICK, + SCREENSHOT = Action_20241022.SCREENSHOT, + CURSOR_POSITION = Action_20241022.CURSOR_POSITION, + // Add new actions + LEFT_MOUSE_DOWN = 'left_mouse_down', + LEFT_MOUSE_UP = 'left_mouse_up', + SCROLL = 'scroll', + HOLD_KEY = 'hold_key', + WAIT = 'wait', +} + +export interface ToolResult { + output?: string; + error?: string; + base64Image?: string; + system?: string; +} + +export class ToolError extends Error { + constructor(message: string) { + super(message); + this.name = 'ToolError'; + } +} + +export interface BaseAnthropicTool { + name: string; + apiType: string; + toParams(): any; +} + +const execAsync = promisify(exec); + +const OUTPUT_DIR = '/tmp/outputs'; +const TYPING_DELAY_MS = 12; +const TYPING_GROUP_SIZE = 50; + +type ScrollDirection = 'up' | 'down' | 'left' | 'right'; + +interface Resolution { + width: number; + height: number; +} + +const MAX_SCALING_TARGETS: Record = { + XGA: { width: 1024, height: 768 }, // 4:3 + WXGA: { width: 1280, height: 800 }, // 16:10 + FWXGA: { width: 1366, height: 768 }, // ~16:9 +}; + +const CLICK_BUTTONS: Record = { + left_click: 1, + right_click: 3, + middle_click: 2, + double_click: '--repeat 2 --delay 10 1', + triple_click: '--repeat 3 --delay 10 1', +}; + +enum ScalingSource { + COMPUTER = 'computer', + API = 'api', +} + +interface ComputerToolOptions { + display_height_px: number; + display_width_px: number; + display_number: number | null; +} + +function chunks(s: string, chunkSize: number): string[] { + return Array.from({ length: Math.ceil(s.length / chunkSize) }, (_, i) => + s.slice(i * chunkSize, (i + 1) * chunkSize) + ); +} + +export class BaseComputerTool { + name: 'computer' = 'computer'; + width: number; + height: number; + displayNum: number | null; + protected _screenshotDelay = 2.0; + protected _scalingEnabled = true; + protected _displayPrefix: string; + protected xdotool: string; + + constructor() { + this.width = parseInt(process.env.WIDTH || '0'); + this.height = parseInt(process.env.HEIGHT || '0'); + + if (!this.width || !this.height) { + throw new Error('WIDTH, HEIGHT must be set'); + } + + const displayNum = process.env.DISPLAY_NUM; + if (displayNum !== undefined) { + this.displayNum = parseInt(displayNum); + this._displayPrefix = `DISPLAY=:${this.displayNum} `; + } else { + this.displayNum = null; + this._displayPrefix = ''; + } + + this.xdotool = `${this._displayPrefix}xdotool`; + } + + get options(): ComputerToolOptions { + const [width, height] = this.scaleCoordinates( + ScalingSource.COMPUTER, + this.width, + this.height + ); + return { + display_width_px: width, + display_height_px: height, + display_number: this.displayNum, + }; + } + + protected async shell(command: string, takeScreenshot = true): Promise { + try { + const { stdout, stderr } = await execAsync(command); + let base64Image: string | undefined; + + if (takeScreenshot) { + await new Promise(resolve => setTimeout(resolve, this._screenshotDelay * 1000)); + base64Image = (await this.screenshot()).base64Image; + } + + return { output: stdout, error: stderr, base64Image }; + } catch (error) { + throw new ToolError(`Command failed: ${error}`); + } + } + + protected validateAndGetCoordinates(coordinate: [number, number] | null = null): [number, number] { + if (!Array.isArray(coordinate) || coordinate.length !== 2) { + throw new ToolError(`${coordinate} must be a tuple of length 2`); + } + if (!coordinate.every(i => typeof i === 'number' && i >= 0)) { + throw new ToolError(`${coordinate} must be a tuple of non-negative numbers`); + } + + return this.scaleCoordinates(ScalingSource.API, coordinate[0], coordinate[1]); + } + + protected scaleCoordinates(source: ScalingSource, x: number, y: number): [number, number] { + if (!this._scalingEnabled) { + return [x, y]; + } + + const ratio = this.width / this.height; + let targetDimension: Resolution | null = null; + + for (const dimension of Object.values(MAX_SCALING_TARGETS)) { + if (Math.abs(dimension.width / dimension.height - ratio) < 0.02) { + if (dimension.width < this.width) { + targetDimension = dimension; + } + break; + } + } + + if (!targetDimension) { + return [x, y]; + } + + const xScalingFactor = targetDimension.width / this.width; + const yScalingFactor = targetDimension.height / this.height; + + if (source === ScalingSource.API) { + if (x > this.width || y > this.height) { + throw new ToolError(`Coordinates ${x}, ${y} are out of bounds`); + } + return [Math.round(x / xScalingFactor), Math.round(y / yScalingFactor)]; + } + + return [Math.round(x * xScalingFactor), Math.round(y * yScalingFactor)]; + } + + async screenshot(): Promise { + const outputDir = OUTPUT_DIR; + await mkdir(outputDir, { recursive: true }); + const path = join(outputDir, `screenshot_${uuidv4()}.png`); + + let screenshotCmd: string; + if (existsSync('/usr/bin/gnome-screenshot')) { + screenshotCmd = `${this._displayPrefix}gnome-screenshot -f ${path} -p`; + } else { + screenshotCmd = `${this._displayPrefix}scrot -p ${path}`; + } + + const result = await this.shell(screenshotCmd, false); + + if (this._scalingEnabled) { + const [x, y] = this.scaleCoordinates(ScalingSource.COMPUTER, this.width, this.height); + await this.shell(`convert ${path} -resize ${x}x${y}! ${path}`, false); + } + + if (existsSync(path)) { + const imageBuffer = await readFile(path); + return { + ...result, + base64Image: imageBuffer.toString('base64'), + }; + } + + throw new ToolError(`Failed to take screenshot: ${result.error}`); + } + + async call(params: { + action: Action_20241022 | Action_20250124; + text?: string; + coordinate?: [number, number]; + scrollDirection?: ScrollDirection; + scrollAmount?: number; + duration?: number; + key?: string; + [key: string]: any; + }): Promise { + const { action, text, coordinate, ...kwargs } = params; + + if (action === Action_20241022.MOUSE_MOVE || action === Action_20241022.LEFT_CLICK_DRAG) { + if (!coordinate) { + throw new ToolError(`coordinate is required for ${action}`); + } + if (text !== undefined) { + throw new ToolError(`text is not accepted for ${action}`); + } + + const [x, y] = this.validateAndGetCoordinates(coordinate); + + if (action === Action_20241022.MOUSE_MOVE) { + return await this.shell(`${this.xdotool} mousemove --sync ${x} ${y}`); + } else { + return await this.shell( + `${this.xdotool} mousedown 1 mousemove --sync ${x} ${y} mouseup 1` + ); + } + } + + if (action === Action_20241022.KEY || action === Action_20241022.TYPE) { + if (text === undefined) { + throw new ToolError(`text is required for ${action}`); + } + if (coordinate !== undefined) { + throw new ToolError(`coordinate is not accepted for ${action}`); + } + if (typeof text !== 'string') { + throw new ToolError(`${text} must be a string`); + } + + if (action === Action_20241022.KEY) { + return await this.shell(`${this.xdotool} key -- ${text}`); + } else { + const results: ToolResult[] = []; + for (const chunk of chunks(text, TYPING_GROUP_SIZE)) { + const escapedChunk = chunk.replace(/'/g, "'\\''"); + results.push( + await this.shell( + `${this.xdotool} type --delay ${TYPING_DELAY_MS} -- '${escapedChunk}'`, + false + ) + ); + } + const screenshot = await this.screenshot(); + return { + output: results.map(r => r.output || '').join(''), + error: results.map(r => r.error || '').join(''), + base64Image: screenshot.base64Image, + }; + } + } + + if ([ + Action_20241022.LEFT_CLICK, + Action_20241022.RIGHT_CLICK, + Action_20241022.DOUBLE_CLICK, + Action_20241022.MIDDLE_CLICK, + Action_20241022.SCREENSHOT, + Action_20241022.CURSOR_POSITION, + ].includes(action as Action_20241022)) { + if (text !== undefined) { + throw new ToolError(`text is not accepted for ${action}`); + } + if (coordinate !== undefined) { + throw new ToolError(`coordinate is not accepted for ${action}`); + } + + if (action === Action_20241022.SCREENSHOT) { + return await this.screenshot(); + } else if (action === Action_20241022.CURSOR_POSITION) { + const result = await this.shell(`${this.xdotool} getmouselocation --shell`, false); + const output = result.output || ''; + const xMatch = output.match(/X=(\d+)/); + const yMatch = output.match(/Y=(\d+)/); + + if (!xMatch?.[1] || !yMatch?.[1]) { + throw new ToolError('Failed to parse cursor position'); + } + + const x = parseInt(xMatch[1], 10); + const y = parseInt(yMatch[1], 10); + + if (isNaN(x) || isNaN(y)) { + throw new ToolError('Invalid cursor position values'); + } + + const [scaledX, scaledY] = this.scaleCoordinates(ScalingSource.COMPUTER, x, y); + return { ...result, output: `X=${scaledX},Y=${scaledY}` }; + } else { + return await this.shell(`${this.xdotool} click ${CLICK_BUTTONS[action]}`); + } + } + + throw new ToolError(`Invalid action: ${action}`); + } +} + +export class ComputerTool20241022 extends BaseComputerTool implements BaseAnthropicTool { + apiType: 'computer_20241022' = 'computer_20241022'; + + toParams(): any { + return { + name: this.name, + type: this.apiType, + ...this.options, + }; + } +} + +export class ComputerTool20250124 extends BaseComputerTool implements BaseAnthropicTool { + apiType: 'computer_20250124' = 'computer_20250124'; + + toParams(): any { + return { + name: this.name, + type: this.apiType, + ...this.options, + }; + } + + async call(params: { + action: Action_20250124; + text?: string; + coordinate?: [number, number]; + scrollDirection?: ScrollDirection; + scrollAmount?: number; + duration?: number; + key?: string; + [key: string]: any; + }): Promise { + const { action, text, coordinate, scrollDirection, scrollAmount, duration, key, ...kwargs } = params; + + if (action === Action_20250124.LEFT_MOUSE_DOWN || action === Action_20250124.LEFT_MOUSE_UP) { + if (coordinate !== undefined) { + throw new ToolError(`coordinate is not accepted for ${action}`); + } + const command = `${this.xdotool} ${action === Action_20250124.LEFT_MOUSE_DOWN ? 'mousedown' : 'mouseup'} 1`; + return await this.shell(command); + } + + if (action === Action_20250124.SCROLL) { + if (!scrollDirection || !['up', 'down', 'left', 'right'].includes(scrollDirection)) { + throw new ToolError(`${scrollDirection} must be 'up', 'down', 'left', or 'right'`); + } + if (typeof scrollAmount !== 'number' || scrollAmount < 0) { + throw new ToolError(`${scrollAmount} must be a non-negative number`); + } + + let mouseMovePart = ''; + if (coordinate !== undefined) { + const [x, y] = this.validateAndGetCoordinates(coordinate); + mouseMovePart = `mousemove --sync ${x} ${y}`; + } + + const scrollButton = { + up: 4, + down: 5, + left: 6, + right: 7, + }[scrollDirection]; + + const commandParts = [this.xdotool, mouseMovePart]; + if (text) { + commandParts.push(`keydown ${text}`); + } + commandParts.push(`click --repeat ${scrollAmount} ${scrollButton}`); + if (text) { + commandParts.push(`keyup ${text}`); + } + + return await this.shell(commandParts.join(' ')); + } + + if (action === Action_20250124.HOLD_KEY || action === Action_20250124.WAIT) { + if (duration === undefined || typeof duration !== 'number') { + throw new ToolError(`${duration} must be a number`); + } + if (duration < 0) { + throw new ToolError(`${duration} must be non-negative`); + } + if (duration > 100) { + throw new ToolError(`${duration} is too long`); + } + + if (action === Action_20250124.HOLD_KEY) { + if (text === undefined) { + throw new ToolError(`text is required for ${action}`); + } + const escapedKeys = text.replace(/'/g, "'\\''"); + const commandParts = [ + this.xdotool, + `keydown '${escapedKeys}'`, + `sleep ${duration}`, + `keyup '${escapedKeys}'`, + ]; + return await this.shell(commandParts.join(' ')); + } + + if (action === Action_20250124.WAIT) { + await new Promise(resolve => setTimeout(resolve, duration * 1000)); + return await this.screenshot(); + } + } + + if ([ + Action_20250124.LEFT_CLICK, + Action_20250124.RIGHT_CLICK, + Action_20250124.DOUBLE_CLICK, + Action_20250124.TRIPLE_CLICK, + Action_20250124.MIDDLE_CLICK, + ].includes(action)) { + if (text !== undefined) { + throw new ToolError(`text is not accepted for ${action}`); + } + + let mouseMovePart = ''; + if (coordinate !== undefined) { + const [x, y] = this.validateAndGetCoordinates(coordinate); + mouseMovePart = `mousemove --sync ${x} ${y}`; + } + + const commandParts = [this.xdotool, mouseMovePart]; + if (key) { + commandParts.push(`keydown ${key}`); + } + commandParts.push(`click ${CLICK_BUTTONS[action]}`); + if (key) { + commandParts.push(`keyup ${key}`); + } + + return await this.shell(commandParts.join(' ')); + } + + return await super.call(params); + } +} diff --git a/templates/typescript/computer-use/tsconfig.json b/templates/typescript/computer-use/tsconfig.json new file mode 100644 index 0000000..39959d0 --- /dev/null +++ b/templates/typescript/computer-use/tsconfig.json @@ -0,0 +1,31 @@ +{ + "compilerOptions": { + // Environment setup & latest features + "lib": ["ESNext", "DOM"], + "target": "ESNext", + "module": "ESNext", + "moduleDetection": "force", + "jsx": "react-jsx", + "allowJs": true, + + // Bundler mode + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": true, + "noEmit": true, + + // Best practices + "strict": true, + "skipLibCheck": true, + "noFallthroughCasesInSwitch": true, + "noUncheckedIndexedAccess": true, + + // Some stricter flags (disabled by default) + "noUnusedLocals": false, + "noUnusedParameters": false, + "noPropertyAccessFromIndexSignature": false + }, + "include": ["./**/*.ts", "./**/*.tsx"], + "exclude": ["node_modules", "dist"] +} + \ No newline at end of file From 4240eb318c245cfe18cb916e766fcde25385bc27 Mon Sep 17 00:00:00 2001 From: Catherine Jue Date: Mon, 26 May 2025 16:19:14 -0400 Subject: [PATCH 2/8] Replace xdotool and gnome-screenshot with playwright --- templates/typescript/computer-use/index.ts | 28 +- templates/typescript/computer-use/loop.ts | 105 ++-- .../typescript/computer-use/package.json | 5 +- .../typescript/computer-use/pnpm-lock.yaml | 79 +++ .../typescript/computer-use/tools/computer.ts | 552 ++++++------------ 5 files changed, 340 insertions(+), 429 deletions(-) create mode 100644 templates/typescript/computer-use/pnpm-lock.yaml diff --git a/templates/typescript/computer-use/index.ts b/templates/typescript/computer-use/index.ts index 58a4911..2d1268c 100644 --- a/templates/typescript/computer-use/index.ts +++ b/templates/typescript/computer-use/index.ts @@ -1,7 +1,7 @@ import { Kernel, type KernelContext } from '@onkernel/sdk'; -import { samplingLoop, APIProvider, ToolVersion } from './loop'; -import type { BetaMessageParam } from './loop'; +import { samplingLoop } from './loop'; import type { ToolResult } from './tools/computer'; +import { chromium } from 'playwright'; const kernel = new Kernel(); @@ -45,24 +45,30 @@ app.action( console.log("Kernel browser live view url: ", kernelBrowser.browser_live_view_url); - // Initialize messages with the user's query - const messages: BetaMessageParam[] = [{ - role: 'user', - content: payload.query - }]; + const browser = await chromium.connectOverCDP(kernelBrowser.cdp_ws_url); + const context = await browser.contexts()[0]; + const page = await context?.pages()[0]; + if (!page) { + throw new Error('Error getting initial page'); + } + await page.waitForTimeout(10000); // Run the sampling loop const finalMessages = await samplingLoop({ - model: 'claude-3-opus-20240229', - provider: APIProvider.ANTHROPIC, - messages, + model: 'claude-sonnet-4-20250514', + messages: [{ + role: 'user', + content: payload.query + }], outputCallback: cuOutputCallback, toolOutputCallback: cuToolOutputCallback, apiResponseCallback: cuApiResponseCallback, apiKey: process.env.ANTHROPIC_API_KEY || '', - toolVersion: ToolVersion.V20250124, + playwrightPage: page, }); + await browser.close(); + // Extract the final result from the messages if (finalMessages.length === 0) { throw new Error('No messages were generated during the sampling loop'); diff --git a/templates/typescript/computer-use/loop.ts b/templates/typescript/computer-use/loop.ts index d3f4923..902f0fc 100644 --- a/templates/typescript/computer-use/loop.ts +++ b/templates/typescript/computer-use/loop.ts @@ -1,15 +1,44 @@ import { Anthropic } from '@anthropic-ai/sdk'; import { DateTime } from 'luxon'; import type { ToolResult } from './tools/computer'; -import { Action_20241022, Action_20250124, ComputerTool20241022, ComputerTool20250124 } from './tools/computer'; +import { ComputerTool20241022, ComputerTool20250124, Action } from './tools/computer'; +import type { Page } from 'playwright'; -export enum APIProvider { - ANTHROPIC = 'anthropic' +export type ToolVersion = 'computer_use_20250124' | 'computer_use_20241022' | 'computer_use_20250429'; +export type BetaFlag = 'computer-use-2024-10-22' | 'computer-use-2025-01-24' | 'computer-use-2025-04-29'; + +const DEFAULT_TOOL_VERSION: ToolVersion = 'computer_use_20250124'; + +interface ToolGroup { + readonly version: ToolVersion; + readonly tools: (typeof ComputerTool20241022 | typeof ComputerTool20250124)[]; + readonly beta_flag: BetaFlag | null; } -export enum ToolVersion { - V20241022 = '20241022', - V20250124 = '20250124', +const TOOL_GROUPS: ToolGroup[] = [ + { + version: 'computer_use_20241022', + tools: [ComputerTool20241022], + beta_flag: 'computer-use-2024-10-22', + }, + { + version: 'computer_use_20250124', + tools: [ComputerTool20250124], + beta_flag: 'computer-use-2025-01-24', + }, + { + version: 'computer_use_20250429', + tools: [ComputerTool20250124], + beta_flag: 'computer-use-2025-01-24', + }, +]; + +const TOOL_GROUPS_BY_VERSION: Record = Object.fromEntries( + TOOL_GROUPS.map(group => [group.version, group]) +) as Record; + +export enum APIProvider { + ANTHROPIC = 'anthropic' } export interface BetaMessageParam { @@ -66,7 +95,7 @@ const PROMPT_CACHING_BETA_FLAG = 'prompt-caching-2024-07-31'; // System prompt optimized for the environment const SYSTEM_PROMPT = ` * You are utilising an Ubuntu virtual machine using ${process.arch} architecture with internet access. -* When you connect to the display, Chromium is already open. Use that browser to complete your tasks. +* When you connect to the display, CHROMIUM IS ALREADY OPEN. The url bar is not visible but it is there. * When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available. * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request. * The current date is ${DateTime.now().toFormat('EEEE, MMMM d, yyyy')}. @@ -86,10 +115,16 @@ class ToolCollection { } toParams(): any[] { - return this.tools.map(tool => tool.toParams()); + const params = this.tools.map(tool => { + const toolParams = tool.toParams(); + console.log('Individual tool params:', JSON.stringify(toolParams, null, 2)); + return toolParams; + }); + console.log('All tool params:', JSON.stringify(params, null, 2)); + return params; } - async run(name: string, toolInput: { action: Action_20241022 | Action_20250124 } & Record): Promise { + async run(name: string, toolInput: { action: Action } & Record): Promise { const tool = this.tools.find(t => t.name === name); if (!tool) { throw new Error(`Tool ${name} not found`); @@ -97,36 +132,23 @@ class ToolCollection { // Type guard to ensure action matches the tool version if (tool instanceof ComputerTool20241022) { - if (!Object.values(Action_20241022).includes(toolInput.action as Action_20241022)) { + if (!Object.values(Action).includes(toolInput.action)) { throw new Error(`Invalid action ${toolInput.action} for tool version 20241022`); } - return await tool.call(toolInput as { action: Action_20241022 } & Record); + return await tool.call(toolInput); } else if (tool instanceof ComputerTool20250124) { - if (!Object.values(Action_20250124).includes(toolInput.action as Action_20250124)) { + if (!Object.values(Action).includes(toolInput.action)) { throw new Error(`Invalid action ${toolInput.action} for tool version 20250124`); } - return await tool.call(toolInput as { action: Action_20250124 } & Record); + return await tool.call(toolInput); } throw new Error(`Unsupported tool version for ${name}`); } } -// Tool groups by version -const TOOL_GROUPS_BY_VERSION: Record = { - [ToolVersion.V20241022]: { - tools: [ComputerTool20241022], - beta_flag: 'tools-2024-10-22', - }, - [ToolVersion.V20250124]: { - tools: [ComputerTool20250124], - beta_flag: 'tools-2025-01-24', - }, -}; - export async function samplingLoop({ model, - provider, systemPromptSuffix, messages, outputCallback, @@ -138,9 +160,9 @@ export async function samplingLoop({ toolVersion, thinkingBudget, tokenEfficientToolsBeta = false, + playwrightPage, }: { model: string; - provider: APIProvider; systemPromptSuffix?: string; messages: BetaMessageParam[]; outputCallback: (block: BetaContentBlockParam) => void; @@ -149,12 +171,14 @@ export async function samplingLoop({ apiKey: string; onlyNMostRecentImages?: number; maxTokens?: number; - toolVersion: ToolVersion; + toolVersion?: ToolVersion; thinkingBudget?: number; tokenEfficientToolsBeta?: boolean; + playwrightPage: Page; }): Promise { - const toolGroup = TOOL_GROUPS_BY_VERSION[toolVersion]; - const toolCollection = new ToolCollection(...toolGroup.tools.map(Tool => new Tool())); + const selectedVersion = toolVersion || DEFAULT_TOOL_VERSION; + const toolGroup = TOOL_GROUPS_BY_VERSION[selectedVersion]; + const toolCollection = new ToolCollection(...toolGroup.tools.map((Tool: typeof ComputerTool20241022 | typeof ComputerTool20250124) => new Tool(playwrightPage))); const system: BetaTextBlockParam = { type: 'text', @@ -162,7 +186,6 @@ export async function samplingLoop({ }; while (true) { - let enablePromptCaching = false; const betas: string[] = toolGroup.beta_flag ? [toolGroup.beta_flag] : []; if (tokenEfficientToolsBeta) { @@ -170,15 +193,10 @@ export async function samplingLoop({ } let imageTruncationThreshold = onlyNMostRecentImages || 0; - let client: Anthropic; - - if (provider === APIProvider.ANTHROPIC) { - client = new Anthropic({ apiKey, maxRetries: 4 }); - enablePromptCaching = true; - } else { - throw new Error(`Unsupported provider: ${provider}`); - } + const client = new Anthropic({ apiKey, maxRetries: 4 }); + const enablePromptCaching = true; + if (enablePromptCaching) { betas.push(PROMPT_CACHING_BETA_FLAG); injectPromptCaching(messages); @@ -199,14 +217,17 @@ export async function samplingLoop({ extraBody.thinking = { type: 'enabled', budget_tokens: thinkingBudget }; } + const toolParams = toolCollection.toParams(); + console.log('Tool parameters being sent to Anthropic:', JSON.stringify(toolParams, null, 2)); + try { // Use beta API for messages - const response = await (client as Anthropic).beta.messages.create({ + const response = await client.beta.messages.create({ max_tokens: maxTokens, messages: messages as any, // Type assertion needed for beta API model, system: [system], - tools: toolCollection.toParams(), + tools: toolParams, betas, ...extraBody, }); @@ -231,7 +252,7 @@ export async function samplingLoop({ if (contentBlock.type === 'tool_use' && contentBlock.name && contentBlock.input) { const toolInput = { - action: contentBlock.input.action as Action_20241022 | Action_20250124, + action: contentBlock.input.action as Action, ...contentBlock.input }; diff --git a/templates/typescript/computer-use/package.json b/templates/typescript/computer-use/package.json index f5ca974..53eac73 100644 --- a/templates/typescript/computer-use/package.json +++ b/templates/typescript/computer-use/package.json @@ -7,7 +7,10 @@ "typescript": "^5" }, "dependencies": { - "@onkernel/sdk": "0.3.0" + "@onkernel/sdk": "0.3.0", + "playwright": "^1.52.0", + "@anthropic-ai/sdk": "0.52.0", + "luxon": "3.6.0" } } \ No newline at end of file diff --git a/templates/typescript/computer-use/pnpm-lock.yaml b/templates/typescript/computer-use/pnpm-lock.yaml new file mode 100644 index 0000000..05dca11 --- /dev/null +++ b/templates/typescript/computer-use/pnpm-lock.yaml @@ -0,0 +1,79 @@ +lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +importers: + + .: + dependencies: + '@anthropic-ai/sdk': + specifier: 0.52.0 + version: 0.52.0 + '@onkernel/sdk': + specifier: 0.3.0 + version: 0.3.0 + luxon: + specifier: 3.6.0 + version: 3.6.0 + playwright: + specifier: ^1.52.0 + version: 1.52.0 + typescript: + specifier: ^5 + version: 5.8.3 + +packages: + + '@anthropic-ai/sdk@0.52.0': + resolution: {integrity: sha512-d4c+fg+xy9e46c8+YnrrgIQR45CZlAi7PwdzIfDXDM6ACxEZli1/fxhURsq30ZpMZy6LvSkr41jGq5aF5TD7rQ==} + hasBin: true + + '@onkernel/sdk@0.3.0': + resolution: {integrity: sha512-15hL/6qqM+wA63cR5O8dtUdOBtcPmOZylB2zgq7q2nGJrqzCS99M2onyrjV31CDLowf9bmnHHjXMLzWvcO7L0g==} + + fsevents@2.3.2: + resolution: {integrity: sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==} + engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0} + os: [darwin] + + luxon@3.6.0: + resolution: {integrity: sha512-WE7p0p7W1xji9qxkLYsvcIxZyfP48GuFrWIBQZIsbjCyf65dG1rv4n83HcOyEyhvzxJCrUoObCRNFgRNIQ5KNA==} + engines: {node: '>=12'} + + playwright-core@1.52.0: + resolution: {integrity: sha512-l2osTgLXSMeuLZOML9qYODUQoPPnUsKsb5/P6LJ2e6uPKXUdPK5WYhN4z03G+YNbWmGDY4YENauNu4ZKczreHg==} + engines: {node: '>=18'} + hasBin: true + + playwright@1.52.0: + resolution: {integrity: sha512-JAwMNMBlxJ2oD1kce4KPtMkDeKGHQstdpFPcPH3maElAXon/QZeTvtsfXmTMRyO9TslfoYOXkSsvao2nE1ilTw==} + engines: {node: '>=18'} + hasBin: true + + typescript@5.8.3: + resolution: {integrity: sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==} + engines: {node: '>=14.17'} + hasBin: true + +snapshots: + + '@anthropic-ai/sdk@0.52.0': {} + + '@onkernel/sdk@0.3.0': {} + + fsevents@2.3.2: + optional: true + + luxon@3.6.0: {} + + playwright-core@1.52.0: {} + + playwright@1.52.0: + dependencies: + playwright-core: 1.52.0 + optionalDependencies: + fsevents: 2.3.2 + + typescript@5.8.3: {} diff --git a/templates/typescript/computer-use/tools/computer.ts b/templates/typescript/computer-use/tools/computer.ts index 3de5ac6..914ac5b 100644 --- a/templates/typescript/computer-use/tools/computer.ts +++ b/templates/typescript/computer-use/tools/computer.ts @@ -1,11 +1,7 @@ -import { exec } from 'child_process'; -import { promisify } from 'util'; -import { mkdir, readFile } from 'fs/promises'; -import { existsSync } from 'fs'; -import { join } from 'path'; -import { v4 as uuidv4 } from 'uuid'; - -export enum Action_20241022 { +import type { Page } from 'playwright'; + +export enum Action { + // Base actions KEY = 'key', TYPE = 'type', MOUSE_MOVE = 'mouse_move', @@ -17,22 +13,7 @@ export enum Action_20241022 { TRIPLE_CLICK = 'triple_click', SCREENSHOT = 'screenshot', CURSOR_POSITION = 'cursor_position', -} - -export enum Action_20250124 { - // Include all actions from 20241022 - KEY = Action_20241022.KEY, - TYPE = Action_20241022.TYPE, - MOUSE_MOVE = Action_20241022.MOUSE_MOVE, - LEFT_CLICK = Action_20241022.LEFT_CLICK, - LEFT_CLICK_DRAG = Action_20241022.LEFT_CLICK_DRAG, - RIGHT_CLICK = Action_20241022.RIGHT_CLICK, - MIDDLE_CLICK = Action_20241022.MIDDLE_CLICK, - DOUBLE_CLICK = Action_20241022.DOUBLE_CLICK, - TRIPLE_CLICK = Action_20241022.TRIPLE_CLICK, - SCREENSHOT = Action_20241022.SCREENSHOT, - CURSOR_POSITION = Action_20241022.CURSOR_POSITION, - // Add new actions + // Extended actions (20250124) LEFT_MOUSE_DOWN = 'left_mouse_down', LEFT_MOUSE_UP = 'left_mouse_up', SCROLL = 'scroll', @@ -40,6 +21,10 @@ export enum Action_20250124 { WAIT = 'wait', } +// For backward compatibility +export type Action_20241022 = Action; +export type Action_20250124 = Action; + export interface ToolResult { output?: string; error?: string; @@ -60,107 +45,35 @@ export interface BaseAnthropicTool { toParams(): any; } -const execAsync = promisify(exec); - -const OUTPUT_DIR = '/tmp/outputs'; const TYPING_DELAY_MS = 12; -const TYPING_GROUP_SIZE = 50; type ScrollDirection = 'up' | 'down' | 'left' | 'right'; -interface Resolution { - width: number; - height: number; -} - -const MAX_SCALING_TARGETS: Record = { - XGA: { width: 1024, height: 768 }, // 4:3 - WXGA: { width: 1280, height: 800 }, // 16:10 - FWXGA: { width: 1366, height: 768 }, // ~16:9 -}; - -const CLICK_BUTTONS: Record = { - left_click: 1, - right_click: 3, - middle_click: 2, - double_click: '--repeat 2 --delay 10 1', - triple_click: '--repeat 3 --delay 10 1', -}; - -enum ScalingSource { - COMPUTER = 'computer', - API = 'api', -} - -interface ComputerToolOptions { - display_height_px: number; - display_width_px: number; - display_number: number | null; -} - -function chunks(s: string, chunkSize: number): string[] { - return Array.from({ length: Math.ceil(s.length / chunkSize) }, (_, i) => - s.slice(i * chunkSize, (i + 1) * chunkSize) - ); -} - -export class BaseComputerTool { +export class ComputerTool implements BaseAnthropicTool { name: 'computer' = 'computer'; - width: number; - height: number; - displayNum: number | null; + protected page: Page; protected _screenshotDelay = 2.0; - protected _scalingEnabled = true; - protected _displayPrefix: string; - protected xdotool: string; - - constructor() { - this.width = parseInt(process.env.WIDTH || '0'); - this.height = parseInt(process.env.HEIGHT || '0'); - - if (!this.width || !this.height) { - throw new Error('WIDTH, HEIGHT must be set'); - } + protected version: '20241022' | '20250124'; - const displayNum = process.env.DISPLAY_NUM; - if (displayNum !== undefined) { - this.displayNum = parseInt(displayNum); - this._displayPrefix = `DISPLAY=:${this.displayNum} `; - } else { - this.displayNum = null; - this._displayPrefix = ''; - } - - this.xdotool = `${this._displayPrefix}xdotool`; + constructor(page: Page, version: '20241022' | '20250124' = '20250124') { + this.page = page; + this.version = version; } - get options(): ComputerToolOptions { - const [width, height] = this.scaleCoordinates( - ScalingSource.COMPUTER, - this.width, - this.height - ); - return { - display_width_px: width, - display_height_px: height, - display_number: this.displayNum, - }; + get apiType(): 'computer_20241022' | 'computer_20250124' { + return this.version === '20241022' ? 'computer_20241022' : 'computer_20250124'; } - protected async shell(command: string, takeScreenshot = true): Promise { - try { - const { stdout, stderr } = await execAsync(command); - let base64Image: string | undefined; - - if (takeScreenshot) { - await new Promise(resolve => setTimeout(resolve, this._screenshotDelay * 1000)); - base64Image = (await this.screenshot()).base64Image; - } - - return { output: stdout, error: stderr, base64Image }; - } catch (error) { - throw new ToolError(`Command failed: ${error}`); - } + toParams(): any { + const params = { + name: this.name, + type: this.apiType, + display_width_px: 1280, + display_height_px: 720, + display_number: null, + }; + console.log('ComputerTool toParams:', JSON.stringify(params, null, 2)); + return params; } protected validateAndGetCoordinates(coordinate: [number, number] | null = null): [number, number] { @@ -170,76 +83,23 @@ export class BaseComputerTool { if (!coordinate.every(i => typeof i === 'number' && i >= 0)) { throw new ToolError(`${coordinate} must be a tuple of non-negative numbers`); } - - return this.scaleCoordinates(ScalingSource.API, coordinate[0], coordinate[1]); - } - - protected scaleCoordinates(source: ScalingSource, x: number, y: number): [number, number] { - if (!this._scalingEnabled) { - return [x, y]; - } - - const ratio = this.width / this.height; - let targetDimension: Resolution | null = null; - - for (const dimension of Object.values(MAX_SCALING_TARGETS)) { - if (Math.abs(dimension.width / dimension.height - ratio) < 0.02) { - if (dimension.width < this.width) { - targetDimension = dimension; - } - break; - } - } - - if (!targetDimension) { - return [x, y]; - } - - const xScalingFactor = targetDimension.width / this.width; - const yScalingFactor = targetDimension.height / this.height; - - if (source === ScalingSource.API) { - if (x > this.width || y > this.height) { - throw new ToolError(`Coordinates ${x}, ${y} are out of bounds`); - } - return [Math.round(x / xScalingFactor), Math.round(y / yScalingFactor)]; - } - - return [Math.round(x * xScalingFactor), Math.round(y * yScalingFactor)]; + return coordinate; } async screenshot(): Promise { - const outputDir = OUTPUT_DIR; - await mkdir(outputDir, { recursive: true }); - const path = join(outputDir, `screenshot_${uuidv4()}.png`); - - let screenshotCmd: string; - if (existsSync('/usr/bin/gnome-screenshot')) { - screenshotCmd = `${this._displayPrefix}gnome-screenshot -f ${path} -p`; - } else { - screenshotCmd = `${this._displayPrefix}scrot -p ${path}`; - } - - const result = await this.shell(screenshotCmd, false); - - if (this._scalingEnabled) { - const [x, y] = this.scaleCoordinates(ScalingSource.COMPUTER, this.width, this.height); - await this.shell(`convert ${path} -resize ${x}x${y}! ${path}`, false); - } - - if (existsSync(path)) { - const imageBuffer = await readFile(path); + try { + await new Promise(resolve => setTimeout(resolve, this._screenshotDelay * 1000)); + const screenshot = await this.page.screenshot({ type: 'png' }); return { - ...result, - base64Image: imageBuffer.toString('base64'), + base64Image: screenshot.toString('base64'), }; + } catch (error) { + throw new ToolError(`Failed to take screenshot: ${error}`); } - - throw new ToolError(`Failed to take screenshot: ${result.error}`); } async call(params: { - action: Action_20241022 | Action_20250124; + action: Action; text?: string; coordinate?: [number, number]; scrollDirection?: ScrollDirection; @@ -248,149 +108,36 @@ export class BaseComputerTool { key?: string; [key: string]: any; }): Promise { - const { action, text, coordinate, ...kwargs } = params; - - if (action === Action_20241022.MOUSE_MOVE || action === Action_20241022.LEFT_CLICK_DRAG) { - if (!coordinate) { - throw new ToolError(`coordinate is required for ${action}`); - } - if (text !== undefined) { - throw new ToolError(`text is not accepted for ${action}`); - } + console.log('ComputerTool.call called with params:', JSON.stringify(params, null, 2)); + const { action, text, coordinate, scrollDirection, scrollAmount, duration, ...kwargs } = params; - const [x, y] = this.validateAndGetCoordinates(coordinate); - - if (action === Action_20241022.MOUSE_MOVE) { - return await this.shell(`${this.xdotool} mousemove --sync ${x} ${y}`); - } else { - return await this.shell( - `${this.xdotool} mousedown 1 mousemove --sync ${x} ${y} mouseup 1` - ); - } + if (action === Action.SCREENSHOT) { + this.validateText(text, false, action); + this.validateCoordinate(coordinate, false, action); + return await this.screenshot(); } - if (action === Action_20241022.KEY || action === Action_20241022.TYPE) { - if (text === undefined) { - throw new ToolError(`text is required for ${action}`); - } - if (coordinate !== undefined) { - throw new ToolError(`coordinate is not accepted for ${action}`); - } - if (typeof text !== 'string') { - throw new ToolError(`${text} must be a string`); - } - - if (action === Action_20241022.KEY) { - return await this.shell(`${this.xdotool} key -- ${text}`); - } else { - const results: ToolResult[] = []; - for (const chunk of chunks(text, TYPING_GROUP_SIZE)) { - const escapedChunk = chunk.replace(/'/g, "'\\''"); - results.push( - await this.shell( - `${this.xdotool} type --delay ${TYPING_DELAY_MS} -- '${escapedChunk}'`, - false - ) - ); - } - const screenshot = await this.screenshot(); - return { - output: results.map(r => r.output || '').join(''), - error: results.map(r => r.error || '').join(''), - base64Image: screenshot.base64Image, - }; + if (action === Action.CURSOR_POSITION) { + this.validateText(text, false, action); + this.validateCoordinate(coordinate, false, action); + const position = await this.page.evaluate(() => { + const selection = window.getSelection(); + const range = selection?.getRangeAt(0); + const rect = range?.getBoundingClientRect(); + return rect ? { x: rect.x, y: rect.y } : null; + }); + + if (!position) { + throw new ToolError('Failed to get cursor position'); } + + return { output: `X=${position.x},Y=${position.y}` }; } - if ([ - Action_20241022.LEFT_CLICK, - Action_20241022.RIGHT_CLICK, - Action_20241022.DOUBLE_CLICK, - Action_20241022.MIDDLE_CLICK, - Action_20241022.SCREENSHOT, - Action_20241022.CURSOR_POSITION, - ].includes(action as Action_20241022)) { - if (text !== undefined) { - throw new ToolError(`text is not accepted for ${action}`); - } - if (coordinate !== undefined) { - throw new ToolError(`coordinate is not accepted for ${action}`); + if (action === Action.SCROLL) { + if (this.version !== '20250124') { + throw new ToolError(`${action} is only available in version 20250124`); } - - if (action === Action_20241022.SCREENSHOT) { - return await this.screenshot(); - } else if (action === Action_20241022.CURSOR_POSITION) { - const result = await this.shell(`${this.xdotool} getmouselocation --shell`, false); - const output = result.output || ''; - const xMatch = output.match(/X=(\d+)/); - const yMatch = output.match(/Y=(\d+)/); - - if (!xMatch?.[1] || !yMatch?.[1]) { - throw new ToolError('Failed to parse cursor position'); - } - - const x = parseInt(xMatch[1], 10); - const y = parseInt(yMatch[1], 10); - - if (isNaN(x) || isNaN(y)) { - throw new ToolError('Invalid cursor position values'); - } - - const [scaledX, scaledY] = this.scaleCoordinates(ScalingSource.COMPUTER, x, y); - return { ...result, output: `X=${scaledX},Y=${scaledY}` }; - } else { - return await this.shell(`${this.xdotool} click ${CLICK_BUTTONS[action]}`); - } - } - - throw new ToolError(`Invalid action: ${action}`); - } -} - -export class ComputerTool20241022 extends BaseComputerTool implements BaseAnthropicTool { - apiType: 'computer_20241022' = 'computer_20241022'; - - toParams(): any { - return { - name: this.name, - type: this.apiType, - ...this.options, - }; - } -} - -export class ComputerTool20250124 extends BaseComputerTool implements BaseAnthropicTool { - apiType: 'computer_20250124' = 'computer_20250124'; - - toParams(): any { - return { - name: this.name, - type: this.apiType, - ...this.options, - }; - } - - async call(params: { - action: Action_20250124; - text?: string; - coordinate?: [number, number]; - scrollDirection?: ScrollDirection; - scrollAmount?: number; - duration?: number; - key?: string; - [key: string]: any; - }): Promise { - const { action, text, coordinate, scrollDirection, scrollAmount, duration, key, ...kwargs } = params; - - if (action === Action_20250124.LEFT_MOUSE_DOWN || action === Action_20250124.LEFT_MOUSE_UP) { - if (coordinate !== undefined) { - throw new ToolError(`coordinate is not accepted for ${action}`); - } - const command = `${this.xdotool} ${action === Action_20250124.LEFT_MOUSE_DOWN ? 'mousedown' : 'mouseup'} 1`; - return await this.shell(command); - } - - if (action === Action_20250124.SCROLL) { if (!scrollDirection || !['up', 'down', 'left', 'right'].includes(scrollDirection)) { throw new ToolError(`${scrollDirection} must be 'up', 'down', 'left', or 'right'`); } @@ -398,91 +145,146 @@ export class ComputerTool20250124 extends BaseComputerTool implements BaseAnthro throw new ToolError(`${scrollAmount} must be a non-negative number`); } - let mouseMovePart = ''; - if (coordinate !== undefined) { + if (coordinate) { const [x, y] = this.validateAndGetCoordinates(coordinate); - mouseMovePart = `mousemove --sync ${x} ${y}`; + await this.page.mouse.move(x, y); } - const scrollButton = { - up: 4, - down: 5, - left: 6, - right: 7, - }[scrollDirection]; + const amount = scrollAmount || 100; + await this.page.mouse.wheel(0, scrollDirection === 'down' ? amount : -amount); + return await this.screenshot(); + } - const commandParts = [this.xdotool, mouseMovePart]; - if (text) { - commandParts.push(`keydown ${text}`); + if (action === Action.WAIT) { + if (this.version !== '20250124') { + throw new ToolError(`${action} is only available in version 20250124`); } - commandParts.push(`click --repeat ${scrollAmount} ${scrollButton}`); - if (text) { - commandParts.push(`keyup ${text}`); - } - - return await this.shell(commandParts.join(' ')); + this.validateDuration(duration, action); + await new Promise(resolve => setTimeout(resolve, duration! * 1000)); + return await this.screenshot(); } - if (action === Action_20250124.HOLD_KEY || action === Action_20250124.WAIT) { - if (duration === undefined || typeof duration !== 'number') { - throw new ToolError(`${duration} must be a number`); - } - if (duration < 0) { - throw new ToolError(`${duration} must be non-negative`); - } - if (duration > 100) { - throw new ToolError(`${duration} is too long`); + // Handle mouse movement and drag + if (action === Action.MOUSE_MOVE || action === Action.LEFT_CLICK_DRAG) { + this.validateText(text, false, action); + if (!coordinate) { + throw new ToolError(`coordinate is required for ${action}`); } - if (action === Action_20250124.HOLD_KEY) { - if (text === undefined) { - throw new ToolError(`text is required for ${action}`); - } - const escapedKeys = text.replace(/'/g, "'\\''"); - const commandParts = [ - this.xdotool, - `keydown '${escapedKeys}'`, - `sleep ${duration}`, - `keyup '${escapedKeys}'`, - ]; - return await this.shell(commandParts.join(' ')); + const [x, y] = this.validateAndGetCoordinates(coordinate); + if (action === Action.MOUSE_MOVE) { + await this.page.mouse.move(x, y); + } else { + await this.page.mouse.down(); + await this.page.mouse.move(x, y); + await this.page.mouse.up(); } + return await this.screenshot(); + } + + // Handle keyboard actions + if (action === Action.KEY || action === Action.TYPE || action === Action.HOLD_KEY) { + this.validateText(text, true, action); + this.validateCoordinate(coordinate, false, action); - if (action === Action_20250124.WAIT) { - await new Promise(resolve => setTimeout(resolve, duration * 1000)); - return await this.screenshot(); + if (action === Action.HOLD_KEY) { + if (this.version !== '20250124') { + throw new ToolError(`${action} is only available in version 20250124`); + } + this.validateDuration(duration, action); + await this.page.keyboard.down(text!); + await new Promise(resolve => setTimeout(resolve, duration! * 1000)); + await this.page.keyboard.up(text!); + } else if (action === Action.KEY) { + await this.page.keyboard.press(text!); + } else { + await this.page.keyboard.type(text!, { delay: TYPING_DELAY_MS }); } + return await this.screenshot(); } + // Handle mouse clicks if ([ - Action_20250124.LEFT_CLICK, - Action_20250124.RIGHT_CLICK, - Action_20250124.DOUBLE_CLICK, - Action_20250124.TRIPLE_CLICK, - Action_20250124.MIDDLE_CLICK, + Action.LEFT_CLICK, + Action.RIGHT_CLICK, + Action.DOUBLE_CLICK, + Action.MIDDLE_CLICK, + Action.TRIPLE_CLICK, + Action.LEFT_MOUSE_DOWN, + Action.LEFT_MOUSE_UP, ].includes(action)) { - if (text !== undefined) { - throw new ToolError(`text is not accepted for ${action}`); - } + this.validateText(text, false, action); + this.validateCoordinate(coordinate, false, action); - let mouseMovePart = ''; - if (coordinate !== undefined) { - const [x, y] = this.validateAndGetCoordinates(coordinate); - mouseMovePart = `mousemove --sync ${x} ${y}`; + if (action === Action.LEFT_MOUSE_DOWN || action === Action.LEFT_MOUSE_UP) { + if (this.version !== '20250124') { + throw new ToolError(`${action} is only available in version 20250124`); + } + if (action === Action.LEFT_MOUSE_DOWN) { + await this.page.mouse.down(); + } else { + await this.page.mouse.up(); + } + } else { + const button = { + [Action.LEFT_CLICK]: 'left' as const, + [Action.RIGHT_CLICK]: 'right' as const, + [Action.MIDDLE_CLICK]: 'middle' as const, + [Action.DOUBLE_CLICK]: 'left' as const, + [Action.TRIPLE_CLICK]: 'left' as const, + }[action]; + + if (action === Action.DOUBLE_CLICK) { + await this.page.mouse.dblclick(0, 0, { button }); + } else if (action === Action.TRIPLE_CLICK) { + await this.page.mouse.click(0, 0, { button, clickCount: 3 }); + } else { + await this.page.mouse.click(0, 0, { button }); + } } + return await this.screenshot(); + } - const commandParts = [this.xdotool, mouseMovePart]; - if (key) { - commandParts.push(`keydown ${key}`); - } - commandParts.push(`click ${CLICK_BUTTONS[action]}`); - if (key) { - commandParts.push(`keyup ${key}`); - } + throw new ToolError(`Invalid action: ${action}`); + } + + protected validateText(text: string | undefined, required: boolean, action: string): void { + if (required && text === undefined) { + throw new ToolError(`text is required for ${action}`); + } + if (text !== undefined && typeof text !== 'string') { + throw new ToolError(`${text} must be a string`); + } + } + + protected validateCoordinate(coordinate: [number, number] | undefined, allowed: boolean, action: string): void { + if (!allowed && coordinate !== undefined) { + throw new ToolError(`coordinate is not accepted for ${action}`); + } + } - return await this.shell(commandParts.join(' ')); + protected validateDuration(duration: number | undefined, action: string): void { + if (duration === undefined || typeof duration !== 'number') { + throw new ToolError(`${duration} must be a number`); + } + if (duration < 0) { + throw new ToolError(`${duration} must be non-negative`); } + if (duration > 100) { + throw new ToolError(`${duration} is too long`); + } + } +} + +// For backward compatibility +export class ComputerTool20241022 extends ComputerTool { + constructor(page: Page) { + super(page, '20241022'); + } +} - return await super.call(params); +export class ComputerTool20250124 extends ComputerTool { + constructor(page: Page) { + super(page, '20250124'); } } From 6007d40808dc95b9375e498c49402398356c26bf Mon Sep 17 00:00:00 2001 From: Catherine Jue Date: Mon, 26 May 2025 17:03:46 -0400 Subject: [PATCH 3/8] CU may be dum, but at least its looping correctly --- templates/typescript/computer-use/index.ts | 3 +- templates/typescript/computer-use/loop.ts | 132 +++++++++--- .../typescript/computer-use/tools/computer.ts | 194 ++++++++++++++++-- 3 files changed, 289 insertions(+), 40 deletions(-) diff --git a/templates/typescript/computer-use/index.ts b/templates/typescript/computer-use/index.ts index 2d1268c..2f29a65 100644 --- a/templates/typescript/computer-use/index.ts +++ b/templates/typescript/computer-use/index.ts @@ -21,7 +21,7 @@ const cuOutputCallback = (block: any) => { }; const cuToolOutputCallback = (result: ToolResult, id: string) => { - console.log('Tool output:', { id, result }); + console.log('Tool output:', { id, result: Object.keys(result) }); }; const cuApiResponseCallback = (request: any, response: any, error: any) => { @@ -51,7 +51,6 @@ app.action( if (!page) { throw new Error('Error getting initial page'); } - await page.waitForTimeout(10000); // Run the sampling loop const finalMessages = await samplingLoop({ diff --git a/templates/typescript/computer-use/loop.ts b/templates/typescript/computer-use/loop.ts index 902f0fc..55ac87f 100644 --- a/templates/typescript/computer-use/loop.ts +++ b/templates/typescript/computer-use/loop.ts @@ -218,13 +218,24 @@ export async function samplingLoop({ } const toolParams = toolCollection.toParams(); - console.log('Tool parameters being sent to Anthropic:', JSON.stringify(toolParams, null, 2)); + console.log('=== TOOL AVAILABILITY ==='); + console.log('Tools being sent to AI:', JSON.stringify(toolParams, null, 2)); + console.log('Available actions:', Object.values(Action)); + console.log('======================='); try { // Use beta API for messages + console.log('=== AI REQUEST ==='); + console.log('Messages being sent:', messages.map(m => ({ + role: m.role, + content: Array.isArray(m.content) + ? m.content.map(c => c.type === 'image' ? 'IMAGE' : c) + : m.content + }))); + const response = await client.beta.messages.create({ max_tokens: maxTokens, - messages: messages as any, // Type assertion needed for beta API + messages: messages as any, model, system: [system], tools: toolParams, @@ -232,50 +243,110 @@ export async function samplingLoop({ ...extraBody, }); - apiResponseCallback( - response._request_id, - response, - null - ); - + console.log('=== AI RESPONSE ==='); + console.log('Stop reason:', response.stop_reason); const responseParams = responseToParams(response as unknown as BetaMessage); + // Log the AI's response without the full base64 data + const loggableContent = responseParams.map(block => { + if (block.type === 'tool_use') { + return { + type: 'tool_use', + name: block.name, + input: block.input + }; + } + return block; + }); + console.log('AI response content:', loggableContent); + + // Always add the assistant's response to messages messages.push({ role: 'assistant', content: responseParams, }); + // Check if the AI has completed its task + if (response.stop_reason === 'end_turn') { + console.log('AI has completed its task, ending loop'); + return messages; + } + const toolResultContent: BetaToolResultBlockParam[] = []; + let hasToolUse = false; for (const contentBlock of responseParams) { - outputCallback(contentBlock); - if (contentBlock.type === 'tool_use' && contentBlock.name && contentBlock.input) { + console.log('=== TOOL USE ATTEMPT ==='); + console.log('Tool:', contentBlock.name); + console.log('Action:', contentBlock.input.action); + + hasToolUse = true; const toolInput = { action: contentBlock.input.action as Action, ...contentBlock.input }; - const result = await toolCollection.run( - contentBlock.name, - toolInput - ); - - const toolResult = makeApiToolResult(result, contentBlock.id!); - toolResultContent.push(toolResult); - toolOutputCallback(result, contentBlock.id!); + try { + // Execute tool without logging the full result + const result = await toolCollection.run( + contentBlock.name, + toolInput + ); + + // Just log the result type and size + console.log('Tool execution completed'); + if (result.base64Image) { + console.log('Result contains image of size:', result.base64Image.length); + } + if (result.output) { + console.log('Result contains output:', result.output); + } + if (result.error) { + console.log('Result contains error:', result.error); + } + + // Create and add tool result without logging it + const toolResult = makeApiToolResult(result, contentBlock.id!); + toolResultContent.push(toolResult); + + // Call output callback without logging + toolOutputCallback(result, contentBlock.id!); + + console.log('Tool result added to messages'); + } catch (error: unknown) { + console.error('=== TOOL EXECUTION ERROR ==='); + console.error('Error executing tool:', contentBlock.name); + if (error instanceof Error) { + console.error('Error message:', error.message); + } + throw error; + } } } - if (toolResultContent.length === 0) { + // Only end the loop if there are no tool results AND no tool use was attempted + if (toolResultContent.length === 0 && !hasToolUse && response.stop_reason !== 'tool_use') { + console.log('No tool use or results, and not waiting for tool use, ending loop'); return messages; } - messages.push({ - content: toolResultContent, - role: 'user', - }); + if (toolResultContent.length > 0) { + console.log('Adding tool results to messages'); + messages.push({ + role: 'user', + content: toolResultContent, + }); + console.log('Tool results added, message count:', messages.length); + } + + console.log('=== LOOP CONTINUING ==='); + console.log('Next API call will have', messages.length, 'messages'); } catch (error: any) { + console.error('=== ERROR IN LOOP ==='); + console.error('Error type:', error.constructor.name); + console.error('Error message:', error.message); + console.error('Error stack:', error.stack); apiResponseCallback(error.request, error.response || error.body, error); return messages; } @@ -377,23 +448,31 @@ function makeApiToolResult( result: ToolResult, toolUseId: string ): BetaToolResultBlockParam { + console.log('=== MAKING API TOOL RESULT ==='); + console.log('Tool use ID:', toolUseId); + console.log('Result type:', result.error ? 'error' : 'success'); + const toolResultContent: (BetaTextBlockParam | BetaImageBlockParam)[] = []; let isError = false; if (result.error) { + console.log('Processing error result'); isError = true; toolResultContent.push({ type: 'text', text: maybePrependSystemToolResult(result, result.error), }); } else { + console.log('Processing success result'); if (result.output) { + console.log('Adding output text'); toolResultContent.push({ type: 'text', text: maybePrependSystemToolResult(result, result.output), }); } if (result.base64Image) { + console.log('Adding base64 image'); toolResultContent.push({ type: 'image', source: { @@ -405,12 +484,15 @@ function makeApiToolResult( } } - return { - type: 'tool_result', + console.log('Final tool result content types:', toolResultContent.map(c => c.type)); + const finalResult: BetaToolResultBlockParam = { + type: 'tool_result' as const, content: toolResultContent, tool_use_id: toolUseId, is_error: isError, }; + console.log('=== API TOOL RESULT COMPLETE ==='); + return finalResult; } function maybePrependSystemToolResult(result: ToolResult, resultText: string): string { diff --git a/templates/typescript/computer-use/tools/computer.ts b/templates/typescript/computer-use/tools/computer.ts index 914ac5b..3890201 100644 --- a/templates/typescript/computer-use/tools/computer.ts +++ b/templates/typescript/computer-use/tools/computer.ts @@ -55,6 +55,68 @@ export class ComputerTool implements BaseAnthropicTool { protected _screenshotDelay = 2.0; protected version: '20241022' | '20250124'; + // Map of common key aliases to Playwright key names + private readonly keyMap: Record = { + 'Return': 'Enter', + 'Enter': 'Enter', + 'Escape': 'Escape', + 'Tab': 'Tab', + 'Backspace': 'Backspace', + 'Delete': 'Delete', + 'ArrowUp': 'ArrowUp', + 'ArrowDown': 'ArrowDown', + 'ArrowLeft': 'ArrowLeft', + 'ArrowRight': 'ArrowRight', + 'Home': 'Home', + 'End': 'End', + 'PageUp': 'PageUp', + 'PageDown': 'PageDown', + 'Space': ' ', + ' ': ' ', + }; + + // Map of modifier keys to their Playwright equivalents + private readonly modifierKeys: Record = { + 'Ctrl': 'Control', + 'Control': 'Control', + 'Alt': 'Alt', + 'Shift': 'Shift', + 'Meta': 'Meta', + 'Command': 'Meta', + 'Win': 'Meta', + }; + + // Map of key combinations to their components + private readonly keyCombinations: Record = { + 'ctrl+a': ['Control', 'a'], + 'ctrl+c': ['Control', 'c'], + 'ctrl+v': ['Control', 'v'], + 'ctrl+x': ['Control', 'x'], + 'ctrl+z': ['Control', 'z'], + 'ctrl+y': ['Control', 'y'], + 'ctrl+f': ['Control', 'f'], + 'alt+tab': ['Alt', 'Tab'], + 'alt+f4': ['Alt', 'F4'], + 'alt+enter': ['Alt', 'Enter'], + }; + + private isModifierKey(key: string | undefined): boolean { + return key !== undefined && key in this.modifierKeys; + } + + private getPlaywrightKey(key: string | undefined): string { + if (!key) { + throw new ToolError('Key cannot be undefined'); + } + const definedKey = key; // TypeScript now knows key is defined + // First check if it's a modifier key + if (this.isModifierKey(definedKey)) { + return this.modifierKeys[definedKey]; + } + // Then check the regular key map + return this.keyMap[definedKey] || definedKey; + } + constructor(page: Page, version: '20241022' | '20250124' = '20250124') { this.page = page; this.version = version; @@ -88,12 +150,31 @@ export class ComputerTool implements BaseAnthropicTool { async screenshot(): Promise { try { + console.log('Starting screenshot...'); + console.log('Waiting for screenshot delay:', this._screenshotDelay * 1000, 'ms'); await new Promise(resolve => setTimeout(resolve, this._screenshotDelay * 1000)); + console.log('Screenshot delay complete'); + + console.log('Taking screenshot...'); const screenshot = await this.page.screenshot({ type: 'png' }); + console.log('Screenshot taken, size:', screenshot.length, 'bytes'); + + console.log('Converting to base64...'); + const base64 = screenshot.toString('base64'); + console.log('Base64 conversion complete, length:', base64.length); + + console.log('Returning screenshot result'); return { - base64Image: screenshot.toString('base64'), + base64Image: base64, }; } catch (error) { + console.error('=== SCREENSHOT ERROR ==='); + console.error('Error taking screenshot:', error); + if (error instanceof Error) { + console.error('Error message:', error.message); + console.error('Error stack:', error.stack); + } + console.error('========================'); throw new ToolError(`Failed to take screenshot: ${error}`); } } @@ -103,13 +184,26 @@ export class ComputerTool implements BaseAnthropicTool { text?: string; coordinate?: [number, number]; scrollDirection?: ScrollDirection; + scroll_amount?: number; scrollAmount?: number; duration?: number; key?: string; [key: string]: any; }): Promise { console.log('ComputerTool.call called with params:', JSON.stringify(params, null, 2)); - const { action, text, coordinate, scrollDirection, scrollAmount, duration, ...kwargs } = params; + const { + action, + text, + coordinate, + scrollDirection: scrollDirectionParam, + scroll_amount, + scrollAmount, + duration, + ...kwargs + } = params; + + const scrollDirection = scrollDirectionParam || kwargs.scroll_direction; + const scrollAmountValue = scrollAmount || scroll_amount; if (action === Action.SCREENSHOT) { this.validateText(text, false, action); @@ -138,20 +232,33 @@ export class ComputerTool implements BaseAnthropicTool { if (this.version !== '20250124') { throw new ToolError(`${action} is only available in version 20250124`); } + + console.log('Scroll parameters:', { scrollDirection, scrollAmountValue }); + if (!scrollDirection || !['up', 'down', 'left', 'right'].includes(scrollDirection)) { - throw new ToolError(`${scrollDirection} must be 'up', 'down', 'left', or 'right'`); + throw new ToolError(`Scroll direction "${scrollDirection}" must be 'up', 'down', 'left', or 'right'`); } - if (typeof scrollAmount !== 'number' || scrollAmount < 0) { - throw new ToolError(`${scrollAmount} must be a non-negative number`); + if (typeof scrollAmountValue !== 'number' || scrollAmountValue < 0) { + throw new ToolError(`Scroll amount "${scrollAmountValue}" must be a non-negative number`); } if (coordinate) { const [x, y] = this.validateAndGetCoordinates(coordinate); + console.log(`Moving mouse to scroll coordinates: [${x}, ${y}]`); await this.page.mouse.move(x, y); + await this.page.waitForTimeout(100); } - const amount = scrollAmount || 100; - await this.page.mouse.wheel(0, scrollDirection === 'down' ? amount : -amount); + const amount = scrollAmountValue || 100; + console.log(`Scrolling ${scrollDirection} by ${amount} pixels`); + + if (scrollDirection === 'down' || scrollDirection === 'up') { + await this.page.mouse.wheel(0, scrollDirection === 'down' ? amount : -amount); + } else { + await this.page.mouse.wheel(scrollDirection === 'right' ? amount : -amount, 0); + } + + await this.page.waitForTimeout(500); return await this.screenshot(); } @@ -192,14 +299,40 @@ export class ComputerTool implements BaseAnthropicTool { throw new ToolError(`${action} is only available in version 20250124`); } this.validateDuration(duration, action); - await this.page.keyboard.down(text!); + const key = this.getPlaywrightKey(text!); + console.log(`Holding key: ${key}`); + await this.page.keyboard.down(key); await new Promise(resolve => setTimeout(resolve, duration! * 1000)); - await this.page.keyboard.up(text!); + await this.page.keyboard.up(key); } else if (action === Action.KEY) { - await this.page.keyboard.press(text!); + // Handle key combinations (e.g., ctrl+a) + const keyCombo = this.keyCombinations[text!]; + if (keyCombo) { + console.log('Pressing key combination:', keyCombo); + for (const key of keyCombo) { + await this.page.keyboard.down(this.getPlaywrightKey(key)); + } + for (const key of keyCombo.reverse()) { + await this.page.keyboard.up(this.getPlaywrightKey(key)); + } + } else { + const key = this.getPlaywrightKey(text!); + console.log(`Pressing key: ${key}`); + if (this.isModifierKey(text!)) { + // For modifier keys, use down/up instead of press + await this.page.keyboard.down(key); + await this.page.waitForTimeout(100); + await this.page.keyboard.up(key); + } else { + await this.page.keyboard.press(key); + } + } } else { + // For typing, add a small delay between characters await this.page.keyboard.type(text!, { delay: TYPING_DELAY_MS }); } + // Add a small delay after keyboard actions + await this.page.waitForTimeout(500); return await this.screenshot(); } @@ -216,6 +349,18 @@ export class ComputerTool implements BaseAnthropicTool { this.validateText(text, false, action); this.validateCoordinate(coordinate, false, action); + if (!coordinate) { + throw new ToolError(`coordinate is required for ${action}`); + } + + const [x, y] = this.validateAndGetCoordinates(coordinate); + console.log(`Moving mouse to coordinates: [${x}, ${y}]`); + + // Move mouse to position first + await this.page.mouse.move(x, y); + // Add a small delay to ensure the mouse has moved + await this.page.waitForTimeout(100); + if (action === Action.LEFT_MOUSE_DOWN || action === Action.LEFT_MOUSE_UP) { if (this.version !== '20250124') { throw new ToolError(`${action} is only available in version 20250124`); @@ -235,13 +380,16 @@ export class ComputerTool implements BaseAnthropicTool { }[action]; if (action === Action.DOUBLE_CLICK) { - await this.page.mouse.dblclick(0, 0, { button }); + await this.page.mouse.dblclick(x, y, { button }); } else if (action === Action.TRIPLE_CLICK) { - await this.page.mouse.click(0, 0, { button, clickCount: 3 }); + await this.page.mouse.click(x, y, { button, clickCount: 3 }); } else { - await this.page.mouse.click(0, 0, { button }); + await this.page.mouse.click(x, y, { button }); } } + + // Add a delay after clicking to ensure the action is complete + await this.page.waitForTimeout(500); return await this.screenshot(); } @@ -258,6 +406,26 @@ export class ComputerTool implements BaseAnthropicTool { } protected validateCoordinate(coordinate: [number, number] | undefined, allowed: boolean, action: string): void { + // For mouse actions, coordinates are required + if ([ + Action.LEFT_CLICK, + Action.RIGHT_CLICK, + Action.MIDDLE_CLICK, + Action.DOUBLE_CLICK, + Action.TRIPLE_CLICK, + Action.MOUSE_MOVE, + Action.LEFT_CLICK_DRAG, + Action.LEFT_MOUSE_DOWN, + Action.LEFT_MOUSE_UP, + ].includes(action as Action)) { + if (!coordinate) { + throw new ToolError(`coordinate is required for ${action}`); + } + this.validateAndGetCoordinates(coordinate); + return; + } + + // For other actions, coordinates are not allowed if (!allowed && coordinate !== undefined) { throw new ToolError(`coordinate is not accepted for ${action}`); } From b70f44c7f7287a6c556a1cc42a76df4e3a768be8 Mon Sep 17 00:00:00 2001 From: Catherine Jue Date: Mon, 26 May 2025 18:35:41 -0400 Subject: [PATCH 4/8] Clean up --- templates/typescript/computer-use/index.ts | 22 +---- templates/typescript/computer-use/loop.ts | 90 +++---------------- .../typescript/computer-use/tools/computer.ts | 34 +------ 3 files changed, 22 insertions(+), 124 deletions(-) diff --git a/templates/typescript/computer-use/index.ts b/templates/typescript/computer-use/index.ts index 2f29a65..862cbe0 100644 --- a/templates/typescript/computer-use/index.ts +++ b/templates/typescript/computer-use/index.ts @@ -15,21 +15,8 @@ interface QueryOutput { result: string; } -// Anthropic callbacks for handling loop output -const cuOutputCallback = (block: any) => { - console.log('Output block:', block); -}; - -const cuToolOutputCallback = (result: ToolResult, id: string) => { - console.log('Tool output:', { id, result: Object.keys(result) }); -}; - -const cuApiResponseCallback = (request: any, response: any, error: any) => { - if (error) { - console.error('API error:', error); - } else { - console.log('API response:', { request, response }); - } +const errorResponseCallback = (request: any, response: any, error: any) => { + console.error('Error response callback - API response:', { request, response, error }); }; app.action( @@ -59,11 +46,10 @@ app.action( role: 'user', content: payload.query }], - outputCallback: cuOutputCallback, - toolOutputCallback: cuToolOutputCallback, - apiResponseCallback: cuApiResponseCallback, + errorResponseCallback, apiKey: process.env.ANTHROPIC_API_KEY || '', playwrightPage: page, + cdpUrl: kernelBrowser.cdp_ws_url, }); await browser.close(); diff --git a/templates/typescript/computer-use/loop.ts b/templates/typescript/computer-use/loop.ts index 55ac87f..2ebdd35 100644 --- a/templates/typescript/computer-use/loop.ts +++ b/templates/typescript/computer-use/loop.ts @@ -116,11 +116,8 @@ class ToolCollection { toParams(): any[] { const params = this.tools.map(tool => { - const toolParams = tool.toParams(); - console.log('Individual tool params:', JSON.stringify(toolParams, null, 2)); - return toolParams; + return tool.toParams(); }); - console.log('All tool params:', JSON.stringify(params, null, 2)); return params; } @@ -151,9 +148,7 @@ export async function samplingLoop({ model, systemPromptSuffix, messages, - outputCallback, - toolOutputCallback, - apiResponseCallback, + errorResponseCallback, apiKey, onlyNMostRecentImages, maxTokens = 4096, @@ -161,13 +156,12 @@ export async function samplingLoop({ thinkingBudget, tokenEfficientToolsBeta = false, playwrightPage, + cdpUrl, }: { model: string; systemPromptSuffix?: string; messages: BetaMessageParam[]; - outputCallback: (block: BetaContentBlockParam) => void; - toolOutputCallback: (result: ToolResult, id: string) => void; - apiResponseCallback: (request: any, response: any, error: any) => void; + errorResponseCallback: (request: any, response: any, error: any) => void; apiKey: string; onlyNMostRecentImages?: number; maxTokens?: number; @@ -175,6 +169,7 @@ export async function samplingLoop({ thinkingBudget?: number; tokenEfficientToolsBeta?: boolean; playwrightPage: Page; + cdpUrl: string; }): Promise { const selectedVersion = toolVersion || DEFAULT_TOOL_VERSION; const toolGroup = TOOL_GROUPS_BY_VERSION[selectedVersion]; @@ -218,21 +213,9 @@ export async function samplingLoop({ } const toolParams = toolCollection.toParams(); - console.log('=== TOOL AVAILABILITY ==='); - console.log('Tools being sent to AI:', JSON.stringify(toolParams, null, 2)); - console.log('Available actions:', Object.values(Action)); - console.log('======================='); try { - // Use beta API for messages - console.log('=== AI REQUEST ==='); - console.log('Messages being sent:', messages.map(m => ({ - role: m.role, - content: Array.isArray(m.content) - ? m.content.map(c => c.type === 'image' ? 'IMAGE' : c) - : m.content - }))); - + // Use beta API for messages const response = await client.beta.messages.create({ max_tokens: maxTokens, messages: messages as any, @@ -243,11 +226,8 @@ export async function samplingLoop({ ...extraBody, }); - console.log('=== AI RESPONSE ==='); - console.log('Stop reason:', response.stop_reason); const responseParams = responseToParams(response as unknown as BetaMessage); - // Log the AI's response without the full base64 data const loggableContent = responseParams.map(block => { if (block.type === 'tool_use') { return { @@ -258,17 +238,18 @@ export async function samplingLoop({ } return block; }); - console.log('AI response content:', loggableContent); + console.log('=== LLM RESPONSE ==='); + console.log('Stop reason:', response.stop_reason); + console.log(loggableContent); + console.log("===") - // Always add the assistant's response to messages messages.push({ role: 'assistant', content: responseParams, }); - // Check if the AI has completed its task if (response.stop_reason === 'end_turn') { - console.log('AI has completed its task, ending loop'); + console.log('LLM has completed its task, ending loop'); return messages; } @@ -277,10 +258,6 @@ export async function samplingLoop({ for (const contentBlock of responseParams) { if (contentBlock.type === 'tool_use' && contentBlock.name && contentBlock.input) { - console.log('=== TOOL USE ATTEMPT ==='); - console.log('Tool:', contentBlock.name); - console.log('Action:', contentBlock.input.action); - hasToolUse = true; const toolInput = { action: contentBlock.input.action as Action, @@ -288,35 +265,14 @@ export async function samplingLoop({ }; try { - // Execute tool without logging the full result const result = await toolCollection.run( contentBlock.name, toolInput ); - - // Just log the result type and size - console.log('Tool execution completed'); - if (result.base64Image) { - console.log('Result contains image of size:', result.base64Image.length); - } - if (result.output) { - console.log('Result contains output:', result.output); - } - if (result.error) { - console.log('Result contains error:', result.error); - } - - // Create and add tool result without logging it + const toolResult = makeApiToolResult(result, contentBlock.id!); toolResultContent.push(toolResult); - - // Call output callback without logging - toolOutputCallback(result, contentBlock.id!); - - console.log('Tool result added to messages'); } catch (error: unknown) { - console.error('=== TOOL EXECUTION ERROR ==='); - console.error('Error executing tool:', contentBlock.name); if (error instanceof Error) { console.error('Error message:', error.message); } @@ -325,29 +281,20 @@ export async function samplingLoop({ } } - // Only end the loop if there are no tool results AND no tool use was attempted + // End the loop if there are no tool results AND no tool use was attempted if (toolResultContent.length === 0 && !hasToolUse && response.stop_reason !== 'tool_use') { console.log('No tool use or results, and not waiting for tool use, ending loop'); return messages; } if (toolResultContent.length > 0) { - console.log('Adding tool results to messages'); messages.push({ role: 'user', content: toolResultContent, }); - console.log('Tool results added, message count:', messages.length); } - - console.log('=== LOOP CONTINUING ==='); - console.log('Next API call will have', messages.length, 'messages'); } catch (error: any) { - console.error('=== ERROR IN LOOP ==='); - console.error('Error type:', error.constructor.name); - console.error('Error message:', error.message); - console.error('Error stack:', error.stack); - apiResponseCallback(error.request, error.response || error.body, error); + errorResponseCallback(error.request, error.response || error.body, error); return messages; } } @@ -448,31 +395,24 @@ function makeApiToolResult( result: ToolResult, toolUseId: string ): BetaToolResultBlockParam { - console.log('=== MAKING API TOOL RESULT ==='); - console.log('Tool use ID:', toolUseId); - console.log('Result type:', result.error ? 'error' : 'success'); const toolResultContent: (BetaTextBlockParam | BetaImageBlockParam)[] = []; let isError = false; if (result.error) { - console.log('Processing error result'); isError = true; toolResultContent.push({ type: 'text', text: maybePrependSystemToolResult(result, result.error), }); } else { - console.log('Processing success result'); if (result.output) { - console.log('Adding output text'); toolResultContent.push({ type: 'text', text: maybePrependSystemToolResult(result, result.output), }); } if (result.base64Image) { - console.log('Adding base64 image'); toolResultContent.push({ type: 'image', source: { @@ -484,14 +424,12 @@ function makeApiToolResult( } } - console.log('Final tool result content types:', toolResultContent.map(c => c.type)); const finalResult: BetaToolResultBlockParam = { type: 'tool_result' as const, content: toolResultContent, tool_use_id: toolUseId, is_error: isError, }; - console.log('=== API TOOL RESULT COMPLETE ==='); return finalResult; } diff --git a/templates/typescript/computer-use/tools/computer.ts b/templates/typescript/computer-use/tools/computer.ts index 3890201..8a882ce 100644 --- a/templates/typescript/computer-use/tools/computer.ts +++ b/templates/typescript/computer-use/tools/computer.ts @@ -111,7 +111,7 @@ export class ComputerTool implements BaseAnthropicTool { const definedKey = key; // TypeScript now knows key is defined // First check if it's a modifier key if (this.isModifierKey(definedKey)) { - return this.modifierKeys[definedKey]; + return this.modifierKeys[definedKey] as string; } // Then check the regular key map return this.keyMap[definedKey] || definedKey; @@ -134,7 +134,6 @@ export class ComputerTool implements BaseAnthropicTool { display_height_px: 720, display_number: null, }; - console.log('ComputerTool toParams:', JSON.stringify(params, null, 2)); return params; } @@ -151,30 +150,14 @@ export class ComputerTool implements BaseAnthropicTool { async screenshot(): Promise { try { console.log('Starting screenshot...'); - console.log('Waiting for screenshot delay:', this._screenshotDelay * 1000, 'ms'); await new Promise(resolve => setTimeout(resolve, this._screenshotDelay * 1000)); - console.log('Screenshot delay complete'); - - console.log('Taking screenshot...'); const screenshot = await this.page.screenshot({ type: 'png' }); console.log('Screenshot taken, size:', screenshot.length, 'bytes'); - - console.log('Converting to base64...'); - const base64 = screenshot.toString('base64'); - console.log('Base64 conversion complete, length:', base64.length); - - console.log('Returning screenshot result'); + return { - base64Image: base64, + base64Image: screenshot.toString('base64'), }; } catch (error) { - console.error('=== SCREENSHOT ERROR ==='); - console.error('Error taking screenshot:', error); - if (error instanceof Error) { - console.error('Error message:', error.message); - console.error('Error stack:', error.stack); - } - console.error('========================'); throw new ToolError(`Failed to take screenshot: ${error}`); } } @@ -190,7 +173,6 @@ export class ComputerTool implements BaseAnthropicTool { key?: string; [key: string]: any; }): Promise { - console.log('ComputerTool.call called with params:', JSON.stringify(params, null, 2)); const { action, text, @@ -232,9 +214,7 @@ export class ComputerTool implements BaseAnthropicTool { if (this.version !== '20250124') { throw new ToolError(`${action} is only available in version 20250124`); } - - console.log('Scroll parameters:', { scrollDirection, scrollAmountValue }); - + if (!scrollDirection || !['up', 'down', 'left', 'right'].includes(scrollDirection)) { throw new ToolError(`Scroll direction "${scrollDirection}" must be 'up', 'down', 'left', or 'right'`); } @@ -244,13 +224,11 @@ export class ComputerTool implements BaseAnthropicTool { if (coordinate) { const [x, y] = this.validateAndGetCoordinates(coordinate); - console.log(`Moving mouse to scroll coordinates: [${x}, ${y}]`); await this.page.mouse.move(x, y); await this.page.waitForTimeout(100); } const amount = scrollAmountValue || 100; - console.log(`Scrolling ${scrollDirection} by ${amount} pixels`); if (scrollDirection === 'down' || scrollDirection === 'up') { await this.page.mouse.wheel(0, scrollDirection === 'down' ? amount : -amount); @@ -300,7 +278,6 @@ export class ComputerTool implements BaseAnthropicTool { } this.validateDuration(duration, action); const key = this.getPlaywrightKey(text!); - console.log(`Holding key: ${key}`); await this.page.keyboard.down(key); await new Promise(resolve => setTimeout(resolve, duration! * 1000)); await this.page.keyboard.up(key); @@ -308,7 +285,6 @@ export class ComputerTool implements BaseAnthropicTool { // Handle key combinations (e.g., ctrl+a) const keyCombo = this.keyCombinations[text!]; if (keyCombo) { - console.log('Pressing key combination:', keyCombo); for (const key of keyCombo) { await this.page.keyboard.down(this.getPlaywrightKey(key)); } @@ -317,7 +293,6 @@ export class ComputerTool implements BaseAnthropicTool { } } else { const key = this.getPlaywrightKey(text!); - console.log(`Pressing key: ${key}`); if (this.isModifierKey(text!)) { // For modifier keys, use down/up instead of press await this.page.keyboard.down(key); @@ -354,7 +329,6 @@ export class ComputerTool implements BaseAnthropicTool { } const [x, y] = this.validateAndGetCoordinates(coordinate); - console.log(`Moving mouse to coordinates: [${x}, ${y}]`); // Move mouse to position first await this.page.mouse.move(x, y); From 8a97ab6e05823a71ad09e365f125cbdb4559b363 Mon Sep 17 00:00:00 2001 From: Catherine Jue Date: Mon, 26 May 2025 19:27:41 -0400 Subject: [PATCH 5/8] organize in files --- templates/typescript/computer-use/README.md | 2 + templates/typescript/computer-use/index.ts | 60 +-- templates/typescript/computer-use/loop.ts | 287 +------------ .../computer-use/tools/collection.ts | 62 +++ .../typescript/computer-use/tools/computer.ts | 392 +++++------------- .../computer-use/tools/types/computer.ts | 64 +++ .../computer-use/tools/utils/keyboard.ts | 54 +++ .../computer-use/tools/utils/validator.ts | 67 +++ .../typescript/computer-use/types/beta.ts | 49 +++ .../computer-use/utils/message-processing.ts | 79 ++++ .../computer-use/utils/tool-results.ts | 49 +++ 11 files changed, 569 insertions(+), 596 deletions(-) create mode 100644 templates/typescript/computer-use/tools/collection.ts create mode 100644 templates/typescript/computer-use/tools/types/computer.ts create mode 100644 templates/typescript/computer-use/tools/utils/keyboard.ts create mode 100644 templates/typescript/computer-use/tools/utils/validator.ts create mode 100644 templates/typescript/computer-use/types/beta.ts create mode 100644 templates/typescript/computer-use/utils/message-processing.ts create mode 100644 templates/typescript/computer-use/utils/tool-results.ts diff --git a/templates/typescript/computer-use/README.md b/templates/typescript/computer-use/README.md index e8075fb..7465e25 100644 --- a/templates/typescript/computer-use/README.md +++ b/templates/typescript/computer-use/README.md @@ -2,4 +2,6 @@ This is a simple Kernel application that implements a prompt loop using Anthropic Computer Use. +It generally follows the [Anthropic Reference Implementation](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo) but replaces `xodotool` and `gnome-screenshot` with Playwright. + See the [docs](https://docs.onkernel.com/quickstart) for information. \ No newline at end of file diff --git a/templates/typescript/computer-use/index.ts b/templates/typescript/computer-use/index.ts index 862cbe0..ce46d60 100644 --- a/templates/typescript/computer-use/index.ts +++ b/templates/typescript/computer-use/index.ts @@ -39,37 +39,41 @@ app.action( throw new Error('Error getting initial page'); } - // Run the sampling loop - const finalMessages = await samplingLoop({ - model: 'claude-sonnet-4-20250514', - messages: [{ - role: 'user', - content: payload.query - }], - errorResponseCallback, - apiKey: process.env.ANTHROPIC_API_KEY || '', - playwrightPage: page, - cdpUrl: kernelBrowser.cdp_ws_url, - }); + try { + // Run the sampling loop + const finalMessages = await samplingLoop({ + model: 'claude-sonnet-4-20250514', + messages: [{ + role: 'user', + content: payload.query + }], + errorResponseCallback, + apiKey: process.env.ANTHROPIC_API_KEY || '', + playwrightPage: page, + }); - await browser.close(); + // Extract the final result from the messages + if (finalMessages.length === 0) { + throw new Error('No messages were generated during the sampling loop'); + } - // Extract the final result from the messages - if (finalMessages.length === 0) { - throw new Error('No messages were generated during the sampling loop'); - } + const lastMessage = finalMessages[finalMessages.length - 1]; + if (!lastMessage) { + throw new Error('Failed to get the last message from the sampling loop'); + } - const lastMessage = finalMessages[finalMessages.length - 1]; - if (!lastMessage) { - throw new Error('Failed to get the last message from the sampling loop'); - } + const result = typeof lastMessage.content === 'string' + ? lastMessage.content + : lastMessage.content.map(block => + block.type === 'text' ? block.text : '' + ).join(''); - const result = typeof lastMessage.content === 'string' - ? lastMessage.content - : lastMessage.content.map(block => - block.type === 'text' ? block.text : '' - ).join(''); - - return { result }; + return { result }; + } catch (error) { + console.error('Error in sampling loop:', error); + throw error; + } finally { + await browser.close(); + } }, ); diff --git a/templates/typescript/computer-use/loop.ts b/templates/typescript/computer-use/loop.ts index 2ebdd35..9ea4ef9 100644 --- a/templates/typescript/computer-use/loop.ts +++ b/templates/typescript/computer-use/loop.ts @@ -1,96 +1,11 @@ import { Anthropic } from '@anthropic-ai/sdk'; import { DateTime } from 'luxon'; -import type { ToolResult } from './tools/computer'; -import { ComputerTool20241022, ComputerTool20250124, Action } from './tools/computer'; import type { Page } from 'playwright'; - -export type ToolVersion = 'computer_use_20250124' | 'computer_use_20241022' | 'computer_use_20250429'; -export type BetaFlag = 'computer-use-2024-10-22' | 'computer-use-2025-01-24' | 'computer-use-2025-04-29'; - -const DEFAULT_TOOL_VERSION: ToolVersion = 'computer_use_20250124'; - -interface ToolGroup { - readonly version: ToolVersion; - readonly tools: (typeof ComputerTool20241022 | typeof ComputerTool20250124)[]; - readonly beta_flag: BetaFlag | null; -} - -const TOOL_GROUPS: ToolGroup[] = [ - { - version: 'computer_use_20241022', - tools: [ComputerTool20241022], - beta_flag: 'computer-use-2024-10-22', - }, - { - version: 'computer_use_20250124', - tools: [ComputerTool20250124], - beta_flag: 'computer-use-2025-01-24', - }, - { - version: 'computer_use_20250429', - tools: [ComputerTool20250124], - beta_flag: 'computer-use-2025-01-24', - }, -]; - -const TOOL_GROUPS_BY_VERSION: Record = Object.fromEntries( - TOOL_GROUPS.map(group => [group.version, group]) -) as Record; - -export enum APIProvider { - ANTHROPIC = 'anthropic' -} - -export interface BetaMessageParam { - role: 'user' | 'assistant'; - content: BetaContentBlockParam[] | string; -} - -export interface BetaContentBlockParam { - type: string; - text?: string; - name?: string; - input?: Record; - id?: string; - cache_control?: { - type: 'ephemeral'; - }; -} - -export interface BetaToolResultBlockParam { - type: 'tool_result'; - content: (BetaTextBlockParam | BetaImageBlockParam)[] | string; - tool_use_id: string; - is_error: boolean; -} - -export interface BetaTextBlockParam { - type: 'text'; - text: string; -} - -export interface BetaImageBlockParam { - type: 'image'; - source: { - type: 'base64'; - media_type: 'image/png'; - data: string; - }; -} - -export interface BetaMessage { - content: Array<{ - type: string; - text?: string; - name?: string; - input?: Record; - id?: string; - thinking?: any; - signature?: string; - }>; -} - -const PROMPT_CACHING_BETA_FLAG = 'prompt-caching-2024-07-31'; +import type { BetaMessageParam, BetaTextBlock } from './types/beta'; +import { ToolCollection, DEFAULT_TOOL_VERSION, TOOL_GROUPS_BY_VERSION, type ToolVersion } from './tools/collection'; +import { responseToParams, maybeFilterToNMostRecentImages, injectPromptCaching, PROMPT_CACHING_BETA_FLAG } from './utils/message-processing'; +import { makeApiToolResult } from './utils/tool-results'; +import { ComputerTool20241022, ComputerTool20250124 } from './tools/computer'; // System prompt optimized for the environment const SYSTEM_PROMPT = ` @@ -102,48 +17,10 @@ const SYSTEM_PROMPT = ` -* When using Chromium, if a startup wizard appears, IGNORE IT. Do not even click "skip this step". Instead, click on the address bar where it says "Search or enter address", and enter the appropriate search term or URL there. +* When using Chromium, if a startup wizard appears, IGNORE IT. Do not even click "skip this step". Instead, click on the search bar on the center of the screenwhere it says "Search or enter address", and enter the appropriate search term or URL there. * If the item you are looking at is a pdf, if after taking a single screenshot of the pdf it seems that you want to read the entire document instead of trying to continue to read the pdf from your screenshots + navigation, determine the URL, use curl to download the pdf, install and use pdftotext to convert it to a text file, and then read that text file directly. `; -// Tool collection class to manage available tools -class ToolCollection { - private tools: (ComputerTool20241022 | ComputerTool20250124)[]; - - constructor(...tools: (ComputerTool20241022 | ComputerTool20250124)[]) { - this.tools = tools; - } - - toParams(): any[] { - const params = this.tools.map(tool => { - return tool.toParams(); - }); - return params; - } - - async run(name: string, toolInput: { action: Action } & Record): Promise { - const tool = this.tools.find(t => t.name === name); - if (!tool) { - throw new Error(`Tool ${name} not found`); - } - - // Type guard to ensure action matches the tool version - if (tool instanceof ComputerTool20241022) { - if (!Object.values(Action).includes(toolInput.action)) { - throw new Error(`Invalid action ${toolInput.action} for tool version 20241022`); - } - return await tool.call(toolInput); - } else if (tool instanceof ComputerTool20250124) { - if (!Object.values(Action).includes(toolInput.action)) { - throw new Error(`Invalid action ${toolInput.action} for tool version 20250124`); - } - return await tool.call(toolInput); - } - - throw new Error(`Unsupported tool version for ${name}`); - } -} - export async function samplingLoop({ model, systemPromptSuffix, @@ -156,7 +33,6 @@ export async function samplingLoop({ thinkingBudget, tokenEfficientToolsBeta = false, playwrightPage, - cdpUrl, }: { model: string; systemPromptSuffix?: string; @@ -169,13 +45,12 @@ export async function samplingLoop({ thinkingBudget?: number; tokenEfficientToolsBeta?: boolean; playwrightPage: Page; - cdpUrl: string; }): Promise { const selectedVersion = toolVersion || DEFAULT_TOOL_VERSION; const toolGroup = TOOL_GROUPS_BY_VERSION[selectedVersion]; const toolCollection = new ToolCollection(...toolGroup.tools.map((Tool: typeof ComputerTool20241022 | typeof ComputerTool20250124) => new Tool(playwrightPage))); - const system: BetaTextBlockParam = { + const system: BetaTextBlock = { type: 'text', text: `${SYSTEM_PROMPT}${systemPromptSuffix ? ' ' + systemPromptSuffix : ''}`, }; @@ -215,7 +90,6 @@ export async function samplingLoop({ const toolParams = toolCollection.toParams(); try { - // Use beta API for messages const response = await client.beta.messages.create({ max_tokens: maxTokens, messages: messages as any, @@ -226,7 +100,7 @@ export async function samplingLoop({ ...extraBody, }); - const responseParams = responseToParams(response as unknown as BetaMessage); + const responseParams = responseToParams(response as any); const loggableContent = responseParams.map(block => { if (block.type === 'tool_use') { @@ -253,14 +127,14 @@ export async function samplingLoop({ return messages; } - const toolResultContent: BetaToolResultBlockParam[] = []; + const toolResultContent = []; let hasToolUse = false; for (const contentBlock of responseParams) { if (contentBlock.type === 'tool_use' && contentBlock.name && contentBlock.input) { hasToolUse = true; const toolInput = { - action: contentBlock.input.action as Action, + action: contentBlock.input.action, ...contentBlock.input }; @@ -281,7 +155,6 @@ export async function samplingLoop({ } } - // End the loop if there are no tool results AND no tool use was attempted if (toolResultContent.length === 0 && !hasToolUse && response.stop_reason !== 'tool_use') { console.log('No tool use or results, and not waiting for tool use, ending loop'); return messages; @@ -299,143 +172,3 @@ export async function samplingLoop({ } } } - -function maybeFilterToNMostRecentImages( - messages: BetaMessageParam[], - imagesToKeep: number, - minRemovalThreshold: number -): void { - if (imagesToKeep === undefined) return; - - const toolResultBlocks = messages - .flatMap(message => { - if (!message || !Array.isArray(message.content)) return []; - return message.content.filter(item => - typeof item === 'object' && item.type === 'tool_result' - ); - }) as BetaToolResultBlockParam[]; - - let totalImages = 0; - for (const toolResult of toolResultBlocks) { - if (Array.isArray(toolResult.content)) { - totalImages += toolResult.content.filter( - content => typeof content === 'object' && content.type === 'image' - ).length; - } - } - - let imagesToRemove = totalImages - imagesToKeep; - imagesToRemove -= imagesToRemove % minRemovalThreshold; - - for (const toolResult of toolResultBlocks) { - if (Array.isArray(toolResult.content)) { - const newContent = []; - for (const content of toolResult.content) { - if (typeof content === 'object' && content.type === 'image') { - if (imagesToRemove > 0) { - imagesToRemove--; - continue; - } - } - newContent.push(content); - } - toolResult.content = newContent; - } - } -} - -function responseToParams(response: BetaMessage): BetaContentBlockParam[] { - const res: BetaContentBlockParam[] = []; - - for (const block of response.content) { - if (block.type === 'text' && block.text) { - res.push({ type: 'text', text: block.text }); - } else if (block.type === 'thinking') { - const thinkingBlock: any = { - type: 'thinking', - thinking: block.thinking, - }; - if (block.signature) { - thinkingBlock.signature = block.signature; - } - res.push(thinkingBlock); - } else { - res.push(block as BetaContentBlockParam); - } - } - - return res; -} - -function injectPromptCaching(messages: BetaMessageParam[]): void { - let breakpointsRemaining = 3; - - for (let i = messages.length - 1; i >= 0; i--) { - const message = messages[i]; - if (!message) continue; - if (message.role === 'user' && Array.isArray(message.content)) { - if (breakpointsRemaining > 0) { - breakpointsRemaining--; - const lastContent = message.content[message.content.length - 1]; - if (lastContent) { - (lastContent as any).cache_control = { type: 'ephemeral' }; - } - } else { - const lastContent = message.content[message.content.length - 1]; - if (lastContent) { - delete (lastContent as any).cache_control; - } - break; - } - } - } -} - -function makeApiToolResult( - result: ToolResult, - toolUseId: string -): BetaToolResultBlockParam { - - const toolResultContent: (BetaTextBlockParam | BetaImageBlockParam)[] = []; - let isError = false; - - if (result.error) { - isError = true; - toolResultContent.push({ - type: 'text', - text: maybePrependSystemToolResult(result, result.error), - }); - } else { - if (result.output) { - toolResultContent.push({ - type: 'text', - text: maybePrependSystemToolResult(result, result.output), - }); - } - if (result.base64Image) { - toolResultContent.push({ - type: 'image', - source: { - type: 'base64', - media_type: 'image/png', - data: result.base64Image, - }, - }); - } - } - - const finalResult: BetaToolResultBlockParam = { - type: 'tool_result' as const, - content: toolResultContent, - tool_use_id: toolUseId, - is_error: isError, - }; - return finalResult; -} - -function maybePrependSystemToolResult(result: ToolResult, resultText: string): string { - if (result.system) { - return `${result.system}\n${resultText}`; - } - return resultText; -} diff --git a/templates/typescript/computer-use/tools/collection.ts b/templates/typescript/computer-use/tools/collection.ts new file mode 100644 index 0000000..0cf67bf --- /dev/null +++ b/templates/typescript/computer-use/tools/collection.ts @@ -0,0 +1,62 @@ +import { ComputerTool20241022, ComputerTool20250124 } from './computer'; +import { Action } from './types/computer'; +import type { ToolResult } from './types/computer'; +import type { Page } from 'playwright'; + +export type ToolVersion = 'computer_use_20250124' | 'computer_use_20241022' | 'computer_use_20250429'; + +export const DEFAULT_TOOL_VERSION: ToolVersion = 'computer_use_20250429'; + +interface ToolGroup { + readonly version: ToolVersion; + readonly tools: (typeof ComputerTool20241022 | typeof ComputerTool20250124)[]; + readonly beta_flag: string; +} + +export const TOOL_GROUPS: ToolGroup[] = [ + { + version: 'computer_use_20241022', + tools: [ComputerTool20241022], + beta_flag: 'computer-use-2024-10-22', + }, + { + version: 'computer_use_20250124', + tools: [ComputerTool20250124], + beta_flag: 'computer-use-2025-01-24', + }, + // 20250429 version inherits from 20250124 + { + version: 'computer_use_20250429', + tools: [ComputerTool20250124], + beta_flag: 'computer-use-2025-01-24', + }, +]; + +export const TOOL_GROUPS_BY_VERSION: Record = Object.fromEntries( + TOOL_GROUPS.map(group => [group.version, group]) +) as Record; + +export class ToolCollection { + private tools: Map; + + constructor(...tools: (ComputerTool20241022 | ComputerTool20250124)[]) { + this.tools = new Map(tools.map(tool => [tool.name, tool])); + } + + toParams(): any[] { + return Array.from(this.tools.values()).map(tool => tool.toParams()); + } + + async run(name: string, toolInput: { action: Action } & Record): Promise { + const tool = this.tools.get(name); + if (!tool) { + throw new Error(`Tool ${name} not found`); + } + + if (!Object.values(Action).includes(toolInput.action)) { + throw new Error(`Invalid action ${toolInput.action} for tool ${name}`); + } + + return await tool.call(toolInput); + } +} \ No newline at end of file diff --git a/templates/typescript/computer-use/tools/computer.ts b/templates/typescript/computer-use/tools/computer.ts index 8a882ce..f654ab5 100644 --- a/templates/typescript/computer-use/tools/computer.ts +++ b/templates/typescript/computer-use/tools/computer.ts @@ -1,121 +1,41 @@ import type { Page } from 'playwright'; - -export enum Action { - // Base actions - KEY = 'key', - TYPE = 'type', - MOUSE_MOVE = 'mouse_move', - LEFT_CLICK = 'left_click', - LEFT_CLICK_DRAG = 'left_click_drag', - RIGHT_CLICK = 'right_click', - MIDDLE_CLICK = 'middle_click', - DOUBLE_CLICK = 'double_click', - TRIPLE_CLICK = 'triple_click', - SCREENSHOT = 'screenshot', - CURSOR_POSITION = 'cursor_position', - // Extended actions (20250124) - LEFT_MOUSE_DOWN = 'left_mouse_down', - LEFT_MOUSE_UP = 'left_mouse_up', - SCROLL = 'scroll', - HOLD_KEY = 'hold_key', - WAIT = 'wait', -} - -// For backward compatibility -export type Action_20241022 = Action; -export type Action_20250124 = Action; - -export interface ToolResult { - output?: string; - error?: string; - base64Image?: string; - system?: string; -} - -export class ToolError extends Error { - constructor(message: string) { - super(message); - this.name = 'ToolError'; - } -} - -export interface BaseAnthropicTool { - name: string; - apiType: string; - toParams(): any; -} +import { Action, ToolError } from './types/computer'; +import type { ActionParams, BaseAnthropicTool, ToolResult } from './types/computer'; +import { KeyboardUtils } from './utils/keyboard'; +import { ActionValidator } from './utils/validator'; const TYPING_DELAY_MS = 12; -type ScrollDirection = 'up' | 'down' | 'left' | 'right'; - export class ComputerTool implements BaseAnthropicTool { name: 'computer' = 'computer'; protected page: Page; protected _screenshotDelay = 2.0; protected version: '20241022' | '20250124'; - // Map of common key aliases to Playwright key names - private readonly keyMap: Record = { - 'Return': 'Enter', - 'Enter': 'Enter', - 'Escape': 'Escape', - 'Tab': 'Tab', - 'Backspace': 'Backspace', - 'Delete': 'Delete', - 'ArrowUp': 'ArrowUp', - 'ArrowDown': 'ArrowDown', - 'ArrowLeft': 'ArrowLeft', - 'ArrowRight': 'ArrowRight', - 'Home': 'Home', - 'End': 'End', - 'PageUp': 'PageUp', - 'PageDown': 'PageDown', - 'Space': ' ', - ' ': ' ', - }; - - // Map of modifier keys to their Playwright equivalents - private readonly modifierKeys: Record = { - 'Ctrl': 'Control', - 'Control': 'Control', - 'Alt': 'Alt', - 'Shift': 'Shift', - 'Meta': 'Meta', - 'Command': 'Meta', - 'Win': 'Meta', - }; - - // Map of key combinations to their components - private readonly keyCombinations: Record = { - 'ctrl+a': ['Control', 'a'], - 'ctrl+c': ['Control', 'c'], - 'ctrl+v': ['Control', 'v'], - 'ctrl+x': ['Control', 'x'], - 'ctrl+z': ['Control', 'z'], - 'ctrl+y': ['Control', 'y'], - 'ctrl+f': ['Control', 'f'], - 'alt+tab': ['Alt', 'Tab'], - 'alt+f4': ['Alt', 'F4'], - 'alt+enter': ['Alt', 'Enter'], - }; - - private isModifierKey(key: string | undefined): boolean { - return key !== undefined && key in this.modifierKeys; - } - - private getPlaywrightKey(key: string | undefined): string { - if (!key) { - throw new ToolError('Key cannot be undefined'); - } - const definedKey = key; // TypeScript now knows key is defined - // First check if it's a modifier key - if (this.isModifierKey(definedKey)) { - return this.modifierKeys[definedKey] as string; - } - // Then check the regular key map - return this.keyMap[definedKey] || definedKey; - } + private readonly mouseActions = new Set([ + Action.LEFT_CLICK, + Action.RIGHT_CLICK, + Action.MIDDLE_CLICK, + Action.DOUBLE_CLICK, + Action.TRIPLE_CLICK, + Action.MOUSE_MOVE, + Action.LEFT_CLICK_DRAG, + Action.LEFT_MOUSE_DOWN, + Action.LEFT_MOUSE_UP, + ]); + + private readonly keyboardActions = new Set([ + Action.KEY, + Action.TYPE, + Action.HOLD_KEY, + ]); + + private readonly systemActions = new Set([ + Action.SCREENSHOT, + Action.CURSOR_POSITION, + Action.SCROLL, + Action.WAIT, + ]); constructor(page: Page, version: '20241022' | '20250124' = '20250124') { this.page = page; @@ -137,14 +57,68 @@ export class ComputerTool implements BaseAnthropicTool { return params; } - protected validateAndGetCoordinates(coordinate: [number, number] | null = null): [number, number] { - if (!Array.isArray(coordinate) || coordinate.length !== 2) { - throw new ToolError(`${coordinate} must be a tuple of length 2`); + private getMouseButton(action: Action): 'left' | 'right' | 'middle' { + switch (action) { + case Action.LEFT_CLICK: + case Action.DOUBLE_CLICK: + case Action.TRIPLE_CLICK: + case Action.LEFT_CLICK_DRAG: + case Action.LEFT_MOUSE_DOWN: + case Action.LEFT_MOUSE_UP: + return 'left'; + case Action.RIGHT_CLICK: + return 'right'; + case Action.MIDDLE_CLICK: + return 'middle'; + default: + throw new ToolError(`Invalid mouse action: ${action}`); } - if (!coordinate.every(i => typeof i === 'number' && i >= 0)) { - throw new ToolError(`${coordinate} must be a tuple of non-negative numbers`); + } + + private async handleMouseAction(action: Action, coordinate: [number, number]): Promise { + const [x, y] = ActionValidator.validateAndGetCoordinates(coordinate); + await this.page.mouse.move(x, y); + await this.page.waitForTimeout(100); + + if (action === Action.LEFT_MOUSE_DOWN) { + await this.page.mouse.down(); + } else if (action === Action.LEFT_MOUSE_UP) { + await this.page.mouse.up(); + } else { + const button = this.getMouseButton(action); + if (action === Action.DOUBLE_CLICK) { + await this.page.mouse.dblclick(x, y, { button }); + } else if (action === Action.TRIPLE_CLICK) { + await this.page.mouse.click(x, y, { button, clickCount: 3 }); + } else { + await this.page.mouse.click(x, y, { button }); + } + } + + await this.page.waitForTimeout(500); + return await this.screenshot(); + } + + private async handleKeyboardAction(action: Action, text: string, duration?: number): Promise { + if (action === Action.HOLD_KEY) { + const key = KeyboardUtils.getPlaywrightKey(text); + await this.page.keyboard.down(key); + await new Promise(resolve => setTimeout(resolve, duration! * 1000)); + await this.page.keyboard.up(key); + } else if (action === Action.KEY) { + const keys = KeyboardUtils.parseKeyCombination(text); + for (const key of keys) { + await this.page.keyboard.down(key); + } + for (const key of keys.reverse()) { + await this.page.keyboard.up(key); + } + } else { + await this.page.keyboard.type(text, { delay: TYPING_DELAY_MS }); } - return coordinate; + + await this.page.waitForTimeout(500); + return await this.screenshot(); } async screenshot(): Promise { @@ -162,17 +136,7 @@ export class ComputerTool implements BaseAnthropicTool { } } - async call(params: { - action: Action; - text?: string; - coordinate?: [number, number]; - scrollDirection?: ScrollDirection; - scroll_amount?: number; - scrollAmount?: number; - duration?: number; - key?: string; - [key: string]: any; - }): Promise { + async call(params: ActionParams): Promise { const { action, text, @@ -184,18 +148,13 @@ export class ComputerTool implements BaseAnthropicTool { ...kwargs } = params; - const scrollDirection = scrollDirectionParam || kwargs.scroll_direction; - const scrollAmountValue = scrollAmount || scroll_amount; + ActionValidator.validateActionParams(params, this.mouseActions, this.keyboardActions); if (action === Action.SCREENSHOT) { - this.validateText(text, false, action); - this.validateCoordinate(coordinate, false, action); return await this.screenshot(); } if (action === Action.CURSOR_POSITION) { - this.validateText(text, false, action); - this.validateCoordinate(coordinate, false, action); const position = await this.page.evaluate(() => { const selection = window.getSelection(); const range = selection?.getRangeAt(0); @@ -214,7 +173,10 @@ export class ComputerTool implements BaseAnthropicTool { if (this.version !== '20250124') { throw new ToolError(`${action} is only available in version 20250124`); } - + + const scrollDirection = scrollDirectionParam || kwargs.scroll_direction; + const scrollAmountValue = scrollAmount || scroll_amount; + if (!scrollDirection || !['up', 'down', 'left', 'right'].includes(scrollDirection)) { throw new ToolError(`Scroll direction "${scrollDirection}" must be 'up', 'down', 'left', or 'right'`); } @@ -223,7 +185,7 @@ export class ComputerTool implements BaseAnthropicTool { } if (coordinate) { - const [x, y] = this.validateAndGetCoordinates(coordinate); + const [x, y] = ActionValidator.validateAndGetCoordinates(coordinate); await this.page.mouse.move(x, y); await this.page.waitForTimeout(100); } @@ -244,178 +206,26 @@ export class ComputerTool implements BaseAnthropicTool { if (this.version !== '20250124') { throw new ToolError(`${action} is only available in version 20250124`); } - this.validateDuration(duration, action); await new Promise(resolve => setTimeout(resolve, duration! * 1000)); return await this.screenshot(); } - // Handle mouse movement and drag - if (action === Action.MOUSE_MOVE || action === Action.LEFT_CLICK_DRAG) { - this.validateText(text, false, action); + if (this.mouseActions.has(action)) { if (!coordinate) { throw new ToolError(`coordinate is required for ${action}`); } - - const [x, y] = this.validateAndGetCoordinates(coordinate); - if (action === Action.MOUSE_MOVE) { - await this.page.mouse.move(x, y); - } else { - await this.page.mouse.down(); - await this.page.mouse.move(x, y); - await this.page.mouse.up(); - } - return await this.screenshot(); - } - - // Handle keyboard actions - if (action === Action.KEY || action === Action.TYPE || action === Action.HOLD_KEY) { - this.validateText(text, true, action); - this.validateCoordinate(coordinate, false, action); - - if (action === Action.HOLD_KEY) { - if (this.version !== '20250124') { - throw new ToolError(`${action} is only available in version 20250124`); - } - this.validateDuration(duration, action); - const key = this.getPlaywrightKey(text!); - await this.page.keyboard.down(key); - await new Promise(resolve => setTimeout(resolve, duration! * 1000)); - await this.page.keyboard.up(key); - } else if (action === Action.KEY) { - // Handle key combinations (e.g., ctrl+a) - const keyCombo = this.keyCombinations[text!]; - if (keyCombo) { - for (const key of keyCombo) { - await this.page.keyboard.down(this.getPlaywrightKey(key)); - } - for (const key of keyCombo.reverse()) { - await this.page.keyboard.up(this.getPlaywrightKey(key)); - } - } else { - const key = this.getPlaywrightKey(text!); - if (this.isModifierKey(text!)) { - // For modifier keys, use down/up instead of press - await this.page.keyboard.down(key); - await this.page.waitForTimeout(100); - await this.page.keyboard.up(key); - } else { - await this.page.keyboard.press(key); - } - } - } else { - // For typing, add a small delay between characters - await this.page.keyboard.type(text!, { delay: TYPING_DELAY_MS }); - } - // Add a small delay after keyboard actions - await this.page.waitForTimeout(500); - return await this.screenshot(); + return await this.handleMouseAction(action, coordinate); } - // Handle mouse clicks - if ([ - Action.LEFT_CLICK, - Action.RIGHT_CLICK, - Action.DOUBLE_CLICK, - Action.MIDDLE_CLICK, - Action.TRIPLE_CLICK, - Action.LEFT_MOUSE_DOWN, - Action.LEFT_MOUSE_UP, - ].includes(action)) { - this.validateText(text, false, action); - this.validateCoordinate(coordinate, false, action); - - if (!coordinate) { - throw new ToolError(`coordinate is required for ${action}`); - } - - const [x, y] = this.validateAndGetCoordinates(coordinate); - - // Move mouse to position first - await this.page.mouse.move(x, y); - // Add a small delay to ensure the mouse has moved - await this.page.waitForTimeout(100); - - if (action === Action.LEFT_MOUSE_DOWN || action === Action.LEFT_MOUSE_UP) { - if (this.version !== '20250124') { - throw new ToolError(`${action} is only available in version 20250124`); - } - if (action === Action.LEFT_MOUSE_DOWN) { - await this.page.mouse.down(); - } else { - await this.page.mouse.up(); - } - } else { - const button = { - [Action.LEFT_CLICK]: 'left' as const, - [Action.RIGHT_CLICK]: 'right' as const, - [Action.MIDDLE_CLICK]: 'middle' as const, - [Action.DOUBLE_CLICK]: 'left' as const, - [Action.TRIPLE_CLICK]: 'left' as const, - }[action]; - - if (action === Action.DOUBLE_CLICK) { - await this.page.mouse.dblclick(x, y, { button }); - } else if (action === Action.TRIPLE_CLICK) { - await this.page.mouse.click(x, y, { button, clickCount: 3 }); - } else { - await this.page.mouse.click(x, y, { button }); - } + if (this.keyboardActions.has(action)) { + if (!text) { + throw new ToolError(`text is required for ${action}`); } - - // Add a delay after clicking to ensure the action is complete - await this.page.waitForTimeout(500); - return await this.screenshot(); + return await this.handleKeyboardAction(action, text, duration); } throw new ToolError(`Invalid action: ${action}`); } - - protected validateText(text: string | undefined, required: boolean, action: string): void { - if (required && text === undefined) { - throw new ToolError(`text is required for ${action}`); - } - if (text !== undefined && typeof text !== 'string') { - throw new ToolError(`${text} must be a string`); - } - } - - protected validateCoordinate(coordinate: [number, number] | undefined, allowed: boolean, action: string): void { - // For mouse actions, coordinates are required - if ([ - Action.LEFT_CLICK, - Action.RIGHT_CLICK, - Action.MIDDLE_CLICK, - Action.DOUBLE_CLICK, - Action.TRIPLE_CLICK, - Action.MOUSE_MOVE, - Action.LEFT_CLICK_DRAG, - Action.LEFT_MOUSE_DOWN, - Action.LEFT_MOUSE_UP, - ].includes(action as Action)) { - if (!coordinate) { - throw new ToolError(`coordinate is required for ${action}`); - } - this.validateAndGetCoordinates(coordinate); - return; - } - - // For other actions, coordinates are not allowed - if (!allowed && coordinate !== undefined) { - throw new ToolError(`coordinate is not accepted for ${action}`); - } - } - - protected validateDuration(duration: number | undefined, action: string): void { - if (duration === undefined || typeof duration !== 'number') { - throw new ToolError(`${duration} must be a number`); - } - if (duration < 0) { - throw new ToolError(`${duration} must be non-negative`); - } - if (duration > 100) { - throw new ToolError(`${duration} is too long`); - } - } } // For backward compatibility diff --git a/templates/typescript/computer-use/tools/types/computer.ts b/templates/typescript/computer-use/tools/types/computer.ts new file mode 100644 index 0000000..4d4cb27 --- /dev/null +++ b/templates/typescript/computer-use/tools/types/computer.ts @@ -0,0 +1,64 @@ +export enum Action { + // Mouse actions + MOUSE_MOVE = 'mouse_move', + LEFT_CLICK = 'left_click', + RIGHT_CLICK = 'right_click', + MIDDLE_CLICK = 'middle_click', + DOUBLE_CLICK = 'double_click', + TRIPLE_CLICK = 'triple_click', + LEFT_CLICK_DRAG = 'left_click_drag', + LEFT_MOUSE_DOWN = 'left_mouse_down', + LEFT_MOUSE_UP = 'left_mouse_up', + + // Keyboard actions + KEY = 'key', + TYPE = 'type', + HOLD_KEY = 'hold_key', + + // System actions + SCREENSHOT = 'screenshot', + CURSOR_POSITION = 'cursor_position', + SCROLL = 'scroll', + WAIT = 'wait', +} + +// For backward compatibility +export type Action_20241022 = Action; +export type Action_20250124 = Action; + +export type MouseButton = 'left' | 'right' | 'middle'; +export type ScrollDirection = 'up' | 'down' | 'left' | 'right'; +export type Coordinate = [number, number]; +export type Duration = number; + +export interface ActionParams { + action: Action; + text?: string; + coordinate?: Coordinate; + scrollDirection?: ScrollDirection; + scroll_amount?: number; + scrollAmount?: number; + duration?: Duration; + key?: string; + [key: string]: any; +} + +export interface ToolResult { + output?: string; + error?: string; + base64Image?: string; + system?: string; +} + +export interface BaseAnthropicTool { + name: string; + apiType: string; + toParams(): any; +} + +export class ToolError extends Error { + constructor(message: string) { + super(message); + this.name = 'ToolError'; + } +} \ No newline at end of file diff --git a/templates/typescript/computer-use/tools/utils/keyboard.ts b/templates/typescript/computer-use/tools/utils/keyboard.ts new file mode 100644 index 0000000..93de5ee --- /dev/null +++ b/templates/typescript/computer-use/tools/utils/keyboard.ts @@ -0,0 +1,54 @@ +export class KeyboardUtils { + // Only map alternative names to standard Playwright modifier keys + private static readonly modifierKeyMap: Record = { + 'ctrl': 'Control', + 'command': 'Meta', + 'win': 'Meta', + }; + + // Essential key mappings for Playwright compatibility + private static readonly keyMap: Record = { + 'return': 'Enter', + 'space': ' ', + }; + + static isModifierKey(key: string | undefined): boolean { + if (!key) return false; + const normalizedKey = this.modifierKeyMap[key.toLowerCase()] || key; + return ['Control', 'Alt', 'Shift', 'Meta'].includes(normalizedKey); + } + + static getPlaywrightKey(key: string | undefined): string { + if (!key) { + throw new Error('Key cannot be undefined'); + } + + const normalizedKey = key.toLowerCase(); + + // Handle special cases + if (normalizedKey in this.keyMap) { + return this.keyMap[normalizedKey] as string; + } + + // Normalize modifier keys + if (normalizedKey in this.modifierKeyMap) { + return this.modifierKeyMap[normalizedKey] as string; + } + + // Return the key as is - Playwright handles standard key names + return key; + } + + static parseKeyCombination(combo: string): string[] { + if (!combo) { + throw new Error('Key combination cannot be empty'); + } + return combo.toLowerCase().split('+').map(key => { + const trimmedKey = key.trim(); + if (!trimmedKey) { + throw new Error('Invalid key combination: empty key'); + } + return this.getPlaywrightKey(trimmedKey); + }); + } +} \ No newline at end of file diff --git a/templates/typescript/computer-use/tools/utils/validator.ts b/templates/typescript/computer-use/tools/utils/validator.ts new file mode 100644 index 0000000..2868ae0 --- /dev/null +++ b/templates/typescript/computer-use/tools/utils/validator.ts @@ -0,0 +1,67 @@ +import { Action, ToolError } from '../types/computer'; +import type { ActionParams, Coordinate, Duration } from '../types/computer'; + +export class ActionValidator { + static validateText(text: string | undefined, required: boolean, action: string): void { + if (required && text === undefined) { + throw new ToolError(`text is required for ${action}`); + } + if (text !== undefined && typeof text !== 'string') { + throw new ToolError(`${text} must be a string`); + } + } + + static validateCoordinate(coordinate: Coordinate | undefined, required: boolean, action: string): void { + if (required && !coordinate) { + throw new ToolError(`coordinate is required for ${action}`); + } + if (coordinate) { + this.validateAndGetCoordinates(coordinate); + } + } + + static validateDuration(duration: Duration | undefined, action: string): void { + if (duration === undefined || typeof duration !== 'number') { + throw new ToolError(`${duration} must be a number`); + } + if (duration < 0) { + throw new ToolError(`${duration} must be non-negative`); + } + if (duration > 100) { + throw new ToolError(`${duration} is too long`); + } + } + + static validateAndGetCoordinates(coordinate: Coordinate): Coordinate { + if (!Array.isArray(coordinate) || coordinate.length !== 2) { + throw new ToolError(`${coordinate} must be a tuple of length 2`); + } + if (!coordinate.every(i => typeof i === 'number' && i >= 0)) { + throw new ToolError(`${coordinate} must be a tuple of non-negative numbers`); + } + return coordinate; + } + + static validateActionParams(params: ActionParams, mouseActions: Set, keyboardActions: Set): void { + const { action, text, coordinate, duration } = params; + + // Validate text parameter + if (keyboardActions.has(action)) { + this.validateText(text, true, action); + } else { + this.validateText(text, false, action); + } + + // Validate coordinate parameter + if (mouseActions.has(action)) { + this.validateCoordinate(coordinate, true, action); + } else { + this.validateCoordinate(coordinate, false, action); + } + + // Validate duration parameter + if (action === Action.HOLD_KEY || action === Action.WAIT) { + this.validateDuration(duration, action); + } + } +} \ No newline at end of file diff --git a/templates/typescript/computer-use/types/beta.ts b/templates/typescript/computer-use/types/beta.ts new file mode 100644 index 0000000..100a55f --- /dev/null +++ b/templates/typescript/computer-use/types/beta.ts @@ -0,0 +1,49 @@ +export interface BetaBaseBlock { + type: string; + id?: string; + cache_control?: { type: 'ephemeral' }; +} + +export interface BetaTextBlock extends BetaBaseBlock { + type: 'text'; + text: string; +} + +export interface BetaImageBlock extends BetaBaseBlock { + type: 'image'; + source: { + type: 'base64'; + media_type: 'image/png'; + data: string; + }; +} + +export interface BetaToolUseBlock extends BetaBaseBlock { + type: 'tool_use'; + name: string; + input: Record; +} + +export interface BetaThinkingBlock extends BetaBaseBlock { + type: 'thinking'; + thinking: any; + signature?: string; +} + +export interface BetaToolResultBlock extends BetaBaseBlock { + type: 'tool_result'; + content: (BetaTextBlock | BetaImageBlock)[] | string; + tool_use_id: string; + is_error: boolean; +} + +export type BetaContentBlock = BetaTextBlock | BetaImageBlock | BetaToolUseBlock | BetaThinkingBlock | BetaToolResultBlock; + +export interface BetaMessageParam { + role: 'user' | 'assistant'; + content: BetaContentBlock[] | string; +} + +export interface BetaMessage { + content: BetaContentBlock[]; +} \ No newline at end of file diff --git a/templates/typescript/computer-use/utils/message-processing.ts b/templates/typescript/computer-use/utils/message-processing.ts new file mode 100644 index 0000000..3bae657 --- /dev/null +++ b/templates/typescript/computer-use/utils/message-processing.ts @@ -0,0 +1,79 @@ +import type { BetaMessage, BetaMessageParam, BetaToolResultBlock, BetaContentBlock } from '../types/beta'; + +export function responseToParams(response: BetaMessage): BetaContentBlock[] { + return response.content.map(block => { + if (block.type === 'text' && block.text) { + return { type: 'text', text: block.text }; + } + if (block.type === 'thinking') { + const { thinking, signature, ...rest } = block; + return { ...rest, thinking, ...(signature && { signature }) }; + } + return block as BetaContentBlock; + }); +} + +export function maybeFilterToNMostRecentImages( + messages: BetaMessageParam[], + imagesToKeep: number, + minRemovalThreshold: number +): void { + if (!imagesToKeep) return; + + const toolResultBlocks = messages + .flatMap(message => Array.isArray(message?.content) ? message.content : []) + .filter((item): item is BetaToolResultBlock => + typeof item === 'object' && item.type === 'tool_result' + ); + + const totalImages = toolResultBlocks.reduce((count, toolResult) => { + if (!Array.isArray(toolResult.content)) return count; + return count + toolResult.content.filter( + content => typeof content === 'object' && content.type === 'image' + ).length; + }, 0); + + let imagesToRemove = Math.floor((totalImages - imagesToKeep) / minRemovalThreshold) * minRemovalThreshold; + + for (const toolResult of toolResultBlocks) { + if (Array.isArray(toolResult.content)) { + toolResult.content = toolResult.content.filter(content => { + if (typeof content === 'object' && content.type === 'image') { + if (imagesToRemove > 0) { + imagesToRemove--; + return false; + } + } + return true; + }); + } + } +} + +const PROMPT_CACHING_BETA_FLAG = 'prompt-caching-2024-07-31'; + +export function injectPromptCaching(messages: BetaMessageParam[]): void { + let breakpointsRemaining = 3; + + for (let i = messages.length - 1; i >= 0; i--) { + const message = messages[i]; + if (!message) continue; + if (message.role === 'user' && Array.isArray(message.content)) { + if (breakpointsRemaining > 0) { + breakpointsRemaining--; + const lastContent = message.content[message.content.length - 1]; + if (lastContent) { + (lastContent as any).cache_control = { type: 'ephemeral' }; + } + } else { + const lastContent = message.content[message.content.length - 1]; + if (lastContent) { + delete (lastContent as any).cache_control; + } + break; + } + } + } +} + +export { PROMPT_CACHING_BETA_FLAG }; \ No newline at end of file diff --git a/templates/typescript/computer-use/utils/tool-results.ts b/templates/typescript/computer-use/utils/tool-results.ts new file mode 100644 index 0000000..4dcc396 --- /dev/null +++ b/templates/typescript/computer-use/utils/tool-results.ts @@ -0,0 +1,49 @@ +import type { ToolResult } from '../tools/computer'; +import type { BetaToolResultBlock, BetaTextBlock, BetaImageBlock } from '../types/beta'; + +export function makeApiToolResult( + result: ToolResult, + toolUseId: string +): BetaToolResultBlock { + const toolResultContent: (BetaTextBlock | BetaImageBlock)[] = []; + let isError = false; + + if (result.error) { + isError = true; + toolResultContent.push({ + type: 'text', + text: maybePrependSystemToolResult(result, result.error), + }); + } else { + if (result.output) { + toolResultContent.push({ + type: 'text', + text: maybePrependSystemToolResult(result, result.output), + }); + } + if (result.base64Image) { + toolResultContent.push({ + type: 'image', + source: { + type: 'base64', + media_type: 'image/png', + data: result.base64Image, + }, + }); + } + } + + return { + type: 'tool_result', + content: toolResultContent, + tool_use_id: toolUseId, + is_error: isError, + }; +} + +export function maybePrependSystemToolResult(result: ToolResult, resultText: string): string { + if (result.system) { + return `${result.system}\n${resultText}`; + } + return resultText; +} \ No newline at end of file From 71c1d789ea341cca89a1c846e12261abf1c09498 Mon Sep 17 00:00:00 2001 From: Catherine Jue Date: Tue, 27 May 2025 09:11:52 -0400 Subject: [PATCH 6/8] Modify prompt --- templates/typescript/computer-use/index.ts | 3 ++- templates/typescript/computer-use/loop.ts | 15 +++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/templates/typescript/computer-use/index.ts b/templates/typescript/computer-use/index.ts index ce46d60..3b649b6 100644 --- a/templates/typescript/computer-use/index.ts +++ b/templates/typescript/computer-use/index.ts @@ -45,10 +45,11 @@ app.action( model: 'claude-sonnet-4-20250514', messages: [{ role: 'user', - content: payload.query + content: payload.query, }], errorResponseCallback, apiKey: process.env.ANTHROPIC_API_KEY || '', + thinkingBudget: 1024, playwrightPage: page, }); diff --git a/templates/typescript/computer-use/loop.ts b/templates/typescript/computer-use/loop.ts index 9ea4ef9..0b0ec24 100644 --- a/templates/typescript/computer-use/loop.ts +++ b/templates/typescript/computer-use/loop.ts @@ -11,14 +11,21 @@ import { ComputerTool20241022, ComputerTool20250124 } from './tools/computer'; const SYSTEM_PROMPT = ` * You are utilising an Ubuntu virtual machine using ${process.arch} architecture with internet access. * When you connect to the display, CHROMIUM IS ALREADY OPEN. The url bar is not visible but it is there. -* When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available. -* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request. +* If you need to navigate to a new page, use ctrl+l to focus the url bar and then enter the url. +* You won't be able to see the url bar from the screenshot but ctrl-l still works. +* When viewing a page it can be helpful to zoom out so that you can see everything on the page. +* Either that, or make sure you scroll down to see everything before deciding something isn't available. +* When using your computer function calls, they take a while to run and send back to you. +* Where possible/feasible, try to chain multiple of these calls all into one function calls request. * The current date is ${DateTime.now().toFormat('EEEE, MMMM d, yyyy')}. +* After each step, take a screenshot and carefully evaluate if you have achieved the right outcome. +* Explicitly show your thinking: "I have evaluated step X..." If not correct, try again. +* Only when you confirm a step was executed correctly should you move on to the next one. -* When using Chromium, if a startup wizard appears, IGNORE IT. Do not even click "skip this step". Instead, click on the search bar on the center of the screenwhere it says "Search or enter address", and enter the appropriate search term or URL there. -* If the item you are looking at is a pdf, if after taking a single screenshot of the pdf it seems that you want to read the entire document instead of trying to continue to read the pdf from your screenshots + navigation, determine the URL, use curl to download the pdf, install and use pdftotext to convert it to a text file, and then read that text file directly. +* When using Chromium, if a startup wizard appears, IGNORE IT. Do not even click "skip this step". +* Instead, click on the search bar on the center of the screen where it says "Search or enter address", and enter the appropriate search term or URL there. `; export async function samplingLoop({ From 59be9b0e56c523811f6f170e09a762771a1f8738 Mon Sep 17 00:00:00 2001 From: Catherine Jue Date: Wed, 28 May 2025 16:33:50 -0400 Subject: [PATCH 7/8] Lint and CLI --- index.ts | 17 ++- templates/typescript/computer-use/index.ts | 14 +- templates/typescript/computer-use/loop.ts | 142 ++++++++++-------- .../typescript/computer-use/package.json | 27 ++-- .../computer-use/tools/collection.ts | 7 +- .../typescript/computer-use/tools/computer.ts | 2 +- .../computer-use/tools/types/computer.ts | 4 +- .../computer-use/tools/utils/keyboard.ts | 13 ++ .../computer-use/tools/utils/validator.ts | 4 +- .../typescript/computer-use/types/beta.ts | 53 ++++--- .../computer-use/utils/message-processing.ts | 6 +- templates/typescript/stagehand/index.ts | 10 +- 12 files changed, 176 insertions(+), 123 deletions(-) diff --git a/index.ts b/index.ts index 5201291..6727c41 100644 --- a/index.ts +++ b/index.ts @@ -17,7 +17,8 @@ type TemplateKey = | "sample-app" | "browser-use" | "stagehand" - | "persistent-browser"; + | "persistent-browser" + | "computer-use"; type LanguageInfo = { name: string; shorthand: string }; type TemplateInfo = { name: string; @@ -32,6 +33,7 @@ const TEMPLATE_SAMPLE_APP = "sample-app"; const TEMPLATE_BROWSER_USE = "browser-use"; const TEMPLATE_STAGEHAND = "stagehand"; const TEMPLATE_PERSISTENT_BROWSER = "persistent-browser"; +const TEMPLATE_COMPUTER_USE = "computer-use"; const LANGUAGE_SHORTHAND_TS = "ts"; const LANGUAGE_SHORTHAND_PY = "py"; @@ -66,6 +68,11 @@ const TEMPLATES: Record = { "Implements a persistent browser that maintains state across invocations", languages: [LANGUAGE_TYPESCRIPT], }, + [TEMPLATE_COMPUTER_USE]: { + name: "Computer Use", + description: "Implements the Anthropic Computer Use SDK", + languages: [LANGUAGE_TYPESCRIPT], + }, }; const INVOKE_SAMPLES: Record< @@ -79,6 +86,8 @@ const INVOKE_SAMPLES: Record< 'kernel invoke ts-stagehand stagehand-task --payload \'{"query": "Best wired earbuds"}\'', [TEMPLATE_PERSISTENT_BROWSER]: 'kernel invoke ts-persistent-browser persistent-browser-task --payload \'{"url": "https://news.ycombinator.com/"}\'', + [TEMPLATE_COMPUTER_USE]: + 'kernel invoke ts-cu cu-task --payload \'{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}\'', }, [LANGUAGE_PYTHON]: { [TEMPLATE_SAMPLE_APP]: @@ -299,10 +308,12 @@ function printNextSteps( ): void { // Determine which sample command to show based on language and template const deployCommand = - language === LANGUAGE_TYPESCRIPT && template === TEMPLATE_SAMPLE_APP + language === LANGUAGE_TYPESCRIPT && (template === TEMPLATE_SAMPLE_APP || template === TEMPLATE_PERSISTENT_BROWSER) ? "kernel deploy index.ts" : language === LANGUAGE_TYPESCRIPT && template === TEMPLATE_STAGEHAND ? "kernel deploy index.ts --env OPENAI_API_KEY=XXX" + : language === LANGUAGE_TYPESCRIPT && template === TEMPLATE_COMPUTER_USE + ? "kernel deploy index.ts --env ANTHROPIC_API_KEY=XXX" : language === LANGUAGE_PYTHON && template === TEMPLATE_SAMPLE_APP ? "kernel deploy main.py" : language === LANGUAGE_PYTHON && template === TEMPLATE_BROWSER_USE @@ -341,7 +352,7 @@ program ) .option( "-t, --template