|
| 1 | +/** |
| 2 | + * Test script for the browser-use agent. |
| 3 | + * |
| 4 | + * Runs the agent on browser tasks one at a time, writing full event traces |
| 5 | + * to files for analysis. Each task produces a trace file in debug/browser-agent-traces/. |
| 6 | + * |
| 7 | + * Usage: |
| 8 | + * bun agents/browser-use/browser-use.test.ts [taskIndex] |
| 9 | + * |
| 10 | + * If taskIndex is provided, runs only that task (0-based). Otherwise runs all tasks. |
| 11 | + */ |
| 12 | + |
| 13 | +import * as fs from 'fs' |
| 14 | +import * as path from 'path' |
| 15 | + |
| 16 | +import { CodebuffClient, loadLocalAgents } from '@codebuff/sdk' |
| 17 | + |
| 18 | +import type { AgentDefinition } from '@codebuff/sdk' |
| 19 | + |
| 20 | +const TRACE_DIR = path.join(process.cwd(), 'debug', 'browser-agent-traces') |
| 21 | + |
| 22 | +interface TaskDefinition { |
| 23 | + name: string |
| 24 | + prompt: string |
| 25 | + url?: string |
| 26 | +} |
| 27 | + |
| 28 | +const TASKS: TaskDefinition[] = [ |
| 29 | + { |
| 30 | + name: 'wikipedia-search', |
| 31 | + prompt: |
| 32 | + 'Navigate to Wikipedia, search for "TypeScript programming language", and tell me the first sentence of the article.', |
| 33 | + url: 'https://en.wikipedia.org', |
| 34 | + }, |
| 35 | + { |
| 36 | + name: 'hacker-news-top', |
| 37 | + prompt: |
| 38 | + 'Navigate to Hacker News and tell me the titles of the top 3 stories on the front page.', |
| 39 | + url: 'https://news.ycombinator.com', |
| 40 | + }, |
| 41 | + { |
| 42 | + name: 'example-form', |
| 43 | + prompt: |
| 44 | + 'Navigate to https://httpbin.org/forms/post and fill out the form with: customer name "Test User", telephone "555-1234", size "Medium", topping "Bacon", and submit the form. Report what the server response shows.', |
| 45 | + url: 'https://httpbin.org/forms/post', |
| 46 | + }, |
| 47 | +] |
| 48 | + |
| 49 | +interface TraceEvent { |
| 50 | + timestamp: string |
| 51 | + type: string |
| 52 | + data: Record<string, unknown> |
| 53 | +} |
| 54 | + |
| 55 | +async function runTask( |
| 56 | + client: CodebuffClient, |
| 57 | + task: TaskDefinition, |
| 58 | + agentDefinitions: AgentDefinition[], |
| 59 | + taskIndex: number, |
| 60 | +): Promise<{ success: boolean; traceFile: string; output: unknown }> { |
| 61 | + const events: TraceEvent[] = [] |
| 62 | + const startTime = Date.now() |
| 63 | + |
| 64 | + console.log(`\n${'='.repeat(60)}`) |
| 65 | + console.log(`Task ${taskIndex}: ${task.name}`) |
| 66 | + console.log(`Prompt: ${task.prompt}`) |
| 67 | + console.log(`${'='.repeat(60)}\n`) |
| 68 | + |
| 69 | + const runState = await client.run({ |
| 70 | + agent: 'browser-use', |
| 71 | + prompt: task.prompt, |
| 72 | + params: task.url ? { url: task.url } : undefined, |
| 73 | + agentDefinitions, |
| 74 | + maxAgentSteps: 30, |
| 75 | + handleEvent: (event) => { |
| 76 | + events.push({ |
| 77 | + timestamp: new Date().toISOString(), |
| 78 | + type: event.type, |
| 79 | + data: event as Record<string, unknown>, |
| 80 | + }) |
| 81 | + |
| 82 | + if (event.type === 'text') { |
| 83 | + process.stdout.write(event.text ?? '') |
| 84 | + } else if (event.type === 'tool_call') { |
| 85 | + console.log(`\n[Tool Call] ${event.toolName}`) |
| 86 | + } else if (event.type === 'tool_result') { |
| 87 | + const preview = JSON.stringify(event.output)?.slice(0, 200) |
| 88 | + console.log(`[Tool Result] ${preview}...`) |
| 89 | + } else if (event.type === 'error') { |
| 90 | + console.error(`[Error] ${event.message}`) |
| 91 | + } else if (event.type === 'subagent_start') { |
| 92 | + console.log(`[Subagent Start] ${event.agentType}`) |
| 93 | + } else if (event.type === 'subagent_finish') { |
| 94 | + console.log(`[Subagent Finish] ${event.agentType}`) |
| 95 | + } |
| 96 | + }, |
| 97 | + }) |
| 98 | + |
| 99 | + const duration = ((Date.now() - startTime) / 1000).toFixed(1) |
| 100 | + const output = runState.output |
| 101 | + |
| 102 | + const trace = { |
| 103 | + task: { |
| 104 | + name: task.name, |
| 105 | + prompt: task.prompt, |
| 106 | + url: task.url, |
| 107 | + }, |
| 108 | + duration: `${duration}s`, |
| 109 | + output, |
| 110 | + eventCount: events.length, |
| 111 | + events, |
| 112 | + } |
| 113 | + |
| 114 | + const timestamp = new Date().toISOString().replace(/[:.]/g, '-') |
| 115 | + const traceFile = path.join( |
| 116 | + TRACE_DIR, |
| 117 | + `${timestamp}_${task.name}.json`, |
| 118 | + ) |
| 119 | + fs.writeFileSync(traceFile, JSON.stringify(trace, null, 2)) |
| 120 | + |
| 121 | + const success = output?.type !== 'error' |
| 122 | + |
| 123 | + console.log(`\n${'─'.repeat(60)}`) |
| 124 | + console.log(`Result: ${success ? '✅ SUCCESS' : '❌ FAILURE'}`) |
| 125 | + console.log(`Duration: ${duration}s`) |
| 126 | + console.log(`Events: ${events.length}`) |
| 127 | + console.log(`Trace: ${traceFile}`) |
| 128 | + |
| 129 | + if (output?.type === 'error') { |
| 130 | + console.log(`Error: ${output.message}`) |
| 131 | + } else if (output?.type === 'structuredOutput') { |
| 132 | + const data = output.value as Record<string, unknown> | null |
| 133 | + console.log(`Status: ${data?.overallStatus}`) |
| 134 | + console.log(`Summary: ${data?.summary}`) |
| 135 | + if (data && Array.isArray(data.lessons) && data.lessons.length > 0) { |
| 136 | + console.log(`Lessons:`) |
| 137 | + for (const lesson of data.lessons) { |
| 138 | + console.log(` - ${lesson}`) |
| 139 | + } |
| 140 | + } |
| 141 | + } |
| 142 | + console.log(`${'─'.repeat(60)}`) |
| 143 | + |
| 144 | + return { success, traceFile, output } |
| 145 | +} |
| 146 | + |
| 147 | +async function main() { |
| 148 | + fs.mkdirSync(TRACE_DIR, { recursive: true }) |
| 149 | + |
| 150 | + const taskIndexArg = process.argv[2] |
| 151 | + const tasksToRun = |
| 152 | + taskIndexArg !== undefined |
| 153 | + ? [{ task: TASKS[parseInt(taskIndexArg, 10)], index: parseInt(taskIndexArg, 10) }] |
| 154 | + : TASKS.map((task, index) => ({ task, index })) |
| 155 | + |
| 156 | + if (tasksToRun.some((t) => !t.task)) { |
| 157 | + console.error(`Invalid task index: ${taskIndexArg}. Available: 0-${TASKS.length - 1}`) |
| 158 | + process.exit(1) |
| 159 | + } |
| 160 | + |
| 161 | + const agents = await loadLocalAgents({ agentsPath: path.join(process.cwd(), 'agents'), verbose: true }) |
| 162 | + const agentDefinitions = Object.values(agents) as AgentDefinition[] |
| 163 | + |
| 164 | + const browserAgent = agentDefinitions.find((a) => a.id === 'browser-use') |
| 165 | + if (!browserAgent) { |
| 166 | + console.error('browser-use agent not found in agents/ directory') |
| 167 | + process.exit(1) |
| 168 | + } |
| 169 | + console.log(`Loaded browser-use agent (model: ${browserAgent.model})`) |
| 170 | + |
| 171 | + const client = new CodebuffClient({ |
| 172 | + apiKey: process.env.CODEBUFF_API_KEY, |
| 173 | + cwd: process.cwd(), |
| 174 | + }) |
| 175 | + |
| 176 | + const results: Array<{ name: string; success: boolean; traceFile: string }> = [] |
| 177 | + |
| 178 | + for (const { task, index } of tasksToRun) { |
| 179 | + const result = await runTask(client, task, agentDefinitions, index) |
| 180 | + results.push({ name: task.name, success: result.success, traceFile: result.traceFile }) |
| 181 | + } |
| 182 | + |
| 183 | + console.log(`\n${'='.repeat(60)}`) |
| 184 | + console.log('SUMMARY') |
| 185 | + console.log(`${'='.repeat(60)}`) |
| 186 | + for (const r of results) { |
| 187 | + console.log(` ${r.success ? '✅' : '❌'} ${r.name} → ${r.traceFile}`) |
| 188 | + } |
| 189 | + const passed = results.filter((r) => r.success).length |
| 190 | + console.log(`\n${passed}/${results.length} tasks passed`) |
| 191 | +} |
| 192 | + |
| 193 | +if (import.meta.main) { |
| 194 | + main().catch((err) => { |
| 195 | + console.error('Fatal error:', err) |
| 196 | + process.exit(1) |
| 197 | + }) |
| 198 | +} |
0 commit comments