From 5aab401c94f5268953a77611cbbeec1840b39e06 Mon Sep 17 00:00:00 2001 From: James Pine Date: Fri, 6 Mar 2026 22:51:19 -0800 Subject: [PATCH 1/2] fix: improve browser tool instructions and element resolution - Rewrite browser section in worker prompt with parameter table, concrete JSON examples, and explicit act_kind emphasis - Add multi-strategy element selector (native tags, aria-label, title attr) to handle implicit ARIA roles on native HTML elements - Improve all error messages to include valid values and examples - Update tool description to front-load act_kind requirement --- prompts/en/tools/browser_description.md.j2 | 2 +- prompts/en/worker.md.j2 | 60 ++++++++--- src/tools/browser.rs | 117 +++++++++++++++++---- 3 files changed, 143 insertions(+), 36 deletions(-) diff --git a/prompts/en/tools/browser_description.md.j2 b/prompts/en/tools/browser_description.md.j2 index 26500c354..09b471411 100644 --- a/prompts/en/tools/browser_description.md.j2 +++ b/prompts/en/tools/browser_description.md.j2 @@ -1 +1 @@ -Browser automation tool. Launch a headless Chrome browser, navigate pages, interact with elements, take screenshots, and extract page content. Workflow: launch → navigate → snapshot (get element refs) → act (click/type by ref) → screenshot. Element refs like "e1", "e2" are assigned during snapshot and used in act calls. \ No newline at end of file +Browser automation tool. Workflow: launch → navigate → snapshot → act → close. The `act` action REQUIRES `act_kind` (click, type, press_key, hover, scroll_into_view, focus) and `element_ref` from the last snapshot. Example: {"action": "act", "act_kind": "click", "element_ref": "e3"}. Always snapshot before acting — refs reset on navigation. Use `navigate` to go to URLs (not `open`, which creates new tabs). \ No newline at end of file diff --git a/prompts/en/worker.md.j2 b/prompts/en/worker.md.j2 index 111f0b18f..f4579e10b 100644 --- a/prompts/en/worker.md.j2 +++ b/prompts/en/worker.md.j2 @@ -93,24 +93,60 @@ Run a subprocess with specific arguments. Use this for programs that need struct Automate a headless Chrome browser. Use this for web scraping, testing web interfaces, filling out forms, or any task requiring browser interaction. -**Workflow:** - -1. `launch` — Start the browser -2. `navigate` — Go to a URL -3. `snapshot` — Get the page's accessibility tree with element refs (e1, e2, e3...) -4. `act` — Interact with elements by ref: `click`, `type`, `press_key`, `hover`, `scroll_into_view`, `focus` -5. `screenshot` — Capture the page or a specific element +**Workflow:** launch → navigate → snapshot → act → (repeat snapshot/act as needed) → close + +**Actions:** + +| Action | Required params | Description | +|--------|----------------|-------------| +| `launch` | — | Start or reconnect to the browser. Always call first. | +| `navigate` | `url` | Go to a URL in the active tab. | +| `open` | `url` (optional) | Open a **new** tab. Don't use this to navigate — use `navigate` instead. | +| `tabs` | — | List all open tabs. | +| `focus` | `target_id` | Switch to a tab by target ID. | +| `close_tab` | `target_id` (optional) | Close a tab (active tab if omitted). | +| `snapshot` | — | Get the accessibility tree with interactive element refs. | +| `act` | `act_kind`, `element_ref` | Interact with an element. **`act_kind` is mandatory.** | +| `screenshot` | `full_page` (optional) | Capture the viewport (or full page). | +| `evaluate` | `script` | Run JavaScript in the page. Last resort — prefer snapshot+act. | +| `content` | — | Get page HTML (large, use sparingly). | {%- if browser_persist_session %} -6. `close` — Detach from the browser when done (tabs and session are preserved for the next worker) +| `close` | — | Detach from the browser. Tabs and session are preserved for the next worker. | {%- else %} -6. `close` — Shut down the browser when done +| `close` | — | Shut down the browser when done. | {%- endif %} -**Multi-tab support:** Use `open` to create new tabs, `tabs` to list them, `focus` to switch between them, `close_tab` to close one. +**The `act` action — IMPORTANT:** + +The `act_kind` parameter is **always required** when `action` is `act`. Valid values: + +- `click` — Click the element. Requires: `element_ref`. +- `type` — Type text into the element. Requires: `element_ref`, `text`. +- `press_key` — Press a key (e.g., "Enter", "Tab", "Escape"). Requires: `key`. Optional: `element_ref`. +- `hover` — Hover over the element. Requires: `element_ref`. +- `scroll_into_view` — Scroll element into viewport. Requires: `element_ref`. +- `focus` — Focus the element. Requires: `element_ref`. + +Examples: +```json +{"action": "act", "act_kind": "click", "element_ref": "e3"} +{"action": "act", "act_kind": "type", "element_ref": "e5", "text": "hello@example.com"} +{"action": "act", "act_kind": "press_key", "key": "Enter"} +``` + +**Element refs:** + +- Refs like `e0`, `e1`, `e2` are assigned by `snapshot` and reset on each snapshot or navigation. +- Always run `snapshot` before using `act` — stale refs will fail. +- If an `act` call fails with "Could not find node", run `snapshot` again to get fresh refs, then retry with the new ref. +- Don't pass `url` or `text` when you mean `act_kind` — these are different parameters. -**Element refs** are assigned during `snapshot` and look like "e1", "e2". Always snapshot before interacting — refs reset on each snapshot or navigation. +**Common mistakes to avoid:** -**Additional actions:** `content` (get page HTML), `evaluate` (run JavaScript, if enabled in config). +- Calling `act` without `act_kind` — this will always error. Every `act` call needs an `act_kind`. +- Using `open` to navigate — `open` creates a new tab. Use `navigate` to go to a URL in the current tab. +- Using `evaluate` to click buttons — use `snapshot` + `act` with `act_kind: "click"` instead. Only use `evaluate` when the accessibility tree doesn't expose what you need. +- Retrying the exact same failed call — read the error, fix the parameters, then retry. ### secret_set diff --git a/src/tools/browser.rs b/src/tools/browser.rs index 90186b262..6c310886d 100644 --- a/src/tools/browser.rs +++ b/src/tools/browser.rs @@ -972,7 +972,11 @@ impl BrowserTool { key: Option, ) -> Result { let Some(act_kind) = act_kind else { - return Err(BrowserError::new("act_kind is required for act action")); + return Err(BrowserError::new( + "act_kind is required for act action — must be one of: \ + click, type, press_key, hover, scroll_into_view, focus. \ + Example: {\"action\": \"act\", \"act_kind\": \"click\", \"element_ref\": \"e3\"}", + )); }; let state = self.state.lock().await; @@ -989,7 +993,10 @@ impl BrowserTool { } ActKind::Type => { let Some(text) = text else { - return Err(BrowserError::new("text is required for act:type")); + return Err(BrowserError::new( + "text is required for act_kind: \"type\" — \ + example: {\"action\": \"act\", \"act_kind\": \"type\", \"element_ref\": \"e5\", \"text\": \"hello\"}", + )); }; let element = self.resolve_element_ref(&state, page, element_ref).await?; element @@ -1007,7 +1014,10 @@ impl BrowserTool { } ActKind::PressKey => { let Some(key) = key else { - return Err(BrowserError::new("key is required for act:press_key")); + return Err(BrowserError::new( + "key is required for act_kind: \"press_key\" — \ + example: {\"action\": \"act\", \"act_kind\": \"press_key\", \"key\": \"Enter\"}", + )); }; if element_ref.is_some() { let element = self.resolve_element_ref(&state, page, element_ref).await?; @@ -1331,24 +1341,40 @@ impl BrowserTool { element_ref: Option, ) -> Result { let Some(ref_id) = element_ref else { - return Err(BrowserError::new("element_ref is required for this action")); + return Err(BrowserError::new( + "element_ref is required for this action — run snapshot first, \ + then use a ref like \"e0\", \"e1\" from the results", + )); }; let elem_ref = state.element_refs.get(&ref_id).ok_or_else(|| { BrowserError::new(format!( - "unknown element ref '{ref_id}' — run snapshot first to get element refs" + "unknown element ref '{ref_id}' — run snapshot first to get fresh element refs" )) })?; - // Use backend_node_id to find the element via CSS selector derived from role+name, - // or fall back to XPath with aria role and name attributes - let selector = build_selector_for_ref(elem_ref); + // Try multiple selector strategies. The accessibility tree role doesn't + // always map to an explicit [role] attribute in the DOM — native HTML + // elements (button, input, a) have implicit ARIA roles without the + // attribute being present. + let selectors = build_selectors_for_ref(elem_ref); - page.find_element(&selector).await.map_err(|error| { - BrowserError::new(format!( - "failed to find element for ref '{ref_id}' (selector: {selector}): {error}" - )) - }) + for selector in &selectors { + if let Ok(element) = page.find_element(selector).await { + return Ok(element); + } + } + + Err(BrowserError::new(format!( + "failed to find element for ref '{ref_id}' \ + (role: {}, name: {:?}) — the page may have changed since \ + the last snapshot. Run snapshot again to get fresh refs, \ + then retry with the updated ref. \ + Selectors tried: {}", + elem_ref.role, + elem_ref.name, + selectors.join(", "), + ))) } } @@ -1387,18 +1413,63 @@ fn extract_ax_value_string( .and_then(|v| v.as_str().map(|s| s.to_string())) } -/// Build a CSS selector from an ElementRef's role and name. -fn build_selector_for_ref(elem_ref: &ElementRef) -> String { - // Use ARIA role attribute as primary selector, with name for disambiguation - let role_selector = format!("[role='{}']", elem_ref.role); +/// Map an accessibility tree role to its native HTML tag equivalent. +/// Native elements don't have explicit `[role]` attributes — a `