From 5aab401c94f5268953a77611cbbeec1840b39e06 Mon Sep 17 00:00:00 2001
From: James Pine <ijamespine@me.com>
Date: Fri, 6 Mar 2026 22:51:19 -0800
Subject: [PATCH 1/2] fix: improve browser tool instructions and element
 resolution

- Rewrite browser section in worker prompt with parameter table,
  concrete JSON examples, and explicit act_kind emphasis
- Add multi-strategy element selector (native tags, aria-label,
  title attr) to handle implicit ARIA roles on native HTML elements
- Improve all error messages to include valid values and examples
- Update tool description to front-load act_kind requirement
---
 prompts/en/tools/browser_description.md.j2 |   2 +-
 prompts/en/worker.md.j2                    |  60 ++++++++---
 src/tools/browser.rs                       | 117 +++++++++++++++++----
 3 files changed, 143 insertions(+), 36 deletions(-)

diff --git a/prompts/en/tools/browser_description.md.j2 b/prompts/en/tools/browser_description.md.j2
index 26500c354..09b471411 100644
--- a/prompts/en/tools/browser_description.md.j2
+++ b/prompts/en/tools/browser_description.md.j2
@@ -1 +1 @@
-Browser automation tool. Launch a headless Chrome browser, navigate pages, interact with elements, take screenshots, and extract page content. Workflow: launch → navigate → snapshot (get element refs) → act (click/type by ref) → screenshot. Element refs like "e1", "e2" are assigned during snapshot and used in act calls.
\ No newline at end of file
+Browser automation tool. Workflow: launch → navigate → snapshot → act → close. The `act` action REQUIRES `act_kind` (click, type, press_key, hover, scroll_into_view, focus) and `element_ref` from the last snapshot. Example: {"action": "act", "act_kind": "click", "element_ref": "e3"}. Always snapshot before acting — refs reset on navigation. Use `navigate` to go to URLs (not `open`, which creates new tabs).
\ No newline at end of file
diff --git a/prompts/en/worker.md.j2 b/prompts/en/worker.md.j2
index 111f0b18f..f4579e10b 100644
--- a/prompts/en/worker.md.j2
+++ b/prompts/en/worker.md.j2
@@ -93,24 +93,60 @@ Run a subprocess with specific arguments. Use this for programs that need struct
 
 Automate a headless Chrome browser. Use this for web scraping, testing web interfaces, filling out forms, or any task requiring browser interaction.
 
-**Workflow:**
-
-1. `launch` — Start the browser
-2. `navigate` — Go to a URL
-3. `snapshot` — Get the page's accessibility tree with element refs (e1, e2, e3...)
-4. `act` — Interact with elements by ref: `click`, `type`, `press_key`, `hover`, `scroll_into_view`, `focus`
-5. `screenshot` — Capture the page or a specific element
+**Workflow:** launch → navigate → snapshot → act → (repeat snapshot/act as needed) → close
+
+**Actions:**
+
+| Action | Required params | Description |
+|--------|----------------|-------------|
+| `launch` | — | Start or reconnect to the browser. Always call first. |
+| `navigate` | `url` | Go to a URL in the active tab. |
+| `open` | `url` (optional) | Open a **new** tab. Don't use this to navigate — use `navigate` instead. |
+| `tabs` | — | List all open tabs. |
+| `focus` | `target_id` | Switch to a tab by target ID. |
+| `close_tab` | `target_id` (optional) | Close a tab (active tab if omitted). |
+| `snapshot` | — | Get the accessibility tree with interactive element refs. |
+| `act` | `act_kind`, `element_ref` | Interact with an element. **`act_kind` is mandatory.** |
+| `screenshot` | `full_page` (optional) | Capture the viewport (or full page). |
+| `evaluate` | `script` | Run JavaScript in the page. Last resort — prefer snapshot+act. |
+| `content` | — | Get page HTML (large, use sparingly). |
 {%- if browser_persist_session %}
-6. `close` — Detach from the browser when done (tabs and session are preserved for the next worker)
+| `close` | — | Detach from the browser. Tabs and session are preserved for the next worker. |
 {%- else %}
-6. `close` — Shut down the browser when done
+| `close` | — | Shut down the browser when done. |
 {%- endif %}
 
-**Multi-tab support:** Use `open` to create new tabs, `tabs` to list them, `focus` to switch between them, `close_tab` to close one.
+**The `act` action — IMPORTANT:**
+
+The `act_kind` parameter is **always required** when `action` is `act`. Valid values:
+
+- `click` — Click the element. Requires: `element_ref`.
+- `type` — Type text into the element. Requires: `element_ref`, `text`.
+- `press_key` — Press a key (e.g., "Enter", "Tab", "Escape"). Requires: `key`. Optional: `element_ref`.
+- `hover` — Hover over the element. Requires: `element_ref`.
+- `scroll_into_view` — Scroll element into viewport. Requires: `element_ref`.
+- `focus` — Focus the element. Requires: `element_ref`.
+
+Examples:
+```json
+{"action": "act", "act_kind": "click", "element_ref": "e3"}
+{"action": "act", "act_kind": "type", "element_ref": "e5", "text": "hello@example.com"}
+{"action": "act", "act_kind": "press_key", "key": "Enter"}
+```
+
+**Element refs:**
+
+- Refs like `e0`, `e1`, `e2` are assigned by `snapshot` and reset on each snapshot or navigation.
+- Always run `snapshot` before using `act` — stale refs will fail.
+- If an `act` call fails with "Could not find node", run `snapshot` again to get fresh refs, then retry with the new ref.
+- Don't pass `url` or `text` when you mean `act_kind` — these are different parameters.
 
-**Element refs** are assigned during `snapshot` and look like "e1", "e2". Always snapshot before interacting — refs reset on each snapshot or navigation.
+**Common mistakes to avoid:**
 
-**Additional actions:** `content` (get page HTML), `evaluate` (run JavaScript, if enabled in config).
+- Calling `act` without `act_kind` — this will always error. Every `act` call needs an `act_kind`.
+- Using `open` to navigate — `open` creates a new tab. Use `navigate` to go to a URL in the current tab.
+- Using `evaluate` to click buttons — use `snapshot` + `act` with `act_kind: "click"` instead. Only use `evaluate` when the accessibility tree doesn't expose what you need.
+- Retrying the exact same failed call — read the error, fix the parameters, then retry.
 
 ### secret_set
 
diff --git a/src/tools/browser.rs b/src/tools/browser.rs
index 90186b262..6c310886d 100644
--- a/src/tools/browser.rs
+++ b/src/tools/browser.rs
@@ -972,7 +972,11 @@ impl BrowserTool {
         key: Option<String>,
     ) -> Result<BrowserOutput, BrowserError> {
         let Some(act_kind) = act_kind else {
-            return Err(BrowserError::new("act_kind is required for act action"));
+            return Err(BrowserError::new(
+                "act_kind is required for act action — must be one of: \
+                 click, type, press_key, hover, scroll_into_view, focus. \
+                 Example: {\"action\": \"act\", \"act_kind\": \"click\", \"element_ref\": \"e3\"}",
+            ));
         };
 
         let state = self.state.lock().await;
@@ -989,7 +993,10 @@ impl BrowserTool {
             }
             ActKind::Type => {
                 let Some(text) = text else {
-                    return Err(BrowserError::new("text is required for act:type"));
+                    return Err(BrowserError::new(
+                        "text is required for act_kind: \"type\" — \
+                         example: {\"action\": \"act\", \"act_kind\": \"type\", \"element_ref\": \"e5\", \"text\": \"hello\"}",
+                    ));
                 };
                 let element = self.resolve_element_ref(&state, page, element_ref).await?;
                 element
@@ -1007,7 +1014,10 @@ impl BrowserTool {
             }
             ActKind::PressKey => {
                 let Some(key) = key else {
-                    return Err(BrowserError::new("key is required for act:press_key"));
+                    return Err(BrowserError::new(
+                        "key is required for act_kind: \"press_key\" — \
+                         example: {\"action\": \"act\", \"act_kind\": \"press_key\", \"key\": \"Enter\"}",
+                    ));
                 };
                 if element_ref.is_some() {
                     let element = self.resolve_element_ref(&state, page, element_ref).await?;
@@ -1331,24 +1341,40 @@ impl BrowserTool {
         element_ref: Option<String>,
     ) -> Result<chromiumoxide::Element, BrowserError> {
         let Some(ref_id) = element_ref else {
-            return Err(BrowserError::new("element_ref is required for this action"));
+            return Err(BrowserError::new(
+                "element_ref is required for this action — run snapshot first, \
+                 then use a ref like \"e0\", \"e1\" from the results",
+            ));
         };
 
         let elem_ref = state.element_refs.get(&ref_id).ok_or_else(|| {
             BrowserError::new(format!(
-                "unknown element ref '{ref_id}' — run snapshot first to get element refs"
+                "unknown element ref '{ref_id}' — run snapshot first to get fresh element refs"
             ))
         })?;
 
-        // Use backend_node_id to find the element via CSS selector derived from role+name,
-        // or fall back to XPath with aria role and name attributes
-        let selector = build_selector_for_ref(elem_ref);
+        // Try multiple selector strategies. The accessibility tree role doesn't
+        // always map to an explicit [role] attribute in the DOM — native HTML
+        // elements (button, input, a) have implicit ARIA roles without the
+        // attribute being present.
+        let selectors = build_selectors_for_ref(elem_ref);
 
-        page.find_element(&selector).await.map_err(|error| {
-            BrowserError::new(format!(
-                "failed to find element for ref '{ref_id}' (selector: {selector}): {error}"
-            ))
-        })
+        for selector in &selectors {
+            if let Ok(element) = page.find_element(selector).await {
+                return Ok(element);
+            }
+        }
+
+        Err(BrowserError::new(format!(
+            "failed to find element for ref '{ref_id}' \
+             (role: {}, name: {:?}) — the page may have changed since \
+             the last snapshot. Run snapshot again to get fresh refs, \
+             then retry with the updated ref. \
+             Selectors tried: {}",
+            elem_ref.role,
+            elem_ref.name,
+            selectors.join(", "),
+        )))
     }
 }
 
@@ -1387,18 +1413,63 @@ fn extract_ax_value_string(
         .and_then(|v| v.as_str().map(|s| s.to_string()))
 }
 
-/// Build a CSS selector from an ElementRef's role and name.
-fn build_selector_for_ref(elem_ref: &ElementRef) -> String {
-    // Use ARIA role attribute as primary selector, with name for disambiguation
-    let role_selector = format!("[role='{}']", elem_ref.role);
+/// Map an accessibility tree role to its native HTML tag equivalent.
+/// Native elements don't have explicit `[role]` attributes — a `<button>` has
+/// implicit role "button" in the AX tree but no `role="button"` in the DOM.
+fn role_to_native_tag(role: &str) -> Option<&'static str> {
+    match role {
+        "button" => Some("button"),
+        "link" => Some("a"),
+        "textbox" => Some("input"),
+        "searchbox" => Some("input[type='search']"),
+        "checkbox" => Some("input[type='checkbox']"),
+        "radio" => Some("input[type='radio']"),
+        "slider" => Some("input[type='range']"),
+        "spinbutton" => Some("input[type='number']"),
+        "combobox" => Some("select"),
+        "switch" => Some("input[type='checkbox']"),
+        "tab" => Some("[role='tab']"),
+        "menuitem" => Some("[role='menuitem']"),
+        _ => None,
+    }
+}
+
+/// Build multiple CSS selector candidates for an element ref.
+///
+/// The accessibility tree role doesn't always correspond to an explicit `[role]`
+/// attribute in the DOM — native HTML elements have implicit ARIA roles. We try
+/// multiple strategies in order of specificity.
+fn build_selectors_for_ref(elem_ref: &ElementRef) -> Vec<String> {
+    let mut selectors = Vec::with_capacity(4);
+    let escaped_name = elem_ref.name.as_ref().map(|n| n.replace('\'', "\\'"));
+
+    // Strategy 1: [role='X'][aria-label='Y'] — explicit ARIA attributes
+    if let Some(name) = &escaped_name {
+        selectors.push(format!("[role='{}'][aria-label='{name}']", elem_ref.role));
+    }
 
-    if let Some(name) = &elem_ref.name {
-        // Escape single quotes in the name for CSS selector safety
-        let escaped = name.replace('\'', "\\'");
-        format!("{role_selector}[aria-label='{escaped}']")
-    } else {
-        role_selector
+    // Strategy 2: native tag + aria-label (e.g., button[aria-label='Submit'])
+    if let Some(tag) = role_to_native_tag(&elem_ref.role) {
+        if let Some(name) = &escaped_name {
+            selectors.push(format!("{tag}[aria-label='{name}']"));
+        }
     }
+
+    // Strategy 3: native tag with text content match via XPath-style selector
+    // using the name as button/link text
+    if let Some(tag) = role_to_native_tag(&elem_ref.role) {
+        if let Some(name) = &escaped_name {
+            // Try title attribute as fallback
+            selectors.push(format!("{tag}[title='{name}']"));
+        }
+        // Tag-only fallback (least specific, may match multiple)
+        selectors.push(tag.to_string());
+    }
+
+    // Strategy 4: role-only fallback
+    selectors.push(format!("[role='{}']", elem_ref.role));
+
+    selectors
 }
 
 /// Extract target ID string from a Page.

From 4e5d92442db8748346026086dca22348da419eb9 Mon Sep 17 00:00:00 2001
From: James Pine <ijamespine@me.com>
Date: Sat, 7 Mar 2026 00:01:26 -0800
Subject: [PATCH 2/2] fix: replace chromiumoxide element interactions with JS
 injection

Element clicks/type/hover/focus now use Runtime.evaluate with DOM methods
instead of CDP node IDs that go stale between snapshot and interaction.

- Rewrite handle_act to use JS injection for all ActKind variants
- Add build_js_selector: tries CSS selectors then text-content fallback
- Add run_js_action: executes JS via page.evaluate, parses JSON result
- Update handle_screenshot to use JS + clip viewport instead of Element
- Remove old resolve_element_ref (replaced by JS injection)
- Merge role_to_native_tag entries from both branches
---
 src/tools/browser.rs | 309 +++++++++++++++++++++++++++++++------------
 1 file changed, 227 insertions(+), 82 deletions(-)

diff --git a/src/tools/browser.rs b/src/tools/browser.rs
index 6c310886d..deab3dd11 100644
--- a/src/tools/browser.rs
+++ b/src/tools/browser.rs
@@ -984,12 +984,31 @@ impl BrowserTool {
 
         match act_kind {
             ActKind::Click => {
-                let element = self.resolve_element_ref(&state, page, element_ref).await?;
-                element
-                    .click()
-                    .await
-                    .map_err(|error| BrowserError::new(format!("click failed: {error}")))?;
-                Ok(BrowserOutput::success("Clicked element"))
+                let selector_js = self.build_js_selector(&state, element_ref)?;
+                let js = format!(
+                    r#"(() => {{
+                        {selector_js}
+                        el.scrollIntoView({{block: 'center'}});
+                        el.click();
+                        return JSON.stringify({{
+                            success: true,
+                            tag: el.tagName,
+                            text: el.textContent.substring(0, 100).trim()
+                        }});
+                    }})()"#
+                );
+                let result = self.run_js_action(page, &js).await?;
+                let tag = result
+                    .get("tag")
+                    .and_then(|v| v.as_str())
+                    .unwrap_or("element");
+                let text = result.get("text").and_then(|v| v.as_str()).unwrap_or("");
+                let display = if text.is_empty() {
+                    format!("Clicked {tag}")
+                } else {
+                    format!("Clicked {tag}: '{}'", truncate_for_display(text, 50))
+                };
+                Ok(BrowserOutput::success(display))
             }
             ActKind::Type => {
                 let Some(text) = text else {
@@ -998,15 +1017,20 @@ impl BrowserTool {
                          example: {\"action\": \"act\", \"act_kind\": \"type\", \"element_ref\": \"e5\", \"text\": \"hello\"}",
                     ));
                 };
-                let element = self.resolve_element_ref(&state, page, element_ref).await?;
-                element
-                    .click()
-                    .await
-                    .map_err(|error| BrowserError::new(format!("focus failed: {error}")))?;
-                element
-                    .type_str(&text)
-                    .await
-                    .map_err(|error| BrowserError::new(format!("type failed: {error}")))?;
+                let selector_js = self.build_js_selector(&state, element_ref)?;
+                let text_json = serde_json::to_string(&text).unwrap_or_default();
+                let js = format!(
+                    r#"(() => {{
+                        {selector_js}
+                        let txt = {text_json};
+                        el.focus();
+                        el.value = txt;
+                        el.dispatchEvent(new Event('input', {{bubbles: true}}));
+                        el.dispatchEvent(new Event('change', {{bubbles: true}}));
+                        return JSON.stringify({{success: true}});
+                    }})()"#
+                );
+                self.run_js_action(page, &js).await?;
                 Ok(BrowserOutput::success(format!(
                     "Typed '{}' into element",
                     truncate_for_display(&text, 50)
@@ -1019,43 +1043,167 @@ impl BrowserTool {
                          example: {\"action\": \"act\", \"act_kind\": \"press_key\", \"key\": \"Enter\"}",
                     ));
                 };
+                // press_key can work without an element ref (sends to page)
                 if element_ref.is_some() {
-                    let element = self.resolve_element_ref(&state, page, element_ref).await?;
-                    element
-                        .press_key(&key)
-                        .await
-                        .map_err(|error| BrowserError::new(format!("press_key failed: {error}")))?;
+                    let selector_js = self.build_js_selector(&state, element_ref)?;
+                    let key_json = serde_json::to_string(&key).unwrap_or_default();
+                    let js = format!(
+                        r#"(() => {{
+                            {selector_js}
+                            el.focus();
+                            el.dispatchEvent(new KeyboardEvent('keydown', {{key: {key_json}, bubbles: true}}));
+                            el.dispatchEvent(new KeyboardEvent('keyup', {{key: {key_json}, bubbles: true}}));
+                            return JSON.stringify({{success: true}});
+                        }})()"#
+                    );
+                    self.run_js_action(page, &js).await?;
                 } else {
                     dispatch_key_press(page, &key).await?;
                 }
                 Ok(BrowserOutput::success(format!("Pressed key '{key}'")))
             }
             ActKind::Hover => {
-                let element = self.resolve_element_ref(&state, page, element_ref).await?;
-                element
-                    .hover()
-                    .await
-                    .map_err(|error| BrowserError::new(format!("hover failed: {error}")))?;
+                let selector_js = self.build_js_selector(&state, element_ref)?;
+                let js = format!(
+                    r#"(() => {{
+                        {selector_js}
+                        el.scrollIntoView({{block: 'center'}});
+                        el.dispatchEvent(new MouseEvent('mouseover', {{bubbles: true}}));
+                        el.dispatchEvent(new MouseEvent('mouseenter', {{bubbles: true}}));
+                        return JSON.stringify({{success: true}});
+                    }})()"#
+                );
+                self.run_js_action(page, &js).await?;
                 Ok(BrowserOutput::success("Hovered over element"))
             }
             ActKind::ScrollIntoView => {
-                let element = self.resolve_element_ref(&state, page, element_ref).await?;
-                element.scroll_into_view().await.map_err(|error| {
-                    BrowserError::new(format!("scroll_into_view failed: {error}"))
-                })?;
+                let selector_js = self.build_js_selector(&state, element_ref)?;
+                let js = format!(
+                    r#"(() => {{
+                        {selector_js}
+                        el.scrollIntoView({{block: 'center', behavior: 'smooth'}});
+                        return JSON.stringify({{success: true}});
+                    }})()"#
+                );
+                self.run_js_action(page, &js).await?;
                 Ok(BrowserOutput::success("Scrolled element into view"))
             }
             ActKind::Focus => {
-                let element = self.resolve_element_ref(&state, page, element_ref).await?;
-                element
-                    .focus()
-                    .await
-                    .map_err(|error| BrowserError::new(format!("focus failed: {error}")))?;
+                let selector_js = self.build_js_selector(&state, element_ref)?;
+                let js = format!(
+                    r#"(() => {{
+                        {selector_js}
+                        el.focus();
+                        return JSON.stringify({{success: true}});
+                    }})()"#
+                );
+                self.run_js_action(page, &js).await?;
                 Ok(BrowserOutput::success("Focused element"))
             }
         }
     }
 
+    /// Build a JS snippet that resolves an element ref to a DOM element stored in `el`.
+    ///
+    /// Uses the accessibility tree ref's role and name to build CSS selectors,
+    /// with a text-content fallback across all interactive elements. This is
+    /// injected into a JS IIFE that must return a JSON result.
+    fn build_js_selector(
+        &self,
+        state: &BrowserState,
+        element_ref: Option<String>,
+    ) -> Result<String, BrowserError> {
+        let Some(ref_id) = element_ref else {
+            return Err(BrowserError::new(
+                "element_ref is required for this action — run snapshot first, \
+                 then use a ref like \"e0\", \"e1\" from the results",
+            ));
+        };
+
+        let elem_ref = state.element_refs.get(&ref_id).ok_or_else(|| {
+            BrowserError::new(format!(
+                "unknown element ref '{ref_id}' — run snapshot first to get fresh element refs"
+            ))
+        })?;
+
+        // Build CSS selectors to try, plus a text-content fallback.
+        let selectors = build_selectors_for_ref(elem_ref);
+        let selectors_json = serde_json::to_string(&selectors).unwrap_or_default();
+        let name_json = serde_json::to_string(&elem_ref.name).unwrap_or("null".to_string());
+
+        // JS that tries each CSS selector, then falls back to text matching
+        // across interactive elements. Sets `el` or returns an error.
+        Ok(format!(
+            r#"let el = null;
+            const selectors = {selectors_json};
+            for (const sel of selectors) {{
+                el = document.querySelector(sel);
+                if (el) break;
+            }}
+            if (!el) {{
+                const name = {name_json};
+                if (name) {{
+                    const candidates = document.querySelectorAll(
+                        'a, button, [role="button"], input, select, textarea, [onclick], [tabindex]'
+                    );
+                    const lower = name.toLowerCase();
+                    for (const e of candidates) {{
+                        const text = (e.textContent || '').trim().toLowerCase();
+                        const label = (e.getAttribute('aria-label') || '').toLowerCase();
+                        const title = (e.getAttribute('title') || '').toLowerCase();
+                        if (text === lower || label === lower || title === lower) {{ el = e; break; }}
+                    }}
+                    if (!el) {{
+                        for (const e of candidates) {{
+                            const text = (e.textContent || '').trim().toLowerCase();
+                            if (text.includes(lower)) {{ el = e; break; }}
+                        }}
+                    }}
+                }}
+            }}
+            if (!el) return JSON.stringify({{
+                success: false,
+                error: 'Element not found for ref {ref_id}. Run snapshot again to get fresh refs.'
+            }});"#
+        ))
+    }
+
+    /// Execute a JS action and parse the JSON result.
+    async fn run_js_action(
+        &self,
+        page: &chromiumoxide::Page,
+        js: &str,
+    ) -> Result<serde_json::Value, BrowserError> {
+        let result = page
+            .evaluate(js)
+            .await
+            .map_err(|error| BrowserError::new(format!("JS execution failed: {error}")))?;
+
+        let value = result.value().cloned().unwrap_or(serde_json::Value::Null);
+
+        // The JS returns a JSON string — parse it
+        let parsed = if let Some(json_str) = value.as_str() {
+            serde_json::from_str::<serde_json::Value>(json_str).unwrap_or(value)
+        } else {
+            value
+        };
+
+        // Check for JS-level errors
+        let success = parsed
+            .get("success")
+            .and_then(|v| v.as_bool())
+            .unwrap_or(false);
+        if !success {
+            let error = parsed
+                .get("error")
+                .and_then(|v| v.as_str())
+                .unwrap_or("action failed");
+            return Err(BrowserError::new(error.to_string()));
+        }
+
+        Ok(parsed)
+    }
+
     async fn handle_screenshot(
         &self,
         element_ref: Option<String>,
@@ -1065,9 +1213,48 @@ impl BrowserTool {
         let page = self.require_active_page(&state)?;
 
         let screenshot_data = if let Some(ref_id) = element_ref {
-            let element = self.resolve_element_ref(&state, page, Some(ref_id)).await?;
-            element
-                .screenshot(CaptureScreenshotFormat::Png)
+            // Use JS to find the element and get its bounding rect, then take
+            // a clipped page screenshot. This avoids stale CDP node IDs.
+            let selector_js = self.build_js_selector(&state, Some(ref_id))?;
+            let js = format!(
+                r#"(() => {{
+                    {selector_js}
+                    el.scrollIntoView({{block: 'center'}});
+                    const rect = el.getBoundingClientRect();
+                    return JSON.stringify({{
+                        success: true,
+                        x: rect.x + window.scrollX,
+                        y: rect.y + window.scrollY,
+                        width: rect.width,
+                        height: rect.height
+                    }});
+                }})()"#
+            );
+            let result = self.run_js_action(page, &js).await?;
+            let x = result.get("x").and_then(|v| v.as_f64()).unwrap_or(0.0);
+            let y = result.get("y").and_then(|v| v.as_f64()).unwrap_or(0.0);
+            let width = result
+                .get("width")
+                .and_then(|v| v.as_f64())
+                .unwrap_or(800.0);
+            let height = result
+                .get("height")
+                .and_then(|v| v.as_f64())
+                .unwrap_or(600.0);
+
+            use chromiumoxide_cdp::cdp::browser_protocol::page::Viewport;
+            let clip = Viewport {
+                x,
+                y,
+                width,
+                height,
+                scale: 1.0,
+            };
+            let params = ScreenshotParams::builder()
+                .format(CaptureScreenshotFormat::Png)
+                .clip(clip)
+                .build();
+            page.screenshot(params)
                 .await
                 .map_err(|error| BrowserError::new(format!("element screenshot failed: {error}")))?
         } else {
@@ -1332,50 +1519,6 @@ impl BrowserTool {
             .get(target)
             .ok_or_else(|| BrowserError::new("active tab no longer exists"))
     }
-
-    /// Resolve an element ref (like "e3") to a chromiumoxide Element on the page.
-    async fn resolve_element_ref(
-        &self,
-        state: &BrowserState,
-        page: &chromiumoxide::Page,
-        element_ref: Option<String>,
-    ) -> Result<chromiumoxide::Element, BrowserError> {
-        let Some(ref_id) = element_ref else {
-            return Err(BrowserError::new(
-                "element_ref is required for this action — run snapshot first, \
-                 then use a ref like \"e0\", \"e1\" from the results",
-            ));
-        };
-
-        let elem_ref = state.element_refs.get(&ref_id).ok_or_else(|| {
-            BrowserError::new(format!(
-                "unknown element ref '{ref_id}' — run snapshot first to get fresh element refs"
-            ))
-        })?;
-
-        // Try multiple selector strategies. The accessibility tree role doesn't
-        // always map to an explicit [role] attribute in the DOM — native HTML
-        // elements (button, input, a) have implicit ARIA roles without the
-        // attribute being present.
-        let selectors = build_selectors_for_ref(elem_ref);
-
-        for selector in &selectors {
-            if let Ok(element) = page.find_element(selector).await {
-                return Ok(element);
-            }
-        }
-
-        Err(BrowserError::new(format!(
-            "failed to find element for ref '{ref_id}' \
-             (role: {}, name: {:?}) — the page may have changed since \
-             the last snapshot. Run snapshot again to get fresh refs, \
-             then retry with the updated ref. \
-             Selectors tried: {}",
-            elem_ref.role,
-            elem_ref.name,
-            selectors.join(", "),
-        )))
-    }
 }
 
 /// Dispatch a key press event to the page via CDP Input domain.
@@ -1427,9 +1570,11 @@ fn role_to_native_tag(role: &str) -> Option<&'static str> {
         "slider" => Some("input[type='range']"),
         "spinbutton" => Some("input[type='number']"),
         "combobox" => Some("select"),
+        "option" => Some("option"),
+        "listbox" => Some("select"),
         "switch" => Some("input[type='checkbox']"),
-        "tab" => Some("[role='tab']"),
         "menuitem" => Some("[role='menuitem']"),
+        "tab" => Some("[role='tab']"),
         _ => None,
     }
 }