facebookresearch · marksibrahim · Nov 11, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025
diff --git a/.github/workflows/static_.yml b/.github/workflows/static_.yml
diff --git a/.github/workflows/test_playground.yml b/.github/workflows/test_playground.yml
@@ -26,4 +26,4 @@ jobs:
           source .venv/bin/activate
       - name: Test with pytest
         run: |
-          uv run -m pytest tests/test_apps.py 
+          uv run -m pytest tests/
diff --git a/README.md b/README.md
@@ -10,6 +10,13 @@
 
 ## Install
 
+```
+uv pip install git+https://github.com/facebookresearch/openapps.git
+```
+
+
+### Manual Installation
+
 - Pre-requisite: install uv (a much faster pip): `pip install uv` (or from [source](https://docs.astral.sh/uv/getting-started/installation/))
 <!-- - [If using Conda] Create a fresh venv: `uv venv --python "$(which python)"` -->
 
@@ -81,6 +88,35 @@ To launch popups, set `apps/pop_ups=adversarial_descriptions`.
 
 You can see the specific variables for each defined in the individual apps. For example, `config/apps/maps/appearance/dark_theme.yaml`.
 
+## Launch Agent
+
+Launch an agent to perform a task:
+
+```
+uv run launch_agent.py
+```
+
+To see the agent solving the task live:
+```
+uv run launch_agent.py browsergym_env_args.headless=False
+```
+
+You can specify the agent of your choice with the `agent=` argument. For example `agent=dummy` is a simple agent that clicks randomly on any buttons, great for exploration!
+
+Learn more about launching with OpenAI, Claude, and VLLM models such as UI-Tars in our docs.
+
+## Launch Agent(s) Across Multiple Tasks
+> launch thousands of app variations to study agent behaviors in parallel
+
+To launch one (or multiple) agents to solve many tasks in parallel, each in an isolated deployment of OpenApps:
+
+```
+uv run launch_sweep.py
+```
+
+* Note each deployment of OpenApps can have different appearance and content
+* Note each task is launched in an isolated environment to ensure reproducible results.
+
 ## Testing
 
 Run all tests via:
@@ -105,7 +141,7 @@ Some icons are have been designed using resources from Flaticon.com
 # Development
 
 ```
-uv sync --dev
+uv sync --extra dev
 ```
 
 To build docs:
@@ -117,6 +153,7 @@ mkdocs serve
 
 this will launch docs available at https://facebookresearch.github.io/OpenApps/
 
+
 ## Legal
 
 Our work is licensed under CC-BY-NC, please refer to the [LICENSE](LICENSE) file in the top level directory.

diff --git a/config/agent/GPT-4o.yaml b/config/agent/GPT-4o.yaml
@@ -0,0 +1,15 @@
+defaults:
+  - default
+  - _self_
+
+model_name: "GPT-4o"
+model_pretty_name: "GPT-4o"
+api_version: "2024-06-01"
+client_type: "openai"
+hostname: null
+# primary key
+api_key: ${oc.env:OPENAI_API_KEY}
+aws_access_key: null
+aws_secret_key: null
+aws_session_token: null
+aws_region: us-west-2
diff --git a/config/agent/UI-TARS-1.5-7B.yaml b/config/agent/UI-TARS-1.5-7B.yaml
@@ -0,0 +1,68 @@
+defaults:
+- default
+- _self_
+model_name: ByteDance-Seed/UI-TARS-1.5-7B
+model_pretty_name: UI-TARS-1.5-7B
+hostname: null
+host_name_updated_on: null
+client_type: vllm
+custom_actions:
+- go_back
+- go_forward
+- goto
+- mouse_click
+- mouse_dblclick
+- scroll
+- mouse_move
+- mouse_down
+- mouse_up
+- mouse_click
+- mouse_dblclick
+- mouse_drag_and_drop
+- mouse_upload_file
+- keyboard_down
+- keyboard_up
+- keyboard_press
+- keyboard_type
+- keyboard_insert_text
+use_html: false
+use_axtree: false
+use_screenshot: true
+use_som: false
+extract_visible_tag: false
+extract_clickable_tag: false
+extract_coords: false
+filter_visible_elements_only: false
+use_focused_element: false
+prompt_txt:
+  system_prompt: You are a GUI agent. You are given a task and your action history,
+    with screenshots. You need to perform the next action to complete the task.
+  output_format: '<action>
+
+    <\action>
+
+    <think>
+
+    <\think>
+
+    '
+  think_prompt: null
+  think_abstract_example: null
+  think_concrete_example: null
+  action_prompt: "## Action Space\n\nclick(point='<point>x1 y1</point>')\nleft_double(point='<point>x1\
+    \ y1</point>')\nright_single(point='<point>x1 y1</point>')\nhotkey(key='ctrl c')\
+    \ # Split keys with a space and use lowercase. Also, do not use more than 3 keys\
+    \ in one hotkey action.\ntype(content='xxx') # Use escape characters \\\\', \\\
+    \\\\\", and \\\\n in content part to ensure we can parse the content in normal\
+    \ python string format. If you want to submit your input, use \\\\n at the end\
+    \ of content. \nscroll(point='<point>x1 y1</point>', direction='down or up or\
+    \ right or left') # Show more information on the `direction` side.\nwait() #Sleep\
+    \ for 5s and take a screenshot to check for any changes.\n\n## Note\n- Use English\
+    \ in `Thought` part.\n- Write a small plan and finally summarize your next action\
+    \ (with its target element) in one sentence in `Thought` part.\n"
+  action_abstract_example: '<action>type(content='''')<\action>
+
+    '
+  action_concrete_example: '<action>click(point=''<point>200 300</point>'')<\action>
+
+    '
diff --git a/config/agent/axtree-only.yaml b/config/agent/axtree-only.yaml
@@ -0,0 +1,42 @@
+# Custom actions to be used by the agent
+# ax-tree options
+custom_actions: ["click", "fill", "dblclick", "clear", "select_option", "drag_and_drop", "hover", "go_back", "go_forward", "goto", "scroll"]
+
+# --- observation flags ---
+use_axtree: True # enable AXTREE observation
+use_screenshot: False # enable screenshot observation
+use_som: False  # Add a set of marks to the screenshot.
+extract_coords: False # Add the coordinates of the elements.
+
+# --- Prompt Flags ---
+prompt_txt:
+  system_prompt: null # takes default system prompt from dp lib
+  output_format: |
+    <action>
+    <\action>
+    <think>
+    <\think>
+  think_prompt: null # takes default system prompt from dp lib
+  think_abstract_example: null # takes default system prompt from dp lib
+  think_concrete_example: null # takes default system prompt from dp lib
+  action_prompt: null # if specified will directly define your action prompt and will ignore browsergym default of listing the API with descriptions
+  action_abstract_example: |
+    For links, checkboxes, buttons, etc. in the Axtree use the following action:
+
+    <action>
+    click("bid")
+    </action>
+
+    For textbox items use:
+
+    <action>
+    fill("bid", str)
+    </action>
+
+    Remember you can only use one action at a time. Check the history for more context about which action you already took.
+  action_concrete_example: |
+   if doing ax tree format, use:
+    <action>
+    fill("4", "my text")
+    </action>
+
diff --git a/config/agent/claude_4_sonnet.yaml b/config/agent/claude_4_sonnet.yaml
@@ -0,0 +1,15 @@
+defaults:
+  - default
+  - _self_
+
+model_name: claude_4_sonnet
+api_version: null
+model_pretty_name: claude_4_sonnet
+client_type: aws
+# load from env vars to avoid committing secrets to git
+aws_access_key: null
+aws_secret_key: null
+aws_session_token: null
+aws_region: us-west-2
+max_tokens: 1000
+temperature: 0.5
diff --git a/config/agent/default.yaml b/config/agent/default.yaml
@@ -0,0 +1,113 @@
+_target_: open_apps.agent.AgentArgs
+
+model_name: null
+model_pretty_name: null
+
+# --- ChatModel Flags ---
+
+api_key: "AMI_RULZ"
+port: "8000"
+hostname: null
+temperature: null
+max_tokens: null
+host_name_updated_on: null
+
+# both ax tree options and vision options
+# see BrowserGym paper, first page of appendix for details on these options
+custom_actions:
+  [
+    "click",
+    "fill",
+    "dblclick",
+    "clear",
+    "select_option",
+    "drag_and_drop",
+    "hover",
+    "go_back",
+    "go_forward",
+    "goto",
+    "scroll",
+    "mouse_click",
+    "mouse_dblclick",
+    "mouse_move",
+    "mouse_down",
+    "mouse_up",
+    "mouse_click",
+    "mouse_dblclick",
+    "mouse_drag_and_drop",
+    "mouse_upload_file",
+    "keyboard_down",
+    "keyboard_up",
+    "keyboard_press",
+    "keyboard_type",
+    "keyboard_insert_text",
+  ]
+
+# --- observation flags ---
+
+use_axtree: True # enable AXTREE observation
+use_screenshot: True # enable screenshot observation
+
+# ---- these are not really changed, but leaving it here for future reference ----
+# use_html: False # enable HTML observation
+use_som: False # Add a set of marks to the screenshot.
+# extract_visible_tag: False # Add a "visible" tag to visible elements in the AXTree.
+# extract_clickable_tag: False #  Add a "clickable" tag to clickable elements in the AXTree.
+extract_coords: False # Add the coordinates of the elements.
+# filter_visible_elements_only: False  # filter elements that are not visible
+# use_focused_element: False  # use focused element
+
+# --- history flags ---
+use_history: True # enable history
+use_action_history: True # enable action history, the use_history flag must be True
+use_think_history: True # enable think history,  the use_history flag must be True
+# --- Agent Flags ---
+use_thinking: True # enable thoughts
+use_concrete_example: True # if true, we will include a concrete example in the prompt
+use_abstract_example: True #if true, we will include an abstract example in the prompt
+# --- Prompt Flags ---
+prompt_txt:
+  system_prompt: null # takes default system prompt from dp lib
+  output_format: |
+    <action>
+    <\action>
+    <think>
+    <\think>
+  think_prompt: null # takes default system prompt from dp lib
+  think_abstract_example: null # takes default system prompt from dp lib
+  think_concrete_example: null # takes default system prompt from dp lib
+  action_prompt: null # if specified will directly define your action prompt and will ignore browsergym default of listing the API with descriptions
+  action_abstract_example: |
+    For links, checkboxes, buttons, etc. in the Axtree use the following action:
+
+    <action>
+    click("bid")
+    </action>
+
+    For textbox items use:
+
+    <action>
+    fill("bid", str)
+    </action>
+
+    For links, checkboxes, buttons, etc. in screenshot format, use the following action:
+    <action>
+    mouse_click(x: float, y: float, button: Literal['left', 'middle', 'right'] = 'left')
+    </action>
+
+    To fill in a text box in screenshot format, use:
+    <action>
+    keyboard_type(text) 
+    </action>
+
+
+    Remember you can only use one action at a time. Check the history for more context about which action you already took.
+  action_concrete_example: |
+    if doing ax tree format, use:
+     <action>
+     fill("4", "my text")
+     </action>
+    if doing screenshot format, to click at coordinates (612, 455) on the screenshot, use:
+     <action>
+     mouse_click(x=612, y=455)
+     </action>
diff --git a/config/agent/dummy.yaml b/config/agent/dummy.yaml
@@ -0,0 +1,11 @@
+# Agent that will click random actions, for testing and debugging.
+_target_: open_apps.agent.dummy_agent.DummyAgentArgs
+model_name: dummy # don't change!
+model_pretty_name: dummy # for wandb-logging
+# --- observation flags ---
+# not relevant change will do nothing but needs to be passed to browsergym
+use_html: True
+use_axtree: True
+use_screenshot: True
+hostname: "no host name for dumb dumbs" # dummy agent does not use hostname
+client_type: dummy