diff --git a/.gitignore b/.gitignore index ba2ded4..8174652 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ frontend/.vitepress/dist/ frontend/.vitepress/cache/ frontend/.vitepress/.temp/ .claude/* +.worktrees/ # npm auth .npmrc diff --git a/frontend/get-started/configuration.md b/frontend/get-started/configuration.md index d8dfa7c..b8f9823 100644 --- a/frontend/get-started/configuration.md +++ b/frontend/get-started/configuration.md @@ -108,6 +108,39 @@ For human-auth relay: - shared relay hub launched by `openpocket human-auth-relay start` does not use separate per-agent relay state or per-agent hub API keys - in managed mode, agent-local request state still stays under the agent's own `state/` +### Aliyun UI Agent mobile backend + +OpenPocket now includes a first-class `aliyun-ui-agent/mobile` model profile. + +Key points: + +- the profile sets `models..backend` to `aliyun_ui_agent_mobile` +- runtime routes this backend through the dedicated Aliyun GUI-agent client instead of the default OpenAI-compatible tool-calling path +- image input is delivered as a short-lived screenshot URL from the local relay stack, so the selected agent must have `humanAuth.useLocalRelay=true` +- if Aliyun must fetch screenshots from the public internet, use either: + - the shared relay hub from `openpocket human-auth-relay start` + - or per-agent ngrok via `humanAuth.tunnel.provider=ngrok` + +Minimal example: + +```json +{ + "defaultModel": "aliyun-ui-agent/mobile", + "models": { + "aliyun-ui-agent/mobile": { + "baseUrl": "https://dashscope.aliyuncs.com/api/v2/apps/gui-owl/gui_agent_server", + "model": "pre-gui_owl_7b", + "apiKey": "", + "apiKeyEnv": "DASHSCOPE_API_KEY", + "maxTokens": 4096, + "reasoningEffort": null, + "temperature": null, + "backend": "aliyun_ui_agent_mobile" + } + } +} +``` + ## Backward Compatibility Keys Loader maps old keys automatically, including: diff --git a/frontend/ops/troubleshooting.md b/frontend/ops/troubleshooting.md index e1f9ca4..830e1e5 100644 --- a/frontend/ops/troubleshooting.md +++ b/frontend/ops/troubleshooting.md @@ -53,6 +53,21 @@ - verify model supports requested endpoint and multimodal input - switch model profile and retry +## Aliyun UI Agent cannot fetch screenshot URL + +- verify the selected model profile uses `backend: "aliyun_ui_agent_mobile"` +- ensure `humanAuth.useLocalRelay=true` +- if using managed agents, start the shared relay hub with `openpocket human-auth-relay start` +- if Aliyun must fetch over the public internet, verify ngrok/shared public relay URL is reachable from outside your LAN +- inspect logs for `[OpenPocket][human-auth]`, `[OpenPocket][relay-hub]`, and local relay startup failures + +## Aliyun UI Agent keeps returning `wait` or unsupported operations + +- inspect the selected agent session file for the raw `Operation` string returned by Aliyun +- confirm the task is in `device_type=mobile` scope and the current screen is an Android phone UI, not a secure/blank surface +- if the screen is `FLAG_SECURE` or blacked out, use the human-auth takeover path instead of retrying model calls +- retry after enabling a public screenshot URL path (shared relay hub or ngrok), because stale/unreachable image URLs often degrade action quality + ## Channel bot does not respond - validate token for the selected agent (`channels..*` or env) diff --git a/frontend/reference/cli-and-gateway.md b/frontend/reference/cli-and-gateway.md index 601c893..12afaf9 100644 --- a/frontend/reference/cli-and-gateway.md +++ b/frontend/reference/cli-and-gateway.md @@ -219,7 +219,9 @@ Examples: openpocket model show openpocket model list openpocket model set --name gpt-5.4 +openpocket model set --name aliyun-ui-agent/mobile openpocket --agent review-bot model set --provider google --model gemini-3.1-pro-preview +openpocket --agent review-bot model set --provider aliyun-ui-agent --model pre-gui_owl_7b ``` Notes: @@ -227,6 +229,8 @@ Notes: - `model set --name ` switches to an existing profile key - `model set --provider --model ` creates/updates a profile from provider presets and switches the selected agent's default model - model config is per agent after creation +- `Aliyun UI Agent (Mobile)` is a dedicated backend, not a normal OpenAI-compatible chat profile even though it uses DashScope +- when using `aliyun-ui-agent/mobile`, the selected agent must expose screenshots through the local relay stack; for public internet access, use the shared relay hub or per-agent ngrok ## Channel Commands diff --git a/frontend/reference/config-defaults.md b/frontend/reference/config-defaults.md index 987b4ba..964d4f3 100644 --- a/frontend/reference/config-defaults.md +++ b/frontend/reference/config-defaults.md @@ -295,11 +295,29 @@ Managed agents created later with `openpocket create agent ` start from the "maxTokens": 4096, "reasoningEffort": null, "temperature": null + }, + "aliyun-ui-agent/mobile": { + "baseUrl": "https://dashscope.aliyuncs.com/api/v2/apps/gui-owl/gui_agent_server", + "model": "pre-gui_owl_7b", + "apiKey": "", + "apiKeyEnv": "DASHSCOPE_API_KEY", + "maxTokens": 4096, + "reasoningEffort": null, + "temperature": null, + "backend": "aliyun_ui_agent_mobile" } } } ``` +### Aliyun UI Agent mobile note + +- built-in profile key: `aliyun-ui-agent/mobile` +- backend discriminator: `models..backend = "aliyun_ui_agent_mobile"` +- screenshot delivery depends on the local relay stack, so public Aliyun fetches require either: + - the shared relay hub (`openpocket human-auth-relay start`) + - or per-agent ngrok (`humanAuth.tunnel.provider=ngrok`) + ## Managed Agent Overrides When you run `openpocket create agent `, OpenPocket clones a new agent config and rewrites these defaults: @@ -356,6 +374,7 @@ Notes: - `humanAuth.pollIntervalMs` is clamped to at least `500`. - `humanAuth.tunnel.provider` accepts only `none|ngrok`. - `humanAuth.tunnel.ngrok.startupTimeoutSec` is clamped to at least `3`. +- `models..backend` accepts only `default|aliyun_ui_agent_mobile`; other values fall back to `default`. - `allowedChatIds` is coerced to numeric array with non-finite values removed. - model `baseUrl` is normalized for known providers: - Google Generative Language bare host -> `/v1beta` diff --git a/src/agent/agent-runtime.ts b/src/agent/agent-runtime.ts index 006c2b1..3a95201 100644 --- a/src/agent/agent-runtime.ts +++ b/src/agent/agent-runtime.ts @@ -63,7 +63,10 @@ import { import { runRuntimeAttempt } from "./runtime/attempt.js"; import { runRuntimeTask } from "./runtime/run.js"; import type { RunTaskRequest } from "./runtime/types.js"; +import { AliyunUiAgentClient } from "./aliyun-ui-agent-client.js"; +import { AliyunGuiPlusClient } from "./aliyun-gui-plus-client.js"; import { createPiSessionBridge } from "./pi-session-bridge.js"; +import { LocalHumanAuthStack } from "../human-auth/local-stack.js"; import { scaleCoordinates, drawDebugMarker } from "../utils/image-scale.js"; import { PhoneUseCapabilityProbe, @@ -3850,6 +3853,9 @@ export class AgentRuntime { isPermissionDialogApp: (currentApp) => this.isPermissionDialogApp(currentApp), autoApprovePermissionDialog: (currentApp) => this.autoApprovePermissionDialog(currentApp), saveModelInputArtifacts: (params) => this.saveModelInputArtifacts(params), + aliyunUiAgentClientFactory: (options) => new AliyunUiAgentClient(options), + aliyunGuiPlusClientFactory: (options) => new AliyunGuiPlusClient(options), + localHumanAuthStackFactory: (config, log) => new LocalHumanAuthStack(config, log), }, attemptRequest, ), diff --git a/src/agent/aliyun-gui-plus-client.ts b/src/agent/aliyun-gui-plus-client.ts new file mode 100644 index 0000000..2fbe1c0 --- /dev/null +++ b/src/agent/aliyun-gui-plus-client.ts @@ -0,0 +1,372 @@ +import type { AgentAction, ModelStepOutput } from "../types.js"; +import { formatDetailedError } from "../utils/error-details.js"; +import { normalizeAction } from "./actions.js"; + +/** + * GUI-Plus smart_resize coordinate conversion. + * + * The GUI-Plus model internally scales images using the smart_resize algorithm + * (factor=28, min_pixels=3136, max_pixels=1003520) and outputs coordinates + * in that scaled image space. To convert back to device coordinates: + * device_x = model_x / scaled_width * original_width + * + * See: https://help.aliyun.com/zh/model-studio/gui-automation + */ + +const SMART_RESIZE_FACTOR = 28; +const SMART_RESIZE_MIN_PIXELS = 56 * 56; // 3,136 +const SMART_RESIZE_MAX_PIXELS = 14 * 14 * 4 * 1280; // 1,003,520 + +function roundByFactor(num: number, factor: number): number { + return Math.round(num / factor) * factor; +} + +function floorByFactor(num: number, factor: number): number { + return Math.floor(num / factor) * factor; +} + +function ceilByFactor(num: number, factor: number): number { + return Math.ceil(num / factor) * factor; +} + +/** + * Calculate the scaled image dimensions that the GUI-Plus model uses internally. + * Replicates the Qwen VL smart_resize algorithm. + */ +export function smartResize( + height: number, + width: number, + factor: number = SMART_RESIZE_FACTOR, + minPixels: number = SMART_RESIZE_MIN_PIXELS, + maxPixels: number = SMART_RESIZE_MAX_PIXELS, +): { hBar: number; wBar: number } { + let h = height; + let w = width; + + const maxLongSide = 8192; + if (Math.max(h, w) > maxLongSide) { + const beta = Math.max(h, w) / maxLongSide; + h = Math.floor(h / beta); + w = Math.floor(w / beta); + } + + let hBar = roundByFactor(h, factor); + let wBar = roundByFactor(w, factor); + + if (hBar * wBar > maxPixels) { + const beta = Math.sqrt((height * width) / maxPixels); + hBar = floorByFactor(height / beta, factor); + wBar = floorByFactor(width / beta, factor); + } else if (hBar * wBar < minPixels) { + const beta = Math.sqrt(minPixels / (height * width)); + hBar = ceilByFactor(height * beta, factor); + wBar = ceilByFactor(width * beta, factor); + } + + return { hBar, wBar }; +} + +function rescaleGuiPlusCoord(modelCoord: number, scaledSize: number, deviceSize: number): number { + if (scaledSize <= 0) return Math.round(modelCoord); + return Math.max(0, Math.min(Math.round((modelCoord / scaledSize) * deviceSize), deviceSize - 1)); +} + +// --- GUI-Plus action types --- + +type GuiPlusAction = + | { action: "CLICK"; parameters: { x: number; y: number; description?: string } } + | { action: "TYPE"; parameters: { text: string; needs_enter?: boolean } } + | { action: "SCROLL" | "SWIPE"; parameters: { direction: string; amount: string } } + | { action: "KEY_PRESS"; parameters: { key: string } } + | { action: "FINISH"; parameters: { message?: string } } + | { action: "FAIL"; parameters: { reason?: string } }; + +// --- Client interfaces --- + +export interface AliyunGuiPlusClientOptions { + apiKey: string; + baseUrl?: string; + modelName?: string; + thoughtLanguage?: string; + fetchImpl?: typeof fetch; +} + +export interface AliyunGuiPlusNextStepParams { + task: string; + screenshotBase64: string; + addInfo?: string; + thoughtLanguage?: string; + viewportWidth: number; + viewportHeight: number; +} + +export interface AliyunGuiPlusNextStepResult { + explanation: string; + output: ModelStepOutput; +} + +// --- Action mapping --- + +function parseScrollAmount(amount: string): number { + switch (amount.toLowerCase()) { + case "small": return 0.2; + case "large": return 0.6; + default: return 0.4; // medium + } +} + +export function mapGuiPlusActionToAgentAction(params: { + parsed: GuiPlusAction; + thought: string; + viewportWidth: number; + viewportHeight: number; + scaledWidth: number; + scaledHeight: number; +}): AgentAction { + const { parsed, thought, viewportWidth, viewportHeight, scaledWidth, scaledHeight } = params; + const reason = thought || undefined; + + switch (parsed.action) { + case "CLICK": { + return normalizeAction({ + type: "tap", + x: rescaleGuiPlusCoord(parsed.parameters.x, scaledWidth, viewportWidth), + y: rescaleGuiPlusCoord(parsed.parameters.y, scaledHeight, viewportHeight), + reason, + }); + } + case "TYPE": { + return normalizeAction({ + type: "type", + text: String(parsed.parameters.text ?? ""), + reason, + }); + } + case "SWIPE": + case "SCROLL": { + const dir = String(parsed.parameters.direction ?? "down").toLowerCase(); + const fraction = parseScrollAmount(String(parsed.parameters.amount ?? "medium")); + const xCenter = Math.round(viewportWidth * 0.5); + const yCenter = Math.round(viewportHeight * 0.5); + const yDelta = Math.round(viewportHeight * fraction); + const xDelta = Math.round(viewportWidth * fraction); + if (dir === "down") { + return normalizeAction({ type: "swipe", x1: xCenter, y1: yCenter + Math.round(yDelta / 2), x2: xCenter, y2: yCenter - Math.round(yDelta / 2), reason }); + } + if (dir === "up") { + return normalizeAction({ type: "swipe", x1: xCenter, y1: yCenter - Math.round(yDelta / 2), x2: xCenter, y2: yCenter + Math.round(yDelta / 2), reason }); + } + if (dir === "left") { + return normalizeAction({ type: "swipe", x1: xCenter + Math.round(xDelta / 2), y1: yCenter, x2: xCenter - Math.round(xDelta / 2), y2: yCenter, reason }); + } + if (dir === "right") { + return normalizeAction({ type: "swipe", x1: xCenter - Math.round(xDelta / 2), y1: yCenter, x2: xCenter + Math.round(xDelta / 2), y2: yCenter, reason }); + } + return normalizeAction({ type: "wait", durationMs: 500, reason: `unsupported scroll direction: ${dir}` }); + } + case "KEY_PRESS": { + const raw = String(parsed.parameters.key ?? "").trim(); + const key = raw.toUpperCase().startsWith("KEYCODE_") ? raw.toUpperCase() : `KEYCODE_${raw.replace(/[\s-]+/g, "_").toUpperCase()}`; + return normalizeAction({ type: "keyevent", keycode: key, reason }); + } + case "FINISH": { + return normalizeAction({ type: "finish", message: parsed.parameters.message || thought || "Task finished." }); + } + case "FAIL": { + throw new Error(`GUI-Plus reported failure: ${parsed.parameters.reason || thought || "unknown"}`); + } + default: { + return normalizeAction({ type: "wait", durationMs: 1000, reason: `unsupported GUI-Plus action: ${(parsed as { action?: string }).action || "empty"}` }); + } + } +} + +// --- Client --- + +export class AliyunGuiPlusClient { + private readonly apiKey: string; + private readonly baseUrl: string; + private readonly modelName: string; + private readonly thoughtLanguage: string; + private readonly fetchImpl: typeof fetch; + private conversationHistory: Array<{ role: string; content: unknown }> = []; + + constructor(options: AliyunGuiPlusClientOptions) { + this.apiKey = options.apiKey; + this.baseUrl = options.baseUrl ?? "https://dashscope.aliyuncs.com/compatible-mode/v1"; + this.modelName = options.modelName ?? "gui-plus"; + this.thoughtLanguage = options.thoughtLanguage ?? "english"; + this.fetchImpl = options.fetchImpl ?? fetch; + } + + resetConversation(): void { + this.conversationHistory = []; + } + + async nextStep(params: AliyunGuiPlusNextStepParams): Promise { + const { hBar, wBar } = smartResize(params.viewportHeight, params.viewportWidth); + + const userContent: unknown[] = [ + { + type: "image_url", + image_url: { + url: `data:image/png;base64,${params.screenshotBase64}`, + }, + }, + { + type: "text", + text: params.task + (params.addInfo ? `\n\n${params.addInfo}` : ""), + }, + ]; + + const systemPrompt = `## 1. 核心角色 (Core Role) +你是一个顶级的AI视觉操作代理。你的任务是分析手机屏幕截图,理解用户的指令,然后将任务分解为单一、精确的GUI原子操作。 +**重要**: 你正在操作一部安卓手机,不是电脑。 +- 要打开应用程序,请使用CLICK点击屏幕上的应用图标。 +- 如果目标应用不在当前屏幕上,使用SCROLL的left/right方向来翻页查找。 +- 没有"OPEN"、"SWIPE"、"SYSTEM_BUTTON"等操作,只能使用以下6个工具。 +- KEY_PRESS支持安卓键: 'back'(返回), 'home'(主屏幕), 'enter'(确认)。 + +## 2. [CRITICAL] JSON Schema & 绝对规则 +你的输出**必须**是一个严格符合以下规则的JSON对象。**任何偏差都将导致失败**。 + +- **[R1] 严格的JSON**: 你的回复**必须**是且**只能是**一个JSON对象。禁止在JSON代码块前后添加任何文本、注释或解释。 +- **[R2] 严格的Parameters结构**:\`thought\`对象的结构: "在这里用一句话简要描述你的思考过程。" +- **[R3] 精确的Action值**: \`action\`字段的值**必须**是\`## 3. 工具集\`中定义的一个大写字符串(例如 \`"CLICK"\`, \`"TYPE"\`),不允许有任何前导/后置空格或大小写变化。 +- **[R4] 严格的Parameters结构**: \`parameters\`对象的结构**必须**与所选Action在\`## 3. 工具集\`中定义的模板**完全一致**。键名、值类型都必须精确匹配。 + +## 3. 工具集 (Available Actions) + +### CLICK +- **功能**: 单击屏幕。 +- **Parameters模板**: {"x": , "y": , "description": ""} + +### TYPE +- **功能**: 输入文本。 +- **Parameters模板**: {"text": "", "needs_enter": } + +### SCROLL +- **功能**: 滚动屏幕。 +- **Parameters模板**: {"direction": "<'up', 'down', 'left', or 'right'>", "amount": "<'small', 'medium', or 'large'>"} +- 在手机主屏幕上,使用left/right翻页查找应用;在应用内,使用up/down滚动内容。 + +### KEY_PRESS +- **功能**: 按下功能键。 +- **Parameters模板**: {"key": ""} + +### FINISH +- **功能**: 任务成功完成。 +- **Parameters模板**: {"message": ""} + +### FAIL +- **功能**: 任务无法完成。 +- **Parameters模板**: {"reason": ""} + +## 4. 思维与决策框架 +在生成每一步操作前,请严格遵循以下思考-验证流程: +1. 目标分析: 用户的最终目标是什么? +2. 屏幕观察: 仔细分析截图。你的决策必须基于截图中存在的视觉证据。 +3. 行动决策: 基于目标和可见的元素,选择最合适的工具。 +4. 最终验证: 我的回复是纯粹的JSON吗?action的值是否正确无误?parameters的结构是否与模板100%一致?`; + + const messages = [ + { + role: "system", + content: systemPrompt, + }, + ...this.conversationHistory, + { + role: "user", + content: userContent, + }, + ]; + + const payload = { + model: this.modelName, + messages, + max_tokens: 2048, + vl_high_resolution_images: true, + }; + + let response: Response; + const endpoint = `${this.baseUrl}/chat/completions`; + try { + response = await this.fetchImpl(endpoint, { + method: "POST", + headers: { + Authorization: `Bearer ${this.apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify(payload), + }); + } catch (error) { + throw new Error(`GUI-Plus request failed: ${formatDetailedError(error)}`); + } + + const rawBody = await response.text(); + if (!response.ok) { + throw new Error(`GUI-Plus request failed ${response.status}: ${rawBody.slice(0, 500)}`); + } + + let responseJson: { + choices?: Array<{ + message?: { content?: string; role?: string }; + }>; + }; + try { + responseJson = JSON.parse(rawBody); + } catch { + throw new Error(`GUI-Plus returned invalid JSON: ${rawBody.slice(0, 500)}`); + } + + const assistantContent = responseJson.choices?.[0]?.message?.content ?? ""; + if (!assistantContent.trim()) { + throw new Error(`GUI-Plus returned empty response: ${rawBody.slice(0, 500)}`); + } + + // Maintain conversation history for multi-turn + this.conversationHistory.push({ role: "user", content: userContent }); + this.conversationHistory.push({ role: "assistant", content: assistantContent }); + // Keep history bounded + if (this.conversationHistory.length > 20) { + this.conversationHistory = this.conversationHistory.slice(-16); + } + + // Parse the structured JSON response + let parsed: { thought?: string; action?: string; parameters?: Record }; + try { + // Handle potential markdown code block wrapping + let jsonStr = assistantContent.trim(); + const codeBlockMatch = jsonStr.match(/```(?:json)?\s*([\s\S]*?)```/); + if (codeBlockMatch) { + jsonStr = codeBlockMatch[1].trim(); + } + parsed = JSON.parse(jsonStr); + } catch { + throw new Error(`GUI-Plus returned non-JSON action: ${assistantContent.slice(0, 300)}`); + } + + const thought = String(parsed.thought ?? "").trim(); + const actionType = String(parsed.action ?? "").trim().toUpperCase(); + const parameters = parsed.parameters ?? {}; + + const guiPlusAction = { action: actionType, parameters } as GuiPlusAction; + const action = mapGuiPlusActionToAgentAction({ + parsed: guiPlusAction, + thought, + viewportWidth: params.viewportWidth, + viewportHeight: params.viewportHeight, + scaledWidth: wBar, + scaledHeight: hBar, + }); + + return { + explanation: thought, + output: { + thought, + action, + raw: rawBody, + }, + }; + } +} diff --git a/src/agent/aliyun-ui-agent-client.ts b/src/agent/aliyun-ui-agent-client.ts new file mode 100644 index 0000000..dd8f66f --- /dev/null +++ b/src/agent/aliyun-ui-agent-client.ts @@ -0,0 +1,403 @@ +import type { AgentAction, ModelStepOutput } from "../types.js"; +import { formatDetailedError } from "../utils/error-details.js"; +import { normalizeAction } from "./actions.js"; + +/** + * Aliyun GUI-OWL model outputs coordinates in a 1000x1000 normalized space. + * These must be rescaled to actual device pixel dimensions before execution. + */ +const ALIYUN_MODEL_COORD_SPACE = 1000; + +function rescaleAliyunCoord(modelCoord: number, deviceSize: number): number { + return Math.round((modelCoord / ALIYUN_MODEL_COORD_SPACE) * deviceSize); +} + +export interface AliyunUiAgentClientOptions { + apiKey: string; + baseUrl?: string; + modelName?: string; + thoughtLanguage?: string; + fetchImpl?: typeof fetch; + sessionId?: string | null; +} + +export interface AliyunUiAgentNextStepParams { + task: string; + screenshotUrl: string; + addInfo?: string; + thoughtLanguage?: string; + viewportWidth?: number; + viewportHeight?: number; +} + +export interface AliyunUiAgentNextStepResult { + sessionId: string | null; + explanation: string; + output: ModelStepOutput; +} + +type AliyunUiAgentResponse = { + session_id?: string; + output?: Array<{ + code?: string; + content?: Array<{ + data?: { + Thought?: string; + Explanation?: string; + Operation?: string; + }; + }>; + }>; +}; + +function parseNumberList(input: string): number[] { + return input + .split(",") + .map((part) => Number(part.trim())) + .filter((value) => Number.isFinite(value)) + .map((value) => Math.round(value)); +} + +function unwrapAliyunTextArgument(input: string): string { + const trimmed = input.trim(); + if (trimmed.length >= 2) { + const first = trimmed[0]; + const last = trimmed[trimmed.length - 1]; + if ((first === "\"" && last === "\"") || (first === "'" && last === "'")) { + return trimmed.slice(1, -1); + } + } + return trimmed; +} + +function normalizeAliyunKeycode(input: string): string { + const trimmed = input.trim(); + if (!trimmed) { + return "KEYCODE_ENTER"; + } + if (/^\d+$/.test(trimmed)) { + return trimmed; + } + const normalized = trimmed.replace(/[\s-]+/g, "_").toUpperCase(); + const key = normalized.startsWith("KEYCODE_") ? normalized.slice("KEYCODE_".length) : normalized; + const aliases: Record = { + HOME: "KEYCODE_HOME", + BACK: "KEYCODE_BACK", + ENTER: "KEYCODE_ENTER", + RETURN: "KEYCODE_ENTER", + MENU: "KEYCODE_MENU", + POWER: "KEYCODE_POWER", + VOLUME_UP: "KEYCODE_VOLUME_UP", + VOLUME_DOWN: "KEYCODE_VOLUME_DOWN", + APP_SWITCH: "KEYCODE_APP_SWITCH", + RECENT: "KEYCODE_APP_SWITCH", + RECENTS: "KEYCODE_APP_SWITCH", + SEARCH: "KEYCODE_SEARCH", + CAMERA: "KEYCODE_CAMERA", + DELETE: "KEYCODE_DEL", + DEL: "KEYCODE_DEL", + BACKSPACE: "KEYCODE_DEL", + }; + return aliases[key] ?? `KEYCODE_${key}`; +} + +function buildDirectionalScrollAction(params: { + direction: string; + viewportWidth?: number; + viewportHeight?: number; + reason?: string; +}): AgentAction | null { + const direction = params.direction.trim().toLowerCase(); + const width = Number.isFinite(params.viewportWidth) && Number(params.viewportWidth) > 0 + ? Math.round(Number(params.viewportWidth)) + : 1080; + const height = Number.isFinite(params.viewportHeight) && Number(params.viewportHeight) > 0 + ? Math.round(Number(params.viewportHeight)) + : 2400; + const xCenter = Math.round(width * 0.5); + const yCenter = Math.round(height * 0.5); + const xLeft = Math.round(width * 0.25); + const xRight = Math.round(width * 0.75); + const yUpper = Math.round(height * 0.25); + const yLower = Math.round(height * 0.75); + + if (direction === "down") { + return normalizeAction({ + type: "swipe", + x1: xCenter, + y1: yLower, + x2: xCenter, + y2: yUpper, + reason: params.reason, + }); + } + if (direction === "up") { + return normalizeAction({ + type: "swipe", + x1: xCenter, + y1: yUpper, + x2: xCenter, + y2: yLower, + reason: params.reason, + }); + } + if (direction === "left") { + return normalizeAction({ + type: "swipe", + x1: xRight, + y1: yCenter, + x2: xLeft, + y2: yCenter, + reason: params.reason, + }); + } + if (direction === "right") { + return normalizeAction({ + type: "swipe", + x1: xLeft, + y1: yCenter, + x2: xRight, + y2: yCenter, + reason: params.reason, + }); + } + return null; +} + +export function mapAliyunOperationToAction(params: { + operation: string; + thought?: string; + explanation?: string; + viewportWidth?: number; + viewportHeight?: number; +}): AgentAction { + const operation = String(params.operation || "").trim(); + const reason = String(params.explanation || params.thought || "").trim() || undefined; + + const vw = params.viewportWidth ?? ALIYUN_MODEL_COORD_SPACE; + const vh = params.viewportHeight ?? ALIYUN_MODEL_COORD_SPACE; + + const clickMatch = operation.match(/^click\s*\(([^)]+)\)$/i); + if (clickMatch) { + const values = parseNumberList(clickMatch[1]); + if (values.length >= 2) { + return normalizeAction({ + type: "tap", + x: rescaleAliyunCoord(values[0], vw), + y: rescaleAliyunCoord(values[1], vh), + reason, + }); + } + } + + const swipeMatch = operation.match(/^swipe\s*\(([^)]+)\)$/i); + if (swipeMatch) { + const values = parseNumberList(swipeMatch[1]); + if (values.length >= 4) { + return normalizeAction({ + type: "swipe", + x1: rescaleAliyunCoord(values[0], vw), + y1: rescaleAliyunCoord(values[1], vh), + x2: rescaleAliyunCoord(values[2], vw), + y2: rescaleAliyunCoord(values[3], vh), + reason, + }); + } + } + + const typeMatch = operation.match(/^type\s*\(([\s\S]*)\)$/i); + if (typeMatch) { + return normalizeAction({ + type: "type", + text: unwrapAliyunTextArgument(typeMatch[1]), + reason, + }); + } + + const keyPressMatch = operation.match(/^key[_\s-]*press\s*\(([\s\S]*)\)$/i); + if (keyPressMatch) { + return normalizeAction({ + type: "keyevent", + keycode: normalizeAliyunKeycode(unwrapAliyunTextArgument(keyPressMatch[1])), + reason, + }); + } + + const scrollMatch = operation.match(/^scroll\s*\(([\s\S]*)\)$/i); + if (scrollMatch) { + const rawArgument = unwrapAliyunTextArgument(scrollMatch[1]); + const values = parseNumberList(rawArgument); + if (values.length >= 4) { + return normalizeAction({ + type: "swipe", + x1: rescaleAliyunCoord(values[0], vw), + y1: rescaleAliyunCoord(values[1], vh), + x2: rescaleAliyunCoord(values[2], vw), + y2: rescaleAliyunCoord(values[3], vh), + reason, + }); + } + const directionalAction = buildDirectionalScrollAction({ + direction: rawArgument, + viewportWidth: params.viewportWidth, + viewportHeight: params.viewportHeight, + reason, + }); + if (directionalAction) { + return directionalAction; + } + } + + const doneMatch = operation.match(/^done(?:\s*\(\s*\))?$/i); + if (doneMatch) { + return normalizeAction({ + type: "finish", + message: String(params.explanation || params.thought || "Task finished."), + }); + } + + return normalizeAction({ + type: "wait", + durationMs: 1000, + reason: `unsupported Aliyun UI Agent operation: ${operation || "empty"}`, + }); +} + +export function buildAliyunUiAgentPayload(params: { + screenshotUrl: string; + task: string; + sessionId: string; + modelName: string; + thoughtLanguage: string; + addInfo: string; +}): Record { + return { + app_id: "gui-owl", + input: [ + { + role: "user", + content: [ + { + type: "data", + data: { + messages: [ + { image: params.screenshotUrl }, + { instruction: params.task }, + { session_id: params.sessionId }, + { device_type: "mobile" }, + { pipeline_type: "agent" }, + { model_name: params.modelName }, + { thought_language: params.thoughtLanguage }, + { param_list: [{ add_info: params.addInfo }] }, + ], + }, + }, + ], + }, + ], + }; +} + +export class AliyunUiAgentClient { + private readonly apiKey: string; + private readonly baseUrl: string; + private readonly modelName: string; + private readonly thoughtLanguage: string; + private readonly fetchImpl: typeof fetch; + private sessionId: string; + + constructor(options: AliyunUiAgentClientOptions) { + this.apiKey = options.apiKey; + this.baseUrl = options.baseUrl ?? "https://dashscope.aliyuncs.com/api/v2/apps/gui-owl/gui_agent_server"; + this.modelName = options.modelName ?? "pre-gui_owl_7b"; + this.thoughtLanguage = options.thoughtLanguage ?? "english"; + this.fetchImpl = options.fetchImpl ?? fetch; + this.sessionId = String(options.sessionId ?? ""); + } + + getSessionId(): string | null { + return this.sessionId || null; + } + + setSessionId(sessionId: string | null | undefined): void { + this.sessionId = String(sessionId ?? ""); + } + + async nextStep(params: AliyunUiAgentNextStepParams): Promise { + const payload = buildAliyunUiAgentPayload({ + screenshotUrl: params.screenshotUrl, + task: params.task, + sessionId: this.sessionId, + modelName: this.modelName, + thoughtLanguage: params.thoughtLanguage ?? this.thoughtLanguage, + addInfo: String(params.addInfo ?? ""), + }); + + let response: Response; + try { + response = await this.fetchImpl(this.baseUrl, { + method: "POST", + headers: { + Authorization: `Bearer ${this.apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify(payload), + }); + } catch (error) { + throw new Error(`Aliyun UI Agent request failed: ${formatDetailedError(error)}`); + } + + const rawBody = await response.text(); + if (!response.ok) { + throw new Error(`Aliyun UI Agent request failed ${response.status}: ${rawBody.slice(0, 500)}`); + } + + let parsed: AliyunUiAgentResponse; + try { + parsed = JSON.parse(rawBody) as AliyunUiAgentResponse; + } catch (error) { + throw new Error(`Aliyun UI Agent returned invalid JSON: ${formatDetailedError(error)}`); + } + + const responseSessionId = typeof parsed.session_id === "string" ? parsed.session_id.trim() : ""; + if (responseSessionId) { + this.sessionId = responseSessionId; + } + + const firstMessage = parsed.output?.[0]; + const data = firstMessage?.content?.[0]?.data; + if (!firstMessage || !data) { + throw new Error(`Aliyun UI Agent response missing action data: ${rawBody.slice(0, 500)}`); + } + + const thought = String(data.Thought ?? "").trim(); + const explanation = String(data.Explanation ?? "").trim(); + const operation = String(data.Operation ?? "").trim(); + const responseCode = typeof firstMessage.code === "string" ? firstMessage.code.trim() : ""; + if (responseCode && responseCode !== "200") { + throw new Error( + `Aliyun UI Agent returned error code ${responseCode}: ${explanation || thought || operation || rawBody.slice(0, 500)}`, + ); + } + if (/^fail(?:\s*\(|$)/i.test(operation)) { + throw new Error(`Aliyun UI Agent reported failure: ${explanation || thought || operation}`); + } + const action = mapAliyunOperationToAction({ + operation, + thought, + explanation, + viewportWidth: params.viewportWidth, + viewportHeight: params.viewportHeight, + }); + + return { + sessionId: this.getSessionId(), + explanation, + output: { + thought, + action, + raw: rawBody, + }, + }; + } +} diff --git a/src/agent/runtime/attempt.ts b/src/agent/runtime/attempt.ts index 5665230..ad2b7a4 100644 --- a/src/agent/runtime/attempt.ts +++ b/src/agent/runtime/attempt.ts @@ -18,6 +18,7 @@ import { } from "@mariozechner/pi-ai"; import type { + AgentAction, CronTaskPlan, HumanAuthDecision, OpenPocketConfig, @@ -400,15 +401,24 @@ export async function runRuntimeAttempt( launchablePackages, taskExecutionPlan: request.taskExecutionPlan ?? null, cronTaskPlan: request.cronTaskPlan ?? null, - runtimeModel: { - id: String((finalModel as { id?: unknown }).id ?? effectiveProfile.model), - provider: String((finalModel as { provider?: unknown }).provider ?? "unknown"), - api: String((finalModel as { api?: unknown }).api ?? "unknown"), - baseUrl: String((finalModel as { baseUrl?: unknown }).baseUrl ?? effectiveProfile.baseUrl), - authSource: auth.source, - }, + runtimeModel: profile.backend === "aliyun_ui_agent_mobile" || profile.backend === "aliyun_gui_plus" + ? { + id: effectiveProfile.model, + provider: profile.backend === "aliyun_gui_plus" ? "aliyun-gui-plus" : "aliyun-ui-agent", + api: profile.backend === "aliyun_gui_plus" ? "aliyun-gui-plus" : "aliyun-ui-agent-mobile", + baseUrl: effectiveProfile.baseUrl, + authSource: auth.source, + } + : { + id: String((finalModel as { id?: unknown }).id ?? effectiveProfile.model), + provider: String((finalModel as { provider?: unknown }).provider ?? "unknown"), + api: String((finalModel as { api?: unknown }).api ?? "unknown"), + baseUrl: String((finalModel as { baseUrl?: unknown }).baseUrl ?? effectiveProfile.baseUrl), + authSource: auth.source, + }, effectivePromptMode, systemPrompt, + aliyunSessionId: null, onHumanAuth: request.onHumanAuth, onChannelMedia: request.onChannelMedia, onUserDecision: request.onUserDecision, @@ -574,6 +584,401 @@ export async function runRuntimeAttempt( const thinkingLevel: ThinkingLevel = profile.reasoningEffort && profile.reasoningEffort in thinkingMap ? thinkingMap[profile.reasoningEffort] : "off"; + const captureAliyunSnapshot = async (): Promise => { + if (ctx.finishMessage || ctx.failMessage) { + return null; + } + if (ctx.stopRequested()) { + ctx.failMessage = "Task stopped by user."; + return null; + } + if (ctx.stepCount >= ctx.maxSteps) { + if (!completeBoundedCronRunIfNeeded(ctx)) { + ctx.failMessage = `Max steps reached (${ctx.maxSteps})`; + } + return null; + } + + ctx.lastScreenshotStartMs = Date.now(); + const snapshot = await deps.adb.captureScreenSnapshot(deps.config.agent.deviceId, profile.model); + snapshot.installedApps = launchableApps ?? snapshot.installedApps; + snapshot.installedPackages = launchablePackages; + ctx.latestSnapshot = snapshot; + ctx.lastScreenshotEndMs = Date.now(); + shouldReturnHome = true; + + if (deps.config.screenshots.saveStepScreenshots) { + try { + ctx.lastScreenshotPath = deps.screenshotStore.save( + Buffer.from(snapshot.screenshotBase64, "base64"), + { sessionId: session.id, step: ctx.stepCount + 1, currentApp: snapshot.currentApp }, + ); + } catch { + ctx.lastScreenshotPath = null; + } + try { + ctx.lastSomScreenshotPath = snapshot.somScreenshotBase64 + ? deps.screenshotStore.save( + Buffer.from(snapshot.somScreenshotBase64, "base64"), + { sessionId: session.id, step: ctx.stepCount + 1, currentApp: `${snapshot.currentApp}-som` }, + ) + : null; + } catch { + ctx.lastSomScreenshotPath = null; + } + const recentForSave = ctx.recentSnapshotWindow.slice(-2); + const recentPaths: string[] = []; + for (const recent of recentForSave) { + try { + const selected = selectObservationImage(recent); + if (!selected) { + continue; + } + const saved = deps.screenshotStore.save( + Buffer.from(selected.data, "base64"), + { + sessionId: session.id, + step: ctx.stepCount + 1, + currentApp: `${recent.currentApp}${selected.tag === "som" ? "-recent-som" : "-recent"}`, + }, + ); + recentPaths.push(saved); + } catch { + // Best-effort recent screenshot persistence. + } + } + ctx.lastRecentScreenshotPaths = recentPaths; + } + + if ( + deps.isPermissionDialogApp(snapshot.currentApp) && + Date.now() - ctx.lastAutoPermissionAllowAtMs >= 1_200 + ) { + const auto = await deps.autoApprovePermissionDialog(snapshot.currentApp); + if (auto?.action?.type === "tap") { + ctx.lastAutoPermissionAllowAtMs = Date.now(); + await sleep(300); + const refreshed = await deps.adb.captureScreenSnapshot(deps.config.agent.deviceId, profile.model); + refreshed.installedApps = launchableApps ?? refreshed.installedApps; + refreshed.installedPackages = launchablePackages; + ctx.latestSnapshot = refreshed; + } + } + + await maybeEscalateSecureSurfaceTakeover(); + if (ctx.failMessage || !ctx.latestSnapshot) { + return null; + } + + deps.saveModelInputArtifacts({ + sessionId: session.id, + step: ctx.stepCount + 1, + task: ctx.task, + profileModel: profile.model, + promptMode: ctx.effectivePromptMode, + systemPrompt: ctx.systemPrompt, + userPrompt: buildUserPrompt( + ctx.task, + ctx.stepCount + 1, + ctx.latestSnapshot, + ctx.history, + ctx.recentSnapshotWindow.slice(-2), + ), + snapshot: ctx.latestSnapshot, + history: ctx.history, + }); + + ctx.recentSnapshotWindow.push(ctx.latestSnapshot); + if (ctx.recentSnapshotWindow.length > 3) { + ctx.recentSnapshotWindow = ctx.recentSnapshotWindow.slice(-3); + } + return ctx.latestSnapshot; + }; + + const buildAliyunAddInfo = (snapshot: ScreenSnapshot): string => { + const lines = [ + `Current app: ${snapshot.currentApp}`, + `Step: ${ctx.stepCount + 1}/${ctx.maxSteps}`, + ]; + const recentHistory = ctx.history.slice(-4); + if (recentHistory.length > 0) { + lines.push("Recent history:"); + for (const item of recentHistory) { + lines.push(`- ${item}`); + } + } + return lines.join("\n"); + }; + + const executeAliyunAction = async (action: AgentAction): Promise => { + if (action.type === "finish") { + ctx.finishMessage = action.message; + return `FINISH: ${action.message}`; + } + if (action.type === "wait") { + const durationMs = Math.max(100, Number(action.durationMs ?? 1000)); + await sleep(durationMs); + return `Waited ${durationMs}ms`; + } + shouldReturnHome = true; + return await deps.adb.executeAction(action, deps.config.agent.deviceId); + }; + + if (profile.backend === "aliyun_ui_agent_mobile") { + const screenshotStack = deps.localHumanAuthStackFactory(deps.config); + const aliyunClient = deps.aliyunUiAgentClientFactory({ + apiKey, + baseUrl: effectiveProfile.baseUrl, + modelName: effectiveProfile.model, + thoughtLanguage: "english", + sessionId: ctx.aliyunSessionId, + }); + + try { + await screenshotStack.start(); + + while (!ctx.finishMessage && !ctx.failMessage) { + const snapshot = await captureAliyunSnapshot(); + if (!snapshot) { + break; + } + + ctx.lastModelInferenceStartMs = Date.now(); + const signedScreenshot = await screenshotStack.createSignedScreenshotUrl({ ttlSec: 60 }); + const stepResult = await aliyunClient.nextStep({ + task: ctx.task, + screenshotUrl: signedScreenshot.url, + addInfo: buildAliyunAddInfo(snapshot), + viewportWidth: snapshot.width, + viewportHeight: snapshot.height, + }); + + ctx.aliyunSessionId = stepResult.sessionId; + aliyunClient.setSessionId(stepResult.sessionId); + + const stepNo = ctx.stepCount + 1; + ctx.stepCount = stepNo; + const executionResult = await executeAliyunAction(stepResult.output.action); + + deps.workspace.appendStep( + session, + stepNo, + stepResult.output.thought, + JSON.stringify(stepResult.output.action), + executionResult, + ); + + ctx.history.push( + `step ${stepNo}: action=${stepResult.output.action.type} result=${executionResult.replace(/\s+/g, " ").trim()}`, + ); + + if (ctx.onProgress) { + await ctx.onProgress({ + step: stepNo, + maxSteps: ctx.maxSteps, + actionType: stepResult.output.action.type, + thought: stepResult.output.thought, + message: executionResult, + currentApp: ctx.latestSnapshot?.currentApp ?? "unknown", + screenshotPath: ctx.lastScreenshotPath, + }); + } + + if (!ctx.finishMessage && !ctx.failMessage && ctx.stepCount >= ctx.maxSteps) { + if (!completeBoundedCronRunIfNeeded(ctx)) { + ctx.failMessage = `Max steps reached (${ctx.maxSteps})`; + } + } + } + } catch (error) { + ctx.failMessage = `Aliyun UI Agent error: ${(error as Error).message}`; + } finally { + await screenshotStack.stop().catch(() => {}); + } + + if (!ctx.finishMessage && !ctx.failMessage && ctx.stopRequested()) { + ctx.failMessage = "Task stopped by user."; + } + + if (ctx.finishMessage) { + deps.workspace.finalizeSession(session, true, ctx.finishMessage); + deps.workspace.appendDailyMemory(profileKey, request.task, true, ctx.finishMessage); + const artifacts = deps.autoArtifactBuilder.build({ + task: request.task, + sessionPath: session.path, + ok: true, + finalMessage: ctx.finishMessage, + traces: ctx.traces, + }); + if (artifacts.skillPath) { + // eslint-disable-next-line no-console + console.log(`[OpenPocket][artifact] auto skill: ${artifacts.skillPath}`); + } + if (artifacts.scriptPath) { + // eslint-disable-next-line no-console + console.log(`[OpenPocket][artifact] auto script: ${artifacts.scriptPath}`); + } + const finalSkillPath = resolveFinalSkillPath(artifacts.skillPath, ctx.finishMessage); + return { + result: { + ok: true, + message: ctx.finishMessage, + sessionPath: session.path, + skillPath: finalSkillPath, + scriptPath: artifacts.scriptPath, + }, + shouldReturnHome, + }; + } + + const failMsg = ctx.failMessage || "Aliyun UI Agent stopped without finishing."; + deps.workspace.finalizeSession(session, false, failMsg); + deps.workspace.appendDailyMemory(profileKey, request.task, false, failMsg); + return { + result: { ok: false, message: failMsg, sessionPath: session.path, skillPath: null, scriptPath: null }, + shouldReturnHome, + }; + } + + if (profile.backend === "aliyun_gui_plus") { + const guiPlusClient = deps.aliyunGuiPlusClientFactory({ + apiKey, + baseUrl: effectiveProfile.baseUrl, + modelName: effectiveProfile.model, + thoughtLanguage: "english", + }); + + const buildGuiPlusAddInfo = (snapshot: ScreenSnapshot): string => { + const lines = [ + `Current app: ${snapshot.currentApp}`, + `Step: ${ctx.stepCount + 1}/${ctx.maxSteps}`, + ]; + const recentHistory = ctx.history.slice(-4); + if (recentHistory.length > 0) { + lines.push("Recent history:"); + for (const item of recentHistory) { + lines.push(`- ${item}`); + } + } + return lines.join("\n"); + }; + + const executeGuiPlusAction = async (action: AgentAction): Promise => { + if (action.type === "finish") { + ctx.finishMessage = action.message; + return `FINISH: ${action.message}`; + } + if (action.type === "wait") { + const durationMs = Math.max(100, Number(action.durationMs ?? 1000)); + await sleep(durationMs); + return `Waited ${durationMs}ms`; + } + shouldReturnHome = true; + return await deps.adb.executeAction(action, deps.config.agent.deviceId); + }; + + try { + while (!ctx.finishMessage && !ctx.failMessage) { + if (ctx.stopRequested()) { + break; + } + + const snapshot = await captureAliyunSnapshot(); + if (!snapshot) { + break; + } + + ctx.lastModelInferenceStartMs = Date.now(); + const stepResult = await guiPlusClient.nextStep({ + task: ctx.task, + screenshotBase64: snapshot.screenshotBase64, + addInfo: buildGuiPlusAddInfo(snapshot), + viewportWidth: snapshot.width, + viewportHeight: snapshot.height, + }); + + const stepNo = ctx.stepCount + 1; + ctx.stepCount = stepNo; + const executionResult = await executeGuiPlusAction(stepResult.output.action); + + deps.workspace.appendStep( + session, + stepNo, + stepResult.output.thought, + JSON.stringify(stepResult.output.action), + executionResult, + ); + + ctx.history.push( + `step ${stepNo}: action=${stepResult.output.action.type} result=${executionResult.replace(/\s+/g, " ").trim()}`, + ); + + if (ctx.onProgress) { + await ctx.onProgress({ + step: stepNo, + maxSteps: ctx.maxSteps, + actionType: stepResult.output.action.type, + thought: stepResult.output.thought, + message: executionResult, + currentApp: ctx.latestSnapshot?.currentApp ?? "unknown", + screenshotPath: ctx.lastScreenshotPath, + }); + } + + if (!ctx.finishMessage && !ctx.failMessage && ctx.stepCount >= ctx.maxSteps) { + if (!completeBoundedCronRunIfNeeded(ctx)) { + ctx.failMessage = `Max steps reached (${ctx.maxSteps})`; + } + } + } + } catch (error) { + ctx.failMessage = `GUI-Plus error: ${(error as Error).message}`; + } + + if (!ctx.finishMessage && !ctx.failMessage && ctx.stopRequested()) { + ctx.failMessage = "Task stopped by user."; + } + + if (ctx.finishMessage) { + deps.workspace.finalizeSession(session, true, ctx.finishMessage); + deps.workspace.appendDailyMemory(profileKey, request.task, true, ctx.finishMessage); + const artifacts = deps.autoArtifactBuilder.build({ + task: request.task, + sessionPath: session.path, + ok: true, + finalMessage: ctx.finishMessage, + traces: ctx.traces, + }); + if (artifacts.skillPath) { + // eslint-disable-next-line no-console + console.log(`[OpenPocket][artifact] auto skill: ${artifacts.skillPath}`); + } + if (artifacts.scriptPath) { + // eslint-disable-next-line no-console + console.log(`[OpenPocket][artifact] auto script: ${artifacts.scriptPath}`); + } + const finalSkillPath = resolveFinalSkillPath(artifacts.skillPath, ctx.finishMessage); + return { + result: { + ok: true, + message: ctx.finishMessage, + sessionPath: session.path, + skillPath: finalSkillPath, + scriptPath: artifacts.scriptPath, + }, + shouldReturnHome, + }; + } + + const failMsg = ctx.failMessage || "GUI-Plus stopped without finishing."; + deps.workspace.finalizeSession(session, false, failMsg); + deps.workspace.appendDailyMemory(profileKey, request.task, false, failMsg); + return { + result: { ok: false, message: failMsg, sessionPath: session.path, skillPath: null, scriptPath: null }, + shouldReturnHome, + }; + } + if (usePiSessionBridge) { const appendSessionEvent = ( eventType: string, diff --git a/src/agent/runtime/types.ts b/src/agent/runtime/types.ts index aa81e01..277e4f5 100644 --- a/src/agent/runtime/types.ts +++ b/src/agent/runtime/types.ts @@ -12,6 +12,7 @@ import type { HumanAuthDecision, HumanAuthRequest, ModelProfile, + ModelStepOutput, OpenPocketConfig, ScreenSnapshot, TaskExecutionPlan, @@ -136,6 +137,52 @@ export interface RuntimeAttemptDependencies { isPermissionDialogApp: (currentApp: string) => boolean; autoApprovePermissionDialog: (currentApp: string) => Promise; saveModelInputArtifacts: (params: RuntimeModelInputArtifactsParams) => void; + aliyunUiAgentClientFactory: (options: { + apiKey: string; + baseUrl?: string; + modelName?: string; + thoughtLanguage?: string; + sessionId?: string | null; + }) => { + getSessionId(): string | null; + setSessionId(sessionId: string | null | undefined): void; + nextStep(params: { + task: string; + screenshotUrl: string; + addInfo?: string; + thoughtLanguage?: string; + viewportWidth?: number; + viewportHeight?: number; + }): Promise<{ + sessionId: string | null; + explanation: string; + output: ModelStepOutput; + }>; + }; + aliyunGuiPlusClientFactory: (options: { + apiKey: string; + baseUrl?: string; + modelName?: string; + thoughtLanguage?: string; + }) => { + resetConversation(): void; + nextStep(params: { + task: string; + screenshotBase64: string; + addInfo?: string; + thoughtLanguage?: string; + viewportWidth: number; + viewportHeight: number; + }): Promise<{ + explanation: string; + output: ModelStepOutput; + }>; + }; + localHumanAuthStackFactory: (config: OpenPocketConfig, log?: (line: string) => void) => { + start(): Promise<{ relayBaseUrl: string; publicBaseUrl: string }>; + stop(): Promise; + createSignedScreenshotUrl(options?: { ttlSec?: number }): Promise<{ url: string; expiresAt: string }>; + }; } /** Mutable state shared across tool execute closures during a single runTask invocation. */ @@ -164,6 +211,7 @@ export interface PhoneAgentRunContext { runtimeModel: RuntimeModelMetadata; effectivePromptMode: SystemPromptMode; systemPrompt: string; + aliyunSessionId: string | null; onHumanAuth?: (request: HumanAuthRequest) => Promise | HumanAuthDecision; onChannelMedia?: (request: ChannelMediaRequest) => Promise | ChannelMediaDeliveryResult; onUserDecision?: (request: UserDecisionRequest) => Promise | UserDecisionResponse; diff --git a/src/cli.ts b/src/cli.ts index 767329f..fe0764f 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -602,6 +602,7 @@ function modelProviderLabel(baseUrl: string): string { if (lower.includes("api.kimi.com")) return "Kimi Code"; if (lower.includes("moonshot.cn") || lower.includes("moonshot.ai")) return "Moonshot AI"; if (lower.includes("api.deepseek.com")) return "DeepSeek"; + if (lower.includes("/api/v2/apps/gui-owl/gui_agent_server")) return "Aliyun UI Agent (Mobile)"; if (lower.includes("dashscope.aliyuncs.com")) return "Qwen (DashScope)"; if (lower.includes("api.minimax.io") || lower.includes("api.minimaxi.com")) return "MiniMax"; if (lower.includes("volces.com") || lower.includes("volcengine.com")) return "Volcano Engine"; diff --git a/src/config/index.ts b/src/config/index.ts index 7a579d1..8a30f98 100644 --- a/src/config/index.ts +++ b/src/config/index.ts @@ -385,6 +385,26 @@ function defaultConfigObject() { reasoningEffort: "medium" as const, temperature: null, }, + "aliyun-ui-agent/mobile": { + baseUrl: "https://dashscope.aliyuncs.com/api/v2/apps/gui-owl/gui_agent_server", + model: "pre-gui_owl_7b", + apiKey: "", + apiKeyEnv: "DASHSCOPE_API_KEY", + maxTokens: 4096, + reasoningEffort: null, + temperature: null, + backend: "aliyun_ui_agent_mobile" as const, + }, + "aliyun-gui-plus/mobile": { + baseUrl: "https://dashscope.aliyuncs.com/compatible-mode/v1", + model: "gui-plus", + apiKey: "", + apiKeyEnv: "DASHSCOPE_API_KEY", + maxTokens: 4096, + reasoningEffort: null, + temperature: null, + backend: "aliyun_gui_plus" as const, + }, "qwen-max": { baseUrl: "https://dashscope.aliyuncs.com/compatible-mode/v1", model: "qwen-max", @@ -923,6 +943,7 @@ function normalizeConfig(raw: Record, configPath: string): Open ? reasoningRaw : null; const tempRaw = model.temperature; + const backendRaw = String(model.backend ?? "").trim().toLowerCase(); const parsedBaseUrl = String(model.baseUrl ?? model.base_url ?? "https://api.openai.com/v1"); const baseUrl = normalizeAnthropicBaseUrl(normalizeGoogleBaseUrl(parsedBaseUrl)); models[key] = { @@ -936,6 +957,11 @@ function normalizeConfig(raw: Record, configPath: string): Open tempRaw === null || tempRaw === undefined || Number.isNaN(Number(tempRaw)) ? null : Number(tempRaw), + backend: backendRaw === "aliyun_ui_agent_mobile" + ? "aliyun_ui_agent_mobile" + : backendRaw === "aliyun_gui_plus" + ? "aliyun_gui_plus" + : "default", }; } const defaultModel = String(merged.defaultModel ?? "gpt-5.2-codex"); diff --git a/src/config/model-provider-presets.ts b/src/config/model-provider-presets.ts index e2614cd..fefda01 100644 --- a/src/config/model-provider-presets.ts +++ b/src/config/model-provider-presets.ts @@ -8,6 +8,9 @@ export type ModelProviderPreset = { defaultModelId: string; suggestedModelIds: string[]; matchHosts: string[]; + matchPaths?: string[]; + hostFallback?: boolean; + backend?: ModelProfile["backend"]; }; const MODEL_PROVIDER_PRESETS: ReadonlyArray = [ @@ -83,6 +86,28 @@ const MODEL_PROVIDER_PRESETS: ReadonlyArray = [ suggestedModelIds: ["deepseek-chat", "deepseek-reasoner"], matchHosts: ["api.deepseek.com"], }, + { + key: "aliyun-ui-agent", + label: "Aliyun UI Agent (Mobile)", + baseUrl: "https://dashscope.aliyuncs.com/api/v2/apps/gui-owl/gui_agent_server", + apiKeyEnv: "DASHSCOPE_API_KEY", + defaultModelId: "pre-gui_owl_7b", + suggestedModelIds: ["pre-gui_owl_7b"], + matchHosts: ["dashscope.aliyuncs.com"], + matchPaths: ["/api/v2/apps/gui-owl/gui_agent_server"], + backend: "aliyun_ui_agent_mobile", + }, + { + key: "aliyun-gui-plus", + label: "Aliyun GUI-Plus (Mobile)", + baseUrl: "https://dashscope.aliyuncs.com/compatible-mode/v1", + apiKeyEnv: "DASHSCOPE_API_KEY", + defaultModelId: "gui-plus", + suggestedModelIds: ["gui-plus"], + matchHosts: ["dashscope.aliyuncs.com"], + matchPaths: ["/compatible-mode/v1/chat/completions"], + backend: "aliyun_gui_plus", + }, { key: "qwen", label: "Qwen (DashScope)", @@ -91,6 +116,8 @@ const MODEL_PROVIDER_PRESETS: ReadonlyArray = [ defaultModelId: "qwen-max", suggestedModelIds: ["qwen-max", "qwen-plus", "qwen-coder-plus"], matchHosts: ["dashscope.aliyuncs.com"], + matchPaths: ["/compatible-mode/v1"], + hostFallback: true, }, { key: "minimax", @@ -173,12 +200,31 @@ export function resolveModelProviderPreset(providerKey: string): ModelProviderPr export function resolveModelProviderPresetByBaseUrl(baseUrl: string): ModelProviderPreset | null { const lower = baseUrl.toLowerCase(); + let pathname = ""; + try { + pathname = new URL(baseUrl).pathname.toLowerCase(); + } catch { + pathname = ""; + } + for (const preset of MODEL_PROVIDER_PRESETS) { - if (preset.matchHosts.some((host) => lower.includes(host))) { + if (!preset.matchPaths || preset.matchPaths.length === 0) { + continue; + } + if (!preset.matchHosts.some((host) => lower.includes(host))) { + continue; + } + if (preset.matchPaths.some((candidate) => pathname.includes(candidate.toLowerCase()))) { return preset; } } - return null; + + const hostMatches = MODEL_PROVIDER_PRESETS.filter((preset) => preset.matchHosts.some((host) => lower.includes(host))); + const hostFallback = hostMatches.find((preset) => preset.hostFallback); + if (hostFallback) { + return hostFallback; + } + return hostMatches[0] ?? null; } export function listModelProviderPresetKeys(): string[] { @@ -217,5 +263,6 @@ export function buildModelProfileFromPreset( maxTokens: Number.isFinite(existing?.maxTokens) ? Number(existing?.maxTokens) : 4096, reasoningEffort: normalizeReasoningEffort(existing?.reasoningEffort) ?? inferReasoningEffort(normalizedModelId), temperature: Number.isFinite(existing?.temperature) ? Number(existing?.temperature) : null, + backend: existing?.backend ?? preset.backend ?? "default", }; } diff --git a/src/dashboard/control-store.ts b/src/dashboard/control-store.ts index 597949d..f429db3 100644 --- a/src/dashboard/control-store.ts +++ b/src/dashboard/control-store.ts @@ -97,6 +97,9 @@ export function providerLabel(baseUrl: string): string { if (lower.includes("api.deepseek.com")) { return "DeepSeek"; } + if (lower.includes("/api/v2/apps/gui-owl/gui_agent_server")) { + return "Aliyun UI Agent (Mobile)"; + } if (lower.includes("dashscope.aliyuncs.com")) { return "Qwen (DashScope)"; } diff --git a/src/dashboard/server.ts b/src/dashboard/server.ts index 0a93d41..5984edc 100644 --- a/src/dashboard/server.ts +++ b/src/dashboard/server.ts @@ -4124,6 +4124,8 @@ export class DashboardServer { if (text.includes("anthropic.com")) return "Anthropic"; if (text.includes("googleapis.com")) return "Google"; if (text.includes("api.deepseek.com")) return "DeepSeek"; + if (text.includes("/api/v2/apps/gui-owl/gui_agent_server")) return "Aliyun UI Agent (Mobile)"; + if (text.includes("gui-plus")) return "Aliyun GUI-Plus (Mobile)"; if (text.includes("dashscope.aliyuncs.com")) return "Qwen (DashScope)"; if (text.includes("api.minimax.io") || text.includes("api.minimaxi.com")) return "MiniMax"; if (text.includes("volces.com") || text.includes("volcengine.com")) return "Volcano Engine"; diff --git a/src/human-auth/local-stack.ts b/src/human-auth/local-stack.ts index 6e2ec01..d2c34e9 100644 --- a/src/human-auth/local-stack.ts +++ b/src/human-auth/local-stack.ts @@ -4,6 +4,7 @@ import { loadManagerPorts } from "../manager/ports.js"; import { HumanAuthRelayServer } from "./relay-server.js"; import { NgrokTunnel } from "./ngrok-tunnel.js"; import { LocalHumanAuthTakeoverRuntime } from "./takeover-runtime.js"; +import type { HumanAuthTakeoverRuntime } from "./takeover-runtime.js"; export interface LocalHumanAuthStackStartResult { relayBaseUrl: string; @@ -15,6 +16,15 @@ type HubRegistrationResponse = { publicBaseUrl: string; }; +export interface LocalHumanAuthStackOptions { + takeoverRuntime?: HumanAuthTakeoverRuntime; +} + +export interface LocalHumanAuthSignedScreenshot { + url: string; + expiresAt: string; +} + function stripTrailingSlash(value: string): string { return value.trim().replace(/\/+$/, ""); } @@ -22,14 +32,16 @@ function stripTrailingSlash(value: string): string { export class LocalHumanAuthStack { private readonly config: OpenPocketConfig; private readonly log: (line: string) => void; + private readonly options: LocalHumanAuthStackOptions; private relay: HumanAuthRelayServer | null = null; private ngrok: NgrokTunnel | null = null; private registeredAgentId: string | null = null; private resolvedRelayBaseUrl = ""; private resolvedPublicBaseUrl = ""; - constructor(config: OpenPocketConfig, log?: (line: string) => void) { + constructor(config: OpenPocketConfig, log?: (line: string) => void, options: LocalHumanAuthStackOptions = {}) { this.config = config; + this.options = options; this.log = log ?? ((line: string) => { @@ -87,6 +99,17 @@ export class LocalHumanAuthStack { } } + async createSignedScreenshotUrl(options?: { ttlSec?: number }): Promise { + await this.start(); + if (!this.relay) { + throw new Error("Local human-auth relay is not running."); + } + return this.relay.createSignedScreenshotUrl({ + publicBaseUrl: this.resolvedPublicBaseUrl || this.resolvedRelayBaseUrl || this.requireRelayAddress(), + ttlSec: options?.ttlSec, + }); + } + private async startManagedRelay(agentId: string): Promise { await this.startPrivateRelay({ host: "127.0.0.1", port: 0 }); const directBaseUrl = this.requireRelayAddress(); @@ -156,7 +179,7 @@ export class LocalHumanAuthStack { apiKey: this.config.humanAuth.apiKey, apiKeyEnv: this.config.humanAuth.apiKeyEnv, stateFile: this.config.humanAuth.localRelayStateFile, - takeoverRuntime: new LocalHumanAuthTakeoverRuntime(this.config), + takeoverRuntime: this.options.takeoverRuntime ?? new LocalHumanAuthTakeoverRuntime(this.config), logger: this.log, }); await this.relay.start(); diff --git a/src/human-auth/relay-server.ts b/src/human-auth/relay-server.ts index 89c6c8a..3ae075f 100644 --- a/src/human-auth/relay-server.ts +++ b/src/human-auth/relay-server.ts @@ -521,6 +521,7 @@ export class HumanAuthRelayServer { private readonly options: HumanAuthRelayServerOptions; private readonly log: (line: string) => void; private readonly records = new Map(); + private readonly screenshotTokens = new Map(); private server: http.Server | null = null; /** Per-request rate limiting: track last takeover action timestamp. */ private readonly takeoverActionTimestamps = new Map(); @@ -706,13 +707,16 @@ export class HumanAuthRelayServer { return authHeader.slice("Bearer ".length).trim() === apiKey; } - private makePublicBaseUrl(req: http.IncomingMessage, bodyPublicBaseUrl: string): string { + private makePublicBaseUrl(req: http.IncomingMessage | null, bodyPublicBaseUrl: string): string { if (bodyPublicBaseUrl.trim()) { return bodyPublicBaseUrl.trim().replace(/\/+$/, ""); } if (this.options.publicBaseUrl.trim()) { return this.options.publicBaseUrl.trim().replace(/\/+$/, ""); } + if (!req) { + return this.address.replace(/\/+$/, ""); + } const host = String(req.headers.host ?? `${this.options.host}:${this.options.port}`); const proto = String(req.headers["x-forwarded-proto"] ?? "http"); return `${proto}://${host}`.replace(/\/+$/, ""); @@ -767,6 +771,43 @@ export class HumanAuthRelayServer { return this.options.takeoverRuntime ?? null; } + private purgeExpiredScreenshotTokens(now = nowMs()): void { + for (const [tokenHash, expiresAtMs] of this.screenshotTokens.entries()) { + if (expiresAtMs <= now) { + this.screenshotTokens.delete(tokenHash); + } + } + } + + private verifyScreenshotToken(tokenRaw: unknown): { ok: true } | { ok: false; error: string; status: number } { + const token = String(tokenRaw ?? ""); + this.purgeExpiredScreenshotTokens(); + const expiresAtMs = this.screenshotTokens.get(hashToken(token)); + if (!token || !expiresAtMs || expiresAtMs <= nowMs()) { + return { ok: false, error: "Invalid or expired token.", status: 403 }; + } + return { ok: true }; + } + + createSignedScreenshotUrl(options?: { publicBaseUrl?: string; ttlSec?: number }): { url: string; expiresAt: string } { + if (!this.ensureTakeoverRuntime()) { + throw new Error("Remote takeover runtime is not configured."); + } + const ttlSecRaw = Number(options?.ttlSec ?? 60); + const ttlSec = Math.max(5, Math.min(300, Number.isFinite(ttlSecRaw) ? Math.round(ttlSecRaw) : 60)); + const expiresAtMs = nowMs() + ttlSec * 1000; + const token = randomToken(); + this.screenshotTokens.set(hashToken(token), expiresAtMs); + const baseUrl = this.makePublicBaseUrl( + null, + String(options?.publicBaseUrl ?? this.options.publicBaseUrl ?? ""), + ); + return { + url: `${baseUrl}/v1/human-auth/takeover/screenshot?token=${encodeURIComponent(token)}`, + expiresAt: new Date(expiresAtMs).toISOString(), + }; + } + private sanitizeHeaderValue(input: string): string { return String(input || "").replace(/[\r\n]+/g, " ").slice(0, 200); } @@ -3375,6 +3416,30 @@ export class HumanAuthRelayServer { return; } + if (method === "GET" && pathname === "/v1/human-auth/takeover/screenshot") { + const auth = this.verifyScreenshotToken(requestUrl.searchParams.get("token")); + if (!auth.ok) { + sendText(res, auth.status, auth.error); + return; + } + const runtime = this.ensureTakeoverRuntime(); + if (!runtime) { + sendText(res, 501, "Remote takeover runtime is not configured."); + return; + } + try { + const frame = await runtime.captureFrame(); + const imageBuffer = Buffer.from(frame.screenshotBase64, "base64"); + res.statusCode = 200; + res.setHeader("content-type", "image/png"); + res.setHeader("cache-control", "no-store, no-cache, must-revalidate, private"); + res.end(imageBuffer); + } catch (error) { + sendText(res, 500, `Failed to capture screenshot: ${(error as Error).message}`); + } + return; + } + if (method === "POST" && pathname === "/v1/human-auth/requests") { if (!this.isAuthorized(req)) { sendJson(res, 401, { error: "Unauthorized" }); diff --git a/src/onboarding/setup-wizard.ts b/src/onboarding/setup-wizard.ts index 8f01f34..0278298 100644 --- a/src/onboarding/setup-wizard.ts +++ b/src/onboarding/setup-wizard.ts @@ -232,6 +232,9 @@ function providerFromBaseUrl(baseUrl: string): string { if (lower.includes("api.deepseek.com")) { return "DeepSeek"; } + if (lower.includes("/api/v2/apps/gui-owl/gui_agent_server")) { + return "Aliyun UI Agent (Mobile)"; + } if (lower.includes("dashscope.aliyuncs.com")) { return "Qwen (DashScope)"; } @@ -277,6 +280,9 @@ function applyProviderApiKey(config: OpenPocketConfig, targetModelKey: string, a } function modelOptionLabel(profileKey: string, profile: ModelProfile): string { + if (profile.backend === "aliyun_ui_agent_mobile" || profileKey === "aliyun-ui-agent/mobile") { + return "Aliyun UI Agent (Mobile)"; + } if (profileKey === "gpt-5.2-codex") { return "GPT-5.2 Codex (OpenAI)"; } @@ -352,6 +358,11 @@ function modelOptionLabel(profileKey: string, profile: ModelProfile): string { return `${profile.model} (${providerFromBaseUrl(profile.baseUrl)})`; } +function currentModelRequiresReadyNgrok(config: OpenPocketConfig): boolean { + const profile = config.models[config.defaultModel]; + return profile?.backend === "aliyun_ui_agent_mobile"; +} + function isOpenAiLikeHost(baseUrl: string): boolean { const lower = baseUrl.toLowerCase(); return lower.includes("openai.com") || lower.includes("chatgpt.com"); @@ -2514,30 +2525,50 @@ async function runHumanAuthStep( prompter: SetupPrompter, state: SetupState, ): Promise { + const requiresReadyNgrok = currentModelRequiresReadyNgrok(config); + if (requiresReadyNgrok) { + await prompter.note( + "Aliyun UI Agent (Mobile)", + "This model requires a public screenshot URL. Onboarding will only continue with a ready ngrok setup.", + ); + } + const mode = await prompter.select( "Real-device authorization bridge mode", - [ - { - value: "ngrok", - label: "Enable local relay + ngrok tunnel (recommended)", - }, - { - value: "lan", - label: "Enable local relay only (same Wi-Fi / LAN access)", - }, - { - value: "disabled", - label: "Disable human-auth bridge for now", - }, - ], - config.humanAuth.enabled - ? config.humanAuth.tunnel.provider === "ngrok" && config.humanAuth.tunnel.ngrok.enabled - ? "ngrok" - : "lan" - : "disabled", + requiresReadyNgrok + ? [ + { + value: "ngrok", + label: "Enable local relay + ngrok tunnel (required)", + }, + ] + : [ + { + value: "ngrok", + label: "Enable local relay + ngrok tunnel (recommended)", + }, + { + value: "lan", + label: "Enable local relay only (same Wi-Fi / LAN access)", + }, + { + value: "disabled", + label: "Disable human-auth bridge for now", + }, + ], + requiresReadyNgrok + ? "ngrok" + : config.humanAuth.enabled + ? config.humanAuth.tunnel.provider === "ngrok" && config.humanAuth.tunnel.ngrok.enabled + ? "ngrok" + : "lan" + : "disabled", ); state.humanAuthMode = mode; + if (requiresReadyNgrok && mode !== "ngrok") { + throw new Error("Aliyun UI Agent (Mobile) requires ngrok."); + } if (mode === "disabled") { config.humanAuth.enabled = false; saveConfig(config); @@ -2598,6 +2629,9 @@ async function runHumanAuthStep( ? `Detected ${config.humanAuth.tunnel.ngrok.executable}: ${ngrokVersion}` : buildNgrokSetupGuide(config.humanAuth.tunnel.ngrok.executable, envName), ); + if (requiresReadyNgrok && !ngrokVersion) { + throw new Error("Aliyun UI Agent (Mobile) requires ngrok CLI."); + } const envToken = process.env[envName]?.trim() ?? ""; const configToken = config.humanAuth.tunnel.ngrok.authtoken.trim(); @@ -2608,33 +2642,41 @@ async function runHumanAuthStep( label: `Use environment variable ${envName}`, hint: envToken ? `Detected (length ${envToken.length})` : "Not detected", }, - { - value: "config", - label: "Paste token and save to local config.json", - hint: hasConfigToken ? `Current config token detected (length ${configToken.length})` : undefined, - }, - { - value: "skip", - label: "Skip for now", - }, ]; if (hasConfigToken) { - ngrokTokenOptions.splice(1, 0, { + ngrokTokenOptions.push({ value: "config-existing", label: "Use existing token from local config.json", hint: `Detected (length ${configToken.length})`, }); } + ngrokTokenOptions.push({ + value: "config", + label: "Paste token and save to local config.json", + hint: hasConfigToken ? `Current config token detected (length ${configToken.length})` : undefined, + }); + if (!requiresReadyNgrok) { + ngrokTokenOptions.push({ + value: "skip", + label: "Skip for now", + }); + } const tokenMethod = await prompter.select( "How should OpenPocket read ngrok authtoken?", ngrokTokenOptions, hasConfigToken ? "config-existing" : envToken ? "env" : "config", ); + if (requiresReadyNgrok && tokenMethod === "skip") { + throw new Error("Aliyun UI Agent (Mobile) requires ngrok authtoken."); + } if (tokenMethod === "env") { config.humanAuth.tunnel.ngrok.authtoken = ""; if (!envToken) { + if (requiresReadyNgrok) { + throw new Error("Aliyun UI Agent (Mobile) requires ngrok authtoken."); + } await prompter.note( "ngrok Setup", `${envName} is not set in the current shell. Gateway may fail to open ngrok tunnel until you export this env.`, @@ -2649,7 +2691,11 @@ async function runHumanAuthStep( ); if (confirmed) { config.humanAuth.tunnel.ngrok.authtoken = token; + } else if (requiresReadyNgrok) { + throw new Error("Aliyun UI Agent (Mobile) requires ngrok authtoken."); } + } else if (requiresReadyNgrok) { + throw new Error("Aliyun UI Agent (Mobile) requires ngrok authtoken."); } } diff --git a/src/types.ts b/src/types.ts index 8ac1ecc..d0e25d6 100644 --- a/src/types.ts +++ b/src/types.ts @@ -345,6 +345,7 @@ export interface ModelProfile { maxTokens: number; reasoningEffort: "low" | "medium" | "high" | "xhigh" | null; temperature: number | null; + backend: "default" | "aliyun_ui_agent_mobile" | "aliyun_gui_plus"; } export interface OpenPocketConfig { diff --git a/test/aliyun-ui-agent-client.test.mjs b/test/aliyun-ui-agent-client.test.mjs new file mode 100644 index 0000000..87543fb --- /dev/null +++ b/test/aliyun-ui-agent-client.test.mjs @@ -0,0 +1,279 @@ +import assert from "node:assert/strict"; +import test from "node:test"; + +const { AliyunUiAgentClient } = await import("../dist/agent/aliyun-ui-agent-client.js"); + +function makeResponse(body) { + return new Response(JSON.stringify(body), { + status: 200, + headers: { "content-type": "application/json" }, + }); +} + +test("AliyunUiAgentClient sends the mobile payload and reuses session ids", async () => { + const requests = []; + const responses = [ + makeResponse({ + session_id: "sess-1", + output: [ + { + code: "200", + content: [ + { + data: { + Thought: "The Weibo icon is visible on screen.", + Explanation: "Tap the Weibo icon to open the app.", + Operation: "Click (144, 248, 144, 248)", + }, + }, + ], + }, + ], + }), + makeResponse({ + session_id: "sess-1", + output: [ + { + code: "200", + content: [ + { + data: { + Thought: "The next screen requires a downward swipe.", + Explanation: "Swipe up so the page scrolls down.", + Operation: "Swipe (320, 900, 320, 300)", + }, + }, + ], + }, + ], + }), + ]; + + const client = new AliyunUiAgentClient({ + apiKey: "dashscope-test-key", + fetchImpl: async (url, init) => { + requests.push({ url, init }); + return responses.shift(); + }, + }); + + const first = await client.nextStep({ + task: "Open Weibo", + screenshotUrl: "https://example.com/screenshot-1.png", + addInfo: "Prefer the visible app icon.", + }); + assert.equal(first.sessionId, "sess-1"); + assert.equal(first.output.action.type, "tap"); + assert.equal(first.output.action.x, 144); + assert.equal(first.output.action.y, 248); + + const firstRequest = requests[0]; + assert.equal(firstRequest.url, "https://dashscope.aliyuncs.com/api/v2/apps/gui-owl/gui_agent_server"); + assert.equal(firstRequest.init.method, "POST"); + assert.equal(firstRequest.init.headers.Authorization, "Bearer dashscope-test-key"); + assert.equal(firstRequest.init.headers["Content-Type"], "application/json"); + + const firstPayload = JSON.parse(firstRequest.init.body); + const firstMessages = firstPayload.input[0].content[0].data.messages; + assert.deepEqual(firstMessages[0], { image: "https://example.com/screenshot-1.png" }); + assert.deepEqual(firstMessages[1], { instruction: "Open Weibo" }); + assert.deepEqual(firstMessages[2], { session_id: "" }); + assert.deepEqual(firstMessages[3], { device_type: "mobile" }); + assert.deepEqual(firstMessages[4], { pipeline_type: "agent" }); + assert.deepEqual(firstMessages[5], { model_name: "pre-gui_owl_7b" }); + assert.deepEqual(firstMessages[6], { thought_language: "english" }); + assert.deepEqual(firstMessages[7], { param_list: [{ add_info: "Prefer the visible app icon." }] }); + + const second = await client.nextStep({ + task: "Scroll down", + screenshotUrl: "https://example.com/screenshot-2.png", + }); + assert.equal(second.sessionId, "sess-1"); + assert.equal(second.output.action.type, "swipe"); + assert.equal(second.output.action.x1, 320); + assert.equal(second.output.action.y1, 900); + assert.equal(second.output.action.x2, 320); + assert.equal(second.output.action.y2, 300); + + const secondPayload = JSON.parse(requests[1].init.body); + assert.deepEqual(secondPayload.input[0].content[0].data.messages[2], { session_id: "sess-1" }); +}); + +test("AliyunUiAgentClient falls back to wait for unsupported operations", async () => { + const client = new AliyunUiAgentClient({ + apiKey: "dashscope-test-key", + fetchImpl: async () => makeResponse({ + session_id: "sess-unsupported", + output: [ + { + code: "200", + content: [ + { + data: { + Thought: "This operation type is not supported yet.", + Explanation: "Hover over the control.", + Operation: "Hover (300, 500)", + }, + }, + ], + }, + ], + }), + }); + + const result = await client.nextStep({ + task: "Hover over a control", + screenshotUrl: "https://example.com/screenshot-unsupported.png", + }); + + assert.equal(result.sessionId, "sess-unsupported"); + assert.equal(result.output.action.type, "wait"); + assert.match(result.output.action.reason || "", /unsupported/i); +}); + +test("AliyunUiAgentClient maps type key_press and scroll operations", async () => { + const responses = [ + makeResponse({ + session_id: "sess-ops", + output: [ + { + code: "200", + content: [ + { + data: { + Thought: "The search box is focused.", + Explanation: "Type the query into the active input.", + Operation: "Type (OpenPocket)", + }, + }, + ], + }, + ], + }), + makeResponse({ + session_id: "sess-ops", + output: [ + { + code: "200", + content: [ + { + data: { + Thought: "Return to the launcher first.", + Explanation: "Press the home key.", + Operation: "Key_press (HOME)", + }, + }, + ], + }, + ], + }), + makeResponse({ + session_id: "sess-ops", + output: [ + { + code: "200", + content: [ + { + data: { + Thought: "More results are lower on the page.", + Explanation: "Scroll down once.", + Operation: "Scroll (down)", + }, + }, + ], + }, + ], + }), + ]; + + const client = new AliyunUiAgentClient({ + apiKey: "dashscope-test-key", + fetchImpl: async () => responses.shift(), + }); + + const typed = await client.nextStep({ + task: "Search for OpenPocket", + screenshotUrl: "https://example.com/typed.png", + }); + assert.equal(typed.output.action.type, "type"); + assert.equal(typed.output.action.text, "OpenPocket"); + + const keyPress = await client.nextStep({ + task: "Return home", + screenshotUrl: "https://example.com/home.png", + }); + assert.equal(keyPress.output.action.type, "keyevent"); + assert.equal(keyPress.output.action.keycode, "KEYCODE_HOME"); + + const scrolled = await client.nextStep({ + task: "Scroll down", + screenshotUrl: "https://example.com/scroll.png", + viewportWidth: 1000, + viewportHeight: 2000, + }); + assert.equal(scrolled.output.action.type, "swipe"); + assert.equal(scrolled.output.action.x1, 500); + assert.equal(scrolled.output.action.y1, 1500); + assert.equal(scrolled.output.action.x2, 500); + assert.equal(scrolled.output.action.y2, 500); +}); + +test("AliyunUiAgentClient rejects non-success payload codes and fail operations", async () => { + const responses = [ + makeResponse({ + session_id: "sess-error-code", + output: [ + { + code: "500", + content: [ + { + data: { + Thought: "The platform rejected the request.", + Explanation: "Temporary backend issue.", + Operation: "Wait ()", + }, + }, + ], + }, + ], + }), + makeResponse({ + session_id: "sess-fail", + output: [ + { + code: "200", + content: [ + { + data: { + Thought: "The requested screen is unavailable.", + Explanation: "Unable to continue because the app blocked automation.", + Operation: "Fail (app blocked automation)", + }, + }, + ], + }, + ], + }), + ]; + + const client = new AliyunUiAgentClient({ + apiKey: "dashscope-test-key", + fetchImpl: async () => responses.shift(), + }); + + await assert.rejects( + () => client.nextStep({ + task: "Open a blocked app", + screenshotUrl: "https://example.com/error-code.png", + }), + /error code 500/i, + ); + + await assert.rejects( + () => client.nextStep({ + task: "Open a blocked app", + screenshotUrl: "https://example.com/fail.png", + }), + /reported failure/i, + ); +}); diff --git a/test/cli.test.mjs b/test/cli.test.mjs index ac586eb..a2c0de7 100644 --- a/test/cli.test.mjs +++ b/test/cli.test.mjs @@ -557,6 +557,35 @@ test("model list prints configured profiles including Gemini 3.1", () => { assert.match(run.stdout, /gpt-5\.4/i); }); +test("model list and model set expose Aliyun UI Agent mobile distinctly", () => { + const home = makeHome("openpocket-ts-model-aliyun-ui-agent-"); + const init = runCli(["init"], { OPENPOCKET_HOME: home }); + assert.equal(init.status, 0, init.stderr || init.stdout); + + const listRun = runCli(["model", "list"], { + OPENPOCKET_HOME: home, + }); + assert.equal(listRun.status, 0, listRun.stderr || listRun.stdout); + assert.match(listRun.stdout, /aliyun-ui-agent\/mobile/i); + assert.match(listRun.stdout, /Aliyun UI Agent \(Mobile\)/i); + + const setRun = runCli( + ["model", "set", "--provider", "aliyun-ui-agent", "--model", "pre-gui_owl_7b"], + { + OPENPOCKET_HOME: home, + }, + ); + assert.equal(setRun.status, 0, setRun.stderr || setRun.stdout); + assert.match(setRun.stdout, /aliyun-ui-agent\/pre-gui_owl_7b/i); + assert.match(setRun.stdout, /Aliyun UI Agent \(Mobile\)/i); + + const cfgPath = path.join(home, "config.json"); + const cfg = JSON.parse(fs.readFileSync(cfgPath, "utf-8")); + assert.equal(cfg.defaultModel, "aliyun-ui-agent/pre-gui_owl_7b"); + assert.equal(cfg.models["aliyun-ui-agent/pre-gui_owl_7b"].apiKeyEnv, "DASHSCOPE_API_KEY"); + assert.equal(cfg.models["aliyun-ui-agent/pre-gui_owl_7b"].backend, "aliyun_ui_agent_mobile"); +}); + test("model set updates default model persistently", () => { const home = makeHome("openpocket-ts-model-set-"); const init = runCli(["init"], { OPENPOCKET_HOME: home }); diff --git a/test/dashboard-control-store.test.mjs b/test/dashboard-control-store.test.mjs index 2e0dc63..3e59253 100644 --- a/test/dashboard-control-store.test.mjs +++ b/test/dashboard-control-store.test.mjs @@ -9,6 +9,7 @@ const { dashboardPaths, defaultControlSettings, defaultOnboardingState, + providerLabel, loadControlSettings, loadOnboardingState, saveControlSettings, @@ -73,6 +74,17 @@ test("control store provides defaults and persists onboarding/control files", () }); }); +test("dashboard providerLabel distinguishes Aliyun UI Agent from DashScope compatible mode", () => { + assert.equal( + providerLabel("https://dashscope.aliyuncs.com/api/v2/apps/gui-owl/gui_agent_server"), + "Aliyun UI Agent (Mobile)", + ); + assert.equal( + providerLabel("https://dashscope.aliyuncs.com/compatible-mode/v1"), + "Qwen (DashScope)", + ); +}); + test("default prompt entries include core prompt files", () => { const entries = defaultPromptEntries("/tmp/openpocket-workspace"); assert.equal(entries.length, 11); diff --git a/test/local-human-auth-stack.test.mjs b/test/local-human-auth-stack.test.mjs index 221b049..416a7cc 100644 --- a/test/local-human-auth-stack.test.mjs +++ b/test/local-human-auth-stack.test.mjs @@ -7,6 +7,25 @@ import test from "node:test"; const { loadConfig } = await import("../dist/config/index.js"); const { LocalHumanAuthStack } = await import("../dist/human-auth/local-stack.js"); +const ONE_PIXEL_PNG_BASE64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO7Z0mQAAAAASUVORK5CYII="; + +class FakeTakeoverRuntime { + async captureFrame() { + return { + deviceId: "emulator-5554", + currentApp: "com.example.testapp", + width: 1, + height: 1, + screenshotBase64: ONE_PIXEL_PNG_BASE64, + capturedAt: new Date().toISOString(), + }; + } + + async execute() { + return "noop"; + } +} + async function withTempHome(prefix, fn) { const prevHome = process.env.OPENPOCKET_HOME; const home = fs.mkdtempSync(path.join(os.tmpdir(), prefix)); @@ -42,3 +61,83 @@ test("LocalHumanAuthStack starts local relay without tunnel", async () => { } }); }); + +test("LocalHumanAuthStack creates a signed screenshot URL served by the relay", async () => { + await withTempHome("openpocket-human-auth-signed-screenshot-", async () => { + const cfg = loadConfig(); + cfg.humanAuth.enabled = true; + cfg.humanAuth.useLocalRelay = true; + cfg.humanAuth.localRelayHost = "127.0.0.1"; + cfg.humanAuth.localRelayPort = 0; + cfg.humanAuth.tunnel.provider = "none"; + cfg.humanAuth.tunnel.ngrok.enabled = false; + + const stack = new LocalHumanAuthStack(cfg, undefined, { takeoverRuntime: new FakeTakeoverRuntime() }); + await stack.start(); + try { + const signed = await stack.createSignedScreenshotUrl({ ttlSec: 60 }); + const response = await fetch(signed.url); + const body = Buffer.from(await response.arrayBuffer()); + + assert.equal(response.status, 200); + assert.match(String(response.headers.get("content-type") || ""), /^image\/png/i); + assert.equal(body.equals(Buffer.from(ONE_PIXEL_PNG_BASE64, "base64")), true); + assert.match(signed.expiresAt, /\d{4}-\d{2}-\d{2}T/); + } finally { + await stack.stop(); + } + }); +}); + +test("LocalHumanAuthStack rejects tampered signed screenshot tokens", async () => { + await withTempHome("openpocket-human-auth-signed-screenshot-deny-", async () => { + const cfg = loadConfig(); + cfg.humanAuth.enabled = true; + cfg.humanAuth.useLocalRelay = true; + cfg.humanAuth.localRelayHost = "127.0.0.1"; + cfg.humanAuth.localRelayPort = 0; + cfg.humanAuth.tunnel.provider = "none"; + cfg.humanAuth.tunnel.ngrok.enabled = false; + + const stack = new LocalHumanAuthStack(cfg, undefined, { takeoverRuntime: new FakeTakeoverRuntime() }); + await stack.start(); + try { + const signed = await stack.createSignedScreenshotUrl({ ttlSec: 60 }); + const tampered = new URL(signed.url); + tampered.searchParams.set("token", "bad-token"); + + const response = await fetch(tampered); + assert.equal(response.status, 403); + } finally { + await stack.stop(); + } + }); +}); + +test("LocalHumanAuthStack rejects expired signed screenshot tokens", async () => { + await withTempHome("openpocket-human-auth-signed-screenshot-expired-", async () => { + const cfg = loadConfig(); + cfg.humanAuth.enabled = true; + cfg.humanAuth.useLocalRelay = true; + cfg.humanAuth.localRelayHost = "127.0.0.1"; + cfg.humanAuth.localRelayPort = 0; + cfg.humanAuth.tunnel.provider = "none"; + cfg.humanAuth.tunnel.ngrok.enabled = false; + + const stack = new LocalHumanAuthStack(cfg, undefined, { takeoverRuntime: new FakeTakeoverRuntime() }); + await stack.start(); + const originalNow = Date.now; + try { + const issuedAt = originalNow(); + Date.now = () => issuedAt; + const signed = await stack.createSignedScreenshotUrl({ ttlSec: 60 }); + Date.now = () => issuedAt + 61_000; + + const response = await fetch(signed.url); + assert.equal(response.status, 403); + } finally { + Date.now = originalNow; + await stack.stop(); + } + }); +}); diff --git a/test/model-provider-presets.test.mjs b/test/model-provider-presets.test.mjs new file mode 100644 index 0000000..ea9c60c --- /dev/null +++ b/test/model-provider-presets.test.mjs @@ -0,0 +1,61 @@ +import assert from "node:assert/strict"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import test from "node:test"; + +const { + buildModelProfileFromPreset, + resolveModelProviderPreset, + resolveModelProviderPresetByBaseUrl, +} = await import("../dist/config/model-provider-presets.js"); +const { loadConfig } = await import("../dist/config/index.js"); + +test("Aliyun UI Agent preset builds a mobile backend profile", () => { + const preset = resolveModelProviderPreset("aliyun-ui-agent"); + assert.ok(preset); + assert.equal(preset.baseUrl, "https://dashscope.aliyuncs.com/api/v2/apps/gui-owl/gui_agent_server"); + assert.equal(preset.apiKeyEnv, "DASHSCOPE_API_KEY"); + + const profile = buildModelProfileFromPreset(preset, "pre-gui_owl_7b"); + assert.equal(profile.backend, "aliyun_ui_agent_mobile"); + assert.equal(profile.baseUrl, preset.baseUrl); + assert.equal(profile.model, "pre-gui_owl_7b"); + assert.equal(profile.apiKeyEnv, "DASHSCOPE_API_KEY"); +}); + +test("DashScope preset resolution distinguishes UI Agent from compatible mode", () => { + const uiAgentPreset = resolveModelProviderPresetByBaseUrl( + "https://dashscope.aliyuncs.com/api/v2/apps/gui-owl/gui_agent_server", + ); + const compatibleModePreset = resolveModelProviderPresetByBaseUrl( + "https://dashscope.aliyuncs.com/compatible-mode/v1", + ); + const bareHostPreset = resolveModelProviderPresetByBaseUrl( + "https://dashscope.aliyuncs.com", + ); + + assert.equal(uiAgentPreset?.key, "aliyun-ui-agent"); + assert.equal(compatibleModePreset?.key, "qwen"); + assert.equal(bareHostPreset?.key, "qwen"); +}); + +test("loadConfig preserves explicit Aliyun UI Agent backend hints", () => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "openpocket-aliyun-ui-agent-config-")); + const configPath = path.join(tmpDir, "config.json"); + + fs.writeFileSync(configPath, JSON.stringify({ + defaultModel: "aliyun-ui-agent/pre-gui_owl_7b", + models: { + "aliyun-ui-agent/pre-gui_owl_7b": { + baseUrl: "https://dashscope.aliyuncs.com/api/v2/apps/gui-owl/gui_agent_server", + model: "pre-gui_owl_7b", + apiKeyEnv: "DASHSCOPE_API_KEY", + backend: "aliyun_ui_agent_mobile", + }, + }, + }, null, 2)); + + const config = loadConfig(configPath); + assert.equal(config.models["aliyun-ui-agent/pre-gui_owl_7b"]?.backend, "aliyun_ui_agent_mobile"); +}); diff --git a/test/relay-hub.test.mjs b/test/relay-hub.test.mjs index 99a263a..6384773 100644 --- a/test/relay-hub.test.mjs +++ b/test/relay-hub.test.mjs @@ -14,6 +14,8 @@ const { RelayHubServer } = await import("../dist/manager/relay-hub.js"); const { loadManagerPorts, saveManagerPorts } = await import("../dist/manager/ports.js"); const { LocalHumanAuthStack } = await import("../dist/human-auth/local-stack.js"); +const ONE_PIXEL_PNG_BASE64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO7Z0mQAAAAASUVORK5CYII="; + function runCli(args, env = {}) { return spawnSync("node", [cliPath, ...args], { cwd: repoRoot, @@ -103,3 +105,56 @@ test("managed local human-auth stack registers through relay hub and returns pre } }); }); + +test("managed relay hub proxies signed screenshot URLs", async () => { + await withTempHome("openpocket-relay-hub-screenshot-", async (home) => { + const init = runCli(["init"], { OPENPOCKET_HOME: home }); + assert.equal(init.status, 0, init.stderr || init.stdout); + + const create = runCli( + ["create", "agent", "auth-bot", "--type", "physical-phone", "--device", "AUTH-DEVICE-1"], + { OPENPOCKET_HOME: home }, + ); + assert.equal(create.status, 0, create.stderr || create.stdout); + + const hub = new RelayHubServer({ host: "127.0.0.1", port: 0 }); + await hub.start(); + const ports = loadManagerPorts(); + saveManagerPorts({ ...ports, relayHubPort: Number(new URL(hub.address).port) }); + + const cfg = loadConfig(path.join(home, "agents", "auth-bot", "config.json")); + cfg.humanAuth.enabled = true; + cfg.humanAuth.useLocalRelay = true; + cfg.humanAuth.tunnel.provider = "none"; + cfg.humanAuth.tunnel.ngrok.enabled = false; + + const stack = new LocalHumanAuthStack(cfg, undefined, { + takeoverRuntime: { + captureFrame: async () => ({ + deviceId: "emulator-5554", + currentApp: "com.example.camera", + width: 1, + height: 1, + screenshotBase64: ONE_PIXEL_PNG_BASE64, + capturedAt: new Date().toISOString(), + }), + execute: async () => "noop", + }, + }); + + await stack.start(); + try { + const signed = await stack.createSignedScreenshotUrl({ ttlSec: 60 }); + assert.match(signed.url, /^http:\/\/127\.0\.0\.1:\d+\/a\/auth-bot\/v1\/human-auth\/takeover\/screenshot\?token=/); + + const response = await fetch(signed.url); + const body = Buffer.from(await response.arrayBuffer()); + assert.equal(response.status, 200); + assert.match(String(response.headers.get("content-type") || ""), /^image\/png/i); + assert.equal(body.equals(Buffer.from(ONE_PIXEL_PNG_BASE64, "base64")), true); + } finally { + await stack.stop(); + await hub.stop(); + } + }); +}); diff --git a/test/runtime-seams.test.mjs b/test/runtime-seams.test.mjs index bae5e29..f8678f6 100644 --- a/test/runtime-seams.test.mjs +++ b/test/runtime-seams.test.mjs @@ -83,6 +83,10 @@ function createAttemptDeps(runtime) { piSessionBridgeFactory: async () => { throw new Error("piSessionBridgeFactory not configured"); }, + aliyunGuiPlusClientFactory: () => ({ + resetConversation: () => {}, + nextStep: async () => { throw new Error("aliyunGuiPlusClientFactory not configured"); }, + }), }; } @@ -430,6 +434,101 @@ test("runRuntimeAttempt falls back to legacy backend when phone-only tools are r assert.equal(bridgeFactoryCalls, 0); }); +test("runRuntimeAttempt uses Aliyun UI Agent mobile backend when configured", async () => { + const runtime = createRuntimeWithApiKey(); + runtime.config.models[runtime.config.defaultModel] = { + baseUrl: "https://dashscope.aliyuncs.com/api/v2/apps/gui-owl/gui_agent_server", + model: "pre-gui_owl_7b", + apiKey: "dashscope-test-key", + apiKeyEnv: "DASHSCOPE_API_KEY", + maxTokens: 4096, + reasoningEffort: null, + temperature: null, + backend: "aliyun_ui_agent_mobile", + }; + + runtime.adb = { + queryLaunchablePackages: () => [], + resolveDeviceId: () => "emulator-5554", + captureScreenSnapshot: () => makeSnapshot({ currentApp: "com.android.launcher3" }), + executeAction: async (action) => { + runtime.__executedActions = runtime.__executedActions ?? []; + runtime.__executedActions.push(action); + return "ok"; + }, + }; + + let agentFactoryCalls = 0; + runtime.agentFactory = () => { + agentFactoryCalls += 1; + throw new Error("legacy agentFactory should not be used for Aliyun UI Agent backend"); + }; + + const deps = createAttemptDeps(runtime); + const screenshotUrls = []; + let stackStops = 0; + deps.localHumanAuthStackFactory = () => ({ + start: async () => ({ relayBaseUrl: "http://127.0.0.1:8787", publicBaseUrl: "https://public.example/a/default" }), + createSignedScreenshotUrl: async () => { + const url = `https://public.example/a/default/v1/human-auth/takeover/screenshot?token=${screenshotUrls.length + 1}`; + screenshotUrls.push(url); + return { url, expiresAt: new Date(Date.now() + 60_000).toISOString() }; + }, + stop: async () => { + stackStops += 1; + }, + }); + + let clientCalls = 0; + deps.aliyunUiAgentClientFactory = () => ({ + getSessionId: () => (clientCalls > 0 ? "aliyun-session-1" : null), + setSessionId: () => {}, + nextStep: async ({ screenshotUrl }) => { + clientCalls += 1; + if (clientCalls === 1) { + assert.match(screenshotUrl, /token=1$/); + return { + sessionId: "aliyun-session-1", + explanation: "Tap the icon", + output: { + thought: "The target icon is visible.", + action: { type: "tap", x: 64, y: 96, reason: "Tap the icon" }, + raw: "{\"step\":1}", + }, + }; + } + assert.match(screenshotUrl, /token=2$/); + return { + sessionId: "aliyun-session-1", + explanation: "Task completed", + output: { + thought: "The task is complete.", + action: { type: "finish", message: "aliyun-ui-agent-ok" }, + raw: "{\"step\":2}", + }, + }; + }, + }); + + const outcome = await runRuntimeAttempt(deps, { + task: "Open an app with Aliyun UI Agent", + }); + + assert.equal(outcome.result.ok, true); + assert.match(outcome.result.message, /aliyun-ui-agent-ok/); + assert.equal(agentFactoryCalls, 0); + assert.equal(clientCalls, 2); + assert.equal(stackStops, 1); + assert.equal(screenshotUrls.length, 2); + assert.equal(runtime.__executedActions.length, 1); + assert.deepEqual(runtime.__executedActions[0], { + type: "tap", + x: 64, + y: 96, + reason: "Tap the icon", + }); +}); + test("runRuntimeAttempt treats bounded cron step budget exhaustion as a normal completion", async () => { const runtime = createRuntimeWithApiKey(); runtime.adb = { diff --git a/test/setup-wizard.test.mjs b/test/setup-wizard.test.mjs index 9fe8a0d..e19527e 100644 --- a/test/setup-wizard.test.mjs +++ b/test/setup-wizard.test.mjs @@ -242,6 +242,115 @@ test("setup wizard supports custom provider+model profile in model selection", a }); }); +test("setup wizard supports Aliyun UI Agent mobile profile selection", async () => { + await withTempHome("openpocket-setup-aliyun-ui-agent-", async () => { + const cfg = loadConfig(); + cfg.humanAuth.tunnel.ngrok.executable = "echo"; + const prevToken = process.env.NGROK_AUTHTOKEN; + process.env.NGROK_AUTHTOKEN = "ngrok-test-token"; + const prompter = new FakePrompter({ + confirms: [true, true, false, false], + selects: ["physical-phone", "usb", "aliyun-ui-agent/mobile", "skip", "skip", "pairing", "ngrok", "env"], + texts: [""], + pauseCount: 0, + }); + const emulator = new FakeEmulator(); + + try { + await runSetupWizard(cfg, { prompter, emulator, skipTtyCheck: true, printHeader: false }); + + const savedCfg = JSON.parse(fs.readFileSync(cfg.configPath, "utf-8")); + assert.equal(savedCfg.defaultModel, "aliyun-ui-agent/mobile"); + assert.equal(savedCfg.models["aliyun-ui-agent/mobile"].apiKeyEnv, "DASHSCOPE_API_KEY"); + assert.equal(savedCfg.models["aliyun-ui-agent/mobile"].backend, "aliyun_ui_agent_mobile"); + assert.equal(savedCfg.humanAuth.enabled, true); + assert.equal(savedCfg.humanAuth.useLocalRelay, true); + assert.equal(savedCfg.humanAuth.tunnel.provider, "ngrok"); + assert.equal(savedCfg.humanAuth.tunnel.ngrok.enabled, true); + + const statePath = path.join(cfg.stateDir, "onboarding.json"); + const state = JSON.parse(fs.readFileSync(statePath, "utf-8")); + assert.equal(state.modelProfile, "aliyun-ui-agent/mobile"); + assert.equal(state.modelProvider, "Aliyun UI Agent (Mobile)"); + assert.equal(state.apiKeyEnv, "DASHSCOPE_API_KEY"); + assert.equal(state.humanAuthMode, "ngrok"); + assert.equal(typeof state.ngrokConfiguredAt, "string"); + } finally { + if (prevToken === undefined) { + delete process.env.NGROK_AUTHTOKEN; + } else { + process.env.NGROK_AUTHTOKEN = prevToken; + } + } + }); +}); + +test("setup wizard rejects disabling human-auth bridge for Aliyun UI Agent mobile", async () => { + await withTempHome("openpocket-setup-aliyun-ui-agent-disabled-", async () => { + const cfg = loadConfig(); + const prompter = new FakePrompter({ + confirms: [true, true, false, false], + selects: ["physical-phone", "usb", "aliyun-ui-agent/mobile", "skip", "skip", "pairing", "disabled"], + texts: [""], + pauseCount: 0, + }); + const emulator = new FakeEmulator(); + + await assert.rejects( + () => runSetupWizard(cfg, { prompter, emulator, skipTtyCheck: true, printHeader: false }), + /Aliyun UI Agent \(Mobile\) requires ngrok/i, + ); + }); +}); + +test("setup wizard rejects missing ngrok CLI for Aliyun UI Agent mobile", async () => { + await withTempHome("openpocket-setup-aliyun-ui-agent-missing-ngrok-", async () => { + const cfg = loadConfig(); + cfg.humanAuth.tunnel.ngrok.executable = "definitely-missing-ngrok-cli"; + const prompter = new FakePrompter({ + confirms: [true, true, false, false], + selects: ["physical-phone", "usb", "aliyun-ui-agent/mobile", "skip", "skip", "pairing", "ngrok", "skip"], + texts: [""], + pauseCount: 0, + }); + const emulator = new FakeEmulator(); + + await assert.rejects( + () => runSetupWizard(cfg, { prompter, emulator, skipTtyCheck: true, printHeader: false }), + /requires ngrok cli/i, + ); + }); +}); + +test("setup wizard rejects missing ngrok token for Aliyun UI Agent mobile", async () => { + await withTempHome("openpocket-setup-aliyun-ui-agent-missing-token-", async () => { + const cfg = loadConfig(); + cfg.humanAuth.tunnel.ngrok.executable = "echo"; + const prevToken = process.env.NGROK_AUTHTOKEN; + delete process.env.NGROK_AUTHTOKEN; + const prompter = new FakePrompter({ + confirms: [true, true, false, false], + selects: ["physical-phone", "usb", "aliyun-ui-agent/mobile", "skip", "skip", "pairing", "ngrok", "skip"], + texts: [""], + pauseCount: 0, + }); + const emulator = new FakeEmulator(); + + try { + await assert.rejects( + () => runSetupWizard(cfg, { prompter, emulator, skipTtyCheck: true, printHeader: false }), + /requires ngrok authtoken/i, + ); + } finally { + if (prevToken === undefined) { + delete process.env.NGROK_AUTHTOKEN; + } else { + process.env.NGROK_AUTHTOKEN = prevToken; + } + } + }); +}); + test("setup wizard can configure physical phone target and skip emulator onboarding", async () => { await withTempHome("openpocket-setup-physical-target-", async () => { const cfg = loadConfig();