Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/agent/actions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,14 @@ export function normalizeAction(input: unknown): AgentAction {
};
}

if (type === "generate_image") {
return {
type,
prompt: String(input.prompt ?? ""),
reason: input.reason ? String(input.reason) : undefined,
};
}

if (type === "wait") {
return {
type,
Expand Down
83 changes: 83 additions & 0 deletions src/agent/agent-runtime.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ import { buildPiAiModel } from "./model-client.js";
import { buildSystemPrompt, buildUserPrompt, type SystemPromptMode } from "./prompts.js";
import { CHAT_TOOLS, TOOL_METAS, toolNameToActionType, type ToolMeta } from "./tools.js";
import { normalizeAction } from "./actions.js";
import { createImageService, type ImageGenerationService } from "../services/image-generation/index.js";
import { runRuntimeAttempt } from "./runtime/attempt.js";
import { runRuntimeTask } from "./runtime/run.js";
import type { RunTaskRequest } from "./runtime/types.js";
Expand Down Expand Up @@ -307,6 +308,7 @@ export class AgentRuntime {
private readonly piCodingToolsExecutor: PiCodingToolsExecutor;
private readonly memoryExecutor: MemoryExecutor;
private readonly screenshotStore: ScreenshotStore;
private readonly imageGenerationService: ImageGenerationService | null;
private busy = false;
private stopRequested = false;
private currentTask: string | null = null;
Expand All @@ -331,9 +333,43 @@ export class AgentRuntime {
config.screenshots.directory,
config.screenshots.maxCount,
);
// Initialize image generation service if enabled and configured
this.imageGenerationService = this.initializeImageGenerationService(config);
this.agentFactory = options?.agentFactory ?? ((agentOptions: AgentOptions) => new Agent(agentOptions));
}

private initializeImageGenerationService(config: OpenPocketConfig): ImageGenerationService | null {
if (!config.imageGeneration.enabled) {
return null;
}

const apiKey = config.imageGeneration.apiKey || process.env[config.imageGeneration.apiKeyEnv];
if (!apiKey) {
// eslint-disable-next-line no-console
console.warn("[OpenPocket] Image generation enabled but no API key configured");
return null;
}

try {
// Only support fal provider for now
if (config.imageGeneration.provider !== "fal") {
// eslint-disable-next-line no-console
console.warn(`[OpenPocket] Image generation provider '${config.imageGeneration.provider}' not yet supported`);
return null;
}

return createImageService({
type: config.imageGeneration.provider,
apiKey,
model: config.imageGeneration.model,
});
} catch (error) {
// eslint-disable-next-line no-console
console.error("[OpenPocket] Failed to initialize image generation service:", error);
return null;
}
}

isBusy(): boolean {
return this.busy;
}
Expand Down Expand Up @@ -2256,6 +2292,53 @@ export class AgentRuntime {
return { content: [{ type: "text" as const, text: resultText }], details: {} };
}

// ---- generate_image ----
if (action.type === "generate_image") {
if (!runtime.imageGenerationService) {
const msg = "Image generation requested, but service is not enabled or configured.";
ctx.failMessage = msg;
runtime.workspace.appendStep(
ctx.session,
step,
thought,
JSON.stringify(action, null, 2),
msg,
buildStepTrace(snapshot?.currentApp ?? "unknown", "error"),
);
ctx.traces.push({ step, action, result: msg, thought, currentApp: snapshot?.currentApp ?? "unknown" });
return { content: [{ type: "text" as const, text: msg }], details: {} };
}

try {
const result = await runtime.imageGenerationService.generate(action.prompt);
const resultText = `Image generated successfully.\nURL: ${result.url}\nProvider: ${result.provider}`;
runtime.workspace.appendStep(
ctx.session,
step,
thought,
JSON.stringify(action, null, 2),
resultText,
buildStepTrace(snapshot?.currentApp ?? "unknown", "ok"),
);
ctx.traces.push({ step, action, result: resultText, thought, currentApp: snapshot?.currentApp ?? "unknown" });
ctx.history.push(`step ${step}: action=generate_image url=${result.url}`);
return { content: [{ type: "text" as const, text: resultText }], details: { imageUrl: result.url } };
} catch (error) {
const errorMsg = `Image generation failed: ${error instanceof Error ? error.message : String(error)}`;
ctx.failMessage = errorMsg;
runtime.workspace.appendStep(
ctx.session,
step,
thought,
JSON.stringify(action, null, 2),
errorMsg,
buildStepTrace(snapshot?.currentApp ?? "unknown", "error"),
);
ctx.traces.push({ step, action, result: errorMsg, thought, currentApp: snapshot?.currentApp ?? "unknown" });
return { content: [{ type: "text" as const, text: errorMsg }], details: {} };
}
}

// ---- all other actions (tap, swipe, type, keyevent, launch_app, shell, run_script, read, write, edit, etc.) ----
const executionResult = await runtime.executePhoneAction(action, ctx);
const stepResult = ctx.lastScreenshotPath
Expand Down
16 changes: 16 additions & 0 deletions src/agent/model-client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,22 @@ function extractThinking(msg: AssistantMessage): string {
* OpenAI-compatible endpoint).
*/
export function buildPiAiModel(profile: ModelProfile): Model<Api> {
// If profile explicitly specifies api and provider, use them directly
if (profile.api && profile.provider) {
return {
id: profile.model,
name: profile.model,
api: profile.api as Api,
provider: profile.provider,
baseUrl: profile.baseUrl,
reasoning: profile.reasoningEffort !== null,
input: ["text", "image"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 128_000,
maxTokens: profile.maxTokens,
};
}

// Detect provider / api from baseUrl.
const baseUrlLower = profile.baseUrl.toLowerCase();

Expand Down
2 changes: 2 additions & 0 deletions src/agent/prompts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ const TOOL_CATALOG_ORDER = [
"process",
"memory_search",
"memory_get",
"generate_image",
"request_human_auth",
"request_user_decision",
"request_user_input",
Expand All @@ -44,6 +45,7 @@ const TOOL_CATALOG_LINES: Record<(typeof TOOL_CATALOG_ORDER)[number], string> =
process: "- process: process(action[, sessionId, input, offset, limit, timeoutMs, reason])",
memory_search: "- memory_search: memory_search(query[, maxResults, minScore, reason])",
memory_get: "- memory_get: memory_get(path[, from, lines, reason])",
generate_image: "- generate_image: generate_image(prompt[, reason]) - Generate an image from text description using AI image generation service",
request_human_auth: "- request_human_auth: request_human_auth(capability, instruction[, timeoutSec, reason])",
request_user_decision: "- request_user_decision: request_user_decision(question, options[, timeoutSec, reason])",
request_user_input: "- request_user_input: request_user_input(question[, placeholder, timeoutSec, reason])",
Expand Down
8 changes: 8 additions & 0 deletions src/agent/tools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,12 @@ export const finishSchema = Type.Object({
message: Type.String({ description: "Summary of what was accomplished." }),
});

export const generateImageSchema = Type.Object({
thought: ThoughtParam,
prompt: Type.String({ description: "Text description of the image to generate." }),
reason: ReasonParam,
});

// ---------------------------------------------------------------------------
// Exported types (Static inference from TypeBox schemas)
// ---------------------------------------------------------------------------
Expand All @@ -210,6 +216,7 @@ export type RequestUserDecisionParams = Static<typeof requestUserDecisionSchema>
export type RequestUserInputParams = Static<typeof requestUserInputSchema>;
export type WaitParams = Static<typeof waitSchema>;
export type FinishParams = Static<typeof finishSchema>;
export type GenerateImageParams = Static<typeof generateImageSchema>;

// ---------------------------------------------------------------------------
// Tool metadata list (name, description, schema) — used to build AgentTool[]
Expand Down Expand Up @@ -243,6 +250,7 @@ export const TOOL_METAS: ToolMeta[] = [
{ name: "request_user_input", description: "Ask user for a short non-sensitive text input needed to continue the task.", parameters: requestUserInputSchema },
{ name: "wait", description: "Wait / do nothing for a short period, e.g. while content is loading.", parameters: waitSchema },
{ name: "finish", description: "Signal that the user task is complete.", parameters: finishSchema },
{ name: "generate_image", description: "Generate an image from text description using an image generation service.", parameters: generateImageSchema },
];

// ---------------------------------------------------------------------------
Expand Down
37 changes: 29 additions & 8 deletions src/config/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,13 @@ function defaultConfigObject() {
},
},
},
imageGeneration: {
enabled: false,
provider: "fal" as const,
apiKey: "",
apiKeyEnv: "FAL_API_KEY",
model: "fal-ai/nano-banana",
},
models: {
"gpt-5.2-codex": {
baseUrl: "https://api.openai.com/v1",
Expand Down Expand Up @@ -613,9 +620,9 @@ function normalizeConfig(raw: Record<string, unknown>, configPath: string): Open
model.reasoningEffort ?? model.reasoning_effort ?? null;
const reasoningEffort =
reasoningRaw === "low" ||
reasoningRaw === "medium" ||
reasoningRaw === "high" ||
reasoningRaw === "xhigh"
reasoningRaw === "medium" ||
reasoningRaw === "high" ||
reasoningRaw === "xhigh"
? reasoningRaw
: null;
const tempRaw = model.temperature;
Expand Down Expand Up @@ -649,6 +656,7 @@ function normalizeConfig(raw: Record<string, unknown>, configPath: string): Open
const cron = (merged.cron ?? {}) as Record<string, unknown>;
const dashboard = (merged.dashboard ?? {}) as Record<string, unknown>;
const humanAuth = (merged.humanAuth ?? {}) as Record<string, unknown>;
const imageGeneration = (merged.imageGeneration ?? {}) as Record<string, unknown>;
const sessionStorage = (merged.sessionStorage ?? {}) as Record<string, unknown>;
const humanAuthTunnel = isObject(humanAuth.tunnel) ? humanAuth.tunnel : {};
const humanAuthNgrok = isObject(humanAuthTunnel.ngrok) ? humanAuthTunnel.ngrok : {};
Expand Down Expand Up @@ -792,7 +800,7 @@ function normalizeConfig(raw: Record<string, unknown>, configPath: string): Open
localRelayStateFile: resolvePath(
String(
humanAuth.localRelayStateFile ??
path.join(resolvedStateDir, "human-auth-relay", "requests.json"),
path.join(resolvedStateDir, "human-auth-relay", "requests.json"),
),
),
relayBaseUrl: String(humanAuth.relayBaseUrl ?? "").trim().replace(/\/+$/, ""),
Expand All @@ -813,30 +821,42 @@ function normalizeConfig(raw: Record<string, unknown>, configPath: string): Open
executable:
String(
humanAuthNgrok.executable ??
defaultConfigObject().humanAuth.tunnel.ngrok.executable,
defaultConfigObject().humanAuth.tunnel.ngrok.executable,
).trim() || "ngrok",
authtoken: String(humanAuthNgrok.authtoken ?? ""),
authtokenEnv:
String(
humanAuthNgrok.authtokenEnv ??
defaultConfigObject().humanAuth.tunnel.ngrok.authtokenEnv,
defaultConfigObject().humanAuth.tunnel.ngrok.authtokenEnv,
).trim() || "NGROK_AUTHTOKEN",
apiBaseUrl:
String(
humanAuthNgrok.apiBaseUrl ??
defaultConfigObject().humanAuth.tunnel.ngrok.apiBaseUrl,
defaultConfigObject().humanAuth.tunnel.ngrok.apiBaseUrl,
).trim().replace(/\/+$/, "") || "http://127.0.0.1:4040",
startupTimeoutSec: (() => {
const raw = Number(
humanAuthNgrok.startupTimeoutSec ??
defaultConfigObject().humanAuth.tunnel.ngrok.startupTimeoutSec,
defaultConfigObject().humanAuth.tunnel.ngrok.startupTimeoutSec,
);
const value = Number.isFinite(raw) ? raw : 20;
return Math.max(3, Math.round(value));
})(),
},
},
},
imageGeneration: {
enabled: Boolean(imageGeneration.enabled ?? false),
provider: (() => {
const provider = String(imageGeneration.provider ?? "fal");
return provider === "fal" || provider === "replicate" || provider === "huggingface"
? provider
: "fal";
})(),
apiKey: String(imageGeneration.apiKey ?? ""),
apiKeyEnv: String(imageGeneration.apiKeyEnv ?? "FAL_API_KEY"),
model: imageGeneration.model ? String(imageGeneration.model) : undefined,
},
models,
configPath,
};
Expand Down Expand Up @@ -882,6 +902,7 @@ export function saveConfig(config: OpenPocketConfig): void {
cron: config.cron,
dashboard: config.dashboard,
humanAuth: config.humanAuth,
imageGeneration: config.imageGeneration,
models: config.models,
};
fs.writeFileSync(config.configPath, `${JSON.stringify(payload, null, 2)}\n`, "utf-8");
Expand Down
3 changes: 3 additions & 0 deletions src/device/adb-runtime.ts
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,9 @@ export class AdbRuntime {
case "finish": {
return `Finish: ${action.message}`;
}
case "generate_image": {
return "generate_image is handled by AgentRuntime image generation service.";
}
default: {
const exhaust: never = action;
return `Unknown action: ${JSON.stringify(exhaust)}`;
Expand Down
43 changes: 35 additions & 8 deletions src/gateway/telegram-gateway.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2467,14 +2467,41 @@ export class TelegramGateway {
this.stripStepCounterTelemetry(finalMessage),
1800,
);
await this.bot.sendMessage(
chatId,
finalForChat,
{
disable_web_page_preview: true,
},
);
this.chat.appendExternalTurn(chatId, "assistant", finalMessage);
// Check if message contains an image URL
const imageUrlMatch = finalMessage.match(/https:\/\/[^\s]+\.(png|jpg|jpeg|gif|webp)/i);
if (imageUrlMatch) {
const imageUrl = imageUrlMatch[0];
const caption = this.sanitizeForChat(
this.stripStepCounterTelemetry(finalMessage.replace(imageUrl, "").trim()),
1000,
);
try {
await this.bot.sendPhoto(chatId, imageUrl, {
caption: caption || undefined,
});
this.chat.appendExternalTurn(chatId, "assistant", finalMessage);
} catch (error) {
// Fallback to text message if photo send fails
this.log(`Failed to send photo, falling back to text: ${(error as Error).message}`);
await this.bot.sendMessage(
chatId,
this.sanitizeForChat(
this.stripStepCounterTelemetry(finalMessage),
1800,
),
);
this.chat.appendExternalTurn(chatId, "assistant", finalMessage);
}
} else {
await this.bot.sendMessage(
chatId,
finalForChat,
{
disable_web_page_preview: true,
},
);
this.chat.appendExternalTurn(chatId, "assistant", finalMessage);
}
}

return {
Expand Down
38 changes: 38 additions & 0 deletions src/services/image-generation/base.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/**
* Base interface for image generation providers
*
* Implementations wrap specific third-party APIs (fal, replicate, etc.)
* while exposing a consistent business-level interface.
*/

import type {
ImageGenerationRequest,
ImageGenerationResult,
} from "./types.js";

/**
* Abstract base class for image generation providers
*
* Each provider (fal, replicate, etc.) implements this interface.
* The factory returns appropriate instances based on configuration.
*/
export abstract class ImageGenerationProvider {
/**
* Unique identifier for this provider
*/
abstract readonly providerId: string;

/**
* Generate an image from a text prompt
*
* @param request - Image generation request
* @returns Promise resolving to generation result
* @throws ImageGenerationError if generation fails
*/
abstract generate(request: ImageGenerationRequest): Promise<ImageGenerationResult>;

/**
* Check if this provider is properly configured
*/
abstract isConfigured(): boolean;
}
Loading
Loading