pockebot · flashclub · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026
diff --git a/src/agent/actions.ts b/src/agent/actions.ts
@@ -271,6 +271,14 @@ export function normalizeAction(input: unknown): AgentAction {
     };
   }
 
+  if (type === "generate_image") {
+    return {
+      type,
+      prompt: String(input.prompt ?? ""),
+      reason: input.reason ? String(input.reason) : undefined,
+    };
+  }
+
   if (type === "wait") {
     return {
       type,

diff --git a/src/agent/agent-runtime.ts b/src/agent/agent-runtime.ts
@@ -46,6 +46,7 @@ import { buildPiAiModel } from "./model-client.js";
 import { buildSystemPrompt, buildUserPrompt, type SystemPromptMode } from "./prompts.js";
 import { CHAT_TOOLS, TOOL_METAS, toolNameToActionType, type ToolMeta } from "./tools.js";
 import { normalizeAction } from "./actions.js";
+import { createImageService, type ImageGenerationService } from "../services/image-generation/index.js";
 import { runRuntimeAttempt } from "./runtime/attempt.js";
 import { runRuntimeTask } from "./runtime/run.js";
 import type { RunTaskRequest } from "./runtime/types.js";
@@ -307,6 +308,7 @@ export class AgentRuntime {
   private readonly piCodingToolsExecutor: PiCodingToolsExecutor;
   private readonly memoryExecutor: MemoryExecutor;
   private readonly screenshotStore: ScreenshotStore;
+  private readonly imageGenerationService: ImageGenerationService | null;
   private busy = false;
   private stopRequested = false;
   private currentTask: string | null = null;
@@ -331,9 +333,43 @@ export class AgentRuntime {
       config.screenshots.directory,
       config.screenshots.maxCount,
     );
+    // Initialize image generation service if enabled and configured
+    this.imageGenerationService = this.initializeImageGenerationService(config);
     this.agentFactory = options?.agentFactory ?? ((agentOptions: AgentOptions) => new Agent(agentOptions));
   }
 
+  private initializeImageGenerationService(config: OpenPocketConfig): ImageGenerationService | null {
+    if (!config.imageGeneration.enabled) {
+      return null;
+    }
+
+    const apiKey = config.imageGeneration.apiKey || process.env[config.imageGeneration.apiKeyEnv];
+    if (!apiKey) {
+      // eslint-disable-next-line no-console
+      console.warn("[OpenPocket] Image generation enabled but no API key configured");
+      return null;
+    }
+
+    try {
+      // Only support fal provider for now
+      if (config.imageGeneration.provider !== "fal") {
+        // eslint-disable-next-line no-console
+        console.warn(`[OpenPocket] Image generation provider '${config.imageGeneration.provider}' not yet supported`);
+        return null;
+      }
+
+      return createImageService({
+        type: config.imageGeneration.provider,
+        apiKey,
+        model: config.imageGeneration.model,
+      });
+    } catch (error) {
+      // eslint-disable-next-line no-console
+      console.error("[OpenPocket] Failed to initialize image generation service:", error);
+      return null;
+    }
+  }
+
   isBusy(): boolean {
     return this.busy;
   }
@@ -2256,6 +2292,53 @@ export class AgentRuntime {
             return { content: [{ type: "text" as const, text: resultText }], details: {} };
           }
 
+          // ---- generate_image ----
+          if (action.type === "generate_image") {
+            if (!runtime.imageGenerationService) {
+              const msg = "Image generation requested, but service is not enabled or configured.";
+              ctx.failMessage = msg;
+              runtime.workspace.appendStep(
+                ctx.session,
+                step,
+                thought,
+                JSON.stringify(action, null, 2),
+                msg,
+                buildStepTrace(snapshot?.currentApp ?? "unknown", "error"),
+              );
+              ctx.traces.push({ step, action, result: msg, thought, currentApp: snapshot?.currentApp ?? "unknown" });
+              return { content: [{ type: "text" as const, text: msg }], details: {} };
+            }
+
+            try {
+              const result = await runtime.imageGenerationService.generate(action.prompt);
+              const resultText = `Image generated successfully.\nURL: ${result.url}\nProvider: ${result.provider}`;
+              runtime.workspace.appendStep(
+                ctx.session,
+                step,
+                thought,
+                JSON.stringify(action, null, 2),
+                resultText,
+                buildStepTrace(snapshot?.currentApp ?? "unknown", "ok"),
+              );
+              ctx.traces.push({ step, action, result: resultText, thought, currentApp: snapshot?.currentApp ?? "unknown" });
+              ctx.history.push(`step ${step}: action=generate_image url=${result.url}`);
+              return { content: [{ type: "text" as const, text: resultText }], details: { imageUrl: result.url } };
+            } catch (error) {
+              const errorMsg = `Image generation failed: ${error instanceof Error ? error.message : String(error)}`;
+              ctx.failMessage = errorMsg;
+              runtime.workspace.appendStep(
+                ctx.session,
+                step,
+                thought,
+                JSON.stringify(action, null, 2),
+                errorMsg,
+                buildStepTrace(snapshot?.currentApp ?? "unknown", "error"),
+              );
+              ctx.traces.push({ step, action, result: errorMsg, thought, currentApp: snapshot?.currentApp ?? "unknown" });
+              return { content: [{ type: "text" as const, text: errorMsg }], details: {} };
+            }
+          }
+
           // ---- all other actions (tap, swipe, type, keyevent, launch_app, shell, run_script, read, write, edit, etc.) ----
           const executionResult = await runtime.executePhoneAction(action, ctx);
           const stepResult = ctx.lastScreenshotPath

diff --git a/src/agent/model-client.ts b/src/agent/model-client.ts
@@ -98,6 +98,22 @@ function extractThinking(msg: AssistantMessage): string {
  * OpenAI-compatible endpoint).
  */
 export function buildPiAiModel(profile: ModelProfile): Model<Api> {
+  // If profile explicitly specifies api and provider, use them directly
+  if (profile.api && profile.provider) {
+    return {
+      id: profile.model,
+      name: profile.model,
+      api: profile.api as Api,
+      provider: profile.provider,
+      baseUrl: profile.baseUrl,
+      reasoning: profile.reasoningEffort !== null,
+      input: ["text", "image"],
+      cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+      contextWindow: 128_000,
+      maxTokens: profile.maxTokens,
+    };
+  }
+
   // Detect provider / api from baseUrl.
   const baseUrlLower = profile.baseUrl.toLowerCase();
 

diff --git a/src/agent/prompts.ts b/src/agent/prompts.ts
@@ -20,6 +20,7 @@ const TOOL_CATALOG_ORDER = [
   "process",
   "memory_search",
   "memory_get",
+  "generate_image",
   "request_human_auth",
   "request_user_decision",
   "request_user_input",
@@ -44,6 +45,7 @@ const TOOL_CATALOG_LINES: Record<(typeof TOOL_CATALOG_ORDER)[number], string> =
   process: "- process: process(action[, sessionId, input, offset, limit, timeoutMs, reason])",
   memory_search: "- memory_search: memory_search(query[, maxResults, minScore, reason])",
   memory_get: "- memory_get: memory_get(path[, from, lines, reason])",
+  generate_image: "- generate_image: generate_image(prompt[, reason]) - Generate an image from text description using AI image generation service",
   request_human_auth: "- request_human_auth: request_human_auth(capability, instruction[, timeoutSec, reason])",
   request_user_decision: "- request_user_decision: request_user_decision(question, options[, timeoutSec, reason])",
   request_user_input: "- request_user_input: request_user_input(question[, placeholder, timeoutSec, reason])",

diff --git a/src/agent/tools.ts b/src/agent/tools.ts
@@ -185,6 +185,12 @@ export const finishSchema = Type.Object({
   message: Type.String({ description: "Summary of what was accomplished." }),
 });
 
+export const generateImageSchema = Type.Object({
+  thought: ThoughtParam,
+  prompt: Type.String({ description: "Text description of the image to generate." }),
+  reason: ReasonParam,
+});
+
 // ---------------------------------------------------------------------------
 // Exported types (Static inference from TypeBox schemas)
 // ---------------------------------------------------------------------------
@@ -210,6 +216,7 @@ export type RequestUserDecisionParams = Static<typeof requestUserDecisionSchema>
 export type RequestUserInputParams = Static<typeof requestUserInputSchema>;
 export type WaitParams = Static<typeof waitSchema>;
 export type FinishParams = Static<typeof finishSchema>;
+export type GenerateImageParams = Static<typeof generateImageSchema>;
 
 // ---------------------------------------------------------------------------
 // Tool metadata list (name, description, schema) — used to build AgentTool[]
@@ -243,6 +250,7 @@ export const TOOL_METAS: ToolMeta[] = [
   { name: "request_user_input", description: "Ask user for a short non-sensitive text input needed to continue the task.", parameters: requestUserInputSchema },
   { name: "wait", description: "Wait / do nothing for a short period, e.g. while content is loading.", parameters: waitSchema },
   { name: "finish", description: "Signal that the user task is complete.", parameters: finishSchema },
+  { name: "generate_image", description: "Generate an image from text description using an image generation service.", parameters: generateImageSchema },
 ];
 
 // ---------------------------------------------------------------------------

diff --git a/src/config/index.ts b/src/config/index.ts
@@ -166,6 +166,13 @@ function defaultConfigObject() {
         },
       },
     },
+    imageGeneration: {
+      enabled: false,
+      provider: "fal" as const,
+      apiKey: "",
+      apiKeyEnv: "FAL_API_KEY",
+      model: "fal-ai/nano-banana",
+    },
     models: {
       "gpt-5.2-codex": {
         baseUrl: "https://api.openai.com/v1",
@@ -613,9 +620,9 @@ function normalizeConfig(raw: Record<string, unknown>, configPath: string): Open
       model.reasoningEffort ?? model.reasoning_effort ?? null;
     const reasoningEffort =
       reasoningRaw === "low" ||
-      reasoningRaw === "medium" ||
-      reasoningRaw === "high" ||
-      reasoningRaw === "xhigh"
+        reasoningRaw === "medium" ||
+        reasoningRaw === "high" ||
+        reasoningRaw === "xhigh"
         ? reasoningRaw
         : null;
     const tempRaw = model.temperature;
@@ -649,6 +656,7 @@ function normalizeConfig(raw: Record<string, unknown>, configPath: string): Open
   const cron = (merged.cron ?? {}) as Record<string, unknown>;
   const dashboard = (merged.dashboard ?? {}) as Record<string, unknown>;
   const humanAuth = (merged.humanAuth ?? {}) as Record<string, unknown>;
+  const imageGeneration = (merged.imageGeneration ?? {}) as Record<string, unknown>;
   const sessionStorage = (merged.sessionStorage ?? {}) as Record<string, unknown>;
   const humanAuthTunnel = isObject(humanAuth.tunnel) ? humanAuth.tunnel : {};
   const humanAuthNgrok = isObject(humanAuthTunnel.ngrok) ? humanAuthTunnel.ngrok : {};
@@ -792,7 +800,7 @@ function normalizeConfig(raw: Record<string, unknown>, configPath: string): Open
       localRelayStateFile: resolvePath(
         String(
           humanAuth.localRelayStateFile ??
-            path.join(resolvedStateDir, "human-auth-relay", "requests.json"),
+          path.join(resolvedStateDir, "human-auth-relay", "requests.json"),
         ),
       ),
       relayBaseUrl: String(humanAuth.relayBaseUrl ?? "").trim().replace(/\/+$/, ""),
@@ -813,30 +821,42 @@ function normalizeConfig(raw: Record<string, unknown>, configPath: string): Open
           executable:
             String(
               humanAuthNgrok.executable ??
-                defaultConfigObject().humanAuth.tunnel.ngrok.executable,
+              defaultConfigObject().humanAuth.tunnel.ngrok.executable,
             ).trim() || "ngrok",
           authtoken: String(humanAuthNgrok.authtoken ?? ""),
           authtokenEnv:
             String(
               humanAuthNgrok.authtokenEnv ??
-                defaultConfigObject().humanAuth.tunnel.ngrok.authtokenEnv,
+              defaultConfigObject().humanAuth.tunnel.ngrok.authtokenEnv,
             ).trim() || "NGROK_AUTHTOKEN",
           apiBaseUrl:
             String(
               humanAuthNgrok.apiBaseUrl ??
-                defaultConfigObject().humanAuth.tunnel.ngrok.apiBaseUrl,
+              defaultConfigObject().humanAuth.tunnel.ngrok.apiBaseUrl,
             ).trim().replace(/\/+$/, "") || "http://127.0.0.1:4040",
           startupTimeoutSec: (() => {
             const raw = Number(
               humanAuthNgrok.startupTimeoutSec ??
-                defaultConfigObject().humanAuth.tunnel.ngrok.startupTimeoutSec,
+              defaultConfigObject().humanAuth.tunnel.ngrok.startupTimeoutSec,
             );
             const value = Number.isFinite(raw) ? raw : 20;
             return Math.max(3, Math.round(value));
           })(),
         },
       },
     },
+    imageGeneration: {
+      enabled: Boolean(imageGeneration.enabled ?? false),
+      provider: (() => {
+        const provider = String(imageGeneration.provider ?? "fal");
+        return provider === "fal" || provider === "replicate" || provider === "huggingface"
+          ? provider
+          : "fal";
+      })(),
+      apiKey: String(imageGeneration.apiKey ?? ""),
+      apiKeyEnv: String(imageGeneration.apiKeyEnv ?? "FAL_API_KEY"),
+      model: imageGeneration.model ? String(imageGeneration.model) : undefined,
+    },
     models,
     configPath,
   };
@@ -882,6 +902,7 @@ export function saveConfig(config: OpenPocketConfig): void {
     cron: config.cron,
     dashboard: config.dashboard,
     humanAuth: config.humanAuth,
+    imageGeneration: config.imageGeneration,
     models: config.models,
   };
   fs.writeFileSync(config.configPath, `${JSON.stringify(payload, null, 2)}\n`, "utf-8");

diff --git a/src/device/adb-runtime.ts b/src/device/adb-runtime.ts
@@ -879,6 +879,9 @@ export class AdbRuntime {
       case "finish": {
         return `Finish: ${action.message}`;
       }
+      case "generate_image": {
+        return "generate_image is handled by AgentRuntime image generation service.";
+      }
       default: {
         const exhaust: never = action;
         return `Unknown action: ${JSON.stringify(exhaust)}`;

diff --git a/src/gateway/telegram-gateway.ts b/src/gateway/telegram-gateway.ts
@@ -2467,14 +2467,41 @@ export class TelegramGateway {
               this.stripStepCounterTelemetry(finalMessage),
               1800,
             );
-            await this.bot.sendMessage(
-              chatId,
-              finalForChat,
-              {
-                disable_web_page_preview: true,
-              },
-            );
-            this.chat.appendExternalTurn(chatId, "assistant", finalMessage);
+            // Check if message contains an image URL
+            const imageUrlMatch = finalMessage.match(/https:\/\/[^\s]+\.(png|jpg|jpeg|gif|webp)/i);
+            if (imageUrlMatch) {
+              const imageUrl = imageUrlMatch[0];
+              const caption = this.sanitizeForChat(
+                this.stripStepCounterTelemetry(finalMessage.replace(imageUrl, "").trim()),
+                1000,
+              );
+              try {
+                await this.bot.sendPhoto(chatId, imageUrl, {
+                  caption: caption || undefined,
+                });
+                this.chat.appendExternalTurn(chatId, "assistant", finalMessage);
+              } catch (error) {
+                // Fallback to text message if photo send fails
+                this.log(`Failed to send photo, falling back to text: ${(error as Error).message}`);
+                await this.bot.sendMessage(
+                  chatId,
+                  this.sanitizeForChat(
+                    this.stripStepCounterTelemetry(finalMessage),
+                    1800,
+                  ),
+                );
+                this.chat.appendExternalTurn(chatId, "assistant", finalMessage);
+              }
+            } else {
+              await this.bot.sendMessage(
+                chatId,
+                finalForChat,
+                {
+                  disable_web_page_preview: true,
+                },
+              );
+              this.chat.appendExternalTurn(chatId, "assistant", finalMessage);
+            }
           }
 
           return {

diff --git a/src/services/image-generation/base.ts b/src/services/image-generation/base.ts
@@ -0,0 +1,38 @@
+/**
+ * Base interface for image generation providers
+ *
+ * Implementations wrap specific third-party APIs (fal, replicate, etc.)
+ * while exposing a consistent business-level interface.
+ */
+
+import type {
+  ImageGenerationRequest,
+  ImageGenerationResult,
+} from "./types.js";
+
+/**
+ * Abstract base class for image generation providers
+ *
+ * Each provider (fal, replicate, etc.) implements this interface.
+ * The factory returns appropriate instances based on configuration.
+ */
+export abstract class ImageGenerationProvider {
+  /**
+   * Unique identifier for this provider
+   */
+  abstract readonly providerId: string;
+
+  /**
+   * Generate an image from a text prompt
+   *
+   * @param request - Image generation request
+   * @returns Promise resolving to generation result
+   * @throws ImageGenerationError if generation fails
+   */
+  abstract generate(request: ImageGenerationRequest): Promise<ImageGenerationResult>;
+
+  /**
+   * Check if this provider is properly configured
+   */
+  abstract isConfigured(): boolean;
+}