From e5636806447cd64b0f56ee7766f2babc98cc3501 Mon Sep 17 00:00:00 2001
From: Hemanth0411 <hemanthreddyannem@gmail.com>
Date: Mon, 12 May 2025 12:58:33 +0530
Subject: [PATCH 1/2] Fixes #65: Added support for third AI model (Gemini)

---
 config.yaml                    |  7 +++++--
 scripts/and_controller.py      |  2 +-
 scripts/document_generation.py |  5 ++++-
 scripts/model.py               | 34 ++++++++++++++++++++++++++++++----
 scripts/self_explorer.py       |  5 ++++-
 5 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/config.yaml b/config.yaml
index d0c1ad6..65366bb 100644
--- a/config.yaml
+++ b/config.yaml
@@ -1,8 +1,8 @@
-MODEL: "OpenAI"  # The type of multi-modal LLM you would like to use to power the AppAgent, must be either OpenAI or Qwen
+MODEL: "Gemini"  # The type of multi-modal LLM you would like to use to power the AppAgent, must be either OpenAI, Qwen or Gemini
 
 OPENAI_API_BASE: "https://api.openai.com/v1/chat/completions"
 OPENAI_API_KEY: "sk-"  # Set the value to sk-xxx if you host the openai interface for open llm model
-OPENAI_API_MODEL: "gpt-4-vision-preview"  # The only OpenAI model by now that accepts visual input
+OPENAI_API_MODEL: "gpt-4o"  # The only OpenAI model by now that accepts visual input
 MAX_TOKENS: 300  # The max token limit for the response completion
 TEMPERATURE: 0.0  # The temperature of the model: the lower the value, the more consistent the output of the model
 REQUEST_INTERVAL: 10  # Time in seconds between consecutive GPT-4V requests
@@ -10,6 +10,9 @@ REQUEST_INTERVAL: 10  # Time in seconds between consecutive GPT-4V requests
 DASHSCOPE_API_KEY: "sk-"  # The dashscope API key that gives you access to Qwen-VL model
 QWEN_MODEL: "qwen-vl-max"
 
+GEMINI_API_KEY: "AI"  # Your Gemini API key
+GEMINI_MODEL: "gemini-1.5-flash"  # The Gemini model to use
+
 ANDROID_SCREENSHOT_DIR: "/sdcard"  # Set the directory on your Android device to store the intermediate screenshots. Make sure the directory EXISTS on your phone!
 ANDROID_XML_DIR: "/sdcard"  # Set the directory on your Android device to store the intermediate XML files used for determining locations of UI elements on your screen. Make sure the directory EXISTS on your phone!
 
diff --git a/scripts/and_controller.py b/scripts/and_controller.py
index 2cde0bb..1fee42c 100644
--- a/scripts/and_controller.py
+++ b/scripts/and_controller.py
@@ -175,6 +175,6 @@ def swipe(self, x, y, direction, dist="medium", quick=False):
     def swipe_precise(self, start, end, duration=400):
         start_x, start_y = start
         end_x, end_y = end
-        adb_command = f"adb -s {self.device} shell input swipe {start_x} {start_x} {end_x} {end_y} {duration}"
+        adb_command = f"adb -s {self.device} shell input swipe {start_x} {start_y} {end_x} {end_y} {duration}"
         ret = execute_adb(adb_command)
         return ret
diff --git a/scripts/document_generation.py b/scripts/document_generation.py
index 24db53b..258c8ac 100644
--- a/scripts/document_generation.py
+++ b/scripts/document_generation.py
@@ -8,7 +8,7 @@
 
 import prompts
 from config import load_config
-from model import OpenAIModel, QwenModel
+from model import OpenAIModel, QwenModel, GeminiModel
 from utils import print_with_color
 
 arg_desc = "AppAgent - Human Demonstration"
@@ -29,6 +29,9 @@
 elif configs["MODEL"] == "Qwen":
     mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"],
                      model=configs["QWEN_MODEL"])
+elif configs["MODEL"] == "Gemini":
+    mllm = GeminiModel(api_key=configs["GEMINI_API_KEY"],
+                       model=configs["GEMINI_MODEL"])
 else:
     print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red")
     sys.exit()
diff --git a/scripts/model.py b/scripts/model.py
index bf632db..98c294e 100644
--- a/scripts/model.py
+++ b/scripts/model.py
@@ -1,6 +1,6 @@
 import re
 from abc import abstractmethod
-from typing import List
+from typing import List, Tuple
 from http import HTTPStatus
 
 import requests
@@ -14,7 +14,7 @@ def __init__(self):
         pass
 
     @abstractmethod
-    def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):
+    def get_model_response(self, prompt: str, images: List[str]) -> Tuple[bool, str]:
         pass
 
 
@@ -27,7 +27,7 @@ def __init__(self, base_url: str, api_key: str, model: str, temperature: float,
         self.temperature = temperature
         self.max_tokens = max_tokens
 
-    def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):
+    def get_model_response(self, prompt: str, images: List[str]) -> Tuple[bool, str]:
         content = [
             {
                 "type": "text",
@@ -76,7 +76,7 @@ def __init__(self, api_key: str, model: str):
         self.model = model
         dashscope.api_key = api_key
 
-    def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):
+    def get_model_response(self, prompt: str, images: List[str]) -> Tuple[bool, str]:
         content = [{
             "text": prompt
         }]
@@ -97,6 +97,32 @@ def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):
         else:
             return False, response.message
 
+class GeminiModel(BaseModel):
+    def __init__(self, api_key: str, model: str):
+        super().__init__()
+        self.api_key = api_key
+        self.model = model
+
+    def get_model_response(self, prompt: str, images: List[str]) -> tuple[bool, str]:
+        # Implement Gemini API call here
+        # This is a placeholder; actual implementation depends on Gemini's API specifications
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}"
+        }
+        payload = {
+            "model": self.model,
+            "prompt": prompt,
+            "images": images  # Assuming Gemini supports image inputs
+        }
+        try:
+            # Replace with actual Gemini API endpoint
+            response = requests.post("https://api.gemini.com/v1/completions", headers=headers, json=payload)
+            response.raise_for_status()
+            response_data = response.json()
+            return True, response_data["choices"][0]["message"]["content"]
+        except Exception as e:
+            return False, str(e)
 
 def parse_explore_rsp(rsp):
     try:
diff --git a/scripts/self_explorer.py b/scripts/self_explorer.py
index b0e16bb..d1e96aa 100644
--- a/scripts/self_explorer.py
+++ b/scripts/self_explorer.py
@@ -10,7 +10,7 @@
 import prompts
 from config import load_config
 from and_controller import list_all_devices, AndroidController, traverse_tree
-from model import parse_explore_rsp, parse_reflect_rsp, OpenAIModel, QwenModel
+from model import parse_explore_rsp, parse_reflect_rsp, OpenAIModel, QwenModel, GeminiModel
 from utils import print_with_color, draw_bbox_multi
 
 arg_desc = "AppAgent - Autonomous Exploration"
@@ -30,6 +30,9 @@
 elif configs["MODEL"] == "Qwen":
     mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"],
                      model=configs["QWEN_MODEL"])
+elif configs["MODEL"] == "Gemini":
+    mllm = GeminiModel(api_key=configs["GEMINI_API_KEY"],
+                    model=configs["GEMINI_MODEL"])
 else:
     print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red")
     sys.exit()

From 832cde7e05dc4b63baf1f1c8e9896945ce1b6eee Mon Sep 17 00:00:00 2001
From: Hemanth0411 <hemanthreddyannem@gmail.com>
Date: Mon, 12 May 2025 13:08:22 +0530
Subject: [PATCH 2/2] docs: update README to include Gemini as supported model

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 149f85f..d3d5baa 100644
--- a/README.md
+++ b/README.md
@@ -93,9 +93,9 @@ pip install -r requirements.txt
 ### 🤖 Step 2. Configure the Agent
 
 AppAgent needs to be powered by a multi-modal model which can receive both text and visual inputs. During our experiment
-, we used `gpt-4-vision-preview` as the model to make decisions on how to take actions to complete a task on the smartphone.
+, we used `gpt-4o` as the model to make decisions on how to take actions to complete a task on the smartphone.
 
-To configure your requests to GPT-4V, you should modify `config.yaml` in the root directory.
+To configure your requests to GPT-4, you should modify `config.yaml` in the root directory.
 There are two key parameters that must be configured to try AppAgent:
 1. OpenAI API key: you must purchase an eligible API key from OpenAI so that you can have access to GPT-4V.
 2. Request interval: this is the time interval in seconds between consecutive GPT-4V requests to control the frequency 
@@ -111,6 +111,8 @@ free to use but its performance in the context of AppAgent is poorer compared wi
 To use it, you should create an Alibaba Cloud account and [create a Dashscope API key](https://help.aliyun.com/zh/dashscope/developer-reference/activate-dashscope-and-create-an-api-key?spm=a2c4g.11186623.0.i1) to fill in the `DASHSCOPE_API_KEY` field 
 in the `config.yaml` file. Change the `MODEL` field from `OpenAI` to `Qwen` as well.
 
+You can also try `Gemini` as another alternative multi-modal model to power the AppAgent. To use it, you should create an account on the respective platform and obtain the necessary API key to fill in the `GEMINI_API_KEY` field in the `config.yaml` file. Change the `MODEL` field to `Gemini` as well.
+
 If you want to test AppAgent using your own models, you should write a new model class in `scripts/model.py` accordingly.
 
 ### 🔍 Step 3. Exploration Phase