From e5636806447cd64b0f56ee7766f2babc98cc3501 Mon Sep 17 00:00:00 2001 From: Hemanth0411 Date: Mon, 12 May 2025 12:58:33 +0530 Subject: [PATCH 1/2] Fixes #65: Added support for third AI model (Gemini) --- config.yaml | 7 +++++-- scripts/and_controller.py | 2 +- scripts/document_generation.py | 5 ++++- scripts/model.py | 34 ++++++++++++++++++++++++++++++---- scripts/self_explorer.py | 5 ++++- 5 files changed, 44 insertions(+), 9 deletions(-) diff --git a/config.yaml b/config.yaml index d0c1ad6..65366bb 100644 --- a/config.yaml +++ b/config.yaml @@ -1,8 +1,8 @@ -MODEL: "OpenAI" # The type of multi-modal LLM you would like to use to power the AppAgent, must be either OpenAI or Qwen +MODEL: "Gemini" # The type of multi-modal LLM you would like to use to power the AppAgent, must be either OpenAI, Qwen or Gemini OPENAI_API_BASE: "https://api.openai.com/v1/chat/completions" OPENAI_API_KEY: "sk-" # Set the value to sk-xxx if you host the openai interface for open llm model -OPENAI_API_MODEL: "gpt-4-vision-preview" # The only OpenAI model by now that accepts visual input +OPENAI_API_MODEL: "gpt-4o" # The only OpenAI model by now that accepts visual input MAX_TOKENS: 300 # The max token limit for the response completion TEMPERATURE: 0.0 # The temperature of the model: the lower the value, the more consistent the output of the model REQUEST_INTERVAL: 10 # Time in seconds between consecutive GPT-4V requests @@ -10,6 +10,9 @@ REQUEST_INTERVAL: 10 # Time in seconds between consecutive GPT-4V requests DASHSCOPE_API_KEY: "sk-" # The dashscope API key that gives you access to Qwen-VL model QWEN_MODEL: "qwen-vl-max" +GEMINI_API_KEY: "AI" # Your Gemini API key +GEMINI_MODEL: "gemini-1.5-flash" # The Gemini model to use + ANDROID_SCREENSHOT_DIR: "/sdcard" # Set the directory on your Android device to store the intermediate screenshots. Make sure the directory EXISTS on your phone! ANDROID_XML_DIR: "/sdcard" # Set the directory on your Android device to store the intermediate XML files used for determining locations of UI elements on your screen. Make sure the directory EXISTS on your phone! diff --git a/scripts/and_controller.py b/scripts/and_controller.py index 2cde0bb..1fee42c 100644 --- a/scripts/and_controller.py +++ b/scripts/and_controller.py @@ -175,6 +175,6 @@ def swipe(self, x, y, direction, dist="medium", quick=False): def swipe_precise(self, start, end, duration=400): start_x, start_y = start end_x, end_y = end - adb_command = f"adb -s {self.device} shell input swipe {start_x} {start_x} {end_x} {end_y} {duration}" + adb_command = f"adb -s {self.device} shell input swipe {start_x} {start_y} {end_x} {end_y} {duration}" ret = execute_adb(adb_command) return ret diff --git a/scripts/document_generation.py b/scripts/document_generation.py index 24db53b..258c8ac 100644 --- a/scripts/document_generation.py +++ b/scripts/document_generation.py @@ -8,7 +8,7 @@ import prompts from config import load_config -from model import OpenAIModel, QwenModel +from model import OpenAIModel, QwenModel, GeminiModel from utils import print_with_color arg_desc = "AppAgent - Human Demonstration" @@ -29,6 +29,9 @@ elif configs["MODEL"] == "Qwen": mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"], model=configs["QWEN_MODEL"]) +elif configs["MODEL"] == "Gemini": + mllm = GeminiModel(api_key=configs["GEMINI_API_KEY"], + model=configs["GEMINI_MODEL"]) else: print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red") sys.exit() diff --git a/scripts/model.py b/scripts/model.py index bf632db..98c294e 100644 --- a/scripts/model.py +++ b/scripts/model.py @@ -1,6 +1,6 @@ import re from abc import abstractmethod -from typing import List +from typing import List, Tuple from http import HTTPStatus import requests @@ -14,7 +14,7 @@ def __init__(self): pass @abstractmethod - def get_model_response(self, prompt: str, images: List[str]) -> (bool, str): + def get_model_response(self, prompt: str, images: List[str]) -> Tuple[bool, str]: pass @@ -27,7 +27,7 @@ def __init__(self, base_url: str, api_key: str, model: str, temperature: float, self.temperature = temperature self.max_tokens = max_tokens - def get_model_response(self, prompt: str, images: List[str]) -> (bool, str): + def get_model_response(self, prompt: str, images: List[str]) -> Tuple[bool, str]: content = [ { "type": "text", @@ -76,7 +76,7 @@ def __init__(self, api_key: str, model: str): self.model = model dashscope.api_key = api_key - def get_model_response(self, prompt: str, images: List[str]) -> (bool, str): + def get_model_response(self, prompt: str, images: List[str]) -> Tuple[bool, str]: content = [{ "text": prompt }] @@ -97,6 +97,32 @@ def get_model_response(self, prompt: str, images: List[str]) -> (bool, str): else: return False, response.message +class GeminiModel(BaseModel): + def __init__(self, api_key: str, model: str): + super().__init__() + self.api_key = api_key + self.model = model + + def get_model_response(self, prompt: str, images: List[str]) -> tuple[bool, str]: + # Implement Gemini API call here + # This is a placeholder; actual implementation depends on Gemini's API specifications + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}" + } + payload = { + "model": self.model, + "prompt": prompt, + "images": images # Assuming Gemini supports image inputs + } + try: + # Replace with actual Gemini API endpoint + response = requests.post("https://api.gemini.com/v1/completions", headers=headers, json=payload) + response.raise_for_status() + response_data = response.json() + return True, response_data["choices"][0]["message"]["content"] + except Exception as e: + return False, str(e) def parse_explore_rsp(rsp): try: diff --git a/scripts/self_explorer.py b/scripts/self_explorer.py index b0e16bb..d1e96aa 100644 --- a/scripts/self_explorer.py +++ b/scripts/self_explorer.py @@ -10,7 +10,7 @@ import prompts from config import load_config from and_controller import list_all_devices, AndroidController, traverse_tree -from model import parse_explore_rsp, parse_reflect_rsp, OpenAIModel, QwenModel +from model import parse_explore_rsp, parse_reflect_rsp, OpenAIModel, QwenModel, GeminiModel from utils import print_with_color, draw_bbox_multi arg_desc = "AppAgent - Autonomous Exploration" @@ -30,6 +30,9 @@ elif configs["MODEL"] == "Qwen": mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"], model=configs["QWEN_MODEL"]) +elif configs["MODEL"] == "Gemini": + mllm = GeminiModel(api_key=configs["GEMINI_API_KEY"], + model=configs["GEMINI_MODEL"]) else: print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red") sys.exit() From 832cde7e05dc4b63baf1f1c8e9896945ce1b6eee Mon Sep 17 00:00:00 2001 From: Hemanth0411 Date: Mon, 12 May 2025 13:08:22 +0530 Subject: [PATCH 2/2] docs: update README to include Gemini as supported model --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 149f85f..d3d5baa 100644 --- a/README.md +++ b/README.md @@ -93,9 +93,9 @@ pip install -r requirements.txt ### 🤖 Step 2. Configure the Agent AppAgent needs to be powered by a multi-modal model which can receive both text and visual inputs. During our experiment -, we used `gpt-4-vision-preview` as the model to make decisions on how to take actions to complete a task on the smartphone. +, we used `gpt-4o` as the model to make decisions on how to take actions to complete a task on the smartphone. -To configure your requests to GPT-4V, you should modify `config.yaml` in the root directory. +To configure your requests to GPT-4, you should modify `config.yaml` in the root directory. There are two key parameters that must be configured to try AppAgent: 1. OpenAI API key: you must purchase an eligible API key from OpenAI so that you can have access to GPT-4V. 2. Request interval: this is the time interval in seconds between consecutive GPT-4V requests to control the frequency @@ -111,6 +111,8 @@ free to use but its performance in the context of AppAgent is poorer compared wi To use it, you should create an Alibaba Cloud account and [create a Dashscope API key](https://help.aliyun.com/zh/dashscope/developer-reference/activate-dashscope-and-create-an-api-key?spm=a2c4g.11186623.0.i1) to fill in the `DASHSCOPE_API_KEY` field in the `config.yaml` file. Change the `MODEL` field from `OpenAI` to `Qwen` as well. +You can also try `Gemini` as another alternative multi-modal model to power the AppAgent. To use it, you should create an account on the respective platform and obtain the necessary API key to fill in the `GEMINI_API_KEY` field in the `config.yaml` file. Change the `MODEL` field to `Gemini` as well. + If you want to test AppAgent using your own models, you should write a new model class in `scripts/model.py` accordingly. ### 🔍 Step 3. Exploration Phase