diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9c6d4e9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/.idea +/apps +/venv +__pycache__ \ No newline at end of file diff --git a/config.yaml b/config.yaml index d0c1ad6..f84d81f 100644 --- a/config.yaml +++ b/config.yaml @@ -1,5 +1,10 @@ -MODEL: "OpenAI" # The type of multi-modal LLM you would like to use to power the AppAgent, must be either OpenAI or Qwen +MODEL: "Ollama" # The type of multi-modal LLM you would like to use to power the AppAgent, must be either OpenAI or Qwen +# OLLAMA +OLLAMA_API_BASE: "http://localhost:11434/api/chat" +OLLAMA_API_MODEL: "llama3.2-vision:11b" + +# OPENAI OPENAI_API_BASE: "https://api.openai.com/v1/chat/completions" OPENAI_API_KEY: "sk-" # Set the value to sk-xxx if you host the openai interface for open llm model OPENAI_API_MODEL: "gpt-4-vision-preview" # The only OpenAI model by now that accepts visual input diff --git a/learn.py b/learn.py index c922200..a73feea 100644 --- a/learn.py +++ b/learn.py @@ -7,7 +7,9 @@ arg_desc = "AppAgent - exploration phase" parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc) -parser.add_argument("--app") +parser.add_argument("--app", default="Chrome") +parser.add_argument("--user_input", default="1") +parser.add_argument("--task_desc", default="open the baidu.com website with a browser") parser.add_argument("--root_dir", default="./") args = vars(parser.parse_args()) @@ -26,7 +28,7 @@ "main interface of the app on your phone.", "yellow") print_with_color("Choose from the following modes:\n1. autonomous exploration\n2. human demonstration\n" "Type 1 or 2.", "blue") -user_input = "" +user_input = args["user_input"] while user_input != "1" and user_input != "2": user_input = input() @@ -36,7 +38,8 @@ app = app.replace(" ", "") if user_input == "1": - os.system(f"python scripts/self_explorer.py --app {app} --root_dir {root_dir}") + task_desc = args["task_desc"] + os.system(f"python scripts/self_explorer.py --app {app} --root_dir {root_dir} --task_desc '{task_desc}'") else: demo_timestamp = int(time.time()) demo_name = datetime.datetime.fromtimestamp(demo_timestamp).strftime(f"demo_{app}_%Y-%m-%d_%H-%M-%S") diff --git a/scripts/cv_example.py b/scripts/cv_example.py new file mode 100644 index 0000000..edb57af --- /dev/null +++ b/scripts/cv_example.py @@ -0,0 +1,48 @@ +import json +from config import load_config +from model_parser import parse as model_parse + +configs = load_config('../config.yaml') +mllm = model_parse(configs) + +form = { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "objects": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "description": "The type of the object." + }, + "color": { + "type": "string", + "description": "The color of the object." + } + }, + "required": ["type", "color"] + } + }, + "count": { + "type": "integer", + "description": "The number of objects present in this image." + } + }, + "required": ["objects", "count"] +} + +prompt = """ +Please count the objects present in this picture? Describe the type, color of each object. +Please respond with the following json format: +%s +""".format(json.dumps(form, indent=2)) + +prompt = """What is inside this image? +""" + +status, rsp = mllm.get_model_response(prompt, ['./image.jpg'], '') +print(status) +print(json.dumps(rsp,indent=2)) diff --git a/scripts/document_generation.py b/scripts/document_generation.py index 24db53b..79659c3 100644 --- a/scripts/document_generation.py +++ b/scripts/document_generation.py @@ -8,9 +8,10 @@ import prompts from config import load_config -from model import OpenAIModel, QwenModel from utils import print_with_color +from model_parser import parse as model_parse + arg_desc = "AppAgent - Human Demonstration" parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc) parser.add_argument("--app", required=True) @@ -20,18 +21,7 @@ configs = load_config() -if configs["MODEL"] == "OpenAI": - mllm = OpenAIModel(base_url=configs["OPENAI_API_BASE"], - api_key=configs["OPENAI_API_KEY"], - model=configs["OPENAI_API_MODEL"], - temperature=configs["TEMPERATURE"], - max_tokens=configs["MAX_TOKENS"]) -elif configs["MODEL"] == "Qwen": - mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"], - model=configs["QWEN_MODEL"]) -else: - print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red") - sys.exit() +mllm = model_parse(configs) root_dir = args["root_dir"] work_dir = os.path.join(root_dir, "apps") diff --git a/scripts/image.jpg b/scripts/image.jpg new file mode 100644 index 0000000..5c560fe Binary files /dev/null and b/scripts/image.jpg differ diff --git a/scripts/model.py b/scripts/model.py index bf632db..de0255d 100644 --- a/scripts/model.py +++ b/scripts/model.py @@ -1,3 +1,4 @@ +import json import re from abc import abstractmethod from typing import List @@ -14,7 +15,7 @@ def __init__(self): pass @abstractmethod - def get_model_response(self, prompt: str, images: List[str]) -> (bool, str): + def get_model_response(self, prompt: str, images: List[str], form='json') -> (bool, str): pass @@ -27,7 +28,7 @@ def __init__(self, base_url: str, api_key: str, model: str, temperature: float, self.temperature = temperature self.max_tokens = max_tokens - def get_model_response(self, prompt: str, images: List[str]) -> (bool, str): + def get_model_response(self, prompt: str, images: List[str], form='json') -> (bool, str): content = [ { "type": "text", @@ -76,7 +77,7 @@ def __init__(self, api_key: str, model: str): self.model = model dashscope.api_key = api_key - def get_model_response(self, prompt: str, images: List[str]) -> (bool, str): + def get_model_response(self, prompt: str, images: List[str], form='json') -> (bool, str): content = [{ "text": prompt }] @@ -100,10 +101,10 @@ def get_model_response(self, prompt: str, images: List[str]) -> (bool, str): def parse_explore_rsp(rsp): try: - observation = re.findall(r"Observation: (.*?)$", rsp, re.MULTILINE)[0] - think = re.findall(r"Thought: (.*?)$", rsp, re.MULTILINE)[0] - act = re.findall(r"Action: (.*?)$", rsp, re.MULTILINE)[0] - last_act = re.findall(r"Summary: (.*?)$", rsp, re.MULTILINE)[0] + observation = rsp['Observation'] + think = rsp['Thought'] + act = rsp['Action'] + last_act = rsp['Summary'] print_with_color("Observation:", "yellow") print_with_color(observation, "magenta") print_with_color("Thought:", "yellow") @@ -137,7 +138,7 @@ def parse_explore_rsp(rsp): print_with_color(f"ERROR: Undefined act {act_name}!", "red") return ["ERROR"] except Exception as e: - print_with_color(f"ERROR: an exception occurs while parsing the model response: {e}", "red") + print_with_color(f"ERROR: an exception occurs while parsing the model response: {e.with_traceback()}", "red") print_with_color(rsp, "red") return ["ERROR"] @@ -189,8 +190,8 @@ def parse_grid_rsp(rsp): def parse_reflect_rsp(rsp): try: - decision = re.findall(r"Decision: (.*?)$", rsp, re.MULTILINE)[0] - think = re.findall(r"Thought: (.*?)$", rsp, re.MULTILINE)[0] + decision = rsp['Decision'] + think = rsp['Thought'] print_with_color("Decision:", "yellow") print_with_color(decision, "magenta") print_with_color("Thought:", "yellow") @@ -198,7 +199,7 @@ def parse_reflect_rsp(rsp): if decision == "INEFFECTIVE": return [decision, think] elif decision == "BACK" or decision == "CONTINUE" or decision == "SUCCESS": - doc = re.findall(r"Documentation: (.*?)$", rsp, re.MULTILINE)[0] + doc = rsp['Documentation'] print_with_color("Documentation:", "yellow") print_with_color(doc, "magenta") return [decision, think, doc] @@ -209,3 +210,46 @@ def parse_reflect_rsp(rsp): print_with_color(f"ERROR: an exception occurs while parsing the model response: {e}", "red") print_with_color(rsp, "red") return ["ERROR"] + + +class OllamaModel(BaseModel): + def __init__(self, base_url: str, model: str): + super().__init__() + self.base_url = base_url + self.model = model + + def get_model_response(self, prompt: str, images: List[str], form='json') -> (bool, str): + for idx, img in enumerate(images): + base64_img = encode_image(img) + images[idx] = base64_img + headers = { + "Content-Type": "application/json" + } + payload = { + "model": self.model, + "messages": [ + { + "role": "user", + "content": prompt, + 'images': images + } + ], + "stream": False, + "format": form, + } + if len(form) == 0: + del payload['format'] + # print('get_model_request:\n', prompt) + response = requests.post(self.base_url, headers=headers, json=payload).json() + print('get_model_response:\n', json.dumps(response, indent=2)) + if "error" not in response: + total_duration = response["total_duration"] + print_with_color(f"Request duration is " + f"{'{0:.2f}'.format(total_duration / 10 ** 9)}s", + "yellow") + else: + return False, response['error'] + content = response["message"]["content"] + if form: + content = json.loads(content) + return True, content diff --git a/scripts/model_parser.py b/scripts/model_parser.py new file mode 100644 index 0000000..cc79449 --- /dev/null +++ b/scripts/model_parser.py @@ -0,0 +1,24 @@ +import sys +from typing import Optional +from model import BaseModel, OpenAIModel, QwenModel, OllamaModel +from utils import print_with_color + + +def parse(configs: dict) -> BaseModel: + mllm: Optional[BaseModel] = None + if configs["MODEL"] == "OpenAI": + mllm = OpenAIModel(base_url=configs["OPENAI_API_BASE"], + api_key=configs["OPENAI_API_KEY"], + model=configs["OPENAI_API_MODEL"], + temperature=configs["TEMPERATURE"], + max_tokens=configs["MAX_TOKENS"]) + elif configs["MODEL"] == "Qwen": + mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"], + model=configs["QWEN_MODEL"]) + elif configs["MODEL"] == 'Ollama': + mllm = OllamaModel(base_url=configs["OLLAMA_API_BASE"], + model=configs["OLLAMA_API_MODEL"]) + else: + print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red") + sys.exit() + return mllm \ No newline at end of file diff --git a/scripts/prompts.py b/scripts/prompts.py index 5f15ec5..9b67006 100644 --- a/scripts/prompts.py +++ b/scripts/prompts.py @@ -139,7 +139,7 @@ given a screenshot of a smartphone app. The interactive UI elements on the screenshot are labeled with numeric tags starting from 1. -You can call the following functions to interact with those labeled elements to control the smartphone: +You can call the following functions (in python syntax) to interact with those labeled elements to control the smartphone: 1. tap(element: int) This function is used to tap an UI element shown on the smartphone screen. @@ -147,10 +147,10 @@ A simple use case can be tap(5), which taps the UI element labeled with the number 5. 2. text(text_input: str) -This function is used to insert text input in an input field/box. text_input is the string you want to insert and must -be wrapped with double quotation marks. A simple use case can be text("Hello, world!"), which inserts the string -"Hello, world!" into the input area on the smartphone screen. This function is only callable when you see a keyboard -showing in the lower half of the screen. +This function is used to insert text input in an input field/box when a keyboard shows up below the screen. text_input +is the string you want to insert and must be wrapped with double quotation marks. A simple use case can be +text("Hello, world!"), which inserts the string "Hello, world!" into the input area on the smartphone screen. This +function is ONLY callable when you see a KEYBOARD showing in the lower half of the screen. 3. long_press(element: int) This function is used to long press an UI element shown on the smartphone screen. @@ -168,16 +168,24 @@ The task you need to complete is to . Your past actions to proceed with this task are summarized as follows: + Now, given the following labeled screenshot, you need to think and call the function needed to proceed with the task. -Your output should include three parts in the given format: +Your output should include the exact four parts (Observation, Thought, Action and Summary) in the following JSON format: +{ + "Observation": "your observation", + "Thought": "your thought", + "Action": "text('text')", + "Summary": "your summary" +} Observation: Thought: -Action: +Action: Summary: -You can only take one action at a time, so please directly call the function.""" +""" self_explore_reflect_template = """I will give you screenshots of a mobile app before and after the UI element labeled with the number '' on the first screenshot. The numeric tag of each element is located at @@ -185,20 +193,29 @@ The action was also an attempt to proceed with a larger task, which is to . Your job is to carefully analyze the difference between the two screenshots to determine if the action is in accord with the description above and at -the same time effectively moved the task forward. Your output should be determined based on the following situations: +the same time effectively moved the task forward. Your output should be determined based on the following situations: 1. BACK If you think the action navigated you to a page where you cannot proceed with the given task, you should go back to the previous interface. At the same time, describe the functionality of the UI element concisely in one or two sentences by observing the difference between the two screenshots. Notice that your description of the UI element should focus on the general function. Never include the numeric tag of the UI element in your description. You can use pronouns such as -"the UI element" to refer to the element. Your output should be in the following format: +"the UI element" to refer to the element. Your output should be in the following JSON format: +{ + "Decision": "BACK", + "Thought": "your thought", + "Documentation": "your documentation" +} Decision: BACK Thought: Documentation: 2. INEFFECTIVE If you find the action changed nothing on the screen (screenshots before and after the action are identical), you should continue to interact with other elements on the screen. Notice that if you find the location of the cursor -changed between the two screenshots, then they are not identical. Your output should be in the following format: +changed between the two screenshots, then they are not identical. Your output should be in the following JSON format: +{ + "Decision": "INEFFECTIVE", + "Thought": "your thought" +} Decision: INEFFECTIVE Thought: 3. CONTINUE @@ -207,7 +224,12 @@ describe the functionality of the UI element concisely in one or two sentences by observing the difference between the two screenshots. Notice that your description of the UI element should focus on the general function. Never include the numeric tag of the UI element in your description. You can use pronouns such as "the UI element" to refer to the -element. Your output should be in the following format: +element. Your output should be in the following JSON format: +{ + "Decision": "CONTINUE", + "Thought": "your thought", + "Documentation": "your documentation" +} Decision: CONTINUE Thought: @@ -216,7 +238,12 @@ If you think the action successfully moved the task forward (even though it did not completed the task), you should describe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI element should focus on the general function. Never include the numeric tag of the UI element in your description. You -can use pronouns such as "the UI element" to refer to the element. Your output should be in the following format: +can use pronouns such as "the UI element" to refer to the element. Your output should be in the following JSON format: +{ + "Decision": "SUCCESS", + "Thought": "your thought", + "Documentation": "your documentation" +} Decision: SUCCESS Thought: Documentation: diff --git a/scripts/self_explorer.py b/scripts/self_explorer.py index b0e16bb..afa5a28 100644 --- a/scripts/self_explorer.py +++ b/scripts/self_explorer.py @@ -10,29 +10,20 @@ import prompts from config import load_config from and_controller import list_all_devices, AndroidController, traverse_tree -from model import parse_explore_rsp, parse_reflect_rsp, OpenAIModel, QwenModel +from model import parse_explore_rsp, parse_reflect_rsp from utils import print_with_color, draw_bbox_multi +from model_parser import parse as model_parse arg_desc = "AppAgent - Autonomous Exploration" parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc) parser.add_argument("--app") +parser.add_argument("--task_desc") parser.add_argument("--root_dir", default="./") args = vars(parser.parse_args()) configs = load_config() -if configs["MODEL"] == "OpenAI": - mllm = OpenAIModel(base_url=configs["OPENAI_API_BASE"], - api_key=configs["OPENAI_API_KEY"], - model=configs["OPENAI_API_MODEL"], - temperature=configs["TEMPERATURE"], - max_tokens=configs["MAX_TOKENS"]) -elif configs["MODEL"] == "Qwen": - mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"], - model=configs["QWEN_MODEL"]) -else: - print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red") - sys.exit() +mllm = model_parse(configs) app = args["app"] root_dir = args["root_dir"] @@ -79,8 +70,10 @@ sys.exit() print_with_color(f"Screen resolution of {device}: {width}x{height}", "yellow") -print_with_color("Please enter the description of the task you want me to complete in a few sentences:", "blue") -task_desc = input() +task_desc = args['task_desc'] +if not task_desc: + print_with_color("Please enter the description of the task you want me to complete in a few sentences:", "blue") + task_desc = input() round_count = 0 doc_count = 0 diff --git a/scripts/task_executor.py b/scripts/task_executor.py index e092a15..4ac0e4e 100644 --- a/scripts/task_executor.py +++ b/scripts/task_executor.py @@ -10,8 +10,9 @@ import prompts from config import load_config from and_controller import list_all_devices, AndroidController, traverse_tree -from model import parse_explore_rsp, parse_grid_rsp, OpenAIModel, QwenModel +from model import parse_explore_rsp, parse_grid_rsp from utils import print_with_color, draw_bbox_multi, draw_grid +from model_parser import parse as model_parse arg_desc = "AppAgent Executor" parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc) @@ -21,18 +22,7 @@ configs = load_config() -if configs["MODEL"] == "OpenAI": - mllm = OpenAIModel(base_url=configs["OPENAI_API_BASE"], - api_key=configs["OPENAI_API_KEY"], - model=configs["OPENAI_API_MODEL"], - temperature=configs["TEMPERATURE"], - max_tokens=configs["MAX_TOKENS"]) -elif configs["MODEL"] == "Qwen": - mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"], - model=configs["QWEN_MODEL"]) -else: - print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red") - sys.exit() +mllm = model_parse(configs) app = args["app"] root_dir = args["root_dir"]