diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9c6d4e9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+/.idea
+/apps
+/venv
+__pycache__
\ No newline at end of file
diff --git a/config.yaml b/config.yaml
index d0c1ad6..f84d81f 100644
--- a/config.yaml
+++ b/config.yaml
@@ -1,5 +1,10 @@
-MODEL: "OpenAI"  # The type of multi-modal LLM you would like to use to power the AppAgent, must be either OpenAI or Qwen
+MODEL: "Ollama"  # The type of multi-modal LLM you would like to use to power the AppAgent, must be either OpenAI or Qwen
 
+# OLLAMA
+OLLAMA_API_BASE: "http://localhost:11434/api/chat"
+OLLAMA_API_MODEL: "llama3.2-vision:11b"
+
+# OPENAI
 OPENAI_API_BASE: "https://api.openai.com/v1/chat/completions"
 OPENAI_API_KEY: "sk-"  # Set the value to sk-xxx if you host the openai interface for open llm model
 OPENAI_API_MODEL: "gpt-4-vision-preview"  # The only OpenAI model by now that accepts visual input
diff --git a/learn.py b/learn.py
index c922200..a73feea 100644
--- a/learn.py
+++ b/learn.py
@@ -7,7 +7,9 @@
 
 arg_desc = "AppAgent - exploration phase"
 parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)
-parser.add_argument("--app")
+parser.add_argument("--app", default="Chrome")
+parser.add_argument("--user_input", default="1")
+parser.add_argument("--task_desc", default="open the baidu.com website with a browser")
 parser.add_argument("--root_dir", default="./")
 args = vars(parser.parse_args())
 
@@ -26,7 +28,7 @@
                  "main interface of the app on your phone.", "yellow")
 print_with_color("Choose from the following modes:\n1. autonomous exploration\n2. human demonstration\n"
                  "Type 1 or 2.", "blue")
-user_input = ""
+user_input = args["user_input"]
 while user_input != "1" and user_input != "2":
     user_input = input()
 
@@ -36,7 +38,8 @@
     app = app.replace(" ", "")
 
 if user_input == "1":
-    os.system(f"python scripts/self_explorer.py --app {app} --root_dir {root_dir}")
+    task_desc = args["task_desc"]
+    os.system(f"python scripts/self_explorer.py --app {app} --root_dir {root_dir} --task_desc '{task_desc}'")
 else:
     demo_timestamp = int(time.time())
     demo_name = datetime.datetime.fromtimestamp(demo_timestamp).strftime(f"demo_{app}_%Y-%m-%d_%H-%M-%S")
diff --git a/scripts/cv_example.py b/scripts/cv_example.py
new file mode 100644
index 0000000..edb57af
--- /dev/null
+++ b/scripts/cv_example.py
@@ -0,0 +1,48 @@
+import json
+from config import load_config
+from model_parser import parse as model_parse
+
+configs = load_config('../config.yaml')
+mllm = model_parse(configs)
+
+form = {
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "properties": {
+    "objects": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "type": {
+            "type": "string",
+            "description": "The type of the object."
+          },
+          "color": {
+            "type": "string",
+            "description": "The color of the object."
+          }
+        },
+        "required": ["type", "color"]
+      }
+    },
+    "count": {
+      "type": "integer",
+      "description": "The number of objects present in this image."
+    }
+  },
+  "required": ["objects", "count"]
+}
+
+prompt = """
+Please count the objects present in this picture? Describe the type, color of each object. 
+Please respond with the following json format:
+%s
+""".format(json.dumps(form, indent=2))
+
+prompt = """What is inside this image?
+"""
+
+status, rsp = mllm.get_model_response(prompt, ['./image.jpg'], '')
+print(status)
+print(json.dumps(rsp,indent=2))
diff --git a/scripts/document_generation.py b/scripts/document_generation.py
index 24db53b..79659c3 100644
--- a/scripts/document_generation.py
+++ b/scripts/document_generation.py
@@ -8,9 +8,10 @@
 
 import prompts
 from config import load_config
-from model import OpenAIModel, QwenModel
 from utils import print_with_color
 
+from model_parser import parse as model_parse
+
 arg_desc = "AppAgent - Human Demonstration"
 parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)
 parser.add_argument("--app", required=True)
@@ -20,18 +21,7 @@
 
 configs = load_config()
 
-if configs["MODEL"] == "OpenAI":
-    mllm = OpenAIModel(base_url=configs["OPENAI_API_BASE"],
-                       api_key=configs["OPENAI_API_KEY"],
-                       model=configs["OPENAI_API_MODEL"],
-                       temperature=configs["TEMPERATURE"],
-                       max_tokens=configs["MAX_TOKENS"])
-elif configs["MODEL"] == "Qwen":
-    mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"],
-                     model=configs["QWEN_MODEL"])
-else:
-    print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red")
-    sys.exit()
+mllm = model_parse(configs)
 
 root_dir = args["root_dir"]
 work_dir = os.path.join(root_dir, "apps")
diff --git a/scripts/image.jpg b/scripts/image.jpg
new file mode 100644
index 0000000..5c560fe
Binary files /dev/null and b/scripts/image.jpg differ
diff --git a/scripts/model.py b/scripts/model.py
index bf632db..de0255d 100644
--- a/scripts/model.py
+++ b/scripts/model.py
@@ -1,3 +1,4 @@
+import json
 import re
 from abc import abstractmethod
 from typing import List
@@ -14,7 +15,7 @@ def __init__(self):
         pass
 
     @abstractmethod
-    def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):
+    def get_model_response(self, prompt: str, images: List[str], form='json') -> (bool, str):
         pass
 
 
@@ -27,7 +28,7 @@ def __init__(self, base_url: str, api_key: str, model: str, temperature: float,
         self.temperature = temperature
         self.max_tokens = max_tokens
 
-    def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):
+    def get_model_response(self, prompt: str, images: List[str], form='json') -> (bool, str):
         content = [
             {
                 "type": "text",
@@ -76,7 +77,7 @@ def __init__(self, api_key: str, model: str):
         self.model = model
         dashscope.api_key = api_key
 
-    def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):
+    def get_model_response(self, prompt: str, images: List[str], form='json') -> (bool, str):
         content = [{
             "text": prompt
         }]
@@ -100,10 +101,10 @@ def get_model_response(self, prompt: str, images: List[str]) -> (bool, str):
 
 def parse_explore_rsp(rsp):
     try:
-        observation = re.findall(r"Observation: (.*?)$", rsp, re.MULTILINE)[0]
-        think = re.findall(r"Thought: (.*?)$", rsp, re.MULTILINE)[0]
-        act = re.findall(r"Action: (.*?)$", rsp, re.MULTILINE)[0]
-        last_act = re.findall(r"Summary: (.*?)$", rsp, re.MULTILINE)[0]
+        observation = rsp['Observation']
+        think = rsp['Thought']
+        act = rsp['Action']
+        last_act = rsp['Summary']
         print_with_color("Observation:", "yellow")
         print_with_color(observation, "magenta")
         print_with_color("Thought:", "yellow")
@@ -137,7 +138,7 @@ def parse_explore_rsp(rsp):
             print_with_color(f"ERROR: Undefined act {act_name}!", "red")
             return ["ERROR"]
     except Exception as e:
-        print_with_color(f"ERROR: an exception occurs while parsing the model response: {e}", "red")
+        print_with_color(f"ERROR: an exception occurs while parsing the model response: {e.with_traceback()}", "red")
         print_with_color(rsp, "red")
         return ["ERROR"]
 
@@ -189,8 +190,8 @@ def parse_grid_rsp(rsp):
 
 def parse_reflect_rsp(rsp):
     try:
-        decision = re.findall(r"Decision: (.*?)$", rsp, re.MULTILINE)[0]
-        think = re.findall(r"Thought: (.*?)$", rsp, re.MULTILINE)[0]
+        decision = rsp['Decision']
+        think = rsp['Thought']
         print_with_color("Decision:", "yellow")
         print_with_color(decision, "magenta")
         print_with_color("Thought:", "yellow")
@@ -198,7 +199,7 @@ def parse_reflect_rsp(rsp):
         if decision == "INEFFECTIVE":
             return [decision, think]
         elif decision == "BACK" or decision == "CONTINUE" or decision == "SUCCESS":
-            doc = re.findall(r"Documentation: (.*?)$", rsp, re.MULTILINE)[0]
+            doc = rsp['Documentation']
             print_with_color("Documentation:", "yellow")
             print_with_color(doc, "magenta")
             return [decision, think, doc]
@@ -209,3 +210,46 @@ def parse_reflect_rsp(rsp):
         print_with_color(f"ERROR: an exception occurs while parsing the model response: {e}", "red")
         print_with_color(rsp, "red")
         return ["ERROR"]
+
+
+class OllamaModel(BaseModel):
+    def __init__(self, base_url: str, model: str):
+        super().__init__()
+        self.base_url = base_url
+        self.model = model
+
+    def get_model_response(self, prompt: str, images: List[str], form='json') -> (bool, str):
+        for idx, img in enumerate(images):
+            base64_img = encode_image(img)
+            images[idx] = base64_img
+        headers = {
+            "Content-Type": "application/json"
+        }
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": prompt,
+                    'images': images
+                }
+            ],
+            "stream": False,
+            "format": form,
+        }
+        if len(form) == 0:
+            del payload['format']
+        # print('get_model_request:\n', prompt)
+        response = requests.post(self.base_url, headers=headers, json=payload).json()
+        print('get_model_response:\n', json.dumps(response, indent=2))
+        if "error" not in response:
+            total_duration = response["total_duration"]
+            print_with_color(f"Request duration is "
+                             f"{'{0:.2f}'.format(total_duration / 10 ** 9)}s",
+                             "yellow")
+        else:
+            return False, response['error']
+        content = response["message"]["content"]
+        if form:
+            content = json.loads(content)
+        return True, content
diff --git a/scripts/model_parser.py b/scripts/model_parser.py
new file mode 100644
index 0000000..cc79449
--- /dev/null
+++ b/scripts/model_parser.py
@@ -0,0 +1,24 @@
+import sys
+from typing import Optional
+from model import BaseModel, OpenAIModel, QwenModel, OllamaModel
+from utils import print_with_color
+
+
+def parse(configs: dict) -> BaseModel:
+    mllm: Optional[BaseModel] = None
+    if configs["MODEL"] == "OpenAI":
+        mllm = OpenAIModel(base_url=configs["OPENAI_API_BASE"],
+                           api_key=configs["OPENAI_API_KEY"],
+                           model=configs["OPENAI_API_MODEL"],
+                           temperature=configs["TEMPERATURE"],
+                           max_tokens=configs["MAX_TOKENS"])
+    elif configs["MODEL"] == "Qwen":
+        mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"],
+                         model=configs["QWEN_MODEL"])
+    elif configs["MODEL"] == 'Ollama':
+        mllm = OllamaModel(base_url=configs["OLLAMA_API_BASE"],
+                         model=configs["OLLAMA_API_MODEL"])
+    else:
+        print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red")
+        sys.exit()
+    return mllm
\ No newline at end of file
diff --git a/scripts/prompts.py b/scripts/prompts.py
index 5f15ec5..9b67006 100644
--- a/scripts/prompts.py
+++ b/scripts/prompts.py
@@ -139,7 +139,7 @@
 given a screenshot of a smartphone app. The interactive UI elements on the screenshot are labeled with numeric tags 
 starting from 1. 
 
-You can call the following functions to interact with those labeled elements to control the smartphone:
+You can call the following functions (in python syntax) to interact with those labeled elements to control the smartphone:
 
 1. tap(element: int)
 This function is used to tap an UI element shown on the smartphone screen.
@@ -147,10 +147,10 @@
 A simple use case can be tap(5), which taps the UI element labeled with the number 5.
 
 2. text(text_input: str)
-This function is used to insert text input in an input field/box. text_input is the string you want to insert and must 
-be wrapped with double quotation marks. A simple use case can be text("Hello, world!"), which inserts the string 
-"Hello, world!" into the input area on the smartphone screen. This function is only callable when you see a keyboard 
-showing in the lower half of the screen.
+This function is used to insert text input in an input field/box when a keyboard shows up below the screen. text_input 
+is the string you want to insert and must be wrapped with double quotation marks. A simple use case can be 
+text("Hello, world!"), which inserts the string "Hello, world!" into the input area on the smartphone screen. This 
+function is ONLY callable when you see a KEYBOARD showing in the lower half of the screen.
 
 3. long_press(element: int)
 This function is used to long press an UI element shown on the smartphone screen.
@@ -168,16 +168,24 @@
 
 The task you need to complete is to <task_description>. Your past actions to proceed with this task are summarized as 
 follows: <last_act>
+
 Now, given the following labeled screenshot, you need to think and call the function needed to proceed with the task. 
-Your output should include three parts in the given format:
+Your output should include the exact four parts (Observation, Thought, Action and Summary) in the following JSON format:
+{
+    "Observation": "your observation",
+    "Thought": "your thought",
+    "Action": "text('text')",
+    "Summary": "your summary"
+}
 Observation: <Describe what you observe in the image>
 Thought: <To complete the given task, what is the next step I should do>
-Action: <The function call with the correct parameters to proceed with the task. If you believe the task is completed or 
-there is nothing to be done, you should output FINISH. You cannot output anything else except a function call or FINISH 
-in this field.>
+Action: <The function call (in python syntax) with the correct parameters to proceed with the task. If you believe the task is completed or 
+there is nothing to be done, you should output FINISH. You cannot output anything else except a function call defined 
+above or FINISH in this field. You can only take one action at a time, so please directly call the 
+function>
 Summary: <Summarize your past actions along with your latest action in one or two sentences. Do not include the numeric 
 tag in your summary>
-You can only take one action at a time, so please directly call the function."""
+"""
 
 self_explore_reflect_template = """I will give you screenshots of a mobile app before and after <action> the UI 
 element labeled with the number '<ui_element>' on the first screenshot. The numeric tag of each element is located at 
@@ -185,20 +193,29 @@
 <last_act>
 The action was also an attempt to proceed with a larger task, which is to <task_desc>. Your job is to carefully analyze 
 the difference between the two screenshots to determine if the action is in accord with the description above and at 
-the same time effectively moved the task forward. Your output should be determined based on the following situations:
+the same time effectively moved the task forward. Your output should be  determined based on the following situations:
 1. BACK
 If you think the action navigated you to a page where you cannot proceed with the given task, you should go back to the 
 previous interface. At the same time, describe the functionality of the UI element concisely in one or two sentences by 
 observing the difference between the two screenshots. Notice that your description of the UI element should focus on 
 the general function. Never include the numeric tag of the UI element in your description. You can use pronouns such as 
-"the UI element" to refer to the element. Your output should be in the following format:
+"the UI element" to refer to the element. Your output should be in the following JSON format:
+{
+    "Decision": "BACK",
+    "Thought": "your thought",
+    "Documentation": "your documentation"
+}
 Decision: BACK
 Thought: <explain why you think the last action is wrong and you should go back to the previous interface>
 Documentation: <describe the function of the UI element>
 2. INEFFECTIVE
 If you find the action changed nothing on the screen (screenshots before and after the action are identical), you 
 should continue to interact with other elements on the screen. Notice that if you find the location of the cursor 
-changed between the two screenshots, then they are not identical. Your output should be in the following format:
+changed between the two screenshots, then they are not identical. Your output should be in the following JSON format:
+{
+    "Decision": "INEFFECTIVE",
+    "Thought": "your thought"
+}
 Decision: INEFFECTIVE
 Thought: <explain why you made this decision>
 3. CONTINUE
@@ -207,7 +224,12 @@
 describe the functionality of the UI element concisely in one or two sentences by observing the difference between the 
 two screenshots. Notice that your description of the UI element should focus on the general function. Never include the 
 numeric tag of the UI element in your description. You can use pronouns such as "the UI element" to refer to the 
-element. Your output should be in the following format:
+element. Your output should be in the following JSON format:
+{
+    "Decision": "CONTINUE",
+    "Thought": "your thought",
+    "Documentation": "your documentation"
+}
 Decision: CONTINUE
 Thought: <explain why you think the action does not reflect the action description above and did not move the given 
 task forward>
@@ -216,7 +238,12 @@
 If you think the action successfully moved the task forward (even though it did not completed the task), you should 
 describe the functionality of the UI element concisely in one or two sentences. Notice that your description of the UI 
 element should focus on the general function. Never include the numeric tag of the UI element in your description. You 
-can use pronouns such as "the UI element" to refer to the element. Your output should be in the following format:
+can use pronouns such as "the UI element" to refer to the element. Your output should be in the following JSON format:
+{
+    "Decision": "SUCCESS",
+    "Thought": "your thought",
+    "Documentation": "your documentation"
+}
 Decision: SUCCESS
 Thought: <explain why you think the action successfully moved the task forward>
 Documentation: <describe the function of the UI element>
diff --git a/scripts/self_explorer.py b/scripts/self_explorer.py
index b0e16bb..afa5a28 100644
--- a/scripts/self_explorer.py
+++ b/scripts/self_explorer.py
@@ -10,29 +10,20 @@
 import prompts
 from config import load_config
 from and_controller import list_all_devices, AndroidController, traverse_tree
-from model import parse_explore_rsp, parse_reflect_rsp, OpenAIModel, QwenModel
+from model import parse_explore_rsp, parse_reflect_rsp
 from utils import print_with_color, draw_bbox_multi
+from model_parser import parse as model_parse
 
 arg_desc = "AppAgent - Autonomous Exploration"
 parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)
 parser.add_argument("--app")
+parser.add_argument("--task_desc")
 parser.add_argument("--root_dir", default="./")
 args = vars(parser.parse_args())
 
 configs = load_config()
 
-if configs["MODEL"] == "OpenAI":
-    mllm = OpenAIModel(base_url=configs["OPENAI_API_BASE"],
-                       api_key=configs["OPENAI_API_KEY"],
-                       model=configs["OPENAI_API_MODEL"],
-                       temperature=configs["TEMPERATURE"],
-                       max_tokens=configs["MAX_TOKENS"])
-elif configs["MODEL"] == "Qwen":
-    mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"],
-                     model=configs["QWEN_MODEL"])
-else:
-    print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red")
-    sys.exit()
+mllm = model_parse(configs)
 
 app = args["app"]
 root_dir = args["root_dir"]
@@ -79,8 +70,10 @@
     sys.exit()
 print_with_color(f"Screen resolution of {device}: {width}x{height}", "yellow")
 
-print_with_color("Please enter the description of the task you want me to complete in a few sentences:", "blue")
-task_desc = input()
+task_desc = args['task_desc']
+if not task_desc:
+    print_with_color("Please enter the description of the task you want me to complete in a few sentences:", "blue")
+    task_desc = input()
 
 round_count = 0
 doc_count = 0
diff --git a/scripts/task_executor.py b/scripts/task_executor.py
index e092a15..4ac0e4e 100644
--- a/scripts/task_executor.py
+++ b/scripts/task_executor.py
@@ -10,8 +10,9 @@
 import prompts
 from config import load_config
 from and_controller import list_all_devices, AndroidController, traverse_tree
-from model import parse_explore_rsp, parse_grid_rsp, OpenAIModel, QwenModel
+from model import parse_explore_rsp, parse_grid_rsp
 from utils import print_with_color, draw_bbox_multi, draw_grid
+from model_parser import parse as model_parse
 
 arg_desc = "AppAgent Executor"
 parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)
@@ -21,18 +22,7 @@
 
 configs = load_config()
 
-if configs["MODEL"] == "OpenAI":
-    mllm = OpenAIModel(base_url=configs["OPENAI_API_BASE"],
-                       api_key=configs["OPENAI_API_KEY"],
-                       model=configs["OPENAI_API_MODEL"],
-                       temperature=configs["TEMPERATURE"],
-                       max_tokens=configs["MAX_TOKENS"])
-elif configs["MODEL"] == "Qwen":
-    mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"],
-                     model=configs["QWEN_MODEL"])
-else:
-    print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red")
-    sys.exit()
+mllm = model_parse(configs)
 
 app = args["app"]
 root_dir = args["root_dir"]