microsoft
diff --git a/‎debug_gym/agents/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎debug_gym/agents/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎debug_gym/agents/base_agent.py‎
Lines changed: 5 additions & 5 deletions b/‎debug_gym/agents/base_agent.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎debug_gym/agents/debug_agent.py‎
Lines changed: 1 addition & 2 deletions b/‎debug_gym/agents/debug_agent.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎debug_gym/agents/history_tracker.py‎
Lines changed: 1 addition & 6 deletions b/‎debug_gym/agents/history_tracker.py‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎debug_gym/agents/swe_agent.py‎
Lines changed: 7 additions & 0 deletions b/‎debug_gym/agents/swe_agent.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎debug_gym/gym/envs/env.py‎
Lines changed: 26 additions & 24 deletions b/‎debug_gym/gym/envs/env.py‎
Lines changed: 26 additions & 24 deletions
@@ -1,3 +1,4 @@
 from debug_gym.agents.debug_agent import Debug_5_Agent, DebugAgent
 from debug_gym.agents.rewrite_agent import RewriteAgent
 from debug_gym.agents.solution_agent import AgentSolution
+from debug_gym.agents.swe_agent import SWEAgent
@@ -248,10 +248,10 @@ def run(self, task_name=None, debug=False):
             highscore = info.score
             for step in range(max_steps):
                 self.logger.info(f"\n{'='*20} STEP {step+1} {'='*20}\n")
-                # highscore = max(highscore, info.score)
-                # self.logger.info(
-                #     f"[{task_name[:10]:<10}] | Step: {step:<4} | Score: {info.score:>4}/{info.max_score:<4} ({info.score/info.max_score:.1%}) [Best: {highscore}]"
-                # )
+                highscore = max(highscore, info.score)
+                self.logger.info(
+                    f"[{task_name[:10]:<10}] | Step: {step:<4} | Score: {info.score:>4}/{info.max_score:<4} ({info.score/info.max_score:.1%}) [Best: {highscore}]"
+                )
 
                 messages = self.build_prompt(info)
                 llm_response = self.llm(messages, info.tools)
@@ -282,7 +282,7 @@ def run(self, task_name=None, debug=False):
                           submit cool called
                     """
                     )
-                    break
+                    # break
 
                 if (
                     info.done
 
@@ -4,8 +4,7 @@
 @register_agent
 class DebugAgent(BaseAgent):
     name = "debug_agent"
-    # system_prompt = "You are a debugging agent specialized in fixing Python programs. Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools including the pdb debugger to help you investigate the code before proposing a patch. While the code may seem familiar to you from your training, you should not assume you know the code. Instead, you must use the pdb debugger to investigate the code and understand the potential bugs. A common debugging workflow is to 1) find suspicious files and lines (from error messages or test failures); 2) set breakpoints at suspicious places; 3) continue execution so the frame is at the breakpoint you set; 4) then print necessary values to identify the bugs. Once you have gained enough information, propose a rewriting patch to fix the bugs. Avoid rewriting the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. If you are confident that you have enough information, propose a patch to fix the bugs by calling the rewrite tool. If you are not sure, continue using the pdb tool to gather more information before proposing a patch. After every rewrite, it's always a good idea to call the eval tool to execute the new code and check if it passes the tests; if it does not, the tool will return the error messages, which you can use to continue debugging. Output both your thinking process (if any) and the tool call (must) in the response. "
-    system_prompt = "You are a debugging agent specialized in fixing Python programs. Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools including the pdb debugger to help you investigate the code before proposing a patch. While the code may seem familiar to you from your training, you should not assume you know the code. Instead, you must use the pdb debugger to investigate the code and understand the potential bugs. A common debugging workflow is to 1) Analyze the codebase by finding and reading relevant files; 2) Create a script to reproduce the issue; 3) Edit the source code to resolve the issue; 4) Verify your fix works by running your script again; 5) Test edge cases to ensure your fix is robust; 6) Submit your changes and finish your work by using the submit tool. Avoid rewriting the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. If you are confident that you have enough information, propose a patch to fix the bugs by calling the rewrite tool. If you are not sure, continue using the pdb tool to gather more information before proposing a patch. After every rewrite, it's always a good idea to run your reproduction script to execute the new code and check if the problem is resolved; if it is not, the script will return the error messages, which you can use to continue debugging. Output both your thinking process (if any) and the tool call (must) in the response. "
+    system_prompt = "You are a debugging agent specialized in fixing Python programs. Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools including the pdb debugger to help you investigate the code before proposing a patch. While the code may seem familiar to you from your training, you should not assume you know the code. Instead, you must use the pdb debugger to investigate the code and understand the potential bugs. A common debugging workflow is to 1) find suspicious files and lines (from error messages or test failures); 2) set breakpoints at suspicious places; 3) continue execution so the frame is at the breakpoint you set; 4) then print necessary values to identify the bugs. Once you have gained enough information, propose a rewriting patch to fix the bugs. Avoid rewriting the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. If you are confident that you have enough information, propose a patch to fix the bugs by calling the rewrite tool. If you are not sure, continue using the pdb tool to gather more information before proposing a patch. After every rewrite, it's always a good idea to call the eval tool to execute the new code and check if it passes the tests; if it does not, the tool will return the error messages, which you can use to continue debugging. Output both your thinking process (if any) and the tool call (must) in the response. "
 
 
 @register_agent
 
@@ -49,12 +49,7 @@ def json(self, game_step=None):
                 "reasoning": None,
                 "content": None,
                 "action": None,  # env reset
-                # bash agent will not have an initial eval observation
-                "obs": (
-                    self.memory[0].step_observation.observation
-                    if self.memory[0].step_observation
-                    else None
-                ),
+                "obs": self.memory[0].step_observation.observation,
                 "rewrite_consumed": 0,
                 "prompt_response_pairs": None,
             }
 
@@ -0,0 +1,7 @@
+from debug_gym.agents.base_agent import BaseAgent, register_agent
+
+
+@register_agent
+class SWEAgent(BaseAgent):
+    name = "swe_agent"
+    system_prompt = "You are a debugging agent specialized in fixing Python programs. Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools including the pdb debugger to help you investigate the code before proposing a patch. While the code may seem familiar to you from your training, you should not assume you know the code. Instead, you must use the pdb debugger to investigate the code and understand the potential bugs. A common debugging workflow is to 1) Analyze the codebase by finding and reading relevant files; 2) Create a script to reproduce the issue; 3) Edit the source code to resolve the issue; 4) Verify your fix works by running your script again; 5) Test edge cases to ensure your fix is robust; 6) Submit your changes and finish your work by using the submit tool. Avoid rewriting the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. If you are confident that you have enough information, propose a patch to fix the bugs by calling the rewrite tool. If you are not sure, continue using the pdb tool to gather more information before proposing a patch. After every rewrite, it's always a good idea to run your reproduction script to execute the new code and check if the problem is resolved; if it is not, the script will return the error messages, which you can use to continue debugging. Output both your thinking process (if any) and the tool call (must) in the response. "
@@ -218,6 +218,7 @@ def __init__(
         dir_tree_depth: int = 1,
         persistent_breakpoints: bool = True,  # TODO: remove
         auto_list: bool = True,  # TODO: remove
+        debug_mode: bool = True,
         terminal: Terminal | None = None,
         logger: DebugGymLogger | None = None,
         problems: str | list[str] | None = None,
@@ -226,7 +227,7 @@ def __init__(
         super().__init__()
 
         self.path = path
-        self.max_score = max_score
+        self.max_score = max_score or 1
         self.auto_eval_on_rewrite = auto_eval_on_rewrite
         self.run_timeout = run_timeout
         self.dir_tree_depth = dir_tree_depth
@@ -235,6 +236,7 @@ def __init__(
         self._debug_entrypoint = debug_entrypoint
         self.persistent_breakpoints = persistent_breakpoints
         self.auto_list = auto_list
+        self.debug_mode = debug_mode
         self.logger = logger or DebugGymLogger("debug-gym")
         self.infos: EnvInfo | None = None
         self.rng = None
@@ -338,23 +340,25 @@ def reset(self, *, options: dict = None):
         self.queue_event(Event.ENV_RESET, source="env")
         self.all_observations = self.process_events()
 
-        # REMOVE EVAL LOGIC
-        # # Gets eval (initial observation) from cache or by running env.eval
-        # if self.last_eval:  # if eval tool was triggered by Event.ENV_RESET
-        #     self.step_observation = Observation("env", self.last_eval.output)
-        # else:  # if eval tool was not triggered by Event.ENV_RESET
-        #     self.last_eval = self.eval()
-        #     self.step_observation = Observation("env", self.last_eval.output)
-        #     self.all_observations.insert(0, self.step_observation)
+        # Gets eval (initial observation) by running env.eval if debug_mode is on.
+        eval_observation = None
+        self.step_observation = Observation("env", self.instructions)
+        if self.debug_mode:
+            self.last_eval = self.eval()
+            eval_observation = Observation("env", self.last_eval.output)
+            self.step_observation = eval_observation
 
-        self.max_score = self.calculate_max_score(self.last_eval)
-        self.score = self.calculate_score(self.last_eval)
-        self.done = self.calculate_done(self.last_eval)
+        self.all_observations.insert(0, self.step_observation)
+
+        if self.debug_mode:
+            self.max_score = self.calculate_max_score(self.last_eval)
+            self.score = self.calculate_score(self.last_eval)
+            self.done = self.calculate_done(self.last_eval)
 
         self.infos = EnvInfo(
-            step_observation=None,
-            all_observations=None,
-            eval_observation=Observation("env", None),
+            step_observation=self.step_observation,
+            all_observations=self.all_observations,
+            eval_observation=eval_observation,
             dir_tree=self.workspace.display_files(self.dir_tree_depth),
             current_breakpoints=self.current_breakpoints(),
             action_reasoning=None,
@@ -382,11 +386,7 @@ def calculate_max_score(self, eval_output: EvalOutput) -> int:
     def calculate_score(self, eval_output: EvalOutput) -> int:
         """Calculate the score from the eval output.
         Override in subclasses for different behavior."""
-        if eval_output is not None:
-            return eval_output.success
-        else:
-            return 0
-        # return eval_output.success
+        return eval_output.success
 
     def calculate_done(self, eval_output: EvalOutput) -> bool:
         """Determine if the task is done.
@@ -469,14 +469,16 @@ def step(
         self.all_observations.insert(0, self.step_observation)
 
         # Calculate score and done based on the last eval output
-        self.score = self.calculate_score(self.last_eval)
-        self.done = self.calculate_done(self.last_eval)
+        eval_observation = None
+        if self.debug_mode:
+            self.score = self.calculate_score(self.last_eval)
+            self.done = self.calculate_done(self.last_eval)
+            eval_observation = Observation("env", self.last_eval.output)
 
         self.infos = EnvInfo(
             step_observation=self.step_observation,
             all_observations=self.all_observations,
-            # eval_observation=Observation("env", self.last_eval.output),
-            eval_observation=None,
+            eval_observation=eval_observation,
             dir_tree=self.workspace.display_files(self.dir_tree_depth),
             current_breakpoints=self.current_breakpoints(),
             action_reasoning=action_reasoning,