microsoft · fverac · Oct 13, 2025 · Oct 14, 2025 · Oct 17, 2025 · Oct 22, 2025
diff --git a/debug_gym/agents/__init__.py b/debug_gym/agents/__init__.py
@@ -1,3 +1,4 @@
 from debug_gym.agents.debug_agent import Debug_5_Agent, DebugAgent
 from debug_gym.agents.rewrite_agent import RewriteAgent
 from debug_gym.agents.solution_agent import AgentSolution
+from debug_gym.agents.swe_agent import SWEAgent
diff --git a/debug_gym/agents/base_agent.py b/debug_gym/agents/base_agent.py
@@ -227,7 +227,7 @@ def run(self, task_name=None, debug=False):
             # initial state does not have prompt and response
             self.history.step(info, None)
 
-            if info.done is True:
+            if info.resolved is True:
                 self.logger.report_progress(
                     problem_id=task_name,
                     step=1,
@@ -265,10 +265,12 @@ def run(self, task_name=None, debug=False):
                 self.history.step(info, llm_response)
 
                 if (
-                    info.done
+                    info.terminated
                     or info.rewrite_counter >= self.config["max_rewrite_steps"]
                 ):
-                    reason = "done" if info.done else "max_rewrite_steps reached"
+                    reason = (
+                        "terminated" if info.resolved else "max_rewrite_steps reached"
+                    )
                     self.logger.info(
                         f"Step: {step} | Score: {info.score}/{info.max_score} ({info.score/info.max_score:.1%}) | Reason: {reason}"
                     )
@@ -279,7 +281,7 @@ def run(self, task_name=None, debug=False):
                         total_steps=step + 1,
                         score=info.score,
                         max_score=info.max_score,
-                        status="resolved" if info.done else "unresolved",
+                        status="resolved" if info.resolved else "unresolved",
                     )
                     break
                 # keep progress bar running until max_steps is reached
@@ -298,9 +300,9 @@ def run(self, task_name=None, debug=False):
                 total_steps=step + 1,
                 score=info.score,
                 max_score=info.max_score,
-                status="resolved" if info.done else "unresolved",
+                status="resolved" if info.resolved else "unresolved",
             )
-            return info.done
+            return info.resolved
         except Exception:
             # report any error that happens during the run
             self.logger.report_progress(
@@ -353,7 +355,7 @@ def save_trajectory(self, task_name="custom"):
             "config": self.config,
             "tools": self.llm.define_tools(self.env.tools) if self.llm else tools,
             "uuid": self._uuid,
-            "success": self.env.done,
+            "success": self.env.resolved,
             "log": [],
             "agent_type": self.__class__.__name__,
             "logger": str(self.logger.log_file),

diff --git a/debug_gym/agents/debug_agent.py b/debug_gym/agents/debug_agent.py
@@ -23,7 +23,7 @@ def run(self, task_name=None, debug=False):
             # initial state does not have prompt and response
             self.history.step(info, None)
 
-            if info.done is True:
+            if info.resolved is True:
                 # msg = "Environment started with entrypoint passing without errors."
                 self.logger.report_progress(
                     problem_id=task_name,
@@ -69,10 +69,10 @@ def run(self, task_name=None, debug=False):
                 self.history.step(info, llm_response)
 
                 if (
-                    info.done
+                    info.terminated
                     or info.rewrite_counter >= self.config["max_rewrite_steps"]
                 ):
-                    reason = "done" if info.done else "max_rewrite_steps reached"
+                    reason = "done" if info.resolved else "max_rewrite_steps reached"
                     self.logger.info(
                         f"Step: {step} | Score: {info.score}/{info.max_score} ({info.score/info.max_score:.1%}) | Reason: {reason}"
                     )
@@ -83,7 +83,7 @@ def run(self, task_name=None, debug=False):
                         total_steps=step + 1,
                         score=info.score,
                         max_score=info.max_score,
-                        status="resolved" if info.done else "unresolved",
+                        status="resolved" if info.resolved else "unresolved",
                     )
                     break
                 # keep progress bar running until max_steps is reached
@@ -102,9 +102,9 @@ def run(self, task_name=None, debug=False):
                 total_steps=step + 1,
                 score=info.score,
                 max_score=info.max_score,
-                status="resolved" if info.done else "unresolved",
+                status="resolved" if info.resolved else "unresolved",
             )
-            return info.done
+            return info.resolved
         except Exception:
             # report any error that happens during the run
             self.logger.report_progress(

diff --git a/debug_gym/agents/solution_agent.py b/debug_gym/agents/solution_agent.py
@@ -33,7 +33,7 @@ def run(self, task_name=None, debug=False):
             info = self.env.reset(options={"task_name": task_name})
             self.history.step(info)
 
-            if info.done is True:
+            if info.resolved is True:
                 self._report_progress(task_name, info, "resolved")
                 return True
 
@@ -76,13 +76,12 @@ def run(self, task_name=None, debug=False):
             self.logger.info(
                 f"Score: {info.score}/{info.max_score} ({info.score/info.max_score:.1%})"
             )
-            assert info.done, (
+            assert info.resolved, (
                 "The task is not done after applying the gold patch.\n"
                 f"{info.step_observation.observation}"
             )
-            status = "resolved" if info.done else "unresolved"
-            self._report_progress(task_name, info, status)
+            self._report_progress(task_name, info, "resolved")
         except Exception:
             self._report_progress(task_name, info, "error")
             raise
-        return info.done
+        return info.resolved
diff --git a/debug_gym/agents/swe_agent.py b/debug_gym/agents/swe_agent.py
@@ -0,0 +1,7 @@
+from debug_gym.agents.base_agent import BaseAgent, register_agent
+
+
+@register_agent
+class SWEAgent(BaseAgent):
+    name = "swe_agent"
+    system_prompt = "You are a debugging agent specialized in fixing Python programs. Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools including the ability to run bash commands to help you investigate the code before proposing a patch. While the code may seem familiar to you from your training, you should not assume you know the code. Instead, you must use the tools available to investigate the code and understand the potential bugs. A common debugging workflow is to 1) Analyze the codebase by finding and reading relevant files; 2) Create a script to reproduce the issue; 3) Edit the source code to resolve the issue; 4) Verify your fix works by running your script again; 5) Test edge cases to ensure your fix is robust; 6) Submit your changes and finish your work by using the submit tool. Avoid rewriting the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. If you are confident that you have enough information, propose a patch to fix the bugs by calling the rewrite tool. If you are not sure, continue using the available tools to gather more information before proposing a patch. After every rewrite, it's always a good idea to run your reproduction script to execute the new code and check if the problem is resolved; if it is not, the script will return the error messages, which you can use to continue debugging. Output both your thinking process (if any) and the tool call (must) in the response."
diff --git a/debug_gym/gym/envs/env.py b/debug_gym/gym/envs/env.py
@@ -8,15 +8,15 @@
 from debug_gym.gym.terminals.terminal import Terminal
 from debug_gym.gym.tools.tool import EnvironmentTool, ToolCall
 from debug_gym.gym.workspace import Workspace
-from debug_gym.logger import DebugGymLogger, log_with_color
+from debug_gym.logger import DebugGymLogger
 
 
 @dataclass
 class EnvInfo:
     # obs from tool triggered by `env.step` or eval if `env.reset`
     step_observation: Observation
     all_observations: list[Observation]  #  env.step + triggered tools obs
-    eval_observation: Observation  # last eval observation
+    eval_observation: Observation | None  # last eval observation
     dir_tree: str
     current_breakpoints: str
     action_reasoning: str | None
@@ -25,7 +25,8 @@ class EnvInfo:
     instructions: dict
     score: int
     max_score: int
-    done: bool
+    terminated: bool
+    resolved: bool
     rewrite_counter: int
     tools: list[EnvironmentTool]
 
@@ -38,7 +39,7 @@ def __str__(self) -> str:
 
         # Status section
         lines.append(
-            f"📊 Status: {'✅ (DONE)' if self.done else '🔄 (IN PROGRESS)'}\t"
+            f"📊 Status: {('✅' if self.resolved else '❌') + ' (TERMINATED)' if self.terminated else '🔄 (IN PROGRESS)'}\t"
             f"🎯 Score: {self.score}/{self.max_score}\t"
             f"✏️ Rewrites: {self.rewrite_counter}"
         )
@@ -251,19 +252,20 @@ def _reset_env_state(self):
         self.rewrite_counter = 0
         self.last_eval: EvalOutput = None
         self.score = 0
-        self.done = False
+        self.terminated = False
+        self.resolved = False
         # clear all observations and event queue (queue should be empty already)
         self.clear_all_observations()
         self.empty_event_queue()
 
     def set_entrypoints(self, entrypoint: str, debug_entrypoint: str | None = None):
-        debug_entrypoint = debug_entrypoint or entrypoint.replace(
-            "python ", "python -m pdb "
-        )
         self.entrypoint = self._prepare_entrypoint(entrypoint)
-        self.debug_entrypoint = self._prepare_entrypoint(debug_entrypoint)
-        # self.entrypoint = "PYTHONPATH=$PYTHONPATH:$PWD " + self.entrypoint
-        # self.debug_entrypoint = "PYTHONPATH=$PYTHONPATH:$PWD " + self.debug_entrypoint
+        self.debug_entrypoint = self._prepare_entrypoint(debug_entrypoint or entrypoint)
+
+        if "python -m pdb" not in self.debug_entrypoint:
+            self.debug_entrypoint = self.debug_entrypoint.replace(
+                "python ", "python -m pdb "
+            )
 
     @staticmethod
     def _prepare_entrypoint(entrypoint):
@@ -313,7 +315,7 @@ def setup_terminal(self) -> None:
         """Setup the terminal.
         Override in subclasses for different behavior. Called once at reset."""
 
-        log_with_color(self.logger, f"Configuring {self.terminal}...", "blue")
+        self.logger.debug(f"Configuring {self.terminal}...")
 
         self.terminal.run("git init -b main")
         self.terminal.run("git config user.name 'debug-gym'")
@@ -328,7 +330,7 @@ def setup_terminal(self) -> None:
     def reset(self, *, options: dict = None):
         """Resets the environment and returns eval as the initial observation."""
         options = options or {}
-        self.logger.info("Resetting environment")
+        self.logger.debug("Resetting environment")
         self.setup_task(task_name=options.get("task_name"), options=options)
         self.setup_workspace()
         self.setup_terminal()
@@ -338,30 +340,31 @@ def reset(self, *, options: dict = None):
         self.queue_event(Event.ENV_RESET, source="env")
         self.all_observations = self.process_events()
 
-        # Gets eval (initial observation) from cache or by running env.eval
-        if self.last_eval:  # if eval tool was triggered by Event.ENV_RESET
-            self.step_observation = Observation("env", self.last_eval.output)
-        else:  # if eval tool was not triggered by Event.ENV_RESET
-            self.last_eval = self.eval()
-            self.step_observation = Observation("env", self.last_eval.output)
-            self.all_observations.insert(0, self.step_observation)
+        # First observation always include the task instructions.
+        self.step_observation = Observation("env", self.instructions)
+        self.all_observations.insert(0, self.step_observation)
 
-        self.max_score = self.calculate_max_score(self.last_eval)
-        self.score = self.calculate_score(self.last_eval)
-        self.done = self.calculate_done(self.last_eval)
+        if self.last_eval:
+            self.max_score = self.calculate_max_score(self.last_eval)
+            self.score = self.calculate_score(self.last_eval)
+            self.resolved = self.calculate_resolved(self.last_eval)
+            self.terminated = self.calculate_terminated(self.last_eval)
 
         self.infos = EnvInfo(
             step_observation=self.step_observation,
             all_observations=self.all_observations,
-            eval_observation=Observation("env", self.last_eval.output),
+            eval_observation=(
+                Observation("env", self.last_eval.output) if self.last_eval else None
+            ),
             dir_tree=self.workspace.display_files(self.dir_tree_depth),
             current_breakpoints=self.current_breakpoints(),
             action_reasoning=None,
             action_content=None,
             action_tool_call=None,
-            done=self.done,
+            terminated=self.terminated,
+            resolved=self.resolved,
             score=self.score,
-            max_score=self.max_score,
+            max_score=self.max_score or 1,
             instructions=self.instructions,
             rewrite_counter=self.rewrite_counter,
             tools=self.tools,
@@ -383,11 +386,16 @@ def calculate_score(self, eval_output: EvalOutput) -> int:
         Override in subclasses for different behavior."""
         return eval_output.success
 
-    def calculate_done(self, eval_output: EvalOutput) -> bool:
-        """Determine if the task is done.
+    def calculate_resolved(self, eval_output: EvalOutput) -> bool:
+        """Determine if the task has been resolved.
         Override in subclasses for different behavior."""
         return self.score == self.max_score
 
+    def calculate_terminated(self, eval_output: EvalOutput) -> bool:
+        """Determine if the task is terminated.
+        Override in subclasses for different behavior."""
+        return self.calculate_resolved(eval_output)
+
     def eval(self, **kwargs) -> EvalOutput:
         """Evaluates the current code using the provided entrypoint.
         Sets the last_eval and returns it.
@@ -464,22 +472,28 @@ def step(
         self.all_observations.insert(0, self.step_observation)
 
         # Calculate score and done based on the last eval output
-        self.score = self.calculate_score(self.last_eval)
-        self.done = self.calculate_done(self.last_eval)
+        if self.last_eval:
+            self.max_score = self.max_score or self.calculate_max_score(self.last_eval)
+            self.score = self.calculate_score(self.last_eval)
+            self.terminated = self.calculate_terminated(self.last_eval)
+            self.resolved = self.calculate_resolved(self.last_eval)
 
         self.infos = EnvInfo(
             step_observation=self.step_observation,
             all_observations=self.all_observations,
-            eval_observation=Observation("env", self.last_eval.output),
+            eval_observation=(
+                Observation("env", self.last_eval.output) if self.last_eval else None
+            ),
             dir_tree=self.workspace.display_files(self.dir_tree_depth),
             current_breakpoints=self.current_breakpoints(),
             action_reasoning=action_reasoning,
             action_content=action_content,
             action_tool_call=action_tool_call,
             instructions=self.instructions,
             score=self.score,
-            max_score=self.max_score,
-            done=self.done,
+            max_score=self.max_score or 1,
+            terminated=self.terminated,
+            resolved=self.resolved,
             rewrite_counter=self.rewrite_counter,
             tools=self.tools,
         )

diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py
@@ -204,7 +204,7 @@ def setup_workspace(self):
         self.set_entrypoints(self.entrypoint, self.debug_entrypoint)
 
     def setup_terminal(self):
-        self.logger.info(f"Configuring {self.terminal}...")
+        self.logger.debug(f"Configuring {self.terminal}...")
 
         # Install tree for listdir.
         self.terminal.run("apt update && apt install -y tree")
@@ -268,10 +268,10 @@ def setup_terminal(self):
         self.terminal.run("git remote remove origin")
 
     def apply_gold_patch(self):
-        self.logger.info(f"Applying gold patch to {self.working_dir}.")
+        self.logger.debug(f"Applying gold patch to {self.working_dir}.")
         command = self.git_apply_cmd + f" <<'EOF'\n{self.gold_patch}\nEOF"
         self.terminal.run(command, raises=True)
-        self.logger.info("Patch applied successfully.")
+        self.logger.debug("Patch applied successfully.")
 
     def eval(self, **kwargs) -> EvalOutput:
         """Evaluates the current code using the provided entrypoint.

diff --git a/debug_gym/gym/envs/swe_bench.py b/debug_gym/gym/envs/swe_bench.py
@@ -143,7 +143,7 @@ def setup_workspace(self):
         self.set_entrypoints(self.entrypoint, self.debug_entrypoint)
 
     def setup_terminal(self):
-        self.logger.info(f"Configuring {self.terminal}...")
+        self.logger.debug(f"Configuring {self.terminal}...")
 
         # Install tree for listdir.
         self.terminal.run("apt update && apt install -y tree")
@@ -169,22 +169,29 @@ def setup_terminal(self):
         # Apply any changes needed to the install commands.
         self.terminal.run("git config user.name 'debug-gym'")
         self.terminal.run("git config user.email '<>'")
-        self.terminal.run(
-            f"git commit -am 'Changes needed for setting up {self.task_name}'"
-        )
-
-        # Apply the test patch directly.
-        self.terminal.run(f"git apply - <<'EOF'\n{self.test_patch}\nEOF")
-        self.terminal.run(f"git commit -am 'Applying test patch for {self.task_name}'")
+        self.terminal.run(f"git commit -am 'Setting up {self.task_name}'")
 
         # Remove the remote so the agent won't see newer commits.
         self.terminal.run("git remote remove origin")
 
     def apply_gold_patch(self):
-        self.logger.info(f"Applying gold patch to {self.working_dir}.")
+        self.logger.debug(f"Applying gold patch to {self.working_dir}.")
         command = self.git_apply_cmd + f" <<'EOF'\n{self.gold_patch}\nEOF"
         self.terminal.run(command, raises=True)
-        self.logger.info("Patch applied successfully.")
+        self.logger.debug("Patch applied successfully.")
+
+    def eval(self, **kwargs) -> EvalOutput:
+        # We need to apply the test patch before running any evaluation.
+        # Reset any changes made to test_directives files.
+        self.terminal.run(f"git checkout -- {' '.join(self.test_directives)}")
+
+        # Apply official test patch (hidden until now)
+        self.terminal.run(f"git apply - <<'EOF'\n{self.test_patch}\nEOF")
+
+        success, output = self.terminal.run(self.entrypoint, timeout=self.run_timeout)
+        self.last_eval = EvalOutput(success, output)
+
+        return self.last_eval
 
     def calculate_max_score(self, eval_output: EvalOutput) -> int:
         return len(self.fail_to_pass)