Skip to content
Open
1 change: 1 addition & 0 deletions debug_gym/agents/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from debug_gym.agents.debug_agent import Debug_5_Agent, DebugAgent
from debug_gym.agents.rewrite_agent import RewriteAgent
from debug_gym.agents.solution_agent import AgentSolution
from debug_gym.agents.swe_agent import SWEAgent
16 changes: 9 additions & 7 deletions debug_gym/agents/base_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def run(self, task_name=None, debug=False):
# initial state does not have prompt and response
self.history.step(info, None)

if info.done is True:
if info.resolved is True:
self.logger.report_progress(
problem_id=task_name,
step=1,
Expand Down Expand Up @@ -265,10 +265,12 @@ def run(self, task_name=None, debug=False):
self.history.step(info, llm_response)

if (
info.done
info.terminated
or info.rewrite_counter >= self.config["max_rewrite_steps"]
):
reason = "done" if info.done else "max_rewrite_steps reached"
reason = (
"terminated" if info.resolved else "max_rewrite_steps reached"
)
self.logger.info(
f"Step: {step} | Score: {info.score}/{info.max_score} ({info.score/info.max_score:.1%}) | Reason: {reason}"
)
Expand All @@ -279,7 +281,7 @@ def run(self, task_name=None, debug=False):
total_steps=step + 1,
score=info.score,
max_score=info.max_score,
status="resolved" if info.done else "unresolved",
status="resolved" if info.resolved else "unresolved",
)
break
# keep progress bar running until max_steps is reached
Expand All @@ -298,9 +300,9 @@ def run(self, task_name=None, debug=False):
total_steps=step + 1,
score=info.score,
max_score=info.max_score,
status="resolved" if info.done else "unresolved",
status="resolved" if info.resolved else "unresolved",
)
return info.done
return info.resolved
except Exception:
# report any error that happens during the run
self.logger.report_progress(
Expand Down Expand Up @@ -353,7 +355,7 @@ def save_trajectory(self, task_name="custom"):
"config": self.config,
"tools": self.llm.define_tools(self.env.tools) if self.llm else tools,
"uuid": self._uuid,
"success": self.env.done,
"success": self.env.resolved,
"log": [],
"agent_type": self.__class__.__name__,
"logger": str(self.logger.log_file),
Expand Down
12 changes: 6 additions & 6 deletions debug_gym/agents/debug_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def run(self, task_name=None, debug=False):
# initial state does not have prompt and response
self.history.step(info, None)

if info.done is True:
if info.resolved is True:
# msg = "Environment started with entrypoint passing without errors."
self.logger.report_progress(
problem_id=task_name,
Expand Down Expand Up @@ -69,10 +69,10 @@ def run(self, task_name=None, debug=False):
self.history.step(info, llm_response)

if (
info.done
info.terminated
or info.rewrite_counter >= self.config["max_rewrite_steps"]
):
reason = "done" if info.done else "max_rewrite_steps reached"
reason = "done" if info.resolved else "max_rewrite_steps reached"
self.logger.info(
f"Step: {step} | Score: {info.score}/{info.max_score} ({info.score/info.max_score:.1%}) | Reason: {reason}"
)
Expand All @@ -83,7 +83,7 @@ def run(self, task_name=None, debug=False):
total_steps=step + 1,
score=info.score,
max_score=info.max_score,
status="resolved" if info.done else "unresolved",
status="resolved" if info.resolved else "unresolved",
)
break
# keep progress bar running until max_steps is reached
Expand All @@ -102,9 +102,9 @@ def run(self, task_name=None, debug=False):
total_steps=step + 1,
score=info.score,
max_score=info.max_score,
status="resolved" if info.done else "unresolved",
status="resolved" if info.resolved else "unresolved",
)
return info.done
return info.resolved
except Exception:
# report any error that happens during the run
self.logger.report_progress(
Expand Down
9 changes: 4 additions & 5 deletions debug_gym/agents/solution_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def run(self, task_name=None, debug=False):
info = self.env.reset(options={"task_name": task_name})
self.history.step(info)

if info.done is True:
if info.resolved is True:
self._report_progress(task_name, info, "resolved")
return True

Expand Down Expand Up @@ -76,13 +76,12 @@ def run(self, task_name=None, debug=False):
self.logger.info(
f"Score: {info.score}/{info.max_score} ({info.score/info.max_score:.1%})"
)
assert info.done, (
assert info.resolved, (
"The task is not done after applying the gold patch.\n"
f"{info.step_observation.observation}"
)
status = "resolved" if info.done else "unresolved"
self._report_progress(task_name, info, status)
self._report_progress(task_name, info, "resolved")
except Exception:
self._report_progress(task_name, info, "error")
raise
return info.done
return info.resolved
7 changes: 7 additions & 0 deletions debug_gym/agents/swe_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from debug_gym.agents.base_agent import BaseAgent, register_agent


@register_agent
class SWEAgent(BaseAgent):
name = "swe_agent"
system_prompt = "You are a debugging agent specialized in fixing Python programs. Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools including the ability to run bash commands to help you investigate the code before proposing a patch. While the code may seem familiar to you from your training, you should not assume you know the code. Instead, you must use the tools available to investigate the code and understand the potential bugs. A common debugging workflow is to 1) Analyze the codebase by finding and reading relevant files; 2) Create a script to reproduce the issue; 3) Edit the source code to resolve the issue; 4) Verify your fix works by running your script again; 5) Test edge cases to ensure your fix is robust; 6) Submit your changes and finish your work by using the submit tool. Avoid rewriting the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. If you are confident that you have enough information, propose a patch to fix the bugs by calling the rewrite tool. If you are not sure, continue using the available tools to gather more information before proposing a patch. After every rewrite, it's always a good idea to run your reproduction script to execute the new code and check if the problem is resolved; if it is not, the script will return the error messages, which you can use to continue debugging. Output both your thinking process (if any) and the tool call (must) in the response."
80 changes: 47 additions & 33 deletions debug_gym/gym/envs/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@
from debug_gym.gym.terminals.terminal import Terminal
from debug_gym.gym.tools.tool import EnvironmentTool, ToolCall
from debug_gym.gym.workspace import Workspace
from debug_gym.logger import DebugGymLogger, log_with_color
from debug_gym.logger import DebugGymLogger


@dataclass
class EnvInfo:
# obs from tool triggered by `env.step` or eval if `env.reset`
step_observation: Observation
all_observations: list[Observation] # env.step + triggered tools obs
eval_observation: Observation # last eval observation
eval_observation: Observation | None # last eval observation
dir_tree: str
current_breakpoints: str
action_reasoning: str | None
Expand All @@ -25,7 +25,8 @@ class EnvInfo:
instructions: dict
score: int
max_score: int
done: bool
terminated: bool
resolved: bool
rewrite_counter: int
tools: list[EnvironmentTool]

Expand All @@ -38,7 +39,7 @@ def __str__(self) -> str:

# Status section
lines.append(
f"📊 Status: {'✅ (DONE)' if self.done else '🔄 (IN PROGRESS)'}\t"
f"📊 Status: {('✅' if self.resolved else '❌') + ' (TERMINATED)' if self.terminated else '🔄 (IN PROGRESS)'}\t"
f"🎯 Score: {self.score}/{self.max_score}\t"
f"✏️ Rewrites: {self.rewrite_counter}"
)
Expand Down Expand Up @@ -251,19 +252,20 @@ def _reset_env_state(self):
self.rewrite_counter = 0
self.last_eval: EvalOutput = None
self.score = 0
self.done = False
self.terminated = False
self.resolved = False
# clear all observations and event queue (queue should be empty already)
self.clear_all_observations()
self.empty_event_queue()

def set_entrypoints(self, entrypoint: str, debug_entrypoint: str | None = None):
debug_entrypoint = debug_entrypoint or entrypoint.replace(
"python ", "python -m pdb "
)
self.entrypoint = self._prepare_entrypoint(entrypoint)
self.debug_entrypoint = self._prepare_entrypoint(debug_entrypoint)
# self.entrypoint = "PYTHONPATH=$PYTHONPATH:$PWD " + self.entrypoint
# self.debug_entrypoint = "PYTHONPATH=$PYTHONPATH:$PWD " + self.debug_entrypoint
self.debug_entrypoint = self._prepare_entrypoint(debug_entrypoint or entrypoint)

if "python -m pdb" not in self.debug_entrypoint:
self.debug_entrypoint = self.debug_entrypoint.replace(
"python ", "python -m pdb "
)

@staticmethod
def _prepare_entrypoint(entrypoint):
Expand Down Expand Up @@ -313,7 +315,7 @@ def setup_terminal(self) -> None:
"""Setup the terminal.
Override in subclasses for different behavior. Called once at reset."""

log_with_color(self.logger, f"Configuring {self.terminal}...", "blue")
self.logger.debug(f"Configuring {self.terminal}...")

self.terminal.run("git init -b main")
self.terminal.run("git config user.name 'debug-gym'")
Expand All @@ -328,7 +330,7 @@ def setup_terminal(self) -> None:
def reset(self, *, options: dict = None):
"""Resets the environment and returns eval as the initial observation."""
options = options or {}
self.logger.info("Resetting environment")
self.logger.debug("Resetting environment")
self.setup_task(task_name=options.get("task_name"), options=options)
self.setup_workspace()
self.setup_terminal()
Expand All @@ -338,30 +340,31 @@ def reset(self, *, options: dict = None):
self.queue_event(Event.ENV_RESET, source="env")
self.all_observations = self.process_events()

# Gets eval (initial observation) from cache or by running env.eval
if self.last_eval: # if eval tool was triggered by Event.ENV_RESET
self.step_observation = Observation("env", self.last_eval.output)
else: # if eval tool was not triggered by Event.ENV_RESET
self.last_eval = self.eval()
self.step_observation = Observation("env", self.last_eval.output)
self.all_observations.insert(0, self.step_observation)
# First observation always include the task instructions.
self.step_observation = Observation("env", self.instructions)
self.all_observations.insert(0, self.step_observation)

self.max_score = self.calculate_max_score(self.last_eval)
self.score = self.calculate_score(self.last_eval)
self.done = self.calculate_done(self.last_eval)
if self.last_eval:
self.max_score = self.calculate_max_score(self.last_eval)
self.score = self.calculate_score(self.last_eval)
self.resolved = self.calculate_resolved(self.last_eval)
self.terminated = self.calculate_terminated(self.last_eval)

self.infos = EnvInfo(
step_observation=self.step_observation,
all_observations=self.all_observations,
eval_observation=Observation("env", self.last_eval.output),
eval_observation=(
Observation("env", self.last_eval.output) if self.last_eval else None
),
dir_tree=self.workspace.display_files(self.dir_tree_depth),
current_breakpoints=self.current_breakpoints(),
action_reasoning=None,
action_content=None,
action_tool_call=None,
done=self.done,
terminated=self.terminated,
resolved=self.resolved,
score=self.score,
max_score=self.max_score,
max_score=self.max_score or 1,
instructions=self.instructions,
rewrite_counter=self.rewrite_counter,
tools=self.tools,
Expand All @@ -383,11 +386,16 @@ def calculate_score(self, eval_output: EvalOutput) -> int:
Override in subclasses for different behavior."""
return eval_output.success

def calculate_done(self, eval_output: EvalOutput) -> bool:
"""Determine if the task is done.
def calculate_resolved(self, eval_output: EvalOutput) -> bool:
"""Determine if the task has been resolved.
Override in subclasses for different behavior."""
return self.score == self.max_score

def calculate_terminated(self, eval_output: EvalOutput) -> bool:
"""Determine if the task is terminated.
Override in subclasses for different behavior."""
return self.calculate_resolved(eval_output)

def eval(self, **kwargs) -> EvalOutput:
"""Evaluates the current code using the provided entrypoint.
Sets the last_eval and returns it.
Expand Down Expand Up @@ -464,22 +472,28 @@ def step(
self.all_observations.insert(0, self.step_observation)

# Calculate score and done based on the last eval output
self.score = self.calculate_score(self.last_eval)
self.done = self.calculate_done(self.last_eval)
if self.last_eval:
self.max_score = self.max_score or self.calculate_max_score(self.last_eval)
self.score = self.calculate_score(self.last_eval)
self.terminated = self.calculate_terminated(self.last_eval)
self.resolved = self.calculate_resolved(self.last_eval)

self.infos = EnvInfo(
step_observation=self.step_observation,
all_observations=self.all_observations,
eval_observation=Observation("env", self.last_eval.output),
eval_observation=(
Observation("env", self.last_eval.output) if self.last_eval else None
),
dir_tree=self.workspace.display_files(self.dir_tree_depth),
current_breakpoints=self.current_breakpoints(),
action_reasoning=action_reasoning,
action_content=action_content,
action_tool_call=action_tool_call,
instructions=self.instructions,
score=self.score,
max_score=self.max_score,
done=self.done,
max_score=self.max_score or 1,
terminated=self.terminated,
resolved=self.resolved,
rewrite_counter=self.rewrite_counter,
tools=self.tools,
)
Expand Down
6 changes: 3 additions & 3 deletions debug_gym/gym/envs/r2egym.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def setup_workspace(self):
self.set_entrypoints(self.entrypoint, self.debug_entrypoint)

def setup_terminal(self):
self.logger.info(f"Configuring {self.terminal}...")
self.logger.debug(f"Configuring {self.terminal}...")

# Install tree for listdir.
self.terminal.run("apt update && apt install -y tree")
Expand Down Expand Up @@ -268,10 +268,10 @@ def setup_terminal(self):
self.terminal.run("git remote remove origin")

def apply_gold_patch(self):
self.logger.info(f"Applying gold patch to {self.working_dir}.")
self.logger.debug(f"Applying gold patch to {self.working_dir}.")
command = self.git_apply_cmd + f" <<'EOF'\n{self.gold_patch}\nEOF"
self.terminal.run(command, raises=True)
self.logger.info("Patch applied successfully.")
self.logger.debug("Patch applied successfully.")

def eval(self, **kwargs) -> EvalOutput:
"""Evaluates the current code using the provided entrypoint.
Expand Down
27 changes: 17 additions & 10 deletions debug_gym/gym/envs/swe_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def setup_workspace(self):
self.set_entrypoints(self.entrypoint, self.debug_entrypoint)

def setup_terminal(self):
self.logger.info(f"Configuring {self.terminal}...")
self.logger.debug(f"Configuring {self.terminal}...")

# Install tree for listdir.
self.terminal.run("apt update && apt install -y tree")
Expand All @@ -169,22 +169,29 @@ def setup_terminal(self):
# Apply any changes needed to the install commands.
self.terminal.run("git config user.name 'debug-gym'")
self.terminal.run("git config user.email '<>'")
self.terminal.run(
f"git commit -am 'Changes needed for setting up {self.task_name}'"
)

# Apply the test patch directly.
self.terminal.run(f"git apply - <<'EOF'\n{self.test_patch}\nEOF")
self.terminal.run(f"git commit -am 'Applying test patch for {self.task_name}'")
self.terminal.run(f"git commit -am 'Setting up {self.task_name}'")

# Remove the remote so the agent won't see newer commits.
self.terminal.run("git remote remove origin")

def apply_gold_patch(self):
self.logger.info(f"Applying gold patch to {self.working_dir}.")
self.logger.debug(f"Applying gold patch to {self.working_dir}.")
command = self.git_apply_cmd + f" <<'EOF'\n{self.gold_patch}\nEOF"
self.terminal.run(command, raises=True)
self.logger.info("Patch applied successfully.")
self.logger.debug("Patch applied successfully.")

def eval(self, **kwargs) -> EvalOutput:
# We need to apply the test patch before running any evaluation.
# Reset any changes made to test_directives files.
self.terminal.run(f"git checkout -- {' '.join(self.test_directives)}")

# Apply official test patch (hidden until now)
self.terminal.run(f"git apply - <<'EOF'\n{self.test_patch}\nEOF")

success, output = self.terminal.run(self.entrypoint, timeout=self.run_timeout)
self.last_eval = EvalOutput(success, output)

return self.last_eval

def calculate_max_score(self, eval_output: EvalOutput) -> int:
return len(self.fail_to_pass)
Expand Down
Loading