Skip to content

Commit 82062fe

Browse files
committed
Adding debug_mode and simplifying changes.
1 parent 63ce0f1 commit 82062fe

File tree

10 files changed

+112
-123
lines changed

10 files changed

+112
-123
lines changed

debug_gym/agents/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
from debug_gym.agents.debug_agent import Debug_5_Agent, DebugAgent
22
from debug_gym.agents.rewrite_agent import RewriteAgent
33
from debug_gym.agents.solution_agent import AgentSolution
4+
from debug_gym.agents.swe_agent import SWEAgent

debug_gym/agents/base_agent.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -248,10 +248,10 @@ def run(self, task_name=None, debug=False):
248248
highscore = info.score
249249
for step in range(max_steps):
250250
self.logger.info(f"\n{'='*20} STEP {step+1} {'='*20}\n")
251-
# highscore = max(highscore, info.score)
252-
# self.logger.info(
253-
# f"[{task_name[:10]:<10}] | Step: {step:<4} | Score: {info.score:>4}/{info.max_score:<4} ({info.score/info.max_score:.1%}) [Best: {highscore}]"
254-
# )
251+
highscore = max(highscore, info.score)
252+
self.logger.info(
253+
f"[{task_name[:10]:<10}] | Step: {step:<4} | Score: {info.score:>4}/{info.max_score:<4} ({info.score/info.max_score:.1%}) [Best: {highscore}]"
254+
)
255255

256256
messages = self.build_prompt(info)
257257
llm_response = self.llm(messages, info.tools)
@@ -282,7 +282,7 @@ def run(self, task_name=None, debug=False):
282282
submit cool called
283283
"""
284284
)
285-
break
285+
# break
286286

287287
if (
288288
info.done

debug_gym/agents/debug_agent.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44
@register_agent
55
class DebugAgent(BaseAgent):
66
name = "debug_agent"
7-
# system_prompt = "You are a debugging agent specialized in fixing Python programs. Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools including the pdb debugger to help you investigate the code before proposing a patch. While the code may seem familiar to you from your training, you should not assume you know the code. Instead, you must use the pdb debugger to investigate the code and understand the potential bugs. A common debugging workflow is to 1) find suspicious files and lines (from error messages or test failures); 2) set breakpoints at suspicious places; 3) continue execution so the frame is at the breakpoint you set; 4) then print necessary values to identify the bugs. Once you have gained enough information, propose a rewriting patch to fix the bugs. Avoid rewriting the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. If you are confident that you have enough information, propose a patch to fix the bugs by calling the rewrite tool. If you are not sure, continue using the pdb tool to gather more information before proposing a patch. After every rewrite, it's always a good idea to call the eval tool to execute the new code and check if it passes the tests; if it does not, the tool will return the error messages, which you can use to continue debugging. Output both your thinking process (if any) and the tool call (must) in the response. "
8-
system_prompt = "You are a debugging agent specialized in fixing Python programs. Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools including the pdb debugger to help you investigate the code before proposing a patch. While the code may seem familiar to you from your training, you should not assume you know the code. Instead, you must use the pdb debugger to investigate the code and understand the potential bugs. A common debugging workflow is to 1) Analyze the codebase by finding and reading relevant files; 2) Create a script to reproduce the issue; 3) Edit the source code to resolve the issue; 4) Verify your fix works by running your script again; 5) Test edge cases to ensure your fix is robust; 6) Submit your changes and finish your work by using the submit tool. Avoid rewriting the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. If you are confident that you have enough information, propose a patch to fix the bugs by calling the rewrite tool. If you are not sure, continue using the pdb tool to gather more information before proposing a patch. After every rewrite, it's always a good idea to run your reproduction script to execute the new code and check if the problem is resolved; if it is not, the script will return the error messages, which you can use to continue debugging. Output both your thinking process (if any) and the tool call (must) in the response. "
7+
system_prompt = "You are a debugging agent specialized in fixing Python programs. Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools including the pdb debugger to help you investigate the code before proposing a patch. While the code may seem familiar to you from your training, you should not assume you know the code. Instead, you must use the pdb debugger to investigate the code and understand the potential bugs. A common debugging workflow is to 1) find suspicious files and lines (from error messages or test failures); 2) set breakpoints at suspicious places; 3) continue execution so the frame is at the breakpoint you set; 4) then print necessary values to identify the bugs. Once you have gained enough information, propose a rewriting patch to fix the bugs. Avoid rewriting the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. If you are confident that you have enough information, propose a patch to fix the bugs by calling the rewrite tool. If you are not sure, continue using the pdb tool to gather more information before proposing a patch. After every rewrite, it's always a good idea to call the eval tool to execute the new code and check if it passes the tests; if it does not, the tool will return the error messages, which you can use to continue debugging. Output both your thinking process (if any) and the tool call (must) in the response. "
98

109

1110
@register_agent

debug_gym/agents/history_tracker.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,7 @@ def json(self, game_step=None):
4949
"reasoning": None,
5050
"content": None,
5151
"action": None, # env reset
52-
# bash agent will not have an initial eval observation
53-
"obs": (
54-
self.memory[0].step_observation.observation
55-
if self.memory[0].step_observation
56-
else None
57-
),
52+
"obs": self.memory[0].step_observation.observation,
5853
"rewrite_consumed": 0,
5954
"prompt_response_pairs": None,
6055
}

debug_gym/agents/swe_agent.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from debug_gym.agents.base_agent import BaseAgent, register_agent
2+
3+
4+
@register_agent
5+
class SWEAgent(BaseAgent):
6+
name = "swe_agent"
7+
system_prompt = "You are a debugging agent specialized in fixing Python programs. Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools including the pdb debugger to help you investigate the code before proposing a patch. While the code may seem familiar to you from your training, you should not assume you know the code. Instead, you must use the pdb debugger to investigate the code and understand the potential bugs. A common debugging workflow is to 1) Analyze the codebase by finding and reading relevant files; 2) Create a script to reproduce the issue; 3) Edit the source code to resolve the issue; 4) Verify your fix works by running your script again; 5) Test edge cases to ensure your fix is robust; 6) Submit your changes and finish your work by using the submit tool. Avoid rewriting the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. If you are confident that you have enough information, propose a patch to fix the bugs by calling the rewrite tool. If you are not sure, continue using the pdb tool to gather more information before proposing a patch. After every rewrite, it's always a good idea to run your reproduction script to execute the new code and check if the problem is resolved; if it is not, the script will return the error messages, which you can use to continue debugging. Output both your thinking process (if any) and the tool call (must) in the response. "

debug_gym/gym/envs/env.py

Lines changed: 26 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ def __init__(
218218
dir_tree_depth: int = 1,
219219
persistent_breakpoints: bool = True, # TODO: remove
220220
auto_list: bool = True, # TODO: remove
221+
debug_mode: bool = True,
221222
terminal: Terminal | None = None,
222223
logger: DebugGymLogger | None = None,
223224
problems: str | list[str] | None = None,
@@ -226,7 +227,7 @@ def __init__(
226227
super().__init__()
227228

228229
self.path = path
229-
self.max_score = max_score
230+
self.max_score = max_score or 1
230231
self.auto_eval_on_rewrite = auto_eval_on_rewrite
231232
self.run_timeout = run_timeout
232233
self.dir_tree_depth = dir_tree_depth
@@ -235,6 +236,7 @@ def __init__(
235236
self._debug_entrypoint = debug_entrypoint
236237
self.persistent_breakpoints = persistent_breakpoints
237238
self.auto_list = auto_list
239+
self.debug_mode = debug_mode
238240
self.logger = logger or DebugGymLogger("debug-gym")
239241
self.infos: EnvInfo | None = None
240242
self.rng = None
@@ -338,23 +340,25 @@ def reset(self, *, options: dict = None):
338340
self.queue_event(Event.ENV_RESET, source="env")
339341
self.all_observations = self.process_events()
340342

341-
# REMOVE EVAL LOGIC
342-
# # Gets eval (initial observation) from cache or by running env.eval
343-
# if self.last_eval: # if eval tool was triggered by Event.ENV_RESET
344-
# self.step_observation = Observation("env", self.last_eval.output)
345-
# else: # if eval tool was not triggered by Event.ENV_RESET
346-
# self.last_eval = self.eval()
347-
# self.step_observation = Observation("env", self.last_eval.output)
348-
# self.all_observations.insert(0, self.step_observation)
343+
# Gets eval (initial observation) by running env.eval if debug_mode is on.
344+
eval_observation = None
345+
self.step_observation = Observation("env", self.instructions)
346+
if self.debug_mode:
347+
self.last_eval = self.eval()
348+
eval_observation = Observation("env", self.last_eval.output)
349+
self.step_observation = eval_observation
349350

350-
self.max_score = self.calculate_max_score(self.last_eval)
351-
self.score = self.calculate_score(self.last_eval)
352-
self.done = self.calculate_done(self.last_eval)
351+
self.all_observations.insert(0, self.step_observation)
352+
353+
if self.debug_mode:
354+
self.max_score = self.calculate_max_score(self.last_eval)
355+
self.score = self.calculate_score(self.last_eval)
356+
self.done = self.calculate_done(self.last_eval)
353357

354358
self.infos = EnvInfo(
355-
step_observation=None,
356-
all_observations=None,
357-
eval_observation=Observation("env", None),
359+
step_observation=self.step_observation,
360+
all_observations=self.all_observations,
361+
eval_observation=eval_observation,
358362
dir_tree=self.workspace.display_files(self.dir_tree_depth),
359363
current_breakpoints=self.current_breakpoints(),
360364
action_reasoning=None,
@@ -382,11 +386,7 @@ def calculate_max_score(self, eval_output: EvalOutput) -> int:
382386
def calculate_score(self, eval_output: EvalOutput) -> int:
383387
"""Calculate the score from the eval output.
384388
Override in subclasses for different behavior."""
385-
if eval_output is not None:
386-
return eval_output.success
387-
else:
388-
return 0
389-
# return eval_output.success
389+
return eval_output.success
390390

391391
def calculate_done(self, eval_output: EvalOutput) -> bool:
392392
"""Determine if the task is done.
@@ -469,14 +469,16 @@ def step(
469469
self.all_observations.insert(0, self.step_observation)
470470

471471
# Calculate score and done based on the last eval output
472-
self.score = self.calculate_score(self.last_eval)
473-
self.done = self.calculate_done(self.last_eval)
472+
eval_observation = None
473+
if self.debug_mode:
474+
self.score = self.calculate_score(self.last_eval)
475+
self.done = self.calculate_done(self.last_eval)
476+
eval_observation = Observation("env", self.last_eval.output)
474477

475478
self.infos = EnvInfo(
476479
step_observation=self.step_observation,
477480
all_observations=self.all_observations,
478-
# eval_observation=Observation("env", self.last_eval.output),
479-
eval_observation=None,
481+
eval_observation=eval_observation,
480482
dir_tree=self.workspace.display_files(self.dir_tree_depth),
481483
current_breakpoints=self.current_breakpoints(),
482484
action_reasoning=action_reasoning,

0 commit comments

Comments
 (0)