diff --git a/debug_gym/agents/__init__.py b/debug_gym/agents/__init__.py index 83161b49..424ccc9c 100644 --- a/debug_gym/agents/__init__.py +++ b/debug_gym/agents/__init__.py @@ -1,3 +1,4 @@ from debug_gym.agents.debug_agent import Debug_5_Agent, DebugAgent from debug_gym.agents.rewrite_agent import RewriteAgent from debug_gym.agents.solution_agent import AgentSolution +from debug_gym.agents.swe_agent import SWEAgent diff --git a/debug_gym/agents/base_agent.py b/debug_gym/agents/base_agent.py index c9fe3d6d..f4f6747c 100644 --- a/debug_gym/agents/base_agent.py +++ b/debug_gym/agents/base_agent.py @@ -227,7 +227,7 @@ def run(self, task_name=None, debug=False): # initial state does not have prompt and response self.history.step(info, None) - if info.done is True: + if info.resolved is True: self.logger.report_progress( problem_id=task_name, step=1, @@ -265,10 +265,12 @@ def run(self, task_name=None, debug=False): self.history.step(info, llm_response) if ( - info.done + info.terminated or info.rewrite_counter >= self.config["max_rewrite_steps"] ): - reason = "done" if info.done else "max_rewrite_steps reached" + reason = ( + "terminated" if info.resolved else "max_rewrite_steps reached" + ) self.logger.info( f"Step: {step} | Score: {info.score}/{info.max_score} ({info.score/info.max_score:.1%}) | Reason: {reason}" ) @@ -279,7 +281,7 @@ def run(self, task_name=None, debug=False): total_steps=step + 1, score=info.score, max_score=info.max_score, - status="resolved" if info.done else "unresolved", + status="resolved" if info.resolved else "unresolved", ) break # keep progress bar running until max_steps is reached @@ -298,9 +300,9 @@ def run(self, task_name=None, debug=False): total_steps=step + 1, score=info.score, max_score=info.max_score, - status="resolved" if info.done else "unresolved", + status="resolved" if info.resolved else "unresolved", ) - return info.done + return info.resolved except Exception: # report any error that happens during the run self.logger.report_progress( @@ -353,7 +355,7 @@ def save_trajectory(self, task_name="custom"): "config": self.config, "tools": self.llm.define_tools(self.env.tools) if self.llm else tools, "uuid": self._uuid, - "success": self.env.done, + "success": self.env.resolved, "log": [], "agent_type": self.__class__.__name__, "logger": str(self.logger.log_file), diff --git a/debug_gym/agents/debug_agent.py b/debug_gym/agents/debug_agent.py index 11acab3e..0008abac 100644 --- a/debug_gym/agents/debug_agent.py +++ b/debug_gym/agents/debug_agent.py @@ -23,7 +23,7 @@ def run(self, task_name=None, debug=False): # initial state does not have prompt and response self.history.step(info, None) - if info.done is True: + if info.resolved is True: # msg = "Environment started with entrypoint passing without errors." self.logger.report_progress( problem_id=task_name, @@ -69,10 +69,10 @@ def run(self, task_name=None, debug=False): self.history.step(info, llm_response) if ( - info.done + info.terminated or info.rewrite_counter >= self.config["max_rewrite_steps"] ): - reason = "done" if info.done else "max_rewrite_steps reached" + reason = "done" if info.resolved else "max_rewrite_steps reached" self.logger.info( f"Step: {step} | Score: {info.score}/{info.max_score} ({info.score/info.max_score:.1%}) | Reason: {reason}" ) @@ -83,7 +83,7 @@ def run(self, task_name=None, debug=False): total_steps=step + 1, score=info.score, max_score=info.max_score, - status="resolved" if info.done else "unresolved", + status="resolved" if info.resolved else "unresolved", ) break # keep progress bar running until max_steps is reached @@ -102,9 +102,9 @@ def run(self, task_name=None, debug=False): total_steps=step + 1, score=info.score, max_score=info.max_score, - status="resolved" if info.done else "unresolved", + status="resolved" if info.resolved else "unresolved", ) - return info.done + return info.resolved except Exception: # report any error that happens during the run self.logger.report_progress( diff --git a/debug_gym/agents/solution_agent.py b/debug_gym/agents/solution_agent.py index 99b8e3de..02213f6a 100644 --- a/debug_gym/agents/solution_agent.py +++ b/debug_gym/agents/solution_agent.py @@ -33,7 +33,7 @@ def run(self, task_name=None, debug=False): info = self.env.reset(options={"task_name": task_name}) self.history.step(info) - if info.done is True: + if info.resolved is True: self._report_progress(task_name, info, "resolved") return True @@ -76,13 +76,12 @@ def run(self, task_name=None, debug=False): self.logger.info( f"Score: {info.score}/{info.max_score} ({info.score/info.max_score:.1%})" ) - assert info.done, ( + assert info.resolved, ( "The task is not done after applying the gold patch.\n" f"{info.step_observation.observation}" ) - status = "resolved" if info.done else "unresolved" - self._report_progress(task_name, info, status) + self._report_progress(task_name, info, "resolved") except Exception: self._report_progress(task_name, info, "error") raise - return info.done + return info.resolved diff --git a/debug_gym/agents/swe_agent.py b/debug_gym/agents/swe_agent.py new file mode 100644 index 00000000..04d216ec --- /dev/null +++ b/debug_gym/agents/swe_agent.py @@ -0,0 +1,7 @@ +from debug_gym.agents.base_agent import BaseAgent, register_agent + + +@register_agent +class SWEAgent(BaseAgent): + name = "swe_agent" + system_prompt = "You are a debugging agent specialized in fixing Python programs. Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools including the ability to run bash commands to help you investigate the code before proposing a patch. While the code may seem familiar to you from your training, you should not assume you know the code. Instead, you must use the tools available to investigate the code and understand the potential bugs. A common debugging workflow is to 1) Analyze the codebase by finding and reading relevant files; 2) Create a script to reproduce the issue; 3) Edit the source code to resolve the issue; 4) Verify your fix works by running your script again; 5) Test edge cases to ensure your fix is robust; 6) Submit your changes and finish your work by using the submit tool. Avoid rewriting the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. If you are confident that you have enough information, propose a patch to fix the bugs by calling the rewrite tool. If you are not sure, continue using the available tools to gather more information before proposing a patch. After every rewrite, it's always a good idea to run your reproduction script to execute the new code and check if the problem is resolved; if it is not, the script will return the error messages, which you can use to continue debugging. Output both your thinking process (if any) and the tool call (must) in the response." diff --git a/debug_gym/gym/envs/env.py b/debug_gym/gym/envs/env.py index a297082f..1c13c808 100644 --- a/debug_gym/gym/envs/env.py +++ b/debug_gym/gym/envs/env.py @@ -8,7 +8,7 @@ from debug_gym.gym.terminals.terminal import Terminal from debug_gym.gym.tools.tool import EnvironmentTool, ToolCall from debug_gym.gym.workspace import Workspace -from debug_gym.logger import DebugGymLogger, log_with_color +from debug_gym.logger import DebugGymLogger @dataclass @@ -16,7 +16,7 @@ class EnvInfo: # obs from tool triggered by `env.step` or eval if `env.reset` step_observation: Observation all_observations: list[Observation] # env.step + triggered tools obs - eval_observation: Observation # last eval observation + eval_observation: Observation | None # last eval observation dir_tree: str current_breakpoints: str action_reasoning: str | None @@ -25,7 +25,8 @@ class EnvInfo: instructions: dict score: int max_score: int - done: bool + terminated: bool # Whether the task has finished running + resolved: bool # Whether the task was successfully solved rewrite_counter: int tools: list[EnvironmentTool] @@ -38,7 +39,7 @@ def __str__(self) -> str: # Status section lines.append( - f"📊 Status: {'✅ (DONE)' if self.done else '🔄 (IN PROGRESS)'}\t" + f"📊 Status: {('✅' if self.resolved else '❌') + ' (TERMINATED)' if self.terminated else '🔄 (IN PROGRESS)'}\t" f"🎯 Score: {self.score}/{self.max_score}\t" f"✏️ Rewrites: {self.rewrite_counter}" ) @@ -226,7 +227,7 @@ def __init__( super().__init__() self.path = path - self.max_score = max_score + self._max_score = max_score self.auto_eval_on_rewrite = auto_eval_on_rewrite self.run_timeout = run_timeout self.dir_tree_depth = dir_tree_depth @@ -251,19 +252,20 @@ def _reset_env_state(self): self.rewrite_counter = 0 self.last_eval: EvalOutput = None self.score = 0 - self.done = False + self.terminated = False + self.resolved = False # clear all observations and event queue (queue should be empty already) self.clear_all_observations() self.empty_event_queue() def set_entrypoints(self, entrypoint: str, debug_entrypoint: str | None = None): - debug_entrypoint = debug_entrypoint or entrypoint.replace( - "python ", "python -m pdb " - ) self.entrypoint = self._prepare_entrypoint(entrypoint) - self.debug_entrypoint = self._prepare_entrypoint(debug_entrypoint) - # self.entrypoint = "PYTHONPATH=$PYTHONPATH:$PWD " + self.entrypoint - # self.debug_entrypoint = "PYTHONPATH=$PYTHONPATH:$PWD " + self.debug_entrypoint + self.debug_entrypoint = self._prepare_entrypoint(debug_entrypoint or entrypoint) + + if "python -m pdb" not in self.debug_entrypoint: + self.debug_entrypoint = self.debug_entrypoint.replace( + "python ", "python -m pdb " + ) @staticmethod def _prepare_entrypoint(entrypoint): @@ -287,6 +289,10 @@ def _prepare_entrypoint(entrypoint): entrypoint = " ".join(entrypoint_list) return entrypoint + @property + def max_score(self): + return self._max_score or 1 + @property def working_dir(self) -> Path: return self.workspace.working_dir @@ -313,7 +319,7 @@ def setup_terminal(self) -> None: """Setup the terminal. Override in subclasses for different behavior. Called once at reset.""" - log_with_color(self.logger, f"Configuring {self.terminal}...", "blue") + self.logger.debug(f"Configuring {self.terminal}...") self.terminal.run("git init -b main") self.terminal.run("git config user.name 'debug-gym'") @@ -328,7 +334,7 @@ def setup_terminal(self) -> None: def reset(self, *, options: dict = None): """Resets the environment and returns eval as the initial observation.""" options = options or {} - self.logger.info("Resetting environment") + self.logger.debug("Resetting environment") self.setup_task(task_name=options.get("task_name"), options=options) self.setup_workspace() self.setup_terminal() @@ -338,28 +344,29 @@ def reset(self, *, options: dict = None): self.queue_event(Event.ENV_RESET, source="env") self.all_observations = self.process_events() - # Gets eval (initial observation) from cache or by running env.eval - if self.last_eval: # if eval tool was triggered by Event.ENV_RESET - self.step_observation = Observation("env", self.last_eval.output) - else: # if eval tool was not triggered by Event.ENV_RESET - self.last_eval = self.eval() - self.step_observation = Observation("env", self.last_eval.output) - self.all_observations.insert(0, self.step_observation) + # First observation always include the task instructions. + self.step_observation = Observation("env", self.instructions) + self.all_observations.insert(0, self.step_observation) - self.max_score = self.calculate_max_score(self.last_eval) - self.score = self.calculate_score(self.last_eval) - self.done = self.calculate_done(self.last_eval) + if self.last_eval: + self._max_score = self.calculate_max_score(self.last_eval) + self.score = self.calculate_score(self.last_eval) + self.resolved = self.calculate_resolved(self.last_eval) + self.terminated = self.calculate_terminated(self.last_eval) self.infos = EnvInfo( step_observation=self.step_observation, all_observations=self.all_observations, - eval_observation=Observation("env", self.last_eval.output), + eval_observation=( + Observation("env", self.last_eval.output) if self.last_eval else None + ), dir_tree=self.workspace.display_files(self.dir_tree_depth), current_breakpoints=self.current_breakpoints(), action_reasoning=None, action_content=None, action_tool_call=None, - done=self.done, + terminated=self.terminated, + resolved=self.resolved, score=self.score, max_score=self.max_score, instructions=self.instructions, @@ -383,11 +390,16 @@ def calculate_score(self, eval_output: EvalOutput) -> int: Override in subclasses for different behavior.""" return eval_output.success - def calculate_done(self, eval_output: EvalOutput) -> bool: - """Determine if the task is done. + def calculate_resolved(self, eval_output: EvalOutput) -> bool: + """Determine if the task has been resolved. Override in subclasses for different behavior.""" return self.score == self.max_score + def calculate_terminated(self, eval_output: EvalOutput) -> bool: + """Determine if the task is terminated. + Override in subclasses for different behavior.""" + return self.calculate_resolved(eval_output) + def eval(self, **kwargs) -> EvalOutput: """Evaluates the current code using the provided entrypoint. Sets the last_eval and returns it. @@ -464,13 +476,18 @@ def step( self.all_observations.insert(0, self.step_observation) # Calculate score and done based on the last eval output - self.score = self.calculate_score(self.last_eval) - self.done = self.calculate_done(self.last_eval) + if self.last_eval: + self._max_score = self.max_score or self.calculate_max_score(self.last_eval) + self.score = self.calculate_score(self.last_eval) + self.terminated = self.calculate_terminated(self.last_eval) + self.resolved = self.calculate_resolved(self.last_eval) self.infos = EnvInfo( step_observation=self.step_observation, all_observations=self.all_observations, - eval_observation=Observation("env", self.last_eval.output), + eval_observation=( + Observation("env", self.last_eval.output) if self.last_eval else None + ), dir_tree=self.workspace.display_files(self.dir_tree_depth), current_breakpoints=self.current_breakpoints(), action_reasoning=action_reasoning, @@ -479,7 +496,8 @@ def step( instructions=self.instructions, score=self.score, max_score=self.max_score, - done=self.done, + terminated=self.terminated, + resolved=self.resolved, rewrite_counter=self.rewrite_counter, tools=self.tools, ) diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py index 3e9673cd..cfbbb665 100644 --- a/debug_gym/gym/envs/r2egym.py +++ b/debug_gym/gym/envs/r2egym.py @@ -204,7 +204,7 @@ def setup_workspace(self): self.set_entrypoints(self.entrypoint, self.debug_entrypoint) def setup_terminal(self): - self.logger.info(f"Configuring {self.terminal}...") + self.logger.debug(f"Configuring {self.terminal}...") # Install tree for listdir. self.terminal.run("apt update && apt install -y tree") @@ -268,10 +268,10 @@ def setup_terminal(self): self.terminal.run("git remote remove origin") def apply_gold_patch(self): - self.logger.info(f"Applying gold patch to {self.working_dir}.") + self.logger.debug(f"Applying gold patch to {self.working_dir}.") command = self.git_apply_cmd + f" <<'EOF'\n{self.gold_patch}\nEOF" self.terminal.run(command, raises=True) - self.logger.info("Patch applied successfully.") + self.logger.debug("Patch applied successfully.") def eval(self, **kwargs) -> EvalOutput: """Evaluates the current code using the provided entrypoint. diff --git a/debug_gym/gym/envs/swe_bench.py b/debug_gym/gym/envs/swe_bench.py index 26be87b1..5c6572e0 100644 --- a/debug_gym/gym/envs/swe_bench.py +++ b/debug_gym/gym/envs/swe_bench.py @@ -143,7 +143,7 @@ def setup_workspace(self): self.set_entrypoints(self.entrypoint, self.debug_entrypoint) def setup_terminal(self): - self.logger.info(f"Configuring {self.terminal}...") + self.logger.debug(f"Configuring {self.terminal}...") # Install tree for listdir. self.terminal.run("apt update && apt install -y tree") @@ -169,22 +169,29 @@ def setup_terminal(self): # Apply any changes needed to the install commands. self.terminal.run("git config user.name 'debug-gym'") self.terminal.run("git config user.email '<>'") - self.terminal.run( - f"git commit -am 'Changes needed for setting up {self.task_name}'" - ) - - # Apply the test patch directly. - self.terminal.run(f"git apply - <<'EOF'\n{self.test_patch}\nEOF") - self.terminal.run(f"git commit -am 'Applying test patch for {self.task_name}'") + self.terminal.run(f"git commit -am 'Setting up {self.task_name}'") # Remove the remote so the agent won't see newer commits. self.terminal.run("git remote remove origin") def apply_gold_patch(self): - self.logger.info(f"Applying gold patch to {self.working_dir}.") + self.logger.debug(f"Applying gold patch to {self.working_dir}.") command = self.git_apply_cmd + f" <<'EOF'\n{self.gold_patch}\nEOF" self.terminal.run(command, raises=True) - self.logger.info("Patch applied successfully.") + self.logger.debug("Patch applied successfully.") + + def eval(self, **kwargs) -> EvalOutput: + # We need to apply the test patch before running any evaluation. + # Reset any changes made to test_directives files. + self.terminal.run(f"git checkout -- {' '.join(self.test_directives)}") + + # Apply official test patch (hidden until now) + self.terminal.run(f"git apply - <<'EOF'\n{self.test_patch}\nEOF") + + success, output = self.terminal.run(self.entrypoint, timeout=self.run_timeout) + self.last_eval = EvalOutput(success, output) + + return self.last_eval def calculate_max_score(self, eval_output: EvalOutput) -> int: return len(self.fail_to_pass) diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py index 1730c5ba..ebe500e1 100644 --- a/debug_gym/gym/envs/swe_smith.py +++ b/debug_gym/gym/envs/swe_smith.py @@ -108,7 +108,7 @@ def setup_task(self, task_name: str, options: dict = None): self.ds_row["base_commit"] if "base_commit" in self.ds_row else "main" ) self.branch_name = self.ds_row["instance_id"] - self.test_patch = self.ds_row["patch"] + self.bug_patch = self.ds_row["patch"] self.image_name = self.ds_row["image_name"] self.repo, self.commit = get_repo_commit_from_image_name(self.image_name) self.install_configs = MAP_REPO_TO_SPECS[self.repo][self.commit] @@ -179,7 +179,14 @@ def setup_task(self, task_name: str, options: dict = None): # Note that the `gold_patch` is the same as the `test_patch` but will # be used in conjunction with --reverse. self.git_apply_cmd = f"git apply --reverse -" - self.gold_patch = self.test_patch + self.gold_patch = self.bug_patch + + def setup_terminal(self): + super().setup_terminal() + + # Apply bug patch. + self.terminal.run(f"git apply - <<'EOF'\n{self.bug_patch}\nEOF", raises=True) + self.terminal.run(f"git commit -am 'Applying bug patch for {self.task_name}'") def calculate_score(self, eval_output: EvalOutput) -> int: test_status_map = self.log_parser(eval_output.output) @@ -206,3 +213,8 @@ def calculate_score(self, eval_output: EvalOutput) -> int: f"Score: {score}/{self.max_score} ({score/self.max_score:.1%})" ) return score + + def eval(self, **kwargs) -> EvalOutput: + success, output = self.terminal.run(self.entrypoint, timeout=self.run_timeout) + self.last_eval = EvalOutput(success, output) + return self.last_eval diff --git a/debug_gym/gym/terminals/shell_session.py b/debug_gym/gym/terminals/shell_session.py index 7526e856..76aa2cf0 100644 --- a/debug_gym/gym/terminals/shell_session.py +++ b/debug_gym/gym/terminals/shell_session.py @@ -25,7 +25,12 @@ class ProcessNotRunningError(Exception): """Raised when the shell process is not running after initialization.""" - pass + def __init__(self, command: str, output: str): + self.output = output + self.command = command + super().__init__( + f"Process not running after command: {self.command}\nOutput:\n{self.output}" + ) class ShellSession: @@ -115,7 +120,8 @@ def start(self, command=None, read_until=None): if not self.is_running: self.close() - raise ProcessNotRunningError(f"{self} failed to start. Output:\n{output}") + self.logger.debug(f"{self} failed to start {entrypoint}. stderr:\n{output}") + raise ProcessNotRunningError(command=command, output=output) # Run session commands after starting the session if command was not provided if not command and self.session_commands: diff --git a/debug_gym/gym/terminals/terminal.py b/debug_gym/gym/terminals/terminal.py index 1bef3361..eaaa7264 100644 --- a/debug_gym/gym/terminals/terminal.py +++ b/debug_gym/gym/terminals/terminal.py @@ -27,6 +27,7 @@ def __init__( self.env_vars["PYTHONSTARTUP"] = "" # prevent Python from loading startup files # use a sentinel to know when to stop reading self.env_vars["PS1"] = DEFAULT_PS1 + self.env_vars["PYTHONDONTWRITEBYTECODE"] = "1" # prevent creation of .pyc files self._working_dir = working_dir self.sessions = [] diff --git a/debug_gym/gym/tools/__init__.py b/debug_gym/gym/tools/__init__.py index b0bda554..92776806 100644 --- a/debug_gym/gym/tools/__init__.py +++ b/debug_gym/gym/tools/__init__.py @@ -4,6 +4,7 @@ from debug_gym.gym.tools.listdir import ListdirTool from debug_gym.gym.tools.pdb import PDBTool from debug_gym.gym.tools.rewrite import RewriteTool +from debug_gym.gym.tools.submit import SubmitTool from debug_gym.gym.tools.tool import EnvironmentTool from debug_gym.gym.tools.toolbox import Toolbox from debug_gym.gym.tools.view import ViewTool diff --git a/debug_gym/gym/tools/eval.py b/debug_gym/gym/tools/eval.py index 2c43d7f0..2e9e2ad4 100644 --- a/debug_gym/gym/tools/eval.py +++ b/debug_gym/gym/tools/eval.py @@ -1,3 +1,5 @@ +import time + from debug_gym.gym.entities import Observation from debug_gym.gym.tools.tool import EnvironmentTool from debug_gym.gym.tools.toolbox import Toolbox @@ -18,6 +20,7 @@ def on_env_reset(self, environment, **kwargs): return self(environment) def on_rewrite_success(self, environment, **kwargs): + # TODO: Make this behavior configurable via tool arguments if environment.auto_eval_on_rewrite: return self(environment) return None diff --git a/debug_gym/gym/tools/pdb.py b/debug_gym/gym/tools/pdb.py index 16798fd3..c52840d4 100644 --- a/debug_gym/gym/tools/pdb.py +++ b/debug_gym/gym/tools/pdb.py @@ -2,7 +2,7 @@ import re from debug_gym.gym.entities import Observation -from debug_gym.gym.terminals.shell_session import ShellSession +from debug_gym.gym.terminals.shell_session import ProcessNotRunningError, ShellSession from debug_gym.gym.tools.tool import EnvironmentTool from debug_gym.gym.tools.toolbox import Toolbox @@ -15,10 +15,12 @@ class PDBTool(EnvironmentTool): """pdb(command="c") to continue the execution until the next breakpoint.""", """pdb(command="p x") to print the value of the variable x in the current context.""", """pdb(command="cl src/code.py:26") to clear the breakpoint at line 26 in the file 'src/code.py'.""", + """pdb(command="l", entrypoint="python -m pdb src/app.py") to list the source around the current frame after starting the PDB session for 'src/app.py'.""", ] description = ( "An interface to the Python debugger PDB. Send a command to the PDB terminal. The command should be a valid PDB command." + "\nWhen using the breakpoint command (e.g., 'b', 'break', 'cl', 'clear'), make sure you specify the file path and line number in the format `file_path:line_number`." + + "\nPDB sessions are restarted upon successful rewrite, or if the entrypoint changes. Breakpoints are persistent across PDB sessions and will be restored automatically." + "\nExamples (for demonstration purposes only, you need to adjust the tool calling format according to your specific syntax):" + "\n".join(examples) ) @@ -27,12 +29,32 @@ class PDBTool(EnvironmentTool): "type": ["string"], "description": "The command to be sent to the PDB terminal. The command should be a valid PDB command. See https://docs.python.org/3/library/pdb.html for more information.", }, + "entrypoint": { + "type": ["string", "null"], + "description": "The entrypoint command to start the pdb session. If null, the last provided entrypoint or the environment's debug_entrypoint will be used, in priority order.", + }, } - def __init__(self): + def __init__(self, set_default_entrypoint: bool = True): super().__init__() self.current_frame_file = None self._session: ShellSession = None + self.set_default_entrypoint = set_default_entrypoint + self.entrypoint = None + if not self.set_default_entrypoint: + # Force the agent to provide an entrypoint when using the tool. + self.arguments = copy.deepcopy( + self.arguments + ) # Avoid modifying the class variable. + self.arguments["entrypoint"]["type"].remove("null") + self.arguments["entrypoint"][ + "description" + ] = "The entrypoint command to start the pdb session. Must be provided when using the pdb tool." + self.description += ( + "\nNote: When using the pdb tool, an entrypoint must be provided." + ) + else: + self.description += "\nNote: You can optionally specify an 'entrypoint' argument to control how the PDB session is started. If not provided, the environment's default debug entrypoint will be used." def __getstate__(self): """Handles serialisation of the PDBTool instance (for pickle) without un-picklable attributes""" @@ -76,19 +98,19 @@ def interact_with_pdb(self, command: str, timeout: int | None = None) -> str: return output.replace("(Pdb)", "").strip() # remove the prompt - def close_pdb(self): - self._session.close() + def stop_pdb(self): self.current_frame_file = None + if self._session is not None: + self._session.close() def start_pdb(self, environment) -> str: self._session = environment.terminal.new_shell_session() # init pdb and wait for the prompt - initial_output = self._session.start( - environment.debug_entrypoint, read_until="(Pdb)" - ) + self.entrypoint = self.entrypoint or environment.debug_entrypoint + initial_output = self._session.start(self.entrypoint, read_until="(Pdb)") if "The program finished and will be restarted" in initial_output: - self.close_pdb() + self.stop_pdb() if self.pdb_is_running: if environment.persistent_breakpoints: @@ -119,15 +141,38 @@ def on_rewrite_success( def restart_pdb(self, environment) -> str: """Restart the pdb session and restore the breakpoints.""" - self.close_pdb() + self.stop_pdb() return self.start_pdb(environment) - def use(self, environment, command: str) -> Observation: + def use( + self, environment, command: str, entrypoint: str | None = None + ) -> Observation: if command == "": return Observation( self.name, "Failure calling pdb:\nEmpty commands are not allowed." ) + if entrypoint is None and not self.set_default_entrypoint: + return Observation( + self.name, + "Failure calling pdb:\nAn entrypoint must be provided when using the pdb tool.", + ) + + # Set the entrypoint. Priority: tool argument > last entrypoint > default entrypoint. + entrypoint = entrypoint or self.entrypoint or environment.debug_entrypoint + + # Check if we need to restart pdb due to a different entrypoint. + if entrypoint != self.entrypoint: + try: + # TODO: allow entrypoint to simply be a file to call with 'python -m pdb ' + self.entrypoint = entrypoint + self.restart_pdb(environment) + except ProcessNotRunningError as e: + return Observation( + self.name, + f"Provided entrypoint failed to start a pdb session:\n{e.output}", + ) + _warning = "" # if print, it's OK to have ";" or "\n" in the command # otherwise, only the first command will be executed diff --git a/debug_gym/gym/tools/submit.py b/debug_gym/gym/tools/submit.py new file mode 100644 index 00000000..eb08e578 --- /dev/null +++ b/debug_gym/gym/tools/submit.py @@ -0,0 +1,15 @@ +from debug_gym.gym.entities import Observation +from debug_gym.gym.tools.tool import EnvironmentTool +from debug_gym.gym.tools.toolbox import Toolbox + + +@Toolbox.register() +class SubmitTool(EnvironmentTool): + name = "submit" + description = "Submit your changes once the task is complete." + arguments = {} + + def use(self, environment, **kwargs) -> Observation: + eval_output = environment.eval() + environment.terminated = True + return Observation(self.name, eval_output.output) diff --git a/notebooks/tutorial.ipynb b/notebooks/tutorial.ipynb index 5a632df7..df93130c 100644 --- a/notebooks/tutorial.ipynb +++ b/notebooks/tutorial.ipynb @@ -299,7 +299,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "5645e9fa", "metadata": {}, "outputs": [ @@ -531,10 +531,10 @@ " info = env.step(llm_response.tool, llm_response.response)\n", " history.step(info, llm_response)\n", "\n", - " if info.done:\n", + " if info.terminated:\n", " break\n", "\n", - " reason = \"bug fixed\" if info.done else \"max steps reached\"\n", + " reason = \"bug fixed\" if info.resolved else \"max steps reached\"\n", " print(f\"Step: {step} | Score: {info.score}/{info.max_score} ({info.score/info.max_score:.1%}) | Reason: {reason}\")\n", "\n", " return history\n", @@ -786,7 +786,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "88634798", "metadata": {}, "outputs": [], @@ -796,7 +796,7 @@ " \"config\": {},\n", " \"tools\": llm.define_tools(env.tools),\n", " \"uuid\": \"N/A\",\n", - " \"success\": env.done,\n", + " \"success\": env.resolved,\n", " \"log\": [history.json(i) for i in range(len(history))],\n", " \"agent_type\": \"custom\",\n", " \"logger\": \"N/A\",\n", diff --git a/scripts/config_swebench.yaml b/scripts/config_swebench.yaml index 9a9e369f..a7a56fb1 100644 --- a/scripts/config_swebench.yaml +++ b/scripts/config_swebench.yaml @@ -45,3 +45,11 @@ debug_5_agent: solution_agent: llm_name: null # No need for an LLM. tools: ["eval", "pdb"] + +swe_agent: + max_steps: 100 + max_rewrite_steps: 20 + tools: + - bash + - rewrite + - submit diff --git a/scripts/run.py b/scripts/run.py index 80edb50f..3d8ee732 100644 --- a/scripts/run.py +++ b/scripts/run.py @@ -191,7 +191,12 @@ def create_env(config: dict, logger: DebugGymLogger): def add_tools(env, config: dict, logger: DebugGymLogger): """Add tools to the environment""" for tool in config["tools"]: - tool_instantiated = Toolbox.get_tool(tool) + tool_config = {} + if isinstance(tool, dict): + assert len(tool) == 1, "Tool dict must have exactly one key" + tool, tool_config = list(tool.items())[0] + + tool_instantiated = Toolbox.get_tool(tool, **tool_config) env.add_tool(tool_instantiated) logger.debug(f"Adding tool to toolbox: {tool_instantiated.__class__.__name__}") diff --git a/tests/agents/test_agents.py b/tests/agents/test_agents.py index 55b48eeb..e97a95dd 100644 --- a/tests/agents/test_agents.py +++ b/tests/agents/test_agents.py @@ -168,7 +168,8 @@ def test_build_prompt(agent_setup, build_env_info): def test_run(agent_setup, build_env_info): agent, env, llm = next(agent_setup(DebugAgent)) env.reset.return_value = build_env_info( - done=False, + terminated=False, + resolved=False, score=0, max_score=10, instructions="Test instructions", @@ -177,7 +178,8 @@ def test_run(agent_setup, build_env_info): step_observation="Test last run obs", ) env.step.return_value = build_env_info( - done=True, + terminated=True, + resolved=True, score=10, max_score=10, instructions="Test instructions", @@ -206,7 +208,8 @@ def test_build_system_prompt_rewrite_agent(agent_setup, build_env_info): def test_run_debug_5_agent(agent_setup, build_env_info): agent, env, llm = next(agent_setup(Debug_5_Agent)) env.reset.return_value = build_env_info( - done=False, + terminated=False, + resolved=False, score=0, max_score=10, rewrite_counter=0, @@ -216,7 +219,8 @@ def test_run_debug_5_agent(agent_setup, build_env_info): step_observation="Test last run obs", ) env.step.return_value = build_env_info( - done=True, + terminated=True, + resolved=True, score=10, max_score=10, rewrite_counter=0, @@ -460,7 +464,8 @@ def test_run_early_completion(agent_setup, build_env_info): # Mock environment to return completed task immediately env.reset.return_value = build_env_info( - done=True, + terminated=True, + resolved=True, score=10, max_score=10, instructions="Test instructions", @@ -480,7 +485,8 @@ def test_run_max_rewrite_steps(agent_setup, build_env_info): agent.config["max_rewrite_steps"] = 2 env.reset.return_value = build_env_info( - done=False, + terminated=False, + resolved=False, score=0, max_score=10, rewrite_counter=0, @@ -492,7 +498,8 @@ def test_run_max_rewrite_steps(agent_setup, build_env_info): # First step - increase rewrite counter to max env.step.return_value = build_env_info( - done=False, + terminated=False, + resolved=False, score=5, max_score=10, rewrite_counter=2, # Reaches max_rewrite_steps @@ -513,7 +520,8 @@ def test_run_exception_handling(agent_setup, build_env_info): agent, env, llm = next(agent_setup(DebugAgent)) env.reset.return_value = build_env_info( - done=False, + terminated=False, + resolved=False, score=0, max_score=10, instructions="Test instructions", @@ -578,7 +586,8 @@ def test_save_trajectory(agent_setup, tmp_path): """Test trajectory saving functionality""" agent, env, llm = next(agent_setup(DebugAgent)) agent._output_path = str(tmp_path) - env.done = True + env.terminated = True + env.resolved = True # Make all fields JSON serializable agent.config = {"output_path": str(tmp_path), "random_seed": 42} @@ -614,7 +623,7 @@ def json(self, step_id): "config": agent.config, "tools": [{"name": "test_tool", "args": "test_args"}], "uuid": agent._uuid, - "success": env.done, + "success": env.resolved, "log": [ {"step": 0, "action": "test_action_0"}, {"step": 1, "action": "test_action_1"}, @@ -646,7 +655,7 @@ def json(self, step_id): "config": agent.config, "tools": ["test_tool(test_args)"], # String format when no LLM "uuid": agent._uuid, - "success": env.done, + "success": env.resolved, "log": [ {"step": 0, "action": "test_action_0"}, {"step": 1, "action": "test_action_1"}, diff --git a/tests/conftest.py b/tests/conftest.py index 17ee3fdd..869e0156 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -107,7 +107,8 @@ def _env_info( instructions=None, score=5, max_score=10, - done=False, + terminated=False, + resolved=False, rewrite_counter=0, tools=[], ): @@ -123,7 +124,8 @@ def _env_info( instructions=instructions if instructions is not None else {}, score=score, max_score=max_score, - done=done, + terminated=terminated, + resolved=resolved, rewrite_counter=rewrite_counter, tools=tools if tools is not None else [], ) diff --git a/tests/gym/envs/test_aider.py b/tests/gym/envs/test_aider.py index 43257eb8..1ad2ad1a 100644 --- a/tests/gym/envs/test_aider.py +++ b/tests/gym/envs/test_aider.py @@ -87,6 +87,7 @@ def test_steps(env): "The file `clock.py` has been updated successfully." ) assert env.auto_eval_on_rewrite is True + assert infos.all_observations[-1].source == "eval" assert infos.score == 1 infos = env.step(eval_call) @@ -109,10 +110,13 @@ def test_build_docker_image(mock_build_docker_image): @pytest.if_docker_running def test_reset_with_docker_terminal(setup_aider_repo): env = AiderBenchmarkEnv() + env.add_tool(Toolbox.get_tool("eval")) assert isinstance(env.terminal, DockerTerminal) infos = env.reset(options={"task_name": "clock"}) - assert "1 failed" in infos.step_observation.observation + assert env.instructions == infos.step_observation.observation + assert "1 failed" in infos.eval_observation.observation assert infos.max_score == 1 assert infos.score == 0 - assert not infos.done + assert not infos.terminated + assert not infos.resolved diff --git a/tests/gym/envs/test_env.py b/tests/gym/envs/test_env.py index b6179756..58ee7dfb 100644 --- a/tests/gym/envs/test_env.py +++ b/tests/gym/envs/test_env.py @@ -201,7 +201,8 @@ def test_step( mock_pdb_tool.assert_called_once_with(env, command="b 10") assert infos.step_observation == observation assert infos.score == 0 - assert not infos.done + assert not infos.terminated + assert not infos.resolved assert isinstance(infos, EnvInfo) @@ -212,13 +213,13 @@ def test_reset(tmp_path): env = RepoEnv(path=tmp_path, entrypoint="pytest test.py") infos = env.reset() + assert env.last_eval is None assert env.current_breakpoints_state == {} assert env.rewrite_counter == 0 - assert "FAILED test.py::test_1 - assert False" in env.last_eval.output assert infos == EnvInfo( - step_observation=Observation(source="env", observation=env.last_eval.output), - all_observations=[Observation(source="env", observation=env.last_eval.output)], - eval_observation=Observation(source="env", observation=env.last_eval.output), + step_observation=Observation(source="env", observation=env.instructions), + all_observations=[Observation(source="env", observation=env.instructions)], + eval_observation=None, dir_tree=( "Listing files in the current working directory. (read-only) indicates read-only files. Max depth: 1.\n" f"{env.working_dir}/\n" @@ -231,7 +232,8 @@ def test_reset(tmp_path): instructions="", score=0, max_score=1, - done=False, + terminated=False, + resolved=False, rewrite_counter=0, tools=[], ) @@ -275,6 +277,16 @@ def test_rewrite_counter(env): assert f.read() == "print('Hello')" +def test_eval(tmp_path): + (tmp_path / "test.py").write_text("def test_1():\n assert False\n") + (tmp_path / ".debugignore").write_text("__pycache__/\n.git/\n.pytest_cache/\n") + + env = RepoEnv(path=tmp_path, entrypoint="pytest test.py") + env.reset() + env.eval() + assert "FAILED test.py::test_1 - assert False" in env.last_eval.output + + def test_eval_success(tmp_path): working_dir = str(tmp_path) # create a dummy file diff --git a/tests/gym/envs/test_mini_nightmare.py b/tests/gym/envs/test_mini_nightmare.py index c9160bc1..eee46ee4 100644 --- a/tests/gym/envs/test_mini_nightmare.py +++ b/tests/gym/envs/test_mini_nightmare.py @@ -5,6 +5,7 @@ from debug_gym.gym.envs.mini_nightmare import MiniNightmareEnv from debug_gym.gym.terminals.docker import DockerTerminal from debug_gym.gym.terminals.local import LocalTerminal +from debug_gym.gym.tools.toolbox import Toolbox @pytest.fixture @@ -12,6 +13,7 @@ def mini_nightmare_env(): # Initialize the MiniNightmareEnv with LocalTerminal terminal = LocalTerminal() env = MiniNightmareEnv(terminal=terminal) + env.add_tool(Toolbox.get_tool("eval")) return env @@ -41,19 +43,24 @@ def test_instructions(mini_nightmare_env): def test_reset(mini_nightmare_env): infos = mini_nightmare_env.reset(options={"task_name": "config"}) - assert "2 failed" in infos.step_observation.observation + assert mini_nightmare_env.instructions == infos.step_observation.observation + assert "2 failed" in infos.eval_observation.observation assert infos.max_score == 2 assert infos.score == 0 - assert not infos.done + assert not infos.terminated + assert not infos.resolved @pytest.if_docker_running def test_reset_with_docker_terminal(): env = MiniNightmareEnv() + env.add_tool(Toolbox.get_tool("eval")) assert isinstance(env.terminal, DockerTerminal) infos = env.reset(options={"task_name": "config"}) - assert "2 failed" in infos.step_observation.observation + assert env.instructions == infos.step_observation.observation + assert "2 failed" in infos.eval_observation.observation assert infos.max_score == 2 assert infos.score == 0 - assert not infos.done + assert not infos.terminated + assert not infos.resolved diff --git a/tests/gym/envs/test_r2egym.py b/tests/gym/envs/test_r2egym.py index e735d04d..0487f782 100644 --- a/tests/gym/envs/test_r2egym.py +++ b/tests/gym/envs/test_r2egym.py @@ -2,6 +2,7 @@ import pytest +from debug_gym.agents.solution_agent import AgentSolution from debug_gym.gym.entities import Observation from debug_gym.gym.tools.tool import ToolCall from debug_gym.gym.tools.toolbox import Toolbox @@ -69,15 +70,19 @@ def test_setup_terminal(get_r2egym_env): @pytest.if_docker_running def test_reset_and_step(get_r2egym_env): env = get_r2egym_env() + env.add_tool(Toolbox.get_tool("eval")) env_info = env.reset( options={"task_name": "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"} ) - assert "short test summary info" in env_info.step_observation.observation + assert env.instructions == env_info.step_observation.observation + assert "short test summary info" in env_info.eval_observation.observation assert env_info.score == env.score == 0 assert env_info.max_score == 1 - assert not env_info.done - assert not env.done + assert not env_info.terminated + assert not env_info.resolved + assert not env.terminated + assert not env.resolved tool_call = ToolCall(id="listdir_id", name="listdir", arguments={}) env_info = env.step(tool_call) @@ -154,15 +159,47 @@ def test_readonly_file(get_r2egym_env): @pytest.if_docker_running def test_apply_gold_patch(get_r2egym_env): env = get_r2egym_env() + env.add_tool(Toolbox.get_tool("eval")) env_info = env.reset( options={"task_name": "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"} ) - assert not env_info.done + assert not env_info.terminated + assert not env_info.resolved assert env_info.score == env.score == 0 env.apply_gold_patch() - eval_output = env.eval() - score = env.calculate_score(eval_output) + env_info = env.step(ToolCall(id="eval_id", name="eval", arguments={})) + assert env_info.step_observation.source == "eval" + assert env_info.score == env_info.max_score - assert score == env.max_score + +@pytest.if_docker_running +def test_running_solution_agent(get_r2egym_env, tmp_path): + """End-to-end SolutionAgent run for R2E-Gym environment, asserting successful resolution after gold patch.""" + env = get_r2egym_env() + task_name = "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324" + config = { + "output_path": str(tmp_path), + "random_seed": 0, + "memory_size": 8, + "max_steps": 1, + "env_kwargs": {}, + } + for tool_name in ["pdb", "eval"]: + env.add_tool(Toolbox.get_tool(tool_name)) + agent = AgentSolution(config=config, env=env, llm=None, logger=env.logger) + success = agent.run(task_name=task_name) + assert success + + +@pytest.if_docker_running +def test_debug_entrypoint_contains_pdb(get_r2egym_env): + """Ensure the environment's debug_entrypoint includes '-m pdb' for interactive debugging.""" + env = get_r2egym_env() + env.reset( + options={"task_name": "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"} + ) + assert ( + "python -m pdb" in env.debug_entrypoint + ), f"Expected '-m pdb' in debug_entrypoint, got: {env.debug_entrypoint}" diff --git a/tests/gym/envs/test_swe_bench.py b/tests/gym/envs/test_swe_bench.py index 8ab76f86..9a5d01a2 100644 --- a/tests/gym/envs/test_swe_bench.py +++ b/tests/gym/envs/test_swe_bench.py @@ -1,6 +1,7 @@ import pytest from anyio import Path +from debug_gym.agents.solution_agent import AgentSolution from debug_gym.gym.entities import Observation from debug_gym.gym.tools.tool import ToolCall from debug_gym.gym.tools.toolbox import Toolbox @@ -17,13 +18,17 @@ def test_instructions(get_swe_bench_env): @pytest.if_docker_running def test_reset_and_step(get_swe_bench_env): env = get_swe_bench_env() + env.add_tool(Toolbox.get_tool("eval")) env_info = env.reset(options={"task_name": "astropy__astropy-14096"}) - assert "short test summary info" in env_info.step_observation.observation + assert env.instructions == env_info.step_observation.observation + assert "short test summary info" in env_info.eval_observation.observation assert env_info.score == env.score == 0 assert env_info.max_score == env.max_score == len(env.fail_to_pass) == 1 - assert not env_info.done - assert not env.done + assert not env_info.terminated + assert not env_info.resolved + assert not env.terminated + assert not env.resolved tool_call = ToolCall(id="listdir_id", name="listdir", arguments={}) env_info = env.step(tool_call) @@ -136,10 +141,17 @@ def test_setup_terminal(get_swe_bench_env): env.reset(options={"task_name": task_name}) _, git_logs = env.terminal.run("git log -n 4") assert env.base_commit in git_logs - assert f"Applying test patch for {task_name}" in git_logs + assert f"Applying test patch for {task_name}" not in git_logs - _, git_diff = env.terminal.run("git show HEAD", strip_output=False) - git_diff = git_diff[git_diff.index("diff --git") :] + # Check that the gold test patch has not been applied. + _, code_diff = env.terminal.run("git diff") + for test_directive in env.test_directives: + assert test_directive not in code_diff + + # After calling eval, the test patch should be applied. + env.eval() + + _, git_diff = env.terminal.run("git diff", strip_output=False) git_diff = [l for l in git_diff.split("\n") if not l.startswith("index ")] assert git_diff == env.test_patch.split("\n") @@ -200,13 +212,44 @@ def new_function(): @pytest.if_docker_running def test_apply_gold_patch(get_swe_bench_env): env = get_swe_bench_env() + env.add_tool(Toolbox.get_tool("eval")) env_info = env.reset(options={"task_name": "astropy__astropy-14096"}) - assert not env_info.done + assert not env_info.terminated + assert not env_info.resolved assert env_info.score == env.score == 0 env.apply_gold_patch() - eval_output = env.eval() - score = env.calculate_score(eval_output) + env_info = env.step(ToolCall(id="eval_id", name="eval", arguments={})) + assert env_info.step_observation.source == "eval" + assert env_info.score == env_info.max_score + + +@pytest.if_docker_running +def test_running_solution_agent(get_swe_bench_env, tmp_path): + env = get_swe_bench_env() + # BaseAgent requires a config dict with at least: output_path, random_seed, memory_size. + # Provide a minimal config for the SolutionAgent run. + config = { + "output_path": str(tmp_path), + "random_seed": 0, + "memory_size": 8, + # Optional values that BaseAgent.run would use; harmless to include here. + "max_steps": 1, + "env_kwargs": {}, + } + for tool_name in ["pdb", "eval"]: + env.add_tool(Toolbox.get_tool(tool_name)) + agent = AgentSolution(config=config, env=env, llm=None, logger=env.logger) + success = agent.run(task_name="astropy__astropy-14096") + assert success - assert score == env.max_score + +@pytest.if_docker_running +def test_debug_entrypoint_contains_pdb(get_swe_bench_env): + """Ensure the environment's debug_entrypoint includes '-m pdb' for interactive debugging.""" + env = get_swe_bench_env() + env.reset(options={"task_name": "astropy__astropy-14096"}) + assert ( + "python -m pdb" in env.debug_entrypoint + ), f"Expected '-m pdb' in debug_entrypoint, got: {env.debug_entrypoint}" diff --git a/tests/gym/envs/test_swe_smith.py b/tests/gym/envs/test_swe_smith.py index 6d7cc2d6..2374e4b3 100644 --- a/tests/gym/envs/test_swe_smith.py +++ b/tests/gym/envs/test_swe_smith.py @@ -2,6 +2,7 @@ import pytest +from debug_gym.agents.solution_agent import AgentSolution from debug_gym.gym.entities import Observation from debug_gym.gym.tools.tool import ToolCall from debug_gym.gym.tools.toolbox import Toolbox @@ -54,27 +55,31 @@ def test_setup_terminal(get_swe_smith_env): _, git_logs = env.terminal.run("git log -n 4") # For SWE-Smith the base commit is found in the branch associated to the # instance id and is different from the one in the main branch. - assert f"Applying test patch for {task_name}" in git_logs + assert f"Applying bug patch for {task_name}" in git_logs _, git_diff = env.terminal.run("git show HEAD", strip_output=False) git_diff = git_diff[git_diff.index("diff --git") :] - assert git_diff == env.test_patch + assert git_diff == env.bug_patch @pytest.if_docker_running def test_reset_and_step(get_swe_smith_env): env = get_swe_smith_env() + env.add_tool(Toolbox.get_tool("eval")) env_info = env.reset( options={ "task_name": "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" } ) - assert "short test summary info" in env_info.step_observation.observation + assert env.instructions == env_info.step_observation.observation + assert "short test summary info" in env_info.eval_observation.observation assert env_info.score == env.score == 0 assert env_info.max_score == env.max_score == len(env.fail_to_pass) == 39 - assert not env_info.done - assert not env.done + assert not env_info.terminated + assert not env_info.resolved + assert not env.terminated + assert not env.resolved tool_call = ToolCall(id="listdir_id", name="listdir", arguments={}) env_info = env.step(tool_call) @@ -147,26 +152,28 @@ def test_readonly_file(get_swe_smith_env): @pytest.if_docker_running def test_apply_gold_patch(get_swe_smith_env): env = get_swe_smith_env() + env.add_tool(Toolbox.get_tool("eval")) env_info = env.reset( options={ "task_name": "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" } ) - assert not env_info.done + assert not env_info.terminated + assert not env_info.resolved assert env_info.score == env.score == 0 env.apply_gold_patch() - eval_output = env.eval() - score = env.calculate_score(eval_output) - - assert score == env.max_score + env_info = env.step(ToolCall(id="eval_id", name="eval", arguments={})) + assert env_info.step_observation.source == "eval" + assert env_info.score == env_info.max_score @pytest.if_docker_running def test_calculate_score_with_pytest_error(get_swe_smith_env): """Test that the indentation error in pytest is handled correctly.""" env = get_swe_smith_env() + env.add_tool(Toolbox.get_tool("eval")) task_name = "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" env.reset(options={"task_name": task_name}) @@ -194,3 +201,36 @@ def test_calculate_score_with_pytest_error(get_swe_smith_env): score = env.calculate_score(eval_output) assert score == 0 + + +@pytest.if_docker_running +def test_running_solution_agent(get_swe_smith_env, tmp_path): + """Analogous to SWE Bench solution agent test: run SolutionAgent end-to-end and assert success.""" + env = get_swe_smith_env() + task_name = "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" + config = { + "output_path": str(tmp_path), + "random_seed": 0, + "memory_size": 8, + "max_steps": 1, + "env_kwargs": {}, + } + for tool_name in ["pdb", "eval"]: + env.add_tool(Toolbox.get_tool(tool_name)) + agent = AgentSolution(config=config, env=env, llm=None, logger=env.logger) + success = agent.run(task_name=task_name) + assert success + + +@pytest.if_docker_running +def test_debug_entrypoint_contains_pdb(get_swe_smith_env): + """Ensure the environment's debug_entrypoint includes '-m pdb' for interactive debugging.""" + env = get_swe_smith_env() + env.reset( + options={ + "task_name": "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" + } + ) + assert ( + "python -m pdb" in env.debug_entrypoint + ), f"Expected '-m pdb' in debug_entrypoint, got: {env.debug_entrypoint}" diff --git a/tests/gym/terminals/test_docker.py b/tests/gym/terminals/test_docker.py index 8540fcc0..074eefdb 100644 --- a/tests/gym/terminals/test_docker.py +++ b/tests/gym/terminals/test_docker.py @@ -18,6 +18,7 @@ def test_docker_terminal_init(): "NO_COLOR": "1", "PS1": DEFAULT_PS1, "PYTHONSTARTUP": "", + "PYTHONDONTWRITEBYTECODE": "1", } assert os.path.basename(terminal.working_dir).startswith("Terminal-") assert terminal.base_image == "ubuntu:latest" diff --git a/tests/gym/terminals/test_kubernetes.py b/tests/gym/terminals/test_kubernetes.py index acf9eb29..0161fbcc 100644 --- a/tests/gym/terminals/test_kubernetes.py +++ b/tests/gym/terminals/test_kubernetes.py @@ -39,6 +39,7 @@ def test_kubernetes_terminal_init(): "NO_COLOR": "1", "PS1": DEFAULT_PS1, "PYTHONSTARTUP": "", + "PYTHONDONTWRITEBYTECODE": "1", } for key, value in expected_base_env.items(): assert terminal.env_vars[key] == value diff --git a/tests/gym/terminals/test_terminal.py b/tests/gym/terminals/test_terminal.py index 00b88056..3867ea18 100644 --- a/tests/gym/terminals/test_terminal.py +++ b/tests/gym/terminals/test_terminal.py @@ -84,6 +84,7 @@ def test_terminal_init_no_os_env_vars(): "NO_COLOR": "1", "PS1": DEFAULT_PS1, "PYTHONSTARTUP": "", + "PYTHONDONTWRITEBYTECODE": "1", } diff --git a/tests/gym/tools/test_eval.py b/tests/gym/tools/test_eval.py index 13c35e2b..47a3173a 100644 --- a/tests/gym/tools/test_eval.py +++ b/tests/gym/tools/test_eval.py @@ -40,8 +40,6 @@ def test_eval(env): @pytest.mark.parametrize( "method,env_auto_eval_on_rewrite,expected", [ - ("on_env_reset", False, "1 passed in "), - ("on_env_reset", True, "1 passed in "), ("on_rewrite_success", True, "1 passed in "), ("on_rewrite_success", False, "FAILED test_1.py::test_1"), ], @@ -56,7 +54,7 @@ def test_eval_on_event(env, method, env_auto_eval_on_rewrite, expected): assert env_info.step_observation.source == "eval" assert "FAILED test_1.py::test_1" in env_info.step_observation.observation - # Edit test file to pass. If eval is called, env.done is set to True + # Edit test file to pass. If eval is called, env.terminated is set to True with open(env.working_dir / "test_1.py", "w") as f: f.write("def test_1():\n assert True\n") diff --git a/tests/gym/tools/test_listdir.py b/tests/gym/tools/test_listdir.py index 6e60ce60..4198266a 100644 --- a/tests/gym/tools/test_listdir.py +++ b/tests/gym/tools/test_listdir.py @@ -24,8 +24,6 @@ def test_listdir_default(tmp_path, setup_listdir_repo_env): assert obs.observation == ( f"{env.working_dir}/\n" "|-- .git/\n" - "|-- .pytest_cache/\n" - "|-- __pycache__/\n" "|-- file1.py\n" "|-- file2.py\n" "|-- test_fail.py\n" diff --git a/tests/gym/tools/test_pdb.py b/tests/gym/tools/test_pdb.py index 72aa61bd..a0276d1c 100644 --- a/tests/gym/tools/test_pdb.py +++ b/tests/gym/tools/test_pdb.py @@ -10,6 +10,7 @@ from debug_gym.gym.envs.env import RepoEnv from debug_gym.gym.terminals.docker import DockerTerminal from debug_gym.gym.terminals.local import LocalTerminal +from debug_gym.gym.terminals.shell_session import ProcessNotRunningError from debug_gym.gym.tools.pdb import PDBTool @@ -403,11 +404,11 @@ def test_pdb_timeout(tmp_path, setup_test_repo): assert not pdb.pdb_is_running -def test_close_pdb_start_and_close_session(tmp_path, setup_pdb_repo_env): +def test_stop_pdb_start_and_close_session(tmp_path, setup_pdb_repo_env): pdb_tool, env = setup_pdb_repo_env(tmp_path) # setup_pdb_repo_env starts the pdb session assert pdb_tool.pdb_is_running - pdb_tool.close_pdb() + pdb_tool.stop_pdb() assert not pdb_tool.pdb_is_running pdb_tool.start_pdb(env) assert pdb_tool.pdb_is_running @@ -457,7 +458,7 @@ def test_on_rewrite_success_calls_breakpoint_modify_and_restart_pdb( def test_restart_pdb_calls_close_and_start(tmp_path, setup_pdb_repo_env): pdb_tool, env = setup_pdb_repo_env(tmp_path) - pdb_tool.close_pdb = lambda: setattr(pdb_tool, "closed", True) + pdb_tool.stop_pdb = lambda: setattr(pdb_tool, "closed", True) pdb_tool.start_pdb = lambda e: "started" out = pdb_tool.restart_pdb(env) assert pdb_tool.closed @@ -693,9 +694,8 @@ def test_pdb_list_output_indentation(tmp_path, setup_pdb_repo_env): f.write("\n".join(f" 'Line {i+1}'" for i in range(1, 2000))) f.write("\n\nif __name__ == '__main__':\n") f.write(" dummy_function()\n") - env.set_entrypoints("python large_file.py", "python -m pdb large_file.py") - pdb_tool.start_pdb(env) - pdb_obs = pdb_tool.use(env, "b large_file.py:100") + debug_entrypoint = "python -m pdb large_file.py" + pdb_obs = pdb_tool.use(env, "b large_file.py:100", debug_entrypoint) assert ( f"Pdb command output:\nBreakpoint 5 at {wd}/large_file.py:100" ) in pdb_obs.observation @@ -809,3 +809,114 @@ def test_pdbtool_pickle_roundtrip(tmp_path, setup_pdb_repo_env): assert rehydrated.name == pdb_tool.name assert rehydrated.examples == pdb_tool.examples + + +def test_pdb_entrypoint_priority_order(tmp_path, setup_pdb_repo_env): + pdb, env = setup_pdb_repo_env(tmp_path) + + # 1. First use with custom entrypoint - should use provided entrypoint + custom1 = "python -m pdb -m pytest -sq ." + pdb.use(env, command="l", entrypoint=custom1) + assert pdb.entrypoint == custom1 + + # 2. Second use without entrypoint - should use last entrypoint (custom1) + pdb.stop_pdb() # Stop to test entrypoint selection on restart + pdb.use(env, command="l") + assert pdb.entrypoint == custom1 + + # 3. Third use with different entrypoint - should use new entrypoint + custom2 = "python -m pdb -m pytest -v ." + pdb.use(env, command="l", entrypoint=custom2) + assert pdb.entrypoint == custom2 + + +def test_pdb_set_default_entrypoint_false_requires_entrypoint( + tmp_path, setup_pdb_repo_env +): + _, env = setup_pdb_repo_env(tmp_path) + pdb = PDBTool(set_default_entrypoint=False) + + # Should fail when no entrypoint is provided + output = pdb.use(env, command="l") + assert "Failure calling pdb:" in output.observation + assert ( + "An entrypoint must be provided when using the pdb tool." in output.observation + ) + + # Should work when entrypoint is provided + output = pdb.use(env, command="l", entrypoint="python -m pdb -m pytest -sv .") + assert """The pytest entry point.""" in output.observation + + +def test_pdb_set_default_entrypoint_false_arguments_validation(): + """Test that when set_default_entrypoint=False, arguments schema is updated.""" + pdb_no_default = PDBTool(set_default_entrypoint=False) + pdb_with_default = PDBTool(set_default_entrypoint=True) + + # When set_default_entrypoint=False, "null" should be removed from entrypoint type + assert "null" not in pdb_no_default.arguments["entrypoint"]["type"] + assert "string" in pdb_no_default.arguments["entrypoint"]["type"] + assert "an entrypoint must be provided" in pdb_no_default.description + + # When set_default_entrypoint=True, "null" should be present in entrypoint type + assert "null" in pdb_with_default.arguments["entrypoint"]["type"] + assert "string" in pdb_with_default.arguments["entrypoint"]["type"] + assert "optionally specify an 'entrypoint'" in pdb_with_default.description + + +def test_pdb_invalid_entrypoint_handling(tmp_path, setup_pdb_repo_env): + pdb, env = setup_pdb_repo_env(tmp_path) + + # Try with an invalid entrypoint that should fail to start pdb + invalid_entrypoint = "nonexistent-command-that-should-fail" + output = pdb.use(env, command="l", entrypoint=invalid_entrypoint) + + # Should contain failure message + assert "entrypoint failed to start a pdb session" in output.observation + assert not pdb.pdb_is_running + + +def test_pdb_changing_entrypoint(tmp_path, setup_pdb_repo_env): + pdb, env = setup_pdb_repo_env(tmp_path) + wd = env.working_dir + + # Create a simple Python script to debug + with (wd / "simple_script.py").open("w") as f: + f.write( + """ +def main(): + x = 42 + print(f"Value is {x}") + return x + +if __name__ == "__main__": + main() +""" + ) + + # Use entrypoint to debug the simple script instead of pytest + script_entrypoint = "python -m pdb simple_script.py" + output = pdb.use(env, command="l", entrypoint=script_entrypoint) + initial_session = pdb._session + + # Should see the script content + assert "def main():" in output.observation + assert pdb.entrypoint == script_entrypoint + + # Subsequent commands should retain the entrypoint and session + pdb.use(env, command="b") + assert pdb.entrypoint == script_entrypoint + assert pdb._session == initial_session + + pdb.use(env, command="where") + assert pdb.entrypoint == script_entrypoint + assert pdb._session == initial_session + + # Switch back to pytest + pytest_entrypoint = "python -m pdb -m pytest -sv ." + output = pdb.use(env, command="l", entrypoint=pytest_entrypoint) + + # Should see pytest content and a new session + assert """The pytest entry point.""" in output.observation + assert pdb.entrypoint == pytest_entrypoint + assert pdb._session != initial_session diff --git a/tests/llms/conftest.py b/tests/llms/conftest.py index 9aeb0009..35cb5d32 100644 --- a/tests/llms/conftest.py +++ b/tests/llms/conftest.py @@ -18,7 +18,8 @@ def _env_info( instructions=None, score=5, max_score=10, - done=False, + terminated=False, + resolved=False, rewrite_counter=0, tools=[], ): @@ -34,7 +35,8 @@ def _env_info( instructions=instructions if instructions is not None else {}, score=score, max_score=max_score, - done=done, + terminated=terminated, + resolved=resolved, rewrite_counter=rewrite_counter, tools=tools if tools is not None else [], ) diff --git a/tests/llms/test_anthropic.py b/tests/llms/test_anthropic.py index 6d953fe4..bce97b08 100644 --- a/tests/llms/test_anthropic.py +++ b/tests/llms/test_anthropic.py @@ -330,7 +330,8 @@ def test_format_tool_call_history_initial_state(mock_llm_config, logger_mock): instructions={}, score=0, max_score=100, - done=False, + terminated=False, + resolved=False, rewrite_counter=0, tools=[], ) @@ -374,7 +375,8 @@ def test_format_tool_call_history_with_action(mock_llm_config, logger_mock): instructions={}, score=0, max_score=100, - done=False, + terminated=False, + resolved=False, rewrite_counter=0, tools=[], ) diff --git a/tests/llms/test_openai.py b/tests/llms/test_openai.py index abd90751..dc1741b9 100644 --- a/tests/llms/test_openai.py +++ b/tests/llms/test_openai.py @@ -217,7 +217,8 @@ def test_format_tool_call_history_initial_state(mock_llm_config, logger_mock): instructions={}, score=0, max_score=100, - done=False, + terminated=False, + resolved=False, rewrite_counter=0, tools=[], ) @@ -272,7 +273,8 @@ def test_format_tool_call_history_with_action(mock_llm_config, logger_mock): instructions={}, score=0, max_score=100, - done=False, + terminated=False, + resolved=False, rewrite_counter=0, tools=[], ) @@ -351,7 +353,8 @@ def test_format_tool_call_history_complex_arguments(mock_llm_config, logger_mock instructions={}, score=0, max_score=100, - done=False, + terminated=False, + resolved=False, rewrite_counter=0, tools=[], )