From 1cd2d2b56965aff1c27744cae465ae3b49761b1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albert=20=C3=96rwall?= Date: Fri, 12 Apr 2024 09:30:53 +0200 Subject: [PATCH 1/6] Create Docker image with SWE-bench installed --- .dockerignore | 8 ++++++++ Dockerfile | 22 +++++++++++++++++++--- 2 files changed, 27 insertions(+), 3 deletions(-) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..78e31d0 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,8 @@ +** + +!README.md +!setup.py +!pyproject.toml +!setup.cfg +!environment.yml +!/swebench/ diff --git a/Dockerfile b/Dockerfile index 2378da1..0bd08b8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,8 @@ -FROM ubuntu:20.04 +FROM ubuntu:jammy # https://github.com/princeton-nlp/SWE-bench/issues/15#issuecomment-1815392192 RUN apt-get update && \ - apt-get install -y bash gcc git jq wget && \ + apt-get install -y bash gcc git jq wget g++ make libffi-dev python3.11 && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -15,6 +15,7 @@ RUN apt update && apt install -y build-essential RUN useradd -ms /bin/bash swe-bench USER swe-bench WORKDIR /home/swe-bench +RUN chown -R swe-bench:swe-bench /home/swe-bench # Setup Conda ENV PATH="/home/swe-bench/miniconda3/bin:${PATH}" @@ -23,13 +24,28 @@ RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - && mkdir ~/.conda \ && bash miniconda.sh -b \ && rm -f miniconda.sh -RUN conda --version +RUN conda --version \ + && conda init bash \ + && conda config --append channels conda-forge # Setup SWE-Bench Env COPY environment.yml . RUN conda env create -f environment.yml +RUN conda --version \ + && conda init \ + && conda config --append channels conda-forge + # Some missing packages RUN pip install datasets python-dotenv gitpython +# Install SWE-Bench +COPY . . +RUN pip install -e . + +# RUN echo "source activate swe-bench" > ~/.bashrc +# SHELL ["/bin/bash", "--login", "-c"] + +# RUN conda activate swe-bench && pip install -e . + CMD ["/bin/bash"] From 2870f603df544eb2d5d0beac7cf922aa6a7b8d84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albert=20=C3=96rwall?= Date: Sat, 13 Apr 2024 16:49:21 +0200 Subject: [PATCH 2/6] Dockerifle --- Dockerfile | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0bd08b8..14caee3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,6 +11,8 @@ RUN git config --global user.name "swebench" RUN apt update && apt install -y build-essential +RUN ln -sfn /bin/bash /bin/sh + # Create new user RUN useradd -ms /bin/bash swe-bench USER swe-bench @@ -37,15 +39,10 @@ RUN conda --version \ && conda config --append channels conda-forge # Some missing packages -RUN pip install datasets python-dotenv gitpython +RUN pip install datasets python-dotenv gitpython unidiff rich # Install SWE-Bench COPY . . RUN pip install -e . -# RUN echo "source activate swe-bench" > ~/.bashrc -# SHELL ["/bin/bash", "--login", "-c"] - -# RUN conda activate swe-bench && pip install -e . - CMD ["/bin/bash"] From e883c189d2a3c683de2217ce02da900a6901eef0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albert=20=C3=96rwall?= Date: Sun, 14 Apr 2024 17:48:27 +0200 Subject: [PATCH 3/6] Updated Dockerfile and environment.yml to make evaluation work in Docker --- Dockerfile | 6 +----- environment.yml | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 14caee3..5d99862 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,12 +34,8 @@ RUN conda --version \ COPY environment.yml . RUN conda env create -f environment.yml -RUN conda --version \ - && conda init \ - && conda config --append channels conda-forge - # Some missing packages -RUN pip install datasets python-dotenv gitpython unidiff rich +RUN pip install datasets python-dotenv gitpython unidiff rich importlib # Install SWE-Bench COPY . . diff --git a/environment.yml b/environment.yml index a64aa6c..3eb4697 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ name: swe-bench dependencies: - - python=3.9 + - python=3.11 - pip - pip: - datasets From a64d47aab9d9cee255dbfab8e0706f681b405f30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albert=20=C3=96rwall?= Date: Fri, 19 Apr 2024 10:51:21 +0200 Subject: [PATCH 4/6] Ignore warnings, might result in other test failures --- swebench/harness/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swebench/harness/constants.py b/swebench/harness/constants.py index 6d2367f..6d7479e 100644 --- a/swebench/harness/constants.py +++ b/swebench/harness/constants.py @@ -495,7 +495,7 @@ MAP_REPO_TO_INSTALL = {} # Constants - Task Instance Test Frameworks -TEST_PYTEST = "pytest --no-header -rA --tb=no -p no:cacheprovider" +TEST_PYTEST = "pytest --no-header -rA --tb=no -p no:cacheprovider -Wignore" MAP_REPO_TO_TEST_FRAMEWORK = { "astropy/astropy": TEST_PYTEST, "dbt-labs/dbt-core": TEST_PYTEST, From 19212694b833aedb0f889544b5a0ada1686780b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albert=20=C3=96rwall?= Date: Fri, 19 Apr 2024 10:53:07 +0200 Subject: [PATCH 5/6] Support swe-bench dataset --- swebench/harness/engine_testbed.py | 54 +++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/swebench/harness/engine_testbed.py b/swebench/harness/engine_testbed.py index d35adc0..9df986f 100644 --- a/swebench/harness/engine_testbed.py +++ b/swebench/harness/engine_testbed.py @@ -1,11 +1,20 @@ import argparse import json +import logging + from context_manager import TestbedContextManager, TaskEnvContextManager from typing import Dict + from utils import DotDict import os.path as osp from multiprocessing import Pool +logging.basicConfig( + level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger("engine_testbed") + + def is_json(myjson: str): try: json_object = json.loads(myjson) @@ -82,16 +91,41 @@ def setup_testbed(data: Dict): return def main(args): - devin_output = json.load(open(args.devin_output_path, "r")) - devin_instance_ids = [_["instance_id"] for _ in devin_output] - with open(args.instances_path, 'r', encoding='utf-8') as f: - instances_list = json.load(f) - for item in instances_list: + if args.devin_output_path: + devin_output = json.load(open(args.devin_output_path, "r")) + devin_instance_ids = [_["instance_id"] for _ in devin_output] + else: + devin_instance_ids = [] + + instances_list = None + if args.instances_path: + with open(args.instances_path, 'r', encoding='utf-8') as f: + instances_list = json.load(f) + elif args.swe_bench_tasks: + from swebench import get_eval_refs + instances_list = list(get_eval_refs(args.swe_bench_tasks).values()) + + if not instances_list: + raise ValueError("No task instances found") + + for idx, item in enumerate(instances_list): # if (args.instance_id != item["instance_id"]): # continue - if item["instance_id"] not in devin_instance_ids or \ - osp.exists(osp.join(args.log_dir, item["instance_id"] + ".log")): + log_file = osp.join(args.log_dir, item["instance_id"] + ".log") + + if devin_instance_ids and item["instance_id"] not in devin_instance_ids: + print(f"[{idx}/{len(instances_list)}] Skipping {item['instance_id']} as it is not in devin's output") continue + elif osp.exists(log_file): + with open(log_file, 'r', encoding='utf-8') as f: + log_content = f.read() + + if "Init Succeeded" in log_content: + print(f"[{idx}/{len(instances_list)}] Skipping {item['instance_id']} it's already initiated.") + continue + else: + print(f"[{idx}/{len(instances_list)}] Processing {item['instance_id']}") + task_instance = item data_group = { "task_instances": [task_instance], @@ -103,10 +137,11 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--instances_path", type=str, help="task instances path", required=True) + parser.add_argument("--instances_path", type=str, help="task instances path", required=False) + parser.add_argument("--swe_bench_tasks", type=str, help="Path to dataset file or HF datasets name", required=False) # parser.add_argument("--instance_id", type=str, help="JSON String for an individual task instance", required=True) parser.add_argument("--log_dir", type=str, help="Path to log directory", required=True) - parser.add_argument("--devin_output_path", type=str, help="Path to devin's output", required=True) + parser.add_argument("--devin_output_path", type=str, help="Path to devin's output", required=False) parser.add_argument("--conda_path", type=str, help="(Optional) Path to miniconda3 or anaconda installation") parser.add_argument("--testbed", type=str, help="(Optional) Path to testbed directory") parser.add_argument("--venv", type=str, help="(Optional) Virtual environment for the test") @@ -114,4 +149,5 @@ def main(args): parser.add_argument("--verbose", action="store_true", help="(Optional) Verbose mode") args = parser.parse_args() validate_args(args) + logger.propagate = args.verbose main(args) \ No newline at end of file From 185f4165044c26440e9f2dcf8e4adb05654e727d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albert=20=C3=96rwall?= Date: Sun, 21 Apr 2024 10:53:51 +0200 Subject: [PATCH 6/6] Generate report --- swebench/harness/report.py | 227 ++++++++++++++++++++++++++++++++++++ swebench/metrics/getters.py | 3 +- swebench/metrics/report.py | 14 ++- 3 files changed, 240 insertions(+), 4 deletions(-) create mode 100644 swebench/harness/report.py diff --git a/swebench/harness/report.py b/swebench/harness/report.py new file mode 100644 index 0000000..af011c0 --- /dev/null +++ b/swebench/harness/report.py @@ -0,0 +1,227 @@ +import argparse +import json +import os +import traceback + +from collections import Counter +from rich import print +from swebench import ( + KEY_INSTANCE_ID, + KEY_MODEL, + KEY_PREDICTION, + get_eval_report, + get_logs_eval, + get_model_report, + get_resolution_status, + run_evaluation, + get_eval_refs, +) +from swebench.harness.constants import ( + INSTALL_FAIL, +) +from unidiff import PatchSet + + +def main(predictions_path, log_dir, swe_bench_tasks): + # Check if paths exist + if not os.path.exists(predictions_path): + raise FileNotFoundError(f"Predictions path {predictions_path} does not exist") + eval_refs = get_eval_refs(swe_bench_tasks) + for k, v in eval_refs.items(): + eval_refs[k] = {key: v[key] for key in [KEY_INSTANCE_ID, "FAIL_TO_PASS", "PASS_TO_PASS"]} + + # Change model_name_or_patch field to directory name for all predictions + directory = os.path.dirname(predictions_path) + directory_name = directory.rsplit("/", 1)[-1] + pred_path_orig = predictions_path + if pred_path_orig.endswith(".jsonl"): + pred_path_temp = predictions_path.replace(".jsonl", "_filtered.jsonl") + else: + pred_path_temp = predictions_path.replace(".json", "_filtered.json") + + + if any([pred_path_orig.endswith(x) for x in [".jsonl", ".jsonl.all"]]): + predictions = list() + with open(pred_path_orig) as f: + for line in f.readlines(): + predictions.append(json.loads(line)) + else: + with open(pred_path_orig) as f: + predictions = json.load(f) + + pred_total, pred_will_eval = 0, 0 + for p in predictions: + pred_total += 1 + # Exclude predictions w/ empty strings + # if p[KEY_PREDICTION] is not None and p[KEY_PREDICTION].strip() != "": + pred_will_eval += 1 + print( + f"Found {pred_total} total predictions, will evaluate {pred_will_eval} ({pred_total-pred_will_eval} are empty)" + ) + print(f"Log directory for evaluation run: {log_dir}") + + # Iterate through predictions + scorecards = [] + for p in predictions: + scorecard = {KEY_INSTANCE_ID: p[KEY_INSTANCE_ID], "statuses": [], "stats": {}} + + # Add trajectory statistics if traj_path exists + traj_path = os.path.join(directory, f"{p[KEY_INSTANCE_ID]}.traj") + if os.path.exists(traj_path): + traj_data = json.load(open(traj_path, "r")) + scorecard["stats"]["traj_num_steps"] = len(traj_data["trajectory"]) + scorecard["stats"]["traj_action_dist"] = dict( + Counter( + [ + entry["action"].strip().split()[0] + if entry["role"] == "assistant" and "action" in entry and len(entry["action"]) > 0 + else None + for entry in traj_data["history"] + ] + ) + ) + scorecard["exit_status"] = ( + traj_data["info"]["exit_status"] + if "exit_status" in traj_data["info"] + else "n/a" + ) + + # Check that a prediction was generated + #if p[KEY_PREDICTION] is None or p[KEY_PREDICTION].strip() == "": + # scorecard["statuses"].append("not_generated") + # scorecards.append(scorecard) + # continue + scorecard["statuses"].append("generated") + + # Get log file + log_path = os.path.join( + log_dir, f"{p[KEY_INSTANCE_ID]}.{directory_name}.eval.log" + ) + if not os.path.exists(log_path): + scorecard["statuses"].append("build_failure") + scorecards.append(scorecard) + continue + + # Get evaluation logs + eval_sm, found = get_logs_eval(log_path) + + # Check that the prediction generated + if not found: + scorecards.append(scorecard) + continue + scorecard["statuses"].append("applied") + + with open(log_path, "r") as f: + log_contents = f.read() + if INSTALL_FAIL in log_contents: + scorecard["statuses"].append("install_fail") + + # Get resolution status + report = get_eval_report(eval_sm, eval_refs[p[KEY_INSTANCE_ID]]) + scorecard["test_results"] = { + "failure": { + "FAIL_TO_PASS": report["FAIL_TO_PASS"]["failure"], + "PASS_TO_PASS": report["PASS_TO_PASS"]["failure"], + }, + "success": { + "FAIL_TO_PASS": report["FAIL_TO_PASS"]["success"], + "PASS_TO_PASS": report["PASS_TO_PASS"]["success"], + } + } + resolution_status = get_resolution_status(report) + scorecard["statuses"].append(resolution_status) + + diff_obj = PatchSet(p[KEY_PREDICTION]) + scorecard["patch_files"] = [ + x.path + for x in diff_obj.modified_files + + diff_obj.added_files + + diff_obj.removed_files + ] + scorecard["patch_lines_add"] = sum([f.added for f in diff_obj]) + scorecard["patch_lines_del"] = sum([f.removed for f in diff_obj]) + scorecards.append(scorecard) + + # Calculate cumulative results + get_ids_with_status = lambda x: [ + s[KEY_INSTANCE_ID] for s in scorecards if x in s["statuses"] + ] + report = { + "# Not Generated": len(get_ids_with_status("not_generated")), + "# Generated": len(get_ids_with_status("generated")), + "# Applied": len(get_ids_with_status("applied")), + "# Resolved": len(get_ids_with_status("RESOLVED_FULL")), + "# Install Fail": len(get_ids_with_status("install_fail")), + } + print(f"== Evaluation Report ==\n{report}") + + report_exits = dict( + Counter([s["exit_status"] if "exit_status" in s else "n/a" for s in scorecards]) + ) + + # Save to summary, scorecard json + path_scorecards = os.path.join(directory, "scorecards.json") + with open(path_scorecards, "w") as f: + json.dump(scorecards, fp=f, indent=2) + print(f"- Wrote per-instance scorecards to {path_scorecards}") + + path_results = os.path.join(directory, "results.json") + with open(path_results, "w") as f: + json.dump( + { + "report": report, + "report_exits": report_exits, + "not_generated": get_ids_with_status("not_generated"), + "generated": get_ids_with_status("generated"), + "applied": get_ids_with_status("applied"), + "resolved": get_ids_with_status("RESOLVED_FULL"), + "not_resolved": get_ids_with_status("RESOLVED_NO"), + "install_fail": get_ids_with_status("install_fail"), + }, + fp=f, + indent=2, + ) + print(f"- Wrote summary of run to {path_results}") + + # Sanity check against get_model_report + report = get_model_report( + directory_name, pred_path_orig, swe_bench_tasks, log_dir + ) + by_outcome = {} + by_outcome_func = lambda status: len( + [ + instance_id + for _, v in report.items() + if isinstance(v, dict) + for instance_id in v[status] + ] + ) + by_outcome["# Not Generated"] = by_outcome_func("none") + by_outcome["# Generated"] = by_outcome_func("generated") + by_outcome["# Applied"] = by_outcome_func("applied") + by_outcome["# Resolved"] = by_outcome_func("resolved") + by_outcome["# Install Fail"] = by_outcome_func("install_fail") + print(f"Reference Report:\n{by_outcome}") + + +if __name__ == "__main__": + # Parse arguments + parser = argparse.ArgumentParser() + parser.add_argument( + "--predictions_path", + type=str, + help="Path to predictions file (.jsonl)", + required=True, + ) + parser.add_argument( + "--log_dir", type=str, help="Path to log directory", required=True + ) + parser.add_argument( + "--swe_bench_tasks", + type=str, + help="Path to SWE-bench task instances file", + required=True, + ) + + args = parser.parse_args() + main(**vars(args)) diff --git a/swebench/metrics/getters.py b/swebench/metrics/getters.py index 4505ee5..f4db804 100644 --- a/swebench/metrics/getters.py +++ b/swebench/metrics/getters.py @@ -121,11 +121,12 @@ def log_path_to_sms(log_fp: str, log_parser) -> Tuple[list, bool]: test_passed = lambda case, sm: case in sm and sm[case] == TestStatus.PASSED.value +test_missed = lambda case, sm: case not in sm + test_failed = lambda case, sm: case not in sm or any( [sm[case] == status for status in [TestStatus.FAILED.value, TestStatus.ERROR.value]] ) - def get_eval_refs(data_path_or_name): decode_keys = False if os.path.isfile(data_path_or_name): diff --git a/swebench/metrics/report.py b/swebench/metrics/report.py index 62a677f..100dbe1 100644 --- a/swebench/metrics/report.py +++ b/swebench/metrics/report.py @@ -17,7 +17,7 @@ get_id_from_lp, test_failed, test_passed, - get_eval_refs, + get_eval_refs, test_missed, ) from swebench.metrics.metrics import ( compute_fail_to_pass_unweighted, @@ -63,8 +63,11 @@ def get_eval_report( # Calculate resolution metrics f2p_success = [] f2p_failure = [] + f2p_missed = [] for test_case in gold_results[FAIL_TO_PASS]: - if test_passed(test_case, eval_sm): + if test_missed(test_case, eval_sm): + f2p_missed.append(test_case) + elif test_passed(test_case, eval_sm): # Assume silent success for now (test case not in eval_sm) f2p_success.append(test_case) elif test_failed(test_case, eval_sm): @@ -73,8 +76,11 @@ def get_eval_report( # Calculate maintenance metrics p2p_success = [] p2p_failure = [] + p2p_missed = [] for test_case in gold_results[PASS_TO_PASS]: - if test_passed(test_case, eval_sm): + if test_missed(test_case, eval_sm): + p2p_missed.append(test_case) + elif test_passed(test_case, eval_sm): p2p_success.append(test_case) elif test_failed(test_case, eval_sm): p2p_failure.append(test_case) @@ -83,10 +89,12 @@ def get_eval_report( FAIL_TO_PASS: { "success": f2p_success, "failure": f2p_failure, + "missed": f2p_missed }, PASS_TO_PASS: { "success": p2p_success, "failure": p2p_failure, + "missed": p2p_missed } }