OpenDevin · aorwall · Apr 12, 2024 · Apr 13, 2024 · Apr 13, 2024 · Apr 14, 2024
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,8 @@
+**
+
+!README.md
+!setup.py
+!pyproject.toml
+!setup.cfg
+!environment.yml
+!/swebench/
diff --git a/Dockerfile b/Dockerfile
@@ -1,8 +1,8 @@
-FROM ubuntu:20.04
+FROM ubuntu:jammy
 
 # https://github.com/princeton-nlp/SWE-bench/issues/15#issuecomment-1815392192
 RUN apt-get update && \
-    apt-get install -y bash gcc git jq wget && \
+    apt-get install -y bash gcc git jq wget g++ make libffi-dev python3.11 && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
@@ -11,10 +11,13 @@ RUN git config --global user.name "swebench"
 
 RUN apt update && apt install -y build-essential
 
+RUN ln -sfn /bin/bash /bin/sh
+
 # Create new user
 RUN useradd -ms /bin/bash swe-bench
 USER swe-bench
 WORKDIR /home/swe-bench
+RUN chown -R swe-bench:swe-bench /home/swe-bench
 
 # Setup Conda
 ENV PATH="/home/swe-bench/miniconda3/bin:${PATH}"
@@ -23,13 +26,19 @@ RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -
     && mkdir ~/.conda \
     && bash miniconda.sh -b \
     && rm -f miniconda.sh
-RUN conda --version
+RUN conda --version \
+    && conda init bash \
+    && conda config --append channels conda-forge
 
 # Setup SWE-Bench Env
 COPY environment.yml .
 RUN conda env create -f environment.yml
 
 # Some missing packages
-RUN pip install datasets python-dotenv gitpython
+RUN pip install datasets python-dotenv gitpython unidiff rich importlib
+
+# Install SWE-Bench
+COPY . .
+RUN pip install -e .
 
 CMD ["/bin/bash"]
diff --git a/environment.yml b/environment.yml
@@ -1,6 +1,6 @@
 name: swe-bench
 dependencies:
-  - python=3.9
+  - python=3.11
   - pip
   - pip:
     - datasets

diff --git a/swebench/harness/constants.py b/swebench/harness/constants.py
@@ -495,7 +495,7 @@
 MAP_REPO_TO_INSTALL = {}
 
 # Constants - Task Instance Test Frameworks
-TEST_PYTEST = "pytest --no-header -rA --tb=no -p no:cacheprovider"
+TEST_PYTEST = "pytest --no-header -rA --tb=no -p no:cacheprovider -Wignore"
 MAP_REPO_TO_TEST_FRAMEWORK = {
     "astropy/astropy": TEST_PYTEST,
     "dbt-labs/dbt-core": TEST_PYTEST,

diff --git a/swebench/harness/engine_testbed.py b/swebench/harness/engine_testbed.py
@@ -1,11 +1,20 @@
 import argparse
 import json
+import logging
+
 from context_manager import TestbedContextManager, TaskEnvContextManager
 from typing import Dict
+
 from utils import DotDict
 import os.path as osp
 from multiprocessing import Pool
 
+logging.basicConfig(
+    level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger("engine_testbed")
+
+
 def is_json(myjson: str):
     try:
         json_object = json.loads(myjson)
@@ -82,16 +91,41 @@ def setup_testbed(data: Dict):
     return
 
 def main(args):
-    devin_output = json.load(open(args.devin_output_path, "r"))
-    devin_instance_ids = [_["instance_id"] for _ in devin_output]
-    with open(args.instances_path, 'r', encoding='utf-8') as f:
-        instances_list =  json.load(f)
-    for item in instances_list:
+    if args.devin_output_path:
+        devin_output = json.load(open(args.devin_output_path, "r"))
+        devin_instance_ids = [_["instance_id"] for _ in devin_output]
+    else:
+        devin_instance_ids = []
+
+    instances_list = None
+    if args.instances_path:
+        with open(args.instances_path, 'r', encoding='utf-8') as f:
+            instances_list = json.load(f)
+    elif args.swe_bench_tasks:
+        from swebench import get_eval_refs
+        instances_list = list(get_eval_refs(args.swe_bench_tasks).values())
+
+    if not instances_list:
+        raise ValueError("No task instances found")
+
+    for idx, item in enumerate(instances_list):
         # if (args.instance_id != item["instance_id"]):
         #     continue
-        if item["instance_id"] not in devin_instance_ids or \
-            osp.exists(osp.join(args.log_dir, item["instance_id"] + ".log")):
+        log_file = osp.join(args.log_dir, item["instance_id"] + ".log")
+
+        if devin_instance_ids and item["instance_id"] not in devin_instance_ids:
+            print(f"[{idx}/{len(instances_list)}] Skipping {item['instance_id']} as it is not in devin's output")
             continue
+        elif osp.exists(log_file):
+            with open(log_file, 'r', encoding='utf-8') as f:
+                log_content = f.read()
+
+            if "Init Succeeded" in log_content:
+                print(f"[{idx}/{len(instances_list)}] Skipping {item['instance_id']} it's already initiated.")
+                continue
+        else:
+            print(f"[{idx}/{len(instances_list)}] Processing {item['instance_id']}")
+
         task_instance = item
         data_group = {
                 "task_instances": [task_instance],
@@ -103,15 +137,17 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--instances_path", type=str, help="task instances path", required=True)
+    parser.add_argument("--instances_path", type=str, help="task instances path", required=False)
+    parser.add_argument("--swe_bench_tasks", type=str, help="Path to dataset file or HF datasets name", required=False)
     # parser.add_argument("--instance_id", type=str, help="JSON String for an individual task instance", required=True)
     parser.add_argument("--log_dir", type=str, help="Path to log directory", required=True)
-    parser.add_argument("--devin_output_path", type=str, help="Path to devin's output", required=True)
+    parser.add_argument("--devin_output_path", type=str, help="Path to devin's output", required=False)
     parser.add_argument("--conda_path", type=str, help="(Optional) Path to miniconda3 or anaconda installation")
     parser.add_argument("--testbed", type=str, help="(Optional) Path to testbed directory")
     parser.add_argument("--venv", type=str, help="(Optional) Virtual environment for the test")
     parser.add_argument("--timeout", type=int, default=None, help="(Optional) Timeout (seconds) for testing script execution")
     parser.add_argument("--verbose", action="store_true", help="(Optional) Verbose mode")
     args = parser.parse_args()
     validate_args(args)
+    logger.propagate = args.verbose
     main(args)
diff --git a/swebench/harness/report.py b/swebench/harness/report.py
@@ -0,0 +1,227 @@
+import argparse
+import json
+import os
+import traceback
+
+from collections import Counter
+from rich import print
+from swebench import (
+    KEY_INSTANCE_ID,
+    KEY_MODEL,
+    KEY_PREDICTION,
+    get_eval_report,
+    get_logs_eval,
+    get_model_report,
+    get_resolution_status,
+    run_evaluation,
+    get_eval_refs,
+)
+from swebench.harness.constants import (
+    INSTALL_FAIL,
+)
+from unidiff import PatchSet
+
+
+def main(predictions_path, log_dir, swe_bench_tasks):
+    # Check if paths exist
+    if not os.path.exists(predictions_path):
+        raise FileNotFoundError(f"Predictions path {predictions_path} does not exist")
+    eval_refs = get_eval_refs(swe_bench_tasks)
+    for k, v in eval_refs.items():
+        eval_refs[k] = {key: v[key] for key in [KEY_INSTANCE_ID, "FAIL_TO_PASS", "PASS_TO_PASS"]}
+
+    # Change model_name_or_patch field to directory name for all predictions
+    directory = os.path.dirname(predictions_path)
+    directory_name = directory.rsplit("/", 1)[-1]
+    pred_path_orig = predictions_path
+    if pred_path_orig.endswith(".jsonl"):
+        pred_path_temp = predictions_path.replace(".jsonl", "_filtered.jsonl")
+    else:
+        pred_path_temp = predictions_path.replace(".json", "_filtered.json")
+
+
+    if any([pred_path_orig.endswith(x) for x in [".jsonl", ".jsonl.all"]]):
+        predictions = list()
+        with open(pred_path_orig) as f:
+            for line in f.readlines():
+                predictions.append(json.loads(line))
+    else:
+        with open(pred_path_orig) as f:
+            predictions = json.load(f)
+
+    pred_total, pred_will_eval = 0, 0
+    for p in predictions:
+        pred_total += 1
+        # Exclude predictions w/ empty strings
+        # if p[KEY_PREDICTION] is not None and p[KEY_PREDICTION].strip() != "":
+        pred_will_eval += 1
+    print(
+        f"Found {pred_total} total predictions, will evaluate {pred_will_eval} ({pred_total-pred_will_eval} are empty)"
+    )
+    print(f"Log directory for evaluation run: {log_dir}")
+
+    # Iterate through predictions
+    scorecards = []
+    for p in predictions:
+        scorecard = {KEY_INSTANCE_ID: p[KEY_INSTANCE_ID], "statuses": [], "stats": {}}
+
+        # Add trajectory statistics if traj_path exists
+        traj_path = os.path.join(directory, f"{p[KEY_INSTANCE_ID]}.traj")
+        if os.path.exists(traj_path):
+            traj_data = json.load(open(traj_path, "r"))
+            scorecard["stats"]["traj_num_steps"] = len(traj_data["trajectory"])
+            scorecard["stats"]["traj_action_dist"] = dict(
+                Counter(
+                    [
+                        entry["action"].strip().split()[0]
+                        if entry["role"] == "assistant" and "action" in entry and len(entry["action"]) > 0
+                        else None
+                        for entry in traj_data["history"]
+                    ]
+                )
+            )
+            scorecard["exit_status"] = (
+                traj_data["info"]["exit_status"]
+                if "exit_status" in traj_data["info"]
+                else "n/a"
+            )
+
+        # Check that a prediction was generated
+        #if p[KEY_PREDICTION] is None or p[KEY_PREDICTION].strip() == "":
+        #    scorecard["statuses"].append("not_generated")
+        #    scorecards.append(scorecard)
+        #    continue
+        scorecard["statuses"].append("generated")
+
+        # Get log file
+        log_path = os.path.join(
+            log_dir, f"{p[KEY_INSTANCE_ID]}.{directory_name}.eval.log"
+        )
+        if not os.path.exists(log_path):
+            scorecard["statuses"].append("build_failure")
+            scorecards.append(scorecard)
+            continue
+
+        # Get evaluation logs
+        eval_sm, found = get_logs_eval(log_path)
+
+        # Check that the prediction generated
+        if not found:
+            scorecards.append(scorecard)
+            continue
+        scorecard["statuses"].append("applied")
+
+        with open(log_path, "r") as f:
+            log_contents = f.read()
+            if INSTALL_FAIL in log_contents:
+                scorecard["statuses"].append("install_fail")
+
+        # Get resolution status
+        report = get_eval_report(eval_sm, eval_refs[p[KEY_INSTANCE_ID]])
+        scorecard["test_results"] = {
+            "failure": {
+                "FAIL_TO_PASS": report["FAIL_TO_PASS"]["failure"],
+                "PASS_TO_PASS": report["PASS_TO_PASS"]["failure"],
+            },
+            "success": {
+                "FAIL_TO_PASS": report["FAIL_TO_PASS"]["success"],
+                "PASS_TO_PASS": report["PASS_TO_PASS"]["success"],
+            }
+        }
+        resolution_status = get_resolution_status(report)
+        scorecard["statuses"].append(resolution_status)
+
+        diff_obj = PatchSet(p[KEY_PREDICTION])
+        scorecard["patch_files"] = [
+            x.path
+            for x in diff_obj.modified_files
+            + diff_obj.added_files
+            + diff_obj.removed_files
+        ]
+        scorecard["patch_lines_add"] = sum([f.added for f in diff_obj])
+        scorecard["patch_lines_del"] = sum([f.removed for f in diff_obj])
+        scorecards.append(scorecard)
+
+    # Calculate cumulative results
+    get_ids_with_status = lambda x: [
+        s[KEY_INSTANCE_ID] for s in scorecards if x in s["statuses"]
+    ]
+    report = {
+        "# Not Generated": len(get_ids_with_status("not_generated")),
+        "# Generated": len(get_ids_with_status("generated")),
+        "# Applied": len(get_ids_with_status("applied")),
+        "# Resolved": len(get_ids_with_status("RESOLVED_FULL")),
+        "# Install Fail": len(get_ids_with_status("install_fail")),
+    }
+    print(f"== Evaluation Report ==\n{report}")
+
+    report_exits = dict(
+        Counter([s["exit_status"] if "exit_status" in s else "n/a" for s in scorecards])
+    )
+
+    # Save to summary, scorecard json
+    path_scorecards = os.path.join(directory, "scorecards.json")
+    with open(path_scorecards, "w") as f:
+        json.dump(scorecards, fp=f, indent=2)
+    print(f"- Wrote per-instance scorecards to {path_scorecards}")
+
+    path_results = os.path.join(directory, "results.json")
+    with open(path_results, "w") as f:
+        json.dump(
+            {
+                "report": report,
+                "report_exits": report_exits,
+                "not_generated": get_ids_with_status("not_generated"),
+                "generated": get_ids_with_status("generated"),
+                "applied": get_ids_with_status("applied"),
+                "resolved": get_ids_with_status("RESOLVED_FULL"),
+                "not_resolved": get_ids_with_status("RESOLVED_NO"),
+                "install_fail": get_ids_with_status("install_fail"),
+            },
+            fp=f,
+            indent=2,
+        )
+    print(f"- Wrote summary of run to {path_results}")
+
+    # Sanity check against get_model_report
+    report = get_model_report(
+        directory_name, pred_path_orig, swe_bench_tasks, log_dir
+    )
+    by_outcome = {}
+    by_outcome_func = lambda status: len(
+        [
+            instance_id
+            for _, v in report.items()
+            if isinstance(v, dict)
+            for instance_id in v[status]
+        ]
+    )
+    by_outcome["# Not Generated"] = by_outcome_func("none")
+    by_outcome["# Generated"] = by_outcome_func("generated")
+    by_outcome["# Applied"] = by_outcome_func("applied")
+    by_outcome["# Resolved"] = by_outcome_func("resolved")
+    by_outcome["# Install Fail"] = by_outcome_func("install_fail")
+    print(f"Reference Report:\n{by_outcome}")
+
+
+if __name__ == "__main__":
+    # Parse arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--predictions_path",
+        type=str,
+        help="Path to predictions file (.jsonl)",
+        required=True,
+    )
+    parser.add_argument(
+        "--log_dir", type=str, help="Path to log directory", required=True
+    )
+    parser.add_argument(
+        "--swe_bench_tasks",
+        type=str,
+        help="Path to SWE-bench task instances file",
+        required=True,
+    )
+
+    args = parser.parse_args()
+    main(**vars(args))