From 1cd2d2b56965aff1c27744cae465ae3b49761b1d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albert=20=C3=96rwall?= <albert@platform24.com>
Date: Fri, 12 Apr 2024 09:30:53 +0200
Subject: [PATCH 1/6] Create Docker image with SWE-bench installed

---
 .dockerignore |  8 ++++++++
 Dockerfile    | 22 +++++++++++++++++++---
 2 files changed, 27 insertions(+), 3 deletions(-)
 create mode 100644 .dockerignore

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..78e31d0
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,8 @@
+**
+
+!README.md
+!setup.py
+!pyproject.toml
+!setup.cfg
+!environment.yml
+!/swebench/
diff --git a/Dockerfile b/Dockerfile
index 2378da1..0bd08b8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,8 @@
-FROM ubuntu:20.04
+FROM ubuntu:jammy
 
 # https://github.com/princeton-nlp/SWE-bench/issues/15#issuecomment-1815392192
 RUN apt-get update && \
-    apt-get install -y bash gcc git jq wget && \
+    apt-get install -y bash gcc git jq wget g++ make libffi-dev python3.11 && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
@@ -15,6 +15,7 @@ RUN apt update && apt install -y build-essential
 RUN useradd -ms /bin/bash swe-bench
 USER swe-bench
 WORKDIR /home/swe-bench
+RUN chown -R swe-bench:swe-bench /home/swe-bench
 
 # Setup Conda
 ENV PATH="/home/swe-bench/miniconda3/bin:${PATH}"
@@ -23,13 +24,28 @@ RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -
     && mkdir ~/.conda \
     && bash miniconda.sh -b \
     && rm -f miniconda.sh
-RUN conda --version
+RUN conda --version \
+    && conda init bash \
+    && conda config --append channels conda-forge
 
 # Setup SWE-Bench Env
 COPY environment.yml .
 RUN conda env create -f environment.yml
 
+RUN conda --version \
+    && conda init \
+    && conda config --append channels conda-forge
+
 # Some missing packages
 RUN pip install datasets python-dotenv gitpython
 
+# Install SWE-Bench
+COPY . .
+RUN pip install -e .
+
+# RUN echo "source activate swe-bench" > ~/.bashrc
+# SHELL ["/bin/bash", "--login", "-c"]
+
+# RUN conda activate swe-bench && pip install -e .
+
 CMD ["/bin/bash"]

From 2870f603df544eb2d5d0beac7cf922aa6a7b8d84 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albert=20=C3=96rwall?= <albert@platform24.com>
Date: Sat, 13 Apr 2024 16:49:21 +0200
Subject: [PATCH 2/6] Dockerifle

---
 Dockerfile | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 0bd08b8..14caee3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,6 +11,8 @@ RUN git config --global user.name "swebench"
 
 RUN apt update && apt install -y build-essential
 
+RUN ln -sfn /bin/bash /bin/sh
+
 # Create new user
 RUN useradd -ms /bin/bash swe-bench
 USER swe-bench
@@ -37,15 +39,10 @@ RUN conda --version \
     && conda config --append channels conda-forge
 
 # Some missing packages
-RUN pip install datasets python-dotenv gitpython
+RUN pip install datasets python-dotenv gitpython unidiff rich
 
 # Install SWE-Bench
 COPY . .
 RUN pip install -e .
 
-# RUN echo "source activate swe-bench" > ~/.bashrc
-# SHELL ["/bin/bash", "--login", "-c"]
-
-# RUN conda activate swe-bench && pip install -e .
-
 CMD ["/bin/bash"]

From e883c189d2a3c683de2217ce02da900a6901eef0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albert=20=C3=96rwall?= <albert@platform24.com>
Date: Sun, 14 Apr 2024 17:48:27 +0200
Subject: [PATCH 3/6] Updated Dockerfile and environment.yml to make evaluation
 work in Docker

---
 Dockerfile      | 6 +-----
 environment.yml | 2 +-
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 14caee3..5d99862 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -34,12 +34,8 @@ RUN conda --version \
 COPY environment.yml .
 RUN conda env create -f environment.yml
 
-RUN conda --version \
-    && conda init \
-    && conda config --append channels conda-forge
-
 # Some missing packages
-RUN pip install datasets python-dotenv gitpython unidiff rich
+RUN pip install datasets python-dotenv gitpython unidiff rich importlib
 
 # Install SWE-Bench
 COPY . .
diff --git a/environment.yml b/environment.yml
index a64aa6c..3eb4697 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,6 +1,6 @@
 name: swe-bench
 dependencies:
-  - python=3.9
+  - python=3.11
   - pip
   - pip:
     - datasets

From a64d47aab9d9cee255dbfab8e0706f681b405f30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albert=20=C3=96rwall?= <albert@platform24.com>
Date: Fri, 19 Apr 2024 10:51:21 +0200
Subject: [PATCH 4/6] Ignore warnings, might result in other test failures

---
 swebench/harness/constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swebench/harness/constants.py b/swebench/harness/constants.py
index 6d2367f..6d7479e 100644
--- a/swebench/harness/constants.py
+++ b/swebench/harness/constants.py
@@ -495,7 +495,7 @@
 MAP_REPO_TO_INSTALL = {}
 
 # Constants - Task Instance Test Frameworks
-TEST_PYTEST = "pytest --no-header -rA --tb=no -p no:cacheprovider"
+TEST_PYTEST = "pytest --no-header -rA --tb=no -p no:cacheprovider -Wignore"
 MAP_REPO_TO_TEST_FRAMEWORK = {
     "astropy/astropy": TEST_PYTEST,
     "dbt-labs/dbt-core": TEST_PYTEST,

From 19212694b833aedb0f889544b5a0ada1686780b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albert=20=C3=96rwall?= <albert@platform24.com>
Date: Fri, 19 Apr 2024 10:53:07 +0200
Subject: [PATCH 5/6] Support swe-bench dataset

---
 swebench/harness/engine_testbed.py | 54 +++++++++++++++++++++++++-----
 1 file changed, 45 insertions(+), 9 deletions(-)

diff --git a/swebench/harness/engine_testbed.py b/swebench/harness/engine_testbed.py
index d35adc0..9df986f 100644
--- a/swebench/harness/engine_testbed.py
+++ b/swebench/harness/engine_testbed.py
@@ -1,11 +1,20 @@
 import argparse
 import json
+import logging
+
 from context_manager import TestbedContextManager, TaskEnvContextManager
 from typing import Dict
+
 from utils import DotDict
 import os.path as osp
 from multiprocessing import Pool
 
+logging.basicConfig(
+    level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger("engine_testbed")
+
+
 def is_json(myjson: str):
     try:
         json_object = json.loads(myjson)
@@ -82,16 +91,41 @@ def setup_testbed(data: Dict):
     return
 
 def main(args):
-    devin_output = json.load(open(args.devin_output_path, "r"))
-    devin_instance_ids = [_["instance_id"] for _ in devin_output]
-    with open(args.instances_path, 'r', encoding='utf-8') as f:
-        instances_list =  json.load(f)
-    for item in instances_list:
+    if args.devin_output_path:
+        devin_output = json.load(open(args.devin_output_path, "r"))
+        devin_instance_ids = [_["instance_id"] for _ in devin_output]
+    else:
+        devin_instance_ids = []
+
+    instances_list = None
+    if args.instances_path:
+        with open(args.instances_path, 'r', encoding='utf-8') as f:
+            instances_list = json.load(f)
+    elif args.swe_bench_tasks:
+        from swebench import get_eval_refs
+        instances_list = list(get_eval_refs(args.swe_bench_tasks).values())
+
+    if not instances_list:
+        raise ValueError("No task instances found")
+
+    for idx, item in enumerate(instances_list):
         # if (args.instance_id != item["instance_id"]):
         #     continue
-        if item["instance_id"] not in devin_instance_ids or \
-            osp.exists(osp.join(args.log_dir, item["instance_id"] + ".log")):
+        log_file = osp.join(args.log_dir, item["instance_id"] + ".log")
+
+        if devin_instance_ids and item["instance_id"] not in devin_instance_ids:
+            print(f"[{idx}/{len(instances_list)}] Skipping {item['instance_id']} as it is not in devin's output")
             continue
+        elif osp.exists(log_file):
+            with open(log_file, 'r', encoding='utf-8') as f:
+                log_content = f.read()
+
+            if "Init Succeeded" in log_content:
+                print(f"[{idx}/{len(instances_list)}] Skipping {item['instance_id']} it's already initiated.")
+                continue
+        else:
+            print(f"[{idx}/{len(instances_list)}] Processing {item['instance_id']}")
+
         task_instance = item
         data_group = {
                 "task_instances": [task_instance],
@@ -103,10 +137,11 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--instances_path", type=str, help="task instances path", required=True)
+    parser.add_argument("--instances_path", type=str, help="task instances path", required=False)
+    parser.add_argument("--swe_bench_tasks", type=str, help="Path to dataset file or HF datasets name", required=False)
     # parser.add_argument("--instance_id", type=str, help="JSON String for an individual task instance", required=True)
     parser.add_argument("--log_dir", type=str, help="Path to log directory", required=True)
-    parser.add_argument("--devin_output_path", type=str, help="Path to devin's output", required=True)
+    parser.add_argument("--devin_output_path", type=str, help="Path to devin's output", required=False)
     parser.add_argument("--conda_path", type=str, help="(Optional) Path to miniconda3 or anaconda installation")
     parser.add_argument("--testbed", type=str, help="(Optional) Path to testbed directory")
     parser.add_argument("--venv", type=str, help="(Optional) Virtual environment for the test")
@@ -114,4 +149,5 @@ def main(args):
     parser.add_argument("--verbose", action="store_true", help="(Optional) Verbose mode")
     args = parser.parse_args()
     validate_args(args)
+    logger.propagate = args.verbose
     main(args)
\ No newline at end of file

From 185f4165044c26440e9f2dcf8e4adb05654e727d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albert=20=C3=96rwall?= <albert@platform24.com>
Date: Sun, 21 Apr 2024 10:53:51 +0200
Subject: [PATCH 6/6] Generate report

---
 swebench/harness/report.py  | 227 ++++++++++++++++++++++++++++++++++++
 swebench/metrics/getters.py |   3 +-
 swebench/metrics/report.py  |  14 ++-
 3 files changed, 240 insertions(+), 4 deletions(-)
 create mode 100644 swebench/harness/report.py

diff --git a/swebench/harness/report.py b/swebench/harness/report.py
new file mode 100644
index 0000000..af011c0
--- /dev/null
+++ b/swebench/harness/report.py
@@ -0,0 +1,227 @@
+import argparse
+import json
+import os
+import traceback
+
+from collections import Counter
+from rich import print
+from swebench import (
+    KEY_INSTANCE_ID,
+    KEY_MODEL,
+    KEY_PREDICTION,
+    get_eval_report,
+    get_logs_eval,
+    get_model_report,
+    get_resolution_status,
+    run_evaluation,
+    get_eval_refs,
+)
+from swebench.harness.constants import (
+    INSTALL_FAIL,
+)
+from unidiff import PatchSet
+
+
+def main(predictions_path, log_dir, swe_bench_tasks):
+    # Check if paths exist
+    if not os.path.exists(predictions_path):
+        raise FileNotFoundError(f"Predictions path {predictions_path} does not exist")
+    eval_refs = get_eval_refs(swe_bench_tasks)
+    for k, v in eval_refs.items():
+        eval_refs[k] = {key: v[key] for key in [KEY_INSTANCE_ID, "FAIL_TO_PASS", "PASS_TO_PASS"]}
+
+    # Change model_name_or_patch field to directory name for all predictions
+    directory = os.path.dirname(predictions_path)
+    directory_name = directory.rsplit("/", 1)[-1]
+    pred_path_orig = predictions_path
+    if pred_path_orig.endswith(".jsonl"):
+        pred_path_temp = predictions_path.replace(".jsonl", "_filtered.jsonl")
+    else:
+        pred_path_temp = predictions_path.replace(".json", "_filtered.json")
+
+
+    if any([pred_path_orig.endswith(x) for x in [".jsonl", ".jsonl.all"]]):
+        predictions = list()
+        with open(pred_path_orig) as f:
+            for line in f.readlines():
+                predictions.append(json.loads(line))
+    else:
+        with open(pred_path_orig) as f:
+            predictions = json.load(f)
+
+    pred_total, pred_will_eval = 0, 0
+    for p in predictions:
+        pred_total += 1
+        # Exclude predictions w/ empty strings
+        # if p[KEY_PREDICTION] is not None and p[KEY_PREDICTION].strip() != "":
+        pred_will_eval += 1
+    print(
+        f"Found {pred_total} total predictions, will evaluate {pred_will_eval} ({pred_total-pred_will_eval} are empty)"
+    )
+    print(f"Log directory for evaluation run: {log_dir}")
+
+    # Iterate through predictions
+    scorecards = []
+    for p in predictions:
+        scorecard = {KEY_INSTANCE_ID: p[KEY_INSTANCE_ID], "statuses": [], "stats": {}}
+
+        # Add trajectory statistics if traj_path exists
+        traj_path = os.path.join(directory, f"{p[KEY_INSTANCE_ID]}.traj")
+        if os.path.exists(traj_path):
+            traj_data = json.load(open(traj_path, "r"))
+            scorecard["stats"]["traj_num_steps"] = len(traj_data["trajectory"])
+            scorecard["stats"]["traj_action_dist"] = dict(
+                Counter(
+                    [
+                        entry["action"].strip().split()[0]
+                        if entry["role"] == "assistant" and "action" in entry and len(entry["action"]) > 0
+                        else None
+                        for entry in traj_data["history"]
+                    ]
+                )
+            )
+            scorecard["exit_status"] = (
+                traj_data["info"]["exit_status"]
+                if "exit_status" in traj_data["info"]
+                else "n/a"
+            )
+
+        # Check that a prediction was generated
+        #if p[KEY_PREDICTION] is None or p[KEY_PREDICTION].strip() == "":
+        #    scorecard["statuses"].append("not_generated")
+        #    scorecards.append(scorecard)
+        #    continue
+        scorecard["statuses"].append("generated")
+
+        # Get log file
+        log_path = os.path.join(
+            log_dir, f"{p[KEY_INSTANCE_ID]}.{directory_name}.eval.log"
+        )
+        if not os.path.exists(log_path):
+            scorecard["statuses"].append("build_failure")
+            scorecards.append(scorecard)
+            continue
+
+        # Get evaluation logs
+        eval_sm, found = get_logs_eval(log_path)
+
+        # Check that the prediction generated
+        if not found:
+            scorecards.append(scorecard)
+            continue
+        scorecard["statuses"].append("applied")
+
+        with open(log_path, "r") as f:
+            log_contents = f.read()
+            if INSTALL_FAIL in log_contents:
+                scorecard["statuses"].append("install_fail")
+
+        # Get resolution status
+        report = get_eval_report(eval_sm, eval_refs[p[KEY_INSTANCE_ID]])
+        scorecard["test_results"] = {
+            "failure": {
+                "FAIL_TO_PASS": report["FAIL_TO_PASS"]["failure"],
+                "PASS_TO_PASS": report["PASS_TO_PASS"]["failure"],
+            },
+            "success": {
+                "FAIL_TO_PASS": report["FAIL_TO_PASS"]["success"],
+                "PASS_TO_PASS": report["PASS_TO_PASS"]["success"],
+            }
+        }
+        resolution_status = get_resolution_status(report)
+        scorecard["statuses"].append(resolution_status)
+
+        diff_obj = PatchSet(p[KEY_PREDICTION])
+        scorecard["patch_files"] = [
+            x.path
+            for x in diff_obj.modified_files
+            + diff_obj.added_files
+            + diff_obj.removed_files
+        ]
+        scorecard["patch_lines_add"] = sum([f.added for f in diff_obj])
+        scorecard["patch_lines_del"] = sum([f.removed for f in diff_obj])
+        scorecards.append(scorecard)
+
+    # Calculate cumulative results
+    get_ids_with_status = lambda x: [
+        s[KEY_INSTANCE_ID] for s in scorecards if x in s["statuses"]
+    ]
+    report = {
+        "# Not Generated": len(get_ids_with_status("not_generated")),
+        "# Generated": len(get_ids_with_status("generated")),
+        "# Applied": len(get_ids_with_status("applied")),
+        "# Resolved": len(get_ids_with_status("RESOLVED_FULL")),
+        "# Install Fail": len(get_ids_with_status("install_fail")),
+    }
+    print(f"== Evaluation Report ==\n{report}")
+
+    report_exits = dict(
+        Counter([s["exit_status"] if "exit_status" in s else "n/a" for s in scorecards])
+    )
+
+    # Save to summary, scorecard json
+    path_scorecards = os.path.join(directory, "scorecards.json")
+    with open(path_scorecards, "w") as f:
+        json.dump(scorecards, fp=f, indent=2)
+    print(f"- Wrote per-instance scorecards to {path_scorecards}")
+
+    path_results = os.path.join(directory, "results.json")
+    with open(path_results, "w") as f:
+        json.dump(
+            {
+                "report": report,
+                "report_exits": report_exits,
+                "not_generated": get_ids_with_status("not_generated"),
+                "generated": get_ids_with_status("generated"),
+                "applied": get_ids_with_status("applied"),
+                "resolved": get_ids_with_status("RESOLVED_FULL"),
+                "not_resolved": get_ids_with_status("RESOLVED_NO"),
+                "install_fail": get_ids_with_status("install_fail"),
+            },
+            fp=f,
+            indent=2,
+        )
+    print(f"- Wrote summary of run to {path_results}")
+
+    # Sanity check against get_model_report
+    report = get_model_report(
+        directory_name, pred_path_orig, swe_bench_tasks, log_dir
+    )
+    by_outcome = {}
+    by_outcome_func = lambda status: len(
+        [
+            instance_id
+            for _, v in report.items()
+            if isinstance(v, dict)
+            for instance_id in v[status]
+        ]
+    )
+    by_outcome["# Not Generated"] = by_outcome_func("none")
+    by_outcome["# Generated"] = by_outcome_func("generated")
+    by_outcome["# Applied"] = by_outcome_func("applied")
+    by_outcome["# Resolved"] = by_outcome_func("resolved")
+    by_outcome["# Install Fail"] = by_outcome_func("install_fail")
+    print(f"Reference Report:\n{by_outcome}")
+
+
+if __name__ == "__main__":
+    # Parse arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--predictions_path",
+        type=str,
+        help="Path to predictions file (.jsonl)",
+        required=True,
+    )
+    parser.add_argument(
+        "--log_dir", type=str, help="Path to log directory", required=True
+    )
+    parser.add_argument(
+        "--swe_bench_tasks",
+        type=str,
+        help="Path to SWE-bench task instances file",
+        required=True,
+    )
+
+    args = parser.parse_args()
+    main(**vars(args))
diff --git a/swebench/metrics/getters.py b/swebench/metrics/getters.py
index 4505ee5..f4db804 100644
--- a/swebench/metrics/getters.py
+++ b/swebench/metrics/getters.py
@@ -121,11 +121,12 @@ def log_path_to_sms(log_fp: str, log_parser) -> Tuple[list, bool]:
 
 test_passed = lambda case, sm: case in sm and sm[case] == TestStatus.PASSED.value
 
+test_missed = lambda case, sm: case not in sm
+
 test_failed = lambda case, sm: case not in sm or any(
     [sm[case] == status for status in [TestStatus.FAILED.value, TestStatus.ERROR.value]]
 )
 
-
 def get_eval_refs(data_path_or_name):
     decode_keys = False
     if os.path.isfile(data_path_or_name):
diff --git a/swebench/metrics/report.py b/swebench/metrics/report.py
index 62a677f..100dbe1 100644
--- a/swebench/metrics/report.py
+++ b/swebench/metrics/report.py
@@ -17,7 +17,7 @@
     get_id_from_lp,
     test_failed,
     test_passed,
-    get_eval_refs,
+    get_eval_refs, test_missed,
 )
 from swebench.metrics.metrics import (
     compute_fail_to_pass_unweighted,
@@ -63,8 +63,11 @@ def get_eval_report(
     # Calculate resolution metrics
     f2p_success = []
     f2p_failure = []
+    f2p_missed = []
     for test_case in gold_results[FAIL_TO_PASS]:
-        if test_passed(test_case, eval_sm):
+        if test_missed(test_case, eval_sm):
+            f2p_missed.append(test_case)
+        elif test_passed(test_case, eval_sm):
             # Assume silent success for now (test case not in eval_sm)
             f2p_success.append(test_case)
         elif test_failed(test_case, eval_sm):
@@ -73,8 +76,11 @@ def get_eval_report(
     # Calculate maintenance metrics
     p2p_success = []
     p2p_failure = []
+    p2p_missed = []
     for test_case in gold_results[PASS_TO_PASS]:
-        if test_passed(test_case, eval_sm):
+        if test_missed(test_case, eval_sm):
+            p2p_missed.append(test_case)
+        elif test_passed(test_case, eval_sm):
             p2p_success.append(test_case)
         elif test_failed(test_case, eval_sm):
             p2p_failure.append(test_case)
@@ -83,10 +89,12 @@ def get_eval_report(
         FAIL_TO_PASS: {
             "success": f2p_success,
             "failure": f2p_failure,
+            "missed": f2p_missed
         },
         PASS_TO_PASS: {
             "success": p2p_success,
             "failure": p2p_failure,
+            "missed": p2p_missed
         }
     }