sourcegraph
diff --git a/‎agents/harnesses/openhands/agent.py‎
Lines changed: 13 additions & 0 deletions b/‎agents/harnesses/openhands/agent.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎benchmarks/backups/csb_org_compliance/ccx-compliance-051/tests/oracle_checks.py‎
Lines changed: 6 additions & 2 deletions b/‎benchmarks/backups/csb_org_compliance/ccx-compliance-051/tests/oracle_checks.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎benchmarks/backups/csb_org_compliance/ccx-compliance-115/tests/oracle_checks.py‎
Lines changed: 6 additions & 2 deletions b/‎benchmarks/backups/csb_org_compliance/ccx-compliance-115/tests/oracle_checks.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎benchmarks/backups/csb_org_compliance/ccx-compliance-118/tests/oracle_checks.py‎
Lines changed: 6 additions & 2 deletions b/‎benchmarks/backups/csb_org_compliance/ccx-compliance-118/tests/oracle_checks.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎benchmarks/backups/csb_org_compliance/ccx-compliance-185/tests/oracle_checks.py‎
Lines changed: 6 additions & 2 deletions b/‎benchmarks/backups/csb_org_compliance/ccx-compliance-185/tests/oracle_checks.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎benchmarks/backups/csb_org_compliance/ccx-compliance-186/tests/oracle_checks.py‎
Lines changed: 6 additions & 2 deletions b/‎benchmarks/backups/csb_org_compliance/ccx-compliance-186/tests/oracle_checks.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎benchmarks/backups/csb_org_compliance/ccx-compliance-193/tests/oracle_checks.py‎
Lines changed: 6 additions & 2 deletions b/‎benchmarks/backups/csb_org_compliance/ccx-compliance-193/tests/oracle_checks.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎benchmarks/backups/csb_org_compliance/ccx-compliance-194/tests/oracle_checks.py‎
Lines changed: 6 additions & 2 deletions b/‎benchmarks/backups/csb_org_compliance/ccx-compliance-194/tests/oracle_checks.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎benchmarks/backups/csb_org_compliance_doe_trim/ccx-compliance-057/tests/oracle_checks.py‎
Lines changed: 6 additions & 2 deletions b/‎benchmarks/backups/csb_org_compliance_doe_trim/ccx-compliance-057/tests/oracle_checks.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎benchmarks/backups/csb_org_compliance_doe_trim/ccx-compliance-188/tests/oracle_checks.py‎
Lines changed: 6 additions & 2 deletions b/‎benchmarks/backups/csb_org_compliance_doe_trim/ccx-compliance-188/tests/oracle_checks.py‎
Lines changed: 6 additions & 2 deletions
@@ -136,6 +136,11 @@ def create_run_agent_commands(self, instruction: str):
             else:
                 env.pop("PYTHONPATH", None)
         env["PYTHONSAFEPATH"] = "1"
+        # Core-level env vars control LocalRuntime plugin loading.
+        # AGENT_ENABLE_JUPYTER only sets the agent config — the runtime
+        # reads the core-level config to decide whether to start jupyter.
+        env["ENABLE_JUPYTER"] = "false"
+        env["ENABLE_BROWSING"] = "false"
         env["AGENT_ENABLE_JUPYTER"] = "false"
         env["AGENT_ENABLE_BROWSING"] = "false"
 
@@ -294,6 +299,14 @@ def _build_workspace_guidance(self, workdir: str) -> str:
 
     def _build_config_toml(self, mcp_url: str | None = None) -> str:
         lines = [
+            # Core-level settings control plugin loading in LocalRuntime.
+            # Without [core], enable_jupyter under [agent] is ignored and
+            # the jupyter-kernelgateway plugin starts, fails to bind, and
+            # crashes with tenacity.RetryError in _wait_until_alive.
+            "[core]",
+            "enable_jupyter = false",
+            "enable_browsing = false",
+            "",
             "[agent]",
             f"enable_mcp = {'true' if mcp_url else 'false'}",
             "enable_jupyter = false",
 
@@ -515,9 +515,13 @@ def check_test_ratio(
             elif line_stripped.startswith("FAIL") or line_stripped.startswith("--- FAIL"):
                 failed += 1
 
-        # Fallback: if no structured output, use exit code
+        # Fallback: if no structured output, use exit code — but only when
+        # there is actual test output.  An empty/trivial exit-0 (e.g. the
+        # agent made no changes and the test script returned cleanly) must
+        # NOT be scored as a passing test.
         if passed == 0 and failed == 0:
-            if proc.returncode == 0:
+            has_output = len(output.strip()) > 0
+            if proc.returncode == 0 and has_output:
                 passed = 1
             else:
                 failed = 1
 
@@ -515,9 +515,13 @@ def check_test_ratio(
             elif line_stripped.startswith("FAIL") or line_stripped.startswith("--- FAIL"):
                 failed += 1
 
-        # Fallback: if no structured output, use exit code
+        # Fallback: if no structured output, use exit code — but only when
+        # there is actual test output.  An empty/trivial exit-0 (e.g. the
+        # agent made no changes and the test script returned cleanly) must
+        # NOT be scored as a passing test.
         if passed == 0 and failed == 0:
-            if proc.returncode == 0:
+            has_output = len(output.strip()) > 0
+            if proc.returncode == 0 and has_output:
                 passed = 1
             else:
                 failed = 1
 
@@ -515,9 +515,13 @@ def check_test_ratio(
             elif line_stripped.startswith("FAIL") or line_stripped.startswith("--- FAIL"):
                 failed += 1
 
-        # Fallback: if no structured output, use exit code
+        # Fallback: if no structured output, use exit code — but only when
+        # there is actual test output.  An empty/trivial exit-0 (e.g. the
+        # agent made no changes and the test script returned cleanly) must
+        # NOT be scored as a passing test.
         if passed == 0 and failed == 0:
-            if proc.returncode == 0:
+            has_output = len(output.strip()) > 0
+            if proc.returncode == 0 and has_output:
                 passed = 1
             else:
                 failed = 1
 
@@ -515,9 +515,13 @@ def check_test_ratio(
             elif line_stripped.startswith("FAIL") or line_stripped.startswith("--- FAIL"):
                 failed += 1
 
-        # Fallback: if no structured output, use exit code
+        # Fallback: if no structured output, use exit code — but only when
+        # there is actual test output.  An empty/trivial exit-0 (e.g. the
+        # agent made no changes and the test script returned cleanly) must
+        # NOT be scored as a passing test.
         if passed == 0 and failed == 0:
-            if proc.returncode == 0:
+            has_output = len(output.strip()) > 0
+            if proc.returncode == 0 and has_output:
                 passed = 1
             else:
                 failed = 1
 
@@ -515,9 +515,13 @@ def check_test_ratio(
             elif line_stripped.startswith("FAIL") or line_stripped.startswith("--- FAIL"):
                 failed += 1
 
-        # Fallback: if no structured output, use exit code
+        # Fallback: if no structured output, use exit code — but only when
+        # there is actual test output.  An empty/trivial exit-0 (e.g. the
+        # agent made no changes and the test script returned cleanly) must
+        # NOT be scored as a passing test.
         if passed == 0 and failed == 0:
-            if proc.returncode == 0:
+            has_output = len(output.strip()) > 0
+            if proc.returncode == 0 and has_output:
                 passed = 1
             else:
                 failed = 1
 
@@ -515,9 +515,13 @@ def check_test_ratio(
             elif line_stripped.startswith("FAIL") or line_stripped.startswith("--- FAIL"):
                 failed += 1
 
-        # Fallback: if no structured output, use exit code
+        # Fallback: if no structured output, use exit code — but only when
+        # there is actual test output.  An empty/trivial exit-0 (e.g. the
+        # agent made no changes and the test script returned cleanly) must
+        # NOT be scored as a passing test.
         if passed == 0 and failed == 0:
-            if proc.returncode == 0:
+            has_output = len(output.strip()) > 0
+            if proc.returncode == 0 and has_output:
                 passed = 1
             else:
                 failed = 1
 
@@ -515,9 +515,13 @@ def check_test_ratio(
             elif line_stripped.startswith("FAIL") or line_stripped.startswith("--- FAIL"):
                 failed += 1
 
-        # Fallback: if no structured output, use exit code
+        # Fallback: if no structured output, use exit code — but only when
+        # there is actual test output.  An empty/trivial exit-0 (e.g. the
+        # agent made no changes and the test script returned cleanly) must
+        # NOT be scored as a passing test.
         if passed == 0 and failed == 0:
-            if proc.returncode == 0:
+            has_output = len(output.strip()) > 0
+            if proc.returncode == 0 and has_output:
                 passed = 1
             else:
                 failed = 1
 
@@ -515,9 +515,13 @@ def check_test_ratio(
             elif line_stripped.startswith("FAIL") or line_stripped.startswith("--- FAIL"):
                 failed += 1
 
-        # Fallback: if no structured output, use exit code
+        # Fallback: if no structured output, use exit code — but only when
+        # there is actual test output.  An empty/trivial exit-0 (e.g. the
+        # agent made no changes and the test script returned cleanly) must
+        # NOT be scored as a passing test.
         if passed == 0 and failed == 0:
-            if proc.returncode == 0:
+            has_output = len(output.strip()) > 0
+            if proc.returncode == 0 and has_output:
                 passed = 1
             else:
                 failed = 1
 
@@ -515,9 +515,13 @@ def check_test_ratio(
             elif line_stripped.startswith("FAIL") or line_stripped.startswith("--- FAIL"):
                 failed += 1
 
-        # Fallback: if no structured output, use exit code
+        # Fallback: if no structured output, use exit code — but only when
+        # there is actual test output.  An empty/trivial exit-0 (e.g. the
+        # agent made no changes and the test script returned cleanly) must
+        # NOT be scored as a passing test.
         if passed == 0 and failed == 0:
-            if proc.returncode == 0:
+            has_output = len(output.strip()) > 0
+            if proc.returncode == 0 and has_output:
                 passed = 1
             else:
                 failed = 1