Skip to content

Commit 49362b4

Browse files
sjarmakclaude
andcommitted
Fix OpenHands Daytona crash + add no-changes guard to verifiers (ki9)
OpenHands: move enable_jupyter/enable_browsing to [core] TOML section and add core-level env vars so LocalRuntime doesn't start jupyter plugin. Verifiers: add git-based no-changes guard to 6 test_ratio test.sh files, 38 repo_state_heuristic test.sh files, and fix exit-code fallback in 273 oracle_checks.py copies. Prevents false-positive scores when agent makes no changes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 13be28f commit 49362b4

File tree

318 files changed

+2305
-546
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

318 files changed

+2305
-546
lines changed

agents/harnesses/openhands/agent.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,11 @@ def create_run_agent_commands(self, instruction: str):
136136
else:
137137
env.pop("PYTHONPATH", None)
138138
env["PYTHONSAFEPATH"] = "1"
139+
# Core-level env vars control LocalRuntime plugin loading.
140+
# AGENT_ENABLE_JUPYTER only sets the agent config — the runtime
141+
# reads the core-level config to decide whether to start jupyter.
142+
env["ENABLE_JUPYTER"] = "false"
143+
env["ENABLE_BROWSING"] = "false"
139144
env["AGENT_ENABLE_JUPYTER"] = "false"
140145
env["AGENT_ENABLE_BROWSING"] = "false"
141146

@@ -294,6 +299,14 @@ def _build_workspace_guidance(self, workdir: str) -> str:
294299

295300
def _build_config_toml(self, mcp_url: str | None = None) -> str:
296301
lines = [
302+
# Core-level settings control plugin loading in LocalRuntime.
303+
# Without [core], enable_jupyter under [agent] is ignored and
304+
# the jupyter-kernelgateway plugin starts, fails to bind, and
305+
# crashes with tenacity.RetryError in _wait_until_alive.
306+
"[core]",
307+
"enable_jupyter = false",
308+
"enable_browsing = false",
309+
"",
297310
"[agent]",
298311
f"enable_mcp = {'true' if mcp_url else 'false'}",
299312
"enable_jupyter = false",

benchmarks/backups/csb_org_compliance/ccx-compliance-051/tests/oracle_checks.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -515,9 +515,13 @@ def check_test_ratio(
515515
elif line_stripped.startswith("FAIL") or line_stripped.startswith("--- FAIL"):
516516
failed += 1
517517

518-
# Fallback: if no structured output, use exit code
518+
# Fallback: if no structured output, use exit code — but only when
519+
# there is actual test output. An empty/trivial exit-0 (e.g. the
520+
# agent made no changes and the test script returned cleanly) must
521+
# NOT be scored as a passing test.
519522
if passed == 0 and failed == 0:
520-
if proc.returncode == 0:
523+
has_output = len(output.strip()) > 0
524+
if proc.returncode == 0 and has_output:
521525
passed = 1
522526
else:
523527
failed = 1

benchmarks/backups/csb_org_compliance/ccx-compliance-115/tests/oracle_checks.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -515,9 +515,13 @@ def check_test_ratio(
515515
elif line_stripped.startswith("FAIL") or line_stripped.startswith("--- FAIL"):
516516
failed += 1
517517

518-
# Fallback: if no structured output, use exit code
518+
# Fallback: if no structured output, use exit code — but only when
519+
# there is actual test output. An empty/trivial exit-0 (e.g. the
520+
# agent made no changes and the test script returned cleanly) must
521+
# NOT be scored as a passing test.
519522
if passed == 0 and failed == 0:
520-
if proc.returncode == 0:
523+
has_output = len(output.strip()) > 0
524+
if proc.returncode == 0 and has_output:
521525
passed = 1
522526
else:
523527
failed = 1

benchmarks/backups/csb_org_compliance/ccx-compliance-118/tests/oracle_checks.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -515,9 +515,13 @@ def check_test_ratio(
515515
elif line_stripped.startswith("FAIL") or line_stripped.startswith("--- FAIL"):
516516
failed += 1
517517

518-
# Fallback: if no structured output, use exit code
518+
# Fallback: if no structured output, use exit code — but only when
519+
# there is actual test output. An empty/trivial exit-0 (e.g. the
520+
# agent made no changes and the test script returned cleanly) must
521+
# NOT be scored as a passing test.
519522
if passed == 0 and failed == 0:
520-
if proc.returncode == 0:
523+
has_output = len(output.strip()) > 0
524+
if proc.returncode == 0 and has_output:
521525
passed = 1
522526
else:
523527
failed = 1

benchmarks/backups/csb_org_compliance/ccx-compliance-185/tests/oracle_checks.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -515,9 +515,13 @@ def check_test_ratio(
515515
elif line_stripped.startswith("FAIL") or line_stripped.startswith("--- FAIL"):
516516
failed += 1
517517

518-
# Fallback: if no structured output, use exit code
518+
# Fallback: if no structured output, use exit code — but only when
519+
# there is actual test output. An empty/trivial exit-0 (e.g. the
520+
# agent made no changes and the test script returned cleanly) must
521+
# NOT be scored as a passing test.
519522
if passed == 0 and failed == 0:
520-
if proc.returncode == 0:
523+
has_output = len(output.strip()) > 0
524+
if proc.returncode == 0 and has_output:
521525
passed = 1
522526
else:
523527
failed = 1

benchmarks/backups/csb_org_compliance/ccx-compliance-186/tests/oracle_checks.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -515,9 +515,13 @@ def check_test_ratio(
515515
elif line_stripped.startswith("FAIL") or line_stripped.startswith("--- FAIL"):
516516
failed += 1
517517

518-
# Fallback: if no structured output, use exit code
518+
# Fallback: if no structured output, use exit code — but only when
519+
# there is actual test output. An empty/trivial exit-0 (e.g. the
520+
# agent made no changes and the test script returned cleanly) must
521+
# NOT be scored as a passing test.
519522
if passed == 0 and failed == 0:
520-
if proc.returncode == 0:
523+
has_output = len(output.strip()) > 0
524+
if proc.returncode == 0 and has_output:
521525
passed = 1
522526
else:
523527
failed = 1

benchmarks/backups/csb_org_compliance/ccx-compliance-193/tests/oracle_checks.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -515,9 +515,13 @@ def check_test_ratio(
515515
elif line_stripped.startswith("FAIL") or line_stripped.startswith("--- FAIL"):
516516
failed += 1
517517

518-
# Fallback: if no structured output, use exit code
518+
# Fallback: if no structured output, use exit code — but only when
519+
# there is actual test output. An empty/trivial exit-0 (e.g. the
520+
# agent made no changes and the test script returned cleanly) must
521+
# NOT be scored as a passing test.
519522
if passed == 0 and failed == 0:
520-
if proc.returncode == 0:
523+
has_output = len(output.strip()) > 0
524+
if proc.returncode == 0 and has_output:
521525
passed = 1
522526
else:
523527
failed = 1

benchmarks/backups/csb_org_compliance/ccx-compliance-194/tests/oracle_checks.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -515,9 +515,13 @@ def check_test_ratio(
515515
elif line_stripped.startswith("FAIL") or line_stripped.startswith("--- FAIL"):
516516
failed += 1
517517

518-
# Fallback: if no structured output, use exit code
518+
# Fallback: if no structured output, use exit code — but only when
519+
# there is actual test output. An empty/trivial exit-0 (e.g. the
520+
# agent made no changes and the test script returned cleanly) must
521+
# NOT be scored as a passing test.
519522
if passed == 0 and failed == 0:
520-
if proc.returncode == 0:
523+
has_output = len(output.strip()) > 0
524+
if proc.returncode == 0 and has_output:
521525
passed = 1
522526
else:
523527
failed = 1

benchmarks/backups/csb_org_compliance_doe_trim/ccx-compliance-057/tests/oracle_checks.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -515,9 +515,13 @@ def check_test_ratio(
515515
elif line_stripped.startswith("FAIL") or line_stripped.startswith("--- FAIL"):
516516
failed += 1
517517

518-
# Fallback: if no structured output, use exit code
518+
# Fallback: if no structured output, use exit code — but only when
519+
# there is actual test output. An empty/trivial exit-0 (e.g. the
520+
# agent made no changes and the test script returned cleanly) must
521+
# NOT be scored as a passing test.
519522
if passed == 0 and failed == 0:
520-
if proc.returncode == 0:
523+
has_output = len(output.strip()) > 0
524+
if proc.returncode == 0 and has_output:
521525
passed = 1
522526
else:
523527
failed = 1

benchmarks/backups/csb_org_compliance_doe_trim/ccx-compliance-188/tests/oracle_checks.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -515,9 +515,13 @@ def check_test_ratio(
515515
elif line_stripped.startswith("FAIL") or line_stripped.startswith("--- FAIL"):
516516
failed += 1
517517

518-
# Fallback: if no structured output, use exit code
518+
# Fallback: if no structured output, use exit code — but only when
519+
# there is actual test output. An empty/trivial exit-0 (e.g. the
520+
# agent made no changes and the test script returned cleanly) must
521+
# NOT be scored as a passing test.
519522
if passed == 0 and failed == 0:
520-
if proc.returncode == 0:
523+
has_output = len(output.strip()) > 0
524+
if proc.returncode == 0 and has_output:
521525
passed = 1
522526
else:
523527
failed = 1

0 commit comments

Comments
 (0)