Skip to content

Commit c9279ab

Browse files
sjarmakclaude
andcommitted
fix: 3 verifier bugs — k8s-cri expected_changes mismatch, llamacpp RocketChat reset, PEP 508 regex
BUG 1: k8s-cri-containerd-reason-001 expected_changes.json was checking Pod storage/validation keywords but instruction asks to trace CRI RuntimeService gRPC to containerd. Aligned task_id, expected_files, expected_content, and added expected_patterns to match CRI instruction. BUG 2: llamacpp-file-modify-search-001 and llamacpp-context-window-search-001 test.sh verifiers called init.sh which resets RocketChat services, destroying the agent's chat messages before eval.py could read them. Both configs scored 0.0. Fix: skip init.sh during verification (services already running from agent session). BUG 3: 8 ccb_build validators.py shared a regex that rejected PEP 508 environment markers (e.g., "boto3==1.26.113 ; python_version >= 3.9") and extras (e.g., "urllib3[socks]>=1.25.4"). The sg_only verifier wrapper converts Poetry pyproject.toml to requirements.txt with these markers. Fix: strip markers after ";" and extras in brackets before regex validation. 12 affected invalid SG_full result directories archived to runs/archive/verifier_bugs_20260220/. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 63a4770 commit c9279ab

File tree

11 files changed

+76
-15
lines changed

11 files changed

+76
-15
lines changed

benchmarks/ccb_build/cgen-deps-install-001/tests/validators.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,11 @@ def _validate_pyproject_toml(self, path: Path) -> List[str]:
118118
@staticmethod
119119
def _is_valid_requirement(line: str) -> bool:
120120
"""Check if line is a valid requirement."""
121+
# Strip PEP 508 environment markers (everything after ";")
122+
# e.g., 'boto3==1.26.113 ; python_version >= "3.9"' -> 'boto3==1.26.113'
123+
line = line.split(";")[0].strip()
124+
# Strip extras specifiers e.g., "package[extra1,extra2]>=1.0" -> "package>=1.0"
125+
line = re.sub(r"\[.*?\]", "", line)
121126
# Pattern: package_name with optional version specifiers
122127
# e.g., "requests", "requests==2.28.0", "requests>=2.28.0,<3.0"
123128
pattern = r"^[a-zA-Z0-9\-_.]+(\s*[><=!~]+\s*[0-9\.\*]+)?(\s*,\s*[><=!~]+\s*[0-9\.\*]+)*$"

benchmarks/ccb_build/codecoverage-deps-install-001/tests/validators.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,11 @@ def _validate_pyproject_toml(self, path: Path) -> List[str]:
118118
@staticmethod
119119
def _is_valid_requirement(line: str) -> bool:
120120
"""Check if line is a valid requirement."""
121+
# Strip PEP 508 environment markers (everything after ";")
122+
# e.g., 'boto3==1.26.113 ; python_version >= "3.9"' -> 'boto3==1.26.113'
123+
line = line.split(";")[0].strip()
124+
# Strip extras specifiers e.g., "package[extra1,extra2]>=1.0" -> "package>=1.0"
125+
line = re.sub(r"\[.*?\]", "", line)
121126
# Pattern: package_name with optional version specifiers
122127
# e.g., "requests", "requests==2.28.0", "requests>=2.28.0,<3.0"
123128
pattern = r"^[a-zA-Z0-9\-_.]+(\s*[><=!~]+\s*[0-9\.\*]+)?(\s*,\s*[><=!~]+\s*[0-9\.\*]+)*$"

benchmarks/ccb_build/dotenv-expand-deps-install-001/tests/validators.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,11 @@ def _validate_pyproject_toml(self, path: Path) -> List[str]:
118118
@staticmethod
119119
def _is_valid_requirement(line: str) -> bool:
120120
"""Check if line is a valid requirement."""
121+
# Strip PEP 508 environment markers (everything after ";")
122+
# e.g., 'boto3==1.26.113 ; python_version >= "3.9"' -> 'boto3==1.26.113'
123+
line = line.split(";")[0].strip()
124+
# Strip extras specifiers e.g., "package[extra1,extra2]>=1.0" -> "package>=1.0"
125+
line = re.sub(r"\[.*?\]", "", line)
121126
# Pattern: package_name with optional version specifiers
122127
# e.g., "requests", "requests==2.28.0", "requests>=2.28.0,<3.0"
123128
pattern = r"^[a-zA-Z0-9\-_.]+(\s*[><=!~]+\s*[0-9\.\*]+)?(\s*,\s*[><=!~]+\s*[0-9\.\*]+)*$"

benchmarks/ccb_build/dotnetkoans-deps-install-001/tests/validators.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,11 @@ def _validate_pyproject_toml(self, path: Path) -> List[str]:
118118
@staticmethod
119119
def _is_valid_requirement(line: str) -> bool:
120120
"""Check if line is a valid requirement."""
121+
# Strip PEP 508 environment markers (everything after ";")
122+
# e.g., 'boto3==1.26.113 ; python_version >= "3.9"' -> 'boto3==1.26.113'
123+
line = line.split(";")[0].strip()
124+
# Strip extras specifiers e.g., "package[extra1,extra2]>=1.0" -> "package>=1.0"
125+
line = re.sub(r"\[.*?\]", "", line)
121126
# Pattern: package_name with optional version specifiers
122127
# e.g., "requests", "requests==2.28.0", "requests>=2.28.0,<3.0"
123128
pattern = r"^[a-zA-Z0-9\-_.]+(\s*[><=!~]+\s*[0-9\.\*]+)?(\s*,\s*[><=!~]+\s*[0-9\.\*]+)*$"

benchmarks/ccb_build/eslint-markdown-deps-install-001/tests/validators.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,11 @@ def _validate_pyproject_toml(self, path: Path) -> List[str]:
118118
@staticmethod
119119
def _is_valid_requirement(line: str) -> bool:
120120
"""Check if line is a valid requirement."""
121+
# Strip PEP 508 environment markers (everything after ";")
122+
# e.g., 'boto3==1.26.113 ; python_version >= "3.9"' -> 'boto3==1.26.113'
123+
line = line.split(";")[0].strip()
124+
# Strip extras specifiers e.g., "package[extra1,extra2]>=1.0" -> "package>=1.0"
125+
line = re.sub(r"\[.*?\]", "", line)
121126
# Pattern: package_name with optional version specifiers
122127
# e.g., "requests", "requests==2.28.0", "requests>=2.28.0,<3.0"
123128
pattern = r"^[a-zA-Z0-9\-_.]+(\s*[><=!~]+\s*[0-9\.\*]+)?(\s*,\s*[><=!~]+\s*[0-9\.\*]+)*$"

benchmarks/ccb_build/iamactionhunter-deps-install-001/tests/validators.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,11 @@ def _validate_pyproject_toml(self, path: Path) -> List[str]:
118118
@staticmethod
119119
def _is_valid_requirement(line: str) -> bool:
120120
"""Check if line is a valid requirement."""
121+
# Strip PEP 508 environment markers (everything after ";")
122+
# e.g., 'boto3==1.26.113 ; python_version >= "3.9"' -> 'boto3==1.26.113'
123+
line = line.split(";")[0].strip()
124+
# Strip extras specifiers e.g., "package[extra1,extra2]>=1.0" -> "package>=1.0"
125+
line = re.sub(r"\[.*?\]", "", line)
121126
# Pattern: package_name with optional version specifiers
122127
# e.g., "requests", "requests==2.28.0", "requests>=2.28.0,<3.0"
123128
pattern = r"^[a-zA-Z0-9\-_.]+(\s*[><=!~]+\s*[0-9\.\*]+)?(\s*,\s*[><=!~]+\s*[0-9\.\*]+)*$"

benchmarks/ccb_build/pcap-parser-deps-install-001/tests/validators.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,11 @@ def _validate_pyproject_toml(self, path: Path) -> List[str]:
118118
@staticmethod
119119
def _is_valid_requirement(line: str) -> bool:
120120
"""Check if line is a valid requirement."""
121+
# Strip PEP 508 environment markers (everything after ";")
122+
# e.g., 'boto3==1.26.113 ; python_version >= "3.9"' -> 'boto3==1.26.113'
123+
line = line.split(";")[0].strip()
124+
# Strip extras specifiers e.g., "package[extra1,extra2]>=1.0" -> "package>=1.0"
125+
line = re.sub(r"\[.*?\]", "", line)
121126
# Pattern: package_name with optional version specifiers
122127
# e.g., "requests", "requests==2.28.0", "requests>=2.28.0,<3.0"
123128
pattern = r"^[a-zA-Z0-9\-_.]+(\s*[><=!~]+\s*[0-9\.\*]+)?(\s*,\s*[><=!~]+\s*[0-9\.\*]+)*$"

benchmarks/ccb_build/similar-asserts-deps-install-001/tests/validators.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,11 @@ def _validate_pyproject_toml(self, path: Path) -> List[str]:
118118
@staticmethod
119119
def _is_valid_requirement(line: str) -> bool:
120120
"""Check if line is a valid requirement."""
121+
# Strip PEP 508 environment markers (everything after ";")
122+
# e.g., 'boto3==1.26.113 ; python_version >= "3.9"' -> 'boto3==1.26.113'
123+
line = line.split(";")[0].strip()
124+
# Strip extras specifiers e.g., "package[extra1,extra2]>=1.0" -> "package>=1.0"
125+
line = re.sub(r"\[.*?\]", "", line)
121126
# Pattern: package_name with optional version specifiers
122127
# e.g., "requests", "requests==2.28.0", "requests>=2.28.0,<3.0"
123128
pattern = r"^[a-zA-Z0-9\-_.]+(\s*[><=!~]+\s*[0-9\.\*]+)?(\s*,\s*[><=!~]+\s*[0-9\.\*]+)*$"

benchmarks/ccb_test/llamacpp-context-window-search-001/tests/test.sh

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,13 @@ fi
1616
echo "Running TAC evaluator for sde-find-answer-in-codebase-1..."
1717
cd /utils
1818

19-
# Initialize TAC environment (needed for RocketChat)
20-
if [ -f "/utils/init.sh" ]; then
21-
SERVER_HOSTNAME="${TAC_SERVER_HOSTNAME:-localhost}" bash /utils/init.sh || true
22-
fi
19+
# NOTE: init.sh intentionally SKIPPED during verification.
20+
# init.sh resets RocketChat services, which destroys the agent's chat messages
21+
# before eval.py can read them. The services are already running from the agent's
22+
# session, so no re-initialization is needed for the verifier.
23+
# if [ -f "/utils/init.sh" ]; then
24+
# SERVER_HOSTNAME="${TAC_SERVER_HOSTNAME:-localhost}" bash /utils/init.sh || true
25+
# fi
2326

2427
DECRYPTION_KEY="${DECRYPTION_KEY:-theagentcompany is all you need}" \
2528
python_default /utils/eval.py \

benchmarks/ccb_test/llamacpp-file-modify-search-001/tests/test.sh

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,13 @@ fi
1616
echo "Running TAC evaluator for sde-find-answer-in-codebase-2..."
1717
cd /utils
1818

19-
if [ -f "/utils/init.sh" ]; then
20-
SERVER_HOSTNAME="${TAC_SERVER_HOSTNAME:-localhost}" bash /utils/init.sh || true
21-
fi
19+
# NOTE: init.sh intentionally SKIPPED during verification.
20+
# init.sh resets RocketChat services, which destroys the agent's chat messages
21+
# before eval.py can read them. The services are already running from the agent's
22+
# session, so no re-initialization is needed for the verifier.
23+
# if [ -f "/utils/init.sh" ]; then
24+
# SERVER_HOSTNAME="${TAC_SERVER_HOSTNAME:-localhost}" bash /utils/init.sh || true
25+
# fi
2226

2327
DECRYPTION_KEY="${DECRYPTION_KEY:-theagentcompany is all you need}" \
2428
python_default /utils/eval.py \

0 commit comments

Comments
 (0)