bd: backup 2026-03-09 13:11

sjarmak · sjarmak · commit b2729266b7c8 · 2026-03-09T13:11:58.000Z
diff --git a/.beads/backup/backup_state.json b/.beads/backup/backup_state.json
@@ -1,10 +1,10 @@
 {
-  "last_dolt_commit": "fj0jnab8p06ihk8as5sdvq1jl0eariu0",
+  "last_dolt_commit": "67q9ierl2g4ba1t4vul82u8co2pilg9o",
   "last_event_id": 0,
-  "timestamp": "2026-03-09T12:51:48.351460445Z",
+  "timestamp": "2026-03-09T13:11:58.45256211Z",
   "counts": {
-    "issues": 7,
-    "events": 19,
+    "issues": 9,
+    "events": 22,
     "comments": 0,
     "dependencies": 5,
     "labels": 0,
diff --git a/.beads/backup/events.jsonl b/.beads/backup/events.jsonl
@@ -17,3 +17,6 @@
 {"actor":"sjarmak","comment":null,"created_at":"2026-03-07T23:26:58Z","event_type":"status_changed","id":17,"issue_id":"CodeScaleBench-utv","new_value":"{\"status\":\"in_progress\"}","old_value":"{\"id\":\"CodeScaleBench-utv\",\"title\":\"Rebuild unified manifest with power-optimized task-type balance\",\"description\":\"Rebuild the core benchmark manifest as a single unified set (no SDLC vs Org split). Optimize selection for: (1) 80% power for overall retrieval effect, (2) balanced task-type representation (comprehension/implementation/quality), (3) multi-repo coverage in every task type, (4) LOC band diversity with emphasis on large codebases (2M+ LOC). Target ~280-300 tasks based on power analysis. Every task has both deterministic reward and IR retrieval scoring.\",\"status\":\"open\",\"priority\":3,\"issue_type\":\"task\",\"owner\":\"sjarmak@users.noreply.github.com\",\"created_at\":\"2026-03-07T22:56:46Z\",\"created_by\":\"sjarmak\",\"updated_at\":\"2026-03-07T22:56:46Z\"}"}
 {"actor":"sjarmak","comment":null,"created_at":"2026-03-07T23:33:04Z","event_type":"closed","id":18,"issue_id":"CodeScaleBench-utv","new_value":"Built unified 280-task manifest (schema v2.0). comprehension=100, implementation=90, quality=90. Overall power=84.1% at sigma=0.20. Large codebase 58.6%, multi-repo 31.8%, 20 suites, 11 languages. LOC fallback chain eliminates all unknowns.","old_value":""}
 {"actor":"sjarmak","comment":null,"created_at":"2026-03-07T23:33:07Z","event_type":"closed","id":19,"issue_id":"CodeScaleBench-xjg","new_value":"Epic complete: (1) IR scoring added to SDLC tasks (ggy), (2) 67 Org tasks got deterministic verifiers (c17), (3) unified 280-task manifest built (utv). No more SDLC/Org split.","old_value":""}
+{"actor":"sjarmak","comment":null,"created_at":"2026-03-09T12:51:57Z","event_type":"created","id":20,"issue_id":"CodeScaleBench-03c","new_value":"","old_value":""}
+{"actor":"sjarmak","comment":null,"created_at":"2026-03-09T13:05:06Z","event_type":"closed","id":21,"issue_id":"CodeScaleBench-03c","new_value":"Done","old_value":""}
+{"actor":"sjarmak","comment":null,"created_at":"2026-03-09T13:11:58Z","event_type":"created","id":22,"issue_id":"CodeScaleBench-2kz","new_value":"","old_value":""}
diff --git a/.beads/backup/issues.jsonl b/.beads/backup/issues.jsonl
@@ -1,3 +1,5 @@
+{"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"Done","closed_at":"2026-03-09T13:05:07Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"fd09967d3c4f6c86a02ac44968cff45a310e9ceb4d1626ef3f2592c790aaa6bf","created_at":"2026-03-09T12:51:57Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Investigate whether clickhouse-mergetree-arch-understand-001 truly needs \u003e10G storage and, if possible, replace the static exception with measured repo-size/routing metadata. Also expand the registry smoke matrix if future harness regressions show uncovered task families.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-03c","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":2,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Audit ClickHouse storage exception and expand smoke coverage","updated_at":"2026-03-09T13:05:07Z","waiters":"","wisp_type":"","work_type":""}
+{"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"","closed_at":null,"closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"e71c16398c3f1178357f507e5f0abb54c84894426f6821e51b482bb84b4a1910","created_at":"2026-03-09T13:11:58Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Run a focused verification batch to prove the current task-contract and harness hardening eliminates the earlier random patch churn.\n\nScope:\n- Claude Code regression sentinels:\n  - mcp_ccx-onboard-search-207\n  - mcp_ccx-onboard-search-208\n  - mcp_ccx-onboard-search-210\n  - mcp_bustub-hyperloglog-impl-001\n  - mcp_django-sensitive-file-exclusion-001\n  - mcp_flink-window-late-data-fix-001\n  - mcp_element-web-unread-indicators-diverge-fix-001\n  - clickhouse-mergetree-arch-understand-001 (confirm Daytona/local routing now that storage metadata was corrected)\n- OpenHands regression sentinel:\n  - ccx-onboard-search-212\n\nAcceptance criteria:\n- Produce a small rerun manifest or manifests for the tasks above.\n- Execute the reruns once accounts are ready.\n- Confirm whether each task now completes as a valid run without ad hoc task-specific patches.\n- Record any remaining failures as either harness bugs, task bugs, or infra issues with exact root cause.\n- If clean, note which tasks should remain in the smoke/verification matrix as permanent regression sentinels.\n","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-2kz","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":1,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"open","target":"","timeout_ns":0,"title":"Verify harness fixes by rerunning historical Claude/OpenHands failures","updated_at":"2026-03-09T13:11:58Z","waiters":"","wisp_type":"","work_type":""}
 {"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"Defined Org→SDLC mapping for all 11 org suites. Selected 67 promotion candidates: all multi-repo, 84% 2M+ LOC, balanced across 6 target SDLC suites. configs/org_promotion_manifest.json.","closed_at":"2026-03-07T23:01:01Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"e57ed0ffb8999cc5708e3fbe9fa45f6a2e6461b45004c38ec33b54abfd14e753","created_at":"2026-03-07T22:56:46Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Analyze current SDLC coverage gaps: multi-repo (only 15/171), large codebases (only 2 tasks in 8M-40M, 0 in \u003e40M), and task-type balance. Select ~60-80 Org tasks for promotion that maximize: (1) multi-repo representation across all SDLC suites, (2) large codebase coverage (prioritize 2M+ LOC), (3) task-type balance across comprehension/implementation/quality. Produce a promotion manifest with target suite, verifier approach, and priority ranking.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-5p1","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":2,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Select Org→SDLC promotion candidates optimized for coverage gaps","updated_at":"2026-03-07T23:01:01Z","waiters":"","wisp_type":"","work_type":""}
 {"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"Defined Org→SDLC mapping for all 11 org suites. Selected 67 promotion candidates: all multi-repo, 84% 2M+ LOC, balanced across 6 target SDLC suites. configs/org_promotion_manifest.json.","closed_at":"2026-03-07T23:01:01Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"d581391bafd28d416539191f5b91d255b0832d75fccc535e206157b820ddbeec","created_at":"2026-03-07T22:56:46Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Select Org tasks that naturally map to SDLC phases and add deterministic verifiers. Priority: multi-repo tasks from large codebases that fill gaps in SDLC coverage. Natural mappings: incident→debug, security/compliance→secure, migration→refactor, onboarding/domain→understand, crossrepo/crossrepo_tracing→design. For each promoted task, identify the most straightforward deterministic verifier approach matching the target SDLC suite's pattern. Focus on tasks where the oracle_checks.py already does structured validation that can be made deterministic.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-aav","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":2,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Map and promote Org tasks to SDLC categories","updated_at":"2026-03-07T23:01:01Z","waiters":"","wisp_type":"","work_type":""}
 {"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"Taxonomy defined: comprehension/implementation/quality. Mapped all 20 suites and 477 tasks. Manifest: 40/37/23% split. configs/task_type_taxonomy.json + task_type field on all tasks.","closed_at":"2026-03-07T22:59:37Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"cac4323aa5802e3e8dca37694c0f3c50c9dacf7ab21a04cf5e65a0bd3b7712a2","created_at":"2026-03-07T22:56:46Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Formalize the three task-type buckets that cut across suites: Comprehension (understand, design, document, onboarding, domain), Implementation (feature, fix, refactor, migration), Quality (test, debug, secure, compliance, incident). Add task_type field to selected_benchmark_tasks.json. Map existing SUITE_TO_PROFILE curator profiles to these three buckets. This taxonomy enables power analysis and balanced selection across task types, not just suites.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-abl","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":2,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Define task-type taxonomy: comprehension / implementation / quality","updated_at":"2026-03-07T22:59:37Z","waiters":"","wisp_type":"","work_type":""}

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	+{"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"Done","closed_at":"2026-03-09T13:05:07Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"fd09967d3c4f6c86a02ac44968cff45a310e9ceb4d1626ef3f2592c790aaa6bf","created_at":"2026-03-09T12:51:57Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Investigate whether clickhouse-mergetree-arch-understand-001 truly needs \u003e10G storage and, if possible, replace the static exception with measured repo-size/routing metadata. Also expand the registry smoke matrix if future harness regressions show uncovered task families.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-03c","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":2,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Audit ClickHouse storage exception and expand smoke coverage","updated_at":"2026-03-09T13:05:07Z","waiters":"","wisp_type":"","work_type":""}
	`2`	+{"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"","closed_at":null,"closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"e71c16398c3f1178357f507e5f0abb54c84894426f6821e51b482bb84b4a1910","created_at":"2026-03-09T13:11:58Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Run a focused verification batch to prove the current task-contract and harness hardening eliminates the earlier random patch churn.\n\nScope:\n- Claude Code regression sentinels:\n - mcp_ccx-onboard-search-207\n - mcp_ccx-onboard-search-208\n - mcp_ccx-onboard-search-210\n - mcp_bustub-hyperloglog-impl-001\n - mcp_django-sensitive-file-exclusion-001\n - mcp_flink-window-late-data-fix-001\n - mcp_element-web-unread-indicators-diverge-fix-001\n - clickhouse-mergetree-arch-understand-001 (confirm Daytona/local routing now that storage metadata was corrected)\n- OpenHands regression sentinel:\n - ccx-onboard-search-212\n\nAcceptance criteria:\n- Produce a small rerun manifest or manifests for the tasks above.\n- Execute the reruns once accounts are ready.\n- Confirm whether each task now completes as a valid run without ad hoc task-specific patches.\n- Record any remaining failures as either harness bugs, task bugs, or infra issues with exact root cause.\n- If clean, note which tasks should remain in the smoke/verification matrix as permanent regression sentinels.\n","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-2kz","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":1,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"open","target":"","timeout_ns":0,"title":"Verify harness fixes by rerunning historical Claude/OpenHands failures","updated_at":"2026-03-09T13:11:58Z","waiters":"","wisp_type":"","work_type":""}
`1`	`3`	{"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"Defined Org→SDLC mapping for all 11 org suites. Selected 67 promotion candidates: all multi-repo, 84% 2M+ LOC, balanced across 6 target SDLC suites. configs/org_promotion_manifest.json.","closed_at":"2026-03-07T23:01:01Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"e57ed0ffb8999cc5708e3fbe9fa45f6a2e6461b45004c38ec33b54abfd14e753","created_at":"2026-03-07T22:56:46Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Analyze current SDLC coverage gaps: multi-repo (only 15/171), large codebases (only 2 tasks in 8M-40M, 0 in \u003e40M), and task-type balance. Select ~60-80 Org tasks for promotion that maximize: (1) multi-repo representation across all SDLC suites, (2) large codebase coverage (prioritize 2M+ LOC), (3) task-type balance across comprehension/implementation/quality. Produce a promotion manifest with target suite, verifier approach, and priority ranking.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-5p1","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":2,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Select Org→SDLC promotion candidates optimized for coverage gaps","updated_at":"2026-03-07T23:01:01Z","waiters":"","wisp_type":"","work_type":""}
`2`	`4`	{"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"Defined Org→SDLC mapping for all 11 org suites. Selected 67 promotion candidates: all multi-repo, 84% 2M+ LOC, balanced across 6 target SDLC suites. configs/org_promotion_manifest.json.","closed_at":"2026-03-07T23:01:01Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"d581391bafd28d416539191f5b91d255b0832d75fccc535e206157b820ddbeec","created_at":"2026-03-07T22:56:46Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Select Org tasks that naturally map to SDLC phases and add deterministic verifiers. Priority: multi-repo tasks from large codebases that fill gaps in SDLC coverage. Natural mappings: incident→debug, security/compliance→secure, migration→refactor, onboarding/domain→understand, crossrepo/crossrepo_tracing→design. For each promoted task, identify the most straightforward deterministic verifier approach matching the target SDLC suite's pattern. Focus on tasks where the oracle_checks.py already does structured validation that can be made deterministic.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-aav","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":2,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Map and promote Org tasks to SDLC categories","updated_at":"2026-03-07T23:01:01Z","waiters":"","wisp_type":"","work_type":""}
`3`	`5`	{"acceptance_criteria":"","actor":"","agent_state":"","assignee":null,"await_id":"","await_type":"","close_reason":"Taxonomy defined: comprehension/implementation/quality. Mapped all 20 suites and 477 tasks. Manifest: 40/37/23% split. configs/task_type_taxonomy.json + task_type field on all tasks.","closed_at":"2026-03-07T22:59:37Z","closed_by_session":"","compacted_at":null,"compacted_at_commit":null,"compaction_level":0,"content_hash":"cac4323aa5802e3e8dca37694c0f3c50c9dacf7ab21a04cf5e65a0bd3b7712a2","created_at":"2026-03-07T22:56:46Z","created_by":"sjarmak","crystallizes":0,"defer_until":null,"description":"Formalize the three task-type buckets that cut across suites: Comprehension (understand, design, document, onboarding, domain), Implementation (feature, fix, refactor, migration), Quality (test, debug, secure, compliance, incident). Add task_type field to selected_benchmark_tasks.json. Map existing SUITE_TO_PROFILE curator profiles to these three buckets. This taxonomy enables power analysis and balanced selection across task types, not just suites.","design":"","due_at":null,"ephemeral":0,"estimated_minutes":null,"event_kind":"","external_ref":null,"hook_bead":"","id":"CodeScaleBench-abl","is_template":0,"issue_type":"task","last_activity":null,"metadata":"{}","mol_type":"","notes":"","original_size":null,"owner":"sjarmak@users.noreply.github.com","payload":"","pinned":0,"priority":2,"quality_score":null,"rig":"","role_bead":"","role_type":"","sender":"","source_repo":"","source_system":"","spec_id":"","status":"closed","target":"","timeout_ns":0,"title":"Define task-type taxonomy: comprehension / implementation / quality","updated_at":"2026-03-07T22:59:37Z","waiters":"","wisp_type":"","work_type":""}