From efe9703f5dc21ec3446f7c2033a070ef225a8a03 Mon Sep 17 00:00:00 2001 From: bft-codebot Date: Tue, 24 Feb 2026 23:40:42 +0000 Subject: [PATCH] sync(bfmono): feat(gambit): improve verify report controls and harden concurrent calibrate persistence (+19 more) (bfmono@c56b7f52f) This PR is an automated gambitmono sync of bfmono Gambit packages. - Source: `packages/gambit/` - Core: `packages/gambit/packages/gambit-core/` - bfmono rev: c56b7f52f Changes: - c56b7f52f feat(gambit): improve verify report controls and harden concurrent calibrate persistence - beb9435c0 feat(gambit-simulator-ui): extend listbox trigger and popover options - 25f9fdcfc fix(gambit-simulator-ui): align verify outlier chip semantics and display - a010b0ee1 feat(gambit-simulator-ui): add verify outliers to workbench chat chips - 383f2500a refactor(simulator-ui): replace nested ternaries in main routing - 13c4c8c22 fix(gambit): preserve shared references in safe session serialization - ae392aa24 feat(gambit-simulator-ui): add grader error chips to workbench chat - 1de6b335c fix(gambit): clamp deck-level maxTurns bounds in test run selection - 01d7abbb9 fix(gambit): default verify tab bootstrap flag to enabled - f3d186c7b fix(gambit): include extension schemas in exports and default serve to restored workspace - a83b7cbe7 fix(gambit): move unbounded build timeout to deck opt-in - acb2de627 fix(gambit): avoid strict json_schema 400s in openrouter responses - 8aba573b6 fix(gambit-simulator-ui): treat errored calibrate runs as failed - ca2028cf8 fix(gambit): prevent circular trace crashes in workspace test run API - 7e41517e5 fix(gambit): make build assistant run timeout unbounded - 24341143d feat(gambit): add deterministic verify fixture seeding - ff2c2d33d feat(gambit): add verify tab consistency UI - 91f0c93bb feat(gambit): add feature-flagged verify routing - 1392f8b65 feat(gambit): support deck-level maxTurns override - f5920ef86 chore(gambit): cut 0.8.5-rc.10 Do not edit this repo directly; make changes in bfmono and re-run the sync. --- simulator-ui/src/GradePage.tsx | 3 + simulator-ui/src/VerifyPage.tsx | 388 ++++++++++++++++++++----------- simulator-ui/src/gds/Listbox.tsx | 53 ++++- simulator-ui/src/styles.ts | 60 ++++- src/server.ts | 23 +- src/server_streams.test.ts | 131 +++++++++++ 6 files changed, 503 insertions(+), 155 deletions(-) diff --git a/simulator-ui/src/GradePage.tsx b/simulator-ui/src/GradePage.tsx index d3fc9c05..44a4abca 100644 --- a/simulator-ui/src/GradePage.tsx +++ b/simulator-ui/src/GradePage.tsx @@ -807,6 +807,9 @@ function GradePage(
Grader runs + + ({filteredSessionRuns.length}) +
{runItems.length === 0 && ( diff --git a/simulator-ui/src/VerifyPage.tsx b/simulator-ui/src/VerifyPage.tsx index e7351c2d..b036eb3b 100644 --- a/simulator-ui/src/VerifyPage.tsx +++ b/simulator-ui/src/VerifyPage.tsx @@ -69,6 +69,14 @@ type VerifyRunSampleResponse = { error?: string; }; +type VerifyReportScope = "current_batch" | "all_matching"; +type VerifyExampleSort = + | "default" + | "delta_desc" + | "agreement_asc" + | "samples_desc" + | "label_asc"; + const parseScenarioRunSummary = (value: unknown): ScenarioRunSummary | null => { if (!value || typeof value !== "object") return null; const summary = value as Record; @@ -199,6 +207,11 @@ function VerifyPage( initialRunIds: [], requests: [], }); + const [reportScope, setReportScope] = useState( + "all_matching", + ); + const [inconsistentOnly, setInconsistentOnly] = useState(false); + const [exampleSort, setExampleSort] = useState("default"); const batchSeqRef = useRef(0); const updateVerifyPath = useCallback((sessionId: string | null) => { @@ -545,10 +558,10 @@ function VerifyPage( const reportRuns = useMemo( () => - completedBatchRuns.length > 0 + reportScope === "current_batch" ? completedBatchRuns : historicalCompletedRuns, - [completedBatchRuns, historicalCompletedRuns], + [completedBatchRuns, historicalCompletedRuns, reportScope], ); const consistencyReport = useMemo( @@ -569,7 +582,43 @@ function VerifyPage( batchState.status !== "running", ); - const topOutliers = consistencyReport.outliers.slice(0, 8); + const displayedOutliers = useMemo(() => { + const filtered = inconsistentOnly + ? consistencyReport.outliers.filter((outlier) => outlier.instability) + : consistencyReport.outliers; + if (exampleSort === "default") return filtered; + const next = [...filtered]; + if (exampleSort === "delta_desc") { + next.sort((a, b) => { + const aDelta = a.scoreDelta ?? -1; + const bDelta = b.scoreDelta ?? -1; + if (aDelta !== bDelta) return bDelta - aDelta; + return a.label.localeCompare(b.label); + }); + return next; + } + if (exampleSort === "agreement_asc") { + next.sort((a, b) => { + const aAgreement = a.agreementRate ?? Number.POSITIVE_INFINITY; + const bAgreement = b.agreementRate ?? Number.POSITIVE_INFINITY; + if (aAgreement !== bAgreement) return aAgreement - bAgreement; + return a.label.localeCompare(b.label); + }); + return next; + } + if (exampleSort === "samples_desc") { + next.sort((a, b) => { + if (a.sampleSize !== b.sampleSize) return b.sampleSize - a.sampleSize; + return a.label.localeCompare(b.label); + }); + return next; + } + next.sort((a, b) => a.label.localeCompare(b.label)); + return next; + }, [consistencyReport.outliers, exampleSort, inconsistentOnly]); + const reportScopeLabel = reportScope === "current_batch" + ? "current batch" + : "all matching runs"; const resolvedComposerChips = useMemo( () => composerChips ?? [], [composerChips], @@ -612,7 +661,7 @@ function VerifyPage( }, [onComposerChipsChange, resolvedComposerChips]); const buildOutlierChip = useCallback( - (outlier: typeof topOutliers[number]) => { + (outlier: typeof consistencyReport.outliers[number]) => { const chipId = `verify:${selectedSessionId ?? ""}:${outlier.key}`; const runId = outlier.maxRunId ?? outlier.minRunId; const score = outlier.maxScore ?? outlier.minScore ?? undefined; @@ -774,23 +823,11 @@ function VerifyPage( {!loading && ( <>
-
+
Batch status -
- - {batchState.status} - - {batchState.startedAt - ? ` · started ${ - formatTimestampShort(batchState.startedAt) - }` - : ""} - {batchState.finishedAt - ? ` · finished ${ - formatTimestampShort(batchState.finishedAt) - }` - : ""} -
+ + {batchState.status} +
{consistencyReport.sampleSize > 0 && ( )}
- {batchState.requested > 0 && ( -
- Queued: {queuedCount} - Running: {batchState.active} - Completed: {batchState.completed} - Failed: {batchState.failed} -
- )} {batchState.status === "idle" && consistencyReport.sampleSize === 0 && ( @@ -818,57 +847,87 @@ function VerifyPage( instability for the selected grader. )} - {consistencyReport.sampleSize > 0 && ( - <> -
-
+
+
+
+
Sample size
{consistencyReport.sampleSize}
-
-
Agreement rate
-
- {consistencyReport.agreementRate === null - ? "—" - : `${ - Math.round(consistencyReport.agreementRate * 100) - }%`} -
-
-
-
- Score spread (min/median/max) -
-
- {consistencyReport.scoreSpreadMin === null - ? "—" - : `${consistencyReport.scoreSpreadMin} / ${ - consistencyReport.scoreSpreadMedian ?? "—" - } / ${consistencyReport.scoreSpreadMax ?? "—"}`} -
-
-
-
- Instability count -
-
- {consistencyReport.instabilityCount} -
+
+ + setReportScope(value as VerifyReportScope)} + size="small" + popoverMatchTriggerWidth={false} + popoverMinWidth={320} + popoverAlign="right" + options={[ + { + value: "current_batch", + label: + `Current batch (${completedBatchRuns.length})`, + triggerLabel: "Current batch", + triggerMeta: null, + meta: "Only runs from the latest batch launch", + }, + { + value: "all_matching", + label: + `All matching runs (${historicalCompletedRuns.length})`, + triggerLabel: "All matching runs", + triggerMeta: null, + meta: + "All runs matching selected scenario + grader", + }, + ]} + />
- - {consistencyReport.verdictReason} - - +
+
+
Agreement rate
+
+ {consistencyReport.agreementRate === null + ? "—" + : `${Math.round(consistencyReport.agreementRate * 100)}%`} +
+
+
+
+ Score spread (min/median/max) +
+
+ {consistencyReport.scoreSpreadMin === null + ? "—" + : `${consistencyReport.scoreSpreadMin} / ${ + consistencyReport.scoreSpreadMedian ?? "—" + } / ${consistencyReport.scoreSpreadMax ?? "—"}`} +
+
+
+
+ Instability count +
+
+ {consistencyReport.instabilityCount} +
+
+
+ {consistencyReport.sampleSize > 0 && ( + + {consistencyReport.verdictReason} + )} Min sample size: {VERIFY_CONSISTENCY_THRESHOLDS.minSampleSize} @@ -889,17 +948,49 @@ function VerifyPage( {VERIFY_CONSISTENCY_THRESHOLDS.warn.maxInstabilityCount}.
- Most inconsistent examples - {topOutliers.length === 0 +
+ Examples +
+ +
+ + setExampleSort(value as VerifyExampleSort)} + size="small" + options={[ + { value: "default", label: "Sort: default" }, + { value: "delta_desc", label: "Sort: score delta" }, + { + value: "agreement_asc", + label: "Sort: agreement", + }, + { value: "samples_desc", label: "Sort: sample size" }, + { value: "label_asc", label: "Sort: label" }, + ]} + /> +
+
+
+ {displayedOutliers.length === 0 ? ( - Inconsistent examples will appear here as soon as at least - one completed run is available in this batch. + {consistencyReport.outliers.length === 0 + ? `Examples will appear here as soon as at least one completed run is available in ${reportScopeLabel}.` + : `No examples match the current filters in ${reportScopeLabel}.`} ) : (
- {topOutliers.map((outlier) => { + {displayedOutliers.map((outlier) => { const runLinks = (() => { if (!selectedSessionId) return []; const ids = [ @@ -973,64 +1064,72 @@ function VerifyPage(
-
- agreement {outlier.agreementRate === null - ? "—" - : `${Math.round(outlier.agreementRate * 100)}%`} - {" "} - · delta {outlier.scoreDelta ?? "—"} · samples{" "} - {outlier.sampleSize} - {outlier.passFlip ? " · pass/fail flip" : ""} - {outlier.messageRefId - ? ` · ref ${outlier.messageRefId}` - : ""} -
- {(() => { - const outlierChip = buildOutlierChip(outlier); - const inChat = composerChipIds.has( - outlierChip.chipId, - ); - return ( -
- +
+
+
+ agreement {outlier.agreementRate === null + ? "—" + : `${ + Math.round(outlier.agreementRate * 100) + }%`} · delta {outlier.scoreDelta ?? "—"} + {" "} + · samples {outlier.sampleSize} + {outlier.passFlip ? " · pass/fail flip" : ""} + {outlier.messageRefId + ? ` · ref ${outlier.messageRefId}` + : ""}
- ); - })()} - {runLinks.length > 0 && ( -
- {runLinks.map((runId) => { - if (!selectedSessionId) return null; - const href = buildGradePath( - selectedSessionId, - runId, - ); - return ( - - handleInternalLinkClick(event, href)} - > - Open grade run {runId} - - ); - })} + {runLinks.length > 0 && ( +
+ {runLinks.map((runId) => { + if (!selectedSessionId) return null; + const href = buildGradePath( + selectedSessionId, + runId, + ); + return ( + + handleInternalLinkClick( + event, + href, + )} + > + Open grade run {runId} + + ); + })} +
+ )}
- )} + {(() => { + const outlierChip = buildOutlierChip(outlier); + const inChat = composerChipIds.has( + outlierChip.chipId, + ); + return ( +
+ +
+ ); + })()} +
); })} @@ -1040,6 +1139,29 @@ function VerifyPage( {batchState.requests.length > 0 && (
Batch requests + {(batchState.startedAt || + batchState.finishedAt) && ( +
+ {batchState.startedAt + ? `started ${ + formatTimestampShort(batchState.startedAt) + }` + : ""} + {batchState.finishedAt + ? ` · finished ${ + formatTimestampShort(batchState.finishedAt) + }` + : ""} +
+ )} + {batchState.requested > 0 && ( +
+ Queued: {queuedCount} + Running: {batchState.active} + Completed: {batchState.completed} + Failed: {batchState.failed} +
+ )}
    {batchState.requests.map((request, index) => (
  • (null); @@ -66,18 +76,39 @@ export default function Listbox(props: ListboxProps) { } return null; }, [options, value]); + const selectedTriggerMeta = useMemo(() => { + if (!selected) return null; + if ("triggerMeta" in selected) return selected.triggerMeta ?? null; + return selected.meta ?? null; + }, [selected]); const updatePopover = useCallback(() => { const trigger = triggerRef.current; if (!trigger) return; const rect = trigger.getBoundingClientRect(); - setPopoverStyle({ + const style: React.CSSProperties = { position: "fixed", top: rect.bottom + 6, - left: rect.left, - width: rect.width, - }); - }, []); + }; + if (popoverMatchTriggerWidth) { + style.width = rect.width; + } else if ( + typeof popoverMinWidth === "number" && Number.isFinite(popoverMinWidth) + ) { + style.minWidth = popoverMinWidth; + } + if (popoverAlign === "right") { + style.left = popoverMatchTriggerWidth + ? rect.right - rect.width + : rect.right; + if (!popoverMatchTriggerWidth) { + style.transform = "translateX(-100%)"; + } + } else { + style.left = rect.left; + } + setPopoverStyle(style); + }, [popoverAlign, popoverMatchTriggerWidth, popoverMinWidth]); useLayoutEffect(() => { if (!open) return; @@ -117,8 +148,12 @@ export default function Listbox(props: ListboxProps) { }; }, [open, updatePopover]); + const rootClassName = size === "small" + ? "gds-listbox gds-listbox--size-small" + : "gds-listbox"; + return ( -
    +
    {label && (