From efe9703f5dc21ec3446f7c2033a070ef225a8a03 Mon Sep 17 00:00:00 2001
From: bft-codebot <bft-codebot@users.noreply.github.com>
Date: Tue, 24 Feb 2026 23:40:42 +0000
Subject: [PATCH] sync(bfmono): feat(gambit): improve verify report controls
 and harden concurrent calibrate persistence (+19 more) (bfmono@c56b7f52f)

This PR is an automated gambitmono sync of bfmono Gambit packages.

- Source: `packages/gambit/`
- Core: `packages/gambit/packages/gambit-core/`
- bfmono rev: c56b7f52f

Changes:
- c56b7f52f feat(gambit): improve verify report controls and harden concurrent calibrate persistence
- beb9435c0 feat(gambit-simulator-ui): extend listbox trigger and popover options
- 25f9fdcfc fix(gambit-simulator-ui): align verify outlier chip semantics and display
- a010b0ee1 feat(gambit-simulator-ui): add verify outliers to workbench chat chips
- 383f2500a refactor(simulator-ui): replace nested ternaries in main routing
- 13c4c8c22 fix(gambit): preserve shared references in safe session serialization
- ae392aa24 feat(gambit-simulator-ui): add grader error chips to workbench chat
- 1de6b335c fix(gambit): clamp deck-level maxTurns bounds in test run selection
- 01d7abbb9 fix(gambit): default verify tab bootstrap flag to enabled
- f3d186c7b fix(gambit): include extension schemas in exports and default serve to restored workspace
- a83b7cbe7 fix(gambit): move unbounded build timeout to deck opt-in
- acb2de627 fix(gambit): avoid strict json_schema 400s in openrouter responses
- 8aba573b6 fix(gambit-simulator-ui): treat errored calibrate runs as failed
- ca2028cf8 fix(gambit): prevent circular trace crashes in workspace test run API
- 7e41517e5 fix(gambit): make build assistant run timeout unbounded
- 24341143d feat(gambit): add deterministic verify fixture seeding
- ff2c2d33d feat(gambit): add verify tab consistency UI
- 91f0c93bb feat(gambit): add feature-flagged verify routing
- 1392f8b65 feat(gambit): support deck-level maxTurns override
- f5920ef86 chore(gambit): cut 0.8.5-rc.10

Do not edit this repo directly; make changes in bfmono and re-run the sync.
---
 simulator-ui/src/GradePage.tsx   |   3 +
 simulator-ui/src/VerifyPage.tsx  | 388 ++++++++++++++++++++-----------
 simulator-ui/src/gds/Listbox.tsx |  53 ++++-
 simulator-ui/src/styles.ts       |  60 ++++-
 src/server.ts                    |  23 +-
 src/server_streams.test.ts       | 131 +++++++++++
 6 files changed, 503 insertions(+), 155 deletions(-)
diff --git a/simulator-ui/src/GradePage.tsx b/simulator-ui/src/GradePage.tsx
index d3fc9c05..44a4abca 100644
--- a/simulator-ui/src/GradePage.tsx
+++ b/simulator-ui/src/GradePage.tsx
@@ -807,6 +807,9 @@ function GradePage(
               <div className="flex-column gap-4">
                 <div className="flex-row items-center gap-8">
                   <strong>Grader runs</strong>
+                  <span className="secondary-note">
+                    ({filteredSessionRuns.length})
+                  </span>
                 </div>
               </div>
               {runItems.length === 0 && (
diff --git a/simulator-ui/src/VerifyPage.tsx b/simulator-ui/src/VerifyPage.tsx
index e7351c2d..b036eb3b 100644
--- a/simulator-ui/src/VerifyPage.tsx
+++ b/simulator-ui/src/VerifyPage.tsx
@@ -69,6 +69,14 @@ type VerifyRunSampleResponse = {
   error?: string;
 };
 
+type VerifyReportScope = "current_batch" | "all_matching";
+type VerifyExampleSort =
+  | "default"
+  | "delta_desc"
+  | "agreement_asc"
+  | "samples_desc"
+  | "label_asc";
+
 const parseScenarioRunSummary = (value: unknown): ScenarioRunSummary | null => {
   if (!value || typeof value !== "object") return null;
   const summary = value as Record<string, unknown>;
@@ -199,6 +207,11 @@ function VerifyPage(
     initialRunIds: [],
     requests: [],
   });
+  const [reportScope, setReportScope] = useState<VerifyReportScope>(
+    "all_matching",
+  );
+  const [inconsistentOnly, setInconsistentOnly] = useState(false);
+  const [exampleSort, setExampleSort] = useState<VerifyExampleSort>("default");
   const batchSeqRef = useRef(0);
 
   const updateVerifyPath = useCallback((sessionId: string | null) => {
@@ -545,10 +558,10 @@ function VerifyPage(
 
   const reportRuns = useMemo(
     () =>
-      completedBatchRuns.length > 0
+      reportScope === "current_batch"
         ? completedBatchRuns
         : historicalCompletedRuns,
-    [completedBatchRuns, historicalCompletedRuns],
+    [completedBatchRuns, historicalCompletedRuns, reportScope],
   );
 
   const consistencyReport = useMemo(
@@ -569,7 +582,43 @@ function VerifyPage(
       batchState.status !== "running",
   );
 
-  const topOutliers = consistencyReport.outliers.slice(0, 8);
+  const displayedOutliers = useMemo(() => {
+    const filtered = inconsistentOnly
+      ? consistencyReport.outliers.filter((outlier) => outlier.instability)
+      : consistencyReport.outliers;
+    if (exampleSort === "default") return filtered;
+    const next = [...filtered];
+    if (exampleSort === "delta_desc") {
+      next.sort((a, b) => {
+        const aDelta = a.scoreDelta ?? -1;
+        const bDelta = b.scoreDelta ?? -1;
+        if (aDelta !== bDelta) return bDelta - aDelta;
+        return a.label.localeCompare(b.label);
+      });
+      return next;
+    }
+    if (exampleSort === "agreement_asc") {
+      next.sort((a, b) => {
+        const aAgreement = a.agreementRate ?? Number.POSITIVE_INFINITY;
+        const bAgreement = b.agreementRate ?? Number.POSITIVE_INFINITY;
+        if (aAgreement !== bAgreement) return aAgreement - bAgreement;
+        return a.label.localeCompare(b.label);
+      });
+      return next;
+    }
+    if (exampleSort === "samples_desc") {
+      next.sort((a, b) => {
+        if (a.sampleSize !== b.sampleSize) return b.sampleSize - a.sampleSize;
+        return a.label.localeCompare(b.label);
+      });
+      return next;
+    }
+    next.sort((a, b) => a.label.localeCompare(b.label));
+    return next;
+  }, [consistencyReport.outliers, exampleSort, inconsistentOnly]);
+  const reportScopeLabel = reportScope === "current_batch"
+    ? "current batch"
+    : "all matching runs";
   const resolvedComposerChips = useMemo(
     () => composerChips ?? [],
     [composerChips],
@@ -612,7 +661,7 @@ function VerifyPage(
   }, [onComposerChipsChange, resolvedComposerChips]);
 
   const buildOutlierChip = useCallback(
-    (outlier: typeof topOutliers[number]) => {
+    (outlier: typeof consistencyReport.outliers[number]) => {
       const chipId = `verify:${selectedSessionId ?? ""}:${outlier.key}`;
       const runId = outlier.maxRunId ?? outlier.minRunId;
       const score = outlier.maxScore ?? outlier.minScore ?? undefined;
@@ -774,23 +823,11 @@ function VerifyPage(
           {!loading && (
             <>
               <div className="verify-status-row">
-                <div className="verify-status-main">
+                <div className="verify-status-main flex-row items-center gap-8">
                   <strong>Batch status</strong>
-                  <div className="verify-status-meta">
-                    <Badge status={batchState.status}>
-                      {batchState.status}
-                    </Badge>
-                    {batchState.startedAt
-                      ? ` · started ${
-                        formatTimestampShort(batchState.startedAt)
-                      }`
-                      : ""}
-                    {batchState.finishedAt
-                      ? ` · finished ${
-                        formatTimestampShort(batchState.finishedAt)
-                      }`
-                      : ""}
-                  </div>
+                  <Badge status={batchState.status}>
+                    {batchState.status}
+                  </Badge>
                 </div>
                 {consistencyReport.sampleSize > 0 && (
                   <span
@@ -803,14 +840,6 @@ function VerifyPage(
                   </span>
                 )}
               </div>
-              {batchState.requested > 0 && (
-                <div className="verify-progress-row">
-                  <span>Queued: {queuedCount}</span>
-                  <span>Running: {batchState.active}</span>
-                  <span>Completed: {batchState.completed}</span>
-                  <span>Failed: {batchState.failed}</span>
-                </div>
-              )}
               {batchState.status === "idle" &&
                 consistencyReport.sampleSize === 0 && (
                 <Callout>
@@ -818,57 +847,87 @@ function VerifyPage(
                   instability for the selected grader.
                 </Callout>
               )}
-              {consistencyReport.sampleSize > 0 && (
-                <>
-                  <div className="verify-metric-grid">
-                    <div className="verify-metric-card">
+              <div className="verify-metric-grid">
+                <div className="verify-metric-card">
+                  <div className="verify-sample-size-row">
+                    <div className="verify-sample-size-copy">
                       <div className="verify-metric-label">Sample size</div>
                       <div className="verify-metric-value">
                         {consistencyReport.sampleSize}
                       </div>
                     </div>
-                    <div className="verify-metric-card">
-                      <div className="verify-metric-label">Agreement rate</div>
-                      <div className="verify-metric-value">
-                        {consistencyReport.agreementRate === null
-                          ? "—"
-                          : `${
-                            Math.round(consistencyReport.agreementRate * 100)
-                          }%`}
-                      </div>
-                    </div>
-                    <div className="verify-metric-card">
-                      <div className="verify-metric-label">
-                        Score spread (min/median/max)
-                      </div>
-                      <div className="verify-metric-value verify-metric-value--compact">
-                        {consistencyReport.scoreSpreadMin === null
-                          ? "—"
-                          : `${consistencyReport.scoreSpreadMin} / ${
-                            consistencyReport.scoreSpreadMedian ?? "—"
-                          } / ${consistencyReport.scoreSpreadMax ?? "—"}`}
-                      </div>
-                    </div>
-                    <div className="verify-metric-card">
-                      <div className="verify-metric-label">
-                        Instability count
-                      </div>
-                      <div className="verify-metric-value">
-                        {consistencyReport.instabilityCount}
-                      </div>
+                    <div className="verify-sample-scope-select">
+                      <Listbox
+                        value={reportScope}
+                        onChange={(value) =>
+                          setReportScope(value as VerifyReportScope)}
+                        size="small"
+                        popoverMatchTriggerWidth={false}
+                        popoverMinWidth={320}
+                        popoverAlign="right"
+                        options={[
+                          {
+                            value: "current_batch",
+                            label:
+                              `Current batch (${completedBatchRuns.length})`,
+                            triggerLabel: "Current batch",
+                            triggerMeta: null,
+                            meta: "Only runs from the latest batch launch",
+                          },
+                          {
+                            value: "all_matching",
+                            label:
+                              `All matching runs (${historicalCompletedRuns.length})`,
+                            triggerLabel: "All matching runs",
+                            triggerMeta: null,
+                            meta:
+                              "All runs matching selected scenario + grader",
+                          },
+                        ]}
+                      />
                     </div>
                   </div>
-                  <Callout
-                    variant={consistencyReport.verdict === "FAIL"
-                      ? "danger"
-                      : consistencyReport.verdict === "WARN"
-                      ? "emphasis"
-                      : "muted"}
-                    title={`Verdict: ${consistencyReport.verdict}`}
-                  >
-                    {consistencyReport.verdictReason}
-                  </Callout>
-                </>
+                </div>
+                <div className="verify-metric-card">
+                  <div className="verify-metric-label">Agreement rate</div>
+                  <div className="verify-metric-value">
+                    {consistencyReport.agreementRate === null
+                      ? "—"
+                      : `${Math.round(consistencyReport.agreementRate * 100)}%`}
+                  </div>
+                </div>
+                <div className="verify-metric-card">
+                  <div className="verify-metric-label">
+                    Score spread (min/median/max)
+                  </div>
+                  <div className="verify-metric-value verify-metric-value--compact">
+                    {consistencyReport.scoreSpreadMin === null
+                      ? "—"
+                      : `${consistencyReport.scoreSpreadMin} / ${
+                        consistencyReport.scoreSpreadMedian ?? "—"
+                      } / ${consistencyReport.scoreSpreadMax ?? "—"}`}
+                  </div>
+                </div>
+                <div className="verify-metric-card">
+                  <div className="verify-metric-label">
+                    Instability count
+                  </div>
+                  <div className="verify-metric-value">
+                    {consistencyReport.instabilityCount}
+                  </div>
+                </div>
+              </div>
+              {consistencyReport.sampleSize > 0 && (
+                <Callout
+                  variant={consistencyReport.verdict === "FAIL"
+                    ? "danger"
+                    : consistencyReport.verdict === "WARN"
+                    ? "emphasis"
+                    : "muted"}
+                  title={`Verdict: ${consistencyReport.verdict}`}
+                >
+                  {consistencyReport.verdictReason}
+                </Callout>
               )}
               <Callout title="Thresholds in code">
                 Min sample size: {VERIFY_CONSISTENCY_THRESHOLDS.minSampleSize}
@@ -889,17 +948,49 @@ function VerifyPage(
                 {VERIFY_CONSISTENCY_THRESHOLDS.warn.maxInstabilityCount}.
               </Callout>
               <div className="verify-section">
-                <strong>Most inconsistent examples</strong>
-                {topOutliers.length === 0
+                <div className="verify-section-header">
+                  <strong>Examples</strong>
+                  <div className="verify-section-controls">
+                    <Button
+                      variant={inconsistentOnly
+                        ? "primary-deemph"
+                        : "secondary"}
+                      size="small"
+                      onClick={() => setInconsistentOnly((prev) => !prev)}
+                    >
+                      Inconsistent
+                    </Button>
+                    <div className="verify-section-sort">
+                      <Listbox
+                        value={exampleSort}
+                        onChange={(value) =>
+                          setExampleSort(value as VerifyExampleSort)}
+                        size="small"
+                        options={[
+                          { value: "default", label: "Sort: default" },
+                          { value: "delta_desc", label: "Sort: score delta" },
+                          {
+                            value: "agreement_asc",
+                            label: "Sort: agreement",
+                          },
+                          { value: "samples_desc", label: "Sort: sample size" },
+                          { value: "label_asc", label: "Sort: label" },
+                        ]}
+                      />
+                    </div>
+                  </div>
+                </div>
+                {displayedOutliers.length === 0
                   ? (
                     <Callout>
-                      Inconsistent examples will appear here as soon as at least
-                      one completed run is available in this batch.
+                      {consistencyReport.outliers.length === 0
+                        ? `Examples will appear here as soon as at least one completed run is available in ${reportScopeLabel}.`
+                        : `No examples match the current filters in ${reportScopeLabel}.`}
                     </Callout>
                   )
                   : (
                     <div className="verify-outlier-list">
-                      {topOutliers.map((outlier) => {
+                      {displayedOutliers.map((outlier) => {
                         const runLinks = (() => {
                           if (!selectedSessionId) return [];
                           const ids = [
@@ -973,64 +1064,72 @@ function VerifyPage(
                                 </Badge>
                               </div>
                             </div>
-                            <div className="verify-outlier-meta">
-                              agreement {outlier.agreementRate === null
-                                ? "—"
-                                : `${Math.round(outlier.agreementRate * 100)}%`}
-                              {" "}
-                              · delta {outlier.scoreDelta ?? "—"} · samples{" "}
-                              {outlier.sampleSize}
-                              {outlier.passFlip ? " · pass/fail flip" : ""}
-                              {outlier.messageRefId
-                                ? ` · ref ${outlier.messageRefId}`
-                                : ""}
-                            </div>
-                            {(() => {
-                              const outlierChip = buildOutlierChip(outlier);
-                              const inChat = composerChipIds.has(
-                                outlierChip.chipId,
-                              );
-                              return (
-                                <div className="workbench-summary-actions">
-                                  <Button
-                                    variant="secondary"
-                                    size="small"
-                                    onClick={() =>
-                                      inChat
-                                        ? removeComposerChip(
-                                          outlierChip.chipId,
-                                        )
-                                        : addComposerChip(outlierChip)}
-                                    disabled={!onComposerChipsChange}
-                                  >
-                                    {inChat
-                                      ? "Remove from chat"
-                                      : "Add to chat"}
-                                  </Button>
+                            <div className="flex-row items-center">
+                              <div className="flex-1 flex-column">
+                                <div className="verify-outlier-meta">
+                                  agreement {outlier.agreementRate === null
+                                    ? "—"
+                                    : `${
+                                      Math.round(outlier.agreementRate * 100)
+                                    }%`} · delta {outlier.scoreDelta ?? "—"}
+                                  {" "}
+                                  · samples {outlier.sampleSize}
+                                  {outlier.passFlip ? " · pass/fail flip" : ""}
+                                  {outlier.messageRefId
+                                    ? ` · ref ${outlier.messageRefId}`
+                                    : ""}
                                 </div>
-                              );
-                            })()}
-                            {runLinks.length > 0 && (
-                              <div className="verify-outlier-links">
-                                {runLinks.map((runId) => {
-                                  if (!selectedSessionId) return null;
-                                  const href = buildGradePath(
-                                    selectedSessionId,
-                                    runId,
-                                  );
-                                  return (
-                                    <a
-                                      key={runId}
-                                      href={href}
-                                      onClick={(event) =>
-                                        handleInternalLinkClick(event, href)}
-                                    >
-                                      Open grade run {runId}
-                                    </a>
-                                  );
-                                })}
+                                {runLinks.length > 0 && (
+                                  <div className="verify-outlier-links">
+                                    {runLinks.map((runId) => {
+                                      if (!selectedSessionId) return null;
+                                      const href = buildGradePath(
+                                        selectedSessionId,
+                                        runId,
+                                      );
+                                      return (
+                                        <a
+                                          key={runId}
+                                          href={href}
+                                          onClick={(event) =>
+                                            handleInternalLinkClick(
+                                              event,
+                                              href,
+                                            )}
+                                        >
+                                          Open grade run {runId}
+                                        </a>
+                                      );
+                                    })}
+                                  </div>
+                                )}
                               </div>
-                            )}
+                              {(() => {
+                                const outlierChip = buildOutlierChip(outlier);
+                                const inChat = composerChipIds.has(
+                                  outlierChip.chipId,
+                                );
+                                return (
+                                  <div className="workbench-summary-actions">
+                                    <Button
+                                      variant="secondary"
+                                      size="small"
+                                      onClick={() =>
+                                        inChat
+                                          ? removeComposerChip(
+                                            outlierChip.chipId,
+                                          )
+                                          : addComposerChip(outlierChip)}
+                                      disabled={!onComposerChipsChange}
+                                    >
+                                      {inChat
+                                        ? "Remove from chat"
+                                        : "Add to chat"}
+                                    </Button>
+                                  </div>
+                                );
+                              })()}
+                            </div>
                           </div>
                         );
                       })}
@@ -1040,6 +1139,29 @@ function VerifyPage(
               {batchState.requests.length > 0 && (
                 <div className="verify-section">
                   <strong>Batch requests</strong>
+                  {(batchState.startedAt ||
+                    batchState.finishedAt) && (
+                    <div className="verify-status-meta">
+                      {batchState.startedAt
+                        ? `started ${
+                          formatTimestampShort(batchState.startedAt)
+                        }`
+                        : ""}
+                      {batchState.finishedAt
+                        ? ` · finished ${
+                          formatTimestampShort(batchState.finishedAt)
+                        }`
+                        : ""}
+                    </div>
+                  )}
+                  {batchState.requested > 0 && (
+                    <div className="verify-progress-row">
+                      <span>Queued: {queuedCount}</span>
+                      <span>Running: {batchState.active}</span>
+                      <span>Completed: {batchState.completed}</span>
+                      <span>Failed: {batchState.failed}</span>
+                    </div>
+                  )}
                   <ul className="verify-request-list">
                     {batchState.requests.map((request, index) => (
                       <li
diff --git a/simulator-ui/src/gds/Listbox.tsx b/simulator-ui/src/gds/Listbox.tsx
index 01e9fc4d..847dcced 100644
--- a/simulator-ui/src/gds/Listbox.tsx
+++ b/simulator-ui/src/gds/Listbox.tsx
@@ -15,6 +15,8 @@ export type ListboxOption = {
   kind?: "option";
   value: string;
   label: string;
+  triggerLabel?: string;
+  triggerMeta?: string | null;
   meta?: string | null;
   disabled?: boolean;
 } | {
@@ -33,6 +35,10 @@ export type ListboxProps = {
   label?: string;
   labelClassName?: string;
   id?: string;
+  popoverMatchTriggerWidth?: boolean;
+  popoverMinWidth?: number;
+  popoverAlign?: "left" | "right";
+  size?: "default" | "small";
 };
 
 export default function Listbox(props: ListboxProps) {
@@ -45,6 +51,10 @@ export default function Listbox(props: ListboxProps) {
     label,
     labelClassName,
     id,
+    popoverMatchTriggerWidth = true,
+    popoverMinWidth,
+    popoverAlign = "left",
+    size = "default",
   } = props;
   const [open, setOpen] = useState(false);
   const rootRef = useRef<HTMLDivElement | null>(null);
@@ -66,18 +76,39 @@ export default function Listbox(props: ListboxProps) {
     }
     return null;
   }, [options, value]);
+  const selectedTriggerMeta = useMemo(() => {
+    if (!selected) return null;
+    if ("triggerMeta" in selected) return selected.triggerMeta ?? null;
+    return selected.meta ?? null;
+  }, [selected]);
 
   const updatePopover = useCallback(() => {
     const trigger = triggerRef.current;
     if (!trigger) return;
     const rect = trigger.getBoundingClientRect();
-    setPopoverStyle({
+    const style: React.CSSProperties = {
       position: "fixed",
       top: rect.bottom + 6,
-      left: rect.left,
-      width: rect.width,
-    });
-  }, []);
+    };
+    if (popoverMatchTriggerWidth) {
+      style.width = rect.width;
+    } else if (
+      typeof popoverMinWidth === "number" && Number.isFinite(popoverMinWidth)
+    ) {
+      style.minWidth = popoverMinWidth;
+    }
+    if (popoverAlign === "right") {
+      style.left = popoverMatchTriggerWidth
+        ? rect.right - rect.width
+        : rect.right;
+      if (!popoverMatchTriggerWidth) {
+        style.transform = "translateX(-100%)";
+      }
+    } else {
+      style.left = rect.left;
+    }
+    setPopoverStyle(style);
+  }, [popoverAlign, popoverMatchTriggerWidth, popoverMinWidth]);
 
   useLayoutEffect(() => {
     if (!open) return;
@@ -117,8 +148,12 @@ export default function Listbox(props: ListboxProps) {
     };
   }, [open, updatePopover]);
 
+  const rootClassName = size === "small"
+    ? "gds-listbox gds-listbox--size-small"
+    : "gds-listbox";
+
   return (
-    <div className="gds-listbox" ref={rootRef}>
+    <div className={rootClassName} ref={rootRef}>
       {label && (
         <label className={labelClasses} htmlFor={controlId} id={labelId}>
           {label}
@@ -136,12 +171,12 @@ export default function Listbox(props: ListboxProps) {
         ref={triggerRef}
       >
         <ScrollingText
-          text={selected?.label ?? placeholder}
+          text={selected?.triggerLabel ?? selected?.label ?? placeholder}
           className="gds-listbox-label"
         />
-        {selected?.meta && (
+        {selectedTriggerMeta && (
           <ScrollingText
-            text={selected.meta}
+            text={selectedTriggerMeta}
             className="gds-listbox-meta"
           />
         )}
diff --git a/simulator-ui/src/styles.ts b/simulator-ui/src/styles.ts
index 34fe6763..ec7e8b88 100644
--- a/simulator-ui/src/styles.ts
+++ b/simulator-ui/src/styles.ts
@@ -370,8 +370,9 @@ code:not(pre *) {
 }
 .verify-status-row {
   display: flex;
+  flex-wrap: wrap;
   align-items: flex-start;
-  justify-content: space-between;
+  justify-content: flex-start;
   gap: 10px;
 }
 .verify-status-main {
@@ -391,6 +392,7 @@ code:not(pre *) {
   color: var(--color-text-muted);
 }
 .verify-verdict-badge {
+  margin-left: auto;
   border-radius: calc(10px * var(--corner-radius-scale, 1));
   corner-shape: squircle;
   border: 1px solid var(--color-border);
@@ -443,6 +445,26 @@ code:not(pre *) {
   font-weight: 700;
   color: var(--color-text);
 }
+.verify-sample-size-row {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  gap: 8px;
+  flex-wrap: wrap;
+}
+.verify-sample-size-copy {
+  display: flex;
+  flex-direction: column;
+}
+.verify-sample-size-row .verify-metric-value {
+  margin-top: 4px;
+}
+.verify-sample-scope-select {
+  min-width: 170px;
+}
+.verify-sample-scope-select .gds-listbox-field-label {
+  display: none;
+}
 .verify-metric-value--compact {
   font-size: 14px;
 }
@@ -451,6 +473,25 @@ code:not(pre *) {
   flex-direction: column;
   gap: 8px;
 }
+.verify-section-header {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  gap: 8px;
+  flex-wrap: wrap;
+}
+.verify-section-controls {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  flex-wrap: wrap;
+}
+.verify-section-sort {
+  min-width: 150px;
+}
+.verify-section-sort .gds-listbox-field-label {
+  display: none;
+}
 .verify-outlier-list {
   display: flex;
   flex-direction: column;
@@ -2452,6 +2493,23 @@ code:not(pre *) {
   opacity: 0.6;
   cursor: not-allowed;
 }
+.gds-listbox--size-small .gds-listbox-field-label {
+  margin-bottom: 4px;
+  font-size: 12px;
+}
+.gds-listbox--size-small .gds-listbox-trigger {
+  padding: 4px 28px 4px 10px;
+  gap: 0;
+}
+.gds-listbox--size-small .gds-listbox-label {
+  font-size: 13px;
+}
+.gds-listbox--size-small .gds-listbox-meta {
+  display: none;
+}
+.gds-listbox--size-small .gds-listbox-caret {
+  right: 9px;
+}
 .gds-listbox-label {
   font-weight: 600;
   font-size: 14px;
diff --git a/src/server.ts b/src/server.ts
index b8b3de65..437e839b 100644
--- a/src/server.ts
+++ b/src/server.ts
@@ -3868,16 +3868,16 @@ export function startWebSocketSimulator(opts: {
           const runId = randomId("cal");
           let entry: GradingRunRecord;
           const upsertCalibrationRun = (
-            state: SavedState,
             nextEntry: GradingRunRecord,
           ): SavedState => {
+            const latestState = readSessionState(workspaceId) ?? sessionState;
             const previousRuns = Array.isArray(
-                (state.meta as { gradingRuns?: unknown })?.gradingRuns,
+                (latestState.meta as { gradingRuns?: unknown })?.gradingRuns,
               )
-              ? ((state.meta as { gradingRuns: Array<GradingRunRecord> })
+              ? ((latestState.meta as { gradingRuns: Array<GradingRunRecord> })
                 .gradingRuns)
-              : Array.isArray(state.meta?.calibrationRuns)
-              ? (state.meta?.calibrationRuns as Array<GradingRunRecord>)
+              : Array.isArray(latestState.meta?.calibrationRuns)
+              ? (latestState.meta?.calibrationRuns as Array<GradingRunRecord>)
               : [];
             const index = previousRuns.findIndex((run) =>
               run.id === nextEntry.id
@@ -3886,9 +3886,9 @@ export function startWebSocketSimulator(opts: {
               ? previousRuns.map((run, i) => (i === index ? nextEntry : run))
               : [...previousRuns, nextEntry];
             const nextState = persistSessionState({
-              ...state,
+              ...latestState,
               meta: {
-                ...(state.meta ?? {}),
+                ...(latestState.meta ?? {}),
                 gradingRuns: nextRuns,
               },
             });
@@ -3911,7 +3911,6 @@ export function startWebSocketSimulator(opts: {
             });
             return nextState;
           };
-          let currentState = sessionState;
           try {
             const result = await (async () => {
               if (runMode !== "turns") {
@@ -3926,7 +3925,7 @@ export function startWebSocketSimulator(opts: {
                   gradingRunId: runId,
                   input: { session: sessionPayload },
                 };
-                currentState = upsertCalibrationRun(currentState, entry);
+                upsertCalibrationRun(entry);
                 return await runDeckWithFallback({
                   path: grader.path,
                   input: { session: sessionPayload },
@@ -3962,7 +3961,7 @@ export function startWebSocketSimulator(opts: {
                 input: { session: sessionPayload },
                 result: { mode: "turns", totalTurns, turns: [] },
               };
-              currentState = upsertCalibrationRun(currentState, entry);
+              upsertCalibrationRun(entry);
               if (totalTurns === 0) {
                 return { mode: "turns", totalTurns, turns: [] };
               }
@@ -3999,7 +3998,7 @@ export function startWebSocketSimulator(opts: {
                   ...entry,
                   result: { mode: "turns", totalTurns, turns: [...turns] },
                 };
-                currentState = upsertCalibrationRun(currentState, entry);
+                upsertCalibrationRun(entry);
               }
               return { mode: "turns", totalTurns, turns };
             })();
@@ -4039,7 +4038,7 @@ export function startWebSocketSimulator(opts: {
               error: message,
             };
           }
-          const nextState = upsertCalibrationRun(currentState, entry);
+          const nextState = upsertCalibrationRun(entry);
           const sessionMeta = buildSessionMeta(workspaceId, nextState);
           return new Response(
             JSON.stringify({
diff --git a/src/server_streams.test.ts b/src/server_streams.test.ts
index eaa98849..42cf5186 100644
--- a/src/server_streams.test.ts
+++ b/src/server_streams.test.ts
@@ -973,6 +973,137 @@ Deno.test("turn-mode calibrate running events include selected scenario run meta
   await server.finished;
 });
 
+Deno.test("concurrent calibrate runs preserve all grading runs", async () => {
+  const dir = await Deno.makeTempDir();
+  const sessionsDir = path.join(dir, "sessions");
+  const modHref = modImportPath();
+  const rootDeckPath = path.join(dir, "concurrent-calibrate-root.deck.ts");
+  const graderDeckPath = path.join(dir, "concurrent-calibrate-grader.deck.ts");
+  const escapedGraderPath = graderDeckPath.replaceAll("\\", "\\\\");
+
+  await Deno.writeTextFile(
+    graderDeckPath,
+    `
+    import { defineDeck } from "${modHref}";
+    import { z } from "zod";
+    export default defineDeck({
+      inputSchema: z.object({
+        session: z.any().optional(),
+      }),
+      outputSchema: z.object({
+        score: z.number(),
+        reason: z.string(),
+        pass: z.boolean(),
+      }),
+      modelParams: { model: "dummy-model" },
+    });
+    `,
+  );
+
+  await Deno.writeTextFile(
+    rootDeckPath,
+    `
+    import { defineDeck } from "${modHref}";
+    import { z } from "zod";
+    export default defineDeck({
+      inputSchema: z.string().optional(),
+      outputSchema: z.string().optional(),
+      modelParams: { model: "dummy-model" },
+      graderDecks: [{
+        id: "concurrency-grader",
+        label: "Concurrency Grader",
+        path: "${escapedGraderPath}",
+      }],
+    });
+    `,
+  );
+
+  const provider: ModelProvider = {
+    chat() {
+      return new Promise((resolve) => {
+        setTimeout(() => {
+          resolve({
+            message: {
+              role: "assistant",
+              content: JSON.stringify({
+                score: 1,
+                reason: "ok",
+                pass: true,
+              }),
+            },
+            finishReason: "stop",
+          });
+        }, 30);
+      });
+    },
+  };
+
+  const server = startWebSocketSimulator({
+    deckPath: rootDeckPath,
+    modelProvider: provider,
+    port: 0,
+    sessionDir: sessionsDir,
+  });
+  const port = (server.addr as Deno.NetAddr).port;
+
+  const workspaceRes = await fetch(
+    `http://127.0.0.1:${port}/api/workspace/new`,
+    {
+      method: "POST",
+    },
+  );
+  assertEquals(workspaceRes.ok, true);
+  const workspaceBody = await workspaceRes.json() as { workspaceId?: string };
+  const workspaceId = workspaceBody.workspaceId ?? "";
+  assert(workspaceId.length > 0);
+
+  const runRequests = Array.from(
+    { length: 3 },
+    () =>
+      fetch(`http://127.0.0.1:${port}/api/calibrate/run`, {
+        method: "POST",
+        headers: { "content-type": "application/json" },
+        body: JSON.stringify({
+          workspaceId,
+          graderId: "concurrency-grader",
+        }),
+      }),
+  );
+  const runResponses = await Promise.all(runRequests);
+  for (const response of runResponses) {
+    const body = await response.json().catch(() => ({})) as { error?: string };
+    assert(
+      response.ok,
+      `calibrate run failed: status=${response.status} error=${
+        body.error ?? "unknown"
+      }`,
+    );
+  }
+
+  const workspaceStateRes = await fetch(
+    `http://127.0.0.1:${port}/api/workspaces/${
+      encodeURIComponent(workspaceId)
+    }`,
+  );
+  assertEquals(workspaceStateRes.ok, true);
+  const workspaceStateBody = await workspaceStateRes.json() as {
+    grade?: {
+      sessions?: Array<{
+        id?: string;
+        gradingRuns?: Array<{ id?: string }>;
+      }>;
+    };
+  };
+  const session = (workspaceStateBody.grade?.sessions ?? []).find((entry) =>
+    entry.id === workspaceId
+  );
+  assert(session);
+  assertEquals((session.gradingRuns ?? []).length, 3);
+
+  await server.shutdown();
+  await server.finished;
+});
+
 Deno.test("test stop aborts in-flight runtime execution", async () => {
   const dir = await Deno.makeTempDir();
   const modHref = modImportPath();