+
{label && (
{label}
@@ -136,12 +171,12 @@ export default function Listbox(props: ListboxProps) {
ref={triggerRef}
>
- {selected?.meta && (
+ {selectedTriggerMeta && (
)}
diff --git a/simulator-ui/src/styles.ts b/simulator-ui/src/styles.ts
index 34fe6763..ec7e8b88 100644
--- a/simulator-ui/src/styles.ts
+++ b/simulator-ui/src/styles.ts
@@ -370,8 +370,9 @@ code:not(pre *) {
}
.verify-status-row {
display: flex;
+ flex-wrap: wrap;
align-items: flex-start;
- justify-content: space-between;
+ justify-content: flex-start;
gap: 10px;
}
.verify-status-main {
@@ -391,6 +392,7 @@ code:not(pre *) {
color: var(--color-text-muted);
}
.verify-verdict-badge {
+ margin-left: auto;
border-radius: calc(10px * var(--corner-radius-scale, 1));
corner-shape: squircle;
border: 1px solid var(--color-border);
@@ -443,6 +445,26 @@ code:not(pre *) {
font-weight: 700;
color: var(--color-text);
}
+.verify-sample-size-row {
+ display: flex;
+ align-items: center;
+ justify-content: space-between;
+ gap: 8px;
+ flex-wrap: wrap;
+}
+.verify-sample-size-copy {
+ display: flex;
+ flex-direction: column;
+}
+.verify-sample-size-row .verify-metric-value {
+ margin-top: 4px;
+}
+.verify-sample-scope-select {
+ min-width: 170px;
+}
+.verify-sample-scope-select .gds-listbox-field-label {
+ display: none;
+}
.verify-metric-value--compact {
font-size: 14px;
}
@@ -451,6 +473,25 @@ code:not(pre *) {
flex-direction: column;
gap: 8px;
}
+.verify-section-header {
+ display: flex;
+ align-items: center;
+ justify-content: space-between;
+ gap: 8px;
+ flex-wrap: wrap;
+}
+.verify-section-controls {
+ display: flex;
+ align-items: center;
+ gap: 8px;
+ flex-wrap: wrap;
+}
+.verify-section-sort {
+ min-width: 150px;
+}
+.verify-section-sort .gds-listbox-field-label {
+ display: none;
+}
.verify-outlier-list {
display: flex;
flex-direction: column;
@@ -2452,6 +2493,23 @@ code:not(pre *) {
opacity: 0.6;
cursor: not-allowed;
}
+.gds-listbox--size-small .gds-listbox-field-label {
+ margin-bottom: 4px;
+ font-size: 12px;
+}
+.gds-listbox--size-small .gds-listbox-trigger {
+ padding: 4px 28px 4px 10px;
+ gap: 0;
+}
+.gds-listbox--size-small .gds-listbox-label {
+ font-size: 13px;
+}
+.gds-listbox--size-small .gds-listbox-meta {
+ display: none;
+}
+.gds-listbox--size-small .gds-listbox-caret {
+ right: 9px;
+}
.gds-listbox-label {
font-weight: 600;
font-size: 14px;
diff --git a/src/server.ts b/src/server.ts
index b8b3de65..437e839b 100644
--- a/src/server.ts
+++ b/src/server.ts
@@ -3868,16 +3868,16 @@ export function startWebSocketSimulator(opts: {
const runId = randomId("cal");
let entry: GradingRunRecord;
const upsertCalibrationRun = (
- state: SavedState,
nextEntry: GradingRunRecord,
): SavedState => {
+ const latestState = readSessionState(workspaceId) ?? sessionState;
const previousRuns = Array.isArray(
- (state.meta as { gradingRuns?: unknown })?.gradingRuns,
+ (latestState.meta as { gradingRuns?: unknown })?.gradingRuns,
)
- ? ((state.meta as { gradingRuns: Array })
+ ? ((latestState.meta as { gradingRuns: Array })
.gradingRuns)
- : Array.isArray(state.meta?.calibrationRuns)
- ? (state.meta?.calibrationRuns as Array)
+ : Array.isArray(latestState.meta?.calibrationRuns)
+ ? (latestState.meta?.calibrationRuns as Array)
: [];
const index = previousRuns.findIndex((run) =>
run.id === nextEntry.id
@@ -3886,9 +3886,9 @@ export function startWebSocketSimulator(opts: {
? previousRuns.map((run, i) => (i === index ? nextEntry : run))
: [...previousRuns, nextEntry];
const nextState = persistSessionState({
- ...state,
+ ...latestState,
meta: {
- ...(state.meta ?? {}),
+ ...(latestState.meta ?? {}),
gradingRuns: nextRuns,
},
});
@@ -3911,7 +3911,6 @@ export function startWebSocketSimulator(opts: {
});
return nextState;
};
- let currentState = sessionState;
try {
const result = await (async () => {
if (runMode !== "turns") {
@@ -3926,7 +3925,7 @@ export function startWebSocketSimulator(opts: {
gradingRunId: runId,
input: { session: sessionPayload },
};
- currentState = upsertCalibrationRun(currentState, entry);
+ upsertCalibrationRun(entry);
return await runDeckWithFallback({
path: grader.path,
input: { session: sessionPayload },
@@ -3962,7 +3961,7 @@ export function startWebSocketSimulator(opts: {
input: { session: sessionPayload },
result: { mode: "turns", totalTurns, turns: [] },
};
- currentState = upsertCalibrationRun(currentState, entry);
+ upsertCalibrationRun(entry);
if (totalTurns === 0) {
return { mode: "turns", totalTurns, turns: [] };
}
@@ -3999,7 +3998,7 @@ export function startWebSocketSimulator(opts: {
...entry,
result: { mode: "turns", totalTurns, turns: [...turns] },
};
- currentState = upsertCalibrationRun(currentState, entry);
+ upsertCalibrationRun(entry);
}
return { mode: "turns", totalTurns, turns };
})();
@@ -4039,7 +4038,7 @@ export function startWebSocketSimulator(opts: {
error: message,
};
}
- const nextState = upsertCalibrationRun(currentState, entry);
+ const nextState = upsertCalibrationRun(entry);
const sessionMeta = buildSessionMeta(workspaceId, nextState);
return new Response(
JSON.stringify({
diff --git a/src/server_streams.test.ts b/src/server_streams.test.ts
index eaa98849..42cf5186 100644
--- a/src/server_streams.test.ts
+++ b/src/server_streams.test.ts
@@ -973,6 +973,137 @@ Deno.test("turn-mode calibrate running events include selected scenario run meta
await server.finished;
});
+Deno.test("concurrent calibrate runs preserve all grading runs", async () => {
+ const dir = await Deno.makeTempDir();
+ const sessionsDir = path.join(dir, "sessions");
+ const modHref = modImportPath();
+ const rootDeckPath = path.join(dir, "concurrent-calibrate-root.deck.ts");
+ const graderDeckPath = path.join(dir, "concurrent-calibrate-grader.deck.ts");
+ const escapedGraderPath = graderDeckPath.replaceAll("\\", "\\\\");
+
+ await Deno.writeTextFile(
+ graderDeckPath,
+ `
+ import { defineDeck } from "${modHref}";
+ import { z } from "zod";
+ export default defineDeck({
+ inputSchema: z.object({
+ session: z.any().optional(),
+ }),
+ outputSchema: z.object({
+ score: z.number(),
+ reason: z.string(),
+ pass: z.boolean(),
+ }),
+ modelParams: { model: "dummy-model" },
+ });
+ `,
+ );
+
+ await Deno.writeTextFile(
+ rootDeckPath,
+ `
+ import { defineDeck } from "${modHref}";
+ import { z } from "zod";
+ export default defineDeck({
+ inputSchema: z.string().optional(),
+ outputSchema: z.string().optional(),
+ modelParams: { model: "dummy-model" },
+ graderDecks: [{
+ id: "concurrency-grader",
+ label: "Concurrency Grader",
+ path: "${escapedGraderPath}",
+ }],
+ });
+ `,
+ );
+
+ const provider: ModelProvider = {
+ chat() {
+ return new Promise((resolve) => {
+ setTimeout(() => {
+ resolve({
+ message: {
+ role: "assistant",
+ content: JSON.stringify({
+ score: 1,
+ reason: "ok",
+ pass: true,
+ }),
+ },
+ finishReason: "stop",
+ });
+ }, 30);
+ });
+ },
+ };
+
+ const server = startWebSocketSimulator({
+ deckPath: rootDeckPath,
+ modelProvider: provider,
+ port: 0,
+ sessionDir: sessionsDir,
+ });
+ const port = (server.addr as Deno.NetAddr).port;
+
+ const workspaceRes = await fetch(
+ `http://127.0.0.1:${port}/api/workspace/new`,
+ {
+ method: "POST",
+ },
+ );
+ assertEquals(workspaceRes.ok, true);
+ const workspaceBody = await workspaceRes.json() as { workspaceId?: string };
+ const workspaceId = workspaceBody.workspaceId ?? "";
+ assert(workspaceId.length > 0);
+
+ const runRequests = Array.from(
+ { length: 3 },
+ () =>
+ fetch(`http://127.0.0.1:${port}/api/calibrate/run`, {
+ method: "POST",
+ headers: { "content-type": "application/json" },
+ body: JSON.stringify({
+ workspaceId,
+ graderId: "concurrency-grader",
+ }),
+ }),
+ );
+ const runResponses = await Promise.all(runRequests);
+ for (const response of runResponses) {
+ const body = await response.json().catch(() => ({})) as { error?: string };
+ assert(
+ response.ok,
+ `calibrate run failed: status=${response.status} error=${
+ body.error ?? "unknown"
+ }`,
+ );
+ }
+
+ const workspaceStateRes = await fetch(
+ `http://127.0.0.1:${port}/api/workspaces/${
+ encodeURIComponent(workspaceId)
+ }`,
+ );
+ assertEquals(workspaceStateRes.ok, true);
+ const workspaceStateBody = await workspaceStateRes.json() as {
+ grade?: {
+ sessions?: Array<{
+ id?: string;
+ gradingRuns?: Array<{ id?: string }>;
+ }>;
+ };
+ };
+ const session = (workspaceStateBody.grade?.sessions ?? []).find((entry) =>
+ entry.id === workspaceId
+ );
+ assert(session);
+ assertEquals((session.gradingRuns ?? []).length, 3);
+
+ await server.shutdown();
+ await server.finished;
+});
+
Deno.test("test stop aborts in-flight runtime execution", async () => {
const dir = await Deno.makeTempDir();
const modHref = modImportPath();