diff --git a/simulator-ui/src/Chat.tsx b/simulator-ui/src/Chat.tsx index 249694c2..c49ed60f 100644 --- a/simulator-ui/src/Chat.tsx +++ b/simulator-ui/src/Chat.tsx @@ -49,10 +49,22 @@ export type WorkbenchFlagContext = { message: string; }; +export type WorkbenchVerifyOutlierContext = { + source: "verify_outlier"; + workspaceId?: string; + runId?: string; + capturedAt: string; + outlierKey: string; + instability?: boolean; + score?: number; + message: string; +}; + export type WorkbenchMessageContext = | WorkbenchScenarioErrorContext | WorkbenchRatingContext - | WorkbenchFlagContext; + | WorkbenchFlagContext + | WorkbenchVerifyOutlierContext; export type WorkbenchScenarioErrorChip = WorkbenchScenarioErrorContext & { enabled: boolean; @@ -157,6 +169,30 @@ function parseWorkbenchContext(value: unknown): WorkbenchMessageContext | null { message: record.message, }; } + if (record.source === "verify_outlier") { + if ( + typeof record.outlierKey !== "string" || + record.outlierKey.trim().length === 0 || + typeof record.message !== "string" || + record.message.trim().length === 0 + ) { + return null; + } + return { + source: "verify_outlier", + workspaceId, + runId, + capturedAt: record.capturedAt, + outlierKey: record.outlierKey, + instability: typeof record.instability === "boolean" + ? record.instability + : undefined, + score: typeof record.score === "number" && Number.isFinite(record.score) + ? record.score + : undefined, + message: record.message, + }; + } return null; } @@ -584,6 +620,19 @@ export function ChatView(props: { score: chip.score, message: chip.message, }); + continue; + } + if (chip.source === "verify_outlier") { + activeContexts.push({ + source: "verify_outlier", + workspaceId: chip.workspaceId, + runId: chip.runId, + capturedAt: chip.capturedAt, + outlierKey: chip.outlierKey, + instability: chip.instability, + score: chip.score, + message: chip.message, + }); } } if (!message && activeContexts.length === 0) return; diff --git a/simulator-ui/src/VerifyPage.tsx b/simulator-ui/src/VerifyPage.tsx index 81ca7da0..e7351c2d 100644 --- a/simulator-ui/src/VerifyPage.tsx +++ b/simulator-ui/src/VerifyPage.tsx @@ -26,6 +26,7 @@ import { buildVerifyConsistencyReport, VERIFY_CONSISTENCY_THRESHOLDS, } from "./verify_metrics.ts"; +import type { WorkbenchComposerChip } from "./Chat.tsx"; const MAX_BATCH_SIZE = 24; const MAX_BATCH_CONCURRENCY = 6; @@ -137,15 +138,33 @@ const clampInt = (value: number, min: number, max: number): number => { return Math.max(min, Math.min(max, rounded)); }; +const formatSignedScore = (value: number | null | undefined): string => { + if (typeof value !== "number" || !Number.isFinite(value)) return "—"; + return `${value > 0 ? "+" : ""}${value}`; +}; + +const scoreBadgeVariant = ( + value: number | null | undefined, +): "ghost" | "error" | "completed" | "idle" => { + if (typeof value !== "number" || !Number.isFinite(value)) return "ghost"; + if (value < 0) return "error"; + if (value > 0) return "completed"; + return "idle"; +}; + function VerifyPage( { setNavActions, onAppPathChange, activeWorkspaceId, + composerChips, + onComposerChipsChange, }: { setNavActions?: (actions: React.ReactNode | null) => void; onAppPathChange?: (path: string) => void; activeWorkspaceId?: string | null; + composerChips?: WorkbenchComposerChip[]; + onComposerChipsChange?: (next: WorkbenchComposerChip[]) => void; }, ) { const { @@ -551,6 +570,74 @@ function VerifyPage( ); const topOutliers = consistencyReport.outliers.slice(0, 8); + const resolvedComposerChips = useMemo( + () => composerChips ?? [], + [composerChips], + ); + const composerChipIds = useMemo( + () => new Set(resolvedComposerChips.map((chip) => chip.chipId)), + [resolvedComposerChips], + ); + + const mergeComposerChip = useCallback( + (base: WorkbenchComposerChip[], chip: WorkbenchComposerChip) => { + const next = [...base]; + const existingIndex = next.findIndex((entry) => + entry.chipId === chip.chipId + ); + if (existingIndex >= 0) { + next[existingIndex] = { + ...next[existingIndex], + ...chip, + enabled: true, + }; + return next; + } + next.push(chip); + return next; + }, + [], + ); + + const addComposerChip = useCallback((chip: WorkbenchComposerChip) => { + if (!onComposerChipsChange) return; + onComposerChipsChange(mergeComposerChip(resolvedComposerChips, chip)); + }, [mergeComposerChip, onComposerChipsChange, resolvedComposerChips]); + + const removeComposerChip = useCallback((chipId: string) => { + if (!onComposerChipsChange) return; + onComposerChipsChange( + resolvedComposerChips.filter((chip) => chip.chipId !== chipId), + ); + }, [onComposerChipsChange, resolvedComposerChips]); + + const buildOutlierChip = useCallback( + (outlier: typeof topOutliers[number]) => { + const chipId = `verify:${selectedSessionId ?? ""}:${outlier.key}`; + const runId = outlier.maxRunId ?? outlier.minRunId; + const score = outlier.maxScore ?? outlier.minScore ?? undefined; + const agreementText = outlier.agreementRate === null + ? "agreement unavailable" + : `agreement ${Math.round(outlier.agreementRate * 100)}%`; + const deltaText = outlier.scoreDelta === null + ? "delta unavailable" + : `delta ${outlier.scoreDelta}`; + return { + chipId, + source: "verify_outlier" as const, + workspaceId: selectedSessionId ?? undefined, + runId, + capturedAt: new Date().toISOString(), + outlierKey: outlier.key, + instability: outlier.instability, + score, + message: + `Verify outlier ${outlier.label}: ${agreementText}, ${deltaText}, samples ${outlier.sampleSize}`, + enabled: true, + }; + }, + [selectedSessionId], + ); useEffect(() => { if (!setNavActions) return; @@ -828,13 +915,63 @@ function VerifyPage( >
{outlier.label} - - {outlier.instability ? "Unstable" : "Stable"} - + {outlier.minScore === outlier.maxScore + ? ( + + {formatSignedScore(outlier.minScore)} + + ) + : ( +
+ + {formatSignedScore(outlier.minScore)} + + + {formatSignedScore(outlier.maxScore)} + +
+ )} + + {outlier.instability ? "Unstable" : "Stable"} + +
agreement {outlier.agreementRate === null @@ -848,6 +985,31 @@ function VerifyPage( ? ` · ref ${outlier.messageRefId}` : ""}
+ {(() => { + const outlierChip = buildOutlierChip(outlier); + const inChat = composerChipIds.has( + outlierChip.chipId, + ); + return ( +
+ +
+ ); + })()} {runLinks.length > 0 && (
{runLinks.map((runId) => { diff --git a/simulator-ui/src/WorkbenchDrawer.tsx b/simulator-ui/src/WorkbenchDrawer.tsx index 0ab2af03..a2949a97 100644 --- a/simulator-ui/src/WorkbenchDrawer.tsx +++ b/simulator-ui/src/WorkbenchDrawer.tsx @@ -387,20 +387,23 @@ export default function WorkbenchDrawer(props: WorkbenchDrawerProps) { let didChange = false; const syncedChips = composerChips.filter((chip) => { if (chip.source === "message_rating") { + if (!chip.chipId.startsWith("rating:")) return true; const stillExists = ratingByChipId.has(chip.chipId); if (!stillExists) didChange = true; return stillExists; } if (chip.source === "grading_flag") { + if (!chip.chipId.startsWith("flag:")) return true; const stillExists = flagByChipId.has(chip.chipId); if (!stillExists) didChange = true; return stillExists; } return true; }).map((chip) => { - const latest = chip.source === "message_rating" + const latest = chip.source === "message_rating" && + chip.chipId.startsWith("rating:") ? ratingByChipId.get(chip.chipId) - : chip.source === "grading_flag" + : chip.source === "grading_flag" && chip.chipId.startsWith("flag:") ? flagByChipId.get(chip.chipId) : undefined; if (!latest) return chip; diff --git a/simulator-ui/src/gds/WorkbenchComposerChip.tsx b/simulator-ui/src/gds/WorkbenchComposerChip.tsx index 39a579dd..5b4e0a5b 100644 --- a/simulator-ui/src/gds/WorkbenchComposerChip.tsx +++ b/simulator-ui/src/gds/WorkbenchComposerChip.tsx @@ -17,6 +17,12 @@ type WorkbenchComposerChipContext = source: "grading_flag"; message: string; score?: number; + } + | { + source: "verify_outlier"; + message: string; + instability?: boolean; + score?: number; }; function formatScoreLabel(score: number): string { @@ -29,6 +35,7 @@ function formatContextLabel(context: WorkbenchComposerChipContext): string { if (context.source === "message_rating") { return formatScoreLabel(context.score); } + if (context.source === "verify_outlier") return "Verify"; return "Flag"; } @@ -41,10 +48,23 @@ function formatContextTooltip(context: WorkbenchComposerChipContext): string { return context.reason?.trim() || `Rating ${formatScoreLabel(context.score)}`; case "grading_flag": + case "verify_outlier": return context.message; } } +function getVerifyOutlierClass( + context: Extract, +): string { + if (typeof context.instability === "boolean") { + return getScoreClass(context.instability ? -1 : 1); + } + if (typeof context.score === "number" && Number.isFinite(context.score)) { + return getScoreClass(context.score); + } + return getScoreClass(0); +} + export default function WorkbenchComposerChip( props: & Omit, "children"> @@ -83,6 +103,8 @@ export default function WorkbenchComposerChip( context.source === "grading_flag" && "workbench-context-chip--flag", context.source === "grading_flag" && typeof score === "number" && getScoreClass(score), + context.source === "verify_outlier" && "workbench-context-chip--flag", + context.source === "verify_outlier" && getVerifyOutlierClass(context), context.source === "message_rating" && "workbench-context-chip--rating", context.source === "message_rating" && typeof score === "number" && getScoreClass(score), @@ -101,6 +123,8 @@ export default function WorkbenchComposerChip( const isPassive = !showToggle && !showRemove; const content = context.source === "grading_flag" ? + : context.source === "verify_outlier" + ? "Verify" : label; if (isPassive) { diff --git a/simulator-ui/src/main.tsx b/simulator-ui/src/main.tsx index 6d3a2c41..e6feb202 100644 --- a/simulator-ui/src/main.tsx +++ b/simulator-ui/src/main.tsx @@ -1669,6 +1669,8 @@ function App() { setNavActions={setNavActions} onAppPathChange={handleAppPathChange} activeWorkspaceId={activeWorkspaceId} + composerChips={workbenchComposerChips} + onComposerChipsChange={setWorkbenchComposerChips} /> ); case "grade": @@ -1702,6 +1704,7 @@ function App() { setNavActions, simulatorBasePath, testBotResetToken, + workbenchComposerChips, workspacesApi, ]);