diff --git a/simulator-ui/src/Chat.tsx b/simulator-ui/src/Chat.tsx index 249694c2..c49ed60f 100644 --- a/simulator-ui/src/Chat.tsx +++ b/simulator-ui/src/Chat.tsx @@ -49,10 +49,22 @@ export type WorkbenchFlagContext = { message: string; }; +export type WorkbenchVerifyOutlierContext = { + source: "verify_outlier"; + workspaceId?: string; + runId?: string; + capturedAt: string; + outlierKey: string; + instability?: boolean; + score?: number; + message: string; +}; + export type WorkbenchMessageContext = | WorkbenchScenarioErrorContext | WorkbenchRatingContext - | WorkbenchFlagContext; + | WorkbenchFlagContext + | WorkbenchVerifyOutlierContext; export type WorkbenchScenarioErrorChip = WorkbenchScenarioErrorContext & { enabled: boolean; @@ -157,6 +169,30 @@ function parseWorkbenchContext(value: unknown): WorkbenchMessageContext | null { message: record.message, }; } + if (record.source === "verify_outlier") { + if ( + typeof record.outlierKey !== "string" || + record.outlierKey.trim().length === 0 || + typeof record.message !== "string" || + record.message.trim().length === 0 + ) { + return null; + } + return { + source: "verify_outlier", + workspaceId, + runId, + capturedAt: record.capturedAt, + outlierKey: record.outlierKey, + instability: typeof record.instability === "boolean" + ? record.instability + : undefined, + score: typeof record.score === "number" && Number.isFinite(record.score) + ? record.score + : undefined, + message: record.message, + }; + } return null; } @@ -584,6 +620,19 @@ export function ChatView(props: { score: chip.score, message: chip.message, }); + continue; + } + if (chip.source === "verify_outlier") { + activeContexts.push({ + source: "verify_outlier", + workspaceId: chip.workspaceId, + runId: chip.runId, + capturedAt: chip.capturedAt, + outlierKey: chip.outlierKey, + instability: chip.instability, + score: chip.score, + message: chip.message, + }); } } if (!message && activeContexts.length === 0) return; diff --git a/simulator-ui/src/VerifyPage.tsx b/simulator-ui/src/VerifyPage.tsx index 81ca7da0..e7351c2d 100644 --- a/simulator-ui/src/VerifyPage.tsx +++ b/simulator-ui/src/VerifyPage.tsx @@ -26,6 +26,7 @@ import { buildVerifyConsistencyReport, VERIFY_CONSISTENCY_THRESHOLDS, } from "./verify_metrics.ts"; +import type { WorkbenchComposerChip } from "./Chat.tsx"; const MAX_BATCH_SIZE = 24; const MAX_BATCH_CONCURRENCY = 6; @@ -137,15 +138,33 @@ const clampInt = (value: number, min: number, max: number): number => { return Math.max(min, Math.min(max, rounded)); }; +const formatSignedScore = (value: number | null | undefined): string => { + if (typeof value !== "number" || !Number.isFinite(value)) return "—"; + return `${value > 0 ? "+" : ""}${value}`; +}; + +const scoreBadgeVariant = ( + value: number | null | undefined, +): "ghost" | "error" | "completed" | "idle" => { + if (typeof value !== "number" || !Number.isFinite(value)) return "ghost"; + if (value < 0) return "error"; + if (value > 0) return "completed"; + return "idle"; +}; + function VerifyPage( { setNavActions, onAppPathChange, activeWorkspaceId, + composerChips, + onComposerChipsChange, }: { setNavActions?: (actions: React.ReactNode | null) => void; onAppPathChange?: (path: string) => void; activeWorkspaceId?: string | null; + composerChips?: WorkbenchComposerChip[]; + onComposerChipsChange?: (next: WorkbenchComposerChip[]) => void; }, ) { const { @@ -551,6 +570,74 @@ function VerifyPage( ); const topOutliers = consistencyReport.outliers.slice(0, 8); + const resolvedComposerChips = useMemo( + () => composerChips ?? [], + [composerChips], + ); + const composerChipIds = useMemo( + () => new Set(resolvedComposerChips.map((chip) => chip.chipId)), + [resolvedComposerChips], + ); + + const mergeComposerChip = useCallback( + (base: WorkbenchComposerChip[], chip: WorkbenchComposerChip) => { + const next = [...base]; + const existingIndex = next.findIndex((entry) => + entry.chipId === chip.chipId + ); + if (existingIndex >= 0) { + next[existingIndex] = { + ...next[existingIndex], + ...chip, + enabled: true, + }; + return next; + } + next.push(chip); + return next; + }, + [], + ); + + const addComposerChip = useCallback((chip: WorkbenchComposerChip) => { + if (!onComposerChipsChange) return; + onComposerChipsChange(mergeComposerChip(resolvedComposerChips, chip)); + }, [mergeComposerChip, onComposerChipsChange, resolvedComposerChips]); + + const removeComposerChip = useCallback((chipId: string) => { + if (!onComposerChipsChange) return; + onComposerChipsChange( + resolvedComposerChips.filter((chip) => chip.chipId !== chipId), + ); + }, [onComposerChipsChange, resolvedComposerChips]); + + const buildOutlierChip = useCallback( + (outlier: typeof topOutliers[number]) => { + const chipId = `verify:${selectedSessionId ?? ""}:${outlier.key}`; + const runId = outlier.maxRunId ?? outlier.minRunId; + const score = outlier.maxScore ?? outlier.minScore ?? undefined; + const agreementText = outlier.agreementRate === null + ? "agreement unavailable" + : `agreement ${Math.round(outlier.agreementRate * 100)}%`; + const deltaText = outlier.scoreDelta === null + ? "delta unavailable" + : `delta ${outlier.scoreDelta}`; + return { + chipId, + source: "verify_outlier" as const, + workspaceId: selectedSessionId ?? undefined, + runId, + capturedAt: new Date().toISOString(), + outlierKey: outlier.key, + instability: outlier.instability, + score, + message: + `Verify outlier ${outlier.label}: ${agreementText}, ${deltaText}, samples ${outlier.sampleSize}`, + enabled: true, + }; + }, + [selectedSessionId], + ); useEffect(() => { if (!setNavActions) return; @@ -828,13 +915,63 @@ function VerifyPage( >