Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 50 additions & 1 deletion simulator-ui/src/Chat.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,22 @@ export type WorkbenchFlagContext = {
message: string;
};

export type WorkbenchVerifyOutlierContext = {
source: "verify_outlier";
workspaceId?: string;
runId?: string;
capturedAt: string;
outlierKey: string;
instability?: boolean;
score?: number;
message: string;
};

export type WorkbenchMessageContext =
| WorkbenchScenarioErrorContext
| WorkbenchRatingContext
| WorkbenchFlagContext;
| WorkbenchFlagContext
| WorkbenchVerifyOutlierContext;

export type WorkbenchScenarioErrorChip = WorkbenchScenarioErrorContext & {
enabled: boolean;
Expand Down Expand Up @@ -157,6 +169,30 @@ function parseWorkbenchContext(value: unknown): WorkbenchMessageContext | null {
message: record.message,
};
}
if (record.source === "verify_outlier") {
if (
typeof record.outlierKey !== "string" ||
record.outlierKey.trim().length === 0 ||
typeof record.message !== "string" ||
record.message.trim().length === 0
) {
return null;
}
return {
source: "verify_outlier",
workspaceId,
runId,
capturedAt: record.capturedAt,
outlierKey: record.outlierKey,
instability: typeof record.instability === "boolean"
? record.instability
: undefined,
score: typeof record.score === "number" && Number.isFinite(record.score)
? record.score
: undefined,
message: record.message,
};
}
return null;
}

Expand Down Expand Up @@ -584,6 +620,19 @@ export function ChatView(props: {
score: chip.score,
message: chip.message,
});
continue;
}
if (chip.source === "verify_outlier") {
activeContexts.push({
source: "verify_outlier",
workspaceId: chip.workspaceId,
runId: chip.runId,
capturedAt: chip.capturedAt,
outlierKey: chip.outlierKey,
instability: chip.instability,
score: chip.score,
message: chip.message,
});
}
}
if (!message && activeContexts.length === 0) return;
Expand Down
174 changes: 168 additions & 6 deletions simulator-ui/src/VerifyPage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import {
buildVerifyConsistencyReport,
VERIFY_CONSISTENCY_THRESHOLDS,
} from "./verify_metrics.ts";
import type { WorkbenchComposerChip } from "./Chat.tsx";

const MAX_BATCH_SIZE = 24;
const MAX_BATCH_CONCURRENCY = 6;
Expand Down Expand Up @@ -137,15 +138,33 @@ const clampInt = (value: number, min: number, max: number): number => {
return Math.max(min, Math.min(max, rounded));
};

const formatSignedScore = (value: number | null | undefined): string => {
if (typeof value !== "number" || !Number.isFinite(value)) return "—";
return `${value > 0 ? "+" : ""}${value}`;
};

const scoreBadgeVariant = (
value: number | null | undefined,
): "ghost" | "error" | "completed" | "idle" => {
if (typeof value !== "number" || !Number.isFinite(value)) return "ghost";
if (value < 0) return "error";
if (value > 0) return "completed";
return "idle";
};

function VerifyPage(
{
setNavActions,
onAppPathChange,
activeWorkspaceId,
composerChips,
onComposerChipsChange,
}: {
setNavActions?: (actions: React.ReactNode | null) => void;
onAppPathChange?: (path: string) => void;
activeWorkspaceId?: string | null;
composerChips?: WorkbenchComposerChip[];
onComposerChipsChange?: (next: WorkbenchComposerChip[]) => void;
},
) {
const {
Expand Down Expand Up @@ -551,6 +570,74 @@ function VerifyPage(
);

const topOutliers = consistencyReport.outliers.slice(0, 8);
const resolvedComposerChips = useMemo(
() => composerChips ?? [],
[composerChips],
);
const composerChipIds = useMemo(
() => new Set(resolvedComposerChips.map((chip) => chip.chipId)),
[resolvedComposerChips],
);

const mergeComposerChip = useCallback(
(base: WorkbenchComposerChip[], chip: WorkbenchComposerChip) => {
const next = [...base];
const existingIndex = next.findIndex((entry) =>
entry.chipId === chip.chipId
);
if (existingIndex >= 0) {
next[existingIndex] = {
...next[existingIndex],
...chip,
enabled: true,
};
return next;
}
next.push(chip);
return next;
},
[],
);

const addComposerChip = useCallback((chip: WorkbenchComposerChip) => {
if (!onComposerChipsChange) return;
onComposerChipsChange(mergeComposerChip(resolvedComposerChips, chip));
}, [mergeComposerChip, onComposerChipsChange, resolvedComposerChips]);

const removeComposerChip = useCallback((chipId: string) => {
if (!onComposerChipsChange) return;
onComposerChipsChange(
resolvedComposerChips.filter((chip) => chip.chipId !== chipId),
);
}, [onComposerChipsChange, resolvedComposerChips]);

const buildOutlierChip = useCallback(
(outlier: typeof topOutliers[number]) => {
const chipId = `verify:${selectedSessionId ?? ""}:${outlier.key}`;
const runId = outlier.maxRunId ?? outlier.minRunId;
const score = outlier.maxScore ?? outlier.minScore ?? undefined;
const agreementText = outlier.agreementRate === null
? "agreement unavailable"
: `agreement ${Math.round(outlier.agreementRate * 100)}%`;
const deltaText = outlier.scoreDelta === null
? "delta unavailable"
: `delta ${outlier.scoreDelta}`;
return {
chipId,
source: "verify_outlier" as const,
workspaceId: selectedSessionId ?? undefined,
runId,
capturedAt: new Date().toISOString(),
outlierKey: outlier.key,
instability: outlier.instability,
score,
message:
`Verify outlier ${outlier.label}: ${agreementText}, ${deltaText}, samples ${outlier.sampleSize}`,
enabled: true,
};
},
[selectedSessionId],
);

useEffect(() => {
if (!setNavActions) return;
Expand Down Expand Up @@ -828,13 +915,63 @@ function VerifyPage(
>
<div className="verify-outlier-header">
<strong>{outlier.label}</strong>
<Badge
variant={outlier.instability
? "error"
: "completed"}
<div
style={{
display: "flex",
alignItems: "center",
gap: "8px",
}}
>
{outlier.instability ? "Unstable" : "Stable"}
</Badge>
{outlier.minScore === outlier.maxScore
? (
<Badge
variant={scoreBadgeVariant(
outlier.minScore,
)}
>
{formatSignedScore(outlier.minScore)}
</Badge>
)
: (
<div
style={{
display: "flex",
alignItems: "center",
}}
>
<Badge
variant={scoreBadgeVariant(
outlier.minScore,
)}
style={{
borderTopRightRadius: 0,
borderBottomRightRadius: 0,
}}
>
{formatSignedScore(outlier.minScore)}
</Badge>
<Badge
variant={scoreBadgeVariant(
outlier.maxScore,
)}
style={{
marginLeft: "-1px",
borderTopLeftRadius: 0,
borderBottomLeftRadius: 0,
}}
>
{formatSignedScore(outlier.maxScore)}
</Badge>
</div>
)}
<Badge
variant={outlier.instability
? "error"
: "completed"}
>
{outlier.instability ? "Unstable" : "Stable"}
</Badge>
</div>
</div>
<div className="verify-outlier-meta">
agreement {outlier.agreementRate === null
Expand All @@ -848,6 +985,31 @@ function VerifyPage(
? ` · ref ${outlier.messageRefId}`
: ""}
</div>
{(() => {
const outlierChip = buildOutlierChip(outlier);
const inChat = composerChipIds.has(
outlierChip.chipId,
);
return (
<div className="workbench-summary-actions">
<Button
variant="secondary"
size="small"
onClick={() =>
inChat
? removeComposerChip(
outlierChip.chipId,
)
: addComposerChip(outlierChip)}
disabled={!onComposerChipsChange}
>
{inChat
? "Remove from chat"
: "Add to chat"}
</Button>
</div>
);
})()}
{runLinks.length > 0 && (
<div className="verify-outlier-links">
{runLinks.map((runId) => {
Expand Down
7 changes: 5 additions & 2 deletions simulator-ui/src/WorkbenchDrawer.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -387,20 +387,23 @@ export default function WorkbenchDrawer(props: WorkbenchDrawerProps) {
let didChange = false;
const syncedChips = composerChips.filter((chip) => {
if (chip.source === "message_rating") {
if (!chip.chipId.startsWith("rating:")) return true;
const stillExists = ratingByChipId.has(chip.chipId);
if (!stillExists) didChange = true;
return stillExists;
}
if (chip.source === "grading_flag") {
if (!chip.chipId.startsWith("flag:")) return true;
const stillExists = flagByChipId.has(chip.chipId);
if (!stillExists) didChange = true;
return stillExists;
}
return true;
}).map((chip) => {
const latest = chip.source === "message_rating"
const latest = chip.source === "message_rating" &&
chip.chipId.startsWith("rating:")
? ratingByChipId.get(chip.chipId)
: chip.source === "grading_flag"
: chip.source === "grading_flag" && chip.chipId.startsWith("flag:")
? flagByChipId.get(chip.chipId)
: undefined;
if (!latest) return chip;
Expand Down
24 changes: 24 additions & 0 deletions simulator-ui/src/gds/WorkbenchComposerChip.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ type WorkbenchComposerChipContext =
source: "grading_flag";
message: string;
score?: number;
}
| {
source: "verify_outlier";
message: string;
instability?: boolean;
score?: number;
};

function formatScoreLabel(score: number): string {
Expand All @@ -29,6 +35,7 @@ function formatContextLabel(context: WorkbenchComposerChipContext): string {
if (context.source === "message_rating") {
return formatScoreLabel(context.score);
}
if (context.source === "verify_outlier") return "Verify";
return "Flag";
}

Expand All @@ -41,10 +48,23 @@ function formatContextTooltip(context: WorkbenchComposerChipContext): string {
return context.reason?.trim() ||
`Rating ${formatScoreLabel(context.score)}`;
case "grading_flag":
case "verify_outlier":
return context.message;
}
}

function getVerifyOutlierClass(
context: Extract<WorkbenchComposerChipContext, { source: "verify_outlier" }>,
): string {
if (typeof context.instability === "boolean") {
return getScoreClass(context.instability ? -1 : 1);
}
if (typeof context.score === "number" && Number.isFinite(context.score)) {
return getScoreClass(context.score);
}
return getScoreClass(0);
}

export default function WorkbenchComposerChip(
props:
& Omit<React.HTMLAttributes<HTMLDivElement>, "children">
Expand Down Expand Up @@ -83,6 +103,8 @@ export default function WorkbenchComposerChip(
context.source === "grading_flag" && "workbench-context-chip--flag",
context.source === "grading_flag" && typeof score === "number" &&
getScoreClass(score),
context.source === "verify_outlier" && "workbench-context-chip--flag",
context.source === "verify_outlier" && getVerifyOutlierClass(context),
context.source === "message_rating" && "workbench-context-chip--rating",
context.source === "message_rating" && typeof score === "number" &&
getScoreClass(score),
Expand All @@ -101,6 +123,8 @@ export default function WorkbenchComposerChip(
const isPassive = !showToggle && !showRemove;
const content = context.source === "grading_flag"
? <Icon name="flag" size={10} />
: context.source === "verify_outlier"
? "Verify"
: label;

if (isPassive) {
Expand Down
3 changes: 3 additions & 0 deletions simulator-ui/src/main.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -1669,6 +1669,8 @@ function App() {
setNavActions={setNavActions}
onAppPathChange={handleAppPathChange}
activeWorkspaceId={activeWorkspaceId}
composerChips={workbenchComposerChips}
onComposerChipsChange={setWorkbenchComposerChips}
/>
);
case "grade":
Expand Down Expand Up @@ -1702,6 +1704,7 @@ function App() {
setNavActions,
simulatorBasePath,
testBotResetToken,
workbenchComposerChips,
workspacesApi,
]);

Expand Down