diff --git a/simulator-ui/src/GradePage.tsx b/simulator-ui/src/GradePage.tsx index d3fc9c05..44a4abca 100644 --- a/simulator-ui/src/GradePage.tsx +++ b/simulator-ui/src/GradePage.tsx @@ -807,6 +807,9 @@ function GradePage(
Grader runs + + ({filteredSessionRuns.length}) +
{runItems.length === 0 && ( diff --git a/simulator-ui/src/VerifyPage.tsx b/simulator-ui/src/VerifyPage.tsx index e7351c2d..b036eb3b 100644 --- a/simulator-ui/src/VerifyPage.tsx +++ b/simulator-ui/src/VerifyPage.tsx @@ -69,6 +69,14 @@ type VerifyRunSampleResponse = { error?: string; }; +type VerifyReportScope = "current_batch" | "all_matching"; +type VerifyExampleSort = + | "default" + | "delta_desc" + | "agreement_asc" + | "samples_desc" + | "label_asc"; + const parseScenarioRunSummary = (value: unknown): ScenarioRunSummary | null => { if (!value || typeof value !== "object") return null; const summary = value as Record; @@ -199,6 +207,11 @@ function VerifyPage( initialRunIds: [], requests: [], }); + const [reportScope, setReportScope] = useState( + "all_matching", + ); + const [inconsistentOnly, setInconsistentOnly] = useState(false); + const [exampleSort, setExampleSort] = useState("default"); const batchSeqRef = useRef(0); const updateVerifyPath = useCallback((sessionId: string | null) => { @@ -545,10 +558,10 @@ function VerifyPage( const reportRuns = useMemo( () => - completedBatchRuns.length > 0 + reportScope === "current_batch" ? completedBatchRuns : historicalCompletedRuns, - [completedBatchRuns, historicalCompletedRuns], + [completedBatchRuns, historicalCompletedRuns, reportScope], ); const consistencyReport = useMemo( @@ -569,7 +582,43 @@ function VerifyPage( batchState.status !== "running", ); - const topOutliers = consistencyReport.outliers.slice(0, 8); + const displayedOutliers = useMemo(() => { + const filtered = inconsistentOnly + ? consistencyReport.outliers.filter((outlier) => outlier.instability) + : consistencyReport.outliers; + if (exampleSort === "default") return filtered; + const next = [...filtered]; + if (exampleSort === "delta_desc") { + next.sort((a, b) => { + const aDelta = a.scoreDelta ?? -1; + const bDelta = b.scoreDelta ?? -1; + if (aDelta !== bDelta) return bDelta - aDelta; + return a.label.localeCompare(b.label); + }); + return next; + } + if (exampleSort === "agreement_asc") { + next.sort((a, b) => { + const aAgreement = a.agreementRate ?? Number.POSITIVE_INFINITY; + const bAgreement = b.agreementRate ?? Number.POSITIVE_INFINITY; + if (aAgreement !== bAgreement) return aAgreement - bAgreement; + return a.label.localeCompare(b.label); + }); + return next; + } + if (exampleSort === "samples_desc") { + next.sort((a, b) => { + if (a.sampleSize !== b.sampleSize) return b.sampleSize - a.sampleSize; + return a.label.localeCompare(b.label); + }); + return next; + } + next.sort((a, b) => a.label.localeCompare(b.label)); + return next; + }, [consistencyReport.outliers, exampleSort, inconsistentOnly]); + const reportScopeLabel = reportScope === "current_batch" + ? "current batch" + : "all matching runs"; const resolvedComposerChips = useMemo( () => composerChips ?? [], [composerChips], @@ -612,7 +661,7 @@ function VerifyPage( }, [onComposerChipsChange, resolvedComposerChips]); const buildOutlierChip = useCallback( - (outlier: typeof topOutliers[number]) => { + (outlier: typeof consistencyReport.outliers[number]) => { const chipId = `verify:${selectedSessionId ?? ""}:${outlier.key}`; const runId = outlier.maxRunId ?? outlier.minRunId; const score = outlier.maxScore ?? outlier.minScore ?? undefined; @@ -774,23 +823,11 @@ function VerifyPage( {!loading && ( <>
-
+
Batch status -
- - {batchState.status} - - {batchState.startedAt - ? ` · started ${ - formatTimestampShort(batchState.startedAt) - }` - : ""} - {batchState.finishedAt - ? ` · finished ${ - formatTimestampShort(batchState.finishedAt) - }` - : ""} -
+ + {batchState.status} +
{consistencyReport.sampleSize > 0 && ( )}
- {batchState.requested > 0 && ( -
- Queued: {queuedCount} - Running: {batchState.active} - Completed: {batchState.completed} - Failed: {batchState.failed} -
- )} {batchState.status === "idle" && consistencyReport.sampleSize === 0 && ( @@ -818,57 +847,87 @@ function VerifyPage( instability for the selected grader. )} - {consistencyReport.sampleSize > 0 && ( - <> -
-
+
+
+
+
Sample size
{consistencyReport.sampleSize}
-
-
Agreement rate
-
- {consistencyReport.agreementRate === null - ? "—" - : `${ - Math.round(consistencyReport.agreementRate * 100) - }%`} -
-
-
-
- Score spread (min/median/max) -
-
- {consistencyReport.scoreSpreadMin === null - ? "—" - : `${consistencyReport.scoreSpreadMin} / ${ - consistencyReport.scoreSpreadMedian ?? "—" - } / ${consistencyReport.scoreSpreadMax ?? "—"}`} -
-
-
-
- Instability count -
-
- {consistencyReport.instabilityCount} -
+
+ + setReportScope(value as VerifyReportScope)} + size="small" + popoverMatchTriggerWidth={false} + popoverMinWidth={320} + popoverAlign="right" + options={[ + { + value: "current_batch", + label: + `Current batch (${completedBatchRuns.length})`, + triggerLabel: "Current batch", + triggerMeta: null, + meta: "Only runs from the latest batch launch", + }, + { + value: "all_matching", + label: + `All matching runs (${historicalCompletedRuns.length})`, + triggerLabel: "All matching runs", + triggerMeta: null, + meta: + "All runs matching selected scenario + grader", + }, + ]} + />
- - {consistencyReport.verdictReason} - - +
+
+
Agreement rate
+
+ {consistencyReport.agreementRate === null + ? "—" + : `${Math.round(consistencyReport.agreementRate * 100)}%`} +
+
+
+
+ Score spread (min/median/max) +
+
+ {consistencyReport.scoreSpreadMin === null + ? "—" + : `${consistencyReport.scoreSpreadMin} / ${ + consistencyReport.scoreSpreadMedian ?? "—" + } / ${consistencyReport.scoreSpreadMax ?? "—"}`} +
+
+
+
+ Instability count +
+
+ {consistencyReport.instabilityCount} +
+
+
+ {consistencyReport.sampleSize > 0 && ( + + {consistencyReport.verdictReason} + )} Min sample size: {VERIFY_CONSISTENCY_THRESHOLDS.minSampleSize} @@ -889,17 +948,49 @@ function VerifyPage( {VERIFY_CONSISTENCY_THRESHOLDS.warn.maxInstabilityCount}.
- Most inconsistent examples - {topOutliers.length === 0 +
+ Examples +
+ +
+ + setExampleSort(value as VerifyExampleSort)} + size="small" + options={[ + { value: "default", label: "Sort: default" }, + { value: "delta_desc", label: "Sort: score delta" }, + { + value: "agreement_asc", + label: "Sort: agreement", + }, + { value: "samples_desc", label: "Sort: sample size" }, + { value: "label_asc", label: "Sort: label" }, + ]} + /> +
+
+
+ {displayedOutliers.length === 0 ? ( - Inconsistent examples will appear here as soon as at least - one completed run is available in this batch. + {consistencyReport.outliers.length === 0 + ? `Examples will appear here as soon as at least one completed run is available in ${reportScopeLabel}.` + : `No examples match the current filters in ${reportScopeLabel}.`} ) : (
- {topOutliers.map((outlier) => { + {displayedOutliers.map((outlier) => { const runLinks = (() => { if (!selectedSessionId) return []; const ids = [ @@ -973,64 +1064,72 @@ function VerifyPage(
-
- agreement {outlier.agreementRate === null - ? "—" - : `${Math.round(outlier.agreementRate * 100)}%`} - {" "} - · delta {outlier.scoreDelta ?? "—"} · samples{" "} - {outlier.sampleSize} - {outlier.passFlip ? " · pass/fail flip" : ""} - {outlier.messageRefId - ? ` · ref ${outlier.messageRefId}` - : ""} -
- {(() => { - const outlierChip = buildOutlierChip(outlier); - const inChat = composerChipIds.has( - outlierChip.chipId, - ); - return ( -
- +
+
+
+ agreement {outlier.agreementRate === null + ? "—" + : `${ + Math.round(outlier.agreementRate * 100) + }%`} · delta {outlier.scoreDelta ?? "—"} + {" "} + · samples {outlier.sampleSize} + {outlier.passFlip ? " · pass/fail flip" : ""} + {outlier.messageRefId + ? ` · ref ${outlier.messageRefId}` + : ""}
- ); - })()} - {runLinks.length > 0 && ( -
- {runLinks.map((runId) => { - if (!selectedSessionId) return null; - const href = buildGradePath( - selectedSessionId, - runId, - ); - return ( - - handleInternalLinkClick(event, href)} - > - Open grade run {runId} - - ); - })} + {runLinks.length > 0 && ( +
+ {runLinks.map((runId) => { + if (!selectedSessionId) return null; + const href = buildGradePath( + selectedSessionId, + runId, + ); + return ( + + handleInternalLinkClick( + event, + href, + )} + > + Open grade run {runId} + + ); + })} +
+ )}
- )} + {(() => { + const outlierChip = buildOutlierChip(outlier); + const inChat = composerChipIds.has( + outlierChip.chipId, + ); + return ( +
+ +
+ ); + })()} +
); })} @@ -1040,6 +1139,29 @@ function VerifyPage( {batchState.requests.length > 0 && (
Batch requests + {(batchState.startedAt || + batchState.finishedAt) && ( +
+ {batchState.startedAt + ? `started ${ + formatTimestampShort(batchState.startedAt) + }` + : ""} + {batchState.finishedAt + ? ` · finished ${ + formatTimestampShort(batchState.finishedAt) + }` + : ""} +
+ )} + {batchState.requested > 0 && ( +
+ Queued: {queuedCount} + Running: {batchState.active} + Completed: {batchState.completed} + Failed: {batchState.failed} +
+ )}
    {batchState.requests.map((request, index) => (
  • (null); @@ -66,18 +76,39 @@ export default function Listbox(props: ListboxProps) { } return null; }, [options, value]); + const selectedTriggerMeta = useMemo(() => { + if (!selected) return null; + if ("triggerMeta" in selected) return selected.triggerMeta ?? null; + return selected.meta ?? null; + }, [selected]); const updatePopover = useCallback(() => { const trigger = triggerRef.current; if (!trigger) return; const rect = trigger.getBoundingClientRect(); - setPopoverStyle({ + const style: React.CSSProperties = { position: "fixed", top: rect.bottom + 6, - left: rect.left, - width: rect.width, - }); - }, []); + }; + if (popoverMatchTriggerWidth) { + style.width = rect.width; + } else if ( + typeof popoverMinWidth === "number" && Number.isFinite(popoverMinWidth) + ) { + style.minWidth = popoverMinWidth; + } + if (popoverAlign === "right") { + style.left = popoverMatchTriggerWidth + ? rect.right - rect.width + : rect.right; + if (!popoverMatchTriggerWidth) { + style.transform = "translateX(-100%)"; + } + } else { + style.left = rect.left; + } + setPopoverStyle(style); + }, [popoverAlign, popoverMatchTriggerWidth, popoverMinWidth]); useLayoutEffect(() => { if (!open) return; @@ -117,8 +148,12 @@ export default function Listbox(props: ListboxProps) { }; }, [open, updatePopover]); + const rootClassName = size === "small" + ? "gds-listbox gds-listbox--size-small" + : "gds-listbox"; + return ( -
    +
    {label && (