Skip to content

Commit de45e24

Browse files
authored
fix: use idle-based CPU formula in dashboard charts and alert evaluator (#54)
* fix: handle BigInt serialization in REST API v1 detail endpoints GET /api/v1/pipelines/:id and /api/v1/nodes/:id return 500 because NodePipelineStatus contains BigInt fields (eventsIn, eventsOut, etc.) that JSON.stringify cannot serialize. Adds a jsonResponse() helper using a custom replacer to convert BigInts to numbers. * fix: use idle-based CPU formula in dashboard charts and alert evaluator The fleet detail page correctly uses (total - idle) / total for CPU%, but the dashboard charts and alert evaluator still used the old cpuDelta / wallClockSeconds formula which gives per-core percentages (e.g. 787% on an 8-core machine). This aligns all CPU calculations to the same idle-based formula clamped to 0-100%. Affected code paths: - dashboard.chartMetrics: chart CPU time-series - dashboard.nodeCards: sparkline CPU values - alert-evaluator getCpuUsage: alert threshold checks
1 parent bbcb3a7 commit de45e24

2 files changed

Lines changed: 29 additions & 17 deletions

File tree

src/server/routers/dashboard.ts

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ export const dashboardRouter = router({
149149
memoryUsedBytes: true,
150150
memoryTotalBytes: true,
151151
cpuSecondsTotal: true,
152+
cpuSecondsIdle: true,
152153
},
153154
})
154155
: [];
@@ -227,11 +228,22 @@ export const dashboardRouter = router({
227228
unhealthyPipelines,
228229
rates: { eventsIn: eventsInRate, eventsOut: eventsOutRate, bytesIn: bytesInRate, bytesOut: bytesOutRate, errors: errorsRate },
229230
totals: { eventsIn: totalEventsIn, eventsOut: totalEventsOut, bytesIn: totalBytesIn, bytesOut: totalBytesOut, errors: totalErrors },
230-
sparkline: (metricsByNode.get(node.id) ?? []).map((m) => ({
231-
t: m.timestamp.getTime(),
232-
mem: m.memoryTotalBytes ? Number(m.memoryUsedBytes) / Number(m.memoryTotalBytes) * 100 : 0,
233-
cpu: Number(m.cpuSecondsTotal ?? 0),
234-
})),
231+
sparkline: (metricsByNode.get(node.id) ?? []).map((m, i, arr) => {
232+
let cpu = 0;
233+
if (i > 0) {
234+
const prev = arr[i - 1];
235+
const totalDelta = m.cpuSecondsTotal - prev.cpuSecondsTotal;
236+
const idleDelta = m.cpuSecondsIdle - prev.cpuSecondsIdle;
237+
if (totalDelta > 0) {
238+
cpu = Math.max(0, Math.min(100, ((totalDelta - idleDelta) / totalDelta) * 100));
239+
}
240+
}
241+
return {
242+
t: m.timestamp.getTime(),
243+
mem: m.memoryTotalBytes ? Number(m.memoryUsedBytes) / Number(m.memoryTotalBytes) * 100 : 0,
244+
cpu,
245+
};
246+
}),
235247
};
236248
});
237249
}),
@@ -683,6 +695,7 @@ export const dashboardRouter = router({
683695
nodeId: true,
684696
timestamp: true,
685697
cpuSecondsTotal: true,
698+
cpuSecondsIdle: true,
686699
memoryUsedBytes: true,
687700
memoryTotalBytes: true,
688701
diskReadBytes: true,
@@ -803,6 +816,7 @@ export const dashboardRouter = router({
803816
nodeId: string;
804817
timestamp: Date;
805818
cpuSecondsTotal: number;
819+
cpuSecondsIdle: number;
806820
memoryUsedBytes: bigint;
807821
memoryTotalBytes: bigint;
808822
diskReadBytes: bigint;
@@ -826,8 +840,11 @@ export const dashboardRouter = router({
826840
const dtSec = (t - new Date(prev.timestamp).getTime()) / 1000;
827841
if (dtSec <= 0) continue;
828842

829-
const cpuDelta = curr.cpuSecondsTotal - prev.cpuSecondsTotal;
830-
const cpuPct = Math.max(0, Math.min(100, (cpuDelta / dtSec) * 100));
843+
const cpuTotalDelta = curr.cpuSecondsTotal - prev.cpuSecondsTotal;
844+
const cpuIdleDelta = curr.cpuSecondsIdle - prev.cpuSecondsIdle;
845+
const cpuPct = cpuTotalDelta > 0
846+
? Math.max(0, Math.min(100, ((cpuTotalDelta - cpuIdleDelta) / cpuTotalDelta) * 100))
847+
: 0;
831848
addPoint(cpu, label, t, cpuPct);
832849

833850
const memTotal = Number(curr.memoryTotalBytes);

src/server/services/alert-evaluator.ts

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -43,22 +43,17 @@ async function getCpuUsage(nodeId: string): Promise<number | null> {
4343
where: { nodeId },
4444
orderBy: { timestamp: "desc" },
4545
take: 2,
46-
select: { cpuSecondsTotal: true, timestamp: true },
46+
select: { cpuSecondsTotal: true, cpuSecondsIdle: true },
4747
});
4848

4949
if (rows.length < 2) return null;
5050

5151
const [newer, older] = rows;
52-
const dtSeconds =
53-
(newer.timestamp.getTime() - older.timestamp.getTime()) / 1000;
54-
if (dtSeconds <= 0) return null;
52+
const totalDelta = newer.cpuSecondsTotal - older.cpuSecondsTotal;
53+
if (totalDelta <= 0) return null; // counter reset or no change
5554

56-
// cpuSecondsTotal is cumulative; the delta / wall-clock-delta gives
57-
// fraction of one core used. Multiply by 100 for a percentage.
58-
const cpuDelta = newer.cpuSecondsTotal - older.cpuSecondsTotal;
59-
if (cpuDelta < 0) return null; // counter reset
60-
61-
return (cpuDelta / dtSeconds) * 100;
55+
const idleDelta = newer.cpuSecondsIdle - older.cpuSecondsIdle;
56+
return Math.max(0, Math.min(100, ((totalDelta - idleDelta) / totalDelta) * 100));
6257
}
6358

6459
/** Compute memory usage percentage from the latest NodeMetric row. */

0 commit comments

Comments
 (0)