fix: use idle-based CPU formula in dashboard charts and alert evaluator (#54)

TerrifiedBug · web-flow · commit de45e246c72a · 2026-03-07T19:01:27.000Z
* fix: handle BigInt serialization in REST API v1 detail endpoints

GET /api/v1/pipelines/:id and /api/v1/nodes/:id return 500 because
NodePipelineStatus contains BigInt fields (eventsIn, eventsOut, etc.)
that JSON.stringify cannot serialize. Adds a jsonResponse() helper
using a custom replacer to convert BigInts to numbers.

* fix: use idle-based CPU formula in dashboard charts and alert evaluator

The fleet detail page correctly uses (total - idle) / total for CPU%,
but the dashboard charts and alert evaluator still used the old
cpuDelta / wallClockSeconds formula which gives per-core percentages
(e.g. 787% on an 8-core machine). This aligns all CPU calculations to
the same idle-based formula clamped to 0-100%.

Affected code paths:
- dashboard.chartMetrics: chart CPU time-series
- dashboard.nodeCards: sparkline CPU values
- alert-evaluator getCpuUsage: alert threshold checks
diff --git a/src/server/routers/dashboard.ts b/src/server/routers/dashboard.ts
@@ -149,6 +149,7 @@ export const dashboardRouter = router({
             memoryUsedBytes: true,
             memoryTotalBytes: true,
             cpuSecondsTotal: true,
+            cpuSecondsIdle: true,
           },
         })
       : [];
@@ -227,11 +228,22 @@ export const dashboardRouter = router({
         unhealthyPipelines,
         rates: { eventsIn: eventsInRate, eventsOut: eventsOutRate, bytesIn: bytesInRate, bytesOut: bytesOutRate, errors: errorsRate },
         totals: { eventsIn: totalEventsIn, eventsOut: totalEventsOut, bytesIn: totalBytesIn, bytesOut: totalBytesOut, errors: totalErrors },
-        sparkline: (metricsByNode.get(node.id) ?? []).map((m) => ({
-          t: m.timestamp.getTime(),
-          mem: m.memoryTotalBytes ? Number(m.memoryUsedBytes) / Number(m.memoryTotalBytes) * 100 : 0,
-          cpu: Number(m.cpuSecondsTotal ?? 0),
-        })),
+        sparkline: (metricsByNode.get(node.id) ?? []).map((m, i, arr) => {
+          let cpu = 0;
+          if (i > 0) {
+            const prev = arr[i - 1];
+            const totalDelta = m.cpuSecondsTotal - prev.cpuSecondsTotal;
+            const idleDelta = m.cpuSecondsIdle - prev.cpuSecondsIdle;
+            if (totalDelta > 0) {
+              cpu = Math.max(0, Math.min(100, ((totalDelta - idleDelta) / totalDelta) * 100));
+            }
+          }
+          return {
+            t: m.timestamp.getTime(),
+            mem: m.memoryTotalBytes ? Number(m.memoryUsedBytes) / Number(m.memoryTotalBytes) * 100 : 0,
+            cpu,
+          };
+        }),
       };
     });
   }),
@@ -683,6 +695,7 @@ export const dashboardRouter = router({
                 nodeId: true,
                 timestamp: true,
                 cpuSecondsTotal: true,
+                cpuSecondsIdle: true,
                 memoryUsedBytes: true,
                 memoryTotalBytes: true,
                 diskReadBytes: true,
@@ -803,6 +816,7 @@ export const dashboardRouter = router({
         nodeId: string;
         timestamp: Date;
         cpuSecondsTotal: number;
+        cpuSecondsIdle: number;
         memoryUsedBytes: bigint;
         memoryTotalBytes: bigint;
         diskReadBytes: bigint;
@@ -826,8 +840,11 @@ export const dashboardRouter = router({
           const dtSec = (t - new Date(prev.timestamp).getTime()) / 1000;
           if (dtSec <= 0) continue;
 
-          const cpuDelta = curr.cpuSecondsTotal - prev.cpuSecondsTotal;
-          const cpuPct = Math.max(0, Math.min(100, (cpuDelta / dtSec) * 100));
+          const cpuTotalDelta = curr.cpuSecondsTotal - prev.cpuSecondsTotal;
+          const cpuIdleDelta = curr.cpuSecondsIdle - prev.cpuSecondsIdle;
+          const cpuPct = cpuTotalDelta > 0
+            ? Math.max(0, Math.min(100, ((cpuTotalDelta - cpuIdleDelta) / cpuTotalDelta) * 100))
+            : 0;
           addPoint(cpu, label, t, cpuPct);
 
           const memTotal = Number(curr.memoryTotalBytes);
diff --git a/src/server/services/alert-evaluator.ts b/src/server/services/alert-evaluator.ts
@@ -43,22 +43,17 @@ async function getCpuUsage(nodeId: string): Promise<number | null> {
     where: { nodeId },
     orderBy: { timestamp: "desc" },
     take: 2,
-    select: { cpuSecondsTotal: true, timestamp: true },
+    select: { cpuSecondsTotal: true, cpuSecondsIdle: true },
   });
 
   if (rows.length < 2) return null;
 
   const [newer, older] = rows;
-  const dtSeconds =
-    (newer.timestamp.getTime() - older.timestamp.getTime()) / 1000;
-  if (dtSeconds <= 0) return null;
+  const totalDelta = newer.cpuSecondsTotal - older.cpuSecondsTotal;
+  if (totalDelta <= 0) return null; // counter reset or no change
 
-  // cpuSecondsTotal is cumulative; the delta / wall-clock-delta gives
-  // fraction of one core used. Multiply by 100 for a percentage.
-  const cpuDelta = newer.cpuSecondsTotal - older.cpuSecondsTotal;
-  if (cpuDelta < 0) return null; // counter reset
-
-  return (cpuDelta / dtSeconds) * 100;
+  const idleDelta = newer.cpuSecondsIdle - older.cpuSecondsIdle;
+  return Math.max(0, Math.min(100, ((totalDelta - idleDelta) / totalDelta) * 100));
 }
 
 /** Compute memory usage percentage from the latest NodeMetric row. */