Add latency percentiles calculation

emasab · emasab · commit 57b16eb9ecd0 · 2025-10-24T17:55:53.000+02:00
diff --git a/ci/tests/run_perf_test.js b/ci/tests/run_perf_test.js
@@ -90,11 +90,12 @@ async function main() {
 
     if (concurrentRun) {
       console.log(`Running ${modeLabel} Producer/Consumer test (concurrently)...`);
-      const INITIAL_DELAY_MS = 2000;
+      const INITIAL_DELAY_MS = 10000;
       const TERMINATE_TIMEOUT_MS = process.env.TERMINATE_TIMEOUT_MS ? +process.env.TERMINATE_TIMEOUT_MS : 600000;
       // Wait INITIAL_DELAY_MS more to see if all lag is caught up, start earlier than the producer to check
       // E2E latencies more accurately.
-      const TERMINATE_TIMEOUT_MS_CONSUMERS = TERMINATE_TIMEOUT_MS + INITIAL_DELAY_MS * 2;
+      const TERMINATE_TIMEOUT_MS_CONSUMERS = TERMINATE_TIMEOUT_MS + INITIAL_DELAY_MS + 2000;
+      const TERMINATE_TIMEOUT_MS_LAG_MONITORING = TERMINATE_TIMEOUT_MS + 1000;
 
       await runCommand(`MODE=${mode} node performance-consolidated.js --create-topics`);
       const allPromises = [];
@@ -106,10 +107,10 @@ async function main() {
         allPromises.push(runCommand(`MODE=${mode} MESSAGE_COUNT=${messageCount} INITIAL_DELAY_MS=0 TERMINATE_TIMEOUT_MS=${TERMINATE_TIMEOUT_MS_CONSUMERS} GROUPID_BATCH=${groupIdEachBatch} node performance-consolidated.js --consumer-each-batch ${produceToSecondTopicParam}`));
       }
       if (consumerModeAll || consumerModeEachMessage) {
-        allPromises.push(runCommand(`MODE=${mode} INITIAL_DELAY_MS=0 TERMINATE_TIMEOUT_MS=${TERMINATE_TIMEOUT_MS_CONSUMERS} GROUPID_MONITOR=${groupIdEachMessage} node performance-consolidated.js --monitor-lag`));
+        allPromises.push(runCommand(`MODE=${mode} INITIAL_DELAY_MS=${INITIAL_DELAY_MS} TERMINATE_TIMEOUT_MS=${TERMINATE_TIMEOUT_MS_LAG_MONITORING} GROUPID_MONITOR=${groupIdEachMessage} node performance-consolidated.js --monitor-lag`));
       }
       if (consumerModeAll || consumerModeEachBatch) {
-        allPromises.push(runCommand(`MODE=${mode} INITIAL_DELAY_MS=0 TERMINATE_TIMEOUT_MS=${TERMINATE_TIMEOUT_MS_CONSUMERS} GROUPID_MONITOR=${groupIdEachBatch} node performance-consolidated.js --monitor-lag`));
+        allPromises.push(runCommand(`MODE=${mode} INITIAL_DELAY_MS=${INITIAL_DELAY_MS} TERMINATE_TIMEOUT_MS=${TERMINATE_TIMEOUT_MS_LAG_MONITORING} GROUPID_MONITOR=${groupIdEachBatch} node performance-consolidated.js --monitor-lag`));
       }
       const results = await Promise.allSettled(allPromises);
       return results.map(r => r.status === 'fulfilled' ? r.value : '').join('\n');
diff --git a/examples/performance/performance-consolidated.js b/examples/performance/performance-consolidated.js
@@ -53,6 +53,13 @@ function logParameters(parameters) {
     }
 }
 
+function printPercentiles(percentiles, type) {
+    for (const [percentile, value] of Object.entries(percentiles)) {
+        const percentileStr = `P${percentile}`.padStart(6, ' ');
+        console.log(`=== Consumer ${percentileStr} E2E latency ${type}: ${value.toFixed(2)} ms`);
+    }
+}
+
 (async function () {
     const producer = process.argv.includes('--producer');
     const consumer = process.argv.includes('--consumer');
@@ -169,10 +176,11 @@ function logParameters(parameters) {
         endTrackingMemory('consumer-each-message', `consumer-memory-message-${mode}.json`);
         console.log("=== Consumer Rate MB/s (eachMessage): ", consumerRate);
         console.log("=== Consumer Rate msg/s (eachMessage): ", stats.messageRate);
-        console.log("=== Consumer average E2E latency T0-T1 (eachMessage): ", stats.avgLatencyT0T1);
+        printPercentiles(stats.percentilesTOT1, 'T0-T1 (eachMessage)');
         console.log("=== Consumer max E2E latency T0-T1 (eachMessage): ", stats.maxLatencyT0T1);
         if (produceToSecondTopic) {
             console.log("=== Consumer average E2E latency T0-T2 (eachMessage): ", stats.avgLatencyT0T2);
+            printPercentiles(stats.percentilesTOT2, 'T0-T2 (eachMessage)');
             console.log("=== Consumer max E2E latency T0-T2 (eachMessage): ", stats.maxLatencyT0T2);
         }
         console.log("=== Consumption time (eachMessage): ", stats.durationSeconds);
@@ -197,9 +205,11 @@ function logParameters(parameters) {
         console.log("=== Max eachBatch lag: ", stats.maxOffsetLag);
         console.log("=== Average eachBatch size: ", stats.averageBatchSize);
         console.log("=== Consumer average E2E latency T0-T1 (eachBatch): ", stats.avgLatencyT0T1);
+        printPercentiles(stats.percentilesTOT1, 'T0-T1 (eachBatch)');
         console.log("=== Consumer max E2E latency T0-T1 (eachBatch): ", stats.maxLatencyT0T1);
         if (produceToSecondTopic) {
             console.log("=== Consumer average E2E latency T0-T2 (eachBatch): ", stats.avgLatencyT0T2);
+            printPercentiles(stats.percentilesTOT2, 'T0-T2 (eachBatch)');
             console.log("=== Consumer max E2E latency T0-T2 (eachBatch): ", stats.maxLatencyT0T2);
         }
         console.log("=== Consumption time (eachBatch): ", stats.durationSeconds);
diff --git a/examples/performance/performance-primitives-common.js b/examples/performance/performance-primitives-common.js
@@ -1,5 +1,6 @@
 const { hrtime } = require('process');
 const { randomBytes } = require('crypto');
+const PERCENTILES = [50, 75, 90, 95, 99, 99.9, 99.99, 100];
 
 const TERMINATE_TIMEOUT_MS = process.env.TERMINATE_TIMEOUT_MS ? +process.env.TERMINATE_TIMEOUT_MS : 600000;
 const AUTO_COMMIT = process.env.AUTO_COMMIT || 'false';
@@ -58,8 +59,78 @@ function genericProduceToTopic(producer, topic, messages) {
     });
 }
 
+
+// We use a simple count-sketch for latency percentiles to avoid storing all latencies in memory.
+// because we're also measuring the memory usage of the consumer as part of the performance tests.
+class LatencyCountSketch {
+    #numBuckets;
+    #maxValue;
+    #buckets;
+    #counts;
+    #findNextMinimum;
+    #changeBaseLogarithm;
+    #totalCount = 0;
+
+    constructor({
+        numBuckets = 600,
+        error = 0.01, // 1% error
+        maxValue = 60000, // max 60s latency
+    }) {
+        // Each bucket represents [x * (1 - error), x * (1 + error))
+        this.#numBuckets = numBuckets;
+        this.#findNextMinimum = 1 / (1 + error) * (1 - error);
+        // Change base from natural log to log base findNextMinimum
+        this.#changeBaseLogarithm = Math.log(this.#findNextMinimum);
+        
+        this.#maxValue = maxValue;
+        this.#buckets = new Array(this.#numBuckets + 2).fill(0);
+        this.#buckets[this.#numBuckets + 1] = Number.POSITIVE_INFINITY;
+        this.#buckets[this.#numBuckets] = this.#maxValue;
+        this.#buckets[0] = 0;
+        let i = this.#numBuckets - 1;
+        let currentValue = maxValue;
+        while (i >= 1) {
+            let nextMinimum = currentValue * this.#findNextMinimum;
+            this.#buckets[i] = nextMinimum;
+            currentValue = nextMinimum;
+            i--;
+        }
+        this.#counts = new Array(this.#numBuckets + 2).fill(0);
+    }
+
+    add(latency) {
+        let idx =  Math.floor(this.#numBuckets - Math.log(latency / this.#maxValue) / this.#changeBaseLogarithm);
+        idx = idx < 0 ? 0 :
+              idx > this.#buckets.length - 2 ? this.#buckets.length - 2 :
+              idx;
+        
+        this.#counts[idx]++;
+        this.#totalCount++;
+    }
+
+    percentiles(percentilesArray) {
+        const percentileCounts = percentilesArray.map(p => Math.ceil(this.#totalCount * p / 100));
+        const percentileResults = new Array(percentilesArray.length);
+        var totalCountSoFar = 0;
+        let j = 0;
+        for (let i = 0; i < percentileCounts.length; i++) {
+            while (totalCountSoFar < percentileCounts[i] && j < this.#counts.length - 1) {
+                totalCountSoFar += this.#counts[j];
+                j++;
+            }
+            const bucketIndex = j < this.#counts.length - 1 ? j : this.#counts.length - 2;
+            percentileResults[i] = this.#buckets[bucketIndex];
+        }
+        return percentileResults;
+    }
+}
+
 async function runConsumer(consumer, topic, warmupMessages, totalMessageCnt, eachBatch, partitionsConsumedConcurrently, stats, actionOnMessages) {
     const handlers = installHandlers(totalMessageCnt === -1);
+    if (stats) {
+        stats.percentilesTOT1 = new LatencyCountSketch({});
+        stats.percentilesTOT2 = new LatencyCountSketch({});
+    }
     while (true) {
         try {
             await consumer.connect();
@@ -106,6 +177,7 @@ async function runConsumer(consumer, topic, warmupMessages, totalMessageCnt, eac
                 stats.maxLatencyT0T1 = Math.max(stats.maxLatencyT0T1, latency);
                 stats.avgLatencyT0T1 = ((stats.avgLatencyT0T1 * (numMessages - 1)) + latency) / numMessages;
             }
+            stats.percentilesTOT1.add(latency);
         } else {
             if (!stats.maxLatencyT0T2) {
                 stats.maxLatencyT0T2 = latency;
@@ -114,6 +186,7 @@ async function runConsumer(consumer, topic, warmupMessages, totalMessageCnt, eac
                 stats.maxLatencyT0T2 = Math.max(stats.maxLatencyT0T2, latency);
                 stats.avgLatencyT0T2 = ((stats.avgLatencyT0T2 * (numMessages - 1)) + latency) / numMessages;
             }
+            stats.percentilesTOT2.add(latency);
         }
     };
 
@@ -257,6 +330,14 @@ async function runConsumer(consumer, topic, warmupMessages, totalMessageCnt, eac
         stats.messageRate = durationSeconds > 0 ? 
                             (messagesMeasured / durationSeconds) : Infinity;
         stats.durationSeconds = durationSeconds;
+        stats.percentilesTOT1 = stats.percentilesTOT1.percentiles(PERCENTILES).map((value, index) => ({
+            percentile: PERCENTILES[index],
+            value,
+        }));
+        stats.percentilesTOT2 = stats.percentilesTOT2.percentiles(PERCENTILES).map((value, index) => ({
+            percentile: PERCENTILES[index],
+            value,
+        }));
     }
     removeHandlers(handlers);
     return rate;