mlcommons
diff --git a/‎docs/CLIENT_PERFORMANCE_TUNING.md‎
Lines changed: 46 additions & 0 deletions b/‎docs/CLIENT_PERFORMANCE_TUNING.md‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/inference_endpoint/endpoint_client/utils.py‎
Lines changed: 29 additions & 32 deletions b/‎src/inference_endpoint/endpoint_client/utils.py‎
Lines changed: 29 additions & 32 deletions
@@ -46,3 +46,49 @@ cpu_affinity: -1 # Auto: physical core isolation with SMT siblings
 - **Linux only**: Uses `os.sched_setaffinity()` and sysfs for topology detection
 - **Non-Linux**: Affinity settings are skipped with a warning
 - **Performance ranking**: Uses ACPI CPPC `highest_perf`, ARM `cpu_capacity`, or `cpuinfo_max_freq` (in order of preference)
+
+## Finding Optimal Worker Count
+
+Optimal worker count depends on your workload — prompt size, streaming mode, and connection count all affect throughput. Use the benchmark script to sweep worker counts against your expected prompt lengths and pick the configuration that maximizes recv rate.
+
+### Full sweep
+
+```bash
+python -m inference_endpoint.utils.benchmark_httpclient --full -d 5
+python -m inference_endpoint.utils.benchmark_httpclient --full -d 5 --stream
+```
+
+Runs all common worker counts against a range of prompt lengths (CPU pinning is on by default). Produces a plot at `/tmp/sweep_*.png` showing send/recv rate per configuration, with shaded variation bands and a stall% overlay.
+
+With `--stream`, the full sweep also varies stream interval (0%, 50%, 100% of prompt length) and adds an SSE-pkts/s subplot. Streaming typically requires more workers to sustain the same recv rate because each response involves many SSE events that must be parsed individually.
+
+### Targeted sweeps
+
+```bash
+# Sweep workers for a specific prompt length
+python -m inference_endpoint.utils.benchmark_httpclient -w 1:16 -l 4096 -d 10
+
+# Sweep workers with explicit values
+python -m inference_endpoint.utils.benchmark_httpclient -w 1,2,4,8,12,16 -l 4096 -d 10
+
+# Cartesian product: workers x prompt lengths
+python -m inference_endpoint.utils.benchmark_httpclient -w 1:16::8 -l 128,1024,8192 -d 5
+
+# Streaming: sweep workers with a fixed stream interval (chars per SSE event)
+python -m inference_endpoint.utils.benchmark_httpclient -w 1:16 -l 4096 --stream --stream-interval 100 -d 5
+
+# Streaming: sweep stream intervals (total events = ceil(output_length / interval))
+python -m inference_endpoint.utils.benchmark_httpclient -w 8 --stream --stream-interval 1,50,500 -d 5
+```
+
+### Reading the results
+
+- **Send Rate**: requests/s the client can issue. Higher is better.
+- **Recv Rate**: responses/s received. This is the effective throughput.
+- **SSE-pkts/s**: SSE events received per second (streaming mode only). Derived from `recv_rate * events_per_response`. Use this to gauge how the client handles high packet rates at different stream intervals.
+- **Stall%**: fraction of send time spent blocked on back-pressure (inflight limit). High stall% indicates client-side overhead — the client can't process responses fast enough to make room for new sends. The target server (MaxThroughputServer) returns pre-built responses with no compute, so stall is purely client overhead.
+- **Variation bands**: shaded region shows min/max per-second rate during each run. Wide bands indicate instability.
+
+Pick the worker count where recv rate peaks and stall% is low.
+
+For streaming workloads, also watch **SSE-pkts/s** — a small stream interval (fine-grained events) dramatically increases packet rate and may require more workers to keep up. If SSE-pkts/s plateaus while recv rate drops, the client is bottlenecked on SSE parsing overhead.
@@ -84,6 +84,8 @@ test = [
     "scipy==1.16.3",
     # HTTP server and client for mock server fixture
     "aiohttp==3.13.3",
+    # Plotting for benchmark sweep mode
+    "matplotlib>=3.8.0",
 ]
 performance = [
     "pytest-benchmark>=4.0.0",
 
@@ -15,60 +15,61 @@
 
 """Utility functions for the endpoint client module."""
 
+import subprocess
 import sys
 
 # Platform detection
 _IS_LINUX = sys.platform.startswith("linux")
+_IS_DARWIN = sys.platform == "darwin"
 
 
 def get_ephemeral_port_range() -> tuple[int, int]:
     """Get the ephemeral port range from system config.
 
     On Linux, reads from /proc/sys/net/ipv4/ip_local_port_range.
+    On macOS, reads from sysctl net.inet.ip.portrange.first/last.
 
     Returns:
         Tuple of (low, high) port numbers.
-
-    Raises:
-        OSError: If not running on Linux.
     """
-    if not _IS_LINUX:
-        raise OSError(
-            f"Ephemeral port range detection is only supported on Linux, not {sys.platform}."
-        )
+    if _IS_LINUX:
+        try:
+            with open("/proc/sys/net/ipv4/ip_local_port_range") as f:
+                low, high = map(int, f.read().split())
+                return low, high
+        except (OSError, ValueError):
+            return 32768, 60999
 
-    try:
-        with open("/proc/sys/net/ipv4/ip_local_port_range") as f:
-            low, high = map(int, f.read().split())
+    if _IS_DARWIN:
+        try:
+            low = int(
+                subprocess.check_output(
+                    ["sysctl", "-n", "net.inet.ip.portrange.first"], text=True
+                ).strip()
+            )
+            high = int(
+                subprocess.check_output(
+                    ["sysctl", "-n", "net.inet.ip.portrange.last"], text=True
+                ).strip()
+            )
             return low, high
-    except (OSError, ValueError):
-        # Fallback to typical Linux default
-        return 32768, 60999
+        except (OSError, ValueError, subprocess.CalledProcessError):
+            return 49152, 65535
+
+    raise OSError(f"Ephemeral port range detection is not supported on {sys.platform}.")
 
 
 def get_used_port_count() -> int:
     """Count TCP sockets using ephemeral ports.
-
-    Only counts sockets whose local port is in the ephemeral range.
-    Excludes LISTEN sockets (servers) and sockets on well-known ports.
     On Linux, reads from /proc/net/tcp and /proc/net/tcp6.
 
-    /proc/net/tcp format (hex values):
-      sl  local_address rem_address   st tx_queue rx_queue ...
-       0: 0100007F:1F90 00000000:0000 0A ...
-          ^^^^^^^^:^^^^
-          IP addr  port (hex)
-
     Returns:
         Number of TCP sockets using ephemeral ports.
-
-    Raises:
-        OSError: If not running on Linux.
     """
+    if _IS_DARWIN:
+        return 0
     if not _IS_LINUX:
-        raise OSError(
-            f"TCP socket counting is only supported on Linux, not {sys.platform}."
-        )
+        raise OSError(f"TCP socket counting is not supported on {sys.platform}.")
 
     low, high = get_ephemeral_port_range()
     count = 0
@@ -83,7 +84,6 @@ def get_used_port_count() -> int:
                     if len(parts) < 2:
                         continue
                     local_addr = parts[1]  # e.g., "0100007F:1F90"
-                    # Port is after the colon, in hex
                     port_hex = local_addr.split(":")[1]
                     port = int(port_hex, 16)
                     if low <= port <= high:
@@ -102,9 +102,6 @@ def get_ephemeral_port_limit() -> int:
 
     Returns:
         Number of available ephemeral ports.
-
-    Raises:
-        OSError: If not running on Linux.
     """
     low, high = get_ephemeral_port_range()
     total_range = high - low + 1
Original file line number	Diff line number	Diff line change
`@@ -84,6 +84,8 @@ test = [`
`84`	`84`	`"scipy==1.16.3",`
`85`	`85`	`# HTTP server and client for mock server fixture`
`86`	`86`	`"aiohttp==3.13.3",`
	`87`	`+ # Plotting for benchmark sweep mode`
	`88`	`+ "matplotlib>=3.8.0",`
`87`	`89`	`]`
`88`	`90`	`performance = [`
`89`	`91`	`"pytest-benchmark>=4.0.0",`