Skip to content

Commit 805a3b5

Browse files
authored
Merge branch 'main' into arekay/add_concurrency_sweep
2 parents 828582d + 24272d0 commit 805a3b5

5 files changed

Lines changed: 2140 additions & 32 deletions

File tree

docs/CLIENT_PERFORMANCE_TUNING.md

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,49 @@ cpu_affinity: -1 # Auto: physical core isolation with SMT siblings
4646
- **Linux only**: Uses `os.sched_setaffinity()` and sysfs for topology detection
4747
- **Non-Linux**: Affinity settings are skipped with a warning
4848
- **Performance ranking**: Uses ACPI CPPC `highest_perf`, ARM `cpu_capacity`, or `cpuinfo_max_freq` (in order of preference)
49+
50+
## Finding Optimal Worker Count
51+
52+
Optimal worker count depends on your workload — prompt size, streaming mode, and connection count all affect throughput. Use the benchmark script to sweep worker counts against your expected prompt lengths and pick the configuration that maximizes recv rate.
53+
54+
### Full sweep
55+
56+
```bash
57+
python -m inference_endpoint.utils.benchmark_httpclient --full -d 5
58+
python -m inference_endpoint.utils.benchmark_httpclient --full -d 5 --stream
59+
```
60+
61+
Runs all common worker counts against a range of prompt lengths (CPU pinning is on by default). Produces a plot at `/tmp/sweep_*.png` showing send/recv rate per configuration, with shaded variation bands and a stall% overlay.
62+
63+
With `--stream`, the full sweep also varies stream interval (0%, 50%, 100% of prompt length) and adds an SSE-pkts/s subplot. Streaming typically requires more workers to sustain the same recv rate because each response involves many SSE events that must be parsed individually.
64+
65+
### Targeted sweeps
66+
67+
```bash
68+
# Sweep workers for a specific prompt length
69+
python -m inference_endpoint.utils.benchmark_httpclient -w 1:16 -l 4096 -d 10
70+
71+
# Sweep workers with explicit values
72+
python -m inference_endpoint.utils.benchmark_httpclient -w 1,2,4,8,12,16 -l 4096 -d 10
73+
74+
# Cartesian product: workers x prompt lengths
75+
python -m inference_endpoint.utils.benchmark_httpclient -w 1:16::8 -l 128,1024,8192 -d 5
76+
77+
# Streaming: sweep workers with a fixed stream interval (chars per SSE event)
78+
python -m inference_endpoint.utils.benchmark_httpclient -w 1:16 -l 4096 --stream --stream-interval 100 -d 5
79+
80+
# Streaming: sweep stream intervals (total events = ceil(output_length / interval))
81+
python -m inference_endpoint.utils.benchmark_httpclient -w 8 --stream --stream-interval 1,50,500 -d 5
82+
```
83+
84+
### Reading the results
85+
86+
- **Send Rate**: requests/s the client can issue. Higher is better.
87+
- **Recv Rate**: responses/s received. This is the effective throughput.
88+
- **SSE-pkts/s**: SSE events received per second (streaming mode only). Derived from `recv_rate * events_per_response`. Use this to gauge how the client handles high packet rates at different stream intervals.
89+
- **Stall%**: fraction of send time spent blocked on back-pressure (inflight limit). High stall% indicates client-side overhead — the client can't process responses fast enough to make room for new sends. The target server (MaxThroughputServer) returns pre-built responses with no compute, so stall is purely client overhead.
90+
- **Variation bands**: shaded region shows min/max per-second rate during each run. Wide bands indicate instability.
91+
92+
Pick the worker count where recv rate peaks and stall% is low.
93+
94+
For streaming workloads, also watch **SSE-pkts/s** — a small stream interval (fine-grained events) dramatically increases packet rate and may require more workers to keep up. If SSE-pkts/s plateaus while recv rate drops, the client is bottlenecked on SSE parsing overhead.

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,8 @@ test = [
8484
"scipy==1.16.3",
8585
# HTTP server and client for mock server fixture
8686
"aiohttp==3.13.3",
87+
# Plotting for benchmark sweep mode
88+
"matplotlib>=3.8.0",
8789
]
8890
performance = [
8991
"pytest-benchmark>=4.0.0",

src/inference_endpoint/endpoint_client/utils.py

Lines changed: 29 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -15,60 +15,61 @@
1515

1616
"""Utility functions for the endpoint client module."""
1717

18+
import subprocess
1819
import sys
1920

2021
# Platform detection
2122
_IS_LINUX = sys.platform.startswith("linux")
23+
_IS_DARWIN = sys.platform == "darwin"
2224

2325

2426
def get_ephemeral_port_range() -> tuple[int, int]:
2527
"""Get the ephemeral port range from system config.
2628
2729
On Linux, reads from /proc/sys/net/ipv4/ip_local_port_range.
30+
On macOS, reads from sysctl net.inet.ip.portrange.first/last.
2831
2932
Returns:
3033
Tuple of (low, high) port numbers.
31-
32-
Raises:
33-
OSError: If not running on Linux.
3434
"""
35-
if not _IS_LINUX:
36-
raise OSError(
37-
f"Ephemeral port range detection is only supported on Linux, not {sys.platform}."
38-
)
35+
if _IS_LINUX:
36+
try:
37+
with open("/proc/sys/net/ipv4/ip_local_port_range") as f:
38+
low, high = map(int, f.read().split())
39+
return low, high
40+
except (OSError, ValueError):
41+
return 32768, 60999
3942

40-
try:
41-
with open("/proc/sys/net/ipv4/ip_local_port_range") as f:
42-
low, high = map(int, f.read().split())
43+
if _IS_DARWIN:
44+
try:
45+
low = int(
46+
subprocess.check_output(
47+
["sysctl", "-n", "net.inet.ip.portrange.first"], text=True
48+
).strip()
49+
)
50+
high = int(
51+
subprocess.check_output(
52+
["sysctl", "-n", "net.inet.ip.portrange.last"], text=True
53+
).strip()
54+
)
4355
return low, high
44-
except (OSError, ValueError):
45-
# Fallback to typical Linux default
46-
return 32768, 60999
56+
except (OSError, ValueError, subprocess.CalledProcessError):
57+
return 49152, 65535
58+
59+
raise OSError(f"Ephemeral port range detection is not supported on {sys.platform}.")
4760

4861

4962
def get_used_port_count() -> int:
5063
"""Count TCP sockets using ephemeral ports.
51-
52-
Only counts sockets whose local port is in the ephemeral range.
53-
Excludes LISTEN sockets (servers) and sockets on well-known ports.
5464
On Linux, reads from /proc/net/tcp and /proc/net/tcp6.
5565
56-
/proc/net/tcp format (hex values):
57-
sl local_address rem_address st tx_queue rx_queue ...
58-
0: 0100007F:1F90 00000000:0000 0A ...
59-
^^^^^^^^:^^^^
60-
IP addr port (hex)
61-
6266
Returns:
6367
Number of TCP sockets using ephemeral ports.
64-
65-
Raises:
66-
OSError: If not running on Linux.
6768
"""
69+
if _IS_DARWIN:
70+
return 0
6871
if not _IS_LINUX:
69-
raise OSError(
70-
f"TCP socket counting is only supported on Linux, not {sys.platform}."
71-
)
72+
raise OSError(f"TCP socket counting is not supported on {sys.platform}.")
7273

7374
low, high = get_ephemeral_port_range()
7475
count = 0
@@ -83,7 +84,6 @@ def get_used_port_count() -> int:
8384
if len(parts) < 2:
8485
continue
8586
local_addr = parts[1] # e.g., "0100007F:1F90"
86-
# Port is after the colon, in hex
8787
port_hex = local_addr.split(":")[1]
8888
port = int(port_hex, 16)
8989
if low <= port <= high:
@@ -102,9 +102,6 @@ def get_ephemeral_port_limit() -> int:
102102
103103
Returns:
104104
Number of available ephemeral ports.
105-
106-
Raises:
107-
OSError: If not running on Linux.
108105
"""
109106
low, high = get_ephemeral_port_range()
110107
total_range = high - low + 1

0 commit comments

Comments
 (0)