diff --git a/docs/host_side_dispatcher_design_gemini.md b/docs/host_side_dispatcher_design_gemini.md new file mode 100644 index 00000000..287abe8e --- /dev/null +++ b/docs/host_side_dispatcher_design_gemini.md @@ -0,0 +1,514 @@ +# Host-Side Spin-Polling Dispatcher with Dynamic Worker Pool + +## Design Specification + +**Component**: `cudaq-qec` Realtime Decoding Subsystem +**Status**: Implemented +**Supersedes**: Device-side persistent kernel dispatcher (`dispatch_kernel_with_graph`) and Statically-mapped Host Dispatcher +**Target Platforms**: NVIDIA Grace Hopper (GH200), Grace Blackwell (GB200) +**Shared-Memory Model**: libcu++ `cuda::std::atomic` with `thread_scope_system` +**Last Updated**: 2026-03-17 + +--- + +## 1. System Context & Motivation + +### 1.1 The Pipeline +The system performs real-time quantum error correction (QEC). An FPGA streams syndrome measurements into a host-device shared ring buffer continuously (~104 µs cadence for d=13, T=104). +1. **Predecoding (GPU)**: TensorRT neural network inference (~88 µs pure GPU compute for d=13/T=104 with FP16; ~146 µs p50 in pipeline with DMA and dispatch overhead). +2. **Global Decoding (CPU)**: PyMatching (MWPM) (~224 µs average for d=13/T=104 with full 17,472-detector parity check matrix). + +### 1.2 The Problem +The legacy architecture used a persistent GPU kernel to launch child CUDA graphs using `cudaStreamGraphFireAndForget`. This hit a hardcoded CUDA runtime limit of 128 cumulative launches, causing fatal crashes. A naive host-side port mapping FPGA slots 1:1 to GPU streams caused **Head-of-Line (HOL) blocking**: a single slow PyMatching decode would stall the sequential dispatcher, backing up the ring buffer and violating strict quantum coherence latency budgets. + +### 1.3 The Solution +This document defines a **Host-Side Dispatcher with a Dynamic Worker Pool**. +* The dispatcher runs on a dedicated CPU core. +* Predecoder streams and CPU workers act as an interchangeable pool. +* Inflight jobs are tagged with their origin slot, allowing out-of-order execution and completion. +* Synchronization relies exclusively on Grace Blackwell's NVLink-C2C hardware using libcu++ system-scope atomics. +* **Decoupled architecture**: PyMatching decode runs in a separate thread pool from the predecoder workers, allowing GPU streams to be released immediately after inference completion rather than blocking on CPU decode. + +--- + +## 2. Core Architecture: Dynamic Worker Pool + +Instead of mapping predecoder streams statically to incoming data, the host dispatcher maintains a bitmask of available workers (`idle_mask`). + +1. **Allocate**: When `rx_flags[slot]` indicates new data, the dispatcher finds the first available worker stream using a hardware bit-scan (`__builtin_ffsll`). +2. **Tag**: The dispatcher records the original `slot` in a tracking array (`inflight_slot_tags[worker_id]`) so the response can be routed correctly. +3. **Pre-launch DMA**: If a `pre_launch_fn` callback is registered on the worker, the dispatcher calls it to issue a `cudaMemcpyAsync` (DMA engine copy) of the input payload from the ring buffer to the TRT input buffer before graph launch. +4. **Dispatch**: The dispatcher launches the CUDA graph on the assigned worker's stream and clears its availability bit. +5. **Free**: The predecoder worker thread (not the PyMatching thread) restores the worker's availability bit in the `idle_mask` after copying inference output and enqueuing the PyMatching job. Slot completion is deferred to the PyMatching thread pool. + +--- + +## 3. Memory & Synchronization Model + +**CRITICAL DIRECTIVE**: The ARM Neoverse architecture (Grace) is **weakly ordered**. Code generated from this document MUST NOT use `volatile`, `__threadfence_system()`, or `std::atomic_thread_fence`. + +All shared state must use **libcu++ system-scope atomics** allocated in mapped pinned memory (`cudaHostAllocMapped`). + +### 3.1 Shared State Variables + +| Variable | Type | Memory Location | Purpose | +| :--- | :--- | :--- | :--- | +| `rx_flags[NUM_SLOTS]` | `atomic` | Mapped Pinned | FPGA writes data ptr; CPU polls (Acquire). | +| `tx_flags[NUM_SLOTS]` | `atomic` | Mapped Pinned | CPU writes response; FPGA polls (Release). | +| `ready_flags[1]` | `atomic` | Mapped Pinned | GPU signals TRT done; CPU polls (Release/Acquire). Queue depth = 1. | +| `idle_mask` | `atomic` | Host CPU Mem | Bitmask of free workers. 1 = free, 0 = busy. | +| `inflight_slot_tags[NUM_WORKERS]`| `int` (Plain array) | Host CPU Mem | Maps `worker_id` -> original FPGA `slot`. | +| `mailbox_bank[NUM_WORKERS]` | `void*` (Plain array) | Mapped Pinned | Dispatcher writes device ptr for pre-launch callback. | +| `h_ring_ptrs[1]` | `void*` (Mapped Pinned) | Mapped Pinned | Pre-launch callback writes slot device ptr for CPU worker readback. | +| `h_predecoder_outputs_[1]` | `void*` (Mapped Pinned) | Mapped Pinned | GPU output copied here via DMA; CPU worker reads inference results. | + +**NUM_SLOTS**: 16 (ring buffer capacity). +**NUM_WORKERS**: 8 (predecoder streams, each with a dedicated CPU poller thread). +**Queue depth**: 1 per predecoder (single in-flight inference per stream). + +--- + +## 4. Host Dispatcher Thread (Producer) + +The dispatcher loop is a tight spin-polling loop running on a dedicated CPU core. It is implemented in `realtime/lib/daemon/dispatcher/host_dispatcher.cu` as `cudaq_host_dispatcher_loop()`. + +### 4.1 cudaq_host_dispatch_worker_t Structure + +Each worker in the pool has the following fields: + +```cpp +typedef struct { + cudaGraphExec_t graph_exec; + cudaStream_t stream; + uint32_t function_id; + void (*pre_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream); + void* pre_launch_data; + void (*post_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream); + void* post_launch_data; +} cudaq_host_dispatch_worker_t; +``` + +The `pre_launch_fn` callback enables the dispatcher to issue a `cudaMemcpyAsync` (using the DMA copy engine) for the input payload before each graph launch, without baking application-specific logic into the generic dispatcher. The `post_launch_fn` callback is used in GPU-only mode to enqueue a `cudaLaunchHostFunc` that signals slot completion without CPU worker threads. + +### 4.2 Dispatcher Logic (Pseudocode) +```cpp +void cudaq_host_dispatcher_loop(const cudaq_host_dispatcher_config_t *config) { + size_t current_slot = 0; + + while (config.shutdown_flag->load(acquire) == 0) { + uint64_t rx_value = config.rx_flags[current_slot].load(acquire); + if (rx_value == 0) { QEC_CPU_RELAX(); continue; } + + void* slot_host = reinterpret_cast(rx_value); + + // Optional: parse RPC header and lookup function table + if (use_function_table) { + ParsedSlot parsed = parse_slot_with_function_table(slot_host, config); + if (parsed.drop) { clear_and_advance(); continue; } + } + + // Wait for an available worker (spin if all busy) + int worker_id = acquire_graph_worker(config, ...); + if (worker_id < 0) { QEC_CPU_RELAX(); continue; } + + // Mark worker busy, tag with origin slot + config.idle_mask->fetch_and(~(1ULL << worker_id), release); + config.inflight_slot_tags[worker_id] = current_slot; + + // Translate host ptr to device ptr, write to mailbox + ptrdiff_t offset = (uint8_t*)slot_host - config.rx_data_host; + void* data_dev = config.rx_data_dev + offset; + config.h_mailbox_bank[worker_id] = data_dev; + __sync_synchronize(); + + // Pre-launch callback: DMA copy input to TRT buffer + if (worker.pre_launch_fn) + worker.pre_launch_fn(worker.pre_launch_data, data_dev, worker.stream); + + // Launch graph + cudaError_t err = cudaGraphLaunch(worker.graph_exec, worker.stream); + if (err != cudaSuccess) { + tx_flags[current_slot].store(0xDEAD|err, release); + idle_mask->fetch_or(1ULL << worker_id, release); + } else { + tx_flags[current_slot].store(0xEEEEEEEEEEEEEEEEULL, release); + } + + // Post-launch callback (GPU-only mode: enqueue cudaLaunchHostFunc) + if (worker.post_launch_fn) + worker.post_launch_fn(worker.post_launch_data, data_dev, worker.stream); + + // Consume slot and advance + rx_flags[current_slot].store(0, release); + current_slot = (current_slot + 1) % num_slots; + } + for (auto& w : config.workers) cudaStreamSynchronize(w.stream); +} +``` + +--- + +## 5. GPU Graph Composition & Data Transfer + +### 5.1 DMA-Based Data Movement + +Data copies between the ring buffer and TRT inference buffers use the GPU's DMA copy engine rather than SM-based kernels, freeing compute resources for inference. + +**Input copy (ring buffer -> TRT input)**: Issued by the host dispatcher via `pre_launch_fn` callback as a `cudaMemcpyAsync(DeviceToDevice)` on the worker's stream *before* `cudaGraphLaunch`. The source address is dynamic (determined at dispatch time from the ring buffer slot at offset `CUDAQ_RPC_HEADER_SIZE` = 24 bytes), so it cannot be baked into the captured graph. + +**Output copy (TRT output -> host-mapped outputs)**: Captured inside the CUDA graph as a `cudaMemcpyAsync(DeviceToDevice)`. Both source (`d_trt_output_`) and destination (`d_predecoder_outputs_`) are fixed addresses, so this is captured at graph instantiation time. + +### 5.2 Captured CUDA Graph Contents + +The CUDA graph for each predecoder contains (in order): + +1. **TRT inference** (`context_->enqueueV3(stream)`) -- or `passthrough_copy_kernel` if `SKIP_TRT` is set. +2. **Output DMA copy** (`cudaMemcpyAsync` D2D) -- copies TRT output to host-mapped predecoder output buffer (`h_predecoder_outputs_`). +3. **Signal kernel** (`predecoder_signal_ready_kernel<<<1,1>>>`) -- a single-thread kernel that performs `d_ready_flags[0].store(1, release)` to notify the CPU worker. + +The graph is instantiated with `cudaGraphInstantiate(&graph_exec_, graph, 0)` for host-launch mode. Input data arrives exclusively via the pre-launch DMA copy callback; no input-copy kernel exists in the graph. + +### 5.3 Source Files + +The `ai_predecoder_service.cu` implementation contains only two device kernels: + +- `predecoder_signal_ready_kernel` -- single-thread kernel that atomically stores `1` to the ready flag with system-scope release semantics. +- `passthrough_copy_kernel` -- vectorized identity copy (`uint4` 16-byte loads/stores, 256 threads) used when `SKIP_TRT` is set, substituting for TRT inference. + +### 5.4 Passthrough Copy Kernel (SKIP_TRT mode) + +When `SKIP_TRT` is set, the `passthrough_copy_kernel` substitutes for TRT inference, providing a deterministic identity function for testing and benchmarking the infrastructure overhead. In SKIP_TRT mode, the `AIDecoderService` constructor sets `input_size_ = output_size_ = 1600 * sizeof(float)` (6400 bytes) without loading any model file. + +--- + +## 6. Decoupled Worker Architecture + +The CPU-side processing uses a **two-tier decoupled architecture** that separates GPU polling from CPU-intensive decode: + +### 6.1 Tier 1: Predecoder Workers (GPU Polling + Copy) + +Each predecoder has a dedicated worker thread in the `RealtimePipeline`. These threads: + +1. **Poll** `ready_flags[0]` via `compare_exchange_strong(1, 2, acquire, relaxed)` (CAS claiming). +2. **Copy** inference output from `h_predecoder_outputs_` to a per-slot buffer (`deferred_outputs[origin_slot]`). +3. **Compute** syndrome density metrics (input vs. output nonzero detector counts). +4. **Release** the GPU predecoder slot via `release_job(slot_idx)` → `ready_flags[0].store(0, release)`. +5. **Enqueue** a `PyMatchJob` to the `PyMatchQueue`. +6. **Return** `DEFERRED_COMPLETION` to the pipeline, which releases `idle_mask` but does NOT set `tx_flags`. + +### 6.2 Tier 2: PyMatching Workers (CPU Decode + Completion) + +A separate thread pool (16 workers for d13_r104) dequeues from `PyMatchQueue` and: + +1. **Decode** using a per-thread PyMatching decoder instance (lock-free `thread_local` acquisition). +2. **Write** the RPC response (`DecodeResponse`) directly into the ring buffer slot. +3. **Signal** slot completion via `pipeline.complete_deferred(origin_slot)`, which stores the slot host address into `tx_flags[origin_slot]`. + +### 6.3 Why Decouple? + +With the coupled architecture, a single slow PyMatching decode (up to 139 ms tail latency) would hold the predecoder worker busy, preventing the GPU stream from being dispatched new work. This caused: +- Severe head-of-line blocking on `idle_mask` +- ~41M backpressure stalls +- Tail latencies: p90 = 970 µs, p99 = 1,767 µs + +The decoupled architecture reduces predecoder worker hold time from ~214 µs to ~10 µs, dropping: +- Backpressure stalls from 41M to 6.2M (85% reduction) +- p90 from 970 µs to 515 µs (47% reduction) +- p99 from 1,767 µs to 1,249 µs (29% reduction) + +### 6.4 DEFERRED_COMPLETION Protocol + +``` +Pipeline Worker Thread: PyMatching Thread: + poll_next_job() → CAS 1→2 (blocked on queue) + copy output to deferred_buf | + release_job() → store 0 | + enqueue PyMatchJob ──────────► pop PyMatchJob + return DEFERRED_COMPLETION decode with PyMatching + pipeline sets idle_mask ✓ write RPC response + pipeline skips tx_flags ✗ complete_deferred(slot) + └──► tx_flags[slot].store(addr) +``` + +### 6.5 PyMatchQueue + +Thread-safe MPSC queue using `std::mutex` + `std::condition_variable`: + +```cpp +struct PyMatchJob { + int origin_slot; + uint64_t request_id; + void *ring_buffer_ptr; +}; + +class PyMatchQueue { + std::mutex mtx_; + std::condition_variable cv_; + std::queue jobs_; + bool stop_ = false; +public: + void push(PyMatchJob &&j); + bool pop(PyMatchJob &out); // blocks until job available or shutdown + void shutdown(); +}; +``` + +### 6.6 Ready-Flag State Machine (Atomic Claiming) + +With queue depth 1, the poller must **claim** each completion exactly once. + +**States** (per-worker ready flag): + +| Value | State | Meaning | +| :--- | :--- | :--- | +| 0 | Idle | Waiting for GPU, or worker has called `release_job`. | +| 1 | Ready | GPU finished; signal kernel stored 1. | +| 2 | Processing | CPU poller claimed the job; copying output. | + +**Poller**: Use `compare_exchange_strong(expected=1, desired=2, memory_order_acquire, memory_order_relaxed)`. Only the thread that wins the CAS proceeds. Use **relaxed on failure** so spin-polling does not add barriers that delay seeing the GPU's store(1). + +**Worker**: When output is copied and job is enqueued, call `release_job(slot_idx)` which does `ready_flags[0].store(0, release)` so the slot is Idle for the next launch. + +--- + +## 7. Out-of-Order Consumer + +The consumer thread harvests completions **out-of-order** by scanning all active slots on every iteration, rather than waiting for a sequential `next_harvest` counter. This eliminates head-of-line blocking where a slow request in slot N would prevent harvesting faster completions in slot N+1. + +### 7.1 Consumer Logic (Pseudocode) +```cpp +while (!consumer_stop) { + bool found_any = false; + for (uint32_t s = 0; s < NUM_SLOTS; ++s) { + if (!slot_occupied[s]) continue; + + cudaq_tx_status_t status = cudaq_host_ringbuffer_poll_tx_flag(&rb, s, &err); + + if (status == CUDAQ_TX_READY) { + int rid = slot_request[s]; + complete_ts[rid] = now(); + completed[rid] = true; + total_completed++; + + slot_occupied[s] = 0; // Reset occupancy FIRST + __sync_synchronize(); // ARM memory fence + cudaq_host_ringbuffer_clear_slot(&rb, s); // Then clear tx_flags + found_any = true; + } + } + if (!found_any) QEC_CPU_RELAX(); +} +``` + +### 7.2 Consumer-Producer Race Fix + +On ARM's weakly ordered memory model, the consumer must reset `slot_occupied[s] = 0` **before** clearing `tx_flags[s]` (via `cudaq_host_ringbuffer_clear_slot`), with a `__sync_synchronize()` fence between them. Without this ordering: +1. Consumer clears `tx_flags[s]` (slot appears free to producer) +2. Producer writes new `slot_occupied[s] = 1` +3. Consumer's delayed `slot_occupied[s] = 0` clobbers the producer's write + +This race caused exactly one request to get "stuck" indefinitely, eventually stalling the entire pipeline through backpressure. + +--- + +## 8. RealtimePipeline Scaffolding + +The low-level dispatcher, consumer, and worker threads are wrapped by a higher-level `RealtimePipeline` class (`libs/qec/include/cudaq/qec/realtime/pipeline.h`) that hides all ring buffer management, atomics, and thread lifecycle. Application code provides three callbacks: + +1. **GPU stage factory** (`GpuStageFactory`): Called once per worker during `start()`. Returns the `cudaGraphExec_t`, `cudaStream_t`, `pre_launch_fn`, `post_launch_fn`, `function_id`, and an opaque `user_context` for each worker. +2. **CPU stage callback** (`CpuStageCallback`): Called by each worker thread when GPU inference completes. Receives `CpuStageContext` containing `gpu_output`, `gpu_output_size`, `response_buffer`, and the `user_context`. Returns the number of bytes written, `0` if no result ready (poll again), or `DEFERRED_COMPLETION` to release the worker without signaling slot completion. +3. **Completion callback** (`CompletionCallback`): Called by the consumer thread for each completed (or errored) request with a `Completion` struct. + +```cpp +RealtimePipeline pipeline(config); +pipeline.set_gpu_stage([&](int worker_id) -> GpuWorkerResources { ... }); +pipeline.set_cpu_stage([&](const CpuStageContext& ctx) -> size_t { ... }); +pipeline.set_completion_handler([&](const Completion& c) { ... }); +auto injector = pipeline.create_injector(); +pipeline.start(); +injector.submit(function_id, payload, payload_size, request_id); +// ... +pipeline.stop(); +``` + +### 8.1 DEFERRED_COMPLETION + +When the CPU stage callback returns `DEFERRED_COMPLETION` (= `SIZE_MAX`), the pipeline: +- Sets the worker's bit in `idle_mask` (worker is free for next dispatch) +- Does NOT write to `tx_flags[origin_slot]` (slot stays IN_FLIGHT) + +The caller is responsible for eventually calling `pipeline.complete_deferred(slot)`, which stores the slot host address into `tx_flags[slot]` with release semantics, making the completion visible to the consumer. + +### 8.2 GPU-Only Mode + +If no `CpuStageCallback` is registered, the pipeline operates in **GPU-only mode**: no CPU worker threads are spawned. Instead, the dispatcher's `post_launch_fn` enqueues a `cudaLaunchHostFunc` on each worker stream. When the GPU finishes, the CUDA runtime calls the host function, which stores into `tx_flags` and restores the `idle_mask` bit — all from the CUDA callback thread. + +### 8.3 RingBufferInjector + +The `RingBufferInjector` class (created via `pipeline.create_injector()`) encapsulates the host-side submission logic for testing without FPGA hardware. It provides: + +- `try_submit()`: Non-blocking, returns false on backpressure. +- `submit()`: Blocking spin-wait until a slot becomes available. +- `backpressure_stalls()`: Counter of spin iterations during backpressure. + +The injector uses a round-robin slot selection with atomic CAS for thread safety. + +The `PipelineStageConfig` allows configuring `num_workers`, `num_slots`, `slot_size`, and optional `CorePinning` for dispatcher, consumer, and worker threads. + +--- + +## 9. Step-by-Step Data Flow Trace + +1. **Producer** writes uint8 measurements into `payload_buf` from Stim test data. +2. **Producer** calls `injector.submit(fid, payload, size, request_id)`. +3. **RingBufferInjector** writes RPC header (`RPCHeader`: magic, function_id, arg_len, request_id, ptp_timestamp = 24 bytes) + payload into `rx_data[slot]`. +4. **RingBufferInjector** sets `rx_flags[slot] = host_ptr` (release). +5. **Host Dispatcher** reads `rx_flags[slot]`, sees data. +6. **Host Dispatcher** parses RPC header, looks up function in the function table. +7. **Host Dispatcher** scans `idle_mask`, finds `worker_id = 2` is free. +8. **Host Dispatcher** marks bit 2 busy in `idle_mask`. +9. **Host Dispatcher** saves `inflight_slot_tags[2] = slot`. +10. **Host Dispatcher** translates `host_ptr` to `dev_ptr`, writes to `mailbox_bank[2]`. +11. **Host Dispatcher** calls `pre_launch_fn`: writes `h_ring_ptrs[0] = dev_ptr`, issues `cudaMemcpyAsync(d_trt_input, dev_ptr + 24, input_size, D2D, stream[2])`. +12. **Host Dispatcher** calls `cudaGraphLaunch(..., stream[2])`. +13. **Host Dispatcher** sets `tx_flags[slot] = 0xEEEE...` (IN_FLIGHT), then clears `rx_flags[slot] = 0` and advances to next slot. +14. **GPU DMA engine** copies input payload from ring buffer to TRT input buffer. +15. **GPU** executes TRT inference (or passthrough copy in SKIP_TRT mode). +16. **GPU DMA engine** copies TRT output to host-mapped `h_predecoder_outputs_`. +17. **GPU signal kernel** sets `ready_flags[0] = 1` (system-scope atomic release). +18. **Predecoder Worker** CAS(1, 2) on `ready_flags[0]`, wins, reads inference output. +19. **Predecoder Worker** copies output to `deferred_outputs[origin_slot]`. +20. **Predecoder Worker** computes syndrome density metrics. +21. **Predecoder Worker** calls `release_job(0)` → `ready_flags[0].store(0, release)`. +22. **Predecoder Worker** extracts `request_id` from RPC header, enqueues `PyMatchJob`. +23. **Predecoder Worker** returns `DEFERRED_COMPLETION`. +24. **Pipeline** restores bit 2 in `idle_mask` (worker free for next dispatch). Does NOT touch `tx_flags`. +25. **PyMatching Worker** pops `PyMatchJob` from queue, acquires per-thread decoder. +26. **PyMatching Worker** runs PyMatching MWPM decode over full parity check matrix. +27. **PyMatching Worker** writes `RPCResponse + DecodeResponse` into ring buffer slot. +28. **PyMatching Worker** calls `pipeline.complete_deferred(slot)` → `tx_flags[slot].store(host_addr, release)`. +29. **Consumer** scans all slots, sees `tx_flags[slot] != 0` and `!= 0xEEEE`, harvests. +30. **Consumer** calls `completion_handler(request_id, slot, success)`. +31. **Consumer** sets `slot_occupied[slot] = 0`, `__sync_synchronize()`, then clears `tx_flags[slot] = 0`. Producer may now reuse slot. + +--- + +## 10. RPC Protocol & Ring Buffer + +### 10.1 RPC Header + +```cpp +struct RPCHeader { + uint32_t magic; // RPC_MAGIC_REQUEST + uint32_t function_id; // FNV-1a hash of function name + uint32_t arg_len; // payload length in bytes + uint32_t request_id; // unique request identifier + uint64_t ptp_timestamp; // PTP timestamp (optional) +}; +// sizeof(RPCHeader) == 24 +#define CUDAQ_RPC_HEADER_SIZE 24u +``` + +### 10.2 IN_FLIGHT Sentinel + +Because `cudaGraphLaunch` is asynchronous, the dispatcher clears `rx_flags[slot]` immediately after launch. Without a hold, the **producer** (FPGA sim or test) would see `rx_flags[slot]==0` and `tx_flags[slot]==0` (response not written yet) and reuse the slot, overwriting data while the GPU is still reading. + +**Fix: IN_FLIGHT tag** + +1. **Dispatcher**: On successful launch, write `tx_flags[current_slot].store(0xEEEEEEEEEEEEEEEEULL, release)` **before** clearing `rx_flags[current_slot]`. On launch failure, write the 0xDEAD|err value and restore the worker bit; do not write 0xEEEE. Setting `tx_data_host = nullptr` and `tx_data_dev = nullptr` in the config forces the dispatcher to use the `0xEEEE` sentinel rather than a real data address. +2. **Producer**: Reuse a slot only when **both** `rx_flags[slot]==0` **and** `tx_flags[slot]==0`. Thus the producer blocks until the consumer has harvested (tx cleared). +3. **Consumer**: When harvesting, treat only real responses: `tx_flags[slot] != 0` **and** `tx_flags[slot] != 0xEEEEEEEEEEEEEEEEULL`. Ignore 0xEEEE (in-flight). On harvest, clear `tx_flags[slot] = 0`. + +**Slot lifecycle**: Idle (rx=0, tx=0) -> Written (rx=ptr, tx=0) -> In-flight (rx=0, tx=0xEEEE) -> Completed (rx=0, tx=response) -> Consumer harvests, tx=0 -> Idle. + +--- + +## 11. Dynamic Batch Handling for ONNX Models + +When building a TensorRT engine from an ONNX model with dynamic batch dimensions (dim 0 <= 0), `ai_decoder_service.cu` automatically creates an optimization profile that pins all dynamic dimensions to 1. This enables building engines from models like `predecoder_memory_d13_T13_X.onnx` which use a symbolic `batch` dimension. + +--- + +## 12. Test Suite + +A GTest-based test suite (`libs/qec/unittests/test_realtime_pipeline.cu`) validates the pipeline using `SKIP_TRT` passthrough mode (no TensorRT dependency at runtime). The tests are organized into three categories: + +### 12.1 Unit Tests (8 tests) +- **AIDecoderService**: Verify SKIP_TRT buffer sizes (1600 floats = 6400 bytes), allocation, and graph capture. +- **AIPreDecoderService**: Verify mapped pinned memory allocation, `poll_next_job` / `release_job` state machine, and host-launchable graph. + +### 12.2 Correctness Tests (5 tests) +Data-integrity tests that verify known payloads survive the full CUDA graph round-trip bitwise-identical (memcmp, not epsilon): +- **Zeros, Known Pattern, Random Data, Extreme Float Values**: Single-request verification with different payload patterns (including `FLT_MAX`, `NaN`, `INFINITY`). +- **Multiple Requests (5,000 iterations)**: Pushes 5,000 random 6.4 KB payloads through the pipeline and verifies bitwise identity on every one. Confirms no cross-contamination or data corruption over sustained use. + +### 12.3 Integration Tests (8 tests) +- **Dispatcher lifecycle**: Shutdown semantics, stats counter accuracy, invalid RPC magic rejection, slot wraparound. +- **Single Request Round-Trip**: Full dispatcher -> graph -> poll -> verify data path. +- **Multi-Predecoder Concurrency**: 4 predecoders on 4 streams, simultaneous dispatch, per-predecoder data verification. +- **Sustained Throughput (200 requests)**: Regression test for the 128-launch-limit fix. Proves indefinite stability of the host-side dispatcher. + +### 12.4 End-to-End Benchmark (test_realtime_predecoder_w_pymatching) +- Configurable surface code distance and round count: d7, d13, d13_r104, d21, d31. +- Loads Stim-generated test data (detectors, observables, parity check matrix, priors). +- Streams syndrome data at configurable rate with correctness verification (LER). +- Reports latency percentiles, throughput, backpressure stalls, syndrome density reduction. + +--- + +## 13. Shutdown and Grace Period + +- **Grace period**: After the producer stops submitting, the pipeline waits up to 5 seconds for `total_completed >= total_submitted`. +- **Consumer exit**: The consumer thread normally exits when `producer_stop && total_completed >= total_submitted`. To avoid hanging forever if some in-flight requests never complete, set a **consumer_stop** flag after the grace period; the consumer loop checks this and exits so `consumer.join()` returns and the process can print the final report and exit cleanly. +- **Dispatcher shutdown**: Set `shutdown_flag = 1` after the consumer exits, then join the dispatcher thread. The dispatcher synchronizes all worker streams before returning. +- **PyMatching thread pool**: Call `pymatch_queue.shutdown()` to unblock all waiting threads, then join all PyMatching worker threads. + +--- + +## 14. Performance Results (d=13, T=104, 104 µs rate, 20s) + +### 14.1 Decoupled Architecture (current) + +Measured on Grace Blackwell (GB200) with `predecoder_memory_d13_T104_X.onnx` (FP16), 8 predecoders, 16 PyMatching workers, 16 slots: + +| Metric | Value | +| :--- | :--- | +| Throughput | 9,610 req/s | +| Submitted / Completed | 192,309 / 192,309 | +| Backpressure stalls | 6,193,418 | +| p50 latency | 352.3 µs | +| Mean latency | 393.4 µs | +| p90 latency | 514.7 µs | +| p95 latency | 606.3 µs | +| p99 latency | 1,248.8 µs | +| Max latency | 3,930.0 µs | +| PyMatching decode (avg) | 224.4 µs | +| Syndrome density reduction | 98.3% | +| Pipeline LER | 0.0020 | + +### 14.2 Raw TRT Baseline (trtexec) + +| Mode | GPU Compute | Total Host Latency | +| :--- | :--- | :--- | +| Default | 107 µs | 119 µs | +| CUDA Graph + SpinWait | 90 µs | 99 µs | +| CUDA Graph + SpinWait + No Transfers | 88 µs | 88 µs | + +--- + +## 15. LLM Implementation Directives (Constraints Checklist) + +When generating code from this specification, the LLM **MUST** strictly adhere to the following constraints: + +- [ ] **NO CUDA STREAM QUERYING**: Do not use `cudaStreamQuery()` for backpressure or completion checking. It incurs severe driver latency. Rely strictly on `idle_mask` and `ready_flags`. +- [ ] **NO WEAK ORDERING BUGS**: Do not use `volatile`. Do not use `__threadfence_system()`. You must use `cuda::std::atomic` (or `` with `thread_scope_system`) for all cross-device synchronization. +- [ ] **NO HEAD OF LINE BLOCKING**: The host dispatcher MUST NOT statically map slots to predecoders. It must dynamically allocate via `idle_mask`. The consumer MUST harvest out-of-order by scanning all active slots. +- [ ] **NO DATA LOSS**: If `idle_mask == 0` (all workers busy), the dispatcher MUST spin on the current slot (`QEC_CPU_RELAX()`). It MUST NOT advance `current_slot` until a worker is allocated and the graph is launched. +- [ ] **NO RACE CONDITIONS ON TAGS**: `inflight_slot_tags` does not need to be atomic because index `[worker_id]` is exclusively owned by the active flow once the dispatcher clears the bit in `idle_mask`, until the worker thread restores the bit. +- [ ] **READY FLAG CLAIMING**: The CPU poller MUST claim each completion exactly once using compare_exchange_strong(1, 2) on the ready flag; use relaxed memory order on CAS failure. The worker MUST clear the flag (store 0) in `release_job`. +- [ ] **IN_FLIGHT SENTINEL**: After a successful `cudaGraphLaunch`, the dispatcher MUST write `tx_flags[current_slot] = 0xEEEEEEEEEEEEEEEEULL` before clearing `rx_flags[current_slot]`. Set `tx_data_host = nullptr` and `tx_data_dev = nullptr` to force the 0xEEEE path. The producer MUST wait for both rx and tx to be 0 before reusing a slot. The consumer MUST ignore 0xEEEE and only harvest real responses (or 0xDEAD errors). +- [ ] **CONSUMER MEMORY ORDERING**: The consumer MUST set `slot_occupied[s] = 0` BEFORE calling `cudaq_host_ringbuffer_clear_slot`, with a `__sync_synchronize()` fence between them, to prevent the producer-consumer race on ARM. +- [ ] **DMA DATA MOVEMENT**: Use `cudaMemcpyAsync` (DMA engine) for data copies. Input copy is issued via `pre_launch_fn` callback before graph launch at offset `CUDAQ_RPC_HEADER_SIZE` (24 bytes). Output copy is captured inside the graph. Do not use SM-based byte-copy kernels for fixed-address transfers. +- [ ] **NO INPUT KERNEL IN GRAPH**: The captured CUDA graph must NOT contain an input-copy kernel. All input data movement is handled by the `pre_launch_fn` DMA callback issued on the worker stream before `cudaGraphLaunch`. +- [ ] **DEFERRED COMPLETION**: When the CPU stage returns `DEFERRED_COMPLETION`, the pipeline MUST release `idle_mask` but MUST NOT write `tx_flags`. The external caller MUST call `complete_deferred(slot)` to signal completion. +- [ ] **SHUTDOWN**: Use a `consumer_stop` (or equivalent) flag so the consumer thread can exit after a grace period even when `total_completed < total_submitted`; join the consumer after setting the flag so the process exits cleanly. Shut down the PyMatching queue before stopping the pipeline. diff --git a/docs/hybrid_ai_predecoder_pipeline.md b/docs/hybrid_ai_predecoder_pipeline.md new file mode 100644 index 00000000..dbafa482 --- /dev/null +++ b/docs/hybrid_ai_predecoder_pipeline.md @@ -0,0 +1,785 @@ +# Hybrid AI Predecoder + PyMatching Global Decoder Pipeline + +## Design Document + +**Component**: `cudaq-qec` Realtime Decoding Subsystem +**Status**: Implementation Complete (Test-Validated) +**Last Updated**: 2026-03-17 + +--- + +## Table of Contents + +1. [Overview](#1-overview) +2. [Problem Statement](#2-problem-statement) +3. [Architecture](#3-architecture) +4. [Component Deep-Dive](#4-component-deep-dive) + - 4.1 [Ring Buffer & RPC Protocol](#41-ring-buffer--rpc-protocol) + - 4.2 [Host-Side Dispatcher](#42-host-side-dispatcher) + - 4.3 [AIDecoderService (Base Class)](#43-aidecoderservice-base-class) + - 4.4 [AIPreDecoderService (Predecoder + CPU Handoff)](#44-aipredeccoderservice-predecoder--cpu-handoff) + - 4.5 [Decoupled CPU Worker Architecture](#45-decoupled-cpu-worker-architecture) +5. [Data Flow](#5-data-flow) +6. [Memory Architecture](#6-memory-architecture) +7. [Backpressure Protocol](#7-backpressure-protocol) +8. [Memory Ordering & Synchronization](#8-memory-ordering--synchronization) +9. [CUDA Graph Structure](#9-cuda-graph-structure) +10. [Pipeline Configurations](#10-pipeline-configurations) +11. [File Inventory](#11-file-inventory) +12. [Configuration Parameters](#12-configuration-parameters) +13. [Performance Benchmarking](#13-performance-benchmarking) +14. [Portability](#14-portability) +15. [Limitations & Future Work](#15-limitations--future-work) + +--- + +## 1. Overview + +This system implements a **realtime hybrid GPU/CPU pipeline** for quantum error correction (QEC) decoding on the surface code. The pipeline splits the decoding workload into two stages: + +| Stage | Location | Algorithm | Data Type | +|-------|----------|-----------|-----------| +| **Predecoding** | GPU | Neural network (TensorRT, from ONNX) | uint8 | +| **Global Decoding** | CPU | PyMatching (MWPM) | float64 | + +A **host-side spin-polling dispatcher** monitors a shared ring buffer for incoming syndrome data. When data arrives, the dispatcher finds a free GPU worker from a dynamic bitmask pool (`idle_mask`), copies the input via DMA, and launches a pre-captured CUDA graph containing a TensorRT inference pass. The neural network accepts raw detector values as uint8 tensors and produces residual detectors and a logical frame prediction. The residual detectors are handed off to a **decoupled PyMatching thread pool** via a lock-free queue, where per-thread MWPM decoders compute corrections. Results are written back to the ring buffer and acknowledged. + +The system supports multiple surface code distances and round counts via a configurable `PipelineConfig` struct: d=7/T=7, d=13/T=13, d=13/T=104, d=21/T=21, and d=31/T=31. ONNX models are compiled to TensorRT engines on first use and cached to disk as `.engine` files for fast reloading on subsequent runs. + +--- + +## 2. Problem Statement + +Surface code QEC requires decoding syndrome measurements within the coherence time of the quantum system. A pure CPU decoder cannot meet this budget at scale. A pure GPU decoder lacks the flexibility to run algorithms like Minimum Weight Perfect Matching (MWPM) efficiently on GPU. + +The hybrid approach exploits the strengths of each: + +- **GPU**: Massively parallel neural network inference provides fast soft-decision outputs (residual detectors) that reduce the problem complexity for the global decoder. The predecoder achieves **98.3% syndrome density reduction** for d=13/T=104. +- **CPU**: PyMatching solves the residual MWPM problem on the simplified output from the predecoder. + +The critical constraint is **zero-copy, zero-allocation** on the hot path. Every buffer is pre-allocated, every kernel is pre-captured into a CUDA Graph, and every transfer uses mapped pinned memory or DMA. + +--- + +## 3. Architecture + +### System Diagram + +``` + Test Harness (or FPGA DMA) + │ + │ syndrome data (uint8 detectors) + ▼ + ┌─────────────────────────────────────────────────────┐ + │ Ring Buffer (Mapped Pinned Memory) │ + │ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ │ + │ │Slot 0│ │Slot 1│ │Slot 2│ ... │Slot15│ │ + │ └──┬───┘ └──┬───┘ └──┬───┘ └──┬───┘ │ + │ │ │ │ │ │ + │ rx_flags[0] rx_flags[1] ... rx_flags[15] │ + └─────┼────────┼────────┼───────────────┼────────────┘ + │ │ │ │ + ▼ ▼ ▼ ▼ + ┌─────────────────────────────────────────────────────┐ + │ Host-Side Dispatcher Thread │ + │ │ + │ Polls rx_flags[] ──► Finds free worker (idle_mask)│ + │ ──► DMA copy (pre_launch_fn) ──► cudaGraphLaunch │ + └──────────┬──────────┬──────────┬──────────┬─────────┘ + │ │ │ │ + ▼ ▼ ▼ ▼ + ┌──────────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ + │ PreDecoder 0 │ │PreDec. 1 │ │ ... │ │PreDec. 7 │ + │ (CUDA Graph) │ │(CUDAGraph│ │ │ │(CUDAGraph│ + │ │ │ │ │ │ │ │ + │ TRT Infer │ │ ... │ │ ... │ │ ... │ + │ DMA Output │ │ │ │ │ │ │ + │ Signal Kern │ │ │ │ │ │ │ + └──────┬───────┘ └────┬─────┘ └────┬─────┘ └────┬─────┘ + │ │ │ │ + │ (mapped pinned memory: ready_flags, h_predecoder_outputs_) + ▼ ▼ ▼ ▼ + ┌─────────────────────────────────────────────────────┐ + │ Predecoder Workers (1:1 with GPU streams) │ + │ CAS(1,2) on ready_flags → copy output → enqueue │ + │ Release predecoder → return DEFERRED_COMPLETION │ + └──────────┬──────────────────────────────────────────┘ + │ PyMatchQueue (mutex + condvar) + ▼ + ┌──────────────┐ ┌──────────┐ ┌──────────────┐ + │ PyMatch 0 │ │PyMatch 1 │ ... │ PyMatch 15 │ + │ (thread pool)│ │(thd pool)│ │ (thread pool) │ + │ │ │ │ │ │ + │ PyMatching │ │PyMatch │ │ PyMatching │ + │ (own decoder)│ │(own dec) │ │ (own decoder) │ + │ Write RPC │ │Write RPC │ │ Write RPC │ + │ complete_ │ │complete_ │ │ complete_ │ + │ deferred() │ deferred() │ │ deferred() │ + └──────┬───────┘ └────┬─────┘ └────┬──────────┘ + │ │ │ + └──────────────┼──────────────────┘ + ▼ + ┌─────────────────────────────────────────────────────┐ + │ Consumer Thread │ + │ Scans tx_flags[] ──► completion_handler ──► clear │ + └─────────────────────────────────────────────────────┘ + tx_flags[slot] ──► Producer can reuse slot +``` + +### Key Design Decisions + +1. **Host-side dispatcher with dynamic worker pool** -- The dispatcher runs as a dedicated CPU thread, polling `rx_flags` and dynamically allocating GPU workers via an atomic `idle_mask` bitmask. This replaced a device-side persistent kernel that hit a CUDA 128-launch limit. + +2. **CUDA Graphs for inference** -- Each predecoder instance has a pre-captured CUDA graph containing TRT inference, output DMA copy, and a signal kernel. Input data is injected via a `pre_launch_fn` DMA callback before graph launch (since the source address is dynamic). + +3. **Mapped pinned memory for GPU→CPU handoff** -- `cudaHostAllocMapped` provides a single address space visible to both CPU and GPU without explicit copies. GPU writes are made visible via libcu++ system-scope atomics with release semantics; CPU reads use acquire semantics. + +4. **Queue depth 1 per predecoder** -- Each `AIPreDecoderService` has a single in-flight inference slot. Deeper queues were found to add complexity without measurable throughput benefit, since 8 parallel streams already exceed the GPU's throughput capacity. + +5. **Decoupled predecoder and PyMatching workers** -- GPU polling threads release the predecoder stream immediately after copying output (~10 µs), then hand off to a separate PyMatching thread pool via `PyMatchQueue`. This prevents slow CPU decodes (~224 µs) from blocking GPU dispatch. + +6. **ONNX model support with engine caching** -- The `AIDecoderService` accepts either a pre-built `.engine` file or an `.onnx` model. When given an ONNX file, it builds a TensorRT engine at runtime and optionally saves it to disk via the `engine_save_path` parameter. + +7. **Per-worker PyMatching decoder pool** -- Each PyMatching thread gets its own pre-allocated decoder instance via `thread_local` assignment. This eliminates mutex contention on the decode path. + +8. **Type-agnostic I/O buffers** -- All TRT I/O buffers use `void*` rather than `float*`, supporting uint8 and INT32 models natively without type casting. + +9. **Stim-derived parity check matrix** -- The PyMatching decoders are initialized from a full parity check matrix (`H`) and observable matrix (`O`) exported from Stim, rather than the `cudaq-qec` surface code's per-slice `H_z`. This enables full-H decoding with proper edge weighting via priors. + +--- + +## 4. Component Deep-Dive + +### 4.1 Ring Buffer & RPC Protocol + +**Files**: `dispatch_kernel_launch.h` (protocol), `cudaq_realtime.h` (C API), `realtime_pipeline.cu` (RingBufferManager) + +The ring buffer is the communication channel between the producer (FPGA or test harness) and the GPU. It consists of: + +| Buffer | Type | Size | Purpose | +|--------|------|------|---------| +| `rx_flags[N]` | `cuda::atomic` | N slots | Non-zero = data ready; value is pointer to slot data | +| `tx_flags[N]` | `cuda::atomic` | N slots | Non-zero = response ready; acknowledges to consumer | +| `rx_data` | `uint8_t*` | N x SLOT_SIZE | Slot payload area (mapped pinned) | + +Each slot carries an **RPC message** in a packed wire format: + +``` +Request: [RPCHeader: magic(4) | function_id(4) | arg_len(4) | request_id(4) | ptp_timestamp(8)] + [payload: arg_len bytes] + Total header: 24 bytes (CUDAQ_RPC_HEADER_SIZE) + +Response: [RPCResponse: magic(4) | status(4) | result_len(4)] + [payload: result_len bytes] +``` + +The `function_id` is an FNV-1a hash of the target function name, enabling the dispatcher to route requests to different predecoder instances. + +The response payload for the PyMatching pipeline is a packed `DecodeResponse`: + +```c +struct __attribute__((packed)) DecodeResponse { + int32_t total_corrections; + int32_t converged; +}; +``` + +### 4.2 Host-Side Dispatcher + +**File**: `realtime/lib/daemon/dispatcher/host_dispatcher.cu` + +The dispatcher is a **spin-polling host thread** running on a dedicated CPU core. It monitors the ring buffer's `rx_flags` and dispatches work to GPU streams. + +#### Worker Pool + +The dispatcher manages a pool of `num_workers` GPU streams. Each worker is described by a `cudaq_host_dispatch_worker_t`: + +```c +typedef struct { + cudaGraphExec_t graph_exec; + cudaStream_t stream; + uint32_t function_id; + void (*pre_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream); + void* pre_launch_data; + void (*post_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream); + void* post_launch_data; +} cudaq_host_dispatch_worker_t; +``` + +#### Dispatch Loop + +``` +while (!shutdown): + rx_value = rx_flags[current_slot].load(acquire) + if rx_value == 0: QEC_CPU_RELAX(); continue + + // Find free worker via idle_mask bitmask + worker_id = ffsll(idle_mask.load(acquire)) - 1 + if worker_id < 0: QEC_CPU_RELAX(); continue + + // Claim worker, tag origin slot + idle_mask.fetch_and(~(1ULL << worker_id), release) + inflight_slot_tags[worker_id] = current_slot + + // Pre-launch: DMA input to TRT buffer + if pre_launch_fn: pre_launch_fn(data, dev_ptr, stream) + + // Launch CUDA graph + cudaGraphLaunch(graph_exec, stream) + + // Mark in-flight, consume slot + tx_flags[current_slot].store(0xEEEE..., release) + rx_flags[current_slot].store(0, release) + + // Post-launch callback (GPU-only mode) + if post_launch_fn: post_launch_fn(...) + + current_slot = (current_slot + 1) % num_slots +``` + +### 4.3 AIDecoderService (Base Class) + +**Files**: `ai_decoder_service.h`, `ai_decoder_service.cu` + +The base class manages the TensorRT lifecycle. + +#### Constructor + +```cpp +AIDecoderService(const std::string& model_path, void** device_mailbox_slot, + const std::string& engine_save_path = ""); +``` + +The constructor accepts either a `.engine` file (fast deserialization) or an `.onnx` file (builds TRT engine via autotuner). When `engine_save_path` is non-empty and the model is ONNX, the built engine is serialized to disk for caching. + +#### Responsibilities + +- **Engine loading**: Deserializes a TensorRT `.engine` file or builds from `.onnx` via `NvOnnxParser`. +- **Engine caching**: Saves built engines to disk via `engine_save_path` for fast reload. +- **Dynamic tensor binding**: Enumerates all I/O tensors from the engine, storing metadata in `TensorBinding` structs. Supports models with multiple outputs. +- **Buffer allocation**: Allocates persistent device buffers sized to the engine's static tensor shapes. Uses `void*` for type-agnostic I/O. +- **Dynamic batch handling**: Automatically pins dynamic dimensions to 1 via optimization profiles. + +#### Dynamic Tensor Binding + +```cpp +struct TensorBinding { + std::string name; + void* d_buffer = nullptr; + size_t size_bytes = 0; + bool is_input = false; +}; +std::vector all_bindings_; +``` + +### 4.4 AIPreDecoderService (Predecoder + CPU Handoff) + +**Files**: `ai_predecoder_service.h`, `ai_predecoder_service.cu` + +This derived class replaces the base class's autonomous graph with one that hands inference results off to the CPU. + +#### Constructor + +```cpp +AIPreDecoderService(const std::string& engine_path, void** device_mailbox_slot, + int queue_depth = 1, const std::string& engine_save_path = ""); +``` + +#### CUDA Graph Structure + +``` +[Pre-launch DMA: ring buffer → d_trt_input (host-side callback)] + ↓ +TRT enqueueV3 (AI predecoder inference) + ↓ +cudaMemcpyAsync D2D (d_trt_output_ → h_predecoder_outputs_) + ↓ +predecoder_signal_ready_kernel (ready_flags.store(1, release)) +``` + +The input DMA copy is NOT in the graph — it's issued by the `pre_launch_fn` callback on the worker stream before `cudaGraphLaunch`, because the source address (ring buffer slot) changes each invocation. + +#### Per-Predecoder Buffers (queue_depth=1) + +| Buffer | Host Pointer | Device Pointer | Purpose | +|--------|-------------|---------------|---------| +| `h_ready_flags_` | CPU reads/writes | `d_ready_flags_` GPU writes | 1 = job ready, 0 = slot free | +| `h_ring_ptrs_` | CPU reads | `d_ring_ptrs_` GPU writes | Original ring buffer address per job | +| `h_predecoder_outputs_` | CPU reads | `d_predecoder_outputs_` GPU writes | TRT inference output (`void*`, uint8) | + +All buffers are allocated with `cudaHostAllocMapped` and mapped to device pointers via `cudaHostGetDevicePointer`. + +#### CPU Interface + +```cpp +bool poll_next_job(PreDecoderJob& out_job); +void release_job(int slot_idx); +``` + +`poll_next_job` performs CAS(expected=1, desired=2) on `ready_flags[0]`. If successful, it populates the `PreDecoderJob` struct with the slot index, ring buffer pointer, and inference output pointer. + +`release_job` stores 0 to the ready flag with release semantics, allowing the GPU to reuse the slot. + +### 4.5 Decoupled CPU Worker Architecture + +**File**: `test_realtime_predecoder_w_pymatching.cpp` + +The CPU-side processing uses a **two-tier decoupled architecture**: + +#### Tier 1: Predecoder Workers (GPU Polling) + +Pipeline worker threads (1:1 with GPU streams) run in the `RealtimePipeline::worker_loop`. Each iteration: + +1. Polls `poll_next_job()` (CAS on ready_flags). +2. Copies inference output to `deferred_outputs[origin_slot]` (per-slot buffer). +3. Computes syndrome density metrics. +4. Releases predecoder via `release_job(0)`. +5. Enqueues `PyMatchJob{origin_slot, request_id, ring_buffer_ptr}` to `PyMatchQueue`. +6. Returns `DEFERRED_COMPLETION` → pipeline releases `idle_mask`, skips `tx_flags`. + +**Hold time**: ~10 µs (copy + release + enqueue). + +#### Tier 2: PyMatching Workers (CPU Decode) + +A separate thread pool (16 workers for d13_r104) processes `PyMatchJob`s: + +1. Pops job from `PyMatchQueue` (blocks if empty). +2. Acquires per-thread PyMatching decoder via `thread_local` lock-free assignment. +3. Runs PyMatching MWPM decode over the full parity check matrix. +4. Writes `RPCResponse + DecodeResponse` into the ring buffer slot. +5. Calls `pipeline.complete_deferred(origin_slot)` → stores host address into `tx_flags`. + +**Decode time**: ~224 µs average. + +#### PyMatching Decoder Pool + +```cpp +struct DecoderContext { + std::vector> decoders; + std::atomic next_decoder_idx{0}; + + cudaq::qec::decoder* acquire_decoder() { + thread_local int my_idx = next_decoder_idx.fetch_add(1); + return decoders[my_idx % decoders.size()].get(); + } +}; +``` + +Decoders are constructed at startup from the Stim-derived parity check matrix (`H`) with edge priors: + +```cpp +auto H_full = stim_data.H.to_dense(); +pm_params.insert("error_rate_vec", stim_data.priors); +for (int i = 0; i < num_decode_workers; ++i) + decoders.push_back(cudaq::qec::decoder::get("pymatching", H_full, pm_params)); +``` + +#### Observable Projection + +When the observable matrix (`O`) is available, corrections are projected onto the logical observable: + +```cpp +int obs_parity = 0; +for (size_t e = 0; e < result.result.size(); ++e) + if (result.result[e] > 0.5 && obs_row[e]) + obs_parity ^= 1; +total_corrections += obs_parity; +``` + +The total corrections include both the predecoder's logical prediction (`output[0]`) and PyMatching's correction parity. + +--- + +## 5. Data Flow + +The following traces a single syndrome packet through the entire pipeline: + +``` +Step Location Action +──── ──────── ────────────────────────────────────────────────── + 1. Producer Writes RPCHeader (24 bytes) + uint8 detectors into rx_data[slot] + 2. Injector Sets rx_flags[slot] = host_ptr (release) + ── release fence ── + 3. Dispatcher Reads rx_flags[slot] (acquire), sees data + 4. Dispatcher Parses RPCHeader, extracts function_id + 5. Dispatcher Scans idle_mask via ffsll → finds free worker W + 6. Dispatcher Marks bit W busy, saves inflight_slot_tags[W] = slot + 7. Dispatcher Writes dev_ptr to h_mailbox_bank[W], __sync_synchronize() + 8. Dispatcher pre_launch_fn: h_ring_ptrs[0] = dev_ptr, + cudaMemcpyAsync(d_trt_input, dev_ptr+24, input_size, D2D, stream[W]) + 9. Dispatcher cudaGraphLaunch(graph_exec[W], stream[W]) +10. Dispatcher tx_flags[slot].store(0xEEEE..., release) [IN_FLIGHT] +11. Dispatcher rx_flags[slot].store(0, release), advance slot + ── slot consumed ── + + ── Inside CUDA Graph ── +12. GPU TRT enqueueV3: AI predecoder inference (uint8 → uint8) +13. GPU cudaMemcpyAsync D2D: d_trt_output_ → h_predecoder_outputs_ +14. GPU predecoder_signal_ready_kernel: ready_flags.store(1, release) + ── Graph complete ── + +15. PreDec Worker CAS(1, 2) on ready_flags[0] (acquire), wins +16. PreDec Worker Copies h_predecoder_outputs_ → deferred_outputs[slot] +17. PreDec Worker Computes syndrome density (input vs output nonzero counts) +18. PreDec Worker release_job(0): ready_flags.store(0, release) +19. PreDec Worker Extracts request_id from RPCHeader +20. PreDec Worker Enqueues PyMatchJob{slot, request_id, ring_buffer_ptr} +21. PreDec Worker Returns DEFERRED_COMPLETION +22. Pipeline idle_mask.fetch_or(1<>> │ + │ ready_flags.store(1, release) │ + └──────────────────────────────────────────────────────┘ +``` + +The graph is instantiated with `cudaGraphInstantiate(&graph_exec_, graph, 0)` for host-launch mode. No device-side graph launch is used. + +--- + +## 10. Pipeline Configurations + +The test supports multiple surface code distances via the `PipelineConfig` struct. Model dimensions are derived automatically from TRT engine bindings: + +| Config | Distance | Rounds | ONNX Model | Input (uint8) | Output (uint8) | Predecoders | PyMatch Workers | Slot Size | +|--------|----------|--------|------------|--------------|----------------|-------------|-----------------|-----------| +| `d7_r7` | 7 | 7 | `model1_d7_r7_unified_Z_batch1.onnx` | 504 | 505 | 16 | 32 | 1,024 | +| `d13_r13` | 13 | 13 | `predecoder_memory_d13_T13_X.onnx` | 3,276 | 3,277 | 16 | 32 | 4,096 | +| `d13_r104` | 13 | 104 | `predecoder_memory_d13_T104_X.onnx` | 17,472 | 17,473 | 8 | 16 | 32,768 | +| `d21_r21` | 21 | 21 | `model1_d21_r21_unified_X_batch1.onnx` | 13,860 | 13,861 | 16 | 32 | 16,384 | +| `d31_r31` | 31 | 31 | `model1_d31_r31_unified_Z_batch1.onnx` | 44,640 | 44,641 | 16 | 32 | 65,536 | + +All models use **uint8** tensors for both input (detectors) and output (logical prediction + residual detectors). + +The `slot_size` is computed as `round_up_pow2(CUDAQ_RPC_HEADER_SIZE + model_input_bytes)`. + +Usage: + +```bash +./test_realtime_predecoder_w_pymatching d7 # default +./test_realtime_predecoder_w_pymatching d13 +./test_realtime_predecoder_w_pymatching d13_r104 104 20 # 104 µs rate, 20 sec +./test_realtime_predecoder_w_pymatching d21 +./test_realtime_predecoder_w_pymatching d31 +``` + +Optional flags: +- `--data-dir /path/to/stim/data`: Load real test data for correctness verification. + +### Engine Caching + +On first run with a given configuration, the ONNX model is compiled to a TensorRT engine and saved alongside the ONNX file (e.g., `predecoder_memory_d13_T104_X.engine`). Subsequent runs detect the cached engine and skip the build phase. + +--- + +## 11. File Inventory + +| File | Layer | Purpose | +|------|-------|---------| +| `realtime/include/.../cudaq_realtime.h` | API | C API header: structs, enums, ring buffer helpers, `CUDAQ_RPC_HEADER_SIZE` | +| `realtime/include/.../dispatch_kernel_launch.h` | API | RPC protocol structs (RPCHeader, RPCResponse), FNV-1a hash | +| `realtime/include/.../host_dispatcher.h` | API | Host dispatcher C API: `cudaq_host_dispatcher_config_t`, `cudaq_host_dispatch_worker_t` | +| `realtime/lib/.../host_dispatcher.cu` | Runtime | Host-side dispatcher loop implementation | +| `realtime/lib/.../cudaq_realtime_api.cpp` | Runtime | Ring buffer C API implementation | +| `libs/qec/include/.../pipeline.h` | Pipeline | `RealtimePipeline`, `RingBufferInjector`, callbacks, `DEFERRED_COMPLETION` | +| `libs/qec/lib/.../realtime_pipeline.cu` | Pipeline | Pipeline implementation: `RingBufferManager`, worker/consumer loops, injector | +| `libs/qec/include/.../ai_decoder_service.h` | QEC | Base class header: TRT lifecycle, dynamic tensor bindings, engine caching | +| `libs/qec/lib/.../ai_decoder_service.cu` | QEC | Base class impl: ONNX build, engine save/load, graph capture | +| `libs/qec/include/.../ai_predecoder_service.h` | QEC | Derived class header: CPU handoff, `poll_next_job`/`release_job` | +| `libs/qec/lib/.../ai_predecoder_service.cu` | QEC | Derived class impl: signal kernel, output DMA, graph capture | +| `libs/qec/include/.../nvtx_helpers.h` | Util | NVTX profiling macros (`NVTX_PUSH`, `NVTX_POP`) | +| `libs/qec/lib/.../test_realtime_predecoder_w_pymatching.cpp` | Test | End-to-end benchmark with real ONNX + PyMatching + correctness verification | +| `libs/qec/unittests/test_realtime_pipeline.cu` | Test | GTest unit/integration tests (21 tests, SKIP_TRT mode) | + +--- + +## 12. Configuration Parameters + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `NUM_SLOTS` | 16 | Ring buffer slot count | +| `slot_size` | Per-config (1024 - 65536) | Max payload per slot (derived from model input size) | +| `num_predecoders` | 8 (d13_r104) | Parallel predecoder instances = pipeline worker threads | +| `queue_depth` | 1 | Single in-flight inference per predecoder | +| `num_decode_workers` | 16 (d13_r104) | PyMatching thread pool size | +| `rate_us` | 104 | Inter-arrival time in microseconds | +| `duration_s` | 20 | Test duration in seconds | +| `warmup_count` | 20 | Requests excluded from latency stats | +| `max_requests` | 500,000 | Maximum requests per run | + +### Capacity Analysis + +- **Ring buffer**: 16 slots, each up to 32 KB for d13_r104. +- **GPU throughput**: 8 parallel streams × ~88 µs compute = ~90k req/s theoretical (far exceeds demand). +- **CPU throughput**: 16 PyMatching workers × ~224 µs decode = ~71k req/s theoretical. +- **Bottleneck**: PyMatching at 224 µs average, but 16 workers provide sufficient aggregate throughput for the 9.6k req/s demand at 104 µs inter-arrival. +- **Backpressure**: ~6.2M stalls over 20 s (noise floor of sub-microsecond spins when next round-robin slot is briefly busy). + +--- + +## 13. Performance Benchmarking + +### Pipeline Results (d=13, T=104, 104 µs rate, 20s) + +Configuration: 16 slots, 8 predecoders, 16 PyMatching workers, Stim test data. + +| Metric | Value | +|--------|-------| +| Throughput | 9,610 req/s | +| Submitted / Completed | 192,309 / 192,309 | +| Backpressure stalls | 6,193,418 | +| p50 latency | 352.3 µs | +| Mean latency | 393.4 µs | +| p90 latency | 514.7 µs | +| p95 latency | 606.3 µs | +| p99 latency | 1,248.8 µs | +| Max latency | 3,930.0 µs | +| stddev | 179.0 µs | + +### Worker Timing Breakdown + +| Component | Avg Time | +|-----------|----------| +| PyMatching decode | 224.4 µs | +| Total worker (PyMatch thread) | 224.5 µs | +| Worker overhead | 0.1 µs | + +### Syndrome Density + +| | Avg nonzero / total | Density | +|-|---------------------|---------| +| Input detectors | 932.7 / 17,472 | 5.34% | +| Output residual detectors | 16.1 / 17,472 | 0.09% | +| **Reduction** | | **98.3%** | + +### Logical Error Rate + +| Mode | LER | Mismatches | +|------|-----|------------| +| Full pipeline (predecoder + PyMatching) | **0.0020** | 384 / 192,309 | +| Predecoder only (output[0]) | 0.3980 | 76,537 / 192,309 | + +### Raw TRT Baseline (trtexec) + +| Mode | GPU Compute | Total Host Latency | +|------|-------------|-------------------| +| Default (single stream) | 107 µs | 119 µs | +| CUDA Graph + SpinWait | 90 µs | 99 µs | +| CUDA Graph + SpinWait + No Transfers | 88 µs | 88 µs | + +### NVTX Profiling (per-stage timing) + +| Stage | Avg (µs) | Median (µs) | +|-------|----------|-------------| +| PyMatchDecode | 277 | 223 | +| PreLaunchCopy | 8.8 | 8.3 | +| ConsumerComplete | 3.3 | 3.2 | +| Submit | 2.8 | 2.7 | +| PollJob | 2.3 | 1.9 | +| ReleaseJob | 2.0 | 1.9 | + +Infrastructure overhead (ring buffer + dispatch + poll + consumer): **~18 µs per request**. + +--- + +## 14. Portability + +### Architecture Support + +| Feature | x86_64 | aarch64 (Grace Blackwell) | +|---------|--------|----------------------| +| `QEC_CPU_RELAX()` | `_mm_pause()` | `asm volatile("yield")` | +| Cross-device atomics | libcu++ system-scope | libcu++ system-scope | +| Memory model | TSO (strong) | Weakly ordered (requires fences) | +| Interconnect | PCIe | NVLink-C2C | + +The `QEC_CPU_RELAX()` macro is defined in both `ai_predecoder_service.h` and `host_dispatcher.h` and should be used by all polling code. + +### CUDA Compute Capability + +| Feature | Minimum | +|---------|---------| +| `cudaHostAllocMapped` | All CUDA devices | +| CUDA Graphs (host launch) | sm_50+ | +| libcu++ system-scope atomics | sm_70+ | + +--- + +## 15. Limitations & Future Work + +1. **PyMatching is the bottleneck**: At 224 µs average, PyMatching consumes 93% of CPU-stage time. A faster MWPM decoder (e.g., Fusion Blossom, GPU-accelerated matching) would directly reduce pipeline latency. + +2. **Round-robin slot injection**: The `RingBufferInjector` uses strict round-robin slot assignment. If slot N is busy, the producer stalls even if slot N+1 is free. Out-of-order slot allocation would reduce backpressure but sacrifice FIFO ordering. + +3. **Single data type**: The current test assumes uint8 detectors matching the predecoder model. Support for INT32 models would require element-size-aware input packing. + +4. **Static TRT shapes only**: The current implementation assumes static input/output tensor shapes. Dynamic shapes would require per-invocation shape metadata in the RPC payload and runtime TRT profile switching. + +5. **No queue drain on shutdown**: The PyMatching queue is shut down immediately; jobs that were enqueued but not yet decoded are silently dropped. A production system should drain the queue before stopping. + +6. **Core pinning is advisory**: The pipeline pins threads to cores via `sched_setaffinity`, but does not isolate cores from the OS scheduler. A production deployment should use `isolcpus` or cgroups. + +7. **INT8 quantization**: The predecoder model runs in FP16. INT8 quantization could reduce GPU compute from 88 µs to ~50 µs, though the GPU is not currently the bottleneck. + +8. **Sparse PyMatching input**: The predecoder reduces syndrome density to 0.09%. Representing the sparse residual as a list of nonzero indices (rather than a dense vector) could speed up PyMatching's graph traversal. diff --git a/docs/realtime_pipeline_architecture.md b/docs/realtime_pipeline_architecture.md new file mode 100644 index 00000000..b01055f1 --- /dev/null +++ b/docs/realtime_pipeline_architecture.md @@ -0,0 +1,452 @@ +# Realtime Pipeline Architecture + +## 1. Component Overview + +```mermaid +classDiagram + class RealtimePipeline { + -impl_ : Impl~ptr~ + +set_gpu_stage(GpuStageFactory) + +set_cpu_stage(CpuStageCallback) + +set_completion_handler(CompletionCallback) + +start() + +stop() + +create_injector() RingBufferInjector + +complete_deferred(slot) + +stats() Stats + } + + class RingBufferInjector { + -state_ : State~ptr~ + +try_submit(fid, payload, size, rid) bool + +submit(fid, payload, size, rid) + +backpressure_stalls() uint64_t + } + + class RingBufferManager { + -rx_flags_ : atomic_uint64~N~ + -tx_flags_ : atomic_uint64~N~ + -rx_data_host_ : uint8_t~ptr~ + +slot_available(slot) bool + +write_and_signal(slot, fid, payload, len) + +poll_tx(slot, err) cudaq_tx_status_t + +clear_slot(slot) + } + + class cudaq_host_dispatcher_config_t { + +rx_flags : atomic_uint64~ptr~ + +tx_flags : atomic_uint64~ptr~ + +idle_mask : atomic_uint64~ptr~ + +inflight_slot_tags : int~ptr~ + +h_mailbox_bank : void~ptrptr~ + +workers : cudaq_host_dispatch_worker_t* + +num_workers : size_t + +function_table : cudaq_function_entry_t~ptr~ + +shutdown_flag : atomic_int~ptr~ + } + + class AIPreDecoderService { + -h_ready_flags_ : atomic_int~ptr~ + -h_predecoder_outputs_ : void~ptr~ + -graph_exec_ : cudaGraphExec_t + +capture_graph(stream, device_launch) + +poll_next_job(job) bool + +release_job(slot) + } + + class PyMatchQueue { + -mtx_ : mutex + -cv_ : condition_variable + -jobs_ : queue~PyMatchJob~ + +push(PyMatchJob) + +pop(PyMatchJob) bool + +shutdown() + } + + RealtimePipeline *-- RingBufferManager : owns + RealtimePipeline *-- cudaq_host_dispatcher_config_t : builds + RealtimePipeline --> RingBufferInjector : creates + RingBufferInjector --> RingBufferManager : writes to + cudaq_host_dispatcher_config_t --> AIPreDecoderService : launches graph + RealtimePipeline --> PyMatchQueue : deferred jobs flow through +``` + +## 2. Thread Model + +The pipeline spawns four categories of threads, each pinnable to a specific CPU core: + +```mermaid +flowchart LR + subgraph "Producer (main thread)" + P["RingBufferInjector::submit()"] + end + + subgraph "Dispatcher Thread (core 2)" + D["cudaq_host_dispatcher_loop()"] + end + + subgraph "Predecoder Workers (cores 10..10+N)" + W0["worker_loop(0)
polls GPU stream 0"] + W1["worker_loop(1)
polls GPU stream 1"] + Wn["worker_loop(N-1)
polls GPU stream N-1"] + end + + subgraph "PyMatching Workers (no pinning)" + PM0["pymatch_thread(0)"] + PM1["pymatch_thread(1)"] + PMn["pymatch_thread(M-1)"] + end + + subgraph "Consumer Thread (core 4)" + C["consumer_loop()"] + end + + subgraph "GPU Streams" + G0["stream 0: CUDA Graph"] + G1["stream 1: CUDA Graph"] + Gn["stream N-1: CUDA Graph"] + end + + P -->|"rx_flags signal"| D + D -->|"cudaGraphLaunch"| G0 + D -->|"cudaGraphLaunch"| G1 + D -->|"cudaGraphLaunch"| Gn + G0 -->|"ready_flags = 1"| W0 + G1 -->|"ready_flags = 1"| W1 + Gn -->|"ready_flags = 1"| Wn + W0 -->|"DEFERRED_COMPLETION
idle_mask restored"| D + W1 -->|"DEFERRED_COMPLETION
idle_mask restored"| D + Wn -->|"DEFERRED_COMPLETION
idle_mask restored"| D + W0 -->|"PyMatchJob"| PM0 + W1 -->|"PyMatchJob"| PM1 + Wn -->|"PyMatchJob"| PMn + PM0 -->|"complete_deferred
tx_flags signal"| C + PM1 -->|"complete_deferred
tx_flags signal"| C + PMn -->|"complete_deferred
tx_flags signal"| C + C -->|"clear_slot"| P +``` + +**Thread counts (d13_r104 configuration):** +- Dispatcher: 1 thread (core 2) +- Predecoder workers: 8 threads (cores 10-17) +- PyMatching workers: 16 threads (unpinned) +- Consumer: 1 thread (core 4) +- Total: 26 threads + +## 3. Sequence Diagram: Single Syndrome Through the Pipeline + +This traces one syndrome request from submission to completion, showing every +atomic operation and the thread/device boundary crossings. + +```mermaid +sequenceDiagram + participant Prod as Producer
(main thread) + participant RB as Ring Buffer
(shared memory) + participant Disp as Dispatcher
(dedicated thread) + participant GPU as GPU Stream w
(CUDA Graph) + participant PDW as Predecoder Worker w
(CPU) + participant PMQ as PyMatchQueue + participant PMW as PyMatching Worker
(CPU) + participant Cons as Consumer
(dedicated thread) + participant App as Application
(completion handler) + + Note over Prod,App: === PHASE 1: Injection === + + Prod->>Prod: CAS next_slot acq_rel, claim slot S + Prod->>RB: memcpy RPCHeader (24 bytes) + payload to rx_data S + Prod->>RB: rx_flags S .store host_ptr, release + Prod->>Prod: slot_occupied S = 1, slot_request S = request_id + Prod->>Prod: total_submitted.fetch_add 1, release + + Note over Prod,App: === PHASE 2: Dispatch === + + Disp->>RB: rx_flags S .load acquire, sees non-zero slot S ready + Disp->>Disp: parse RPCHeader to function_id + Disp->>Disp: idle_mask.load acquire, find worker W via ffsll + Disp->>Disp: idle_mask.fetch_and ~1 shl W, release, mark W busy + Disp->>Disp: inflight_slot_tags W = S + Disp->>RB: h_mailbox_bank W = dev_ptr + Disp->>Disp: __sync_synchronize + + opt pre_launch_fn configured + Disp->>GPU: pre_launch_fn cudaMemcpyAsync DMA syndrome to TRT input (offset 24) + end + + Disp->>GPU: cudaGraphLaunch graph_exec W, stream W + Disp->>RB: tx_flags S .store 0xEEEE, release, IN_FLIGHT sentinel + Disp->>RB: rx_flags S .store 0, release, free rx slot + + Note over Prod,App: === PHASE 3: GPU Inference === + + GPU->>GPU: TRT enqueueV3: AI predecoder inference (uint8 → uint8) + GPU->>GPU: cudaMemcpyAsync D2D: TRT output to h_predecoder_outputs + GPU->>GPU: predecoder_signal_ready_kernel: ready_flags.store 1, release + + Note over Prod,App: === PHASE 4: Predecoder Worker (fast path, ~10 µs) === + + PDW->>PDW: poll_next_job: ready_flags CAS 1 to 2, acquire + PDW->>PDW: memcpy h_predecoder_outputs to deferred_outputs[S] + PDW->>PDW: compute syndrome density metrics + PDW->>PDW: release_job: ready_flags.store 0, release + PDW->>PDW: extract request_id from RPCHeader + PDW->>PMQ: push PyMatchJob(S, request_id, ring_buffer_ptr) + PDW->>PDW: return DEFERRED_COMPLETION + PDW->>Disp: idle_mask.fetch_or 1 shl W, release, worker W free + + Note over Prod,App: === PHASE 5: PyMatching Decode (~224 µs) === + + PMW->>PMQ: pop PyMatchJob + PMW->>PMW: acquire per-thread decoder (thread_local) + PMW->>PMW: read deferred_outputs[S]: logical_pred + residual detectors + PMW->>PMW: PyMatching MWPM decode over full H matrix + PMW->>PMW: project corrections onto observable O + PMW->>RB: write RPCResponse + DecodeResponse to ring buffer slot + PMW->>RB: complete_deferred(S): tx_flags S .store slot_host_addr, release + + Note over Prod,App: === PHASE 6: Completion === + + Cons->>RB: poll_tx S: tx_flags S .load acquire, sees valid addr READY + Cons->>App: completion_handler request_id, slot, success + Cons->>Cons: total_completed.fetch_add 1, relaxed + Cons->>Cons: slot_occupied S = 0 + Cons->>Cons: __sync_synchronize + Cons->>RB: clear_slot S: rx_flags = 0, tx_flags = 0 + Note over Prod: Slot S now available for next submission +``` + +## 4. Atomic Variables Reference + +Every atomic used in the pipeline, its scope, who writes it, who reads it, +and the memory ordering used. + +### Ring Buffer Flags + +| Atomic | Type | Scope | Writer(s) | Reader(s) | Ordering | +|--------|------|-------|-----------|-----------|----------| +| `rx_flags[slot]` | `cuda::atomic` | Producer ↔ Dispatcher | Producer (signal), Dispatcher (clear), Consumer (clear) | Dispatcher (poll) | store: `release`, load: `acquire` | +| `tx_flags[slot]` | `cuda::atomic` | Dispatcher ↔ PyMatch Worker ↔ Consumer | Dispatcher (IN_FLIGHT), PyMatch Worker (READY/addr via `complete_deferred`) | Consumer (poll) | store: `release`, load: `acquire` | + +### Worker Pool Scheduling + +| Atomic | Type | Scope | Writer(s) | Reader(s) | Ordering | +|--------|------|-------|-----------|-----------|----------| +| `idle_mask` | `cuda::atomic` | Dispatcher ↔ Pipeline Workers | Dispatcher (clear bit), Pipeline (set bit after DEFERRED_COMPLETION) | Dispatcher (find free worker) | fetch_and/fetch_or: `release`, load: `acquire` | + +### GPU ↔ CPU Handoff (per AIPreDecoderService) + +| Atomic | Type | Scope | Writer(s) | Reader(s) | Ordering | +|--------|------|-------|-----------|-----------|----------| +| `ready_flags[0]` | `cuda::atomic` | GPU kernel ↔ Predecoder worker | GPU kernel (0→1), Worker (CAS 1→2), Worker (2→0 via release_job) | Worker (CAS poll) | store: `release`, CAS success: `acquire`, CAS fail: `relaxed` | + +### Pipeline Lifecycle + +| Atomic | Type | Scope | Writer(s) | Reader(s) | Ordering | +|--------|------|-------|-----------|-----------|----------| +| `shutdown_flag` | `cuda::atomic` | Main ↔ Dispatcher | Main thread | Dispatcher loop | store: `release`, load: `acquire` | +| `producer_stop` | `std::atomic` | Main ↔ Consumer/Injector | Main thread | Consumer, Injector | store: `release`, load: `acquire` | +| `consumer_stop` | `std::atomic` | Main ↔ Consumer/Workers | Main thread | Consumer, Workers | store: `release`, load: `acquire` | +| `total_submitted` | `std::atomic` | Injector ↔ Consumer | Injector | Consumer | fetch_add: `release`, load: `acquire` | +| `total_completed` | `std::atomic` | Consumer ↔ Main | Consumer | Main (stats) | fetch_add: `relaxed`, load: `relaxed` | +| `backpressure_stalls` | `std::atomic` | Injector ↔ Main | Injector | Main (stats) | fetch_add: `relaxed`, load: `relaxed` | +| `started` | `std::atomic` | Main thread | start()/stop() | destructor, start() | implicit seq_cst | + +### Injector Slot Claiming + +| Atomic | Type | Scope | Writer(s) | Reader(s) | Ordering | +|--------|------|-------|-----------|-----------|----------| +| `next_slot` | `std::atomic` | Injector-internal | try_submit (CAS) | try_submit | CAS: `acq_rel` / `relaxed` | + +## 5. Ring Buffer Slot State Machine + +Each of the N ring buffer slots transitions through these states. The +transitions are driven by atomic flag writes from different threads. + +```mermaid +stateDiagram-v2 + [*] --> FREE : initialization + + FREE --> RX_SIGNALED : Producer writes rx_flags[S] = host_ptr + note right of RX_SIGNALED + rx_flags != 0, tx_flags = 0 + RPCHeader (24B) + payload in rx_data + end note + + RX_SIGNALED --> IN_FLIGHT : Dispatcher reads rx_flags, launches graph, sets tx_flags IN_FLIGHT, clears rx_flags + note right of IN_FLIGHT + rx_flags = 0, tx_flags = 0xEEEE + GPU processing + predecoder worker + PyMatch queue + end note + + IN_FLIGHT --> TX_READY : PyMatch worker calls complete_deferred → tx_flags = slot_host_addr + note right of TX_READY + rx_flags = 0, tx_flags = valid addr + Result available for consumer + end note + + TX_READY --> FREE : Consumer reads result, calls clear_slot + + IN_FLIGHT --> TX_ERROR : cudaGraphLaunch failed, tx_flags = 0xDEAD | err + TX_ERROR --> FREE : Consumer reads error, calls clear_slot +``` + +**`tx_flags` value encoding:** + +| Value | Meaning | +|-------|---------| +| `0` | Slot is free (no pending result) | +| `0xEEEEEEEEEEEEEEEE` | IN_FLIGHT — graph launched, result not yet ready | +| `0xDEAD____XXXXXXXX` | ERROR — upper 16 bits = `0xDEAD`, lower 32 = cudaError_t | +| Any other non-zero | READY — value is host pointer to slot data containing result | + +## 6. CUDA Graph Structure (per Worker) + +Each worker has a pre-captured CUDA graph that executes on its dedicated stream. +The graph is instantiated once at startup and replayed for every syndrome. + +```mermaid +flowchart TD + subgraph "CUDA Graph (AIPreDecoderService)" + A["TRT enqueueV3
(AI predecoder inference)"] --> B["cudaMemcpyAsync D2D
TRT output → h_predecoder_outputs
(host-mapped)"] + B --> C["predecoder_signal_ready_kernel
ready_flags.store(1, release)"] + end + + subgraph "Pre-Launch Callback (host-side, before graph)" + P["pre_launch_fn:
cudaMemcpyAsync D2D
ring buffer slot+24 → TRT input
(DMA copy engine)"] + end + + subgraph "Predecoder Worker (fast path, ~10 µs)" + D["poll_next_job():
ready_flags CAS 1 → 2"] + E["memcpy output → deferred_outputs[slot]"] + F["syndrome density metrics"] + G["release_job():
ready_flags store 0"] + H["enqueue PyMatchJob"] + I["return DEFERRED_COMPLETION
→ idle_mask restored"] + D --> E --> F --> G --> H --> I + end + + subgraph "PyMatching Worker (~224 µs)" + J["pop PyMatchJob from queue"] + K["PyMatching MWPM decode"] + L["Write RPC response"] + M["complete_deferred(slot):
tx_flags.store(addr, release)"] + J --> K --> L --> M + end + + P --> A + C -.->|"GPU signals ready_flags = 1"| D + I -.->|"PyMatchQueue"| J +``` + +## 7. Backpressure and Flow Control + +The pipeline uses implicit backpressure through slot availability: + +```mermaid +flowchart TD + subgraph "Flow Control" + Submit["Injector::try_submit()"] + Check{"slot_available(S)?
rx_flags=0 AND tx_flags=0"} + CAS{"CAS next_slot
cur to cur+1"} + Write["Write RPCHeader + payload + signal"] + Stall["backpressure_stalls++
QEC_CPU_RELAX()"] + Retry["Retry"] + + Submit --> Check + Check -->|yes| CAS + Check -->|no| Stall + CAS -->|success| Write + CAS -->|"fail contention"| Stall + Stall --> Retry --> Submit + end +``` + +**Capacity:** With `num_slots = 16` and `num_workers = 8` (predecoder) + `16` (PyMatching), +up to 16 syndromes can be in various stages of processing simultaneously. When all 16 +slots are occupied (either waiting for dispatch, in-flight on GPU, being decoded by +PyMatching, or awaiting consumer pickup), the injector stalls until the consumer frees a +slot. + +**Round-robin limitation:** The injector uses strict round-robin slot selection. If slot N +is busy but slot N+1 is free, the producer still stalls on slot N. This preserves FIFO +ordering but contributes to the ~6.2M backpressure stalls observed at 104 µs injection rate. + +## 8. ARM Memory Ordering Considerations + +The pipeline runs on NVIDIA Grace (ARM aarch64) which has a weakly-ordered +memory model. Key ordering guarantees: + +1. **Producer → Dispatcher:** `rx_flags[S].store(release)` pairs with + `rx_flags[S].load(acquire)`. The dispatcher sees all payload bytes written + before the flag. + +2. **Dispatcher → Worker (via GPU):** The CUDA graph launch is ordered by + `cudaGraphLaunch` semantics. The `ready_flags` store inside the GPU kernel + uses `cuda::thread_scope_system` + `memory_order_release`, paired with the + worker's `compare_exchange_strong(acquire)`. + +3. **Predecoder Worker → PyMatch Worker:** The `PyMatchQueue` uses `std::mutex` + + `std::condition_variable`, which provide implicit acquire/release semantics. + The `deferred_outputs[slot]` buffer is written by the predecoder worker before + `push()` and read by the PyMatch worker after `pop()`, so the mutex guarantees + visibility. + +4. **PyMatch Worker → Consumer:** `tx_flags[S].store(release)` in + `complete_deferred()` pairs with `tx_flags[S].load(acquire)` in `poll_tx_flag()`. + Consumer sees the full RPC response before the ready flag. + +5. **Consumer → Producer (slot recycling):** `slot_occupied[S] = 0` followed + by `__sync_synchronize()` (full barrier) before `clear_slot()` ensures the + producer cannot see a free slot while the consumer is still accessing + slot metadata. + +```mermaid +flowchart LR + subgraph "Release/Acquire Pairs" + A["rx_flags store
(release)"] -->|"paired with"| B["rx_flags load
(acquire)"] + C["tx_flags store
(release, complete_deferred)"] -->|"paired with"| D["tx_flags load
(acquire, poll_tx)"] + E["ready_flags store(1)
(release, system scope)"] -->|"paired with"| F["ready_flags CAS
(acquire)"] + G["idle_mask fetch_or
(release)"] -->|"paired with"| H["idle_mask load
(acquire)"] + end + + subgraph "Mutex-Based Ordering" + I["PyMatchQueue::push()
mutex lock/unlock"] -->|"happens-before"| J["PyMatchQueue::pop()
mutex lock/unlock"] + end + + subgraph "Full Barriers" + K["__sync_synchronize()
between slot_occupied=0
and clear_slot()"] + L["__sync_synchronize()
between mailbox_bank write
and cudaGraphLaunch"] + end +``` + +## 9. DEFERRED_COMPLETION Protocol + +The `DEFERRED_COMPLETION` mechanism allows predecoder workers to release their +GPU stream immediately while deferring ring buffer slot completion to a later +thread (the PyMatching worker pool). + +```mermaid +sequenceDiagram + participant PW as Predecoder Worker + participant Pipeline as RealtimePipeline + participant PMQ as PyMatchQueue + participant PMW as PyMatch Worker + + PW->>PW: poll_next_job() succeeds + PW->>PW: copy output, release GPU slot + PW->>PMQ: push(PyMatchJob) + PW->>Pipeline: return DEFERRED_COMPLETION + Pipeline->>Pipeline: idle_mask.fetch_or(1<tx_flags NOT touched + + PMW->>PMQ: pop(PyMatchJob) + PMW->>PMW: PyMatching MWPM decode + PMW->>PMW: Write RPC response to ring buffer + PMW->>Pipeline: complete_deferred(slot) + Pipeline->>Pipeline: tx_flags[slot].store(host_addr, release) + Note over Pipeline: Slot S now READY
Consumer can harvest +``` + +**Key invariant:** Between `DEFERRED_COMPLETION` and `complete_deferred()`, the ring +buffer slot remains in the IN_FLIGHT state (`tx_flags = 0xEEEE`). The slot's data area +is safe to read/write because the consumer only harvests when `tx_flags` transitions to +a valid address, and the producer cannot reuse the slot while `tx_flags != 0`. diff --git a/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h b/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h new file mode 100644 index 00000000..ee3e075d --- /dev/null +++ b/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h @@ -0,0 +1,80 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace cudaq::qec { + +class AIDecoderService { +public: + class Logger : public nvinfer1::ILogger { + void log(Severity severity, const char *msg) noexcept override; + } static gLogger; + + /// @brief Constructor. Accepts a serialized TRT engine (.engine/.plan) or + /// an ONNX model (.onnx) which will be compiled to a TRT engine. + /// @param model_path Path to the model file + /// @param device_mailbox_slot Pointer to the specific slot in the global + /// mailbox bank + /// @param engine_save_path If non-empty and model_path is .onnx, save the + /// built engine to this path for fast reloading on subsequent runs + AIDecoderService(const std::string &model_path, void **device_mailbox_slot, + const std::string &engine_save_path = ""); + + virtual ~AIDecoderService(); + + virtual void capture_graph(cudaStream_t stream); + + cudaGraphExec_t get_executable_graph() const { return graph_exec_; } + + /// @brief Size of the primary input tensor in bytes (payload from RPC) + size_t get_input_size() const { return input_size_; } + + /// @brief Size of the primary output tensor in bytes (forwarded to CPU) + size_t get_output_size() const { return output_size_; } + + void *get_trt_input_ptr() const { return d_trt_input_; } + +protected: + void load_engine(const std::string &path); + void build_engine_from_onnx(const std::string &onnx_path, + const std::string &engine_save_path = ""); + void setup_bindings(); + void allocate_resources(); + + std::unique_ptr runtime_; + std::unique_ptr engine_; + std::unique_ptr context_; + + cudaGraphExec_t graph_exec_ = nullptr; + + void **device_mailbox_slot_; + void *d_trt_input_ = nullptr; // Primary input buffer + void *d_trt_output_ = nullptr; // Primary output buffer (residual_detectors) + std::vector d_aux_buffers_; // Additional I/O buffers TRT needs + + struct TensorBinding { + std::string name; + void *d_buffer = nullptr; + size_t size_bytes = 0; + bool is_input = false; + }; + std::vector all_bindings_; + + size_t input_size_ = 0; + size_t output_size_ = 0; +}; + +} // namespace cudaq::qec diff --git a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h new file mode 100644 index 00000000..db5638dd --- /dev/null +++ b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h @@ -0,0 +1,84 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include "cudaq/qec/realtime/ai_decoder_service.h" +#include +#include + +// Portable CPU Yield Macro for busy-polling (skip if already defined by realtime API) +#ifndef QEC_CPU_RELAX +#if defined(__x86_64__) +#include +#define QEC_CPU_RELAX() _mm_pause() +#elif defined(__aarch64__) +#define QEC_CPU_RELAX() __asm__ volatile("yield" ::: "memory") +#else +#define QEC_CPU_RELAX() std::atomic_thread_fence(std::memory_order_seq_cst) +#endif +#endif + +namespace cudaq::qec { + +struct PreDecoderJob { + int slot_idx; ///< Worker/slot index (for release_job; always 0) + int origin_slot; ///< FPGA ring slot for tx_flags routing (dynamic pool) + void *ring_buffer_ptr; + void *inference_data; ///< Points into the pinned output (single slot) + + // Performance Tracking + uint64_t submit_ts_ns; + uint64_t dispatch_ts_ns; + uint64_t poll_ts_ns; +}; + +class AIPreDecoderService : public AIDecoderService { +public: + AIPreDecoderService(const std::string &engine_path, + void **device_mailbox_slot, int queue_depth = 1, + const std::string &engine_save_path = ""); + virtual ~AIPreDecoderService(); + + void capture_graph(cudaStream_t stream, bool device_launch); + void capture_graph(cudaStream_t stream) override { + capture_graph(stream, true); + } + + bool poll_next_job(PreDecoderJob &out_job); + void release_job(int slot_idx); + + /// Stub for device-dispatcher batch path (returns nullptr; streaming uses + /// host dispatcher) + int *get_device_queue_idx() const { return nullptr; } + cuda::atomic *get_device_ready_flags() const { + return d_ready_flags_; + } + int *get_device_inflight_flag() const { return nullptr; } + + cuda::atomic *get_host_ready_flags() const { + return h_ready_flags_; + } + volatile int *get_host_queue_idx() const { return nullptr; } + int get_queue_depth() const { return queue_depth_; } + + void **get_host_ring_ptrs() const { return h_ring_ptrs_; } + +private: + int queue_depth_; // Always 1 + + cuda::atomic *h_ready_flags_ = nullptr; + void **h_ring_ptrs_ = nullptr; + void *h_predecoder_outputs_ = nullptr; + + cuda::atomic *d_ready_flags_ = nullptr; + void **d_ring_ptrs_ = nullptr; + void *d_predecoder_outputs_ = nullptr; +}; + +} // namespace cudaq::qec diff --git a/libs/qec/include/cudaq/qec/realtime/nvtx_helpers.h b/libs/qec/include/cudaq/qec/realtime/nvtx_helpers.h new file mode 100644 index 00000000..d20568b6 --- /dev/null +++ b/libs/qec/include/cudaq/qec/realtime/nvtx_helpers.h @@ -0,0 +1,32 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. + * All rights reserved. + * + * This source code and the accompanying materials are made available under + * the terms of the Apache License 2.0 which accompanies this distribution. + ******************************************************************************/ + +#pragma once + +#ifdef ENABLE_NVTX + +#include + +struct NvtxRange { + explicit NvtxRange(const char *name) { nvtxRangePushA(name); } + ~NvtxRange() { nvtxRangePop(); } + NvtxRange(const NvtxRange &) = delete; + NvtxRange &operator=(const NvtxRange &) = delete; +}; + +#define NVTX_RANGE(name) NvtxRange _nvtx_range_##__LINE__(name) +#define NVTX_PUSH(name) nvtxRangePushA(name) +#define NVTX_POP() nvtxRangePop() + +#else + +#define NVTX_RANGE(name) (void)0 +#define NVTX_PUSH(name) (void)0 +#define NVTX_POP() (void)0 + +#endif diff --git a/libs/qec/include/cudaq/qec/realtime/pipeline.h b/libs/qec/include/cudaq/qec/realtime/pipeline.h new file mode 100644 index 00000000..57c96b37 --- /dev/null +++ b/libs/qec/include/cudaq/qec/realtime/pipeline.h @@ -0,0 +1,187 @@ +/******************************************************************************* + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. + * All rights reserved. + * + * This source code and the accompanying materials are made available under + * the terms of the Apache License 2.0 which accompanies this distribution. + ******************************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace cudaq::realtime { + +// --------------------------------------------------------------------------- +// Configuration +// --------------------------------------------------------------------------- + +struct CorePinning { + int dispatcher = -1; // -1 = no pinning + int consumer = -1; + int worker_base = -1; // workers pin to base, base+1, ... +}; + +struct PipelineStageConfig { + int num_workers = 8; + int num_slots = 32; + size_t slot_size = 16384; + CorePinning cores; +}; + +// --------------------------------------------------------------------------- +// GPU Stage Factory +// --------------------------------------------------------------------------- + +struct GpuWorkerResources { + cudaGraphExec_t graph_exec = nullptr; + cudaStream_t stream = nullptr; + void (*pre_launch_fn)(void *user_data, void *slot_dev, + cudaStream_t stream) = nullptr; + void *pre_launch_data = nullptr; + void (*post_launch_fn)(void *user_data, void *slot_dev, + cudaStream_t stream) = nullptr; + void *post_launch_data = nullptr; + uint32_t function_id = 0; + void *user_context = nullptr; +}; + +/// Called once per worker during start(). Returns GPU resources for that +/// worker. +using GpuStageFactory = std::function; + +// --------------------------------------------------------------------------- +// CPU Stage Callback +// --------------------------------------------------------------------------- + +/// Passed to the user's CPU stage callback on each completed GPU workload. +/// The user reads gpu_output, does post-processing, and writes the +/// result into response_buffer. No atomics are exposed. +struct CpuStageContext { + int worker_id; + int origin_slot; + const void *gpu_output; + size_t gpu_output_size; + void *response_buffer; + size_t max_response_size; + void *user_context; +}; + +/// Returns the number of bytes written into response_buffer. +/// Return 0 if no GPU result is ready yet (poll again). +/// Return DEFERRED_COMPLETION to release the worker immediately while +/// deferring slot completion to a later complete_deferred() call. +using CpuStageCallback = std::function; + +/// Sentinel return value from CpuStageCallback: release the worker +/// (idle_mask) but do NOT signal slot completion (tx_flags). The caller +/// is responsible for calling RealtimePipeline::complete_deferred(slot) +/// once the deferred work (e.g. a separate decode thread) finishes. +static constexpr size_t DEFERRED_COMPLETION = SIZE_MAX; + +// --------------------------------------------------------------------------- +// Completion Callback +// --------------------------------------------------------------------------- + +struct Completion { + uint64_t request_id; + int slot; + bool success; + int cuda_error; // 0 on success +}; + +/// Called by the consumer thread for each completed (or errored) request. +using CompletionCallback = std::function; + +// --------------------------------------------------------------------------- +// Ring Buffer Injector (software-only test/replay data source) +// --------------------------------------------------------------------------- + +/// Writes RPC-framed requests into the pipeline's ring buffer, simulating +/// FPGA DMA deposits. Created via RealtimePipeline::create_injector(). +/// The parent RealtimePipeline must outlive the injector. +class RingBufferInjector { +public: + ~RingBufferInjector(); + RingBufferInjector(RingBufferInjector &&) noexcept; + RingBufferInjector &operator=(RingBufferInjector &&) noexcept; + + RingBufferInjector(const RingBufferInjector &) = delete; + RingBufferInjector &operator=(const RingBufferInjector &) = delete; + + /// Try to submit a request. Returns true if accepted, false if + /// backpressure (all slots busy). Non-blocking. Thread-safe. + bool try_submit(uint32_t function_id, const void *payload, + size_t payload_size, uint64_t request_id); + + /// Blocking submit: spins until a slot becomes available. + void submit(uint32_t function_id, const void *payload, size_t payload_size, + uint64_t request_id); + + uint64_t backpressure_stalls() const; + +private: + friend class RealtimePipeline; + struct State; + std::unique_ptr state_; + explicit RingBufferInjector(std::unique_ptr s); +}; + +// --------------------------------------------------------------------------- +// Pipeline +// --------------------------------------------------------------------------- + +class RealtimePipeline { +public: + explicit RealtimePipeline(const PipelineStageConfig &config); + ~RealtimePipeline(); + + RealtimePipeline(const RealtimePipeline &) = delete; + RealtimePipeline &operator=(const RealtimePipeline &) = delete; + + /// Register the GPU stage factory (called before start). + void set_gpu_stage(GpuStageFactory factory); + + /// Register the CPU worker callback (called before start). + void set_cpu_stage(CpuStageCallback callback); + + /// Register the completion callback (called before start). + void set_completion_handler(CompletionCallback handler); + + /// Allocate resources, build dispatcher config, spawn all threads. + void start(); + + /// Signal shutdown, join all threads, free resources. + void stop(); + + /// Create a software injector for testing without FPGA hardware. + /// The pipeline must be constructed but need not be started yet. + RingBufferInjector create_injector(); + + struct Stats { + uint64_t submitted; + uint64_t completed; + uint64_t dispatched; + uint64_t backpressure_stalls; + }; + + /// Thread-safe, lock-free stats snapshot. + Stats stats() const; + + /// Signal that deferred processing for a slot is complete. + /// Call this from any thread after the cpu_stage callback returned + /// DEFERRED_COMPLETION and the deferred work has finished writing the + /// response into the slot's ring buffer area. + void complete_deferred(int slot); + +private: + struct Impl; + std::unique_ptr impl_; +}; + +} // namespace cudaq::realtime diff --git a/libs/qec/include/cudaq/qec/utils/pipeline_benchmarks.h b/libs/qec/include/cudaq/qec/utils/pipeline_benchmarks.h new file mode 100644 index 00000000..7075f5d4 --- /dev/null +++ b/libs/qec/include/cudaq/qec/utils/pipeline_benchmarks.h @@ -0,0 +1,210 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cudaq::qec::utils { + +/// Reusable latency / throughput tracker for realtime decoding pipelines. +/// +/// Usage: +/// PipelineBenchmark bench("my test", num_requests); +/// bench.start(); +/// for (int i = 0; i < n; ++i) { +/// bench.mark_submit(i); +/// // ... submit request ... +/// // ... wait for response ... +/// bench.mark_complete(i); // only if successful +/// } +/// bench.stop(); +/// bench.report(); +/// +class PipelineBenchmark { +public: + using clock = std::chrono::high_resolution_clock; + using time_point = clock::time_point; + using duration_us = std::chrono::duration; + + explicit PipelineBenchmark(const std::string &label = "Pipeline", + size_t expected_requests = 0) + : label_(label), total_submitted_(0) { + if (expected_requests > 0) { + submit_times_.resize(expected_requests); + complete_times_.resize(expected_requests); + completed_.resize(expected_requests, false); + } + } + + void start() { run_start_ = clock::now(); } + void stop() { run_end_ = clock::now(); } + + void mark_submit(int request_id) { + ensure_capacity(request_id); + submit_times_[request_id] = clock::now(); + total_submitted_++; + } + + void mark_complete(int request_id) { + ensure_capacity(request_id); + complete_times_[request_id] = clock::now(); + completed_[request_id] = true; + } + + struct Stats { + size_t submitted = 0; + size_t completed = 0; + double min_us = 0, max_us = 0, mean_us = 0; + double p50_us = 0, p90_us = 0, p95_us = 0, p99_us = 0; + double stddev_us = 0; + double total_wall_us = 0; + double throughput_rps = 0; + }; + + /// Return per-request latencies in microseconds (completed requests only). + std::vector latencies_us() const { + size_t n = std::min( + {submit_times_.size(), complete_times_.size(), completed_.size()}); + std::vector lats; + lats.reserve(n); + for (size_t i = 0; i < n; ++i) { + if (!completed_[i]) + continue; + auto dt = std::chrono::duration_cast(complete_times_[i] - + submit_times_[i]); + lats.push_back(dt.count()); + } + return lats; + } + + /// Return per-request latency or -1.0 for incomplete (preserves indices). + std::vector all_latencies_us() const { + size_t n = std::min( + {submit_times_.size(), complete_times_.size(), completed_.size()}); + std::vector lats(n, -1.0); + for (size_t i = 0; i < n; ++i) { + if (!completed_[i]) + continue; + auto dt = std::chrono::duration_cast(complete_times_[i] - + submit_times_[i]); + lats[i] = dt.count(); + } + return lats; + } + + Stats compute_stats() const { + auto lats = latencies_us(); + Stats s; + s.submitted = total_submitted_; + s.completed = lats.size(); + if (s.completed == 0) + return s; + + std::sort(lats.begin(), lats.end()); + + s.min_us = lats.front(); + s.max_us = lats.back(); + s.mean_us = std::accumulate(lats.begin(), lats.end(), 0.0) / s.completed; + s.p50_us = percentile(lats, 50.0); + s.p90_us = percentile(lats, 90.0); + s.p95_us = percentile(lats, 95.0); + s.p99_us = percentile(lats, 99.0); + + double sum_sq = 0; + for (auto v : lats) + sum_sq += (v - s.mean_us) * (v - s.mean_us); + s.stddev_us = std::sqrt(sum_sq / s.completed); + + auto wall = std::chrono::duration_cast(run_end_ - run_start_); + s.total_wall_us = wall.count(); + s.throughput_rps = + (s.total_wall_us > 0) ? (s.completed * 1e6 / s.total_wall_us) : 0; + + return s; + } + + void report(std::ostream &os = std::cout) const { + auto s = compute_stats(); + auto all = all_latencies_us(); + + os << "\n"; + os << "================================================================\n"; + os << " Benchmark: " << label_ << "\n"; + os << "================================================================\n"; + os << std::fixed; + os << " Submitted: " << s.submitted << "\n"; + os << " Completed: " << s.completed << "\n"; + if (s.submitted > s.completed) + os << " Timed out: " << (s.submitted - s.completed) << "\n"; + os << std::setprecision(1); + os << " Wall time: " << s.total_wall_us / 1000.0 << " ms\n"; + os << " Throughput: " << s.throughput_rps << " req/s\n"; + os << " ---------------------------------------------------------------\n"; + os << " Latency (us) [completed requests only]\n"; + os << std::setprecision(1); + os << " min = " << std::setw(10) << s.min_us << "\n"; + os << " p50 = " << std::setw(10) << s.p50_us << "\n"; + os << " mean = " << std::setw(10) << s.mean_us << "\n"; + os << " p90 = " << std::setw(10) << s.p90_us << "\n"; + os << " p95 = " << std::setw(10) << s.p95_us << "\n"; + os << " p99 = " << std::setw(10) << s.p99_us << "\n"; + os << " max = " << std::setw(10) << s.max_us << "\n"; + os << " stddev = " << std::setw(10) << s.stddev_us << "\n"; + os << " ---------------------------------------------------------------\n"; + + // Per-request breakdown: only show for small runs (<=50 requests) + if (!all.empty() && all.size() <= 50) { + os << " Per-request latencies (us):\n"; + for (size_t i = 0; i < all.size(); ++i) { + os << " [" << std::setw(4) << i << "] "; + if (all[i] < 0) + os << " TIMEOUT\n"; + else + os << std::setprecision(1) << std::setw(10) << all[i] << "\n"; + } + } + os << "================================================================\n"; + } + +private: + std::string label_; + size_t total_submitted_; + time_point run_start_{}, run_end_{}; + std::vector submit_times_; + std::vector complete_times_; + std::vector completed_; + + void ensure_capacity(int id) { + size_t needed = static_cast(id) + 1; + if (submit_times_.size() < needed) + submit_times_.resize(needed); + if (complete_times_.size() < needed) + complete_times_.resize(needed); + if (completed_.size() < needed) + completed_.resize(needed, false); + } + + static double percentile(const std::vector &sorted, double p) { + if (sorted.empty()) + return 0; + double idx = (p / 100.0) * (sorted.size() - 1); + size_t lo = static_cast(idx); + size_t hi = std::min(lo + 1, sorted.size() - 1); + double frac = idx - lo; + return sorted[lo] * (1.0 - frac) + sorted[hi] * frac; + } +}; + +} // namespace cudaq::qec::utils diff --git a/libs/qec/include/cudaq/qec/utils/thread_pool.h b/libs/qec/include/cudaq/qec/utils/thread_pool.h new file mode 100644 index 00000000..8fe3b67e --- /dev/null +++ b/libs/qec/include/cudaq/qec/utils/thread_pool.h @@ -0,0 +1,145 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__linux__) +#include +#include +#endif + +namespace cudaq::qec::utils { + +class ThreadPool { +public: + // Option 1: Standard unpinned thread pool + explicit ThreadPool(size_t threads); + + // Option 2: Pinned thread pool (1 thread per specified core ID) + explicit ThreadPool(const std::vector &core_ids); + + ~ThreadPool(); + + // Enqueue a job into the pool. + template + auto enqueue(F &&f, Args &&...args) + -> std::future::type>; + +private: + void worker_loop(); + + std::vector workers; + std::queue> tasks; + + std::mutex queue_mutex; + std::condition_variable condition; + bool stop; +}; + +// --- Implementation --- + +inline void ThreadPool::worker_loop() { + while (true) { + std::function task; + { + std::unique_lock lock(this->queue_mutex); + this->condition.wait( + lock, [this] { return this->stop || !this->tasks.empty(); }); + + if (this->stop && this->tasks.empty()) { + return; + } + + task = std::move(this->tasks.front()); + this->tasks.pop(); + } + task(); + } +} + +// Constructor 1: Unpinned +inline ThreadPool::ThreadPool(size_t threads) : stop(false) { + for (size_t i = 0; i < threads; ++i) { + workers.emplace_back([this] { this->worker_loop(); }); + } +} + +// Constructor 2: Pinned to specific cores +inline ThreadPool::ThreadPool(const std::vector &core_ids) : stop(false) { + for (size_t i = 0; i < core_ids.size(); ++i) { + int core_id = core_ids[i]; + + workers.emplace_back([this, core_id] { + // Apply Thread Affinity (Linux Only) +#if defined(__linux__) + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + + int rc = + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); + if (rc != 0) { + std::cerr << "[ThreadPool] Warning: Failed to pin thread to core " + << core_id << " (Error " << rc << ")\n"; + } +#else + // Silent fallback for non-Linux platforms + (void)core_id; +#endif + + // Enter the standard execution loop + this->worker_loop(); + }); + } +} + +template +auto ThreadPool::enqueue(F &&f, Args &&...args) + -> std::future::type> { + using return_type = typename std::invoke_result::type; + + auto task = std::make_shared>( + std::bind(std::forward(f), std::forward(args)...)); + + std::future res = task->get_future(); + { + std::unique_lock lock(queue_mutex); + if (stop) { + throw std::runtime_error("enqueue on stopped ThreadPool"); + } + tasks.emplace([task]() { (*task)(); }); + } + condition.notify_one(); + return res; +} + +inline ThreadPool::~ThreadPool() { + { + std::unique_lock lock(queue_mutex); + stop = true; + } + condition.notify_all(); + for (std::thread &worker : workers) { + if (worker.joinable()) { + worker.join(); + } + } +} + +} // namespace cudaq::qec::utils diff --git a/libs/qec/lib/realtime/CMakeLists.txt b/libs/qec/lib/realtime/CMakeLists.txt index 2c459e1c..3d25e3dd 100644 --- a/libs/qec/lib/realtime/CMakeLists.txt +++ b/libs/qec/lib/realtime/CMakeLists.txt @@ -25,9 +25,24 @@ if(CMAKE_CUDA_COMPILER) find_path(CUDAQ_REALTIME_INCLUDE_DIR NAMES cudaq/realtime/daemon/dispatcher/cudaq_realtime.h - PATHS ${_cudaq_realtime_prefixes} - PATH_SUFFIXES include ../include + HINTS ${_cudaq_realtime_prefixes} + PATH_SUFFIXES include + NO_DEFAULT_PATH ) + if(NOT CUDAQ_REALTIME_INCLUDE_DIR) + find_path(CUDAQ_REALTIME_INCLUDE_DIR + NAMES cudaq/realtime/daemon/dispatcher/cudaq_realtime.h + PATHS ${_cudaq_realtime_prefixes} + PATH_SUFFIXES include + ) + endif() + if(NOT CUDAQ_REALTIME_INCLUDE_DIR) + find_path(CUDAQ_REALTIME_INCLUDE_DIR + NAMES cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h + HINTS ${_cudaq_realtime_prefixes} + PATH_SUFFIXES include ../include + ) + endif() if(CUDAQ_REALTIME_INCLUDE_DIR) message(STATUS "Found cuda-quantum realtime headers at ${CUDAQ_REALTIME_INCLUDE_DIR}") @@ -113,5 +128,66 @@ install(TARGETS cudaq-qec-realtime-decoding LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) +# --------------------------------------------------------------------------- +# RealtimePipeline shared library +# Requires pre-installed cudaq-realtime (set CUDAQ_REALTIME_ROOT) +# --------------------------------------------------------------------------- +if(CMAKE_CUDA_COMPILER AND CUDAQ_REALTIME_INCLUDE_DIR) + find_library(_CUDAQ_RT_LIB cudaq-realtime + PATHS ${_cudaq_realtime_prefixes} PATH_SUFFIXES lib) + find_library(_CUDAQ_RT_HD_LIB cudaq-realtime-host-dispatch + PATHS ${_cudaq_realtime_prefixes} PATH_SUFFIXES lib) + + if(_CUDAQ_RT_LIB AND _CUDAQ_RT_HD_LIB) + message(STATUS "RealtimePipeline: building with CUDAQ_REALTIME_INCLUDE_DIR=${CUDAQ_REALTIME_INCLUDE_DIR}") + + add_library(cudaq-realtime-pipeline SHARED + realtime_pipeline.cu + ) + + get_filename_component(_cuda_bin_pl "${CMAKE_CUDA_COMPILER}" DIRECTORY) + get_filename_component(_cuda_root_pl "${_cuda_bin_pl}" DIRECTORY) + set(_cuda_cccl_include_pl "${_cuda_root_pl}/include/cccl") + + target_include_directories(cudaq-realtime-pipeline + PUBLIC + $ + $ + $ + $ + ) + + target_link_libraries(cudaq-realtime-pipeline + PUBLIC CUDA::cudart_static + PRIVATE ${_CUDAQ_RT_LIB} ${_CUDAQ_RT_HD_LIB} + ) + + option(ENABLE_NVTX "Enable NVTX profiling ranges" OFF) + if(ENABLE_NVTX) + target_compile_definitions(cudaq-realtime-pipeline PRIVATE ENABLE_NVTX) + message(STATUS "NVTX profiling enabled for cudaq-realtime-pipeline") + endif() + + get_filename_component(_CUDAQ_RT_LIB_DIR "${_CUDAQ_RT_LIB}" DIRECTORY) + set_target_properties(cudaq-realtime-pipeline PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + POSITION_INDEPENDENT_CODE ON + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib + BUILD_RPATH "${_CUDAQ_RT_LIB_DIR};${CMAKE_BINARY_DIR}/lib" + ) + + install(TARGETS cudaq-realtime-pipeline + COMPONENT qec-lib + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) + else() + message(STATUS "RealtimePipeline: skipping (cudaq-realtime or cudaq-realtime-host-dispatch not found)") + endif() +else() + if(CMAKE_CUDA_COMPILER) + message(STATUS "RealtimePipeline: skipping (CUDAQ_REALTIME_INCLUDE_DIR not set)") + endif() +endif() + add_subdirectory(quantinuum) add_subdirectory(simulation) diff --git a/libs/qec/lib/realtime/ai_decoder_service.cu b/libs/qec/lib/realtime/ai_decoder_service.cu new file mode 100644 index 00000000..4694a477 --- /dev/null +++ b/libs/qec/lib/realtime/ai_decoder_service.cu @@ -0,0 +1,366 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/qec/realtime/ai_decoder_service.h" +#include +#include +#include +#include +#include +#include +#include + +#define DECODER_CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + throw std::runtime_error( \ + std::string("CUDA Error in AIDecoderService: ") + \ + cudaGetErrorString(err)); \ + } \ + } while (0) + +namespace cudaq::qec { + +// ============================================================================= +// Gateway Kernels +// ============================================================================= + +__global__ void gateway_input_kernel(void **mailbox_slot_ptr, + void *trt_fixed_input, + size_t copy_size_bytes) { + void *ring_buffer_data = *mailbox_slot_ptr; + if (ring_buffer_data == nullptr) + return; + + const char *src = + (const char *)ring_buffer_data + sizeof(cudaq::realtime::RPCHeader); + char *dst = (char *)trt_fixed_input; + + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < copy_size_bytes; + i += blockDim.x * gridDim.x) { + dst[i] = src[i]; + } +} + +__global__ void gateway_output_kernel(void **mailbox_slot_ptr, + const void *trt_fixed_output, + size_t result_size_bytes) { + void *ring_buffer_data = *mailbox_slot_ptr; + if (ring_buffer_data == nullptr) + return; + + char *dst = (char *)ring_buffer_data + sizeof(cudaq::realtime::RPCHeader); + const char *src = (const char *)trt_fixed_output; + + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < result_size_bytes; + i += blockDim.x * gridDim.x) { + dst[i] = src[i]; + } + + __syncthreads(); + + if (threadIdx.x == 0 && blockIdx.x == 0) { + auto *response = (cudaq::realtime::RPCResponse *)ring_buffer_data; + response->magic = cudaq::realtime::RPC_MAGIC_RESPONSE; + response->status = 0; + response->result_len = static_cast(result_size_bytes); + __threadfence_system(); + } +} + +// ============================================================================= +// Helpers +// ============================================================================= + +static size_t trt_dtype_size(nvinfer1::DataType dtype) { + switch (dtype) { + case nvinfer1::DataType::kFLOAT: + return 4; + case nvinfer1::DataType::kHALF: + return 2; + case nvinfer1::DataType::kINT8: + return 1; + case nvinfer1::DataType::kUINT8: + return 1; + case nvinfer1::DataType::kINT32: + return 4; + case nvinfer1::DataType::kINT64: + return 8; + case nvinfer1::DataType::kBOOL: + return 1; + default: + return 4; + } +} + +static size_t tensor_volume(const nvinfer1::Dims &d) { + size_t v = 1; + for (int i = 0; i < d.nbDims; ++i) + v *= (d.d[i] > 0) ? static_cast(d.d[i]) : 1; + return v; +} + +// ============================================================================= +// Class Implementation +// ============================================================================= + +AIDecoderService::Logger AIDecoderService::gLogger; + +void AIDecoderService::Logger::log(Severity severity, + const char *msg) noexcept { + if (severity <= Severity::kWARNING) { + std::printf("[TensorRT] %s\n", msg); + } +} + +AIDecoderService::AIDecoderService(const std::string &model_path, + void **device_mailbox_slot, + const std::string &engine_save_path) + : device_mailbox_slot_(device_mailbox_slot) { + + if (std::getenv("SKIP_TRT")) { + input_size_ = 1600 * sizeof(float); + output_size_ = 1600 * sizeof(float); + allocate_resources(); + } else { + std::string ext = model_path.substr(model_path.find_last_of('.')); + if (ext == ".onnx") { + build_engine_from_onnx(model_path, engine_save_path); + } else { + load_engine(model_path); + } + setup_bindings(); + allocate_resources(); + } +} + +AIDecoderService::~AIDecoderService() { + if (graph_exec_) + cudaGraphExecDestroy(graph_exec_); + if (d_trt_input_) + cudaFree(d_trt_input_); + if (d_trt_output_) + cudaFree(d_trt_output_); + for (auto *buf : d_aux_buffers_) + cudaFree(buf); +} + +void AIDecoderService::load_engine(const std::string &path) { + std::ifstream file(path, std::ios::binary); + if (!file.good()) + throw std::runtime_error("Error opening engine file: " + path); + + file.seekg(0, file.end); + size_t size = file.tellg(); + file.seekg(0, file.beg); + + std::vector engine_data(size); + file.read(engine_data.data(), size); + + runtime_.reset(nvinfer1::createInferRuntime(gLogger)); + engine_.reset(runtime_->deserializeCudaEngine(engine_data.data(), size)); + context_.reset(engine_->createExecutionContext()); +} + +void AIDecoderService::build_engine_from_onnx( + const std::string &onnx_path, const std::string &engine_save_path) { + runtime_.reset(nvinfer1::createInferRuntime(gLogger)); + + auto builder = std::unique_ptr( + nvinfer1::createInferBuilder(gLogger)); + auto network = std::unique_ptr( + builder->createNetworkV2(0)); + auto config = + std::unique_ptr(builder->createBuilderConfig()); + + // Enable FP16 optimization for Grace Blackwell / Hopper + if (builder->platformHasFastFp16()) { + config->setFlag(nvinfer1::BuilderFlag::kFP16); + std::printf("[TensorRT] FP16 precision enabled.\n"); + } else { + std::printf("[TensorRT] Warning: Platform does not support fast FP16. " + "Using FP32.\n"); + } + + auto parser = std::unique_ptr( + nvonnxparser::createParser(*network, gLogger)); + + if (!parser->parseFromFile( + onnx_path.c_str(), + static_cast(nvinfer1::ILogger::Severity::kWARNING))) { + throw std::runtime_error("Failed to parse ONNX file: " + onnx_path); + } + + bool has_dynamic = false; + for (int i = 0; i < network->getNbInputs(); ++i) { + auto *input = network->getInput(i); + auto dims = input->getDimensions(); + for (int d = 0; d < dims.nbDims; ++d) { + if (dims.d[d] <= 0) { + has_dynamic = true; + break; + } + } + if (has_dynamic) + break; + } + + if (has_dynamic) { + auto *profile = builder->createOptimizationProfile(); + for (int i = 0; i < network->getNbInputs(); ++i) { + auto *input = network->getInput(i); + auto dims = input->getDimensions(); + nvinfer1::Dims fixed = dims; + for (int d = 0; d < fixed.nbDims; ++d) { + if (fixed.d[d] <= 0) + fixed.d[d] = 1; + } + profile->setDimensions(input->getName(), + nvinfer1::OptProfileSelector::kMIN, fixed); + profile->setDimensions(input->getName(), + nvinfer1::OptProfileSelector::kOPT, fixed); + profile->setDimensions(input->getName(), + nvinfer1::OptProfileSelector::kMAX, fixed); + std::printf("[TensorRT] Set dynamic input \"%s\" to batch=1\n", + input->getName()); + } + config->addOptimizationProfile(profile); + } + + auto plan = std::unique_ptr( + builder->buildSerializedNetwork(*network, *config)); + if (!plan) + throw std::runtime_error("Failed to build TRT engine from ONNX"); + + if (!engine_save_path.empty()) { + std::ofstream out(engine_save_path, std::ios::binary); + if (out.good()) { + out.write(static_cast(plan->data()), plan->size()); + std::printf("[TensorRT] Saved engine to: %s\n", engine_save_path.c_str()); + } else { + std::fprintf(stderr, "[TensorRT] Warning: could not save engine to %s\n", + engine_save_path.c_str()); + } + } + + engine_.reset(runtime_->deserializeCudaEngine(plan->data(), plan->size())); + if (!engine_) + throw std::runtime_error("Failed to deserialize built engine"); + + context_.reset(engine_->createExecutionContext()); + + std::printf("[TensorRT] Built engine from ONNX: %s\n", onnx_path.c_str()); +} + +void AIDecoderService::setup_bindings() { + int num_io = engine_->getNbIOTensors(); + bool found_input = false; + bool found_output = false; + + for (int i = 0; i < num_io; ++i) { + const char *name = engine_->getIOTensorName(i); + auto mode = engine_->getTensorIOMode(name); + auto dims = engine_->getTensorShape(name); + auto dtype = engine_->getTensorDataType(name); + size_t size_bytes = tensor_volume(dims) * trt_dtype_size(dtype); + + bool is_input = (mode == nvinfer1::TensorIOMode::kINPUT); + + std::printf("[TensorRT] Binding %d: \"%s\" %s, dtype=%d, elem_size=%zu, " + "volume=%zu, %zu bytes\n", + i, name, is_input ? "INPUT" : "OUTPUT", + static_cast(dtype), trt_dtype_size(dtype), + tensor_volume(dims), size_bytes); + + TensorBinding binding{name, nullptr, size_bytes, is_input}; + + if (is_input && !found_input) { + input_size_ = size_bytes; + found_input = true; + } else if (!is_input && !found_output) { + output_size_ = size_bytes; + found_output = true; + } + + all_bindings_.push_back(std::move(binding)); + } +} + +void AIDecoderService::allocate_resources() { + if (all_bindings_.empty()) { + // SKIP_TRT fallback path + if (cudaMalloc(&d_trt_input_, input_size_) != cudaSuccess) + throw std::runtime_error("Failed to allocate TRT Input"); + if (cudaMalloc(&d_trt_output_, output_size_) != cudaSuccess) + throw std::runtime_error("Failed to allocate TRT Output"); + return; + } + + bool assigned_input = false; + bool assigned_output = false; + + for (auto &b : all_bindings_) { + void *buf = nullptr; + if (cudaMalloc(&buf, b.size_bytes) != cudaSuccess) + throw std::runtime_error("Failed to allocate buffer for " + b.name); + cudaMemset(buf, 0, b.size_bytes); + b.d_buffer = buf; + + if (b.is_input && !assigned_input) { + d_trt_input_ = buf; + assigned_input = true; + } else if (!b.is_input && !assigned_output) { + d_trt_output_ = buf; + assigned_output = true; + } else { + d_aux_buffers_.push_back(buf); + } + } +} + +void AIDecoderService::capture_graph(cudaStream_t stream) { + for (auto &b : all_bindings_) { + context_->setTensorAddress(b.name.c_str(), b.d_buffer); + } + + if (!context_->enqueueV3(stream)) + throw std::runtime_error("TRT enqueueV3 warmup failed in AIDecoderService"); + DECODER_CUDA_CHECK(cudaStreamSynchronize(stream)); + + cudaGraph_t graph; + DECODER_CUDA_CHECK( + cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); + + gateway_input_kernel<<<1, 128, 0, stream>>>(device_mailbox_slot_, + d_trt_input_, input_size_); + if (!context_->enqueueV3(stream)) + throw std::runtime_error( + "TRT enqueueV3 failed during graph capture in AIDecoderService"); + gateway_output_kernel<<<1, 128, 0, stream>>>(device_mailbox_slot_, + d_trt_output_, output_size_); + + DECODER_CUDA_CHECK(cudaStreamEndCapture(stream, &graph)); + + cudaError_t inst_err = cudaGraphInstantiateWithFlags( + &graph_exec_, graph, cudaGraphInstantiateFlagDeviceLaunch); + if (inst_err != cudaSuccess) { + cudaGraphDestroy(graph); + throw std::runtime_error( + std::string( + "cudaGraphInstantiateWithFlags failed in AIDecoderService: ") + + cudaGetErrorString(inst_err)); + } + + DECODER_CUDA_CHECK(cudaGraphUpload(graph_exec_, stream)); + cudaGraphDestroy(graph); + DECODER_CUDA_CHECK(cudaStreamSynchronize(stream)); +} + +} // namespace cudaq::qec diff --git a/libs/qec/lib/realtime/ai_predecoder_service.cu b/libs/qec/lib/realtime/ai_predecoder_service.cu new file mode 100644 index 00000000..c1cf3b8b --- /dev/null +++ b/libs/qec/lib/realtime/ai_predecoder_service.cu @@ -0,0 +1,186 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "cudaq/qec/realtime/ai_predecoder_service.h" +#include "cudaq/qec/realtime/nvtx_helpers.h" +#include +#include +#include +#include + +#define SERVICE_CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + throw std::runtime_error( \ + std::string("CUDA Error in AIPreDecoderService: ") + \ + cudaGetErrorString(err)); \ + } \ + } while (0) + +namespace cudaq::qec { + +// System scope for NVLink/PCIe visibility to host (design: no +// __threadfence_system) +using atomic_int_sys = cuda::atomic; + +// ============================================================================= +// Kernels (single slot 0 only; queue removed for host-side dynamic pool) +// ============================================================================= + +__global__ void predecoder_signal_ready_kernel(atomic_int_sys *d_ready_flags) { + if (threadIdx.x == 0) + d_ready_flags[0].store(1, cuda::std::memory_order_release); +} + +__global__ void passthrough_copy_kernel(void *dst, const void *src, + size_t num_bytes) { + const uint4 *src4 = (const uint4 *)src; + uint4 *dst4 = (uint4 *)dst; + size_t n4 = num_bytes / sizeof(uint4); + for (size_t i = threadIdx.x; i < n4; i += blockDim.x) + dst4[i] = src4[i]; + + size_t done = n4 * sizeof(uint4); + for (size_t i = done + threadIdx.x; i < num_bytes; i += blockDim.x) + ((char *)dst)[i] = ((const char *)src)[i]; +} + +// ============================================================================= +// Class Implementation +// ============================================================================= + +AIPreDecoderService::AIPreDecoderService( + const std::string &path, void **mailbox, + int /* queue_depth (ignored; always 1) */, + const std::string &engine_save_path) + : AIDecoderService(path, mailbox, engine_save_path), queue_depth_(1) { + void *buf = nullptr; + + SERVICE_CUDA_CHECK( + cudaHostAlloc(&buf, sizeof(atomic_int_sys), cudaHostAllocMapped)); + h_ready_flags_ = static_cast(buf); + new (h_ready_flags_) atomic_int_sys(0); + + SERVICE_CUDA_CHECK( + cudaHostAlloc(&h_ring_ptrs_, sizeof(void *), cudaHostAllocMapped)); + SERVICE_CUDA_CHECK(cudaHostAlloc(&h_predecoder_outputs_, get_output_size(), + cudaHostAllocMapped)); + + SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void **)&d_ready_flags_, + (void *)h_ready_flags_, 0)); + SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void **)&d_ring_ptrs_, + (void *)h_ring_ptrs_, 0)); + SERVICE_CUDA_CHECK(cudaHostGetDevicePointer( + (void **)&d_predecoder_outputs_, (void *)h_predecoder_outputs_, 0)); +} + +AIPreDecoderService::~AIPreDecoderService() { + if (h_ready_flags_) { + h_ready_flags_[0].~atomic_int_sys(); + cudaFreeHost((void *)h_ready_flags_); + h_ready_flags_ = nullptr; + d_ready_flags_ = nullptr; + } + if (h_ring_ptrs_) { + cudaFreeHost(h_ring_ptrs_); + h_ring_ptrs_ = nullptr; + } + if (h_predecoder_outputs_) { + cudaFreeHost(h_predecoder_outputs_); + h_predecoder_outputs_ = nullptr; + } +} + +void AIPreDecoderService::capture_graph(cudaStream_t stream, + bool device_launch) { + bool skip_trt = (std::getenv("SKIP_TRT") != nullptr); + + if (!skip_trt) { + for (auto &b : all_bindings_) { + context_->setTensorAddress(b.name.c_str(), b.d_buffer); + } + if (!context_->enqueueV3(stream)) + throw std::runtime_error( + "TRT enqueueV3 warmup failed in AIPreDecoderService"); + } + SERVICE_CUDA_CHECK(cudaStreamSynchronize(stream)); + + cudaGraph_t graph; + SERVICE_CUDA_CHECK( + cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); + + if (skip_trt) { + passthrough_copy_kernel<<<1, 256, 0, stream>>>(d_trt_output_, d_trt_input_, + get_input_size()); + } else { + if (!context_->enqueueV3(stream)) + throw std::runtime_error( + "TRT enqueueV3 failed during graph capture in AIPreDecoderService"); + } + + SERVICE_CUDA_CHECK(cudaMemcpyAsync(d_predecoder_outputs_, d_trt_output_, + get_output_size(), + cudaMemcpyDeviceToDevice, stream)); + + predecoder_signal_ready_kernel<<<1, 1, 0, stream>>>( + static_cast(d_ready_flags_)); + + SERVICE_CUDA_CHECK(cudaStreamEndCapture(stream, &graph)); + + if (device_launch) { + cudaError_t inst_err = cudaGraphInstantiateWithFlags( + &graph_exec_, graph, cudaGraphInstantiateFlagDeviceLaunch); + if (inst_err != cudaSuccess) { + cudaGraphDestroy(graph); + throw std::runtime_error( + std::string("cudaGraphInstantiateWithFlags (DeviceLaunch) FAILED: ") + + cudaGetErrorString(inst_err)); + } + SERVICE_CUDA_CHECK(cudaGraphUpload(graph_exec_, stream)); + } else { + cudaError_t inst_err = cudaGraphInstantiate(&graph_exec_, graph, 0); + if (inst_err != cudaSuccess) { + cudaGraphDestroy(graph); + throw std::runtime_error(std::string("cudaGraphInstantiate FAILED: ") + + cudaGetErrorString(inst_err)); + } + } + + cudaGraphDestroy(graph); + SERVICE_CUDA_CHECK(cudaStreamSynchronize(stream)); +} + +bool AIPreDecoderService::poll_next_job(PreDecoderJob &out_job) { + auto *sys_flags = static_cast(h_ready_flags_); + int expected = 1; + // Atomically claim: 1 (Ready) -> 2 (Processing) so we enqueue the job exactly + // once. Use relaxed on failure so spinning doesn't add barriers that delay + // seeing GPU's store(1). + if (sys_flags[0].compare_exchange_strong(expected, 2, + cuda::std::memory_order_acquire, + cuda::std::memory_order_relaxed)) { + NVTX_PUSH("PollJob"); + out_job.slot_idx = 0; + out_job.ring_buffer_ptr = h_ring_ptrs_[0]; + out_job.inference_data = h_predecoder_outputs_; + NVTX_POP(); + return true; + } + return false; +} + +void AIPreDecoderService::release_job(int /* slot_idx */) { + NVTX_PUSH("ReleaseJob"); + auto *sys_flags = static_cast(h_ready_flags_); + // PyMatching done: 2 (Processing) -> 0 (Idle) + sys_flags[0].store(0, cuda::std::memory_order_release); + NVTX_POP(); +} + +} // namespace cudaq::qec diff --git a/libs/qec/lib/realtime/realtime_pipeline.cu b/libs/qec/lib/realtime/realtime_pipeline.cu new file mode 100644 index 00000000..80339bc0 --- /dev/null +++ b/libs/qec/lib/realtime/realtime_pipeline.cu @@ -0,0 +1,678 @@ +/******************************************************************************* + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. + * All rights reserved. + * + * This source code and the accompanying materials are made available under + * the terms of the Apache License 2.0 which accompanies this distribution. + ******************************************************************************/ + +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" +#include "cudaq/qec/realtime/pipeline.h" +#include "cudaq/qec/realtime/nvtx_helpers.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cudaq::realtime { + +using atomic_uint64_sys = cuda::std::atomic; +using atomic_int_sys = cuda::std::atomic; + +// --------------------------------------------------------------------------- +// Internal helpers +// --------------------------------------------------------------------------- + +#define PIPELINE_CUDA_CHECK(call) \ + do { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) { \ + std::cerr << "RealtimePipeline CUDA error: " << cudaGetErrorString(err) \ + << " at " << __FILE__ << ":" << __LINE__ << std::endl; \ + std::abort(); \ + } \ + } while (0) + +static void pin_thread(std::thread &t, int core) { + if (core < 0) + return; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core, &cpuset); + pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset); +} + +// --------------------------------------------------------------------------- +// GPU-only mode: completion signaling via cudaLaunchHostFunc +// --------------------------------------------------------------------------- + +struct GpuOnlyWorkerCtx { + atomic_uint64_sys *tx_flags; + atomic_uint64_sys *idle_mask; + int *inflight_slot_tags; + uint8_t *rx_data_host; + size_t slot_size; + int worker_id; + void (*user_post_launch_fn)(void *user_data, void *slot_dev, + cudaStream_t stream); + void *user_post_launch_data; + int origin_slot; + uint64_t tx_value; +}; + +static void gpu_only_host_callback(void *user_data) { + auto *ctx = static_cast(user_data); + ctx->tx_flags[ctx->origin_slot].store(ctx->tx_value, + cuda::std::memory_order_release); + ctx->idle_mask->fetch_or(1ULL << ctx->worker_id, + cuda::std::memory_order_release); +} + +static void gpu_only_post_launch(void *user_data, void *slot_dev, + cudaStream_t stream) { + NVTX_PUSH("GPUPostLaunch"); + auto *ctx = static_cast(user_data); + + if (ctx->user_post_launch_fn) + ctx->user_post_launch_fn(ctx->user_post_launch_data, slot_dev, stream); + + ctx->origin_slot = ctx->inflight_slot_tags[ctx->worker_id]; + uint8_t *slot_host = ctx->rx_data_host + + static_cast(ctx->origin_slot) * ctx->slot_size; + ctx->tx_value = reinterpret_cast(slot_host); + + cudaLaunchHostFunc(stream, gpu_only_host_callback, ctx); + NVTX_POP(); +} + +// --------------------------------------------------------------------------- +// RingBufferManager +// --------------------------------------------------------------------------- + +class RingBufferManager { +public: + RingBufferManager(size_t num_slots, size_t slot_size) + : num_slots_(num_slots), slot_size_(slot_size) { + PIPELINE_CUDA_CHECK(cudaHostAlloc( + &buf_rx_, num_slots * sizeof(atomic_uint64_sys), cudaHostAllocMapped)); + rx_flags_ = static_cast(buf_rx_); + for (size_t i = 0; i < num_slots; ++i) + new (rx_flags_ + i) atomic_uint64_sys(0); + + PIPELINE_CUDA_CHECK(cudaHostAlloc( + &buf_tx_, num_slots * sizeof(atomic_uint64_sys), cudaHostAllocMapped)); + tx_flags_ = static_cast(buf_tx_); + for (size_t i = 0; i < num_slots; ++i) + new (tx_flags_ + i) atomic_uint64_sys(0); + + PIPELINE_CUDA_CHECK(cudaHostGetDevicePointer( + reinterpret_cast(&rx_flags_dev_), buf_rx_, 0)); + PIPELINE_CUDA_CHECK(cudaHostGetDevicePointer( + reinterpret_cast(&tx_flags_dev_), buf_tx_, 0)); + + PIPELINE_CUDA_CHECK(cudaHostAlloc(reinterpret_cast(&rx_data_host_), + num_slots * slot_size, + cudaHostAllocMapped)); + PIPELINE_CUDA_CHECK(cudaHostGetDevicePointer( + reinterpret_cast(&rx_data_dev_), rx_data_host_, 0)); + + rb_.rx_flags = reinterpret_cast(rx_flags_); + rb_.tx_flags = reinterpret_cast(tx_flags_); + rb_.rx_data = rx_data_dev_; + rb_.tx_data = rx_data_dev_; + rb_.rx_stride_sz = slot_size; + rb_.tx_stride_sz = slot_size; + rb_.rx_flags_host = reinterpret_cast(rx_flags_); + rb_.tx_flags_host = reinterpret_cast(tx_flags_); + rb_.rx_data_host = rx_data_host_; + rb_.tx_data_host = rx_data_host_; + } + + ~RingBufferManager() { + for (size_t i = 0; i < num_slots_; ++i) { + rx_flags_[i].~atomic_uint64_sys(); + tx_flags_[i].~atomic_uint64_sys(); + } + cudaFreeHost(buf_rx_); + cudaFreeHost(buf_tx_); + cudaFreeHost(rx_data_host_); + } + + bool slot_available(uint32_t slot) const { + return cudaq_host_ringbuffer_slot_available(&rb_, slot) != 0; + } + + void write_and_signal(uint32_t slot, uint32_t function_id, + const void *payload, uint32_t payload_len, + uint32_t request_id = 0, + uint64_t ptp_timestamp = 0) { + cudaq_host_ringbuffer_write_rpc_request(&rb_, slot, function_id, payload, + payload_len, request_id, + ptp_timestamp); + cudaq_host_ringbuffer_signal_slot(&rb_, slot); + } + + cudaq_tx_status_t poll_tx(uint32_t slot, int *cuda_error) const { + return cudaq_host_ringbuffer_poll_tx_flag(&rb_, slot, cuda_error); + } + + void clear_slot(uint32_t slot) { + cudaq_host_ringbuffer_clear_slot(&rb_, slot); + } + + size_t num_slots() const { return num_slots_; } + size_t slot_size() const { return slot_size_; } + + atomic_uint64_sys *rx_flags() { return rx_flags_; } + atomic_uint64_sys *tx_flags() { return tx_flags_; } + uint8_t *rx_data_host() { return rx_data_host_; } + uint8_t *rx_data_dev() { return rx_data_dev_; } + const cudaq_ringbuffer_t &ringbuffer() const { return rb_; } + +private: + size_t num_slots_; + size_t slot_size_; + void *buf_rx_ = nullptr; + void *buf_tx_ = nullptr; + atomic_uint64_sys *rx_flags_ = nullptr; + atomic_uint64_sys *tx_flags_ = nullptr; + uint64_t *rx_flags_dev_ = nullptr; + uint64_t *tx_flags_dev_ = nullptr; + uint8_t *rx_data_host_ = nullptr; + uint8_t *rx_data_dev_ = nullptr; + cudaq_ringbuffer_t rb_{}; +}; + +// --------------------------------------------------------------------------- +// Impl +// --------------------------------------------------------------------------- + +struct RealtimePipeline::Impl { + PipelineStageConfig config; + + GpuStageFactory gpu_factory; + CpuStageCallback cpu_stage; + CompletionCallback completion_handler; + + // Owned infrastructure + std::unique_ptr ring; + void **h_mailbox_bank = nullptr; + void **d_mailbox_bank = nullptr; + + // Dispatcher state (hidden atomics) + atomic_int_sys shutdown_flag{0}; + uint64_t dispatcher_stats = 0; + atomic_uint64_sys live_dispatched{0}; + atomic_uint64_sys idle_mask{0}; + std::vector inflight_slot_tags; + + // Function table + std::vector function_table; + + // Per-worker GPU resources (from factory) + std::vector worker_resources; + + // GPU-only mode state + bool gpu_only = false; + std::vector gpu_only_ctxs; + + // Slot-to-request mapping (consumer-owned) + std::vector slot_request; + std::vector slot_occupied; + + // Stats (atomic counters) + std::atomic total_submitted{0}; + std::atomic total_completed{0}; + std::atomic backpressure_stalls{0}; + + // Thread coordination + std::atomic producer_stop{false}; + std::atomic consumer_stop{false}; + + // Threads + std::thread dispatcher_thread; + std::thread consumer_thread; + std::vector worker_threads; + + std::atomic started{false}; + + // ----------------------------------------------------------------------- + // Lifecycle + // ----------------------------------------------------------------------- + + void allocate(const PipelineStageConfig &cfg) { + if (cfg.num_workers > 64) { + throw std::invalid_argument("num_workers (" + + std::to_string(cfg.num_workers) + + ") exceeds idle_mask capacity of 64"); + } + + config = cfg; + + ring = std::make_unique( + static_cast(cfg.num_slots), cfg.slot_size); + + PIPELINE_CUDA_CHECK(cudaHostAlloc(&h_mailbox_bank, + cfg.num_workers * sizeof(void *), + cudaHostAllocMapped)); + std::memset(h_mailbox_bank, 0, cfg.num_workers * sizeof(void *)); + PIPELINE_CUDA_CHECK(cudaHostGetDevicePointer( + reinterpret_cast(&d_mailbox_bank), h_mailbox_bank, 0)); + + inflight_slot_tags.resize(cfg.num_workers, 0); + slot_request.resize(cfg.num_slots, 0); + slot_occupied.resize(cfg.num_slots, 0); + } + + void start_threads() { + if (!gpu_factory) { + throw std::logic_error("gpu_factory must be set before calling start()"); + } + + const int nw = config.num_workers; + gpu_only = !cpu_stage; + + // Build GPU resources via user factory + worker_resources.resize(nw); + function_table.resize(nw); + for (int i = 0; i < nw; ++i) { + worker_resources[i] = gpu_factory(i); + function_table[i].function_id = worker_resources[i].function_id; + function_table[i].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; + function_table[i].handler.graph_exec = worker_resources[i].graph_exec; + std::memset(&function_table[i].schema, 0, + sizeof(function_table[i].schema)); + } + + // In GPU-only mode, set up per-worker contexts for cudaLaunchHostFunc + // completion signaling (chains user's post_launch_fn if provided). + if (gpu_only) { + gpu_only_ctxs.resize(nw); + for (int i = 0; i < nw; ++i) { + auto &c = gpu_only_ctxs[i]; + c.tx_flags = ring->tx_flags(); + c.idle_mask = &idle_mask; + c.inflight_slot_tags = inflight_slot_tags.data(); + c.rx_data_host = ring->rx_data_host(); + c.slot_size = config.slot_size; + c.worker_id = i; + c.user_post_launch_fn = worker_resources[i].post_launch_fn; + c.user_post_launch_data = worker_resources[i].post_launch_data; + c.origin_slot = 0; + c.tx_value = 0; + } + } + + // Initialize idle_mask with all workers free + uint64_t initial_idle = (nw >= 64) ? ~0ULL : ((1ULL << nw) - 1); + idle_mask.store(initial_idle, cuda::std::memory_order_release); + + // Build cudaq_host_dispatcher_config_t + std::vector disp_workers(nw); + for (int i = 0; i < nw; ++i) { + disp_workers[i].graph_exec = worker_resources[i].graph_exec; + disp_workers[i].stream = worker_resources[i].stream; + disp_workers[i].function_id = worker_resources[i].function_id; + disp_workers[i].pre_launch_fn = worker_resources[i].pre_launch_fn; + disp_workers[i].pre_launch_data = worker_resources[i].pre_launch_data; + + if (gpu_only) { + disp_workers[i].post_launch_fn = gpu_only_post_launch; + disp_workers[i].post_launch_data = &gpu_only_ctxs[i]; + } else { + disp_workers[i].post_launch_fn = worker_resources[i].post_launch_fn; + disp_workers[i].post_launch_data = + worker_resources[i].post_launch_data; + } + } + + cudaq_host_dispatcher_config_t disp_cfg; + std::memset(&disp_cfg, 0, sizeof(disp_cfg)); + disp_cfg.rx_flags = static_cast(ring->rx_flags()); + disp_cfg.tx_flags = static_cast(ring->tx_flags()); + disp_cfg.rx_data_host = ring->rx_data_host(); + disp_cfg.rx_data_dev = ring->rx_data_dev(); + disp_cfg.tx_data_host = nullptr; + disp_cfg.tx_data_dev = nullptr; + disp_cfg.tx_stride_sz = config.slot_size; + disp_cfg.h_mailbox_bank = h_mailbox_bank; + disp_cfg.num_slots = static_cast(config.num_slots); + disp_cfg.slot_size = config.slot_size; + disp_cfg.workers = disp_workers.data(); + disp_cfg.num_workers = static_cast(nw); + disp_cfg.function_table = function_table.data(); + disp_cfg.function_table_count = static_cast(nw); + disp_cfg.shutdown_flag = static_cast(&shutdown_flag); + disp_cfg.stats_counter = &dispatcher_stats; + disp_cfg.live_dispatched = static_cast(&live_dispatched); + disp_cfg.idle_mask = static_cast(&idle_mask); + disp_cfg.inflight_slot_tags = inflight_slot_tags.data(); + + // --- Dispatcher thread --- + // Copy workers vector into the lambda so it outlives this scope. + dispatcher_thread = std::thread( + [cfg = disp_cfg, workers = std::move(disp_workers)]() mutable { + cfg.workers = workers.data(); + cudaq_host_dispatcher_loop(&cfg); + }); + pin_thread(dispatcher_thread, config.cores.dispatcher); + + // --- Worker threads (skipped in GPU-only mode) --- + if (!gpu_only) { + worker_threads.resize(nw); + for (int i = 0; i < nw; ++i) { + worker_threads[i] = std::thread([this, i]() { worker_loop(i); }); + int core = + (config.cores.worker_base >= 0) ? config.cores.worker_base + i : -1; + pin_thread(worker_threads[i], core); + } + } + + // --- Consumer thread --- + consumer_thread = std::thread([this]() { consumer_loop(); }); + pin_thread(consumer_thread, config.cores.consumer); + + started = true; + } + + void stop_all() { + if (!started) + return; + + // Signal consumer to finish pending work + producer_stop.store(true, std::memory_order_release); + + // Grace period for in-flight requests + auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(5); + while (total_completed.load(std::memory_order_relaxed) < + total_submitted.load(std::memory_order_relaxed) && + std::chrono::steady_clock::now() < deadline) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + + consumer_stop.store(true, std::memory_order_release); + + // Shut down dispatcher + shutdown_flag.store(1, cuda::std::memory_order_release); + dispatcher_thread.join(); + + // Consumer + consumer_thread.join(); + + // Workers check shutdown via consumer_stop (they spin on ready_flags, + // which will never fire after dispatcher is gone, so we need to break + // them out). We set consumer_stop which doubles as system_stop for + // workers; the user's poll_next_job must eventually return false. + for (auto &t : worker_threads) { + if (t.joinable()) + t.join(); + } + + started = false; + } + + void free_resources() { + ring.reset(); + if (h_mailbox_bank) { + cudaFreeHost(h_mailbox_bank); + h_mailbox_bank = nullptr; + } + } + + // ----------------------------------------------------------------------- + // Worker loop (one per worker thread) + // ----------------------------------------------------------------------- + + void worker_loop(int worker_id) { + auto *wr = &worker_resources[worker_id]; + + // The cpu_stage callback is called in "poll mode" + // (gpu_output == nullptr). It polls its own GPU-ready + // mechanism and, if a result is available, processes it and + // writes the RPC response. Returns 0 when nothing was ready, + // >0 when a job was completed. The pipeline then handles all + // atomic signaling (tx_flags, idle_mask). + + while (!consumer_stop.load(std::memory_order_relaxed)) { + CpuStageContext ctx; + ctx.worker_id = worker_id; + ctx.origin_slot = inflight_slot_tags[worker_id]; + ctx.gpu_output = nullptr; + ctx.gpu_output_size = 0; + ctx.response_buffer = nullptr; + ctx.max_response_size = 0; + ctx.user_context = wr->user_context; + + NVTX_PUSH("WorkerPoll"); + size_t written = cpu_stage(ctx); + NVTX_POP(); + if (written == 0) { + QEC_CPU_RELAX(); + continue; + } + + if (written == DEFERRED_COMPLETION) { + idle_mask.fetch_or(1ULL << worker_id, cuda::std::memory_order_release); + continue; + } + + int origin_slot = inflight_slot_tags[worker_id]; + + uint8_t *slot_host = ring->rx_data_host() + + static_cast(origin_slot) * config.slot_size; + uint64_t rx_value = reinterpret_cast(slot_host); + + ring->tx_flags()[origin_slot].store(rx_value, + cuda::std::memory_order_release); + + idle_mask.fetch_or(1ULL << worker_id, cuda::std::memory_order_release); + } + } + + // ----------------------------------------------------------------------- + // Consumer loop + // ----------------------------------------------------------------------- + + void consumer_loop() { + const uint32_t ns = static_cast(config.num_slots); + + while (true) { + if (consumer_stop.load(std::memory_order_acquire)) + break; + + bool pdone = producer_stop.load(std::memory_order_acquire); + uint64_t nsub = total_submitted.load(std::memory_order_acquire); + uint64_t ncomp = total_completed.load(std::memory_order_relaxed); + + if (pdone && ncomp >= nsub) + break; + + bool found_any = false; + for (uint32_t s = 0; s < ns; ++s) { + if (!slot_occupied[s]) + continue; + + int cuda_error = 0; + cudaq_tx_status_t status = ring->poll_tx(s, &cuda_error); + + if (status == CUDAQ_TX_READY) { + NVTX_PUSH("ConsumerComplete"); + if (completion_handler) { + Completion c; + c.request_id = slot_request[s]; + c.slot = static_cast(s); + c.success = true; + c.cuda_error = 0; + completion_handler(c); + } + total_completed.fetch_add(1, std::memory_order_relaxed); + + // ARM memory ordering: clear occupancy BEFORE + // clearing ring buffer flags, with a fence between. + slot_occupied[s] = 0; + __sync_synchronize(); + ring->clear_slot(s); + found_any = true; + NVTX_POP(); + + } else if (status == CUDAQ_TX_ERROR) { + if (completion_handler) { + Completion c; + c.request_id = slot_request[s]; + c.slot = static_cast(s); + c.success = false; + c.cuda_error = cuda_error; + completion_handler(c); + } + total_completed.fetch_add(1, std::memory_order_relaxed); + slot_occupied[s] = 0; + __sync_synchronize(); + ring->clear_slot(s); + found_any = true; + } + } + + if (!found_any) + QEC_CPU_RELAX(); + } + } +}; + +// --------------------------------------------------------------------------- +// RealtimePipeline public API +// --------------------------------------------------------------------------- + +RealtimePipeline::RealtimePipeline(const PipelineStageConfig &config) + : impl_(std::make_unique()) { + impl_->allocate(config); +} + +RealtimePipeline::~RealtimePipeline() { + if (impl_->started) + impl_->stop_all(); + impl_->free_resources(); +} + +void RealtimePipeline::set_gpu_stage(GpuStageFactory factory) { + impl_->gpu_factory = std::move(factory); +} + +void RealtimePipeline::set_cpu_stage(CpuStageCallback callback) { + impl_->cpu_stage = std::move(callback); +} + +void RealtimePipeline::set_completion_handler(CompletionCallback handler) { + impl_->completion_handler = std::move(handler); +} + +void RealtimePipeline::start() { + if (impl_->started) + return; + impl_->start_threads(); +} + +void RealtimePipeline::stop() { impl_->stop_all(); } + +RealtimePipeline::Stats RealtimePipeline::stats() const { + return {impl_->total_submitted.load(std::memory_order_relaxed), + impl_->total_completed.load(std::memory_order_relaxed), + impl_->live_dispatched.load(cuda::std::memory_order_relaxed), + impl_->backpressure_stalls.load(std::memory_order_relaxed)}; +} + +void RealtimePipeline::complete_deferred(int slot) { + uint8_t *slot_host = impl_->ring->rx_data_host() + + static_cast(slot) * impl_->config.slot_size; + uint64_t rx_value = reinterpret_cast(slot_host); + impl_->ring->tx_flags()[slot].store(rx_value, + cuda::std::memory_order_release); +} + +// --------------------------------------------------------------------------- +// RingBufferInjector +// --------------------------------------------------------------------------- + +struct RingBufferInjector::State { + RingBufferManager *ring = nullptr; + std::vector *slot_request = nullptr; + std::vector *slot_occupied = nullptr; + std::atomic *total_submitted = nullptr; + std::atomic *backpressure_stalls = nullptr; + std::atomic *producer_stop = nullptr; + int num_slots = 0; + std::atomic next_slot{0}; +}; + +RingBufferInjector RealtimePipeline::create_injector() { + auto s = std::make_unique(); + s->ring = impl_->ring.get(); + s->slot_request = &impl_->slot_request; + s->slot_occupied = &impl_->slot_occupied; + s->total_submitted = &impl_->total_submitted; + s->backpressure_stalls = &impl_->backpressure_stalls; + s->producer_stop = &impl_->producer_stop; + s->num_slots = impl_->config.num_slots; + return RingBufferInjector(std::move(s)); +} + +RingBufferInjector::RingBufferInjector(std::unique_ptr s) + : state_(std::move(s)) {} + +RingBufferInjector::~RingBufferInjector() = default; +RingBufferInjector::RingBufferInjector(RingBufferInjector &&) noexcept = + default; +RingBufferInjector & +RingBufferInjector::operator=(RingBufferInjector &&) noexcept = default; + +bool RingBufferInjector::try_submit(uint32_t function_id, const void *payload, + size_t payload_size, uint64_t request_id) { + uint32_t cur = state_->next_slot.load(std::memory_order_relaxed); + uint32_t slot = cur % static_cast(state_->num_slots); + if (!state_->ring->slot_available(slot)) + return false; + + if (!state_->next_slot.compare_exchange_weak( + cur, cur + 1, std::memory_order_acq_rel, std::memory_order_relaxed)) + return false; + + NVTX_PUSH("Submit"); + state_->ring->write_and_signal(slot, function_id, payload, + static_cast(payload_size), + static_cast(request_id)); + + (*state_->slot_request)[slot] = request_id; + (*state_->slot_occupied)[slot] = 1; + state_->total_submitted->fetch_add(1, std::memory_order_release); + NVTX_POP(); + return true; +} + +void RingBufferInjector::submit(uint32_t function_id, const void *payload, + size_t payload_size, uint64_t request_id) { + while (!try_submit(function_id, payload, payload_size, request_id)) { + if (state_->producer_stop && + state_->producer_stop->load(std::memory_order_acquire)) + return; + state_->backpressure_stalls->fetch_add(1, std::memory_order_relaxed); + QEC_CPU_RELAX(); + } +} + +uint64_t RingBufferInjector::backpressure_stalls() const { + return state_->backpressure_stalls->load(std::memory_order_relaxed); +} + +} // namespace cudaq::realtime diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp new file mode 100644 index 00000000..63bfe668 --- /dev/null +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -0,0 +1,1155 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +/******************************************************************************* + * Hybrid Realtime Pipeline Benchmark with AI Pre-Decoder + PyMatching + * + * Uses the RealtimePipeline scaffolding to hide all ring buffer, atomics, + * and thread management. Application code only provides: + * 1. GPU stage factory (AIPreDecoderService instances) + * 2. CPU stage callback (PyMatching decode) + * 3. Completion callback (timestamp recording) + * + * Usage: test_realtime_predecoder_w_pymatching [d7|d13|d13_r104|d21|d31] + *[rate_us] [duration_s] + ******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifndef CUDA_VERSION +#define CUDA_VERSION 13000 +#endif + +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" +#include "cudaq/qec/realtime/pipeline.h" + +#include "cudaq/qec/code.h" +#include "cudaq/qec/decoder.h" +#include "cudaq/qec/realtime/ai_decoder_service.h" +#include "cudaq/qec/realtime/ai_predecoder_service.h" +#include "cudaq/qec/realtime/nvtx_helpers.h" + +using namespace cudaq::qec; +namespace realtime_ns = cudaq::realtime; + +// Portable CPU Yield +#ifndef QEC_CPU_RELAX +#if defined(__x86_64__) +#include +#define QEC_CPU_RELAX() _mm_pause() +#elif defined(__aarch64__) +#define QEC_CPU_RELAX() __asm__ volatile("yield" ::: "memory") +#else +#define QEC_CPU_RELAX() \ + do { \ + } while (0) +#endif +#endif + +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at line " \ + << __LINE__ << std::endl; \ + exit(1); \ + } \ + } while (0) + +// ============================================================================= +// Pipeline Configuration (application-level, no atomics) +// ============================================================================= + +constexpr size_t NUM_SLOTS = 16; + +struct PipelineConfig { + std::string label; + int distance; + int num_rounds; + std::string onnx_filename; + int num_predecoders; + int num_workers; + int num_decode_workers; + + std::string onnx_path() const { + return std::string(ONNX_MODEL_DIR) + "/" + onnx_filename; + } + + std::string engine_path() const { + std::string name = onnx_filename; + auto dot = name.rfind('.'); + if (dot != std::string::npos) + name = name.substr(0, dot); + return std::string(ONNX_MODEL_DIR) + "/" + name + ".engine"; + } + + static PipelineConfig d7_r7() { + return {"d7_r7_Z", 7, 7, "model1_d7_r7_unified_Z_batch1.onnx", 16, 16, 32}; + } + + static PipelineConfig d13_r13() { + return {"d13_r13_X", 13, 13, "predecoder_memory_d13_T13_X.onnx", 16, 16, 32}; + } + + static PipelineConfig d13_r104() { + return {"d13_r104_X", 13, 104, "predecoder_memory_d13_T104_X.onnx", 8, 8, 16}; + } + + static PipelineConfig d21_r21() { + return {"d21_r21_Z", 21, 21, "model1_d21_r21_unified_X_batch1.onnx", 16, + 16, 32}; + } + + static PipelineConfig d31_r31() { + return {"d31_r31_Z", 31, 31, "model1_d31_r31_unified_Z_batch1.onnx", 16, + 16, 32}; + } +}; + +static size_t round_up_pow2(size_t v) { + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v |= v >> 32; + return v + 1; +} + +// ============================================================================= +// Decoder Context (application-level) +// ============================================================================= + +struct DecoderContext { + std::vector> decoders; + std::atomic next_decoder_idx{0}; + int z_stabilizers = 0; + int spatial_slices = 0; + int num_residual_detectors = 0; + bool use_full_H = false; + + cudaq::qec::decoder *acquire_decoder() { + thread_local int my_idx = + next_decoder_idx.fetch_add(1, std::memory_order_relaxed); + return decoders[my_idx % decoders.size()].get(); + } + + std::atomic total_decode_us{0}; + std::atomic total_worker_us{0}; + std::atomic decode_count{0}; + + int num_input_detectors = 0; + std::atomic total_input_nonzero{0}; + std::atomic total_output_nonzero{0}; +}; + +// ============================================================================= +// Pre-launch DMA copy callback +// ============================================================================= + +struct PreLaunchCopyCtx { + void *d_trt_input; + size_t input_size; + void **h_ring_ptrs; +}; + +static void pre_launch_input_copy(void *user_data, void *slot_dev, + cudaStream_t stream) { + NVTX_PUSH("PreLaunchCopy"); + auto *ctx = static_cast(user_data); + ctx->h_ring_ptrs[0] = slot_dev; + cudaMemcpyAsync(ctx->d_trt_input, + static_cast(slot_dev) + CUDAQ_RPC_HEADER_SIZE, + ctx->input_size, cudaMemcpyDeviceToDevice, stream); + NVTX_POP(); +} + +// ============================================================================= +// Worker context (passed through user_context) +// ============================================================================= + +struct WorkerCtx { + AIPreDecoderService *predecoder; + DecoderContext *decoder_ctx; + int32_t *decode_corrections = nullptr; + int32_t *decode_logical_pred = nullptr; + int max_requests = 0; + const uint8_t *obs_row = nullptr; + size_t obs_row_size = 0; +}; + +struct __attribute__((packed)) DecodeResponse { + int32_t total_corrections; + int32_t converged; +}; + +// ============================================================================= +// PyMatching work queue (decoupled from predecoder workers) +// ============================================================================= + +struct PyMatchJob { + int origin_slot; + uint64_t request_id; + void *ring_buffer_ptr; +}; + +class PyMatchQueue { +public: + void push(PyMatchJob &&j) { + { + std::lock_guard lk(mtx_); + jobs_.push(std::move(j)); + } + cv_.notify_one(); + } + + bool pop(PyMatchJob &out) { + std::unique_lock lk(mtx_); + cv_.wait(lk, [&] { return !jobs_.empty() || stop_; }); + if (stop_ && jobs_.empty()) + return false; + out = std::move(jobs_.front()); + jobs_.pop(); + return true; + } + + void shutdown() { + { + std::lock_guard lk(mtx_); + stop_ = true; + } + cv_.notify_all(); + } + +private: + std::mutex mtx_; + std::condition_variable cv_; + std::queue jobs_; + bool stop_ = false; +}; + +// ============================================================================= +// Test data (pre-generated from Stim, or random) +// ============================================================================= + +struct TestData { + std::vector detectors; // (num_samples × num_detectors) row-major + std::vector observables; // (num_samples × num_observables) row-major + uint32_t num_samples = 0; + uint32_t num_detectors = 0; + uint32_t num_observables = 0; + + bool loaded() const { return num_samples > 0 && num_detectors > 0; } + + const int32_t *sample(int idx) const { + return detectors.data() + + (static_cast(idx % num_samples) * num_detectors); + } + + int32_t observable(int idx, int obs = 0) const { + return observables[static_cast(idx % num_samples) * + num_observables + + obs]; + } +}; + +static bool load_binary_file(const std::string &path, uint32_t &out_rows, + uint32_t &out_cols, std::vector &data) { + std::ifstream f(path, std::ios::binary); + if (!f.good()) + return false; + f.read(reinterpret_cast(&out_rows), sizeof(uint32_t)); + f.read(reinterpret_cast(&out_cols), sizeof(uint32_t)); + size_t count = static_cast(out_rows) * out_cols; + data.resize(count); + f.read(reinterpret_cast(data.data()), count * sizeof(int32_t)); + return f.good(); +} + +static TestData load_test_data(const std::string &data_dir) { + TestData td; + std::string det_path = data_dir + "/detectors.bin"; + std::string obs_path = data_dir + "/observables.bin"; + + if (!load_binary_file(det_path, td.num_samples, td.num_detectors, + td.detectors)) { + std::cerr << "ERROR: Failed to load " << det_path << "\n"; + return td; + } + uint32_t obs_samples = 0; + if (!load_binary_file(obs_path, obs_samples, td.num_observables, + td.observables)) { + std::cerr << "ERROR: Failed to load " << obs_path << "\n"; + td.num_samples = 0; + return td; + } + if (obs_samples != td.num_samples) { + std::cerr << "ERROR: sample count mismatch: detectors=" << td.num_samples + << " observables=" << obs_samples << "\n"; + td.num_samples = 0; + return td; + } + std::cout << "[Data] Loaded " << td.num_samples << " samples, " + << td.num_detectors << " detectors, " << td.num_observables + << " observables from " << data_dir << "\n"; + return td; +} + +// ============================================================================= +// Stim-derived parity check matrix loader (CSR sparse → dense tensor) +// ============================================================================= + +struct SparseCSR { + uint32_t nrows = 0, ncols = 0, nnz = 0; + std::vector indptr; + std::vector indices; + + bool loaded() const { return nrows > 0 && ncols > 0; } + + cudaqx::tensor to_dense() const { + cudaqx::tensor T; + std::vector data(static_cast(nrows) * ncols, 0); + for (uint32_t r = 0; r < nrows; ++r) + for (int32_t j = indptr[r]; j < indptr[r + 1]; ++j) + data[static_cast(r) * ncols + indices[j]] = 1; + T.copy(data.data(), + {static_cast(nrows), static_cast(ncols)}); + return T; + } + + std::vector row_dense(uint32_t r) const { + std::vector row(ncols, 0); + for (int32_t j = indptr[r]; j < indptr[r + 1]; ++j) + row[indices[j]] = 1; + return row; + } +}; + +struct StimData { + SparseCSR H; + SparseCSR O; + std::vector priors; +}; + +static bool load_csr(const std::string &path, SparseCSR &out) { + std::ifstream f(path, std::ios::binary); + if (!f.good()) + return false; + f.read(reinterpret_cast(&out.nrows), sizeof(uint32_t)); + f.read(reinterpret_cast(&out.ncols), sizeof(uint32_t)); + f.read(reinterpret_cast(&out.nnz), sizeof(uint32_t)); + out.indptr.resize(out.nrows + 1); + out.indices.resize(out.nnz); + f.read(reinterpret_cast(out.indptr.data()), + (out.nrows + 1) * sizeof(int32_t)); + f.read(reinterpret_cast(out.indices.data()), + out.nnz * sizeof(int32_t)); + return f.good(); +} + +static StimData load_stim_data(const std::string &data_dir) { + StimData sd; + + if (!load_csr(data_dir + "/H_csr.bin", sd.H)) { + std::cerr << "[Data] No H_csr.bin found in " << data_dir << "\n"; + return sd; + } + std::cout << "[Data] Loaded H_csr " << sd.H.nrows << "x" << sd.H.ncols + << " (" << sd.H.nnz << " nnz)\n"; + + if (load_csr(data_dir + "/O_csr.bin", sd.O)) + std::cout << "[Data] Loaded O_csr " << sd.O.nrows << "x" << sd.O.ncols + << " (" << sd.O.nnz << " nnz)\n"; + + std::string priors_path = data_dir + "/priors.bin"; + std::ifstream pf(priors_path, std::ios::binary); + if (pf.good()) { + uint32_t nedges = 0; + pf.read(reinterpret_cast(&nedges), sizeof(uint32_t)); + sd.priors.resize(nedges); + pf.read(reinterpret_cast(sd.priors.data()), + nedges * sizeof(double)); + std::cout << "[Data] Loaded " << sd.priors.size() << " priors\n"; + } + return sd; +} + +// ============================================================================= +// Streaming Config +// ============================================================================= + +struct StreamingConfig { + int rate_us = 0; + int duration_s = 5; + int warmup_count = 20; + std::string data_dir; +}; + +// ============================================================================= +// Main +// ============================================================================= + +int main(int argc, char *argv[]) { + using hrclock = std::chrono::high_resolution_clock; + + // --- Parse arguments --- + std::string config_name = "d7"; + StreamingConfig scfg; + + // Scan for --data-dir first (can appear anywhere) + for (int i = 1; i < argc; ++i) { + if (std::string(argv[i]) == "--data-dir" && i + 1 < argc) { + scfg.data_dir = argv[i + 1]; + break; + } + } + // Positional: config_name [rate_us] [duration_s] + if (argc > 1 && std::string(argv[1]).substr(0, 2) != "--") + config_name = argv[1]; + if (argc > 2 && std::isdigit(argv[2][0])) + scfg.rate_us = std::stoi(argv[2]); + if (argc > 3 && std::isdigit(argv[3][0])) + scfg.duration_s = std::stoi(argv[3]); + + PipelineConfig config; + if (config_name == "d7") { + config = PipelineConfig::d7_r7(); + } else if (config_name == "d13") { + config = PipelineConfig::d13_r13(); + } else if (config_name == "d13_r104") { + config = PipelineConfig::d13_r104(); + } else if (config_name == "d21") { + config = PipelineConfig::d21_r21(); + } else if (config_name == "d31") { + config = PipelineConfig::d31_r31(); + } else { + std::cerr << "Usage: " << argv[0] + << " [d7|d13|d13_r104|d21|d31] [rate_us] [duration_s]\n" + << " d7 - distance 7, 7 rounds (default)\n" + << " d13 - distance 13, 13 rounds\n" + << " d13_r104 - distance 13, 104 rounds\n" + << " d21 - distance 21, 21 rounds\n" + << " d31 - distance 31, 31 rounds\n" + << " rate_us - inter-arrival time in us (0 = open-loop)\n" + << " duration_s - test duration in seconds (default: 5)\n"; + return 1; + } + + std::cout << "--- Initializing Hybrid AI Realtime Pipeline (" << config.label + << ") ---\n"; + + CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost)); + + // --- Model path --- + std::string engine_file = config.engine_path(); + std::string onnx_file = config.onnx_path(); + std::string model_path; + + std::ifstream engine_probe(engine_file, std::ios::binary); + if (engine_probe.good()) { + engine_probe.close(); + model_path = engine_file; + std::cout << "[Setup] Loading cached TRT engine: " << engine_file << "\n"; + } else { + model_path = onnx_file; + std::cout << "[Setup] Building TRT engines from ONNX: " << onnx_file + << "\n"; + } + + // --- Create GPU resources (predecoders, streams, mailbox) --- + void **h_mailbox_bank = nullptr; + void **d_mailbox_bank = nullptr; + CUDA_CHECK(cudaHostAlloc(&h_mailbox_bank, + config.num_predecoders * sizeof(void *), + cudaHostAllocMapped)); + std::memset(h_mailbox_bank, 0, config.num_predecoders * sizeof(void *)); + CUDA_CHECK(cudaHostGetDevicePointer( + reinterpret_cast(&d_mailbox_bank), h_mailbox_bank, 0)); + + std::vector predecoder_streams; + for (int i = 0; i < config.num_predecoders; ++i) { + cudaStream_t s; + CUDA_CHECK(cudaStreamCreate(&s)); + predecoder_streams.push_back(s); + } + + std::cout << "[Setup] Capturing " << config.num_predecoders + << "x AIPreDecoder Graphs...\n"; + cudaStream_t capture_stream; + CUDA_CHECK(cudaStreamCreate(&capture_stream)); + + std::vector> predecoders; + bool need_save = (model_path == onnx_file); + for (int i = 0; i < config.num_predecoders; ++i) { + std::string save_path = (need_save && i == 0) ? engine_file : ""; + auto pd = std::make_unique( + model_path, d_mailbox_bank + i, 1, save_path); + std::cout << "[Setup] Predecoder " << i + << ": input_size=" << pd->get_input_size() + << " output_size=" << pd->get_output_size() << "\n"; + pd->capture_graph(capture_stream, false); + predecoders.push_back(std::move(pd)); + } + + // --- Derive dimensions from TRT model bindings --- + const size_t model_input_bytes = predecoders[0]->get_input_size(); + const size_t model_output_bytes = predecoders[0]->get_output_size(); + const size_t slot_size = + round_up_pow2(CUDAQ_RPC_HEADER_SIZE + model_input_bytes); + + // Model I/O element count: for uint8 models, 1 byte per element; + // for int32, 4 bytes per element. Detect by comparing against expected + // detector count from the ONNX model shape. + const size_t model_input_elements = model_input_bytes; + const size_t model_output_elements_total = model_output_bytes; + // If model_input_bytes equals num_detectors (uint8), elem_size is 1. + // If model_input_bytes equals num_detectors*4 (int32), elem_size is 4. + // We detect this by checking if model_output_bytes == model_input_bytes + 1 + // (uint8: one extra L element) vs model_input_bytes + 4 (int32). + const size_t model_elem_size = + (model_output_bytes == model_input_bytes + 1) ? 1 : sizeof(int32_t); + const size_t num_input_detectors = model_input_bytes / model_elem_size; + const size_t num_output_elements = model_output_bytes / model_elem_size; + + std::cout << "[Setup] Model I/O element size: " << model_elem_size + << " bytes (" << (model_elem_size == 1 ? "uint8" : "int32") << ")\n"; + std::cout << "[Setup] Input detectors: " << num_input_detectors + << ", Output elements: " << num_output_elements << "\n"; + + const int residual_detectors = static_cast(num_output_elements) - 1; + + std::cout << "[Config] distance=" << config.distance + << " rounds=" << config.num_rounds + << " residual_detectors=" << residual_detectors + << " model_input=" << model_input_bytes + << " model_output=" << model_output_bytes + << " slot_size=" << slot_size << "\n"; + + // --- Load test data (optional) --- + TestData test_data; + StimData stim; + if (!scfg.data_dir.empty()) { + test_data = load_test_data(scfg.data_dir); + if (!test_data.loaded()) { + std::cerr << "ERROR: Failed to load test data from " << scfg.data_dir + << "\n"; + return 1; + } + if (test_data.num_detectors != num_input_detectors) { + std::cerr << "ERROR: detector count mismatch: data has " + << test_data.num_detectors << " but model expects " + << num_input_detectors << "\n"; + return 1; + } + stim = load_stim_data(scfg.data_dir); + } + + // --- Build PyMatching decoder --- + DecoderContext decoder_ctx; + decoder_ctx.num_residual_detectors = residual_detectors; + decoder_ctx.num_input_detectors = static_cast(num_input_detectors); + cudaqx::heterogeneous_map pm_params; + pm_params.insert("merge_strategy", std::string("smallest_weight")); + + // Observable row from O matrix (for projecting edge corrections → logical) + std::vector obs_row; + + if (stim.H.loaded() && + static_cast(stim.H.nrows) == residual_detectors) { + decoder_ctx.use_full_H = true; + std::cout << "[Setup] Converting sparse H (" << stim.H.nrows << "x" + << stim.H.ncols << ") to dense tensor...\n"; + auto H_full = stim.H.to_dense(); + std::cout << "[Setup] H tensor: [" << H_full.shape()[0] << " x " + << H_full.shape()[1] << "]\n"; + + if (!stim.priors.empty() && stim.priors.size() == stim.H.ncols) + pm_params.insert("error_rate_vec", stim.priors); + + if (stim.O.loaded()) + obs_row = stim.O.row_dense(0); + + std::cout << "[Setup] Creating " << config.num_decode_workers + << " PyMatching decoders (full H)...\n"; + for (int i = 0; i < config.num_decode_workers; ++i) + decoder_ctx.decoders.push_back( + cudaq::qec::decoder::get("pymatching", H_full, pm_params)); + } else { + // Fallback: per-slice decode with CUDA-Q surface code H_z + std::cout << "[Setup] Creating PyMatching decoder (d=" << config.distance + << " surface code, Z stabilizers)...\n"; + auto surface_code = + cudaq::qec::get_code("surface_code", {{"distance", config.distance}}); + auto H_z = surface_code->get_parity_z(); + + const int z_stabilizers = static_cast(H_z.shape()[0]); + if (residual_detectors > 0 && residual_detectors % z_stabilizers == 0) + decoder_ctx.spatial_slices = residual_detectors / z_stabilizers; + decoder_ctx.z_stabilizers = z_stabilizers; + + std::cout << "[Setup] H_z shape: [" << H_z.shape()[0] << " x " + << H_z.shape()[1] << "], spatial_slices=" + << decoder_ctx.spatial_slices << "\n"; + + std::cout << "[Setup] Creating " << config.num_decode_workers + << " PyMatching decoders (per-slice)...\n"; + for (int i = 0; i < config.num_decode_workers; ++i) + decoder_ctx.decoders.push_back( + cudaq::qec::decoder::get("pymatching", H_z, pm_params)); + } + std::cout << "[Setup] PyMatching decoder pool ready.\n"; + + // Pre-launch DMA contexts + std::vector pre_launch_ctxs(config.num_predecoders); + for (int i = 0; i < config.num_predecoders; ++i) { + pre_launch_ctxs[i].d_trt_input = predecoders[i]->get_trt_input_ptr(); + pre_launch_ctxs[i].input_size = predecoders[i]->get_input_size(); + pre_launch_ctxs[i].h_ring_ptrs = predecoders[i]->get_host_ring_ptrs(); + } + + if (config.num_workers != config.num_predecoders) { + std::cerr << "[WARN] num_workers (" << config.num_workers + << ") != num_predecoders (" << config.num_predecoders + << "); pipeline workers should match predecoders for 1:1 poll\n"; + } + + // Worker contexts (per-worker, application-specific) + std::vector worker_ctxs(config.num_workers); + for (int i = 0; i < config.num_workers; ++i) { + worker_ctxs[i].predecoder = predecoders[i].get(); + worker_ctxs[i].decoder_ctx = &decoder_ctx; + } + + // Build function table for RPC dispatch + std::vector function_ids(config.num_workers); + for (int i = 0; i < config.num_workers; ++i) { + std::string func = "predecode_target_" + std::to_string(i); + function_ids[i] = realtime_ns::fnv1a_hash(func.c_str()); + } + + // ========================================================================= + // Per-slot output buffers (predecoder output copied here before release) + // ========================================================================= + + std::vector> deferred_outputs( + NUM_SLOTS, std::vector(model_output_bytes)); + + PyMatchQueue pymatch_queue; + + // ========================================================================= + // Create pipeline (all atomics hidden inside) + // ========================================================================= + + realtime_ns::PipelineStageConfig stage_cfg; + stage_cfg.num_workers = config.num_workers; + stage_cfg.num_slots = NUM_SLOTS; + stage_cfg.slot_size = slot_size; + stage_cfg.cores = {.dispatcher = 2, .consumer = 4, .worker_base = 10}; + + realtime_ns::RealtimePipeline pipeline(stage_cfg); + + // --- GPU stage factory --- + pipeline.set_gpu_stage([&](int w) -> realtime_ns::GpuWorkerResources { + return {.graph_exec = predecoders[w]->get_executable_graph(), + .stream = predecoder_streams[w], + .pre_launch_fn = pre_launch_input_copy, + .pre_launch_data = &pre_launch_ctxs[w], + .function_id = function_ids[w], + .user_context = &worker_ctxs[w]}; + }); + + // --- CPU stage callback (poll GPU + copy + enqueue to PyMatch queue) --- + // Predecoder workers only poll GPU completion, copy the output to a + // per-slot buffer, release the predecoder, and enqueue a PyMatchJob. + // Returns DEFERRED_COMPLETION so the pipeline releases the worker + // (idle_mask) without signaling slot completion (tx_flags). + pipeline.set_cpu_stage( + [&deferred_outputs, &pymatch_queue, + out_sz = model_output_bytes](const realtime_ns::CpuStageContext &ctx) -> size_t { + auto *wctx = static_cast(ctx.user_context); + auto *pd = wctx->predecoder; + auto *dctx = wctx->decoder_ctx; + + PreDecoderJob job; + if (!pd->poll_next_job(job)) + return 0; + + NVTX_PUSH("PredecoderPoll"); + + int origin_slot = ctx.origin_slot; + + std::memcpy(deferred_outputs[origin_slot].data(), job.inference_data, + out_sz); + + // Syndrome density: count nonzero in input and output residuals + const uint8_t *input_u8 = + static_cast(job.ring_buffer_ptr) + + CUDAQ_RPC_HEADER_SIZE; + int input_nz = 0; + for (int k = 0; k < dctx->num_input_detectors; ++k) + input_nz += (input_u8[k] != 0); + const uint8_t *out_buf = deferred_outputs[origin_slot].data(); + int output_nz = 0; + for (int k = 0; k < dctx->num_residual_detectors; ++k) + output_nz += (out_buf[1 + k] != 0); + dctx->total_input_nonzero.fetch_add(input_nz, + std::memory_order_relaxed); + dctx->total_output_nonzero.fetch_add(output_nz, + std::memory_order_relaxed); + + pd->release_job(job.slot_idx); + + auto *rpc_hdr = + static_cast(job.ring_buffer_ptr); + uint32_t rid = rpc_hdr->request_id; + + pymatch_queue.push({origin_slot, rid, job.ring_buffer_ptr}); + + NVTX_POP(); // PredecoderPoll + return realtime_ns::DEFERRED_COMPLETION; + }); + + // --- Completion callback (record timestamps) --- + const int max_requests = 500000; + std::vector submit_ts(max_requests); + std::vector complete_ts(max_requests); + std::vector completed(max_requests, 0); + std::vector decode_corrections(max_requests, -1); + std::vector decode_logical_pred(max_requests, -1); + + pipeline.set_completion_handler([&](const realtime_ns::Completion &c) { + if (c.request_id < static_cast(max_requests)) { + complete_ts[c.request_id] = hrclock::now(); + completed[c.request_id] = c.success; + } + }); + + // ========================================================================= + // Start pipeline and run producer + // ========================================================================= + + for (int i = 0; i < config.num_workers; ++i) { + worker_ctxs[i].decode_corrections = decode_corrections.data(); + worker_ctxs[i].decode_logical_pred = decode_logical_pred.data(); + worker_ctxs[i].max_requests = max_requests; + if (!obs_row.empty()) { + worker_ctxs[i].obs_row = obs_row.data(); + worker_ctxs[i].obs_row_size = obs_row.size(); + } + } + + // ========================================================================= + // PyMatching thread pool (decoupled from predecoder workers) + // ========================================================================= + + std::vector pymatch_threads(config.num_decode_workers); + for (int t = 0; t < config.num_decode_workers; ++t) { + pymatch_threads[t] = std::thread( + [&pipeline, &pymatch_queue, &deferred_outputs, &decoder_ctx, + &decode_corrections, &decode_logical_pred, &obs_row, + max_requests]() { + PyMatchJob job; + while (pymatch_queue.pop(job)) { + NVTX_PUSH("PyMatchDecode"); + using hrclock = std::chrono::high_resolution_clock; + auto decode_start = hrclock::now(); + + const uint8_t *output_u8 = + deferred_outputs[job.origin_slot].data(); + const int32_t logical_pred = output_u8[0]; + int total_corrections = 0; + bool all_converged = true; + +#if !defined(DISABLE_PYMATCHING) + const uint8_t *residual_u8 = output_u8 + 1; + auto *my_decoder = decoder_ctx.acquire_decoder(); + + if (decoder_ctx.use_full_H) { + thread_local cudaqx::tensor syndrome_tensor( + {(size_t)decoder_ctx.num_residual_detectors}); + std::memcpy(syndrome_tensor.data(), residual_u8, + decoder_ctx.num_residual_detectors); + auto result = my_decoder->decode(syndrome_tensor); + all_converged = result.converged; + if (!obs_row.empty() && obs_row.size() == result.result.size()) { + int obs_parity = 0; + for (size_t e = 0; e < result.result.size(); ++e) + if (result.result[e] > 0.5 && obs_row[e]) + obs_parity ^= 1; + total_corrections += obs_parity; + } else { + for (auto v : result.result) + if (v > 0.5) + total_corrections++; + } + } else { + thread_local cudaqx::tensor syndrome_tensor( + {(size_t)decoder_ctx.z_stabilizers}); + uint8_t *syn_data = syndrome_tensor.data(); + for (int s = 0; s < decoder_ctx.spatial_slices; ++s) { + const uint8_t *slice = + residual_u8 + s * decoder_ctx.z_stabilizers; + std::memcpy(syn_data, slice, decoder_ctx.z_stabilizers); + auto result = my_decoder->decode(syndrome_tensor); + all_converged &= result.converged; + for (auto v : result.result) + if (v > 0.5) + total_corrections++; + } + } + total_corrections += logical_pred; +#endif + + auto decode_end = hrclock::now(); + NVTX_POP(); // PyMatchDecode + + // Write RPC response into ring buffer slot + DecodeResponse resp{total_corrections, all_converged ? 1 : 0}; + char *response_payload = (char *)job.ring_buffer_ptr + + sizeof(realtime_ns::RPCResponse); + std::memcpy(response_payload, &resp, sizeof(resp)); + + auto *header = static_cast( + job.ring_buffer_ptr); + header->magic = realtime_ns::RPC_MAGIC_RESPONSE; + header->status = 0; + header->result_len = sizeof(resp); + + pipeline.complete_deferred(job.origin_slot); + + auto worker_end = hrclock::now(); + auto decode_us = + std::chrono::duration_cast( + decode_end - decode_start) + .count(); + auto worker_us = + std::chrono::duration_cast( + worker_end - decode_start) + .count(); + decoder_ctx.total_decode_us.fetch_add(decode_us, + std::memory_order_relaxed); + decoder_ctx.total_worker_us.fetch_add(worker_us, + std::memory_order_relaxed); + decoder_ctx.decode_count.fetch_add(1, std::memory_order_relaxed); + + uint32_t rid = static_cast(job.request_id); + if (rid < static_cast(max_requests)) { + decode_corrections[rid] = total_corrections; + decode_logical_pred[rid] = logical_pred; + } + } + }); + } + std::cout << "[Setup] Started " << config.num_decode_workers + << " PyMatching decode workers.\n"; + + std::cout << "[Setup] Starting pipeline...\n"; + auto injector = pipeline.create_injector(); + pipeline.start(); + + auto run_deadline = + std::chrono::steady_clock::now() + std::chrono::seconds(scfg.duration_s); + + std::string rate_label = + (scfg.rate_us > 0) ? std::to_string(scfg.rate_us) + " us" : "open-loop"; + + std::cout << "\n[Stream] Starting streaming test (" << config.label << ")\n" + << " Rate: " << rate_label << "\n" + << " Duration: " << scfg.duration_s << " s\n" + << " Warmup: " << scfg.warmup_count << " requests\n" + << " Predecoders:" << config.num_predecoders + << " (dedicated streams)\n" + << " Decode workers:" << config.num_decode_workers << "\n" + << " Max reqs: " << max_requests << "\n\n" + << std::flush; + + // --- Producer loop (runs on main thread) --- + std::mt19937 rng(42); + const size_t payload_bytes = + std::min(model_input_bytes, + slot_size - static_cast(CUDAQ_RPC_HEADER_SIZE)); + std::vector payload_buf(CUDAQ_RPC_HEADER_SIZE + payload_bytes); + int req_id = 0; + int target = 0; + + auto next_submit_time = hrclock::now(); + + while (std::chrono::steady_clock::now() < run_deadline && + req_id < max_requests) { + + if (scfg.rate_us > 0) { + while (hrclock::now() < next_submit_time) + QEC_CPU_RELAX(); + } + + uint8_t *payload = payload_buf.data() + CUDAQ_RPC_HEADER_SIZE; + if (test_data.loaded()) { + const int32_t *src = test_data.sample(req_id); + for (size_t d = 0; d < num_input_detectors; ++d) + payload[d] = static_cast(src[d]); + } else { + std::bernoulli_distribution err_dist(0.01); + for (size_t d = 0; d < num_input_detectors; ++d) + payload[d] = err_dist(rng) ? 1 : 0; + } + + std::string func = "predecode_target_" + std::to_string(target); + uint32_t fid = realtime_ns::fnv1a_hash(func.c_str()); + + submit_ts[req_id] = hrclock::now(); + NVTX_PUSH("ProducerSubmit"); + injector.submit(fid, payload, static_cast(payload_bytes), + static_cast(req_id)); + NVTX_POP(); + + target = (target + 1) % config.num_predecoders; + req_id++; + + if (scfg.rate_us > 0) + next_submit_time += std::chrono::microseconds(scfg.rate_us); + } + + // --- Shutdown --- + pipeline.stop(); + + pymatch_queue.shutdown(); + for (auto &t : pymatch_threads) + if (t.joinable()) + t.join(); + + // ========================================================================= + // Report + // ========================================================================= + + auto final_stats = pipeline.stats(); + uint64_t nsub = final_stats.submitted; + uint64_t ncomp = final_stats.completed; + + if (ncomp < nsub) + std::cerr << " [WARN] " << (nsub - ncomp) + << " requests did not complete.\n"; + + int warmup = std::min(scfg.warmup_count, static_cast(nsub)); + std::vector latencies; + latencies.reserve(nsub - warmup); + + for (uint64_t i = warmup; i < nsub; ++i) { + if (!completed[i]) + continue; + auto dt = + std::chrono::duration_cast>( + complete_ts[i] - submit_ts[i]); + latencies.push_back(dt.count()); + } + + std::sort(latencies.begin(), latencies.end()); + + auto pct = [&](double p) -> double { + if (latencies.empty()) + return 0; + double idx = (p / 100.0) * (latencies.size() - 1); + size_t lo = (size_t)idx; + size_t hi = std::min(lo + 1, latencies.size() - 1); + double frac = idx - lo; + return latencies[lo] * (1.0 - frac) + latencies[hi] * frac; + }; + + double mean = 0; + for (auto v : latencies) + mean += v; + mean = latencies.empty() ? 0 : mean / latencies.size(); + + double stddev = 0; + for (auto v : latencies) + stddev += (v - mean) * (v - mean); + stddev = latencies.empty() ? 0 : std::sqrt(stddev / latencies.size()); + + auto wall_us = + std::chrono::duration_cast>( + std::chrono::steady_clock::now() - + (run_deadline - std::chrono::seconds(scfg.duration_s))) + .count(); + double throughput = (wall_us > 0) ? (ncomp * 1e6 / wall_us) : 0; + + double actual_rate = (nsub > 1) + ? std::chrono::duration_cast< + std::chrono::duration>( + submit_ts[nsub - 1] - submit_ts[0]) + .count() / + (nsub - 1) + : 0; + + std::cout << std::fixed; + std::cout + << "\n================================================================\n"; + std::cout << " Streaming Benchmark: " << config.label << "\n"; + std::cout + << "================================================================\n"; + std::cout << " Submitted: " << nsub << "\n"; + std::cout << " Completed: " << ncomp << "\n"; + std::cout << std::setprecision(1); + std::cout << " Wall time: " << wall_us / 1000.0 << " ms\n"; + std::cout << " Throughput: " << throughput << " req/s\n"; + std::cout << " Actual arrival rate:" << std::setw(8) << actual_rate + << " us/req\n"; + std::cout << " Backpressure stalls:" << std::setw(8) + << final_stats.backpressure_stalls << "\n"; + std::cout + << " ---------------------------------------------------------------\n"; + std::cout << " Latency (us) [steady-state, " << latencies.size() + << " requests after " << warmup << " warmup]\n"; + if (!latencies.empty()) { + std::cout << " min = " << std::setw(10) << latencies.front() << "\n"; + std::cout << " p50 = " << std::setw(10) << pct(50) << "\n"; + std::cout << " mean = " << std::setw(10) << mean << "\n"; + std::cout << " p90 = " << std::setw(10) << pct(90) << "\n"; + std::cout << " p95 = " << std::setw(10) << pct(95) << "\n"; + std::cout << " p99 = " << std::setw(10) << pct(99) << "\n"; + std::cout << " max = " << std::setw(10) << latencies.back() << "\n"; + std::cout << " stddev = " << std::setw(10) << stddev << "\n"; + } + + int n_decoded = decoder_ctx.decode_count.load(); + if (n_decoded > 0) { + double avg_decode = (double)decoder_ctx.total_decode_us.load() / n_decoded; + double avg_worker = (double)decoder_ctx.total_worker_us.load() / n_decoded; + double avg_overhead = avg_worker - avg_decode; + std::cout + << " " + "---------------------------------------------------------------\n"; + std::cout << " Worker-level averages (" << n_decoded << " completed):\n"; + std::cout << " PyMatching decode: " << std::setw(9) << avg_decode + << " us\n"; + std::cout << " Total worker: " << std::setw(9) << avg_worker + << " us\n"; + std::cout << " Worker overhead: " << std::setw(9) << avg_overhead + << " us\n"; + } + if (n_decoded > 0) { + double avg_in_nz = + (double)decoder_ctx.total_input_nonzero.load() / n_decoded; + double avg_out_nz = + (double)decoder_ctx.total_output_nonzero.load() / n_decoded; + double in_density = avg_in_nz / decoder_ctx.num_input_detectors; + double out_density = avg_out_nz / decoder_ctx.num_residual_detectors; + double reduction = (in_density > 0) ? (1.0 - out_density / in_density) : 0; + std::cout + << " " + "---------------------------------------------------------------\n"; + std::cout << " Syndrome density (" << n_decoded << " samples):\n"; + std::cout << " Input: " << std::fixed << std::setprecision(1) + << avg_in_nz << " / " << decoder_ctx.num_input_detectors + << " (" << std::setprecision(4) << in_density << ")\n"; + std::cout << " Output: " << std::fixed << std::setprecision(1) + << avg_out_nz << " / " << decoder_ctx.num_residual_detectors + << " (" << std::setprecision(4) << out_density << ")\n"; + std::cout << " Reduction: " << std::setprecision(1) + << (reduction * 100.0) << "%\n"; + } + + std::cout + << " ---------------------------------------------------------------\n"; + std::cout << " Host dispatcher processed " << final_stats.dispatched + << " packets.\n"; + std::cout + << "================================================================\n"; + + // --- Correctness verification (when using real data) --- + if (test_data.loaded()) { + int verified = 0, mismatches = 0, missing = 0; + int pred_only_mismatches = 0; + int64_t sum_total_corr = 0, sum_logical_pred = 0; + int nonzero_logical = 0, nonzero_pymatch = 0; + for (int i = 0; i < nsub; ++i) { + if (decode_corrections[i] < 0) { + missing++; + continue; + } + int32_t total_corr = decode_corrections[i]; + int32_t lpred = decode_logical_pred[i]; + int32_t pymatch_corr = total_corr - lpred; + int32_t pipeline_parity = total_corr % 2; + int32_t ground_truth = test_data.observable(i, 0); + + if (pipeline_parity != ground_truth) + mismatches++; + if ((lpred % 2) != ground_truth) + pred_only_mismatches++; + + sum_total_corr += total_corr; + sum_logical_pred += lpred; + if (lpred != 0) + nonzero_logical++; + if (pymatch_corr != 0) + nonzero_pymatch++; + verified++; + } + double ler = + (verified > 0) ? static_cast(mismatches) / verified : 0; + double pred_ler = + (verified > 0) ? static_cast(pred_only_mismatches) / verified + : 0; + std::cout << "\n[Correctness] Verified " << verified << "/" << nsub + << " requests (" << missing << " missing)\n"; + std::cout << "[Correctness] Pipeline (pred+pymatch) mismatches: " + << mismatches << " LER: " << std::setprecision(4) << ler + << "\n"; + std::cout << "[Correctness] Predecoder-only mismatches: " + << pred_only_mismatches + << " LER: " << std::setprecision(4) << pred_ler << "\n"; + std::cout << "[Correctness] Avg logical_pred: " << std::setprecision(3) + << (verified > 0 ? (double)sum_logical_pred / verified : 0) + << " nonzero: " << nonzero_logical << "/" << verified << "\n"; + std::cout << "[Correctness] Avg pymatch_corr: " << std::setprecision(3) + << (verified > 0 + ? (double)(sum_total_corr - sum_logical_pred) / verified + : 0) + << " nonzero: " << nonzero_pymatch << "/" << verified << "\n"; + std::cout << "[Correctness] Ground truth ones: "; + int gt_ones = 0; + int gt_count = static_cast( + std::min(nsub, static_cast(test_data.num_samples))); + for (int i = 0; i < gt_count; ++i) + if (test_data.observable(i, 0)) + gt_ones++; + std::cout << gt_ones << "/" << gt_count << "\n"; + } + + // --- Cleanup --- + std::cout << "[Teardown] Shutting down...\n"; + CUDA_CHECK(cudaStreamSynchronize(capture_stream)); + for (auto &s : predecoder_streams) { + cudaStreamSynchronize(s); + cudaStreamDestroy(s); + } + cudaFreeHost(h_mailbox_bank); + cudaStreamDestroy(capture_stream); + + std::cout << "Done.\n"; + return 0; +} diff --git a/libs/qec/unittests/CMakeLists.txt b/libs/qec/unittests/CMakeLists.txt index 430e8fdd..c6eed7a0 100644 --- a/libs/qec/unittests/CMakeLists.txt +++ b/libs/qec/unittests/CMakeLists.txt @@ -127,11 +127,28 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) list(APPEND _cudaq_realtime_prefixes "${CUDAQ_INSTALL_PREFIX}") endif() + # Realtime API lives under install prefix (CUDAQ_REALTIME_ROOT = install directory). + # Header layout: include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h find_path(CUDAQ_REALTIME_INCLUDE_DIR NAMES cudaq/realtime/daemon/dispatcher/cudaq_realtime.h - PATHS ${_cudaq_realtime_prefixes} - PATH_SUFFIXES include ../include + HINTS ${_cudaq_realtime_prefixes} + PATH_SUFFIXES include + NO_DEFAULT_PATH ) + if(NOT CUDAQ_REALTIME_INCLUDE_DIR) + find_path(CUDAQ_REALTIME_INCLUDE_DIR + NAMES cudaq/realtime/daemon/dispatcher/cudaq_realtime.h + PATHS ${_cudaq_realtime_prefixes} + PATH_SUFFIXES include + ) + endif() + if(NOT CUDAQ_REALTIME_INCLUDE_DIR) + find_path(CUDAQ_REALTIME_INCLUDE_DIR + NAMES cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h + HINTS ${_cudaq_realtime_prefixes} + PATH_SUFFIXES include ../include + ) + endif() find_library(CUDAQ_REALTIME_LIBRARY NAMES cudaq-realtime @@ -145,22 +162,67 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) PATH_SUFFIXES lib ) + find_library(CUDAQ_REALTIME_HOST_DISPATCH_LIBRARY + NAMES cudaq-realtime-host-dispatch + PATHS ${_cudaq_realtime_prefixes} + PATH_SUFFIXES lib + ) + + set(_have_realtime_for_tests FALSE) if(CUDAQ_REALTIME_INCLUDE_DIR AND CUDAQ_REALTIME_LIBRARY AND CUDAQ_REALTIME_DISPATCH_LIBRARY) + set(_have_realtime_for_tests TRUE) message(STATUS "Found cuda-quantum realtime headers at ${CUDAQ_REALTIME_INCLUDE_DIR}") message(STATUS "Found cuda-quantum realtime library at ${CUDAQ_REALTIME_LIBRARY}") message(STATUS "Found cuda-quantum realtime dispatch library at ${CUDAQ_REALTIME_DISPATCH_LIBRARY}") + endif() + if(_have_realtime_for_tests) add_executable(test_realtime_decoding ${CMAKE_CURRENT_SOURCE_DIR}/decoders/realtime/test_realtime_decoding.cu ${CMAKE_CURRENT_SOURCE_DIR}/realtime/mock_decode_handler.cu ${CMAKE_CURRENT_SOURCE_DIR}/realtime/mock_decoder_launch_params.cpp ) + # TODO: Re-enable once libcudaq-realtime-host-dispatch.so RPATH is resolved + # add_executable(test_realtime_decoding + # ${CMAKE_CURRENT_SOURCE_DIR}/decoders/realtime/test_realtime_decoding.cu + # ) + # set_target_properties(test_realtime_decoding PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON CUDA_STANDARD 17 ) + # + # target_include_directories(test_realtime_decoding PRIVATE + # ${CUDAToolkit_INCLUDE_DIRS} + # ${CMAKE_CURRENT_SOURCE_DIR}/../include + # ${CMAKE_SOURCE_DIR}/libs/core/include + # ${CUDAQ_REALTIME_INCLUDE_DIR} + # ) + # + # target_compile_definitions(test_realtime_decoding PRIVATE + # TEST_DATA_DIR="${CMAKE_CURRENT_SOURCE_DIR}/decoders/realtime/data" + # ) + # + # target_link_libraries(test_realtime_decoding PRIVATE + # GTest::gtest_main + # CUDA::cudart + # cudaq-qec-realtime-cudevice + # ${CUDAQ_REALTIME_LIBRARY} + # ${CUDAQ_REALTIME_DISPATCH_LIBRARY} + # ) + # + # get_filename_component(CUDAQ_REALTIME_LIB_DIR "${CUDAQ_REALTIME_LIBRARY}" DIRECTORY) + # set_target_properties(test_realtime_decoding PROPERTIES + # BUILD_RPATH "${CUDAQ_REALTIME_LIB_DIR}" + # INSTALL_RPATH "${CUDAQ_REALTIME_LIB_DIR}" + # ) + # + # add_dependencies(CUDAQXQECUnitTests test_realtime_decoding) + # gtest_discover_tests(test_realtime_decoding + # TEST_PREFIX "test_realtime_decoding." + # ) target_include_directories(test_realtime_decoding PRIVATE ${CUDAToolkit_INCLUDE_DIRS} @@ -184,19 +246,175 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) # Ensure runtime can locate libcudaq-realtime.so get_filename_component(CUDAQ_REALTIME_LIB_DIR "${CUDAQ_REALTIME_LIBRARY}" DIRECTORY) - set_target_properties(test_realtime_decoding PROPERTIES - BUILD_RPATH "${CUDAQ_REALTIME_LIB_DIR}" - INSTALL_RPATH "${CUDAQ_REALTIME_LIB_DIR}" + + # ---------------------------------------------------------------- + # Realtime pipeline unit tests (SKIP_TRT passthrough at runtime; + # still needs TRT headers+libs at compile/link time) + # ---------------------------------------------------------------- + find_path(TENSORRT_INCLUDE_DIR_FOR_PIPELINE NvInfer.h + PATHS + ${TENSORRT_ROOT}/include + /usr/include/x86_64-linux-gnu + /usr/include/aarch64-linux-gnu + /usr/local/cuda/include + /usr/local/tensorrt/include + /opt/tensorrt/include + NO_DEFAULT_PATH + ) + find_library(TENSORRT_LIBRARY_FOR_PIPELINE nvinfer + PATHS ${TENSORRT_ROOT}/lib /usr/lib/x86_64-linux-gnu /usr/lib/aarch64-linux-gnu /usr/local/cuda/lib64 /usr/local/tensorrt/lib /opt/tensorrt/lib + ) + find_library(TENSORRT_ONNX_PARSER_FOR_PIPELINE nvonnxparser + PATHS ${TENSORRT_ROOT}/lib /usr/lib/x86_64-linux-gnu /usr/lib/aarch64-linux-gnu /usr/local/cuda/lib64 /usr/local/tensorrt/lib /opt/tensorrt/lib ) - add_dependencies(CUDAQXQECUnitTests test_realtime_decoding) - gtest_discover_tests(test_realtime_decoding - TEST_PREFIX "test_realtime_decoding." + if(TENSORRT_INCLUDE_DIR_FOR_PIPELINE AND TENSORRT_LIBRARY_FOR_PIPELINE AND TENSORRT_ONNX_PARSER_FOR_PIPELINE) + get_filename_component(_cuda_bin_pipe "${CMAKE_CUDA_COMPILER}" DIRECTORY) + get_filename_component(_cuda_root_pipe "${_cuda_bin_pipe}" DIRECTORY) + set(_cuda_cccl_include_pipe "${_cuda_root_pipe}/include/cccl") + + add_executable(test_realtime_pipeline + ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/ai_decoder_service.cu + ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/ai_predecoder_service.cu + ${CMAKE_CURRENT_SOURCE_DIR}/test_realtime_pipeline.cu + ) + + set_target_properties(test_realtime_pipeline PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON + CUDA_STANDARD 17 + LINKER_LANGUAGE CUDA + ) + + target_include_directories(test_realtime_pipeline PRIVATE + ${_cuda_cccl_include_pipe} + ${CUDAToolkit_INCLUDE_DIRS} + ${TENSORRT_INCLUDE_DIR_FOR_PIPELINE} + ${CMAKE_CURRENT_SOURCE_DIR}/../include + ${CMAKE_SOURCE_DIR}/libs/core/include + ${CUDAQ_REALTIME_INCLUDE_DIR} + ) + + target_link_libraries(test_realtime_pipeline PRIVATE + GTest::gtest_main + CUDA::cudart + ${TENSORRT_LIBRARY_FOR_PIPELINE} + ${TENSORRT_ONNX_PARSER_FOR_PIPELINE} + ${CUDAQ_REALTIME_LIBRARY} + ${CUDAQ_REALTIME_DISPATCH_LIBRARY} + ${CUDAQ_REALTIME_HOST_DISPATCH_LIBRARY} + cudaq-realtime-pipeline + ) + set_target_properties(test_realtime_pipeline PROPERTIES + BUILD_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" + INSTALL_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" + ) + + if(ENABLE_NVTX) + target_compile_definitions(test_realtime_pipeline PRIVATE ENABLE_NVTX) + endif() + + add_dependencies(CUDAQXQECUnitTests test_realtime_pipeline) + gtest_discover_tests(test_realtime_pipeline + TEST_PREFIX "test_realtime_pipeline." + ) + else() + message(WARNING "TensorRT not found. Skipping test_realtime_pipeline (needs NvInfer.h + TRT libs for compile/link).") + endif() + + # Hybrid AI predecoder + PyMatching pipeline test + # Requires TensorRT + ONNX parser for building engines from ONNX models + find_path(TENSORRT_INCLUDE_DIR NvInfer.h + PATHS + ${TENSORRT_ROOT}/include + /usr/include/x86_64-linux-gnu + /usr/include/aarch64-linux-gnu + /usr/local/cuda/include + /usr/local/tensorrt/include + /opt/tensorrt/include + NO_DEFAULT_PATH ) + find_library(TENSORRT_LIBRARY nvinfer + PATHS + ${TENSORRT_ROOT}/lib + /usr/lib/x86_64-linux-gnu + /usr/lib/aarch64-linux-gnu + /usr/local/cuda/lib64 + /usr/local/tensorrt/lib + /opt/tensorrt/lib + ) + find_library(TENSORRT_ONNX_PARSER_LIBRARY nvonnxparser + PATHS + ${TENSORRT_ROOT}/lib + /usr/lib/x86_64-linux-gnu + /usr/lib/aarch64-linux-gnu + /usr/local/cuda/lib64 + /usr/local/tensorrt/lib + /opt/tensorrt/lib + ) + + if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY AND TENSORRT_ONNX_PARSER_LIBRARY) + add_executable(test_realtime_predecoder_w_pymatching + ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp + ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/ai_decoder_service.cu + ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/ai_predecoder_service.cu + ) + + set_target_properties(test_realtime_predecoder_w_pymatching PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON + CUDA_STANDARD 17 + LINKER_LANGUAGE CUDA + ) + + target_compile_definitions(test_realtime_predecoder_w_pymatching PRIVATE + ONNX_MODEL_DIR="${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime" + ) + + # libcu++ (cuda/std/atomic) lives in CUDA toolkit under cccl/ + get_filename_component(_cuda_bin "${CMAKE_CUDA_COMPILER}" DIRECTORY) + get_filename_component(_cuda_root "${_cuda_bin}" DIRECTORY) + set(_cuda_cccl_include "${_cuda_root}/include/cccl") + + target_include_directories(test_realtime_predecoder_w_pymatching PRIVATE + ${_cuda_cccl_include} + ${CUDAToolkit_INCLUDE_DIRS} + ${TENSORRT_INCLUDE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/../include + ${CMAKE_SOURCE_DIR}/libs/core/include + ${CUDAQ_REALTIME_INCLUDE_DIR} + ) + + target_link_libraries(test_realtime_predecoder_w_pymatching PRIVATE + CUDA::cudart + ${TENSORRT_LIBRARY} + ${TENSORRT_ONNX_PARSER_LIBRARY} + ${CUDAQ_REALTIME_LIBRARY} + ${CUDAQ_REALTIME_DISPATCH_LIBRARY} + ${CUDAQ_REALTIME_HOST_DISPATCH_LIBRARY} + cudaq-realtime-pipeline + cudaq-qec + cudaq::cudaq + ) + set_target_properties(test_realtime_predecoder_w_pymatching PROPERTIES + BUILD_RPATH "${CMAKE_BINARY_DIR}/lib;${CUDAQ_REALTIME_LIB_DIR}" + INSTALL_RPATH "${CMAKE_BINARY_DIR}/lib;${CUDAQ_REALTIME_LIB_DIR}" + ) + + if(ENABLE_NVTX) + target_compile_definitions(test_realtime_predecoder_w_pymatching PRIVATE ENABLE_NVTX) + message(STATUS "NVTX profiling enabled for test_realtime_predecoder_w_pymatching") + endif() + + add_dependencies(CUDAQXQECUnitTests test_realtime_predecoder_w_pymatching) + else() + message(WARNING "TensorRT or ONNX parser not found. Skipping test_realtime_predecoder_w_pymatching.") + endif() + else() message(WARNING "cuda-quantum realtime dependency not found. " - "Set CUDAQ_REALTIME_ROOT or CUDAQ_INSTALL_PREFIX to enable " - "test_realtime_decoding.") + "Set CUDAQ_REALTIME_ROOT to enable " + "test_realtime_pipeline and test_realtime_predecoder_w_pymatching.") endif() endif() diff --git a/libs/qec/unittests/test_realtime_pipeline.cu b/libs/qec/unittests/test_realtime_pipeline.cu new file mode 100644 index 00000000..d4a106ba --- /dev/null +++ b/libs/qec/unittests/test_realtime_pipeline.cu @@ -0,0 +1,782 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. + * All rights reserved. + * + * This source code and the accompanying materials are made available under + * the terms of the Apache License 2.0 which accompanies this distribution. + ******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cudaq/qec/realtime/ai_decoder_service.h" +#include "cudaq/qec/realtime/ai_predecoder_service.h" +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" + +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + ASSERT_EQ(err, cudaSuccess) << "CUDA error: " << cudaGetErrorString(err); \ + } while (0) + +namespace { + +using namespace cudaq::qec; +namespace rt = cudaq::realtime; + +using atomic_uint64_sys = cuda::std::atomic; +using atomic_int_sys = cuda::std::atomic; + +static constexpr size_t kSkipTrtFloats = 1600; +static constexpr size_t kSkipTrtBytes = kSkipTrtFloats * sizeof(float); +static constexpr size_t kSlotSize = 8192; +static constexpr size_t kNumSlots = 8; +static constexpr uint32_t kTestFunctionId = rt::fnv1a_hash("test_predecoder"); + +// ============================================================================ +// Pre-launch DMA callback (mirrors production code) +// ============================================================================ + +struct PreLaunchCopyCtx { + void *d_trt_input; + size_t input_size; + void **h_ring_ptrs; +}; + +static void pre_launch_input_copy(void *user_data, void *slot_dev, + cudaStream_t stream) { + auto *ctx = static_cast(user_data); + ctx->h_ring_ptrs[0] = slot_dev; + cudaMemcpyAsync(ctx->d_trt_input, + static_cast(slot_dev) + CUDAQ_RPC_HEADER_SIZE, + ctx->input_size, cudaMemcpyDeviceToDevice, stream); +} + +// ============================================================================ +// Ring buffer helpers (mapped pinned memory) +// ============================================================================ + +static bool allocate_mapped_buffer(size_t size, uint8_t **host_out, + uint8_t **dev_out) { + void *h = nullptr; + if (cudaHostAlloc(&h, size, cudaHostAllocMapped) != cudaSuccess) + return false; + void *d = nullptr; + if (cudaHostGetDevicePointer(&d, h, 0) != cudaSuccess) { + cudaFreeHost(h); + return false; + } + std::memset(h, 0, size); + *host_out = static_cast(h); + *dev_out = static_cast(d); + return true; +} + +static void free_mapped_buffer(uint8_t *host_ptr) { + if (host_ptr) + cudaFreeHost(host_ptr); +} + +// ============================================================================ +// Write an RPC request (RPCHeader + payload) into a mapped buffer slot +// ============================================================================ + +static void write_rpc_slot(uint8_t *slot_host, uint32_t function_id, + const void *payload, size_t payload_len) { + rt::RPCHeader hdr{}; + hdr.magic = rt::RPC_MAGIC_REQUEST; + hdr.function_id = function_id; + hdr.arg_len = static_cast(payload_len); + std::memcpy(slot_host, &hdr, sizeof(hdr)); + if (payload && payload_len > 0) + std::memcpy(slot_host + sizeof(hdr), payload, payload_len); +} + +// ============================================================================ +// Test Fixture +// ============================================================================ + +class RealtimePipelineTest : public ::testing::Test { +protected: + void SetUp() override { + setenv("SKIP_TRT", "1", 1); + + ASSERT_TRUE(allocate_mapped_buffer(kNumSlots * sizeof(uint64_t), + &rx_flags_host_, &rx_flags_dev_)); + ASSERT_TRUE(allocate_mapped_buffer(kNumSlots * sizeof(uint64_t), + &tx_flags_host_, &tx_flags_dev_)); + ASSERT_TRUE(allocate_mapped_buffer(kNumSlots * kSlotSize, &rx_data_host_, + &rx_data_dev_)); + ASSERT_TRUE(allocate_mapped_buffer(kNumSlots * kSlotSize, &tx_data_host_, + &tx_data_dev_)); + + CUDA_CHECK(cudaHostAlloc(&mailbox_bank_host_, kMaxWorkers * sizeof(void *), + cudaHostAllocMapped)); + std::memset(mailbox_bank_host_, 0, kMaxWorkers * sizeof(void *)); + CUDA_CHECK(cudaHostGetDevicePointer( + reinterpret_cast(&mailbox_bank_dev_), mailbox_bank_host_, 0)); + + CUDA_CHECK(cudaStreamCreate(&stream_)); + } + + void TearDown() override { + if (stream_) + cudaStreamDestroy(stream_); + if (mailbox_bank_host_) + cudaFreeHost(mailbox_bank_host_); + free_mapped_buffer(rx_flags_host_); + free_mapped_buffer(tx_flags_host_); + free_mapped_buffer(rx_data_host_); + free_mapped_buffer(tx_data_host_); + unsetenv("SKIP_TRT"); + } + + std::unique_ptr create_predecoder(int mailbox_idx) { + auto pd = std::make_unique( + "dummy.onnx", + reinterpret_cast(mailbox_bank_dev_ + mailbox_idx), 1); + pd->capture_graph(stream_, false); + EXPECT_EQ(cudaStreamSynchronize(stream_), cudaSuccess); + return pd; + } + + void submit_rpc_to_slot(size_t slot, uint32_t function_id, + const void *payload, size_t payload_len) { + uint8_t *slot_host = rx_data_host_ + slot * kSlotSize; + write_rpc_slot(slot_host, function_id, payload, payload_len); + auto *flags = reinterpret_cast(rx_flags_host_); + flags[slot].store(reinterpret_cast(slot_host), + cuda::std::memory_order_release); + } + + bool wait_ready_flag(AIPreDecoderService *pd, int timeout_ms = 2000) { + auto deadline = std::chrono::steady_clock::now() + + std::chrono::milliseconds(timeout_ms); + while (std::chrono::steady_clock::now() < deadline) { + auto *flags = pd->get_host_ready_flags(); + int val = flags[0].load(cuda::std::memory_order_acquire); + if (val >= 1) + return true; + usleep(100); + } + return false; + } + + static constexpr size_t kMaxWorkers = 8; + + uint8_t *rx_flags_host_ = nullptr; + uint8_t *rx_flags_dev_ = nullptr; + uint8_t *tx_flags_host_ = nullptr; + uint8_t *tx_flags_dev_ = nullptr; + uint8_t *rx_data_host_ = nullptr; + uint8_t *rx_data_dev_ = nullptr; + uint8_t *tx_data_host_ = nullptr; + uint8_t *tx_data_dev_ = nullptr; + void **mailbox_bank_host_ = nullptr; + void **mailbox_bank_dev_ = nullptr; + cudaStream_t stream_ = nullptr; +}; + +// ============================================================================ +// AIDecoderService Unit Tests (SKIP_TRT) +// ============================================================================ + +TEST_F(RealtimePipelineTest, SkipTrtSizes) { + AIDecoderService svc("dummy.onnx", mailbox_bank_dev_); + EXPECT_EQ(svc.get_input_size(), kSkipTrtBytes); + EXPECT_EQ(svc.get_output_size(), kSkipTrtBytes); +} + +TEST_F(RealtimePipelineTest, SkipTrtBuffersAllocated) { + AIDecoderService svc("dummy.onnx", mailbox_bank_dev_); + EXPECT_NE(svc.get_trt_input_ptr(), nullptr); +} + +TEST_F(RealtimePipelineTest, SkipTrtGraphExecNull_BeforeCapture) { + AIDecoderService svc("dummy.onnx", mailbox_bank_dev_); + EXPECT_EQ(svc.get_executable_graph(), nullptr); +} + +// ============================================================================ +// AIPreDecoderService Unit Tests (SKIP_TRT) +// ============================================================================ + +TEST_F(RealtimePipelineTest, PreDecoderConstruction) { + auto pd = create_predecoder(0); + EXPECT_NE(pd->get_host_ready_flags(), nullptr); + EXPECT_NE(pd->get_host_ring_ptrs(), nullptr); + EXPECT_EQ(pd->get_queue_depth(), 1); + EXPECT_EQ(pd->get_input_size(), kSkipTrtBytes); + EXPECT_EQ(pd->get_output_size(), kSkipTrtBytes); +} + +TEST_F(RealtimePipelineTest, PreDecoderGraphCaptured) { + auto pd = create_predecoder(0); + EXPECT_NE(pd->get_executable_graph(), nullptr); +} + +TEST_F(RealtimePipelineTest, PollReturnsFalseWhenIdle) { + auto pd = create_predecoder(0); + PreDecoderJob job{}; + EXPECT_FALSE(pd->poll_next_job(job)); +} + +TEST_F(RealtimePipelineTest, PollAndRelease) { + auto pd = create_predecoder(0); + + auto *flags = pd->get_host_ready_flags(); + flags[0].store(1, cuda::std::memory_order_release); + + PreDecoderJob job{}; + EXPECT_TRUE(pd->poll_next_job(job)); + EXPECT_EQ(job.slot_idx, 0); + EXPECT_NE(job.inference_data, nullptr); + + int val = flags[0].load(cuda::std::memory_order_acquire); + EXPECT_EQ(val, 2); + + pd->release_job(0); + val = flags[0].load(cuda::std::memory_order_acquire); + EXPECT_EQ(val, 0); +} + +TEST_F(RealtimePipelineTest, GraphLaunchableFromHost) { + auto pd = create_predecoder(0); + cudaGraphExec_t exec = pd->get_executable_graph(); + ASSERT_NE(exec, nullptr); + + CUDA_CHECK(cudaGraphLaunch(exec, stream_)); + CUDA_CHECK(cudaStreamSynchronize(stream_)); +} + +// ============================================================================ +// Correctness Tests (Identity Passthrough) +// +// Data flow: payload -> (pre_launch DMA to d_trt_input_) -> +// passthrough_copy_kernel (identity) -> d_trt_output_ -> +// cudaMemcpyAsync -> d_outputs_ (mapped pinned) -> +// poll_next_job() -> inference_data +// ============================================================================ + +class CorrectnessTest : public RealtimePipelineTest { +protected: + void run_passthrough(AIPreDecoderService *pd, int mailbox_idx, + const float *payload, size_t num_floats, float *output) { + size_t payload_bytes = num_floats * sizeof(float); + ASSERT_LE(payload_bytes, kSkipTrtBytes); + + uint8_t *slot_host = rx_data_host_; + write_rpc_slot(slot_host, kTestFunctionId, payload, payload_bytes); + + ptrdiff_t offset = slot_host - rx_data_host_; + void *slot_dev = static_cast(rx_data_dev_ + offset); + + PreLaunchCopyCtx ctx; + ctx.d_trt_input = pd->get_trt_input_ptr(); + ctx.input_size = pd->get_input_size(); + ctx.h_ring_ptrs = pd->get_host_ring_ptrs(); + + pre_launch_input_copy(&ctx, slot_dev, stream_); + CUDA_CHECK(cudaGraphLaunch(pd->get_executable_graph(), stream_)); + CUDA_CHECK(cudaStreamSynchronize(stream_)); + + ASSERT_TRUE(wait_ready_flag(pd)); + + PreDecoderJob job{}; + ASSERT_TRUE(pd->poll_next_job(job)); + std::memcpy(output, job.inference_data, payload_bytes); + pd->release_job(0); + } +}; + +TEST_F(CorrectnessTest, IdentityPassthrough_Zeros) { + auto pd = create_predecoder(0); + float input[kSkipTrtFloats] = {}; + float output[kSkipTrtFloats]; + std::memset(output, 0xFF, sizeof(output)); + + run_passthrough(pd.get(), 0, input, kSkipTrtFloats, output); + EXPECT_EQ(std::memcmp(input, output, kSkipTrtBytes), 0) + << "Zero payload should pass through unchanged"; +} + +TEST_F(CorrectnessTest, IdentityPassthrough_KnownPattern) { + auto pd = create_predecoder(0); + float input[kSkipTrtFloats]; + for (size_t i = 0; i < kSkipTrtFloats; ++i) + input[i] = static_cast(i + 1); + float output[kSkipTrtFloats] = {}; + + run_passthrough(pd.get(), 0, input, kSkipTrtFloats, output); + EXPECT_EQ(std::memcmp(input, output, kSkipTrtBytes), 0) + << "Known pattern {1..16} should pass through unchanged"; +} + +TEST_F(CorrectnessTest, IdentityPassthrough_RandomData) { + auto pd = create_predecoder(0); + std::mt19937 rng(42); + std::uniform_real_distribution dist(-1e6f, 1e6f); + + float input[kSkipTrtFloats]; + for (size_t i = 0; i < kSkipTrtFloats; ++i) + input[i] = dist(rng); + float output[kSkipTrtFloats] = {}; + + run_passthrough(pd.get(), 0, input, kSkipTrtFloats, output); + EXPECT_EQ(std::memcmp(input, output, kSkipTrtBytes), 0) + << "Random payload should pass through bitwise-identical"; +} + +TEST_F(CorrectnessTest, IdentityPassthrough_MaxValues) { + auto pd = create_predecoder(0); + std::vector input(kSkipTrtFloats); + const float extremes[] = {FLT_MAX, -FLT_MAX, FLT_MIN, -FLT_MIN, + INFINITY, -INFINITY, NAN, 0.0f, + -0.0f, 1.0f, -1.0f, 1e-38f, + 1e38f, 3.14159265f, 2.71828183f, 0.5f}; + for (size_t i = 0; i < kSkipTrtFloats; ++i) + input[i] = extremes[i % (sizeof(extremes) / sizeof(extremes[0]))]; + std::vector output(kSkipTrtFloats, 0.0f); + + run_passthrough(pd.get(), 0, input.data(), kSkipTrtFloats, output.data()); + EXPECT_EQ(std::memcmp(input.data(), output.data(), kSkipTrtBytes), 0) + << "Extreme float values should pass through bitwise-identical"; +} + +TEST_F(CorrectnessTest, IdentityPassthrough_MultipleRequests) { + auto pd = create_predecoder(0); + constexpr int kNumRequests = 5000; + std::mt19937 rng(123); + std::uniform_real_distribution dist(-1e6f, 1e6f); + int failures = 0; + + for (int r = 0; r < kNumRequests; ++r) { + float input[kSkipTrtFloats]; + for (size_t i = 0; i < kSkipTrtFloats; ++i) + input[i] = dist(rng); + float output[kSkipTrtFloats] = {}; + + run_passthrough(pd.get(), 0, input, kSkipTrtFloats, output); + if (std::memcmp(input, output, kSkipTrtBytes) != 0) { + failures++; + if (failures <= 5) + ADD_FAILURE() << "Request " << r << ": output does not match input"; + } + } + EXPECT_EQ(failures, 0) << failures << " of " << kNumRequests + << " requests had mismatched output"; +} + +// ============================================================================ +// Host Dispatcher Unit Tests +// ============================================================================ + +class HostDispatcherTest : public RealtimePipelineTest { +protected: + void SetUp() override { + RealtimePipelineTest::SetUp(); + idle_mask_ = new atomic_uint64_sys(0); + live_dispatched_ = new atomic_uint64_sys(0); + inflight_slot_tags_ = new int[kMaxWorkers](); + shutdown_flag_ = new atomic_int_sys(0); + stats_counter_ = 0; + function_table_ = new cudaq_function_entry_t[kMaxWorkers]; + std::memset(function_table_, 0, + kMaxWorkers * sizeof(cudaq_function_entry_t)); + } + + void TearDown() override { + if (!loop_stopped_) { + shutdown_flag_->store(1, cuda::std::memory_order_release); + __sync_synchronize(); + if (loop_thread_.joinable()) + loop_thread_.join(); + } + for (auto &s : worker_streams_) { + if (s) + cudaStreamDestroy(s); + } + delete idle_mask_; + delete live_dispatched_; + delete[] inflight_slot_tags_; + delete shutdown_flag_; + delete[] function_table_; + RealtimePipelineTest::TearDown(); + } + + void add_worker(uint32_t function_id, cudaGraphExec_t exec, + PreLaunchCopyCtx *plc = nullptr) { + cudaStream_t s = nullptr; + ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess); + worker_streams_.push_back(s); + + cudaq_host_dispatch_worker_t w{}; + w.graph_exec = exec; + w.stream = s; + w.function_id = function_id; + w.pre_launch_fn = plc ? pre_launch_input_copy : nullptr; + w.pre_launch_data = plc; + workers_.push_back(w); + + size_t idx = ft_count_; + function_table_[idx].handler.graph_exec = exec; + function_table_[idx].function_id = function_id; + function_table_[idx].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; + ft_count_++; + } + + void start_loop() { + idle_mask_->store((1ULL << workers_.size()) - 1, + cuda::std::memory_order_release); + + std::memset(&config_, 0, sizeof(config_)); + config_.rx_flags = rx_flags_host_; + config_.tx_flags = tx_flags_host_; + config_.rx_data_host = rx_data_host_; + config_.rx_data_dev = rx_data_dev_; + config_.tx_data_host = tx_data_host_; + config_.tx_data_dev = tx_data_dev_; + config_.tx_stride_sz = kSlotSize; + config_.h_mailbox_bank = mailbox_bank_host_; + config_.num_slots = kNumSlots; + config_.slot_size = kSlotSize; + config_.workers = workers_.data(); + config_.num_workers = workers_.size(); + config_.function_table = function_table_; + config_.function_table_count = ft_count_; + config_.shutdown_flag = shutdown_flag_; + config_.stats_counter = &stats_counter_; + config_.live_dispatched = live_dispatched_; + config_.idle_mask = idle_mask_; + config_.inflight_slot_tags = inflight_slot_tags_; + + loop_thread_ = std::thread([this]() { + cudaq_host_dispatcher_loop(&config_); + }); + } + + void stop_loop() { + shutdown_flag_->store(1, cuda::std::memory_order_release); + __sync_synchronize(); + if (loop_thread_.joinable()) + loop_thread_.join(); + loop_stopped_ = true; + } + + void restore_worker(int id) { + idle_mask_->fetch_or(1ULL << id, cuda::std::memory_order_release); + } + + bool poll_tx_flag(size_t slot, int timeout_ms = 2000) { + auto *flags = reinterpret_cast(tx_flags_host_); + auto deadline = std::chrono::steady_clock::now() + + std::chrono::milliseconds(timeout_ms); + while (std::chrono::steady_clock::now() < deadline) { + uint64_t val = flags[slot].load(cuda::std::memory_order_acquire); + if (val != 0) + return true; + usleep(100); + } + return false; + } + + void clear_tx_flag(size_t slot) { + auto *flags = reinterpret_cast(tx_flags_host_); + flags[slot].store(0, cuda::std::memory_order_release); + } + + atomic_uint64_sys *idle_mask_ = nullptr; + atomic_uint64_sys *live_dispatched_ = nullptr; + int *inflight_slot_tags_ = nullptr; + atomic_int_sys *shutdown_flag_ = nullptr; + uint64_t stats_counter_ = 0; + bool loop_stopped_ = false; + + cudaq_function_entry_t *function_table_ = nullptr; + size_t ft_count_ = 0; + std::vector workers_; + std::vector worker_streams_; + cudaq_host_dispatcher_config_t config_; + std::thread loop_thread_; +}; + +TEST_F(HostDispatcherTest, ShutdownImmediate) { + auto pd = create_predecoder(0); + add_worker(kTestFunctionId, pd->get_executable_graph()); + + shutdown_flag_->store(1, cuda::std::memory_order_release); + start_loop(); + if (loop_thread_.joinable()) + loop_thread_.join(); + loop_stopped_ = true; + + EXPECT_EQ(stats_counter_, 0u); +} + +TEST_F(HostDispatcherTest, ShutdownClean) { + auto pd = create_predecoder(0); + add_worker(kTestFunctionId, pd->get_executable_graph()); + start_loop(); + usleep(10000); + stop_loop(); + EXPECT_EQ(stats_counter_, 0u); +} + +TEST_F(HostDispatcherTest, StatsCounter) { + auto pd = create_predecoder(0); + PreLaunchCopyCtx plc; + plc.d_trt_input = pd->get_trt_input_ptr(); + plc.input_size = pd->get_input_size(); + plc.h_ring_ptrs = pd->get_host_ring_ptrs(); + add_worker(kTestFunctionId, pd->get_executable_graph(), &plc); + start_loop(); + + constexpr int kN = 5; + for (int i = 0; i < kN; ++i) { + size_t slot = static_cast(i % kNumSlots); + if (i > 0) + clear_tx_flag((i - 1) % kNumSlots); + + float payload[kSkipTrtFloats] = {}; + payload[0] = static_cast(i); + submit_rpc_to_slot(slot, kTestFunctionId, payload, kSkipTrtBytes); + + ASSERT_TRUE(poll_tx_flag(slot)) << "Timeout on request " << i; + CUDA_CHECK(cudaDeviceSynchronize()); + + ASSERT_TRUE(wait_ready_flag(pd.get())); + PreDecoderJob job{}; + if (pd->poll_next_job(job)) + pd->release_job(0); + + restore_worker(0); + } + + stop_loop(); + EXPECT_EQ(stats_counter_, static_cast(kN)); +} + +TEST_F(HostDispatcherTest, InvalidMagicDropped) { + auto pd = create_predecoder(0); + add_worker(kTestFunctionId, pd->get_executable_graph()); + start_loop(); + + uint8_t *slot_host = rx_data_host_; + rt::RPCHeader bad_hdr; + bad_hdr.magic = 0xDEADBEEF; + bad_hdr.function_id = kTestFunctionId; + bad_hdr.arg_len = 4; + std::memcpy(slot_host, &bad_hdr, sizeof(bad_hdr)); + + auto *flags = reinterpret_cast(rx_flags_host_); + flags[0].store(reinterpret_cast(slot_host), + cuda::std::memory_order_release); + + usleep(50000); + + uint64_t rx_val = flags[0].load(cuda::std::memory_order_acquire); + EXPECT_EQ(rx_val, 0u) << "Invalid magic should be consumed (rx_flag cleared)"; + + stop_loop(); + EXPECT_EQ(stats_counter_, 0u) + << "Invalid magic should not count as dispatched"; +} + +TEST_F(HostDispatcherTest, SlotWraparound) { + auto pd = create_predecoder(0); + PreLaunchCopyCtx plc; + plc.d_trt_input = pd->get_trt_input_ptr(); + plc.input_size = pd->get_input_size(); + plc.h_ring_ptrs = pd->get_host_ring_ptrs(); + add_worker(kTestFunctionId, pd->get_executable_graph(), &plc); + start_loop(); + + constexpr int kTotal = static_cast(kNumSlots) + 2; + for (int i = 0; i < kTotal; ++i) { + size_t slot = static_cast(i % kNumSlots); + + auto *rx = reinterpret_cast(rx_flags_host_); + while (rx[slot].load(cuda::std::memory_order_acquire) != 0) + usleep(100); + clear_tx_flag(slot); + + float payload[kSkipTrtFloats] = {}; + payload[0] = static_cast(i); + submit_rpc_to_slot(slot, kTestFunctionId, payload, kSkipTrtBytes); + + ASSERT_TRUE(poll_tx_flag(slot)) + << "Timeout on request " << i << " (slot " << slot << ")"; + CUDA_CHECK(cudaDeviceSynchronize()); + + ASSERT_TRUE(wait_ready_flag(pd.get())); + PreDecoderJob job{}; + if (pd->poll_next_job(job)) + pd->release_job(0); + + restore_worker(0); + } + + stop_loop(); + EXPECT_EQ(stats_counter_, static_cast(kTotal)); +} + +// ============================================================================ +// Integration Tests +// ============================================================================ + +TEST_F(HostDispatcherTest, SingleRequestRoundTrip) { + auto pd = create_predecoder(0); + PreLaunchCopyCtx plc; + plc.d_trt_input = pd->get_trt_input_ptr(); + plc.input_size = pd->get_input_size(); + plc.h_ring_ptrs = pd->get_host_ring_ptrs(); + add_worker(kTestFunctionId, pd->get_executable_graph(), &plc); + start_loop(); + + float input[kSkipTrtFloats]; + for (size_t i = 0; i < kSkipTrtFloats; ++i) + input[i] = static_cast(i + 1); + submit_rpc_to_slot(0, kTestFunctionId, input, kSkipTrtBytes); + + ASSERT_TRUE(poll_tx_flag(0)) << "Timeout waiting for dispatcher to process"; + CUDA_CHECK(cudaDeviceSynchronize()); + + ASSERT_TRUE(wait_ready_flag(pd.get())) << "Predecoder ready flag not set"; + + PreDecoderJob job{}; + ASSERT_TRUE(pd->poll_next_job(job)); + float output[kSkipTrtFloats]; + std::memcpy(output, job.inference_data, kSkipTrtBytes); + pd->release_job(0); + + EXPECT_EQ(std::memcmp(input, output, kSkipTrtBytes), 0) + << "Round-trip data should match (identity passthrough)"; + + stop_loop(); + EXPECT_EQ(stats_counter_, 1u); +} + +TEST_F(HostDispatcherTest, MultiPredecoderConcurrency) { + constexpr int kNPd = 4; + std::vector> pds; + std::vector plcs(kNPd); + std::vector fids; + + for (int i = 0; i < kNPd; ++i) { + pds.push_back(create_predecoder(i)); + std::string name = "predecoder_" + std::to_string(i); + fids.push_back(rt::fnv1a_hash(name.c_str())); + plcs[i].d_trt_input = pds[i]->get_trt_input_ptr(); + plcs[i].input_size = pds[i]->get_input_size(); + plcs[i].h_ring_ptrs = pds[i]->get_host_ring_ptrs(); + add_worker(fids[i], pds[i]->get_executable_graph(), &plcs[i]); + } + start_loop(); + + float inputs[kNPd][kSkipTrtFloats]; + for (int i = 0; i < kNPd; ++i) + for (size_t j = 0; j < kSkipTrtFloats; ++j) + inputs[i][j] = static_cast(i * 100 + j); + + for (int i = 0; i < kNPd; ++i) + submit_rpc_to_slot(static_cast(i), fids[i], inputs[i], + kSkipTrtBytes); + + for (int i = 0; i < kNPd; ++i) + ASSERT_TRUE(poll_tx_flag(static_cast(i))) + << "Timeout on predecoder " << i; + CUDA_CHECK(cudaDeviceSynchronize()); + + for (int i = 0; i < kNPd; ++i) { + ASSERT_TRUE(wait_ready_flag(pds[i].get())) + << "Ready flag not set for predecoder " << i; + PreDecoderJob job{}; + ASSERT_TRUE(pds[i]->poll_next_job(job)); + float output[kSkipTrtFloats]; + std::memcpy(output, job.inference_data, kSkipTrtBytes); + pds[i]->release_job(0); + + EXPECT_EQ(std::memcmp(inputs[i], output, kSkipTrtBytes), 0) + << "Predecoder " << i << ": output should match input"; + } + + stop_loop(); + EXPECT_EQ(stats_counter_, static_cast(kNPd)); +} + +TEST_F(HostDispatcherTest, SustainedThroughput_200Requests) { + constexpr int kNPd = 2; + constexpr int kTotalRequests = 200; + + std::vector> pds; + std::vector plcs(kNPd); + std::vector fids; + + for (int i = 0; i < kNPd; ++i) { + pds.push_back(create_predecoder(i)); + std::string name = "sustained_pd_" + std::to_string(i); + fids.push_back(rt::fnv1a_hash(name.c_str())); + plcs[i].d_trt_input = pds[i]->get_trt_input_ptr(); + plcs[i].input_size = pds[i]->get_input_size(); + plcs[i].h_ring_ptrs = pds[i]->get_host_ring_ptrs(); + add_worker(fids[i], pds[i]->get_executable_graph(), &plcs[i]); + } + start_loop(); + + std::mt19937 rng(999); + std::uniform_real_distribution dist(-10.0f, 10.0f); + int completed = 0; + + for (int r = 0; r < kTotalRequests; ++r) { + int pd_idx = r % kNPd; + size_t slot = static_cast(r % kNumSlots); + + auto *rx = reinterpret_cast(rx_flags_host_); + auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(5); + while (rx[slot].load(cuda::std::memory_order_acquire) != 0) { + if (std::chrono::steady_clock::now() > deadline) + FAIL() << "Timeout waiting for slot " << slot << " to clear at request " + << r; + usleep(100); + } + clear_tx_flag(slot); + + float payload[kSkipTrtFloats]; + for (size_t i = 0; i < kSkipTrtFloats; ++i) + payload[i] = dist(rng); + + submit_rpc_to_slot(slot, fids[pd_idx], payload, kSkipTrtBytes); + + ASSERT_TRUE(poll_tx_flag(slot)) + << "Timeout on request " << r << " (slot " << slot << ")"; + CUDA_CHECK(cudaDeviceSynchronize()); + + ASSERT_TRUE(wait_ready_flag(pds[pd_idx].get())) + << "Ready flag not set for request " << r; + PreDecoderJob job{}; + if (pds[pd_idx]->poll_next_job(job)) + pds[pd_idx]->release_job(0); + + restore_worker(pd_idx); + completed++; + } + + stop_loop(); + EXPECT_EQ(completed, kTotalRequests); + EXPECT_EQ(stats_counter_, static_cast(kTotalRequests)); +} + +} // namespace