From b3f36f40a45cb99b1287fbd4ca3f9801f1d91c6c Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Wed, 18 Feb 2026 20:51:20 +0000 Subject: [PATCH 01/40] Add realtime GPU dispatch kernel library with RPC-based function dispatch Introduce the cudaq-realtime library under realtime/, providing infrastructure for low-latency GPU-accelerated realtime coprocessing between FPGA/CPU and GPU systems in the NVQLink architecture. Key components: - C-compatible host API (cudaq_realtime.h) with dispatch manager/dispatcher lifecycle management (create, configure ring buffers, start/stop) - Persistent GPU dispatch kernel that polls a ring buffer for incoming RPC requests and dispatches to registered handlers via function table lookup using FNV-1a hashed function IDs - Two dispatch modes: DeviceCallMode (direct __device__ function calls) and GraphLaunchMode (device-side cudaGraphLaunch with backpressure and single-launch guards, requires sm_80+) - Two kernel synchronization strategies: RegularKernel (__syncthreads) and CooperativeKernel (grid-wide cooperative_groups sync) - Schema-driven type system for RPC argument/result descriptors - Shared library (libcudaq-realtime.so) for the host API and static library (libcudaq-realtime-dispatch.a) for GPU kernel device code - GTest-based unit tests covering device-call dispatch, host API integration, and device-side graph launch Signed-off-by: Scott Thornton --- realtime/.clang-format | 12 + realtime/.gitignore | 99 +++ realtime/CMakeLists.txt | 130 ++++ realtime/README.md | 41 ++ .../daemon/dispatcher/cudaq_realtime.h | 219 ++++++ .../daemon/dispatcher/dispatch_kernel.cuh | 70 ++ .../dispatcher/dispatch_kernel_launch.h | 105 +++ .../daemon/dispatcher/dispatch_modes.h | 64 ++ .../nvqlink/daemon/dispatcher/kernel_types.h | 35 + realtime/lib/CMakeLists.txt | 17 + realtime/lib/daemon/CMakeLists.txt | 76 ++ .../daemon/dispatcher/cudaq_realtime_api.cpp | 202 +++++ .../lib/daemon/dispatcher/dispatch_kernel.cu | 454 ++++++++++++ realtime/unittests/CMakeLists.txt | 78 ++ realtime/unittests/test_dispatch_kernel.cu | 693 ++++++++++++++++++ 15 files changed, 2295 insertions(+) create mode 100644 realtime/.clang-format create mode 100644 realtime/.gitignore create mode 100644 realtime/CMakeLists.txt create mode 100644 realtime/README.md create mode 100644 realtime/include/cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h create mode 100644 realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_kernel.cuh create mode 100644 realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h create mode 100644 realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_modes.h create mode 100644 realtime/include/cudaq/nvqlink/daemon/dispatcher/kernel_types.h create mode 100644 realtime/lib/CMakeLists.txt create mode 100644 realtime/lib/daemon/CMakeLists.txt create mode 100644 realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp create mode 100644 realtime/lib/daemon/dispatcher/dispatch_kernel.cu create mode 100644 realtime/unittests/CMakeLists.txt create mode 100644 realtime/unittests/test_dispatch_kernel.cu diff --git a/realtime/.clang-format b/realtime/.clang-format new file mode 100644 index 00000000..4b5d84be --- /dev/null +++ b/realtime/.clang-format @@ -0,0 +1,12 @@ +BasedOnStyle: LLVM +AlwaysBreakTemplateDeclarations: Yes +IncludeCategories: + - Regex: '^<' + Priority: 4 + - Regex: '^"cudaq/' + Priority: 3 + - Regex: '^"(nvqlink|\.\.)/' + Priority: 2 + - Regex: '.*' + Priority: 1 +InsertNewlineAtEOF: Yes diff --git a/realtime/.gitignore b/realtime/.gitignore new file mode 100644 index 00000000..ccec909e --- /dev/null +++ b/realtime/.gitignore @@ -0,0 +1,99 @@ +# Editor backup files +*~ + +# Patch files +*.orig +*.rej + +# Compiled Object files +*.slo +*.lo +*.o +*.obj +*.x +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +**/Output/ +**/.lit*.txt + +# Executables +*.exe +*.out +*.app +**/out/ +/*build*/ +/*Build/ +/plugins/ +/other_library_builds/ +/.cproject +/.project +/.settings/ +**/*.jar +**/.ptp* +*.ab +/dist/ +/*egg*/ +/python/*egg* +/*tmp*/ +/wheelhouse/ +**/.ipynb_checkpoints +compile_commands.json +**/*.dat +**/.antlr +__pycache__/ + +# IDE files +.vscode/* +.theia/* + +# Container files +**/.docker/* + +# LSP files +.cache/* + +# LLVM/MLIR files +*.ll +*.bc + +# Build results +[Bb]in/ +[Oo]bj/ +*.bson +*.csv +*.bin +docs/sphinx/_doxygen +docs/sphinx/_mdgen +**/_build/* +**/_skbuild/* +_version.py + +# third party integrations +simulators/ +apps/ + +# macOS +.DS_Store + +# JetBrains IDE files +.idea + +# vim files +*.tmp diff --git a/realtime/CMakeLists.txt b/realtime/CMakeLists.txt new file mode 100644 index 00000000..53db32b2 --- /dev/null +++ b/realtime/CMakeLists.txt @@ -0,0 +1,130 @@ +# ============================================================================ # +# Copyright (c) 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +# Requiring the same version as the others. +cmake_minimum_required(VERSION 3.28 FATAL_ERROR) + +include(FetchContent) + +# Set a default build type if none was specified. Must set this before +# project(). +set(CMAKE_BUILD_TYPE "Release" CACHE STRING + "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel") + +# Set a default install prefix if none was specified. +set(CMAKE_INSTALL_PREFIX "$ENV{HOME}/.nvqlink" CACHE STRING + "Install path prefix, prepended onto install directories") + +# Project setup +# ============================================================================== + +# Check if core is built as a standalone project. +project(cudaq-nvqlink) +set(CUDAQ_NVQLINK_STANDALONE_BUILD TRUE) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# The following must go after `project(...)` +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED TRUE) +set(CMAKE_POSITION_INDEPENDENT_CODE TRUE) + +set(CUDAQ_NVQLINK_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(CUDAQ_NVQLINK_INCLUDE_DIR ${CUDAQ_NVQLINK_SOURCE_DIR}/include) + +# Add cmake directory to module path for custom Find modules +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") + +# Options +# ============================================================================== + +option(NVQLINK_BUILD_TESTS + "Generate build targets for the NVQLINK unit tests" ON) +option(NVQLINK_BUILD_EXAMPLES + "Generate build targets for the NVQLINK example programs" ON) +option(NVQLINK_ENABLE_ROCE + "Enable RoCE backend using libibverbs" OFF) +option(NVQLINK_ENABLE_DOCA + "Enable DOCA GPUNetIO backend for GPU-controlled RDMA" OFF) + +# Profiler backend selection +set(NVQLINK_PROFILER_BACKEND "NONE" CACHE STRING "Profiler backend (NONE, NVTX, TRACY)") +set_property(CACHE NVQLINK_PROFILER_BACKEND PROPERTY STRINGS NONE NVTX TRACY) + +# Logging backend selection +set(NVQLINK_LOGGING_BACKEND "NONE" CACHE STRING "Logging backend (NONE, QUILL)") +set_property(CACHE NVQLINK_LOGGING_BACKEND PROPERTY STRINGS NONE QUILL) + +# Compile-time log level filtering (lower levels become no-ops) +set(NVQLINK_LOGGING_LEVEL "INFO" CACHE STRING "Minimum log level (TRACE, DEBUG, INFO, WARNING, ERROR)") +set_property(CACHE NVQLINK_LOGGING_LEVEL PROPERTY STRINGS TRACE DEBUG INFO WARNING ERROR) + +# Check for CUDA Support (ref: cuda-quantum/CMakeLists.txt) +# ============================================================================== +include(CheckLanguage) +check_language(CUDA) +set(CUDA_FOUND FALSE) +# Generate -gencode arch=compute_XX,code=sm_XX for list of supported +# arch values. +# List should be sorted in increasing order. +function(CUDA_get_gencode_args out_args_string arch_values) + # allow the user to pass the list like a normal variable + set(arch_list ${arch_values} ${ARGN}) + set(out "") + foreach(arch IN LISTS arch_list) + set(out "${out} -gencode arch=compute_${arch},code=sm_${arch}") + endforeach(arch) + + # Repeat the last one as to ensure the generation of PTX for most + # recent virtual architecture for forward compatibility + list(GET arch_list -1 last_arch) + set(out "${out} -gencode arch=compute_${last_arch},code=compute_${last_arch}") + set(${out_args_string} ${out} PARENT_SCOPE) +endfunction() + +if(CMAKE_CUDA_COMPILER) + if (NOT CUDA_TARGET_ARCHS) + # Ampere, Ada Lovelace, Hopper + set(CUDA_TARGET_ARCHS "80;89;90") + endif() + CUDA_get_gencode_args(CUDA_gencode_flags ${CUDA_TARGET_ARCHS}) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -shared -std=c++17 ${CUDA_gencode_flags} --compiler-options -fPIC") + + enable_language(CUDA) + set(CUDA_FOUND TRUE) + set(CMAKE_CUDA_STANDARD 17) + set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) + find_package(CUDAToolkit REQUIRED) + message(STATUS "Cuda language found.") +endif() + +# External Dependencies +# ============================================================================== + +find_package(Threads REQUIRED) + +add_subdirectory(lib) + +if (NVQLINK_BUILD_EXAMPLES) + message(STATUS "RoCE/DOCA examples removed for RPC dispatch workflow.") +endif() + +if (NVQLINK_BUILD_TESTS) + add_custom_target(NVQLINKUnitTests) + include(CTest) + + add_custom_target(run_tests + COMMAND ${CMAKE_COMMAND} -E env + PYTHONPATH="${CUDAQ_INSTALL_DIR}:${CMAKE_BINARY_DIR}/python" + ${CMAKE_CTEST_COMMAND} --output-on-failure + DEPENDS NVQLINKUnitTests + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + ) + add_subdirectory(unittests) +endif() + diff --git a/realtime/README.md b/realtime/README.md new file mode 100644 index 00000000..5fec3286 --- /dev/null +++ b/realtime/README.md @@ -0,0 +1,41 @@ +# CUDA-Q Realtime Library + +CUDA-Q Realtime is a library for tightly coupling GPU accelerated compute to the control system of a quantum processor. +It fulfills two primary responsibilities: +1. It provides the low-level basis of realtime coprocessing between FPGA and CPU-GPU systems. +1. It provides the low latency networking stack of the NVQLink architecture, enabling system integrators to achieve few-microsecond data round trips between FPGA and GPU. + +> [!WARNING] +> This library is currently in early access / alpha stage and will continue to rapidly evolve as we build interactively with collaborators. + +> [!NOTE] +> While the library is in early access, instructions to reproduce the FPGA-GPU latency round trip on third party systems can be found at [docs/nvqlink_latency_demo.md](docs/nvqlink_latency_demo.md). + +## Getting Started + +```bash +# Configure, need cmake 3.28+ +cmake -G Ninja .. -DNVQLINK_BUILD_TESTS=ON +# Build +ninja +# Test +ctest +``` + +## Extending the library + +Check out the tests in the `unittests` folder as well as the example codes in `examples`. + +3rd parties can extend this library with new `device` types. The goal is to define +a subclass of `device_mixin` that allows you specify device traits that your `device` exposes. +There are a number of traits available, and they are specified in the `device.h` file. There are +example devices in the `devices/` folder there too. + +3rd parties can also provide custom compiler implementations. Compilers take generic +code strings and return a `compiled_kernel`. There is one compiler implemented as of +today, and it is the CUDA-Q compiler. For simplicity, this compiler simply delegates to +the command line CUDA-Q toolchain. Subclasses should be able to override the `cudaq-opt` +pass flags. This would allow one to handle CUDA-Q IR operations in a target specific manner +(e.g., custom lowering of the device_call op). + + diff --git a/realtime/include/cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h b/realtime/include/cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h new file mode 100644 index 00000000..98459c98 --- /dev/null +++ b/realtime/include/cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h @@ -0,0 +1,219 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Opaque handles +typedef struct cudaq_dispatch_manager_t cudaq_dispatch_manager_t; +typedef struct cudaq_dispatcher_t cudaq_dispatcher_t; + +// Error codes +typedef enum { + CUDAQ_OK = 0, + CUDAQ_ERR_INVALID_ARG = 1, + CUDAQ_ERR_INTERNAL = 2, + CUDAQ_ERR_CUDA = 3 +} cudaq_status_t; + +// Kernel synchronization type +typedef enum { + CUDAQ_KERNEL_REGULAR = 0, + CUDAQ_KERNEL_COOPERATIVE = 1 +} cudaq_kernel_type_t; + +// Dispatch invocation mode +typedef enum { + CUDAQ_DISPATCH_DEVICE_CALL = 0, + CUDAQ_DISPATCH_GRAPH_LAUNCH = 1 +} cudaq_dispatch_mode_t; + +// Payload type identifiers (matching PayloadTypeID in dispatch_kernel_launch.h) +typedef enum { + CUDAQ_TYPE_UINT8 = 0x10, + CUDAQ_TYPE_INT32 = 0x11, + CUDAQ_TYPE_INT64 = 0x12, + CUDAQ_TYPE_FLOAT32 = 0x13, + CUDAQ_TYPE_FLOAT64 = 0x14, + CUDAQ_TYPE_ARRAY_UINT8 = 0x20, + CUDAQ_TYPE_ARRAY_INT32 = 0x21, + CUDAQ_TYPE_ARRAY_FLOAT32 = 0x22, + CUDAQ_TYPE_ARRAY_FLOAT64 = 0x23, + CUDAQ_TYPE_BIT_PACKED = 0x30 +} cudaq_payload_type_t; + +// Type descriptor for arguments/results +typedef struct { + uint8_t type_id; // cudaq_payload_type_t value + uint8_t reserved[3]; // padding + uint32_t size_bytes; // total size in bytes + uint32_t num_elements; // number of elements (for arrays) +} cudaq_type_desc_t; + +// Handler schema describing function signature +typedef struct { + uint8_t num_args; // number of arguments + uint8_t num_results; // number of results + uint16_t reserved; // padding + cudaq_type_desc_t args[8]; // argument descriptors (max 8) + cudaq_type_desc_t results[4]; // result descriptors (max 4) +} cudaq_handler_schema_t; + +// Dispatcher configuration +typedef struct { + int device_id; // GPU device ID (>=0) + uint32_t num_blocks; // grid size + uint32_t threads_per_block; // block size + uint32_t num_slots; // ring buffer slots + uint32_t slot_size; // bytes per slot + uint32_t vp_id; // virtual port ID + cudaq_kernel_type_t kernel_type; // regular/cooperative kernel + cudaq_dispatch_mode_t dispatch_mode; // device call/graph launch +} cudaq_dispatcher_config_t; + +// GPU ring buffer pointers (device-visible mapped pointers) +typedef struct { + volatile uint64_t *rx_flags; // device pointer + volatile uint64_t *tx_flags; // device pointer +} cudaq_ringbuffer_t; + +// Unified function table entry with schema +typedef struct { + union { + void *device_fn_ptr; // for CUDAQ_DISPATCH_DEVICE_CALL + cudaGraphExec_t graph_exec; // for CUDAQ_DISPATCH_GRAPH_LAUNCH + } handler; + uint32_t function_id; // hash of function name (FNV-1a) + uint8_t dispatch_mode; // cudaq_dispatch_mode_t value + uint8_t reserved[3]; // padding + cudaq_handler_schema_t schema; // function signature schema + + // Graph-launch backpressure metadata + // Only meaningful when dispatch_mode == CUDAQ_DISPATCH_GRAPH_LAUNCH. + // Set to 0/NULL for DEVICE_CALL entries or when backpressure is not needed. + uint32_t mailbox_idx; // index into global_mailbox_bank + uint32_t _pad0; // alignment padding + int *d_queue_idx; // device pointer to queue tail tracker + volatile int *d_ready_flags; // device-mapped pointer to ready flags + volatile int *d_inflight_flag; // 0 = idle, 1 = graph in flight (single-launch guard) +} cudaq_function_entry_t; + +// Function table for device-side dispatch +typedef struct { + cudaq_function_entry_t *entries; // device pointer to array of entries + uint32_t count; // number of entries +} cudaq_function_table_t; + +// Host launch function pointer type +typedef void (*cudaq_dispatch_launch_fn_t)( + volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, + cudaq_function_entry_t *function_table, size_t func_count, + volatile int *shutdown_flag, uint64_t *stats, size_t num_slots, + uint32_t num_blocks, uint32_t threads_per_block, cudaStream_t stream); + +// Default dispatch kernel launch helpers (from libcudaq-realtime-dispatch.a) +void cudaq_launch_dispatch_kernel_regular( + volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, + cudaq_function_entry_t *function_table, size_t func_count, + volatile int *shutdown_flag, uint64_t *stats, size_t num_slots, + uint32_t num_blocks, uint32_t threads_per_block, cudaStream_t stream); + +void cudaq_launch_dispatch_kernel_cooperative( + volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, + cudaq_function_entry_t *function_table, size_t func_count, + volatile int *shutdown_flag, uint64_t *stats, size_t num_slots, + uint32_t num_blocks, uint32_t threads_per_block, cudaStream_t stream); + +// Graph-enabled dispatch kernels (requires compute capability 8.0+, sm_80+) +// Device-side cudaGraphLaunch is available on sm_80 and higher (CUDA 13+) +#if defined(__CUDACC__) || defined(CUDA_VERSION) + +//============================================================================== +// Graph-Based Dispatch API (Proper Device-Side Graph Launch Support) +//============================================================================== +// +// These functions properly support device-side cudaGraphLaunch() by wrapping +// the dispatch kernel in a graph that is instantiated with +// cudaGraphInstantiateFlagDeviceLaunch. +// +// Usage: +// 1. Call cudaq_create_dispatch_graph_regular() to create the graph context +// 2. Call cudaq_launch_dispatch_graph() to launch the dispatch kernel +// 3. When done, call cudaq_destroy_dispatch_graph() to cleanup +// +// The dispatch kernel running inside this graph CAN call cudaGraphLaunch() +// to launch child graphs using cudaStreamGraphFireAndForget or other modes. + +// Opaque handle for graph-based dispatch context +typedef struct cudaq_dispatch_graph_context cudaq_dispatch_graph_context; + +// Create a graph-based dispatch context for the regular kernel type. +// This creates a graph containing the dispatch kernel, instantiates it with +// cudaGraphInstantiateFlagDeviceLaunch, and uploads it to the device. +// Returns cudaSuccess on success, or an error code on failure. +cudaError_t cudaq_create_dispatch_graph_regular( + volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, + cudaq_function_entry_t *function_table, size_t func_count, + void **global_mailbox_bank, + volatile int *shutdown_flag, uint64_t *stats, + size_t num_slots, uint32_t num_blocks, uint32_t threads_per_block, + cudaStream_t stream, cudaq_dispatch_graph_context **out_context); + +// Launch the dispatch graph. The dispatch kernel inside this graph can call +// cudaGraphLaunch() to launch child graphs from device code. +cudaError_t cudaq_launch_dispatch_graph(cudaq_dispatch_graph_context *context, + cudaStream_t stream); + +// Destroy the dispatch graph context and release all resources. +cudaError_t cudaq_destroy_dispatch_graph(cudaq_dispatch_graph_context *context); + +#endif + +// Manager lifecycle +cudaq_status_t +cudaq_dispatch_manager_create(cudaq_dispatch_manager_t **out_mgr); +cudaq_status_t cudaq_dispatch_manager_destroy(cudaq_dispatch_manager_t *mgr); + +// Dispatcher lifecycle +cudaq_status_t cudaq_dispatcher_create(cudaq_dispatch_manager_t *mgr, + const cudaq_dispatcher_config_t *config, + cudaq_dispatcher_t **out_dispatcher); +cudaq_status_t cudaq_dispatcher_destroy(cudaq_dispatcher_t *dispatcher); + +// Wiring inputs +cudaq_status_t +cudaq_dispatcher_set_ringbuffer(cudaq_dispatcher_t *dispatcher, + const cudaq_ringbuffer_t *ringbuffer); +cudaq_status_t +cudaq_dispatcher_set_function_table(cudaq_dispatcher_t *dispatcher, + const cudaq_function_table_t *table); +cudaq_status_t cudaq_dispatcher_set_control(cudaq_dispatcher_t *dispatcher, + volatile int *shutdown_flag, + uint64_t *stats); +cudaq_status_t +cudaq_dispatcher_set_launch_fn(cudaq_dispatcher_t *dispatcher, + cudaq_dispatch_launch_fn_t launch_fn); + +// Start/stop +cudaq_status_t cudaq_dispatcher_start(cudaq_dispatcher_t *dispatcher); +cudaq_status_t cudaq_dispatcher_stop(cudaq_dispatcher_t *dispatcher); + +// Stats +cudaq_status_t cudaq_dispatcher_get_processed(cudaq_dispatcher_t *dispatcher, + uint64_t *out_packets); + +#ifdef __cplusplus +} +#endif diff --git a/realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_kernel.cuh b/realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_kernel.cuh new file mode 100644 index 00000000..0e3a028d --- /dev/null +++ b/realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_kernel.cuh @@ -0,0 +1,70 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2025 - Present NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +/// @file dispatch_kernel.cuh +/// @brief Dispatch kernel declarations for external projects. +/// +/// The dispatch kernel implementation now lives in a separate CUDA TU +/// (dispatch_kernel.cu) and is linked into libcudaq-realtime.so. This header +/// provides declarations and inline wrappers for the launch functions. + +#include "cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/nvqlink/daemon/dispatcher/kernel_types.h" +#include "cudaq/nvqlink/daemon/dispatcher/dispatch_modes.h" + +#include +#include + +namespace cudaq::nvqlink { + +//============================================================================== +// Kernel Launch Function Declarations (with schema-driven function table) +//============================================================================== +// These declarations match the extern "C" functions defined in dispatch_kernel.cu +// and cudaq_realtime.h + +/// @brief Inline wrapper for regular kernel (schema-aware). +inline void launch_dispatch_kernel_regular_inline( + volatile std::uint64_t* rx_flags, + volatile std::uint64_t* tx_flags, + cudaq_function_entry_t* function_table, + std::size_t func_count, + volatile int* shutdown_flag, + std::uint64_t* stats, + std::size_t num_slots, + std::uint32_t num_blocks, + std::uint32_t threads_per_block, + cudaStream_t stream) { + cudaq_launch_dispatch_kernel_regular( + rx_flags, tx_flags, function_table, func_count, + shutdown_flag, stats, num_slots, + num_blocks, threads_per_block, stream); +} + +/// @brief Inline wrapper for cooperative kernel (schema-aware). +inline void launch_dispatch_kernel_cooperative_inline( + volatile std::uint64_t* rx_flags, + volatile std::uint64_t* tx_flags, + cudaq_function_entry_t* function_table, + std::size_t func_count, + volatile int* shutdown_flag, + std::uint64_t* stats, + std::size_t num_slots, + std::uint32_t num_blocks, + std::uint32_t threads_per_block, + cudaStream_t stream) { + cudaq_launch_dispatch_kernel_cooperative( + rx_flags, tx_flags, function_table, func_count, + shutdown_flag, stats, num_slots, + num_blocks, threads_per_block, stream); +} + +} // namespace cudaq::nvqlink diff --git a/realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h b/realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h new file mode 100644 index 00000000..18288fbf --- /dev/null +++ b/realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h @@ -0,0 +1,105 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2025 - Present NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include +#include +#include + +namespace cudaq::nvqlink { + +//============================================================================== +// RPC Protocol Structures (Wire Format) +//============================================================================== + +/// @brief RPC request header - wire format for function dispatch. +/// Must be wire-compatible with cuda-quantum RPC protocol. +struct __attribute__((packed)) RPCHeader { + std::uint32_t magic; ///< Magic value to validate message framing + std::uint32_t function_id; ///< Hash of function name (FNV-1a) + std::uint32_t arg_len; ///< Length of argument data in bytes +}; + +/// @brief RPC response header - returned to caller. +struct __attribute__((packed)) RPCResponse { + std::uint32_t magic; ///< Magic value to validate message framing + std::int32_t status; ///< Return status (0 = success) + std::uint32_t result_len; ///< Length of result data in bytes +}; + +//============================================================================== +// Device Function Type +//============================================================================== + +/// @brief Device RPC function signature. +/// @param buffer Pointer to argument/result buffer +/// @param arg_len Length of argument data +/// @param max_result_len Maximum result buffer size +/// @param result_len Output: actual result length +/// @return Status code (0 = success) +using DeviceRPCFunction = int (*)(void *buffer, std::uint32_t arg_len, + std::uint32_t max_result_len, + std::uint32_t *result_len); + +//============================================================================== +// Function ID Hashing +//============================================================================== + +/// @brief Compute FNV-1a hash of a string (for function_id). +/// @param str Null-terminated string to hash +/// @return 32-bit hash value +constexpr std::uint32_t fnv1a_hash(const char *str) { + std::uint32_t hash = 2166136261u; + while (*str) { + hash ^= static_cast(*str++); + hash *= 16777619u; + } + return hash; +} + +// RPC framing magic values (ASCII: CUQ?). +constexpr std::uint32_t RPC_MAGIC_REQUEST = 0x43555152; // 'CUQR' +constexpr std::uint32_t RPC_MAGIC_RESPONSE = 0x43555153; // 'CUQS' + +//============================================================================== +// Schema-Driven Type System +//============================================================================== + +/// @brief Standardized payload type identifiers for RPC arguments/results. +enum PayloadTypeID : std::uint8_t { + TYPE_UINT8 = 0x10, + TYPE_INT32 = 0x11, + TYPE_INT64 = 0x12, + TYPE_FLOAT32 = 0x13, + TYPE_FLOAT64 = 0x14, + TYPE_ARRAY_UINT8 = 0x20, + TYPE_ARRAY_INT32 = 0x21, + TYPE_ARRAY_FLOAT32 = 0x22, + TYPE_ARRAY_FLOAT64 = 0x23, + TYPE_BIT_PACKED = 0x30 +}; + +/// @brief Type descriptor for a single argument or result. +struct __attribute__((packed)) cudaq_type_desc_t { + std::uint8_t type_id; ///< PayloadTypeID value + std::uint8_t reserved[3]; ///< Padding for alignment + std::uint32_t size_bytes; ///< Total size in bytes + std::uint32_t num_elements; ///< Number of elements (for arrays) +}; + +/// @brief Handler schema describing argument and result types. +struct __attribute__((packed)) cudaq_handler_schema_t { + std::uint8_t num_args; ///< Number of arguments + std::uint8_t num_results; ///< Number of results + std::uint16_t reserved; ///< Padding for alignment + cudaq_type_desc_t args[8]; ///< Argument type descriptors (max 8) + cudaq_type_desc_t results[4]; ///< Result type descriptors (max 4) +}; + +} // namespace cudaq::nvqlink diff --git a/realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_modes.h b/realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_modes.h new file mode 100644 index 00000000..83e0c843 --- /dev/null +++ b/realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_modes.h @@ -0,0 +1,64 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include + +namespace cudaq::realtime { + +/// @brief Device call dispatch mode - direct __device__ function call. +/// +/// The handler function is called directly from within the dispatch kernel. +/// This is the simplest and lowest-latency dispatch mode, suitable for +/// lightweight handlers like simple decoders or data transformations. +struct DeviceCallMode { + /// @brief Dispatch to handler via direct device function call. + /// + /// @tparam HandlerFunc Function pointer type + /// @tparam ContextType Context structure type + /// @tparam Args Additional argument types + /// @param handler The __device__ function to call + /// @param ctx Handler context (matrices, dimensions, etc.) + /// @param args Additional arguments + template + __device__ static void dispatch(HandlerFunc handler, ContextType &ctx, + Args... args) { + handler(ctx, args...); + } +}; + +/// @brief Graph launch dispatch mode - launches a CUDA graph from device. +/// +/// The handler is a pre-captured CUDA graph that gets launched from the +/// persistent kernel. This is suitable for complex multi-kernel workflows +/// that benefit from graph optimization. +/// +/// NOTE: Requires the graph to be captured and stored in the context at +/// initialization time. The context must contain graph_exec handle. +struct GraphLaunchMode { + /// @brief Dispatch via CUDA graph launch from device. + /// + /// @tparam ContextType Context structure type (must have graph_exec member) + /// @param ctx Handler context containing the graph executable + template + __device__ static void dispatch(ContextType &ctx) { +// Device graph launch requires CUDA 13+ and compute capability 8.0+ +// The graph_exec must be a cudaGraphExec_t captured at initialization +#if __CUDA_ARCH__ >= 800 + // cudaGraphLaunch is available from device code on sm_80+ + // Note: This is a placeholder - actual implementation requires + // the graph_exec to be properly set up in the context + if (ctx.graph_exec != nullptr) { + cudaGraphLaunch(ctx.graph_exec, ctx.stream); + } +#endif + } +}; + +} // namespace cudaq::realtime diff --git a/realtime/include/cudaq/nvqlink/daemon/dispatcher/kernel_types.h b/realtime/include/cudaq/nvqlink/daemon/dispatcher/kernel_types.h new file mode 100644 index 00000000..e78ae558 --- /dev/null +++ b/realtime/include/cudaq/nvqlink/daemon/dispatcher/kernel_types.h @@ -0,0 +1,35 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include +#include + +namespace cudaq::realtime { + +/// @brief Regular kernel synchronization using __syncthreads(). +/// +/// Use this for single-block kernels or when only block-level synchronization +/// is needed. Suitable for simple decode handlers that don't require +/// grid-wide coordination. +struct RegularKernel { + /// @brief Synchronize threads within a block. + __device__ static void sync() { __syncthreads(); } +}; + +/// @brief Cooperative kernel synchronization using grid.sync(). +/// +/// Use this for multi-block kernels that need grid-wide synchronization, +/// such as complex decoders with data dependencies across blocks. +/// Requires kernel to be launched with cudaLaunchCooperativeKernel. +struct CooperativeKernel { + __device__ static void sync() { cooperative_groups::this_grid().sync(); } +}; + +} // namespace cudaq::realtime diff --git a/realtime/lib/CMakeLists.txt b/realtime/lib/CMakeLists.txt new file mode 100644 index 00000000..9193b29c --- /dev/null +++ b/realtime/lib/CMakeLists.txt @@ -0,0 +1,17 @@ +# ============================================================================ # +# Copyright (c) 2024 - 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +include(GNUInstallDirs) + +install(DIRECTORY ${CUDAQ_NVQLINK_INCLUDE_DIR}/cudaq + COMPONENT nvqlink-headers + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} + FILES_MATCHING PATTERN "*.h" +) + +add_subdirectory(daemon) diff --git a/realtime/lib/daemon/CMakeLists.txt b/realtime/lib/daemon/CMakeLists.txt new file mode 100644 index 00000000..5bd0e3f2 --- /dev/null +++ b/realtime/lib/daemon/CMakeLists.txt @@ -0,0 +1,76 @@ +# ============================================================================ # +# Copyright (c) 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +# ============================================================================== +# Shared library for external consumers (libcudaq-realtime.so) +# ============================================================================== +# This shared library exports a C-compatible host API for wiring dispatchers +# and includes the GPU dispatch kernel device code. + +if(CUDA_FOUND) + set(CUDAQ_REALTIME_SOURCES + dispatcher/cudaq_realtime_api.cpp + ) + + add_library(cudaq-realtime SHARED ${CUDAQ_REALTIME_SOURCES}) + + target_include_directories(cudaq-realtime + PUBLIC + $ + $ + ) + + target_link_libraries(cudaq-realtime + PUBLIC + CUDA::cudart_static + ) + + target_compile_definitions(cudaq-realtime PUBLIC NVQLINK_HAVE_CUDA) + + set_target_properties(cudaq-realtime PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + POSITION_INDEPENDENT_CODE ON + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib + ) + + install(TARGETS cudaq-realtime + COMPONENT realtime-lib + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) + + add_library(cudaq-realtime-dispatch STATIC dispatcher/dispatch_kernel.cu) + + target_include_directories(cudaq-realtime-dispatch + PUBLIC + $ + $ + ) + + # Link CUDA device runtime library (required for device-side API calls like cudaGraphLaunch) + find_library(CUDADEVRT_LIBRARY cudadevrt + HINTS ${CUDAToolkit_LIBRARY_DIR} + REQUIRED + ) + + target_link_libraries(cudaq-realtime-dispatch + PUBLIC + CUDA::cudart_static + ${CUDADEVRT_LIBRARY} + ) + + set_target_properties(cudaq-realtime-dispatch PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + POSITION_INDEPENDENT_CODE ON + ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib + ) + + install(TARGETS cudaq-realtime-dispatch + COMPONENT realtime-lib + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) +endif() diff --git a/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp b/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp new file mode 100644 index 00000000..28216781 --- /dev/null +++ b/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp @@ -0,0 +1,202 @@ +/******************************************************************************* + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h" + +#include +#include + +struct cudaq_dispatch_manager_t { + int reserved = 0; +}; + +struct cudaq_dispatcher_t { + cudaq_dispatcher_config_t config{}; + cudaq_ringbuffer_t ringbuffer{}; + cudaq_function_table_t table{}; + cudaq_dispatch_launch_fn_t launch_fn = nullptr; + volatile int *shutdown_flag = nullptr; + uint64_t *stats = nullptr; + cudaStream_t stream = nullptr; + bool running = false; +}; + +static bool is_valid_kernel_type(cudaq_kernel_type_t kernel_type) { + switch (kernel_type) { + case CUDAQ_KERNEL_REGULAR: + case CUDAQ_KERNEL_COOPERATIVE: + return true; + default: + return false; + } +} + +static bool is_valid_dispatch_mode(cudaq_dispatch_mode_t dispatch_mode) { + switch (dispatch_mode) { + case CUDAQ_DISPATCH_DEVICE_CALL: + case CUDAQ_DISPATCH_GRAPH_LAUNCH: + return true; + default: + return false; + } +} + +static cudaq_status_t validate_dispatcher(cudaq_dispatcher_t *dispatcher) { + if (!dispatcher) + return CUDAQ_ERR_INVALID_ARG; + if (!dispatcher->launch_fn || !dispatcher->shutdown_flag || + !dispatcher->stats) + return CUDAQ_ERR_INVALID_ARG; + if (!dispatcher->ringbuffer.rx_flags || !dispatcher->ringbuffer.tx_flags) + return CUDAQ_ERR_INVALID_ARG; + if (!dispatcher->table.entries || dispatcher->table.count == 0) + return CUDAQ_ERR_INVALID_ARG; + if (dispatcher->config.num_blocks == 0 || + dispatcher->config.threads_per_block == 0 || + dispatcher->config.num_slots == 0 || dispatcher->config.slot_size == 0) + return CUDAQ_ERR_INVALID_ARG; + if (!is_valid_kernel_type(dispatcher->config.kernel_type) || + !is_valid_dispatch_mode(dispatcher->config.dispatch_mode)) + return CUDAQ_ERR_INVALID_ARG; + return CUDAQ_OK; +} + +cudaq_status_t +cudaq_dispatch_manager_create(cudaq_dispatch_manager_t **out_mgr) { + if (!out_mgr) + return CUDAQ_ERR_INVALID_ARG; + auto *mgr = new (std::nothrow) cudaq_dispatch_manager_t(); + if (!mgr) + return CUDAQ_ERR_INTERNAL; + *out_mgr = mgr; + return CUDAQ_OK; +} + +cudaq_status_t cudaq_dispatch_manager_destroy(cudaq_dispatch_manager_t *mgr) { + delete mgr; + return CUDAQ_OK; +} + +cudaq_status_t cudaq_dispatcher_create(cudaq_dispatch_manager_t *, + const cudaq_dispatcher_config_t *config, + cudaq_dispatcher_t **out_dispatcher) { + if (!config || !out_dispatcher) + return CUDAQ_ERR_INVALID_ARG; + auto *dispatcher = new (std::nothrow) cudaq_dispatcher_t(); + if (!dispatcher) + return CUDAQ_ERR_INTERNAL; + dispatcher->config = *config; + *out_dispatcher = dispatcher; + return CUDAQ_OK; +} + +cudaq_status_t cudaq_dispatcher_destroy(cudaq_dispatcher_t *dispatcher) { + if (!dispatcher) + return CUDAQ_ERR_INVALID_ARG; + delete dispatcher; + return CUDAQ_OK; +} + +cudaq_status_t +cudaq_dispatcher_set_ringbuffer(cudaq_dispatcher_t *dispatcher, + const cudaq_ringbuffer_t *ringbuffer) { + if (!dispatcher || !ringbuffer) + return CUDAQ_ERR_INVALID_ARG; + dispatcher->ringbuffer = *ringbuffer; + return CUDAQ_OK; +} + +cudaq_status_t +cudaq_dispatcher_set_function_table(cudaq_dispatcher_t *dispatcher, + const cudaq_function_table_t *table) { + if (!dispatcher || !table) + return CUDAQ_ERR_INVALID_ARG; + dispatcher->table = *table; + return CUDAQ_OK; +} + +cudaq_status_t cudaq_dispatcher_set_control(cudaq_dispatcher_t *dispatcher, + volatile int *shutdown_flag, + uint64_t *stats) { + if (!dispatcher || !shutdown_flag || !stats) + return CUDAQ_ERR_INVALID_ARG; + dispatcher->shutdown_flag = shutdown_flag; + dispatcher->stats = stats; + return CUDAQ_OK; +} + +cudaq_status_t +cudaq_dispatcher_set_launch_fn(cudaq_dispatcher_t *dispatcher, + cudaq_dispatch_launch_fn_t launch_fn) { + if (!dispatcher || !launch_fn) + return CUDAQ_ERR_INVALID_ARG; + dispatcher->launch_fn = launch_fn; + return CUDAQ_OK; +} + +cudaq_status_t cudaq_dispatcher_start(cudaq_dispatcher_t *dispatcher) { + auto status = validate_dispatcher(dispatcher); + if (status != CUDAQ_OK) + return status; + if (dispatcher->running) + return CUDAQ_OK; + + int device_id = dispatcher->config.device_id; + if (device_id < 0) + device_id = 0; + if (cudaSetDevice(device_id) != cudaSuccess) + return CUDAQ_ERR_CUDA; + if (cudaStreamCreate(&dispatcher->stream) != cudaSuccess) + return CUDAQ_ERR_CUDA; + + dispatcher->launch_fn( + dispatcher->ringbuffer.rx_flags, dispatcher->ringbuffer.tx_flags, + dispatcher->table.entries, dispatcher->table.count, + dispatcher->shutdown_flag, dispatcher->stats, + dispatcher->config.num_slots, dispatcher->config.num_blocks, + dispatcher->config.threads_per_block, dispatcher->stream); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA error in dispatcher launch: %s (%d)\n", + cudaGetErrorString(err), err); + return CUDAQ_ERR_CUDA; + } + + dispatcher->running = true; + return CUDAQ_OK; +} + +cudaq_status_t cudaq_dispatcher_stop(cudaq_dispatcher_t *dispatcher) { + if (!dispatcher) + return CUDAQ_ERR_INVALID_ARG; + if (!dispatcher->running) + return CUDAQ_OK; + + int shutdown = 1; + if (cudaMemcpy(const_cast(dispatcher->shutdown_flag), &shutdown, + sizeof(int), cudaMemcpyHostToDevice) != cudaSuccess) + return CUDAQ_ERR_CUDA; + cudaStreamSynchronize(dispatcher->stream); + cudaStreamDestroy(dispatcher->stream); + dispatcher->stream = nullptr; + dispatcher->running = false; + return CUDAQ_OK; +} + +cudaq_status_t cudaq_dispatcher_get_processed(cudaq_dispatcher_t *dispatcher, + uint64_t *out_packets) { + if (!dispatcher || !out_packets || !dispatcher->stats) + return CUDAQ_ERR_INVALID_ARG; + + if (cudaMemcpy(out_packets, dispatcher->stats, sizeof(uint64_t), + cudaMemcpyDeviceToHost) != cudaSuccess) + return CUDAQ_ERR_CUDA; + + return CUDAQ_OK; +} diff --git a/realtime/lib/daemon/dispatcher/dispatch_kernel.cu b/realtime/lib/daemon/dispatcher/dispatch_kernel.cu new file mode 100644 index 00000000..1495902d --- /dev/null +++ b/realtime/lib/daemon/dispatcher/dispatch_kernel.cu @@ -0,0 +1,454 @@ +// Copyright (c) 2025 - Present NVIDIA Corporation & Affiliates. +// All rights reserved. +// +// This source code and the accompanying materials are made available under +// the terms of the Apache License 2.0 which accompanies this distribution. + +#include "cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel.cuh" +#include "cudaq/nvqlink/daemon/dispatcher/dispatch_modes.h" +#include "cudaq/nvqlink/daemon/dispatcher/kernel_types.h" + +#include +#include +#include + +namespace cudaq::nvqlink { + +//============================================================================== +// Dispatch Kernel Implementation (compiled into libcudaq-realtime.so) +//============================================================================== + +/// @brief Lookup function entry in table by function_id. +__device__ inline const cudaq_function_entry_t* dispatch_lookup_entry( + std::uint32_t function_id, + cudaq_function_entry_t* entries, + std::size_t entry_count) { + for (std::size_t i = 0; i < entry_count; ++i) { + if (entries[i].function_id == function_id) { + return &entries[i]; + } + } + return nullptr; +} + +/// @brief Dispatch kernel for DEVICE_CALL mode only (no graph launch support). +/// This kernel does not contain any device-side graph launch code, avoiding +/// compatibility issues on systems where cudaGraphLaunch is not supported. +template +__global__ void dispatch_kernel_device_call_only( + volatile std::uint64_t* rx_flags, + volatile std::uint64_t* tx_flags, + cudaq_function_entry_t* function_table, + std::size_t func_count, + volatile int* shutdown_flag, + std::uint64_t* stats, + std::size_t num_slots) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + std::uint64_t local_packet_count = 0; + std::size_t current_slot = 0; + + while (!(*shutdown_flag)) { + if (tid == 0) { + std::uint64_t rx_value = rx_flags[current_slot]; + if (rx_value != 0) { + + bool packet_consumed = false; + + void* data_buffer = reinterpret_cast(rx_value); + RPCHeader* header = static_cast(data_buffer); + + if (header->magic != RPC_MAGIC_REQUEST) { + packet_consumed = true; // Garbage data, consume it to clear it + } else { + const cudaq_function_entry_t* entry = dispatch_lookup_entry( + header->function_id, function_table, func_count); + + if (entry != nullptr && entry->dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) { + DeviceRPCFunction func = + reinterpret_cast(entry->handler.device_fn_ptr); + std::uint32_t result_len = 0; + std::uint32_t max_result_len = 1024; + void* arg_buffer = static_cast(header + 1); + int status = func(arg_buffer, header->arg_len, max_result_len, &result_len); + + RPCResponse* response = static_cast(data_buffer); + response->magic = RPC_MAGIC_RESPONSE; + response->status = status; + response->result_len = result_len; + + __threadfence_system(); + tx_flags[current_slot] = rx_value; + } + // Whether the entry was found or not, consume the packet + packet_consumed = true; + } + + if (packet_consumed) { + __threadfence_system(); + rx_flags[current_slot] = 0; + local_packet_count++; + } + current_slot = (current_slot + 1) % num_slots; + } + } + + KernelType::sync(); + + if ((local_packet_count & 0xFF) == 0) { + __threadfence_system(); + } + } + + if (tid == 0) { + atomicAdd(reinterpret_cast(stats), local_packet_count); + } +} + +/// @brief Dispatch kernel supporting both DEVICE_CALL and GRAPH_LAUNCH modes. +/// This kernel includes device-side graph launch code for sm_80+ (compute capability >= 8.0). +/// NOTE: Graph launch code is conditionally compiled based on __CUDA_ARCH__. +template +__global__ void dispatch_kernel_with_graph( + volatile std::uint64_t* rx_flags, + volatile std::uint64_t* tx_flags, + cudaq_function_entry_t* function_table, + std::size_t func_count, + void** global_mailbox_bank, + volatile int* shutdown_flag, + std::uint64_t* stats, + std::size_t num_slots) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + std::uint64_t local_packet_count = 0; + std::size_t current_slot = 0; + + while (!(*shutdown_flag)) { + if (tid == 0) { + std::uint64_t rx_value = rx_flags[current_slot]; + if (rx_value != 0) { + + bool packet_consumed = false; + + void* data_buffer = reinterpret_cast(rx_value); + RPCHeader* header = static_cast(data_buffer); + + if (header->magic != RPC_MAGIC_REQUEST) { + packet_consumed = true; // Garbage data, consume it to clear it + } else { + const cudaq_function_entry_t* entry = dispatch_lookup_entry( + header->function_id, function_table, func_count); + + if (entry != nullptr) { + if (entry->dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) { + DeviceRPCFunction func = + reinterpret_cast(entry->handler.device_fn_ptr); + std::uint32_t result_len = 0; + std::uint32_t max_result_len = 1024; + void* arg_buffer = static_cast(header + 1); + int status = func(arg_buffer, header->arg_len, max_result_len, &result_len); + + RPCResponse* response = static_cast(data_buffer); + response->magic = RPC_MAGIC_RESPONSE; + response->status = status; + response->result_len = result_len; + + __threadfence_system(); + tx_flags[current_slot] = rx_value; + packet_consumed = true; + } +#if __CUDA_ARCH__ >= 800 + else if (entry->dispatch_mode == CUDAQ_DISPATCH_GRAPH_LAUNCH) { + + int mailbox_idx = static_cast(entry->mailbox_idx); + + // --- SINGLE-LAUNCH GUARD (fixes review issue #1) --- + // Check d_inflight_flag first: if a previous graph execution + // is still in flight for this predecoder, skip it. The output + // kernel clears this flag when it finishes. + volatile int* d_inflight = entry->d_inflight_flag; + bool already_in_flight = (d_inflight != nullptr && *d_inflight == 1); + + // --- BACKPRESSURE CHECK --- + // Even if not in-flight, the CPU queue may be full. + bool queue_full = false; + if (!already_in_flight) { + int* d_queue_idx = entry->d_queue_idx; + volatile int* d_ready_flags = entry->d_ready_flags; + if (d_queue_idx != nullptr && d_ready_flags != nullptr) { + int current_tail = *d_queue_idx; + if (d_ready_flags[current_tail] == 1) { + queue_full = true; + } + } + } + // ------------------------------- + + if (already_in_flight || queue_full) { + // Do NOT launch. Packet stays in ring buffer for retry. + packet_consumed = false; + } else { + // CLEAR TO LAUNCH: set inflight flag, write mailbox, launch graph. + if (d_inflight != nullptr) { + *d_inflight = 1; + __threadfence_system(); // Ensure flag is visible before graph reads it + } + + if (global_mailbox_bank != nullptr) { + global_mailbox_bank[mailbox_idx] = data_buffer; + __threadfence_system(); + } + + cudaError_t launch_err = cudaGraphLaunch(entry->handler.graph_exec, cudaStreamGraphFireAndForget); + if (launch_err != cudaSuccess) { + // Launch failed: write error code to tx_flags for host diagnostics + // Error codes are small integers, distinguishable from valid pointers + tx_flags[current_slot] = 0xDEAD000000000000ULL | (uint64_t)launch_err; + __threadfence_system(); + // Roll back inflight flag since graph never ran + if (d_inflight != nullptr) { + *d_inflight = 0; + __threadfence_system(); + } + } + packet_consumed = true; + } + } +#endif // __CUDA_ARCH__ >= 800 + } else { + packet_consumed = true; // Unknown function, drop it + } + } + + // --- ADVANCE LOGIC --- + if (packet_consumed) { + __threadfence_system(); + rx_flags[current_slot] = 0; // Clear the slot ONLY if we launched it + local_packet_count++; + } + + // ALWAYS advance the slot pointer to keep checking other arrivals + // If we skipped a packet due to backpressure, we will loop back to it eventually. + current_slot = (current_slot + 1) % num_slots; + } + } + + KernelType::sync(); + + if ((local_packet_count & 0xFF) == 0) { + __threadfence_system(); + } + } + + if (tid == 0) { + atomicAdd(reinterpret_cast(stats), local_packet_count); + } +} + +} // namespace cudaq::nvqlink + +//============================================================================== +// Host Launch Functions +//============================================================================== + +extern "C" void cudaq_launch_dispatch_kernel_regular( + volatile std::uint64_t* rx_flags, + volatile std::uint64_t* tx_flags, + cudaq_function_entry_t* function_table, + std::size_t func_count, + volatile int* shutdown_flag, + std::uint64_t* stats, + std::size_t num_slots, + std::uint32_t num_blocks, + std::uint32_t threads_per_block, + cudaStream_t stream) { + // Use device-call-only kernel (no graph launch support) + cudaq::nvqlink::dispatch_kernel_device_call_only + <<>>( + rx_flags, tx_flags, function_table, func_count, + shutdown_flag, stats, num_slots); +} + +extern "C" void cudaq_launch_dispatch_kernel_cooperative( + volatile std::uint64_t* rx_flags, + volatile std::uint64_t* tx_flags, + cudaq_function_entry_t* function_table, + std::size_t func_count, + volatile int* shutdown_flag, + std::uint64_t* stats, + std::size_t num_slots, + std::uint32_t num_blocks, + std::uint32_t threads_per_block, + cudaStream_t stream) { + void* kernel_args[] = { + const_cast(&rx_flags), + const_cast(&tx_flags), + &function_table, + &func_count, + const_cast(&shutdown_flag), + &stats, + &num_slots + }; + + cudaLaunchCooperativeKernel( + reinterpret_cast( + cudaq::nvqlink::dispatch_kernel_device_call_only), + dim3(num_blocks), dim3(threads_per_block), kernel_args, 0, stream); +} + +//============================================================================== +// Graph-Based Dispatch (Proper Device-Side Graph Launch Support) +//============================================================================== +// +// To use device-side cudaGraphLaunch(), the dispatch kernel itself must be +// running inside a graph execution context. These functions create a graph +// containing the dispatch kernel, instantiate it with cudaGraphInstantiateFlagDeviceLaunch, +// and provide proper launch/cleanup functions. + +// Internal storage for graph-based dispatch context +// Parameters must be stored persistently since the graph may execute after +// the create function returns. +struct cudaq_dispatch_graph_context { + cudaGraph_t graph; + cudaGraphExec_t graph_exec; + cudaGraphNode_t kernel_node; + bool is_valid; + + // Persistent storage for kernel parameters (must outlive graph execution) + volatile std::uint64_t* rx_flags; + volatile std::uint64_t* tx_flags; + cudaq_function_entry_t* function_table; + std::size_t func_count; + void** global_mailbox_bank; + volatile int* shutdown_flag; + std::uint64_t* stats; + std::size_t num_slots; +}; + +extern "C" cudaError_t cudaq_create_dispatch_graph_regular( + volatile std::uint64_t* rx_flags, + volatile std::uint64_t* tx_flags, + cudaq_function_entry_t* function_table, + std::size_t func_count, + void** global_mailbox_bank, + volatile int* shutdown_flag, + std::uint64_t* stats, + std::size_t num_slots, + std::uint32_t num_blocks, + std::uint32_t threads_per_block, + cudaStream_t stream, + cudaq_dispatch_graph_context** out_context) { + + cudaError_t err; + + // Allocate context with persistent parameter storage + cudaq_dispatch_graph_context* ctx = new cudaq_dispatch_graph_context(); + ctx->is_valid = false; + + // Store parameters persistently in the context + ctx->rx_flags = rx_flags; + ctx->tx_flags = tx_flags; + ctx->function_table = function_table; + ctx->func_count = func_count; + ctx->global_mailbox_bank = global_mailbox_bank; + ctx->shutdown_flag = shutdown_flag; + ctx->stats = stats; + ctx->num_slots = num_slots; + + // Create graph + err = cudaGraphCreate(&ctx->graph, 0); + if (err != cudaSuccess) { + delete ctx; + return err; + } + + // Set up kernel parameters - point to persistent storage in context + cudaKernelNodeParams kernel_params = {}; + void* kernel_args[] = { + &ctx->rx_flags, + &ctx->tx_flags, + &ctx->function_table, + &ctx->func_count, + &ctx->global_mailbox_bank, + &ctx->shutdown_flag, + &ctx->stats, + &ctx->num_slots + }; + + kernel_params.func = reinterpret_cast( + cudaq::nvqlink::dispatch_kernel_with_graph); + kernel_params.gridDim = dim3(num_blocks, 1, 1); + kernel_params.blockDim = dim3(threads_per_block, 1, 1); + kernel_params.sharedMemBytes = 0; + kernel_params.kernelParams = kernel_args; + kernel_params.extra = nullptr; + + // Add kernel node to graph + err = cudaGraphAddKernelNode(&ctx->kernel_node, ctx->graph, nullptr, 0, &kernel_params); + if (err != cudaSuccess) { + cudaGraphDestroy(ctx->graph); + delete ctx; + return err; + } + + // Instantiate with device launch flag - THIS IS THE KEY! + err = cudaGraphInstantiate(&ctx->graph_exec, ctx->graph, + cudaGraphInstantiateFlagDeviceLaunch); + if (err != cudaSuccess) { + cudaGraphDestroy(ctx->graph); + delete ctx; + return err; + } + + // Upload graph to device (required before device-side launch) + err = cudaGraphUpload(ctx->graph_exec, stream); + if (err != cudaSuccess) { + cudaGraphExecDestroy(ctx->graph_exec); + cudaGraphDestroy(ctx->graph); + delete ctx; + return err; + } + + // Synchronize to ensure upload completes + err = cudaStreamSynchronize(stream); + if (err != cudaSuccess) { + cudaGraphExecDestroy(ctx->graph_exec); + cudaGraphDestroy(ctx->graph); + delete ctx; + return err; + } + + ctx->is_valid = true; + *out_context = ctx; + return cudaSuccess; +} + +extern "C" cudaError_t cudaq_launch_dispatch_graph( + cudaq_dispatch_graph_context* context, + cudaStream_t stream) { + if (context == nullptr || !context->is_valid) { + return cudaErrorInvalidValue; + } + + // Launch the graph - now device-side cudaGraphLaunch will work! + return cudaGraphLaunch(context->graph_exec, stream); +} + +extern "C" cudaError_t cudaq_destroy_dispatch_graph( + cudaq_dispatch_graph_context* context) { + if (context == nullptr) { + return cudaErrorInvalidValue; + } + + cudaError_t err = cudaSuccess; + + if (context->is_valid) { + cudaError_t err1 = cudaGraphExecDestroy(context->graph_exec); + cudaError_t err2 = cudaGraphDestroy(context->graph); + if (err1 != cudaSuccess) err = err1; + else if (err2 != cudaSuccess) err = err2; + } + + delete context; + return err; +} diff --git a/realtime/unittests/CMakeLists.txt b/realtime/unittests/CMakeLists.txt new file mode 100644 index 00000000..ee5e41bd --- /dev/null +++ b/realtime/unittests/CMakeLists.txt @@ -0,0 +1,78 @@ +# ============================================================================ # +# Copyright (c) 2024 - 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +# External Dependencies +# ============================================================================== + +FetchContent_Declare( + googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG v1.17.0 + EXCLUDE_FROM_ALL +) +FetchContent_MakeAvailable(googletest) + +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + +# Bug in GCC 12 leads to spurious warnings (-Wrestrict) +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105329 +if (CMAKE_COMPILER_IS_GNUCXX + AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0.0 + AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 13.0.0) + target_compile_options(gtest PUBLIC --param=evrp-mode=legacy) +endif() +include(GoogleTest) + + +add_compile_options(-Wno-attributes) + +# ============================================================================== +# GPU Dispatch Kernel Tests +# ============================================================================== + +find_package(CUDAToolkit) +if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) + + add_executable(test_dispatch_kernel test_dispatch_kernel.cu) + + set_target_properties(test_dispatch_kernel PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + ) + + target_include_directories(test_dispatch_kernel PRIVATE + ${CUDAToolkit_INCLUDE_DIRS} + ${CUDAQ_NVQLINK_INCLUDE_DIR} + ) + + # Find CUDA device runtime library (required for device-side API calls like cudaGraphLaunch) + find_library(CUDADEVRT_LIBRARY cudadevrt + HINTS ${CUDAToolkit_LIBRARY_DIR} + REQUIRED + ) + + target_link_libraries(test_dispatch_kernel PRIVATE + GTest::gtest_main + CUDA::cudart + cudaq-realtime + cudaq-realtime-dispatch + ${CUDADEVRT_LIBRARY} + ) + + add_dependencies(NVQLINKUnitTests test_dispatch_kernel) + gtest_discover_tests(test_dispatch_kernel + TEST_PREFIX "test_dispatch_kernel." + ) + + message(STATUS " - test_dispatch_kernel (GPU dispatch infrastructure)") +endif() + +# ============================================================================== + + diff --git a/realtime/unittests/test_dispatch_kernel.cu b/realtime/unittests/test_dispatch_kernel.cu new file mode 100644 index 00000000..eae65dcc --- /dev/null +++ b/realtime/unittests/test_dispatch_kernel.cu @@ -0,0 +1,693 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include "cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/nvqlink/daemon/dispatcher/kernel_types.h" +#include "cudaq/nvqlink/daemon/dispatcher/dispatch_modes.h" +#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel.cuh" + +// Helper macro for CUDA error checking +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + ASSERT_EQ(err, cudaSuccess) << "CUDA error: " << cudaGetErrorString(err); \ + } while (0) + +namespace { + +//============================================================================== +// Test Handler: Simple noop that copies input to output +//============================================================================== + +/// @brief Test handler that adds 1 to each byte. +__device__ int increment_handler(void* buffer, std::uint32_t arg_len, + std::uint32_t max_result_len, + std::uint32_t* result_len) { + std::uint8_t* data = static_cast(buffer); + for (std::uint32_t i = 0; i < arg_len && i < max_result_len; ++i) { + data[i] = data[i] + 1; + } + *result_len = arg_len; + return 0; +} + +//============================================================================== +// Host API Dispatch Kernel Test Helpers +//============================================================================== + +constexpr std::uint32_t RPC_INCREMENT_FUNCTION_ID = + cudaq::nvqlink::fnv1a_hash("rpc_increment"); + +__device__ int rpc_increment_handler(void* buffer, std::uint32_t arg_len, + std::uint32_t max_result_len, + std::uint32_t* result_len) { + std::uint8_t* data = static_cast(buffer); + for (std::uint32_t i = 0; i < arg_len && i < max_result_len; ++i) { + data[i] = static_cast(data[i] + 1); + } + *result_len = arg_len; + return 0; +} + +__global__ void init_rpc_function_table(cudaq_function_entry_t* entries) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + entries[0].handler.device_fn_ptr = reinterpret_cast(&rpc_increment_handler); + entries[0].function_id = RPC_INCREMENT_FUNCTION_ID; + entries[0].dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; + entries[0].reserved[0] = 0; + entries[0].reserved[1] = 0; + entries[0].reserved[2] = 0; + + // Schema: 1 array argument (uint8), 1 array result (uint8) + entries[0].schema.num_args = 1; + entries[0].schema.num_results = 1; + entries[0].schema.reserved = 0; + entries[0].schema.args[0].type_id = CUDAQ_TYPE_ARRAY_UINT8; + entries[0].schema.args[0].reserved[0] = 0; + entries[0].schema.args[0].reserved[1] = 0; + entries[0].schema.args[0].reserved[2] = 0; + entries[0].schema.args[0].size_bytes = 0; // Variable size + entries[0].schema.args[0].num_elements = 0; // Variable size + entries[0].schema.results[0].type_id = CUDAQ_TYPE_ARRAY_UINT8; + entries[0].schema.results[0].reserved[0] = 0; + entries[0].schema.results[0].reserved[1] = 0; + entries[0].schema.results[0].reserved[2] = 0; + entries[0].schema.results[0].size_bytes = 0; // Variable size + entries[0].schema.results[0].num_elements = 0; // Variable size + } +} + +bool allocate_ring_buffer(std::size_t num_slots, std::size_t slot_size, + volatile uint64_t** host_flags_out, + volatile uint64_t** device_flags_out, + std::uint8_t** host_data_out, + std::uint8_t** device_data_out) { + void* host_flags_ptr = nullptr; + cudaError_t err = cudaHostAlloc(&host_flags_ptr, + num_slots * sizeof(uint64_t), + cudaHostAllocMapped); + if (err != cudaSuccess) + return false; + + void* device_flags_ptr = nullptr; + err = cudaHostGetDevicePointer(&device_flags_ptr, host_flags_ptr, 0); + if (err != cudaSuccess) { + cudaFreeHost(host_flags_ptr); + return false; + } + + void* host_data_ptr = nullptr; + err = cudaHostAlloc(&host_data_ptr, + num_slots * slot_size, + cudaHostAllocMapped); + if (err != cudaSuccess) { + cudaFreeHost(host_flags_ptr); + return false; + } + + void* device_data_ptr = nullptr; + err = cudaHostGetDevicePointer(&device_data_ptr, host_data_ptr, 0); + if (err != cudaSuccess) { + cudaFreeHost(host_flags_ptr); + cudaFreeHost(host_data_ptr); + return false; + } + + memset(host_flags_ptr, 0, num_slots * sizeof(uint64_t)); + + *host_flags_out = static_cast(host_flags_ptr); + *device_flags_out = static_cast(device_flags_ptr); + *host_data_out = static_cast(host_data_ptr); + *device_data_out = static_cast(device_data_ptr); + return true; +} + +void free_ring_buffer(volatile uint64_t* host_flags, + std::uint8_t* host_data) { + if (host_flags) + cudaFreeHost(const_cast(host_flags)); + if (host_data) + cudaFreeHost(host_data); +} + +extern "C" void launch_dispatch_kernel_wrapper( + volatile std::uint64_t* rx_flags, + volatile std::uint64_t* tx_flags, + cudaq_function_entry_t* function_table, + std::size_t func_count, + volatile int* shutdown_flag, + std::uint64_t* stats, + std::size_t num_slots, + std::uint32_t num_blocks, + std::uint32_t threads_per_block, + cudaStream_t stream) { + cudaq_launch_dispatch_kernel_regular( + rx_flags, tx_flags, function_table, func_count, + shutdown_flag, stats, num_slots, num_blocks, threads_per_block, stream); +} + +//============================================================================== +// Test Kernel for DeviceCallMode +//============================================================================== + +using HandlerFunc = int (*)(void*, std::uint32_t, std::uint32_t, std::uint32_t*); + +__device__ HandlerFunc d_increment_handler = increment_handler; + +/// @brief Test kernel that dispatches to a handler using DeviceCallMode. +template +__global__ void test_dispatch_kernel( + HandlerFunc handler, + void* buffer, + std::uint32_t arg_len, + std::uint32_t max_result_len, + std::uint32_t* result_len, + int* status) { + + if (threadIdx.x == 0 && blockIdx.x == 0) { + *status = handler(buffer, arg_len, max_result_len, result_len); + } + + KernelType::sync(); +} + +//============================================================================== +// Test Fixture +//============================================================================== + +class DispatchKernelTest : public ::testing::Test { +protected: + void SetUp() override { + CUDA_CHECK(cudaMalloc(&d_buffer_, 1024)); + CUDA_CHECK(cudaMalloc(&d_result_len_, sizeof(std::uint32_t))); + CUDA_CHECK(cudaMalloc(&d_status_, sizeof(int))); + } + + void TearDown() override { + if (d_buffer_) cudaFree(d_buffer_); + if (d_result_len_) cudaFree(d_result_len_); + if (d_status_) cudaFree(d_status_); + } + + void* d_buffer_ = nullptr; + std::uint32_t* d_result_len_ = nullptr; + int* d_status_ = nullptr; +}; + +//============================================================================== +// Tests +//============================================================================== + +TEST_F(DispatchKernelTest, IncrementHandlerBasic) { + // Prepare test data + std::vector input = {0, 1, 2, 3, 4}; + std::vector expected = {1, 2, 3, 4, 5}; + CUDA_CHECK(cudaMemcpy(d_buffer_, input.data(), input.size(), + cudaMemcpyHostToDevice)); + + // Get device function pointer + HandlerFunc h_handler; + CUDA_CHECK(cudaMemcpyFromSymbol(&h_handler, d_increment_handler, + sizeof(HandlerFunc))); + + // Launch kernel + test_dispatch_kernel<<<1, 32>>>( + h_handler, d_buffer_, input.size(), 1024, d_result_len_, d_status_); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + + // Check results + int status; + std::uint32_t result_len; + CUDA_CHECK(cudaMemcpy(&status, d_status_, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(&result_len, d_result_len_, sizeof(std::uint32_t), + cudaMemcpyDeviceToHost)); + + EXPECT_EQ(status, 0) << "Handler should return success"; + EXPECT_EQ(result_len, input.size()) << "Result length should match input"; + + // Verify data incremented + std::vector output(input.size()); + CUDA_CHECK(cudaMemcpy(output.data(), d_buffer_, output.size(), + cudaMemcpyDeviceToHost)); + EXPECT_EQ(expected, output) << "Increment handler should add 1 to each byte"; +} + +TEST_F(DispatchKernelTest, LargeBuffer) { + // Test with larger data + const std::size_t size = 512; + std::vector input(size); + for (std::size_t i = 0; i < size; ++i) { + input[i] = static_cast(i & 0xFF); + } + + CUDA_CHECK(cudaMemcpy(d_buffer_, input.data(), input.size(), + cudaMemcpyHostToDevice)); + + HandlerFunc h_handler; + CUDA_CHECK(cudaMemcpyFromSymbol(&h_handler, d_increment_handler, + sizeof(HandlerFunc))); + + test_dispatch_kernel<<<1, 256>>>( + h_handler, d_buffer_, input.size(), 1024, d_result_len_, d_status_); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + + std::uint32_t result_len; + CUDA_CHECK(cudaMemcpy(&result_len, d_result_len_, sizeof(std::uint32_t), + cudaMemcpyDeviceToHost)); + EXPECT_EQ(result_len, size) << "Should process all bytes"; + + // Verify all bytes incremented + std::vector output(size); + CUDA_CHECK(cudaMemcpy(output.data(), d_buffer_, output.size(), + cudaMemcpyDeviceToHost)); + + for (std::size_t i = 0; i < size; ++i) { + uint8_t expected = static_cast((i + 1) & 0xFF); + EXPECT_EQ(output[i], expected) << "Mismatch at index " << i; + } +} + +class HostApiDispatchTest : public ::testing::Test { +protected: + void SetUp() override { + ASSERT_TRUE(allocate_ring_buffer(num_slots_, slot_size_, &rx_flags_host_, + &rx_flags_, &rx_data_host_, &rx_data_)); + ASSERT_TRUE(allocate_ring_buffer(num_slots_, slot_size_, &tx_flags_host_, + &tx_flags_, &tx_data_host_, &tx_data_)); + + void* tmp_shutdown = nullptr; + CUDA_CHECK(cudaHostAlloc(&tmp_shutdown, sizeof(int), cudaHostAllocMapped)); + shutdown_flag_ = static_cast(tmp_shutdown); + void* tmp_d_shutdown = nullptr; + CUDA_CHECK(cudaHostGetDevicePointer(&tmp_d_shutdown, tmp_shutdown, 0)); + d_shutdown_flag_ = static_cast(tmp_d_shutdown); + *shutdown_flag_ = 0; + int zero = 0; + CUDA_CHECK(cudaMemcpy(const_cast(d_shutdown_flag_), &zero, + sizeof(int), cudaMemcpyHostToDevice)); + + CUDA_CHECK(cudaMalloc(&d_stats_, sizeof(uint64_t))); + CUDA_CHECK(cudaMemset(d_stats_, 0, sizeof(uint64_t))); + + CUDA_CHECK(cudaMalloc(&d_function_entries_, sizeof(cudaq_function_entry_t))); + init_rpc_function_table<<<1, 1>>>(d_function_entries_); + CUDA_CHECK(cudaDeviceSynchronize()); + func_count_ = 1; + + ASSERT_EQ(cudaq_dispatch_manager_create(&manager_), CUDAQ_OK); + cudaq_dispatcher_config_t config{}; + config.device_id = 0; + config.num_blocks = 1; + config.threads_per_block = 64; + config.num_slots = static_cast(num_slots_); + config.slot_size = static_cast(slot_size_); + config.vp_id = 0; + config.kernel_type = CUDAQ_KERNEL_REGULAR; + config.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; + ASSERT_EQ(cudaq_dispatcher_create(manager_, &config, &dispatcher_), CUDAQ_OK); + + cudaq_ringbuffer_t ringbuffer{}; + ringbuffer.rx_flags = rx_flags_; + ringbuffer.tx_flags = tx_flags_; + ASSERT_EQ(cudaq_dispatcher_set_ringbuffer(dispatcher_, &ringbuffer), CUDAQ_OK); + + cudaq_function_table_t table{}; + table.entries = d_function_entries_; + table.count = func_count_; + ASSERT_EQ(cudaq_dispatcher_set_function_table(dispatcher_, &table), CUDAQ_OK); + + ASSERT_EQ( + cudaq_dispatcher_set_control(dispatcher_, d_shutdown_flag_, d_stats_), + CUDAQ_OK); + ASSERT_EQ(cudaq_dispatcher_set_launch_fn(dispatcher_, + &launch_dispatch_kernel_wrapper), + CUDAQ_OK); + ASSERT_EQ(cudaq_dispatcher_start(dispatcher_), CUDAQ_OK); + } + + void TearDown() override { + if (shutdown_flag_) { + *shutdown_flag_ = 1; + __sync_synchronize(); + } + if (dispatcher_) { + cudaq_dispatcher_stop(dispatcher_); + cudaq_dispatcher_destroy(dispatcher_); + dispatcher_ = nullptr; + } + if (manager_) { + cudaq_dispatch_manager_destroy(manager_); + manager_ = nullptr; + } + free_ring_buffer(rx_flags_host_, rx_data_host_); + free_ring_buffer(tx_flags_host_, tx_data_host_); + + if (shutdown_flag_) + cudaFreeHost(const_cast(shutdown_flag_)); + if (d_stats_) + cudaFree(d_stats_); + if (d_function_entries_) + cudaFree(d_function_entries_); + } + + void write_rpc_request(std::size_t slot, + const std::vector& payload) { + std::uint8_t* slot_data = + const_cast(rx_data_host_) + slot * slot_size_; + auto* header = reinterpret_cast(slot_data); + header->magic = cudaq::nvqlink::RPC_MAGIC_REQUEST; + header->function_id = RPC_INCREMENT_FUNCTION_ID; + header->arg_len = static_cast(payload.size()); + memcpy(slot_data + sizeof(cudaq::nvqlink::RPCHeader), payload.data(), + payload.size()); + } + + bool read_rpc_response(std::size_t slot, + std::vector& payload, + std::int32_t* status_out = nullptr, + std::uint32_t* result_len_out = nullptr) { + __sync_synchronize(); + const std::uint8_t* slot_data = + const_cast(rx_data_host_) + slot * slot_size_; + auto* response = + reinterpret_cast(slot_data); + + if (response->magic != cudaq::nvqlink::RPC_MAGIC_RESPONSE) + return false; + if (status_out) + *status_out = response->status; + if (result_len_out) + *result_len_out = response->result_len; + if (response->status != 0) + return false; + + payload.resize(response->result_len); + memcpy(payload.data(), + slot_data + sizeof(cudaq::nvqlink::RPCResponse), + response->result_len); + return true; + } + + static constexpr std::size_t num_slots_ = 2; + std::size_t slot_size_ = 256; + volatile uint64_t* rx_flags_host_ = nullptr; + volatile uint64_t* tx_flags_host_ = nullptr; + volatile uint64_t* rx_flags_ = nullptr; + volatile uint64_t* tx_flags_ = nullptr; + std::uint8_t* rx_data_host_ = nullptr; + std::uint8_t* tx_data_host_ = nullptr; + std::uint8_t* rx_data_ = nullptr; + std::uint8_t* tx_data_ = nullptr; + + volatile int* shutdown_flag_ = nullptr; + volatile int* d_shutdown_flag_ = nullptr; + uint64_t* d_stats_ = nullptr; + + cudaq_function_entry_t* d_function_entries_ = nullptr; + std::size_t func_count_ = 0; + + cudaq_dispatch_manager_t* manager_ = nullptr; + cudaq_dispatcher_t* dispatcher_ = nullptr; +}; + +TEST_F(HostApiDispatchTest, RpcIncrementHandler) { + std::vector payload = {0, 1, 2, 3}; + write_rpc_request(0, payload); + + __sync_synchronize(); + const_cast(rx_flags_host_)[0] = + reinterpret_cast(rx_data_); + + int timeout = 50; + while (tx_flags_host_[0] == 0 && timeout-- > 0) { + usleep(1000); + } + ASSERT_GT(timeout, 0) << "Timeout waiting for dispatch kernel response"; + + std::vector response; + std::int32_t status = -1; + std::uint32_t result_len = 0; + ASSERT_TRUE(read_rpc_response(0, response, &status, &result_len)); + EXPECT_EQ(status, 0); + ASSERT_EQ(result_len, payload.size()); + + std::vector expected = {1, 2, 3, 4}; + EXPECT_EQ(response, expected); +} + +//============================================================================== +// Graph Launch Test +//============================================================================== + +// Graph kernel that processes RPC buffer via pointer indirection +__global__ void graph_increment_kernel(void** buffer_ptr) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + void* buffer = *buffer_ptr; + cudaq::nvqlink::RPCHeader* header = static_cast(buffer); + + std::uint32_t arg_len = header->arg_len; + void* arg_buffer = static_cast(header + 1); + std::uint8_t* data = static_cast(arg_buffer); + + // Increment each byte + for (std::uint32_t i = 0; i < arg_len; ++i) { + data[i] = data[i] + 1; + } + + // Write response + cudaq::nvqlink::RPCResponse* response = static_cast(buffer); + response->magic = cudaq::nvqlink::RPC_MAGIC_RESPONSE; + response->status = 0; + response->result_len = arg_len; + } +} + +constexpr std::uint32_t RPC_GRAPH_INCREMENT_FUNCTION_ID = + cudaq::nvqlink::fnv1a_hash("rpc_graph_increment"); + +__global__ void init_graph_function_table(cudaq_function_entry_t* entries, + cudaGraphExec_t graph_exec) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + entries[0].handler.graph_exec = graph_exec; + entries[0].function_id = RPC_GRAPH_INCREMENT_FUNCTION_ID; + entries[0].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; + entries[0].reserved[0] = 0; + entries[0].reserved[1] = 0; + entries[0].reserved[2] = 0; + } +} + +TEST(GraphLaunchTest, DispatchKernelGraphLaunch) { + // Check compute capability + int device; + CUDA_CHECK(cudaGetDevice(&device)); + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); + + if (prop.major < 8) { + GTEST_SKIP() << "Graph device launch requires compute capability 8.0+, found " + << prop.major << "." << prop.minor; + } + + // Allocate graph buffer pointer (for pointer indirection pattern) + void** d_graph_buffer_ptr; + CUDA_CHECK(cudaMalloc(&d_graph_buffer_ptr, sizeof(void*))); + CUDA_CHECK(cudaMemset(d_graph_buffer_ptr, 0, sizeof(void*))); + + // Allocate test buffer + constexpr size_t buffer_size = 1024; + void* d_buffer; + CUDA_CHECK(cudaMalloc(&d_buffer, buffer_size)); + + // Create the child graph (the one that will be launched from device) + cudaGraph_t child_graph; + cudaGraphExec_t child_graph_exec; + + CUDA_CHECK(cudaGraphCreate(&child_graph, 0)); + + // Add kernel node to child graph + cudaKernelNodeParams kernel_params = {}; + void* kernel_args[] = {&d_graph_buffer_ptr}; + kernel_params.func = reinterpret_cast(&graph_increment_kernel); + kernel_params.gridDim = dim3(1, 1, 1); + kernel_params.blockDim = dim3(32, 1, 1); + kernel_params.sharedMemBytes = 0; + kernel_params.kernelParams = kernel_args; + kernel_params.extra = nullptr; + + cudaGraphNode_t kernel_node; + CUDA_CHECK(cudaGraphAddKernelNode(&kernel_node, child_graph, nullptr, 0, &kernel_params)); + + // Instantiate CHILD graph with DEVICE LAUNCH FLAG + CUDA_CHECK(cudaGraphInstantiate(&child_graph_exec, child_graph, + cudaGraphInstantiateFlagDeviceLaunch)); + + // Create stream for operations + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + // Upload the child graph to device + CUDA_CHECK(cudaGraphUpload(child_graph_exec, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // Set up function table with graph launch entry + cudaq_function_entry_t* d_function_entries; + CUDA_CHECK(cudaMalloc(&d_function_entries, sizeof(cudaq_function_entry_t))); + init_graph_function_table<<<1, 1>>>(d_function_entries, child_graph_exec); + CUDA_CHECK(cudaDeviceSynchronize()); + + // Set up RPC buffer on host + std::uint8_t* h_buffer = new std::uint8_t[buffer_size]; + cudaq::nvqlink::RPCHeader* h_header = reinterpret_cast(h_buffer); + h_header->magic = cudaq::nvqlink::RPC_MAGIC_REQUEST; + h_header->function_id = RPC_GRAPH_INCREMENT_FUNCTION_ID; + h_header->arg_len = 4; + + std::uint8_t* h_data = h_buffer + sizeof(cudaq::nvqlink::RPCHeader); + h_data[0] = 0; + h_data[1] = 1; + h_data[2] = 2; + h_data[3] = 3; + + // Copy to device + CUDA_CHECK(cudaMemcpy(d_buffer, h_buffer, buffer_size, cudaMemcpyHostToDevice)); + + // Set up fake RX/TX flags for single-shot test + volatile uint64_t* d_rx_flags; + volatile uint64_t* d_tx_flags; + CUDA_CHECK(cudaMalloc(&d_rx_flags, sizeof(uint64_t))); + CUDA_CHECK(cudaMalloc(&d_tx_flags, sizeof(uint64_t))); + CUDA_CHECK(cudaMemset((void*)d_rx_flags, 0, sizeof(uint64_t))); + CUDA_CHECK(cudaMemset((void*)d_tx_flags, 0, sizeof(uint64_t))); + + // Set RX flag to point to our buffer (simulating incoming RPC) + uint64_t buffer_addr = reinterpret_cast(d_buffer); + CUDA_CHECK(cudaMemcpy((void*)d_rx_flags, &buffer_addr, sizeof(uint64_t), cudaMemcpyHostToDevice)); + + // Set up shutdown flag using pinned mapped memory so the dispatch kernel + // can see host updates immediately + volatile int* h_shutdown; + volatile int* d_shutdown; + { + void* tmp_shutdown; + CUDA_CHECK(cudaHostAlloc(&tmp_shutdown, sizeof(int), cudaHostAllocMapped)); + h_shutdown = static_cast(tmp_shutdown); + *h_shutdown = 0; + + void* tmp_d_shutdown; + CUDA_CHECK(cudaHostGetDevicePointer(&tmp_d_shutdown, tmp_shutdown, 0)); + d_shutdown = static_cast(tmp_d_shutdown); + } + int shutdown_val = 0; // Local variable for tracking + + // Set up stats + uint64_t* d_stats; + CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t))); + CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t))); + + // Create dispatch graph context - THIS WRAPS THE DISPATCH KERNEL IN A GRAPH + // so that device-side cudaGraphLaunch() can work! + cudaq_dispatch_graph_context* dispatch_ctx = nullptr; + cudaError_t err = cudaq_create_dispatch_graph_regular( + d_rx_flags, d_tx_flags, d_function_entries, 1, + nullptr, d_shutdown, d_stats, 1, + 1, 32, stream, &dispatch_ctx); + + if (err != cudaSuccess) { + GTEST_SKIP() << "Device-side graph launch not supported: " + << cudaGetErrorString(err) << " (" << err << ")"; + } + + // Launch dispatch graph - now device-side cudaGraphLaunch will work! + CUDA_CHECK(cudaq_launch_dispatch_graph(dispatch_ctx, stream)); + + // Poll for the response using pinned memory and async operations + // The child graph runs asynchronously (fire-and-forget) so we need to poll + std::uint8_t* h_poll_buffer; + CUDA_CHECK(cudaHostAlloc(&h_poll_buffer, sizeof(cudaq::nvqlink::RPCResponse), cudaHostAllocDefault)); + memset(h_poll_buffer, 0, sizeof(cudaq::nvqlink::RPCResponse)); + + cudaStream_t poll_stream; + CUDA_CHECK(cudaStreamCreate(&poll_stream)); + + int timeout_ms = 5000; + int poll_interval_ms = 100; + bool got_response = false; + + for (int elapsed = 0; elapsed < timeout_ms; elapsed += poll_interval_ms) { + CUDA_CHECK(cudaMemcpyAsync(h_poll_buffer, d_buffer, sizeof(cudaq::nvqlink::RPCResponse), + cudaMemcpyDeviceToHost, poll_stream)); + CUDA_CHECK(cudaStreamSynchronize(poll_stream)); + + cudaq::nvqlink::RPCResponse* peek = reinterpret_cast(h_poll_buffer); + if (peek->magic == cudaq::nvqlink::RPC_MAGIC_RESPONSE) { + got_response = true; + break; + } + + usleep(poll_interval_ms * 1000); + } + + // Signal shutdown to allow kernel to exit + *h_shutdown = 1; + __sync_synchronize(); + usleep(100000); // Give kernel time to see shutdown flag + + // Copy final results + CUDA_CHECK(cudaMemcpyAsync(h_buffer, d_buffer, buffer_size, cudaMemcpyDeviceToHost, poll_stream)); + CUDA_CHECK(cudaStreamSynchronize(poll_stream)); + + // Clean up poll resources + CUDA_CHECK(cudaStreamDestroy(poll_stream)); + cudaFreeHost(h_poll_buffer); + + // Sync main stream (dispatch kernel should have exited) + CUDA_CHECK(cudaStreamSynchronize(stream)); + + ASSERT_TRUE(got_response) << "Timeout waiting for device-side graph launch response"; + + // Verify response + cudaq::nvqlink::RPCResponse* h_response = reinterpret_cast(h_buffer); + EXPECT_EQ(h_response->magic, cudaq::nvqlink::RPC_MAGIC_RESPONSE) + << "Expected RPC_MAGIC_RESPONSE, got 0x" << std::hex << h_response->magic; + EXPECT_EQ(h_response->status, 0) << "Handler returned error status"; + EXPECT_EQ(h_response->result_len, 4u) << "Unexpected result length"; + + // Verify data was incremented by graph kernel launched from dispatch kernel + std::uint8_t* h_result = h_buffer + sizeof(cudaq::nvqlink::RPCResponse); + EXPECT_EQ(h_result[0], 1) << "Expected h_result[0]=1"; + EXPECT_EQ(h_result[1], 2) << "Expected h_result[1]=2"; + EXPECT_EQ(h_result[2], 3) << "Expected h_result[2]=3"; + EXPECT_EQ(h_result[3], 4) << "Expected h_result[3]=4"; + + // Cleanup + delete[] h_buffer; + CUDA_CHECK(cudaq_destroy_dispatch_graph(dispatch_ctx)); + CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaFree(d_stats)); + CUDA_CHECK(cudaFreeHost(const_cast(h_shutdown))); // Free mapped memory + CUDA_CHECK(cudaFree((void*)d_tx_flags)); + CUDA_CHECK(cudaFree((void*)d_rx_flags)); + CUDA_CHECK(cudaFree(d_function_entries)); + CUDA_CHECK(cudaGraphExecDestroy(child_graph_exec)); + CUDA_CHECK(cudaGraphDestroy(child_graph)); + CUDA_CHECK(cudaFree(d_graph_buffer_ptr)); + CUDA_CHECK(cudaFree(d_buffer)); +} + +} // namespace From 85b38abf6412fcb72a73b45c6af2c2ad6cd68ad8 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Wed, 18 Feb 2026 20:54:38 +0000 Subject: [PATCH 02/40] Add AI predecoder service with hybrid GPU-CPU decoding pipeline Introduce AIDecoderService and AIPreDecoderService in the QEC library, enabling a hybrid realtime pipeline where GPU-side TensorRT inference (predecoding) hands off results to CPU-side classical decoders like PyMatching. Key components: - AIDecoderService: wraps TensorRT inference in a CUDA graph using a gateway kernel pattern (mailbox pointer indirection) to bridge the dispatch kernel's dynamic ring buffer addresses to TRT's fixed I/O buffers. Supports SKIP_TRT env var for testing without TensorRT. - AIPreDecoderService: extends AIDecoderService with an N-deep pinned memory circular queue for GPU-to-CPU handoff, slot claim/release protocol (d_claimed_slot, d_inflight_flag), backpressure signaling via d_ready_flags/d_queue_idx, and poll_next_job/release_job API with proper acquire/release memory ordering - ThreadPool utility with optional Linux CPU core pinning for low-latency PyMatching worker threads - End-to-end integration test demonstrating the full hybrid pipeline: dispatcher -> 4x AIPreDecoderService GPU inference -> polling thread -> 4-worker PyMatching thread pool -> TX flag acknowledgment - CMake integration to find TensorRT and build the test with CUDA separable compilation Signed-off-by: Scott Thornton --- .../cudaq/qec/realtime/ai_decoder_service.h | 70 ++++ .../qec/realtime/ai_predecoder_service.h | 79 ++++ .../qec/include/cudaq/qec/utils/thread_pool.h | 147 +++++++ libs/qec/lib/realtime/ai_decoder_service.cu | 184 +++++++++ .../qec/lib/realtime/ai_predecoder_service.cu | 218 ++++++++++ .../test_realtime_predecoder_w_pymatching.cpp | 373 ++++++++++++++++++ libs/qec/unittests/CMakeLists.txt | 59 +++ 7 files changed, 1130 insertions(+) create mode 100644 libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h create mode 100644 libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h create mode 100644 libs/qec/include/cudaq/qec/utils/thread_pool.h create mode 100644 libs/qec/lib/realtime/ai_decoder_service.cu create mode 100644 libs/qec/lib/realtime/ai_predecoder_service.cu create mode 100644 libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp diff --git a/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h b/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h new file mode 100644 index 00000000..c5bcc92b --- /dev/null +++ b/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h @@ -0,0 +1,70 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace cudaq::qec { + +class AIDecoderService { +public: + // Logger interface for NvInfer + class Logger : public nvinfer1::ILogger { + void log(Severity severity, const char* msg) noexcept override; + } static gLogger; + + /// @brief Constructor + /// @param engine_path Path to the serialized TensorRT engine file + /// @param device_mailbox_slot Pointer to the specific slot in the global mailbox bank + /// that this decoder will listen to. + AIDecoderService(const std::string& engine_path, void** device_mailbox_slot); + + virtual ~AIDecoderService(); + + /// @brief Captures the CUDA Graph (Gateway In -> TRT -> Gateway Out) + /// @param stream The stream to use for capture + virtual void capture_graph(cudaStream_t stream); + + /// @brief Returns the executable graph for the Dispatcher table + cudaGraphExec_t get_executable_graph() const { return graph_exec_; } + + /// @brief Returns the required input/output sizes for verification + size_t get_input_size() const { return input_size_; } + size_t get_output_size() const { return output_size_; } + +protected: + void load_engine(const std::string& path); + void allocate_resources(); + + // NvInfer resources + std::unique_ptr runtime_; + std::unique_ptr engine_; + std::unique_ptr context_; + + // Graph resources + cudaGraphExec_t graph_exec_ = nullptr; + + // Memory resources (Resident on Device) + void** device_mailbox_slot_; // Address where Dispatcher writes the data pointer + float* d_trt_input_ = nullptr; + float* d_trt_output_ = nullptr; + + // Metadata + size_t input_size_ = 0; + size_t output_size_ = 0; + int input_idx_ = -1; + int output_idx_ = -1; +}; + +} // namespace cudaq::qec diff --git a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h new file mode 100644 index 00000000..69b2e3cf --- /dev/null +++ b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h @@ -0,0 +1,79 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include "cudaq/qec/realtime/ai_decoder_service.h" +#include + +// Portable CPU Yield Macro for busy-polling (Fix #5) +#if defined(__x86_64__) + #include + #define QEC_CPU_RELAX() _mm_pause() +#elif defined(__aarch64__) + #define QEC_CPU_RELAX() asm volatile("yield" ::: "memory") +#else + #define QEC_CPU_RELAX() std::atomic_thread_fence(std::memory_order_seq_cst) +#endif + +namespace cudaq::qec { + +// Represents a single job handed off from GPU to CPU +struct PreDecoderJob { + int slot_idx; // The queue index (needed for release) + void* ring_buffer_ptr; // The FPGA mapped memory address + float* inference_data; // Pointer to the TensorRT output +}; + +class AIPreDecoderService : public AIDecoderService { +public: + AIPreDecoderService(const std::string& engine_path, void** device_mailbox_slot, int queue_depth = 16); + virtual ~AIPreDecoderService(); + + // Overrides the standard graph with the CPU-Handoff graph + void capture_graph(cudaStream_t stream) override; + + // --- CPU Thread Interfaces --- + + /// @brief Polls the circular buffer for a new job. Non-blocking. + bool poll_next_job(PreDecoderJob& out_job); + + /// @brief Releases the slot back to the GPU once the Outgoing Thread finishes. + void release_job(int slot_idx); + + /// @brief Returns the device pointer to the queue tail index (for dispatcher backpressure). + int* get_device_queue_idx() const { return d_queue_idx_; } + + /// @brief Returns the device-mapped pointer to the ready flags (for dispatcher backpressure). + volatile int* get_device_ready_flags() const { return d_ready_flags_; } + + /// @brief Returns the device pointer to the in-flight flag (for single-launch guarantee). + /// Dispatcher sets to 1 before launching; output kernel clears to 0 when done. + int* get_device_inflight_flag() const { return d_inflight_flag_; } + +private: + int queue_depth_; + int cpu_poll_idx_ = 0; + + // --- Pinned Host Memory (The Queue) --- + volatile int* h_ready_flags_ = nullptr; + void** h_ring_ptrs_ = nullptr; + float* h_outputs_ = nullptr; + + // --- Device Mapped Pointers (For the Graph to write to) --- + volatile int* d_ready_flags_ = nullptr; + void** d_ring_ptrs_ = nullptr; + float* d_outputs_ = nullptr; + + // --- Device State --- + int* d_queue_idx_ = nullptr; // Tracks the current slot tail on the GPU + int* d_claimed_slot_ = nullptr; // Passes claimed slot from input to output kernel + int* d_inflight_flag_ = nullptr; // 0 = idle, 1 = graph in flight (set by dispatcher, cleared by output kernel) +}; + +} // namespace cudaq::qec diff --git a/libs/qec/include/cudaq/qec/utils/thread_pool.h b/libs/qec/include/cudaq/qec/utils/thread_pool.h new file mode 100644 index 00000000..237c2b32 --- /dev/null +++ b/libs/qec/include/cudaq/qec/utils/thread_pool.h @@ -0,0 +1,147 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__linux__) +#include +#include +#endif + +namespace cudaq::qec::utils { + +class ThreadPool { +public: + // Option 1: Standard unpinned thread pool + explicit ThreadPool(size_t threads); + + // Option 2: Pinned thread pool (1 thread per specified core ID) + explicit ThreadPool(const std::vector& core_ids); + + ~ThreadPool(); + + // Enqueue a job into the pool. + template + auto enqueue(F&& f, Args&&... args) + -> std::future::type>; + +private: + void worker_loop(); + + std::vector workers; + std::queue> tasks; + + std::mutex queue_mutex; + std::condition_variable condition; + bool stop; +}; + +// --- Implementation --- + +inline void ThreadPool::worker_loop() { + while(true) { + std::function task; + { + std::unique_lock lock(this->queue_mutex); + this->condition.wait(lock, [this] { + return this->stop || !this->tasks.empty(); + }); + + if(this->stop && this->tasks.empty()) { + return; + } + + task = std::move(this->tasks.front()); + this->tasks.pop(); + } + task(); + } +} + +// Constructor 1: Unpinned +inline ThreadPool::ThreadPool(size_t threads) : stop(false) { + for(size_t i = 0; i < threads; ++i) { + workers.emplace_back([this] { this->worker_loop(); }); + } +} + +// Constructor 2: Pinned to specific cores +inline ThreadPool::ThreadPool(const std::vector& core_ids) : stop(false) { + for(size_t i = 0; i < core_ids.size(); ++i) { + int core_id = core_ids[i]; + + workers.emplace_back([this, core_id] { + // Apply Thread Affinity (Linux Only) +#if defined(__linux__) + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + + int rc = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); + if (rc != 0) { + std::cerr << "[ThreadPool] Warning: Failed to pin thread to core " + << core_id << " (Error " << rc << ")\n"; + } +#else + // Silent fallback for non-Linux platforms + (void)core_id; +#endif + + // Enter the standard execution loop + this->worker_loop(); + }); + } +} + +template +auto ThreadPool::enqueue(F&& f, Args&&... args) + -> std::future::type> +{ + using return_type = typename std::invoke_result::type; + + auto task = std::make_shared>( + std::bind(std::forward(f), std::forward(args)...) + ); + + std::future res = task->get_future(); + { + std::unique_lock lock(queue_mutex); + if(stop) { + throw std::runtime_error("enqueue on stopped ThreadPool"); + } + tasks.emplace([task](){ (*task)(); }); + } + condition.notify_one(); + return res; +} + +inline ThreadPool::~ThreadPool() { + { + std::unique_lock lock(queue_mutex); + stop = true; + } + condition.notify_all(); + for(std::thread &worker : workers) { + if (worker.joinable()) { + worker.join(); + } + } +} + +} // namespace cudaq::qec::utils diff --git a/libs/qec/lib/realtime/ai_decoder_service.cu b/libs/qec/lib/realtime/ai_decoder_service.cu new file mode 100644 index 00000000..d86c88d5 --- /dev/null +++ b/libs/qec/lib/realtime/ai_decoder_service.cu @@ -0,0 +1,184 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "cudaq/qec/realtime/ai_decoder_service.h" +#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" // For RPCHeader, RPCResponse +#include +#include +#include + +namespace cudaq::qec { + +// ============================================================================= +// Gateway Kernels (The Bridge) +// ============================================================================= + +/// @brief Reads the dynamic buffer address from the mailbox and copies to fixed buffer +__global__ void gateway_input_kernel( + void** mailbox_slot_ptr, // The specific slot in the Global Bank + float* trt_fixed_input, // The persistent TRT input buffer + size_t copy_size_bytes) +{ + // 1. Read the pointer provided by the Dispatcher + void* ring_buffer_data = *mailbox_slot_ptr; + + if (ring_buffer_data == nullptr) return; + + // 2. Skip RPC Header to find payload + const char* src = (const char*)ring_buffer_data + sizeof(cudaq::nvqlink::RPCHeader); + char* dst = (char*)trt_fixed_input; + + // 3. Grid-Stride Copy + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < copy_size_bytes; i += blockDim.x * gridDim.x) { + dst[i] = src[i]; + } +} + +/// @brief Copies result back to Ring Buffer and writes RPC Response +__global__ void gateway_output_kernel( + void** mailbox_slot_ptr, + const float* trt_fixed_output, + size_t result_size_bytes) +{ + void* ring_buffer_data = *mailbox_slot_ptr; + if (ring_buffer_data == nullptr) return; + + // 1. Write Result Payload (Overwriting input args in this design, or append after) + // Assuming Input/Output fit in the same slot allocation. + char* dst = (char*)ring_buffer_data + sizeof(cudaq::nvqlink::RPCHeader); + const char* src = (const char*)trt_fixed_output; + + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < result_size_bytes; i += blockDim.x * gridDim.x) { + dst[i] = src[i]; + } + + // 2. Write RPC Response Header (Thread 0 only) + if (threadIdx.x == 0 && blockIdx.x == 0) { + auto* response = (cudaq::nvqlink::RPCResponse*)ring_buffer_data; + response->magic = cudaq::nvqlink::RPC_MAGIC_RESPONSE; + response->status = 0; // Success + response->result_len = static_cast(result_size_bytes); + + // Ensure memory visibility + __threadfence_system(); + } +} + +// ============================================================================= +// Class Implementation +// ============================================================================= + +AIDecoderService::Logger AIDecoderService::gLogger; + +void AIDecoderService::Logger::log(Severity severity, const char* msg) noexcept { + if (severity <= Severity::kWARNING) { + std::printf("[TensorRT] %s\n", msg); + } +} + +AIDecoderService::AIDecoderService(const std::string& engine_path, void** device_mailbox_slot) + : device_mailbox_slot_(device_mailbox_slot) { + + if (std::getenv("SKIP_TRT")) { + // Skip TRT entirely; use fixed sizes for testing + input_size_ = 16 * sizeof(float); + output_size_ = 16 * sizeof(float); + input_idx_ = 0; + output_idx_ = 1; + allocate_resources(); + } else { + load_engine(engine_path); + allocate_resources(); + } +} + +AIDecoderService::~AIDecoderService() { + if (graph_exec_) cudaGraphExecDestroy(graph_exec_); + if (d_trt_input_) cudaFree(d_trt_input_); + if (d_trt_output_) cudaFree(d_trt_output_); + // Note: We do not free device_mailbox_slot_ as it is a view into the global bank +} + +void AIDecoderService::load_engine(const std::string& path) { + std::ifstream file(path, std::ios::binary); + if (!file.good()) throw std::runtime_error("Error opening engine file: " + path); + + file.seekg(0, file.end); + size_t size = file.tellg(); + file.seekg(0, file.beg); + + std::vector engine_data(size); + file.read(engine_data.data(), size); + + runtime_.reset(nvinfer1::createInferRuntime(gLogger)); + engine_.reset(runtime_->deserializeCudaEngine(engine_data.data(), size)); + context_.reset(engine_->createExecutionContext()); + + // Auto-detect bindings + input_idx_ = 0; // Simplified assumption, use engine_->getBindingName() in prod + output_idx_ = 1; + + // Inspect shapes (assuming static shapes for realtime) + auto input_dims = engine_->getTensorShape(engine_->getIOTensorName(input_idx_)); + auto output_dims = engine_->getTensorShape(engine_->getIOTensorName(output_idx_)); + + // Calculate sizes (Assuming float) + auto volume = [](const nvinfer1::Dims& d) { + size_t v = 1; + for (int i = 0; i < d.nbDims; ++i) v *= d.d[i]; + return v; + }; + + input_size_ = volume(input_dims) * sizeof(float); + output_size_ = volume(output_dims) * sizeof(float); +} + +void AIDecoderService::allocate_resources() { + if (cudaMalloc(&d_trt_input_, input_size_) != cudaSuccess) + throw std::runtime_error("Failed to allocate TRT Input"); + if (cudaMalloc(&d_trt_output_, output_size_) != cudaSuccess) + throw std::runtime_error("Failed to allocate TRT Output"); +} + +void AIDecoderService::capture_graph(cudaStream_t stream) { + // 1. Bind TensorRT to our fixed buffers + context_->setTensorAddress(engine_->getIOTensorName(input_idx_), d_trt_input_); + context_->setTensorAddress(engine_->getIOTensorName(output_idx_), d_trt_output_); + + // 2. Warmup + context_->enqueueV3(stream); + cudaStreamSynchronize(stream); + + // 3. Capture + cudaGraph_t graph; + cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal); + + // --- Node A: Gateway Input --- + // Reads from *device_mailbox_slot_ -> Writes to d_trt_input_ + gateway_input_kernel<<<1, 128, 0, stream>>>(device_mailbox_slot_, d_trt_input_, input_size_); + + // --- Node B: TensorRT --- + context_->enqueueV3(stream); + + // --- Node C: Gateway Output --- + // Reads from d_trt_output_ -> Writes back to *device_mailbox_slot_ + gateway_output_kernel<<<1, 128, 0, stream>>>(device_mailbox_slot_, d_trt_output_, output_size_); + + cudaStreamEndCapture(stream, &graph); + + // 4. Instantiate for Device Launch + cudaGraphInstantiateWithFlags(&graph_exec_, graph, cudaGraphInstantiateFlagDeviceLaunch); + + // 5. Upload & Cleanup + cudaGraphUpload(graph_exec_, stream); + cudaGraphDestroy(graph); + + cudaStreamSynchronize(stream); +} + +} // namespace cudaq::qec diff --git a/libs/qec/lib/realtime/ai_predecoder_service.cu b/libs/qec/lib/realtime/ai_predecoder_service.cu new file mode 100644 index 00000000..7c83bfd1 --- /dev/null +++ b/libs/qec/lib/realtime/ai_predecoder_service.cu @@ -0,0 +1,218 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "cudaq/qec/realtime/ai_predecoder_service.h" +#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" // RPCHeader for device code +#include "cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h" // cudaq_function_entry_t for debug check +#include +#include +#include + +// Internal Macro to catch silent memory allocation failures (Fix #2) +#define SERVICE_CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + throw std::runtime_error(std::string("CUDA Error in AIPreDecoderService: ") + cudaGetErrorString(err)); \ + } \ + } while(0) + +namespace cudaq::qec { + +// ============================================================================= +// Kernels specific to the PreDecoder +// ============================================================================= + +__global__ void predecoder_input_kernel( + void** mailbox_slot_ptr, int* d_queue_idx, volatile int* d_ready_flags, + void** d_ring_ptrs, float* trt_input, size_t input_size_bytes, + int* d_claimed_slot) +{ + __shared__ int slot_idx; + __shared__ void* ring_ptr; + + if (threadIdx.x == 0 && blockIdx.x == 0) { + ring_ptr = *mailbox_slot_ptr; + // Safe to read non-atomically: dispatcher guarantees at most one + // graph instance in flight per predecoder via d_inflight_flag. + slot_idx = *d_queue_idx; + + // Publish the claimed slot so the output kernel can read it. + // This survives across graph nodes (device global memory). + *d_claimed_slot = slot_idx; + + // Defense-in-depth: if the slot is still owned by the CPU, bail out. + // Under normal operation this should never fire because the dispatcher + // already checked d_ready_flags before launching. + if (d_ready_flags[slot_idx] == 1) { + ring_ptr = nullptr; + } else { + d_ring_ptrs[slot_idx] = ring_ptr; + } + } + __syncthreads(); + + if (!ring_ptr) return; + + // Copy Data from Ring Buffer to TRT + const char* src = (const char*)ring_ptr + sizeof(cudaq::nvqlink::RPCHeader); + char* dst = (char*)trt_input; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < input_size_bytes; i += blockDim.x * gridDim.x) { + dst[i] = src[i]; + } +} + +__global__ void predecoder_output_kernel( + int* d_claimed_slot, int* d_queue_idx, int queue_depth, + volatile int* d_ready_flags, float* d_outputs, const float* trt_output, + size_t output_size_bytes, volatile int* d_inflight_flag) +{ + // Read the slot that the input kernel claimed (fixes review issue #2: + // no stale re-read of d_queue_idx which could race under concurrent launches). + int slot_idx = *d_claimed_slot; + + // Direct D2H Copy (Writing to mapped pinned memory) + char* dst = (char*)d_outputs + (slot_idx * output_size_bytes); + const char* src = (const char*)trt_output; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < output_size_bytes; i += blockDim.x * gridDim.x) { + dst[i] = src[i]; + } + + __syncthreads(); // Ensure all threads finished copying (review issue #5) + __threadfence_system(); // Make D2H writes visible to Host over PCIe + + // Signal CPU, advance queue index, and release the inflight lock + if (threadIdx.x == 0 && blockIdx.x == 0) { + d_ready_flags[slot_idx] = 1; + *d_queue_idx = (slot_idx + 1) % queue_depth; + + __threadfence_system(); // Ensure queue advance is visible before clearing flag + *d_inflight_flag = 0; // Release: dispatcher may now launch this graph again + } +} + +// Simple passthrough kernel: copies input buffer to output buffer (replaces TRT for testing) +__global__ void passthrough_copy_kernel(float* dst, const float* src, size_t num_bytes) { + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < num_bytes; i += blockDim.x * gridDim.x) { + ((char*)dst)[i] = ((const char*)src)[i]; + } +} + +// ============================================================================= +// Class Implementation +// ============================================================================= + +AIPreDecoderService::AIPreDecoderService(const std::string& path, void** mailbox, int queue_depth) + : AIDecoderService(path, mailbox), queue_depth_(queue_depth) +{ + // Fix #2: Wrapped all allocations in SERVICE_CUDA_CHECK + // 1. Allocate Pinned Host Memory Queue + SERVICE_CUDA_CHECK(cudaHostAlloc(&h_ready_flags_, queue_depth_ * sizeof(int), cudaHostAllocMapped)); + SERVICE_CUDA_CHECK(cudaHostAlloc(&h_ring_ptrs_, queue_depth_ * sizeof(void*), cudaHostAllocMapped)); + SERVICE_CUDA_CHECK(cudaHostAlloc(&h_outputs_, queue_depth_ * get_output_size(), cudaHostAllocMapped)); + + memset((void*)h_ready_flags_, 0, queue_depth_ * sizeof(int)); + + // 2. Map Device Pointers + SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_ready_flags_, (void*)h_ready_flags_, 0)); + SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_ring_ptrs_, (void*)h_ring_ptrs_, 0)); + SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_outputs_, (void*)h_outputs_, 0)); + + // 3. Allocate GPU State Trackers + SERVICE_CUDA_CHECK(cudaMalloc(&d_queue_idx_, sizeof(int))); + SERVICE_CUDA_CHECK(cudaMemset(d_queue_idx_, 0, sizeof(int))); + + // 4. Slot handoff buffer (input kernel writes, output kernel reads) + SERVICE_CUDA_CHECK(cudaMalloc(&d_claimed_slot_, sizeof(int))); + SERVICE_CUDA_CHECK(cudaMemset(d_claimed_slot_, 0, sizeof(int))); + + // 5. In-flight flag (dispatcher sets 1 before launch, output kernel clears 0) + SERVICE_CUDA_CHECK(cudaMalloc(&d_inflight_flag_, sizeof(int))); + SERVICE_CUDA_CHECK(cudaMemset(d_inflight_flag_, 0, sizeof(int))); +} + +AIPreDecoderService::~AIPreDecoderService() { + if (h_ready_flags_) cudaFreeHost((void*)h_ready_flags_); + if (h_ring_ptrs_) cudaFreeHost(h_ring_ptrs_); + if (h_outputs_) cudaFreeHost(h_outputs_); + if (d_queue_idx_) cudaFree(d_queue_idx_); + if (d_claimed_slot_) cudaFree(d_claimed_slot_); + if (d_inflight_flag_) cudaFree(d_inflight_flag_); +} + +void AIPreDecoderService::capture_graph(cudaStream_t stream) { + bool skip_trt = (std::getenv("SKIP_TRT") != nullptr); + + if (!skip_trt) { + context_->setTensorAddress(engine_->getIOTensorName(input_idx_), d_trt_input_); + context_->setTensorAddress(engine_->getIOTensorName(output_idx_), d_trt_output_); + context_->enqueueV3(stream); // Warmup + } + SERVICE_CUDA_CHECK(cudaStreamSynchronize(stream)); + + cudaGraph_t graph; + SERVICE_CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); + + predecoder_input_kernel<<<1, 128, 0, stream>>>( + device_mailbox_slot_, d_queue_idx_, d_ready_flags_, + d_ring_ptrs_, d_trt_input_, get_input_size(), + d_claimed_slot_); + + if (skip_trt) { + // Replace TRT with a simple passthrough copy + passthrough_copy_kernel<<<1, 128, 0, stream>>>( + d_trt_output_, d_trt_input_, get_input_size()); + } else { + context_->enqueueV3(stream); + } + + predecoder_output_kernel<<<1, 128, 0, stream>>>( + d_claimed_slot_, d_queue_idx_, queue_depth_, d_ready_flags_, + d_outputs_, d_trt_output_, get_output_size(), + d_inflight_flag_); + + SERVICE_CUDA_CHECK(cudaStreamEndCapture(stream, &graph)); + + // Instantiate for device-side launch + cudaError_t inst_err = cudaGraphInstantiateWithFlags(&graph_exec_, graph, cudaGraphInstantiateFlagDeviceLaunch); + if (inst_err != cudaSuccess) { + cudaGraphDestroy(graph); + throw std::runtime_error( + std::string("cudaGraphInstantiateWithFlags FAILED: ") + cudaGetErrorString(inst_err)); + } + + SERVICE_CUDA_CHECK(cudaGraphUpload(graph_exec_, stream)); + cudaGraphDestroy(graph); + SERVICE_CUDA_CHECK(cudaStreamSynchronize(stream)); +} + +bool AIPreDecoderService::poll_next_job(PreDecoderJob& out_job) { + if (h_ready_flags_[cpu_poll_idx_] == 1) { + + // Fix #3: ARM Portability - Memory Acquire Fence + // Ensures that the reads to h_ring_ptrs_ and h_outputs_ are not + // speculatively executed before the h_ready_flags_ check clears. + std::atomic_thread_fence(std::memory_order_acquire); + + out_job.slot_idx = cpu_poll_idx_; + out_job.ring_buffer_ptr = h_ring_ptrs_[cpu_poll_idx_]; + out_job.inference_data = h_outputs_ + (cpu_poll_idx_ * (get_output_size() / sizeof(float))); + + cpu_poll_idx_ = (cpu_poll_idx_ + 1) % queue_depth_; + return true; + } + return false; +} + +void AIPreDecoderService::release_job(int slot_idx) { + // Memory Order Release guarantees that PyMatching results written + // to other buffers are strictly visible before we flag the slot as free. + __atomic_store_n(&h_ready_flags_[slot_idx], 0, __ATOMIC_RELEASE); +} + +} // namespace cudaq::qec diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp new file mode 100644 index 00000000..0af289b4 --- /dev/null +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -0,0 +1,373 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +/******************************************************************************* + * Standalone Hybrid Realtime Pipeline Test + * Demonstrates: + * 1. Ring Buffer setup + * 2. Dispatcher Kernel -> 4x AIPreDecoderService instances (GPU) + * 3. GPU -> CPU N-Deep Pinned Memory Queue handoff + * 4. Dedicated Polling Thread -> 4-Worker PyMatching Thread Pool + * 5. CPU Workers closing the transaction (Setting TX flags) + ******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +// Ensure graph-based dispatch API is visible (guarded by CUDA_VERSION in cudaq_realtime.h) +#ifndef CUDA_VERSION +#define CUDA_VERSION 13000 +#endif +#include "cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" + +#include "cudaq/qec/realtime/ai_decoder_service.h" +#include "cudaq/qec/realtime/ai_predecoder_service.h" +#include "cudaq/qec/utils/thread_pool.h" + +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at line " << __LINE__ << std::endl; \ + exit(1); \ + } \ + } while(0) + +using namespace cudaq::qec; + +// ============================================================================= +// Configuration & Globals +// ============================================================================= +constexpr size_t NUM_SLOTS = 64; +constexpr size_t SLOT_SIZE = 256; +constexpr int NUM_PREDECODERS = 4; +constexpr int QUEUE_DEPTH = 16; +constexpr int SYNDROME_FLOATS = 16; // 64 bytes + +// Helper to generate Function IDs +constexpr std::uint32_t fnv1a_hash(std::string_view str) { + std::uint32_t hash = 0x811c9dc5; + for (char c : str) { hash ^= static_cast(c); hash *= 0x01000193; } + return hash; +} + +// Global context to pass to workers without massive argument lists +struct SystemContext { + volatile uint64_t* tx_flags_host = nullptr; + uint8_t* rx_data_host = nullptr; + size_t slot_size = SLOT_SIZE; +}; +SystemContext g_sys_ctx; + +// ============================================================================= +// 1. Thread Pool Worker (PyMatching Simulation) +// ============================================================================= +void pymatching_worker_task(PreDecoderJob job, AIPreDecoderService* predecoder) { + // A. "PyMatching" CPU Algorithm + // Convert 16 floats (logits) back to 16 bits + size_t num_elements = predecoder->get_output_size() / sizeof(float); + std::vector final_corrections(num_elements); + + // Simulation placeholder: in production this would run the PyMatching decoder. + for (size_t i = 0; i < num_elements; ++i) { + final_corrections[i] = (job.inference_data[i] > 0.5f) ? 1 : 0; + } + + // B. Write RPC Response + char* response_payload = (char*)job.ring_buffer_ptr + sizeof(cudaq::nvqlink::RPCResponse); + std::memcpy(response_payload, final_corrections.data(), final_corrections.size()); + + auto* header = static_cast(job.ring_buffer_ptr); + header->magic = cudaq::nvqlink::RPC_MAGIC_RESPONSE; + header->status = 0; + header->result_len = static_cast(final_corrections.size()); + + std::atomic_thread_fence(std::memory_order_release); + + // C. Calculate the original Ring Buffer Slot Index + size_t slot_idx = ((uint8_t*)job.ring_buffer_ptr - g_sys_ctx.rx_data_host) / g_sys_ctx.slot_size; + + // D. Release GPU Queue Slot + predecoder->release_job(job.slot_idx); + + // E. Acknowledge to FPGA + // Reconstruct the original rx_value (which is just the pointer cast to uint64_t) + uint64_t rx_value = reinterpret_cast(job.ring_buffer_ptr); + g_sys_ctx.tx_flags_host[slot_idx] = rx_value; +} + +// ============================================================================= +// 2. Incoming Polling Thread +// ============================================================================= +void incoming_polling_loop( + std::vector>& predecoders, + cudaq::qec::utils::ThreadPool& thread_pool, + std::atomic& stop_signal) +{ + PreDecoderJob job; + while (!stop_signal.load(std::memory_order_relaxed)) { + bool found_work = false; + + // Round-robin poll across all 4 PreDecoder instances + for (auto& predecoder : predecoders) { + if (predecoder->poll_next_job(job)) { + // Enqueue the job. Capture raw pointer to specific predecoder instance. + AIPreDecoderService* pd_ptr = predecoder.get(); + thread_pool.enqueue([job, pd_ptr]() { + pymatching_worker_task(job, pd_ptr); + }); + found_work = true; + } + } + + // If all 4 queues were empty, yield the pipeline + if (!found_work) { + QEC_CPU_RELAX(); + } + } +} + +// ============================================================================= +// 3. Helper: Dummy TRT Engine Generator +// ============================================================================= +void create_dummy_engine(const std::string& filepath) { + class Logger : public nvinfer1::ILogger { + void log(Severity severity, const char* msg) noexcept override {} + } logger; + + auto builder = std::unique_ptr(nvinfer1::createInferBuilder(logger)); + uint32_t flag = 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); + auto network = std::unique_ptr(builder->createNetworkV2(flag)); + auto config = std::unique_ptr(builder->createBuilderConfig()); + + // Identity network: 16 floats in, 16 floats out + auto input = network->addInput("input", nvinfer1::DataType::kFLOAT, nvinfer1::Dims{1, {SYNDROME_FLOATS}}); + auto identity = network->addIdentity(*input); + identity->getOutput(0)->setName("output"); + network->markOutput(*identity->getOutput(0)); + + auto plan = std::unique_ptr(builder->buildSerializedNetwork(*network, *config)); + + std::ofstream file(filepath, std::ios::binary); + file.write(static_cast(plan->data()), plan->size()); +} + +// ============================================================================= +// 4. Main Application +// ============================================================================= +int main() { + std::cout << "--- Initializing Hybrid AI Realtime Pipeline ---\n"; + CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost)); + + // A. Generate Dummy Model + std::string engine_path = "predecoder_dummy.engine"; + create_dummy_engine(engine_path); + + // B. Allocate Ring Buffers + void* tmp = nullptr; + + volatile uint64_t *rx_flags_host, *tx_flags_host; + volatile uint64_t *rx_flags_dev, *tx_flags_dev; + uint8_t *rx_data_host; + uint8_t *rx_data_dev; + + CUDA_CHECK(cudaHostAlloc(&tmp, NUM_SLOTS * sizeof(uint64_t), cudaHostAllocMapped)); + rx_flags_host = static_cast(tmp); + CUDA_CHECK(cudaHostGetDevicePointer((void**)&rx_flags_dev, tmp, 0)); + + CUDA_CHECK(cudaHostAlloc(&tmp, NUM_SLOTS * sizeof(uint64_t), cudaHostAllocMapped)); + tx_flags_host = static_cast(tmp); + CUDA_CHECK(cudaHostGetDevicePointer((void**)&tx_flags_dev, tmp, 0)); + + CUDA_CHECK(cudaHostAlloc(&rx_data_host, NUM_SLOTS * SLOT_SIZE, cudaHostAllocMapped)); + CUDA_CHECK(cudaHostGetDevicePointer((void**)&rx_data_dev, rx_data_host, 0)); + + std::memset((void*)rx_flags_host, 0, NUM_SLOTS * sizeof(uint64_t)); + std::memset((void*)tx_flags_host, 0, NUM_SLOTS * sizeof(uint64_t)); + + g_sys_ctx.tx_flags_host = tx_flags_host; + g_sys_ctx.rx_data_host = rx_data_host; + + // C. Allocate Global Mailbox Bank & Control signals + void** d_global_mailbox_bank; + CUDA_CHECK(cudaMalloc(&d_global_mailbox_bank, NUM_PREDECODERS * sizeof(void*))); + CUDA_CHECK(cudaMemset(d_global_mailbox_bank, 0, NUM_PREDECODERS * sizeof(void*))); + + int* shutdown_flag_host; + CUDA_CHECK(cudaHostAlloc(&shutdown_flag_host, sizeof(int), cudaHostAllocMapped)); + *shutdown_flag_host = 0; + int* d_shutdown_flag; + CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_shutdown_flag, shutdown_flag_host, 0)); + + uint64_t* d_stats; + CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t))); + CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t))); + + // D. Initialize the 4 AIPreDecoder Instances + std::cout << "[Setup] Capturing 4x AIPreDecoder Graphs...\n"; + cudaStream_t capture_stream; + CUDA_CHECK(cudaStreamCreate(&capture_stream)); + + std::vector> predecoders; + std::vector function_entries(NUM_PREDECODERS); + + for (int i = 0; i < NUM_PREDECODERS; ++i) { + void** my_mailbox = d_global_mailbox_bank + i; + auto pd = std::make_unique(engine_path, my_mailbox, QUEUE_DEPTH); + pd->capture_graph(capture_stream); + + cudaGraphExec_t gexec = pd->get_executable_graph(); + std::cout << "[Setup] Decoder " << i << ": graph_exec=" << gexec << "\n"; + + std::string func_name = "predecode_target_" + std::to_string(i); + function_entries[i].function_id = fnv1a_hash(func_name); + function_entries[i].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; + function_entries[i].handler.graph_exec = gexec; + function_entries[i].mailbox_idx = i; + function_entries[i].d_queue_idx = pd->get_device_queue_idx(); + function_entries[i].d_ready_flags = pd->get_device_ready_flags(); + function_entries[i].d_inflight_flag = pd->get_device_inflight_flag(); + + predecoders.push_back(std::move(pd)); + } + int actual_func_count = NUM_PREDECODERS; + + // Print struct layout for host/device verification + std::cout << "[Debug] sizeof(cudaq_function_entry_t) = " << sizeof(cudaq_function_entry_t) << "\n"; + std::cout << "[Debug] offsetof handler = " << offsetof(cudaq_function_entry_t, handler) << "\n"; + std::cout << "[Debug] offsetof function_id = " << offsetof(cudaq_function_entry_t, function_id) << "\n"; + std::cout << "[Debug] offsetof dispatch_mode = " << offsetof(cudaq_function_entry_t, dispatch_mode) << "\n"; + std::cout << "[Debug] offsetof schema = " << offsetof(cudaq_function_entry_t, schema) << "\n"; + std::cout << "[Debug] offsetof mailbox_idx = " << offsetof(cudaq_function_entry_t, mailbox_idx) << "\n"; + std::cout << "[Debug] offsetof d_queue_idx = " << offsetof(cudaq_function_entry_t, d_queue_idx) << "\n"; + std::cout << "[Debug] offsetof d_ready_flags = " << offsetof(cudaq_function_entry_t, d_ready_flags) << "\n"; + std::cout << "[Debug] offsetof d_inflight_flag= " << offsetof(cudaq_function_entry_t, d_inflight_flag) << "\n"; + std::cout << "[Debug] sizeof(cudaq_handler_schema_t) = " << sizeof(cudaq_handler_schema_t) << "\n"; + + cudaq_function_entry_t* d_function_entries; + CUDA_CHECK(cudaMalloc(&d_function_entries, actual_func_count * sizeof(cudaq_function_entry_t))); + CUDA_CHECK(cudaMemcpy(d_function_entries, function_entries.data(), + actual_func_count * sizeof(cudaq_function_entry_t), cudaMemcpyHostToDevice)); + + // E. Start GPU Dispatcher + std::cout << "[Setup] Launching Dispatcher Kernel...\n"; + cudaq_dispatch_graph_context* dispatch_ctx = nullptr; + CUDA_CHECK(cudaq_create_dispatch_graph_regular( + rx_flags_dev, tx_flags_dev, d_function_entries, actual_func_count, + d_global_mailbox_bank, d_shutdown_flag, d_stats, NUM_SLOTS, 1, 32, capture_stream, &dispatch_ctx + )); + CUDA_CHECK(cudaq_launch_dispatch_graph(dispatch_ctx, capture_stream)); + + // F. Start CPU Infrastructure + std::cout << "[Setup] Booting Thread Pool & Polling Loop...\n"; + cudaq::qec::utils::ThreadPool pymatching_pool(4); + std::atomic system_stop{false}; + + std::thread incoming_thread([&]() { + incoming_polling_loop(predecoders, pymatching_pool, system_stop); + }); + + // ========================================================================= + // 5. The Test Stimulus (Acting as the FPGA) + // + // Original pattern: fire 8 requests (2 per decoder) all at once, + // then wait for all responses. + // ========================================================================= + std::cout << "\n[Test] Firing Syndromes...\n"; + + int requests_sent = 0; + for (int i = 0; i < 8; ++i) { + int target_decoder = i % NUM_PREDECODERS; + std::string target_func = "predecode_target_" + std::to_string(target_decoder); + + int slot = i % NUM_SLOTS; + while (rx_flags_host[slot] != 0) usleep(10); + + uint8_t* slot_data = rx_data_host + (slot * SLOT_SIZE); + auto* header = reinterpret_cast(slot_data); + header->magic = cudaq::nvqlink::RPC_MAGIC_REQUEST; + header->function_id = fnv1a_hash(target_func); + header->arg_len = SYNDROME_FLOATS * sizeof(float); + + float* payload = reinterpret_cast(slot_data + sizeof(cudaq::nvqlink::RPCHeader)); + for (int j = 0; j < SYNDROME_FLOATS; ++j) payload[j] = 1.0f; + + __sync_synchronize(); + rx_flags_host[slot] = reinterpret_cast(slot_data); + requests_sent++; + } + + // Wait for all 8 responses + int responses_received = 0; + for (int i = 0; i < requests_sent; ++i) { + int slot = i % NUM_SLOTS; + + int timeout = 3000; + while (tx_flags_host[slot] == 0 && timeout-- > 0) usleep(1000); + + uint64_t tv = tx_flags_host[slot]; + if (tv != 0 && (tv >> 48) == 0xDEAD) { + int cuda_err = (int)(tv & 0xFFFF); + std::cerr << " [FAIL] Slot " << slot << " cudaGraphLaunch error " + << cuda_err << " (" << cudaGetErrorString((cudaError_t)cuda_err) << ")\n"; + } else if (tv != 0) { + responses_received++; + std::cout << " -> Success: Slot " << slot << " completed the full trip!\n"; + } else { + std::cerr << " [FAIL] Timeout waiting for slot " << slot << "\n"; + } + + tx_flags_host[slot] = 0; + } + + std::cout << "\n[Result] Processed " << responses_received << "/" << requests_sent + << " requests successfully.\n"; + + // ========================================================================= + // 6. Teardown + // ========================================================================= + std::cout << "[Teardown] Shutting down...\n"; + *shutdown_flag_host = 1; + __sync_synchronize(); + system_stop = true; + + incoming_thread.join(); + CUDA_CHECK(cudaStreamSynchronize(capture_stream)); + + // Read back dispatcher stats for sanity check + uint64_t dispatched_packets = 0; + CUDA_CHECK(cudaMemcpy(&dispatched_packets, d_stats, sizeof(uint64_t), cudaMemcpyDeviceToHost)); + std::cout << "[Stats] Dispatcher processed " << dispatched_packets << " packets.\n"; + + CUDA_CHECK(cudaq_destroy_dispatch_graph(dispatch_ctx)); + + // Cleanup memory + cudaFreeHost((void*)rx_flags_host); + cudaFreeHost((void*)tx_flags_host); + cudaFreeHost(rx_data_host); + cudaFreeHost(shutdown_flag_host); + cudaFree(d_global_mailbox_bank); + cudaFree(d_stats); + cudaFree(d_function_entries); + cudaStreamDestroy(capture_stream); + + remove(engine_path.c_str()); + + std::cout << "Done.\n"; + return 0; +} diff --git a/libs/qec/unittests/CMakeLists.txt b/libs/qec/unittests/CMakeLists.txt index 90ae5882..e91833ec 100644 --- a/libs/qec/unittests/CMakeLists.txt +++ b/libs/qec/unittests/CMakeLists.txt @@ -185,6 +185,65 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) gtest_discover_tests(test_realtime_decoding TEST_PREFIX "test_realtime_decoding." ) + # Hybrid AI predecoder + PyMatching pipeline test + # Requires TensorRT for the AI inference engine + find_path(TENSORRT_INCLUDE_DIR NvInfer.h + PATHS + ${TENSORRT_ROOT}/include + /usr/include/x86_64-linux-gnu + /usr/local/cuda/include + /usr/local/tensorrt/include + /opt/tensorrt/include + NO_DEFAULT_PATH + ) + find_library(TENSORRT_LIBRARY nvinfer + PATHS + ${TENSORRT_ROOT}/lib + /usr/lib/x86_64-linux-gnu + /usr/local/cuda/lib64 + /usr/local/tensorrt/lib + /opt/tensorrt/lib + ) + + if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY) + add_executable(test_realtime_predecoder_w_pymatching + ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp + ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/ai_decoder_service.cu + ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/ai_predecoder_service.cu + ) + + set_target_properties(test_realtime_predecoder_w_pymatching PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON + CUDA_STANDARD 17 + LINKER_LANGUAGE CUDA + ) + + target_include_directories(test_realtime_predecoder_w_pymatching PRIVATE + ${CUDAToolkit_INCLUDE_DIRS} + ${TENSORRT_INCLUDE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/../include + ${CMAKE_SOURCE_DIR}/libs/core/include + ${CUDAQ_REALTIME_INCLUDE_DIR} + ) + + target_link_libraries(test_realtime_predecoder_w_pymatching PRIVATE + CUDA::cudart + ${TENSORRT_LIBRARY} + ${CUDAQ_REALTIME_LIBRARY} + ${CUDAQ_REALTIME_DISPATCH_LIBRARY} + ) + + set_target_properties(test_realtime_predecoder_w_pymatching PROPERTIES + BUILD_RPATH "${CUDAQ_REALTIME_LIB_DIR}" + INSTALL_RPATH "${CUDAQ_REALTIME_LIB_DIR}" + ) + + add_dependencies(CUDAQXQECUnitTests test_realtime_predecoder_w_pymatching) + else() + message(WARNING "TensorRT not found. Skipping test_realtime_predecoder_w_pymatching.") + endif() + else() message(WARNING "cuda-quantum realtime dependency not found. " "Set CUDAQ_REALTIME_ROOT or CUDAQ_INSTALL_PREFIX to enable " From 14962810297605736ebb12ae0d855e5f861d0860 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Thu, 19 Feb 2026 04:46:40 +0000 Subject: [PATCH 03/40] Enable real ONNX model inference in AI predecoder pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upgrade the AI predecoder test from a dummy identity TRT engine to a real d=7 r=7 surface code Z-type ONNX model. The service classes now support ONNX→TRT engine compilation, multi-output tensor bindings, and type-agnostic (INT32) I/O. The test fires 8 realistic syndrome payloads through 4 GPU pre-decoders and verifies end-to-end residual detector output handed off to simulated PyMatching workers. Signed-off-by: Scott Thornton --- .../cudaq/qec/realtime/ai_decoder_service.h | 41 +-- .../qec/realtime/ai_predecoder_service.h | 37 +-- libs/qec/lib/realtime/ai_decoder_service.cu | 199 +++++++++----- .../qec/lib/realtime/ai_predecoder_service.cu | 59 ++--- .../test_realtime_predecoder_w_pymatching.cpp | 248 ++++++++---------- libs/qec/unittests/CMakeLists.txt | 19 +- 6 files changed, 315 insertions(+), 288 deletions(-) diff --git a/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h b/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h index c5bcc92b..60c1ebc4 100644 --- a/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h +++ b/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h @@ -19,52 +19,55 @@ namespace cudaq::qec { class AIDecoderService { public: - // Logger interface for NvInfer class Logger : public nvinfer1::ILogger { void log(Severity severity, const char* msg) noexcept override; } static gLogger; - /// @brief Constructor - /// @param engine_path Path to the serialized TensorRT engine file + /// @brief Constructor. Accepts a serialized TRT engine (.engine/.plan) or + /// an ONNX model (.onnx) which will be compiled to a TRT engine. + /// @param model_path Path to the model file /// @param device_mailbox_slot Pointer to the specific slot in the global mailbox bank - /// that this decoder will listen to. - AIDecoderService(const std::string& engine_path, void** device_mailbox_slot); + AIDecoderService(const std::string& model_path, void** device_mailbox_slot); virtual ~AIDecoderService(); - /// @brief Captures the CUDA Graph (Gateway In -> TRT -> Gateway Out) - /// @param stream The stream to use for capture virtual void capture_graph(cudaStream_t stream); - /// @brief Returns the executable graph for the Dispatcher table cudaGraphExec_t get_executable_graph() const { return graph_exec_; } - /// @brief Returns the required input/output sizes for verification + /// @brief Size of the primary input tensor in bytes (payload from RPC) size_t get_input_size() const { return input_size_; } + + /// @brief Size of the primary output tensor in bytes (forwarded to CPU) size_t get_output_size() const { return output_size_; } protected: void load_engine(const std::string& path); + void build_engine_from_onnx(const std::string& onnx_path); + void setup_bindings(); void allocate_resources(); - // NvInfer resources std::unique_ptr runtime_; std::unique_ptr engine_; std::unique_ptr context_; - // Graph resources cudaGraphExec_t graph_exec_ = nullptr; - - // Memory resources (Resident on Device) - void** device_mailbox_slot_; // Address where Dispatcher writes the data pointer - float* d_trt_input_ = nullptr; - float* d_trt_output_ = nullptr; - // Metadata + void** device_mailbox_slot_; + void* d_trt_input_ = nullptr; // Primary input buffer + void* d_trt_output_ = nullptr; // Primary output buffer (residual_detectors) + std::vector d_aux_buffers_; // Additional I/O buffers TRT needs + + struct TensorBinding { + std::string name; + void* d_buffer = nullptr; + size_t size_bytes = 0; + bool is_input = false; + }; + std::vector all_bindings_; + size_t input_size_ = 0; size_t output_size_ = 0; - int input_idx_ = -1; - int output_idx_ = -1; }; } // namespace cudaq::qec diff --git a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h index 69b2e3cf..e4634bd9 100644 --- a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h +++ b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h @@ -11,7 +11,7 @@ #include "cudaq/qec/realtime/ai_decoder_service.h" #include -// Portable CPU Yield Macro for busy-polling (Fix #5) +// Portable CPU Yield Macro for busy-polling #if defined(__x86_64__) #include #define QEC_CPU_RELAX() _mm_pause() @@ -23,11 +23,10 @@ namespace cudaq::qec { -// Represents a single job handed off from GPU to CPU struct PreDecoderJob { - int slot_idx; // The queue index (needed for release) - void* ring_buffer_ptr; // The FPGA mapped memory address - float* inference_data; // Pointer to the TensorRT output + int slot_idx; + void* ring_buffer_ptr; + void* inference_data; // Points into the pinned output queue (type-agnostic) }; class AIPreDecoderService : public AIDecoderService { @@ -35,45 +34,33 @@ class AIPreDecoderService : public AIDecoderService { AIPreDecoderService(const std::string& engine_path, void** device_mailbox_slot, int queue_depth = 16); virtual ~AIPreDecoderService(); - // Overrides the standard graph with the CPU-Handoff graph void capture_graph(cudaStream_t stream) override; - // --- CPU Thread Interfaces --- - - /// @brief Polls the circular buffer for a new job. Non-blocking. bool poll_next_job(PreDecoderJob& out_job); - - /// @brief Releases the slot back to the GPU once the Outgoing Thread finishes. void release_job(int slot_idx); - /// @brief Returns the device pointer to the queue tail index (for dispatcher backpressure). int* get_device_queue_idx() const { return d_queue_idx_; } - - /// @brief Returns the device-mapped pointer to the ready flags (for dispatcher backpressure). volatile int* get_device_ready_flags() const { return d_ready_flags_; } - - /// @brief Returns the device pointer to the in-flight flag (for single-launch guarantee). - /// Dispatcher sets to 1 before launching; output kernel clears to 0 when done. int* get_device_inflight_flag() const { return d_inflight_flag_; } private: int queue_depth_; int cpu_poll_idx_ = 0; - // --- Pinned Host Memory (The Queue) --- + // Pinned Host Memory (The Queue) volatile int* h_ready_flags_ = nullptr; void** h_ring_ptrs_ = nullptr; - float* h_outputs_ = nullptr; + void* h_outputs_ = nullptr; // Type-agnostic pinned output queue - // --- Device Mapped Pointers (For the Graph to write to) --- + // Device Mapped Pointers (For the Graph to write to) volatile int* d_ready_flags_ = nullptr; void** d_ring_ptrs_ = nullptr; - float* d_outputs_ = nullptr; + void* d_outputs_ = nullptr; - // --- Device State --- - int* d_queue_idx_ = nullptr; // Tracks the current slot tail on the GPU - int* d_claimed_slot_ = nullptr; // Passes claimed slot from input to output kernel - int* d_inflight_flag_ = nullptr; // 0 = idle, 1 = graph in flight (set by dispatcher, cleared by output kernel) + // Device State + int* d_queue_idx_ = nullptr; + int* d_claimed_slot_ = nullptr; + int* d_inflight_flag_ = nullptr; }; } // namespace cudaq::qec diff --git a/libs/qec/lib/realtime/ai_decoder_service.cu b/libs/qec/lib/realtime/ai_decoder_service.cu index d86c88d5..30531335 100644 --- a/libs/qec/lib/realtime/ai_decoder_service.cu +++ b/libs/qec/lib/realtime/ai_decoder_service.cu @@ -7,49 +7,43 @@ ******************************************************************************/ #include "cudaq/qec/realtime/ai_decoder_service.h" -#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" // For RPCHeader, RPCResponse +#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" +#include #include #include #include +#include namespace cudaq::qec { // ============================================================================= -// Gateway Kernels (The Bridge) +// Gateway Kernels // ============================================================================= -/// @brief Reads the dynamic buffer address from the mailbox and copies to fixed buffer __global__ void gateway_input_kernel( - void** mailbox_slot_ptr, // The specific slot in the Global Bank - float* trt_fixed_input, // The persistent TRT input buffer - size_t copy_size_bytes) + void** mailbox_slot_ptr, + void* trt_fixed_input, + size_t copy_size_bytes) { - // 1. Read the pointer provided by the Dispatcher void* ring_buffer_data = *mailbox_slot_ptr; - if (ring_buffer_data == nullptr) return; - // 2. Skip RPC Header to find payload const char* src = (const char*)ring_buffer_data + sizeof(cudaq::nvqlink::RPCHeader); char* dst = (char*)trt_fixed_input; - // 3. Grid-Stride Copy for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < copy_size_bytes; i += blockDim.x * gridDim.x) { dst[i] = src[i]; } } -/// @brief Copies result back to Ring Buffer and writes RPC Response __global__ void gateway_output_kernel( void** mailbox_slot_ptr, - const float* trt_fixed_output, + const void* trt_fixed_output, size_t result_size_bytes) { void* ring_buffer_data = *mailbox_slot_ptr; if (ring_buffer_data == nullptr) return; - // 1. Write Result Payload (Overwriting input args in this design, or append after) - // Assuming Input/Output fit in the same slot allocation. char* dst = (char*)ring_buffer_data + sizeof(cudaq::nvqlink::RPCHeader); const char* src = (const char*)trt_fixed_output; @@ -57,18 +51,37 @@ __global__ void gateway_output_kernel( dst[i] = src[i]; } - // 2. Write RPC Response Header (Thread 0 only) if (threadIdx.x == 0 && blockIdx.x == 0) { auto* response = (cudaq::nvqlink::RPCResponse*)ring_buffer_data; response->magic = cudaq::nvqlink::RPC_MAGIC_RESPONSE; - response->status = 0; // Success + response->status = 0; response->result_len = static_cast(result_size_bytes); - - // Ensure memory visibility __threadfence_system(); } } +// ============================================================================= +// Helpers +// ============================================================================= + +static size_t trt_dtype_size(nvinfer1::DataType dtype) { + switch (dtype) { + case nvinfer1::DataType::kFLOAT: return 4; + case nvinfer1::DataType::kHALF: return 2; + case nvinfer1::DataType::kINT8: return 1; + case nvinfer1::DataType::kINT32: return 4; + case nvinfer1::DataType::kINT64: return 8; + case nvinfer1::DataType::kBOOL: return 1; + default: return 4; + } +} + +static size_t tensor_volume(const nvinfer1::Dims& d) { + size_t v = 1; + for (int i = 0; i < d.nbDims; ++i) v *= d.d[i]; + return v; +} + // ============================================================================= // Class Implementation // ============================================================================= @@ -81,18 +94,21 @@ void AIDecoderService::Logger::log(Severity severity, const char* msg) noexcept } } -AIDecoderService::AIDecoderService(const std::string& engine_path, void** device_mailbox_slot) +AIDecoderService::AIDecoderService(const std::string& model_path, void** device_mailbox_slot) : device_mailbox_slot_(device_mailbox_slot) { - + if (std::getenv("SKIP_TRT")) { - // Skip TRT entirely; use fixed sizes for testing input_size_ = 16 * sizeof(float); output_size_ = 16 * sizeof(float); - input_idx_ = 0; - output_idx_ = 1; allocate_resources(); } else { - load_engine(engine_path); + std::string ext = model_path.substr(model_path.find_last_of('.')); + if (ext == ".onnx") { + build_engine_from_onnx(model_path); + } else { + load_engine(model_path); + } + setup_bindings(); allocate_resources(); } } @@ -101,83 +117,136 @@ AIDecoderService::~AIDecoderService() { if (graph_exec_) cudaGraphExecDestroy(graph_exec_); if (d_trt_input_) cudaFree(d_trt_input_); if (d_trt_output_) cudaFree(d_trt_output_); - // Note: We do not free device_mailbox_slot_ as it is a view into the global bank + for (auto* buf : d_aux_buffers_) cudaFree(buf); } void AIDecoderService::load_engine(const std::string& path) { std::ifstream file(path, std::ios::binary); if (!file.good()) throw std::runtime_error("Error opening engine file: " + path); - + file.seekg(0, file.end); size_t size = file.tellg(); file.seekg(0, file.beg); - + std::vector engine_data(size); file.read(engine_data.data(), size); - + runtime_.reset(nvinfer1::createInferRuntime(gLogger)); engine_.reset(runtime_->deserializeCudaEngine(engine_data.data(), size)); context_.reset(engine_->createExecutionContext()); +} + +void AIDecoderService::build_engine_from_onnx(const std::string& onnx_path) { + runtime_.reset(nvinfer1::createInferRuntime(gLogger)); + + auto builder = std::unique_ptr(nvinfer1::createInferBuilder(gLogger)); + auto network = std::unique_ptr(builder->createNetworkV2(0)); + auto config = std::unique_ptr(builder->createBuilderConfig()); + auto parser = std::unique_ptr( + nvonnxparser::createParser(*network, gLogger)); + + if (!parser->parseFromFile(onnx_path.c_str(), + static_cast(nvinfer1::ILogger::Severity::kWARNING))) { + throw std::runtime_error("Failed to parse ONNX file: " + onnx_path); + } + + auto plan = std::unique_ptr( + builder->buildSerializedNetwork(*network, *config)); + if (!plan) throw std::runtime_error("Failed to build TRT engine from ONNX"); + + engine_.reset(runtime_->deserializeCudaEngine(plan->data(), plan->size())); + if (!engine_) throw std::runtime_error("Failed to deserialize built engine"); + + context_.reset(engine_->createExecutionContext()); + + std::printf("[TensorRT] Built engine from ONNX: %s\n", onnx_path.c_str()); +} + +void AIDecoderService::setup_bindings() { + int num_io = engine_->getNbIOTensors(); + bool found_input = false; + bool found_output = false; + + for (int i = 0; i < num_io; ++i) { + const char* name = engine_->getIOTensorName(i); + auto mode = engine_->getTensorIOMode(name); + auto dims = engine_->getTensorShape(name); + auto dtype = engine_->getTensorDataType(name); + size_t size_bytes = tensor_volume(dims) * trt_dtype_size(dtype); + + bool is_input = (mode == nvinfer1::TensorIOMode::kINPUT); + + std::printf("[TensorRT] Binding %d: \"%s\" %s, %zu bytes\n", + i, name, is_input ? "INPUT" : "OUTPUT", size_bytes); - // Auto-detect bindings - input_idx_ = 0; // Simplified assumption, use engine_->getBindingName() in prod - output_idx_ = 1; - - // Inspect shapes (assuming static shapes for realtime) - auto input_dims = engine_->getTensorShape(engine_->getIOTensorName(input_idx_)); - auto output_dims = engine_->getTensorShape(engine_->getIOTensorName(output_idx_)); - - // Calculate sizes (Assuming float) - auto volume = [](const nvinfer1::Dims& d) { - size_t v = 1; - for (int i = 0; i < d.nbDims; ++i) v *= d.d[i]; - return v; - }; - - input_size_ = volume(input_dims) * sizeof(float); - output_size_ = volume(output_dims) * sizeof(float); + TensorBinding binding{name, nullptr, size_bytes, is_input}; + + if (is_input && !found_input) { + input_size_ = size_bytes; + found_input = true; + } else if (!is_input && !found_output) { + output_size_ = size_bytes; + found_output = true; + } + + all_bindings_.push_back(std::move(binding)); + } } void AIDecoderService::allocate_resources() { - if (cudaMalloc(&d_trt_input_, input_size_) != cudaSuccess) - throw std::runtime_error("Failed to allocate TRT Input"); - if (cudaMalloc(&d_trt_output_, output_size_) != cudaSuccess) - throw std::runtime_error("Failed to allocate TRT Output"); + if (all_bindings_.empty()) { + // SKIP_TRT fallback path + if (cudaMalloc(&d_trt_input_, input_size_) != cudaSuccess) + throw std::runtime_error("Failed to allocate TRT Input"); + if (cudaMalloc(&d_trt_output_, output_size_) != cudaSuccess) + throw std::runtime_error("Failed to allocate TRT Output"); + return; + } + + bool assigned_input = false; + bool assigned_output = false; + + for (auto& b : all_bindings_) { + void* buf = nullptr; + if (cudaMalloc(&buf, b.size_bytes) != cudaSuccess) + throw std::runtime_error("Failed to allocate buffer for " + b.name); + cudaMemset(buf, 0, b.size_bytes); + b.d_buffer = buf; + + if (b.is_input && !assigned_input) { + d_trt_input_ = buf; + assigned_input = true; + } else if (!b.is_input && !assigned_output) { + d_trt_output_ = buf; + assigned_output = true; + } else { + d_aux_buffers_.push_back(buf); + } + } } void AIDecoderService::capture_graph(cudaStream_t stream) { - // 1. Bind TensorRT to our fixed buffers - context_->setTensorAddress(engine_->getIOTensorName(input_idx_), d_trt_input_); - context_->setTensorAddress(engine_->getIOTensorName(output_idx_), d_trt_output_); + // Bind all tensors to TRT context + for (auto& b : all_bindings_) { + context_->setTensorAddress(b.name.c_str(), b.d_buffer); + } - // 2. Warmup context_->enqueueV3(stream); cudaStreamSynchronize(stream); - // 3. Capture cudaGraph_t graph; cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal); - // --- Node A: Gateway Input --- - // Reads from *device_mailbox_slot_ -> Writes to d_trt_input_ gateway_input_kernel<<<1, 128, 0, stream>>>(device_mailbox_slot_, d_trt_input_, input_size_); - - // --- Node B: TensorRT --- context_->enqueueV3(stream); - - // --- Node C: Gateway Output --- - // Reads from d_trt_output_ -> Writes back to *device_mailbox_slot_ gateway_output_kernel<<<1, 128, 0, stream>>>(device_mailbox_slot_, d_trt_output_, output_size_); cudaStreamEndCapture(stream, &graph); - // 4. Instantiate for Device Launch cudaGraphInstantiateWithFlags(&graph_exec_, graph, cudaGraphInstantiateFlagDeviceLaunch); - - // 5. Upload & Cleanup + cudaGraphUpload(graph_exec_, stream); cudaGraphDestroy(graph); - cudaStreamSynchronize(stream); } diff --git a/libs/qec/lib/realtime/ai_predecoder_service.cu b/libs/qec/lib/realtime/ai_predecoder_service.cu index 7c83bfd1..aafa40e5 100644 --- a/libs/qec/lib/realtime/ai_predecoder_service.cu +++ b/libs/qec/lib/realtime/ai_predecoder_service.cu @@ -7,13 +7,12 @@ ******************************************************************************/ #include "cudaq/qec/realtime/ai_predecoder_service.h" -#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" // RPCHeader for device code -#include "cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h" // cudaq_function_entry_t for debug check +#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h" #include #include #include -// Internal Macro to catch silent memory allocation failures (Fix #2) #define SERVICE_CUDA_CHECK(call) \ do { \ cudaError_t err = call; \ @@ -25,12 +24,12 @@ namespace cudaq::qec { // ============================================================================= -// Kernels specific to the PreDecoder +// Kernels // ============================================================================= __global__ void predecoder_input_kernel( void** mailbox_slot_ptr, int* d_queue_idx, volatile int* d_ready_flags, - void** d_ring_ptrs, float* trt_input, size_t input_size_bytes, + void** d_ring_ptrs, void* trt_input, size_t input_size_bytes, int* d_claimed_slot) { __shared__ int slot_idx; @@ -38,17 +37,9 @@ __global__ void predecoder_input_kernel( if (threadIdx.x == 0 && blockIdx.x == 0) { ring_ptr = *mailbox_slot_ptr; - // Safe to read non-atomically: dispatcher guarantees at most one - // graph instance in flight per predecoder via d_inflight_flag. slot_idx = *d_queue_idx; - - // Publish the claimed slot so the output kernel can read it. - // This survives across graph nodes (device global memory). *d_claimed_slot = slot_idx; - // Defense-in-depth: if the slot is still owned by the CPU, bail out. - // Under normal operation this should never fire because the dispatcher - // already checked d_ready_flags before launching. if (d_ready_flags[slot_idx] == 1) { ring_ptr = nullptr; } else { @@ -59,7 +50,6 @@ __global__ void predecoder_input_kernel( if (!ring_ptr) return; - // Copy Data from Ring Buffer to TRT const char* src = (const char*)ring_ptr + sizeof(cudaq::nvqlink::RPCHeader); char* dst = (char*)trt_input; for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < input_size_bytes; i += blockDim.x * gridDim.x) { @@ -69,35 +59,29 @@ __global__ void predecoder_input_kernel( __global__ void predecoder_output_kernel( int* d_claimed_slot, int* d_queue_idx, int queue_depth, - volatile int* d_ready_flags, float* d_outputs, const float* trt_output, + volatile int* d_ready_flags, void* d_outputs, const void* trt_output, size_t output_size_bytes, volatile int* d_inflight_flag) { - // Read the slot that the input kernel claimed (fixes review issue #2: - // no stale re-read of d_queue_idx which could race under concurrent launches). int slot_idx = *d_claimed_slot; - // Direct D2H Copy (Writing to mapped pinned memory) char* dst = (char*)d_outputs + (slot_idx * output_size_bytes); const char* src = (const char*)trt_output; for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < output_size_bytes; i += blockDim.x * gridDim.x) { dst[i] = src[i]; } - __syncthreads(); // Ensure all threads finished copying (review issue #5) - __threadfence_system(); // Make D2H writes visible to Host over PCIe + __syncthreads(); + __threadfence_system(); - // Signal CPU, advance queue index, and release the inflight lock if (threadIdx.x == 0 && blockIdx.x == 0) { d_ready_flags[slot_idx] = 1; *d_queue_idx = (slot_idx + 1) % queue_depth; - - __threadfence_system(); // Ensure queue advance is visible before clearing flag - *d_inflight_flag = 0; // Release: dispatcher may now launch this graph again + __threadfence_system(); + *d_inflight_flag = 0; } } -// Simple passthrough kernel: copies input buffer to output buffer (replaces TRT for testing) -__global__ void passthrough_copy_kernel(float* dst, const float* src, size_t num_bytes) { +__global__ void passthrough_copy_kernel(void* dst, const void* src, size_t num_bytes) { for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < num_bytes; i += blockDim.x * gridDim.x) { ((char*)dst)[i] = ((const char*)src)[i]; } @@ -110,28 +94,22 @@ __global__ void passthrough_copy_kernel(float* dst, const float* src, size_t num AIPreDecoderService::AIPreDecoderService(const std::string& path, void** mailbox, int queue_depth) : AIDecoderService(path, mailbox), queue_depth_(queue_depth) { - // Fix #2: Wrapped all allocations in SERVICE_CUDA_CHECK - // 1. Allocate Pinned Host Memory Queue SERVICE_CUDA_CHECK(cudaHostAlloc(&h_ready_flags_, queue_depth_ * sizeof(int), cudaHostAllocMapped)); SERVICE_CUDA_CHECK(cudaHostAlloc(&h_ring_ptrs_, queue_depth_ * sizeof(void*), cudaHostAllocMapped)); SERVICE_CUDA_CHECK(cudaHostAlloc(&h_outputs_, queue_depth_ * get_output_size(), cudaHostAllocMapped)); memset((void*)h_ready_flags_, 0, queue_depth_ * sizeof(int)); - // 2. Map Device Pointers SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_ready_flags_, (void*)h_ready_flags_, 0)); SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_ring_ptrs_, (void*)h_ring_ptrs_, 0)); SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_outputs_, (void*)h_outputs_, 0)); - // 3. Allocate GPU State Trackers SERVICE_CUDA_CHECK(cudaMalloc(&d_queue_idx_, sizeof(int))); SERVICE_CUDA_CHECK(cudaMemset(d_queue_idx_, 0, sizeof(int))); - // 4. Slot handoff buffer (input kernel writes, output kernel reads) SERVICE_CUDA_CHECK(cudaMalloc(&d_claimed_slot_, sizeof(int))); SERVICE_CUDA_CHECK(cudaMemset(d_claimed_slot_, 0, sizeof(int))); - // 5. In-flight flag (dispatcher sets 1 before launch, output kernel clears 0) SERVICE_CUDA_CHECK(cudaMalloc(&d_inflight_flag_, sizeof(int))); SERVICE_CUDA_CHECK(cudaMemset(d_inflight_flag_, 0, sizeof(int))); } @@ -149,9 +127,10 @@ void AIPreDecoderService::capture_graph(cudaStream_t stream) { bool skip_trt = (std::getenv("SKIP_TRT") != nullptr); if (!skip_trt) { - context_->setTensorAddress(engine_->getIOTensorName(input_idx_), d_trt_input_); - context_->setTensorAddress(engine_->getIOTensorName(output_idx_), d_trt_output_); - context_->enqueueV3(stream); // Warmup + for (auto& b : all_bindings_) { + context_->setTensorAddress(b.name.c_str(), b.d_buffer); + } + context_->enqueueV3(stream); } SERVICE_CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -164,7 +143,6 @@ void AIPreDecoderService::capture_graph(cudaStream_t stream) { d_claimed_slot_); if (skip_trt) { - // Replace TRT with a simple passthrough copy passthrough_copy_kernel<<<1, 128, 0, stream>>>( d_trt_output_, d_trt_input_, get_input_size()); } else { @@ -178,7 +156,6 @@ void AIPreDecoderService::capture_graph(cudaStream_t stream) { SERVICE_CUDA_CHECK(cudaStreamEndCapture(stream, &graph)); - // Instantiate for device-side launch cudaError_t inst_err = cudaGraphInstantiateWithFlags(&graph_exec_, graph, cudaGraphInstantiateFlagDeviceLaunch); if (inst_err != cudaSuccess) { cudaGraphDestroy(graph); @@ -193,15 +170,11 @@ void AIPreDecoderService::capture_graph(cudaStream_t stream) { bool AIPreDecoderService::poll_next_job(PreDecoderJob& out_job) { if (h_ready_flags_[cpu_poll_idx_] == 1) { - - // Fix #3: ARM Portability - Memory Acquire Fence - // Ensures that the reads to h_ring_ptrs_ and h_outputs_ are not - // speculatively executed before the h_ready_flags_ check clears. std::atomic_thread_fence(std::memory_order_acquire); out_job.slot_idx = cpu_poll_idx_; out_job.ring_buffer_ptr = h_ring_ptrs_[cpu_poll_idx_]; - out_job.inference_data = h_outputs_ + (cpu_poll_idx_ * (get_output_size() / sizeof(float))); + out_job.inference_data = static_cast(h_outputs_) + (cpu_poll_idx_ * get_output_size()); cpu_poll_idx_ = (cpu_poll_idx_ + 1) % queue_depth_; return true; @@ -210,8 +183,6 @@ bool AIPreDecoderService::poll_next_job(PreDecoderJob& out_job) { } void AIPreDecoderService::release_job(int slot_idx) { - // Memory Order Release guarantees that PyMatching results written - // to other buffers are strictly visible before we flag the slot as free. __atomic_store_n(&h_ready_flags_[slot_idx], 0, __ATOMIC_RELEASE); } diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index 0af289b4..f3e02d86 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -7,10 +7,16 @@ ******************************************************************************/ /******************************************************************************* - * Standalone Hybrid Realtime Pipeline Test - * Demonstrates: + * Hybrid Realtime Pipeline Test with Real ONNX Pre-Decoder + * + * Uses model1_d7_r7_unified_Z_batch1.onnx: + * Input: all_measurements [1, 72, 7] INT32 (2016 bytes) + * Output: residual_detectors [1, 336] INT32 (1344 bytes) + * Output: logical_frame [1] INT32 (4 bytes) + * + * Pipeline: * 1. Ring Buffer setup - * 2. Dispatcher Kernel -> 4x AIPreDecoderService instances (GPU) + * 2. Dispatcher Kernel -> 4x AIPreDecoderService instances (GPU, TRT from ONNX) * 3. GPU -> CPU N-Deep Pinned Memory Queue handoff * 4. Dedicated Polling Thread -> 4-Worker PyMatching Thread Pool * 5. CPU Workers closing the transaction (Setting TX flags) @@ -23,12 +29,10 @@ #include #include #include -#include +#include #include -#include -// Ensure graph-based dispatch API is visible (guarded by CUDA_VERSION in cudaq_realtime.h) #ifndef CUDA_VERSION #define CUDA_VERSION 13000 #endif @@ -51,22 +55,25 @@ using namespace cudaq::qec; // ============================================================================= -// Configuration & Globals +// Configuration // ============================================================================= constexpr size_t NUM_SLOTS = 64; -constexpr size_t SLOT_SIZE = 256; +constexpr size_t SLOT_SIZE = 4096; // Enough for RPC header + 2016-byte payload + response constexpr int NUM_PREDECODERS = 4; constexpr int QUEUE_DEPTH = 16; -constexpr int SYNDROME_FLOATS = 16; // 64 bytes -// Helper to generate Function IDs +// d=7, r=7 surface code Z-type model dimensions +constexpr int MEAS_QUBITS = 72; +constexpr int NUM_ROUNDS = 7; +constexpr int INPUT_ELEMENTS = MEAS_QUBITS * NUM_ROUNDS; // 504 int32s = 2016 bytes +constexpr int RESIDUAL_DETECTORS = 336; // 336 int32s = 1344 bytes + constexpr std::uint32_t fnv1a_hash(std::string_view str) { std::uint32_t hash = 0x811c9dc5; for (char c : str) { hash ^= static_cast(c); hash *= 0x01000193; } return hash; } -// Global context to pass to workers without massive argument lists struct SystemContext { volatile uint64_t* tx_flags_host = nullptr; uint8_t* rx_data_host = nullptr; @@ -75,44 +82,39 @@ struct SystemContext { SystemContext g_sys_ctx; // ============================================================================= -// 1. Thread Pool Worker (PyMatching Simulation) +// Thread Pool Worker (PyMatching Simulation) // ============================================================================= void pymatching_worker_task(PreDecoderJob job, AIPreDecoderService* predecoder) { - // A. "PyMatching" CPU Algorithm - // Convert 16 floats (logits) back to 16 bits - size_t num_elements = predecoder->get_output_size() / sizeof(float); - std::vector final_corrections(num_elements); - - // Simulation placeholder: in production this would run the PyMatching decoder. - for (size_t i = 0; i < num_elements; ++i) { - final_corrections[i] = (job.inference_data[i] > 0.5f) ? 1 : 0; + size_t num_detectors = predecoder->get_output_size() / sizeof(int32_t); + const int32_t* residual = static_cast(job.inference_data); + + // Simulate PyMatching: count non-zero detectors and produce corrections + int nonzero = 0; + for (size_t i = 0; i < num_detectors; ++i) { + if (residual[i] != 0) nonzero++; } - // B. Write RPC Response + // Write RPC Response with a simple summary (correction count) char* response_payload = (char*)job.ring_buffer_ptr + sizeof(cudaq::nvqlink::RPCResponse); - std::memcpy(response_payload, final_corrections.data(), final_corrections.size()); + int32_t correction_count = nonzero; + std::memcpy(response_payload, &correction_count, sizeof(int32_t)); auto* header = static_cast(job.ring_buffer_ptr); header->magic = cudaq::nvqlink::RPC_MAGIC_RESPONSE; header->status = 0; - header->result_len = static_cast(final_corrections.size()); + header->result_len = sizeof(int32_t); std::atomic_thread_fence(std::memory_order_release); - // C. Calculate the original Ring Buffer Slot Index size_t slot_idx = ((uint8_t*)job.ring_buffer_ptr - g_sys_ctx.rx_data_host) / g_sys_ctx.slot_size; - - // D. Release GPU Queue Slot predecoder->release_job(job.slot_idx); - // E. Acknowledge to FPGA - // Reconstruct the original rx_value (which is just the pointer cast to uint64_t) uint64_t rx_value = reinterpret_cast(job.ring_buffer_ptr); g_sys_ctx.tx_flags_host[slot_idx] = rx_value; } // ============================================================================= -// 2. Incoming Polling Thread +// Incoming Polling Thread // ============================================================================= void incoming_polling_loop( std::vector>& predecoders, @@ -122,11 +124,8 @@ void incoming_polling_loop( PreDecoderJob job; while (!stop_signal.load(std::memory_order_relaxed)) { bool found_work = false; - - // Round-robin poll across all 4 PreDecoder instances for (auto& predecoder : predecoders) { if (predecoder->poll_next_job(job)) { - // Enqueue the job. Capture raw pointer to specific predecoder instance. AIPreDecoderService* pd_ptr = predecoder.get(); thread_pool.enqueue([job, pd_ptr]() { pymatching_worker_task(job, pd_ptr); @@ -134,8 +133,6 @@ void incoming_polling_loop( found_work = true; } } - - // If all 4 queues were empty, yield the pipeline if (!found_work) { QEC_CPU_RELAX(); } @@ -143,48 +140,31 @@ void incoming_polling_loop( } // ============================================================================= -// 3. Helper: Dummy TRT Engine Generator +// Generate Realistic Syndrome Data // ============================================================================= -void create_dummy_engine(const std::string& filepath) { - class Logger : public nvinfer1::ILogger { - void log(Severity severity, const char* msg) noexcept override {} - } logger; - - auto builder = std::unique_ptr(nvinfer1::createInferBuilder(logger)); - uint32_t flag = 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); - auto network = std::unique_ptr(builder->createNetworkV2(flag)); - auto config = std::unique_ptr(builder->createBuilderConfig()); - - // Identity network: 16 floats in, 16 floats out - auto input = network->addInput("input", nvinfer1::DataType::kFLOAT, nvinfer1::Dims{1, {SYNDROME_FLOATS}}); - auto identity = network->addIdentity(*input); - identity->getOutput(0)->setName("output"); - network->markOutput(*identity->getOutput(0)); - - auto plan = std::unique_ptr(builder->buildSerializedNetwork(*network, *config)); - - std::ofstream file(filepath, std::ios::binary); - file.write(static_cast(plan->data()), plan->size()); +void fill_measurement_payload(int32_t* payload, std::mt19937& rng, + double error_rate = 0.01) { + std::bernoulli_distribution err_dist(error_rate); + for (int i = 0; i < INPUT_ELEMENTS; ++i) { + payload[i] = err_dist(rng) ? 1 : 0; + } } // ============================================================================= -// 4. Main Application +// Main // ============================================================================= int main() { - std::cout << "--- Initializing Hybrid AI Realtime Pipeline ---\n"; + std::cout << "--- Initializing Hybrid AI Realtime Pipeline (d=7 r=7 Z) ---\n"; CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost)); - // A. Generate Dummy Model - std::string engine_path = "predecoder_dummy.engine"; - create_dummy_engine(engine_path); + std::string onnx_path = ONNX_MODEL_PATH; + std::cout << "[Setup] Building TRT engines from: " << onnx_path << "\n"; - // B. Allocate Ring Buffers + // Allocate Ring Buffers void* tmp = nullptr; - volatile uint64_t *rx_flags_host, *tx_flags_host; volatile uint64_t *rx_flags_dev, *tx_flags_dev; - uint8_t *rx_data_host; - uint8_t *rx_data_dev; + uint8_t *rx_data_host, *rx_data_dev; CUDA_CHECK(cudaHostAlloc(&tmp, NUM_SLOTS * sizeof(uint64_t), cudaHostAllocMapped)); rx_flags_host = static_cast(tmp); @@ -203,7 +183,7 @@ int main() { g_sys_ctx.tx_flags_host = tx_flags_host; g_sys_ctx.rx_data_host = rx_data_host; - // C. Allocate Global Mailbox Bank & Control signals + // Allocate Global Mailbox Bank & Control signals void** d_global_mailbox_bank; CUDA_CHECK(cudaMalloc(&d_global_mailbox_bank, NUM_PREDECODERS * sizeof(void*))); CUDA_CHECK(cudaMemset(d_global_mailbox_bank, 0, NUM_PREDECODERS * sizeof(void*))); @@ -218,8 +198,8 @@ int main() { CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t))); CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t))); - // D. Initialize the 4 AIPreDecoder Instances - std::cout << "[Setup] Capturing 4x AIPreDecoder Graphs...\n"; + // Initialize 4 AIPreDecoder Instances from ONNX + std::cout << "[Setup] Capturing 4x AIPreDecoder Graphs (ONNX -> TRT)...\n"; cudaStream_t capture_stream; CUDA_CHECK(cudaStreamCreate(&capture_stream)); @@ -228,12 +208,15 @@ int main() { for (int i = 0; i < NUM_PREDECODERS; ++i) { void** my_mailbox = d_global_mailbox_bank + i; - auto pd = std::make_unique(engine_path, my_mailbox, QUEUE_DEPTH); + auto pd = std::make_unique(onnx_path, my_mailbox, QUEUE_DEPTH); + + std::cout << "[Setup] Decoder " << i + << ": input_size=" << pd->get_input_size() + << " output_size=" << pd->get_output_size() << "\n"; + pd->capture_graph(capture_stream); cudaGraphExec_t gexec = pd->get_executable_graph(); - std::cout << "[Setup] Decoder " << i << ": graph_exec=" << gexec << "\n"; - std::string func_name = "predecode_target_" + std::to_string(i); function_entries[i].function_id = fnv1a_hash(func_name); function_entries[i].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; @@ -247,24 +230,12 @@ int main() { } int actual_func_count = NUM_PREDECODERS; - // Print struct layout for host/device verification - std::cout << "[Debug] sizeof(cudaq_function_entry_t) = " << sizeof(cudaq_function_entry_t) << "\n"; - std::cout << "[Debug] offsetof handler = " << offsetof(cudaq_function_entry_t, handler) << "\n"; - std::cout << "[Debug] offsetof function_id = " << offsetof(cudaq_function_entry_t, function_id) << "\n"; - std::cout << "[Debug] offsetof dispatch_mode = " << offsetof(cudaq_function_entry_t, dispatch_mode) << "\n"; - std::cout << "[Debug] offsetof schema = " << offsetof(cudaq_function_entry_t, schema) << "\n"; - std::cout << "[Debug] offsetof mailbox_idx = " << offsetof(cudaq_function_entry_t, mailbox_idx) << "\n"; - std::cout << "[Debug] offsetof d_queue_idx = " << offsetof(cudaq_function_entry_t, d_queue_idx) << "\n"; - std::cout << "[Debug] offsetof d_ready_flags = " << offsetof(cudaq_function_entry_t, d_ready_flags) << "\n"; - std::cout << "[Debug] offsetof d_inflight_flag= " << offsetof(cudaq_function_entry_t, d_inflight_flag) << "\n"; - std::cout << "[Debug] sizeof(cudaq_handler_schema_t) = " << sizeof(cudaq_handler_schema_t) << "\n"; - cudaq_function_entry_t* d_function_entries; CUDA_CHECK(cudaMalloc(&d_function_entries, actual_func_count * sizeof(cudaq_function_entry_t))); CUDA_CHECK(cudaMemcpy(d_function_entries, function_entries.data(), actual_func_count * sizeof(cudaq_function_entry_t), cudaMemcpyHostToDevice)); - // E. Start GPU Dispatcher + // Start GPU Dispatcher std::cout << "[Setup] Launching Dispatcher Kernel...\n"; cudaq_dispatch_graph_context* dispatch_ctx = nullptr; CUDA_CHECK(cudaq_create_dispatch_graph_regular( @@ -273,7 +244,7 @@ int main() { )); CUDA_CHECK(cudaq_launch_dispatch_graph(dispatch_ctx, capture_stream)); - // F. Start CPU Infrastructure + // Start CPU Infrastructure std::cout << "[Setup] Booting Thread Pool & Polling Loop...\n"; cudaq::qec::utils::ThreadPool pymatching_pool(4); std::atomic system_stop{false}; @@ -283,64 +254,81 @@ int main() { }); // ========================================================================= - // 5. The Test Stimulus (Acting as the FPGA) - // - // Original pattern: fire 8 requests (2 per decoder) all at once, - // then wait for all responses. + // Test Stimulus: Fire requests in batches of NUM_PREDECODERS. + // The dispatcher advances its slot pointer linearly and only retries + // while rx_value != 0, so we must wait for each batch to complete + // before firing the next to avoid stranding un-dispatched slots. // ========================================================================= - std::cout << "\n[Test] Firing Syndromes...\n"; - + constexpr int TOTAL_REQUESTS = 8; + constexpr int BATCH_SIZE = NUM_PREDECODERS; + std::cout << "\n[Test] Firing " << TOTAL_REQUESTS + << " syndromes in batches of " << BATCH_SIZE + << " (d=7, r=7, error_rate=0.01)...\n"; + + std::mt19937 rng(42); + const size_t payload_bytes = INPUT_ELEMENTS * sizeof(int32_t); int requests_sent = 0; - for (int i = 0; i < 8; ++i) { - int target_decoder = i % NUM_PREDECODERS; - std::string target_func = "predecode_target_" + std::to_string(target_decoder); + int responses_received = 0; - int slot = i % NUM_SLOTS; - while (rx_flags_host[slot] != 0) usleep(10); + for (int batch_start = 0; batch_start < TOTAL_REQUESTS; batch_start += BATCH_SIZE) { + int batch_end = std::min(batch_start + BATCH_SIZE, TOTAL_REQUESTS); + int batch_count = batch_end - batch_start; - uint8_t* slot_data = rx_data_host + (slot * SLOT_SIZE); - auto* header = reinterpret_cast(slot_data); - header->magic = cudaq::nvqlink::RPC_MAGIC_REQUEST; - header->function_id = fnv1a_hash(target_func); - header->arg_len = SYNDROME_FLOATS * sizeof(float); + // Fire one batch + for (int i = batch_start; i < batch_end; ++i) { + int target_decoder = i % NUM_PREDECODERS; + std::string target_func = "predecode_target_" + std::to_string(target_decoder); - float* payload = reinterpret_cast(slot_data + sizeof(cudaq::nvqlink::RPCHeader)); - for (int j = 0; j < SYNDROME_FLOATS; ++j) payload[j] = 1.0f; + int slot = i % NUM_SLOTS; + while (rx_flags_host[slot] != 0) usleep(10); - __sync_synchronize(); - rx_flags_host[slot] = reinterpret_cast(slot_data); - requests_sent++; - } + uint8_t* slot_data = rx_data_host + (slot * SLOT_SIZE); + auto* header = reinterpret_cast(slot_data); + header->magic = cudaq::nvqlink::RPC_MAGIC_REQUEST; + header->function_id = fnv1a_hash(target_func); + header->arg_len = static_cast(payload_bytes); - // Wait for all 8 responses - int responses_received = 0; - for (int i = 0; i < requests_sent; ++i) { - int slot = i % NUM_SLOTS; - - int timeout = 3000; - while (tx_flags_host[slot] == 0 && timeout-- > 0) usleep(1000); - - uint64_t tv = tx_flags_host[slot]; - if (tv != 0 && (tv >> 48) == 0xDEAD) { - int cuda_err = (int)(tv & 0xFFFF); - std::cerr << " [FAIL] Slot " << slot << " cudaGraphLaunch error " - << cuda_err << " (" << cudaGetErrorString((cudaError_t)cuda_err) << ")\n"; - } else if (tv != 0) { - responses_received++; - std::cout << " -> Success: Slot " << slot << " completed the full trip!\n"; - } else { - std::cerr << " [FAIL] Timeout waiting for slot " << slot << "\n"; + int32_t* payload = reinterpret_cast(slot_data + sizeof(cudaq::nvqlink::RPCHeader)); + fill_measurement_payload(payload, rng, 0.01); + + __sync_synchronize(); + rx_flags_host[slot] = reinterpret_cast(slot_data); + requests_sent++; } - tx_flags_host[slot] = 0; + // Wait for this batch to complete + for (int i = batch_start; i < batch_end; ++i) { + int slot = i % NUM_SLOTS; + + int timeout = 10000; + while (tx_flags_host[slot] == 0 && timeout-- > 0) usleep(1000); + + uint64_t tv = tx_flags_host[slot]; + if (tv != 0 && (tv >> 48) == 0xDEAD) { + int cuda_err = (int)(tv & 0xFFFF); + std::cerr << " [FAIL] Slot " << slot << " cudaGraphLaunch error " + << cuda_err << " (" << cudaGetErrorString((cudaError_t)cuda_err) << ")\n"; + } else if (tv != 0) { + responses_received++; + uint8_t* slot_data = rx_data_host + (slot * SLOT_SIZE); + int32_t correction_count = 0; + std::memcpy(&correction_count, + slot_data + sizeof(cudaq::nvqlink::RPCResponse), + sizeof(int32_t)); + std::cout << " -> Slot " << slot << ": OK, residual non-zero detectors = " + << correction_count << "\n"; + } else { + std::cerr << " [FAIL] Timeout waiting for slot " << slot << "\n"; + } + + tx_flags_host[slot] = 0; + } } std::cout << "\n[Result] Processed " << responses_received << "/" << requests_sent << " requests successfully.\n"; - // ========================================================================= - // 6. Teardown - // ========================================================================= + // Teardown std::cout << "[Teardown] Shutting down...\n"; *shutdown_flag_host = 1; __sync_synchronize(); @@ -349,14 +337,12 @@ int main() { incoming_thread.join(); CUDA_CHECK(cudaStreamSynchronize(capture_stream)); - // Read back dispatcher stats for sanity check uint64_t dispatched_packets = 0; CUDA_CHECK(cudaMemcpy(&dispatched_packets, d_stats, sizeof(uint64_t), cudaMemcpyDeviceToHost)); std::cout << "[Stats] Dispatcher processed " << dispatched_packets << " packets.\n"; CUDA_CHECK(cudaq_destroy_dispatch_graph(dispatch_ctx)); - // Cleanup memory cudaFreeHost((void*)rx_flags_host); cudaFreeHost((void*)tx_flags_host); cudaFreeHost(rx_data_host); @@ -366,8 +352,6 @@ int main() { cudaFree(d_function_entries); cudaStreamDestroy(capture_stream); - remove(engine_path.c_str()); - std::cout << "Done.\n"; return 0; } diff --git a/libs/qec/unittests/CMakeLists.txt b/libs/qec/unittests/CMakeLists.txt index e91833ec..28aa1dce 100644 --- a/libs/qec/unittests/CMakeLists.txt +++ b/libs/qec/unittests/CMakeLists.txt @@ -186,7 +186,7 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) TEST_PREFIX "test_realtime_decoding." ) # Hybrid AI predecoder + PyMatching pipeline test - # Requires TensorRT for the AI inference engine + # Requires TensorRT + ONNX parser for building engines from ONNX models find_path(TENSORRT_INCLUDE_DIR NvInfer.h PATHS ${TENSORRT_ROOT}/include @@ -204,8 +204,16 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) /usr/local/tensorrt/lib /opt/tensorrt/lib ) + find_library(TENSORRT_ONNX_PARSER_LIBRARY nvonnxparser + PATHS + ${TENSORRT_ROOT}/lib + /usr/lib/x86_64-linux-gnu + /usr/local/cuda/lib64 + /usr/local/tensorrt/lib + /opt/tensorrt/lib + ) - if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY) + if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY AND TENSORRT_ONNX_PARSER_LIBRARY) add_executable(test_realtime_predecoder_w_pymatching ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/ai_decoder_service.cu @@ -219,6 +227,10 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) LINKER_LANGUAGE CUDA ) + target_compile_definitions(test_realtime_predecoder_w_pymatching PRIVATE + ONNX_MODEL_PATH="${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/model1_d7_r7_unified_Z_batch1.onnx" + ) + target_include_directories(test_realtime_predecoder_w_pymatching PRIVATE ${CUDAToolkit_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR} @@ -230,6 +242,7 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) target_link_libraries(test_realtime_predecoder_w_pymatching PRIVATE CUDA::cudart ${TENSORRT_LIBRARY} + ${TENSORRT_ONNX_PARSER_LIBRARY} ${CUDAQ_REALTIME_LIBRARY} ${CUDAQ_REALTIME_DISPATCH_LIBRARY} ) @@ -241,7 +254,7 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) add_dependencies(CUDAQXQECUnitTests test_realtime_predecoder_w_pymatching) else() - message(WARNING "TensorRT not found. Skipping test_realtime_predecoder_w_pymatching.") + message(WARNING "TensorRT or ONNX parser not found. Skipping test_realtime_predecoder_w_pymatching.") endif() else() From ffaab3dada097f60bbba42647562249f1e51ded7 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Thu, 19 Feb 2026 05:04:20 +0000 Subject: [PATCH 04/40] Integrate real PyMatching MWPM decoder into AI predecoder pipeline Replace the simulated PyMatching worker with a real MWPM decoder using the d=7 surface code's static Z parity check matrix via the cudaq-qec decoder plugin system. The 336 residual detectors from the AI predecoder are sliced into 14 spatial rounds of 24 Z-stabilizer syndromes and decoded independently. A mutex protects the decoder for thread safety across the 4-worker thread pool. Signed-off-by: Scott Thornton --- .../test_realtime_predecoder_w_pymatching.cpp | 84 +++++++++++++++---- libs/qec/unittests/CMakeLists.txt | 10 ++- 2 files changed, 74 insertions(+), 20 deletions(-) diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index f3e02d86..e8ce1678 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -42,6 +43,8 @@ #include "cudaq/qec/realtime/ai_decoder_service.h" #include "cudaq/qec/realtime/ai_predecoder_service.h" #include "cudaq/qec/utils/thread_pool.h" +#include "cudaq/qec/code.h" +#include "cudaq/qec/decoder.h" #define CUDA_CHECK(call) \ do { \ @@ -82,27 +85,54 @@ struct SystemContext { SystemContext g_sys_ctx; // ============================================================================= -// Thread Pool Worker (PyMatching Simulation) +// Thread Pool Worker (Real PyMatching MWPM Decoder) // ============================================================================= -void pymatching_worker_task(PreDecoderJob job, AIPreDecoderService* predecoder) { + +// d=7 surface code: 24 Z stabilizers per spatial slice +constexpr int Z_STABILIZERS = 24; +constexpr int NUM_SPATIAL_SLICES = RESIDUAL_DETECTORS / Z_STABILIZERS; // 336/24 = 14 + +void pymatching_worker_task(PreDecoderJob job, AIPreDecoderService* predecoder, + cudaq::qec::decoder* pm_decoder, std::mutex* decode_mtx) { size_t num_detectors = predecoder->get_output_size() / sizeof(int32_t); const int32_t* residual = static_cast(job.inference_data); - // Simulate PyMatching: count non-zero detectors and produce corrections - int nonzero = 0; - for (size_t i = 0; i < num_detectors; ++i) { - if (residual[i] != 0) nonzero++; + // Decode each spatial slice of Z-stabilizer detectors independently + // using code-capacity PyMatching (H_z is [24 x 49]) + int total_corrections = 0; + bool all_converged = true; + + for (int s = 0; s < NUM_SPATIAL_SLICES; ++s) { + const int32_t* slice = residual + s * Z_STABILIZERS; + std::vector syndrome(Z_STABILIZERS); + for (int i = 0; i < Z_STABILIZERS; ++i) + syndrome[i] = static_cast(slice[i]); + + cudaq::qec::decoder_result result; + { + std::lock_guard lock(*decode_mtx); + result = pm_decoder->decode(syndrome); + } + + all_converged &= result.converged; + for (auto v : result.result) + if (v > 0.5f) total_corrections++; } - // Write RPC Response with a simple summary (correction count) + // Write RPC Response + struct __attribute__((packed)) DecodeResponse { + int32_t total_corrections; + int32_t converged; + }; + DecodeResponse resp_data{total_corrections, all_converged ? 1 : 0}; + char* response_payload = (char*)job.ring_buffer_ptr + sizeof(cudaq::nvqlink::RPCResponse); - int32_t correction_count = nonzero; - std::memcpy(response_payload, &correction_count, sizeof(int32_t)); + std::memcpy(response_payload, &resp_data, sizeof(resp_data)); auto* header = static_cast(job.ring_buffer_ptr); header->magic = cudaq::nvqlink::RPC_MAGIC_RESPONSE; header->status = 0; - header->result_len = sizeof(int32_t); + header->result_len = sizeof(resp_data); std::atomic_thread_fence(std::memory_order_release); @@ -119,6 +149,8 @@ void pymatching_worker_task(PreDecoderJob job, AIPreDecoderService* predecoder) void incoming_polling_loop( std::vector>& predecoders, cudaq::qec::utils::ThreadPool& thread_pool, + cudaq::qec::decoder* pm_decoder, + std::mutex& decode_mtx, std::atomic& stop_signal) { PreDecoderJob job; @@ -127,8 +159,8 @@ void incoming_polling_loop( for (auto& predecoder : predecoders) { if (predecoder->poll_next_job(job)) { AIPreDecoderService* pd_ptr = predecoder.get(); - thread_pool.enqueue([job, pd_ptr]() { - pymatching_worker_task(job, pd_ptr); + thread_pool.enqueue([job, pd_ptr, pm_decoder, &decode_mtx]() { + pymatching_worker_task(job, pd_ptr, pm_decoder, &decode_mtx); }); found_work = true; } @@ -160,6 +192,18 @@ int main() { std::string onnx_path = ONNX_MODEL_PATH; std::cout << "[Setup] Building TRT engines from: " << onnx_path << "\n"; + // Create PyMatching decoder from d=7 surface code Z parity check matrix + std::cout << "[Setup] Creating PyMatching decoder (d=7 surface code, Z stabilizers)...\n"; + auto surface_code = cudaq::qec::get_code("surface_code", {{"distance", 7}}); + auto H_z = surface_code->get_parity_z(); + std::cout << "[Setup] H_z shape: [" << H_z.shape()[0] << " x " << H_z.shape()[1] << "]\n"; + + cudaqx::heterogeneous_map pm_params; + pm_params.insert("merge_strategy", std::string("smallest_weight")); + auto pm_decoder = cudaq::qec::decoder::get("pymatching", H_z, pm_params); + std::mutex decode_mtx; + std::cout << "[Setup] PyMatching decoder ready.\n"; + // Allocate Ring Buffers void* tmp = nullptr; volatile uint64_t *rx_flags_host, *tx_flags_host; @@ -249,8 +293,9 @@ int main() { cudaq::qec::utils::ThreadPool pymatching_pool(4); std::atomic system_stop{false}; + cudaq::qec::decoder* pm_raw = pm_decoder.get(); std::thread incoming_thread([&]() { - incoming_polling_loop(predecoders, pymatching_pool, system_stop); + incoming_polling_loop(predecoders, pymatching_pool, pm_raw, decode_mtx, system_stop); }); // ========================================================================= @@ -259,7 +304,7 @@ int main() { // while rx_value != 0, so we must wait for each batch to complete // before firing the next to avoid stranding un-dispatched slots. // ========================================================================= - constexpr int TOTAL_REQUESTS = 8; + constexpr int TOTAL_REQUESTS = 20; constexpr int BATCH_SIZE = NUM_PREDECODERS; std::cout << "\n[Test] Firing " << TOTAL_REQUESTS << " syndromes in batches of " << BATCH_SIZE @@ -311,12 +356,15 @@ int main() { } else if (tv != 0) { responses_received++; uint8_t* slot_data = rx_data_host + (slot * SLOT_SIZE); - int32_t correction_count = 0; - std::memcpy(&correction_count, + int32_t corrections = 0, converged = 0; + std::memcpy(&corrections, slot_data + sizeof(cudaq::nvqlink::RPCResponse), sizeof(int32_t)); - std::cout << " -> Slot " << slot << ": OK, residual non-zero detectors = " - << correction_count << "\n"; + std::memcpy(&converged, + slot_data + sizeof(cudaq::nvqlink::RPCResponse) + sizeof(int32_t), + sizeof(int32_t)); + std::cout << " -> Slot " << slot << ": OK, corrections=" << corrections + << " converged=" << (converged ? "yes" : "no") << "\n"; } else { std::cerr << " [FAIL] Timeout waiting for slot " << slot << "\n"; } diff --git a/libs/qec/unittests/CMakeLists.txt b/libs/qec/unittests/CMakeLists.txt index 28aa1dce..5c40b3db 100644 --- a/libs/qec/unittests/CMakeLists.txt +++ b/libs/qec/unittests/CMakeLists.txt @@ -245,11 +245,17 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) ${TENSORRT_ONNX_PARSER_LIBRARY} ${CUDAQ_REALTIME_LIBRARY} ${CUDAQ_REALTIME_DISPATCH_LIBRARY} + cudaq-qec + cudaq::cudaq + ) + + target_link_directories(test_realtime_predecoder_w_pymatching PRIVATE + ${CMAKE_BINARY_DIR}/lib ) set_target_properties(test_realtime_predecoder_w_pymatching PROPERTIES - BUILD_RPATH "${CUDAQ_REALTIME_LIB_DIR}" - INSTALL_RPATH "${CUDAQ_REALTIME_LIB_DIR}" + BUILD_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" + INSTALL_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" ) add_dependencies(CUDAQXQECUnitTests test_realtime_predecoder_w_pymatching) From 35792ec3c056029c5e12723b06fbc9a71bcd9049 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Thu, 19 Feb 2026 18:00:11 +0000 Subject: [PATCH 05/40] Refactor predecoder test into multi-distance PipelineConfig architecture Extract hard-coded d=7 parameters into a PipelineConfig struct with static factory methods for d=7, d=13, d=21, and d=31 surface codes. Runtime config selection via command-line argument (d7|d13|d21|d31) preserves existing d=7 functionality while enabling larger-distance experiments. ONNX_MODEL_PATH replaced with ONNX_MODEL_DIR to support per-config model filenames. Signed-off-by: Scott Thornton --- .../test_realtime_predecoder_w_pymatching.cpp | 314 +++++++++++++----- libs/qec/unittests/CMakeLists.txt | 2 +- 2 files changed, 229 insertions(+), 87 deletions(-) diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index e8ce1678..c6a453d7 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -7,19 +7,38 @@ ******************************************************************************/ /******************************************************************************* - * Hybrid Realtime Pipeline Test with Real ONNX Pre-Decoder + * Hybrid Realtime Pipeline Test with Real ONNX Pre-Decoder + PyMatching * - * Uses model1_d7_r7_unified_Z_batch1.onnx: - * Input: all_measurements [1, 72, 7] INT32 (2016 bytes) - * Output: residual_detectors [1, 336] INT32 (1344 bytes) - * Output: logical_frame [1] INT32 (4 bytes) + * Supports multiple surface code configurations: + * + * d=7 r=7 (model1_d7_r7_unified_Z_batch1.onnx) + * Input: all_measurements [1, 72, 7] INT32 (2016 bytes) + * Output: residual_detectors [1, 336] INT32 (1344 bytes) + * Output: logical_frame [1] INT32 (4 bytes) + * + * d=13 r=13 (model1_d13_r13_unified_Z_batch1.onnx) + * Input: all_measurements [1, 252, 13] INT32 (13104 bytes) + * Output: residual_detectors [1, 2184] INT32 (8736 bytes) + * Output: logical_frame [1] INT32 (4 bytes) + * + * d=21 r=21 (model1_d21_r21_unified_Z_batch1.onnx) + * Input: all_measurements [1, 660, 21] INT32 (55440 bytes) + * Output: residual_detectors [1, 9240] INT32 (36960 bytes) + * Output: logical_frame [1] INT32 (4 bytes) + * + * d=31 r=31 (model1_d31_r31_unified_Z_batch1.onnx) + * Input: all_measurements [1, 1440, 31] INT32 (178560 bytes) + * Output: residual_detectors [1, 29760] INT32 (119040 bytes) + * Output: logical_frame [1] INT32 (4 bytes) * * Pipeline: * 1. Ring Buffer setup - * 2. Dispatcher Kernel -> 4x AIPreDecoderService instances (GPU, TRT from ONNX) + * 2. Dispatcher Kernel -> Nx AIPreDecoderService instances (GPU, TRT from ONNX) * 3. GPU -> CPU N-Deep Pinned Memory Queue handoff - * 4. Dedicated Polling Thread -> 4-Worker PyMatching Thread Pool + * 4. Dedicated Polling Thread -> Worker PyMatching Thread Pool * 5. CPU Workers closing the transaction (Setting TX flags) + * + * Usage: test_realtime_predecoder_w_pymatching [d7|d13|d21|d31] ******************************************************************************/ #include @@ -31,6 +50,7 @@ #include #include #include +#include #include @@ -58,18 +78,103 @@ using namespace cudaq::qec; // ============================================================================= -// Configuration +// Pipeline Configuration // ============================================================================= + constexpr size_t NUM_SLOTS = 64; -constexpr size_t SLOT_SIZE = 4096; // Enough for RPC header + 2016-byte payload + response -constexpr int NUM_PREDECODERS = 4; -constexpr int QUEUE_DEPTH = 16; -// d=7, r=7 surface code Z-type model dimensions -constexpr int MEAS_QUBITS = 72; -constexpr int NUM_ROUNDS = 7; -constexpr int INPUT_ELEMENTS = MEAS_QUBITS * NUM_ROUNDS; // 504 int32s = 2016 bytes -constexpr int RESIDUAL_DETECTORS = 336; // 336 int32s = 1344 bytes +struct PipelineConfig { + std::string label; + int distance; + int num_rounds; + int meas_qubits; // ONNX input shape[1] + int residual_detectors; // ONNX output dim + std::string onnx_filename; + size_t slot_size; // must fit RPCHeader + input payload + int total_requests; + int num_predecoders; + int queue_depth; + int num_workers; + + int input_elements() const { return meas_qubits * num_rounds; } + size_t input_bytes() const { return input_elements() * sizeof(int32_t); } + + std::string onnx_path() const { + return std::string(ONNX_MODEL_DIR) + "/" + onnx_filename; + } + + static PipelineConfig d7_r7() { + return { + "d7_r7_Z", + /*distance=*/7, + /*num_rounds=*/7, + /*meas_qubits=*/72, + /*residual_detectors=*/336, + "model1_d7_r7_unified_Z_batch1.onnx", + /*slot_size=*/4096, + /*total_requests=*/20, + /*num_predecoders=*/4, + /*queue_depth=*/16, + /*num_workers=*/4 + }; + } + + static PipelineConfig d13_r13() { + return { + "d13_r13_Z", + /*distance=*/13, + /*num_rounds=*/13, + /*meas_qubits=*/252, + /*residual_detectors=*/2184, + "model1_d13_r13_unified_Z_batch1.onnx", + /*slot_size=*/16384, + /*total_requests=*/20, + /*num_predecoders=*/4, + /*queue_depth=*/16, + /*num_workers=*/4 + }; + } + + static PipelineConfig d21_r21() { + return { + "d21_r21_Z", + /*distance=*/21, + /*num_rounds=*/21, + /*meas_qubits=*/660, + /*residual_detectors=*/9240, + "model1_d21_r21_unified_X_batch1.onnx", + /*slot_size=*/65536, + /*total_requests=*/20, + /*num_predecoders=*/4, + /*queue_depth=*/16, + /*num_workers=*/4 + }; + } + + static PipelineConfig d31_r31() { + return { + "d31_r31_Z", + /*distance=*/31, + /*num_rounds=*/31, + /*meas_qubits=*/1440, + /*residual_detectors=*/29760, + "model1_d31_r31_unified_Z_batch1.onnx", + /*slot_size=*/262144, + /*total_requests=*/20, + /*num_predecoders=*/4, + /*queue_depth=*/16, + /*num_workers=*/4 + }; + } +}; + +// Runtime decoder state populated during setup +struct DecoderContext { + std::unique_ptr pm_decoder; + std::mutex decode_mtx; + int z_stabilizers = 0; + int spatial_slices = 0; +}; constexpr std::uint32_t fnv1a_hash(std::string_view str) { std::uint32_t hash = 0x811c9dc5; @@ -80,7 +185,7 @@ constexpr std::uint32_t fnv1a_hash(std::string_view str) { struct SystemContext { volatile uint64_t* tx_flags_host = nullptr; uint8_t* rx_data_host = nullptr; - size_t slot_size = SLOT_SIZE; + size_t slot_size = 0; }; SystemContext g_sys_ctx; @@ -88,42 +193,35 @@ SystemContext g_sys_ctx; // Thread Pool Worker (Real PyMatching MWPM Decoder) // ============================================================================= -// d=7 surface code: 24 Z stabilizers per spatial slice -constexpr int Z_STABILIZERS = 24; -constexpr int NUM_SPATIAL_SLICES = RESIDUAL_DETECTORS / Z_STABILIZERS; // 336/24 = 14 +struct __attribute__((packed)) DecodeResponse { + int32_t total_corrections; + int32_t converged; +}; void pymatching_worker_task(PreDecoderJob job, AIPreDecoderService* predecoder, - cudaq::qec::decoder* pm_decoder, std::mutex* decode_mtx) { - size_t num_detectors = predecoder->get_output_size() / sizeof(int32_t); + DecoderContext* ctx) { const int32_t* residual = static_cast(job.inference_data); - // Decode each spatial slice of Z-stabilizer detectors independently - // using code-capacity PyMatching (H_z is [24 x 49]) int total_corrections = 0; bool all_converged = true; - for (int s = 0; s < NUM_SPATIAL_SLICES; ++s) { - const int32_t* slice = residual + s * Z_STABILIZERS; - std::vector syndrome(Z_STABILIZERS); - for (int i = 0; i < Z_STABILIZERS; ++i) + for (int s = 0; s < ctx->spatial_slices; ++s) { + const int32_t* slice = residual + s * ctx->z_stabilizers; + std::vector syndrome(ctx->z_stabilizers); + for (int i = 0; i < ctx->z_stabilizers; ++i) syndrome[i] = static_cast(slice[i]); cudaq::qec::decoder_result result; { - std::lock_guard lock(*decode_mtx); - result = pm_decoder->decode(syndrome); + std::lock_guard lock(ctx->decode_mtx); + result = ctx->pm_decoder->decode(syndrome); } all_converged &= result.converged; for (auto v : result.result) - if (v > 0.5f) total_corrections++; + if (v > 0.5) total_corrections++; } - // Write RPC Response - struct __attribute__((packed)) DecodeResponse { - int32_t total_corrections; - int32_t converged; - }; DecodeResponse resp_data{total_corrections, all_converged ? 1 : 0}; char* response_payload = (char*)job.ring_buffer_ptr + sizeof(cudaq::nvqlink::RPCResponse); @@ -149,8 +247,7 @@ void pymatching_worker_task(PreDecoderJob job, AIPreDecoderService* predecoder, void incoming_polling_loop( std::vector>& predecoders, cudaq::qec::utils::ThreadPool& thread_pool, - cudaq::qec::decoder* pm_decoder, - std::mutex& decode_mtx, + DecoderContext* ctx, std::atomic& stop_signal) { PreDecoderJob job; @@ -159,8 +256,8 @@ void incoming_polling_loop( for (auto& predecoder : predecoders) { if (predecoder->poll_next_job(job)) { AIPreDecoderService* pd_ptr = predecoder.get(); - thread_pool.enqueue([job, pd_ptr, pm_decoder, &decode_mtx]() { - pymatching_worker_task(job, pd_ptr, pm_decoder, &decode_mtx); + thread_pool.enqueue([job, pd_ptr, ctx]() { + pymatching_worker_task(job, pd_ptr, ctx); }); found_work = true; } @@ -174,10 +271,10 @@ void incoming_polling_loop( // ============================================================================= // Generate Realistic Syndrome Data // ============================================================================= -void fill_measurement_payload(int32_t* payload, std::mt19937& rng, - double error_rate = 0.01) { +void fill_measurement_payload(int32_t* payload, int input_elements, + std::mt19937& rng, double error_rate = 0.01) { std::bernoulli_distribution err_dist(error_rate); - for (int i = 0; i < INPUT_ELEMENTS; ++i) { + for (int i = 0; i < input_elements; ++i) { payload[i] = err_dist(rng) ? 1 : 0; } } @@ -185,23 +282,62 @@ void fill_measurement_payload(int32_t* payload, std::mt19937& rng, // ============================================================================= // Main // ============================================================================= -int main() { - std::cout << "--- Initializing Hybrid AI Realtime Pipeline (d=7 r=7 Z) ---\n"; +int main(int argc, char* argv[]) { + // Select configuration + std::string config_name = "d7"; + if (argc > 1) + config_name = argv[1]; + + PipelineConfig config; + if (config_name == "d7") { + config = PipelineConfig::d7_r7(); + } else if (config_name == "d13") { + config = PipelineConfig::d13_r13(); + } else if (config_name == "d21") { + config = PipelineConfig::d21_r21(); + } else if (config_name == "d31") { + config = PipelineConfig::d31_r31(); + } else { + std::cerr << "Usage: " << argv[0] << " [d7|d13|d21|d31]\n" + << " d7 - distance 7, 7 rounds (default)\n" + << " d13 - distance 13, 13 rounds\n" + << " d21 - distance 21, 21 rounds\n" + << " d31 - distance 31, 31 rounds\n"; + return 1; + } + + std::cout << "--- Initializing Hybrid AI Realtime Pipeline (" + << config.label << ") ---\n"; + std::cout << "[Config] distance=" << config.distance + << " rounds=" << config.num_rounds + << " meas_qubits=" << config.meas_qubits + << " residual_detectors=" << config.residual_detectors + << " input_bytes=" << config.input_bytes() + << " slot_size=" << config.slot_size << "\n"; + CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost)); - std::string onnx_path = ONNX_MODEL_PATH; + std::string onnx_path = config.onnx_path(); std::cout << "[Setup] Building TRT engines from: " << onnx_path << "\n"; - // Create PyMatching decoder from d=7 surface code Z parity check matrix - std::cout << "[Setup] Creating PyMatching decoder (d=7 surface code, Z stabilizers)...\n"; - auto surface_code = cudaq::qec::get_code("surface_code", {{"distance", 7}}); + // Create PyMatching decoder from surface code Z parity check matrix + std::cout << "[Setup] Creating PyMatching decoder (d=" << config.distance + << " surface code, Z stabilizers)...\n"; + auto surface_code = cudaq::qec::get_code("surface_code", + {{"distance", config.distance}}); auto H_z = surface_code->get_parity_z(); - std::cout << "[Setup] H_z shape: [" << H_z.shape()[0] << " x " << H_z.shape()[1] << "]\n"; + + DecoderContext decoder_ctx; + decoder_ctx.z_stabilizers = static_cast(H_z.shape()[0]); + decoder_ctx.spatial_slices = config.residual_detectors / decoder_ctx.z_stabilizers; + std::cout << "[Setup] H_z shape: [" << H_z.shape()[0] << " x " + << H_z.shape()[1] << "]" + << " z_stabilizers=" << decoder_ctx.z_stabilizers + << " spatial_slices=" << decoder_ctx.spatial_slices << "\n"; cudaqx::heterogeneous_map pm_params; pm_params.insert("merge_strategy", std::string("smallest_weight")); - auto pm_decoder = cudaq::qec::decoder::get("pymatching", H_z, pm_params); - std::mutex decode_mtx; + decoder_ctx.pm_decoder = cudaq::qec::decoder::get("pymatching", H_z, pm_params); std::cout << "[Setup] PyMatching decoder ready.\n"; // Allocate Ring Buffers @@ -218,7 +354,7 @@ int main() { tx_flags_host = static_cast(tmp); CUDA_CHECK(cudaHostGetDevicePointer((void**)&tx_flags_dev, tmp, 0)); - CUDA_CHECK(cudaHostAlloc(&rx_data_host, NUM_SLOTS * SLOT_SIZE, cudaHostAllocMapped)); + CUDA_CHECK(cudaHostAlloc(&rx_data_host, NUM_SLOTS * config.slot_size, cudaHostAllocMapped)); CUDA_CHECK(cudaHostGetDevicePointer((void**)&rx_data_dev, rx_data_host, 0)); std::memset((void*)rx_flags_host, 0, NUM_SLOTS * sizeof(uint64_t)); @@ -226,11 +362,12 @@ int main() { g_sys_ctx.tx_flags_host = tx_flags_host; g_sys_ctx.rx_data_host = rx_data_host; + g_sys_ctx.slot_size = config.slot_size; // Allocate Global Mailbox Bank & Control signals void** d_global_mailbox_bank; - CUDA_CHECK(cudaMalloc(&d_global_mailbox_bank, NUM_PREDECODERS * sizeof(void*))); - CUDA_CHECK(cudaMemset(d_global_mailbox_bank, 0, NUM_PREDECODERS * sizeof(void*))); + CUDA_CHECK(cudaMalloc(&d_global_mailbox_bank, config.num_predecoders * sizeof(void*))); + CUDA_CHECK(cudaMemset(d_global_mailbox_bank, 0, config.num_predecoders * sizeof(void*))); int* shutdown_flag_host; CUDA_CHECK(cudaHostAlloc(&shutdown_flag_host, sizeof(int), cudaHostAllocMapped)); @@ -242,17 +379,19 @@ int main() { CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t))); CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t))); - // Initialize 4 AIPreDecoder Instances from ONNX - std::cout << "[Setup] Capturing 4x AIPreDecoder Graphs (ONNX -> TRT)...\n"; + // Initialize AIPreDecoder Instances from ONNX + std::cout << "[Setup] Capturing " << config.num_predecoders + << "x AIPreDecoder Graphs (ONNX -> TRT)...\n"; cudaStream_t capture_stream; CUDA_CHECK(cudaStreamCreate(&capture_stream)); std::vector> predecoders; - std::vector function_entries(NUM_PREDECODERS); + std::vector function_entries(config.num_predecoders); - for (int i = 0; i < NUM_PREDECODERS; ++i) { + for (int i = 0; i < config.num_predecoders; ++i) { void** my_mailbox = d_global_mailbox_bank + i; - auto pd = std::make_unique(onnx_path, my_mailbox, QUEUE_DEPTH); + auto pd = std::make_unique(onnx_path, my_mailbox, + config.queue_depth); std::cout << "[Setup] Decoder " << i << ": input_size=" << pd->get_input_size() @@ -272,69 +411,72 @@ int main() { predecoders.push_back(std::move(pd)); } - int actual_func_count = NUM_PREDECODERS; cudaq_function_entry_t* d_function_entries; - CUDA_CHECK(cudaMalloc(&d_function_entries, actual_func_count * sizeof(cudaq_function_entry_t))); + CUDA_CHECK(cudaMalloc(&d_function_entries, + config.num_predecoders * sizeof(cudaq_function_entry_t))); CUDA_CHECK(cudaMemcpy(d_function_entries, function_entries.data(), - actual_func_count * sizeof(cudaq_function_entry_t), cudaMemcpyHostToDevice)); + config.num_predecoders * sizeof(cudaq_function_entry_t), + cudaMemcpyHostToDevice)); // Start GPU Dispatcher std::cout << "[Setup] Launching Dispatcher Kernel...\n"; cudaq_dispatch_graph_context* dispatch_ctx = nullptr; CUDA_CHECK(cudaq_create_dispatch_graph_regular( - rx_flags_dev, tx_flags_dev, d_function_entries, actual_func_count, - d_global_mailbox_bank, d_shutdown_flag, d_stats, NUM_SLOTS, 1, 32, capture_stream, &dispatch_ctx + rx_flags_dev, tx_flags_dev, d_function_entries, config.num_predecoders, + d_global_mailbox_bank, d_shutdown_flag, d_stats, NUM_SLOTS, 1, 32, + capture_stream, &dispatch_ctx )); CUDA_CHECK(cudaq_launch_dispatch_graph(dispatch_ctx, capture_stream)); // Start CPU Infrastructure - std::cout << "[Setup] Booting Thread Pool & Polling Loop...\n"; - cudaq::qec::utils::ThreadPool pymatching_pool(4); + std::cout << "[Setup] Booting Thread Pool (" << config.num_workers + << " workers) & Polling Loop...\n"; + cudaq::qec::utils::ThreadPool pymatching_pool(config.num_workers); std::atomic system_stop{false}; - cudaq::qec::decoder* pm_raw = pm_decoder.get(); std::thread incoming_thread([&]() { - incoming_polling_loop(predecoders, pymatching_pool, pm_raw, decode_mtx, system_stop); + incoming_polling_loop(predecoders, pymatching_pool, &decoder_ctx, + system_stop); }); // ========================================================================= - // Test Stimulus: Fire requests in batches of NUM_PREDECODERS. + // Test Stimulus: Fire requests in batches of num_predecoders. // The dispatcher advances its slot pointer linearly and only retries // while rx_value != 0, so we must wait for each batch to complete // before firing the next to avoid stranding un-dispatched slots. // ========================================================================= - constexpr int TOTAL_REQUESTS = 20; - constexpr int BATCH_SIZE = NUM_PREDECODERS; - std::cout << "\n[Test] Firing " << TOTAL_REQUESTS - << " syndromes in batches of " << BATCH_SIZE - << " (d=7, r=7, error_rate=0.01)...\n"; + const int batch_size = config.num_predecoders; + std::cout << "\n[Test] Firing " << config.total_requests + << " syndromes in batches of " << batch_size + << " (" << config.label << ", error_rate=0.01)...\n"; std::mt19937 rng(42); - const size_t payload_bytes = INPUT_ELEMENTS * sizeof(int32_t); + const size_t payload_bytes = config.input_bytes(); int requests_sent = 0; int responses_received = 0; - for (int batch_start = 0; batch_start < TOTAL_REQUESTS; batch_start += BATCH_SIZE) { - int batch_end = std::min(batch_start + BATCH_SIZE, TOTAL_REQUESTS); - int batch_count = batch_end - batch_start; + for (int batch_start = 0; batch_start < config.total_requests; + batch_start += batch_size) { + int batch_end = std::min(batch_start + batch_size, config.total_requests); // Fire one batch for (int i = batch_start; i < batch_end; ++i) { - int target_decoder = i % NUM_PREDECODERS; + int target_decoder = i % config.num_predecoders; std::string target_func = "predecode_target_" + std::to_string(target_decoder); - int slot = i % NUM_SLOTS; + int slot = i % (int)NUM_SLOTS; while (rx_flags_host[slot] != 0) usleep(10); - uint8_t* slot_data = rx_data_host + (slot * SLOT_SIZE); + uint8_t* slot_data = rx_data_host + (slot * config.slot_size); auto* header = reinterpret_cast(slot_data); header->magic = cudaq::nvqlink::RPC_MAGIC_REQUEST; header->function_id = fnv1a_hash(target_func); header->arg_len = static_cast(payload_bytes); - int32_t* payload = reinterpret_cast(slot_data + sizeof(cudaq::nvqlink::RPCHeader)); - fill_measurement_payload(payload, rng, 0.01); + int32_t* payload = reinterpret_cast( + slot_data + sizeof(cudaq::nvqlink::RPCHeader)); + fill_measurement_payload(payload, config.input_elements(), rng, 0.01); __sync_synchronize(); rx_flags_host[slot] = reinterpret_cast(slot_data); @@ -343,7 +485,7 @@ int main() { // Wait for this batch to complete for (int i = batch_start; i < batch_end; ++i) { - int slot = i % NUM_SLOTS; + int slot = i % (int)NUM_SLOTS; int timeout = 10000; while (tx_flags_host[slot] == 0 && timeout-- > 0) usleep(1000); @@ -355,7 +497,7 @@ int main() { << cuda_err << " (" << cudaGetErrorString((cudaError_t)cuda_err) << ")\n"; } else if (tv != 0) { responses_received++; - uint8_t* slot_data = rx_data_host + (slot * SLOT_SIZE); + uint8_t* slot_data = rx_data_host + (slot * config.slot_size); int32_t corrections = 0, converged = 0; std::memcpy(&corrections, slot_data + sizeof(cudaq::nvqlink::RPCResponse), diff --git a/libs/qec/unittests/CMakeLists.txt b/libs/qec/unittests/CMakeLists.txt index 5c40b3db..5196e253 100644 --- a/libs/qec/unittests/CMakeLists.txt +++ b/libs/qec/unittests/CMakeLists.txt @@ -228,7 +228,7 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) ) target_compile_definitions(test_realtime_predecoder_w_pymatching PRIVATE - ONNX_MODEL_PATH="${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/model1_d7_r7_unified_Z_batch1.onnx" + ONNX_MODEL_DIR="${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime" ) target_include_directories(test_realtime_predecoder_w_pymatching PRIVATE From f7b4c6ec11e2fbce2f5e4467dce6a2585da7bc19 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Thu, 19 Feb 2026 18:10:26 +0000 Subject: [PATCH 06/40] Add PipelineBenchmark utility for realtime decoding latency measurement Introduce a reusable header-only latency and throughput tracker for realtime decoding pipelines. Provides per-request submit/complete timestamping, percentile statistics (p50/p90/p95/p99), and a formatted report including wall time, throughput, and per-request breakdown. Signed-off-by: Scott Thornton --- .../cudaq/qec/utils/pipeline_benchmarks.h | 180 ++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 libs/qec/include/cudaq/qec/utils/pipeline_benchmarks.h diff --git a/libs/qec/include/cudaq/qec/utils/pipeline_benchmarks.h b/libs/qec/include/cudaq/qec/utils/pipeline_benchmarks.h new file mode 100644 index 00000000..2a812e9e --- /dev/null +++ b/libs/qec/include/cudaq/qec/utils/pipeline_benchmarks.h @@ -0,0 +1,180 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cudaq::qec::utils { + +/// Reusable latency / throughput tracker for realtime decoding pipelines. +/// +/// Usage: +/// PipelineBenchmark bench("my test", num_requests); +/// bench.start(); +/// for (int i = 0; i < n; ++i) { +/// bench.mark_submit(i); +/// // ... submit request ... +/// // ... wait for response ... +/// bench.mark_complete(i); +/// } +/// bench.stop(); +/// bench.report(); +/// +class PipelineBenchmark { +public: + using clock = std::chrono::high_resolution_clock; + using time_point = clock::time_point; + using duration_us = std::chrono::duration; + + explicit PipelineBenchmark(const std::string &label = "Pipeline", + size_t expected_requests = 0) + : label_(label) { + if (expected_requests > 0) { + submit_times_.resize(expected_requests); + complete_times_.resize(expected_requests); + } + } + + void start() { run_start_ = clock::now(); } + void stop() { run_end_ = clock::now(); } + + void mark_submit(int request_id) { + ensure_capacity(request_id); + submit_times_[request_id] = clock::now(); + } + + void mark_complete(int request_id) { + ensure_capacity(request_id); + complete_times_[request_id] = clock::now(); + } + + struct Stats { + size_t count = 0; + double min_us = 0, max_us = 0, mean_us = 0; + double p50_us = 0, p90_us = 0, p95_us = 0, p99_us = 0; + double stddev_us = 0; + double total_wall_us = 0; + double throughput_rps = 0; + }; + + /// Return per-request latencies in microseconds. + std::vector latencies_us() const { + size_t n = std::min(submit_times_.size(), complete_times_.size()); + std::vector lats; + lats.reserve(n); + for (size_t i = 0; i < n; ++i) { + auto dt = std::chrono::duration_cast( + complete_times_[i] - submit_times_[i]); + lats.push_back(dt.count()); + } + return lats; + } + + Stats compute_stats() const { + auto lats = latencies_us(); + Stats s; + s.count = lats.size(); + if (s.count == 0) + return s; + + std::sort(lats.begin(), lats.end()); + + s.min_us = lats.front(); + s.max_us = lats.back(); + s.mean_us = + std::accumulate(lats.begin(), lats.end(), 0.0) / s.count; + s.p50_us = percentile(lats, 50.0); + s.p90_us = percentile(lats, 90.0); + s.p95_us = percentile(lats, 95.0); + s.p99_us = percentile(lats, 99.0); + + double sum_sq = 0; + for (auto v : lats) + sum_sq += (v - s.mean_us) * (v - s.mean_us); + s.stddev_us = std::sqrt(sum_sq / s.count); + + auto wall = + std::chrono::duration_cast(run_end_ - run_start_); + s.total_wall_us = wall.count(); + s.throughput_rps = + (s.total_wall_us > 0) ? (s.count * 1e6 / s.total_wall_us) : 0; + + return s; + } + + void report(std::ostream &os = std::cout) const { + auto s = compute_stats(); + auto lats = latencies_us(); + + os << "\n"; + os << "================================================================\n"; + os << " Benchmark: " << label_ << "\n"; + os << "================================================================\n"; + os << std::fixed; + os << " Requests: " << s.count << "\n"; + os << std::setprecision(1); + os << " Wall time: " << s.total_wall_us / 1000.0 << " ms\n"; + os << " Throughput: " << s.throughput_rps << " req/s\n"; + os << " ---------------------------------------------------------------\n"; + os << " Latency (us)\n"; + os << std::setprecision(1); + os << " min = " << std::setw(10) << s.min_us << "\n"; + os << " p50 = " << std::setw(10) << s.p50_us << "\n"; + os << " mean = " << std::setw(10) << s.mean_us << "\n"; + os << " p90 = " << std::setw(10) << s.p90_us << "\n"; + os << " p95 = " << std::setw(10) << s.p95_us << "\n"; + os << " p99 = " << std::setw(10) << s.p99_us << "\n"; + os << " max = " << std::setw(10) << s.max_us << "\n"; + os << " stddev = " << std::setw(10) << s.stddev_us << "\n"; + os << " ---------------------------------------------------------------\n"; + + // Per-request breakdown (compact, one line per request) + if (!lats.empty()) { + os << " Per-request latencies (us):\n"; + for (size_t i = 0; i < lats.size(); ++i) { + os << " [" << std::setw(4) << i << "] " + << std::setprecision(1) << std::setw(10) << lats[i] + << "\n"; + } + } + os << "================================================================\n"; + } + +private: + std::string label_; + time_point run_start_{}, run_end_{}; + std::vector submit_times_; + std::vector complete_times_; + + void ensure_capacity(int id) { + size_t needed = static_cast(id) + 1; + if (submit_times_.size() < needed) + submit_times_.resize(needed); + if (complete_times_.size() < needed) + complete_times_.resize(needed); + } + + static double percentile(const std::vector &sorted, double p) { + if (sorted.empty()) + return 0; + double idx = (p / 100.0) * (sorted.size() - 1); + size_t lo = static_cast(idx); + size_t hi = std::min(lo + 1, sorted.size() - 1); + double frac = idx - lo; + return sorted[lo] * (1.0 - frac) + sorted[hi] * frac; + } +}; + +} // namespace cudaq::qec::utils From 70d3eacaa79bba42314d982502780435326dca0d Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Thu, 19 Feb 2026 18:29:41 +0000 Subject: [PATCH 07/40] Integrate PipelineBenchmark into predecoder test and track incomplete requests Enhance PipelineBenchmark to distinguish submitted vs completed requests, report timeouts, and cap per-request output to 50 entries. Integrate it into the predecoder pipeline test with per-request submit/complete markers and spin-wait polling for accurate latency measurement. Increase default total_requests from 20 to 100 across all distance configs. Signed-off-by: Scott Thornton --- .../cudaq/qec/utils/pipeline_benchmarks.h | 71 ++++++++++++++----- .../test_realtime_predecoder_w_pymatching.cpp | 29 ++++++-- 2 files changed, 74 insertions(+), 26 deletions(-) diff --git a/libs/qec/include/cudaq/qec/utils/pipeline_benchmarks.h b/libs/qec/include/cudaq/qec/utils/pipeline_benchmarks.h index 2a812e9e..4ade0c6b 100644 --- a/libs/qec/include/cudaq/qec/utils/pipeline_benchmarks.h +++ b/libs/qec/include/cudaq/qec/utils/pipeline_benchmarks.h @@ -27,7 +27,7 @@ namespace cudaq::qec::utils { /// bench.mark_submit(i); /// // ... submit request ... /// // ... wait for response ... -/// bench.mark_complete(i); +/// bench.mark_complete(i); // only if successful /// } /// bench.stop(); /// bench.report(); @@ -40,10 +40,11 @@ class PipelineBenchmark { explicit PipelineBenchmark(const std::string &label = "Pipeline", size_t expected_requests = 0) - : label_(label) { + : label_(label), total_submitted_(0) { if (expected_requests > 0) { submit_times_.resize(expected_requests); complete_times_.resize(expected_requests); + completed_.resize(expected_requests, false); } } @@ -53,15 +54,18 @@ class PipelineBenchmark { void mark_submit(int request_id) { ensure_capacity(request_id); submit_times_[request_id] = clock::now(); + total_submitted_++; } void mark_complete(int request_id) { ensure_capacity(request_id); complete_times_[request_id] = clock::now(); + completed_[request_id] = true; } struct Stats { - size_t count = 0; + size_t submitted = 0; + size_t completed = 0; double min_us = 0, max_us = 0, mean_us = 0; double p50_us = 0, p90_us = 0, p95_us = 0, p99_us = 0; double stddev_us = 0; @@ -69,12 +73,15 @@ class PipelineBenchmark { double throughput_rps = 0; }; - /// Return per-request latencies in microseconds. + /// Return per-request latencies in microseconds (completed requests only). std::vector latencies_us() const { - size_t n = std::min(submit_times_.size(), complete_times_.size()); + size_t n = std::min({submit_times_.size(), complete_times_.size(), + completed_.size()}); std::vector lats; lats.reserve(n); for (size_t i = 0; i < n; ++i) { + if (!completed_[i]) + continue; auto dt = std::chrono::duration_cast( complete_times_[i] - submit_times_[i]); lats.push_back(dt.count()); @@ -82,11 +89,27 @@ class PipelineBenchmark { return lats; } + /// Return per-request latency or -1.0 for incomplete (preserves indices). + std::vector all_latencies_us() const { + size_t n = std::min({submit_times_.size(), complete_times_.size(), + completed_.size()}); + std::vector lats(n, -1.0); + for (size_t i = 0; i < n; ++i) { + if (!completed_[i]) + continue; + auto dt = std::chrono::duration_cast( + complete_times_[i] - submit_times_[i]); + lats[i] = dt.count(); + } + return lats; + } + Stats compute_stats() const { auto lats = latencies_us(); Stats s; - s.count = lats.size(); - if (s.count == 0) + s.submitted = total_submitted_; + s.completed = lats.size(); + if (s.completed == 0) return s; std::sort(lats.begin(), lats.end()); @@ -94,7 +117,7 @@ class PipelineBenchmark { s.min_us = lats.front(); s.max_us = lats.back(); s.mean_us = - std::accumulate(lats.begin(), lats.end(), 0.0) / s.count; + std::accumulate(lats.begin(), lats.end(), 0.0) / s.completed; s.p50_us = percentile(lats, 50.0); s.p90_us = percentile(lats, 90.0); s.p95_us = percentile(lats, 95.0); @@ -103,32 +126,35 @@ class PipelineBenchmark { double sum_sq = 0; for (auto v : lats) sum_sq += (v - s.mean_us) * (v - s.mean_us); - s.stddev_us = std::sqrt(sum_sq / s.count); + s.stddev_us = std::sqrt(sum_sq / s.completed); auto wall = std::chrono::duration_cast(run_end_ - run_start_); s.total_wall_us = wall.count(); s.throughput_rps = - (s.total_wall_us > 0) ? (s.count * 1e6 / s.total_wall_us) : 0; + (s.total_wall_us > 0) ? (s.completed * 1e6 / s.total_wall_us) : 0; return s; } void report(std::ostream &os = std::cout) const { auto s = compute_stats(); - auto lats = latencies_us(); + auto all = all_latencies_us(); os << "\n"; os << "================================================================\n"; os << " Benchmark: " << label_ << "\n"; os << "================================================================\n"; os << std::fixed; - os << " Requests: " << s.count << "\n"; + os << " Submitted: " << s.submitted << "\n"; + os << " Completed: " << s.completed << "\n"; + if (s.submitted > s.completed) + os << " Timed out: " << (s.submitted - s.completed) << "\n"; os << std::setprecision(1); os << " Wall time: " << s.total_wall_us / 1000.0 << " ms\n"; os << " Throughput: " << s.throughput_rps << " req/s\n"; os << " ---------------------------------------------------------------\n"; - os << " Latency (us)\n"; + os << " Latency (us) [completed requests only]\n"; os << std::setprecision(1); os << " min = " << std::setw(10) << s.min_us << "\n"; os << " p50 = " << std::setw(10) << s.p50_us << "\n"; @@ -140,13 +166,16 @@ class PipelineBenchmark { os << " stddev = " << std::setw(10) << s.stddev_us << "\n"; os << " ---------------------------------------------------------------\n"; - // Per-request breakdown (compact, one line per request) - if (!lats.empty()) { + // Per-request breakdown: only show for small runs (<=50 requests) + if (!all.empty() && all.size() <= 50) { os << " Per-request latencies (us):\n"; - for (size_t i = 0; i < lats.size(); ++i) { - os << " [" << std::setw(4) << i << "] " - << std::setprecision(1) << std::setw(10) << lats[i] - << "\n"; + for (size_t i = 0; i < all.size(); ++i) { + os << " [" << std::setw(4) << i << "] "; + if (all[i] < 0) + os << " TIMEOUT\n"; + else + os << std::setprecision(1) << std::setw(10) << all[i] + << "\n"; } } os << "================================================================\n"; @@ -154,9 +183,11 @@ class PipelineBenchmark { private: std::string label_; + size_t total_submitted_; time_point run_start_{}, run_end_{}; std::vector submit_times_; std::vector complete_times_; + std::vector completed_; void ensure_capacity(int id) { size_t needed = static_cast(id) + 1; @@ -164,6 +195,8 @@ class PipelineBenchmark { submit_times_.resize(needed); if (complete_times_.size() < needed) complete_times_.resize(needed); + if (completed_.size() < needed) + completed_.resize(needed, false); } static double percentile(const std::vector &sorted, double p) { diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index c6a453d7..57b61213 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -63,6 +63,7 @@ #include "cudaq/qec/realtime/ai_decoder_service.h" #include "cudaq/qec/realtime/ai_predecoder_service.h" #include "cudaq/qec/utils/thread_pool.h" +#include "cudaq/qec/utils/pipeline_benchmarks.h" #include "cudaq/qec/code.h" #include "cudaq/qec/decoder.h" @@ -112,7 +113,7 @@ struct PipelineConfig { /*residual_detectors=*/336, "model1_d7_r7_unified_Z_batch1.onnx", /*slot_size=*/4096, - /*total_requests=*/20, + /*total_requests=*/100, /*num_predecoders=*/4, /*queue_depth=*/16, /*num_workers=*/4 @@ -128,7 +129,7 @@ struct PipelineConfig { /*residual_detectors=*/2184, "model1_d13_r13_unified_Z_batch1.onnx", /*slot_size=*/16384, - /*total_requests=*/20, + /*total_requests=*/100, /*num_predecoders=*/4, /*queue_depth=*/16, /*num_workers=*/4 @@ -144,7 +145,7 @@ struct PipelineConfig { /*residual_detectors=*/9240, "model1_d21_r21_unified_X_batch1.onnx", /*slot_size=*/65536, - /*total_requests=*/20, + /*total_requests=*/100, /*num_predecoders=*/4, /*queue_depth=*/16, /*num_workers=*/4 @@ -160,7 +161,7 @@ struct PipelineConfig { /*residual_detectors=*/29760, "model1_d31_r31_unified_Z_batch1.onnx", /*slot_size=*/262144, - /*total_requests=*/20, + /*total_requests=*/100, /*num_predecoders=*/4, /*queue_depth=*/16, /*num_workers=*/4 @@ -451,11 +452,16 @@ int main(int argc, char* argv[]) { << " syndromes in batches of " << batch_size << " (" << config.label << ", error_rate=0.01)...\n"; + cudaq::qec::utils::PipelineBenchmark bench(config.label, + config.total_requests); + std::mt19937 rng(42); const size_t payload_bytes = config.input_bytes(); int requests_sent = 0; int responses_received = 0; + bench.start(); + for (int batch_start = 0; batch_start < config.total_requests; batch_start += batch_size) { int batch_end = std::min(batch_start + batch_size, config.total_requests); @@ -479,16 +485,20 @@ int main(int argc, char* argv[]) { fill_measurement_payload(payload, config.input_elements(), rng, 0.01); __sync_synchronize(); + bench.mark_submit(i); rx_flags_host[slot] = reinterpret_cast(slot_data); requests_sent++; } - // Wait for this batch to complete + // Wait for this batch to complete (spin-wait for accurate latency) for (int i = batch_start; i < batch_end; ++i) { int slot = i % (int)NUM_SLOTS; - int timeout = 10000; - while (tx_flags_host[slot] == 0 && timeout-- > 0) usleep(1000); + auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(10); + while (tx_flags_host[slot] == 0) { + if (std::chrono::steady_clock::now() > deadline) break; + QEC_CPU_RELAX(); + } uint64_t tv = tx_flags_host[slot]; if (tv != 0 && (tv >> 48) == 0xDEAD) { @@ -496,6 +506,7 @@ int main(int argc, char* argv[]) { std::cerr << " [FAIL] Slot " << slot << " cudaGraphLaunch error " << cuda_err << " (" << cudaGetErrorString((cudaError_t)cuda_err) << ")\n"; } else if (tv != 0) { + bench.mark_complete(i); responses_received++; uint8_t* slot_data = rx_data_host + (slot * config.slot_size); int32_t corrections = 0, converged = 0; @@ -515,9 +526,13 @@ int main(int argc, char* argv[]) { } } + bench.stop(); + std::cout << "\n[Result] Processed " << responses_received << "/" << requests_sent << " requests successfully.\n"; + bench.report(); + // Teardown std::cout << "[Teardown] Shutting down...\n"; *shutdown_flag_host = 1; From 4de331e8a00d9210aa4f8e76028afa3c800e4b21 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Thu, 19 Feb 2026 18:35:12 +0000 Subject: [PATCH 08/40] Add per-worker timing breakdown to predecoder pipeline test Instrument the PyMatching worker with high-resolution timestamps to measure decode time vs worker overhead. Report a breakdown showing PyMatching decode, worker overhead, and GPU+dispatch+poll latency as percentages of the total end-to-end pipeline, plus per-round latency. Signed-off-by: Scott Thornton --- .../test_realtime_predecoder_w_pymatching.cpp | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index 57b61213..00b69a10 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -51,6 +51,7 @@ #include #include #include +#include #include @@ -175,6 +176,11 @@ struct DecoderContext { std::mutex decode_mtx; int z_stabilizers = 0; int spatial_slices = 0; + + // Per-worker timing accumulators (protected by decode_mtx) + std::atomic total_decode_us{0}; + std::atomic total_worker_us{0}; + std::atomic decode_count{0}; }; constexpr std::uint32_t fnv1a_hash(std::string_view str) { @@ -201,11 +207,15 @@ struct __attribute__((packed)) DecodeResponse { void pymatching_worker_task(PreDecoderJob job, AIPreDecoderService* predecoder, DecoderContext* ctx) { + using hrclock = std::chrono::high_resolution_clock; + auto worker_start = hrclock::now(); + const int32_t* residual = static_cast(job.inference_data); int total_corrections = 0; bool all_converged = true; + auto decode_start = hrclock::now(); for (int s = 0; s < ctx->spatial_slices; ++s) { const int32_t* slice = residual + s * ctx->z_stabilizers; std::vector syndrome(ctx->z_stabilizers); @@ -222,6 +232,7 @@ void pymatching_worker_task(PreDecoderJob job, AIPreDecoderService* predecoder, for (auto v : result.result) if (v > 0.5) total_corrections++; } + auto decode_end = hrclock::now(); DecodeResponse resp_data{total_corrections, all_converged ? 1 : 0}; @@ -235,6 +246,15 @@ void pymatching_worker_task(PreDecoderJob job, AIPreDecoderService* predecoder, std::atomic_thread_fence(std::memory_order_release); + auto worker_end = hrclock::now(); + auto decode_us = std::chrono::duration_cast( + decode_end - decode_start).count(); + auto worker_us = std::chrono::duration_cast( + worker_end - worker_start).count(); + ctx->total_decode_us.fetch_add(decode_us, std::memory_order_relaxed); + ctx->total_worker_us.fetch_add(worker_us, std::memory_order_relaxed); + ctx->decode_count.fetch_add(1, std::memory_order_relaxed); + size_t slot_idx = ((uint8_t*)job.ring_buffer_ptr - g_sys_ctx.rx_data_host) / g_sys_ctx.slot_size; predecoder->release_job(job.slot_idx); @@ -533,6 +553,28 @@ int main(int argc, char* argv[]) { bench.report(); + // Worker timing breakdown + int n_decoded = decoder_ctx.decode_count.load(); + if (n_decoded > 0) { + double avg_decode = (double)decoder_ctx.total_decode_us.load() / n_decoded; + double avg_worker = (double)decoder_ctx.total_worker_us.load() / n_decoded; + double avg_overhead = avg_worker - avg_decode; + auto stats = bench.compute_stats(); + double avg_pipeline_overhead = stats.mean_us - avg_worker; + + std::cout << std::fixed << std::setprecision(1); + std::cout << "\n Worker Timing Breakdown (avg over " << n_decoded << " requests):\n"; + std::cout << " PyMatching decode: " << std::setw(8) << avg_decode + << " us (" << std::setw(4) << (100.0 * avg_decode / stats.mean_us) << "%)\n"; + std::cout << " Worker overhead: " << std::setw(8) << avg_overhead + << " us (" << std::setw(4) << (100.0 * avg_overhead / stats.mean_us) << "%)\n"; + std::cout << " GPU+dispatch+poll: " << std::setw(8) << avg_pipeline_overhead + << " us (" << std::setw(4) << (100.0 * avg_pipeline_overhead / stats.mean_us) << "%)\n"; + std::cout << " Total end-to-end: " << std::setw(8) << stats.mean_us << " us\n"; + std::cout << " Per-round (/" << config.num_rounds << "): " + << std::setw(8) << (stats.mean_us / config.num_rounds) << " us/round\n"; + } + // Teardown std::cout << "[Teardown] Shutting down...\n"; *shutdown_flag_host = 1; From 44c04c434e79e1313ef17544f18221d90f512c6f Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Thu, 19 Feb 2026 19:27:42 +0000 Subject: [PATCH 09/40] Cache TRT engines to disk and use per-worker decoder pool Add engine caching: prefer a pre-built .engine file when available, otherwise build from ONNX and save the engine for subsequent runs. Replace the single mutex-protected PyMatching decoder with a pool of per-worker decoder instances using thread-local index assignment, eliminating lock contention in the decode path. Signed-off-by: Scott Thornton --- .../cudaq/qec/realtime/ai_decoder_service.h | 8 ++- .../qec/realtime/ai_predecoder_service.h | 3 +- libs/qec/lib/realtime/ai_decoder_service.cu | 19 +++++- .../qec/lib/realtime/ai_predecoder_service.cu | 5 +- .../test_realtime_predecoder_w_pymatching.cpp | 61 ++++++++++++++----- 5 files changed, 73 insertions(+), 23 deletions(-) diff --git a/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h b/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h index 60c1ebc4..0c9aa709 100644 --- a/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h +++ b/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h @@ -27,7 +27,10 @@ class AIDecoderService { /// an ONNX model (.onnx) which will be compiled to a TRT engine. /// @param model_path Path to the model file /// @param device_mailbox_slot Pointer to the specific slot in the global mailbox bank - AIDecoderService(const std::string& model_path, void** device_mailbox_slot); + /// @param engine_save_path If non-empty and model_path is .onnx, save the + /// built engine to this path for fast reloading on subsequent runs + AIDecoderService(const std::string& model_path, void** device_mailbox_slot, + const std::string& engine_save_path = ""); virtual ~AIDecoderService(); @@ -43,7 +46,8 @@ class AIDecoderService { protected: void load_engine(const std::string& path); - void build_engine_from_onnx(const std::string& onnx_path); + void build_engine_from_onnx(const std::string& onnx_path, + const std::string& engine_save_path = ""); void setup_bindings(); void allocate_resources(); diff --git a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h index e4634bd9..dd2dec99 100644 --- a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h +++ b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h @@ -31,7 +31,8 @@ struct PreDecoderJob { class AIPreDecoderService : public AIDecoderService { public: - AIPreDecoderService(const std::string& engine_path, void** device_mailbox_slot, int queue_depth = 16); + AIPreDecoderService(const std::string& engine_path, void** device_mailbox_slot, + int queue_depth = 16, const std::string& engine_save_path = ""); virtual ~AIPreDecoderService(); void capture_graph(cudaStream_t stream) override; diff --git a/libs/qec/lib/realtime/ai_decoder_service.cu b/libs/qec/lib/realtime/ai_decoder_service.cu index 30531335..f581b5b4 100644 --- a/libs/qec/lib/realtime/ai_decoder_service.cu +++ b/libs/qec/lib/realtime/ai_decoder_service.cu @@ -94,7 +94,8 @@ void AIDecoderService::Logger::log(Severity severity, const char* msg) noexcept } } -AIDecoderService::AIDecoderService(const std::string& model_path, void** device_mailbox_slot) +AIDecoderService::AIDecoderService(const std::string& model_path, void** device_mailbox_slot, + const std::string& engine_save_path) : device_mailbox_slot_(device_mailbox_slot) { if (std::getenv("SKIP_TRT")) { @@ -104,7 +105,7 @@ AIDecoderService::AIDecoderService(const std::string& model_path, void** device_ } else { std::string ext = model_path.substr(model_path.find_last_of('.')); if (ext == ".onnx") { - build_engine_from_onnx(model_path); + build_engine_from_onnx(model_path, engine_save_path); } else { load_engine(model_path); } @@ -136,7 +137,8 @@ void AIDecoderService::load_engine(const std::string& path) { context_.reset(engine_->createExecutionContext()); } -void AIDecoderService::build_engine_from_onnx(const std::string& onnx_path) { +void AIDecoderService::build_engine_from_onnx(const std::string& onnx_path, + const std::string& engine_save_path) { runtime_.reset(nvinfer1::createInferRuntime(gLogger)); auto builder = std::unique_ptr(nvinfer1::createInferBuilder(gLogger)); @@ -154,6 +156,17 @@ void AIDecoderService::build_engine_from_onnx(const std::string& onnx_path) { builder->buildSerializedNetwork(*network, *config)); if (!plan) throw std::runtime_error("Failed to build TRT engine from ONNX"); + if (!engine_save_path.empty()) { + std::ofstream out(engine_save_path, std::ios::binary); + if (out.good()) { + out.write(static_cast(plan->data()), plan->size()); + std::printf("[TensorRT] Saved engine to: %s\n", engine_save_path.c_str()); + } else { + std::fprintf(stderr, "[TensorRT] Warning: could not save engine to %s\n", + engine_save_path.c_str()); + } + } + engine_.reset(runtime_->deserializeCudaEngine(plan->data(), plan->size())); if (!engine_) throw std::runtime_error("Failed to deserialize built engine"); diff --git a/libs/qec/lib/realtime/ai_predecoder_service.cu b/libs/qec/lib/realtime/ai_predecoder_service.cu index aafa40e5..de91afb7 100644 --- a/libs/qec/lib/realtime/ai_predecoder_service.cu +++ b/libs/qec/lib/realtime/ai_predecoder_service.cu @@ -91,8 +91,9 @@ __global__ void passthrough_copy_kernel(void* dst, const void* src, size_t num_b // Class Implementation // ============================================================================= -AIPreDecoderService::AIPreDecoderService(const std::string& path, void** mailbox, int queue_depth) - : AIDecoderService(path, mailbox), queue_depth_(queue_depth) +AIPreDecoderService::AIPreDecoderService(const std::string& path, void** mailbox, + int queue_depth, const std::string& engine_save_path) + : AIDecoderService(path, mailbox, engine_save_path), queue_depth_(queue_depth) { SERVICE_CUDA_CHECK(cudaHostAlloc(&h_ready_flags_, queue_depth_ * sizeof(int), cudaHostAllocMapped)); SERVICE_CUDA_CHECK(cudaHostAlloc(&h_ring_ptrs_, queue_depth_ * sizeof(void*), cudaHostAllocMapped)); diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index 00b69a10..028e80da 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -52,6 +52,7 @@ #include #include #include +#include #include @@ -105,6 +106,14 @@ struct PipelineConfig { return std::string(ONNX_MODEL_DIR) + "/" + onnx_filename; } + std::string engine_path() const { + std::string name = onnx_filename; + auto dot = name.rfind('.'); + if (dot != std::string::npos) + name = name.substr(0, dot); + return std::string(ONNX_MODEL_DIR) + "/" + name + ".engine"; + } + static PipelineConfig d7_r7() { return { "d7_r7_Z", @@ -172,12 +181,17 @@ struct PipelineConfig { // Runtime decoder state populated during setup struct DecoderContext { - std::unique_ptr pm_decoder; - std::mutex decode_mtx; + std::vector> decoders; + std::atomic next_decoder_idx{0}; int z_stabilizers = 0; int spatial_slices = 0; - // Per-worker timing accumulators (protected by decode_mtx) + cudaq::qec::decoder* acquire_decoder() { + thread_local int my_idx = next_decoder_idx.fetch_add(1, std::memory_order_relaxed); + return decoders[my_idx % decoders.size()].get(); + } + + // Per-worker timing accumulators (lock-free) std::atomic total_decode_us{0}; std::atomic total_worker_us{0}; std::atomic decode_count{0}; @@ -211,6 +225,7 @@ void pymatching_worker_task(PreDecoderJob job, AIPreDecoderService* predecoder, auto worker_start = hrclock::now(); const int32_t* residual = static_cast(job.inference_data); + auto* my_decoder = ctx->acquire_decoder(); int total_corrections = 0; bool all_converged = true; @@ -222,11 +237,7 @@ void pymatching_worker_task(PreDecoderJob job, AIPreDecoderService* predecoder, for (int i = 0; i < ctx->z_stabilizers; ++i) syndrome[i] = static_cast(slice[i]); - cudaq::qec::decoder_result result; - { - std::lock_guard lock(ctx->decode_mtx); - result = ctx->pm_decoder->decode(syndrome); - } + auto result = my_decoder->decode(syndrome); all_converged &= result.converged; for (auto v : result.result) @@ -338,8 +349,21 @@ int main(int argc, char* argv[]) { CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost)); - std::string onnx_path = config.onnx_path(); - std::cout << "[Setup] Building TRT engines from: " << onnx_path << "\n"; + std::string engine_file = config.engine_path(); + std::string onnx_file = config.onnx_path(); + std::string model_path; + + // Prefer cached .engine file; fall back to ONNX build + save + std::ifstream engine_probe(engine_file, std::ios::binary); + if (engine_probe.good()) { + engine_probe.close(); + model_path = engine_file; + std::cout << "[Setup] Loading cached TRT engine: " << engine_file << "\n"; + } else { + model_path = onnx_file; + std::cout << "[Setup] Building TRT engines from ONNX: " << onnx_file << "\n"; + std::cout << "[Setup] Engine will be cached to: " << engine_file << "\n"; + } // Create PyMatching decoder from surface code Z parity check matrix std::cout << "[Setup] Creating PyMatching decoder (d=" << config.distance @@ -358,8 +382,12 @@ int main(int argc, char* argv[]) { cudaqx::heterogeneous_map pm_params; pm_params.insert("merge_strategy", std::string("smallest_weight")); - decoder_ctx.pm_decoder = cudaq::qec::decoder::get("pymatching", H_z, pm_params); - std::cout << "[Setup] PyMatching decoder ready.\n"; + std::cout << "[Setup] Pre-allocating " << config.num_workers + << " PyMatching decoders (one per worker)...\n"; + for (int i = 0; i < config.num_workers; ++i) + decoder_ctx.decoders.push_back( + cudaq::qec::decoder::get("pymatching", H_z, pm_params)); + std::cout << "[Setup] PyMatching decoder pool ready.\n"; // Allocate Ring Buffers void* tmp = nullptr; @@ -402,17 +430,20 @@ int main(int argc, char* argv[]) { // Initialize AIPreDecoder Instances from ONNX std::cout << "[Setup] Capturing " << config.num_predecoders - << "x AIPreDecoder Graphs (ONNX -> TRT)...\n"; + << "x AIPreDecoder Graphs...\n"; cudaStream_t capture_stream; CUDA_CHECK(cudaStreamCreate(&capture_stream)); std::vector> predecoders; std::vector function_entries(config.num_predecoders); + bool need_save = (model_path == onnx_file); for (int i = 0; i < config.num_predecoders; ++i) { void** my_mailbox = d_global_mailbox_bank + i; - auto pd = std::make_unique(onnx_path, my_mailbox, - config.queue_depth); + std::string save_path = (need_save && i == 0) ? engine_file : ""; + auto pd = std::make_unique(model_path, my_mailbox, + config.queue_depth, + save_path); std::cout << "[Setup] Decoder " << i << ": input_size=" << pd->get_input_size() From 6a2010fd0c7b84c2a0e7f9f614cc7af5dab625a9 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Thu, 19 Feb 2026 23:29:30 +0000 Subject: [PATCH 10/40] Add streaming test mode with continuous syndrome arrival simulation Introduce a streaming test mode alongside the existing batch mode, activated via CLI (`stream [rate_us] [duration_s]`). The streaming mode uses dedicated producer/consumer threads to simulate continuous FPGA syndrome arrival with configurable inter-arrival rate, in-flight throttling (capped to num_predecoders), backpressure tracking, and warmup period exclusion from latency stats. Reports steady-state throughput, percentile latencies, and per-round timing breakdown. Signed-off-by: Scott Thornton --- .../test_realtime_predecoder_w_pymatching.cpp | 555 ++++++++++++++---- 1 file changed, 444 insertions(+), 111 deletions(-) diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index 028e80da..d8b570f9 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -38,7 +38,7 @@ * 4. Dedicated Polling Thread -> Worker PyMatching Thread Pool * 5. CPU Workers closing the transaction (Setting TX flags) * - * Usage: test_realtime_predecoder_w_pymatching [d7|d13|d21|d31] + * Usage: test_realtime_predecoder_w_pymatching [d7|d13|d21|d31] [stream [rate_us] [duration_s]] ******************************************************************************/ #include @@ -311,15 +311,325 @@ void fill_measurement_payload(int32_t* payload, int input_elements, } } +// ============================================================================= +// Streaming Test Mode (simulates FPGA continuous syndrome arrival) +// ============================================================================= + +struct StreamingConfig { + int rate_us = 0; // inter-arrival time in us (0 = open-loop) + int duration_s = 5; // how long to run + int warmup_count = 20; // discard first N from latency stats +}; + +void run_streaming_test( + const PipelineConfig& config, + const StreamingConfig& scfg, + volatile uint64_t* rx_flags_host, + volatile uint64_t* tx_flags_host, + uint8_t* rx_data_host, + DecoderContext& decoder_ctx, + std::vector>& predecoders, + cudaq::qec::utils::ThreadPool& pymatching_pool, + std::atomic& system_stop) +{ + using hrclock = std::chrono::high_resolution_clock; + + const int max_requests = 500000; + const size_t payload_bytes = config.input_bytes(); + + std::vector submit_ts(max_requests); + std::vector complete_ts(max_requests); + std::vector completed(max_requests, false); + + // slot -> request_id mapping so consumer can correlate completions + std::vector slot_request(NUM_SLOTS, -1); + + std::atomic total_submitted{0}; + std::atomic total_completed{0}; + std::atomic in_flight{0}; + std::atomic backpressure_stalls{0}; + std::atomic producer_done{false}; + + // Cap in-flight to num_predecoders. The dispatcher scans slots + // sequentially and only advances on non-empty slots. With the inflight + // flag limiting one graph launch per predecoder, only num_predecoders + // slots can be consumed per scan. Any excess slots get backpressured, + // then the dispatcher parks on an empty slot and never revisits them. + const int max_in_flight = config.num_predecoders; + + auto run_deadline = std::chrono::steady_clock::now() + + std::chrono::seconds(scfg.duration_s); + + std::string rate_label = (scfg.rate_us > 0) + ? std::to_string(scfg.rate_us) + " us" + : "open-loop"; + + std::cout << "\n[Stream] Starting streaming test (" << config.label << ")\n" + << " Rate: " << rate_label << "\n" + << " Duration: " << scfg.duration_s << " s\n" + << " Warmup: " << scfg.warmup_count << " requests\n" + << " Max flight: " << max_in_flight << "\n" + << " Max reqs: " << max_requests << "\n\n"; + + // --- Producer thread (simulates FPGA) --- + std::thread producer([&]() { + std::mt19937 rng(42); + int next_slot = 0; + int req_id = 0; + + while (std::chrono::steady_clock::now() < run_deadline + && req_id < max_requests) { + + // Throttle: don't exceed max_in_flight to prevent ring buffer flooding + while (in_flight.load(std::memory_order_acquire) >= max_in_flight) { + QEC_CPU_RELAX(); + if (std::chrono::steady_clock::now() >= run_deadline) return; + } + + int slot = next_slot % (int)NUM_SLOTS; + + // Wait for slot to be fully free (dispatcher consumed + response harvested) + while (rx_flags_host[slot] != 0 || tx_flags_host[slot] != 0) { + backpressure_stalls.fetch_add(1, std::memory_order_relaxed); + QEC_CPU_RELAX(); + if (std::chrono::steady_clock::now() >= run_deadline) return; + } + + int target = req_id % config.num_predecoders; + std::string func = "predecode_target_" + std::to_string(target); + + uint8_t* slot_data = rx_data_host + (slot * config.slot_size); + auto* hdr = reinterpret_cast(slot_data); + hdr->magic = cudaq::nvqlink::RPC_MAGIC_REQUEST; + hdr->function_id = fnv1a_hash(func); + hdr->arg_len = static_cast(payload_bytes); + + int32_t* payload = reinterpret_cast( + slot_data + sizeof(cudaq::nvqlink::RPCHeader)); + fill_measurement_payload(payload, config.input_elements(), rng, 0.01); + + slot_request[slot] = req_id; + + __sync_synchronize(); + submit_ts[req_id] = hrclock::now(); + rx_flags_host[slot] = reinterpret_cast(slot_data); + in_flight.fetch_add(1, std::memory_order_release); + total_submitted.fetch_add(1, std::memory_order_release); + + next_slot++; + req_id++; + + // Rate limiting (busy-wait for precision) + if (scfg.rate_us > 0) { + auto target_time = submit_ts[req_id - 1] + + std::chrono::microseconds(scfg.rate_us); + while (hrclock::now() < target_time) + QEC_CPU_RELAX(); + } + } + + producer_done.store(true, std::memory_order_release); + }); + + // --- Consumer thread (harvests completions sequentially) --- + std::thread consumer([&]() { + int next_harvest = 0; + + while (true) { + bool pdone = producer_done.load(std::memory_order_acquire); + int nsub = total_submitted.load(std::memory_order_acquire); + int ncomp = total_completed.load(std::memory_order_relaxed); + + if (pdone && ncomp >= nsub) + break; + + // Nothing to harvest yet + if (next_harvest >= nsub) { + QEC_CPU_RELAX(); + continue; + } + + int slot = next_harvest % (int)NUM_SLOTS; + uint64_t tv = tx_flags_host[slot]; + + if (tv != 0) { + int rid = slot_request[slot]; + if (rid >= 0 && (tv >> 48) != 0xDEAD) { + complete_ts[rid] = hrclock::now(); + completed[rid] = true; + total_completed.fetch_add(1, std::memory_order_relaxed); + } else if ((tv >> 48) == 0xDEAD) { + int cuda_err = (int)(tv & 0xFFFF); + std::cerr << " [FAIL] Slot " << slot + << " cudaGraphLaunch error " << cuda_err + << " (" << cudaGetErrorString((cudaError_t)cuda_err) + << ")\n"; + total_completed.fetch_add(1, std::memory_order_relaxed); + } + + tx_flags_host[slot] = 0; + slot_request[slot] = -1; + in_flight.fetch_sub(1, std::memory_order_release); + next_harvest++; + } else { + QEC_CPU_RELAX(); + } + } + }); + + producer.join(); + + // Grace period for in-flight requests + auto grace_deadline = std::chrono::steady_clock::now() + std::chrono::seconds(10); + while (total_completed.load() < total_submitted.load() + && std::chrono::steady_clock::now() < grace_deadline) { + usleep(1000); + } + + consumer.join(); + + // ===== Report ===== + auto run_end = std::chrono::steady_clock::now(); + int nsub = total_submitted.load(); + int ncomp = total_completed.load(); + + // Build PipelineBenchmark from timestamps (skip warmup) + int warmup = std::min(scfg.warmup_count, nsub); + int bench_count = nsub - warmup; + + cudaq::qec::utils::PipelineBenchmark bench( + config.label + " (stream)", bench_count); + bench.start(); + + for (int i = warmup; i < nsub; ++i) { + int bench_id = i - warmup; + bench.mark_submit(bench_id); + // Override the internal submit timestamp with the real one + } + + // We can't override PipelineBenchmark's internal timestamps, so compute + // stats manually for the steady-state window. + std::vector latencies; + latencies.reserve(bench_count); + for (int i = warmup; i < nsub; ++i) { + if (!completed[i]) continue; + auto dt = std::chrono::duration_cast>( + complete_ts[i] - submit_ts[i]); + latencies.push_back(dt.count()); + } + + bench.stop(); + + std::sort(latencies.begin(), latencies.end()); + + auto pct = [&](double p) -> double { + if (latencies.empty()) return 0; + double idx = (p / 100.0) * (latencies.size() - 1); + size_t lo = (size_t)idx; + size_t hi = std::min(lo + 1, latencies.size() - 1); + double frac = idx - lo; + return latencies[lo] * (1.0 - frac) + latencies[hi] * frac; + }; + + double mean = 0; + for (auto v : latencies) mean += v; + mean = latencies.empty() ? 0 : mean / latencies.size(); + + double stddev = 0; + for (auto v : latencies) stddev += (v - mean) * (v - mean); + stddev = latencies.empty() ? 0 : std::sqrt(stddev / latencies.size()); + + auto wall_us = std::chrono::duration_cast>( + run_end - (run_deadline - std::chrono::seconds(scfg.duration_s))).count(); + double throughput = (wall_us > 0) ? (ncomp * 1e6 / wall_us) : 0; + + double actual_rate = (nsub > 1) + ? std::chrono::duration_cast>( + submit_ts[nsub - 1] - submit_ts[0]).count() / (nsub - 1) + : 0; + + std::cout << std::fixed; + std::cout << "\n================================================================\n"; + std::cout << " Streaming Benchmark: " << config.label << "\n"; + std::cout << "================================================================\n"; + std::cout << " Submitted: " << nsub << "\n"; + std::cout << " Completed: " << ncomp << "\n"; + if (nsub > ncomp) + std::cout << " Dropped/timeout: " << (nsub - ncomp) << "\n"; + std::cout << std::setprecision(1); + std::cout << " Wall time: " << wall_us / 1000.0 << " ms\n"; + std::cout << " Throughput: " << throughput << " req/s\n"; + std::cout << " Actual arrival rate:" << std::setw(8) << actual_rate << " us/req\n"; + std::cout << " Backpressure stalls:" << std::setw(8) + << backpressure_stalls.load() << "\n"; + std::cout << " ---------------------------------------------------------------\n"; + std::cout << " Latency (us) [steady-state, " << latencies.size() + << " requests after " << warmup << " warmup]\n"; + std::cout << std::setprecision(1); + if (!latencies.empty()) { + std::cout << " min = " << std::setw(10) << latencies.front() << "\n"; + std::cout << " p50 = " << std::setw(10) << pct(50) << "\n"; + std::cout << " mean = " << std::setw(10) << mean << "\n"; + std::cout << " p90 = " << std::setw(10) << pct(90) << "\n"; + std::cout << " p95 = " << std::setw(10) << pct(95) << "\n"; + std::cout << " p99 = " << std::setw(10) << pct(99) << "\n"; + std::cout << " max = " << std::setw(10) << latencies.back() << "\n"; + std::cout << " stddev = " << std::setw(10) << stddev << "\n"; + } + std::cout << " ---------------------------------------------------------------\n"; + + // Worker timing breakdown + int n_decoded = decoder_ctx.decode_count.load(); + if (n_decoded > 0) { + double avg_decode = (double)decoder_ctx.total_decode_us.load() / n_decoded; + double avg_worker = (double)decoder_ctx.total_worker_us.load() / n_decoded; + double avg_overhead = avg_worker - avg_decode; + double avg_pipeline = mean - avg_worker; + + std::cout << std::setprecision(1); + std::cout << " Worker Timing Breakdown (avg over " << n_decoded << " requests):\n"; + std::cout << " PyMatching decode:" << std::setw(10) << avg_decode + << " us (" << std::setw(4) << (mean > 0 ? 100.0 * avg_decode / mean : 0) + << "%)\n"; + std::cout << " Worker overhead: " << std::setw(10) << avg_overhead + << " us (" << std::setw(4) << (mean > 0 ? 100.0 * avg_overhead / mean : 0) + << "%)\n"; + std::cout << " GPU+dispatch+poll:" << std::setw(10) << avg_pipeline + << " us (" << std::setw(4) << (mean > 0 ? 100.0 * avg_pipeline / mean : 0) + << "%)\n"; + std::cout << " Total end-to-end: " << std::setw(10) << mean << " us\n"; + std::cout << " Per-round (/" << config.num_rounds << "): " + << std::setw(10) << (mean / config.num_rounds) << " us/round\n"; + } + std::cout << "================================================================\n"; +} + // ============================================================================= // Main // ============================================================================= int main(int argc, char* argv[]) { - // Select configuration + // Parse arguments: [stream [rate_us] [duration_s]] std::string config_name = "d7"; + bool streaming_mode = false; + StreamingConfig stream_cfg; + if (argc > 1) config_name = argv[1]; + int stream_positional = 0; // tracks positional args after "stream" + for (int a = 2; a < argc; ++a) { + std::string arg = argv[a]; + if (arg == "stream") { + streaming_mode = true; + } else if (streaming_mode && stream_positional == 0 && std::isdigit(arg[0])) { + stream_cfg.rate_us = std::stoi(arg); + stream_positional++; + } else if (streaming_mode && stream_positional == 1 && std::isdigit(arg[0])) { + stream_cfg.duration_s = std::stoi(arg); + stream_positional++; + } + } + PipelineConfig config; if (config_name == "d7") { config = PipelineConfig::d7_r7(); @@ -330,11 +640,21 @@ int main(int argc, char* argv[]) { } else if (config_name == "d31") { config = PipelineConfig::d31_r31(); } else { - std::cerr << "Usage: " << argv[0] << " [d7|d13|d21|d31]\n" - << " d7 - distance 7, 7 rounds (default)\n" - << " d13 - distance 13, 13 rounds\n" - << " d21 - distance 21, 21 rounds\n" - << " d31 - distance 31, 31 rounds\n"; + std::cerr << "Usage: " << argv[0] << " [d7|d13|d21|d31] [stream [rate_us] [duration_s]]\n" + << " d7 - distance 7, 7 rounds (default)\n" + << " d13 - distance 13, 13 rounds\n" + << " d21 - distance 21, 21 rounds\n" + << " d31 - distance 31, 31 rounds\n" + << "\n" + << " stream - continuous FPGA-like submission (default: batch mode)\n" + << " rate_us - inter-arrival time in us (0 = open-loop, default)\n" + << " duration_s - test duration in seconds (default: 5)\n" + << "\n" + << "Examples:\n" + << " " << argv[0] << " d13 # batch mode\n" + << " " << argv[0] << " d13 stream # streaming, open-loop\n" + << " " << argv[0] << " d13 stream 50 # streaming, 50 us between requests\n" + << " " << argv[0] << " d13 stream 50 10 # streaming, 50 us rate, 10s duration\n"; return 1; } @@ -493,117 +813,130 @@ int main(int argc, char* argv[]) { }); // ========================================================================= - // Test Stimulus: Fire requests in batches of num_predecoders. - // The dispatcher advances its slot pointer linearly and only retries - // while rx_value != 0, so we must wait for each batch to complete - // before firing the next to avoid stranding un-dispatched slots. + // Test Stimulus // ========================================================================= - const int batch_size = config.num_predecoders; - std::cout << "\n[Test] Firing " << config.total_requests - << " syndromes in batches of " << batch_size - << " (" << config.label << ", error_rate=0.01)...\n"; - - cudaq::qec::utils::PipelineBenchmark bench(config.label, - config.total_requests); - - std::mt19937 rng(42); - const size_t payload_bytes = config.input_bytes(); - int requests_sent = 0; - int responses_received = 0; - - bench.start(); - - for (int batch_start = 0; batch_start < config.total_requests; - batch_start += batch_size) { - int batch_end = std::min(batch_start + batch_size, config.total_requests); - - // Fire one batch - for (int i = batch_start; i < batch_end; ++i) { - int target_decoder = i % config.num_predecoders; - std::string target_func = "predecode_target_" + std::to_string(target_decoder); - - int slot = i % (int)NUM_SLOTS; - while (rx_flags_host[slot] != 0) usleep(10); - - uint8_t* slot_data = rx_data_host + (slot * config.slot_size); - auto* header = reinterpret_cast(slot_data); - header->magic = cudaq::nvqlink::RPC_MAGIC_REQUEST; - header->function_id = fnv1a_hash(target_func); - header->arg_len = static_cast(payload_bytes); - - int32_t* payload = reinterpret_cast( - slot_data + sizeof(cudaq::nvqlink::RPCHeader)); - fill_measurement_payload(payload, config.input_elements(), rng, 0.01); - - __sync_synchronize(); - bench.mark_submit(i); - rx_flags_host[slot] = reinterpret_cast(slot_data); - requests_sent++; - } - - // Wait for this batch to complete (spin-wait for accurate latency) - for (int i = batch_start; i < batch_end; ++i) { - int slot = i % (int)NUM_SLOTS; - - auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(10); - while (tx_flags_host[slot] == 0) { - if (std::chrono::steady_clock::now() > deadline) break; - QEC_CPU_RELAX(); - } + if (streaming_mode) { + run_streaming_test(config, stream_cfg, rx_flags_host, tx_flags_host, + rx_data_host, decoder_ctx, predecoders, + pymatching_pool, system_stop); + } else { + // Batch mode: fire requests in batches of num_predecoders, wait for + // each batch to complete before firing the next. + const int batch_size = config.num_predecoders; + std::cout << "\n[Batch] Firing " << config.total_requests + << " syndromes in batches of " << batch_size + << " (" << config.label << ", error_rate=0.01)...\n"; + + cudaq::qec::utils::PipelineBenchmark bench(config.label, + config.total_requests); + std::mt19937 rng(42); + const size_t payload_bytes = config.input_bytes(); + int requests_sent = 0; + int responses_received = 0; + + bench.start(); + + for (int batch_start = 0; batch_start < config.total_requests; + batch_start += batch_size) { + int batch_end = std::min(batch_start + batch_size, config.total_requests); + + for (int i = batch_start; i < batch_end; ++i) { + int target_decoder = i % config.num_predecoders; + std::string target_func = "predecode_target_" + + std::to_string(target_decoder); + + int slot = i % (int)NUM_SLOTS; + while (rx_flags_host[slot] != 0) usleep(10); - uint64_t tv = tx_flags_host[slot]; - if (tv != 0 && (tv >> 48) == 0xDEAD) { - int cuda_err = (int)(tv & 0xFFFF); - std::cerr << " [FAIL] Slot " << slot << " cudaGraphLaunch error " - << cuda_err << " (" << cudaGetErrorString((cudaError_t)cuda_err) << ")\n"; - } else if (tv != 0) { - bench.mark_complete(i); - responses_received++; uint8_t* slot_data = rx_data_host + (slot * config.slot_size); - int32_t corrections = 0, converged = 0; - std::memcpy(&corrections, - slot_data + sizeof(cudaq::nvqlink::RPCResponse), - sizeof(int32_t)); - std::memcpy(&converged, - slot_data + sizeof(cudaq::nvqlink::RPCResponse) + sizeof(int32_t), - sizeof(int32_t)); - std::cout << " -> Slot " << slot << ": OK, corrections=" << corrections - << " converged=" << (converged ? "yes" : "no") << "\n"; - } else { - std::cerr << " [FAIL] Timeout waiting for slot " << slot << "\n"; + auto* header = reinterpret_cast(slot_data); + header->magic = cudaq::nvqlink::RPC_MAGIC_REQUEST; + header->function_id = fnv1a_hash(target_func); + header->arg_len = static_cast(payload_bytes); + + int32_t* payload = reinterpret_cast( + slot_data + sizeof(cudaq::nvqlink::RPCHeader)); + fill_measurement_payload(payload, config.input_elements(), rng, 0.01); + + __sync_synchronize(); + bench.mark_submit(i); + rx_flags_host[slot] = reinterpret_cast(slot_data); + requests_sent++; } - tx_flags_host[slot] = 0; + for (int i = batch_start; i < batch_end; ++i) { + int slot = i % (int)NUM_SLOTS; + + auto deadline = std::chrono::steady_clock::now() + + std::chrono::seconds(10); + while (tx_flags_host[slot] == 0) { + if (std::chrono::steady_clock::now() > deadline) break; + QEC_CPU_RELAX(); + } + + uint64_t tv = tx_flags_host[slot]; + if (tv != 0 && (tv >> 48) == 0xDEAD) { + int cuda_err = (int)(tv & 0xFFFF); + std::cerr << " [FAIL] Slot " << slot + << " cudaGraphLaunch error " << cuda_err + << " (" << cudaGetErrorString((cudaError_t)cuda_err) + << ")\n"; + } else if (tv != 0) { + bench.mark_complete(i); + responses_received++; + uint8_t* slot_data = rx_data_host + (slot * config.slot_size); + int32_t corrections = 0, converged = 0; + std::memcpy(&corrections, + slot_data + sizeof(cudaq::nvqlink::RPCResponse), + sizeof(int32_t)); + std::memcpy(&converged, + slot_data + sizeof(cudaq::nvqlink::RPCResponse) + + sizeof(int32_t), + sizeof(int32_t)); + std::cout << " -> Slot " << slot + << ": OK, corrections=" << corrections + << " converged=" << (converged ? "yes" : "no") << "\n"; + } else { + std::cerr << " [FAIL] Timeout waiting for slot " << slot << "\n"; + } + + tx_flags_host[slot] = 0; + } } - } - - bench.stop(); - - std::cout << "\n[Result] Processed " << responses_received << "/" << requests_sent - << " requests successfully.\n"; - - bench.report(); - // Worker timing breakdown - int n_decoded = decoder_ctx.decode_count.load(); - if (n_decoded > 0) { - double avg_decode = (double)decoder_ctx.total_decode_us.load() / n_decoded; - double avg_worker = (double)decoder_ctx.total_worker_us.load() / n_decoded; - double avg_overhead = avg_worker - avg_decode; - auto stats = bench.compute_stats(); - double avg_pipeline_overhead = stats.mean_us - avg_worker; - - std::cout << std::fixed << std::setprecision(1); - std::cout << "\n Worker Timing Breakdown (avg over " << n_decoded << " requests):\n"; - std::cout << " PyMatching decode: " << std::setw(8) << avg_decode - << " us (" << std::setw(4) << (100.0 * avg_decode / stats.mean_us) << "%)\n"; - std::cout << " Worker overhead: " << std::setw(8) << avg_overhead - << " us (" << std::setw(4) << (100.0 * avg_overhead / stats.mean_us) << "%)\n"; - std::cout << " GPU+dispatch+poll: " << std::setw(8) << avg_pipeline_overhead - << " us (" << std::setw(4) << (100.0 * avg_pipeline_overhead / stats.mean_us) << "%)\n"; - std::cout << " Total end-to-end: " << std::setw(8) << stats.mean_us << " us\n"; - std::cout << " Per-round (/" << config.num_rounds << "): " - << std::setw(8) << (stats.mean_us / config.num_rounds) << " us/round\n"; + bench.stop(); + + std::cout << "\n[Result] Processed " << responses_received << "/" + << requests_sent << " requests successfully.\n"; + + bench.report(); + + int n_decoded = decoder_ctx.decode_count.load(); + if (n_decoded > 0) { + double avg_decode = (double)decoder_ctx.total_decode_us.load() / n_decoded; + double avg_worker = (double)decoder_ctx.total_worker_us.load() / n_decoded; + double avg_overhead = avg_worker - avg_decode; + auto stats = bench.compute_stats(); + double avg_pipeline_overhead = stats.mean_us - avg_worker; + + std::cout << std::fixed << std::setprecision(1); + std::cout << "\n Worker Timing Breakdown (avg over " + << n_decoded << " requests):\n"; + std::cout << " PyMatching decode: " << std::setw(8) << avg_decode + << " us (" << std::setw(4) + << (100.0 * avg_decode / stats.mean_us) << "%)\n"; + std::cout << " Worker overhead: " << std::setw(8) << avg_overhead + << " us (" << std::setw(4) + << (100.0 * avg_overhead / stats.mean_us) << "%)\n"; + std::cout << " GPU+dispatch+poll: " << std::setw(8) + << avg_pipeline_overhead << " us (" << std::setw(4) + << (100.0 * avg_pipeline_overhead / stats.mean_us) << "%)\n"; + std::cout << " Total end-to-end: " << std::setw(8) + << stats.mean_us << " us\n"; + std::cout << " Per-round (/" << config.num_rounds << "): " + << std::setw(8) << (stats.mean_us / config.num_rounds) + << " us/round\n"; + } } // Teardown From a36a2c3979cbfd92c4279d921684d3226ecd6fe0 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Fri, 20 Feb 2026 00:11:50 +0000 Subject: [PATCH 11/40] Added design document Signed-off-by: Scott Thornton --- docs/hybrid_ai_predecoder_pipeline.md | 802 ++++++++++++++++++++++++++ 1 file changed, 802 insertions(+) create mode 100644 docs/hybrid_ai_predecoder_pipeline.md diff --git a/docs/hybrid_ai_predecoder_pipeline.md b/docs/hybrid_ai_predecoder_pipeline.md new file mode 100644 index 00000000..20a4013e --- /dev/null +++ b/docs/hybrid_ai_predecoder_pipeline.md @@ -0,0 +1,802 @@ +# Hybrid AI Predecoder + PyMatching Global Decoder Pipeline + +## Design Document + +**Component**: `cudaq-qec` Realtime Decoding Subsystem +**Status**: Implementation Complete (Test-Validated) +**Last Updated**: 2026-02-19 + +--- + +## Table of Contents + +1. [Overview](#1-overview) +2. [Problem Statement](#2-problem-statement) +3. [Architecture](#3-architecture) +4. [Component Deep-Dive](#4-component-deep-dive) + - 4.1 [Ring Buffer & RPC Protocol](#41-ring-buffer--rpc-protocol) + - 4.2 [GPU Persistent Dispatcher Kernel](#42-gpu-persistent-dispatcher-kernel) + - 4.3 [AIDecoderService (Base Class)](#43-aidecoderservice-base-class) + - 4.4 [AIPreDecoderService (Predecoder + CPU Handoff)](#44-aipredeccoderservice-predecoder--cpu-handoff) + - 4.5 [CPU Worker Threads & PyMatching Decoder Pool](#45-cpu-worker-threads--pymatching-decoder-pool) +5. [Data Flow](#5-data-flow) +6. [Memory Architecture](#6-memory-architecture) +7. [Backpressure Protocol](#7-backpressure-protocol) +8. [Memory Ordering & Synchronization](#8-memory-ordering--synchronization) +9. [CUDA Graph Hierarchy](#9-cuda-graph-hierarchy) +10. [Pipeline Configurations](#10-pipeline-configurations) +11. [File Inventory](#11-file-inventory) +12. [Configuration Parameters](#12-configuration-parameters) +13. [Performance Benchmarking](#13-performance-benchmarking) +14. [Portability](#14-portability) +15. [Limitations & Future Work](#15-limitations--future-work) + +--- + +## 1. Overview + +This system implements a **realtime hybrid GPU/CPU pipeline** for quantum error correction (QEC) decoding on the surface code. The pipeline splits the decoding workload into two stages: + +| Stage | Location | Algorithm | Data Type | +|-------|----------|-----------|-----------| +| **Predecoding** | GPU | Neural network (TensorRT, from ONNX) | INT32 | +| **Global Decoding** | CPU | PyMatching (MWPM) | float64 | + +A **persistent GPU kernel** (the Dispatcher) monitors a shared ring buffer for incoming syndrome data. When data arrives, the Dispatcher launches a CUDA Graph containing a TensorRT inference pass. The neural network accepts raw measurements as INT32 tensors and produces residual detectors and a logical frame. The residual detectors are handed off to the CPU via mapped pinned memory, where a thread pool runs PyMatching MWPM decoding. Results are written back to the ring buffer and acknowledged. + +The system supports multiple surface code distances via a configurable `PipelineConfig` struct: d=7, d=13, d=21, and d=31. ONNX models are compiled to TensorRT engines on first use and cached to disk as `.engine` files for fast reloading on subsequent runs. + +--- + +## 2. Problem Statement + +Surface code QEC requires decoding syndrome measurements within the coherence time of the quantum system (typically ~1 microsecond for superconducting qubits). A pure CPU decoder cannot meet this budget at scale. A pure GPU decoder lacks the flexibility to run algorithms like Minimum Weight Perfect Matching (MWPM) efficiently on GPU. + +The hybrid approach exploits the strengths of each: + +- **GPU**: Massively parallel neural network inference provides fast soft-decision outputs (residual detectors) that reduce the problem complexity for the global decoder. +- **CPU**: PyMatching solves the residual MWPM problem on the simplified output from the predecoder. + +The critical constraint is **zero-copy, zero-allocation** on the hot path. Every buffer is pre-allocated, every kernel is pre-captured into a CUDA Graph, and every transfer uses mapped pinned memory. + +--- + +## 3. Architecture + +### System Diagram + +``` + FPGA / Quantum Control (or Test Harness) + │ + │ syndrome data (INT32 measurements) + ▼ + ┌─────────────────────────────────────────────────────┐ + │ Ring Buffer (Mapped Pinned Memory) │ + │ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ │ + │ │Slot 0│ │Slot 1│ │Slot 2│ ... │Slot63│ │ + │ └──┬───┘ └──┬───┘ └──┬───┘ └──┬───┘ │ + │ │ │ │ │ │ + │ rx_flags[0] rx_flags[1] ... rx_flags[63] │ + └─────┼────────┼────────┼───────────────┼────────────┘ + │ │ │ │ + ▼ ▼ ▼ ▼ + ┌─────────────────────────────────────────────────────┐ + │ GPU Persistent Dispatcher Kernel │ + │ │ + │ Polls rx_flags[] ──► Looks up function_id │ + │ ──► Checks backpressure ──► Launches CUDA Graph │ + └──────────┬──────────┬──────────┬──────────┬─────────┘ + │ │ │ │ + ▼ ▼ ▼ ▼ + ┌──────────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ + │ PreDecoder 0 │ │PreDec. 1 │ │PreDec. 2 │ │PreDec. 3 │ + │ (CUDA Graph) │ │(CUDAGraph│ │(CUDAGraph│ │(CUDAGraph│ + │ │ │ │ │ │ │ │ + │ Input Kern │ │ │ │ │ │ │ + │ ──► TRT ──► │ │ ... │ │ ... │ │ ... │ + │ Output Kern │ │ │ │ │ │ │ + └──────┬───────┘ └────┬─────┘ └────┬─────┘ └────┬─────┘ + │ │ │ │ + │ (mapped pinned memory: ready_flags, outputs) + ▼ ▼ ▼ ▼ + ┌─────────────────────────────────────────────────────┐ + │ Polling Thread (incoming_polling_loop) │ + │ Round-robins all predecoders, dispatches to pool │ + └──────────┬──────────────────────────────────────────┘ + │ + ▼ + ┌──────────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ + │ Worker 0 │ │ Worker 1 │ │ Worker 2 │ │ Worker 3 │ + │ (thread pool)│ │(thd pool)│ │(thd pool)│ │(thd pool)│ + │ │ │ │ │ │ │ │ + │ PyMatching 0 │ │PyMatch 1 │ │PyMatch 2 │ │PyMatch 3 │ + │ (own decoder)│ │(own dec) │ │(own dec) │ │(own dec) │ + │ Write RPC │ │Write RPC │ │Write RPC │ │Write RPC │ + │ Set tx_flag │ │Set tx_flg│ │Set tx_flg│ │Set tx_flg│ + └──────┬───────┘ └────┬─────┘ └────┬─────┘ └────┬─────┘ + │ │ │ │ + └──────────────┼────────────┼────────────┘ + ▼ + tx_flags[slot] ──► FPGA +``` + +### Key Design Decisions + +1. **CUDA Graphs everywhere** -- Both the dispatcher kernel and every predecoder instance are captured as CUDA Graphs. The dispatcher graph is instantiated with `cudaGraphInstantiateFlagDeviceLaunch`, enabling it to launch child predecoder graphs from device code via `cudaGraphLaunch(..., cudaStreamGraphFireAndForget)`. + +2. **Mapped pinned memory for all CPU-GPU communication** -- `cudaHostAllocMapped` provides a single address space visible to both CPU and GPU without explicit copies. GPU writes are made visible via `__threadfence_system()`; CPU reads are ordered via `std::atomic_thread_fence(std::memory_order_acquire)`. + +3. **N-deep circular queue between GPU and CPU** -- Rather than a single handoff slot, each predecoder maintains a circular buffer of depth N (default 16), allowing the GPU to pipeline multiple inferences before the CPU consumes them. + +4. **Dispatcher-level backpressure** -- The dispatcher checks a predecoder's queue state *before* launching its graph. If the queue is full, the packet stays in the ring buffer and the dispatcher moves on to service other slots. + +5. **ONNX model support with engine caching** -- The `AIDecoderService` accepts either a pre-built `.engine` file or an `.onnx` model. When given an ONNX file, it builds a TensorRT engine at runtime and optionally saves it to disk via the `engine_save_path` parameter. On subsequent runs, the cached `.engine` file is loaded directly, skipping the expensive autotuner phase (startup drops from ~15s to ~4s). + +6. **Per-worker PyMatching decoder pool** -- Each thread pool worker gets its own pre-allocated PyMatching decoder instance via `thread_local` assignment. This eliminates mutex contention on the decode path (previous single-decoder + mutex design was ~2.4x slower). + +7. **Type-agnostic I/O buffers** -- All TRT I/O buffers use `void*` rather than `float*`, supporting INT32 models natively without type casting on the GPU. + +--- + +## 4. Component Deep-Dive + +### 4.1 Ring Buffer & RPC Protocol + +**Files**: `dispatch_kernel_launch.h` (protocol), test harness (allocation) + +The ring buffer is the communication channel between the FPGA (or test harness) and the GPU. It consists of: + +| Buffer | Type | Size | Purpose | +|--------|------|------|---------| +| `rx_flags[N]` | `volatile uint64_t*` | N slots | Non-zero = data ready; value is pointer to slot data | +| `tx_flags[N]` | `volatile uint64_t*` | N slots | Non-zero = response ready; acknowledges to FPGA | +| `rx_data` | `uint8_t*` | N x SLOT_SIZE | Slot payload area | + +Each slot carries an **RPC message** in a packed wire format: + +``` +Request: [RPCHeader: magic(4) | function_id(4) | arg_len(4)] [payload: arg_len bytes] +Response: [RPCResponse: magic(4) | status(4) | result_len(4)] [payload: result_len bytes] +``` + +The `function_id` is an FNV-1a hash of the target function name, enabling the dispatcher to route requests to different predecoder instances. + +The response payload for the PyMatching pipeline is a packed `DecodeResponse`: + +```c +struct __attribute__((packed)) DecodeResponse { + int32_t total_corrections; + int32_t converged; +}; +``` + +### 4.2 GPU Persistent Dispatcher Kernel + +**File**: `realtime/lib/daemon/dispatcher/dispatch_kernel.cu` + +The dispatcher is a **persistent kernel** -- it runs for the lifetime of the system, spinning on the ring buffer. Two variants exist: + +| Variant | Function | Graph Launch | Use Case | +|---------|----------|-------------|----------| +| `dispatch_kernel_device_call_only` | Direct device function calls | No | Legacy / simple RPC | +| `dispatch_kernel_with_graph` | Device function calls + CUDA Graph launch | Yes (sm_80+) | AI predecoder pipeline | + +#### Dispatch Loop (Graph Variant) + +``` +while (!shutdown): + rx_value = rx_flags[current_slot] + if rx_value != 0: + header = parse_rpc_header(rx_value) + + if header.magic is invalid: + consume and clear slot ← garbage data + + else: + entry = lookup(header.function_id) + + if entry is DEVICE_CALL: + call device function inline + write RPC response + set tx_flags + consume slot + + elif entry is GRAPH_LAUNCH: + if backpressure_check(entry): + skip (do NOT consume) ← retry later + else: + write mailbox + cudaGraphLaunch(fire-and-forget) + consume slot + (tx_flags set later by CPU) + + else: + consume slot ← unknown function + + advance current_slot ← always advance + KernelType::sync() +``` + +The `packet_consumed` flag controls whether `rx_flags[slot]` is cleared. For backpressured graph launches, the slot is left intact so the dispatcher retries on the next pass. The slot pointer **always** advances to avoid head-of-line blocking. + +**Note on slot scanning**: The dispatcher only advances `current_slot` when a non-empty slot is found. When a slot is empty, it spins on that same slot. This means having many empty slots (e.g., 64 slots with only 4 in use) does not cause scanning overhead, but the dispatcher does park on a slot waiting for it to be filled. + +#### Function Table Entry + +Each registered function is described by a `cudaq_function_entry_t`: + +```c +typedef struct { + union { + void *device_fn_ptr; // DEVICE_CALL handler + cudaGraphExec_t graph_exec; // GRAPH_LAUNCH handler + } handler; + uint32_t function_id; // FNV-1a hash + uint8_t dispatch_mode; // DEVICE_CALL or GRAPH_LAUNCH + uint8_t reserved[3]; + cudaq_handler_schema_t schema; // argument/result type descriptors + + // Graph-launch backpressure metadata: + uint32_t mailbox_idx; // index into global_mailbox_bank + int *d_queue_idx; // → predecoder's queue tail + volatile int *d_ready_flags; // → predecoder's ready flags + int *d_inflight_flag; // → predecoder's inflight flag +} cudaq_function_entry_t; +``` + +#### Graph-Based Dispatch Context + +The dispatcher kernel itself runs inside a CUDA Graph (`cudaq_dispatch_graph_context`), instantiated with `cudaGraphInstantiateFlagDeviceLaunch`. This is **required** for the kernel to call `cudaGraphLaunch()` from device code. The lifecycle is: + +``` +cudaq_create_dispatch_graph_regular() + → cudaGraphCreate + → cudaGraphAddKernelNode (dispatch_kernel_with_graph) + → cudaGraphInstantiate (with DeviceLaunch flag) + → cudaGraphUpload + → cudaStreamSynchronize + +cudaq_launch_dispatch_graph() + → cudaGraphLaunch (from host) + +cudaq_destroy_dispatch_graph() + → cudaGraphExecDestroy + cudaGraphDestroy +``` + +### 4.3 AIDecoderService (Base Class) + +**Files**: `ai_decoder_service.h`, `ai_decoder_service.cu` + +The base class manages the TensorRT lifecycle and provides a default "autonomous" CUDA Graph that reads from a mailbox, runs inference, and writes results back to the ring buffer -- all on the GPU. + +#### Constructor + +```cpp +AIDecoderService(const std::string& model_path, void** device_mailbox_slot, + const std::string& engine_save_path = ""); +``` + +The constructor accepts either a `.engine` file (fast deserialization) or an `.onnx` file (builds TRT engine via autotuner). When `engine_save_path` is non-empty and the model is ONNX, the built engine is serialized to disk for caching. + +#### Responsibilities + +- **Engine loading**: Deserializes a TensorRT `.engine` file or builds from `.onnx` via `NvOnnxParser`. +- **Engine caching**: Saves built engines to disk via `engine_save_path` for fast reload. +- **Dynamic tensor binding**: Enumerates all I/O tensors from the engine, storing metadata in `TensorBinding` structs. Supports models with multiple outputs (e.g., `residual_detectors` + `logical_frame`). +- **Buffer allocation**: Allocates persistent device buffers sized to the engine's static tensor shapes. Uses `void*` for type-agnostic I/O (INT32, FP32, etc.). +- **Graph capture**: The default `capture_graph()` creates a 3-node graph: + +``` +gateway_input_kernel ──► TRT enqueueV3 ──► gateway_output_kernel +``` + +#### Dynamic Tensor Binding + +```cpp +struct TensorBinding { + std::string name; + void* d_buffer = nullptr; + size_t size_bytes = 0; + bool is_input = false; +}; +std::vector all_bindings_; +``` + +During `setup_bindings()`, all I/O tensors are enumerated from the engine. The first input becomes `d_trt_input_`, the first output becomes `d_trt_output_` (the primary output forwarded to the CPU), and any additional outputs are allocated as auxiliary buffers in `d_aux_buffers_`. + +### 4.4 AIPreDecoderService (Predecoder + CPU Handoff) + +**Files**: `ai_predecoder_service.h`, `ai_predecoder_service.cu` + +This derived class replaces the base class's autonomous graph with one that hands inference results off to the CPU for further processing by PyMatching. + +#### Constructor + +```cpp +AIPreDecoderService(const std::string& engine_path, void** device_mailbox_slot, + int queue_depth = 16, const std::string& engine_save_path = ""); +``` + +#### CUDA Graph Structure + +``` +predecoder_input_kernel ──► TRT enqueueV3 ──► predecoder_output_kernel +``` + +**`predecoder_input_kernel`**: +1. Reads the current queue tail index (`d_queue_idx`). +2. Performs a defense-in-depth bounded spin on `d_ready_flags[slot]` (primary backpressure is at the dispatcher level). +3. If the slot is free: saves the FPGA ring buffer pointer to `d_ring_ptrs[slot]` and copies syndrome data to the TRT input buffer. +4. If the spin times out: sets `ring_ptr = nullptr`, causing all threads to abort safely without corrupting the queue. + +**`predecoder_output_kernel`**: +1. Copies TRT output to `d_outputs[slot]` (mapped pinned memory, directly readable by CPU). Output data is `void*` (typically INT32 residual detectors). +2. Issues `__threadfence_system()` to ensure writes are visible over PCIe. +3. Sets `d_ready_flags[slot] = 1` (signals the CPU). +4. Advances `d_queue_idx` circularly. + +#### N-Deep Circular Queue + +Each `AIPreDecoderService` instance owns a private circular queue: + +``` + GPU writes → ← CPU reads + ┌───┬───┬───┬───┬───┬───┬───┬───┐ + │ 0 │ 1 │ 2 │ 3 │ 4 │...│14 │15 │ ready_flags[16] + └───┴───┴───┴───┴───┴───┴───┴───┘ + ▲ ▲ + │ │ + d_queue_idx cpu_poll_idx_ + (GPU tail) (CPU head) +``` + +| Buffer | Host Pointer | Device Pointer | Purpose | +|--------|-------------|---------------|---------| +| `h_ready_flags_` | CPU reads | `d_ready_flags_` GPU writes | 1 = job ready, 0 = slot free | +| `h_ring_ptrs_` | CPU reads | `d_ring_ptrs_` GPU writes | Original FPGA buffer address per job | +| `h_outputs_` | CPU reads | `d_outputs_` GPU writes | TRT inference output (`void*`, typically INT32) | + +All three buffers are allocated with `cudaHostAllocMapped` and mapped to device pointers via `cudaHostGetDevicePointer`. The GPU writes through the device pointers; the CPU reads through the host pointers. No explicit `cudaMemcpy` is ever issued on the hot path. + +#### CPU Interface + +```cpp +bool poll_next_job(PreDecoderJob& out_job); +void release_job(int slot_idx); +``` + +`poll_next_job` checks `h_ready_flags_[cpu_poll_idx_]`. If set, it issues an acquire fence (for ARM portability), populates the `PreDecoderJob` struct with the slot index, ring buffer pointer, and a pointer into the inference output buffer, then advances the poll index. + +`release_job` uses `__atomic_store_n(..., __ATOMIC_RELEASE)` to clear the flag, ensuring that all prior CPU writes (RPC response data) are visible before the GPU is allowed to reuse the slot. + +### 4.5 CPU Worker Threads & PyMatching Decoder Pool + +**File**: `test_realtime_predecoder_w_pymatching.cpp` + +The CPU-side processing uses a **polling thread + thread pool** architecture: + +1. **Polling thread** (`incoming_polling_loop`): A single dedicated thread round-robins all predecoder instances, calling `poll_next_job()` on each. When a job is found, it is dispatched to the thread pool. +2. **Thread pool** (`cudaq::qec::utils::ThreadPool`): A pool of `num_workers` threads (default 4) that execute `pymatching_worker_task` jobs concurrently. + +#### PyMatching Decoder Pool + +Each worker thread gets its own pre-allocated PyMatching decoder via `thread_local` assignment: + +```cpp +struct DecoderContext { + std::vector> decoders; + std::atomic next_decoder_idx{0}; + int z_stabilizers = 0; + int spatial_slices = 0; + + cudaq::qec::decoder* acquire_decoder() { + thread_local int my_idx = next_decoder_idx.fetch_add(1); + return decoders[my_idx % decoders.size()].get(); + } +}; +``` + +Decoders are constructed at startup from the surface code's Z parity check matrix (`H_z`) using the `cudaq-qec` plugin system: + +```cpp +auto surface_code = cudaq::qec::get_code("surface_code", {{"distance", d}}); +auto H_z = surface_code->get_parity_z(); +for (int i = 0; i < num_workers; ++i) + decoders.push_back(cudaq::qec::decoder::get("pymatching", H_z, pm_params)); +``` + +The `merge_strategy` parameter is set to `"smallest_weight"` to handle parallel edges in the surface code's PCM. + +#### Worker Function (`pymatching_worker_task`) + +Each worker invocation: + +1. **Acquires a decoder** from the pool via `ctx->acquire_decoder()` (lock-free, `thread_local`). +2. **Slices residual detectors** into `spatial_slices` groups of `z_stabilizers` each. For d=13, this is 26 slices of 84 stabilizers. +3. **Runs PyMatching** on each slice: converts INT32 residual detectors to `std::vector`, calls `decoder->decode(syndrome)`. +4. **Accumulates corrections** and convergence status across all slices. +5. **Writes RPC Response**: Formats `DecodeResponse{total_corrections, converged}` into the original ring buffer slot. +6. **Releases GPU Queue Slot**: Calls `predecoder->release_job(slot_idx)`. +7. **Acknowledges to FPGA**: Sets `tx_flags[slot]` to signal completion. + +#### Timing Instrumentation + +The worker function includes `std::chrono::high_resolution_clock` instrumentation that accumulates PyMatching decode time and total worker time into atomic counters on the `DecoderContext`. These are reported at the end of the run as a latency breakdown. + +--- + +## 5. Data Flow + +The following traces a single syndrome packet through the entire pipeline: + +``` +Step Location Action +──── ──────── ────────────────────────────────────────────────── + 1. Host/FPGA Writes RPCHeader + INT32 measurements into rx_data[slot] + 2. Host/FPGA Sets rx_flags[slot] = pointer_to_slot_data + ── memory fence ── + 3. GPU Dispatcher reads rx_flags[slot] (non-zero → work available) + 4. GPU Parses RPCHeader, extracts function_id + 5. GPU Looks up function_id in function_table → finds entry + 6. GPU Backpressure check: reads entry->d_queue_idx, entry->d_ready_flags + If queue full: skip, leave rx_flags intact, advance slot pointer + 7. GPU Writes data_buffer pointer to global_mailbox_bank[mailbox_idx] + ── __threadfence_system() ── + 8. GPU cudaGraphLaunch(entry->graph_exec, cudaStreamGraphFireAndForget) + 9. GPU Clears rx_flags[slot] = 0, increments packet count + ── slot consumed ── + + ── Inside Predecoder CUDA Graph ── +10. GPU predecoder_input_kernel: + Reads *mailbox → ring_buffer_ptr + Reads d_queue_idx → slot_idx + Copies INT32 measurements from ring buffer → d_trt_input_ +11. GPU TRT enqueueV3: Neural network inference (INT32 → INT32) +12. GPU predecoder_output_kernel: + Copies d_trt_output_ → h_outputs_[slot_idx] (mapped pinned) + Saves ring_buffer_ptr → h_ring_ptrs_[slot_idx] + ── __threadfence_system() ── + Sets h_ready_flags_[slot_idx] = 1 + Advances d_queue_idx + ── Graph complete ── + +13. CPU Polling thread: h_ready_flags_[poll_idx] == 1 → job found + Dispatches job to thread pool +14. CPU Worker thread: acquires per-thread PyMatching decoder + ── std::atomic_thread_fence(acquire) ── +15. CPU Worker reads h_ring_ptrs_[poll_idx], h_outputs_[poll_idx] +16. CPU Worker slices residual_detectors into spatial_slices + For each slice: PyMatching MWPM decode → corrections +17. CPU Worker writes RPCResponse + DecodeResponse into ring_buffer_ptr + ── std::atomic_thread_fence(release) ── +18. CPU Worker: release_job(slot_idx) → clears h_ready_flags_[slot_idx] + ── __atomic_store_n(..., RELEASE) ── +19. CPU Worker: Sets tx_flags[ring_slot] = rx_value + ── FPGA/Host sees response ── +``` + +--- + +## 6. Memory Architecture + +### Allocation Map + +``` +┌─────────────────────────────────────────────────────────────┐ +│ PINNED MAPPED MEMORY │ +│ (cudaHostAllocMapped + cudaHostGetDevicePointer)│ +│ │ +│ Ring Buffer: │ +│ rx_flags[64] ← Host writes, GPU reads/clears │ +│ tx_flags[64] ← CPU writes, Host reads │ +│ rx_data[64 x SLOT_SIZE] ← Host writes, GPU reads, │ +│ CPU reads/writes │ +│ │ +│ Per-PreDecoder (x4): │ +│ h_ready_flags_[16] ← GPU writes 1, CPU reads, CPU clears│ +│ h_ring_ptrs_[16] ← GPU writes, CPU reads │ +│ h_outputs_[16xN] ← GPU writes (void*), CPU reads │ +│ │ +│ Control: │ +│ shutdown_flag ← CPU writes, GPU reads │ +└─────────────────────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────┐ +│ DEVICE MEMORY │ +│ │ +│ d_global_mailbox_bank[4] ← Dispatcher writes, Graph reads │ +│ d_function_entries[4] ← Host copies at init, GPU reads │ +│ d_stats ← GPU increments, Host reads │ +│ │ +│ Per-PreDecoder (x4): │ +│ d_trt_input_ (void*) ← Input kernel writes, TRT reads │ +│ d_trt_output_ (void*) ← TRT writes, Output kernel reads│ +│ d_aux_buffers_ (void*) ← Additional TRT I/O (e.g. │ +│ logical_frame) │ +│ d_queue_idx_ ← GPU reads/writes (queue tail) │ +│ d_inflight_flag_ ← Dispatcher checks backpressure │ +└─────────────────────────────────────────────────────────────┘ +``` + +### Why Mapped Pinned Memory? + +Traditional `cudaMemcpyAsync` requires the GPU to issue a DMA transfer on a stream, which introduces stream synchronization overhead. Mapped pinned memory (`cudaHostAllocMapped`) gives the GPU a device-accessible pointer to host memory. GPU writes travel over PCIe and become visible to the CPU (on x86, immediately; on ARM, after appropriate fencing). This eliminates all explicit copy calls from the hot path. + +--- + +## 7. Backpressure Protocol + +Backpressure prevents the GPU from overwhelming the CPU when PyMatching workers fall behind. It operates at **two levels**: + +### Level 1: Dispatcher (Primary) + +Before launching a predecoder graph, the dispatcher reads the predecoder's queue state directly from the function table entry: + +```c +int* d_queue_idx = entry->d_queue_idx; +volatile int* d_ready_flags = entry->d_ready_flags; + +int current_tail = *d_queue_idx; +if (d_ready_flags[current_tail] == 1) { + // Queue full: skip this packet, do NOT clear rx_flags + packet_consumed = false; +} +``` + +If the queue is full, the packet stays in the ring buffer. The dispatcher advances to the next slot, so **other decoders are not blocked** (no head-of-line blocking). On the next pass through the ring buffer, the dispatcher will retry the skipped slot. + +### Level 2: Predecoder Input Kernel (Defense-in-Depth) + +If the dispatcher's backpressure check is bypassed (e.g., backpressure pointers not wired up, or a race condition), the predecoder input kernel has a **bounded spin** as a safety net: + +```c +int timeout_counter = 0; +while (d_ready_flags[slot_idx] == 1 && timeout_counter < 1000000) { + timeout_counter++; +} + +if (d_ready_flags[slot_idx] == 1) { + ring_ptr = nullptr; // Abort safely, don't corrupt the slot +} +``` + +On timeout, the kernel nullifies `ring_ptr`, which causes all threads to return without writing any data. This prevents silent corruption but means the syndrome is effectively dropped. In a correctly configured system, this path should never be reached. + +--- + +## 8. Memory Ordering & Synchronization + +The pipeline involves three independent agents (FPGA/Host, GPU, CPU) communicating through shared memory. Correctness depends on careful ordering: + +### GPU → CPU (Predecoder Output → Poll) + +| Agent | Operation | Ordering Primitive | +|-------|-----------|-------------------| +| GPU | Write `h_outputs_[slot]` and `h_ring_ptrs_[slot]` | (normal device writes to mapped memory) | +| GPU | `__threadfence_system()` | Ensures all prior writes are visible over PCIe | +| GPU | Write `h_ready_flags_[slot] = 1` | (the "publish" signal) | +| CPU | Read `h_ready_flags_[slot] == 1` | (volatile read) | +| CPU | `std::atomic_thread_fence(acquire)` | Prevents CPU from speculatively reading data before the flag | +| CPU | Read `h_outputs_[slot]`, `h_ring_ptrs_[slot]` | (safe: ordered after acquire) | + +On x86, the acquire fence is technically a no-op (loads are not reordered with loads), but it is necessary for correctness on ARM (e.g., Grace Hopper). + +### CPU → GPU (Job Release → Queue Reuse) + +| Agent | Operation | Ordering Primitive | +|-------|-----------|-------------------| +| CPU | Write RPC response to ring buffer | (normal stores) | +| CPU | `__atomic_store_n(&h_ready_flags_[slot], 0, __ATOMIC_RELEASE)` | Ensures response writes are visible before flag is cleared | +| GPU | Read `d_ready_flags[slot] == 0` | (volatile read from mapped memory) | +| GPU | Overwrites `d_ring_ptrs[slot]`, `d_outputs[slot]` | (safe: flag was 0) | + +### Host → GPU (Ring Buffer Signaling) + +| Agent | Operation | Ordering Primitive | +|-------|-----------|-------------------| +| Host/Test | Write RPC header + payload to `rx_data[slot]` | (normal stores) | +| Host/Test | `__sync_synchronize()` / memory barrier | Full fence before flag write | +| Host/Test | Write `rx_flags[slot] = pointer` | (the "publish" signal) | +| GPU | Read `rx_flags[slot] != 0` | (volatile read from mapped memory) | + +--- + +## 9. CUDA Graph Hierarchy + +The system uses a **two-level graph hierarchy**: + +``` +Level 0: Dispatcher Graph (cudaq_dispatch_graph_context) + │ + │ Instantiated with cudaGraphInstantiateFlagDeviceLaunch + │ Contains: dispatch_kernel_with_graph (persistent kernel node) + │ + │ Device-side cudaGraphLaunch() ──► + │ + ├──► Level 1: PreDecoder Graph [0] + │ predecoder_input_kernel → TRT enqueueV3 → predecoder_output_kernel + │ + ├──► Level 1: PreDecoder Graph [1] + │ ... + ├──► Level 1: PreDecoder Graph [2] + │ ... + └──► Level 1: PreDecoder Graph [3] + ... +``` + +**Level 0** must be instantiated with `cudaGraphInstantiateFlagDeviceLaunch` so that the persistent kernel running inside it can call `cudaGraphLaunch()` on **Level 1** graphs. Level 1 graphs are also instantiated with this flag and uploaded to the device. The launch mode is `cudaStreamGraphFireAndForget`, meaning the predecoder graph executes asynchronously without blocking the dispatcher. + +**Requirement**: Compute capability >= sm_80 (Ampere and later). Device-side graph launch is gated by `#if __CUDA_ARCH__ >= 800`. + +**Limitation**: `cudaStreamGraphFireAndForget` has a CUDA runtime limit on concurrent pending child graph launches (~128). The test limits `total_requests` to 100 to stay under this ceiling. + +--- + +## 10. Pipeline Configurations + +The test supports multiple surface code distances via the `PipelineConfig` struct: + +| Config | Distance | Rounds | ONNX Model | Input Shape | Input Bytes | Residual Detectors | Z Stabilizers | Spatial Slices | Slot Size | +|--------|----------|--------|------------|-------------|-------------|-------------------|---------------|---------------|-----------| +| `d7` | 7 | 7 | `model1_d7_r7_unified_Z_batch1.onnx` | [1, 72, 7] | 2,016 | 336 | 24 | 14 | 4,096 | +| `d13` | 13 | 13 | `model1_d13_r13_unified_Z_batch1.onnx` | [1, 252, 13] | 13,104 | 2,184 | 84 | 26 | 16,384 | +| `d21` | 21 | 21 | `model1_d21_r21_unified_X_batch1.onnx` | [1, 660, 21] | 55,440 | 9,240 | 220 | 42 | 65,536 | +| `d31` | 31 | 31 | `model1_d31_r31_unified_Z_batch1.onnx` | [1, 1440, 31] | 178,560 | 29,760 | 480 | 62 | 262,144 | + +All models use **INT32** tensors for both input (measurements) and output (residual detectors, logical frame). + +The number of **spatial slices** is `residual_detectors / z_stabilizers`. PyMatching is called once per slice, with each slice containing one group of Z-stabilizer detector values. + +Usage: + +```bash +./test_realtime_predecoder_w_pymatching d7 # default +./test_realtime_predecoder_w_pymatching d13 +./test_realtime_predecoder_w_pymatching d21 +./test_realtime_predecoder_w_pymatching d31 +``` + +### Engine Caching + +On first run with a given configuration, the ONNX model is compiled to a TensorRT engine and saved alongside the ONNX file (e.g., `model1_d13_r13_unified_Z_batch1.engine`). Subsequent runs detect the cached engine and skip the build phase. + +--- + +## 11. File Inventory + +| File | Layer | Purpose | +|------|-------|---------| +| `realtime/include/.../cudaq_realtime.h` | API | C API header: structs, enums, function declarations | +| `realtime/include/.../dispatch_kernel_launch.h` | API | RPC protocol structs (RPCHeader, RPCResponse), FNV-1a hash | +| `realtime/lib/.../dispatch_kernel.cu` | Runtime | Persistent dispatcher kernels + graph-based dispatch context | +| `libs/qec/include/.../ai_decoder_service.h` | QEC | Base class header: TRT lifecycle, dynamic tensor bindings, engine caching | +| `libs/qec/lib/.../ai_decoder_service.cu` | QEC | Base class impl: ONNX build, engine save/load, gateway kernels, graph capture | +| `libs/qec/include/.../ai_predecoder_service.h` | QEC | Derived class header: CPU handoff queue, `QEC_CPU_RELAX` macro | +| `libs/qec/lib/.../ai_predecoder_service.cu` | QEC | Derived class impl: predecoder kernels, circular queue, poll/release | +| `libs/qec/include/.../utils/thread_pool.h` | Util | Thread pool with optional core pinning | +| `libs/qec/include/.../utils/pipeline_benchmarks.h` | Util | Reusable latency/throughput benchmarking utility | +| `libs/qec/lib/.../test_realtime_predecoder_w_pymatching.cpp` | Test | End-to-end integration test with real ONNX + PyMatching | + +--- + +## 12. Configuration Parameters + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `NUM_SLOTS` | 64 | Ring buffer slot count (Host ↔ GPU) | +| `slot_size` | Per-config (4096 - 262144) | Max payload per slot (RPCHeader + measurements + result) | +| `num_predecoders` | 4 | Parallel predecoder instances (TRT engines) | +| `queue_depth` | 16 | N-deep circular queue per predecoder | +| `num_workers` | 4 | Thread pool size (each gets its own PyMatching decoder) | +| `total_requests` | 100 | Requests per test run (limited by CUDA graph launch ceiling) | +| Dispatcher grid | 1 block, 32 threads | Persistent kernel configuration | +| Predecoder grid | 1 block, 128 threads | Per-graph kernel configuration | +| Spin timeout | 1,000,000 iterations | Defense-in-depth backpressure in input kernel | + +### Capacity Analysis + +- **Total GPU→CPU queue capacity**: 4 predecoders x 16 depth = 64 slots +- **Ring buffer capacity**: 64 slots +- These are balanced: worst case, all 64 ring buffer requests could be in-flight across the predecoder queues simultaneously. +- If requests are unevenly distributed (e.g., 32 to one predecoder), that predecoder's queue fills at depth 16, and the dispatcher applies backpressure for the remaining 16. +- **Batched submission**: The test fires requests in batches of `num_predecoders` (4), waiting for each batch to complete before submitting the next. This avoids overwhelming the dispatcher and stays within CUDA graph launch limits. + +--- + +## 13. Performance Benchmarking + +### PipelineBenchmark Utility + +The `PipelineBenchmark` class (`libs/qec/include/cudaq/qec/utils/pipeline_benchmarks.h`) provides reusable latency and throughput measurement for any pipeline test: + +```cpp +cudaq::qec::utils::PipelineBenchmark bench("d13_r13_Z", total_requests); +bench.start(); +// ... submit requests, mark_submit(i), mark_complete(i) ... +bench.stop(); +bench.report(); +``` + +It tracks per-request submit and complete timestamps, computes statistics only on completed requests, and reports: + +- Min, max, mean, p50, p90, p95, p99 latencies (microseconds) +- Standard deviation +- Total wall time and throughput (req/s) +- Submitted / completed / timed-out counts + +### Worker Timing Breakdown + +The test also reports an average breakdown of where time is spent: + +``` + Worker Timing Breakdown (avg over 100 requests): + PyMatching decode: 164.3 us (23.6%) + Worker overhead: 0.4 us ( 0.1%) + GPU+dispatch+poll: 530.1 us (76.3%) + Total end-to-end: 694.8 us + Per-round (/13): 53.4 us/round +``` + +### Measured Performance (representative, system-dependent) + +| Config | p50 Latency | Mean Latency | Throughput | PyMatching % | Per-round | +|--------|-------------|-------------|------------|-------------|-----------| +| d=7 | 262 us | 284 us | 10,803 req/s | 12.8% | 40.6 us | +| d=13 | 658 us | 678 us | 3,467 req/s | 23.0% | 52.1 us | + +### Profiling with Nsight Systems + +```bash +nsys profile --trace=cuda,nvtx,osrt --cuda-graph-trace=node \ + -o d13_profile ./test_realtime_predecoder_w_pymatching d13 +nsys stats d13_profile.nsys-rep +``` + +Key findings from profiling: +- GPU TRT inference is ~9 us/request (very fast) +- The dominant latency is in the dispatcher's slot-scanning loop and CPU polling gap +- PyMatching decode accounts for 13-23% of end-to-end latency depending on distance +- The `--cuda-graph-trace=node` flag is critical for seeing individual kernels inside CUDA graphs + +--- + +## 14. Portability + +### Architecture Support + +| Feature | x86_64 | aarch64 (Grace Hopper) | +|---------|--------|----------------------| +| `QEC_CPU_RELAX()` | `_mm_pause()` | `asm volatile("yield")` | +| Acquire fence in `poll_next_job` | No-op (TSO) | Required (`std::atomic_thread_fence`) | +| Release store in `release_job` | `__atomic_store_n` | `__atomic_store_n` | +| `volatile` for mapped memory | Sufficient | Requires fences (provided) | + +The `QEC_CPU_RELAX()` macro is defined in `ai_predecoder_service.h` and should be used by all polling code instead of platform-specific intrinsics. + +### CUDA Compute Capability + +| Feature | Minimum | +|---------|---------| +| Device-side `cudaGraphLaunch` | sm_80 (Ampere) | +| `__threadfence_system()` | sm_20+ | +| Mapped pinned memory | All CUDA devices | + +--- + +## 15. Limitations & Future Work + +1. **Linear function table lookup**: `dispatch_lookup_entry` performs a linear scan of the function table. With 4 entries this is negligible, but for larger tables a hash map or sorted binary search would be appropriate. + +2. **No queue drain on shutdown**: Setting `system_stop = true` causes the worker threads to exit immediately. Jobs that the GPU has completed but the CPU hasn't polled are silently dropped. Production code should drain all queues before stopping. + +3. **Dropped syndromes on timeout**: If the defense-in-depth spin timeout fires in `predecoder_input_kernel`, the syndrome is silently dropped. A production system should increment an error counter or signal the host. + +4. **Static TRT shapes only**: The current implementation assumes static input/output tensor shapes. Dynamic shapes would require per-invocation shape metadata in the RPC payload and runtime TRT profile switching. + +5. **Batched submission**: The test fires requests in batches of `num_predecoders` and waits for completion before the next batch. This serializes batches and underutilizes the pipeline. A pipelined submission strategy (overlapping batch N+1 submission with batch N completion) would improve throughput. + +6. **Single polling thread**: The `incoming_polling_loop` is a single thread that round-robins all predecoders. At higher predecoder counts, this could become a bottleneck. A per-predecoder polling thread or lock-free MPSC queue could help. + +7. **CUDA graph launch ceiling**: `cudaStreamGraphFireAndForget` has a runtime limit of ~128 concurrent pending child graph launches. The test limits `total_requests` to 100 to stay under this. Production systems with sustained high throughput may need to throttle submissions or use a different dispatch strategy. + +8. **Dispatcher scanning latency**: The persistent dispatcher kernel parks on the current slot and spins until it is populated. With batched submission, there is a round-trip delay between batch completion and next-batch submission that dominates the end-to-end latency (~550 us of the ~700 us total for d=13). From 5ddd4d3ff4312769f337af632ab271b0c08c6c08 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Fri, 20 Feb 2026 01:29:47 +0000 Subject: [PATCH 12/40] Add host-side spin-polling dispatcher to replace device-side persistent kernel The CUDA device runtime has a hardcoded 128 fire-and-forget graph launch slot limit that is never reclaimed while a persistent parent kernel runs, making the device-side dispatcher unsuitable for sustained operation. This adds a host-side CPU dispatcher thread that polls rx_flags and calls cudaGraphLaunch from host code on per-predecoder CUDA streams, bypassing the device runtime limit entirely. Streaming mode uses the host dispatcher; batch mode retains the device-side dispatcher for backward compatibility. Key changes: - New host_dispatcher.h/.cpp with host_dispatcher_loop() - AIPreDecoderService::capture_graph() gains device_launch flag for conditional cudaGraphInstantiateFlagDeviceLaunch vs standard instantiation - d_queue_idx_ changed from cudaMalloc to cudaHostAllocMapped so the host dispatcher can read backpressure state without cudaMemcpy - Mailbox bank changed to mapped pinned memory for zero-copy host writes - Streaming test uses host dispatcher with per-predecoder streams Verified: d7 streaming 16,824 requests (219 us mean, 31 us/round), d13 streaming 6,227 requests (455 us mean, 35 us/round), zero errors. Signed-off-by: Scott Thornton --- .../qec/realtime/ai_predecoder_service.h | 16 +- .../cudaq/qec/realtime/host_dispatcher.h | 44 ++++ .../qec/lib/realtime/ai_predecoder_service.cu | 37 ++- libs/qec/lib/realtime/host_dispatcher.cpp | 99 ++++++++ .../test_realtime_predecoder_w_pymatching.cpp | 218 ++++++++++++------ libs/qec/unittests/CMakeLists.txt | 1 + 6 files changed, 326 insertions(+), 89 deletions(-) create mode 100644 libs/qec/include/cudaq/qec/realtime/host_dispatcher.h create mode 100644 libs/qec/lib/realtime/host_dispatcher.cpp diff --git a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h index dd2dec99..ba4ee551 100644 --- a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h +++ b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h @@ -35,7 +35,11 @@ class AIPreDecoderService : public AIDecoderService { int queue_depth = 16, const std::string& engine_save_path = ""); virtual ~AIPreDecoderService(); - void capture_graph(cudaStream_t stream) override; + /// @param device_launch If true, instantiate graph with DeviceLaunch flag + /// (for device-side dispatcher). If false, use standard instantiation + /// (for host-side dispatcher). + void capture_graph(cudaStream_t stream, bool device_launch); + void capture_graph(cudaStream_t stream) override { capture_graph(stream, true); } bool poll_next_job(PreDecoderJob& out_job); void release_job(int slot_idx); @@ -44,6 +48,11 @@ class AIPreDecoderService : public AIDecoderService { volatile int* get_device_ready_flags() const { return d_ready_flags_; } int* get_device_inflight_flag() const { return d_inflight_flag_; } + // Host-side accessors (for host dispatcher backpressure checks) + volatile int* get_host_ready_flags() const { return h_ready_flags_; } + volatile int* get_host_queue_idx() const { return h_queue_idx_; } + int get_queue_depth() const { return queue_depth_; } + private: int queue_depth_; int cpu_poll_idx_ = 0; @@ -58,8 +67,9 @@ class AIPreDecoderService : public AIDecoderService { void** d_ring_ptrs_ = nullptr; void* d_outputs_ = nullptr; - // Device State - int* d_queue_idx_ = nullptr; + // Queue index: mapped pinned so both GPU and host can access + volatile int* h_queue_idx_ = nullptr; // Host pointer + int* d_queue_idx_ = nullptr; // Device pointer (same physical memory) int* d_claimed_slot_ = nullptr; int* d_inflight_flag_ = nullptr; }; diff --git a/libs/qec/include/cudaq/qec/realtime/host_dispatcher.h b/libs/qec/include/cudaq/qec/realtime/host_dispatcher.h new file mode 100644 index 00000000..9032c5b5 --- /dev/null +++ b/libs/qec/include/cudaq/qec/realtime/host_dispatcher.h @@ -0,0 +1,44 @@ +/******************************************************************************* + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. + * All rights reserved. + * + * This source code and the accompanying materials are made available under + * the terms of the Apache License 2.0 which accompanies this distribution. + ******************************************************************************/ + +#pragma once + +#include "cudaq/qec/realtime/ai_predecoder_service.h" +#include +#include +#include +#include + +namespace cudaq::qec { + +struct HostDispatchEntry { + uint32_t function_id; + cudaGraphExec_t graph_exec; + int mailbox_idx; + AIPreDecoderService* predecoder; + cudaStream_t stream; +}; + +struct HostDispatcherConfig { + volatile uint64_t* rx_flags_host; + volatile uint64_t* tx_flags_host; + uint8_t* rx_data_host; + uint8_t* rx_data_dev; + void** h_mailbox_bank; + size_t num_slots; + size_t slot_size; + std::vector entries; + volatile int* shutdown_flag; + uint64_t* stats_counter; +}; + +/// Run the host-side dispatcher loop. Blocks until *config.shutdown_flag +/// becomes non-zero. Call from a dedicated thread. +void host_dispatcher_loop(const HostDispatcherConfig& config); + +} // namespace cudaq::qec diff --git a/libs/qec/lib/realtime/ai_predecoder_service.cu b/libs/qec/lib/realtime/ai_predecoder_service.cu index de91afb7..f36333f5 100644 --- a/libs/qec/lib/realtime/ai_predecoder_service.cu +++ b/libs/qec/lib/realtime/ai_predecoder_service.cu @@ -105,8 +105,9 @@ AIPreDecoderService::AIPreDecoderService(const std::string& path, void** mailbox SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_ring_ptrs_, (void*)h_ring_ptrs_, 0)); SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_outputs_, (void*)h_outputs_, 0)); - SERVICE_CUDA_CHECK(cudaMalloc(&d_queue_idx_, sizeof(int))); - SERVICE_CUDA_CHECK(cudaMemset(d_queue_idx_, 0, sizeof(int))); + SERVICE_CUDA_CHECK(cudaHostAlloc((void**)&h_queue_idx_, sizeof(int), cudaHostAllocMapped)); + *const_cast(const_cast(h_queue_idx_)) = 0; + SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_queue_idx_, (void*)h_queue_idx_, 0)); SERVICE_CUDA_CHECK(cudaMalloc(&d_claimed_slot_, sizeof(int))); SERVICE_CUDA_CHECK(cudaMemset(d_claimed_slot_, 0, sizeof(int))); @@ -119,12 +120,12 @@ AIPreDecoderService::~AIPreDecoderService() { if (h_ready_flags_) cudaFreeHost((void*)h_ready_flags_); if (h_ring_ptrs_) cudaFreeHost(h_ring_ptrs_); if (h_outputs_) cudaFreeHost(h_outputs_); - if (d_queue_idx_) cudaFree(d_queue_idx_); + if (h_queue_idx_) cudaFreeHost((void*)h_queue_idx_); if (d_claimed_slot_) cudaFree(d_claimed_slot_); if (d_inflight_flag_) cudaFree(d_inflight_flag_); } -void AIPreDecoderService::capture_graph(cudaStream_t stream) { +void AIPreDecoderService::capture_graph(cudaStream_t stream, bool device_launch) { bool skip_trt = (std::getenv("SKIP_TRT") != nullptr); if (!skip_trt) { @@ -156,15 +157,27 @@ void AIPreDecoderService::capture_graph(cudaStream_t stream) { d_inflight_flag_); SERVICE_CUDA_CHECK(cudaStreamEndCapture(stream, &graph)); - - cudaError_t inst_err = cudaGraphInstantiateWithFlags(&graph_exec_, graph, cudaGraphInstantiateFlagDeviceLaunch); - if (inst_err != cudaSuccess) { - cudaGraphDestroy(graph); - throw std::runtime_error( - std::string("cudaGraphInstantiateWithFlags FAILED: ") + cudaGetErrorString(inst_err)); + + if (device_launch) { + cudaError_t inst_err = cudaGraphInstantiateWithFlags( + &graph_exec_, graph, cudaGraphInstantiateFlagDeviceLaunch); + if (inst_err != cudaSuccess) { + cudaGraphDestroy(graph); + throw std::runtime_error( + std::string("cudaGraphInstantiateWithFlags (DeviceLaunch) FAILED: ") + + cudaGetErrorString(inst_err)); + } + SERVICE_CUDA_CHECK(cudaGraphUpload(graph_exec_, stream)); + } else { + cudaError_t inst_err = cudaGraphInstantiate(&graph_exec_, graph, 0); + if (inst_err != cudaSuccess) { + cudaGraphDestroy(graph); + throw std::runtime_error( + std::string("cudaGraphInstantiate FAILED: ") + + cudaGetErrorString(inst_err)); + } } - - SERVICE_CUDA_CHECK(cudaGraphUpload(graph_exec_, stream)); + cudaGraphDestroy(graph); SERVICE_CUDA_CHECK(cudaStreamSynchronize(stream)); } diff --git a/libs/qec/lib/realtime/host_dispatcher.cpp b/libs/qec/lib/realtime/host_dispatcher.cpp new file mode 100644 index 00000000..c35e2366 --- /dev/null +++ b/libs/qec/lib/realtime/host_dispatcher.cpp @@ -0,0 +1,99 @@ +/******************************************************************************* + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. + * All rights reserved. + * + * This source code and the accompanying materials are made available under + * the terms of the Apache License 2.0 which accompanies this distribution. + ******************************************************************************/ + +#include "cudaq/qec/realtime/host_dispatcher.h" +#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" + +#include +#include + +namespace cudaq::qec { + +void host_dispatcher_loop(const HostDispatcherConfig& config) { + size_t current_slot = 0; + const size_t num_slots = config.num_slots; + const int num_entries = static_cast(config.entries.size()); + uint64_t packets_dispatched = 0; + + while (!*config.shutdown_flag) { + uint64_t rx_value = config.rx_flags_host[current_slot]; + + if (rx_value == 0) { + QEC_CPU_RELAX(); + continue; + } + + std::atomic_thread_fence(std::memory_order_acquire); + + auto* data_host = reinterpret_cast(rx_value); + auto* header = static_cast(data_host); + + if (header->magic != cudaq::nvqlink::RPC_MAGIC_REQUEST) { + config.rx_flags_host[current_slot] = 0; + current_slot = (current_slot + 1) % num_slots; + continue; + } + + int entry_idx = -1; + for (int i = 0; i < num_entries; ++i) { + if (config.entries[i].function_id == header->function_id) { + entry_idx = i; + break; + } + } + + if (entry_idx < 0) { + config.rx_flags_host[current_slot] = 0; + current_slot = (current_slot + 1) % num_slots; + continue; + } + + const auto& entry = config.entries[entry_idx]; + + // Backpressure: check if the predecoder stream is idle + bool stream_busy = (cudaStreamQuery(entry.stream) != cudaSuccess); + if (stream_busy) { + current_slot = (current_slot + 1) % num_slots; + continue; + } + + // Backpressure: check if the predecoder queue is full + volatile int* h_ready = entry.predecoder->get_host_ready_flags(); + volatile int* h_qidx = entry.predecoder->get_host_queue_idx(); + if (h_ready[*h_qidx] == 1) { + current_slot = (current_slot + 1) % num_slots; + continue; + } + + // Translate host pointer to device pointer for the mailbox + ptrdiff_t offset = (uint8_t*)data_host - config.rx_data_host; + void* data_dev = (void*)(config.rx_data_dev + offset); + config.h_mailbox_bank[entry.mailbox_idx] = data_dev; + + __sync_synchronize(); + + cudaError_t err = cudaGraphLaunch(entry.graph_exec, entry.stream); + if (err != cudaSuccess) { + // Signal error via tx_flags (same protocol as device dispatcher) + size_t slot_idx = ((uint8_t*)data_host - config.rx_data_host) / config.slot_size; + uint64_t error_val = (uint64_t)0xDEAD << 48 | (uint64_t)err; + config.tx_flags_host[slot_idx] = error_val; + } + + config.rx_flags_host[current_slot] = 0; + packets_dispatched++; + current_slot = (current_slot + 1) % num_slots; + } + + // Write stats + if (config.stats_counter) { + *config.stats_counter = packets_dispatched; + } +} + +} // namespace cudaq::qec diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index d8b570f9..2d617b15 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -64,6 +64,7 @@ #include "cudaq/qec/realtime/ai_decoder_service.h" #include "cudaq/qec/realtime/ai_predecoder_service.h" +#include "cudaq/qec/realtime/host_dispatcher.h" #include "cudaq/qec/utils/thread_pool.h" #include "cudaq/qec/utils/pipeline_benchmarks.h" #include "cudaq/qec/code.h" @@ -327,10 +328,13 @@ void run_streaming_test( volatile uint64_t* rx_flags_host, volatile uint64_t* tx_flags_host, uint8_t* rx_data_host, + uint8_t* rx_data_dev, DecoderContext& decoder_ctx, std::vector>& predecoders, cudaq::qec::utils::ThreadPool& pymatching_pool, - std::atomic& system_stop) + std::atomic& system_stop, + void** h_mailbox_bank, + std::vector& predecoder_streams) { using hrclock = std::chrono::high_resolution_clock; @@ -341,21 +345,42 @@ void run_streaming_test( std::vector complete_ts(max_requests); std::vector completed(max_requests, false); - // slot -> request_id mapping so consumer can correlate completions std::vector slot_request(NUM_SLOTS, -1); std::atomic total_submitted{0}; std::atomic total_completed{0}; - std::atomic in_flight{0}; std::atomic backpressure_stalls{0}; std::atomic producer_done{false}; - // Cap in-flight to num_predecoders. The dispatcher scans slots - // sequentially and only advances on non-empty slots. With the inflight - // flag limiting one graph launch per predecoder, only num_predecoders - // slots can be consumed per scan. Any excess slots get backpressured, - // then the dispatcher parks on an empty slot and never revisits them. - const int max_in_flight = config.num_predecoders; + // Set up host dispatcher + volatile int dispatcher_shutdown = 0; + uint64_t dispatcher_stats = 0; + + HostDispatcherConfig disp_cfg; + disp_cfg.rx_flags_host = rx_flags_host; + disp_cfg.tx_flags_host = tx_flags_host; + disp_cfg.rx_data_host = rx_data_host; + disp_cfg.rx_data_dev = rx_data_dev; + disp_cfg.h_mailbox_bank = h_mailbox_bank; + disp_cfg.num_slots = NUM_SLOTS; + disp_cfg.slot_size = config.slot_size; + disp_cfg.shutdown_flag = &dispatcher_shutdown; + disp_cfg.stats_counter = &dispatcher_stats; + + for (int i = 0; i < config.num_predecoders; ++i) { + std::string func_name = "predecode_target_" + std::to_string(i); + HostDispatchEntry entry; + entry.function_id = fnv1a_hash(func_name); + entry.graph_exec = predecoders[i]->get_executable_graph(); + entry.mailbox_idx = i; + entry.predecoder = predecoders[i].get(); + entry.stream = predecoder_streams[i]; + disp_cfg.entries.push_back(entry); + } + + std::thread dispatcher_thread([&disp_cfg]() { + host_dispatcher_loop(disp_cfg); + }); auto run_deadline = std::chrono::steady_clock::now() + std::chrono::seconds(scfg.duration_s); @@ -364,11 +389,12 @@ void run_streaming_test( ? std::to_string(scfg.rate_us) + " us" : "open-loop"; - std::cout << "\n[Stream] Starting streaming test (" << config.label << ")\n" + std::cout << "\n[Stream] Starting streaming test (" << config.label + << ", HOST dispatcher)\n" << " Rate: " << rate_label << "\n" << " Duration: " << scfg.duration_s << " s\n" << " Warmup: " << scfg.warmup_count << " requests\n" - << " Max flight: " << max_in_flight << "\n" + << " Predecoders:" << config.num_predecoders << " (dedicated streams)\n" << " Max reqs: " << max_requests << "\n\n"; // --- Producer thread (simulates FPGA) --- @@ -380,12 +406,6 @@ void run_streaming_test( while (std::chrono::steady_clock::now() < run_deadline && req_id < max_requests) { - // Throttle: don't exceed max_in_flight to prevent ring buffer flooding - while (in_flight.load(std::memory_order_acquire) >= max_in_flight) { - QEC_CPU_RELAX(); - if (std::chrono::steady_clock::now() >= run_deadline) return; - } - int slot = next_slot % (int)NUM_SLOTS; // Wait for slot to be fully free (dispatcher consumed + response harvested) @@ -413,13 +433,11 @@ void run_streaming_test( __sync_synchronize(); submit_ts[req_id] = hrclock::now(); rx_flags_host[slot] = reinterpret_cast(slot_data); - in_flight.fetch_add(1, std::memory_order_release); total_submitted.fetch_add(1, std::memory_order_release); next_slot++; req_id++; - // Rate limiting (busy-wait for precision) if (scfg.rate_us > 0) { auto target_time = submit_ts[req_id - 1] + std::chrono::microseconds(scfg.rate_us); @@ -443,7 +461,6 @@ void run_streaming_test( if (pdone && ncomp >= nsub) break; - // Nothing to harvest yet if (next_harvest >= nsub) { QEC_CPU_RELAX(); continue; @@ -469,7 +486,6 @@ void run_streaming_test( tx_flags_host[slot] = 0; slot_request[slot] = -1; - in_flight.fetch_sub(1, std::memory_order_release); next_harvest++; } else { QEC_CPU_RELAX(); @@ -486,6 +502,11 @@ void run_streaming_test( usleep(1000); } + // Shut down the host dispatcher thread + dispatcher_shutdown = 1; + __sync_synchronize(); + dispatcher_thread.join(); + consumer.join(); // ===== Report ===== @@ -601,6 +622,8 @@ void run_streaming_test( std::cout << " Per-round (/" << config.num_rounds << "): " << std::setw(10) << (mean / config.num_rounds) << " us/round\n"; } + std::cout << " ---------------------------------------------------------------\n"; + std::cout << " Host dispatcher processed " << dispatcher_stats << " packets.\n"; std::cout << "================================================================\n"; } @@ -733,24 +756,54 @@ int main(int argc, char* argv[]) { g_sys_ctx.rx_data_host = rx_data_host; g_sys_ctx.slot_size = config.slot_size; - // Allocate Global Mailbox Bank & Control signals - void** d_global_mailbox_bank; - CUDA_CHECK(cudaMalloc(&d_global_mailbox_bank, config.num_predecoders * sizeof(void*))); - CUDA_CHECK(cudaMemset(d_global_mailbox_bank, 0, config.num_predecoders * sizeof(void*))); + // ========================================================================= + // Mailbox & Dispatcher Setup (mode-dependent) + // ========================================================================= + + // Mapped pinned mailbox (used by both modes -- host writes, GPU reads) + void** h_mailbox_bank = nullptr; + void** d_mailbox_bank = nullptr; + CUDA_CHECK(cudaHostAlloc(&h_mailbox_bank, config.num_predecoders * sizeof(void*), cudaHostAllocMapped)); + std::memset(h_mailbox_bank, 0, config.num_predecoders * sizeof(void*)); + CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_mailbox_bank, h_mailbox_bank, 0)); + + // Device memory mailbox (for device-side dispatcher backward compat) + void** d_global_mailbox_bank = nullptr; + + int* shutdown_flag_host = nullptr; + int* d_shutdown_flag = nullptr; + uint64_t* d_stats = nullptr; + cudaq_function_entry_t* d_function_entries = nullptr; + cudaq_dispatch_graph_context* dispatch_ctx = nullptr; - int* shutdown_flag_host; - CUDA_CHECK(cudaHostAlloc(&shutdown_flag_host, sizeof(int), cudaHostAllocMapped)); - *shutdown_flag_host = 0; - int* d_shutdown_flag; - CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_shutdown_flag, shutdown_flag_host, 0)); + // Per-predecoder streams (for host dispatcher) + std::vector predecoder_streams; - uint64_t* d_stats; - CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t))); - CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t))); + const bool use_host_dispatcher = streaming_mode; + bool device_launch = !use_host_dispatcher; + + if (!use_host_dispatcher) { + CUDA_CHECK(cudaMalloc(&d_global_mailbox_bank, config.num_predecoders * sizeof(void*))); + CUDA_CHECK(cudaMemset(d_global_mailbox_bank, 0, config.num_predecoders * sizeof(void*))); + + CUDA_CHECK(cudaHostAlloc(&shutdown_flag_host, sizeof(int), cudaHostAllocMapped)); + *shutdown_flag_host = 0; + CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_shutdown_flag, shutdown_flag_host, 0)); + + CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t))); + CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t))); + } else { + for (int i = 0; i < config.num_predecoders; ++i) { + cudaStream_t s; + CUDA_CHECK(cudaStreamCreate(&s)); + predecoder_streams.push_back(s); + } + } // Initialize AIPreDecoder Instances from ONNX std::cout << "[Setup] Capturing " << config.num_predecoders - << "x AIPreDecoder Graphs...\n"; + << "x AIPreDecoder Graphs (" + << (device_launch ? "device-launch" : "host-launch") << ")...\n"; cudaStream_t capture_stream; CUDA_CHECK(cudaStreamCreate(&capture_stream)); @@ -759,7 +812,9 @@ int main(int argc, char* argv[]) { bool need_save = (model_path == onnx_file); for (int i = 0; i < config.num_predecoders; ++i) { - void** my_mailbox = d_global_mailbox_bank + i; + void** my_mailbox = use_host_dispatcher + ? (d_mailbox_bank + i) + : (d_global_mailbox_bank + i); std::string save_path = (need_save && i == 0) ? engine_file : ""; auto pd = std::make_unique(model_path, my_mailbox, config.queue_depth, @@ -769,37 +824,40 @@ int main(int argc, char* argv[]) { << ": input_size=" << pd->get_input_size() << " output_size=" << pd->get_output_size() << "\n"; - pd->capture_graph(capture_stream); - - cudaGraphExec_t gexec = pd->get_executable_graph(); - std::string func_name = "predecode_target_" + std::to_string(i); - function_entries[i].function_id = fnv1a_hash(func_name); - function_entries[i].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; - function_entries[i].handler.graph_exec = gexec; - function_entries[i].mailbox_idx = i; - function_entries[i].d_queue_idx = pd->get_device_queue_idx(); - function_entries[i].d_ready_flags = pd->get_device_ready_flags(); - function_entries[i].d_inflight_flag = pd->get_device_inflight_flag(); + pd->capture_graph(capture_stream, device_launch); + + if (!use_host_dispatcher) { + cudaGraphExec_t gexec = pd->get_executable_graph(); + std::string func_name = "predecode_target_" + std::to_string(i); + function_entries[i].function_id = fnv1a_hash(func_name); + function_entries[i].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; + function_entries[i].handler.graph_exec = gexec; + function_entries[i].mailbox_idx = i; + function_entries[i].d_queue_idx = pd->get_device_queue_idx(); + function_entries[i].d_ready_flags = pd->get_device_ready_flags(); + function_entries[i].d_inflight_flag = pd->get_device_inflight_flag(); + } predecoders.push_back(std::move(pd)); } - cudaq_function_entry_t* d_function_entries; - CUDA_CHECK(cudaMalloc(&d_function_entries, - config.num_predecoders * sizeof(cudaq_function_entry_t))); - CUDA_CHECK(cudaMemcpy(d_function_entries, function_entries.data(), - config.num_predecoders * sizeof(cudaq_function_entry_t), - cudaMemcpyHostToDevice)); - - // Start GPU Dispatcher - std::cout << "[Setup] Launching Dispatcher Kernel...\n"; - cudaq_dispatch_graph_context* dispatch_ctx = nullptr; - CUDA_CHECK(cudaq_create_dispatch_graph_regular( - rx_flags_dev, tx_flags_dev, d_function_entries, config.num_predecoders, - d_global_mailbox_bank, d_shutdown_flag, d_stats, NUM_SLOTS, 1, 32, - capture_stream, &dispatch_ctx - )); - CUDA_CHECK(cudaq_launch_dispatch_graph(dispatch_ctx, capture_stream)); + if (!use_host_dispatcher) { + CUDA_CHECK(cudaMalloc(&d_function_entries, + config.num_predecoders * sizeof(cudaq_function_entry_t))); + CUDA_CHECK(cudaMemcpy(d_function_entries, function_entries.data(), + config.num_predecoders * sizeof(cudaq_function_entry_t), + cudaMemcpyHostToDevice)); + + std::cout << "[Setup] Launching GPU Dispatcher Kernel...\n"; + CUDA_CHECK(cudaq_create_dispatch_graph_regular( + rx_flags_dev, tx_flags_dev, d_function_entries, config.num_predecoders, + d_global_mailbox_bank, d_shutdown_flag, d_stats, NUM_SLOTS, 1, 32, + capture_stream, &dispatch_ctx + )); + CUDA_CHECK(cudaq_launch_dispatch_graph(dispatch_ctx, capture_stream)); + } else { + std::cout << "[Setup] Host-side dispatcher will be launched in streaming test.\n"; + } // Start CPU Infrastructure std::cout << "[Setup] Booting Thread Pool (" << config.num_workers @@ -817,8 +875,9 @@ int main(int argc, char* argv[]) { // ========================================================================= if (streaming_mode) { run_streaming_test(config, stream_cfg, rx_flags_host, tx_flags_host, - rx_data_host, decoder_ctx, predecoders, - pymatching_pool, system_stop); + rx_data_host, rx_data_dev, decoder_ctx, predecoders, + pymatching_pool, system_stop, + h_mailbox_bank, predecoder_streams); } else { // Batch mode: fire requests in batches of num_predecoders, wait for // each batch to complete before firing the next. @@ -941,26 +1000,37 @@ int main(int argc, char* argv[]) { // Teardown std::cout << "[Teardown] Shutting down...\n"; - *shutdown_flag_host = 1; - __sync_synchronize(); system_stop = true; + if (!use_host_dispatcher) { + *shutdown_flag_host = 1; + __sync_synchronize(); + } + incoming_thread.join(); CUDA_CHECK(cudaStreamSynchronize(capture_stream)); - uint64_t dispatched_packets = 0; - CUDA_CHECK(cudaMemcpy(&dispatched_packets, d_stats, sizeof(uint64_t), cudaMemcpyDeviceToHost)); - std::cout << "[Stats] Dispatcher processed " << dispatched_packets << " packets.\n"; + if (!use_host_dispatcher) { + uint64_t dispatched_packets = 0; + CUDA_CHECK(cudaMemcpy(&dispatched_packets, d_stats, sizeof(uint64_t), cudaMemcpyDeviceToHost)); + std::cout << "[Stats] Dispatcher processed " << dispatched_packets << " packets.\n"; + CUDA_CHECK(cudaq_destroy_dispatch_graph(dispatch_ctx)); + } - CUDA_CHECK(cudaq_destroy_dispatch_graph(dispatch_ctx)); + // Synchronize predecoder streams before cleanup + for (auto& s : predecoder_streams) { + cudaStreamSynchronize(s); + cudaStreamDestroy(s); + } cudaFreeHost((void*)rx_flags_host); cudaFreeHost((void*)tx_flags_host); cudaFreeHost(rx_data_host); - cudaFreeHost(shutdown_flag_host); - cudaFree(d_global_mailbox_bank); - cudaFree(d_stats); - cudaFree(d_function_entries); + cudaFreeHost(h_mailbox_bank); + if (shutdown_flag_host) cudaFreeHost(shutdown_flag_host); + if (d_global_mailbox_bank) cudaFree(d_global_mailbox_bank); + if (d_stats) cudaFree(d_stats); + if (d_function_entries) cudaFree(d_function_entries); cudaStreamDestroy(capture_stream); std::cout << "Done.\n"; diff --git a/libs/qec/unittests/CMakeLists.txt b/libs/qec/unittests/CMakeLists.txt index 5196e253..255c3522 100644 --- a/libs/qec/unittests/CMakeLists.txt +++ b/libs/qec/unittests/CMakeLists.txt @@ -218,6 +218,7 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/ai_decoder_service.cu ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/ai_predecoder_service.cu + ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/host_dispatcher.cpp ) set_target_properties(test_realtime_predecoder_w_pymatching PROPERTIES From 779cdcb065690c124009830f3a71253d5d85f378 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Sat, 21 Feb 2026 23:14:33 +0000 Subject: [PATCH 13/40] realtime: host-side dynamic worker pool dispatcher and predecoder refactor - Add host dispatcher with dynamic worker pool (idle_mask, inflight_slot_tags) to avoid head-of-line blocking; use libcu++ system-scope atomics for rx/tx/ready flags and mapped pinned memory. - Extend AIPreDecoderService and PreDecoderJob with origin_slot for out-of-order completion; default queue_depth 1 for host dispatch. - Add design doc (host_side_dispatcher_design_gemini.md) with spin-polling dispatcher and worker pseudocode/constraints. - Refactor test_realtime_predecoder_w_pymatching for dynamic pool and update CMakeLists; adjust nvqlink daemon and dispatch_kernel for host-side dispatch. Signed-off-by: Scott Thornton --- docs/host_side_dispatcher_design_gemini.md | 195 ++ .../qec/realtime/ai_predecoder_service.h | 44 +- .../cudaq/qec/realtime/host_dispatcher.h | 36 +- .../qec/lib/realtime/ai_predecoder_service.cu | 122 +- libs/qec/lib/realtime/host_dispatcher.cpp | 98 +- .../test_realtime_predecoder_w_pymatching.cpp | 2106 +++++++++-------- libs/qec/unittests/CMakeLists.txt | 6 + .../daemon/dispatcher/cudaq_realtime.h | 2 +- .../lib/daemon/dispatcher/dispatch_kernel.cu | 5 +- 9 files changed, 1462 insertions(+), 1152 deletions(-) create mode 100644 docs/host_side_dispatcher_design_gemini.md diff --git a/docs/host_side_dispatcher_design_gemini.md b/docs/host_side_dispatcher_design_gemini.md new file mode 100644 index 00000000..30093118 --- /dev/null +++ b/docs/host_side_dispatcher_design_gemini.md @@ -0,0 +1,195 @@ +# Host-Side Spin-Polling Dispatcher with Dynamic Worker Pool + +## Design Specification + +**Component**: `cudaq-qec` Realtime Decoding Subsystem +**Status**: Approved for Implementation +**Supersedes**: Device-side persistent kernel dispatcher (`dispatch_kernel_with_graph`) and Statically-mapped Host Dispatcher +**Target Platforms**: NVIDIA Grace Hopper (GH200), Grace Blackwell (GB200) +**Shared-Memory Model**: libcu++ `cuda::std::atomic` with `thread_scope_system` +**Last Updated**: 2026-02-20 + +--- + +## 1. System Context & Motivation + +### 1.1 The Pipeline +The system performs real-time quantum error correction (QEC). An FPGA streams syndrome measurements into a host-device shared ring buffer continuously (~1 µs cadence). +1. **Predecoding (GPU)**: TensorRT neural network inference (~9 µs). +2. **Global Decoding (CPU)**: PyMatching (MWPM) (~40-300 µs, highly variable). + +### 1.2 The Problem +The legacy architecture used a persistent GPU kernel to launch child CUDA graphs using `cudaStreamGraphFireAndForget`. This hit a hardcoded CUDA runtime limit of 128 cumulative launches, causing fatal crashes. A naive host-side port mapping FPGA slots 1:1 to GPU streams caused **Head-of-Line (HOL) blocking**: a single slow PyMatching decode would stall the sequential dispatcher, backing up the ring buffer and violating strict quantum coherence latency budgets. + +### 1.3 The Solution +This document defines a **Host-Side Dispatcher with a Dynamic Worker Pool**. +* The dispatcher runs on a dedicated CPU core. +* Predecoder streams and CPU workers act as an interchangeable pool. +* Inflight jobs are tagged with their origin slot, allowing out-of-order execution and completion. +* Synchronization relies exclusively on Grace Blackwell's NVLink-C2C hardware using libcu++ system-scope atomics. + +--- + +## 2. Core Architecture: Dynamic Worker Pool + +Instead of mapping predecoder streams statically to incoming data, the host dispatcher maintains a bitmask of available workers (`idle_mask`). + +1. **Allocate**: When `rx_flags[slot]` indicates new data, the dispatcher finds the first available worker stream using a hardware bit-scan (`__builtin_ffsll`). +2. **Tag**: The dispatcher records the original `slot` in a tracking array (`inflight_slot_tags[worker_id]`) so the response can be routed correctly. +3. **Dispatch**: The dispatcher launches the CUDA graph on the assigned worker's stream and clears its availability bit. +4. **Free**: When the CPU PyMatching worker finishes the job and writes the response to `tx_flags[origin_slot]`, it restores the worker's availability bit in the `idle_mask`. + +--- + +## 3. Memory & Synchronization Model + +**CRITICAL DIRECTIVE**: The ARM Neoverse architecture (Grace) is **weakly ordered**. Code generated from this document MUST NOT use `volatile`, `__threadfence_system()`, or `std::atomic_thread_fence`. + +All shared state must use **libcu++ system-scope atomics** allocated in mapped pinned memory (`cudaHostAllocMapped`). + +### 3.1 Shared State Variables + +| Variable | Type | Memory Location | Purpose | +| :--- | :--- | :--- | :--- | +| `rx_flags[NUM_SLOTS]` | `atomic` | Mapped Pinned | FPGA writes data ptr; CPU polls (Acquire). | +| `tx_flags[NUM_SLOTS]` | `atomic` | Mapped Pinned | CPU writes response; FPGA polls (Release). | +| `ready_flags[NUM_WORKERS]` | `atomic` | Mapped Pinned | GPU signals TRT done; CPU polls (Release/Acquire). | +| `idle_mask` | `atomic` | Host CPU Mem | Bitmask of free workers. 1 = free, 0 = busy. | +| `inflight_slot_tags[NUM_WORKERS]`| `int` (Plain array) | Host CPU Mem | Maps `worker_id` -> original FPGA `slot`. | +| `mailbox_bank[NUM_WORKERS]` | `void*` (Plain array) | Mapped Pinned | Dispatcher writes device ptr for GPU input kernel. | + +--- + +## 4. Host Dispatcher Thread (Producer) + +The dispatcher loop is a tight spin-polling loop running on a dedicated CPU core. + +### 4.1 Dispatcher Logic (Pseudocode) +```cpp +#include + +using atomic_uint64_sys = cuda::std::atomic; +using atomic_int_sys = cuda::std::atomic; + +void host_dispatcher_loop(DispatcherContext& ctx) { + size_t current_slot = 0; + + while (ctx.shutdown_flag->load(cuda::std::memory_order_acquire) == 0) { + // 1. Poll incoming ring buffer + uint64_t rx_value = ctx.rx_flags[current_slot].load(cuda::std::memory_order_acquire); + + if (rx_value != 0) { + // 2. Wait for an available worker in the pool (Spin if all busy) + uint64_t mask = ctx.idle_mask->load(cuda::std::memory_order_acquire); + if (mask == 0) { + QEC_CPU_RELAX(); + continue; // Do NOT advance slot. Wait for worker. + } + + // 3. Allocate worker + int worker_id = __builtin_ffsll(mask) - 1; + + // Mark worker as busy (atomic fetch_and with inverted bit) + ctx.idle_mask->fetch_and(~(1ULL << worker_id), cuda::std::memory_order_release); + + // 4. Tag the payload with its origin slot for out-of-order return + ctx.inflight_slot_tags[worker_id] = current_slot; + + // 5. Translate Host Ptr to Device Ptr for the GPU Mailbox + void* data_host = reinterpret_cast(rx_value); + ptrdiff_t offset = (uint8_t*)data_host - ctx.rx_data_host; + void* data_dev = (void*)(ctx.rx_data_dev + offset); + + ctx.h_mailbox_bank[worker_id] = data_dev; + __sync_synchronize(); // Full barrier to ensure mailbox write is visible + + // 6. Launch graph on the assigned worker's stream + cudaGraphLaunch(ctx.workers[worker_id].graph_exec, ctx.workers[worker_id].stream); + + // 7. Consume slot and advance + ctx.rx_flags[current_slot].store(0, cuda::std::memory_order_release); + current_slot = (current_slot + 1) % ctx.num_slots; + + } else { + QEC_CPU_RELAX(); // No data, spin on current slot + } + } + // Cleanup: Synchronize all streams before exit to prevent illegal memory access + for(auto& w : ctx.workers) cudaStreamSynchronize(w.stream); +} +``` + +--- + +## 5. GPU Kernel Modifications + +The predecoder GPU kernels require minimal changes, as the dynamic pooling complexity is handled entirely by the host. + +1. **Input Kernel**: Reads `*mailbox_slot_ptr` (mapped pinned) to get the device pointer to the ring buffer data. It copies this to `d_trt_input`. +2. **Output Kernel**: Copies `d_trt_output` to `h_outputs[worker_id]` (mapped pinned). +3. **Completion Signal**: The output kernel signals the CPU polling thread by setting the ready flag: + ```cpp + // Device code + d_ready_flags[worker_id].store(1, cuda::std::memory_order_release); + ``` + +*(Note: `cudaGraphInstantiateFlagDeviceLaunch` MUST be removed from graph capture. Use `cudaGraphInstantiate(&graph_exec, graph, 0)`).* + +--- + +## 6. Worker Subsystem (Consumer) + +A separate CPU polling thread scans the `ready_flags` array. When a GPU graph finishes, the job is handed to a CPU thread pool for PyMatching decoding. + +### 6.1 Worker Logic (Pseudocode) +```cpp +void pymatching_worker_task(WorkerContext& ctx, int worker_id) { + // 1. Read GPU outputs from mapped pinned memory + // ... run PyMatching MWPM ... + + // 2. Lookup origin slot for out-of-order routing + int origin_slot = ctx.inflight_slot_tags[worker_id]; + + // 3. Write response back to the EXACT slot the FPGA expects + uint64_t response_val = format_response(...); + ctx.tx_flags[origin_slot].store(response_val, cuda::std::memory_order_release); + + // 4. Acknowledge GPU read completion + ctx.ready_flags[worker_id].store(0, cuda::std::memory_order_release); + + // 5. FREE THE WORKER: Return this worker back to the dispatcher pool + ctx.idle_mask->fetch_or((1ULL << worker_id), cuda::std::memory_order_release); +} +``` + +--- + +## 7. Step-by-Step Data Flow Trace + +1. **FPGA** writes INT32 measurements into `rx_data[5]`. +2. **FPGA** sets `rx_flags[5] = host_ptr`. +3. **Host Dispatcher** reads `rx_flags[5]`, sees data. +4. **Host Dispatcher** scans `idle_mask`, finds `worker_id = 2` is free. +5. **Host Dispatcher** marks bit 2 busy in `idle_mask`. +6. **Host Dispatcher** saves `inflight_slot_tags[2] = 5`. +7. **Host Dispatcher** translates `host_ptr` to `dev_ptr`, writes to `mailbox_bank[2]`. +8. **Host Dispatcher** calls `cudaGraphLaunch(..., stream[2])`. +9. **Host Dispatcher** clears `rx_flags[5] = 0` and advances to `current_slot = 6`. +10. **GPU** executes graph on stream 2. Finishes and sets `ready_flags[2] = 1`. +11. **CPU Poller** sees `ready_flags[2] == 1`, triggers PyMatching on CPU. +12. **CPU Worker** finishes PyMatching. +13. **CPU Worker** looks up `origin_slot = inflight_slot_tags[2]` (which is 5). +14. **CPU Worker** writes response to `tx_flags[5]`. +15. **CPU Worker** restores bit 2 in `idle_mask`, freeing `worker_id = 2` for the dispatcher. + +--- + +## 8. LLM Implementation Directives (Constraints Checklist) + +When generating code from this specification, the LLM **MUST** strictly adhere to the following constraints: + +- [ ] **NO CUDA STREAM QUERYING**: Do not use `cudaStreamQuery()` for backpressure or completion checking. It incurs severe driver latency. Rely strictly on `idle_mask` and `ready_flags`. +- [ ] **NO WEAK ORDERING BUGS**: Do not use `volatile`. Do not use `__threadfence_system()`. You must use `cuda::std::atomic` for all cross-device synchronization. +- [ ] **NO HEAD OF LINE BLOCKING**: The host dispatcher MUST NOT statically map slots to predecoders. It must dynamically allocate via `idle_mask`. +- [ ] **NO DATA LOSS**: If `idle_mask == 0` (all workers busy), the dispatcher MUST spin on the current slot (`QEC_CPU_RELAX()`). It MUST NOT advance `current_slot` until a worker is allocated and the graph is launched. +- [ ] **NO RACE CONDITIONS ON TAGS**: `inflight_slot_tags` does not need to be atomic because index `[worker_id]` is exclusively owned by the active flow once the dispatcher clears the bit in `idle_mask`, until the worker thread restores the bit. diff --git a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h index ba4ee551..e2b5be46 100644 --- a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h +++ b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h @@ -8,7 +8,8 @@ #pragma once -#include "cudaq/qec/realtime/ai_decoder_service.h" +#include "cudaq/qec/realtime/ai_decoder_service.h" +#include #include // Portable CPU Yield Macro for busy-polling @@ -24,54 +25,43 @@ namespace cudaq::qec { struct PreDecoderJob { - int slot_idx; + int slot_idx; ///< Worker/slot index (for release_job; always 0) + int origin_slot; ///< FPGA ring slot for tx_flags routing (dynamic pool) void* ring_buffer_ptr; - void* inference_data; // Points into the pinned output queue (type-agnostic) + void* inference_data; ///< Points into the pinned output (single slot) }; class AIPreDecoderService : public AIDecoderService { public: AIPreDecoderService(const std::string& engine_path, void** device_mailbox_slot, - int queue_depth = 16, const std::string& engine_save_path = ""); + int queue_depth = 1, const std::string& engine_save_path = ""); virtual ~AIPreDecoderService(); - /// @param device_launch If true, instantiate graph with DeviceLaunch flag - /// (for device-side dispatcher). If false, use standard instantiation - /// (for host-side dispatcher). void capture_graph(cudaStream_t stream, bool device_launch); void capture_graph(cudaStream_t stream) override { capture_graph(stream, true); } bool poll_next_job(PreDecoderJob& out_job); void release_job(int slot_idx); - int* get_device_queue_idx() const { return d_queue_idx_; } - volatile int* get_device_ready_flags() const { return d_ready_flags_; } - int* get_device_inflight_flag() const { return d_inflight_flag_; } + /// Stub for device-dispatcher batch path (returns nullptr; streaming uses host dispatcher) + int* get_device_queue_idx() const { return nullptr; } + cuda::atomic* get_device_ready_flags() const { return d_ready_flags_; } + int* get_device_inflight_flag() const { return nullptr; } - // Host-side accessors (for host dispatcher backpressure checks) - volatile int* get_host_ready_flags() const { return h_ready_flags_; } - volatile int* get_host_queue_idx() const { return h_queue_idx_; } + cuda::atomic* get_host_ready_flags() const { return h_ready_flags_; } + volatile int* get_host_queue_idx() const { return nullptr; } int get_queue_depth() const { return queue_depth_; } private: - int queue_depth_; - int cpu_poll_idx_ = 0; + int queue_depth_; // Always 1 - // Pinned Host Memory (The Queue) - volatile int* h_ready_flags_ = nullptr; - void** h_ring_ptrs_ = nullptr; - void* h_outputs_ = nullptr; // Type-agnostic pinned output queue + cuda::atomic* h_ready_flags_ = nullptr; + void** h_ring_ptrs_ = nullptr; + void* h_outputs_ = nullptr; - // Device Mapped Pointers (For the Graph to write to) - volatile int* d_ready_flags_ = nullptr; + cuda::atomic* d_ready_flags_ = nullptr; void** d_ring_ptrs_ = nullptr; void* d_outputs_ = nullptr; - - // Queue index: mapped pinned so both GPU and host can access - volatile int* h_queue_idx_ = nullptr; // Host pointer - int* d_queue_idx_ = nullptr; // Device pointer (same physical memory) - int* d_claimed_slot_ = nullptr; - int* d_inflight_flag_ = nullptr; }; } // namespace cudaq::qec diff --git a/libs/qec/include/cudaq/qec/realtime/host_dispatcher.h b/libs/qec/include/cudaq/qec/realtime/host_dispatcher.h index 9032c5b5..5eaf049e 100644 --- a/libs/qec/include/cudaq/qec/realtime/host_dispatcher.h +++ b/libs/qec/include/cudaq/qec/realtime/host_dispatcher.h @@ -8,37 +8,55 @@ #pragma once -#include "cudaq/qec/realtime/ai_predecoder_service.h" #include +#include #include #include #include +#ifndef QEC_CPU_RELAX +#if defined(__x86_64__) +#include +#define QEC_CPU_RELAX() _mm_pause() +#elif defined(__aarch64__) +#define QEC_CPU_RELAX() __asm__ volatile("yield" ::: "memory") +#else +#define QEC_CPU_RELAX() do { } while (0) +#endif +#endif + namespace cudaq::qec { -struct HostDispatchEntry { - uint32_t function_id; +using atomic_uint64_sys = cuda::std::atomic; +using atomic_int_sys = cuda::std::atomic; + +struct HostDispatchWorker { cudaGraphExec_t graph_exec; - int mailbox_idx; - AIPreDecoderService* predecoder; cudaStream_t stream; }; struct HostDispatcherConfig { - volatile uint64_t* rx_flags_host; - volatile uint64_t* tx_flags_host; + atomic_uint64_sys* rx_flags; + atomic_uint64_sys* tx_flags; uint8_t* rx_data_host; uint8_t* rx_data_dev; void** h_mailbox_bank; size_t num_slots; size_t slot_size; - std::vector entries; - volatile int* shutdown_flag; + std::vector workers; + atomic_int_sys* shutdown_flag; uint64_t* stats_counter; + /// Optional: atomic counter incremented on each dispatch (for progress diagnostics). + atomic_uint64_sys* live_dispatched = nullptr; + + /// Dynamic worker pool (design: Host-Side Spin-Polling Dispatcher) + atomic_uint64_sys* idle_mask; ///< 1 = free, 0 = busy; bit index = worker_id + int* inflight_slot_tags; ///< worker_id -> origin FPGA slot for tx_flags routing }; /// Run the host-side dispatcher loop. Blocks until *config.shutdown_flag /// becomes non-zero. Call from a dedicated thread. +/// Uses dynamic worker pool: allocates via idle_mask, tags with inflight_slot_tags. void host_dispatcher_loop(const HostDispatcherConfig& config); } // namespace cudaq::qec diff --git a/libs/qec/lib/realtime/ai_predecoder_service.cu b/libs/qec/lib/realtime/ai_predecoder_service.cu index f36333f5..c29599d9 100644 --- a/libs/qec/lib/realtime/ai_predecoder_service.cu +++ b/libs/qec/lib/realtime/ai_predecoder_service.cu @@ -9,6 +9,7 @@ #include "cudaq/qec/realtime/ai_predecoder_service.h" #include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" #include "cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h" +#include #include #include #include @@ -23,28 +24,25 @@ namespace cudaq::qec { +// System scope for NVLink/PCIe visibility to host (design: no __threadfence_system) +using atomic_int_sys = cuda::atomic; + // ============================================================================= -// Kernels +// Kernels (single slot 0 only; queue removed for host-side dynamic pool) // ============================================================================= __global__ void predecoder_input_kernel( - void** mailbox_slot_ptr, int* d_queue_idx, volatile int* d_ready_flags, - void** d_ring_ptrs, void* trt_input, size_t input_size_bytes, - int* d_claimed_slot) + void** mailbox_slot_ptr, + atomic_int_sys* d_ready_flags, + void** d_ring_ptrs, + void* trt_input, + size_t input_size_bytes) { - __shared__ int slot_idx; __shared__ void* ring_ptr; if (threadIdx.x == 0 && blockIdx.x == 0) { ring_ptr = *mailbox_slot_ptr; - slot_idx = *d_queue_idx; - *d_claimed_slot = slot_idx; - - if (d_ready_flags[slot_idx] == 1) { - ring_ptr = nullptr; - } else { - d_ring_ptrs[slot_idx] = ring_ptr; - } + d_ring_ptrs[0] = ring_ptr; } __syncthreads(); @@ -58,26 +56,22 @@ __global__ void predecoder_input_kernel( } __global__ void predecoder_output_kernel( - int* d_claimed_slot, int* d_queue_idx, int queue_depth, - volatile int* d_ready_flags, void* d_outputs, const void* trt_output, - size_t output_size_bytes, volatile int* d_inflight_flag) + atomic_int_sys* d_ready_flags, + void* d_outputs, + const void* trt_output, + size_t output_size_bytes) { - int slot_idx = *d_claimed_slot; - - char* dst = (char*)d_outputs + (slot_idx * output_size_bytes); + char* dst = (char*)d_outputs; const char* src = (const char*)trt_output; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < output_size_bytes; i += blockDim.x * gridDim.x) { dst[i] = src[i]; } __syncthreads(); - __threadfence_system(); if (threadIdx.x == 0 && blockIdx.x == 0) { - d_ready_flags[slot_idx] = 1; - *d_queue_idx = (slot_idx + 1) % queue_depth; - __threadfence_system(); - *d_inflight_flag = 0; + d_ready_flags[0].store(1, cuda::std::memory_order_release); } } @@ -92,37 +86,39 @@ __global__ void passthrough_copy_kernel(void* dst, const void* src, size_t num_b // ============================================================================= AIPreDecoderService::AIPreDecoderService(const std::string& path, void** mailbox, - int queue_depth, const std::string& engine_save_path) - : AIDecoderService(path, mailbox, engine_save_path), queue_depth_(queue_depth) + int /* queue_depth (ignored; always 1) */, + const std::string& engine_save_path) + : AIDecoderService(path, mailbox, engine_save_path), queue_depth_(1) { - SERVICE_CUDA_CHECK(cudaHostAlloc(&h_ready_flags_, queue_depth_ * sizeof(int), cudaHostAllocMapped)); - SERVICE_CUDA_CHECK(cudaHostAlloc(&h_ring_ptrs_, queue_depth_ * sizeof(void*), cudaHostAllocMapped)); - SERVICE_CUDA_CHECK(cudaHostAlloc(&h_outputs_, queue_depth_ * get_output_size(), cudaHostAllocMapped)); + void* buf = nullptr; + + SERVICE_CUDA_CHECK(cudaHostAlloc(&buf, sizeof(atomic_int_sys), cudaHostAllocMapped)); + h_ready_flags_ = static_cast(buf); + new (h_ready_flags_) atomic_int_sys(0); - memset((void*)h_ready_flags_, 0, queue_depth_ * sizeof(int)); + SERVICE_CUDA_CHECK(cudaHostAlloc(&h_ring_ptrs_, sizeof(void*), cudaHostAllocMapped)); + SERVICE_CUDA_CHECK(cudaHostAlloc(&h_outputs_, get_output_size(), cudaHostAllocMapped)); SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_ready_flags_, (void*)h_ready_flags_, 0)); SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_ring_ptrs_, (void*)h_ring_ptrs_, 0)); SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_outputs_, (void*)h_outputs_, 0)); - - SERVICE_CUDA_CHECK(cudaHostAlloc((void**)&h_queue_idx_, sizeof(int), cudaHostAllocMapped)); - *const_cast(const_cast(h_queue_idx_)) = 0; - SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_queue_idx_, (void*)h_queue_idx_, 0)); - - SERVICE_CUDA_CHECK(cudaMalloc(&d_claimed_slot_, sizeof(int))); - SERVICE_CUDA_CHECK(cudaMemset(d_claimed_slot_, 0, sizeof(int))); - - SERVICE_CUDA_CHECK(cudaMalloc(&d_inflight_flag_, sizeof(int))); - SERVICE_CUDA_CHECK(cudaMemset(d_inflight_flag_, 0, sizeof(int))); } AIPreDecoderService::~AIPreDecoderService() { - if (h_ready_flags_) cudaFreeHost((void*)h_ready_flags_); - if (h_ring_ptrs_) cudaFreeHost(h_ring_ptrs_); - if (h_outputs_) cudaFreeHost(h_outputs_); - if (h_queue_idx_) cudaFreeHost((void*)h_queue_idx_); - if (d_claimed_slot_) cudaFree(d_claimed_slot_); - if (d_inflight_flag_) cudaFree(d_inflight_flag_); + if (h_ready_flags_) { + h_ready_flags_[0].~atomic_int_sys(); + cudaFreeHost((void*)h_ready_flags_); + h_ready_flags_ = nullptr; + d_ready_flags_ = nullptr; + } + if (h_ring_ptrs_) { + cudaFreeHost(h_ring_ptrs_); + h_ring_ptrs_ = nullptr; + } + if (h_outputs_) { + cudaFreeHost(h_outputs_); + h_outputs_ = nullptr; + } } void AIPreDecoderService::capture_graph(cudaStream_t stream, bool device_launch) { @@ -140,9 +136,9 @@ void AIPreDecoderService::capture_graph(cudaStream_t stream, bool device_launch) SERVICE_CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); predecoder_input_kernel<<<1, 128, 0, stream>>>( - device_mailbox_slot_, d_queue_idx_, d_ready_flags_, - d_ring_ptrs_, d_trt_input_, get_input_size(), - d_claimed_slot_); + device_mailbox_slot_, + static_cast(d_ready_flags_), + d_ring_ptrs_, d_trt_input_, get_input_size()); if (skip_trt) { passthrough_copy_kernel<<<1, 128, 0, stream>>>( @@ -152,9 +148,8 @@ void AIPreDecoderService::capture_graph(cudaStream_t stream, bool device_launch) } predecoder_output_kernel<<<1, 128, 0, stream>>>( - d_claimed_slot_, d_queue_idx_, queue_depth_, d_ready_flags_, - d_outputs_, d_trt_output_, get_output_size(), - d_inflight_flag_); + static_cast(d_ready_flags_), + d_outputs_, d_trt_output_, get_output_size()); SERVICE_CUDA_CHECK(cudaStreamEndCapture(stream, &graph)); @@ -183,21 +178,24 @@ void AIPreDecoderService::capture_graph(cudaStream_t stream, bool device_launch) } bool AIPreDecoderService::poll_next_job(PreDecoderJob& out_job) { - if (h_ready_flags_[cpu_poll_idx_] == 1) { - std::atomic_thread_fence(std::memory_order_acquire); - - out_job.slot_idx = cpu_poll_idx_; - out_job.ring_buffer_ptr = h_ring_ptrs_[cpu_poll_idx_]; - out_job.inference_data = static_cast(h_outputs_) + (cpu_poll_idx_ * get_output_size()); - - cpu_poll_idx_ = (cpu_poll_idx_ + 1) % queue_depth_; + auto* sys_flags = static_cast(h_ready_flags_); + int expected = 1; + // Atomically claim: 1 (Ready) -> 2 (Processing) so we enqueue the job exactly once. + // Use relaxed on failure so spinning doesn't add barriers that delay seeing GPU's store(1). + if (sys_flags[0].compare_exchange_strong(expected, 2, + cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed)) { + out_job.slot_idx = 0; + out_job.ring_buffer_ptr = h_ring_ptrs_[0]; + out_job.inference_data = h_outputs_; return true; } return false; } -void AIPreDecoderService::release_job(int slot_idx) { - __atomic_store_n(&h_ready_flags_[slot_idx], 0, __ATOMIC_RELEASE); +void AIPreDecoderService::release_job(int /* slot_idx */) { + auto* sys_flags = static_cast(h_ready_flags_); + // PyMatching done: 2 (Processing) -> 0 (Idle) + sys_flags[0].store(0, cuda::std::memory_order_release); } } // namespace cudaq::qec diff --git a/libs/qec/lib/realtime/host_dispatcher.cpp b/libs/qec/lib/realtime/host_dispatcher.cpp index c35e2366..12c5c4eb 100644 --- a/libs/qec/lib/realtime/host_dispatcher.cpp +++ b/libs/qec/lib/realtime/host_dispatcher.cpp @@ -7,9 +7,7 @@ ******************************************************************************/ #include "cudaq/qec/realtime/host_dispatcher.h" -#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" -#include #include namespace cudaq::qec { @@ -17,80 +15,56 @@ namespace cudaq::qec { void host_dispatcher_loop(const HostDispatcherConfig& config) { size_t current_slot = 0; const size_t num_slots = config.num_slots; - const int num_entries = static_cast(config.entries.size()); + const int num_workers = static_cast(config.workers.size()); uint64_t packets_dispatched = 0; - while (!*config.shutdown_flag) { - uint64_t rx_value = config.rx_flags_host[current_slot]; + while (config.shutdown_flag->load(cuda::std::memory_order_acquire) == 0) { + uint64_t rx_value = config.rx_flags[current_slot].load(cuda::std::memory_order_acquire); - if (rx_value == 0) { - QEC_CPU_RELAX(); - continue; - } - - std::atomic_thread_fence(std::memory_order_acquire); - - auto* data_host = reinterpret_cast(rx_value); - auto* header = static_cast(data_host); - - if (header->magic != cudaq::nvqlink::RPC_MAGIC_REQUEST) { - config.rx_flags_host[current_slot] = 0; - current_slot = (current_slot + 1) % num_slots; - continue; - } - - int entry_idx = -1; - for (int i = 0; i < num_entries; ++i) { - if (config.entries[i].function_id == header->function_id) { - entry_idx = i; - break; + if (rx_value != 0) { + uint64_t mask = config.idle_mask->load(cuda::std::memory_order_acquire); + if (mask == 0) { + QEC_CPU_RELAX(); + continue; } - } - - if (entry_idx < 0) { - config.rx_flags_host[current_slot] = 0; - current_slot = (current_slot + 1) % num_slots; - continue; - } - const auto& entry = config.entries[entry_idx]; + int worker_id = __builtin_ffsll(static_cast(mask)) - 1; + config.idle_mask->fetch_and(~(1ULL << worker_id), cuda::std::memory_order_release); - // Backpressure: check if the predecoder stream is idle - bool stream_busy = (cudaStreamQuery(entry.stream) != cudaSuccess); - if (stream_busy) { - current_slot = (current_slot + 1) % num_slots; - continue; - } + config.inflight_slot_tags[worker_id] = static_cast(current_slot); - // Backpressure: check if the predecoder queue is full - volatile int* h_ready = entry.predecoder->get_host_ready_flags(); - volatile int* h_qidx = entry.predecoder->get_host_queue_idx(); - if (h_ready[*h_qidx] == 1) { - current_slot = (current_slot + 1) % num_slots; - continue; - } + void* data_host = reinterpret_cast(rx_value); + ptrdiff_t offset = static_cast(data_host) - config.rx_data_host; + void* data_dev = static_cast(config.rx_data_dev + offset); - // Translate host pointer to device pointer for the mailbox - ptrdiff_t offset = (uint8_t*)data_host - config.rx_data_host; - void* data_dev = (void*)(config.rx_data_dev + offset); - config.h_mailbox_bank[entry.mailbox_idx] = data_dev; + config.h_mailbox_bank[worker_id] = data_dev; + __sync_synchronize(); - __sync_synchronize(); + cudaError_t err = cudaGraphLaunch(config.workers[worker_id].graph_exec, + config.workers[worker_id].stream); + if (err != cudaSuccess) { + uint64_t error_val = (uint64_t)0xDEAD << 48 | (uint64_t)err; + config.tx_flags[current_slot].store(error_val, cuda::std::memory_order_release); + config.idle_mask->fetch_or(1ULL << worker_id, cuda::std::memory_order_release); + } else { + // Mark slot IN_FLIGHT so producer doesn't overwrite while GPU/workers use it + config.tx_flags[current_slot].store(0xEEEEEEEEEEEEEEEEULL, cuda::std::memory_order_release); + } - cudaError_t err = cudaGraphLaunch(entry.graph_exec, entry.stream); - if (err != cudaSuccess) { - // Signal error via tx_flags (same protocol as device dispatcher) - size_t slot_idx = ((uint8_t*)data_host - config.rx_data_host) / config.slot_size; - uint64_t error_val = (uint64_t)0xDEAD << 48 | (uint64_t)err; - config.tx_flags_host[slot_idx] = error_val; + config.rx_flags[current_slot].store(0, cuda::std::memory_order_release); + packets_dispatched++; + if (config.live_dispatched) + config.live_dispatched->fetch_add(1, cuda::std::memory_order_relaxed); + current_slot = (current_slot + 1) % num_slots; + } else { + QEC_CPU_RELAX(); } + } - config.rx_flags_host[current_slot] = 0; - packets_dispatched++; - current_slot = (current_slot + 1) % num_slots; + for (const auto& w : config.workers) { + cudaStreamSynchronize(w.stream); } - // Write stats if (config.stats_counter) { *config.stats_counter = packets_dispatched; } diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index 2d617b15..485a65a2 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -1,7 +1,7 @@ /****************************************************************-*- C++ -*-**** * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * * All rights reserved. * - * * + * * * This source code and the accompanying materials are made available under * * the terms of the Apache License 2.0 which accompanies this distribution. * ******************************************************************************/ @@ -11,25 +11,25 @@ * * Supports multiple surface code configurations: * - * d=7 r=7 (model1_d7_r7_unified_Z_batch1.onnx) - * Input: all_measurements [1, 72, 7] INT32 (2016 bytes) - * Output: residual_detectors [1, 336] INT32 (1344 bytes) - * Output: logical_frame [1] INT32 (4 bytes) + * d=7 r=7 (model1_d7_r7_unified_Z_batch1.onnx) + * Input: all_measurements [1, 72, 7] INT32 (2016 bytes) + * Output: residual_detectors [1, 336] INT32 (1344 bytes) + * Output: logical_frame [1] INT32 (4 bytes) * - * d=13 r=13 (model1_d13_r13_unified_Z_batch1.onnx) - * Input: all_measurements [1, 252, 13] INT32 (13104 bytes) - * Output: residual_detectors [1, 2184] INT32 (8736 bytes) - * Output: logical_frame [1] INT32 (4 bytes) + * d=13 r=13 (model1_d13_r13_unified_Z_batch1.onnx) + * Input: all_measurements [1, 252, 13] INT32 (13104 bytes) + * Output: residual_detectors [1, 2184] INT32 (8736 bytes) + * Output: logical_frame [1] INT32 (4 bytes) * - * d=21 r=21 (model1_d21_r21_unified_Z_batch1.onnx) - * Input: all_measurements [1, 660, 21] INT32 (55440 bytes) - * Output: residual_detectors [1, 9240] INT32 (36960 bytes) - * Output: logical_frame [1] INT32 (4 bytes) + * d=21 r=21 (model1_d21_r21_unified_Z_batch1.onnx) + * Input: all_measurements [1, 660, 21] INT32 (55440 bytes) + * Output: residual_detectors [1, 9240] INT32 (36960 bytes) + * Output: logical_frame [1] INT32 (4 bytes) * - * d=31 r=31 (model1_d31_r31_unified_Z_batch1.onnx) - * Input: all_measurements [1, 1440, 31] INT32 (178560 bytes) - * Output: residual_detectors [1, 29760] INT32 (119040 bytes) - * Output: logical_frame [1] INT32 (4 bytes) + * d=31 r=31 (model1_d31_r31_unified_Z_batch1.onnx) + * Input: all_measurements [1, 1440, 31] INT32 (178560 bytes) + * Output: residual_detectors [1, 29760] INT32 (119040 bytes) + * Output: logical_frame [1] INT32 (4 bytes) * * Pipeline: * 1. Ring Buffer setup @@ -41,998 +41,1126 @@ * Usage: test_realtime_predecoder_w_pymatching [d7|d13|d21|d31] [stream [rate_us] [duration_s]] ******************************************************************************/ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#ifndef CUDA_VERSION -#define CUDA_VERSION 13000 -#endif -#include "cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h" -#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" - -#include "cudaq/qec/realtime/ai_decoder_service.h" -#include "cudaq/qec/realtime/ai_predecoder_service.h" -#include "cudaq/qec/realtime/host_dispatcher.h" -#include "cudaq/qec/utils/thread_pool.h" -#include "cudaq/qec/utils/pipeline_benchmarks.h" -#include "cudaq/qec/code.h" -#include "cudaq/qec/decoder.h" - -#define CUDA_CHECK(call) \ - do { \ - cudaError_t err = call; \ - if (err != cudaSuccess) { \ - std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at line " << __LINE__ << std::endl; \ - exit(1); \ - } \ - } while(0) - -using namespace cudaq::qec; - -// ============================================================================= -// Pipeline Configuration -// ============================================================================= - -constexpr size_t NUM_SLOTS = 64; - -struct PipelineConfig { - std::string label; - int distance; - int num_rounds; - int meas_qubits; // ONNX input shape[1] - int residual_detectors; // ONNX output dim - std::string onnx_filename; - size_t slot_size; // must fit RPCHeader + input payload - int total_requests; - int num_predecoders; - int queue_depth; - int num_workers; - - int input_elements() const { return meas_qubits * num_rounds; } - size_t input_bytes() const { return input_elements() * sizeof(int32_t); } - - std::string onnx_path() const { - return std::string(ONNX_MODEL_DIR) + "/" + onnx_filename; - } - - std::string engine_path() const { - std::string name = onnx_filename; - auto dot = name.rfind('.'); - if (dot != std::string::npos) - name = name.substr(0, dot); - return std::string(ONNX_MODEL_DIR) + "/" + name + ".engine"; - } - - static PipelineConfig d7_r7() { - return { - "d7_r7_Z", - /*distance=*/7, - /*num_rounds=*/7, - /*meas_qubits=*/72, - /*residual_detectors=*/336, - "model1_d7_r7_unified_Z_batch1.onnx", - /*slot_size=*/4096, - /*total_requests=*/100, - /*num_predecoders=*/4, - /*queue_depth=*/16, - /*num_workers=*/4 - }; - } - - static PipelineConfig d13_r13() { - return { - "d13_r13_Z", - /*distance=*/13, - /*num_rounds=*/13, - /*meas_qubits=*/252, - /*residual_detectors=*/2184, - "model1_d13_r13_unified_Z_batch1.onnx", - /*slot_size=*/16384, - /*total_requests=*/100, - /*num_predecoders=*/4, - /*queue_depth=*/16, - /*num_workers=*/4 - }; - } - - static PipelineConfig d21_r21() { - return { - "d21_r21_Z", - /*distance=*/21, - /*num_rounds=*/21, - /*meas_qubits=*/660, - /*residual_detectors=*/9240, - "model1_d21_r21_unified_X_batch1.onnx", - /*slot_size=*/65536, - /*total_requests=*/100, - /*num_predecoders=*/4, - /*queue_depth=*/16, - /*num_workers=*/4 - }; - } - - static PipelineConfig d31_r31() { - return { - "d31_r31_Z", - /*distance=*/31, - /*num_rounds=*/31, - /*meas_qubits=*/1440, - /*residual_detectors=*/29760, - "model1_d31_r31_unified_Z_batch1.onnx", - /*slot_size=*/262144, - /*total_requests=*/100, - /*num_predecoders=*/4, - /*queue_depth=*/16, - /*num_workers=*/4 - }; - } -}; - -// Runtime decoder state populated during setup -struct DecoderContext { - std::vector> decoders; - std::atomic next_decoder_idx{0}; - int z_stabilizers = 0; - int spatial_slices = 0; - - cudaq::qec::decoder* acquire_decoder() { - thread_local int my_idx = next_decoder_idx.fetch_add(1, std::memory_order_relaxed); - return decoders[my_idx % decoders.size()].get(); - } - - // Per-worker timing accumulators (lock-free) - std::atomic total_decode_us{0}; - std::atomic total_worker_us{0}; - std::atomic decode_count{0}; -}; - -constexpr std::uint32_t fnv1a_hash(std::string_view str) { - std::uint32_t hash = 0x811c9dc5; - for (char c : str) { hash ^= static_cast(c); hash *= 0x01000193; } - return hash; -} - -struct SystemContext { - volatile uint64_t* tx_flags_host = nullptr; - uint8_t* rx_data_host = nullptr; - size_t slot_size = 0; -}; -SystemContext g_sys_ctx; - -// ============================================================================= -// Thread Pool Worker (Real PyMatching MWPM Decoder) -// ============================================================================= - -struct __attribute__((packed)) DecodeResponse { - int32_t total_corrections; - int32_t converged; -}; - -void pymatching_worker_task(PreDecoderJob job, AIPreDecoderService* predecoder, - DecoderContext* ctx) { - using hrclock = std::chrono::high_resolution_clock; - auto worker_start = hrclock::now(); - - const int32_t* residual = static_cast(job.inference_data); - auto* my_decoder = ctx->acquire_decoder(); - - int total_corrections = 0; - bool all_converged = true; - - auto decode_start = hrclock::now(); - for (int s = 0; s < ctx->spatial_slices; ++s) { - const int32_t* slice = residual + s * ctx->z_stabilizers; - std::vector syndrome(ctx->z_stabilizers); - for (int i = 0; i < ctx->z_stabilizers; ++i) - syndrome[i] = static_cast(slice[i]); - - auto result = my_decoder->decode(syndrome); - - all_converged &= result.converged; - for (auto v : result.result) - if (v > 0.5) total_corrections++; - } - auto decode_end = hrclock::now(); - - DecodeResponse resp_data{total_corrections, all_converged ? 1 : 0}; - - char* response_payload = (char*)job.ring_buffer_ptr + sizeof(cudaq::nvqlink::RPCResponse); - std::memcpy(response_payload, &resp_data, sizeof(resp_data)); - - auto* header = static_cast(job.ring_buffer_ptr); - header->magic = cudaq::nvqlink::RPC_MAGIC_RESPONSE; - header->status = 0; - header->result_len = sizeof(resp_data); - - std::atomic_thread_fence(std::memory_order_release); - - auto worker_end = hrclock::now(); - auto decode_us = std::chrono::duration_cast( - decode_end - decode_start).count(); - auto worker_us = std::chrono::duration_cast( - worker_end - worker_start).count(); - ctx->total_decode_us.fetch_add(decode_us, std::memory_order_relaxed); - ctx->total_worker_us.fetch_add(worker_us, std::memory_order_relaxed); - ctx->decode_count.fetch_add(1, std::memory_order_relaxed); - - size_t slot_idx = ((uint8_t*)job.ring_buffer_ptr - g_sys_ctx.rx_data_host) / g_sys_ctx.slot_size; - predecoder->release_job(job.slot_idx); - - uint64_t rx_value = reinterpret_cast(job.ring_buffer_ptr); - g_sys_ctx.tx_flags_host[slot_idx] = rx_value; -} - -// ============================================================================= -// Incoming Polling Thread -// ============================================================================= -void incoming_polling_loop( - std::vector>& predecoders, - cudaq::qec::utils::ThreadPool& thread_pool, - DecoderContext* ctx, - std::atomic& stop_signal) -{ - PreDecoderJob job; - while (!stop_signal.load(std::memory_order_relaxed)) { - bool found_work = false; - for (auto& predecoder : predecoders) { - if (predecoder->poll_next_job(job)) { - AIPreDecoderService* pd_ptr = predecoder.get(); - thread_pool.enqueue([job, pd_ptr, ctx]() { - pymatching_worker_task(job, pd_ptr, ctx); - }); - found_work = true; - } - } - if (!found_work) { - QEC_CPU_RELAX(); - } - } -} - -// ============================================================================= -// Generate Realistic Syndrome Data -// ============================================================================= -void fill_measurement_payload(int32_t* payload, int input_elements, - std::mt19937& rng, double error_rate = 0.01) { - std::bernoulli_distribution err_dist(error_rate); - for (int i = 0; i < input_elements; ++i) { - payload[i] = err_dist(rng) ? 1 : 0; - } -} - -// ============================================================================= -// Streaming Test Mode (simulates FPGA continuous syndrome arrival) -// ============================================================================= - -struct StreamingConfig { - int rate_us = 0; // inter-arrival time in us (0 = open-loop) - int duration_s = 5; // how long to run - int warmup_count = 20; // discard first N from latency stats -}; - -void run_streaming_test( - const PipelineConfig& config, - const StreamingConfig& scfg, - volatile uint64_t* rx_flags_host, - volatile uint64_t* tx_flags_host, - uint8_t* rx_data_host, - uint8_t* rx_data_dev, - DecoderContext& decoder_ctx, - std::vector>& predecoders, - cudaq::qec::utils::ThreadPool& pymatching_pool, - std::atomic& system_stop, - void** h_mailbox_bank, - std::vector& predecoder_streams) -{ - using hrclock = std::chrono::high_resolution_clock; - - const int max_requests = 500000; - const size_t payload_bytes = config.input_bytes(); - - std::vector submit_ts(max_requests); - std::vector complete_ts(max_requests); - std::vector completed(max_requests, false); - - std::vector slot_request(NUM_SLOTS, -1); - - std::atomic total_submitted{0}; - std::atomic total_completed{0}; - std::atomic backpressure_stalls{0}; - std::atomic producer_done{false}; - - // Set up host dispatcher - volatile int dispatcher_shutdown = 0; - uint64_t dispatcher_stats = 0; - - HostDispatcherConfig disp_cfg; - disp_cfg.rx_flags_host = rx_flags_host; - disp_cfg.tx_flags_host = tx_flags_host; - disp_cfg.rx_data_host = rx_data_host; - disp_cfg.rx_data_dev = rx_data_dev; - disp_cfg.h_mailbox_bank = h_mailbox_bank; - disp_cfg.num_slots = NUM_SLOTS; - disp_cfg.slot_size = config.slot_size; - disp_cfg.shutdown_flag = &dispatcher_shutdown; - disp_cfg.stats_counter = &dispatcher_stats; - - for (int i = 0; i < config.num_predecoders; ++i) { - std::string func_name = "predecode_target_" + std::to_string(i); - HostDispatchEntry entry; - entry.function_id = fnv1a_hash(func_name); - entry.graph_exec = predecoders[i]->get_executable_graph(); - entry.mailbox_idx = i; - entry.predecoder = predecoders[i].get(); - entry.stream = predecoder_streams[i]; - disp_cfg.entries.push_back(entry); - } - - std::thread dispatcher_thread([&disp_cfg]() { - host_dispatcher_loop(disp_cfg); - }); - - auto run_deadline = std::chrono::steady_clock::now() - + std::chrono::seconds(scfg.duration_s); - - std::string rate_label = (scfg.rate_us > 0) - ? std::to_string(scfg.rate_us) + " us" - : "open-loop"; - + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + #include + + #ifndef CUDA_VERSION + #define CUDA_VERSION 13000 + #endif + #include "cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h" + #include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" + + #include "cudaq/qec/realtime/ai_decoder_service.h" + #include "cudaq/qec/realtime/ai_predecoder_service.h" + #include "cudaq/qec/realtime/host_dispatcher.h" + #include "cudaq/qec/utils/thread_pool.h" + #include + #include "cudaq/qec/utils/pipeline_benchmarks.h" + #include "cudaq/qec/code.h" + #include "cudaq/qec/decoder.h" + + #define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at line " << __LINE__ << std::endl; \ + exit(1); \ + } \ + } while(0) + + using namespace cudaq::qec; + + // ============================================================================= + // Pipeline Configuration + // ============================================================================= + + constexpr size_t NUM_SLOTS = 64; + + struct PipelineConfig { + std::string label; + int distance; + int num_rounds; + int meas_qubits; // ONNX input shape[1] + int residual_detectors; // ONNX output dim + std::string onnx_filename; + size_t slot_size; // must fit RPCHeader + input payload + int total_requests; + int num_predecoders; + int queue_depth; + int num_workers; + + int input_elements() const { return meas_qubits * num_rounds; } + size_t input_bytes() const { return input_elements() * sizeof(int32_t); } + + std::string onnx_path() const { + return std::string(ONNX_MODEL_DIR) + "/" + onnx_filename; + } + + std::string engine_path() const { + std::string name = onnx_filename; + auto dot = name.rfind('.'); + if (dot != std::string::npos) + name = name.substr(0, dot); + return std::string(ONNX_MODEL_DIR) + "/" + name + ".engine"; + } + + static PipelineConfig d7_r7() { + return { + "d7_r7_Z", + /*distance=*/7, + /*num_rounds=*/7, + /*meas_qubits=*/72, + /*residual_detectors=*/336, + "model1_d7_r7_unified_Z_batch1.onnx", + /*slot_size=*/4096, + /*total_requests=*/100, + /*num_predecoders=*/4, + /*queue_depth=*/16, + /*num_workers=*/4 + }; + } + + static PipelineConfig d13_r13() { + return { + "d13_r13_Z", + /*distance=*/13, + /*num_rounds=*/13, + /*meas_qubits=*/252, + /*residual_detectors=*/2184, + "model1_d13_r13_unified_Z_batch1.onnx", + /*slot_size=*/16384, + /*total_requests=*/100, + /*num_predecoders=*/4, + /*queue_depth=*/16, + /*num_workers=*/4 + }; + } + + static PipelineConfig d21_r21() { + return { + "d21_r21_Z", + /*distance=*/21, + /*num_rounds=*/21, + /*meas_qubits=*/660, + /*residual_detectors=*/9240, + "model1_d21_r21_unified_X_batch1.onnx", + /*slot_size=*/65536, + /*total_requests=*/100, + /*num_predecoders=*/4, + /*queue_depth=*/16, + /*num_workers=*/4 + }; + } + + static PipelineConfig d31_r31() { + return { + "d31_r31_Z", + /*distance=*/31, + /*num_rounds=*/31, + /*meas_qubits=*/1440, + /*residual_detectors=*/29760, + "model1_d31_r31_unified_Z_batch1.onnx", + /*slot_size=*/262144, + /*total_requests=*/100, + /*num_predecoders=*/4, + /*queue_depth=*/16, + /*num_workers=*/4 + }; + } + }; + + // Runtime decoder state populated during setup + struct DecoderContext { + std::vector> decoders; + std::atomic next_decoder_idx{0}; + int z_stabilizers = 0; + int spatial_slices = 0; + + cudaq::qec::decoder* acquire_decoder() { + thread_local int my_idx = next_decoder_idx.fetch_add(1, std::memory_order_relaxed); + return decoders[my_idx % decoders.size()].get(); + } + + // Per-worker timing accumulators (lock-free) + std::atomic total_decode_us{0}; + std::atomic total_worker_us{0}; + std::atomic decode_count{0}; + }; + + constexpr std::uint32_t fnv1a_hash(std::string_view str) { + std::uint32_t hash = 0x811c9dc5; + for (char c : str) { hash ^= static_cast(c); hash *= 0x01000193; } + return hash; + } + + struct SystemContext { + cudaq::qec::atomic_uint64_sys* tx_flags_host = nullptr; + uint8_t* rx_data_host = nullptr; + size_t slot_size = 0; + }; + SystemContext g_sys_ctx; + + /// Context for dynamic worker pool: worker task writes tx_flags[origin_slot] and frees idle_mask. + struct WorkerPoolContext { + cudaq::qec::atomic_uint64_sys* tx_flags = nullptr; + cudaq::qec::atomic_uint64_sys* idle_mask = nullptr; + int* inflight_slot_tags = nullptr; + }; + + // ============================================================================= + // Thread Pool Worker (Real PyMatching MWPM Decoder) + // ============================================================================= + + struct __attribute__((packed)) DecodeResponse { + int32_t total_corrections; + int32_t converged; + }; + + void pymatching_worker_task(PreDecoderJob job, int worker_id, + AIPreDecoderService* predecoder, + DecoderContext* ctx, + WorkerPoolContext* pool_ctx) { + using hrclock = std::chrono::high_resolution_clock; + auto worker_start = hrclock::now(); + + const int32_t* residual = static_cast(job.inference_data); + auto* my_decoder = ctx->acquire_decoder(); + + int total_corrections = 0; + bool all_converged = true; + + auto decode_start = hrclock::now(); + for (int s = 0; s < ctx->spatial_slices; ++s) { + const int32_t* slice = residual + s * ctx->z_stabilizers; + std::vector syndrome(ctx->z_stabilizers); + for (int i = 0; i < ctx->z_stabilizers; ++i) + syndrome[i] = static_cast(slice[i]); + + auto result = my_decoder->decode(syndrome); + + all_converged &= result.converged; + for (auto v : result.result) + if (v > 0.5) total_corrections++; + } + auto decode_end = hrclock::now(); + + DecodeResponse resp_data{total_corrections, all_converged ? 1 : 0}; + + char* response_payload = (char*)job.ring_buffer_ptr + sizeof(cudaq::nvqlink::RPCResponse); + std::memcpy(response_payload, &resp_data, sizeof(resp_data)); + + auto* header = static_cast(job.ring_buffer_ptr); + header->magic = cudaq::nvqlink::RPC_MAGIC_RESPONSE; + header->status = 0; + header->result_len = sizeof(resp_data); + + uint64_t rx_value = reinterpret_cast(job.ring_buffer_ptr); + int origin_slot = job.origin_slot; + + if (pool_ctx && pool_ctx->tx_flags) { + pool_ctx->tx_flags[origin_slot].store(rx_value, cuda::std::memory_order_release); + } else { + size_t slot_idx = ((uint8_t*)job.ring_buffer_ptr - g_sys_ctx.rx_data_host) / g_sys_ctx.slot_size; + g_sys_ctx.tx_flags_host[slot_idx].store(rx_value, cuda::std::memory_order_release); + } + + predecoder->release_job(job.slot_idx); + + if (pool_ctx && pool_ctx->idle_mask) { + pool_ctx->idle_mask->fetch_or(1ULL << worker_id, cuda::std::memory_order_release); + } + + auto worker_end = hrclock::now(); + auto decode_us = std::chrono::duration_cast( + decode_end - decode_start).count(); + auto worker_us = std::chrono::duration_cast( + worker_end - worker_start).count(); + ctx->total_decode_us.fetch_add(decode_us, std::memory_order_relaxed); + ctx->total_worker_us.fetch_add(worker_us, std::memory_order_relaxed); + ctx->decode_count.fetch_add(1, std::memory_order_relaxed); + } + + // ============================================================================= + // Incoming Polling Thread + // ============================================================================= + void incoming_polling_loop( + std::vector>& predecoders, + cudaq::qec::utils::ThreadPool& thread_pool, + DecoderContext* ctx, + std::atomic& stop_signal, + WorkerPoolContext* pool_ctx = nullptr, + std::atomic* total_claimed = nullptr) + { + PreDecoderJob job; + int num_workers = static_cast(predecoders.size()); + while (!stop_signal.load(std::memory_order_relaxed)) { + bool found_work = false; + for (int i = 0; i < num_workers; ++i) { + if (predecoders[i]->poll_next_job(job)) { + if (pool_ctx && pool_ctx->inflight_slot_tags) { + job.origin_slot = pool_ctx->inflight_slot_tags[i]; + } else { + job.origin_slot = static_cast(((uint8_t*)job.ring_buffer_ptr - g_sys_ctx.rx_data_host) / g_sys_ctx.slot_size); + } + if (total_claimed) total_claimed->fetch_add(1, std::memory_order_relaxed); + AIPreDecoderService* pd_ptr = predecoders[i].get(); + int worker_id = i; + WorkerPoolContext* pctx = pool_ctx; + thread_pool.enqueue([job, worker_id, pd_ptr, ctx, pctx]() { + pymatching_worker_task(job, worker_id, pd_ptr, ctx, pctx); + }); + found_work = true; + } + } + if (!found_work) { + QEC_CPU_RELAX(); + } + } + } + + // ============================================================================= + // Generate Realistic Syndrome Data + // ============================================================================= + void fill_measurement_payload(int32_t* payload, int input_elements, + std::mt19937& rng, double error_rate = 0.01) { + std::bernoulli_distribution err_dist(error_rate); + for (int i = 0; i < input_elements; ++i) { + payload[i] = err_dist(rng) ? 1 : 0; + } + } + + // ============================================================================= + // Streaming Test Mode (simulates FPGA continuous syndrome arrival) + // ============================================================================= + + struct StreamingConfig { + int rate_us = 0; // inter-arrival time in us (0 = open-loop) + int duration_s = 5; // how long to run + int warmup_count = 20; // discard first N from latency stats + }; + + void run_streaming_test( + const PipelineConfig& config, + const StreamingConfig& scfg, + uint8_t* rx_data_host, + uint8_t* rx_data_dev, + cudaq::qec::atomic_uint64_sys* rx_flags, + cudaq::qec::atomic_uint64_sys* tx_flags, + DecoderContext& decoder_ctx, + std::vector>& predecoders, + cudaq::qec::utils::ThreadPool& pymatching_pool, + std::atomic& system_stop, + void** h_mailbox_bank, + std::vector& predecoder_streams, + WorkerPoolContext* pool_ctx, + std::atomic* total_claimed = nullptr) + { + using hrclock = std::chrono::high_resolution_clock; + using atomic_uint64_sys = cudaq::qec::atomic_uint64_sys; + using atomic_int_sys = cudaq::qec::atomic_int_sys; + + const int num_workers = config.num_predecoders; + const int max_requests = 500000; + const size_t payload_bytes = config.input_bytes(); + + std::vector submit_ts(max_requests); + std::vector complete_ts(max_requests); + std::vector completed(max_requests, false); + + std::vector slot_request(NUM_SLOTS, -1); + + std::atomic total_submitted{0}; + std::atomic total_completed{0}; + std::atomic backpressure_stalls{0}; + std::atomic producer_done{false}; + std::atomic consumer_stop{false}; + + atomic_int_sys shutdown_flag(0); + uint64_t dispatcher_stats = 0; + atomic_uint64_sys live_dispatched(0); + + HostDispatcherConfig disp_cfg; + disp_cfg.rx_flags = rx_flags; + disp_cfg.tx_flags = tx_flags; + disp_cfg.rx_data_host = rx_data_host; + disp_cfg.rx_data_dev = rx_data_dev; + disp_cfg.h_mailbox_bank = h_mailbox_bank; + disp_cfg.num_slots = NUM_SLOTS; + disp_cfg.slot_size = config.slot_size; + disp_cfg.shutdown_flag = &shutdown_flag; + disp_cfg.stats_counter = &dispatcher_stats; + disp_cfg.live_dispatched = &live_dispatched; + disp_cfg.idle_mask = pool_ctx->idle_mask; + disp_cfg.inflight_slot_tags = pool_ctx->inflight_slot_tags; + disp_cfg.workers.resize(num_workers); + for (int i = 0; i < num_workers; ++i) { + disp_cfg.workers[i].graph_exec = predecoders[i]->get_executable_graph(); + disp_cfg.workers[i].stream = predecoder_streams[i]; + } + + std::thread dispatcher_thread([&disp_cfg]() { + host_dispatcher_loop(disp_cfg); + }); + + auto run_deadline = std::chrono::steady_clock::now() + + std::chrono::seconds(scfg.duration_s); + + std::string rate_label = (scfg.rate_us > 0) + ? std::to_string(scfg.rate_us) + " us" + : "open-loop"; + std::cout << "\n[Stream] Starting streaming test (" << config.label << ", HOST dispatcher)\n" << " Rate: " << rate_label << "\n" << " Duration: " << scfg.duration_s << " s\n" << " Warmup: " << scfg.warmup_count << " requests\n" << " Predecoders:" << config.num_predecoders << " (dedicated streams)\n" - << " Max reqs: " << max_requests << "\n\n"; - - // --- Producer thread (simulates FPGA) --- - std::thread producer([&]() { - std::mt19937 rng(42); - int next_slot = 0; - int req_id = 0; - - while (std::chrono::steady_clock::now() < run_deadline - && req_id < max_requests) { - - int slot = next_slot % (int)NUM_SLOTS; - - // Wait for slot to be fully free (dispatcher consumed + response harvested) - while (rx_flags_host[slot] != 0 || tx_flags_host[slot] != 0) { - backpressure_stalls.fetch_add(1, std::memory_order_relaxed); - QEC_CPU_RELAX(); - if (std::chrono::steady_clock::now() >= run_deadline) return; + << " Max reqs: " << max_requests << "\n\n" + << std::flush; + + // Progress reporter (debug only; set to true to print submitted/completed every second) + constexpr bool kEnableProgressReporter = false; + std::atomic progress_done{false}; + std::thread progress_reporter; + if (kEnableProgressReporter) { + progress_reporter = std::thread([&]() { + while (true) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + if (progress_done.load(std::memory_order_acquire)) break; + bool pdone = producer_done.load(std::memory_order_acquire); + int nsub = total_submitted.load(std::memory_order_acquire); + int ncomp = total_completed.load(std::memory_order_acquire); + uint64_t disp = live_dispatched.load(cuda::std::memory_order_relaxed); + uint64_t claimed = total_claimed ? total_claimed->load(std::memory_order_relaxed) : 0; + uint64_t mask = pool_ctx->idle_mask ? pool_ctx->idle_mask->load(cuda::std::memory_order_relaxed) : 0; + std::cout << " [progress] submitted=" << nsub << " completed=" << ncomp + << " dispatched=" << disp << " claimed=" << claimed + << " idle_mask=0x" << std::hex << mask << std::dec << std::endl; + if (pdone && ncomp >= nsub) break; } + }); + } - int target = req_id % config.num_predecoders; - std::string func = "predecode_target_" + std::to_string(target); - - uint8_t* slot_data = rx_data_host + (slot * config.slot_size); - auto* hdr = reinterpret_cast(slot_data); - hdr->magic = cudaq::nvqlink::RPC_MAGIC_REQUEST; - hdr->function_id = fnv1a_hash(func); - hdr->arg_len = static_cast(payload_bytes); - - int32_t* payload = reinterpret_cast( - slot_data + sizeof(cudaq::nvqlink::RPCHeader)); - fill_measurement_payload(payload, config.input_elements(), rng, 0.01); - - slot_request[slot] = req_id; - - __sync_synchronize(); - submit_ts[req_id] = hrclock::now(); - rx_flags_host[slot] = reinterpret_cast(slot_data); - total_submitted.fetch_add(1, std::memory_order_release); - - next_slot++; - req_id++; - - if (scfg.rate_us > 0) { - auto target_time = submit_ts[req_id - 1] - + std::chrono::microseconds(scfg.rate_us); - while (hrclock::now() < target_time) - QEC_CPU_RELAX(); - } - } - - producer_done.store(true, std::memory_order_release); - }); - - // --- Consumer thread (harvests completions sequentially) --- - std::thread consumer([&]() { - int next_harvest = 0; - - while (true) { - bool pdone = producer_done.load(std::memory_order_acquire); - int nsub = total_submitted.load(std::memory_order_acquire); - int ncomp = total_completed.load(std::memory_order_relaxed); - - if (pdone && ncomp >= nsub) - break; - - if (next_harvest >= nsub) { - QEC_CPU_RELAX(); - continue; - } + // --- Producer thread (simulates FPGA) --- + std::thread producer([&]() { + std::mt19937 rng(42); + int next_slot = 0; + int req_id = 0; + + while (std::chrono::steady_clock::now() < run_deadline + && req_id < max_requests) { + + int slot = next_slot % (int)NUM_SLOTS; + // Wait for both flags to be completely clear (0). Dispatcher marks in-flight + // with tx_flags=0xEEEE... so we don't overwrite while GPU/workers are using the slot. + while (rx_flags[slot].load(cuda::std::memory_order_acquire) != 0 + || tx_flags[slot].load(cuda::std::memory_order_acquire) != 0) { + backpressure_stalls.fetch_add(1, std::memory_order_relaxed); + QEC_CPU_RELAX(); + if (std::chrono::steady_clock::now() >= run_deadline) return; + } + + int target = req_id % config.num_predecoders; + std::string func = "predecode_target_" + std::to_string(target); + + uint8_t* slot_data = rx_data_host + (slot * config.slot_size); + auto* hdr = reinterpret_cast(slot_data); + hdr->magic = cudaq::nvqlink::RPC_MAGIC_REQUEST; + hdr->function_id = fnv1a_hash(func); + hdr->arg_len = static_cast(payload_bytes); + + int32_t* payload = reinterpret_cast( + slot_data + sizeof(cudaq::nvqlink::RPCHeader)); + fill_measurement_payload(payload, config.input_elements(), rng, 0.01); + + slot_request[slot] = req_id; + + submit_ts[req_id] = hrclock::now(); + rx_flags[slot].store(reinterpret_cast(slot_data), cuda::std::memory_order_release); + total_submitted.fetch_add(1, std::memory_order_release); + + next_slot++; + req_id++; + + if (scfg.rate_us > 0) { + auto target_time = submit_ts[req_id - 1] + + std::chrono::microseconds(scfg.rate_us); + while (hrclock::now() < target_time) + QEC_CPU_RELAX(); + } + } + + producer_done.store(true, std::memory_order_seq_cst); + }); + + // --- Consumer thread (harvests completions sequentially) --- + std::thread consumer([&]() { + int next_harvest = 0; + + while (true) { + if (consumer_stop.load(std::memory_order_acquire)) + break; + bool pdone = producer_done.load(std::memory_order_acquire); + int nsub = total_submitted.load(std::memory_order_acquire); + int ncomp = total_completed.load(std::memory_order_relaxed); + + if (pdone && ncomp >= nsub) + break; + + if (next_harvest >= nsub) { + QEC_CPU_RELAX(); + continue; + } + int slot = next_harvest % (int)NUM_SLOTS; - uint64_t tv = tx_flags_host[slot]; - - if (tv != 0) { - int rid = slot_request[slot]; - if (rid >= 0 && (tv >> 48) != 0xDEAD) { - complete_ts[rid] = hrclock::now(); - completed[rid] = true; - total_completed.fetch_add(1, std::memory_order_relaxed); - } else if ((tv >> 48) == 0xDEAD) { - int cuda_err = (int)(tv & 0xFFFF); - std::cerr << " [FAIL] Slot " << slot - << " cudaGraphLaunch error " << cuda_err - << " (" << cudaGetErrorString((cudaError_t)cuda_err) - << ")\n"; - total_completed.fetch_add(1, std::memory_order_relaxed); + uint64_t tv = tx_flags[slot].load(cuda::std::memory_order_acquire); + + // Ignore IN_FLIGHT tag (dispatcher marks slot busy until worker writes response) + if (tv != 0 && tv != 0xEEEEEEEEEEEEEEEEULL) { + int rid = slot_request[slot]; + if (rid >= 0 && (tv >> 48) != 0xDEAD) { + complete_ts[rid] = hrclock::now(); + completed[rid] = true; + total_completed.fetch_add(1, std::memory_order_relaxed); + } else if ((tv >> 48) == 0xDEAD) { + int cuda_err = (int)(tv & 0xFFFF); + std::cerr << " [FAIL] Slot " << slot + << " cudaGraphLaunch error " << cuda_err + << " (" << cudaGetErrorString((cudaError_t)cuda_err) + << ")\n"; + total_completed.fetch_add(1, std::memory_order_relaxed); + } + + tx_flags[slot].store(0, cuda::std::memory_order_release); + slot_request[slot] = -1; + next_harvest++; + } else { + QEC_CPU_RELAX(); + } + } + }); + + // --- DIAGNOSTIC WATCHDOG THREAD (debug only; set true to diagnose stalls) --- + constexpr bool kEnableWatchdog = false; + std::thread watchdog; + if (kEnableWatchdog) { + watchdog = std::thread([&]() { + while (!producer_done.load(std::memory_order_seq_cst)) { + std::this_thread::sleep_for(std::chrono::seconds(2)); + if (producer_done.load(std::memory_order_seq_cst)) break; + + int nsub = total_submitted.load(std::memory_order_acquire); + int ncomp = total_completed.load(std::memory_order_relaxed); + + // Only print if the pipeline seems stalled (no progress in 2 seconds) + static int last_comp = -1; + if (ncomp == last_comp && nsub > ncomp) { + std::cout << "\n[WATCHDOG] PIPELINE STALL DETECTED!\n"; + std::cout << " Submitted: " << nsub << " | Completed: " << ncomp << "\n"; + + uint64_t mask = pool_ctx->idle_mask ? pool_ctx->idle_mask->load(cuda::std::memory_order_acquire) : 0; + std::cout << " Idle Mask: 0x" << std::hex << mask << std::dec << " (0 means all workers busy)\n"; + + std::cout << " Predecoder Ready Flags (GPU -> CPU):\n"; + for (int i = 0; i < config.num_predecoders; ++i) { + auto* sys_flags = predecoders[i]->get_host_ready_flags(); + int ready = sys_flags ? sys_flags[0].load(cuda::std::memory_order_acquire) : -1; + std::cout << " Worker " << i << ": " << ready << " (0=Idle, 1=GPU Done, 2=CPU Working)\n"; + } + + std::cout << " Ring Buffer (Window around stall):\n"; + int start_slot = std::max(0, (ncomp % (int)NUM_SLOTS) - 2); + int end_slot = std::min((int)NUM_SLOTS, start_slot + 8); + for (int i = start_slot; i < end_slot; ++i) { + uint64_t rx = rx_flags[i].load(cuda::std::memory_order_acquire); + uint64_t tx = tx_flags[i].load(cuda::std::memory_order_acquire); + std::cout << " Slot " << i << " | RX: " << (rx ? "HAS_DATA" : "0") + << " | TX: "; + if (tx == 0) std::cout << "0\n"; + else if (tx == 0xEEEEEEEEEEEEEEEEULL) std::cout << "IN_FLIGHT (0xEEEE...)\n"; + else if ((tx >> 48) == 0xDEAD) std::cout << "ERROR (0xDEAD...)\n"; + else std::cout << "RESPONSE_READY\n"; + } + std::cout << "--------------------------------------------------\n"; } - - tx_flags_host[slot] = 0; - slot_request[slot] = -1; - next_harvest++; - } else { - QEC_CPU_RELAX(); + last_comp = ncomp; } - } - }); - - producer.join(); - - // Grace period for in-flight requests - auto grace_deadline = std::chrono::steady_clock::now() + std::chrono::seconds(10); - while (total_completed.load() < total_submitted.load() - && std::chrono::steady_clock::now() < grace_deadline) { - usleep(1000); - } - - // Shut down the host dispatcher thread - dispatcher_shutdown = 1; - __sync_synchronize(); - dispatcher_thread.join(); - - consumer.join(); - - // ===== Report ===== - auto run_end = std::chrono::steady_clock::now(); - int nsub = total_submitted.load(); - int ncomp = total_completed.load(); - - // Build PipelineBenchmark from timestamps (skip warmup) - int warmup = std::min(scfg.warmup_count, nsub); - int bench_count = nsub - warmup; - - cudaq::qec::utils::PipelineBenchmark bench( - config.label + " (stream)", bench_count); - bench.start(); - - for (int i = warmup; i < nsub; ++i) { - int bench_id = i - warmup; - bench.mark_submit(bench_id); - // Override the internal submit timestamp with the real one - } - - // We can't override PipelineBenchmark's internal timestamps, so compute - // stats manually for the steady-state window. - std::vector latencies; - latencies.reserve(bench_count); - for (int i = warmup; i < nsub; ++i) { - if (!completed[i]) continue; - auto dt = std::chrono::duration_cast>( - complete_ts[i] - submit_ts[i]); - latencies.push_back(dt.count()); - } - - bench.stop(); - - std::sort(latencies.begin(), latencies.end()); - - auto pct = [&](double p) -> double { - if (latencies.empty()) return 0; - double idx = (p / 100.0) * (latencies.size() - 1); - size_t lo = (size_t)idx; - size_t hi = std::min(lo + 1, latencies.size() - 1); - double frac = idx - lo; - return latencies[lo] * (1.0 - frac) + latencies[hi] * frac; - }; - - double mean = 0; - for (auto v : latencies) mean += v; - mean = latencies.empty() ? 0 : mean / latencies.size(); - - double stddev = 0; - for (auto v : latencies) stddev += (v - mean) * (v - mean); - stddev = latencies.empty() ? 0 : std::sqrt(stddev / latencies.size()); - - auto wall_us = std::chrono::duration_cast>( - run_end - (run_deadline - std::chrono::seconds(scfg.duration_s))).count(); - double throughput = (wall_us > 0) ? (ncomp * 1e6 / wall_us) : 0; - - double actual_rate = (nsub > 1) - ? std::chrono::duration_cast>( - submit_ts[nsub - 1] - submit_ts[0]).count() / (nsub - 1) - : 0; - - std::cout << std::fixed; - std::cout << "\n================================================================\n"; - std::cout << " Streaming Benchmark: " << config.label << "\n"; - std::cout << "================================================================\n"; - std::cout << " Submitted: " << nsub << "\n"; - std::cout << " Completed: " << ncomp << "\n"; - if (nsub > ncomp) - std::cout << " Dropped/timeout: " << (nsub - ncomp) << "\n"; - std::cout << std::setprecision(1); - std::cout << " Wall time: " << wall_us / 1000.0 << " ms\n"; - std::cout << " Throughput: " << throughput << " req/s\n"; - std::cout << " Actual arrival rate:" << std::setw(8) << actual_rate << " us/req\n"; - std::cout << " Backpressure stalls:" << std::setw(8) - << backpressure_stalls.load() << "\n"; - std::cout << " ---------------------------------------------------------------\n"; - std::cout << " Latency (us) [steady-state, " << latencies.size() - << " requests after " << warmup << " warmup]\n"; - std::cout << std::setprecision(1); - if (!latencies.empty()) { - std::cout << " min = " << std::setw(10) << latencies.front() << "\n"; - std::cout << " p50 = " << std::setw(10) << pct(50) << "\n"; - std::cout << " mean = " << std::setw(10) << mean << "\n"; - std::cout << " p90 = " << std::setw(10) << pct(90) << "\n"; - std::cout << " p95 = " << std::setw(10) << pct(95) << "\n"; - std::cout << " p99 = " << std::setw(10) << pct(99) << "\n"; - std::cout << " max = " << std::setw(10) << latencies.back() << "\n"; - std::cout << " stddev = " << std::setw(10) << stddev << "\n"; - } - std::cout << " ---------------------------------------------------------------\n"; - - // Worker timing breakdown - int n_decoded = decoder_ctx.decode_count.load(); - if (n_decoded > 0) { - double avg_decode = (double)decoder_ctx.total_decode_us.load() / n_decoded; - double avg_worker = (double)decoder_ctx.total_worker_us.load() / n_decoded; - double avg_overhead = avg_worker - avg_decode; - double avg_pipeline = mean - avg_worker; - - std::cout << std::setprecision(1); - std::cout << " Worker Timing Breakdown (avg over " << n_decoded << " requests):\n"; - std::cout << " PyMatching decode:" << std::setw(10) << avg_decode - << " us (" << std::setw(4) << (mean > 0 ? 100.0 * avg_decode / mean : 0) - << "%)\n"; - std::cout << " Worker overhead: " << std::setw(10) << avg_overhead - << " us (" << std::setw(4) << (mean > 0 ? 100.0 * avg_overhead / mean : 0) - << "%)\n"; - std::cout << " GPU+dispatch+poll:" << std::setw(10) << avg_pipeline - << " us (" << std::setw(4) << (mean > 0 ? 100.0 * avg_pipeline / mean : 0) - << "%)\n"; - std::cout << " Total end-to-end: " << std::setw(10) << mean << " us\n"; - std::cout << " Per-round (/" << config.num_rounds << "): " - << std::setw(10) << (mean / config.num_rounds) << " us/round\n"; - } - std::cout << " ---------------------------------------------------------------\n"; - std::cout << " Host dispatcher processed " << dispatcher_stats << " packets.\n"; - std::cout << "================================================================\n"; -} - -// ============================================================================= -// Main -// ============================================================================= -int main(int argc, char* argv[]) { - // Parse arguments: [stream [rate_us] [duration_s]] - std::string config_name = "d7"; - bool streaming_mode = false; - StreamingConfig stream_cfg; - - if (argc > 1) - config_name = argv[1]; - - int stream_positional = 0; // tracks positional args after "stream" - for (int a = 2; a < argc; ++a) { - std::string arg = argv[a]; - if (arg == "stream") { - streaming_mode = true; - } else if (streaming_mode && stream_positional == 0 && std::isdigit(arg[0])) { - stream_cfg.rate_us = std::stoi(arg); - stream_positional++; - } else if (streaming_mode && stream_positional == 1 && std::isdigit(arg[0])) { - stream_cfg.duration_s = std::stoi(arg); - stream_positional++; - } - } - - PipelineConfig config; - if (config_name == "d7") { - config = PipelineConfig::d7_r7(); - } else if (config_name == "d13") { - config = PipelineConfig::d13_r13(); - } else if (config_name == "d21") { - config = PipelineConfig::d21_r21(); - } else if (config_name == "d31") { - config = PipelineConfig::d31_r31(); - } else { - std::cerr << "Usage: " << argv[0] << " [d7|d13|d21|d31] [stream [rate_us] [duration_s]]\n" - << " d7 - distance 7, 7 rounds (default)\n" - << " d13 - distance 13, 13 rounds\n" - << " d21 - distance 21, 21 rounds\n" - << " d31 - distance 31, 31 rounds\n" - << "\n" - << " stream - continuous FPGA-like submission (default: batch mode)\n" - << " rate_us - inter-arrival time in us (0 = open-loop, default)\n" - << " duration_s - test duration in seconds (default: 5)\n" - << "\n" - << "Examples:\n" - << " " << argv[0] << " d13 # batch mode\n" - << " " << argv[0] << " d13 stream # streaming, open-loop\n" - << " " << argv[0] << " d13 stream 50 # streaming, 50 us between requests\n" - << " " << argv[0] << " d13 stream 50 10 # streaming, 50 us rate, 10s duration\n"; - return 1; - } - - std::cout << "--- Initializing Hybrid AI Realtime Pipeline (" - << config.label << ") ---\n"; - std::cout << "[Config] distance=" << config.distance - << " rounds=" << config.num_rounds - << " meas_qubits=" << config.meas_qubits - << " residual_detectors=" << config.residual_detectors - << " input_bytes=" << config.input_bytes() - << " slot_size=" << config.slot_size << "\n"; - - CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost)); - - std::string engine_file = config.engine_path(); - std::string onnx_file = config.onnx_path(); - std::string model_path; - - // Prefer cached .engine file; fall back to ONNX build + save - std::ifstream engine_probe(engine_file, std::ios::binary); - if (engine_probe.good()) { - engine_probe.close(); - model_path = engine_file; - std::cout << "[Setup] Loading cached TRT engine: " << engine_file << "\n"; - } else { - model_path = onnx_file; - std::cout << "[Setup] Building TRT engines from ONNX: " << onnx_file << "\n"; - std::cout << "[Setup] Engine will be cached to: " << engine_file << "\n"; - } - - // Create PyMatching decoder from surface code Z parity check matrix - std::cout << "[Setup] Creating PyMatching decoder (d=" << config.distance - << " surface code, Z stabilizers)...\n"; - auto surface_code = cudaq::qec::get_code("surface_code", - {{"distance", config.distance}}); - auto H_z = surface_code->get_parity_z(); - - DecoderContext decoder_ctx; - decoder_ctx.z_stabilizers = static_cast(H_z.shape()[0]); - decoder_ctx.spatial_slices = config.residual_detectors / decoder_ctx.z_stabilizers; - std::cout << "[Setup] H_z shape: [" << H_z.shape()[0] << " x " - << H_z.shape()[1] << "]" - << " z_stabilizers=" << decoder_ctx.z_stabilizers - << " spatial_slices=" << decoder_ctx.spatial_slices << "\n"; - - cudaqx::heterogeneous_map pm_params; - pm_params.insert("merge_strategy", std::string("smallest_weight")); - std::cout << "[Setup] Pre-allocating " << config.num_workers - << " PyMatching decoders (one per worker)...\n"; - for (int i = 0; i < config.num_workers; ++i) - decoder_ctx.decoders.push_back( - cudaq::qec::decoder::get("pymatching", H_z, pm_params)); - std::cout << "[Setup] PyMatching decoder pool ready.\n"; - - // Allocate Ring Buffers - void* tmp = nullptr; - volatile uint64_t *rx_flags_host, *tx_flags_host; - volatile uint64_t *rx_flags_dev, *tx_flags_dev; - uint8_t *rx_data_host, *rx_data_dev; - - CUDA_CHECK(cudaHostAlloc(&tmp, NUM_SLOTS * sizeof(uint64_t), cudaHostAllocMapped)); - rx_flags_host = static_cast(tmp); - CUDA_CHECK(cudaHostGetDevicePointer((void**)&rx_flags_dev, tmp, 0)); - - CUDA_CHECK(cudaHostAlloc(&tmp, NUM_SLOTS * sizeof(uint64_t), cudaHostAllocMapped)); - tx_flags_host = static_cast(tmp); - CUDA_CHECK(cudaHostGetDevicePointer((void**)&tx_flags_dev, tmp, 0)); - - CUDA_CHECK(cudaHostAlloc(&rx_data_host, NUM_SLOTS * config.slot_size, cudaHostAllocMapped)); - CUDA_CHECK(cudaHostGetDevicePointer((void**)&rx_data_dev, rx_data_host, 0)); - - std::memset((void*)rx_flags_host, 0, NUM_SLOTS * sizeof(uint64_t)); - std::memset((void*)tx_flags_host, 0, NUM_SLOTS * sizeof(uint64_t)); - - g_sys_ctx.tx_flags_host = tx_flags_host; - g_sys_ctx.rx_data_host = rx_data_host; - g_sys_ctx.slot_size = config.slot_size; - - // ========================================================================= - // Mailbox & Dispatcher Setup (mode-dependent) - // ========================================================================= - - // Mapped pinned mailbox (used by both modes -- host writes, GPU reads) - void** h_mailbox_bank = nullptr; - void** d_mailbox_bank = nullptr; - CUDA_CHECK(cudaHostAlloc(&h_mailbox_bank, config.num_predecoders * sizeof(void*), cudaHostAllocMapped)); - std::memset(h_mailbox_bank, 0, config.num_predecoders * sizeof(void*)); - CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_mailbox_bank, h_mailbox_bank, 0)); - - // Device memory mailbox (for device-side dispatcher backward compat) - void** d_global_mailbox_bank = nullptr; - - int* shutdown_flag_host = nullptr; - int* d_shutdown_flag = nullptr; - uint64_t* d_stats = nullptr; - cudaq_function_entry_t* d_function_entries = nullptr; - cudaq_dispatch_graph_context* dispatch_ctx = nullptr; - - // Per-predecoder streams (for host dispatcher) - std::vector predecoder_streams; - - const bool use_host_dispatcher = streaming_mode; - bool device_launch = !use_host_dispatcher; - - if (!use_host_dispatcher) { - CUDA_CHECK(cudaMalloc(&d_global_mailbox_bank, config.num_predecoders * sizeof(void*))); - CUDA_CHECK(cudaMemset(d_global_mailbox_bank, 0, config.num_predecoders * sizeof(void*))); - - CUDA_CHECK(cudaHostAlloc(&shutdown_flag_host, sizeof(int), cudaHostAllocMapped)); - *shutdown_flag_host = 0; - CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_shutdown_flag, shutdown_flag_host, 0)); - - CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t))); - CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t))); - } else { - for (int i = 0; i < config.num_predecoders; ++i) { - cudaStream_t s; - CUDA_CHECK(cudaStreamCreate(&s)); - predecoder_streams.push_back(s); - } + }); } - // Initialize AIPreDecoder Instances from ONNX - std::cout << "[Setup] Capturing " << config.num_predecoders - << "x AIPreDecoder Graphs (" - << (device_launch ? "device-launch" : "host-launch") << ")...\n"; - cudaStream_t capture_stream; - CUDA_CHECK(cudaStreamCreate(&capture_stream)); - - std::vector> predecoders; - std::vector function_entries(config.num_predecoders); - + std::cout << " [shutdown] joining producer...\n" << std::flush; + producer.join(); + if (kEnableWatchdog) { + std::cout << " [shutdown] joining watchdog...\n" << std::flush; + watchdog.join(); + } + + // Grace period for in-flight requests + auto grace_deadline = std::chrono::steady_clock::now() + std::chrono::seconds(10); + while (total_completed.load() < total_submitted.load() + && std::chrono::steady_clock::now() < grace_deadline) { + usleep(1000); + } + consumer_stop.store(true, std::memory_order_release); + + shutdown_flag.store(1, cuda::std::memory_order_release); + std::cout << " [shutdown] joining dispatcher...\n" << std::flush; + dispatcher_thread.join(); + std::cout << " [shutdown] joining consumer...\n" << std::flush; + consumer.join(); + + if (kEnableProgressReporter) { + progress_done.store(true, std::memory_order_release); + progress_reporter.join(); + } + + // ===== Report ===== + auto run_end = std::chrono::steady_clock::now(); + int nsub = total_submitted.load(); + int ncomp = total_completed.load(); + if (ncomp < nsub) + std::cerr << " [WARN] " << (nsub - ncomp) << " in-flight requests did not complete before grace period.\n"; + + // Build PipelineBenchmark from timestamps (skip warmup) + int warmup = std::min(scfg.warmup_count, nsub); + int bench_count = nsub - warmup; + + cudaq::qec::utils::PipelineBenchmark bench( + config.label + " (stream)", bench_count); + bench.start(); + + for (int i = warmup; i < nsub; ++i) { + int bench_id = i - warmup; + bench.mark_submit(bench_id); + } + + std::vector latencies; + latencies.reserve(bench_count); + for (int i = warmup; i < nsub; ++i) { + if (!completed[i]) continue; + auto dt = std::chrono::duration_cast>( + complete_ts[i] - submit_ts[i]); + latencies.push_back(dt.count()); + } + + bench.stop(); + + std::sort(latencies.begin(), latencies.end()); + + auto pct = [&](double p) -> double { + if (latencies.empty()) return 0; + double idx = (p / 100.0) * (latencies.size() - 1); + size_t lo = (size_t)idx; + size_t hi = std::min(lo + 1, latencies.size() - 1); + double frac = idx - lo; + return latencies[lo] * (1.0 - frac) + latencies[hi] * frac; + }; + + double mean = 0; + for (auto v : latencies) mean += v; + mean = latencies.empty() ? 0 : mean / latencies.size(); + + double stddev = 0; + for (auto v : latencies) stddev += (v - mean) * (v - mean); + stddev = latencies.empty() ? 0 : std::sqrt(stddev / latencies.size()); + + auto wall_us = std::chrono::duration_cast>( + run_end - (run_deadline - std::chrono::seconds(scfg.duration_s))).count(); + double throughput = (wall_us > 0) ? (ncomp * 1e6 / wall_us) : 0; + + double actual_rate = (nsub > 1) + ? std::chrono::duration_cast>( + submit_ts[nsub - 1] - submit_ts[0]).count() / (nsub - 1) + : 0; + + std::cout << std::fixed; + std::cout << "\n================================================================\n"; + std::cout << " Streaming Benchmark: " << config.label << "\n"; + std::cout << "================================================================\n"; + std::cout << " Submitted: " << nsub << "\n"; + std::cout << " Completed: " << ncomp << "\n"; + if (nsub > ncomp) + std::cout << " Dropped/timeout: " << (nsub - ncomp) << "\n"; + std::cout << std::setprecision(1); + std::cout << " Wall time: " << wall_us / 1000.0 << " ms\n"; + std::cout << " Throughput: " << throughput << " req/s\n"; + std::cout << " Actual arrival rate:" << std::setw(8) << actual_rate << " us/req\n"; + std::cout << " Backpressure stalls:" << std::setw(8) + << backpressure_stalls.load() << "\n"; + std::cout << " ---------------------------------------------------------------\n"; + std::cout << " Latency (us) [steady-state, " << latencies.size() + << " requests after " << warmup << " warmup]\n"; + std::cout << std::setprecision(1); + if (!latencies.empty()) { + std::cout << " min = " << std::setw(10) << latencies.front() << "\n"; + std::cout << " p50 = " << std::setw(10) << pct(50) << "\n"; + std::cout << " mean = " << std::setw(10) << mean << "\n"; + std::cout << " p90 = " << std::setw(10) << pct(90) << "\n"; + std::cout << " p95 = " << std::setw(10) << pct(95) << "\n"; + std::cout << " p99 = " << std::setw(10) << pct(99) << "\n"; + std::cout << " max = " << std::setw(10) << latencies.back() << "\n"; + std::cout << " stddev = " << std::setw(10) << stddev << "\n"; + } + std::cout << " ---------------------------------------------------------------\n"; + + // Worker timing breakdown + int n_decoded = decoder_ctx.decode_count.load(); + if (n_decoded > 0) { + double avg_decode = (double)decoder_ctx.total_decode_us.load() / n_decoded; + double avg_worker = (double)decoder_ctx.total_worker_us.load() / n_decoded; + double avg_overhead = avg_worker - avg_decode; + double avg_pipeline = mean - avg_worker; + + std::cout << std::setprecision(1); + std::cout << " Worker Timing Breakdown (avg over " << n_decoded << " requests):\n"; + std::cout << " PyMatching decode:" << std::setw(10) << avg_decode + << " us (" << std::setw(4) << (mean > 0 ? 100.0 * avg_decode / mean : 0) + << "%)\n"; + std::cout << " Worker overhead: " << std::setw(10) << avg_overhead + << " us (" << std::setw(4) << (mean > 0 ? 100.0 * avg_overhead / mean : 0) + << "%)\n"; + std::cout << " GPU+dispatch+poll:" << std::setw(10) << avg_pipeline + << " us (" << std::setw(4) << (mean > 0 ? 100.0 * avg_pipeline / mean : 0) + << "%)\n"; + std::cout << " Total end-to-end: " << std::setw(10) << mean << " us\n"; + std::cout << " Per-round (/" << config.num_rounds << "): " + << std::setw(10) << (mean / config.num_rounds) << " us/round\n"; + } + std::cout << " ---------------------------------------------------------------\n"; + std::cout << " Host dispatcher processed " << dispatcher_stats << " packets.\n"; + std::cout << "================================================================\n"; + } + + // ============================================================================= + // Main + // ============================================================================= + int main(int argc, char* argv[]) { + // Parse arguments: [stream [rate_us] [duration_s]] + std::string config_name = "d7"; + bool streaming_mode = false; + StreamingConfig stream_cfg; + + if (argc > 1) + config_name = argv[1]; + + int stream_positional = 0; // tracks positional args after "stream" + for (int a = 2; a < argc; ++a) { + std::string arg = argv[a]; + if (arg == "stream") { + streaming_mode = true; + } else if (streaming_mode && stream_positional == 0 && std::isdigit(arg[0])) { + stream_cfg.rate_us = std::stoi(arg); + stream_positional++; + } else if (streaming_mode && stream_positional == 1 && std::isdigit(arg[0])) { + stream_cfg.duration_s = std::stoi(arg); + stream_positional++; + } + } + + PipelineConfig config; + if (config_name == "d7") { + config = PipelineConfig::d7_r7(); + } else if (config_name == "d13") { + config = PipelineConfig::d13_r13(); + } else if (config_name == "d21") { + config = PipelineConfig::d21_r21(); + } else if (config_name == "d31") { + config = PipelineConfig::d31_r31(); + } else { + std::cerr << "Usage: " << argv[0] << " [d7|d13|d21|d31] [stream [rate_us] [duration_s]]\n" + << " d7 - distance 7, 7 rounds (default)\n" + << " d13 - distance 13, 13 rounds\n" + << " d21 - distance 21, 21 rounds\n" + << " d31 - distance 31, 31 rounds\n" + << "\n" + << " stream - continuous FPGA-like submission (default: batch mode)\n" + << " rate_us - inter-arrival time in us (0 = open-loop, default)\n" + << " duration_s - test duration in seconds (default: 5)\n" + << "\n" + << "Examples:\n" + << " " << argv[0] << " d13 # batch mode\n" + << " " << argv[0] << " d13 stream # streaming, open-loop\n" + << " " << argv[0] << " d13 stream 50 # streaming, 50 us between requests\n" + << " " << argv[0] << " d13 stream 50 10 # streaming, 50 us rate, 10s duration\n"; + return 1; + } + + std::cout << "--- Initializing Hybrid AI Realtime Pipeline (" + << config.label << ") ---\n"; + std::cout << "[Config] distance=" << config.distance + << " rounds=" << config.num_rounds + << " meas_qubits=" << config.meas_qubits + << " residual_detectors=" << config.residual_detectors + << " input_bytes=" << config.input_bytes() + << " slot_size=" << config.slot_size << "\n"; + + CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost)); + + std::string engine_file = config.engine_path(); + std::string onnx_file = config.onnx_path(); + std::string model_path; + + std::ifstream engine_probe(engine_file, std::ios::binary); + if (engine_probe.good()) { + engine_probe.close(); + model_path = engine_file; + std::cout << "[Setup] Loading cached TRT engine: " << engine_file << "\n"; + } else { + model_path = onnx_file; + std::cout << "[Setup] Building TRT engines from ONNX: " << onnx_file << "\n"; + std::cout << "[Setup] Engine will be cached to: " << engine_file << "\n"; + } + + std::cout << "[Setup] Creating PyMatching decoder (d=" << config.distance + << " surface code, Z stabilizers)...\n"; + auto surface_code = cudaq::qec::get_code("surface_code", + {{"distance", config.distance}}); + auto H_z = surface_code->get_parity_z(); + + DecoderContext decoder_ctx; + decoder_ctx.z_stabilizers = static_cast(H_z.shape()[0]); + decoder_ctx.spatial_slices = config.residual_detectors / decoder_ctx.z_stabilizers; + std::cout << "[Setup] H_z shape: [" << H_z.shape()[0] << " x " + << H_z.shape()[1] << "]" + << " z_stabilizers=" << decoder_ctx.z_stabilizers + << " spatial_slices=" << decoder_ctx.spatial_slices << "\n"; + + cudaqx::heterogeneous_map pm_params; + pm_params.insert("merge_strategy", std::string("smallest_weight")); + std::cout << "[Setup] Pre-allocating " << config.num_workers + << " PyMatching decoders (one per worker)...\n"; + for (int i = 0; i < config.num_workers; ++i) + decoder_ctx.decoders.push_back( + cudaq::qec::decoder::get("pymatching", H_z, pm_params)); + std::cout << "[Setup] PyMatching decoder pool ready.\n"; + + // ========================================================================= + // System-Scope Atomics & Ring Buffer Allocation (Replaces volatile setup) + // ========================================================================= + using atomic_uint64_sys = cudaq::qec::atomic_uint64_sys; + using atomic_int_sys = cudaq::qec::atomic_int_sys; + + void* buf_rx = nullptr; + CUDA_CHECK(cudaHostAlloc(&buf_rx, NUM_SLOTS * sizeof(atomic_uint64_sys), cudaHostAllocMapped)); + atomic_uint64_sys* rx_flags_host = static_cast(buf_rx); + for (size_t i = 0; i < NUM_SLOTS; ++i) new (rx_flags_host + i) atomic_uint64_sys(0); + + void* buf_tx = nullptr; + CUDA_CHECK(cudaHostAlloc(&buf_tx, NUM_SLOTS * sizeof(atomic_uint64_sys), cudaHostAllocMapped)); + atomic_uint64_sys* tx_flags_host = static_cast(buf_tx); + for (size_t i = 0; i < NUM_SLOTS; ++i) new (tx_flags_host + i) atomic_uint64_sys(0); + + uint64_t* rx_flags_dev = nullptr; + uint64_t* tx_flags_dev = nullptr; + CUDA_CHECK(cudaHostGetDevicePointer((void**)&rx_flags_dev, buf_rx, 0)); + CUDA_CHECK(cudaHostGetDevicePointer((void**)&tx_flags_dev, buf_tx, 0)); + + uint8_t *rx_data_host, *rx_data_dev; + CUDA_CHECK(cudaHostAlloc(&rx_data_host, NUM_SLOTS * config.slot_size, cudaHostAllocMapped)); + CUDA_CHECK(cudaHostGetDevicePointer((void**)&rx_data_dev, rx_data_host, 0)); + + g_sys_ctx.tx_flags_host = tx_flags_host; + g_sys_ctx.rx_data_host = rx_data_host; + g_sys_ctx.slot_size = config.slot_size; + + // Define the dynamic pool variables HERE so they live until the program exits + atomic_uint64_sys idle_mask((1ULL << config.num_predecoders) - 1); + std::vector inflight_slot_tags(config.num_predecoders, 0); + + WorkerPoolContext pool_ctx; + pool_ctx.tx_flags = tx_flags_host; + pool_ctx.idle_mask = &idle_mask; + pool_ctx.inflight_slot_tags = inflight_slot_tags.data(); + + // ========================================================================= + // Mailbox & Dispatcher Setup (mode-dependent) + // ========================================================================= + + void** h_mailbox_bank = nullptr; + void** d_mailbox_bank = nullptr; + CUDA_CHECK(cudaHostAlloc(&h_mailbox_bank, config.num_predecoders * sizeof(void*), cudaHostAllocMapped)); + std::memset(h_mailbox_bank, 0, config.num_predecoders * sizeof(void*)); + CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_mailbox_bank, h_mailbox_bank, 0)); + + void** d_global_mailbox_bank = nullptr; + + int* shutdown_flag_host = nullptr; + int* d_shutdown_flag = nullptr; + uint64_t* d_stats = nullptr; + cudaq_function_entry_t* d_function_entries = nullptr; + cudaq_dispatch_graph_context* dispatch_ctx = nullptr; + + std::vector predecoder_streams; + + const bool use_host_dispatcher = streaming_mode; + bool device_launch = !use_host_dispatcher; + + if (!use_host_dispatcher) { + CUDA_CHECK(cudaMalloc(&d_global_mailbox_bank, config.num_predecoders * sizeof(void*))); + CUDA_CHECK(cudaMemset(d_global_mailbox_bank, 0, config.num_predecoders * sizeof(void*))); + + CUDA_CHECK(cudaHostAlloc(&shutdown_flag_host, sizeof(int), cudaHostAllocMapped)); + *shutdown_flag_host = 0; + CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_shutdown_flag, shutdown_flag_host, 0)); + + CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t))); + CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t))); + } else { + for (int i = 0; i < config.num_predecoders; ++i) { + cudaStream_t s; + CUDA_CHECK(cudaStreamCreate(&s)); + predecoder_streams.push_back(s); + } + } + + std::cout << "[Setup] Capturing " << config.num_predecoders + << "x AIPreDecoder Graphs (" + << (device_launch ? "device-launch" : "host-launch") << ")...\n"; + cudaStream_t capture_stream; + CUDA_CHECK(cudaStreamCreate(&capture_stream)); + + std::vector> predecoders; + std::vector function_entries(config.num_predecoders); + bool need_save = (model_path == onnx_file); + int predecoder_queue_depth = use_host_dispatcher ? 1 : config.queue_depth; for (int i = 0; i < config.num_predecoders; ++i) { void** my_mailbox = use_host_dispatcher ? (d_mailbox_bank + i) : (d_global_mailbox_bank + i); std::string save_path = (need_save && i == 0) ? engine_file : ""; auto pd = std::make_unique(model_path, my_mailbox, - config.queue_depth, + predecoder_queue_depth, save_path); - - std::cout << "[Setup] Decoder " << i - << ": input_size=" << pd->get_input_size() - << " output_size=" << pd->get_output_size() << "\n"; - - pd->capture_graph(capture_stream, device_launch); - - if (!use_host_dispatcher) { - cudaGraphExec_t gexec = pd->get_executable_graph(); - std::string func_name = "predecode_target_" + std::to_string(i); - function_entries[i].function_id = fnv1a_hash(func_name); - function_entries[i].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; - function_entries[i].handler.graph_exec = gexec; - function_entries[i].mailbox_idx = i; - function_entries[i].d_queue_idx = pd->get_device_queue_idx(); - function_entries[i].d_ready_flags = pd->get_device_ready_flags(); - function_entries[i].d_inflight_flag = pd->get_device_inflight_flag(); - } - - predecoders.push_back(std::move(pd)); - } - - if (!use_host_dispatcher) { - CUDA_CHECK(cudaMalloc(&d_function_entries, - config.num_predecoders * sizeof(cudaq_function_entry_t))); - CUDA_CHECK(cudaMemcpy(d_function_entries, function_entries.data(), - config.num_predecoders * sizeof(cudaq_function_entry_t), - cudaMemcpyHostToDevice)); - - std::cout << "[Setup] Launching GPU Dispatcher Kernel...\n"; - CUDA_CHECK(cudaq_create_dispatch_graph_regular( - rx_flags_dev, tx_flags_dev, d_function_entries, config.num_predecoders, - d_global_mailbox_bank, d_shutdown_flag, d_stats, NUM_SLOTS, 1, 32, - capture_stream, &dispatch_ctx - )); - CUDA_CHECK(cudaq_launch_dispatch_graph(dispatch_ctx, capture_stream)); - } else { - std::cout << "[Setup] Host-side dispatcher will be launched in streaming test.\n"; - } - - // Start CPU Infrastructure + + std::cout << "[Setup] Decoder " << i + << ": input_size=" << pd->get_input_size() + << " output_size=" << pd->get_output_size() << "\n"; + + pd->capture_graph(capture_stream, device_launch); + + if (!use_host_dispatcher) { + cudaGraphExec_t gexec = pd->get_executable_graph(); + std::string func_name = "predecode_target_" + std::to_string(i); + function_entries[i].function_id = fnv1a_hash(func_name); + function_entries[i].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; + function_entries[i].handler.graph_exec = gexec; + function_entries[i].mailbox_idx = i; + function_entries[i].d_queue_idx = pd->get_device_queue_idx(); + function_entries[i].d_ready_flags = reinterpret_cast(pd->get_device_ready_flags()); + function_entries[i].d_inflight_flag = pd->get_device_inflight_flag(); + } + + predecoders.push_back(std::move(pd)); + } + + if (!use_host_dispatcher) { + CUDA_CHECK(cudaMalloc(&d_function_entries, + config.num_predecoders * sizeof(cudaq_function_entry_t))); + CUDA_CHECK(cudaMemcpy(d_function_entries, function_entries.data(), + config.num_predecoders * sizeof(cudaq_function_entry_t), + cudaMemcpyHostToDevice)); + + std::cout << "[Setup] Launching GPU Dispatcher Kernel...\n"; + CUDA_CHECK(cudaq_create_dispatch_graph_regular( + rx_flags_dev, tx_flags_dev, d_function_entries, config.num_predecoders, + d_global_mailbox_bank, d_shutdown_flag, d_stats, NUM_SLOTS, 1, 32, + capture_stream, &dispatch_ctx + )); + CUDA_CHECK(cudaq_launch_dispatch_graph(dispatch_ctx, capture_stream)); + } else { + std::cout << "[Setup] Host-side dispatcher will be launched in streaming test.\n"; + } + std::cout << "[Setup] Booting Thread Pool (" << config.num_workers - << " workers) & Polling Loop...\n"; - cudaq::qec::utils::ThreadPool pymatching_pool(config.num_workers); - std::atomic system_stop{false}; - - std::thread incoming_thread([&]() { - incoming_polling_loop(predecoders, pymatching_pool, &decoder_ctx, - system_stop); - }); - - // ========================================================================= - // Test Stimulus - // ========================================================================= - if (streaming_mode) { - run_streaming_test(config, stream_cfg, rx_flags_host, tx_flags_host, - rx_data_host, rx_data_dev, decoder_ctx, predecoders, - pymatching_pool, system_stop, - h_mailbox_bank, predecoder_streams); - } else { - // Batch mode: fire requests in batches of num_predecoders, wait for - // each batch to complete before firing the next. - const int batch_size = config.num_predecoders; - std::cout << "\n[Batch] Firing " << config.total_requests - << " syndromes in batches of " << batch_size - << " (" << config.label << ", error_rate=0.01)...\n"; - - cudaq::qec::utils::PipelineBenchmark bench(config.label, - config.total_requests); - std::mt19937 rng(42); - const size_t payload_bytes = config.input_bytes(); - int requests_sent = 0; - int responses_received = 0; - - bench.start(); - - for (int batch_start = 0; batch_start < config.total_requests; - batch_start += batch_size) { - int batch_end = std::min(batch_start + batch_size, config.total_requests); - - for (int i = batch_start; i < batch_end; ++i) { - int target_decoder = i % config.num_predecoders; - std::string target_func = "predecode_target_" - + std::to_string(target_decoder); - - int slot = i % (int)NUM_SLOTS; - while (rx_flags_host[slot] != 0) usleep(10); - - uint8_t* slot_data = rx_data_host + (slot * config.slot_size); - auto* header = reinterpret_cast(slot_data); - header->magic = cudaq::nvqlink::RPC_MAGIC_REQUEST; - header->function_id = fnv1a_hash(target_func); - header->arg_len = static_cast(payload_bytes); - - int32_t* payload = reinterpret_cast( - slot_data + sizeof(cudaq::nvqlink::RPCHeader)); - fill_measurement_payload(payload, config.input_elements(), rng, 0.01); - - __sync_synchronize(); - bench.mark_submit(i); - rx_flags_host[slot] = reinterpret_cast(slot_data); - requests_sent++; - } - - for (int i = batch_start; i < batch_end; ++i) { - int slot = i % (int)NUM_SLOTS; - - auto deadline = std::chrono::steady_clock::now() - + std::chrono::seconds(10); - while (tx_flags_host[slot] == 0) { - if (std::chrono::steady_clock::now() > deadline) break; - QEC_CPU_RELAX(); - } - - uint64_t tv = tx_flags_host[slot]; - if (tv != 0 && (tv >> 48) == 0xDEAD) { - int cuda_err = (int)(tv & 0xFFFF); - std::cerr << " [FAIL] Slot " << slot - << " cudaGraphLaunch error " << cuda_err - << " (" << cudaGetErrorString((cudaError_t)cuda_err) - << ")\n"; - } else if (tv != 0) { - bench.mark_complete(i); - responses_received++; - uint8_t* slot_data = rx_data_host + (slot * config.slot_size); - int32_t corrections = 0, converged = 0; - std::memcpy(&corrections, - slot_data + sizeof(cudaq::nvqlink::RPCResponse), - sizeof(int32_t)); - std::memcpy(&converged, - slot_data + sizeof(cudaq::nvqlink::RPCResponse) - + sizeof(int32_t), - sizeof(int32_t)); - std::cout << " -> Slot " << slot - << ": OK, corrections=" << corrections - << " converged=" << (converged ? "yes" : "no") << "\n"; - } else { - std::cerr << " [FAIL] Timeout waiting for slot " << slot << "\n"; - } - - tx_flags_host[slot] = 0; - } - } - - bench.stop(); - - std::cout << "\n[Result] Processed " << responses_received << "/" - << requests_sent << " requests successfully.\n"; - - bench.report(); - - int n_decoded = decoder_ctx.decode_count.load(); - if (n_decoded > 0) { - double avg_decode = (double)decoder_ctx.total_decode_us.load() / n_decoded; - double avg_worker = (double)decoder_ctx.total_worker_us.load() / n_decoded; - double avg_overhead = avg_worker - avg_decode; - auto stats = bench.compute_stats(); - double avg_pipeline_overhead = stats.mean_us - avg_worker; - - std::cout << std::fixed << std::setprecision(1); - std::cout << "\n Worker Timing Breakdown (avg over " - << n_decoded << " requests):\n"; - std::cout << " PyMatching decode: " << std::setw(8) << avg_decode - << " us (" << std::setw(4) - << (100.0 * avg_decode / stats.mean_us) << "%)\n"; - std::cout << " Worker overhead: " << std::setw(8) << avg_overhead - << " us (" << std::setw(4) - << (100.0 * avg_overhead / stats.mean_us) << "%)\n"; - std::cout << " GPU+dispatch+poll: " << std::setw(8) - << avg_pipeline_overhead << " us (" << std::setw(4) - << (100.0 * avg_pipeline_overhead / stats.mean_us) << "%)\n"; - std::cout << " Total end-to-end: " << std::setw(8) - << stats.mean_us << " us\n"; - std::cout << " Per-round (/" << config.num_rounds << "): " - << std::setw(8) << (stats.mean_us / config.num_rounds) - << " us/round\n"; - } - } - - // Teardown - std::cout << "[Teardown] Shutting down...\n"; - system_stop = true; - - if (!use_host_dispatcher) { - *shutdown_flag_host = 1; - __sync_synchronize(); - } - - incoming_thread.join(); - CUDA_CHECK(cudaStreamSynchronize(capture_stream)); - - if (!use_host_dispatcher) { - uint64_t dispatched_packets = 0; - CUDA_CHECK(cudaMemcpy(&dispatched_packets, d_stats, sizeof(uint64_t), cudaMemcpyDeviceToHost)); - std::cout << "[Stats] Dispatcher processed " << dispatched_packets << " packets.\n"; - CUDA_CHECK(cudaq_destroy_dispatch_graph(dispatch_ctx)); - } - - // Synchronize predecoder streams before cleanup - for (auto& s : predecoder_streams) { - cudaStreamSynchronize(s); - cudaStreamDestroy(s); - } - - cudaFreeHost((void*)rx_flags_host); - cudaFreeHost((void*)tx_flags_host); - cudaFreeHost(rx_data_host); - cudaFreeHost(h_mailbox_bank); - if (shutdown_flag_host) cudaFreeHost(shutdown_flag_host); - if (d_global_mailbox_bank) cudaFree(d_global_mailbox_bank); - if (d_stats) cudaFree(d_stats); - if (d_function_entries) cudaFree(d_function_entries); - cudaStreamDestroy(capture_stream); - - std::cout << "Done.\n"; - return 0; -} + << " workers) & Polling Loop...\n"; + cudaq::qec::utils::ThreadPool pymatching_pool(config.num_workers); + std::atomic system_stop{false}; + std::atomic total_claimed{0}; + + std::thread incoming_thread([&]() { + incoming_polling_loop(predecoders, pymatching_pool, &decoder_ctx, + system_stop, &pool_ctx, &total_claimed); + }); + + // ========================================================================= + // Test Stimulus + // ========================================================================= + if (streaming_mode) { + run_streaming_test(config, stream_cfg, + rx_data_host, rx_data_dev, rx_flags_host, tx_flags_host, + decoder_ctx, predecoders, pymatching_pool, system_stop, + h_mailbox_bank, predecoder_streams, &pool_ctx, &total_claimed); + } else { + const int batch_size = config.num_predecoders; + std::cout << "\n[Batch] Firing " << config.total_requests + << " syndromes in batches of " << batch_size + << " (" << config.label << ", error_rate=0.01)...\n"; + + cudaq::qec::utils::PipelineBenchmark bench(config.label, + config.total_requests); + std::mt19937 rng(42); + const size_t payload_bytes = config.input_bytes(); + int requests_sent = 0; + int responses_received = 0; + + bench.start(); + + for (int batch_start = 0; batch_start < config.total_requests; + batch_start += batch_size) { + int batch_end = std::min(batch_start + batch_size, config.total_requests); + + for (int i = batch_start; i < batch_end; ++i) { + int target_decoder = i % config.num_predecoders; + std::string target_func = "predecode_target_" + + std::to_string(target_decoder); + + int slot = i % (int)NUM_SLOTS; + while (rx_flags_host[slot].load(cuda::std::memory_order_acquire) != 0) usleep(10); + + uint8_t* slot_data = rx_data_host + (slot * config.slot_size); + auto* header = reinterpret_cast(slot_data); + header->magic = cudaq::nvqlink::RPC_MAGIC_REQUEST; + header->function_id = fnv1a_hash(target_func); + header->arg_len = static_cast(payload_bytes); + + int32_t* payload = reinterpret_cast( + slot_data + sizeof(cudaq::nvqlink::RPCHeader)); + fill_measurement_payload(payload, config.input_elements(), rng, 0.01); + + bench.mark_submit(i); + rx_flags_host[slot].store(reinterpret_cast(slot_data), cuda::std::memory_order_release); + requests_sent++; + } + + for (int i = batch_start; i < batch_end; ++i) { + int slot = i % (int)NUM_SLOTS; + + auto deadline = std::chrono::steady_clock::now() + + std::chrono::seconds(10); + uint64_t tv = 0; + while ((tv = tx_flags_host[slot].load(cuda::std::memory_order_acquire)) == 0) { + if (std::chrono::steady_clock::now() > deadline) break; + QEC_CPU_RELAX(); + } + + if (tv != 0 && (tv >> 48) == 0xDEAD) { + int cuda_err = (int)(tv & 0xFFFF); + std::cerr << " [FAIL] Slot " << slot + << " cudaGraphLaunch error " << cuda_err + << " (" << cudaGetErrorString((cudaError_t)cuda_err) + << ")\n"; + } else if (tv != 0) { + bench.mark_complete(i); + responses_received++; + uint8_t* slot_data = rx_data_host + (slot * config.slot_size); + int32_t corrections = 0, converged = 0; + std::memcpy(&corrections, + slot_data + sizeof(cudaq::nvqlink::RPCResponse), + sizeof(int32_t)); + std::memcpy(&converged, + slot_data + sizeof(cudaq::nvqlink::RPCResponse) + + sizeof(int32_t), + sizeof(int32_t)); + std::cout << " -> Slot " << slot + << ": OK, corrections=" << corrections + << " converged=" << (converged ? "yes" : "no") << "\n"; + } else { + std::cerr << " [FAIL] Timeout waiting for slot " << slot << "\n"; + } + + tx_flags_host[slot].store(0, cuda::std::memory_order_release); + } + } + + bench.stop(); + + std::cout << "\n[Result] Processed " << responses_received << "/" + << requests_sent << " requests successfully.\n"; + + bench.report(); + + int n_decoded = decoder_ctx.decode_count.load(); + if (n_decoded > 0) { + double avg_decode = (double)decoder_ctx.total_decode_us.load() / n_decoded; + double avg_worker = (double)decoder_ctx.total_worker_us.load() / n_decoded; + double avg_overhead = avg_worker - avg_decode; + auto stats = bench.compute_stats(); + double avg_pipeline_overhead = stats.mean_us - avg_worker; + + std::cout << std::fixed << std::setprecision(1); + std::cout << "\n Worker Timing Breakdown (avg over " + << n_decoded << " requests):\n"; + std::cout << " PyMatching decode: " << std::setw(8) << avg_decode + << " us (" << std::setw(4) + << (100.0 * avg_decode / stats.mean_us) << "%)\n"; + std::cout << " Worker overhead: " << std::setw(8) << avg_overhead + << " us (" << std::setw(4) + << (100.0 * avg_overhead / stats.mean_us) << "%)\n"; + std::cout << " GPU+dispatch+poll: " << std::setw(8) + << avg_pipeline_overhead << " us (" << std::setw(4) + << (100.0 * avg_pipeline_overhead / stats.mean_us) << "%)\n"; + std::cout << " Total end-to-end: " << std::setw(8) + << stats.mean_us << " us\n"; + std::cout << " Per-round (/" << config.num_rounds << "): " + << std::setw(8) << (stats.mean_us / config.num_rounds) + << " us/round\n"; + } + } + + // Teardown + std::cout << "[Teardown] Shutting down...\n"; + system_stop = true; + + if (!use_host_dispatcher) { + *shutdown_flag_host = 1; + __sync_synchronize(); + } + + incoming_thread.join(); + CUDA_CHECK(cudaStreamSynchronize(capture_stream)); + + if (!use_host_dispatcher) { + uint64_t dispatched_packets = 0; + CUDA_CHECK(cudaMemcpy(&dispatched_packets, d_stats, sizeof(uint64_t), cudaMemcpyDeviceToHost)); + std::cout << "[Stats] Dispatcher processed " << dispatched_packets << " packets.\n"; + CUDA_CHECK(cudaq_destroy_dispatch_graph(dispatch_ctx)); + } + + for (auto& s : predecoder_streams) { + cudaStreamSynchronize(s); + cudaStreamDestroy(s); + } + + // Explicitly call destructors for libcu++ atomics before freeing memory + for (size_t i = 0; i < NUM_SLOTS; ++i) { + rx_flags_host[i].~atomic_uint64_sys(); + tx_flags_host[i].~atomic_uint64_sys(); + } + + cudaFreeHost(buf_rx); + cudaFreeHost(buf_tx); + cudaFreeHost(rx_data_host); + cudaFreeHost(h_mailbox_bank); + if (shutdown_flag_host) cudaFreeHost(shutdown_flag_host); + if (d_global_mailbox_bank) cudaFree(d_global_mailbox_bank); + if (d_stats) cudaFree(d_stats); + if (d_function_entries) cudaFree(d_function_entries); + cudaStreamDestroy(capture_stream); + + std::cout << "Done.\n"; + return 0; + } \ No newline at end of file diff --git a/libs/qec/unittests/CMakeLists.txt b/libs/qec/unittests/CMakeLists.txt index 255c3522..7c1a8215 100644 --- a/libs/qec/unittests/CMakeLists.txt +++ b/libs/qec/unittests/CMakeLists.txt @@ -232,7 +232,13 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) ONNX_MODEL_DIR="${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime" ) + # libcu++ (cuda/std/atomic) lives in CUDA toolkit under cccl/ + get_filename_component(_cuda_bin "${CMAKE_CUDA_COMPILER}" DIRECTORY) + get_filename_component(_cuda_root "${_cuda_bin}" DIRECTORY) + set(_cuda_cccl_include "${_cuda_root}/include/cccl") + target_include_directories(test_realtime_predecoder_w_pymatching PRIVATE + ${_cuda_cccl_include} ${CUDAToolkit_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../include diff --git a/realtime/include/cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h b/realtime/include/cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h index 98459c98..792893eb 100644 --- a/realtime/include/cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h +++ b/realtime/include/cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h @@ -106,7 +106,7 @@ typedef struct { uint32_t mailbox_idx; // index into global_mailbox_bank uint32_t _pad0; // alignment padding int *d_queue_idx; // device pointer to queue tail tracker - volatile int *d_ready_flags; // device-mapped pointer to ready flags + void *d_ready_flags; // device-mapped: cuda::std::atomic* volatile int *d_inflight_flag; // 0 = idle, 1 = graph in flight (single-launch guard) } cudaq_function_entry_t; diff --git a/realtime/lib/daemon/dispatcher/dispatch_kernel.cu b/realtime/lib/daemon/dispatcher/dispatch_kernel.cu index 1495902d..fcfa7f9a 100644 --- a/realtime/lib/daemon/dispatcher/dispatch_kernel.cu +++ b/realtime/lib/daemon/dispatcher/dispatch_kernel.cu @@ -10,6 +10,7 @@ #include "cudaq/nvqlink/daemon/dispatcher/dispatch_modes.h" #include "cudaq/nvqlink/daemon/dispatcher/kernel_types.h" +#include #include #include #include @@ -174,10 +175,10 @@ __global__ void dispatch_kernel_with_graph( bool queue_full = false; if (!already_in_flight) { int* d_queue_idx = entry->d_queue_idx; - volatile int* d_ready_flags = entry->d_ready_flags; + auto* d_ready_flags = static_cast*>(entry->d_ready_flags); if (d_queue_idx != nullptr && d_ready_flags != nullptr) { int current_tail = *d_queue_idx; - if (d_ready_flags[current_tail] == 1) { + if (d_ready_flags[current_tail].load(cuda::std::memory_order_acquire) == 1) { queue_full = true; } } From 10dfcfb9675c0e3ced58b54015a834d393d618e8 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Sun, 22 Feb 2026 00:18:27 +0000 Subject: [PATCH 14/40] Updated the design document to reflect code changes. Signed-off-by: Scott Thornton --- docs/host_side_dispatcher_design_gemini.md | 74 ++++++++++++++++++---- 1 file changed, 62 insertions(+), 12 deletions(-) diff --git a/docs/host_side_dispatcher_design_gemini.md b/docs/host_side_dispatcher_design_gemini.md index 30093118..b97fd74c 100644 --- a/docs/host_side_dispatcher_design_gemini.md +++ b/docs/host_side_dispatcher_design_gemini.md @@ -7,7 +7,7 @@ **Supersedes**: Device-side persistent kernel dispatcher (`dispatch_kernel_with_graph`) and Statically-mapped Host Dispatcher **Target Platforms**: NVIDIA Grace Hopper (GH200), Grace Blackwell (GB200) **Shared-Memory Model**: libcu++ `cuda::std::atomic` with `thread_scope_system` -**Last Updated**: 2026-02-20 +**Last Updated**: 2026-02-21 --- @@ -104,7 +104,15 @@ void host_dispatcher_loop(DispatcherContext& ctx) { __sync_synchronize(); // Full barrier to ensure mailbox write is visible // 6. Launch graph on the assigned worker's stream - cudaGraphLaunch(ctx.workers[worker_id].graph_exec, ctx.workers[worker_id].stream); + cudaError_t err = cudaGraphLaunch(ctx.workers[worker_id].graph_exec, ctx.workers[worker_id].stream); + if (err != cudaSuccess) { + uint64_t error_val = (uint64_t)0xDEAD << 48 | (uint64_t)err; + ctx.tx_flags[current_slot].store(error_val, cuda::std::memory_order_release); + ctx.idle_mask->fetch_or(1ULL << worker_id, cuda::std::memory_order_release); + } else { + // 6b. Mark slot IN_FLIGHT so producer does not reuse it while GPU/workers use it + ctx.tx_flags[current_slot].store(0xEEEEEEEEEEEEEEEEULL, cuda::std::memory_order_release); + } // 7. Consume slot and advance ctx.rx_flags[current_slot].store(0, cuda::std::memory_order_release); @@ -139,9 +147,25 @@ The predecoder GPU kernels require minimal changes, as the dynamic pooling compl ## 6. Worker Subsystem (Consumer) -A separate CPU polling thread scans the `ready_flags` array. When a GPU graph finishes, the job is handed to a CPU thread pool for PyMatching decoding. +A separate CPU polling thread scans the `ready_flags` array. When a GPU graph finishes, the job is handed to a CPU thread pool for PyMatching decoding. + +### 6.1 Ready-Flag State Machine (Atomic Claiming) + +With a single slot per predecoder (queue depth 1), the poller must **claim** each completion exactly once. If the poller only checks `ready_flags[i]==1` and enqueues without claiming, it will enqueue the same job repeatedly until the PyMatching worker calls `release_job`, flooding the thread pool and stalling the pipeline. + +**States** (per-worker ready flag): + +| Value | State | Meaning | +| :--- | :--- | :--- | +| 0 | Idle | Waiting for GPU, or worker has called `release_job`. | +| 1 | Ready | GPU finished; output kernel stored 1. | +| 2 | Processing | CPU poller claimed the job; PyMatching is running. | -### 6.1 Worker Logic (Pseudocode) +**Poller**: Use `compare_exchange_strong(expected=1, desired=2, memory_order_acquire, memory_order_relaxed)`. Only the thread that wins the CAS enqueues the job. Use **relaxed on failure** so spin-polling does not add barriers that delay seeing the GPU's store(1). + +**Worker**: When PyMatching finishes, call `release_job(slot_idx)` which does `ready_flags[0].store(0, release)` so the slot is Idle for the next launch. + +### 6.2 Worker Logic (Pseudocode) ```cpp void pymatching_worker_task(WorkerContext& ctx, int worker_id) { // 1. Read GPU outputs from mapped pinned memory @@ -154,8 +178,8 @@ void pymatching_worker_task(WorkerContext& ctx, int worker_id) { uint64_t response_val = format_response(...); ctx.tx_flags[origin_slot].store(response_val, cuda::std::memory_order_release); - // 4. Acknowledge GPU read completion - ctx.ready_flags[worker_id].store(0, cuda::std::memory_order_release); + // 4. Acknowledge GPU read completion (Idle for next launch) + ctx.ready_flags[worker_id].store(0, cuda::std::memory_order_release); // 2 -> 0 // 5. FREE THE WORKER: Return this worker back to the dispatcher pool ctx.idle_mask->fetch_or((1ULL << worker_id), cuda::std::memory_order_release); @@ -174,22 +198,48 @@ void pymatching_worker_task(WorkerContext& ctx, int worker_id) { 6. **Host Dispatcher** saves `inflight_slot_tags[2] = 5`. 7. **Host Dispatcher** translates `host_ptr` to `dev_ptr`, writes to `mailbox_bank[2]`. 8. **Host Dispatcher** calls `cudaGraphLaunch(..., stream[2])`. -9. **Host Dispatcher** clears `rx_flags[5] = 0` and advances to `current_slot = 6`. +9. **Host Dispatcher** sets `tx_flags[5] = 0xEEEE...` (IN_FLIGHT), then clears `rx_flags[5] = 0` and advances to `current_slot = 6`. 10. **GPU** executes graph on stream 2. Finishes and sets `ready_flags[2] = 1`. -11. **CPU Poller** sees `ready_flags[2] == 1`, triggers PyMatching on CPU. +11. **CPU Poller** CAS(1, 2) on `ready_flags[2]`, wins, enqueues job once; PyMatching runs on CPU. 12. **CPU Worker** finishes PyMatching. 13. **CPU Worker** looks up `origin_slot = inflight_slot_tags[2]` (which is 5). -14. **CPU Worker** writes response to `tx_flags[5]`. -15. **CPU Worker** restores bit 2 in `idle_mask`, freeing `worker_id = 2` for the dispatcher. +14. **CPU Worker** writes response to `tx_flags[5]` (overwrites 0xEEEE), then `release_job`, then restores bit 2 in `idle_mask`. +15. **Consumer** (harvest thread) sees `tx_flags[5] != 0` and `!= 0xEEEE`, harvests, then clears `tx_flags[5] = 0`. Producer may now reuse slot 5. + +--- + +## 8. Ring Buffer and IN_FLIGHT Sentinel + +Because `cudaGraphLaunch` is asynchronous, the dispatcher clears `rx_flags[slot]` immediately after launch. Without a hold, the **producer** (FPGA sim or test) would see `rx_flags[slot]==0` and `tx_flags[slot]==0` (response not written yet) and reuse the slot, overwriting data while the GPU is still reading. + +**Fix: IN_FLIGHT tag** + +1. **Dispatcher**: On successful launch, write `tx_flags[current_slot].store(0xEEEEEEEEEEEEEEEEULL, release)` **before** clearing `rx_flags[current_slot]`. On launch failure, write the 0xDEAD|err value and restore the worker bit; do not write 0xEEEE. +2. **Producer**: Reuse a slot only when **both** `rx_flags[slot]==0` **and** `tx_flags[slot]==0`. Thus the producer blocks until the consumer has harvested (tx cleared). +3. **Consumer**: When harvesting, treat only real responses: `tx_flags[slot] != 0` **and** `tx_flags[slot] != 0xEEEEEEEEEEEEEEEEULL`. Ignore 0xEEEE (in-flight). On harvest, clear `tx_flags[slot] = 0`. + +**Slot lifecycle**: Idle (rx=0, tx=0) → Written (rx=ptr, tx=0) → In-flight (rx=0, tx=0xEEEE) → Completed (rx=0, tx=response) → Consumer harvests, tx=0 → Idle. + +--- + +## 9. Shutdown and Grace Period + +- **Grace period**: After the producer thread exits, the main thread may wait up to a bounded time (e.g. 10 s) for `total_completed >= total_submitted`. +- **Consumer exit**: The consumer thread normally exits when `producer_done && total_completed >= total_submitted`. To avoid hanging forever if some in-flight requests never complete, set a **consumer_stop** flag after the grace period; the consumer loop checks this and exits so `consumer.join()` returns and the process can print the final report and exit cleanly. +- **Diagnostic threads**: A progress reporter (submitted/completed every second) and a watchdog (stall detection every 2 s) are **optional** and should be **disabled by default** (e.g. `kEnableProgressReporter = false`, `kEnableWatchdog = false`). Enable them only when debugging stalls; otherwise they can block shutdown (e.g. watchdog not seeing `producer_done`). --- -## 8. LLM Implementation Directives (Constraints Checklist) +## 10. LLM Implementation Directives (Constraints Checklist) When generating code from this specification, the LLM **MUST** strictly adhere to the following constraints: - [ ] **NO CUDA STREAM QUERYING**: Do not use `cudaStreamQuery()` for backpressure or completion checking. It incurs severe driver latency. Rely strictly on `idle_mask` and `ready_flags`. -- [ ] **NO WEAK ORDERING BUGS**: Do not use `volatile`. Do not use `__threadfence_system()`. You must use `cuda::std::atomic` for all cross-device synchronization. +- [ ] **NO WEAK ORDERING BUGS**: Do not use `volatile`. Do not use `__threadfence_system()`. You must use `cuda::std::atomic` (or `` with `thread_scope_system`) for all cross-device synchronization. - [ ] **NO HEAD OF LINE BLOCKING**: The host dispatcher MUST NOT statically map slots to predecoders. It must dynamically allocate via `idle_mask`. - [ ] **NO DATA LOSS**: If `idle_mask == 0` (all workers busy), the dispatcher MUST spin on the current slot (`QEC_CPU_RELAX()`). It MUST NOT advance `current_slot` until a worker is allocated and the graph is launched. - [ ] **NO RACE CONDITIONS ON TAGS**: `inflight_slot_tags` does not need to be atomic because index `[worker_id]` is exclusively owned by the active flow once the dispatcher clears the bit in `idle_mask`, until the worker thread restores the bit. +- [ ] **READY FLAG CLAIMING**: The CPU poller MUST claim each completion exactly once using compare_exchange_strong(1, 2) on the ready flag; use relaxed memory order on CAS failure. The worker MUST clear the flag (store 0) in `release_job`. +- [ ] **IN_FLIGHT SENTINEL**: After a successful `cudaGraphLaunch`, the dispatcher MUST write `tx_flags[current_slot] = 0xEEEEEEEEEEEEEEEEULL` before clearing `rx_flags[current_slot]`. The producer MUST wait for both rx and tx to be 0 before reusing a slot. The consumer MUST ignore 0xEEEE and only harvest real responses (or 0xDEAD errors). +- [ ] **SHUTDOWN**: Use a `consumer_stop` (or equivalent) flag so the consumer thread can exit after a grace period even when `total_completed < total_submitted`; join the consumer after setting the flag so the process exits cleanly. +- [ ] **DIAGNOSTIC THREADS**: Progress reporter and watchdog threads MUST be optional and disabled by default so they do not block normal shutdown. From df47e950116543c9ae6e068faed45035e51e7248 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Sun, 22 Feb 2026 02:44:37 +0000 Subject: [PATCH 15/40] perf: optimize predecoder realtime pipeline latency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fundamentally redesigns the host-side execution model to achieve microsecond-level latency, shifting from a general-purpose thread pool to a strict, pinned, and lock-free architecture. Key architectural changes in `test_realtime_predecoder_w_pymatching.cpp`: 1. Dedicated Polling Threads (Removed Thread Pool) - Replaced `cudaq::qec::utils::ThreadPool` and the single `incoming_thread` with a vector of dedicated `std::thread` worker loops. - Eliminates queueing latency, mutex locking, and context switching overhead. Each worker thread now spins continuously checking for its own GPU completions. 2. Strict CPU Thread Pinning - Introduced `pin_thread_to_core` and `pin_current_thread_to_core` using the Linux `pthread_setaffinity_np` API. - Pinned the Dispatcher (Core 2), Producer (Core 3), Consumer (Core 4), and all Worker threads (Cores 10+) to ensure they never migrate, keeping their CPU caches perfectly warm. 3. High-Resolution Sub-Component Timing - Added tracking arrays (`dispatch_ts`, `poll_ts`, `debug_dispatch_ts_arr`) piped through `WorkerPoolContext` and `PreDecoderJob`. - Updated end-of-run reporting to calculate differences between timestamps, proving that Host Dispatch overhead is negligible (~1-3µs) and the bottleneck is the GPU inference itself. 4. PyMatching Data Conversion Optimization - Inside `pymatching_worker_task`, replaced the conversion of `int32_t` syndrome data into a `std::vector`. - Now populates a pre-allocated `cudaqx::tensor` to avoid slow double-precision conversions inside the latency-critical worker loop. 5. NVTX Profiling Markers - Included `` and wrapped key blocks in `nvtxRangePushA` and `nvtxRangePop`. - Enables generation of `nsys` profiles to visually align CPU thread activity with GPU TensorRT execution. Other changes: - Enable TensorRT FP16 builder flag (`kFP16`) in `ai_decoder_service.cu` for supported platforms to accelerate GPU inference. Signed-off-by: Scott Thornton --- .../qec/realtime/ai_predecoder_service.h | 5 + .../cudaq/qec/realtime/host_dispatcher.h | 3 + libs/qec/lib/realtime/ai_decoder_service.cu | 9 + libs/qec/lib/realtime/host_dispatcher.cpp | 20 + .../test_realtime_predecoder_w_pymatching.cpp | 389 +++++++++++------- 5 files changed, 286 insertions(+), 140 deletions(-) diff --git a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h index e2b5be46..69f07e21 100644 --- a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h +++ b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h @@ -29,6 +29,11 @@ struct PreDecoderJob { int origin_slot; ///< FPGA ring slot for tx_flags routing (dynamic pool) void* ring_buffer_ptr; void* inference_data; ///< Points into the pinned output (single slot) + + // Performance Tracking + uint64_t submit_ts_ns; + uint64_t dispatch_ts_ns; + uint64_t poll_ts_ns; }; class AIPreDecoderService : public AIDecoderService { diff --git a/libs/qec/include/cudaq/qec/realtime/host_dispatcher.h b/libs/qec/include/cudaq/qec/realtime/host_dispatcher.h index 5eaf049e..82412b75 100644 --- a/libs/qec/include/cudaq/qec/realtime/host_dispatcher.h +++ b/libs/qec/include/cudaq/qec/realtime/host_dispatcher.h @@ -52,6 +52,9 @@ struct HostDispatcherConfig { /// Dynamic worker pool (design: Host-Side Spin-Polling Dispatcher) atomic_uint64_sys* idle_mask; ///< 1 = free, 0 = busy; bit index = worker_id int* inflight_slot_tags; ///< worker_id -> origin FPGA slot for tx_flags routing + + // Optional arrays for timestamping + uint64_t* debug_dispatch_ts = nullptr; }; /// Run the host-side dispatcher loop. Blocks until *config.shutdown_flag diff --git a/libs/qec/lib/realtime/ai_decoder_service.cu b/libs/qec/lib/realtime/ai_decoder_service.cu index f581b5b4..78f14850 100644 --- a/libs/qec/lib/realtime/ai_decoder_service.cu +++ b/libs/qec/lib/realtime/ai_decoder_service.cu @@ -144,6 +144,15 @@ void AIDecoderService::build_engine_from_onnx(const std::string& onnx_path, auto builder = std::unique_ptr(nvinfer1::createInferBuilder(gLogger)); auto network = std::unique_ptr(builder->createNetworkV2(0)); auto config = std::unique_ptr(builder->createBuilderConfig()); + + // Enable FP16 optimization for Grace Blackwell / Hopper + if (builder->platformHasFastFp16()) { + config->setFlag(nvinfer1::BuilderFlag::kFP16); + std::printf("[TensorRT] FP16 precision enabled.\n"); + } else { + std::printf("[TensorRT] Warning: Platform does not support fast FP16. Using FP32.\n"); + } + auto parser = std::unique_ptr( nvonnxparser::createParser(*network, gLogger)); diff --git a/libs/qec/lib/realtime/host_dispatcher.cpp b/libs/qec/lib/realtime/host_dispatcher.cpp index 12c5c4eb..65fb72a6 100644 --- a/libs/qec/lib/realtime/host_dispatcher.cpp +++ b/libs/qec/lib/realtime/host_dispatcher.cpp @@ -9,6 +9,7 @@ #include "cudaq/qec/realtime/host_dispatcher.h" #include +#include namespace cudaq::qec { @@ -18,13 +19,20 @@ void host_dispatcher_loop(const HostDispatcherConfig& config) { const int num_workers = static_cast(config.workers.size()); uint64_t packets_dispatched = 0; + nvtxRangePushA("Dispatcher Loop"); + while (config.shutdown_flag->load(cuda::std::memory_order_acquire) == 0) { uint64_t rx_value = config.rx_flags[current_slot].load(cuda::std::memory_order_acquire); if (rx_value != 0) { + nvtxRangePushA("Process Slot"); + uint64_t mask = config.idle_mask->load(cuda::std::memory_order_acquire); if (mask == 0) { + nvtxRangePushA("Wait Worker"); QEC_CPU_RELAX(); + nvtxRangePop(); // Wait Worker + nvtxRangePop(); // Process Slot continue; } @@ -40,8 +48,16 @@ void host_dispatcher_loop(const HostDispatcherConfig& config) { config.h_mailbox_bank[worker_id] = data_dev; __sync_synchronize(); + if (config.debug_dispatch_ts) { + config.debug_dispatch_ts[current_slot] = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now().time_since_epoch()).count(); + } + + nvtxRangePushA("Launch Graph"); cudaError_t err = cudaGraphLaunch(config.workers[worker_id].graph_exec, config.workers[worker_id].stream); + nvtxRangePop(); // Launch Graph + if (err != cudaSuccess) { uint64_t error_val = (uint64_t)0xDEAD << 48 | (uint64_t)err; config.tx_flags[current_slot].store(error_val, cuda::std::memory_order_release); @@ -56,10 +72,14 @@ void host_dispatcher_loop(const HostDispatcherConfig& config) { if (config.live_dispatched) config.live_dispatched->fetch_add(1, cuda::std::memory_order_relaxed); current_slot = (current_slot + 1) % num_slots; + + nvtxRangePop(); // Process Slot } else { QEC_CPU_RELAX(); } } + + nvtxRangePop(); // Dispatcher Loop for (const auto& w : config.workers) { cudaStreamSynchronize(w.stream); diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index 485a65a2..7f8e858f 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -53,8 +53,11 @@ #include #include #include - - #include + #include +#include +#include + +#include #ifndef CUDA_VERSION #define CUDA_VERSION 13000 @@ -71,16 +74,37 @@ #include "cudaq/qec/code.h" #include "cudaq/qec/decoder.h" - #define CUDA_CHECK(call) \ - do { \ - cudaError_t err = call; \ - if (err != cudaSuccess) { \ - std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at line " << __LINE__ << std::endl; \ - exit(1); \ - } \ - } while(0) - - using namespace cudaq::qec; +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at line " << __LINE__ << std::endl; \ + exit(1); \ + } \ + } while(0) + +// Pin a thread to a specific CPU core (Cores 2-5 = spinning infra, 10+ = workers; 0-1 = OS). +static void pin_thread_to_core(std::thread& t, int core_id) { + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + int rc = pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset); + if (rc != 0) { + std::cerr << "Warning: Failed to pin thread to core " << core_id << " (Error: " << rc << ")\n"; + } +} + +static void pin_current_thread_to_core(int core_id) { + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + int rc = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); + if (rc != 0) { + std::cerr << "Warning: Failed to pin current thread to core " << core_id << " (Error: " << rc << ")\n"; + } +} + +using namespace cudaq::qec; // ============================================================================= // Pipeline Configuration @@ -126,9 +150,9 @@ "model1_d7_r7_unified_Z_batch1.onnx", /*slot_size=*/4096, /*total_requests=*/100, - /*num_predecoders=*/4, + /*num_predecoders=*/64, /*queue_depth=*/16, - /*num_workers=*/4 + /*num_workers=*/64 }; } @@ -142,9 +166,9 @@ "model1_d13_r13_unified_Z_batch1.onnx", /*slot_size=*/16384, /*total_requests=*/100, - /*num_predecoders=*/4, + /*num_predecoders=*/64, /*queue_depth=*/16, - /*num_workers=*/4 + /*num_workers=*/64 }; } @@ -158,9 +182,9 @@ "model1_d21_r21_unified_X_batch1.onnx", /*slot_size=*/65536, /*total_requests=*/100, - /*num_predecoders=*/4, + /*num_predecoders=*/64, /*queue_depth=*/16, - /*num_workers=*/4 + /*num_workers=*/64 }; } @@ -174,9 +198,9 @@ "model1_d31_r31_unified_Z_batch1.onnx", /*slot_size=*/262144, /*total_requests=*/100, - /*num_predecoders=*/4, + /*num_predecoders=*/64, /*queue_depth=*/16, - /*num_workers=*/4 + /*num_workers=*/64 }; } }; @@ -217,6 +241,7 @@ cudaq::qec::atomic_uint64_sys* tx_flags = nullptr; cudaq::qec::atomic_uint64_sys* idle_mask = nullptr; int* inflight_slot_tags = nullptr; + uint64_t* debug_poll_ts = nullptr; }; // ============================================================================= @@ -231,66 +256,79 @@ void pymatching_worker_task(PreDecoderJob job, int worker_id, AIPreDecoderService* predecoder, DecoderContext* ctx, - WorkerPoolContext* pool_ctx) { - using hrclock = std::chrono::high_resolution_clock; - auto worker_start = hrclock::now(); - - const int32_t* residual = static_cast(job.inference_data); - auto* my_decoder = ctx->acquire_decoder(); - - int total_corrections = 0; - bool all_converged = true; - - auto decode_start = hrclock::now(); - for (int s = 0; s < ctx->spatial_slices; ++s) { - const int32_t* slice = residual + s * ctx->z_stabilizers; - std::vector syndrome(ctx->z_stabilizers); - for (int i = 0; i < ctx->z_stabilizers; ++i) - syndrome[i] = static_cast(slice[i]); - - auto result = my_decoder->decode(syndrome); - - all_converged &= result.converged; - for (auto v : result.result) - if (v > 0.5) total_corrections++; - } - auto decode_end = hrclock::now(); - - DecodeResponse resp_data{total_corrections, all_converged ? 1 : 0}; - - char* response_payload = (char*)job.ring_buffer_ptr + sizeof(cudaq::nvqlink::RPCResponse); - std::memcpy(response_payload, &resp_data, sizeof(resp_data)); - - auto* header = static_cast(job.ring_buffer_ptr); - header->magic = cudaq::nvqlink::RPC_MAGIC_RESPONSE; - header->status = 0; - header->result_len = sizeof(resp_data); - - uint64_t rx_value = reinterpret_cast(job.ring_buffer_ptr); - int origin_slot = job.origin_slot; - - if (pool_ctx && pool_ctx->tx_flags) { - pool_ctx->tx_flags[origin_slot].store(rx_value, cuda::std::memory_order_release); - } else { - size_t slot_idx = ((uint8_t*)job.ring_buffer_ptr - g_sys_ctx.rx_data_host) / g_sys_ctx.slot_size; - g_sys_ctx.tx_flags_host[slot_idx].store(rx_value, cuda::std::memory_order_release); - } - - predecoder->release_job(job.slot_idx); - - if (pool_ctx && pool_ctx->idle_mask) { - pool_ctx->idle_mask->fetch_or(1ULL << worker_id, cuda::std::memory_order_release); - } - - auto worker_end = hrclock::now(); - auto decode_us = std::chrono::duration_cast( - decode_end - decode_start).count(); - auto worker_us = std::chrono::duration_cast( - worker_end - worker_start).count(); - ctx->total_decode_us.fetch_add(decode_us, std::memory_order_relaxed); - ctx->total_worker_us.fetch_add(worker_us, std::memory_order_relaxed); - ctx->decode_count.fetch_add(1, std::memory_order_relaxed); - } + WorkerPoolContext* pool_ctx) { + nvtxRangePushA("Worker Task"); + using hrclock = std::chrono::high_resolution_clock; + auto worker_start = hrclock::now(); + + if (pool_ctx && pool_ctx->debug_poll_ts) { + pool_ctx->debug_poll_ts[job.origin_slot] = std::chrono::duration_cast( + worker_start.time_since_epoch()).count(); + } + + const int32_t* residual = static_cast(job.inference_data); + auto* my_decoder = ctx->acquire_decoder(); + + int total_corrections = 0; + bool all_converged = true; + + auto decode_start = hrclock::now(); + nvtxRangePushA("PyMatching Decode"); + + cudaqx::tensor syndrome_tensor({(size_t)ctx->z_stabilizers}); + uint8_t* syn_data = syndrome_tensor.data(); + + for (int s = 0; s < ctx->spatial_slices; ++s) { + const int32_t* slice = residual + s * ctx->z_stabilizers; + for (int i = 0; i < ctx->z_stabilizers; ++i) { + syn_data[i] = static_cast(slice[i]); + } + + auto result = my_decoder->decode(syndrome_tensor); + + all_converged &= result.converged; + for (auto v : result.result) + if (v > 0.5) total_corrections++; + } + nvtxRangePop(); // PyMatching Decode + auto decode_end = hrclock::now(); + + DecodeResponse resp_data{total_corrections, all_converged ? 1 : 0}; + + char* response_payload = (char*)job.ring_buffer_ptr + sizeof(cudaq::nvqlink::RPCResponse); + std::memcpy(response_payload, &resp_data, sizeof(resp_data)); + + auto* header = static_cast(job.ring_buffer_ptr); + header->magic = cudaq::nvqlink::RPC_MAGIC_RESPONSE; + header->status = 0; + header->result_len = sizeof(resp_data); + + uint64_t rx_value = reinterpret_cast(job.ring_buffer_ptr); + int origin_slot = job.origin_slot; + + if (pool_ctx && pool_ctx->tx_flags) { + pool_ctx->tx_flags[origin_slot].store(rx_value, cuda::std::memory_order_release); + } else { + size_t slot_idx = ((uint8_t*)job.ring_buffer_ptr - g_sys_ctx.rx_data_host) / g_sys_ctx.slot_size; + g_sys_ctx.tx_flags_host[slot_idx].store(rx_value, cuda::std::memory_order_release); + } + + predecoder->release_job(job.slot_idx); + + if (pool_ctx && pool_ctx->idle_mask) { + pool_ctx->idle_mask->fetch_or(1ULL << worker_id, cuda::std::memory_order_release); + } + + auto worker_end = hrclock::now(); + auto decode_us = std::chrono::duration_cast( + decode_end - decode_start).count(); + auto worker_us = std::chrono::duration_cast( + worker_end - worker_start).count(); + ctx->total_decode_us.fetch_add(decode_us, std::memory_order_relaxed); + ctx->total_worker_us.fetch_add(worker_us, std::memory_order_relaxed); + ctx->decode_count.fetch_add(1, std::memory_order_relaxed); + nvtxRangePop(); // Worker Task +} // ============================================================================= // Incoming Polling Thread @@ -301,34 +339,38 @@ DecoderContext* ctx, std::atomic& stop_signal, WorkerPoolContext* pool_ctx = nullptr, - std::atomic* total_claimed = nullptr) - { - PreDecoderJob job; - int num_workers = static_cast(predecoders.size()); - while (!stop_signal.load(std::memory_order_relaxed)) { - bool found_work = false; - for (int i = 0; i < num_workers; ++i) { - if (predecoders[i]->poll_next_job(job)) { - if (pool_ctx && pool_ctx->inflight_slot_tags) { - job.origin_slot = pool_ctx->inflight_slot_tags[i]; - } else { - job.origin_slot = static_cast(((uint8_t*)job.ring_buffer_ptr - g_sys_ctx.rx_data_host) / g_sys_ctx.slot_size); - } - if (total_claimed) total_claimed->fetch_add(1, std::memory_order_relaxed); - AIPreDecoderService* pd_ptr = predecoders[i].get(); - int worker_id = i; - WorkerPoolContext* pctx = pool_ctx; - thread_pool.enqueue([job, worker_id, pd_ptr, ctx, pctx]() { - pymatching_worker_task(job, worker_id, pd_ptr, ctx, pctx); - }); - found_work = true; - } - } - if (!found_work) { - QEC_CPU_RELAX(); - } - } - } + std::atomic* total_claimed = nullptr) +{ + nvtxRangePushA("Polling Loop"); + PreDecoderJob job; + int num_workers = static_cast(predecoders.size()); + while (!stop_signal.load(std::memory_order_relaxed)) { + bool found_work = false; + for (int i = 0; i < num_workers; ++i) { + if (predecoders[i]->poll_next_job(job)) { + nvtxRangePushA("Dispatch Job"); + if (pool_ctx && pool_ctx->inflight_slot_tags) { + job.origin_slot = pool_ctx->inflight_slot_tags[i]; + } else { + job.origin_slot = static_cast(((uint8_t*)job.ring_buffer_ptr - g_sys_ctx.rx_data_host) / g_sys_ctx.slot_size); + } + if (total_claimed) total_claimed->fetch_add(1, std::memory_order_relaxed); + AIPreDecoderService* pd_ptr = predecoders[i].get(); + int worker_id = i; + WorkerPoolContext* pctx = pool_ctx; + thread_pool.enqueue([job, worker_id, pd_ptr, ctx, pctx]() { + pymatching_worker_task(job, worker_id, pd_ptr, ctx, pctx); + }); + found_work = true; + nvtxRangePop(); // Dispatch Job + } + } + if (!found_work) { + QEC_CPU_RELAX(); + } + } + nvtxRangePop(); // Polling Loop +} // ============================================================================= // Generate Realistic Syndrome Data @@ -360,7 +402,6 @@ cudaq::qec::atomic_uint64_sys* tx_flags, DecoderContext& decoder_ctx, std::vector>& predecoders, - cudaq::qec::utils::ThreadPool& pymatching_pool, std::atomic& system_stop, void** h_mailbox_bank, std::vector& predecoder_streams, @@ -378,8 +419,11 @@ std::vector submit_ts(max_requests); std::vector complete_ts(max_requests); std::vector completed(max_requests, false); + std::vector dispatch_ts(max_requests, 0); + std::vector poll_ts(max_requests, 0); std::vector slot_request(NUM_SLOTS, -1); + std::vector debug_dispatch_ts_arr(NUM_SLOTS, 0); std::atomic total_submitted{0}; std::atomic total_completed{0}; @@ -404,16 +448,18 @@ disp_cfg.live_dispatched = &live_dispatched; disp_cfg.idle_mask = pool_ctx->idle_mask; disp_cfg.inflight_slot_tags = pool_ctx->inflight_slot_tags; + disp_cfg.debug_dispatch_ts = debug_dispatch_ts_arr.data(); disp_cfg.workers.resize(num_workers); for (int i = 0; i < num_workers; ++i) { disp_cfg.workers[i].graph_exec = predecoders[i]->get_executable_graph(); disp_cfg.workers[i].stream = predecoder_streams[i]; } - std::thread dispatcher_thread([&disp_cfg]() { - host_dispatcher_loop(disp_cfg); - }); - + std::thread dispatcher_thread([&disp_cfg]() { + host_dispatcher_loop(disp_cfg); + }); + pin_thread_to_core(dispatcher_thread, 2); + auto run_deadline = std::chrono::steady_clock::now() + std::chrono::seconds(scfg.duration_s); @@ -505,7 +551,8 @@ producer_done.store(true, std::memory_order_seq_cst); }); - + pin_thread_to_core(producer, 3); + // --- Consumer thread (harvests completions sequentially) --- std::thread consumer([&]() { int next_harvest = 0; @@ -533,6 +580,8 @@ int rid = slot_request[slot]; if (rid >= 0 && (tv >> 48) != 0xDEAD) { complete_ts[rid] = hrclock::now(); + dispatch_ts[rid] = debug_dispatch_ts_arr[slot]; + poll_ts[rid] = pool_ctx->debug_poll_ts[slot]; completed[rid] = true; total_completed.fetch_add(1, std::memory_order_relaxed); } else if ((tv >> 48) == 0xDEAD) { @@ -552,6 +601,7 @@ } } }); + pin_thread_to_core(consumer, 4); // --- DIAGNOSTIC WATCHDOG THREAD (debug only; set true to diagnose stalls) --- constexpr bool kEnableWatchdog = false; @@ -722,22 +772,47 @@ double avg_decode = (double)decoder_ctx.total_decode_us.load() / n_decoded; double avg_worker = (double)decoder_ctx.total_worker_us.load() / n_decoded; double avg_overhead = avg_worker - avg_decode; + + double sum_dispatch_latency = 0; + double sum_gpu_execution = 0; + int count_valid_ts = 0; + for (int i = warmup; i < nsub; ++i) { + if (completed[i] && dispatch_ts[i] > 0) { + uint64_t submit_ns = std::chrono::duration_cast(submit_ts[i].time_since_epoch()).count(); + if (dispatch_ts[i] > submit_ns && poll_ts[i] > dispatch_ts[i]) { + sum_dispatch_latency += (dispatch_ts[i] - submit_ns) / 1000.0; + sum_gpu_execution += (poll_ts[i] - dispatch_ts[i]) / 1000.0; + count_valid_ts++; + } else if (i == warmup) { + std::cout << "Debug [warmup]: submit=" << submit_ns << " dispatch=" << dispatch_ts[i] << " poll=" << poll_ts[i] << "\n"; + } + } + } + double avg_dispatch_latency = count_valid_ts > 0 ? (sum_dispatch_latency / count_valid_ts) : 0; + double avg_gpu_execution = count_valid_ts > 0 ? (sum_gpu_execution / count_valid_ts) : 0; + double avg_pipeline = mean - avg_worker; std::cout << std::setprecision(1); std::cout << " Worker Timing Breakdown (avg over " << n_decoded << " requests):\n"; - std::cout << " PyMatching decode:" << std::setw(10) << avg_decode + std::cout << " Host Dispatch overhead:" << std::setw(9) << avg_dispatch_latency + << " us (" << std::setw(4) << (mean > 0 ? 100.0 * avg_dispatch_latency / mean : 0) + << "%)\n"; + std::cout << " GPU TRT Inference: " << std::setw(9) << avg_gpu_execution + << " us (" << std::setw(4) << (mean > 0 ? 100.0 * avg_gpu_execution / mean : 0) + << "%)\n"; + std::cout << " PyMatching decode: " << std::setw(9) << avg_decode << " us (" << std::setw(4) << (mean > 0 ? 100.0 * avg_decode / mean : 0) << "%)\n"; - std::cout << " Worker overhead: " << std::setw(10) << avg_overhead + std::cout << " Worker overhead: " << std::setw(9) << avg_overhead << " us (" << std::setw(4) << (mean > 0 ? 100.0 * avg_overhead / mean : 0) << "%)\n"; - std::cout << " GPU+dispatch+poll:" << std::setw(10) << avg_pipeline - << " us (" << std::setw(4) << (mean > 0 ? 100.0 * avg_pipeline / mean : 0) + std::cout << " Other/Misc Wait: " << std::setw(9) << (avg_pipeline - avg_dispatch_latency - avg_gpu_execution) + << " us (" << std::setw(4) << (mean > 0 ? 100.0 * (avg_pipeline - avg_dispatch_latency - avg_gpu_execution) / mean : 0) << "%)\n"; - std::cout << " Total end-to-end: " << std::setw(10) << mean << " us\n"; - std::cout << " Per-round (/" << config.num_rounds << "): " - << std::setw(10) << (mean / config.num_rounds) << " us/round\n"; + std::cout << " Total end-to-end: " << std::setw(9) << mean << " us\n"; + std::cout << " Per-round (/" << config.num_rounds << "): " + << std::setw(9) << (mean / config.num_rounds) << " us/round\n"; } std::cout << " ---------------------------------------------------------------\n"; std::cout << " Host dispatcher processed " << dispatcher_stats << " packets.\n"; @@ -877,13 +952,19 @@ g_sys_ctx.slot_size = config.slot_size; // Define the dynamic pool variables HERE so they live until the program exits - atomic_uint64_sys idle_mask((1ULL << config.num_predecoders) - 1); + // Avoid 1ULL<<64 (UB); for 64 workers use all-ones mask. + uint64_t initial_idle = (config.num_predecoders >= 64) + ? ~0ULL + : ((1ULL << config.num_predecoders) - 1); + atomic_uint64_sys idle_mask(initial_idle); std::vector inflight_slot_tags(config.num_predecoders, 0); - + std::vector debug_poll_ts_arr(NUM_SLOTS, 0); + WorkerPoolContext pool_ctx; pool_ctx.tx_flags = tx_flags_host; pool_ctx.idle_mask = &idle_mask; pool_ctx.inflight_slot_tags = inflight_slot_tags.data(); + pool_ctx.debug_poll_ts = debug_poll_ts_arr.data(); // ========================================================================= // Mailbox & Dispatcher Setup (mode-dependent) @@ -985,16 +1066,42 @@ std::cout << "[Setup] Host-side dispatcher will be launched in streaming test.\n"; } - std::cout << "[Setup] Booting Thread Pool (" << config.num_workers - << " workers) & Polling Loop...\n"; - cudaq::qec::utils::ThreadPool pymatching_pool(config.num_workers); - std::atomic system_stop{false}; - std::atomic total_claimed{0}; + std::atomic system_stop{false}; + std::atomic total_claimed{0}; - std::thread incoming_thread([&]() { - incoming_polling_loop(predecoders, pymatching_pool, &decoder_ctx, - system_stop, &pool_ctx, &total_claimed); - }); + std::cout << "[Setup] Booting " << config.num_workers << " Dedicated Polling/Worker Threads...\n"; + std::vector worker_threads; + for (int i = 0; i < config.num_workers; ++i) { + worker_threads.emplace_back([i, &predecoders, &decoder_ctx, &system_stop, &pool_ctx, &total_claimed]() { + int target_core = 10 + i; + pin_current_thread_to_core(target_core); + + AIPreDecoderService* pd_ptr = predecoders[i].get(); + + nvtxRangePushA("Worker Loop"); + PreDecoderJob job; + while (!system_stop.load(std::memory_order_relaxed)) { + // Wait for GPU to set ready flag to 1 + if (pd_ptr->poll_next_job(job)) { + nvtxRangePushA("Process Job"); + + total_claimed.fetch_add(1, std::memory_order_relaxed); + + if (pool_ctx.inflight_slot_tags) { + job.origin_slot = pool_ctx.inflight_slot_tags[i]; + } else { + job.origin_slot = static_cast(((uint8_t*)job.ring_buffer_ptr - g_sys_ctx.rx_data_host) / g_sys_ctx.slot_size); + } + + pymatching_worker_task(job, i, pd_ptr, &decoder_ctx, &pool_ctx); + nvtxRangePop(); // Process Job + } else { + QEC_CPU_RELAX(); + } + } + nvtxRangePop(); // Worker Loop + }); + } // ========================================================================= // Test Stimulus @@ -1002,7 +1109,7 @@ if (streaming_mode) { run_streaming_test(config, stream_cfg, rx_data_host, rx_data_dev, rx_flags_host, tx_flags_host, - decoder_ctx, predecoders, pymatching_pool, system_stop, + decoder_ctx, predecoders, system_stop, h_mailbox_bank, predecoder_streams, &pool_ctx, &total_claimed); } else { const int batch_size = config.num_predecoders; @@ -1125,13 +1232,15 @@ std::cout << "[Teardown] Shutting down...\n"; system_stop = true; - if (!use_host_dispatcher) { - *shutdown_flag_host = 1; - __sync_synchronize(); - } - - incoming_thread.join(); - CUDA_CHECK(cudaStreamSynchronize(capture_stream)); + if (!use_host_dispatcher) { + *shutdown_flag_host = 1; + __sync_synchronize(); + } + + for (auto& t : worker_threads) { + if (t.joinable()) t.join(); + } + CUDA_CHECK(cudaStreamSynchronize(capture_stream)); if (!use_host_dispatcher) { uint64_t dispatched_packets = 0; From a04ef38f554d8af0049188d62dbf8d15e6633910 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Thu, 26 Feb 2026 17:01:50 +0000 Subject: [PATCH 16/40] Copied the updated realtime code (dispatchers and all) to the realtime directory. Refactored the pymatching demo code to use the updated functions in realtime. Signed-off-by: Scott Thornton --- CMakeLists.txt | 7 + .../test_realtime_predecoder_w_pymatching.cpp | 624 +--- libs/qec/unittests/CMakeLists.txt | 93 +- realtime/.clang-format | 2 +- realtime/CMakeLists.txt | 53 +- realtime/README.md | 41 +- realtime/docs/cudaq_realtime_host_api.html | 2945 +++++++++++++++++ .../docs/cudaq_realtime_message_protocol.html | 2513 ++++++++++++++ realtime/docs/nvqlink_latency_demo.md | 232 ++ .../daemon/dispatcher/cudaq_realtime.h | 219 -- .../daemon/dispatcher/cudaq_realtime.h | 345 ++ .../daemon/dispatcher/dispatch_kernel.cuh | 30 +- .../dispatcher/dispatch_kernel_launch.h | 55 +- .../daemon/dispatcher/dispatch_modes.h | 6 +- .../daemon/dispatcher/host_dispatcher.h | 71 + .../daemon/dispatcher/kernel_types.h | 4 + .../cudaq/realtime/hololink_bridge_common.h | 502 +++ realtime/lib/CMakeLists.txt | 4 +- realtime/lib/daemon/CMakeLists.txt | 40 +- .../daemon/dispatcher/cudaq_realtime_api.cpp | 145 +- .../lib/daemon/dispatcher/dispatch_kernel.cu | 469 ++- .../lib/daemon/dispatcher/host_dispatcher.cu | 178 + .../daemon/dispatcher/host_dispatcher_capi.cu | 157 + realtime/scripts/install_dev_prerequisites.sh | 53 + realtime/unittests/CMakeLists.txt | 32 +- realtime/unittests/test_dispatch_kernel.cu | 136 +- realtime/unittests/test_host_dispatcher.cu | 1015 ++++++ realtime/unittests/utils/CMakeLists.txt | 264 ++ realtime/unittests/utils/hololink_bridge.cpp | 124 + .../utils/hololink_fpga_emulator.cpp | 1210 +++++++ .../utils/hololink_fpga_playback.cpp | 534 +++ realtime/unittests/utils/hololink_test.sh | 408 +++ realtime/unittests/utils/hololink_wrapper.cpp | 216 ++ realtime/unittests/utils/hololink_wrapper.h | 142 + .../init_rpc_increment_function_table.cu | 92 + 35 files changed, 11972 insertions(+), 989 deletions(-) create mode 100644 realtime/docs/cudaq_realtime_host_api.html create mode 100644 realtime/docs/cudaq_realtime_message_protocol.html create mode 100644 realtime/docs/nvqlink_latency_demo.md delete mode 100644 realtime/include/cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h create mode 100644 realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h rename realtime/include/cudaq/{nvqlink => realtime}/daemon/dispatcher/dispatch_kernel.cuh (74%) rename realtime/include/cudaq/{nvqlink => realtime}/daemon/dispatcher/dispatch_kernel_launch.h (61%) rename realtime/include/cudaq/{nvqlink => realtime}/daemon/dispatcher/dispatch_modes.h (94%) create mode 100644 realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h rename realtime/include/cudaq/{nvqlink => realtime}/daemon/dispatcher/kernel_types.h (85%) create mode 100644 realtime/include/cudaq/realtime/hololink_bridge_common.h create mode 100644 realtime/lib/daemon/dispatcher/host_dispatcher.cu create mode 100644 realtime/lib/daemon/dispatcher/host_dispatcher_capi.cu create mode 100755 realtime/scripts/install_dev_prerequisites.sh create mode 100644 realtime/unittests/test_host_dispatcher.cu create mode 100644 realtime/unittests/utils/CMakeLists.txt create mode 100644 realtime/unittests/utils/hololink_bridge.cpp create mode 100644 realtime/unittests/utils/hololink_fpga_emulator.cpp create mode 100644 realtime/unittests/utils/hololink_fpga_playback.cpp create mode 100755 realtime/unittests/utils/hololink_test.sh create mode 100644 realtime/unittests/utils/hololink_wrapper.cpp create mode 100644 realtime/unittests/utils/hololink_wrapper.h create mode 100644 realtime/unittests/utils/init_rpc_increment_function_table.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 020b8c4b..4fbc9e4d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -286,6 +286,13 @@ if (CUDAQX_INCLUDE_DOCS) add_subdirectory(docs) endif() +# In-tree realtime (optional): provides cudaq-realtime and host-dispatcher for QEC tests +if(EXISTS "${CMAKE_SOURCE_DIR}/realtime/CMakeLists.txt" AND CMAKE_CUDA_COMPILER) + set(CUDAQ_REALTIME_STANDALONE_BUILD FALSE) + add_subdirectory(realtime) + set(CUDAQX_BUILD_REALTIME_IN_TREE TRUE) +endif() + foreach(lib ${CUDAQX_ENABLE_LIBS}) add_subdirectory(libs/${lib}) endforeach() diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index 7f8e858f..7ae57299 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -38,7 +38,7 @@ * 4. Dedicated Polling Thread -> Worker PyMatching Thread Pool * 5. CPU Workers closing the transaction (Setting TX flags) * - * Usage: test_realtime_predecoder_w_pymatching [d7|d13|d21|d31] [stream [rate_us] [duration_s]] + * Usage: test_realtime_predecoder_w_pymatching [d7|d13|d21|d31] [rate_us] [duration_s] ******************************************************************************/ #include @@ -57,18 +57,17 @@ #include #include -#include + #include #ifndef CUDA_VERSION #define CUDA_VERSION 13000 #endif - #include "cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h" - #include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" + #include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" + #include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" + #include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" #include "cudaq/qec/realtime/ai_decoder_service.h" #include "cudaq/qec/realtime/ai_predecoder_service.h" - #include "cudaq/qec/realtime/host_dispatcher.h" - #include "cudaq/qec/utils/thread_pool.h" #include #include "cudaq/qec/utils/pipeline_benchmarks.h" #include "cudaq/qec/code.h" @@ -105,6 +104,7 @@ static void pin_current_thread_to_core(int core_id) { } using namespace cudaq::qec; +namespace realtime_ns = cudaq::realtime; // ============================================================================= // Pipeline Configuration @@ -119,8 +119,7 @@ using namespace cudaq::qec; int meas_qubits; // ONNX input shape[1] int residual_detectors; // ONNX output dim std::string onnx_filename; - size_t slot_size; // must fit RPCHeader + input payload - int total_requests; + size_t slot_size; // must fit RPC header (CUDAQ_RPC_HEADER_SIZE) + input payload int num_predecoders; int queue_depth; int num_workers; @@ -149,13 +148,12 @@ using namespace cudaq::qec; /*residual_detectors=*/336, "model1_d7_r7_unified_Z_batch1.onnx", /*slot_size=*/4096, - /*total_requests=*/100, - /*num_predecoders=*/64, +/*num_predecoders=*/8, /*queue_depth=*/16, - /*num_workers=*/64 + /*num_workers=*/8 }; } - + static PipelineConfig d13_r13() { return { "d13_r13_Z", @@ -165,13 +163,12 @@ using namespace cudaq::qec; /*residual_detectors=*/2184, "model1_d13_r13_unified_Z_batch1.onnx", /*slot_size=*/16384, - /*total_requests=*/100, - /*num_predecoders=*/64, +/*num_predecoders=*/8, /*queue_depth=*/16, - /*num_workers=*/64 + /*num_workers=*/8 }; } - + static PipelineConfig d21_r21() { return { "d21_r21_Z", @@ -181,13 +178,12 @@ using namespace cudaq::qec; /*residual_detectors=*/9240, "model1_d21_r21_unified_X_batch1.onnx", /*slot_size=*/65536, - /*total_requests=*/100, - /*num_predecoders=*/64, +/*num_predecoders=*/8, /*queue_depth=*/16, - /*num_workers=*/64 + /*num_workers=*/8 }; } - + static PipelineConfig d31_r31() { return { "d31_r31_Z", @@ -197,10 +193,9 @@ using namespace cudaq::qec; /*residual_detectors=*/29760, "model1_d31_r31_unified_Z_batch1.onnx", /*slot_size=*/262144, - /*total_requests=*/100, - /*num_predecoders=*/64, + /*num_predecoders=*/8, /*queue_depth=*/16, - /*num_workers=*/64 + /*num_workers=*/8 }; } }; @@ -223,23 +218,17 @@ using namespace cudaq::qec; std::atomic decode_count{0}; }; - constexpr std::uint32_t fnv1a_hash(std::string_view str) { - std::uint32_t hash = 0x811c9dc5; - for (char c : str) { hash ^= static_cast(c); hash *= 0x01000193; } - return hash; - } - struct SystemContext { - cudaq::qec::atomic_uint64_sys* tx_flags_host = nullptr; + realtime_ns::atomic_uint64_sys* tx_flags_host = nullptr; uint8_t* rx_data_host = nullptr; size_t slot_size = 0; }; SystemContext g_sys_ctx; - + /// Context for dynamic worker pool: worker task writes tx_flags[origin_slot] and frees idle_mask. struct WorkerPoolContext { - cudaq::qec::atomic_uint64_sys* tx_flags = nullptr; - cudaq::qec::atomic_uint64_sys* idle_mask = nullptr; + realtime_ns::atomic_uint64_sys* tx_flags = nullptr; + realtime_ns::atomic_uint64_sys* idle_mask = nullptr; int* inflight_slot_tags = nullptr; uint64_t* debug_poll_ts = nullptr; }; @@ -295,11 +284,11 @@ using namespace cudaq::qec; DecodeResponse resp_data{total_corrections, all_converged ? 1 : 0}; - char* response_payload = (char*)job.ring_buffer_ptr + sizeof(cudaq::nvqlink::RPCResponse); + char* response_payload = (char*)job.ring_buffer_ptr + sizeof(realtime_ns::RPCResponse); std::memcpy(response_payload, &resp_data, sizeof(resp_data)); - auto* header = static_cast(job.ring_buffer_ptr); - header->magic = cudaq::nvqlink::RPC_MAGIC_RESPONSE; + auto* header = static_cast(job.ring_buffer_ptr); + header->magic = realtime_ns::RPC_MAGIC_RESPONSE; header->status = 0; header->result_len = sizeof(resp_data); @@ -330,48 +319,6 @@ using namespace cudaq::qec; nvtxRangePop(); // Worker Task } - // ============================================================================= - // Incoming Polling Thread - // ============================================================================= - void incoming_polling_loop( - std::vector>& predecoders, - cudaq::qec::utils::ThreadPool& thread_pool, - DecoderContext* ctx, - std::atomic& stop_signal, - WorkerPoolContext* pool_ctx = nullptr, - std::atomic* total_claimed = nullptr) -{ - nvtxRangePushA("Polling Loop"); - PreDecoderJob job; - int num_workers = static_cast(predecoders.size()); - while (!stop_signal.load(std::memory_order_relaxed)) { - bool found_work = false; - for (int i = 0; i < num_workers; ++i) { - if (predecoders[i]->poll_next_job(job)) { - nvtxRangePushA("Dispatch Job"); - if (pool_ctx && pool_ctx->inflight_slot_tags) { - job.origin_slot = pool_ctx->inflight_slot_tags[i]; - } else { - job.origin_slot = static_cast(((uint8_t*)job.ring_buffer_ptr - g_sys_ctx.rx_data_host) / g_sys_ctx.slot_size); - } - if (total_claimed) total_claimed->fetch_add(1, std::memory_order_relaxed); - AIPreDecoderService* pd_ptr = predecoders[i].get(); - int worker_id = i; - WorkerPoolContext* pctx = pool_ctx; - thread_pool.enqueue([job, worker_id, pd_ptr, ctx, pctx]() { - pymatching_worker_task(job, worker_id, pd_ptr, ctx, pctx); - }); - found_work = true; - nvtxRangePop(); // Dispatch Job - } - } - if (!found_work) { - QEC_CPU_RELAX(); - } - } - nvtxRangePop(); // Polling Loop -} - // ============================================================================= // Generate Realistic Syndrome Data // ============================================================================= @@ -398,8 +345,8 @@ using namespace cudaq::qec; const StreamingConfig& scfg, uint8_t* rx_data_host, uint8_t* rx_data_dev, - cudaq::qec::atomic_uint64_sys* rx_flags, - cudaq::qec::atomic_uint64_sys* tx_flags, + realtime_ns::atomic_uint64_sys* rx_flags, + realtime_ns::atomic_uint64_sys* tx_flags, DecoderContext& decoder_ctx, std::vector>& predecoders, std::atomic& system_stop, @@ -409,8 +356,8 @@ using namespace cudaq::qec; std::atomic* total_claimed = nullptr) { using hrclock = std::chrono::high_resolution_clock; - using atomic_uint64_sys = cudaq::qec::atomic_uint64_sys; - using atomic_int_sys = cudaq::qec::atomic_int_sys; + using atomic_uint64_sys = realtime_ns::atomic_uint64_sys; + using atomic_int_sys = realtime_ns::atomic_int_sys; const int num_workers = config.num_predecoders; const int max_requests = 500000; @@ -431,42 +378,70 @@ using namespace cudaq::qec; std::atomic producer_done{false}; std::atomic consumer_stop{false}; - atomic_int_sys shutdown_flag(0); - uint64_t dispatcher_stats = 0; - atomic_uint64_sys live_dispatched(0); - - HostDispatcherConfig disp_cfg; - disp_cfg.rx_flags = rx_flags; - disp_cfg.tx_flags = tx_flags; - disp_cfg.rx_data_host = rx_data_host; - disp_cfg.rx_data_dev = rx_data_dev; - disp_cfg.h_mailbox_bank = h_mailbox_bank; - disp_cfg.num_slots = NUM_SLOTS; - disp_cfg.slot_size = config.slot_size; - disp_cfg.shutdown_flag = &shutdown_flag; - disp_cfg.stats_counter = &dispatcher_stats; - disp_cfg.live_dispatched = &live_dispatched; - disp_cfg.idle_mask = pool_ctx->idle_mask; - disp_cfg.inflight_slot_tags = pool_ctx->inflight_slot_tags; - disp_cfg.debug_dispatch_ts = debug_dispatch_ts_arr.data(); - disp_cfg.workers.resize(num_workers); - for (int i = 0; i < num_workers; ++i) { - disp_cfg.workers[i].graph_exec = predecoders[i]->get_executable_graph(); - disp_cfg.workers[i].stream = predecoder_streams[i]; - } - + atomic_int_sys shutdown_flag(0); + uint64_t dispatcher_stats = 0; + atomic_uint64_sys live_dispatched(0); + + // Build function table for realtime host dispatcher (lookup by function_id). + std::vector function_table(num_workers); + for (int i = 0; i < num_workers; ++i) { + std::string func_name = "predecode_target_" + std::to_string(i); + function_table[i].function_id = realtime_ns::fnv1a_hash(func_name.c_str()); + function_table[i].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; + function_table[i].handler.graph_exec = predecoders[i]->get_executable_graph(); + std::memset(&function_table[i].schema, 0, sizeof(function_table[i].schema)); + } + + realtime_ns::HostDispatcherConfig disp_cfg; + disp_cfg.rx_flags = rx_flags; + disp_cfg.tx_flags = tx_flags; + disp_cfg.rx_data_host = rx_data_host; + disp_cfg.rx_data_dev = rx_data_dev; + disp_cfg.tx_data_host = rx_data_host; + disp_cfg.tx_data_dev = rx_data_dev; + disp_cfg.tx_stride_sz = config.slot_size; + disp_cfg.h_mailbox_bank = h_mailbox_bank; + disp_cfg.num_slots = NUM_SLOTS; + disp_cfg.slot_size = config.slot_size; + disp_cfg.function_table = function_table.data(); + disp_cfg.function_table_count = num_workers; + disp_cfg.shutdown_flag = &shutdown_flag; + disp_cfg.stats_counter = &dispatcher_stats; + disp_cfg.live_dispatched = &live_dispatched; + disp_cfg.idle_mask = pool_ctx->idle_mask; + disp_cfg.inflight_slot_tags = pool_ctx->inflight_slot_tags; + disp_cfg.workers.resize(num_workers); + for (int i = 0; i < num_workers; ++i) { + disp_cfg.workers[i].graph_exec = predecoders[i]->get_executable_graph(); + disp_cfg.workers[i].stream = predecoder_streams[i]; + disp_cfg.workers[i].function_id = function_table[i].function_id; + } + std::thread dispatcher_thread([&disp_cfg]() { - host_dispatcher_loop(disp_cfg); + realtime_ns::host_dispatcher_loop(disp_cfg); }); pin_thread_to_core(dispatcher_thread, 2); + // Ring buffer view for producer/consumer helpers (realtime C API). + cudaq_ringbuffer_t rb{}; + rb.rx_flags = reinterpret_cast(rx_flags); + rb.tx_flags = reinterpret_cast(tx_flags); + rb.rx_data = rx_data_dev; + rb.tx_data = rx_data_dev; + rb.rx_stride_sz = config.slot_size; + rb.tx_stride_sz = config.slot_size; + rb.rx_flags_host = reinterpret_cast(rx_flags); + rb.tx_flags_host = reinterpret_cast(tx_flags); + rb.rx_data_host = rx_data_host; + rb.tx_data_host = rx_data_host; + auto run_deadline = std::chrono::steady_clock::now() + std::chrono::seconds(scfg.duration_s); - + std::string rate_label = (scfg.rate_us > 0) ? std::to_string(scfg.rate_us) + " us" : "open-loop"; - + std::cout << "\n[Stream] Starting streaming test (" << config.label << ", HOST dispatcher)\n" << " Rate: " << rate_label << "\n" @@ -504,43 +479,38 @@ using namespace cudaq::qec; std::mt19937 rng(42); int next_slot = 0; int req_id = 0; - + while (std::chrono::steady_clock::now() < run_deadline && req_id < max_requests) { - + int slot = next_slot % (int)NUM_SLOTS; - // Wait for both flags to be completely clear (0). Dispatcher marks in-flight - // with tx_flags=0xEEEE... so we don't overwrite while GPU/workers are using the slot. - while (rx_flags[slot].load(cuda::std::memory_order_acquire) != 0 - || tx_flags[slot].load(cuda::std::memory_order_acquire) != 0) { + while (!cudaq_host_ringbuffer_slot_available(&rb, static_cast(slot))) { backpressure_stalls.fetch_add(1, std::memory_order_relaxed); QEC_CPU_RELAX(); if (std::chrono::steady_clock::now() >= run_deadline) return; } - + int target = req_id % config.num_predecoders; std::string func = "predecode_target_" + std::to_string(target); - + uint32_t function_id = realtime_ns::fnv1a_hash(func.c_str()); + uint8_t* slot_data = rx_data_host + (slot * config.slot_size); - auto* hdr = reinterpret_cast(slot_data); - hdr->magic = cudaq::nvqlink::RPC_MAGIC_REQUEST; - hdr->function_id = fnv1a_hash(func); - hdr->arg_len = static_cast(payload_bytes); - int32_t* payload = reinterpret_cast( - slot_data + sizeof(cudaq::nvqlink::RPCHeader)); + slot_data + CUDAQ_RPC_HEADER_SIZE); fill_measurement_payload(payload, config.input_elements(), rng, 0.01); - + + cudaq_host_ringbuffer_write_rpc_request(&rb, static_cast(slot), + function_id, payload, static_cast(payload_bytes)); + slot_request[slot] = req_id; - submit_ts[req_id] = hrclock::now(); - rx_flags[slot].store(reinterpret_cast(slot_data), cuda::std::memory_order_release); + cudaq_host_ringbuffer_signal_slot(&rb, static_cast(slot)); total_submitted.fetch_add(1, std::memory_order_release); - + next_slot++; req_id++; - + if (scfg.rate_us > 0) { auto target_time = submit_ts[req_id - 1] + std::chrono::microseconds(scfg.rate_us); @@ -548,7 +518,7 @@ using namespace cudaq::qec; QEC_CPU_RELAX(); } } - + producer_done.store(true, std::memory_order_seq_cst); }); pin_thread_to_core(producer, 3); @@ -566,34 +536,36 @@ using namespace cudaq::qec; if (pdone && ncomp >= nsub) break; - + if (next_harvest >= nsub) { QEC_CPU_RELAX(); continue; } - + int slot = next_harvest % (int)NUM_SLOTS; - uint64_t tv = tx_flags[slot].load(cuda::std::memory_order_acquire); + int cuda_error = 0; + cudaq_tx_status_t status = cudaq_host_ringbuffer_poll_tx_flag( + &rb, static_cast(slot), &cuda_error); - // Ignore IN_FLIGHT tag (dispatcher marks slot busy until worker writes response) - if (tv != 0 && tv != 0xEEEEEEEEEEEEEEEEULL) { + if (status == CUDAQ_TX_READY) { int rid = slot_request[slot]; - if (rid >= 0 && (tv >> 48) != 0xDEAD) { + if (rid >= 0) { complete_ts[rid] = hrclock::now(); - dispatch_ts[rid] = debug_dispatch_ts_arr[slot]; - poll_ts[rid] = pool_ctx->debug_poll_ts[slot]; + dispatch_ts[rid] = 0; + poll_ts[rid] = pool_ctx->debug_poll_ts ? pool_ctx->debug_poll_ts[slot] : 0; completed[rid] = true; total_completed.fetch_add(1, std::memory_order_relaxed); - } else if ((tv >> 48) == 0xDEAD) { - int cuda_err = (int)(tv & 0xFFFF); - std::cerr << " [FAIL] Slot " << slot - << " cudaGraphLaunch error " << cuda_err - << " (" << cudaGetErrorString((cudaError_t)cuda_err) - << ")\n"; - total_completed.fetch_add(1, std::memory_order_relaxed); } - - tx_flags[slot].store(0, cuda::std::memory_order_release); + cudaq_host_ringbuffer_clear_slot(&rb, static_cast(slot)); + slot_request[slot] = -1; + next_harvest++; + } else if (status == CUDAQ_TX_ERROR) { + std::cerr << " [FAIL] Slot " << slot + << " cudaGraphLaunch error " << cuda_error + << " (" << cudaGetErrorString(static_cast(cuda_error)) + << ")\n"; + total_completed.fetch_add(1, std::memory_order_relaxed); + cudaq_host_ringbuffer_clear_slot(&rb, static_cast(slot)); slot_request[slot] = -1; next_harvest++; } else { @@ -603,60 +575,8 @@ using namespace cudaq::qec; }); pin_thread_to_core(consumer, 4); - // --- DIAGNOSTIC WATCHDOG THREAD (debug only; set true to diagnose stalls) --- - constexpr bool kEnableWatchdog = false; - std::thread watchdog; - if (kEnableWatchdog) { - watchdog = std::thread([&]() { - while (!producer_done.load(std::memory_order_seq_cst)) { - std::this_thread::sleep_for(std::chrono::seconds(2)); - if (producer_done.load(std::memory_order_seq_cst)) break; - - int nsub = total_submitted.load(std::memory_order_acquire); - int ncomp = total_completed.load(std::memory_order_relaxed); - - // Only print if the pipeline seems stalled (no progress in 2 seconds) - static int last_comp = -1; - if (ncomp == last_comp && nsub > ncomp) { - std::cout << "\n[WATCHDOG] PIPELINE STALL DETECTED!\n"; - std::cout << " Submitted: " << nsub << " | Completed: " << ncomp << "\n"; - - uint64_t mask = pool_ctx->idle_mask ? pool_ctx->idle_mask->load(cuda::std::memory_order_acquire) : 0; - std::cout << " Idle Mask: 0x" << std::hex << mask << std::dec << " (0 means all workers busy)\n"; - - std::cout << " Predecoder Ready Flags (GPU -> CPU):\n"; - for (int i = 0; i < config.num_predecoders; ++i) { - auto* sys_flags = predecoders[i]->get_host_ready_flags(); - int ready = sys_flags ? sys_flags[0].load(cuda::std::memory_order_acquire) : -1; - std::cout << " Worker " << i << ": " << ready << " (0=Idle, 1=GPU Done, 2=CPU Working)\n"; - } - - std::cout << " Ring Buffer (Window around stall):\n"; - int start_slot = std::max(0, (ncomp % (int)NUM_SLOTS) - 2); - int end_slot = std::min((int)NUM_SLOTS, start_slot + 8); - for (int i = start_slot; i < end_slot; ++i) { - uint64_t rx = rx_flags[i].load(cuda::std::memory_order_acquire); - uint64_t tx = tx_flags[i].load(cuda::std::memory_order_acquire); - std::cout << " Slot " << i << " | RX: " << (rx ? "HAS_DATA" : "0") - << " | TX: "; - if (tx == 0) std::cout << "0\n"; - else if (tx == 0xEEEEEEEEEEEEEEEEULL) std::cout << "IN_FLIGHT (0xEEEE...)\n"; - else if ((tx >> 48) == 0xDEAD) std::cout << "ERROR (0xDEAD...)\n"; - else std::cout << "RESPONSE_READY\n"; - } - std::cout << "--------------------------------------------------\n"; - } - last_comp = ncomp; - } - }); - } - std::cout << " [shutdown] joining producer...\n" << std::flush; producer.join(); - if (kEnableWatchdog) { - std::cout << " [shutdown] joining watchdog...\n" << std::flush; - watchdog.join(); - } // Grace period for in-flight requests auto grace_deadline = std::chrono::steady_clock::now() + std::chrono::seconds(10); @@ -823,28 +743,17 @@ using namespace cudaq::qec; // Main // ============================================================================= int main(int argc, char* argv[]) { - // Parse arguments: [stream [rate_us] [duration_s]] + // Parse arguments: [rate_us] [duration_s] std::string config_name = "d7"; - bool streaming_mode = false; StreamingConfig stream_cfg; - + if (argc > 1) config_name = argv[1]; - - int stream_positional = 0; // tracks positional args after "stream" - for (int a = 2; a < argc; ++a) { - std::string arg = argv[a]; - if (arg == "stream") { - streaming_mode = true; - } else if (streaming_mode && stream_positional == 0 && std::isdigit(arg[0])) { - stream_cfg.rate_us = std::stoi(arg); - stream_positional++; - } else if (streaming_mode && stream_positional == 1 && std::isdigit(arg[0])) { - stream_cfg.duration_s = std::stoi(arg); - stream_positional++; - } - } - + if (argc > 2 && std::isdigit(argv[2][0])) + stream_cfg.rate_us = std::stoi(argv[2]); + if (argc > 3 && std::isdigit(argv[3][0])) + stream_cfg.duration_s = std::stoi(argv[3]); + PipelineConfig config; if (config_name == "d7") { config = PipelineConfig::d7_r7(); @@ -855,21 +764,17 @@ using namespace cudaq::qec; } else if (config_name == "d31") { config = PipelineConfig::d31_r31(); } else { - std::cerr << "Usage: " << argv[0] << " [d7|d13|d21|d31] [stream [rate_us] [duration_s]]\n" + std::cerr << "Usage: " << argv[0] << " [d7|d13|d21|d31] [rate_us] [duration_s]\n" << " d7 - distance 7, 7 rounds (default)\n" << " d13 - distance 13, 13 rounds\n" << " d21 - distance 21, 21 rounds\n" << " d31 - distance 31, 31 rounds\n" - << "\n" - << " stream - continuous FPGA-like submission (default: batch mode)\n" - << " rate_us - inter-arrival time in us (0 = open-loop, default)\n" + << " rate_us - inter-arrival time in us (0 = open-loop, default)\n" << " duration_s - test duration in seconds (default: 5)\n" - << "\n" - << "Examples:\n" - << " " << argv[0] << " d13 # batch mode\n" - << " " << argv[0] << " d13 stream # streaming, open-loop\n" - << " " << argv[0] << " d13 stream 50 # streaming, 50 us between requests\n" - << " " << argv[0] << " d13 stream 50 10 # streaming, 50 us rate, 10s duration\n"; + << "\nExamples:\n" + << " " << argv[0] << " d13 # open-loop, 5s\n" + << " " << argv[0] << " d13 50 # 50 us between requests, 5s\n" + << " " << argv[0] << " d13 50 10 # 50 us rate, 10s duration\n"; return 1; } @@ -922,13 +827,13 @@ using namespace cudaq::qec; cudaq::qec::decoder::get("pymatching", H_z, pm_params)); std::cout << "[Setup] PyMatching decoder pool ready.\n"; - // ========================================================================= - // System-Scope Atomics & Ring Buffer Allocation (Replaces volatile setup) - // ========================================================================= - using atomic_uint64_sys = cudaq::qec::atomic_uint64_sys; - using atomic_int_sys = cudaq::qec::atomic_int_sys; - - void* buf_rx = nullptr; + // ========================================================================= + // System-Scope Atomics & Ring Buffer Allocation (Replaces volatile setup) + // ========================================================================= + using atomic_uint64_sys = realtime_ns::atomic_uint64_sys; + using atomic_int_sys = realtime_ns::atomic_int_sys; + + void* buf_rx = nullptr; CUDA_CHECK(cudaHostAlloc(&buf_rx, NUM_SLOTS * sizeof(atomic_uint64_sys), cudaHostAllocMapped)); atomic_uint64_sys* rx_flags_host = static_cast(buf_rx); for (size_t i = 0; i < NUM_SLOTS; ++i) new (rx_flags_host + i) atomic_uint64_sys(0); @@ -975,96 +880,38 @@ using namespace cudaq::qec; CUDA_CHECK(cudaHostAlloc(&h_mailbox_bank, config.num_predecoders * sizeof(void*), cudaHostAllocMapped)); std::memset(h_mailbox_bank, 0, config.num_predecoders * sizeof(void*)); CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_mailbox_bank, h_mailbox_bank, 0)); - - void** d_global_mailbox_bank = nullptr; - - int* shutdown_flag_host = nullptr; - int* d_shutdown_flag = nullptr; - uint64_t* d_stats = nullptr; - cudaq_function_entry_t* d_function_entries = nullptr; - cudaq_dispatch_graph_context* dispatch_ctx = nullptr; - + std::vector predecoder_streams; - - const bool use_host_dispatcher = streaming_mode; - bool device_launch = !use_host_dispatcher; - - if (!use_host_dispatcher) { - CUDA_CHECK(cudaMalloc(&d_global_mailbox_bank, config.num_predecoders * sizeof(void*))); - CUDA_CHECK(cudaMemset(d_global_mailbox_bank, 0, config.num_predecoders * sizeof(void*))); - - CUDA_CHECK(cudaHostAlloc(&shutdown_flag_host, sizeof(int), cudaHostAllocMapped)); - *shutdown_flag_host = 0; - CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_shutdown_flag, shutdown_flag_host, 0)); - - CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t))); - CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t))); - } else { - for (int i = 0; i < config.num_predecoders; ++i) { - cudaStream_t s; - CUDA_CHECK(cudaStreamCreate(&s)); - predecoder_streams.push_back(s); - } + for (int i = 0; i < config.num_predecoders; ++i) { + cudaStream_t s; + CUDA_CHECK(cudaStreamCreate(&s)); + predecoder_streams.push_back(s); } - + std::cout << "[Setup] Capturing " << config.num_predecoders - << "x AIPreDecoder Graphs (" - << (device_launch ? "device-launch" : "host-launch") << ")...\n"; + << "x AIPreDecoder Graphs (host-launch)...\n"; cudaStream_t capture_stream; CUDA_CHECK(cudaStreamCreate(&capture_stream)); - + std::vector> predecoders; - std::vector function_entries(config.num_predecoders); - - bool need_save = (model_path == onnx_file); - int predecoder_queue_depth = use_host_dispatcher ? 1 : config.queue_depth; - for (int i = 0; i < config.num_predecoders; ++i) { - void** my_mailbox = use_host_dispatcher - ? (d_mailbox_bank + i) - : (d_global_mailbox_bank + i); - std::string save_path = (need_save && i == 0) ? engine_file : ""; - auto pd = std::make_unique(model_path, my_mailbox, + bool need_save = (model_path == onnx_file); + const int predecoder_queue_depth = 1; + for (int i = 0; i < config.num_predecoders; ++i) { + std::string save_path = (need_save && i == 0) ? engine_file : ""; + auto pd = std::make_unique(model_path, d_mailbox_bank + i, predecoder_queue_depth, save_path); - + std::cout << "[Setup] Decoder " << i << ": input_size=" << pd->get_input_size() << " output_size=" << pd->get_output_size() << "\n"; - - pd->capture_graph(capture_stream, device_launch); - - if (!use_host_dispatcher) { - cudaGraphExec_t gexec = pd->get_executable_graph(); - std::string func_name = "predecode_target_" + std::to_string(i); - function_entries[i].function_id = fnv1a_hash(func_name); - function_entries[i].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; - function_entries[i].handler.graph_exec = gexec; - function_entries[i].mailbox_idx = i; - function_entries[i].d_queue_idx = pd->get_device_queue_idx(); - function_entries[i].d_ready_flags = reinterpret_cast(pd->get_device_ready_flags()); - function_entries[i].d_inflight_flag = pd->get_device_inflight_flag(); - } - + + pd->capture_graph(capture_stream, false /* host-launch */); + predecoders.push_back(std::move(pd)); } - - if (!use_host_dispatcher) { - CUDA_CHECK(cudaMalloc(&d_function_entries, - config.num_predecoders * sizeof(cudaq_function_entry_t))); - CUDA_CHECK(cudaMemcpy(d_function_entries, function_entries.data(), - config.num_predecoders * sizeof(cudaq_function_entry_t), - cudaMemcpyHostToDevice)); - - std::cout << "[Setup] Launching GPU Dispatcher Kernel...\n"; - CUDA_CHECK(cudaq_create_dispatch_graph_regular( - rx_flags_dev, tx_flags_dev, d_function_entries, config.num_predecoders, - d_global_mailbox_bank, d_shutdown_flag, d_stats, NUM_SLOTS, 1, 32, - capture_stream, &dispatch_ctx - )); - CUDA_CHECK(cudaq_launch_dispatch_graph(dispatch_ctx, capture_stream)); - } else { - std::cout << "[Setup] Host-side dispatcher will be launched in streaming test.\n"; - } + + std::cout << "[Setup] Host-side dispatcher will be launched in streaming test.\n"; std::atomic system_stop{false}; std::atomic total_claimed{0}; @@ -1104,170 +951,37 @@ using namespace cudaq::qec; } // ========================================================================= - // Test Stimulus + // Streaming test // ========================================================================= - if (streaming_mode) { - run_streaming_test(config, stream_cfg, - rx_data_host, rx_data_dev, rx_flags_host, tx_flags_host, - decoder_ctx, predecoders, system_stop, - h_mailbox_bank, predecoder_streams, &pool_ctx, &total_claimed); - } else { - const int batch_size = config.num_predecoders; - std::cout << "\n[Batch] Firing " << config.total_requests - << " syndromes in batches of " << batch_size - << " (" << config.label << ", error_rate=0.01)...\n"; - - cudaq::qec::utils::PipelineBenchmark bench(config.label, - config.total_requests); - std::mt19937 rng(42); - const size_t payload_bytes = config.input_bytes(); - int requests_sent = 0; - int responses_received = 0; - - bench.start(); - - for (int batch_start = 0; batch_start < config.total_requests; - batch_start += batch_size) { - int batch_end = std::min(batch_start + batch_size, config.total_requests); - - for (int i = batch_start; i < batch_end; ++i) { - int target_decoder = i % config.num_predecoders; - std::string target_func = "predecode_target_" - + std::to_string(target_decoder); - - int slot = i % (int)NUM_SLOTS; - while (rx_flags_host[slot].load(cuda::std::memory_order_acquire) != 0) usleep(10); - - uint8_t* slot_data = rx_data_host + (slot * config.slot_size); - auto* header = reinterpret_cast(slot_data); - header->magic = cudaq::nvqlink::RPC_MAGIC_REQUEST; - header->function_id = fnv1a_hash(target_func); - header->arg_len = static_cast(payload_bytes); - - int32_t* payload = reinterpret_cast( - slot_data + sizeof(cudaq::nvqlink::RPCHeader)); - fill_measurement_payload(payload, config.input_elements(), rng, 0.01); - - bench.mark_submit(i); - rx_flags_host[slot].store(reinterpret_cast(slot_data), cuda::std::memory_order_release); - requests_sent++; - } - - for (int i = batch_start; i < batch_end; ++i) { - int slot = i % (int)NUM_SLOTS; - - auto deadline = std::chrono::steady_clock::now() - + std::chrono::seconds(10); - uint64_t tv = 0; - while ((tv = tx_flags_host[slot].load(cuda::std::memory_order_acquire)) == 0) { - if (std::chrono::steady_clock::now() > deadline) break; - QEC_CPU_RELAX(); - } - - if (tv != 0 && (tv >> 48) == 0xDEAD) { - int cuda_err = (int)(tv & 0xFFFF); - std::cerr << " [FAIL] Slot " << slot - << " cudaGraphLaunch error " << cuda_err - << " (" << cudaGetErrorString((cudaError_t)cuda_err) - << ")\n"; - } else if (tv != 0) { - bench.mark_complete(i); - responses_received++; - uint8_t* slot_data = rx_data_host + (slot * config.slot_size); - int32_t corrections = 0, converged = 0; - std::memcpy(&corrections, - slot_data + sizeof(cudaq::nvqlink::RPCResponse), - sizeof(int32_t)); - std::memcpy(&converged, - slot_data + sizeof(cudaq::nvqlink::RPCResponse) - + sizeof(int32_t), - sizeof(int32_t)); - std::cout << " -> Slot " << slot - << ": OK, corrections=" << corrections - << " converged=" << (converged ? "yes" : "no") << "\n"; - } else { - std::cerr << " [FAIL] Timeout waiting for slot " << slot << "\n"; - } - - tx_flags_host[slot].store(0, cuda::std::memory_order_release); - } - } - - bench.stop(); - - std::cout << "\n[Result] Processed " << responses_received << "/" - << requests_sent << " requests successfully.\n"; - - bench.report(); - - int n_decoded = decoder_ctx.decode_count.load(); - if (n_decoded > 0) { - double avg_decode = (double)decoder_ctx.total_decode_us.load() / n_decoded; - double avg_worker = (double)decoder_ctx.total_worker_us.load() / n_decoded; - double avg_overhead = avg_worker - avg_decode; - auto stats = bench.compute_stats(); - double avg_pipeline_overhead = stats.mean_us - avg_worker; - - std::cout << std::fixed << std::setprecision(1); - std::cout << "\n Worker Timing Breakdown (avg over " - << n_decoded << " requests):\n"; - std::cout << " PyMatching decode: " << std::setw(8) << avg_decode - << " us (" << std::setw(4) - << (100.0 * avg_decode / stats.mean_us) << "%)\n"; - std::cout << " Worker overhead: " << std::setw(8) << avg_overhead - << " us (" << std::setw(4) - << (100.0 * avg_overhead / stats.mean_us) << "%)\n"; - std::cout << " GPU+dispatch+poll: " << std::setw(8) - << avg_pipeline_overhead << " us (" << std::setw(4) - << (100.0 * avg_pipeline_overhead / stats.mean_us) << "%)\n"; - std::cout << " Total end-to-end: " << std::setw(8) - << stats.mean_us << " us\n"; - std::cout << " Per-round (/" << config.num_rounds << "): " - << std::setw(8) << (stats.mean_us / config.num_rounds) - << " us/round\n"; - } - } - + run_streaming_test(config, stream_cfg, + rx_data_host, rx_data_dev, rx_flags_host, tx_flags_host, + decoder_ctx, predecoders, system_stop, + h_mailbox_bank, predecoder_streams, &pool_ctx, &total_claimed); + // Teardown std::cout << "[Teardown] Shutting down...\n"; system_stop = true; - - if (!use_host_dispatcher) { - *shutdown_flag_host = 1; - __sync_synchronize(); - } - for (auto& t : worker_threads) { - if (t.joinable()) t.join(); - } - CUDA_CHECK(cudaStreamSynchronize(capture_stream)); - - if (!use_host_dispatcher) { - uint64_t dispatched_packets = 0; - CUDA_CHECK(cudaMemcpy(&dispatched_packets, d_stats, sizeof(uint64_t), cudaMemcpyDeviceToHost)); - std::cout << "[Stats] Dispatcher processed " << dispatched_packets << " packets.\n"; - CUDA_CHECK(cudaq_destroy_dispatch_graph(dispatch_ctx)); + for (auto& t : worker_threads) { + if (t.joinable()) t.join(); } - + CUDA_CHECK(cudaStreamSynchronize(capture_stream)); + for (auto& s : predecoder_streams) { cudaStreamSynchronize(s); cudaStreamDestroy(s); } - + // Explicitly call destructors for libcu++ atomics before freeing memory for (size_t i = 0; i < NUM_SLOTS; ++i) { rx_flags_host[i].~atomic_uint64_sys(); tx_flags_host[i].~atomic_uint64_sys(); } - + cudaFreeHost(buf_rx); cudaFreeHost(buf_tx); cudaFreeHost(rx_data_host); cudaFreeHost(h_mailbox_bank); - if (shutdown_flag_host) cudaFreeHost(shutdown_flag_host); - if (d_global_mailbox_bank) cudaFree(d_global_mailbox_bank); - if (d_stats) cudaFree(d_stats); - if (d_function_entries) cudaFree(d_function_entries); cudaStreamDestroy(capture_stream); std::cout << "Done.\n"; diff --git a/libs/qec/unittests/CMakeLists.txt b/libs/qec/unittests/CMakeLists.txt index 7c1a8215..e3c4c1bc 100644 --- a/libs/qec/unittests/CMakeLists.txt +++ b/libs/qec/unittests/CMakeLists.txt @@ -122,11 +122,20 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) list(APPEND _cudaq_realtime_prefixes "${CUDAQ_INSTALL_PREFIX}") endif() + # Realtime API lives under install prefix (CUDAQ_REALTIME_ROOT = install directory). + # Header layout: include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h find_path(CUDAQ_REALTIME_INCLUDE_DIR - NAMES cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h + NAMES cudaq/realtime/daemon/dispatcher/cudaq_realtime.h PATHS ${_cudaq_realtime_prefixes} - PATH_SUFFIXES include ../include + PATH_SUFFIXES include ) + if(NOT CUDAQ_REALTIME_INCLUDE_DIR) + find_path(CUDAQ_REALTIME_INCLUDE_DIR + NAMES cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h + PATHS ${_cudaq_realtime_prefixes} + PATH_SUFFIXES include ../include + ) + endif() find_library(CUDAQ_REALTIME_LIBRARY NAMES cudaq-realtime @@ -140,10 +149,25 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) PATH_SUFFIXES lib ) + # In-tree realtime (built from top-level add_subdirectory(realtime)) provides new API + set(_predecoder_use_in_tree_realtime FALSE) + if(TARGET cudaq-realtime) + set(_predecoder_use_in_tree_realtime TRUE) + message(STATUS "Using in-tree realtime (cudaq-realtime) for predecoder test") + endif() + + set(_have_realtime_for_tests FALSE) if(CUDAQ_REALTIME_INCLUDE_DIR AND CUDAQ_REALTIME_LIBRARY AND CUDAQ_REALTIME_DISPATCH_LIBRARY) + set(_have_realtime_for_tests TRUE) message(STATUS "Found cuda-quantum realtime headers at ${CUDAQ_REALTIME_INCLUDE_DIR}") message(STATUS "Found cuda-quantum realtime library at ${CUDAQ_REALTIME_LIBRARY}") message(STATUS "Found cuda-quantum realtime dispatch library at ${CUDAQ_REALTIME_DISPATCH_LIBRARY}") + endif() + if(TARGET cudaq-realtime) + set(_have_realtime_for_tests TRUE) + endif() + + if(_have_realtime_for_tests) add_executable(test_realtime_decoding ${CMAKE_CURRENT_SOURCE_DIR}/decoders/realtime/test_realtime_decoding.cu @@ -218,7 +242,6 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/ai_decoder_service.cu ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/ai_predecoder_service.cu - ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/host_dispatcher.cpp ) set_target_properties(test_realtime_predecoder_w_pymatching PROPERTIES @@ -237,33 +260,57 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) get_filename_component(_cuda_root "${_cuda_bin}" DIRECTORY) set(_cuda_cccl_include "${_cuda_root}/include/cccl") + # Includes: in-tree realtime target brings include; else in-repo or install dir + set(_realtime_predecoder_includes "") + if(NOT _predecoder_use_in_tree_realtime) + set(_realtime_include "${CMAKE_SOURCE_DIR}/realtime/include") + if(EXISTS "${_realtime_include}/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h") + list(APPEND _realtime_predecoder_includes "${_realtime_include}") + endif() + endif() target_include_directories(test_realtime_predecoder_w_pymatching PRIVATE ${_cuda_cccl_include} ${CUDAToolkit_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../include ${CMAKE_SOURCE_DIR}/libs/core/include + ${_realtime_predecoder_includes} ${CUDAQ_REALTIME_INCLUDE_DIR} ) - target_link_libraries(test_realtime_predecoder_w_pymatching PRIVATE - CUDA::cudart - ${TENSORRT_LIBRARY} - ${TENSORRT_ONNX_PARSER_LIBRARY} - ${CUDAQ_REALTIME_LIBRARY} - ${CUDAQ_REALTIME_DISPATCH_LIBRARY} - cudaq-qec - cudaq::cudaq - ) - - target_link_directories(test_realtime_predecoder_w_pymatching PRIVATE - ${CMAKE_BINARY_DIR}/lib - ) - - set_target_properties(test_realtime_predecoder_w_pymatching PROPERTIES - BUILD_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" - INSTALL_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" - ) + if(_predecoder_use_in_tree_realtime) + target_link_libraries(test_realtime_predecoder_w_pymatching PRIVATE + CUDA::cudart + ${TENSORRT_LIBRARY} + ${TENSORRT_ONNX_PARSER_LIBRARY} + cudaq-realtime + cudaq-realtime-host-dispatch + cudaq-realtime-dispatch + cudaq-qec + cudaq::cudaq + ) + set_target_properties(test_realtime_predecoder_w_pymatching PROPERTIES + BUILD_RPATH "${CMAKE_BINARY_DIR}/lib;${CMAKE_BINARY_DIR}/realtime/lib" + INSTALL_RPATH "${CMAKE_BINARY_DIR}/lib;${CMAKE_BINARY_DIR}/realtime/lib" + ) + else() + target_link_libraries(test_realtime_predecoder_w_pymatching PRIVATE + CUDA::cudart + ${TENSORRT_LIBRARY} + ${TENSORRT_ONNX_PARSER_LIBRARY} + ${CUDAQ_REALTIME_LIBRARY} + ${CUDAQ_REALTIME_DISPATCH_LIBRARY} + cudaq-qec + cudaq::cudaq + ) + target_link_directories(test_realtime_predecoder_w_pymatching PRIVATE + ${CMAKE_BINARY_DIR}/lib + ) + set_target_properties(test_realtime_predecoder_w_pymatching PROPERTIES + BUILD_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" + INSTALL_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" + ) + endif() add_dependencies(CUDAQXQECUnitTests test_realtime_predecoder_w_pymatching) else() @@ -272,8 +319,8 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) else() message(WARNING "cuda-quantum realtime dependency not found. " - "Set CUDAQ_REALTIME_ROOT or CUDAQ_INSTALL_PREFIX to enable " - "test_realtime_decoding.") + "Set CUDAQ_REALTIME_ROOT or build with in-tree realtime to enable " + "test_realtime_decoding and test_realtime_predecoder_w_pymatching.") endif() endif() diff --git a/realtime/.clang-format b/realtime/.clang-format index 4b5d84be..4c6382a7 100644 --- a/realtime/.clang-format +++ b/realtime/.clang-format @@ -5,7 +5,7 @@ IncludeCategories: Priority: 4 - Regex: '^"cudaq/' Priority: 3 - - Regex: '^"(nvqlink|\.\.)/' + - Regex: '^"(realtime|\.\.)/' Priority: 2 - Regex: '.*' Priority: 1 diff --git a/realtime/CMakeLists.txt b/realtime/CMakeLists.txt index 53db32b2..f5a78407 100644 --- a/realtime/CMakeLists.txt +++ b/realtime/CMakeLists.txt @@ -17,15 +17,17 @@ set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel") # Set a default install prefix if none was specified. -set(CMAKE_INSTALL_PREFIX "$ENV{HOME}/.nvqlink" CACHE STRING +set(CMAKE_INSTALL_PREFIX "$ENV{HOME}/.cudaq_realtime" CACHE STRING "Install path prefix, prepended onto install directories") # Project setup # ============================================================================== -# Check if core is built as a standalone project. -project(cudaq-nvqlink) -set(CUDAQ_NVQLINK_STANDALONE_BUILD TRUE) +# Check if built as standalone (not as subdirectory of cudaqx). +project(cudaq-realtime) +if(NOT DEFINED CUDAQ_REALTIME_STANDALONE_BUILD) + set(CUDAQ_REALTIME_STANDALONE_BUILD TRUE) +endif() set(CMAKE_EXPORT_COMPILE_COMMANDS ON) @@ -34,8 +36,8 @@ set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED TRUE) set(CMAKE_POSITION_INDEPENDENT_CODE TRUE) -set(CUDAQ_NVQLINK_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) -set(CUDAQ_NVQLINK_INCLUDE_DIR ${CUDAQ_NVQLINK_SOURCE_DIR}/include) +set(CUDAQ_REALTIME_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(CUDAQ_REALTIME_INCLUDE_DIR ${CUDAQ_REALTIME_SOURCE_DIR}/include) # Add cmake directory to module path for custom Find modules list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") @@ -43,26 +45,13 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") # Options # ============================================================================== -option(NVQLINK_BUILD_TESTS - "Generate build targets for the NVQLINK unit tests" ON) -option(NVQLINK_BUILD_EXAMPLES - "Generate build targets for the NVQLINK example programs" ON) -option(NVQLINK_ENABLE_ROCE - "Enable RoCE backend using libibverbs" OFF) -option(NVQLINK_ENABLE_DOCA - "Enable DOCA GPUNetIO backend for GPU-controlled RDMA" OFF) - -# Profiler backend selection -set(NVQLINK_PROFILER_BACKEND "NONE" CACHE STRING "Profiler backend (NONE, NVTX, TRACY)") -set_property(CACHE NVQLINK_PROFILER_BACKEND PROPERTY STRINGS NONE NVTX TRACY) - -# Logging backend selection -set(NVQLINK_LOGGING_BACKEND "NONE" CACHE STRING "Logging backend (NONE, QUILL)") -set_property(CACHE NVQLINK_LOGGING_BACKEND PROPERTY STRINGS NONE QUILL) - -# Compile-time log level filtering (lower levels become no-ops) -set(NVQLINK_LOGGING_LEVEL "INFO" CACHE STRING "Minimum log level (TRACE, DEBUG, INFO, WARNING, ERROR)") -set_property(CACHE NVQLINK_LOGGING_LEVEL PROPERTY STRINGS TRACE DEBUG INFO WARNING ERROR) +option(CUDAQ_REALTIME_BUILD_TESTS + "Generate build targets for the CUDAQ real-time unit tests" ON) +option(CUDAQ_REALTIME_BUILD_EXAMPLES + "Generate build targets for the CUDAQ real-time example programs" ON) +option(CUDAQ_REALTIME_ENABLE_HOLOLINK_TOOLS + "Build Hololink bridge/emulator/playback tools (requires hololink)." + OFF) # Check for CUDA Support (ref: cuda-quantum/CMakeLists.txt) # ============================================================================== @@ -89,8 +78,8 @@ endfunction() if(CMAKE_CUDA_COMPILER) if (NOT CUDA_TARGET_ARCHS) - # Ampere, Ada Lovelace, Hopper - set(CUDA_TARGET_ARCHS "80;89;90") + # Ampere, Hopper + set(CUDA_TARGET_ARCHS "80;90") endif() CUDA_get_gencode_args(CUDA_gencode_flags ${CUDA_TARGET_ARCHS}) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -shared -std=c++17 ${CUDA_gencode_flags} --compiler-options -fPIC") @@ -110,19 +99,19 @@ find_package(Threads REQUIRED) add_subdirectory(lib) -if (NVQLINK_BUILD_EXAMPLES) +if (CUDAQ_REALTIME_BUILD_EXAMPLES) message(STATUS "RoCE/DOCA examples removed for RPC dispatch workflow.") endif() -if (NVQLINK_BUILD_TESTS) - add_custom_target(NVQLINKUnitTests) +if (CUDAQ_REALTIME_BUILD_TESTS AND CUDAQ_REALTIME_STANDALONE_BUILD) + add_custom_target(CudaqRealtimeUnitTests) include(CTest) add_custom_target(run_tests COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH="${CUDAQ_INSTALL_DIR}:${CMAKE_BINARY_DIR}/python" ${CMAKE_CTEST_COMMAND} --output-on-failure - DEPENDS NVQLINKUnitTests + DEPENDS CudaqRealtimeUnitTests WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) add_subdirectory(unittests) diff --git a/realtime/README.md b/realtime/README.md index 5fec3286..5ebdd7db 100644 --- a/realtime/README.md +++ b/realtime/README.md @@ -1,41 +1,36 @@ # CUDA-Q Realtime Library -CUDA-Q Realtime is a library for tightly coupling GPU accelerated compute to the control system of a quantum processor. +CUDA-Q Realtime is a library for tightly coupling GPU accelerated compute +to the control system of a quantum processor. + It fulfills two primary responsibilities: -1. It provides the low-level basis of realtime coprocessing between FPGA and CPU-GPU systems. -1. It provides the low latency networking stack of the NVQLink architecture, enabling system integrators to achieve few-microsecond data round trips between FPGA and GPU. + +1. It provides the low-level basis of realtime coprocessing +between FPGA and CPU-GPU systems. + +2. It provides the low latency networking stack of the NVQLink architecture, +enabling system integrators to achieve few-microsecond +data round trips between FPGA and GPU. > [!WARNING] -> This library is currently in early access / alpha stage and will continue to rapidly evolve as we build interactively with collaborators. +> This library is currently in early access / alpha stage +> and will continue to rapidly evolve as we build interactively with collaborators. + + > [!NOTE] -> While the library is in early access, instructions to reproduce the FPGA-GPU latency round trip on third party systems can be found at [docs/nvqlink_latency_demo.md](docs/nvqlink_latency_demo.md). +> While the library is in early access, instructions to reproduce the FPGA-GPU latency +> round trip on third party systems can be found at [docs/nvqlink_latency_demo.md](docs/nvqlink_latency_demo.md). ## Getting Started ```bash # Configure, need cmake 3.28+ -cmake -G Ninja .. -DNVQLINK_BUILD_TESTS=ON +cmake -G Ninja .. -DCUDAQ_REALTIME_BUILD_TESTS=ON # Build ninja # Test ctest ``` -## Extending the library - -Check out the tests in the `unittests` folder as well as the example codes in `examples`. - -3rd parties can extend this library with new `device` types. The goal is to define -a subclass of `device_mixin` that allows you specify device traits that your `device` exposes. -There are a number of traits available, and they are specified in the `device.h` file. There are -example devices in the `devices/` folder there too. - -3rd parties can also provide custom compiler implementations. Compilers take generic -code strings and return a `compiled_kernel`. There is one compiler implemented as of -today, and it is the CUDA-Q compiler. For simplicity, this compiler simply delegates to -the command line CUDA-Q toolchain. Subclasses should be able to override the `cudaq-opt` -pass flags. This would allow one to handle CUDA-Q IR operations in a target specific manner -(e.g., custom lowering of the device_call op). - - +Check out the tests in the `unittests` folder for examples. diff --git a/realtime/docs/cudaq_realtime_host_api.html b/realtime/docs/cudaq_realtime_host_api.html new file mode 100644 index 00000000..0338ec07 --- /dev/null +++ b/realtime/docs/cudaq_realtime_host_api.html @@ -0,0 +1,2945 @@ + + + + + CUDA-Q Realtime Host API (Draft) + + + + + + + + + + + + + + + +
+

+

CUDA-Q Realtime Host API (Draft)

+

Published Proposal, +

+
+
+
Editor: +
(NVIDIA) +
Issue Tracking: +
GitHub +
+
+
+
+
+
+

Abstract

+

Host API, wiring, and usage for CUDA-Q realtime dispatch.

+
+
+ +
+

1. CUDA-Q Realtime Host API

+

This document explains the C host API for realtime dispatch, the RPC wire +protocol, and complete wiring examples. It is written for external partners +integrating CUDA-QX decoders with their own transport mechanisms. The API and +protocol are transport-agnostic and support multiple data transport options, +including NVIDIA Hololink (RDMA via ConnectX NICs), libibverbs, and proprietary +transport layers. Handlers can execute on GPU (via CUDA kernels) or CPU (via +host threads). Examples in this document use Hololink’s 3-kernel workflow (RX +kernel/dispatch/TX kernel) for illustration, but the same principles apply to +other transport mechanisms.

+ +

Hololink is NVIDIA’s low-latency sensor bridge framework that enables +direct GPU memory access from external devices (FPGAs, sensors) over Ethernet +using RDMA (Remote Direct Memory Access) via ConnectX NICs. In the context of +quantum error correction, Hololink is one example of a transport mechanism that +connects the quantum control system (typically an FPGA) to GPU-based decoders.

+

Repository: nvidia-holoscan/holoscan-sensor-bridge (nvqlink branch)

+

Hololink handles:

+
    +
  • +

    RX (Receive): RX kernel receives data from the FPGA directly into GPU memory via RDMA

    +
  • +

    TX (Transmit): TX kernel sends results back to the FPGA via RDMA

    +
  • +

    RDMA transport: Zero-copy data movement using ConnectX-7 NICs with GPUDirect support

    +
+

The CUDA-Q Realtime Host API provides the middle component (dispatch kernel or thread) that +sits between the transport’s RX and TX components, executing the actual decoder logic.

+

1.2. Transport Mechanisms # {#transport-mechanisms}

+

The realtime dispatch API is designed to work with multiple transport mechanisms +that move data between the quantum control system (FPGA) and the decoder. The +transport mechanism handles getting RPC messages into RX ring buffer slots and +sending responses from TX ring buffer slots back to the FPGA.

+

1.2.1. Supported Transport Options

+

Hololink (GPU-based with GPUDirect):

+
    +
  • +

    Uses ConnectX-7 NICs with RDMA for zero-copy data movement

    +
  • +

    RX and TX are persistent GPU kernels that directly access GPU memory

    +
  • +

    Requires GPUDirect support

    +
  • +

    Lowest latency option for GPU-based decoders

    +
+

libibverbs (CPU-based):

+
    +
  • +

    Standard InfiniBand Verbs API for RDMA on the CPU

    +
  • +

    RX and TX are host threads that poll CPU-accessible memory

    +
  • +

    Works with CPU-based dispatchers

    +
  • +

    Ring buffers reside in host memory (cudaHostAlloc or regular malloc)

    +
+

Proprietary Transport Mechanisms:

+
    +
  • +

    Custom implementations with or without GPUDirect support

    +
  • +

    May use different networking technologies or memory transfer methods

    +
  • +

    Must implement the ring buffer + flag protocol defined in this document

    +
  • +

    Can target either GPU (with suitable memory access) or CPU execution

    +
+

The key requirement is that the transport mechanism implements the ring buffer +slot + flag protocol: writing RPC messages to RX slots and setting rx_flags, +then reading TX slots after tx_flags are set.

+

1.3. The 3-Kernel Architecture (Hololink Example) # {#three-kernel-architecture}

+

The Hololink workflow separates concerns into three persistent GPU kernels that +communicate via shared ring buffers:

+

3-kernel architecture

+

1.3.1. Data Flow Summary # {#data-flow-summary}

+ + + + + + + + + + +
Step + Component + Action +
1-2 + FPGA → ConnectX + Detection event data sent over Ethernet, RDMA writes to GPU memory +
3 + RX Kernel + Frames detection events into RPC message, sets rx_flags[slot] (see Message completion note) +
4-5 + Dispatch Kernel + Polls for ready slots, looks up handler by function_id, executes decoder +
6 + Dispatch Kernel + Writes RPCResponse + correction, sets tx_flags[slot] +
7-8 + TX Kernel + Polls for responses, triggers RDMA send back to FPGA +
9 + ConnectX → FPGA + Correction delivered to quantum controller +
+

1.3.2. Why 3 Kernels? # {#why-3-kernels}

+
    +
  1. +

    Separation of concerns: Transport (RX/TX kernels) vs. compute (dispatch) are decoupled

    +
  2. +

    Reusability: Same dispatch kernel works with any decoder handler

    +
  3. +

    Testability: Dispatch kernel can be tested without Hololink hardware

    +
  4. +

    Flexibility: RX/TX kernels can be replaced with different transport mechanisms

    +
  5. +

    Transport independence: The protocol works with Hololink, libibverbs, or proprietary transports

    +
+

1.4. What This API Does (In One Paragraph) # {#what-this-does}

+

The host API wires a dispatcher (GPU kernel or CPU thread) to shared ring buffers. +The transport mechanism (e.g., Hololink RX/TX kernels, libibverbs threads, or +proprietary transport) places incoming RPC messages into RX slots and retrieves +responses from TX slots. +The dispatcher polls RX flags (see Message completion note), looks up a +handler by function_id, executes it on the GPU, and writes a response into the +same slot. Hololink’s RX/TX kernels handle device I/O; the dispatch kernel sits +in the middle and runs the decoder handler.

+

1.5. Scope # {#scope}

+
    +
  • +

    C host API in cudaq_realtime.h

    +
  • +

    RPC messaging protocol (header + payload + response)

    +
  • +

    End-to-end example using the mock decoder in cudaqx

    +
  • +

    NIC-free testing path

    +
+

1.6. Terms and Components # {#terms}

+
    +
  • +

    Ring buffer: Fixed-size slots holding RPC messages (see Message completion note). Each slot has an RX flag and a TX flag.

    +
  • +

    RX flag: Nonzero means a slot is ready to be processed.

    +
  • +

    TX flag: Nonzero means a response is ready to send.

    +
  • +

    Dispatcher: Component that processes RPC messages (GPU kernel or CPU thread).

    +
  • +

    Handler: Function registered in the function table that processes specific message types.

    +
  • +

    Function table: Array of handler function pointers + IDs + schemas.

    +
+

1.7. Schema Data Structures # {#schema-structures}

+

Each handler registered in the function table includes a schema that describes +its argument and result types.

+

1.7.1. Type Descriptors

+
// Standardized payload type identifiersenum PayloadTypeID : uint8_t {  TYPE_UINT8           = 0x10,  TYPE_INT32           = 0x11,  TYPE_INT64           = 0x12,  TYPE_FLOAT32         = 0x13,  TYPE_FLOAT64         = 0x14,  TYPE_ARRAY_UINT8     = 0x20,  TYPE_ARRAY_INT32     = 0x21,  TYPE_ARRAY_FLOAT32   = 0x22,  TYPE_ARRAY_FLOAT64   = 0x23,  TYPE_BIT_PACKED      = 0x30   // Bit-packed data (LSB-first)};struct cudaq_type_desc_t {  uint8_t  type_id;       // PayloadTypeID value  uint8_t  reserved[3];  uint32_t size_bytes;    // Total size in bytes  uint32_t num_elements;  // Interpretation depends on type_id};
+

The num_elements field interpretation:

+
    +
  • +

    Scalar types (TYPE_UINT8, TYPE_INT32, etc.): unused, set to 1

    +
  • +

    Array types (TYPE_ARRAY_*): number of array elements

    +
  • +

    TYPE_BIT_PACKED: number of bits (not bytes)

    +
+

1.7.2. Handler Schema

+
struct cudaq_handler_schema_t {  uint8_t  num_args;              // Number of input arguments  uint8_t  num_results;           // Number of return values  uint16_t reserved;  cudaq_type_desc_t args[8];      // Argument type descriptors  cudaq_type_desc_t results[4];   // Result type descriptors};
+

Limits:

+
    +
  • +

    Maximum 8 arguments per handler

    +
  • +

    Maximum 4 results per handler

    +
  • +

    Total payload size must fit in slot: slot_size - sizeof(RPCHeader)

    +
+

1.8. RPC Messaging Protocol # {#rpc-protocol}

+

Each RX ring buffer slot contains an RPC request. The dispatcher writes the +response to the corresponding TX ring buffer slot.

+
RX Slot: | RPCHeader | request payload bytes |TX Slot: | RPCResponse | response payload bytes |
+

Payload encoding details (type system, multi-argument encoding, bit-packing, +and QEC-specific examples) are defined in cudaq_realtime_message_protocol.bs.

+

Magic values (little-endian 32-bit):

+
    +
  • +

    RPC_MAGIC_REQUEST = 0x43555152 ('CUQR')

    +
  • +

    RPC_MAGIC_RESPONSE = 0x43555153 ('CUQS')

    +
+
// Wire format (byte layout must match dispatch_kernel.cuh)struct RPCHeader {  uint32_t magic;        // RPC_MAGIC_REQUEST  uint32_t function_id;  // fnv1a_hash("handler_name")  uint32_t arg_len;      // payload bytes following this header};struct RPCResponse {  uint32_t magic;        // RPC_MAGIC_RESPONSE  int32_t  status;       // 0 = success  uint32_t result_len;   // bytes of response payload};
+

Payload conventions:

+
    +
  • +

    Request payload: argument data as specified by handler schema.

    +
  • +

    Response payload: result data as specified by handler schema.

    +
  • +

    Size limit: payload must fit in one slot. max_payload_bytes = slot_size - sizeof(RPCHeader).

    +
  • +

    Multi-argument encoding: arguments concatenated in schema order (see message protocol doc).

    +
+

1.9. Host API Overview # {#api-overview}

+

Header: realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h

+

1.10. Manager and Dispatcher Topology # {#manager-dispatcher}

+

The manager is a lightweight owner for one or more dispatchers. Each dispatcher +is configured independently (e.g., vp_id, kernel_type, dispatch_mode) and +can target different workloads.

+

Manager and dispatcher topology

+

1.11. Host API Functions # {#api-functions}

+

Function usage:

+

cudaq_dispatch_manager_create creates the top-level manager that owns +dispatchers.

+

Parameters:

+
    +
  • +

    out_mgr: receives the created manager handle.

    +
+

Call this once near program startup and keep the manager alive for the +lifetime of the dispatch subsystem.

+

cudaq_dispatch_manager_destroy releases the manager and any internal +resources.

+

Parameters:

+
    +
  • +

    mgr: manager handle to destroy.

    +
+

Call this after all dispatchers have been destroyed and the program is +shutting down.

+

cudaq_dispatcher_create allocates a dispatcher instance and validates the +configuration.

+

Parameters:

+
    +
  • +

    mgr: owning manager.

    +
  • +

    config: filled cudaq_dispatcher_config_t with:

    +
  • +

    device_id (default 0): selects the CUDA device for the dispatcher

    +
  • +

    num_blocks (default 1)

    +
  • +

    threads_per_block (default 32)

    +
  • +

    num_slots (required)

    +
  • +

    slot_size (required)

    +
  • +

    vp_id (default 0): tags a dispatcher to a transport channel. Queue pair selection and NIC port/IP binding are configured in Hololink, not in this API.

    +
  • +

    kernel_type (default CUDAQ_KERNEL_REGULAR)

    +
      +
    • +

      CUDAQ_KERNEL_REGULAR: standard kernel launch

      +
    • +

      CUDAQ_KERNEL_COOPERATIVE: cooperative launch (grid.sync() capable)

      +
    +
  • +

    dispatch_mode (default CUDAQ_DISPATCH_DEVICE_CALL)

    +
      +
    • +

      CUDAQ_DISPATCH_DEVICE_CALL: direct __device__ handler call (lowest latency)

      +
    • +

      CUDAQ_DISPATCH_GRAPH_LAUNCH: CUDA graph launch from device code (requires sm_90+, Hopper or later GPUs)

      +
    +
  • +

    out_dispatcher: receives the created dispatcher handle.

    +
+

Call this before wiring ring buffers, function tables, or control state.

+

cudaq_dispatcher_destroy releases a dispatcher after it has been stopped.

+

Parameters:

+
    +
  • +

    dispatcher: dispatcher handle to destroy.

    +
+

Call this when the dispatcher is no longer needed.

+

cudaq_dispatcher_set_ringbuffer provides the RX/TX flag and data +pointers the dispatch kernel will poll and use for request/response slots.

+

Parameters:

+
    +
  • +

    dispatcher: dispatcher handle.

    +
  • +

    ringbuffer: cudaq_ringbuffer_t with:

    +
  • +

    rx_flags: device-visible pointer to RX flags.

    +
  • +

    tx_flags: device-visible pointer to TX flags.

    +
  • +

    rx_data: device-visible pointer to RX slot data (request payloads).

    +
  • +

    tx_data: device-visible pointer to TX slot data (response payloads).

    +
  • +

    rx_stride_sz: size in bytes of each RX slot.

    +
  • +

    tx_stride_sz: size in bytes of each TX slot.

    +
+

Call this before cudaq_dispatcher_start, after allocating mapped host memory +or device memory for the ring buffers.

+

cudaq_dispatcher_set_function_table supplies the function table +containing handler pointers, IDs, and schemas.

+

Parameters:

+
    +
  • +

    dispatcher: dispatcher handle.

    +
  • +

    table: cudaq_function_table_t with:

    +
  • +

    entries: device pointer to array of cudaq_function_entry_t.

    +
  • +

    count: number of entries in the table.

    +
+
// Unified function table entry with schemastruct cudaq_function_entry_t {  union {    void*           device_fn_ptr;   // for CUDAQ_DISPATCH_DEVICE_CALL    cudaGraphExec_t graph_exec;      // for CUDAQ_DISPATCH_GRAPH_LAUNCH  } handler;  uint32_t                function_id;  uint8_t                 dispatch_mode;   // Per-handler dispatch mode  uint8_t                 reserved[3];  cudaq_handler_schema_t  schema;          // Handler interface schema};struct cudaq_function_table_t {  cudaq_function_entry_t* entries;   // Device pointer to entry array  uint32_t                count;     // Number of entries};
+

Call this after initializing the device-side function table entries. +Each entry contains a handler pointer (or graph), function_id, dispatch mode, +and schema describing the handler’s interface.

+

Function ID semantics:

+
    +
  • +

    function_id is the 32-bit FNV-1a hash of the handler name string.

    +
  • +

    The handler name is the string you hash when populating entries; there is no separate runtime registration call.

    +
  • +

    If no entry matches, the dispatcher clears the slot without a response.

    +
  • +

    Suggested: use stable, human-readable handler names (e.g., "mock_decode").

    +
+

cudaq_dispatcher_set_control supplies the shutdown flag and stats buffer +the dispatch kernel uses for termination and bookkeeping.

+

Parameters:

+
    +
  • +

    dispatcher: dispatcher handle.

    +
  • +

    shutdown_flag: device-visible flag used to signal shutdown.

    +
  • +

    stats: device-visible stats buffer.

    +
+

Call this before starting the dispatcher; both buffers must remain valid for +the dispatcher’s lifetime.

+

cudaq_dispatcher_set_launch_fn provides the host-side launch wrapper that +invokes the dispatch kernel with the correct grid/block dimensions.

+

Parameters:

+
    +
  • +

    dispatcher: dispatcher handle.

    +
  • +

    launch_fn: host launch function pointer.

    +
+

Call this once during setup. Typically you pass one of the provided launch functions:

+
    +
  • +

    cudaq_launch_dispatch_kernel_regular - for CUDAQ_KERNEL_REGULAR mode

    +
  • +

    cudaq_launch_dispatch_kernel_cooperative - for CUDAQ_KERNEL_COOPERATIVE mode

    +
+

cudaq_dispatcher_start launches the persistent dispatch kernel and begins +processing slots.

+

Parameters:

+
    +
  • +

    dispatcher: dispatcher handle.

    +
+

Call this only after ring buffers, function table, control buffers, and launch +function are set.

+

cudaq_dispatcher_stop signals the dispatch kernel to exit and waits for it +to shut down.

+

Parameters:

+
    +
  • +

    dispatcher: dispatcher handle.

    +
+

Call this during teardown before destroying the dispatcher.

+

cudaq_dispatcher_get_processed reads the processed‑packet counter from the +stats buffer to support debugging or throughput tracking.

+

Parameters:

+
    +
  • +

    dispatcher: dispatcher handle.

    +
  • +

    out_packets: receives the processed packet count.

    +
+

1.11.1. Occupancy Query and Eager Module Loading # {#occupancy-query}

+

Before calling cudaq_dispatcher_start, call the appropriate occupancy query +to force eager loading of the dispatch kernel module. This avoids lazy-load +deadlocks when the dispatch kernel and transport kernels (e.g., Hololink RX/TX) +run as persistent kernels.

+

cudaq_dispatch_kernel_query_occupancy returns the +maximum number of active blocks per multiprocessor for the regular dispatch +kernel.

+

Parameters:

+
    +
  • +

    out_blocks: receives the max blocks per SM (or 0 on error).

    +
  • +

    threads_per_block: block size used for the occupancy calculation.

    +
+

Returns cudaSuccess on success. Call this when kernel_type is +CUDAQ_KERNEL_REGULAR.

+

cudaq_dispatch_kernel_cooperative_query_occupancy +returns the maximum number of active blocks per multiprocessor for the +cooperative dispatch kernel.

+

Parameters:

+
    +
  • +

    out_blocks: receives the max blocks per SM (or 0 on error).

    +
  • +

    threads_per_block: block size used for the occupancy calculation (e.g., 128 for cooperative decoders).

    +
+

Returns cudaSuccess on success. Call this when kernel_type is +CUDAQ_KERNEL_COOPERATIVE. Use the same threads_per_block value that will +be passed to the dispatcher config and launch function.

+

Call the occupancy function that matches the dispatcher’s kernel_type once +before cudaq_dispatcher_start; the result can be used to size the dispatch +grid (e.g., to reserve SMs for transport kernels).

+

Lifetime/ownership:

+
    +
  • +

    All resources are assumed to live for the program lifetime.

    +
  • +

    The API does not take ownership of host-allocated memory.

    +
+

Threading:

+
    +
  • +

    Single-threaded host usage; create/wire/start/stop from one thread.

    +
+

Error handling:

+
    +
  • +

    All calls return cudaq_status_t.

    +
  • +

    CUDAQ_ERR_INVALID_ARG for missing pointers or invalid config.

    +
  • +

    CUDAQ_ERR_CUDA for CUDA API failures during start/stop.

    +
+

1.11.2. Graph-Based Dispatch Functions

+

The following functions are only available when using CUDAQ_DISPATCH_GRAPH_LAUNCH mode with sm_90+ GPUs:

+

cudaq_create_dispatch_graph_regular creates a graph-based dispatch context that enables device-side graph launching.

+

Parameters:

+
    +
  • +

    rx_flags: device-visible pointer to RX ring buffer flags

    +
  • +

    tx_flags: device-visible pointer to TX ring buffer flags

    +
  • +

    function_table: device pointer to function table entries

    +
  • +

    func_count: number of function table entries

    +
  • +

    graph_buffer_ptr: device pointer for graph buffer communication

    +
  • +

    shutdown_flag: device-visible shutdown flag

    +
  • +

    stats: device-visible stats buffer

    +
  • +

    num_slots: number of ring buffer slots

    +
  • +

    num_blocks: grid size for dispatch kernel

    +
  • +

    threads_per_block: block size for dispatch kernel

    +
  • +

    stream: CUDA stream for graph operations

    +
  • +

    out_context: receives the created graph context handle

    +
+

Returns cudaSuccess on success, or CUDA error code on failure.

+

This function creates a graph containing the dispatch kernel, instantiates it with cudaGraphInstantiateFlagDeviceLaunch, and uploads it to the device. The resulting graph context enables device-side cudaGraphLaunch() calls from within handlers.

+

cudaq_launch_dispatch_graph launches the dispatch graph to begin processing RPC messages.

+

Parameters:

+
    +
  • +

    context: graph context handle from cudaq_create_dispatch_graph_regular

    +
  • +

    stream: CUDA stream for graph launch

    +
+

Returns cudaSuccess on success, or CUDA error code on failure.

+

Call this to start the persistent dispatch kernel. The kernel will continue running until the shutdown flag is set.

+

cudaq_destroy_dispatch_graph destroys the graph context and releases all associated resources.

+

Parameters:

+
    +
  • +

    context: graph context handle to destroy

    +
+

Returns cudaSuccess on success, or CUDA error code on failure.

+

Call this after the dispatch kernel has exited (shutdown flag was set) to clean up graph resources.

+

1.11.3. Kernel Launch Helper Functions

+

The following helper functions are provided for use with cudaq_dispatcher_set_launch_fn():

+

cudaq_launch_dispatch_kernel_regular launches the dispatch kernel in regular (non-cooperative) mode.

+

Parameters:

+
    +
  • +

    rx_flags: device-visible pointer to RX ring buffer flags

    +
  • +

    tx_flags: device-visible pointer to TX ring buffer flags

    +
  • +

    function_table: device pointer to function table entries

    +
  • +

    func_count: number of function table entries

    +
  • +

    shutdown_flag: device-visible shutdown flag

    +
  • +

    stats: device-visible stats buffer

    +
  • +

    num_slots: number of ring buffer slots

    +
  • +

    num_blocks: grid size for dispatch kernel

    +
  • +

    threads_per_block: block size for dispatch kernel

    +
  • +

    stream: CUDA stream for kernel launch

    +
+

Use this when kernel_type is set to CUDAQ_KERNEL_REGULAR in the dispatcher configuration.

+

cudaq_launch_dispatch_kernel_cooperative launches the dispatch kernel in cooperative mode.

+

Parameters: Same as cudaq_launch_dispatch_kernel_regular.

+

Use this when kernel_type is set to CUDAQ_KERNEL_COOPERATIVE in the dispatcher configuration. This enables the dispatch kernel and handlers to use grid-wide synchronization via cooperative_groups::this_grid().sync().

+

1.12. Memory Layout and Ring Buffer Wiring # {#memory-layout}

+

Each slot is a fixed-size byte region:

+
| RPCHeader | payload bytes (arg_len) | unused padding (slot_size - header - payload) |
+

Unused padding is the remaining bytes in the fixed-size slot after the header +and payload.

+

Flags (both are uint64_t arrays of slot flags):

+
    +
  • +

    rx_flags[slot] is set by the producer to a non-zero value when a slot is ready.

    +
  • +

    tx_flags[slot] is set by the dispatch kernel to a non-zero value when the response is ready.

    +
+

Message completion note: +An RPC message may be delivered as multiple RDMA writes into a single slot. +Completion is signaled only after the final write (often an RDMA write with +immediate) sets rx_flags[slot] to a non-zero value. The dispatch kernel treats +the slot as complete only after the flag is set.

+

In the NIC-free path, flags and data are allocated with +cudaHostAllocMapped so the device and host see the same memory.

+

1.13. Step-by-Step: Wiring the Host API (Minimal) # {#wiring}

+

The snippet below is real code from +cudaqx/libs/qec/unittests/decoders/realtime/test_realtime_decoding.cu:

+
// Host API wiringASSERT_EQ(cudaq_dispatch_manager_create(&manager_), CUDAQ_OK);cudaq_dispatcher_config_t config{};config.device_id = 0;config.num_blocks = 1;config.threads_per_block = 32;config.num_slots = static_cast<uint32_t>(num_slots_);config.slot_size = static_cast<uint32_t>(slot_size_);config.vp_id = 0;config.kernel_type = CUDAQ_KERNEL_REGULAR;config.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL;ASSERT_EQ(cudaq_dispatcher_create(manager_, &config, &dispatcher_), CUDAQ_OK);cudaq_ringbuffer_t ringbuffer{};ringbuffer.rx_flags = rx_flags_;ringbuffer.tx_flags = tx_flags_;ringbuffer.rx_data = rx_data_;ringbuffer.tx_data = tx_data_;ringbuffer.rx_stride_sz = slot_size_;ringbuffer.tx_stride_sz = slot_size_;ASSERT_EQ(cudaq_dispatcher_set_ringbuffer(dispatcher_, &ringbuffer), CUDAQ_OK);// Allocate and initialize function table entriescudaq_function_entry_t* d_entries;cudaMalloc(&d_entries, func_count_ * sizeof(cudaq_function_entry_t));// Initialize entries on device (including schemas)init_function_table<<<1, 1>>>(d_entries);cudaDeviceSynchronize();cudaq_function_table_t table{};table.entries = d_entries;table.count = func_count_;ASSERT_EQ(cudaq_dispatcher_set_function_table(dispatcher_, &table), CUDAQ_OK);ASSERT_EQ(cudaq_dispatcher_set_control(dispatcher_, d_shutdown_flag_, d_stats_),          CUDAQ_OK);ASSERT_EQ(cudaq_dispatcher_set_launch_fn(dispatcher_, &launch_dispatch_kernel_wrapper),          CUDAQ_OK);ASSERT_EQ(cudaq_dispatcher_start(dispatcher_), CUDAQ_OK);
+

1.14. Device Handler and Function ID # {#device-handler}

+

Real code from test_realtime_decoding.cu:

+
// The dispatcher uses function_id to find the handlerconstexpr std::uint32_t MOCK_DECODE_FUNCTION_ID =    cudaq::realtime::fnv1a_hash("mock_decode");/// @brief Initialize the device function table with schema__global__ void init_function_table(cudaq_function_entry_t* entries) {  if (threadIdx.x == 0 && blockIdx.x == 0) {    // Entry 0: Mock decoder    entries[0].handler.device_fn_ptr =         reinterpret_cast<void*>(&cudaq::qec::realtime::mock_decode_rpc);    entries[0].function_id = MOCK_DECODE_FUNCTION_ID;    entries[0].dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL;    // Schema: 1 arg (bit-packed detection events), 1 result (correction byte)    entries[0].schema.num_args = 1;    entries[0].schema.args[0] = {TYPE_BIT_PACKED, {0}, 16, 128};  // 128 bits    entries[0].schema.num_results = 1;    entries[0].schema.results[0] = {TYPE_UINT8, {0}, 1, 1};  }}
+

1.14.1. Multi-Argument Handler Example

+
constexpr std::uint32_t ADVANCED_DECODE_FUNCTION_ID =    cudaq::realtime::fnv1a_hash("advanced_decode");__global__ void init_advanced_handler(cudaq_function_entry_t* entries,                                        uint32_t index) {  if (threadIdx.x == 0 && blockIdx.x == 0) {    entries[index].handler.device_fn_ptr =         reinterpret_cast<void*>(&advanced_decode_rpc);    entries[index].function_id = ADVANCED_DECODE_FUNCTION_ID;    entries[index].dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL;    // Schema: 2 args (detection events + calibration), 1 result    entries[index].schema.num_args = 2;    entries[index].schema.args[0] = {TYPE_BIT_PACKED, {0}, 16, 128};    entries[index].schema.args[1] = {TYPE_ARRAY_FLOAT32, {0}, 64, 16};  // 16 floats    entries[index].schema.num_results = 1;    entries[index].schema.results[0] = {TYPE_UINT8, {0}, 1, 1};  }}
+

1.15. CUDA Graph Dispatch Mode # {#graph-dispatch}

+

The CUDAQ_DISPATCH_GRAPH_LAUNCH mode enables handlers to be executed as pre-captured CUDA graphs launched from device code. This is useful for complex multi-kernel workflows that benefit from graph optimization and can reduce kernel launch overhead for sophisticated decoders.

+

1.15.1. Requirements

+
    +
  • +

    GPU Architecture: Compute capability 9.0 or higher (Hopper H100 or later)

    +
  • +

    CUDA Version: CUDA 12.0+ with device-side graph launch support

    +
  • +

    Graph Setup: Handler graphs must be captured and instantiated with cudaGraphInstantiateFlagDeviceLaunch

    +
+

1.15.2. Graph-Based Dispatch API

+

The API provides functions to properly wrap the dispatch kernel in a graph context that enables device-side cudaGraphLaunch():

+
// Opaque handle for graph-based dispatch contexttypedef struct cudaq_dispatch_graph_context cudaq_dispatch_graph_context;// Create a graph-based dispatch contextcudaError_t cudaq_create_dispatch_graph_regular(    volatile uint64_t *rx_flags, volatile uint64_t *tx_flags,    cudaq_function_entry_t *function_table, size_t func_count,    void **graph_buffer_ptr, volatile int *shutdown_flag, uint64_t *stats,    size_t num_slots, uint32_t num_blocks, uint32_t threads_per_block,    cudaStream_t stream, cudaq_dispatch_graph_context **out_context);// Launch the dispatch graphcudaError_t cudaq_launch_dispatch_graph(cudaq_dispatch_graph_context *context,                                        cudaStream_t stream);// Destroy the dispatch graph contextcudaError_t cudaq_destroy_dispatch_graph(cudaq_dispatch_graph_context *context);
+

1.15.3. Graph Handler Setup Example

+
/// @brief Initialize function table with CUDA graph handler__global__ void init_function_table_graph(cudaq_function_entry_t* entries) {  if (threadIdx.x == 0 && blockIdx.x == 0) {    entries[0].handler.graph_exec = /* pre-captured cudaGraphExec_t */;    entries[0].function_id = DECODE_FUNCTION_ID;    entries[0].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH;    // Schema: same as device call mode    entries[0].schema.num_args = 1;    entries[0].schema.args[0] = {TYPE_BIT_PACKED, {0}, 16, 128};    entries[0].schema.num_results = 1;    entries[0].schema.results[0] = {TYPE_UINT8, {0}, 1, 1};  }}
+

1.15.4. Graph Capture and Instantiation

+

Handler graphs must be captured and instantiated with the device launch flag:

+
cudaStream_t capture_stream;cudaStreamCreate(&capture_stream);// Capture the decoder kernel(s) into a graphcudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeGlobal);decode_kernel<<<blocks, threads, 0, capture_stream>>>(args...);cudaStreamEndCapture(capture_stream, &graph);// Instantiate with device launch flag (required for device-side cudaGraphLaunch)cudaGraphExec_t graph_exec;cudaGraphInstantiateWithFlags(&graph_exec, graph,                               cudaGraphInstantiateFlagDeviceLaunch);// Upload graph to devicecudaGraphUpload(graph_exec, capture_stream);cudaStreamSynchronize(capture_stream);cudaStreamDestroy(capture_stream);
+

1.15.5. When to Use Graph Dispatch

+

Use CUDAQ_DISPATCH_GRAPH_LAUNCH mode with the graph-based dispatch API when handlers need to launch CUDA graphs from device code. The graph-based dispatch API (cudaq_create_dispatch_graph_regular() + cudaq_launch_dispatch_graph()) wraps the dispatch kernel in a graph execution context, enabling device-side cudaGraphLaunch() calls from within handlers.

+

1.15.6. Graph vs Device Call Dispatch

+

Device Call Mode (CUDAQ_DISPATCH_DEVICE_CALL):

+
    +
  • +

    Lowest latency for simple handlers

    +
  • +

    Direct __device__ function call from dispatcher

    +
  • +

    Suitable for lightweight decoders and data transformations

    +
  • +

    No special hardware requirements

    +
+

Graph Launch Mode (CUDAQ_DISPATCH_GRAPH_LAUNCH):

+
    +
  • +

    Enables complex multi-kernel workflows

    +
  • +

    Benefits from CUDA graph optimizations

    +
  • +

    Requires sm_90+ hardware (Hopper or later)

    +
  • +

    Higher setup overhead but can reduce per-invocation latency for complex pipelines

    +
+

1.16. Building and Sending an RPC Message # {#build-rpc}

+

Real code from test_realtime_decoding.cu:

+

Note: this host-side snippet emulates what the external device/FPGA would do +when populating RX slots in a Hololink deployment.

+
/// @brief Write detection events to RX buffer in RPC format.void write_rpc_request(std::size_t slot, const std::vector<uint8_t>& measurements) {  uint8_t* slot_data = const_cast<uint8_t*>(rx_data_host_) + slot * slot_size_;  // Write RPCHeader  cudaq::realtime::RPCHeader* header =      reinterpret_cast<cudaq::realtime::RPCHeader*>(slot_data);  header->magic = cudaq::realtime::RPC_MAGIC_REQUEST;  header->function_id = MOCK_DECODE_FUNCTION_ID;  header->arg_len = static_cast<std::uint32_t>(measurements.size());  // Write measurement data after header  memcpy(slot_data + sizeof(cudaq::realtime::RPCHeader),         measurements.data(), measurements.size());}
+

1.17. Reading the Response # {#read-response}

+

Real code from test_realtime_decoding.cu:

+

Note: this host-side snippet emulates what the external device/FPGA would do +when consuming TX slots in a Hololink deployment.

+
/// @brief Read response from TX buffer./// Responses are written by the dispatch kernel to the TX ring buffer; read from tx_data, not rx_data.bool read_rpc_response(std::size_t slot, uint8_t& correction,                       std::int32_t* status_out = nullptr,                       std::uint32_t* result_len_out = nullptr) {  __sync_synchronize();  const uint8_t* slot_data = const_cast<uint8_t*>(tx_data_host_) + slot * slot_size_;  // Read RPCResponse  const cudaq::realtime::RPCResponse* response =      reinterpret_cast<const cudaq::realtime::RPCResponse*>(slot_data);  if (response->magic != cudaq::realtime::RPC_MAGIC_RESPONSE) {    return false;  }  if (status_out)    *status_out = response->status;  if (result_len_out)    *result_len_out = response->result_len;  if (response->status != 0) {    return false;  }  // Read correction data after response header  correction = *(slot_data + sizeof(cudaq::realtime::RPCResponse));  return true;}
+

1.18. Schema-Driven Argument Parsing # {#schema-parsing}

+

The dispatcher uses the handler schema to interpret the typeless payload bytes. +This example shows conceptual parsing logic:

+
__device__ void parse_args_from_payload(    const uint8_t* payload,    const cudaq_handler_schema_t& schema,    void** arg_ptrs) {  uint32_t offset = 0;  for (uint8_t i = 0; i < schema.num_args; i++) {    arg_ptrs[i] = const_cast<uint8_t*>(payload + offset);    offset += schema.args[i].size_bytes;  }}__device__ void dispatch_with_schema(    uint8_t* slot_data,    const cudaq_function_entry_t& entry) {  RPCHeader* hdr = reinterpret_cast<RPCHeader*>(slot_data);  uint8_t* payload = slot_data + sizeof(RPCHeader);  // Parse arguments using schema  void* arg_ptrs[8];  parse_args_from_payload(payload, entry.schema, arg_ptrs);  // Call handler with parsed arguments  if (entry.dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) {    auto handler = reinterpret_cast<HandlerFn>(entry.handler.device_fn_ptr);    handler(arg_ptrs, entry.schema.num_args, /* result buffer */);  }  // ... graph launch path uses same parsed args}
+

For multi-argument payloads, arguments are concatenated in schema order:

+
| RPCHeader | arg0_bytes | arg1_bytes | arg2_bytes | ... |             ^            ^            ^             offset=0     offset=16    offset=80
+

The schema specifies the size of each argument, allowing the dispatcher to +compute offsets.

+ +

See the 3-Kernel Architecture diagram above for +the complete data flow. The key integration points are:

+

Ring buffer handoff (RX → Dispatch):

+
// Hololink RX kernel sets this after writing detection event datarx_flags[slot] = device_ptr_to_slot_data;
+

Ring buffer handoff (Dispatch → TX):

+
// Dispatch kernel sets this after writing RPCResponsetx_flags[slot] = device_ptr_to_slot_data;
+

Latency path: The critical path is:

+
    +
  1. +

    RDMA write completes → RX kernel signals → Dispatch polls and processes → TX kernel polls and sends → RDMA read completes

    +
+

All three kernels are persistent (launched once, run indefinitely), so +there is no kernel launch overhead in the hot path.

+

1.20. NIC-Free Testing (No Hololink / No ConnectX-7) # {#nic-free}

+

Emulate RX/TX with mapped host memory:

+
    +
  • +

    cudaqx mock-decoder test:

    +
  • +

    libs/qec/unittests/decoders/realtime/test_realtime_decoding.cu

    +
  • +

    cuda-quantum host API test:

    +
  • +

    realtime/unittests/test_dispatch_kernel.cu

    +
+

Detection event file convention used by the tests:

+
    +
  • +

    Each ROUND_START block represents one decoding round.

    +
  • +

    Only the numeric detection event values are encoded into the payload (do not send the ROUND_START tokens).

    +
+

Note: Existing test files may use SHOT_START for backwards compatibility; this should be interpreted as ROUND_START in the context of realtime decoding.

+

1.21. Mock Decoder Example (cudaqx) # {#mock-decoder}

+

The mock decoder is registered as an RPC handler and invoked by the dispatch +kernel. The tests show end-to-end wiring with detection events loaded from +the detection event file.

+

See:

+
    +
  • +

    cudaqx/libs/qec/unittests/decoders/realtime/test_realtime_decoding.cu

    +
+

1.22. Troubleshooting # {#troubleshooting}

+
    +
  • +

    Timeout waiting for TX: ensure the RX flag points to device-mapped memory.

    +
  • +

    Invalid arg: check slot_size, num_slots, function table pointers.

    +
  • +

    CUDA errors: verify device_id, and that CUDA is initialized.

    +
+

1.23. References # {#references}

+
    +
  • +

    cuda-quantum/realtime/unittests/test_dispatch_kernel.cu

    +
  • +

    cudaqx/libs/qec/unittests/decoders/realtime/test_realtime_decoding.cu

    +
+
+ \ No newline at end of file diff --git a/realtime/docs/cudaq_realtime_message_protocol.html b/realtime/docs/cudaq_realtime_message_protocol.html new file mode 100644 index 00000000..2e9e98df --- /dev/null +++ b/realtime/docs/cudaq_realtime_message_protocol.html @@ -0,0 +1,2513 @@ + + + + + CUDA-Q Realtime Messaging Protocol (Draft) + + + + + + + + + + + + + + + +
+

+

CUDA-Q Realtime Messaging Protocol (Draft)

+

Published Proposal, +

+
+
+
Editor: +
(NVIDIA) +
Issue Tracking: +
GitHub +
+
+
+
+
+
+

Abstract

+

RPC payload encoding and message conventions for realtime dispatch.

+
+
+ +
+

1. CUDA-Q Realtime Messaging Protocol

+

This document defines the RPC (Remote Procedure Call) payload encoding used by the realtime dispatch kernel for processing data and returning results. It complements +cudaq_realtime_host_api.bs, which focuses on wiring and API usage.

+

1.1. Scope # {#scope}

+
    +
  • +

    RPC header/response wire format

    +
  • +

    Payload encoding and type system

    +
  • +

    Schema contract and payload interpretation

    +
  • +

    Function dispatch semantics

    +
+

Note: This protocol is hardware-agnostic. While the companion document +cudaq_realtime_host_api.bs provides implementation details for both GPU and +CPU-based dispatchers, the wire format and encoding rules specified here apply +universally.

+

1.2. RPC Header / Response # {#rpc-header}

+

Each ring-buffer slot is interpreted as:

+
| RPCHeader | payload bytes (arg_len) | unused padding (slot_size - header - payload) |
+
struct RPCHeader {  uint32_t magic;        // RPC_MAGIC_REQUEST  uint32_t function_id;  // fnv1a_hash("handler_name")  uint32_t arg_len;      // payload bytes following this header};struct RPCResponse {  uint32_t magic;        // RPC_MAGIC_RESPONSE  int32_t  status;       // 0 = success  uint32_t result_len;   // bytes of response payload};
+

Magic values (little-endian 32-bit):

+
    +
  • +

    RPC_MAGIC_REQUEST = 0x43555152 ('CUQR')

    +
  • +

    RPC_MAGIC_RESPONSE = 0x43555153 ('CUQS')

    +
+

1.3. Function ID Semantics # {#function-id}

+

function_id selects which handler the dispatcher invokes for a given RPC +message. The dispatcher performs a lookup in the function table (array of +function pointers + IDs) and calls the matching entry.

+

See cudaq_realtime_host_api.bs for function ID hashing, handler naming, and function +table registration details.

+

1.4. Schema and Payload Interpretation # {#schema-interpretation}

+

The RPC payload is typeless on the wire. The bytes following RPCHeader +are an opaque blob from the protocol’s perspective.

+

Payload interpretation is defined by the handler schema, which is registered +in the dispatcher’s function table during setup (see cudaq_realtime_host_api.bs). +The schema specifies:

+
    +
  • +

    Number of arguments

    +
  • +

    Type and size of each argument

    +
  • +

    Number of return values

    +
  • +

    Type and size of each return value

    +
+

Out-of-band contract: The client (e.g., FPGA) firmware and dispatcher function +table must agree on the schema for each function_id. Schema mismatches are detected +during integration testing, not at runtime.

+

For handlers with multiple arguments, the payload is a concatenation of +argument data in schema order:

+
| RPCHeader | arg0_bytes | arg1_bytes | arg2_bytes | ... |
+

The dispatcher uses the schema to determine where each argument begins and ends within +the payload.

+

1.4.1. Type System # {#type-system}

+

Standardized payload type identifiers used in handler schemas:

+
enum PayloadTypeID : uint8_t {  TYPE_UINT8           = 0x10,  TYPE_INT32           = 0x11,  TYPE_INT64           = 0x12,  TYPE_FLOAT32         = 0x13,  TYPE_FLOAT64         = 0x14,  TYPE_ARRAY_UINT8     = 0x20,  TYPE_ARRAY_INT32     = 0x21,  TYPE_ARRAY_FLOAT32   = 0x22,  TYPE_ARRAY_FLOAT64   = 0x23,  TYPE_BIT_PACKED      = 0x30   // Bit-packed data (LSB-first)};
+

Schema type descriptor (see cudaq_realtime_host_api.bs for full definition):

+
struct cudaq_type_desc_t {  uint8_t  type_id;       // PayloadTypeID value  uint8_t  reserved[3];  uint32_t size_bytes;    // Total size in bytes  uint32_t num_elements;  // Interpretation depends on type_id};
+

The num_elements field interpretation:

+
    +
  • +

    Scalar types (TYPE_UINT8, TYPE_INT32, etc.): unused, set to 1

    +
  • +

    Array types (TYPE_ARRAY_*): number of array elements

    +
  • +

    TYPE_BIT_PACKED: number of bits (not bytes)

    +
+

Note: For arbitrary binary data or vendor-specific formats, use TYPE_ARRAY_UINT8.

+

Encoding rules:

+
    +
  • +

    All multi-byte integers: little-endian

    +
  • +

    Floating-point: IEEE 754 format

    +
  • +

    Arrays: tightly packed elements (no padding)

    +
  • +

    Bit-packed data: LSB-first within each byte, size_bytes = ceil(num_elements / 8)

    +
+

1.5. Payload Encoding # {#payload-encoding}

+

The payload contains the argument data for the handler function. The encoding +depends on the argument types specified in the handler schema.

+

1.5.1. Single-Argument Payloads

+

For handlers with one argument, the payload contains the argument data directly:

+
| RPCHeader | argument_bytes |
+

1.5.2. Multi-Argument Payloads

+

For handlers with multiple arguments, arguments are concatenated in schema order +with no padding or delimiters:

+
| RPCHeader | arg0_bytes | arg1_bytes | arg2_bytes | ... |
+

The schema specifies the size of each argument, allowing the dispatcher to compute offsets.

+

1.5.3. Size Constraints

+

The total payload must fit in a single ring-buffer slot:

+
total_size = sizeof(RPCHeader) + arg_lenslot_sizemax_payload_bytes = slot_size - sizeof(RPCHeader)
+

1.5.4. Encoding Examples

+

Example 1: Handler with signature void process(int32_t count, float threshold)

+

Schema:

+
    +
  • +

    arg0: TYPE_INT32, 4 bytes

    +
  • +

    arg1: TYPE_FLOAT32, 4 bytes

    +
+

Wire encoding:

+
Offset | Content-------|--------0-11   | RPCHeader { magic, function_id, arg_len=8 }12-15  | count (int32_t, little-endian)16-19  | threshold (float, IEEE 754)
+

Example 2: Handler with signature void decode(const uint8_t* bits, uint32_t num_bits)

+

Schema:

+
    +
  • +

    arg0: TYPE_BIT_PACKED, size_bytes=16, num_elements=128

    +
  • +

    arg1: TYPE_UINT32, size_bytes=4, num_elements=1

    +
+

Wire encoding:

+
Offset | Content-------|--------0-11   | RPCHeader { magic, function_id, arg_len=20 }12-27  | bits (bit-packed, LSB-first, 128 bits)28-31  | num_bits=128 (uint32_t, little-endian)
+

1.5.5. Bit-Packed Data Encoding

+

For TYPE_BIT_PACKED arguments:

+
    +
  • +

    Bits are packed LSB-first within each byte

    +
  • +

    Payload length: size_bytes = ceil(num_elements / 8) bytes

    +
  • +

    The schema specifies both size_bytes (storage) and num_elements (actual bit count)

    +
+

Example for 10 bits (size_bytes=2, num_elements=10):

+
bits:    b0 b1 b2 b3 b4 b5 b6 b7 b8 b9byte[0]: b0 b1 b2 b3 b4 b5 b6 b7   (LSB-first)byte[1]: b8 b9 0  0  0  0  0  0    (unused bits set to zero)
+

The handler can use num_elements from the schema to determine how many bits +are valid, avoiding the need to pass bit count as a separate argument (though +some handlers may still choose to do so for flexibility).

+

Use case: TYPE_BIT_PACKED is suitable for binary measurements where +each measurement result is 0 or 1 (1 bit per measurement).

+

1.5.6. Multi-Bit Measurement Encoding

+

For applications requiring richer measurement data (e.g., soft readout, leakage +detection), use array types instead of TYPE_BIT_PACKED:

+

4-bit soft readout (confidence values 0-15):

+

Use TYPE_ARRAY_UINT8 with custom packing (2 measurements per byte):

+
    +
  • +

    Schema: TYPE_ARRAY_UINT8, size_bytes = ceil(num_measurements / 2), num_elements = num_measurements

    +
  • +

    Encoding: Low nibble = measurement[0], high nibble = measurement[1], etc.

    +
+

8-bit soft readout (confidence values 0-255):

+

Use TYPE_ARRAY_UINT8 with one byte per measurement:

+
    +
  • +

    Schema: TYPE_ARRAY_UINT8, size_bytes = num_measurements, num_elements = num_measurements

    +
  • +

    Encoding: byte[i] = measurement[i]

    +
+

Floating-point confidence values:

+

Use TYPE_ARRAY_FLOAT32:

+
    +
  • +

    Schema: TYPE_ARRAY_FLOAT32, size_bytes = num_measurements × 4, num_elements = num_measurements

    +
  • +

    Encoding: IEEE 754 single-precision floats, tightly packed

    +
+

Leakage/erasure-resolving readout (values beyond binary):

+

Use TYPE_ARRAY_UINT8 or TYPE_ARRAY_INT32 depending on the range of measurement outcomes (e.g., 0=ground, 1=excited, 2=leakage state).

+

1.6. Response Encoding # {#response-encoding}

+

The response is written to the TX ring buffer slot (separate from the RX buffer +that contains the request):

+
| RPCResponse | result_bytes |
+

Like the request payload, the response payload encoding is defined by the +handler schema. The schema’s results[] array specifies the type and size +of each return value.

+

1.6.1. Single-Result Response

+

For handlers returning one value, the result is written directly after the +response header.

+

Example response for a handler returning a single uint8_t:

+

Schema:

+
    +
  • +

    result0: TYPE_UINT8, size_bytes=1, num_elements=1

    +
+

Wire encoding:

+
Offset | Content                                    | Value (hex)-------|--------------------------------------------|--------------0-3    | magic (RPC_MAGIC_RESPONSE)                 | 53 51 55 434-7    | status (0 = success)                       | 00 00 00 008-11   | result_len                                 | 01 00 00 0012     | result value (uint8_t)                     | 0313-... | unused padding                             | XX XX XX XX
+

1.6.2. Multi-Result Response

+

For handlers returning multiple values, results are concatenated in schema order +(same pattern as multi-argument requests):

+
| RPCResponse | result0_bytes | result1_bytes | ... |
+

Example: Handler returning correction (uint8_t) + confidence (float)

+

Schema:

+
    +
  • +

    result0: TYPE_UINT8, size_bytes=1, num_elements=1

    +
  • +

    result1: TYPE_FLOAT32, size_bytes=4, num_elements=1

    +
+

Wire encoding:

+
Offset | Content-------|--------0-11   | RPCResponse { magic, status=0, result_len=5 }12     | correction (uint8_t)13-16  | confidence (float32, IEEE 754)
+

1.6.3. Status Codes

+
    +
  • +

    status = 0: Success

    +
  • +

    status > 0: Handler-specific error

    +
  • +

    status < 0: Protocol-level error

    +
+

1.7. QEC-Specific Usage Example # {#qec-example}

+

This section shows how the realtime messaging protocol is used for quantum +error correction (QEC) decoding. This is one application of the protocol; +other use cases follow the same pattern.

+

1.7.1. QEC Terminology

+

In QEC applications, the following terminology applies:

+
    +
  • +

    Measurement result: Raw readout value from a QPU measurement (0 or 1 for binary readout)

    +
  • +

    Detection event: XOR’d measurement results as dictated by the parity check (stabilizer) matrix

    +
  • +

    Syndrome: The full history or set of detection events used by the decoder

    +
+

The decoder consumes detection events (often called "syndrome data" colloquially) +and produces corrections.

+

1.7.2. QEC Decoder Handler

+

Typical QEC decoder signature:

+
void qec_decode(const uint8_t* detection_events, uint32_t num_events,                 uint8_t* correction);
+

Schema:

+
    +
  • +

    arg0: TYPE_BIT_PACKED, variable size (detection events, 1 bit per event)

    +
  • +

    arg1: TYPE_UINT32, 4 bytes (number of detection events)

    +
  • +

    result0: TYPE_UINT8, 1 byte (correction bit-packed)

    +
+

1.7.3. Decoding Rounds

+

For QEC applications, one RPC message typically corresponds to one decoding round +(one invocation of the decoder with a set of detection events). The boundaries of +each decoding round are determined by the quantum control system (e.g., FPGA) when +building RPC messages.

+

Note: The term "shot" is often used in quantum computing to mean one full execution +of a quantum program (repeated num_shots times for statistics). In the context +of realtime decoding, we use "decoding round" to avoid confusion, as there may be +many RPC invocations during a single quantum program execution.

+

1.7.4. Testing with Detection Event Files

+

The mock-decoder tests in cudaqx use a text file format for testing:

+
NUM_DATA <N>NUM_LOGICAL <M>ROUND_START 0<detection event bits, one per line>ROUND_START 1<detection event bits, one per line>...CORRECTIONS_START<expected corrections, one per line>CORRECTIONS_END
+

Only the numeric detection event values are encoded into RPC payloads. The +ROUND_START markers and other metadata are not transmitted on the wire.

+

Note: Existing test files may use SHOT_START for backwards compatibility; this +should be interpreted as ROUND_START in the context of realtime decoding.

+

1.8. References # {#references}

+
    +
  • +

    cudaqx/libs/qec/unittests/decoders/realtime/test_realtime_decoding.cu

    +
  • +

    cudaqx/libs/qec/unittests/decoders/realtime/data/syndromes_multi_err_lut.txt

    +
+
+ \ No newline at end of file diff --git a/realtime/docs/nvqlink_latency_demo.md b/realtime/docs/nvqlink_latency_demo.md new file mode 100644 index 00000000..c96f8a45 --- /dev/null +++ b/realtime/docs/nvqlink_latency_demo.md @@ -0,0 +1,232 @@ +# Steps to execute the NVQLink latency demo + +The source Verilog code can be found at: + + +More details about how the Holoscan Sensor Bridge (HSB) IP can be incorporated can be found at: + + +Furthermore, for this experiment, we need the Integrated Logic Analyzer (ILA) to keep the captured measurements. See the "Hololink IP: Connecting an APB ILA for Debug" section below. + +# Steps to do the experiment + +1. Load the bitfile into the FPGA. +2. Setup the host to run the experiment. Mainly the IP address of the NIC needs to be set to `192.168.0.101`. More details can be found at the *Data Channel Enumeration and IP Address Configuration* section of: + +3. Download the accompanying software from: + + + Then generate the docker: + ```sh + sudo sh ./docker/build.sh --dgpu + sudo sh ./docker/demo.sh + ``` + +To run the test, here is an example for 32B messages reported in the paper: +```sh +python3 ./examples/gpunetio_loopback.py --frame-size=32 --hololink=192.168.0.2 --rx-ibv-name=mlx5_0 --tx-ibv-name=mlx5_0 --mtu=256 +``` + +Then to capture the data from the experiment and run the latency calculation: +```sh +python3 ila.py +python3 latency_analysis.py +``` +(These two python scripts can be found next to the Verilog source code). + +# Hololink IP: Connecting an APB ILA for Debug + +This guide describes how to attach an Integrated Logic Analyzer (ILA) to one of the Hololink IP's APB register interfaces for real-time signal capture and debugging over Ethernet. + +## Overview + +The Hololink IP exposes multiple APB register interfaces via the `REG_INST` parameter (defined in `HOLOLINK_def.svh`). These interfaces can be used to connect custom user logic, including ILAs, for monitoring internal signals. + +In this example, we connect the `s_apb_ila` module to **APB[2]** and configure it to capture PTP timestamps, frame information, and other debug signals. + +## APB Interface Signals from Hololink + +The Hololink IP provides the following APB signals for user register interfaces: + +```systemverilog +// From HOLOLINK_top outputs +logic [`REG_INST-1:0] apb_psel; // Per-interface select +logic apb_penable; // Common enable +logic [31:0] apb_paddr; // Common address bus +logic [31:0] apb_pwdata; // Common write data +logic apb_pwrite; // Common write enable + +// To HOLOLINK_top inputs +logic [`REG_INST-1:0] apb_pready; // Per-interface ready +logic [31:0] apb_prdata [`REG_INST-1:0]; // Per-interface read data +logic [`REG_INST-1:0] apb_pserr; // Per-interface error +``` + +## Step 1: Tie Off Unused APB Interfaces + +For any APB interfaces not in use, tie off the signals appropriately: + +```systemverilog +// Tie off unused APB bus signals +assign apb_pserr[7:3] = '0; +assign apb_pserr[1:0] = '0; +assign apb_pready[7:3] = '1; +assign apb_pready[1:0] = '0; +``` + +> **Note:** APB[2] is left unassigned here since it will be connected to the ILA. + +--- + +## Step 2: Create APB Interface Structs for the ILA + +The `s_apb_ila` module uses the `apb_m2s` and `apb_s2m` struct types from `apb_pkg`. Declare the interface signals: + +```systemverilog +import apb_pkg::*; + +apb_m2s ila_apb_m2s; +apb_s2m ila_apb_s2m; +``` + +--- + +## Step 3: Instantiate the s_apb_ila Module + +The `s_apb_ila` module is part of the Hololink IP library (`lib_apb/s_apb_ila.sv`). + +```systemverilog +localparam ILA_DATA_WIDTH = 256; + +s_apb_ila #( + .DEPTH ( 65536 ), + .W_DATA ( ILA_DATA_WIDTH ) +) u_apb_ila ( + // APB Interface (slow clock domain) + .i_aclk ( apb_clk ), + .i_arst ( apb_rst ), + .i_apb_m2s ( ila_apb_m2s ), + .o_apb_s2m ( ila_apb_s2m ), + + // User Capture Interface (fast clock domain) + .i_pclk ( hif_clk ), + .i_prst ( hif_rst ), + .i_trigger ( '1 ), // Always triggered + .i_enable ( '1 ), // Always enabled + .i_wr_data ( ila_wr_data ), // Data to capture + .i_wr_en ( ptp_ts_en ), // Write enable + .o_ctrl_reg ( ) // Optional control output +); +``` + +--- + +## Step 4: Connect APB[2] to the ILA + +Map the Hololink APB signals to the ILA's struct interface: + +```systemverilog +// APB Master-to-Slave signals (from Hololink to ILA) +assign ila_apb_m2s.psel = apb_psel[2]; // Select APB interface 2 +assign ila_apb_m2s.penable = apb_penable; +assign ila_apb_m2s.paddr = apb_paddr; +assign ila_apb_m2s.pwdata = apb_pwdata; +assign ila_apb_m2s.pwrite = apb_pwrite; + +// APB Slave-to-Master signals (from ILA back to Hololink) +assign apb_pready[2] = ila_apb_s2m.pready; +assign apb_prdata[2] = ila_apb_s2m.prdata; +assign apb_pserr[2] = ila_apb_s2m.pserr; +``` + +--- + +## Step 5: Define the Write Data Vector + +Structure the `ila_wr_data` signal to capture the signals of interest. Here's the example configuration used: + +```systemverilog +localparam ILA_DATA_WIDTH = 256; +logic [ILA_DATA_WIDTH-1:0] ila_wr_data; + +// Bit assignments +assign ila_wr_data[63:0] = ptp_ts[63:0]; // PTP timestamp from sensor frame +assign ila_wr_data[127:64] = {ptp_sec_sync_usr[31:0], // Synchronized PTP seconds + ptp_nsec_sync_usr[31:0]}; // Synchronized PTP nanoseconds +assign ila_wr_data[139:128] = frame_cnt; // 12-bit frame counter +assign ila_wr_data[140] = sof; // Start of frame +assign ila_wr_data[141] = eof; // End of frame +assign ila_wr_data[255:142] = 'h123456789ABCDEF; // Debug pattern (filler) +``` + +### Write Data Bit Map Summary + +| Bits | Width | Signal | Description | +|------|-------|--------|-------------| +| [63:0] | 64 | `ptp_ts` | PTP timestamp extracted from sensor TX data | +| [127:64] | 64 | `{ptp_sec, ptp_nsec}` | Synchronized PTP time (seconds + nanoseconds) from Hololink | +| [139:128] | 12 | `frame_cnt` | Frame counter extracted from sensor TX data | +| [140] | 1 | `sof` | Start of frame indicator | +| [141] | 1 | `eof` | End of frame indicator | +| [255:142] | 114 | Debug pattern | Fixed pattern for debugging | + +> **Note:** `ptp_sec_sync_usr` and `ptp_nsec_sync_usr` are the PTP time outputs from Hololink (`o_ptp_sec`, `o_ptp_nanosec`) synchronized to the host interface clock domain. + +--- + +## Step 6: Supporting Logic + +### Frame Detection + +```systemverilog +logic sof, eof; +assign sof = sif_tx_axis_tvalid[0]; // SOF on first valid +assign eof = sif_tx_axis_tlast[0]; // EOF on last +``` + +### Timestamp Capture + +```systemverilog +logic [79:0] ptp_ts; +logic ptp_ts_en; +logic [11:0] frame_cnt; + +always_ff @(posedge hif_clk) begin + if (hif_rst) begin + ptp_ts <= '0; + ptp_ts_en <= '0; + frame_cnt <= '0; + end + else begin + ptp_ts <= (sof) ? sif_tx_axis_tdata[0][79:0] : ptp_ts; + frame_cnt <= (sof) ? sif_tx_axis_tdata[0][91:80] : frame_cnt; + ptp_ts_en <= sof; + end +end +``` + +--- + +## Sensor RX Interface Tie-Off + +In this configuration, only the **Sensor TX interface** is used (for receiving data from the host). The Sensor RX interface is not used and should be tied off as follows: + +```systemverilog +// Sensor Rx Streaming Interface - Tie off (not used) +.i_sif_axis_tvalid ( '0 ), +.i_sif_axis_tlast ( '0 ), +.i_sif_axis_tdata ( '{default:0} ), +.i_sif_axis_tkeep ( '{default:0} ), +.i_sif_axis_tuser ( '{default:0} ), +.o_sif_axis_tready ( ), // Leave unconnected +``` + +The Sensor TX interface (`o_sif_axis_*`) should have `i_sif_axis_tready` tied high to always accept data: + +```systemverilog +.i_sif_axis_tready ( '1 ), +``` + +--- + +Once integrated, the ILA data can be accessed via APB register reads from the host over Ethernet using the Hololink control plane. diff --git a/realtime/include/cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h b/realtime/include/cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h deleted file mode 100644 index 792893eb..00000000 --- a/realtime/include/cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h +++ /dev/null @@ -1,219 +0,0 @@ -/****************************************************************-*- C++ -*-**** - * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * - * All rights reserved. * - * * - * This source code and the accompanying materials are made available under * - * the terms of the Apache License 2.0 which accompanies this distribution. * - ******************************************************************************/ - -#pragma once - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -// Opaque handles -typedef struct cudaq_dispatch_manager_t cudaq_dispatch_manager_t; -typedef struct cudaq_dispatcher_t cudaq_dispatcher_t; - -// Error codes -typedef enum { - CUDAQ_OK = 0, - CUDAQ_ERR_INVALID_ARG = 1, - CUDAQ_ERR_INTERNAL = 2, - CUDAQ_ERR_CUDA = 3 -} cudaq_status_t; - -// Kernel synchronization type -typedef enum { - CUDAQ_KERNEL_REGULAR = 0, - CUDAQ_KERNEL_COOPERATIVE = 1 -} cudaq_kernel_type_t; - -// Dispatch invocation mode -typedef enum { - CUDAQ_DISPATCH_DEVICE_CALL = 0, - CUDAQ_DISPATCH_GRAPH_LAUNCH = 1 -} cudaq_dispatch_mode_t; - -// Payload type identifiers (matching PayloadTypeID in dispatch_kernel_launch.h) -typedef enum { - CUDAQ_TYPE_UINT8 = 0x10, - CUDAQ_TYPE_INT32 = 0x11, - CUDAQ_TYPE_INT64 = 0x12, - CUDAQ_TYPE_FLOAT32 = 0x13, - CUDAQ_TYPE_FLOAT64 = 0x14, - CUDAQ_TYPE_ARRAY_UINT8 = 0x20, - CUDAQ_TYPE_ARRAY_INT32 = 0x21, - CUDAQ_TYPE_ARRAY_FLOAT32 = 0x22, - CUDAQ_TYPE_ARRAY_FLOAT64 = 0x23, - CUDAQ_TYPE_BIT_PACKED = 0x30 -} cudaq_payload_type_t; - -// Type descriptor for arguments/results -typedef struct { - uint8_t type_id; // cudaq_payload_type_t value - uint8_t reserved[3]; // padding - uint32_t size_bytes; // total size in bytes - uint32_t num_elements; // number of elements (for arrays) -} cudaq_type_desc_t; - -// Handler schema describing function signature -typedef struct { - uint8_t num_args; // number of arguments - uint8_t num_results; // number of results - uint16_t reserved; // padding - cudaq_type_desc_t args[8]; // argument descriptors (max 8) - cudaq_type_desc_t results[4]; // result descriptors (max 4) -} cudaq_handler_schema_t; - -// Dispatcher configuration -typedef struct { - int device_id; // GPU device ID (>=0) - uint32_t num_blocks; // grid size - uint32_t threads_per_block; // block size - uint32_t num_slots; // ring buffer slots - uint32_t slot_size; // bytes per slot - uint32_t vp_id; // virtual port ID - cudaq_kernel_type_t kernel_type; // regular/cooperative kernel - cudaq_dispatch_mode_t dispatch_mode; // device call/graph launch -} cudaq_dispatcher_config_t; - -// GPU ring buffer pointers (device-visible mapped pointers) -typedef struct { - volatile uint64_t *rx_flags; // device pointer - volatile uint64_t *tx_flags; // device pointer -} cudaq_ringbuffer_t; - -// Unified function table entry with schema -typedef struct { - union { - void *device_fn_ptr; // for CUDAQ_DISPATCH_DEVICE_CALL - cudaGraphExec_t graph_exec; // for CUDAQ_DISPATCH_GRAPH_LAUNCH - } handler; - uint32_t function_id; // hash of function name (FNV-1a) - uint8_t dispatch_mode; // cudaq_dispatch_mode_t value - uint8_t reserved[3]; // padding - cudaq_handler_schema_t schema; // function signature schema - - // Graph-launch backpressure metadata - // Only meaningful when dispatch_mode == CUDAQ_DISPATCH_GRAPH_LAUNCH. - // Set to 0/NULL for DEVICE_CALL entries or when backpressure is not needed. - uint32_t mailbox_idx; // index into global_mailbox_bank - uint32_t _pad0; // alignment padding - int *d_queue_idx; // device pointer to queue tail tracker - void *d_ready_flags; // device-mapped: cuda::std::atomic* - volatile int *d_inflight_flag; // 0 = idle, 1 = graph in flight (single-launch guard) -} cudaq_function_entry_t; - -// Function table for device-side dispatch -typedef struct { - cudaq_function_entry_t *entries; // device pointer to array of entries - uint32_t count; // number of entries -} cudaq_function_table_t; - -// Host launch function pointer type -typedef void (*cudaq_dispatch_launch_fn_t)( - volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, - cudaq_function_entry_t *function_table, size_t func_count, - volatile int *shutdown_flag, uint64_t *stats, size_t num_slots, - uint32_t num_blocks, uint32_t threads_per_block, cudaStream_t stream); - -// Default dispatch kernel launch helpers (from libcudaq-realtime-dispatch.a) -void cudaq_launch_dispatch_kernel_regular( - volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, - cudaq_function_entry_t *function_table, size_t func_count, - volatile int *shutdown_flag, uint64_t *stats, size_t num_slots, - uint32_t num_blocks, uint32_t threads_per_block, cudaStream_t stream); - -void cudaq_launch_dispatch_kernel_cooperative( - volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, - cudaq_function_entry_t *function_table, size_t func_count, - volatile int *shutdown_flag, uint64_t *stats, size_t num_slots, - uint32_t num_blocks, uint32_t threads_per_block, cudaStream_t stream); - -// Graph-enabled dispatch kernels (requires compute capability 8.0+, sm_80+) -// Device-side cudaGraphLaunch is available on sm_80 and higher (CUDA 13+) -#if defined(__CUDACC__) || defined(CUDA_VERSION) - -//============================================================================== -// Graph-Based Dispatch API (Proper Device-Side Graph Launch Support) -//============================================================================== -// -// These functions properly support device-side cudaGraphLaunch() by wrapping -// the dispatch kernel in a graph that is instantiated with -// cudaGraphInstantiateFlagDeviceLaunch. -// -// Usage: -// 1. Call cudaq_create_dispatch_graph_regular() to create the graph context -// 2. Call cudaq_launch_dispatch_graph() to launch the dispatch kernel -// 3. When done, call cudaq_destroy_dispatch_graph() to cleanup -// -// The dispatch kernel running inside this graph CAN call cudaGraphLaunch() -// to launch child graphs using cudaStreamGraphFireAndForget or other modes. - -// Opaque handle for graph-based dispatch context -typedef struct cudaq_dispatch_graph_context cudaq_dispatch_graph_context; - -// Create a graph-based dispatch context for the regular kernel type. -// This creates a graph containing the dispatch kernel, instantiates it with -// cudaGraphInstantiateFlagDeviceLaunch, and uploads it to the device. -// Returns cudaSuccess on success, or an error code on failure. -cudaError_t cudaq_create_dispatch_graph_regular( - volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, - cudaq_function_entry_t *function_table, size_t func_count, - void **global_mailbox_bank, - volatile int *shutdown_flag, uint64_t *stats, - size_t num_slots, uint32_t num_blocks, uint32_t threads_per_block, - cudaStream_t stream, cudaq_dispatch_graph_context **out_context); - -// Launch the dispatch graph. The dispatch kernel inside this graph can call -// cudaGraphLaunch() to launch child graphs from device code. -cudaError_t cudaq_launch_dispatch_graph(cudaq_dispatch_graph_context *context, - cudaStream_t stream); - -// Destroy the dispatch graph context and release all resources. -cudaError_t cudaq_destroy_dispatch_graph(cudaq_dispatch_graph_context *context); - -#endif - -// Manager lifecycle -cudaq_status_t -cudaq_dispatch_manager_create(cudaq_dispatch_manager_t **out_mgr); -cudaq_status_t cudaq_dispatch_manager_destroy(cudaq_dispatch_manager_t *mgr); - -// Dispatcher lifecycle -cudaq_status_t cudaq_dispatcher_create(cudaq_dispatch_manager_t *mgr, - const cudaq_dispatcher_config_t *config, - cudaq_dispatcher_t **out_dispatcher); -cudaq_status_t cudaq_dispatcher_destroy(cudaq_dispatcher_t *dispatcher); - -// Wiring inputs -cudaq_status_t -cudaq_dispatcher_set_ringbuffer(cudaq_dispatcher_t *dispatcher, - const cudaq_ringbuffer_t *ringbuffer); -cudaq_status_t -cudaq_dispatcher_set_function_table(cudaq_dispatcher_t *dispatcher, - const cudaq_function_table_t *table); -cudaq_status_t cudaq_dispatcher_set_control(cudaq_dispatcher_t *dispatcher, - volatile int *shutdown_flag, - uint64_t *stats); -cudaq_status_t -cudaq_dispatcher_set_launch_fn(cudaq_dispatcher_t *dispatcher, - cudaq_dispatch_launch_fn_t launch_fn); - -// Start/stop -cudaq_status_t cudaq_dispatcher_start(cudaq_dispatcher_t *dispatcher); -cudaq_status_t cudaq_dispatcher_stop(cudaq_dispatcher_t *dispatcher); - -// Stats -cudaq_status_t cudaq_dispatcher_get_processed(cudaq_dispatcher_t *dispatcher, - uint64_t *out_packets); - -#ifdef __cplusplus -} -#endif diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h b/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h new file mode 100644 index 00000000..cf8eaacb --- /dev/null +++ b/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h @@ -0,0 +1,345 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Opaque handles +typedef struct cudaq_dispatch_manager_t cudaq_dispatch_manager_t; +typedef struct cudaq_dispatcher_t cudaq_dispatcher_t; + +// Error codes +typedef enum { + CUDAQ_OK = 0, + CUDAQ_ERR_INVALID_ARG = 1, + CUDAQ_ERR_INTERNAL = 2, + CUDAQ_ERR_CUDA = 3 +} cudaq_status_t; + +// Dispatcher backend: device persistent kernel vs host-side loop +typedef enum { + CUDAQ_BACKEND_DEVICE_KERNEL = 0, + CUDAQ_BACKEND_HOST_LOOP = 1 +} cudaq_backend_t; + +// TX flag status returned by cudaq_host_ringbuffer_poll_tx_flag. +typedef enum { + CUDAQ_TX_EMPTY = 0, + CUDAQ_TX_IN_FLIGHT = 1, + CUDAQ_TX_ERROR = 2, + CUDAQ_TX_READY = 3 +} cudaq_tx_status_t; + +// RPC wire-format constants (must match dispatch_kernel_launch.h). +#define CUDAQ_RPC_MAGIC_REQUEST 0x43555152u /* 'CUQR' */ +#define CUDAQ_RPC_MAGIC_RESPONSE 0x43555153u /* 'CUQS' */ +#define CUDAQ_RPC_HEADER_SIZE 12u /* 3 x uint32_t */ + +// Kernel synchronization type +typedef enum { + CUDAQ_KERNEL_REGULAR = 0, + CUDAQ_KERNEL_COOPERATIVE = 1 +} cudaq_kernel_type_t; + +// Dispatch invocation mode. +// For CUDAQ_BACKEND_HOST_LOOP only GRAPH_LAUNCH is dispatched; DEVICE_CALL and +// HOST_CALL table entries are dropped (slot cleared and advanced). +typedef enum { + CUDAQ_DISPATCH_DEVICE_CALL = 0, + CUDAQ_DISPATCH_GRAPH_LAUNCH = 1, + CUDAQ_DISPATCH_HOST_CALL = 2 +} cudaq_dispatch_mode_t; + +// Payload type identifiers (matching PayloadTypeID in dispatch_kernel_launch.h) +typedef enum { + CUDAQ_TYPE_UINT8 = 0x10, + CUDAQ_TYPE_INT32 = 0x11, + CUDAQ_TYPE_INT64 = 0x12, + CUDAQ_TYPE_FLOAT32 = 0x13, + CUDAQ_TYPE_FLOAT64 = 0x14, + CUDAQ_TYPE_ARRAY_UINT8 = 0x20, + CUDAQ_TYPE_ARRAY_INT32 = 0x21, + CUDAQ_TYPE_ARRAY_FLOAT32 = 0x22, + CUDAQ_TYPE_ARRAY_FLOAT64 = 0x23, + CUDAQ_TYPE_BIT_PACKED = 0x30 +} cudaq_payload_type_t; + +// Type descriptor for arguments/results +typedef struct { + uint8_t type_id; // cudaq_payload_type_t value + uint8_t reserved[3]; // padding + uint32_t size_bytes; // total size in bytes + uint32_t num_elements; // number of elements (for arrays) +} cudaq_type_desc_t; + +// Handler schema describing function signature +typedef struct { + uint8_t num_args; // number of arguments + uint8_t num_results; // number of results + uint16_t reserved; // padding + cudaq_type_desc_t args[8]; // argument descriptors (max 8) + cudaq_type_desc_t results[4]; // result descriptors (max 4) +} cudaq_handler_schema_t; + +// Dispatcher configuration +typedef struct { + int device_id; // GPU device ID (>=0) + uint32_t num_blocks; // grid size + uint32_t threads_per_block; // block size + uint32_t num_slots; // ring buffer slots + uint32_t slot_size; // bytes per slot + uint32_t vp_id; // virtual port ID + cudaq_kernel_type_t kernel_type; // regular/cooperative kernel + cudaq_dispatch_mode_t dispatch_mode; // device call/graph launch + cudaq_backend_t backend; // device kernel or host loop (default DEVICE_KERNEL) +} cudaq_dispatcher_config_t; + +// GPU ring buffer pointers. For device backend use device pointers only. +// For CUDAQ_BACKEND_HOST_LOOP, also set the _host pointers (same pinned +// mapped allocation); the host loop polls rx_flags_host and uses host data. +typedef struct { + volatile uint64_t *rx_flags; // device pointer + volatile uint64_t *tx_flags; // device pointer + uint8_t *rx_data; // device pointer to RX data buffer + uint8_t *tx_data; // device pointer to TX data buffer + size_t rx_stride_sz; // size of each RX slot in bytes + size_t tx_stride_sz; // size of each TX slot in bytes + // Host-side view (required when backend == CUDAQ_BACKEND_HOST_LOOP; NULL otherwise) + volatile uint64_t *rx_flags_host; + volatile uint64_t *tx_flags_host; + uint8_t *rx_data_host; + uint8_t *tx_data_host; +} cudaq_ringbuffer_t; + +// Host RPC callback: reads RPCHeader + args from slot, writes RPCResponse + result. +// slot_host is the host pointer to the slot (same layout as device slot). +typedef void (*cudaq_host_rpc_fn_t)(void *slot_host, size_t slot_size); + +// Unified function table entry with schema +typedef struct { + union { + void *device_fn_ptr; // for CUDAQ_DISPATCH_DEVICE_CALL + cudaGraphExec_t graph_exec; // for CUDAQ_DISPATCH_GRAPH_LAUNCH + cudaq_host_rpc_fn_t host_fn; // for CUDAQ_DISPATCH_HOST_CALL + } handler; + uint32_t function_id; // hash of function name (FNV-1a) + uint8_t dispatch_mode; // cudaq_dispatch_mode_t value + uint8_t reserved[3]; // padding + cudaq_handler_schema_t schema; // function signature schema +} cudaq_function_entry_t; + +// Function table for device-side dispatch +typedef struct { + cudaq_function_entry_t *entries; // device pointer to array of entries + uint32_t count; // number of entries +} cudaq_function_table_t; + +// Host launch function pointer type +typedef void (*cudaq_dispatch_launch_fn_t)( + volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, uint8_t *rx_data, + uint8_t *tx_data, size_t rx_stride_sz, size_t tx_stride_sz, + cudaq_function_entry_t *function_table, size_t func_count, + volatile int *shutdown_flag, uint64_t *stats, size_t num_slots, + uint32_t num_blocks, uint32_t threads_per_block, cudaStream_t stream); + +// Default dispatch kernel launch helpers (from libcudaq-realtime-dispatch.a) +void cudaq_launch_dispatch_kernel_regular( + volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, uint8_t *rx_data, + uint8_t *tx_data, size_t rx_stride_sz, size_t tx_stride_sz, + cudaq_function_entry_t *function_table, size_t func_count, + volatile int *shutdown_flag, uint64_t *stats, size_t num_slots, + uint32_t num_blocks, uint32_t threads_per_block, cudaStream_t stream); + +void cudaq_launch_dispatch_kernel_cooperative( + volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, uint8_t *rx_data, + uint8_t *tx_data, size_t rx_stride_sz, size_t tx_stride_sz, + cudaq_function_entry_t *function_table, size_t func_count, + volatile int *shutdown_flag, uint64_t *stats, size_t num_slots, + uint32_t num_blocks, uint32_t threads_per_block, cudaStream_t stream); + +// Graph-enabled dispatch kernels (requires compute capability 9.0+, sm_90+) +// These functions are only available when compiled for sm_90 or higher +#if defined(__CUDACC__) || defined(CUDA_VERSION) + +//============================================================================== +// Graph-Based Dispatch API (Proper Device-Side Graph Launch Support) +//============================================================================== +// +// These functions properly support device-side cudaGraphLaunch() by wrapping +// the dispatch kernel in a graph that is instantiated with +// cudaGraphInstantiateFlagDeviceLaunch. +// +// Usage: +// 1. Allocate a GraphIOContext on the device (cudaMalloc) +// 2. Call cudaq_create_dispatch_graph_regular() to create the graph context +// 3. Call cudaq_launch_dispatch_graph() to launch the dispatch kernel +// 4. When done, call cudaq_destroy_dispatch_graph() to cleanup +// +// The dispatch kernel fills the GraphIOContext before each fire-and-forget +// graph launch. The graph kernel reads input from io_ctx->rx_slot, writes +// the RPCResponse to io_ctx->tx_slot, and signals completion by writing +// io_ctx->tx_flag_value to *io_ctx->tx_flag after a __threadfence_system(). + +// Forward declaration for GraphIOContext (defined in dispatch_kernel_launch.h) +struct cudaq_graph_io_context; + +// Opaque handle for graph-based dispatch context +typedef struct cudaq_dispatch_graph_context cudaq_dispatch_graph_context; + +// Create a graph-based dispatch context for the regular kernel type. +// This creates a graph containing the dispatch kernel, instantiates it with +// cudaGraphInstantiateFlagDeviceLaunch, and uploads it to the device. +// +// graph_io_ctx: Device pointer to a GraphIOContext struct. The dispatch +// kernel fills this before each fire-and-forget child graph launch so +// the graph kernel knows where to read input and write output. +// +// Returns cudaSuccess on success, or an error code on failure. +cudaError_t cudaq_create_dispatch_graph_regular( + volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, uint8_t *rx_data, + uint8_t *tx_data, size_t rx_stride_sz, size_t tx_stride_sz, + cudaq_function_entry_t *function_table, size_t func_count, + void *graph_io_ctx, volatile int *shutdown_flag, uint64_t *stats, + size_t num_slots, uint32_t num_blocks, uint32_t threads_per_block, + cudaStream_t stream, cudaq_dispatch_graph_context **out_context); + +// Launch the dispatch graph. The dispatch kernel inside this graph can call +// cudaGraphLaunch() to launch child graphs from device code. +cudaError_t cudaq_launch_dispatch_graph(cudaq_dispatch_graph_context *context, + cudaStream_t stream); + +// Destroy the dispatch graph context and release all resources. +cudaError_t cudaq_destroy_dispatch_graph(cudaq_dispatch_graph_context *context); + +#endif + +// Manager lifecycle +cudaq_status_t +cudaq_dispatch_manager_create(cudaq_dispatch_manager_t **out_mgr); +cudaq_status_t cudaq_dispatch_manager_destroy(cudaq_dispatch_manager_t *mgr); + +// Dispatcher lifecycle +cudaq_status_t cudaq_dispatcher_create(cudaq_dispatch_manager_t *mgr, + const cudaq_dispatcher_config_t *config, + cudaq_dispatcher_t **out_dispatcher); +cudaq_status_t cudaq_dispatcher_destroy(cudaq_dispatcher_t *dispatcher); + +// Wiring inputs +cudaq_status_t +cudaq_dispatcher_set_ringbuffer(cudaq_dispatcher_t *dispatcher, + const cudaq_ringbuffer_t *ringbuffer); +cudaq_status_t +cudaq_dispatcher_set_function_table(cudaq_dispatcher_t *dispatcher, + const cudaq_function_table_t *table); +cudaq_status_t cudaq_dispatcher_set_control(cudaq_dispatcher_t *dispatcher, + volatile int *shutdown_flag, + uint64_t *stats); +cudaq_status_t +cudaq_dispatcher_set_launch_fn(cudaq_dispatcher_t *dispatcher, + cudaq_dispatch_launch_fn_t launch_fn); + +// Optional: provide a caller-managed pinned mailbox for GRAPH_LAUNCH workers. +// h_mailbox_bank must be allocated with cudaHostAlloc(..., cudaHostAllocMapped) +// and sized to at least (num_graph_launch_entries * sizeof(void*)). +// If set, the dispatcher uses this mailbox instead of allocating its own. +// The caller retains ownership and must free it after cudaq_dispatcher_destroy. +cudaq_status_t cudaq_dispatcher_set_mailbox(cudaq_dispatcher_t *dispatcher, + void **h_mailbox_bank); + +// Start/stop +cudaq_status_t cudaq_dispatcher_start(cudaq_dispatcher_t *dispatcher); +cudaq_status_t cudaq_dispatcher_stop(cudaq_dispatcher_t *dispatcher); + +// Stats +cudaq_status_t cudaq_dispatcher_get_processed(cudaq_dispatcher_t *dispatcher, + uint64_t *out_packets); + +//============================================================================== +// Host dispatcher backend (CUDAQ_BACKEND_HOST_LOOP) +//============================================================================== +// When config.backend == CUDAQ_BACKEND_HOST_LOOP, start() uses these instead +// of launch_fn. The realtime lib calls them; implementation is in +// libcudaq-realtime-host-dispatch. + +typedef struct cudaq_host_dispatcher_handle cudaq_host_dispatcher_handle_t; + +// Start the host dispatcher loop in a new thread. Call from cudaq_dispatcher_start +// when backend is CUDAQ_BACKEND_HOST_LOOP. Returns a handle for stop, or NULL on error. +// If external_mailbox is non-NULL, uses it instead of allocating internally. +cudaq_host_dispatcher_handle_t *cudaq_host_dispatcher_start_thread( + const cudaq_ringbuffer_t *ringbuffer, + const cudaq_function_table_t *table, + const cudaq_dispatcher_config_t *config, + volatile int *shutdown_flag, + uint64_t *stats, + void **external_mailbox); + +// Stop the host dispatcher thread and free resources. +void cudaq_host_dispatcher_stop(cudaq_host_dispatcher_handle_t *handle); + +// Release a worker back to the idle pool (handle-level, called by API layer). +cudaq_status_t +cudaq_host_dispatcher_release_worker(cudaq_host_dispatcher_handle_t *handle, + int worker_id); + +//============================================================================== +// Ring buffer slot helpers (producer / consumer side) +//============================================================================== +// These encapsulate the RPC wire format and flag-signalling protocol so that +// producers and consumers don't need to know about magic constants, the +// "address-as-flag" convention, or the tx_flags state machine. + +// Write an RPC request (RPCHeader + payload) into slot `slot_idx`. +// payload_len must satisfy CUDAQ_RPC_HEADER_SIZE + payload_len <= rx_stride_sz. +cudaq_status_t cudaq_host_ringbuffer_write_rpc_request( + const cudaq_ringbuffer_t *rb, uint32_t slot_idx, uint32_t function_id, + const void *payload, uint32_t payload_len); + +// Signal that slot `slot_idx` has data ready for the dispatcher. +// Stores the host address of the slot into rx_flags_host[slot_idx]. +void cudaq_host_ringbuffer_signal_slot(const cudaq_ringbuffer_t *rb, + uint32_t slot_idx); + +// Poll tx_flags_host[slot_idx] and classify the result. +// If status == CUDAQ_TX_ERROR and out_cuda_error is non-NULL, the CUDA error +// code is written there. +cudaq_tx_status_t cudaq_host_ringbuffer_poll_tx_flag( + const cudaq_ringbuffer_t *rb, uint32_t slot_idx, int *out_cuda_error); + +// Check whether a slot is available for reuse (both rx and tx flags are 0). +int cudaq_host_ringbuffer_slot_available(const cudaq_ringbuffer_t *rb, + uint32_t slot_idx); + +// Clear tx_flags_host[slot_idx] after consuming the response. +void cudaq_host_ringbuffer_clear_slot(const cudaq_ringbuffer_t *rb, + uint32_t slot_idx); + +// Release a worker back to the idle pool after the graph has completed. +// This is the consumer-side counterpart to the dispatcher's internal +// idle_mask acquisition — without this call the worker stays "busy" forever. +cudaq_status_t cudaq_host_release_worker(cudaq_dispatcher_t *dispatcher, + int worker_id); + +// Force eager CUDA module loading for dispatch kernels (occupancy query). +// Call before cudaq_dispatcher_start() to avoid lazy-loading deadlocks. +cudaError_t cudaq_dispatch_kernel_query_occupancy(int *out_blocks, + uint32_t threads_per_block); +cudaError_t +cudaq_dispatch_kernel_cooperative_query_occupancy(int *out_blocks, + uint32_t threads_per_block); + +#ifdef __cplusplus +} +#endif diff --git a/realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_kernel.cuh b/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh similarity index 74% rename from realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_kernel.cuh rename to realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh index 0e3a028d..3b3be6dc 100644 --- a/realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_kernel.cuh +++ b/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh @@ -1,5 +1,5 @@ /****************************************************************-*- C++ -*-**** - * Copyright (c) 2025 - Present NVIDIA Corporation & Affiliates. * + * Copyright (c) 2025 - 2026 NVIDIA Corporation & Affiliates. * * All rights reserved. * * * * This source code and the accompanying materials are made available under * @@ -15,15 +15,15 @@ /// (dispatch_kernel.cu) and is linked into libcudaq-realtime.so. This header /// provides declarations and inline wrappers for the launch functions. -#include "cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h" -#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" -#include "cudaq/nvqlink/daemon/dispatcher/kernel_types.h" -#include "cudaq/nvqlink/daemon/dispatcher/dispatch_modes.h" +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/realtime/daemon/dispatcher/kernel_types.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_modes.h" #include #include -namespace cudaq::nvqlink { +namespace cudaq::realtime { //============================================================================== // Kernel Launch Function Declarations (with schema-driven function table) @@ -35,6 +35,10 @@ namespace cudaq::nvqlink { inline void launch_dispatch_kernel_regular_inline( volatile std::uint64_t* rx_flags, volatile std::uint64_t* tx_flags, + std::uint8_t* rx_data, + std::uint8_t* tx_data, + std::size_t rx_stride_sz, + std::size_t tx_stride_sz, cudaq_function_entry_t* function_table, std::size_t func_count, volatile int* shutdown_flag, @@ -44,7 +48,9 @@ inline void launch_dispatch_kernel_regular_inline( std::uint32_t threads_per_block, cudaStream_t stream) { cudaq_launch_dispatch_kernel_regular( - rx_flags, tx_flags, function_table, func_count, + rx_flags, tx_flags, rx_data, tx_data, + rx_stride_sz, tx_stride_sz, + function_table, func_count, shutdown_flag, stats, num_slots, num_blocks, threads_per_block, stream); } @@ -53,6 +59,10 @@ inline void launch_dispatch_kernel_regular_inline( inline void launch_dispatch_kernel_cooperative_inline( volatile std::uint64_t* rx_flags, volatile std::uint64_t* tx_flags, + std::uint8_t* rx_data, + std::uint8_t* tx_data, + std::size_t rx_stride_sz, + std::size_t tx_stride_sz, cudaq_function_entry_t* function_table, std::size_t func_count, volatile int* shutdown_flag, @@ -62,9 +72,11 @@ inline void launch_dispatch_kernel_cooperative_inline( std::uint32_t threads_per_block, cudaStream_t stream) { cudaq_launch_dispatch_kernel_cooperative( - rx_flags, tx_flags, function_table, func_count, + rx_flags, tx_flags, rx_data, tx_data, + rx_stride_sz, tx_stride_sz, + function_table, func_count, shutdown_flag, stats, num_slots, num_blocks, threads_per_block, stream); } -} // namespace cudaq::nvqlink +} // namespace cudaq::realtime diff --git a/realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h b/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h similarity index 61% rename from realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h rename to realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h index 18288fbf..d5eaf6bf 100644 --- a/realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h +++ b/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h @@ -1,5 +1,5 @@ /****************************************************************-*- C++ -*-**** - * Copyright (c) 2025 - Present NVIDIA Corporation & Affiliates. * + * Copyright (c) 2025 - 2026 NVIDIA Corporation & Affiliates. * * All rights reserved. * * * * This source code and the accompanying materials are made available under * @@ -10,9 +10,8 @@ #include #include -#include -namespace cudaq::nvqlink { +namespace cudaq::realtime { //============================================================================== // RPC Protocol Structures (Wire Format) @@ -38,12 +37,20 @@ struct __attribute__((packed)) RPCResponse { //============================================================================== /// @brief Device RPC function signature. -/// @param buffer Pointer to argument/result buffer -/// @param arg_len Length of argument data -/// @param max_result_len Maximum result buffer size -/// @param result_len Output: actual result length +/// +/// The handler reads arguments from the input buffer and writes results +/// directly to the output buffer. The two buffers never overlap, which +/// enables the dispatch kernel to point `output` straight into the TX +/// ring-buffer slot, eliminating a post-handler copy. +/// +/// @param input Pointer to argument data (RX buffer, read-only) +/// @param output Pointer to result buffer (TX buffer, write-only) +/// @param arg_len Length of argument data in bytes +/// @param max_result_len Maximum result buffer size in bytes +/// @param result_len Output: actual result length written /// @return Status code (0 = success) -using DeviceRPCFunction = int (*)(void *buffer, std::uint32_t arg_len, +using DeviceRPCFunction = int (*)(const void *input, void *output, + std::uint32_t arg_len, std::uint32_t max_result_len, std::uint32_t *result_len); @@ -67,6 +74,26 @@ constexpr std::uint32_t fnv1a_hash(const char *str) { constexpr std::uint32_t RPC_MAGIC_REQUEST = 0x43555152; // 'CUQR' constexpr std::uint32_t RPC_MAGIC_RESPONSE = 0x43555153; // 'CUQS' +//============================================================================== +// Graph IO Context (for CUDAQ_DISPATCH_GRAPH_LAUNCH) +//============================================================================== + +/// @brief IO context passed to graph-launched RPC handlers via pointer +/// indirection. +/// +/// The dispatch kernel fills this context before each fire-and-forget graph +/// launch so the graph kernel knows where to read input, where to write the +/// response, and how to signal completion. The graph kernel is responsible +/// for writing the RPCResponse header to `tx_slot` and then setting +/// `*tx_flag = tx_flag_value` after a `__threadfence_system()`. +struct GraphIOContext { + void *rx_slot; ///< Input: RX slot (RPCHeader + `args`) + std::uint8_t *tx_slot; ///< Output: TX slot for RPCResponse + volatile std::uint64_t *tx_flag; ///< Pointer to TX flag for this slot + std::uint64_t tx_flag_value; ///< Value to write to tx_flag when done + std::size_t tx_stride_sz; ///< TX slot size (for max_result_len) +}; + //============================================================================== // Schema-Driven Type System //============================================================================== @@ -95,11 +122,11 @@ struct __attribute__((packed)) cudaq_type_desc_t { /// @brief Handler schema describing argument and result types. struct __attribute__((packed)) cudaq_handler_schema_t { - std::uint8_t num_args; ///< Number of arguments - std::uint8_t num_results; ///< Number of results - std::uint16_t reserved; ///< Padding for alignment - cudaq_type_desc_t args[8]; ///< Argument type descriptors (max 8) - cudaq_type_desc_t results[4]; ///< Result type descriptors (max 4) + std::uint8_t num_args; ///< Number of arguments + std::uint8_t num_results; ///< Number of results + std::uint16_t reserved; ///< Padding for alignment + cudaq_type_desc_t args[8]; ///< Argument type descriptors (max 8) + cudaq_type_desc_t results[4]; ///< Result type descriptors (max 4) }; -} // namespace cudaq::nvqlink +} // namespace cudaq::realtime diff --git a/realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_modes.h b/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_modes.h similarity index 94% rename from realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_modes.h rename to realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_modes.h index 83e0c843..d34c0b83 100644 --- a/realtime/include/cudaq/nvqlink/daemon/dispatcher/dispatch_modes.h +++ b/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_modes.h @@ -48,10 +48,10 @@ struct GraphLaunchMode { /// @param ctx Handler context containing the graph executable template __device__ static void dispatch(ContextType &ctx) { -// Device graph launch requires CUDA 13+ and compute capability 8.0+ +// Device graph launch requires CUDA 12.0+ and appropriate context setup // The graph_exec must be a cudaGraphExec_t captured at initialization -#if __CUDA_ARCH__ >= 800 - // cudaGraphLaunch is available from device code on sm_80+ +#if __CUDA_ARCH__ >= 900 + // cudaGraphLaunch is available from device code on Hopper+ // Note: This is a placeholder - actual implementation requires // the graph_exec to be properly set up in the context if (ctx.graph_exec != nullptr) { diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h b/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h new file mode 100644 index 00000000..43ff3821 --- /dev/null +++ b/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h @@ -0,0 +1,71 @@ +/******************************************************************************* + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. + * All rights reserved. + * + * This source code and the accompanying materials are made available under + * the terms of the Apache License 2.0 which accompanies this distribution. + ******************************************************************************/ + +#pragma once + +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" + +#include +#include +#include +#include +#include + +#ifndef QEC_CPU_RELAX +#if defined(__x86_64__) +#include +#define QEC_CPU_RELAX() _mm_pause() +#elif defined(__aarch64__) +#define QEC_CPU_RELAX() __asm__ volatile("yield" ::: "memory") +#else +#define QEC_CPU_RELAX() do { } while (0) +#endif +#endif + +namespace cudaq::realtime { + +using atomic_uint64_sys = cuda::std::atomic; +using atomic_int_sys = cuda::std::atomic; + +struct HostDispatchWorker { + cudaGraphExec_t graph_exec; + cudaStream_t stream; + uint32_t function_id; // matches table entry; used to assign slot to this worker +}; + +struct HostDispatcherConfig { + atomic_uint64_sys* rx_flags; + atomic_uint64_sys* tx_flags; + uint8_t* rx_data_host; + uint8_t* rx_data_dev; + uint8_t* tx_data_host; + uint8_t* tx_data_dev; + size_t tx_stride_sz; + void** h_mailbox_bank; + size_t num_slots; + size_t slot_size; + std::vector workers; + /// Host-visible function table for lookup by function_id (GRAPH_LAUNCH only; others dropped). + cudaq_function_entry_t* function_table = nullptr; + size_t function_table_count = 0; + atomic_int_sys* shutdown_flag; + uint64_t* stats_counter; + /// Optional: atomic counter incremented on each dispatch (for progress diagnostics). + atomic_uint64_sys* live_dispatched = nullptr; + + /// Dynamic worker pool (graph workers only) + atomic_uint64_sys* idle_mask; ///< 1 = free, 0 = busy; bit index = worker_id + int* inflight_slot_tags; ///< worker_id -> origin FPGA slot for tx_flags routing +}; + +/// Run the host-side dispatcher loop. Blocks until *config.shutdown_flag +/// becomes non-zero. Call from a dedicated thread. +/// Uses dynamic worker pool: allocates via idle_mask, tags with inflight_slot_tags. +void host_dispatcher_loop(const HostDispatcherConfig& config); + +} // namespace cudaq::realtime diff --git a/realtime/include/cudaq/nvqlink/daemon/dispatcher/kernel_types.h b/realtime/include/cudaq/realtime/daemon/dispatcher/kernel_types.h similarity index 85% rename from realtime/include/cudaq/nvqlink/daemon/dispatcher/kernel_types.h rename to realtime/include/cudaq/realtime/daemon/dispatcher/kernel_types.h index e78ae558..b7efcac1 100644 --- a/realtime/include/cudaq/nvqlink/daemon/dispatcher/kernel_types.h +++ b/realtime/include/cudaq/realtime/daemon/dispatcher/kernel_types.h @@ -19,6 +19,8 @@ namespace cudaq::realtime { /// is needed. Suitable for simple decode handlers that don't require /// grid-wide coordination. struct RegularKernel { + /// @brief Not a cooperative kernel -- handler is called by thread 0 only. + static constexpr bool is_cooperative = false; /// @brief Synchronize threads within a block. __device__ static void sync() { __syncthreads(); } }; @@ -29,6 +31,8 @@ struct RegularKernel { /// such as complex decoders with data dependencies across blocks. /// Requires kernel to be launched with cudaLaunchCooperativeKernel. struct CooperativeKernel { + /// @brief Cooperative kernel -- handler is called by ALL threads. + static constexpr bool is_cooperative = true; __device__ static void sync() { cooperative_groups::this_grid().sync(); } }; diff --git a/realtime/include/cudaq/realtime/hololink_bridge_common.h b/realtime/include/cudaq/realtime/hololink_bridge_common.h new file mode 100644 index 00000000..d5fb254a --- /dev/null +++ b/realtime/include/cudaq/realtime/hololink_bridge_common.h @@ -0,0 +1,502 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +/// @file hololink_bridge_common.h +/// @brief Header-only bridge skeleton for Hololink-based RPC dispatch. +/// +/// Provides common infrastructure used by all Hololink bridge tools: +/// - Command-line argument parsing for IB device, peer IP, QP, etc. +/// - Hololink transceiver creation and QP connection +/// - Dispatch kernel wiring via the cudaq host API +/// - Main run loop with diagnostics +/// - Graceful shutdown +/// +/// Each concrete bridge tool (generic increment, mock decoder, real decoder) +/// implements a small main() that: +/// 1. Parses any tool-specific arguments +/// 2. Sets up its RPC function table on the GPU +/// 3. Calls bridge_run() with a BridgeConfig struct +/// +/// This header is compiled by a standard C++ compiler; all CUDA and Hololink +/// calls go through C interfaces (cudaq_realtime.h, hololink_wrapper.h). + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" + +// Hololink C wrapper (link against hololink_wrapper_bridge static library) +#include "hololink_wrapper.h" + +namespace cudaq::realtime { + +//============================================================================== +// CUDA Error Checking +//============================================================================== + +#ifndef BRIDGE_CUDA_CHECK +#define BRIDGE_CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << ": " \ + << cudaGetErrorString(err) << std::endl; \ + return 1; \ + } \ + } while (0) +#endif + +//============================================================================== +// Global Signal Handler +//============================================================================== + +namespace detail { +inline std::atomic &bridge_shutdown_flag() { + static std::atomic flag{false}; + return flag; +} +inline void bridge_signal_handler(int) { bridge_shutdown_flag() = true; } +} // namespace detail + +//============================================================================== +// Bridge Configuration +//============================================================================== + +/// @brief Configuration for the bridge's Hololink and dispatch kernel setup. +struct BridgeConfig { + // IB / network + std::string device = "rocep1s0f0"; ///< IB device name + std::string peer_ip = "10.0.0.2"; ///< FPGA/emulator IP + uint32_t remote_qp = 0x2; ///< Remote QP number (FPGA default: 2) + int gpu_id = 0; ///< GPU device ID + int timeout_sec = 60; ///< Runtime timeout in seconds + + // Ring buffer sizing + size_t frame_size = 256; ///< Minimum frame size (RPCHeader + payload) + size_t page_size = + 384; ///< Ring buffer slot size (>= frame_size, 128-aligned) + unsigned num_pages = 64; ///< Number of ring buffer slots + + // QP exchange (emulator mode) + bool exchange_qp = false; ///< Use QP exchange protocol + int exchange_port = 12345; ///< TCP port for QP exchange + + // Dispatch kernel config + cudaq_function_entry_t *d_function_entries = nullptr; ///< GPU function table + size_t func_count = 0; ///< Number of entries + + /// @brief Dispatch kernel grid configuration. + /// Defaults match the regular (non-cooperative) kernel. + cudaq_kernel_type_t kernel_type = CUDAQ_KERNEL_REGULAR; + uint32_t num_blocks = 1; + uint32_t threads_per_block = 32; + + /// @brief Pointer to the dispatch kernel launch function. + /// Default: cudaq_launch_dispatch_kernel_regular + cudaq_dispatch_launch_fn_t launch_fn = nullptr; + + /// @brief Optional cleanup callback invoked during shutdown. + std::function cleanup_fn; +}; + +//============================================================================== +// Common Argument Parsing +//============================================================================== + +/// @brief Parse common bridge arguments from the command line. +/// +/// Recognized flags: `--device=`, `--peer-ip=`, `--remote-qp=`, `--gpu=`, +/// `--timeout=`, `--page-size=`, `--num-pages=`, `--exchange-qp`, +/// `--exchange-port=`. Unknown flags are silently ignored (so tool-specific +/// flags can co-exist). +/// +/// @param argc Argument count +/// @param argv Argument vector +/// @param [out] config Bridge configuration to populate +inline void parse_bridge_args(int argc, char *argv[], BridgeConfig &config) { + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg.find("--device=") == 0) + config.device = arg.substr(9); + else if (arg.find("--peer-ip=") == 0) + config.peer_ip = arg.substr(10); + else if (arg.find("--remote-qp=") == 0) + config.remote_qp = std::stoul(arg.substr(12), nullptr, 0); + else if (arg.find("--gpu=") == 0) + config.gpu_id = std::stoi(arg.substr(6)); + else if (arg.find("--timeout=") == 0) + config.timeout_sec = std::stoi(arg.substr(10)); + else if (arg.find("--page-size=") == 0) + config.page_size = std::stoull(arg.substr(12)); + else if (arg.find("--num-pages=") == 0) + config.num_pages = std::stoul(arg.substr(12)); + else if (arg == "--exchange-qp") + config.exchange_qp = true; + else if (arg.find("--exchange-port=") == 0) + config.exchange_port = std::stoi(arg.substr(16)); + } +} + +//============================================================================== +// Bridge Run Function +//============================================================================== + +/// @brief Run the Hololink bridge with the given configuration. +/// +/// This function: +/// 1. Initialises CUDA on the configured GPU +/// 2. Creates the Hololink transceiver and connects the QP +/// 3. Forces eager CUDA module loading +/// 4. Wires the cudaq dispatch kernel to the Hololink ring buffers +/// 5. Launches Hololink RX+TX kernels +/// 6. Runs the main diagnostic loop until timeout or signal +/// 7. Performs orderly shutdown +/// +/// The caller must set config.d_function_entries and config.func_count +/// before calling this function. +/// +/// @param config Fully-populated bridge configuration +/// @return 0 on success, non-zero on error +inline int bridge_run(BridgeConfig &config) { + signal(SIGINT, detail::bridge_signal_handler); + signal(SIGTERM, detail::bridge_signal_handler); + + auto &g_shutdown = detail::bridge_shutdown_flag(); + + //============================================================================ + // [1] Initialize CUDA + //============================================================================ + std::cout << "\n[1/5] Initializing CUDA..." << std::endl; + BRIDGE_CUDA_CHECK(cudaSetDevice(config.gpu_id)); + + cudaDeviceProp prop; + BRIDGE_CUDA_CHECK(cudaGetDeviceProperties(&prop, config.gpu_id)); + std::cout << " GPU: " << prop.name << std::endl; + + //============================================================================ + // [2] Create Hololink transceiver + //============================================================================ + std::cout << "\n[2/5] Creating Hololink transceiver..." << std::endl; + + // Ensure page_size >= frame_size + if (config.page_size < config.frame_size) { + std::cout << " Adjusting page_size from " << config.page_size << " to " + << config.frame_size << " to fit frame" << std::endl; + config.page_size = config.frame_size; + } + + std::cout << " Frame size: " << config.frame_size << " bytes" << std::endl; + std::cout << " Page size: " << config.page_size << " bytes" << std::endl; + std::cout << " Num pages: " << config.num_pages << std::endl; + + hololink_transceiver_t transceiver = hololink_create_transceiver( + config.device.c_str(), 1, // ib_port + config.frame_size, config.page_size, config.num_pages, + "0.0.0.0", // deferred connection + 0, // forward = false + 1, // rx_only = true + 1 // tx_only = true + ); + + if (!transceiver) { + std::cerr << "ERROR: Failed to create Hololink transceiver" << std::endl; + return 1; + } + + if (!hololink_start(transceiver)) { + std::cerr << "ERROR: Failed to start Hololink transceiver" << std::endl; + hololink_destroy_transceiver(transceiver); + return 1; + } + + // Connect QP to remote peer + { + uint8_t remote_gid[16] = {}; + remote_gid[10] = 0xff; + remote_gid[11] = 0xff; + inet_pton(AF_INET, config.peer_ip.c_str(), &remote_gid[12]); + + std::cout << " Connecting QP to remote QP 0x" << std::hex + << config.remote_qp << std::dec << " at " << config.peer_ip + << "..." << std::endl; + + if (!hololink_reconnect_qp(transceiver, remote_gid, config.remote_qp)) { + std::cerr << "ERROR: Failed to connect QP to remote peer" << std::endl; + hololink_destroy_transceiver(transceiver); + return 1; + } + std::cout << " QP connected to remote peer" << std::endl; + } + + uint32_t our_qp = hololink_get_qp_number(transceiver); + uint32_t our_rkey = hololink_get_rkey(transceiver); + uint64_t our_buffer = hololink_get_buffer_addr(transceiver); + + std::cout << " QP Number: 0x" << std::hex << our_qp << std::dec << std::endl; + std::cout << " RKey: " << our_rkey << std::endl; + std::cout << " Buffer Addr: 0x" << std::hex << our_buffer << std::dec + << std::endl; + + // Ring buffer pointers + uint8_t *rx_ring_data = + reinterpret_cast(hololink_get_rx_ring_data_addr(transceiver)); + uint64_t *rx_ring_flag = hololink_get_rx_ring_flag_addr(transceiver); + uint8_t *tx_ring_data = + reinterpret_cast(hololink_get_tx_ring_data_addr(transceiver)); + uint64_t *tx_ring_flag = hololink_get_tx_ring_flag_addr(transceiver); + + if (!rx_ring_data || !rx_ring_flag || !tx_ring_data || !tx_ring_flag) { + std::cerr << "ERROR: Failed to get ring buffer pointers" << std::endl; + hololink_destroy_transceiver(transceiver); + return 1; + } + + //============================================================================ + // [3] Force eager CUDA module loading + //============================================================================ + std::cout << "\n[3/5] Forcing CUDA module loading..." << std::endl; + { + int dispatch_blocks = 0; + cudaError_t occ_err; + if (config.kernel_type == CUDAQ_KERNEL_COOPERATIVE) { + occ_err = cudaq_dispatch_kernel_cooperative_query_occupancy( + &dispatch_blocks, config.threads_per_block); + } else { + occ_err = cudaq_dispatch_kernel_query_occupancy(&dispatch_blocks, 1); + } + if (occ_err != cudaSuccess) { + std::cerr << "ERROR: Dispatch kernel occupancy query failed: " + << cudaGetErrorString(occ_err) << std::endl; + return 1; + } + std::cout << " Dispatch kernel occupancy: " << dispatch_blocks + << " blocks/SM" << std::endl; + + if (!hololink_query_kernel_occupancy()) { + std::cerr << "ERROR: Hololink kernel occupancy query failed" << std::endl; + return 1; + } + } + + //============================================================================ + // [4] Wire dispatch kernel to Hololink ring buffers + //============================================================================ + std::cout << "\n[4/5] Wiring dispatch kernel..." << std::endl; + + // Allocate control variables + void *tmp_shutdown = nullptr; + BRIDGE_CUDA_CHECK( + cudaHostAlloc(&tmp_shutdown, sizeof(int), cudaHostAllocMapped)); + volatile int *shutdown_flag = static_cast(tmp_shutdown); + void *tmp_d_shutdown = nullptr; + BRIDGE_CUDA_CHECK(cudaHostGetDevicePointer(&tmp_d_shutdown, tmp_shutdown, 0)); + volatile int *d_shutdown_flag = static_cast(tmp_d_shutdown); + *shutdown_flag = 0; + int zero = 0; + BRIDGE_CUDA_CHECK(cudaMemcpy(const_cast(d_shutdown_flag), &zero, + sizeof(int), cudaMemcpyHostToDevice)); + + uint64_t *d_stats = nullptr; + BRIDGE_CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t))); + BRIDGE_CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t))); + + // Host API wiring + cudaq_dispatch_manager_t *manager = nullptr; + cudaq_dispatcher_t *dispatcher = nullptr; + + if (cudaq_dispatch_manager_create(&manager) != CUDAQ_OK) { + std::cerr << "ERROR: Failed to create dispatch manager" << std::endl; + return 1; + } + + cudaq_dispatcher_config_t dconfig{}; + dconfig.device_id = config.gpu_id; + dconfig.num_blocks = config.num_blocks; + dconfig.threads_per_block = config.threads_per_block; + dconfig.num_slots = static_cast(config.num_pages); + dconfig.slot_size = static_cast(config.page_size); + dconfig.vp_id = 0; + dconfig.kernel_type = config.kernel_type; + dconfig.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; + + if (cudaq_dispatcher_create(manager, &dconfig, &dispatcher) != CUDAQ_OK) { + std::cerr << "ERROR: Failed to create dispatcher" << std::endl; + return 1; + } + + cudaq_ringbuffer_t ringbuffer{}; + ringbuffer.rx_flags = reinterpret_cast(rx_ring_flag); + ringbuffer.tx_flags = reinterpret_cast(tx_ring_flag); + ringbuffer.rx_data = rx_ring_data; + ringbuffer.tx_data = tx_ring_data; + ringbuffer.rx_stride_sz = config.page_size; + ringbuffer.tx_stride_sz = config.page_size; + + if (cudaq_dispatcher_set_ringbuffer(dispatcher, &ringbuffer) != CUDAQ_OK) { + std::cerr << "ERROR: Failed to set ringbuffer" << std::endl; + return 1; + } + + cudaq_function_table_t table{}; + table.entries = config.d_function_entries; + table.count = config.func_count; + if (cudaq_dispatcher_set_function_table(dispatcher, &table) != CUDAQ_OK) { + std::cerr << "ERROR: Failed to set function table" << std::endl; + return 1; + } + + if (cudaq_dispatcher_set_control(dispatcher, d_shutdown_flag, d_stats) != + CUDAQ_OK) { + std::cerr << "ERROR: Failed to set control" << std::endl; + return 1; + } + + // Use provided launch function, or default to regular dispatch + cudaq_dispatch_launch_fn_t launch_fn = config.launch_fn; + if (!launch_fn) { + launch_fn = &cudaq_launch_dispatch_kernel_regular; + } + if (cudaq_dispatcher_set_launch_fn(dispatcher, launch_fn) != CUDAQ_OK) { + std::cerr << "ERROR: Failed to set launch function" << std::endl; + return 1; + } + + if (cudaq_dispatcher_start(dispatcher) != CUDAQ_OK) { + std::cerr << "ERROR: Failed to start dispatcher" << std::endl; + return 1; + } + std::cout << " Dispatch kernel launched" << std::endl; + + //============================================================================ + // [5] Launch Hololink kernels and run + //============================================================================ + std::cout << "\n[5/5] Launching Hololink kernels..." << std::endl; + + std::thread hololink_thread( + [transceiver]() { hololink_blocking_monitor(transceiver); }); + + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + std::cout << " Hololink RX+TX kernels started" << std::endl; + + // Print QP info for FPGA stimulus tool + std::cout << "\n=== Bridge Ready ===" << std::endl; + std::cout << " QP Number: 0x" << std::hex << our_qp << std::dec << std::endl; + std::cout << " RKey: " << our_rkey << std::endl; + std::cout << " Buffer Addr: 0x" << std::hex << our_buffer << std::dec + << std::endl; + std::cout << "\nWaiting for data (Ctrl+C to stop, timeout=" + << config.timeout_sec << "s)..." << std::endl; + + //============================================================================ + // Main run loop + //============================================================================ + cudaStream_t diag_stream = nullptr; + BRIDGE_CUDA_CHECK( + cudaStreamCreateWithFlags(&diag_stream, cudaStreamNonBlocking)); + + auto start_time = std::chrono::steady_clock::now(); + uint64_t last_processed = 0; + + while (!g_shutdown) { + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start_time) + .count(); + if (elapsed > config.timeout_sec) { + std::cout << "\nTimeout reached (" << config.timeout_sec << "s)" + << std::endl; + break; + } + + // Progress report every 5 seconds + if (elapsed > 0 && elapsed % 5 == 0) { + uint64_t processed = 0; + cudaMemcpyAsync(&processed, d_stats, sizeof(uint64_t), + cudaMemcpyDeviceToHost, diag_stream); + cudaStreamSynchronize(diag_stream); + if (processed != last_processed) { + std::cout << " [" << elapsed << "s] Processed " << processed + << " packets" << std::endl; + last_processed = processed; + } + } + + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + } + + //============================================================================ + // Shutdown + //============================================================================ + std::cout << "\n=== Shutting down ===" << std::endl; + + if (diag_stream) { + cudaStreamDestroy(diag_stream); + diag_stream = nullptr; + } + + *shutdown_flag = 1; + __sync_synchronize(); + cudaq_dispatcher_stop(dispatcher); + + uint64_t total_processed = 0; + cudaq_dispatcher_get_processed(dispatcher, &total_processed); + std::cout << " Total packets processed: " << total_processed << std::endl; + + hololink_close(transceiver); + if (hololink_thread.joinable()) + hololink_thread.join(); + + cudaq_dispatcher_destroy(dispatcher); + cudaq_dispatch_manager_destroy(manager); + hololink_destroy_transceiver(transceiver); + + if (shutdown_flag) + cudaFreeHost(const_cast(shutdown_flag)); + if (d_stats) + cudaFree(d_stats); + + // Call tool-specific cleanup + if (config.cleanup_fn) + config.cleanup_fn(); + + std::cout << "\n*** Bridge shutdown complete ***" << std::endl; + return 0; +} + +/// @brief Default dispatch kernel launch wrapper. +/// +/// Matches cudaq_dispatch_launch_fn_t signature; delegates to +/// cudaq_launch_dispatch_kernel_regular from libcudaq-realtime. +inline void bridge_launch_dispatch_kernel( + volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, + std::uint8_t *rx_data, std::uint8_t *tx_data, std::size_t rx_stride_sz, + std::size_t tx_stride_sz, cudaq_function_entry_t *function_table, + std::size_t func_count, volatile int *shutdown_flag, std::uint64_t *stats, + std::size_t num_slots, std::uint32_t num_blocks, + std::uint32_t threads_per_block, cudaStream_t stream) { + cudaq_launch_dispatch_kernel_regular( + rx_flags, tx_flags, rx_data, tx_data, rx_stride_sz, tx_stride_sz, + function_table, func_count, shutdown_flag, stats, num_slots, num_blocks, + threads_per_block, stream); +} + +} // namespace cudaq::realtime diff --git a/realtime/lib/CMakeLists.txt b/realtime/lib/CMakeLists.txt index 9193b29c..916f5e39 100644 --- a/realtime/lib/CMakeLists.txt +++ b/realtime/lib/CMakeLists.txt @@ -8,8 +8,8 @@ include(GNUInstallDirs) -install(DIRECTORY ${CUDAQ_NVQLINK_INCLUDE_DIR}/cudaq - COMPONENT nvqlink-headers +install(DIRECTORY ${CUDAQ_REALTIME_INCLUDE_DIR}/cudaq + COMPONENT cudaq-realtime-headers DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} FILES_MATCHING PATTERN "*.h" ) diff --git a/realtime/lib/daemon/CMakeLists.txt b/realtime/lib/daemon/CMakeLists.txt index 5bd0e3f2..95d67ddc 100644 --- a/realtime/lib/daemon/CMakeLists.txt +++ b/realtime/lib/daemon/CMakeLists.txt @@ -21,16 +21,18 @@ if(CUDA_FOUND) target_include_directories(cudaq-realtime PUBLIC - $ + $ $ ) target_link_libraries(cudaq-realtime PUBLIC CUDA::cudart_static + PRIVATE + cudaq-realtime-host-dispatch ) - target_compile_definitions(cudaq-realtime PUBLIC NVQLINK_HAVE_CUDA) + target_compile_definitions(cudaq-realtime PUBLIC CUDAQ_REALTIME_HAVE_CUDA) set_target_properties(cudaq-realtime PROPERTIES CUDA_SEPARABLE_COMPILATION ON @@ -47,7 +49,7 @@ if(CUDA_FOUND) target_include_directories(cudaq-realtime-dispatch PUBLIC - $ + $ $ ) @@ -73,4 +75,36 @@ if(CUDA_FOUND) COMPONENT realtime-lib ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) + + # ============================================================================ + # Host-side graph dispatcher (optional, for Grace Hopper / Grace Blackwell etc.) + # ============================================================================ + # Compiled with nvcc so libcu++ () works without extra + # include paths. Host-only code; no device code in this TU. + add_library(cudaq-realtime-host-dispatch SHARED + dispatcher/host_dispatcher.cu + dispatcher/host_dispatcher_capi.cu + ) + + target_include_directories(cudaq-realtime-host-dispatch + PUBLIC + $ + $ + ) + + target_link_libraries(cudaq-realtime-host-dispatch + PUBLIC + CUDA::cudart_static + ) + + set_target_properties(cudaq-realtime-host-dispatch PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + POSITION_INDEPENDENT_CODE ON + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib + ) + + install(TARGETS cudaq-realtime-host-dispatch + COMPONENT realtime-lib + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) endif() diff --git a/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp b/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp index 28216781..323be95e 100644 --- a/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp +++ b/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp @@ -6,9 +6,10 @@ * the terms of the Apache License 2.0 which accompanies this distribution. * ******************************************************************************/ -#include "cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" #include +#include #include struct cudaq_dispatch_manager_t { @@ -24,6 +25,8 @@ struct cudaq_dispatcher_t { uint64_t *stats = nullptr; cudaStream_t stream = nullptr; bool running = false; + cudaq_host_dispatcher_handle_t *host_handle = nullptr; + void **h_mailbox_bank = nullptr; }; static bool is_valid_kernel_type(cudaq_kernel_type_t kernel_type) { @@ -40,6 +43,7 @@ static bool is_valid_dispatch_mode(cudaq_dispatch_mode_t dispatch_mode) { switch (dispatch_mode) { case CUDAQ_DISPATCH_DEVICE_CALL: case CUDAQ_DISPATCH_GRAPH_LAUNCH: + case CUDAQ_DISPATCH_HOST_CALL: return true; default: return false; @@ -49,16 +53,26 @@ static bool is_valid_dispatch_mode(cudaq_dispatch_mode_t dispatch_mode) { static cudaq_status_t validate_dispatcher(cudaq_dispatcher_t *dispatcher) { if (!dispatcher) return CUDAQ_ERR_INVALID_ARG; - if (!dispatcher->launch_fn || !dispatcher->shutdown_flag || - !dispatcher->stats) + if (!dispatcher->shutdown_flag || !dispatcher->stats) return CUDAQ_ERR_INVALID_ARG; if (!dispatcher->ringbuffer.rx_flags || !dispatcher->ringbuffer.tx_flags) return CUDAQ_ERR_INVALID_ARG; if (!dispatcher->table.entries || dispatcher->table.count == 0) return CUDAQ_ERR_INVALID_ARG; + if (dispatcher->config.num_slots == 0 || dispatcher->config.slot_size == 0) + return CUDAQ_ERR_INVALID_ARG; + + if (dispatcher->config.backend == CUDAQ_BACKEND_HOST_LOOP) { + if (!dispatcher->ringbuffer.rx_flags_host || !dispatcher->ringbuffer.tx_flags_host || + !dispatcher->ringbuffer.rx_data_host || !dispatcher->ringbuffer.tx_data_host) + return CUDAQ_ERR_INVALID_ARG; + return CUDAQ_OK; + } + + if (!dispatcher->launch_fn) + return CUDAQ_ERR_INVALID_ARG; if (dispatcher->config.num_blocks == 0 || - dispatcher->config.threads_per_block == 0 || - dispatcher->config.num_slots == 0 || dispatcher->config.slot_size == 0) + dispatcher->config.threads_per_block == 0) return CUDAQ_ERR_INVALID_ARG; if (!is_valid_kernel_type(dispatcher->config.kernel_type) || !is_valid_dispatch_mode(dispatcher->config.dispatch_mode)) @@ -78,7 +92,8 @@ cudaq_dispatch_manager_create(cudaq_dispatch_manager_t **out_mgr) { } cudaq_status_t cudaq_dispatch_manager_destroy(cudaq_dispatch_manager_t *mgr) { - delete mgr; + if (mgr) + delete mgr; return CUDAQ_OK; } @@ -98,6 +113,11 @@ cudaq_status_t cudaq_dispatcher_create(cudaq_dispatch_manager_t *, cudaq_status_t cudaq_dispatcher_destroy(cudaq_dispatcher_t *dispatcher) { if (!dispatcher) return CUDAQ_ERR_INVALID_ARG; + if (dispatcher->running && dispatcher->host_handle) { + *dispatcher->shutdown_flag = 1; + cudaq_host_dispatcher_stop(dispatcher->host_handle); + dispatcher->host_handle = nullptr; + } delete dispatcher; return CUDAQ_OK; } @@ -133,12 +153,24 @@ cudaq_status_t cudaq_dispatcher_set_control(cudaq_dispatcher_t *dispatcher, cudaq_status_t cudaq_dispatcher_set_launch_fn(cudaq_dispatcher_t *dispatcher, cudaq_dispatch_launch_fn_t launch_fn) { - if (!dispatcher || !launch_fn) + if (!dispatcher) + return CUDAQ_ERR_INVALID_ARG; + if (dispatcher->config.backend == CUDAQ_BACKEND_HOST_LOOP && launch_fn != nullptr) + return CUDAQ_ERR_INVALID_ARG; + if (dispatcher->config.backend != CUDAQ_BACKEND_HOST_LOOP && !launch_fn) return CUDAQ_ERR_INVALID_ARG; dispatcher->launch_fn = launch_fn; return CUDAQ_OK; } +cudaq_status_t cudaq_dispatcher_set_mailbox(cudaq_dispatcher_t *dispatcher, + void **h_mailbox_bank) { + if (!dispatcher) + return CUDAQ_ERR_INVALID_ARG; + dispatcher->h_mailbox_bank = h_mailbox_bank; + return CUDAQ_OK; +} + cudaq_status_t cudaq_dispatcher_start(cudaq_dispatcher_t *dispatcher) { auto status = validate_dispatcher(dispatcher); if (status != CUDAQ_OK) @@ -151,11 +183,25 @@ cudaq_status_t cudaq_dispatcher_start(cudaq_dispatcher_t *dispatcher) { device_id = 0; if (cudaSetDevice(device_id) != cudaSuccess) return CUDAQ_ERR_CUDA; + + if (dispatcher->config.backend == CUDAQ_BACKEND_HOST_LOOP) { + dispatcher->host_handle = cudaq_host_dispatcher_start_thread( + &dispatcher->ringbuffer, &dispatcher->table, &dispatcher->config, + dispatcher->shutdown_flag, dispatcher->stats, + dispatcher->h_mailbox_bank); + if (!dispatcher->host_handle) + return CUDAQ_ERR_INTERNAL; + dispatcher->running = true; + return CUDAQ_OK; + } + if (cudaStreamCreate(&dispatcher->stream) != cudaSuccess) return CUDAQ_ERR_CUDA; dispatcher->launch_fn( dispatcher->ringbuffer.rx_flags, dispatcher->ringbuffer.tx_flags, + dispatcher->ringbuffer.rx_data, dispatcher->ringbuffer.tx_data, + dispatcher->ringbuffer.rx_stride_sz, dispatcher->ringbuffer.tx_stride_sz, dispatcher->table.entries, dispatcher->table.count, dispatcher->shutdown_flag, dispatcher->stats, dispatcher->config.num_slots, dispatcher->config.num_blocks, @@ -165,6 +211,8 @@ cudaq_status_t cudaq_dispatcher_start(cudaq_dispatcher_t *dispatcher) { if (err != cudaSuccess) { fprintf(stderr, "CUDA error in dispatcher launch: %s (%d)\n", cudaGetErrorString(err), err); + cudaStreamDestroy(dispatcher->stream); + dispatcher->stream = nullptr; return CUDAQ_ERR_CUDA; } @@ -178,6 +226,15 @@ cudaq_status_t cudaq_dispatcher_stop(cudaq_dispatcher_t *dispatcher) { if (!dispatcher->running) return CUDAQ_OK; + if (dispatcher->config.backend == CUDAQ_BACKEND_HOST_LOOP && + dispatcher->host_handle) { + *dispatcher->shutdown_flag = 1; + cudaq_host_dispatcher_stop(dispatcher->host_handle); + dispatcher->host_handle = nullptr; + dispatcher->running = false; + return CUDAQ_OK; + } + int shutdown = 1; if (cudaMemcpy(const_cast(dispatcher->shutdown_flag), &shutdown, sizeof(int), cudaMemcpyHostToDevice) != cudaSuccess) @@ -194,9 +251,83 @@ cudaq_status_t cudaq_dispatcher_get_processed(cudaq_dispatcher_t *dispatcher, if (!dispatcher || !out_packets || !dispatcher->stats) return CUDAQ_ERR_INVALID_ARG; + if (dispatcher->config.backend == CUDAQ_BACKEND_HOST_LOOP) { + *out_packets = *dispatcher->stats; + return CUDAQ_OK; + } + if (cudaMemcpy(out_packets, dispatcher->stats, sizeof(uint64_t), cudaMemcpyDeviceToHost) != cudaSuccess) return CUDAQ_ERR_CUDA; return CUDAQ_OK; } + +//============================================================================== +// Ring buffer slot helpers +//============================================================================== + +cudaq_status_t cudaq_host_ringbuffer_write_rpc_request( + const cudaq_ringbuffer_t *rb, uint32_t slot_idx, uint32_t function_id, + const void *payload, uint32_t payload_len) { + if (!rb || !rb->rx_data_host) + return CUDAQ_ERR_INVALID_ARG; + if (CUDAQ_RPC_HEADER_SIZE + payload_len > rb->rx_stride_sz) + return CUDAQ_ERR_INVALID_ARG; + + uint8_t *slot = rb->rx_data_host + slot_idx * rb->rx_stride_sz; + uint32_t *hdr = reinterpret_cast(slot); + hdr[0] = CUDAQ_RPC_MAGIC_REQUEST; + hdr[1] = function_id; + hdr[2] = payload_len; + + if (payload && payload_len > 0) + std::memcpy(slot + CUDAQ_RPC_HEADER_SIZE, payload, payload_len); + + return CUDAQ_OK; +} + +void cudaq_host_ringbuffer_signal_slot(const cudaq_ringbuffer_t *rb, + uint32_t slot_idx) { + __sync_synchronize(); + const_cast( + rb->rx_flags_host)[slot_idx] = reinterpret_cast( + rb->rx_data_host + slot_idx * rb->rx_stride_sz); +} + +cudaq_tx_status_t cudaq_host_ringbuffer_poll_tx_flag( + const cudaq_ringbuffer_t *rb, uint32_t slot_idx, int *out_cuda_error) { + uint64_t v = rb->tx_flags_host[slot_idx]; + if (v == 0) + return CUDAQ_TX_EMPTY; + if (v == 0xEEEEEEEEEEEEEEEEULL) + return CUDAQ_TX_IN_FLIGHT; + if ((v >> 48) == 0xDEAD) { + if (out_cuda_error) + *out_cuda_error = static_cast(v & 0xFFFF); + return CUDAQ_TX_ERROR; + } + return CUDAQ_TX_READY; +} + +int cudaq_host_ringbuffer_slot_available(const cudaq_ringbuffer_t *rb, + uint32_t slot_idx) { + return rb->rx_flags_host[slot_idx] == 0 && rb->tx_flags_host[slot_idx] == 0; +} + +void cudaq_host_ringbuffer_clear_slot(const cudaq_ringbuffer_t *rb, + uint32_t slot_idx) { + const_cast(rb->tx_flags_host)[slot_idx] = 0; + __sync_synchronize(); +} + +cudaq_status_t cudaq_host_release_worker(cudaq_dispatcher_t *dispatcher, + int worker_id) { + if (!dispatcher) + return CUDAQ_ERR_INVALID_ARG; + if (dispatcher->config.backend != CUDAQ_BACKEND_HOST_LOOP || + !dispatcher->host_handle) + return CUDAQ_ERR_INVALID_ARG; + return cudaq_host_dispatcher_release_worker(dispatcher->host_handle, + worker_id); +} diff --git a/realtime/lib/daemon/dispatcher/dispatch_kernel.cu b/realtime/lib/daemon/dispatcher/dispatch_kernel.cu index fcfa7f9a..dceac063 100644 --- a/realtime/lib/daemon/dispatcher/dispatch_kernel.cu +++ b/realtime/lib/daemon/dispatcher/dispatch_kernel.cu @@ -1,21 +1,22 @@ -// Copyright (c) 2025 - Present NVIDIA Corporation & Affiliates. -// All rights reserved. -// -// This source code and the accompanying materials are made available under -// the terms of the Apache License 2.0 which accompanies this distribution. - -#include "cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h" -#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" -#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel.cuh" -#include "cudaq/nvqlink/daemon/dispatcher/dispatch_modes.h" -#include "cudaq/nvqlink/daemon/dispatcher/kernel_types.h" +/******************************************************************************* + * Copyright (c) 2025 - 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh" +#include "cudaq/realtime/daemon/dispatcher/dispatch_modes.h" +#include "cudaq/realtime/daemon/dispatcher/kernel_types.h" -#include #include #include #include -namespace cudaq::nvqlink { +namespace cudaq::realtime { //============================================================================== // Dispatch Kernel Implementation (compiled into libcudaq-realtime.so) @@ -37,10 +38,23 @@ __device__ inline const cudaq_function_entry_t* dispatch_lookup_entry( /// @brief Dispatch kernel for DEVICE_CALL mode only (no graph launch support). /// This kernel does not contain any device-side graph launch code, avoiding /// compatibility issues on systems where cudaGraphLaunch is not supported. +/// +/// Supports symmetric RX/TX data buffers for Hololink compatibility: +/// - RX data address comes from rx_flags[slot] (set by Hololink RX kernel) +/// - TX response is written to tx_data + slot * tx_stride_sz +/// - tx_flags[slot] is set to the TX slot address +/// +/// When KernelType::is_cooperative is true, the kernel is launched via +/// cudaLaunchCooperativeKernel and ALL threads participate in calling the +/// RPC handler (needed for multi-block cooperative decode kernels like BP). +/// Thread 0 polls/parses the header, broadcasts work via shared memory, +/// then all threads call the handler after a grid.sync(). template __global__ void dispatch_kernel_device_call_only( volatile std::uint64_t* rx_flags, volatile std::uint64_t* tx_flags, + std::uint8_t* tx_data, + std::size_t tx_stride_sz, cudaq_function_entry_t* function_table, std::size_t func_count, volatile int* shutdown_flag, @@ -50,55 +64,197 @@ __global__ void dispatch_kernel_device_call_only( std::uint64_t local_packet_count = 0; std::size_t current_slot = 0; - while (!(*shutdown_flag)) { - if (tid == 0) { - std::uint64_t rx_value = rx_flags[current_slot]; - if (rx_value != 0) { + if constexpr (KernelType::is_cooperative) { + //========================================================================== + // Cooperative path: ALL threads call the handler. + // + // Work descriptor in shared memory (block 0 broadcasts via grid.sync). + // Only block 0 needs shared memory for the descriptor; other blocks + // read the device-memory copies after the grid barrier. + //========================================================================== + __shared__ DeviceRPCFunction s_func; + __shared__ void* s_arg_buffer; + __shared__ std::uint8_t* s_output_buffer; + __shared__ std::uint32_t s_arg_len; + __shared__ std::uint32_t s_max_result_len; + __shared__ bool s_have_work; + + // Device-memory work descriptor visible to all blocks after grid.sync. + // We use a single set since the cooperative kernel processes one RPC at + // a time (all threads participate, so no pipelining). + __device__ static DeviceRPCFunction d_func; + __device__ static void* d_arg_buffer; + __device__ static std::uint8_t* d_output_buffer; + __device__ static std::uint32_t d_arg_len; + __device__ static std::uint32_t d_max_result_len; + __device__ static bool d_have_work; + + while (!(*shutdown_flag)) { + // --- Phase 1: Thread 0 polls and parses --- + if (tid == 0) { + s_have_work = false; + std::uint64_t rx_value = rx_flags[current_slot]; + if (rx_value != 0) { + void* rx_slot = reinterpret_cast(rx_value); + RPCHeader* header = static_cast(rx_slot); + if (header->magic == RPC_MAGIC_REQUEST) { + const cudaq_function_entry_t* entry = dispatch_lookup_entry( + header->function_id, function_table, func_count); + if (entry != nullptr && + entry->dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) { + std::uint8_t* tx_slot = tx_data + current_slot * tx_stride_sz; + + s_func = reinterpret_cast( + entry->handler.device_fn_ptr); + s_arg_buffer = static_cast(header + 1); + s_output_buffer = tx_slot + sizeof(RPCResponse); + s_arg_len = header->arg_len; + s_max_result_len = tx_stride_sz - sizeof(RPCResponse); + s_have_work = true; + + // Publish to device memory for other blocks + d_func = s_func; + d_arg_buffer = s_arg_buffer; + d_output_buffer = s_output_buffer; + d_arg_len = s_arg_len; + d_max_result_len = s_max_result_len; + d_have_work = true; + } + } + if (!s_have_work) { + // Bad magic or unsupported mode -- discard + __threadfence_system(); + rx_flags[current_slot] = 0; + } + } + } - bool packet_consumed = false; + // --- Phase 2: Broadcast to all threads --- + KernelType::sync(); + + // Non-block-0 threads read from device memory + bool have_work; + DeviceRPCFunction func; + void* arg_buffer; + std::uint8_t* output_buffer; + std::uint32_t arg_len; + std::uint32_t max_result_len; + if (blockIdx.x == 0) { + have_work = s_have_work; + func = s_func; + arg_buffer = s_arg_buffer; + output_buffer = s_output_buffer; + arg_len = s_arg_len; + max_result_len = s_max_result_len; + } else { + have_work = d_have_work; + func = d_func; + arg_buffer = d_arg_buffer; + output_buffer = d_output_buffer; + arg_len = d_arg_len; + max_result_len = d_max_result_len; + } - void* data_buffer = reinterpret_cast(rx_value); - RPCHeader* header = static_cast(data_buffer); + // --- Phase 3: ALL threads call the handler --- + std::uint32_t result_len = 0; + int status = 0; + if (have_work) { + status = func(arg_buffer, output_buffer, arg_len, + max_result_len, &result_len); + } + + // --- Phase 4: Sync, then thread 0 writes response --- + KernelType::sync(); + + if (tid == 0 && have_work) { + std::uint8_t* tx_slot = tx_data + current_slot * tx_stride_sz; + RPCResponse* response = reinterpret_cast(tx_slot); + response->magic = RPC_MAGIC_RESPONSE; + response->status = status; + response->result_len = result_len; + + __threadfence_system(); + tx_flags[current_slot] = reinterpret_cast(tx_slot); + + __threadfence_system(); + rx_flags[current_slot] = 0; + local_packet_count++; + current_slot = (current_slot + 1) % num_slots; + } + + // Reset device-memory work flag for next iteration + if (tid == 0) { + d_have_work = false; + } + + KernelType::sync(); + + if ((local_packet_count & 0xFF) == 0) { + __threadfence_system(); + } + } + } else { + //========================================================================== + // Regular path: only thread 0 calls the handler (unchanged). + //========================================================================== + while (!(*shutdown_flag)) { + if (tid == 0) { + std::uint64_t rx_value = rx_flags[current_slot]; + if (rx_value != 0) { + // RX data address comes from rx_flags (set by Hololink RX kernel + // or host test harness to the address of the RX data slot) + void* rx_slot = reinterpret_cast(rx_value); + RPCHeader* header = static_cast(rx_slot); + if (header->magic != RPC_MAGIC_REQUEST) { + __threadfence_system(); + rx_flags[current_slot] = 0; + continue; + } + + std::uint32_t function_id = header->function_id; + std::uint32_t arg_len = header->arg_len; + void* arg_buffer = static_cast(header + 1); - if (header->magic != RPC_MAGIC_REQUEST) { - packet_consumed = true; // Garbage data, consume it to clear it - } else { const cudaq_function_entry_t* entry = dispatch_lookup_entry( - header->function_id, function_table, func_count); + function_id, function_table, func_count); if (entry != nullptr && entry->dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) { - DeviceRPCFunction func = + DeviceRPCFunction func = reinterpret_cast(entry->handler.device_fn_ptr); + + // Compute TX slot address from symmetric TX data buffer + std::uint8_t* tx_slot = tx_data + current_slot * tx_stride_sz; + + // Handler writes results directly to TX slot (after response header) + std::uint8_t* output_buffer = tx_slot + sizeof(RPCResponse); std::uint32_t result_len = 0; - std::uint32_t max_result_len = 1024; - void* arg_buffer = static_cast(header + 1); - int status = func(arg_buffer, header->arg_len, max_result_len, &result_len); + std::uint32_t max_result_len = tx_stride_sz - sizeof(RPCResponse); + int status = func(arg_buffer, output_buffer, arg_len, + max_result_len, &result_len); - RPCResponse* response = static_cast(data_buffer); + // Write RPC response header to TX slot + RPCResponse* response = reinterpret_cast(tx_slot); response->magic = RPC_MAGIC_RESPONSE; response->status = status; response->result_len = result_len; __threadfence_system(); - tx_flags[current_slot] = rx_value; + // Signal TX with the TX slot address (symmetric with Hololink TX kernel) + tx_flags[current_slot] = reinterpret_cast(tx_slot); } - // Whether the entry was found or not, consume the packet - packet_consumed = true; - } - if (packet_consumed) { __threadfence_system(); rx_flags[current_slot] = 0; local_packet_count++; + current_slot = (current_slot + 1) % num_slots; } - current_slot = (current_slot + 1) % num_slots; } - } - KernelType::sync(); + KernelType::sync(); - if ((local_packet_count & 0xFF) == 0) { - __threadfence_system(); + if ((local_packet_count & 0xFF) == 0) { + __threadfence_system(); + } } } @@ -108,15 +264,19 @@ __global__ void dispatch_kernel_device_call_only( } /// @brief Dispatch kernel supporting both DEVICE_CALL and GRAPH_LAUNCH modes. -/// This kernel includes device-side graph launch code for sm_80+ (compute capability >= 8.0). +/// This kernel includes device-side graph launch code and requires compute capability >= 9.0. /// NOTE: Graph launch code is conditionally compiled based on __CUDA_ARCH__. +/// +/// Supports symmetric RX/TX data buffers for Hololink compatibility. template __global__ void dispatch_kernel_with_graph( volatile std::uint64_t* rx_flags, volatile std::uint64_t* tx_flags, + std::uint8_t* tx_data, + std::size_t tx_stride_sz, cudaq_function_entry_t* function_table, std::size_t func_count, - void** global_mailbox_bank, + GraphIOContext* graph_io_ctx, volatile int* shutdown_flag, std::uint64_t* stats, std::size_t num_slots) { @@ -128,108 +288,72 @@ __global__ void dispatch_kernel_with_graph( if (tid == 0) { std::uint64_t rx_value = rx_flags[current_slot]; if (rx_value != 0) { - - bool packet_consumed = false; + void* rx_slot = reinterpret_cast(rx_value); + RPCHeader* header = static_cast(rx_slot); + if (header->magic != RPC_MAGIC_REQUEST) { + __threadfence_system(); + rx_flags[current_slot] = 0; + continue; + } + + std::uint32_t function_id = header->function_id; + std::uint32_t arg_len = header->arg_len; + void* arg_buffer = static_cast(header + 1); - void* data_buffer = reinterpret_cast(rx_value); - RPCHeader* header = static_cast(data_buffer); + const cudaq_function_entry_t* entry = dispatch_lookup_entry( + function_id, function_table, func_count); - if (header->magic != RPC_MAGIC_REQUEST) { - packet_consumed = true; // Garbage data, consume it to clear it - } else { - const cudaq_function_entry_t* entry = dispatch_lookup_entry( - header->function_id, function_table, func_count); - - if (entry != nullptr) { - if (entry->dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) { - DeviceRPCFunction func = - reinterpret_cast(entry->handler.device_fn_ptr); - std::uint32_t result_len = 0; - std::uint32_t max_result_len = 1024; - void* arg_buffer = static_cast(header + 1); - int status = func(arg_buffer, header->arg_len, max_result_len, &result_len); - - RPCResponse* response = static_cast(data_buffer); - response->magic = RPC_MAGIC_RESPONSE; - response->status = status; - response->result_len = result_len; + // Compute TX slot address from symmetric TX data buffer + std::uint8_t* tx_slot = tx_data + current_slot * tx_stride_sz; + + if (entry != nullptr) { + if (entry->dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) { + DeviceRPCFunction func = + reinterpret_cast(entry->handler.device_fn_ptr); + + // Handler writes results directly to TX slot (after response header) + std::uint8_t* output_buffer = tx_slot + sizeof(RPCResponse); + std::uint32_t result_len = 0; + std::uint32_t max_result_len = tx_stride_sz - sizeof(RPCResponse); + int status = func(arg_buffer, output_buffer, arg_len, + max_result_len, &result_len); + // Write RPC response to TX slot + RPCResponse* response = reinterpret_cast(tx_slot); + response->magic = RPC_MAGIC_RESPONSE; + response->status = status; + response->result_len = result_len; + + __threadfence_system(); + tx_flags[current_slot] = reinterpret_cast(tx_slot); + } +#if __CUDA_ARCH__ >= 900 + else if (entry->dispatch_mode == CUDAQ_DISPATCH_GRAPH_LAUNCH) { + // Fill IO context so the graph kernel can read input from + // rx_slot, write the RPCResponse to tx_slot, and signal + // completion by setting *tx_flag = tx_flag_value. + if (graph_io_ctx != nullptr) { + graph_io_ctx->rx_slot = rx_slot; + graph_io_ctx->tx_slot = tx_slot; + graph_io_ctx->tx_flag = &tx_flags[current_slot]; + graph_io_ctx->tx_flag_value = + reinterpret_cast(tx_slot); + graph_io_ctx->tx_stride_sz = tx_stride_sz; __threadfence_system(); - tx_flags[current_slot] = rx_value; - packet_consumed = true; - } -#if __CUDA_ARCH__ >= 800 - else if (entry->dispatch_mode == CUDAQ_DISPATCH_GRAPH_LAUNCH) { - - int mailbox_idx = static_cast(entry->mailbox_idx); - - // --- SINGLE-LAUNCH GUARD (fixes review issue #1) --- - // Check d_inflight_flag first: if a previous graph execution - // is still in flight for this predecoder, skip it. The output - // kernel clears this flag when it finishes. - volatile int* d_inflight = entry->d_inflight_flag; - bool already_in_flight = (d_inflight != nullptr && *d_inflight == 1); - - // --- BACKPRESSURE CHECK --- - // Even if not in-flight, the CPU queue may be full. - bool queue_full = false; - if (!already_in_flight) { - int* d_queue_idx = entry->d_queue_idx; - auto* d_ready_flags = static_cast*>(entry->d_ready_flags); - if (d_queue_idx != nullptr && d_ready_flags != nullptr) { - int current_tail = *d_queue_idx; - if (d_ready_flags[current_tail].load(cuda::std::memory_order_acquire) == 1) { - queue_full = true; - } - } - } - // ------------------------------- - - if (already_in_flight || queue_full) { - // Do NOT launch. Packet stays in ring buffer for retry. - packet_consumed = false; - } else { - // CLEAR TO LAUNCH: set inflight flag, write mailbox, launch graph. - if (d_inflight != nullptr) { - *d_inflight = 1; - __threadfence_system(); // Ensure flag is visible before graph reads it - } - - if (global_mailbox_bank != nullptr) { - global_mailbox_bank[mailbox_idx] = data_buffer; - __threadfence_system(); - } - - cudaError_t launch_err = cudaGraphLaunch(entry->handler.graph_exec, cudaStreamGraphFireAndForget); - if (launch_err != cudaSuccess) { - // Launch failed: write error code to tx_flags for host diagnostics - // Error codes are small integers, distinguishable from valid pointers - tx_flags[current_slot] = 0xDEAD000000000000ULL | (uint64_t)launch_err; - __threadfence_system(); - // Roll back inflight flag since graph never ran - if (d_inflight != nullptr) { - *d_inflight = 0; - __threadfence_system(); - } - } - packet_consumed = true; - } } -#endif // __CUDA_ARCH__ >= 800 - } else { - packet_consumed = true; // Unknown function, drop it + + // Launch pre-created graph (fire-and-forget is async; the + // graph kernel is responsible for writing the response and + // signaling tx_flag when done). + cudaGraphLaunch(entry->handler.graph_exec, + cudaStreamGraphFireAndForget); } +#endif // __CUDA_ARCH__ >= 900 } - // --- ADVANCE LOGIC --- - if (packet_consumed) { - __threadfence_system(); - rx_flags[current_slot] = 0; // Clear the slot ONLY if we launched it - local_packet_count++; - } - - // ALWAYS advance the slot pointer to keep checking other arrivals - // If we skipped a packet due to backpressure, we will loop back to it eventually. + __threadfence_system(); + rx_flags[current_slot] = 0; + local_packet_count++; current_slot = (current_slot + 1) % num_slots; } } @@ -246,15 +370,46 @@ __global__ void dispatch_kernel_with_graph( } } -} // namespace cudaq::nvqlink +} // namespace cudaq::realtime //============================================================================== // Host Launch Functions //============================================================================== +// Force eager CUDA module loading for the dispatch kernel. +// Call before launching persistent kernels to avoid lazy-loading deadlocks. +extern "C" cudaError_t cudaq_dispatch_kernel_query_occupancy( + int* out_blocks, uint32_t threads_per_block) { + int num_blocks = 0; + cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &num_blocks, + cudaq::realtime::dispatch_kernel_device_call_only, + threads_per_block, 0); + if (err != cudaSuccess) return err; + if (out_blocks) *out_blocks = num_blocks; + return cudaSuccess; +} + +extern "C" cudaError_t cudaq_dispatch_kernel_cooperative_query_occupancy( + int* out_blocks, uint32_t threads_per_block) { + int num_blocks = 0; + cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &num_blocks, + cudaq::realtime::dispatch_kernel_device_call_only< + cudaq::realtime::CooperativeKernel>, + threads_per_block, 0); + if (err != cudaSuccess) return err; + if (out_blocks) *out_blocks = num_blocks; + return cudaSuccess; +} + extern "C" void cudaq_launch_dispatch_kernel_regular( volatile std::uint64_t* rx_flags, volatile std::uint64_t* tx_flags, + std::uint8_t* rx_data, + std::uint8_t* tx_data, + std::size_t rx_stride_sz, + std::size_t tx_stride_sz, cudaq_function_entry_t* function_table, std::size_t func_count, volatile int* shutdown_flag, @@ -264,15 +419,24 @@ extern "C" void cudaq_launch_dispatch_kernel_regular( std::uint32_t threads_per_block, cudaStream_t stream) { // Use device-call-only kernel (no graph launch support) - cudaq::nvqlink::dispatch_kernel_device_call_only + // Note: rx_data/rx_stride_sz are available in the ringbuffer struct but + // not passed to the kernel since it reads RX addresses from rx_flags. + (void)rx_data; + (void)rx_stride_sz; + cudaq::realtime::dispatch_kernel_device_call_only <<>>( - rx_flags, tx_flags, function_table, func_count, + rx_flags, tx_flags, tx_data, tx_stride_sz, + function_table, func_count, shutdown_flag, stats, num_slots); } extern "C" void cudaq_launch_dispatch_kernel_cooperative( volatile std::uint64_t* rx_flags, volatile std::uint64_t* tx_flags, + std::uint8_t* rx_data, + std::uint8_t* tx_data, + std::size_t rx_stride_sz, + std::size_t tx_stride_sz, cudaq_function_entry_t* function_table, std::size_t func_count, volatile int* shutdown_flag, @@ -281,9 +445,13 @@ extern "C" void cudaq_launch_dispatch_kernel_cooperative( std::uint32_t num_blocks, std::uint32_t threads_per_block, cudaStream_t stream) { + (void)rx_data; + (void)rx_stride_sz; void* kernel_args[] = { const_cast(&rx_flags), const_cast(&tx_flags), + &tx_data, + &tx_stride_sz, &function_table, &func_count, const_cast(&shutdown_flag), @@ -293,7 +461,7 @@ extern "C" void cudaq_launch_dispatch_kernel_cooperative( cudaLaunchCooperativeKernel( reinterpret_cast( - cudaq::nvqlink::dispatch_kernel_device_call_only), + cudaq::realtime::dispatch_kernel_device_call_only), dim3(num_blocks), dim3(threads_per_block), kernel_args, 0, stream); } @@ -318,9 +486,11 @@ struct cudaq_dispatch_graph_context { // Persistent storage for kernel parameters (must outlive graph execution) volatile std::uint64_t* rx_flags; volatile std::uint64_t* tx_flags; + std::uint8_t* tx_data; + std::size_t tx_stride_sz; cudaq_function_entry_t* function_table; std::size_t func_count; - void** global_mailbox_bank; + cudaq::realtime::GraphIOContext* graph_io_ctx; volatile int* shutdown_flag; std::uint64_t* stats; std::size_t num_slots; @@ -329,9 +499,13 @@ struct cudaq_dispatch_graph_context { extern "C" cudaError_t cudaq_create_dispatch_graph_regular( volatile std::uint64_t* rx_flags, volatile std::uint64_t* tx_flags, + std::uint8_t* rx_data, + std::uint8_t* tx_data, + std::size_t rx_stride_sz, + std::size_t tx_stride_sz, cudaq_function_entry_t* function_table, std::size_t func_count, - void** global_mailbox_bank, + void* graph_io_ctx_raw, volatile int* shutdown_flag, std::uint64_t* stats, std::size_t num_slots, @@ -340,6 +514,8 @@ extern "C" cudaError_t cudaq_create_dispatch_graph_regular( cudaStream_t stream, cudaq_dispatch_graph_context** out_context) { + (void)rx_data; + (void)rx_stride_sz; cudaError_t err; // Allocate context with persistent parameter storage @@ -349,9 +525,12 @@ extern "C" cudaError_t cudaq_create_dispatch_graph_regular( // Store parameters persistently in the context ctx->rx_flags = rx_flags; ctx->tx_flags = tx_flags; + ctx->tx_data = tx_data; + ctx->tx_stride_sz = tx_stride_sz; ctx->function_table = function_table; ctx->func_count = func_count; - ctx->global_mailbox_bank = global_mailbox_bank; + ctx->graph_io_ctx = + static_cast(graph_io_ctx_raw); ctx->shutdown_flag = shutdown_flag; ctx->stats = stats; ctx->num_slots = num_slots; @@ -368,16 +547,18 @@ extern "C" cudaError_t cudaq_create_dispatch_graph_regular( void* kernel_args[] = { &ctx->rx_flags, &ctx->tx_flags, + &ctx->tx_data, + &ctx->tx_stride_sz, &ctx->function_table, &ctx->func_count, - &ctx->global_mailbox_bank, + &ctx->graph_io_ctx, &ctx->shutdown_flag, &ctx->stats, &ctx->num_slots }; kernel_params.func = reinterpret_cast( - cudaq::nvqlink::dispatch_kernel_with_graph); + cudaq::realtime::dispatch_kernel_with_graph); kernel_params.gridDim = dim3(num_blocks, 1, 1); kernel_params.blockDim = dim3(threads_per_block, 1, 1); kernel_params.sharedMemBytes = 0; diff --git a/realtime/lib/daemon/dispatcher/host_dispatcher.cu b/realtime/lib/daemon/dispatcher/host_dispatcher.cu new file mode 100644 index 00000000..abb52d87 --- /dev/null +++ b/realtime/lib/daemon/dispatcher/host_dispatcher.cu @@ -0,0 +1,178 @@ +/******************************************************************************* + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. + * All rights reserved. + * + * This source code and the accompanying materials are made available under + * the terms of the Apache License 2.0 which accompanies this distribution. + ******************************************************************************/ + +#include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" + +namespace cudaq::realtime { + +//----------------------------------------------------------------------------- +// Helpers: function table lookup +//----------------------------------------------------------------------------- + +static const cudaq_function_entry_t* lookup_function(cudaq_function_entry_t* table, + size_t count, + uint32_t function_id) { + for (size_t i = 0; i < count; ++i) { + if (table[i].function_id == function_id) + return &table[i]; + } + return nullptr; +} + +static int find_idle_graph_worker_for_function(const HostDispatcherConfig& config, + uint32_t function_id) { + uint64_t mask = config.idle_mask->load(cuda::std::memory_order_acquire); + while (mask != 0) { + int worker_id = __builtin_ffsll(static_cast(mask)) - 1; + if (config.workers[static_cast(worker_id)].function_id == function_id) + return worker_id; + mask &= ~(1ULL << worker_id); + } + return -1; +} + +/// Result of parsing the slot when a function table is in use. +struct ParsedSlot { + uint32_t function_id = 0; + const cudaq_function_entry_t* entry = nullptr; + bool drop = false; // true => invalid magic or unknown function_id; clear slot and advance +}; + +static ParsedSlot parse_slot_with_function_table(void* slot_host, + const HostDispatcherConfig& config) { + ParsedSlot out; + const RPCHeader* header = static_cast(slot_host); + if (header->magic != RPC_MAGIC_REQUEST) { + out.drop = true; + return out; + } + out.function_id = header->function_id; + out.entry = lookup_function(config.function_table, config.function_table_count, + out.function_id); + if (!out.entry) + out.drop = true; + return out; +} + +/// Clear rx_flag for this slot, increment stats, advance slot index. +static void finish_slot_and_advance(const HostDispatcherConfig& config, + size_t& current_slot, + size_t num_slots, + uint64_t& packets_dispatched) { + config.rx_flags[current_slot].store(0, cuda::std::memory_order_release); + packets_dispatched++; + if (config.live_dispatched) + config.live_dispatched->fetch_add(1, cuda::std::memory_order_relaxed); + current_slot = (current_slot + 1) % num_slots; +} + +/// Acquire a graph worker (by function_id if table in use, else any idle worker). +static int acquire_graph_worker(const HostDispatcherConfig& config, + bool use_function_table, + const cudaq_function_entry_t* entry, + uint32_t function_id) { + if (use_function_table && entry && entry->dispatch_mode == CUDAQ_DISPATCH_GRAPH_LAUNCH) + return find_idle_graph_worker_for_function(config, function_id); + uint64_t mask = config.idle_mask->load(cuda::std::memory_order_acquire); + if (mask == 0) + return -1; + return __builtin_ffsll(static_cast(mask)) - 1; +} + +/// Launch the graph for the given worker; set tx_flags on success or error. +static void launch_graph_worker(const HostDispatcherConfig& config, + int worker_id, + void* slot_host, + size_t current_slot) { + config.idle_mask->fetch_and(~(1ULL << worker_id), cuda::std::memory_order_release); + config.inflight_slot_tags[worker_id] = static_cast(current_slot); + + ptrdiff_t offset = static_cast(slot_host) - config.rx_data_host; + void* data_dev = static_cast(config.rx_data_dev + offset); + config.h_mailbox_bank[worker_id] = data_dev; + __sync_synchronize(); + + const size_t w = static_cast(worker_id); + cudaError_t err = cudaGraphLaunch(config.workers[w].graph_exec, config.workers[w].stream); + + if (err != cudaSuccess) { + uint64_t error_val = (uint64_t)0xDEAD << 48 | (uint64_t)err; + config.tx_flags[current_slot].store(error_val, cuda::std::memory_order_release); + config.idle_mask->fetch_or(1ULL << worker_id, cuda::std::memory_order_release); + } else { + uint64_t tx_slot_addr = + (config.tx_data_host != nullptr && config.tx_data_dev != nullptr) + ? reinterpret_cast(config.tx_data_host + + current_slot * config.tx_stride_sz) + : 0xEEEEEEEEEEEEEEEEULL; + config.tx_flags[current_slot].store(tx_slot_addr, cuda::std::memory_order_release); + } +} + +//----------------------------------------------------------------------------- +// Main loop +//----------------------------------------------------------------------------- + +void host_dispatcher_loop(const HostDispatcherConfig& config) { + size_t current_slot = 0; + const size_t num_slots = config.num_slots; + uint64_t packets_dispatched = 0; + const bool use_function_table = + (config.function_table != nullptr && config.function_table_count > 0); + + while (config.shutdown_flag->load(cuda::std::memory_order_acquire) == 0) { + uint64_t rx_value = config.rx_flags[current_slot].load(cuda::std::memory_order_acquire); + + if (rx_value == 0) { + QEC_CPU_RELAX(); + continue; + } + + void* slot_host = reinterpret_cast(rx_value); + uint32_t function_id = 0; + const cudaq_function_entry_t* entry = nullptr; + + if (use_function_table) { + ParsedSlot parsed = parse_slot_with_function_table(slot_host, config); + if (parsed.drop) { + config.rx_flags[current_slot].store(0, cuda::std::memory_order_release); + current_slot = (current_slot + 1) % num_slots; + continue; + } + function_id = parsed.function_id; + entry = parsed.entry; + } + + // Only GRAPH_LAUNCH is dispatched; HOST_CALL and DEVICE_CALL are dropped. + if (entry && entry->dispatch_mode != CUDAQ_DISPATCH_GRAPH_LAUNCH) { + config.rx_flags[current_slot].store(0, cuda::std::memory_order_release); + current_slot = (current_slot + 1) % num_slots; + continue; + } + + int worker_id = acquire_graph_worker(config, use_function_table, entry, function_id); + if (worker_id < 0) { + QEC_CPU_RELAX(); + continue; + } + + launch_graph_worker(config, worker_id, slot_host, current_slot); + finish_slot_and_advance(config, current_slot, num_slots, packets_dispatched); + } + + for (const auto& w : config.workers) { + cudaStreamSynchronize(w.stream); + } + + if (config.stats_counter) { + *config.stats_counter = packets_dispatched; + } +} + +} // namespace cudaq::realtime diff --git a/realtime/lib/daemon/dispatcher/host_dispatcher_capi.cu b/realtime/lib/daemon/dispatcher/host_dispatcher_capi.cu new file mode 100644 index 00000000..e9c5be95 --- /dev/null +++ b/realtime/lib/daemon/dispatcher/host_dispatcher_capi.cu @@ -0,0 +1,157 @@ +/******************************************************************************* + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. + * All rights reserved. + * + * This source code and the accompanying materials are made available under + * the terms of the Apache License 2.0 which accompanies this distribution. + ******************************************************************************/ + +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" + +#include +#include +#include +#include +#include + +struct cudaq_host_dispatcher_handle { + std::thread thread; + std::vector workers; + cudaq::realtime::atomic_uint64_sys* idle_mask = nullptr; + int* inflight_slot_tags = nullptr; + void** h_mailbox_bank = nullptr; + bool owns_mailbox = false; + size_t num_workers = 0; +}; + +static size_t count_graph_launch_workers(const cudaq_function_table_t* table) { + size_t n = 0; + for (uint32_t i = 0; i < table->count; ++i) { + if (table->entries[i].dispatch_mode == CUDAQ_DISPATCH_GRAPH_LAUNCH) + ++n; + } + return n; +} + +extern "C" cudaq_host_dispatcher_handle_t* cudaq_host_dispatcher_start_thread( + const cudaq_ringbuffer_t* ringbuffer, + const cudaq_function_table_t* table, + const cudaq_dispatcher_config_t* config, + volatile int* shutdown_flag, + uint64_t* stats, + void** external_mailbox) { + if (!ringbuffer || !table || !config || !shutdown_flag || !stats) + return nullptr; + if (!ringbuffer->rx_flags_host || !ringbuffer->tx_flags_host || + !ringbuffer->rx_data_host || !ringbuffer->tx_data_host) + return nullptr; + if (!table->entries || table->count == 0) + return nullptr; + if (config->num_slots == 0 || config->slot_size == 0) + return nullptr; + + const size_t num_workers = count_graph_launch_workers(table); + if (num_workers == 0) + return nullptr; + + auto* handle = new (std::nothrow) cudaq_host_dispatcher_handle(); + if (!handle) + return nullptr; + + handle->idle_mask = new (std::nothrow) cudaq::realtime::atomic_uint64_sys(0); + handle->inflight_slot_tags = new (std::nothrow) int[num_workers]; + if (external_mailbox) { + handle->h_mailbox_bank = external_mailbox; + handle->owns_mailbox = false; + } else { + handle->h_mailbox_bank = new (std::nothrow) void*[num_workers]; + handle->owns_mailbox = true; + } + if (!handle->idle_mask || !handle->inflight_slot_tags || !handle->h_mailbox_bank) { + delete handle->idle_mask; + delete[] handle->inflight_slot_tags; + if (handle->owns_mailbox) + delete[] handle->h_mailbox_bank; + delete handle; + return nullptr; + } + + std::memset(handle->inflight_slot_tags, 0, num_workers * sizeof(int)); + + handle->workers.reserve(num_workers); + for (uint32_t i = 0; i < table->count; ++i) { + if (table->entries[i].dispatch_mode != CUDAQ_DISPATCH_GRAPH_LAUNCH) + continue; + cudaStream_t stream = nullptr; + if (cudaStreamCreate(&stream) != cudaSuccess) { + for (auto& w : handle->workers) + cudaStreamDestroy(w.stream); + delete handle->idle_mask; + delete[] handle->inflight_slot_tags; + delete[] handle->h_mailbox_bank; + delete handle; + return nullptr; + } + cudaq::realtime::HostDispatchWorker w; + w.graph_exec = table->entries[i].handler.graph_exec; + w.stream = stream; + w.function_id = table->entries[i].function_id; + handle->workers.push_back(w); + } + handle->num_workers = num_workers; + + handle->idle_mask->store((1ULL << num_workers) - 1, + cuda::std::memory_order_release); + + cudaq::realtime::HostDispatcherConfig host_config; + host_config.rx_flags = + (cudaq::realtime::atomic_uint64_sys*)(uintptr_t)ringbuffer->rx_flags_host; + host_config.tx_flags = + (cudaq::realtime::atomic_uint64_sys*)(uintptr_t)ringbuffer->tx_flags_host; + host_config.rx_data_host = ringbuffer->rx_data_host; + host_config.rx_data_dev = ringbuffer->rx_data; + host_config.tx_data_host = ringbuffer->tx_data_host; + host_config.tx_data_dev = ringbuffer->tx_data; + host_config.tx_stride_sz = ringbuffer->tx_stride_sz; + host_config.h_mailbox_bank = handle->h_mailbox_bank; + host_config.num_slots = config->num_slots; + host_config.slot_size = config->slot_size; + host_config.workers = handle->workers; + host_config.function_table = table->entries; + host_config.function_table_count = table->count; + host_config.shutdown_flag = + (cudaq::realtime::atomic_int_sys*)(uintptr_t)shutdown_flag; + host_config.stats_counter = stats; + host_config.live_dispatched = nullptr; + host_config.idle_mask = handle->idle_mask; + host_config.inflight_slot_tags = handle->inflight_slot_tags; + + handle->thread = std::thread(cudaq::realtime::host_dispatcher_loop, host_config); + return handle; +} + +extern "C" cudaq_status_t cudaq_host_dispatcher_release_worker( + cudaq_host_dispatcher_handle_t* handle, int worker_id) { + if (!handle || !handle->idle_mask) + return CUDAQ_ERR_INVALID_ARG; + if (worker_id < 0 || static_cast(worker_id) >= handle->num_workers) + return CUDAQ_ERR_INVALID_ARG; + handle->idle_mask->fetch_or(1ULL << worker_id, + cuda::std::memory_order_release); + return CUDAQ_OK; +} + +extern "C" void cudaq_host_dispatcher_stop(cudaq_host_dispatcher_handle_t* handle) { + if (!handle) + return; + if (handle->thread.joinable()) + handle->thread.join(); + for (auto& w : handle->workers) + cudaStreamDestroy(w.stream); + delete handle->idle_mask; + delete[] handle->inflight_slot_tags; + if (handle->owns_mailbox) + delete[] handle->h_mailbox_bank; + delete handle; +} diff --git a/realtime/scripts/install_dev_prerequisites.sh b/realtime/scripts/install_dev_prerequisites.sh new file mode 100755 index 00000000..bf8c57f4 --- /dev/null +++ b/realtime/scripts/install_dev_prerequisites.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# ============================================================================ # +# Copyright (c) 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +# Usage: +# This script builds and installs a minimal set of dependencies needed to build +# CUDA-Q realtime from source. +# +# Usage: +# bash install_dev_prerequisites.sh + + +if [ -x "$(command -v apt-get)" ]; then + # [libibverbs] + echo "Installing libibverbs..." + apt-get update && apt-get install -y --no-install-recommends libibverbs-dev + + # [DOCA Host] + + if [ ! -x "$(command -v curl)" ]; then + apt-get update && apt-get install -y --no-install-recommends curl + fi + + DOCA_VERSION=3.2.1 + echo "Installing DOCA version $DOCA_VERSION..." + arch=$(uname -m) + distro=$(. /etc/os-release && echo ${ID}${VERSION_ID}) # e.g., ubuntu24.04 + export DOCA_URL="https://linux.mellanox.com/public/repo/doca/$DOCA_VERSION/$distro/$arch/" + echo "Using DOCA_REPO_LINK=${DOCA_URL}" + curl https://linux.mellanox.com/public/repo/doca/GPG-KEY-Mellanox.pub | gpg --dearmor > /etc/apt/trusted.gpg.d/GPG-KEY-Mellanox.pub + echo "deb [signed-by=/etc/apt/trusted.gpg.d/GPG-KEY-Mellanox.pub] $DOCA_URL ./" > /etc/apt/sources.list.d/doca.list + apt-get update + DEBIAN_FRONTEND=noninteractive apt-get -y install doca-all + + # [Holoscan SDK] + CUDA_MAJOR_VERSION=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\).*$/\1/p') + if [ -z "$CUDA_MAJOR_VERSION" ]; then + echo "Could not determine CUDA version from nvcc. Is the CUDA toolkit installed?" >&2 + exit 1 + fi + apt-get update && apt-get install -y --no-install-recommends holoscan-cuda-$CUDA_MAJOR_VERSION + +elif [ -x "$(command -v dnf)" ]; then + echo "TODO: Support RHEL." >&2 +else + echo "No supported package manager detected." >&2 +fi diff --git a/realtime/unittests/CMakeLists.txt b/realtime/unittests/CMakeLists.txt index ee5e41bd..048f8e88 100644 --- a/realtime/unittests/CMakeLists.txt +++ b/realtime/unittests/CMakeLists.txt @@ -48,7 +48,7 @@ if(CMAKE_CUDA_COMPILER) target_include_directories(test_dispatch_kernel PRIVATE ${CUDAToolkit_INCLUDE_DIRS} - ${CUDAQ_NVQLINK_INCLUDE_DIR} + ${CUDAQ_REALTIME_INCLUDE_DIR} ) # Find CUDA device runtime library (required for device-side API calls like cudaGraphLaunch) @@ -65,14 +65,40 @@ if(CMAKE_CUDA_COMPILER) ${CUDADEVRT_LIBRARY} ) - add_dependencies(NVQLINKUnitTests test_dispatch_kernel) + add_dependencies(CudaqRealtimeUnitTests test_dispatch_kernel) gtest_discover_tests(test_dispatch_kernel TEST_PREFIX "test_dispatch_kernel." ) message(STATUS " - test_dispatch_kernel (GPU dispatch infrastructure)") + + # Host dispatcher tests (CUDAQ_BACKEND_HOST_LOOP) + add_executable(test_host_dispatcher test_host_dispatcher.cu) + set_target_properties(test_host_dispatcher PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + ) + target_include_directories(test_host_dispatcher PRIVATE + ${CUDAToolkit_INCLUDE_DIRS} + ${CUDAQ_REALTIME_INCLUDE_DIR} + ) + target_link_libraries(test_host_dispatcher PRIVATE + GTest::gtest_main + CUDA::cudart + cudaq-realtime + cudaq-realtime-host-dispatch + ) + add_dependencies(CudaqRealtimeUnitTests test_host_dispatcher) + gtest_discover_tests(test_host_dispatcher + TEST_PREFIX "test_host_dispatcher." + ) + message(STATUS " - test_host_dispatcher (host dispatcher loop)") endif() +# ============================================================================== +# Hololink bridge/emulator/playback tools (optional, not CI) # ============================================================================== - +if (CUDAQ_REALTIME_ENABLE_HOLOLINK_TOOLS) + add_subdirectory(utils) +endif() diff --git a/realtime/unittests/test_dispatch_kernel.cu b/realtime/unittests/test_dispatch_kernel.cu index eae65dcc..bef7e049 100644 --- a/realtime/unittests/test_dispatch_kernel.cu +++ b/realtime/unittests/test_dispatch_kernel.cu @@ -14,10 +14,10 @@ #include #include -#include "cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h" -#include "cudaq/nvqlink/daemon/dispatcher/kernel_types.h" -#include "cudaq/nvqlink/daemon/dispatcher/dispatch_modes.h" -#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel.cuh" +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/kernel_types.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_modes.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh" // Helper macro for CUDA error checking #define CUDA_CHECK(call) \ @@ -33,12 +33,14 @@ namespace { //============================================================================== /// @brief Test handler that adds 1 to each byte. -__device__ int increment_handler(void* buffer, std::uint32_t arg_len, +__device__ int increment_handler(const void* input, void* output, + std::uint32_t arg_len, std::uint32_t max_result_len, std::uint32_t* result_len) { - std::uint8_t* data = static_cast(buffer); + const std::uint8_t* in_data = static_cast(input); + std::uint8_t* out_data = static_cast(output); for (std::uint32_t i = 0; i < arg_len && i < max_result_len; ++i) { - data[i] = data[i] + 1; + out_data[i] = in_data[i] + 1; } *result_len = arg_len; return 0; @@ -49,14 +51,16 @@ __device__ int increment_handler(void* buffer, std::uint32_t arg_len, //============================================================================== constexpr std::uint32_t RPC_INCREMENT_FUNCTION_ID = - cudaq::nvqlink::fnv1a_hash("rpc_increment"); + cudaq::realtime::fnv1a_hash("rpc_increment"); -__device__ int rpc_increment_handler(void* buffer, std::uint32_t arg_len, +__device__ int rpc_increment_handler(const void* input, void* output, + std::uint32_t arg_len, std::uint32_t max_result_len, std::uint32_t* result_len) { - std::uint8_t* data = static_cast(buffer); + const std::uint8_t* in_data = static_cast(input); + std::uint8_t* out_data = static_cast(output); for (std::uint32_t i = 0; i < arg_len && i < max_result_len; ++i) { - data[i] = static_cast(data[i] + 1); + out_data[i] = static_cast(in_data[i] + 1); } *result_len = arg_len; return 0; @@ -146,6 +150,10 @@ void free_ring_buffer(volatile uint64_t* host_flags, extern "C" void launch_dispatch_kernel_wrapper( volatile std::uint64_t* rx_flags, volatile std::uint64_t* tx_flags, + std::uint8_t* rx_data, + std::uint8_t* tx_data, + std::size_t rx_stride_sz, + std::size_t tx_stride_sz, cudaq_function_entry_t* function_table, std::size_t func_count, volatile int* shutdown_flag, @@ -155,7 +163,8 @@ extern "C" void launch_dispatch_kernel_wrapper( std::uint32_t threads_per_block, cudaStream_t stream) { cudaq_launch_dispatch_kernel_regular( - rx_flags, tx_flags, function_table, func_count, + rx_flags, tx_flags, rx_data, tx_data, rx_stride_sz, tx_stride_sz, + function_table, func_count, shutdown_flag, stats, num_slots, num_blocks, threads_per_block, stream); } @@ -163,7 +172,7 @@ extern "C" void launch_dispatch_kernel_wrapper( // Test Kernel for DeviceCallMode //============================================================================== -using HandlerFunc = int (*)(void*, std::uint32_t, std::uint32_t, std::uint32_t*); +using HandlerFunc = int (*)(const void*, void*, std::uint32_t, std::uint32_t, std::uint32_t*); __device__ HandlerFunc d_increment_handler = increment_handler; @@ -171,14 +180,15 @@ __device__ HandlerFunc d_increment_handler = increment_handler; template __global__ void test_dispatch_kernel( HandlerFunc handler, - void* buffer, + const void* input, + void* output, std::uint32_t arg_len, std::uint32_t max_result_len, std::uint32_t* result_len, int* status) { if (threadIdx.x == 0 && blockIdx.x == 0) { - *status = handler(buffer, arg_len, max_result_len, result_len); + *status = handler(input, output, arg_len, max_result_len, result_len); } KernelType::sync(); @@ -212,10 +222,13 @@ protected: //============================================================================== TEST_F(DispatchKernelTest, IncrementHandlerBasic) { - // Prepare test data + // Prepare test data - separate input and output buffers std::vector input = {0, 1, 2, 3, 4}; std::vector expected = {1, 2, 3, 4, 5}; - CUDA_CHECK(cudaMemcpy(d_buffer_, input.data(), input.size(), + + void* d_input = nullptr; + CUDA_CHECK(cudaMalloc(&d_input, 1024)); + CUDA_CHECK(cudaMemcpy(d_input, input.data(), input.size(), cudaMemcpyHostToDevice)); // Get device function pointer @@ -223,9 +236,9 @@ TEST_F(DispatchKernelTest, IncrementHandlerBasic) { CUDA_CHECK(cudaMemcpyFromSymbol(&h_handler, d_increment_handler, sizeof(HandlerFunc))); - // Launch kernel + // Launch kernel with separate input/output buffers test_dispatch_kernel<<<1, 32>>>( - h_handler, d_buffer_, input.size(), 1024, d_result_len_, d_status_); + h_handler, d_input, d_buffer_, input.size(), 1024, d_result_len_, d_status_); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); @@ -239,22 +252,32 @@ TEST_F(DispatchKernelTest, IncrementHandlerBasic) { EXPECT_EQ(status, 0) << "Handler should return success"; EXPECT_EQ(result_len, input.size()) << "Result length should match input"; - // Verify data incremented + // Verify output buffer has incremented data std::vector output(input.size()); CUDA_CHECK(cudaMemcpy(output.data(), d_buffer_, output.size(), cudaMemcpyDeviceToHost)); EXPECT_EQ(expected, output) << "Increment handler should add 1 to each byte"; + + // Verify input buffer is unchanged + std::vector input_readback(input.size()); + CUDA_CHECK(cudaMemcpy(input_readback.data(), d_input, input.size(), + cudaMemcpyDeviceToHost)); + EXPECT_EQ(input, input_readback) << "Input buffer should be unchanged"; + + cudaFree(d_input); } TEST_F(DispatchKernelTest, LargeBuffer) { - // Test with larger data + // Test with larger data - separate input/output buffers const std::size_t size = 512; std::vector input(size); for (std::size_t i = 0; i < size; ++i) { input[i] = static_cast(i & 0xFF); } - CUDA_CHECK(cudaMemcpy(d_buffer_, input.data(), input.size(), + void* d_input = nullptr; + CUDA_CHECK(cudaMalloc(&d_input, 1024)); + CUDA_CHECK(cudaMemcpy(d_input, input.data(), input.size(), cudaMemcpyHostToDevice)); HandlerFunc h_handler; @@ -262,7 +285,7 @@ TEST_F(DispatchKernelTest, LargeBuffer) { sizeof(HandlerFunc))); test_dispatch_kernel<<<1, 256>>>( - h_handler, d_buffer_, input.size(), 1024, d_result_len_, d_status_); + h_handler, d_input, d_buffer_, input.size(), 1024, d_result_len_, d_status_); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); @@ -271,7 +294,7 @@ TEST_F(DispatchKernelTest, LargeBuffer) { cudaMemcpyDeviceToHost)); EXPECT_EQ(result_len, size) << "Should process all bytes"; - // Verify all bytes incremented + // Verify all bytes incremented in output buffer std::vector output(size); CUDA_CHECK(cudaMemcpy(output.data(), d_buffer_, output.size(), cudaMemcpyDeviceToHost)); @@ -280,6 +303,8 @@ TEST_F(DispatchKernelTest, LargeBuffer) { uint8_t expected = static_cast((i + 1) & 0xFF); EXPECT_EQ(output[i], expected) << "Mismatch at index " << i; } + + cudaFree(d_input); } class HostApiDispatchTest : public ::testing::Test { @@ -324,6 +349,10 @@ protected: cudaq_ringbuffer_t ringbuffer{}; ringbuffer.rx_flags = rx_flags_; ringbuffer.tx_flags = tx_flags_; + ringbuffer.rx_data = rx_data_; + ringbuffer.tx_data = tx_data_; + ringbuffer.rx_stride_sz = slot_size_; + ringbuffer.tx_stride_sz = slot_size_; ASSERT_EQ(cudaq_dispatcher_set_ringbuffer(dispatcher_, &ringbuffer), CUDAQ_OK); cudaq_function_table_t table{}; @@ -369,11 +398,11 @@ protected: const std::vector& payload) { std::uint8_t* slot_data = const_cast(rx_data_host_) + slot * slot_size_; - auto* header = reinterpret_cast(slot_data); - header->magic = cudaq::nvqlink::RPC_MAGIC_REQUEST; + auto* header = reinterpret_cast(slot_data); + header->magic = cudaq::realtime::RPC_MAGIC_REQUEST; header->function_id = RPC_INCREMENT_FUNCTION_ID; header->arg_len = static_cast(payload.size()); - memcpy(slot_data + sizeof(cudaq::nvqlink::RPCHeader), payload.data(), + memcpy(slot_data + sizeof(cudaq::realtime::RPCHeader), payload.data(), payload.size()); } @@ -382,12 +411,13 @@ protected: std::int32_t* status_out = nullptr, std::uint32_t* result_len_out = nullptr) { __sync_synchronize(); + // Read from TX buffer (dispatch kernel writes response to symmetric TX) const std::uint8_t* slot_data = - const_cast(rx_data_host_) + slot * slot_size_; + const_cast(tx_data_host_) + slot * slot_size_; auto* response = - reinterpret_cast(slot_data); + reinterpret_cast(slot_data); - if (response->magic != cudaq::nvqlink::RPC_MAGIC_RESPONSE) + if (response->magic != cudaq::realtime::RPC_MAGIC_RESPONSE) return false; if (status_out) *status_out = response->status; @@ -398,7 +428,7 @@ protected: payload.resize(response->result_len); memcpy(payload.data(), - slot_data + sizeof(cudaq::nvqlink::RPCResponse), + slot_data + sizeof(cudaq::realtime::RPCResponse), response->result_len); return true; } @@ -458,7 +488,7 @@ TEST_F(HostApiDispatchTest, RpcIncrementHandler) { __global__ void graph_increment_kernel(void** buffer_ptr) { if (threadIdx.x == 0 && blockIdx.x == 0) { void* buffer = *buffer_ptr; - cudaq::nvqlink::RPCHeader* header = static_cast(buffer); + cudaq::realtime::RPCHeader* header = static_cast(buffer); std::uint32_t arg_len = header->arg_len; void* arg_buffer = static_cast(header + 1); @@ -470,15 +500,15 @@ __global__ void graph_increment_kernel(void** buffer_ptr) { } // Write response - cudaq::nvqlink::RPCResponse* response = static_cast(buffer); - response->magic = cudaq::nvqlink::RPC_MAGIC_RESPONSE; + cudaq::realtime::RPCResponse* response = static_cast(buffer); + response->magic = cudaq::realtime::RPC_MAGIC_RESPONSE; response->status = 0; response->result_len = arg_len; } } constexpr std::uint32_t RPC_GRAPH_INCREMENT_FUNCTION_ID = - cudaq::nvqlink::fnv1a_hash("rpc_graph_increment"); + cudaq::realtime::fnv1a_hash("rpc_graph_increment"); __global__ void init_graph_function_table(cudaq_function_entry_t* entries, cudaGraphExec_t graph_exec) { @@ -499,8 +529,8 @@ TEST(GraphLaunchTest, DispatchKernelGraphLaunch) { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); - if (prop.major < 8) { - GTEST_SKIP() << "Graph device launch requires compute capability 8.0+, found " + if (prop.major < 9) { + GTEST_SKIP() << "Graph device launch requires compute capability 9.0+, found " << prop.major << "." << prop.minor; } @@ -553,12 +583,12 @@ TEST(GraphLaunchTest, DispatchKernelGraphLaunch) { // Set up RPC buffer on host std::uint8_t* h_buffer = new std::uint8_t[buffer_size]; - cudaq::nvqlink::RPCHeader* h_header = reinterpret_cast(h_buffer); - h_header->magic = cudaq::nvqlink::RPC_MAGIC_REQUEST; + cudaq::realtime::RPCHeader* h_header = reinterpret_cast(h_buffer); + h_header->magic = cudaq::realtime::RPC_MAGIC_REQUEST; h_header->function_id = RPC_GRAPH_INCREMENT_FUNCTION_ID; h_header->arg_len = 4; - std::uint8_t* h_data = h_buffer + sizeof(cudaq::nvqlink::RPCHeader); + std::uint8_t* h_data = h_buffer + sizeof(cudaq::realtime::RPCHeader); h_data[0] = 0; h_data[1] = 1; h_data[2] = 2; @@ -593,7 +623,6 @@ TEST(GraphLaunchTest, DispatchKernelGraphLaunch) { CUDA_CHECK(cudaHostGetDevicePointer(&tmp_d_shutdown, tmp_shutdown, 0)); d_shutdown = static_cast(tmp_d_shutdown); } - int shutdown_val = 0; // Local variable for tracking // Set up stats uint64_t* d_stats; @@ -604,8 +633,13 @@ TEST(GraphLaunchTest, DispatchKernelGraphLaunch) { // so that device-side cudaGraphLaunch() can work! cudaq_dispatch_graph_context* dispatch_ctx = nullptr; cudaError_t err = cudaq_create_dispatch_graph_regular( - d_rx_flags, d_tx_flags, d_function_entries, 1, - nullptr, d_shutdown, d_stats, 1, + d_rx_flags, d_tx_flags, + reinterpret_cast(d_buffer), // rx_data + reinterpret_cast(d_buffer), // tx_data (same buffer for single-slot test) + buffer_size, // rx_stride_sz + buffer_size, // tx_stride_sz + d_function_entries, 1, + d_graph_buffer_ptr, d_shutdown, d_stats, 1, 1, 32, stream, &dispatch_ctx); if (err != cudaSuccess) { @@ -619,8 +653,8 @@ TEST(GraphLaunchTest, DispatchKernelGraphLaunch) { // Poll for the response using pinned memory and async operations // The child graph runs asynchronously (fire-and-forget) so we need to poll std::uint8_t* h_poll_buffer; - CUDA_CHECK(cudaHostAlloc(&h_poll_buffer, sizeof(cudaq::nvqlink::RPCResponse), cudaHostAllocDefault)); - memset(h_poll_buffer, 0, sizeof(cudaq::nvqlink::RPCResponse)); + CUDA_CHECK(cudaHostAlloc(&h_poll_buffer, sizeof(cudaq::realtime::RPCResponse), cudaHostAllocDefault)); + memset(h_poll_buffer, 0, sizeof(cudaq::realtime::RPCResponse)); cudaStream_t poll_stream; CUDA_CHECK(cudaStreamCreate(&poll_stream)); @@ -630,12 +664,12 @@ TEST(GraphLaunchTest, DispatchKernelGraphLaunch) { bool got_response = false; for (int elapsed = 0; elapsed < timeout_ms; elapsed += poll_interval_ms) { - CUDA_CHECK(cudaMemcpyAsync(h_poll_buffer, d_buffer, sizeof(cudaq::nvqlink::RPCResponse), + CUDA_CHECK(cudaMemcpyAsync(h_poll_buffer, d_buffer, sizeof(cudaq::realtime::RPCResponse), cudaMemcpyDeviceToHost, poll_stream)); CUDA_CHECK(cudaStreamSynchronize(poll_stream)); - cudaq::nvqlink::RPCResponse* peek = reinterpret_cast(h_poll_buffer); - if (peek->magic == cudaq::nvqlink::RPC_MAGIC_RESPONSE) { + cudaq::realtime::RPCResponse* peek = reinterpret_cast(h_poll_buffer); + if (peek->magic == cudaq::realtime::RPC_MAGIC_RESPONSE) { got_response = true; break; } @@ -662,14 +696,14 @@ TEST(GraphLaunchTest, DispatchKernelGraphLaunch) { ASSERT_TRUE(got_response) << "Timeout waiting for device-side graph launch response"; // Verify response - cudaq::nvqlink::RPCResponse* h_response = reinterpret_cast(h_buffer); - EXPECT_EQ(h_response->magic, cudaq::nvqlink::RPC_MAGIC_RESPONSE) + cudaq::realtime::RPCResponse* h_response = reinterpret_cast(h_buffer); + EXPECT_EQ(h_response->magic, cudaq::realtime::RPC_MAGIC_RESPONSE) << "Expected RPC_MAGIC_RESPONSE, got 0x" << std::hex << h_response->magic; EXPECT_EQ(h_response->status, 0) << "Handler returned error status"; EXPECT_EQ(h_response->result_len, 4u) << "Unexpected result length"; // Verify data was incremented by graph kernel launched from dispatch kernel - std::uint8_t* h_result = h_buffer + sizeof(cudaq::nvqlink::RPCResponse); + std::uint8_t* h_result = h_buffer + sizeof(cudaq::realtime::RPCResponse); EXPECT_EQ(h_result[0], 1) << "Expected h_result[0]=1"; EXPECT_EQ(h_result[1], 2) << "Expected h_result[1]=2"; EXPECT_EQ(h_result[2], 3) << "Expected h_result[2]=3"; diff --git a/realtime/unittests/test_host_dispatcher.cu b/realtime/unittests/test_host_dispatcher.cu new file mode 100644 index 00000000..7d79c5b3 --- /dev/null +++ b/realtime/unittests/test_host_dispatcher.cu @@ -0,0 +1,1015 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. + * All rights reserved. + * + * This source code and the accompanying materials are made available under + * the terms of the Apache License 2.0 which accompanies this distribution. + ******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" + +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + ASSERT_EQ(err, cudaSuccess) << "CUDA error: " << cudaGetErrorString(err); \ + } while (0) + +namespace { + +//============================================================================== +// Ring buffer helpers (same pattern as test_dispatch_kernel.cu) +//============================================================================== + +bool allocate_ring_buffer(std::size_t num_slots, std::size_t slot_size, + volatile uint64_t** host_flags_out, + volatile uint64_t** device_flags_out, + std::uint8_t** host_data_out, + std::uint8_t** device_data_out) { + void* host_flags_ptr = nullptr; + cudaError_t err = cudaHostAlloc(&host_flags_ptr, + num_slots * sizeof(uint64_t), + cudaHostAllocMapped); + if (err != cudaSuccess) + return false; + + void* device_flags_ptr = nullptr; + err = cudaHostGetDevicePointer(&device_flags_ptr, host_flags_ptr, 0); + if (err != cudaSuccess) { + cudaFreeHost(host_flags_ptr); + return false; + } + + void* host_data_ptr = nullptr; + err = cudaHostAlloc(&host_data_ptr, num_slots * slot_size, + cudaHostAllocMapped); + if (err != cudaSuccess) { + cudaFreeHost(host_flags_ptr); + return false; + } + + void* device_data_ptr = nullptr; + err = cudaHostGetDevicePointer(&device_data_ptr, host_data_ptr, 0); + if (err != cudaSuccess) { + cudaFreeHost(host_flags_ptr); + cudaFreeHost(host_data_ptr); + return false; + } + + std::memset(host_flags_ptr, 0, num_slots * sizeof(uint64_t)); + + *host_flags_out = static_cast(host_flags_ptr); + *device_flags_out = static_cast(device_flags_ptr); + *host_data_out = static_cast(host_data_ptr); + *device_data_out = static_cast(device_data_ptr); + return true; +} + +void free_ring_buffer(volatile uint64_t* host_flags, std::uint8_t* host_data) { + if (host_flags) + cudaFreeHost(const_cast(host_flags)); + if (host_data) + cudaFreeHost(host_data); +} + +//============================================================================== +// Minimal graph for dummy GRAPH_LAUNCH entry (so C API starts the host thread) +//============================================================================== + +__global__ void noop_kernel() {} + +// Creates a minimal executable graph and returns it. Caller must destroy with +// cudaGraphExecDestroy and cudaGraphDestroy. +bool create_dummy_graph(cudaGraph_t* graph_out, cudaGraphExec_t* exec_out) { + cudaGraph_t graph = nullptr; + if (cudaGraphCreate(&graph, 0) != cudaSuccess) + return false; + + cudaKernelNodeParams params = {}; + void* args[] = {}; + params.func = reinterpret_cast(noop_kernel); + params.gridDim = dim3(1, 1, 1); + params.blockDim = dim3(1, 1, 1); + params.sharedMemBytes = 0; + params.kernelParams = args; + params.extra = nullptr; + + cudaGraphNode_t node = nullptr; + if (cudaGraphAddKernelNode(&node, graph, nullptr, 0, ¶ms) != + cudaSuccess) { + cudaGraphDestroy(graph); + return false; + } + + cudaGraphExec_t exec = nullptr; + if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) != cudaSuccess) { + cudaGraphDestroy(graph); + return false; + } + + *graph_out = graph; + *exec_out = exec; + return true; +} + +//============================================================================== +// Graph launch test: kernel that reads slot from mailbox and writes response +// in-place (same buffer as request; use single ring buffer for rx/tx). +//============================================================================== + +__global__ void graph_increment_kernel(void** mailbox_slot_ptr) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + void* buffer = *mailbox_slot_ptr; + cudaq::realtime::RPCHeader* header = + static_cast(buffer); + std::uint32_t arg_len = header->arg_len; + void* arg_buffer = static_cast(header + 1); + std::uint8_t* data = static_cast(arg_buffer); + for (std::uint32_t i = 0; i < arg_len; ++i) + data[i] = data[i] + 1; + cudaq::realtime::RPCResponse* response = + static_cast(buffer); + response->magic = cudaq::realtime::RPC_MAGIC_RESPONSE; + response->status = 0; + response->result_len = arg_len; + } +} + +constexpr std::uint32_t RPC_GRAPH_INCREMENT_FUNCTION_ID = + cudaq::realtime::fnv1a_hash("rpc_graph_increment"); + +/// Creates an executable graph that runs graph_increment_kernel with +/// kernel arg = d_mailbox_bank (device pointer to first mailbox slot). +/// Caller must cudaGraphExecDestroy / cudaGraphDestroy. +bool create_increment_graph(void** d_mailbox_bank, cudaGraph_t* graph_out, + cudaGraphExec_t* exec_out) { + cudaGraph_t graph = nullptr; + if (cudaGraphCreate(&graph, 0) != cudaSuccess) + return false; + + // kernelParams[i] must be a *pointer to* the i-th argument value. + // The kernel takes void** so we pass &d_mailbox_bank (a void***). + cudaKernelNodeParams params = {}; + void* kernel_args[] = {&d_mailbox_bank}; + params.func = reinterpret_cast(graph_increment_kernel); + params.gridDim = dim3(1, 1, 1); + params.blockDim = dim3(32, 1, 1); + params.sharedMemBytes = 0; + params.kernelParams = kernel_args; + params.extra = nullptr; + + cudaGraphNode_t node = nullptr; + if (cudaGraphAddKernelNode(&node, graph, nullptr, 0, ¶ms) != + cudaSuccess) { + cudaGraphDestroy(graph); + return false; + } + + cudaGraphExec_t exec = nullptr; + if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) != cudaSuccess) { + cudaGraphDestroy(graph); + return false; + } + + *graph_out = graph; + *exec_out = exec; + return true; +} + +//============================================================================== +// Graph launch test: kernel that reads slot from mailbox and doubles payload +// in-place (for function_id routing differentiation vs increment kernel). +//============================================================================== + +__global__ void graph_double_kernel(void** mailbox_slot_ptr) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + void* buffer = *mailbox_slot_ptr; + cudaq::realtime::RPCHeader* header = + static_cast(buffer); + std::uint32_t arg_len = header->arg_len; + void* arg_buffer = static_cast(header + 1); + std::uint8_t* data = static_cast(arg_buffer); + for (std::uint32_t i = 0; i < arg_len; ++i) + data[i] = data[i] * 2; + cudaq::realtime::RPCResponse* response = + static_cast(buffer); + response->magic = cudaq::realtime::RPC_MAGIC_RESPONSE; + response->status = 0; + response->result_len = arg_len; + } +} + +constexpr std::uint32_t RPC_GRAPH_DOUBLE_FUNCTION_ID = + cudaq::realtime::fnv1a_hash("rpc_graph_double"); + +bool create_double_graph(void** d_mailbox_slot, cudaGraph_t* graph_out, + cudaGraphExec_t* exec_out) { + cudaGraph_t graph = nullptr; + if (cudaGraphCreate(&graph, 0) != cudaSuccess) + return false; + + cudaKernelNodeParams params = {}; + void* kernel_args[] = {&d_mailbox_slot}; + params.func = reinterpret_cast(graph_double_kernel); + params.gridDim = dim3(1, 1, 1); + params.blockDim = dim3(32, 1, 1); + params.sharedMemBytes = 0; + params.kernelParams = kernel_args; + params.extra = nullptr; + + cudaGraphNode_t node = nullptr; + if (cudaGraphAddKernelNode(&node, graph, nullptr, 0, ¶ms) != + cudaSuccess) { + cudaGraphDestroy(graph); + return false; + } + + cudaGraphExec_t exec = nullptr; + if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) != cudaSuccess) { + cudaGraphDestroy(graph); + return false; + } + + *graph_out = graph; + *exec_out = exec; + return true; +} + +//============================================================================== +// Test fixture: drives host_dispatcher_loop directly (not C API) for full +// control over idle_mask, enabling worker recycling and backpressure tests. +//============================================================================== + +static constexpr std::size_t kMaxWorkers = 8; + +class HostDispatcherLoopTest : public ::testing::Test { +protected: + void SetUp() override { + ASSERT_TRUE(allocate_ring_buffer(num_slots_, slot_size_, &rx_flags_host_, + &rx_flags_dev_, &rx_data_host_, + &rx_data_dev_)); + ASSERT_TRUE(allocate_ring_buffer(num_slots_, slot_size_, &tx_flags_host_, + &tx_flags_dev_, &tx_data_host_, + &tx_data_dev_)); + + CUDA_CHECK(cudaHostAlloc(&h_mailbox_bank_, + kMaxWorkers * sizeof(void*), + cudaHostAllocMapped)); + std::memset(h_mailbox_bank_, 0, kMaxWorkers * sizeof(void*)); + CUDA_CHECK(cudaHostGetDevicePointer( + reinterpret_cast(&d_mailbox_bank_), h_mailbox_bank_, 0)); + + idle_mask_ = new cudaq::realtime::atomic_uint64_sys(0); + live_dispatched_ = new cudaq::realtime::atomic_uint64_sys(0); + inflight_slot_tags_ = new int[kMaxWorkers](); + shutdown_flag_ = new cudaq::realtime::atomic_int_sys(0); + stats_counter_ = 0; + + function_table_ = new cudaq_function_entry_t[kMaxWorkers]; + std::memset(function_table_, 0, kMaxWorkers * sizeof(cudaq_function_entry_t)); + + std::memset(&ringbuffer_, 0, sizeof(ringbuffer_)); + ringbuffer_.rx_flags = rx_flags_dev_; + ringbuffer_.tx_flags = tx_flags_dev_; + ringbuffer_.rx_data = rx_data_dev_; + ringbuffer_.tx_data = tx_data_dev_; + ringbuffer_.rx_stride_sz = slot_size_; + ringbuffer_.tx_stride_sz = slot_size_; + ringbuffer_.rx_flags_host = rx_flags_host_; + ringbuffer_.tx_flags_host = tx_flags_host_; + ringbuffer_.rx_data_host = rx_data_host_; + ringbuffer_.tx_data_host = tx_data_host_; + } + + void TearDown() override { + if (!loop_stopped_) { + shutdown_flag_->store(1, cuda::std::memory_order_release); + __sync_synchronize(); + if (loop_thread_.joinable()) + loop_thread_.join(); + } + + for (auto& w : worker_info_) { + if (w.stream) + cudaStreamDestroy(w.stream); + if (w.graph_exec) + cudaGraphExecDestroy(w.graph_exec); + if (w.graph) + cudaGraphDestroy(w.graph); + } + + free_ring_buffer(rx_flags_host_, rx_data_host_); + free_ring_buffer(tx_flags_host_, tx_data_host_); + if (h_mailbox_bank_) + cudaFreeHost(h_mailbox_bank_); + delete idle_mask_; + delete live_dispatched_; + delete[] inflight_slot_tags_; + delete shutdown_flag_; + delete[] function_table_; + } + + struct WorkerInfo { + cudaGraphExec_t graph_exec = nullptr; + cudaGraph_t graph = nullptr; + cudaStream_t stream = nullptr; + }; + + void AddWorker(std::uint32_t function_id, cudaGraphExec_t exec, + cudaGraph_t graph) { + cudaStream_t stream = nullptr; + ASSERT_EQ(cudaStreamCreate(&stream), cudaSuccess); + + cudaq::realtime::HostDispatchWorker w; + w.graph_exec = exec; + w.stream = stream; + w.function_id = function_id; + workers_.push_back(w); + worker_info_.push_back({exec, graph, stream}); + + std::size_t idx = function_table_count_; + function_table_[idx].handler.graph_exec = exec; + function_table_[idx].function_id = function_id; + function_table_[idx].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; + function_table_count_++; + } + + void StartLoop() { + idle_mask_->store((1ULL << workers_.size()) - 1, + cuda::std::memory_order_release); + + config_.rx_flags = + reinterpret_cast( + const_cast(rx_flags_host_)); + config_.tx_flags = + reinterpret_cast( + const_cast(tx_flags_host_)); + config_.rx_data_host = rx_data_host_; + config_.rx_data_dev = rx_data_dev_; + config_.tx_data_host = tx_data_host_; + config_.tx_data_dev = tx_data_dev_; + config_.tx_stride_sz = slot_size_; + config_.h_mailbox_bank = h_mailbox_bank_; + config_.num_slots = num_slots_; + config_.slot_size = slot_size_; + config_.workers = workers_; + config_.function_table = function_table_; + config_.function_table_count = function_table_count_; + config_.shutdown_flag = shutdown_flag_; + config_.stats_counter = &stats_counter_; + config_.live_dispatched = live_dispatched_; + config_.idle_mask = idle_mask_; + config_.inflight_slot_tags = inflight_slot_tags_; + + loop_thread_ = std::thread(cudaq::realtime::host_dispatcher_loop, config_); + } + + void WriteRpcRequest(std::size_t slot, std::uint32_t function_id, + const std::uint8_t* payload, std::size_t len) { + ASSERT_EQ(cudaq_host_ringbuffer_write_rpc_request( + &ringbuffer_, static_cast(slot), function_id, + payload, static_cast(len)), + CUDAQ_OK); + } + + void SignalSlot(std::size_t slot) { + cudaq_host_ringbuffer_signal_slot(&ringbuffer_, static_cast(slot)); + } + + bool PollTxFlag(std::size_t slot, int timeout_ms = 2000) { + for (int waited = 0; waited < timeout_ms * 1000; waited += 200) { + cudaq_tx_status_t st = cudaq_host_ringbuffer_poll_tx_flag( + &ringbuffer_, static_cast(slot), nullptr); + if (st != CUDAQ_TX_EMPTY) + return true; + usleep(200); + } + return cudaq_host_ringbuffer_poll_tx_flag( + &ringbuffer_, static_cast(slot), nullptr) != + CUDAQ_TX_EMPTY; + } + + void StopLoop() { + shutdown_flag_->store(1, cuda::std::memory_order_release); + __sync_synchronize(); + if (loop_thread_.joinable()) + loop_thread_.join(); + loop_stopped_ = true; + } + + void RestoreWorker(int worker_id) { + idle_mask_->fetch_or(1ULL << worker_id, cuda::std::memory_order_release); + } + + void ClearSlot(std::size_t slot) { + cudaq_host_ringbuffer_clear_slot(&ringbuffer_, static_cast(slot)); + std::memset(rx_data_host_ + slot * slot_size_, 0, slot_size_); + } + + void VerifyResponse(std::size_t slot, const std::uint8_t* expected, + std::size_t len) { + int cuda_err = 0; + cudaq_tx_status_t st = cudaq_host_ringbuffer_poll_tx_flag( + &ringbuffer_, static_cast(slot), &cuda_err); + ASSERT_EQ(st, CUDAQ_TX_READY) << "slot " << slot + << ": tx_flag not READY (status=" << st << " cuda_err=" << cuda_err << ")"; + + std::uint8_t* slot_data = rx_data_host_ + slot * slot_size_; + auto* resp = + reinterpret_cast(slot_data); + ASSERT_EQ(resp->magic, CUDAQ_RPC_MAGIC_RESPONSE) + << "slot " << slot << ": expected response magic"; + ASSERT_EQ(resp->status, 0) << "slot " << slot << ": non-zero status"; + ASSERT_EQ(resp->result_len, static_cast(len)) + << "slot " << slot << ": wrong result_len"; + std::uint8_t* result = slot_data + sizeof(cudaq::realtime::RPCResponse); + for (std::size_t i = 0; i < len; ++i) { + EXPECT_EQ(result[i], expected[i]) + << "slot " << slot << " byte " << i; + } + } + + std::size_t num_slots_ = 4; + std::size_t slot_size_ = 256; + + volatile uint64_t* rx_flags_host_ = nullptr; + volatile uint64_t* tx_flags_host_ = nullptr; + volatile uint64_t* rx_flags_dev_ = nullptr; + volatile uint64_t* tx_flags_dev_ = nullptr; + std::uint8_t* rx_data_host_ = nullptr; + std::uint8_t* tx_data_host_ = nullptr; + std::uint8_t* rx_data_dev_ = nullptr; + std::uint8_t* tx_data_dev_ = nullptr; + + void** h_mailbox_bank_ = nullptr; + void** d_mailbox_bank_ = nullptr; + + cudaq::realtime::atomic_uint64_sys* idle_mask_ = nullptr; + cudaq::realtime::atomic_uint64_sys* live_dispatched_ = nullptr; + int* inflight_slot_tags_ = nullptr; + cudaq::realtime::atomic_int_sys* shutdown_flag_ = nullptr; + uint64_t stats_counter_ = 0; + bool loop_stopped_ = false; + + cudaq_function_entry_t* function_table_ = nullptr; + std::size_t function_table_count_ = 0; + std::vector workers_; + std::vector worker_info_; + + cudaq_ringbuffer_t ringbuffer_{}; + cudaq::realtime::HostDispatcherConfig config_{}; + std::thread loop_thread_; +}; + +//============================================================================== +// Test 1: Smoke test — host loop starts and drops slot with unknown function_id +//============================================================================== + +constexpr std::uint32_t DUMMY_GRAPH_FUNCTION_ID = + cudaq::realtime::fnv1a_hash("dummy_graph"); +// Use a different function_id in the slot so the host loop does not find it. +constexpr std::uint32_t UNKNOWN_FUNCTION_ID = 0xdeadbeefu; + +class HostDispatcherSmokeTest : public ::testing::Test { +protected: + void SetUp() override { + ASSERT_TRUE(allocate_ring_buffer(num_slots_, slot_size_, &rx_flags_host_, + &rx_flags_, &rx_data_host_, &rx_data_)); + ASSERT_TRUE(allocate_ring_buffer(num_slots_, slot_size_, &tx_flags_host_, + &tx_flags_, &tx_data_host_, &tx_data_)); + + shutdown_flag_ = new (std::nothrow) int(0); + stats_ = new (std::nothrow) uint64_t(0); + ASSERT_NE(shutdown_flag_, nullptr); + ASSERT_NE(stats_, nullptr); + + ASSERT_TRUE(create_dummy_graph(&dummy_graph_, &dummy_graph_exec_)); + + host_table_ = new (std::nothrow) cudaq_function_entry_t[1]; + ASSERT_NE(host_table_, nullptr); + std::memset(host_table_, 0, sizeof(cudaq_function_entry_t)); + host_table_[0].handler.graph_exec = dummy_graph_exec_; + host_table_[0].function_id = DUMMY_GRAPH_FUNCTION_ID; + host_table_[0].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; + + ASSERT_EQ(cudaq_dispatch_manager_create(&manager_), CUDAQ_OK); + cudaq_dispatcher_config_t config{}; + config.device_id = 0; + config.num_slots = static_cast(num_slots_); + config.slot_size = static_cast(slot_size_); + config.backend = CUDAQ_BACKEND_HOST_LOOP; + ASSERT_EQ(cudaq_dispatcher_create(manager_, &config, &dispatcher_), + CUDAQ_OK); + + std::memset(&ringbuffer_, 0, sizeof(ringbuffer_)); + ringbuffer_.rx_flags = rx_flags_; + ringbuffer_.tx_flags = tx_flags_; + ringbuffer_.rx_data = rx_data_; + ringbuffer_.tx_data = tx_data_; + ringbuffer_.rx_stride_sz = slot_size_; + ringbuffer_.tx_stride_sz = slot_size_; + ringbuffer_.rx_flags_host = rx_flags_host_; + ringbuffer_.tx_flags_host = tx_flags_host_; + ringbuffer_.rx_data_host = rx_data_host_; + ringbuffer_.tx_data_host = tx_data_host_; + ASSERT_EQ(cudaq_dispatcher_set_ringbuffer(dispatcher_, &ringbuffer_), + CUDAQ_OK); + + cudaq_function_table_t table{}; + table.entries = host_table_; + table.count = 1; + ASSERT_EQ(cudaq_dispatcher_set_function_table(dispatcher_, &table), + CUDAQ_OK); + + ASSERT_EQ( + cudaq_dispatcher_set_control(dispatcher_, shutdown_flag_, stats_), + CUDAQ_OK); + ASSERT_EQ(cudaq_dispatcher_start(dispatcher_), CUDAQ_OK); + } + + void TearDown() override { + if (shutdown_flag_) { + *shutdown_flag_ = 1; + __sync_synchronize(); + } + if (dispatcher_) { + cudaq_dispatcher_stop(dispatcher_); + cudaq_dispatcher_destroy(dispatcher_); + dispatcher_ = nullptr; + } + if (manager_) { + cudaq_dispatch_manager_destroy(manager_); + manager_ = nullptr; + } + free_ring_buffer(rx_flags_host_, rx_data_host_); + free_ring_buffer(tx_flags_host_, tx_data_host_); + if (shutdown_flag_) + delete shutdown_flag_; + if (stats_) + delete stats_; + if (host_table_) + delete[] host_table_; + if (dummy_graph_exec_) + cudaGraphExecDestroy(dummy_graph_exec_); + if (dummy_graph_) + cudaGraphDestroy(dummy_graph_); + } + + void write_rpc_request_unknown_function(std::size_t slot) { + const std::uint8_t payload[] = {0, 1, 2, 3}; + ASSERT_EQ(cudaq_host_ringbuffer_write_rpc_request( + &ringbuffer_, static_cast(slot), + UNKNOWN_FUNCTION_ID, payload, 4), + CUDAQ_OK); + } + + static constexpr std::size_t num_slots_ = 2; + std::size_t slot_size_ = 256; + + volatile uint64_t* rx_flags_host_ = nullptr; + volatile uint64_t* tx_flags_host_ = nullptr; + volatile uint64_t* rx_flags_ = nullptr; + volatile uint64_t* tx_flags_ = nullptr; + std::uint8_t* rx_data_host_ = nullptr; + std::uint8_t* tx_data_host_ = nullptr; + std::uint8_t* rx_data_ = nullptr; + std::uint8_t* tx_data_ = nullptr; + + int* shutdown_flag_ = nullptr; + uint64_t* stats_ = nullptr; + cudaq_function_entry_t* host_table_ = nullptr; + cudaGraph_t dummy_graph_ = nullptr; + cudaGraphExec_t dummy_graph_exec_ = nullptr; + + cudaq_ringbuffer_t ringbuffer_{}; + cudaq_dispatch_manager_t* manager_ = nullptr; + cudaq_dispatcher_t* dispatcher_ = nullptr; +}; + +TEST_F(HostDispatcherSmokeTest, DropsSlotWithUnknownFunctionId) { + write_rpc_request_unknown_function(0); + cudaq_host_ringbuffer_signal_slot(&ringbuffer_, 0); + + for (int i = 0; i < 50; ++i) { + usleep(1000); + cudaq_tx_status_t st = + cudaq_host_ringbuffer_poll_tx_flag(&ringbuffer_, 0, nullptr); + if (st != CUDAQ_TX_EMPTY) + break; + } + + cudaq_tx_status_t final_st = + cudaq_host_ringbuffer_poll_tx_flag(&ringbuffer_, 0, nullptr); + EXPECT_EQ(final_st, CUDAQ_TX_EMPTY) + << "Host loop should drop slot with unknown function_id (no response)"; +} + +//============================================================================== +// Test 2: GRAPH_LAUNCH via host loop (full RPC round-trip) using the C API +// +// End-to-end test of: RPC in ring buffer → C API dispatcher → CUDA graph +// launch via pinned mailbox → in-place response. +// +// Flow: +// 1. Allocate pinned ring buffers and pinned mailbox (cudaHostAllocMapped). +// 2. Capture graph_increment_kernel with d_mailbox_bank baked in. +// 3. Build function table with one GRAPH_LAUNCH entry. +// 4. Wire the C API: manager → dispatcher → ringbuffer, function table, +// control, mailbox → start. +// 5. Write an RPC request {0,1,2,3} into slot 0 and signal rx_flags. +// 6. The dispatcher picks up the slot, matches function_id → GRAPH_LAUNCH, +// acquires the idle worker, writes the slot device pointer into the +// pinned mailbox, and launches the graph. +// 7. The graph reads the slot pointer from the mailbox, increments each +// payload byte, and writes an RPCResponse header in-place. +// 8. Test polls tx_flags, syncs device, then asserts the response is +// {1,2,3,4} with correct magic/status/result_len. +//============================================================================== + +TEST(HostDispatcherGraphLaunchTest, FullRpcRoundTripViaPinnedMailbox) { + constexpr std::size_t num_slots = 2; + constexpr std::size_t slot_size = 256; + + // --- Ring buffers --- + // Separate flag arrays for RX and TX: the dispatcher clears rx_flags[slot] + // right after setting tx_flags[slot], so sharing would clobber the signal. + // Data buffers are shared (graph writes response in-place to the RX slot). + volatile uint64_t* rx_flags_host = nullptr; + volatile uint64_t* rx_flags_dev = nullptr; + std::uint8_t* rx_data_host = nullptr; + std::uint8_t* rx_data_dev = nullptr; + volatile uint64_t* tx_flags_host = nullptr; + volatile uint64_t* tx_flags_dev = nullptr; + std::uint8_t* tx_data_host_unused = nullptr; + std::uint8_t* tx_data_dev_unused = nullptr; + + ASSERT_TRUE(allocate_ring_buffer(num_slots, slot_size, &rx_flags_host, + &rx_flags_dev, &rx_data_host, + &rx_data_dev)); + ASSERT_TRUE(allocate_ring_buffer(num_slots, slot_size, &tx_flags_host, + &tx_flags_dev, &tx_data_host_unused, + &tx_data_dev_unused)); + + // --- Pinned mailbox --- + // cudaHostAllocMapped gives us host + device views of the same memory. + // The host dispatcher writes the slot device pointer to h_mailbox_bank[0]; + // the graph reads it from d_mailbox_bank[0] (same physical location). + void** h_mailbox_bank = nullptr; + void** d_mailbox_bank = nullptr; + CUDA_CHECK(cudaHostAlloc(&h_mailbox_bank, sizeof(void*), + cudaHostAllocMapped)); + std::memset(h_mailbox_bank, 0, sizeof(void*)); + CUDA_CHECK( + cudaHostGetDevicePointer((void**)&d_mailbox_bank, h_mailbox_bank, 0)); + + // --- Graph --- + // Capture graph_increment_kernel with d_mailbox_bank baked in as the + // kernel arg. At runtime the kernel reads *d_mailbox_bank to find + // the slot, so different slots can be processed on each launch. + cudaGraph_t graph = nullptr; + cudaGraphExec_t graph_exec = nullptr; + ASSERT_TRUE( + create_increment_graph(d_mailbox_bank, &graph, &graph_exec)); + + // --- Function table (one GRAPH_LAUNCH entry) --- + cudaq_function_entry_t host_table[1]; + std::memset(host_table, 0, sizeof(host_table)); + host_table[0].function_id = RPC_GRAPH_INCREMENT_FUNCTION_ID; + host_table[0].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; + host_table[0].handler.graph_exec = graph_exec; + + // --- C API: create manager + dispatcher --- + cudaq_dispatch_manager_t* manager = nullptr; + ASSERT_EQ(cudaq_dispatch_manager_create(&manager), CUDAQ_OK); + + cudaq_dispatcher_config_t disp_config{}; + disp_config.device_id = 0; + disp_config.num_slots = static_cast(num_slots); + disp_config.slot_size = static_cast(slot_size); + disp_config.backend = CUDAQ_BACKEND_HOST_LOOP; + + cudaq_dispatcher_t* dispatcher = nullptr; + ASSERT_EQ(cudaq_dispatcher_create(manager, &disp_config, &dispatcher), + CUDAQ_OK); + + // --- Wire ring buffer (rx/tx flags separate, data shared for in-place) --- + cudaq_ringbuffer_t ringbuffer{}; + ringbuffer.rx_flags = rx_flags_dev; + ringbuffer.tx_flags = tx_flags_dev; + ringbuffer.rx_data = rx_data_dev; + ringbuffer.tx_data = rx_data_dev; + ringbuffer.rx_stride_sz = slot_size; + ringbuffer.tx_stride_sz = slot_size; + ringbuffer.rx_flags_host = rx_flags_host; + ringbuffer.tx_flags_host = tx_flags_host; + ringbuffer.rx_data_host = rx_data_host; + ringbuffer.tx_data_host = rx_data_host; + ASSERT_EQ(cudaq_dispatcher_set_ringbuffer(dispatcher, &ringbuffer), + CUDAQ_OK); + + cudaq_function_table_t table{}; + table.entries = host_table; + table.count = 1; + ASSERT_EQ(cudaq_dispatcher_set_function_table(dispatcher, &table), + CUDAQ_OK); + + int shutdown_flag = 0; + uint64_t stats_counter = 0; + ASSERT_EQ(cudaq_dispatcher_set_control(dispatcher, &shutdown_flag, + &stats_counter), + CUDAQ_OK); + + // Provide the caller-allocated pinned mailbox so the dispatcher uses it + // instead of allocating plain host memory (which the graph can't read). + ASSERT_EQ(cudaq_dispatcher_set_mailbox(dispatcher, h_mailbox_bank), + CUDAQ_OK); + + // --- Start --- + ASSERT_EQ(cudaq_dispatcher_start(dispatcher), CUDAQ_OK); + + // --- Send RPC request (simulates FPGA / producer) --- + const std::uint8_t payload[] = {0, 1, 2, 3}; + ASSERT_EQ(cudaq_host_ringbuffer_write_rpc_request( + &ringbuffer, 0, RPC_GRAPH_INCREMENT_FUNCTION_ID, payload, 4), + CUDAQ_OK); + cudaq_host_ringbuffer_signal_slot(&ringbuffer, 0); + + // --- Verify: dispatcher picked up slot and launched graph --- + int cuda_err = 0; + cudaq_tx_status_t st = CUDAQ_TX_EMPTY; + for (int i = 0; i < 5000 && st == CUDAQ_TX_EMPTY; ++i) { + usleep(200); + st = cudaq_host_ringbuffer_poll_tx_flag(&ringbuffer, 0, &cuda_err); + } + ASSERT_NE(st, CUDAQ_TX_EMPTY) << "Timeout waiting for tx flag"; + ASSERT_NE(st, CUDAQ_TX_ERROR) + << "Dispatcher reported graph launch error (cuda_err=" << cuda_err << ")"; + + // cudaGraphLaunch is async; sync device so the in-place response is visible + CUDA_CHECK(cudaDeviceSynchronize()); + + // --- Verify: graph wrote correct response in-place --- + std::uint8_t* slot_data = rx_data_host + 0 * slot_size; + auto* resp = reinterpret_cast(slot_data); + ASSERT_EQ(resp->magic, CUDAQ_RPC_MAGIC_RESPONSE) + << "Expected response magic (graph in-place write)"; + ASSERT_EQ(resp->status, 0); + ASSERT_EQ(resp->result_len, 4u); + std::uint8_t* result = slot_data + sizeof(cudaq::realtime::RPCResponse); + EXPECT_EQ(result[0], 1); + EXPECT_EQ(result[1], 2); + EXPECT_EQ(result[2], 3); + EXPECT_EQ(result[3], 4); + + // --- Teardown (C API handles thread join) --- + shutdown_flag = 1; + __sync_synchronize(); + cudaq_dispatcher_stop(dispatcher); + cudaq_dispatcher_destroy(dispatcher); + cudaq_dispatch_manager_destroy(manager); + + cudaGraphExecDestroy(graph_exec); + cudaGraphDestroy(graph); + cudaFreeHost(h_mailbox_bank); + free_ring_buffer(rx_flags_host, rx_data_host); + free_ring_buffer(tx_flags_host, tx_data_host_unused); +} + +//============================================================================== +// Test 3: Multiple workers with function_id routing (internal API) +// +// Two workers: worker 0 runs graph_increment_kernel (func_id A), +// worker 1 runs graph_double_kernel (func_id B). Sends one RPC per worker +// and verifies each graph produced the expected output, confirming the +// dispatcher routed by function_id. +//============================================================================== + +TEST_F(HostDispatcherLoopTest, MultiWorkerFunctionIdRouting) { + cudaGraph_t inc_graph = nullptr; + cudaGraphExec_t inc_exec = nullptr; + ASSERT_TRUE(create_increment_graph(d_mailbox_bank_ + 0, &inc_graph, &inc_exec)); + AddWorker(RPC_GRAPH_INCREMENT_FUNCTION_ID, inc_exec, inc_graph); + + cudaGraph_t dbl_graph = nullptr; + cudaGraphExec_t dbl_exec = nullptr; + ASSERT_TRUE(create_double_graph(d_mailbox_bank_ + 1, &dbl_graph, &dbl_exec)); + AddWorker(RPC_GRAPH_DOUBLE_FUNCTION_ID, dbl_exec, dbl_graph); + + StartLoop(); + + const std::uint8_t payload[] = {1, 2, 3, 4}; + WriteRpcRequest(0, RPC_GRAPH_INCREMENT_FUNCTION_ID, payload, 4); + WriteRpcRequest(1, RPC_GRAPH_DOUBLE_FUNCTION_ID, payload, 4); + SignalSlot(0); + SignalSlot(1); + + ASSERT_TRUE(PollTxFlag(0)) << "Timeout on slot 0 (increment)"; + ASSERT_TRUE(PollTxFlag(1)) << "Timeout on slot 1 (double)"; + ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); + + const std::uint8_t expected_inc[] = {2, 3, 4, 5}; + const std::uint8_t expected_dbl[] = {2, 4, 6, 8}; + VerifyResponse(0, expected_inc, 4); + VerifyResponse(1, expected_dbl, 4); +} + +//============================================================================== +// Test 4: Worker recycling — idle_mask round-trip (internal API) +// +// One worker, two sequential RPCs to the same slot. The second dispatch +// can only proceed after the test restores idle_mask (simulating the +// external worker thread that returns the worker to the pool). +//============================================================================== + +TEST_F(HostDispatcherLoopTest, WorkerRecycling) { + cudaGraph_t graph = nullptr; + cudaGraphExec_t exec = nullptr; + ASSERT_TRUE(create_increment_graph(d_mailbox_bank_, &graph, &exec)); + AddWorker(RPC_GRAPH_INCREMENT_FUNCTION_ID, exec, graph); + + StartLoop(); + + // RPC 1 on slot 0 — after dispatch, current_slot advances to 1. + const std::uint8_t payload1[] = {0, 1, 2, 3}; + WriteRpcRequest(0, RPC_GRAPH_INCREMENT_FUNCTION_ID, payload1, 4); + SignalSlot(0); + ASSERT_TRUE(PollTxFlag(0)) << "Timeout on first RPC"; + ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); + + const std::uint8_t expected1[] = {1, 2, 3, 4}; + VerifyResponse(0, expected1, 4); + + RestoreWorker(0); + + // RPC 2 on slot 1 — the dispatcher is now polling slot 1. + // This can only dispatch if idle_mask was properly restored above. + const std::uint8_t payload2[] = {10, 11, 12, 13}; + WriteRpcRequest(1, RPC_GRAPH_INCREMENT_FUNCTION_ID, payload2, 4); + SignalSlot(1); + ASSERT_TRUE(PollTxFlag(1)) << "Timeout on second RPC (worker not recycled?)"; + ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); + + const std::uint8_t expected2[] = {11, 12, 13, 14}; + VerifyResponse(1, expected2, 4); +} + +//============================================================================== +// Test 5: Backpressure — dispatcher stalls when all workers are busy +// +// One worker, two slots signalled simultaneously. Slot 0 dispatches +// immediately; slot 1 stalls until the test restores idle_mask. +//============================================================================== + +TEST_F(HostDispatcherLoopTest, BackpressureWhenAllBusy) { + cudaGraph_t graph = nullptr; + cudaGraphExec_t exec = nullptr; + ASSERT_TRUE(create_increment_graph(d_mailbox_bank_, &graph, &exec)); + AddWorker(RPC_GRAPH_INCREMENT_FUNCTION_ID, exec, graph); + + StartLoop(); + + const std::uint8_t payload0[] = {0, 1, 2, 3}; + const std::uint8_t payload1[] = {10, 11, 12, 13}; + WriteRpcRequest(0, RPC_GRAPH_INCREMENT_FUNCTION_ID, payload0, 4); + WriteRpcRequest(1, RPC_GRAPH_INCREMENT_FUNCTION_ID, payload1, 4); + SignalSlot(0); + SignalSlot(1); + + ASSERT_TRUE(PollTxFlag(0)) << "Timeout on slot 0"; + ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); + + // Slot 1 should still be pending — worker is busy. + EXPECT_EQ(tx_flags_host_[1], 0u) + << "Slot 1 should stall while worker is busy"; + + RestoreWorker(0); + + ASSERT_TRUE(PollTxFlag(1)) << "Timeout on slot 1 after restoring worker"; + ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); + + const std::uint8_t expected0[] = {1, 2, 3, 4}; + const std::uint8_t expected1[] = {11, 12, 13, 14}; + VerifyResponse(0, expected0, 4); + VerifyResponse(1, expected1, 4); + + EXPECT_EQ(live_dispatched_->load(cuda::std::memory_order_acquire), 2u); + + StopLoop(); + EXPECT_EQ(stats_counter_, 2u); +} + +//============================================================================== +// Test 6: Stats counter accuracy (internal API) +// +// Sends 5 sequential RPCs through a single worker (recycling between each) +// and verifies stats_counter == 5 at the end. +//============================================================================== + +TEST_F(HostDispatcherLoopTest, StatsCounterAccuracy) { + cudaGraph_t graph = nullptr; + cudaGraphExec_t exec = nullptr; + ASSERT_TRUE(create_increment_graph(d_mailbox_bank_, &graph, &exec)); + AddWorker(RPC_GRAPH_INCREMENT_FUNCTION_ID, exec, graph); + + StartLoop(); + + // Sequential RPCs through slots 0,1,2,3,0 — the dispatcher advances + // current_slot after each dispatch, so each RPC must target the next slot. + // When wrapping back to slot 0 for the 5th RPC, clear its tx_flags first. + constexpr int kNumRpcs = 5; + for (int i = 0; i < kNumRpcs; ++i) { + std::size_t slot = static_cast(i % num_slots_); + if (i >= static_cast(num_slots_)) + ClearSlot(slot); + + std::uint8_t payload[] = { + static_cast(i * 10), + static_cast(i * 10 + 1), + static_cast(i * 10 + 2), + static_cast(i * 10 + 3)}; + WriteRpcRequest(slot, RPC_GRAPH_INCREMENT_FUNCTION_ID, payload, 4); + SignalSlot(slot); + ASSERT_TRUE(PollTxFlag(slot)) << "Timeout on RPC " << i << " (slot " << slot << ")"; + ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); + + std::uint8_t expected[] = { + static_cast(i * 10 + 1), + static_cast(i * 10 + 2), + static_cast(i * 10 + 3), + static_cast(i * 10 + 4)}; + VerifyResponse(slot, expected, 4); + + RestoreWorker(0); + } + + EXPECT_EQ(live_dispatched_->load(cuda::std::memory_order_acquire), + static_cast(kNumRpcs)); + + StopLoop(); + EXPECT_EQ(stats_counter_, static_cast(kNumRpcs)); +} + +//============================================================================== +// Test 7: Multi-slot round-robin dispatch (internal API) +// +// 4 slots, 4 workers (all same function_id). All slots signalled at once; +// the dispatcher processes them 0 → 1 → 2 → 3 using one worker each. +//============================================================================== + +TEST_F(HostDispatcherLoopTest, MultiSlotRoundRobin) { + constexpr int kNumSlots = 4; + cudaGraph_t graphs[kNumSlots]; + cudaGraphExec_t execs[kNumSlots]; + for (int i = 0; i < kNumSlots; ++i) { + ASSERT_TRUE(create_increment_graph(d_mailbox_bank_ + i, &graphs[i], + &execs[i])); + AddWorker(RPC_GRAPH_INCREMENT_FUNCTION_ID, execs[i], graphs[i]); + } + + StartLoop(); + + for (int i = 0; i < kNumSlots; ++i) { + std::uint8_t payload[] = { + static_cast(i * 4 + 1), + static_cast(i * 4 + 2), + static_cast(i * 4 + 3), + static_cast(i * 4 + 4)}; + WriteRpcRequest(static_cast(i), + RPC_GRAPH_INCREMENT_FUNCTION_ID, payload, 4); + } + + for (int i = 0; i < kNumSlots; ++i) + SignalSlot(static_cast(i)); + + for (int i = 0; i < kNumSlots; ++i) { + ASSERT_TRUE(PollTxFlag(static_cast(i))) + << "Timeout on slot " << i; + } + ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); + + for (int i = 0; i < kNumSlots; ++i) { + std::uint8_t expected[] = { + static_cast(i * 4 + 2), + static_cast(i * 4 + 3), + static_cast(i * 4 + 4), + static_cast(i * 4 + 5)}; + VerifyResponse(static_cast(i), expected, 4); + } + + EXPECT_EQ(live_dispatched_->load(cuda::std::memory_order_acquire), + static_cast(kNumSlots)); + + StopLoop(); + EXPECT_EQ(stats_counter_, static_cast(kNumSlots)); +} + +} // namespace diff --git a/realtime/unittests/utils/CMakeLists.txt b/realtime/unittests/utils/CMakeLists.txt new file mode 100644 index 00000000..d6811a1f --- /dev/null +++ b/realtime/unittests/utils/CMakeLists.txt @@ -0,0 +1,264 @@ +# ============================================================================ # +# Copyright (c) 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +# Hololink bridge and playback tools +# ============================================================================== +# These targets are gated by CUDAQ_REALTIME_ENABLE_HOLOLINK_TOOLS and require +# a pre-built hololink (holoscan-sensor-bridge) with DOCA support. +# They are NOT CI tests -- they need FPGA hardware or an FPGA emulator. + +if (NOT HOLOSCAN_SENSOR_BRIDGE_SOURCE_DIR) + message(FATAL_ERROR + "HOLOSCAN_SENSOR_BRIDGE_SOURCE_DIR must be set when building hololink tools.") +endif() +if (NOT HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR) + message(FATAL_ERROR + "HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR must be set when building hololink tools.") +endif() + +find_package(Threads REQUIRED) +find_package(CUDAToolkit REQUIRED) + +# --------------------------------------------------------------------------- # +# Find Hololink core library +# --------------------------------------------------------------------------- # + +find_library(HOLOLINK_CORE_LIB + NAMES hololink_core + PATHS + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/core" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" + NO_DEFAULT_PATH) + +if (NOT HOLOLINK_CORE_LIB) + message(FATAL_ERROR + "Could not find hololink_core library under ${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}.") +endif() + +# --------------------------------------------------------------------------- # +# Find GPU RoCE Transceiver library +# --------------------------------------------------------------------------- # + +find_library(GPU_ROCE_TRANSCEIVER_LIB + NAMES gpu_roce_transceiver + PATHS + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/operators/gpu_roce_transceiver" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" + NO_DEFAULT_PATH) + +if (NOT GPU_ROCE_TRANSCEIVER_LIB) + message(WARNING + "Could not find gpu_roce_transceiver library. " + "hololink_bridge will not be built.") +endif() + +# --------------------------------------------------------------------------- # +# Find transitive Hololink libraries +# --------------------------------------------------------------------------- # + +find_library(HOLOLINK_COMMON_LIB + NAMES hololink + PATHS + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/common" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" + NO_DEFAULT_PATH) + +find_library(ROCE_RECEIVER_LIB + NAMES roce_receiver + PATHS + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/operators/roce_receiver" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" + NO_DEFAULT_PATH) + +find_library(BASE_RECEIVER_OP_LIB + NAMES base_receiver_op + PATHS + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/operators" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" + NO_DEFAULT_PATH) + +find_library(IBVERBS_LIB NAMES ibverbs) + +# --------------------------------------------------------------------------- # +# Find DOCA libraries +# --------------------------------------------------------------------------- # + +set(DOCA_PATH "/opt/mellanox/doca") + +if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)") + set(DOCA_LIB_DIR "${DOCA_PATH}/lib/x86_64-linux-gnu") +elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)") + set(DOCA_LIB_DIR "${DOCA_PATH}/lib/aarch64-linux-gnu") +else() + set(DOCA_LIB_DIR "${DOCA_PATH}/lib") +endif() + +find_path(DOCA_INCLUDE_DIR doca_verbs.h + PATHS ${DOCA_PATH}/include + NO_DEFAULT_PATH) + +find_library(DOCA_VERBS_LIB doca_verbs + PATHS ${DOCA_LIB_DIR} + NO_DEFAULT_PATH) + +find_library(DOCA_GPUNETIO_LIB doca_gpunetio + PATHS ${DOCA_LIB_DIR} + NO_DEFAULT_PATH) + +find_library(DOCA_COMMON_LIB doca_common + PATHS ${DOCA_LIB_DIR} + NO_DEFAULT_PATH) + +# --------------------------------------------------------------------------- # +# Find Holoscan (required by gpu_roce_transceiver -> holoscan::core) +# --------------------------------------------------------------------------- # + +find_package(holoscan QUIET) + +# --------------------------------------------------------------------------- # +# Find fmt (transitive dependency of hololink logging) +# --------------------------------------------------------------------------- # + +find_path(FMT_INCLUDE_DIR + NAMES fmt/format.h + PATHS /opt/nvidia/holoscan /usr/local/cudaq /usr /usr/local + PATH_SUFFIXES include + NO_DEFAULT_PATH) + +# =========================================================================== # +# hololink_fpga_playback (no GPU / DOCA dependency) +# =========================================================================== # + +add_executable(hololink_fpga_playback + hololink_fpga_playback.cpp) + +target_include_directories(hololink_fpga_playback + PRIVATE ${CUDAQ_REALTIME_INCLUDE_DIR}) + +target_link_libraries(hololink_fpga_playback + PRIVATE Threads::Threads) + +# =========================================================================== # +# hololink_bridge (generic increment bridge) +# =========================================================================== # + +if (GPU_ROCE_TRANSCEIVER_LIB AND + DOCA_INCLUDE_DIR AND DOCA_VERBS_LIB AND DOCA_COMMON_LIB AND + DOCA_GPUNETIO_LIB) + + message(STATUS "Building hololink_bridge (generic increment)") + message(STATUS " GPU RoCE Transceiver: ${GPU_ROCE_TRANSCEIVER_LIB}") + + # Hololink wrapper static library (compiled by g++, isolates fmt) + add_library(hololink_wrapper_generic STATIC + hololink_wrapper.cpp) + + target_include_directories(hololink_wrapper_generic + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + "${HOLOSCAN_SENSOR_BRIDGE_SOURCE_DIR}/src" + ${DOCA_INCLUDE_DIR} + ${CUDAToolkit_INCLUDE_DIRS} + ${FMT_INCLUDE_DIR}) + + target_link_libraries(hololink_wrapper_generic + PRIVATE ${GPU_ROCE_TRANSCEIVER_LIB}) + + target_compile_options(hololink_wrapper_generic PRIVATE -Wno-deprecated-declarations) + + # Increment function table (compiled by nvcc) + add_library(rpc_increment_ft STATIC + init_rpc_increment_function_table.cu) + + set_target_properties(rpc_increment_ft PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17) + + target_include_directories(rpc_increment_ft PRIVATE + ${CUDAQ_REALTIME_INCLUDE_DIR} + ${CUDAToolkit_INCLUDE_DIRS}) + + # Bridge executable (.cpp, linked with CUDA) + add_executable(hololink_bridge + hololink_bridge.cpp) + + set_target_properties(hololink_bridge PROPERTIES + LINKER_LANGUAGE CUDA + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON) + + target_include_directories(hololink_bridge + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CUDAQ_REALTIME_INCLUDE_DIR} + ${CUDAToolkit_INCLUDE_DIRS}) + + # Link order: static archives first, then shared + target_link_libraries(hololink_bridge + PRIVATE + rpc_increment_ft + cudaq-realtime-dispatch + hololink_wrapper_generic + ${GPU_ROCE_TRANSCEIVER_LIB} + ${ROCE_RECEIVER_LIB} + ${BASE_RECEIVER_OP_LIB} + ${HOLOLINK_CORE_LIB} + ${HOLOLINK_COMMON_LIB} + cudaq-realtime + CUDA::cudart + CUDA::cuda_driver + ${DOCA_VERBS_LIB} + ${DOCA_GPUNETIO_LIB} + ${DOCA_COMMON_LIB} + ${IBVERBS_LIB} + Threads::Threads + ${CMAKE_DL_LIBS}) + + if (holoscan_FOUND) + target_link_libraries(hololink_bridge PRIVATE holoscan::core) + target_link_libraries(hololink_wrapper_generic PRIVATE holoscan::core) + endif() + + # Set RPATH for shared libraries + set_target_properties(hololink_bridge PROPERTIES + BUILD_RPATH "${DOCA_LIB_DIR}" + INSTALL_RPATH "${DOCA_LIB_DIR}") + +else() + if (NOT GPU_ROCE_TRANSCEIVER_LIB) + message(WARNING "gpu_roce_transceiver library not found. " + "hololink_bridge will not be built.") + endif() + if (NOT DOCA_INCLUDE_DIR OR NOT DOCA_VERBS_LIB) + message(WARNING "DOCA libraries not found. " + "hololink_bridge requires DOCA.") + endif() +endif() + +# =========================================================================== # +# hololink_fpga_emulator (software FPGA, libibverbs only) +# =========================================================================== # + +if (IBVERBS_LIB) + message(STATUS "Building hololink_fpga_emulator") + + add_executable(hololink_fpga_emulator + hololink_fpga_emulator.cpp) + + target_link_libraries(hololink_fpga_emulator + PRIVATE + ${IBVERBS_LIB} + Threads::Threads) +else() + message(WARNING "libibverbs not found. hololink_fpga_emulator will not be built.") +endif() diff --git a/realtime/unittests/utils/hololink_bridge.cpp b/realtime/unittests/utils/hololink_bridge.cpp new file mode 100644 index 00000000..0f10caa9 --- /dev/null +++ b/realtime/unittests/utils/hololink_bridge.cpp @@ -0,0 +1,124 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +/// @file hololink_bridge.cpp +/// @brief Generic Hololink bridge tool for testing libcudaq-realtime dispatch. +/// +/// Registers a simple increment RPC handler (adds 1 to each byte) and wires +/// it through the Hololink GPU-RoCE Transceiver. No QEC or decoder dependency. +/// +/// Usage: +/// ./hololink_bridge \ +/// --device=rocep1s0f0 \ +/// --peer-ip=10.0.0.2 \ +/// --remote-qp=0x2 \ +/// --gpu=0 \ +/// --timeout=60 + +#include +#include +#include +#include + +#include + +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/realtime/hololink_bridge_common.h" + +//============================================================================== +// Increment RPC Handler Function Table +//============================================================================== + +// The actual __device__ rpc_increment_handler lives in +// init_rpc_increment_function_table.cu (compiled by nvcc). We declare the +// host-callable setup function here so this .cpp can be compiled by g++. + +extern "C" void +setup_rpc_increment_function_table(cudaq_function_entry_t *d_entries); + +//============================================================================== +// Main +//============================================================================== + +int main(int argc, char *argv[]) { + // Check for help + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg == "--help" || arg == "-h") { + std::cout + << "Usage: " << argv[0] << " [options]\n" + << "\n" + << "Generic Hololink bridge for testing libcudaq-realtime dispatch.\n" + << "Registers increment handler (adds 1 to each byte of the RPC " + "payload).\n" + << "\n" + << "Options:\n" + << " --device=NAME IB device (default: rocep1s0f0)\n" + << " --peer-ip=ADDR FPGA/emulator IP (default: 10.0.0.2)\n" + << " --remote-qp=N Remote QP number (default: 0x2)\n" + << " --gpu=N GPU device ID (default: 0)\n" + << " --timeout=N Timeout in seconds (default: 60)\n" + << " --page-size=N Ring buffer slot size (default: 384)\n" + << " --num-pages=N Number of ring buffer slots (default: " + "64)\n" + << " --exchange-qp Enable QP exchange protocol\n" + << " --exchange-port=N TCP port for QP exchange (default: " + "12345)\n"; + return 0; + } + } + + try { + std::cout << "=== Hololink Generic Bridge ===" << std::endl; + + // Parse common bridge args + cudaq::realtime::BridgeConfig config; + cudaq::realtime::parse_bridge_args(argc, argv, config); + + // Frame size: RPCHeader + 256 bytes payload + config.frame_size = sizeof(cudaq::realtime::RPCHeader) + 256; + + std::cout << "Device: " << config.device << std::endl; + std::cout << "Peer IP: " << config.peer_ip << std::endl; + std::cout << "Remote QP: 0x" << std::hex << config.remote_qp << std::dec + << std::endl; + std::cout << "GPU: " << config.gpu_id << std::endl; + + // Initialize CUDA early to allocate function table + cudaError_t err = cudaSetDevice(config.gpu_id); + if (err != cudaSuccess) { + std::cerr << "ERROR: cudaSetDevice failed: " << cudaGetErrorString(err) + << std::endl; + return 1; + } + + // Set up increment RPC function table on GPU + cudaq_function_entry_t *d_function_entries = nullptr; + err = cudaMalloc(&d_function_entries, sizeof(cudaq_function_entry_t)); + if (err != cudaSuccess) { + std::cerr << "ERROR: cudaMalloc failed: " << cudaGetErrorString(err) + << std::endl; + return 1; + } + setup_rpc_increment_function_table(d_function_entries); + + config.d_function_entries = d_function_entries; + config.func_count = 1; + config.launch_fn = &cudaq::realtime::bridge_launch_dispatch_kernel; + config.cleanup_fn = [d_function_entries]() { + cudaFree(d_function_entries); + }; + + return cudaq::realtime::bridge_run(config); + + } catch (const std::exception &e) { + std::cerr << "ERROR: " << e.what() << std::endl; + return 1; + } +} diff --git a/realtime/unittests/utils/hololink_fpga_emulator.cpp b/realtime/unittests/utils/hololink_fpga_emulator.cpp new file mode 100644 index 00000000..284fff87 --- /dev/null +++ b/realtime/unittests/utils/hololink_fpga_emulator.cpp @@ -0,0 +1,1210 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +/// @file hololink_fpga_emulator.cpp +/// @brief Software FPGA emulator for Hololink RPC testing. +/// +/// Emulates the FPGA's role in the RPC pipeline: +/// 1. Hololink UDP control plane server (register read/write) +/// 2. Playback BRAM (receives payloads from playback tool) +/// 3. RDMA transmit (sends RPC requests to bridge) +/// 4. RDMA receive (receives RPC responses from bridge) +/// 5. ILA capture RAM (stores responses for verification readback) +/// +/// Three-tool workflow: +/// 1. Start this emulator (prints QP number) +/// 2. Start hololink_mock_decoder_bridge with --remote-qp= +/// 3. Start hololink_fpga_syndrome_playback --control-port= +/// with bridge's QP/RKEY/buffer-addr +/// +/// The playback tool drives the emulator via UDP just as it would a real FPGA. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +//============================================================================== +// Global shutdown flag +//============================================================================== + +static std::atomic g_shutdown{false}; +static void signal_handler(int) { g_shutdown = true; } + +//============================================================================== +// Hololink Protocol Constants +//============================================================================== + +static constexpr uint8_t WR_DWORD = 0x04; +static constexpr uint8_t WR_BLOCK = 0x09; +static constexpr uint8_t RD_DWORD = 0x14; +static constexpr uint8_t RD_BLOCK = 0x19; + +static constexpr uint8_t REQUEST_FLAGS_ACK_REQUEST = 0x01; +static constexpr uint8_t RESPONSE_SUCCESS = 0x00; + +// VP register offsets (relative to vp_address) +static constexpr uint32_t DP_QP = 0x00; +static constexpr uint32_t DP_RKEY = 0x04; +static constexpr uint32_t DP_PAGE_LSB = 0x08; +static constexpr uint32_t DP_PAGE_MSB = 0x0C; +static constexpr uint32_t DP_PAGE_INC = 0x10; +static constexpr uint32_t DP_MAX_BUFF = 0x14; +static constexpr uint32_t DP_BUFFER_LENGTH = 0x18; + +// HIF register offsets (relative to hif_address) +static constexpr uint32_t DP_VP_MASK = 0x0C; + +// Player registers +static constexpr uint32_t PLAYER_BASE = 0x50000000; +static constexpr uint32_t PLAYER_ENABLE = PLAYER_BASE + 0x04; +static constexpr uint32_t PLAYER_TIMER = PLAYER_BASE + 0x08; +static constexpr uint32_t PLAYER_WIN_SIZE = PLAYER_BASE + 0x0C; +static constexpr uint32_t PLAYER_WIN_NUM = PLAYER_BASE + 0x10; + +// Playback BRAM +static constexpr uint32_t RAM_BASE = 0x50100000; +static constexpr int BRAM_NUM_BANKS = 16; +static constexpr int BRAM_W_SAMPLE_ADDR = 9; // log2(512 entries) +static constexpr int BRAM_BANK_STRIDE = 1 << (BRAM_W_SAMPLE_ADDR + 2); // 2048 + +// ILA capture +static constexpr uint32_t ILA_BASE = 0x40000000; +static constexpr uint32_t ILA_CTRL = ILA_BASE + 0x00; +static constexpr uint32_t ILA_STATUS = ILA_BASE + 0x80; +static constexpr uint32_t ILA_SAMPLE_ADDR = ILA_BASE + 0x84; +static constexpr uint32_t ILA_DATA_BASE = 0x40100000; +static constexpr int ILA_NUM_BANKS = 17; +static constexpr int ILA_W_ADDR = 13; // log2(8192 entries) +static constexpr int ILA_BANK_STRIDE = 1 << (ILA_W_ADDR + 2); // 32768 + +// Ring buffer +static constexpr int NUM_BUFFERS = 64; + +//============================================================================== +// RDMA Context (adapted from cuda-qx rdma_utils.hpp) +//============================================================================== + +class RdmaContext { +public: + ~RdmaContext() { cleanup(); } + + bool open(const std::string &device_name, int port = 1) { + int num_devices; + ibv_device **devices = ibv_get_device_list(&num_devices); + if (!devices || num_devices == 0) + return false; + + ibv_device *target = nullptr; + for (int i = 0; i < num_devices; i++) { + if (device_name == ibv_get_device_name(devices[i])) { + target = devices[i]; + break; + } + } + if (!target) { + ibv_free_device_list(devices); + return false; + } + + ctx_ = ibv_open_device(target); + ibv_free_device_list(devices); + if (!ctx_) + return false; + + port_ = port; + pd_ = ibv_alloc_pd(ctx_); + if (!pd_) { + cleanup(); + return false; + } + + if (ibv_query_port(ctx_, port_, &port_attr_) != 0) { + cleanup(); + return false; + } + + gid_index_ = find_roce_v2_gid_index(); + return true; + } + + ibv_cq *create_cq(int size) { + return ibv_create_cq(ctx_, size, nullptr, nullptr, 0); + } + + ibv_mr *register_memory(void *addr, size_t size, + int access = IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE) { + return ibv_reg_mr(pd_, addr, size, access); + } + + ibv_qp *create_qp(ibv_cq *send_cq, ibv_cq *recv_cq, uint32_t max_send_wr = 64, + uint32_t max_recv_wr = 64) { + ibv_qp_init_attr init_attr{}; + init_attr.qp_type = IBV_QPT_UC; // Unreliable Connected - matches FPGA + init_attr.send_cq = send_cq; + init_attr.recv_cq = recv_cq; + init_attr.cap.max_send_wr = max_send_wr; + init_attr.cap.max_recv_wr = max_recv_wr; + init_attr.cap.max_send_sge = 1; + init_attr.cap.max_recv_sge = 1; + return ibv_create_qp(pd_, &init_attr); + } + + bool qp_to_init(ibv_qp *qp) { + ibv_qp_attr attr{}; + attr.qp_state = IBV_QPS_INIT; + attr.port_num = port_; + attr.pkey_index = 0; + attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE; + return ibv_modify_qp(qp, &attr, + IBV_QP_STATE | IBV_QP_PORT | IBV_QP_PKEY_INDEX | + IBV_QP_ACCESS_FLAGS) == 0; + } + + bool qp_to_rtr(ibv_qp *qp, const ibv_gid &remote_gid, uint32_t remote_qp_num, + uint32_t psn = 0) { + ibv_qp_attr attr{}; + attr.qp_state = IBV_QPS_RTR; + attr.path_mtu = port_attr_.active_mtu; + attr.dest_qp_num = remote_qp_num; + attr.rq_psn = psn; + attr.ah_attr.is_global = 1; + attr.ah_attr.grh.dgid = remote_gid; + attr.ah_attr.grh.sgid_index = gid_index_; + attr.ah_attr.grh.hop_limit = 64; + attr.ah_attr.grh.traffic_class = 0; + attr.ah_attr.dlid = 0; + attr.ah_attr.sl = 0; + attr.ah_attr.src_path_bits = 0; + attr.ah_attr.port_num = port_; + return ibv_modify_qp(qp, &attr, + IBV_QP_STATE | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | IBV_QP_AV) == 0; + } + + bool qp_to_rts(ibv_qp *qp, uint32_t psn = 0) { + ibv_qp_attr attr{}; + attr.qp_state = IBV_QPS_RTS; + attr.sq_psn = psn; + return ibv_modify_qp(qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN) == 0; + } + + bool post_recv(ibv_qp *qp, uint64_t wr_id, void *addr, uint32_t length, + uint32_t lkey) { + ibv_sge sge{}; + sge.addr = reinterpret_cast(addr); + sge.length = length; + sge.lkey = lkey; + + ibv_recv_wr wr{}; + wr.wr_id = wr_id; + wr.sg_list = &sge; + wr.num_sge = 1; + wr.next = nullptr; + + ibv_recv_wr *bad_wr = nullptr; + return ibv_post_recv(qp, &wr, &bad_wr) == 0; + } + + bool post_rdma_write_imm(ibv_qp *qp, uint64_t wr_id, void *local_addr, + uint32_t length, uint32_t lkey, uint64_t remote_addr, + uint32_t rkey, uint32_t imm_data) { + ibv_sge sge{}; + sge.addr = reinterpret_cast(local_addr); + sge.length = length; + sge.lkey = lkey; + + ibv_send_wr wr{}; + wr.wr_id = wr_id; + wr.sg_list = &sge; + wr.num_sge = 1; + wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; + wr.send_flags = IBV_SEND_SIGNALED; + wr.imm_data = htonl(imm_data); + wr.wr.rdma.remote_addr = remote_addr; + wr.wr.rdma.rkey = rkey; + wr.next = nullptr; + + ibv_send_wr *bad_wr = nullptr; + return ibv_post_send(qp, &wr, &bad_wr) == 0; + } + + int poll_cq(ibv_cq *cq, ibv_wc *wc, int max_wc = 1) { + return ibv_poll_cq(cq, max_wc, wc); + } + + int get_gid_index() const { return gid_index_; } + +private: + void cleanup() { + if (pd_) { + ibv_dealloc_pd(pd_); + pd_ = nullptr; + } + if (ctx_) { + ibv_close_device(ctx_); + ctx_ = nullptr; + } + } + + int find_roce_v2_gid_index() { + int best_gid = -1; + for (int i = 0; i < port_attr_.gid_tbl_len; i++) { + ibv_gid gid; + if (ibv_query_gid(ctx_, port_, i, &gid) == 0) { + if (gid.raw[10] == 0xff && gid.raw[11] == 0xff) { + best_gid = i; // Last match = RoCE v2 + } + } + } + return (best_gid >= 0) ? best_gid : 0; + } + + ibv_context *ctx_ = nullptr; + ibv_pd *pd_ = nullptr; + ibv_port_attr port_attr_{}; + int port_ = 1; + int gid_index_ = 0; +}; + +//============================================================================== +// RDMA Buffer +//============================================================================== + +class RdmaBuffer { +public: + ~RdmaBuffer() { release(); } + + bool allocate(RdmaContext &ctx, size_t size) { + size_t page_size = 4096; + size_t aligned = ((size + page_size - 1) / page_size) * page_size; + data_ = aligned_alloc(page_size, aligned); + if (!data_) + return false; + size_ = size; + memset(data_, 0, aligned); + mr_ = ctx.register_memory(data_, aligned, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + if (!mr_) { + ::free(data_); + data_ = nullptr; + return false; + } + return true; + } + + void release() { + if (mr_) { + ibv_dereg_mr(mr_); + mr_ = nullptr; + } + if (data_) { + ::free(data_); + data_ = nullptr; + } + } + + void *data() const { return data_; } + size_t size() const { return size_; } + uint32_t lkey() const { return mr_ ? mr_->lkey : 0; } + uint32_t rkey() const { return mr_ ? mr_->rkey : 0; } + +private: + void *data_ = nullptr; + size_t size_ = 0; + ibv_mr *mr_ = nullptr; +}; + +//============================================================================== +// Emulated Register File +//============================================================================== + +class RegisterFile { +public: + void write(uint32_t addr, uint32_t value) { + std::lock_guard lock(mu_); + regs_[addr] = value; + } + + uint32_t read(uint32_t addr) const { + std::lock_guard lock(mu_); + auto it = regs_.find(addr); + return (it != regs_.end()) ? it->second : 0; + } + + /// Batch write (for BRAM loading efficiency). + void write_batch(const std::vector> &writes) { + std::lock_guard lock(mu_); + for (auto &[addr, val] : writes) { + regs_[addr] = val; + } + } + + /// Read a range of contiguous 32-bit registers. + std::vector read_range(uint32_t base_addr, uint32_t count) const { + std::lock_guard lock(mu_); + std::vector result(count); + for (uint32_t i = 0; i < count; i++) { + auto it = regs_.find(base_addr + i * 4); + result[i] = (it != regs_.end()) ? it->second : 0; + } + return result; + } + +private: + mutable std::mutex mu_; + std::unordered_map regs_; +}; + +//============================================================================== +// RDMA Target Config (decoded from VP register writes) +//============================================================================== + +struct RdmaTargetConfig { + uint32_t qp_number = 0; + uint32_t rkey = 0; + uint64_t buffer_addr = 0; + uint32_t page_inc = 0; // bytes + uint32_t max_buff = 0; // max buffer index + uint32_t buffer_length = 0; + + // Temporary storage for two-part address + uint32_t page_lsb = 0; + uint32_t page_msb = 0; + + // Track whether key fields were explicitly set (buffer_addr=0 is valid + // when Hololink uses IOVA with dmabuf). + bool qp_set = false; + bool rkey_set = false; + + void update_addr() { + // Hololink encodes: PAGE_LSB = addr >> 7, PAGE_MSB = addr >> 32 + // Reconstruct: addr = (MSB << 32) | (LSB << 7) + buffer_addr = (static_cast(page_msb) << 32) | + (static_cast(page_lsb) << 7); + } + + bool is_complete() const { + // buffer_addr=0 is valid (Hololink IOVA/dmabuf), so we only check + // that QP and RKEY were explicitly set. + return qp_set && rkey_set; + } + + void print() const { + std::cout << " RDMA Target Config:" << std::endl; + std::cout << " QP: 0x" << std::hex << qp_number << std::dec << std::endl; + std::cout << " RKEY: 0x" << std::hex << rkey << std::dec << std::endl; + std::cout << " Buffer addr: 0x" << std::hex << buffer_addr << std::dec + << std::endl; + std::cout << " Page inc: " << page_inc << " bytes" << std::endl; + std::cout << " Max buff: " << max_buff << std::endl; + } +}; + +//============================================================================== +// UDP Control Plane Server +//============================================================================== + +class ControlPlaneServer { +public: + ControlPlaneServer(uint16_t port, uint32_t vp_address, uint32_t hif_address, + RegisterFile ®s) + : port_(port), vp_addr_(vp_address), hif_addr_(hif_address), regs_(regs) { + } + + ~ControlPlaneServer() { stop(); } + + void set_my_qp(uint32_t qp) { my_qp_ = qp; } + + bool start() { + fd_ = socket(AF_INET, SOCK_DGRAM, 0); + if (fd_ < 0) + return false; + + int opt = 1; + setsockopt(fd_, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); + + sockaddr_in addr{}; + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = INADDR_ANY; + addr.sin_port = htons(port_); + if (bind(fd_, reinterpret_cast(&addr), sizeof(addr)) < 0) { + ::close(fd_); + fd_ = -1; + return false; + } + + running_ = true; + thread_ = std::thread(&ControlPlaneServer::run, this); + return true; + } + + void stop() { + running_ = false; + if (fd_ >= 0) { + shutdown(fd_, SHUT_RDWR); + ::close(fd_); + fd_ = -1; + } + if (thread_.joinable()) + thread_.join(); + } + + /// Block until RDMA config is complete or timeout. + bool wait_for_config(int timeout_ms = 60000) { + auto start = std::chrono::steady_clock::now(); + while (!target_.is_complete() && !g_shutdown) { + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start) + .count(); + if (elapsed >= timeout_ms) + return false; + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + return target_.is_complete(); + } + + const RdmaTargetConfig &target() const { return target_; } + + /// Check if player_enable was set to 1. + bool playback_triggered() const { return playback_triggered_.load(); } + void clear_playback_trigger() { playback_triggered_ = false; } + + /// Get player config. + uint32_t window_size() const { return regs_.read(PLAYER_WIN_SIZE); } + uint32_t window_number() const { return regs_.read(PLAYER_WIN_NUM); } + uint32_t timer_spacing() const { return regs_.read(PLAYER_TIMER); } + +private: + void run() { + std::vector buf(4096); + while (running_ && !g_shutdown) { + fd_set fds; + FD_ZERO(&fds); + FD_SET(fd_, &fds); + timeval tv{0, 100000}; // 100ms + + int ready = select(fd_ + 1, &fds, nullptr, nullptr, &tv); + if (ready <= 0) + continue; + + sockaddr_in client{}; + socklen_t clen = sizeof(client); + ssize_t len = recvfrom(fd_, buf.data(), buf.size(), 0, + reinterpret_cast(&client), &clen); + if (len < 6) + continue; + + handle_packet(buf.data(), static_cast(len), client); + } + } + + // --- Packet helpers --- + + static uint32_t read_be32(const uint8_t *p) { + return (uint32_t(p[0]) << 24) | (uint32_t(p[1]) << 16) | + (uint32_t(p[2]) << 8) | p[3]; + } + + static uint16_t read_be16(const uint8_t *p) { + return (uint16_t(p[0]) << 8) | p[1]; + } + + static void write_be32(uint8_t *p, uint32_t v) { + p[0] = (v >> 24) & 0xFF; + p[1] = (v >> 16) & 0xFF; + p[2] = (v >> 8) & 0xFF; + p[3] = v & 0xFF; + } + + static void write_be16(uint8_t *p, uint16_t v) { + p[0] = (v >> 8) & 0xFF; + p[1] = v & 0xFF; + } + + // --- Handle incoming packet --- + + void handle_packet(const uint8_t *data, size_t len, + const sockaddr_in &client) { + uint8_t opcode = data[0]; + uint8_t flags = data[1]; + uint16_t seq = read_be16(data + 2); + + switch (opcode) { + case WR_DWORD: + if (len >= 14) + handle_wr_dword(data, flags, seq, client); + break; + case WR_BLOCK: + handle_wr_block(data, len, flags, seq, client); + break; + case RD_DWORD: + if (len >= 10) + handle_rd_dword(data, flags, seq, client); + break; + case RD_BLOCK: + handle_rd_block(data, len, flags, seq, client); + break; + default: + // Unknown opcode - send error ACK + if (flags & REQUEST_FLAGS_ACK_REQUEST) + send_write_ack(client, opcode, flags, seq); + break; + } + } + + void handle_wr_dword(const uint8_t *data, uint8_t flags, uint16_t seq, + const sockaddr_in &client) { + uint32_t addr = read_be32(data + 6); + uint32_t val = read_be32(data + 10); + process_register_write(addr, val); + if (flags & REQUEST_FLAGS_ACK_REQUEST) + send_write_ack(client, WR_DWORD, flags, seq); + } + + void handle_wr_block(const uint8_t *data, size_t len, uint8_t flags, + uint16_t seq, const sockaddr_in &client) { + // Pairs start at offset 6, each pair is 8 bytes + size_t offset = 6; + std::vector> batch; + while (offset + 8 <= len) { + uint32_t addr = read_be32(data + offset); + uint32_t val = read_be32(data + offset + 4); + batch.push_back({addr, val}); + offset += 8; + } + + // Batch write to register file + regs_.write_batch(batch); + + // Process VP register updates + for (auto &[addr, val] : batch) { + process_vp_update(addr, val); + check_player_enable(addr, val); + } + + if (flags & REQUEST_FLAGS_ACK_REQUEST) + send_write_ack(client, WR_BLOCK, flags, seq); + } + + void handle_rd_dword(const uint8_t *data, uint8_t flags, uint16_t seq, + const sockaddr_in &client) { + uint32_t addr = read_be32(data + 6); + uint32_t val = regs_.read(addr); + + // Response: cmd(1) + flags(1) + seq(2) + response_code(1) + reserved(1) + + // addr(4) + value(4) + latched_seq(2) = 16 bytes + uint8_t resp[16]; + resp[0] = RD_DWORD; + resp[1] = flags; + write_be16(resp + 2, seq); + resp[4] = RESPONSE_SUCCESS; + resp[5] = 0; // reserved + write_be32(resp + 6, addr); + write_be32(resp + 10, val); + write_be16(resp + 14, seq); // latched sequence + + sendto(fd_, resp, sizeof(resp), 0, + reinterpret_cast(&client), sizeof(client)); + } + + void handle_rd_block(const uint8_t *data, size_t len, uint8_t flags, + uint16_t seq, const sockaddr_in &client) { + // Parse addresses from request + std::vector addrs; + size_t offset = 6; + while (offset + 8 <= len) { + addrs.push_back(read_be32(data + offset)); + offset += 8; + } + + // Build response: cmd(1) + flags(1) + seq(2) + rc(1) + reserved(1) + + // N*(addr(4)+value(4)) + latched_seq(2) + size_t resp_len = 6 + addrs.size() * 8 + 2; + std::vector resp(resp_len); + resp[0] = RD_BLOCK; + resp[1] = flags; + write_be16(resp.data() + 2, seq); + resp[4] = RESPONSE_SUCCESS; + resp[5] = 0; + + size_t roff = 6; + for (uint32_t a : addrs) { + uint32_t val = regs_.read(a); + write_be32(resp.data() + roff, a); + write_be32(resp.data() + roff + 4, val); + roff += 8; + } + write_be16(resp.data() + roff, seq); // latched sequence + + sendto(fd_, resp.data(), resp.size(), 0, + reinterpret_cast(&client), sizeof(client)); + } + + // --- Write ACK for WR_DWORD / WR_BLOCK --- + + void send_write_ack(const sockaddr_in &client, uint8_t cmd, uint8_t flags, + uint16_t seq) { + uint8_t resp[5]; + resp[0] = cmd; + resp[1] = flags; + write_be16(resp + 2, seq); + resp[4] = RESPONSE_SUCCESS; + sendto(fd_, resp, sizeof(resp), 0, + reinterpret_cast(&client), sizeof(client)); + } + + // --- Register write processing --- + + void process_register_write(uint32_t addr, uint32_t val) { + regs_.write(addr, val); + process_vp_update(addr, val); + check_player_enable(addr, val); + } + + void process_vp_update(uint32_t addr, uint32_t val) { + // Check if this is a VP register (relative to vp_addr_) + if (addr < vp_addr_ || addr >= vp_addr_ + 0x100) + return; + + uint32_t offset = addr - vp_addr_; + switch (offset) { + case DP_QP: + target_.qp_number = val; + target_.qp_set = true; + break; + case DP_RKEY: + target_.rkey = val; + target_.rkey_set = true; + break; + case DP_PAGE_LSB: + target_.page_lsb = val; + target_.update_addr(); + break; + case DP_PAGE_MSB: + target_.page_msb = val; + target_.update_addr(); + break; + case DP_PAGE_INC: + target_.page_inc = val << 7; // PAGES encoding: value * 128 + break; + case DP_MAX_BUFF: + target_.max_buff = val; + break; + case DP_BUFFER_LENGTH: + target_.buffer_length = val; + break; + } + } + + void check_player_enable(uint32_t addr, uint32_t val) { + if (addr == PLAYER_ENABLE && val == 1) { + playback_triggered_ = true; + } + } + + uint16_t port_; + uint32_t vp_addr_; + uint32_t hif_addr_; + RegisterFile ®s_; + int fd_ = -1; + std::atomic running_{false}; + std::thread thread_; + uint32_t my_qp_ = 0; + RdmaTargetConfig target_; + std::atomic playback_triggered_{false}; +}; + +//============================================================================== +// BRAM Reassembly +//============================================================================== + +/// Reassemble one window from the 16-bank BRAM layout. +/// Each 64-byte beat is spread across 16 banks (4 bytes each). +/// @param regs Register file to read from +/// @param window_index Window number +/// @param cycles_per_window Number of 64-byte beats per window +/// @return Reassembled window payload +static std::vector reassemble_window(const RegisterFile ®s, + uint32_t window_index, + uint32_t cycles_per_window) { + std::vector payload(cycles_per_window * 64, 0); + for (uint32_t cycle = 0; cycle < cycles_per_window; cycle++) { + uint32_t sample_index = window_index * cycles_per_window + cycle; + for (int bank = 0; bank < BRAM_NUM_BANKS; bank++) { + uint32_t addr = + RAM_BASE + (bank << (BRAM_W_SAMPLE_ADDR + 2)) + (sample_index * 4); + uint32_t val = regs.read(addr); + // Store as little-endian (matching FPGA BRAM word order) + size_t byte_offset = cycle * 64 + bank * 4; + memcpy(&payload[byte_offset], &val, 4); + } + } + return payload; +} + +//============================================================================== +// ILA Capture Storage +//============================================================================== + +/// Store a correction response into the ILA capture register file. +/// The ILA stores each sample across 17 banks of 32-bit words. +/// Banks 0-15 = 512-bit AXI data bus (raw correction bytes). +/// Bank 16 = control signals: +/// bit 0 = tvalid (bit 512 of the captured word) +/// bit 1 = tlast (bit 513) +/// bits [8:2] = wr_tcnt (bits 520:514, 7-bit write transaction count) +static void store_ila_sample(RegisterFile ®s, uint32_t sample_index, + const uint8_t *data, size_t data_len) { + // Spread the data across banks 0-15 (the 512-bit AXI data bus). + for (int bank = 0; bank < ILA_NUM_BANKS - 1; bank++) { + uint32_t addr = + ILA_DATA_BASE + (bank << (ILA_W_ADDR + 2)) + (sample_index * 4); + uint32_t val = 0; + size_t byte_offset = bank * 4; + if (byte_offset < data_len) { + size_t copy_len = std::min(4, data_len - byte_offset); + memcpy(&val, data + byte_offset, copy_len); + } + regs.write(addr, val); + } + + // Bank 16: set control signals (tvalid=1, tlast=1, wr_tcnt=1) + { + uint32_t ctrl_addr = ILA_DATA_BASE + + ((ILA_NUM_BANKS - 1) << (ILA_W_ADDR + 2)) + + (sample_index * 4); + uint32_t ctrl_val = 0; + ctrl_val |= (1u << 0); // tvalid (bit 512) + ctrl_val |= (1u << 1); // tlast (bit 513) + ctrl_val |= (1u << 2); // wr_tcnt = 1 (bits 514+, value 1 in 7-bit field) + regs.write(ctrl_addr, ctrl_val); + } + + // Update sample count + regs.write(ILA_SAMPLE_ADDR, sample_index + 1); +} + +//============================================================================== +// Command-Line Arguments +//============================================================================== + +struct EmulatorArgs { + std::string device = "rocep1s0f0"; + int ib_port = 1; + uint16_t control_port = 8193; + std::string bridge_ip = ""; // Bridge IP (for GID, auto-detect if empty) + uint32_t vp_address = 0x1000; + uint32_t hif_address = 0x0800; + size_t page_size = 256; // Default slot size for responses RX +}; + +static void print_usage(const char *prog) { + std::cout + << "Usage: " << prog << " [options]\n" + << "\nFPGA emulator for QEC decode loop testing.\n" + << "\nOptions:\n" + << " --device=NAME IB device name (default: rocep1s0f0)\n" + << " --ib-port=N IB port number (default: 1)\n" + << " --port=N UDP control plane port (default: 8193)\n" + << " --bridge-ip=ADDR Bridge tool IP for GID (default: auto)\n" + << " --vp-address=ADDR VP register base (default: 0x1000)\n" + << " --hif-address=ADDR HIF register base (default: 0x0800)\n" + << " --page-size=N Slot size for correction RX (default: 256)\n" + << " --help Show this help\n"; +} + +static EmulatorArgs parse_args(int argc, char *argv[]) { + EmulatorArgs args; + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg.find("--device=") == 0) + args.device = arg.substr(9); + else if (arg.find("--ib-port=") == 0) + args.ib_port = std::stoi(arg.substr(10)); + else if (arg.find("--port=") == 0) + args.control_port = std::stoi(arg.substr(7)); + else if (arg.find("--bridge-ip=") == 0) + args.bridge_ip = arg.substr(12); + else if (arg.find("--vp-address=") == 0) + args.vp_address = std::stoul(arg.substr(13), nullptr, 0); + else if (arg.find("--hif-address=") == 0) + args.hif_address = std::stoul(arg.substr(14), nullptr, 0); + else if (arg.find("--page-size=") == 0) + args.page_size = std::stoull(arg.substr(12)); + else if (arg == "--help" || arg == "-h") { + print_usage(argv[0]); + exit(0); + } + } + return args; +} + +//============================================================================== +// MAIN +//============================================================================== + +int main(int argc, char *argv[]) { + signal(SIGINT, signal_handler); + signal(SIGTERM, signal_handler); + + try { + auto args = parse_args(argc, argv); + + std::cout << "=== Hololink FPGA Emulator ===" << std::endl; + std::cout << "IB Device: " << args.device << std::endl; + std::cout << "Control port: " << args.control_port << std::endl; + std::cout << "VP address: 0x" << std::hex << args.vp_address << std::dec + << std::endl; + + //========================================================================== + // [1/4] Initialize RDMA + //========================================================================== + std::cout << "\n[1/4] Initializing RDMA..." << std::endl; + + RdmaContext rdma; + if (!rdma.open(args.device, args.ib_port)) { + std::cerr << "ERROR: Failed to open RDMA device: " << args.device + << std::endl; + return 1; + } + std::cout << " GID index: " << rdma.get_gid_index() << std::endl; + + // TX buffer for outgoing syndromes + RdmaBuffer tx_buffer; + if (!tx_buffer.allocate(rdma, NUM_BUFFERS * args.page_size)) { + std::cerr << "ERROR: Failed to allocate TX buffer" << std::endl; + return 1; + } + + // RX buffer for incoming responses (same page_size as bridge for + // symmetry) + RdmaBuffer rx_buffer; + if (!rx_buffer.allocate(rdma, NUM_BUFFERS * args.page_size)) { + std::cerr << "ERROR: Failed to allocate RX buffer" << std::endl; + return 1; + } + + // Create CQs and QP + ibv_cq *tx_cq = rdma.create_cq(NUM_BUFFERS * 2); + ibv_cq *rx_cq = rdma.create_cq(NUM_BUFFERS * 2); + if (!tx_cq || !rx_cq) { + std::cerr << "ERROR: Failed to create CQs" << std::endl; + return 1; + } + + ibv_qp *qp = rdma.create_qp(tx_cq, rx_cq, NUM_BUFFERS, NUM_BUFFERS); + if (!qp) { + std::cerr << "ERROR: Failed to create QP" << std::endl; + return 1; + } + if (!rdma.qp_to_init(qp)) { + std::cerr << "ERROR: Failed to set QP to INIT" << std::endl; + return 1; + } + + std::cout << " QP Number: 0x" << std::hex << qp->qp_num << std::dec + << std::endl; + std::cout << " TX buffer: " << tx_buffer.size() << " bytes" << std::endl; + std::cout << " RX buffer: " << rx_buffer.size() << " bytes" << std::endl; + + //========================================================================== + // [2/4] Start UDP control plane server + //========================================================================== + std::cout << "\n[2/4] Starting control plane server..." << std::endl; + + RegisterFile regs; + ControlPlaneServer server(args.control_port, args.vp_address, + args.hif_address, regs); + server.set_my_qp(qp->qp_num); + + if (!server.start()) { + std::cerr << "ERROR: Failed to start control plane server" << std::endl; + return 1; + } + std::cout << " Listening on UDP port " << args.control_port << std::endl; + std::cout << " Emulator QP: 0x" << std::hex << qp->qp_num << std::dec + << std::endl; + + //========================================================================== + // [3/4] Wait for RDMA config from playback tool + //========================================================================== + std::cout << "\n[3/4] Waiting for RDMA configuration..." << std::endl; + std::cout << " (Start bridge tool, then playback tool with " + "--control-port=" + << args.control_port << ")" << std::endl; + + if (!server.wait_for_config(300000)) { // 5 minute timeout + std::cerr << "ERROR: Timeout waiting for RDMA configuration" << std::endl; + return 1; + } + + auto &target = server.target(); + target.print(); + + // Connect QP to bridge + ibv_gid remote_gid{}; + if (!args.bridge_ip.empty()) { + // Use provided IP + remote_gid.raw[10] = 0xff; + remote_gid.raw[11] = 0xff; + inet_pton(AF_INET, args.bridge_ip.c_str(), &remote_gid.raw[12]); + } else { + // Derive from VP HOST_IP register if available + uint32_t host_ip = regs.read(args.vp_address + 0x28); // DP_HOST_IP + if (host_ip != 0) { + remote_gid.raw[10] = 0xff; + remote_gid.raw[11] = 0xff; + // DP_HOST_IP is in network byte order from inet_network() + memcpy(&remote_gid.raw[12], &host_ip, 4); + } else { + std::cerr << "ERROR: No bridge IP available. Use --bridge-ip or ensure " + "configure_roce sets HOST_IP." + << std::endl; + return 1; + } + } + + std::cout << " Connecting QP to bridge QP 0x" << std::hex + << target.qp_number << std::dec << "..." << std::endl; + + if (!rdma.qp_to_rtr(qp, remote_gid, target.qp_number, 0)) { + std::cerr << "ERROR: Failed QP -> RTR" << std::endl; + return 1; + } + if (!rdma.qp_to_rts(qp, 0)) { + std::cerr << "ERROR: Failed QP -> RTS" << std::endl; + return 1; + } + std::cout << " QP connected!" << std::endl; + + // Post receive WQEs for responses + for (size_t i = 0; i < NUM_BUFFERS; i++) { + void *addr = + static_cast(rx_buffer.data()) + (i * args.page_size); + if (!rdma.post_recv(qp, i, addr, args.page_size, rx_buffer.lkey())) { + std::cerr << "ERROR: Failed to post receive WQE " << i << std::endl; + return 1; + } + } + std::cout << " Posted " << NUM_BUFFERS << " receive WQEs" << std::endl; + + //========================================================================== + // [4/4] Wait for playback trigger, then run + //========================================================================== + std::cout << "\n[4/4] Waiting for playback trigger..." << std::endl; + + while (!server.playback_triggered() && !g_shutdown) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + + if (g_shutdown) { + std::cout << "Shutdown requested" << std::endl; + return 0; + } + + std::cout << "\n=== Playback triggered ===" << std::endl; + + uint32_t win_size = server.window_size(); + uint32_t win_num = server.window_number(); + uint32_t timer = server.timer_spacing(); + uint32_t cycles_per_window = (win_size + 63) / 64; // 64 bytes per beat + + std::cout << " Window size: " << win_size << " bytes" << std::endl; + std::cout << " Window count: " << win_num << std::endl; + std::cout << " Timer spacing: " << timer << " (raw)" << std::endl; + std::cout << " Cycles per window: " << cycles_per_window << std::endl; + + // Compute pacing interval from timer register (timer = 322 * microseconds) + int pacing_us = (timer > 0) ? (timer / 322) : 10; + + // Check if ILA is armed + bool ila_armed = (regs.read(ILA_CTRL) & 0x01) != 0; + std::cout << " ILA capture: " << (ila_armed ? "armed" : "not armed") + << std::endl; + + // Determine page_size for RDMA addressing from target config + uint32_t rdma_page_size = + (target.page_inc > 0) ? target.page_inc : args.page_size; + uint32_t num_pages = target.max_buff + 1; + + std::cout << "\n=== Starting syndrome transmission ===" << std::endl; + + auto start_time = std::chrono::high_resolution_clock::now(); + uint32_t responses_received = 0; + uint32_t send_errors = 0; + uint32_t recv_timeouts = 0; + + for (uint32_t window = 0; window < win_num && !g_shutdown; window++) { + uint32_t slot = window % num_pages; + + // Reassemble syndrome payload from BRAM + auto payload = reassemble_window(regs, window, cycles_per_window); + + // Copy to RDMA TX buffer slot + uint8_t *tx_addr = + static_cast(tx_buffer.data()) + (slot * rdma_page_size); + size_t copy_len = std::min(payload.size(), rdma_page_size); + memcpy(tx_addr, payload.data(), copy_len); + + // RDMA WRITE to bridge's ring buffer + uint64_t remote_addr = target.buffer_addr + (slot * rdma_page_size); + if (!rdma.post_rdma_write_imm(qp, window, tx_addr, copy_len, + tx_buffer.lkey(), remote_addr, target.rkey, + slot)) { + std::cerr << "ERROR: RDMA WRITE failed for window " << window + << std::endl; + send_errors++; + continue; + } + + // Wait for send completion + bool send_ok = false; + auto t0 = std::chrono::steady_clock::now(); + while (!send_ok && !g_shutdown) { + ibv_wc wc; + int n = rdma.poll_cq(tx_cq, &wc, 1); + if (n > 0) { + send_ok = (wc.status == IBV_WC_SUCCESS); + if (!send_ok) { + std::cerr << "ERROR: Send CQE error: " + << ibv_wc_status_str(wc.status) << std::endl; + send_errors++; + } + break; + } + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - t0) + .count(); + if (elapsed > 5000) { + std::cerr << "ERROR: Send timeout for window " << window << std::endl; + recv_timeouts++; + break; + } + } + if (!send_ok) + continue; + + // Wait for correction response (natural pacing) + bool corr_ok = false; + t0 = std::chrono::steady_clock::now(); + while (!corr_ok && !g_shutdown) { + ibv_wc wc; + int n = rdma.poll_cq(rx_cq, &wc, 1); + if (n > 0) { + if (wc.status == IBV_WC_SUCCESS) { + corr_ok = true; + responses_received++; + + // Store in ILA capture if armed + if (ila_armed) { + uint32_t rx_slot = wc.wr_id % NUM_BUFFERS; + uint8_t *resp_data = static_cast(rx_buffer.data()) + + (rx_slot * args.page_size); + store_ila_sample(regs, window, resp_data, wc.byte_len); + } + + // Re-post receive WQE + uint32_t rx_slot = wc.wr_id % NUM_BUFFERS; + void *rx_addr = static_cast(rx_buffer.data()) + + (rx_slot * args.page_size); + rdma.post_recv(qp, rx_slot, rx_addr, args.page_size, + rx_buffer.lkey()); + } else { + std::cerr << "ERROR: Recv CQE error: " + << ibv_wc_status_str(wc.status) << std::endl; + } + break; + } + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - t0) + .count(); + if (elapsed > 10000) { + std::cerr << "ERROR: Correction timeout for window " << window + << std::endl; + recv_timeouts++; + break; + } + } + + // Progress + if ((window + 1) % 10 == 0 || window == win_num - 1) { + std::cout << " Window " << (window + 1) << "/" << win_num + << " (responses: " << responses_received + << ", errors: " << send_errors << ")" << std::endl; + } + + // Pacing delay + if (pacing_us > 0 && window + 1 < win_num) { + std::this_thread::sleep_for(std::chrono::microseconds(pacing_us)); + } + } + + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time); + + // Mark ILA as done + if (ila_armed) { + regs.write(ILA_STATUS, regs.read(ILA_STATUS) | 0x02); // done bit + } + + // Report results + std::cout << "\n=== Emulator Results ===" << std::endl; + std::cout << " Windows sent: " << win_num << std::endl; + std::cout << " Responses received: " << responses_received << std::endl; + std::cout << " Send errors: " << send_errors << std::endl; + std::cout << " Timeouts: " << recv_timeouts << std::endl; + std::cout << " Duration: " << duration.count() << " ms" << std::endl; + + // Keep running to allow playback tool to read ILA capture data + if (ila_armed) { + std::cout << "\nWaiting for ILA readback (Ctrl+C to stop)..." + << std::endl; + while (!g_shutdown) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + } + + // Cleanup + server.stop(); + ibv_destroy_qp(qp); + ibv_destroy_cq(tx_cq); + ibv_destroy_cq(rx_cq); + + if (send_errors == 0 && recv_timeouts == 0 && + responses_received == win_num) { + std::cout << "\n*** EMULATOR: ALL WINDOWS PROCESSED ***" << std::endl; + return 0; + } else { + std::cout << "\n*** EMULATOR: ERRORS DETECTED ***" << std::endl; + return 1; + } + + } catch (const std::exception &e) { + std::cerr << "ERROR: " << e.what() << std::endl; + return 1; + } +} diff --git a/realtime/unittests/utils/hololink_fpga_playback.cpp b/realtime/unittests/utils/hololink_fpga_playback.cpp new file mode 100644 index 00000000..c98d346f --- /dev/null +++ b/realtime/unittests/utils/hololink_fpga_playback.cpp @@ -0,0 +1,534 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +/// @file hololink_fpga_playback.cpp +/// @brief Generic RPC playback tool for Hololink FPGA / emulator testing. +/// +/// Sends RPC messages to the FPGA (or emulator) via the Hololink UDP control +/// plane, triggering RDMA transmission to the bridge. After playback, reads +/// back responses from the ILA capture RAM and verifies them. +/// +/// For the generic bridge, the payload is a sequence of ascending bytes and +/// the expected response is each byte incremented by 1. +/// +/// Usage: +/// ./hololink_fpga_playback \ +/// --control-ip=10.0.0.2 --control-port=8193 \ +/// --bridge-qp=0x5 --bridge-rkey=12345 --bridge-buffer=0x7f... \ +/// --page-size=384 --num-pages=64 --num-shots=100 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" + +//============================================================================== +// Hololink Control Plane Protocol +//============================================================================== + +static constexpr uint8_t WR_DWORD = 0x04; +static constexpr uint8_t WR_BLOCK = 0x09; +static constexpr uint8_t RD_DWORD = 0x14; +static constexpr uint8_t RD_BLOCK = 0x19; +static constexpr uint8_t REQUEST_FLAGS_ACK_REQUEST = 0x01; +static constexpr uint8_t RESPONSE_SUCCESS = 0x00; + +// VP register offsets +static constexpr uint32_t DP_QP = 0x00; +static constexpr uint32_t DP_RKEY = 0x04; +static constexpr uint32_t DP_PAGE_LSB = 0x08; +static constexpr uint32_t DP_PAGE_MSB = 0x0C; +static constexpr uint32_t DP_PAGE_INC = 0x10; +static constexpr uint32_t DP_MAX_BUFF = 0x14; +static constexpr uint32_t DP_BUFFER_LENGTH = 0x18; +static constexpr uint32_t DP_HOST_IP = 0x28; + +// HIF register offsets +static constexpr uint32_t DP_VP_MASK = 0x0C; + +// Player registers +static constexpr uint32_t PLAYER_BASE = 0x50000000; +static constexpr uint32_t PLAYER_ENABLE = PLAYER_BASE + 0x04; +static constexpr uint32_t PLAYER_TIMER = PLAYER_BASE + 0x08; +static constexpr uint32_t PLAYER_WIN_SIZE = PLAYER_BASE + 0x0C; +static constexpr uint32_t PLAYER_WIN_NUM = PLAYER_BASE + 0x10; + +// Playback BRAM +static constexpr uint32_t RAM_BASE = 0x50100000; +static constexpr int BRAM_NUM_BANKS = 16; +static constexpr int BRAM_W_SAMPLE_ADDR = 9; +static constexpr int BRAM_BANK_STRIDE = 1 << (BRAM_W_SAMPLE_ADDR + 2); + +// ILA capture +static constexpr uint32_t ILA_BASE = 0x40000000; +static constexpr uint32_t ILA_CTRL = ILA_BASE + 0x00; +static constexpr uint32_t ILA_STATUS = ILA_BASE + 0x80; +static constexpr uint32_t ILA_SAMPLE_ADDR = ILA_BASE + 0x84; +static constexpr uint32_t ILA_DATA_BASE = 0x40100000; +static constexpr int ILA_NUM_BANKS = 17; +static constexpr int ILA_W_ADDR = 13; +static constexpr int ILA_BANK_STRIDE = 1 << (ILA_W_ADDR + 2); + +// Hololink page encoding +static constexpr int PAGE_SHIFT = 7; // 128-byte pages + +//============================================================================== +// UDP helpers +//============================================================================== + +static void write_be32(uint8_t *p, uint32_t v) { + p[0] = (v >> 24) & 0xFF; + p[1] = (v >> 16) & 0xFF; + p[2] = (v >> 8) & 0xFF; + p[3] = v & 0xFF; +} + +static void write_be16(uint8_t *p, uint16_t v) { + p[0] = (v >> 8) & 0xFF; + p[1] = v & 0xFF; +} + +static uint32_t read_be32(const uint8_t *p) { + return (uint32_t(p[0]) << 24) | (uint32_t(p[1]) << 16) | + (uint32_t(p[2]) << 8) | p[3]; +} + +//============================================================================== +// Control plane client +//============================================================================== + +class ControlPlaneClient { +public: + bool connect(const std::string &ip, uint16_t port) { + fd_ = socket(AF_INET, SOCK_DGRAM, 0); + if (fd_ < 0) + return false; + + addr_.sin_family = AF_INET; + addr_.sin_port = htons(port); + inet_pton(AF_INET, ip.c_str(), &addr_.sin_addr); + + // Set receive timeout + timeval tv{2, 0}; + setsockopt(fd_, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); + return true; + } + + ~ControlPlaneClient() { + if (fd_ >= 0) + ::close(fd_); + } + + bool write_dword(uint32_t addr, uint32_t value) { + uint8_t pkt[14]; + pkt[0] = WR_DWORD; + pkt[1] = REQUEST_FLAGS_ACK_REQUEST; + write_be16(pkt + 2, seq_++); + pkt[4] = 0; + pkt[5] = 0; + write_be32(pkt + 6, addr); + write_be32(pkt + 10, value); + + sendto(fd_, pkt, sizeof(pkt), 0, reinterpret_cast(&addr_), + sizeof(addr_)); + + // Wait for ACK + uint8_t resp[16]; + ssize_t n = recv(fd_, resp, sizeof(resp), 0); + return (n >= 5 && resp[4] == RESPONSE_SUCCESS); + } + + bool write_block(const std::vector> &pairs) { + std::vector pkt(6 + pairs.size() * 8); + pkt[0] = WR_BLOCK; + pkt[1] = REQUEST_FLAGS_ACK_REQUEST; + write_be16(pkt.data() + 2, seq_++); + pkt[4] = 0; + pkt[5] = 0; + + size_t off = 6; + for (auto &[addr, val] : pairs) { + write_be32(pkt.data() + off, addr); + write_be32(pkt.data() + off + 4, val); + off += 8; + } + + sendto(fd_, pkt.data(), pkt.size(), 0, reinterpret_cast(&addr_), + sizeof(addr_)); + + uint8_t resp[16]; + ssize_t n = recv(fd_, resp, sizeof(resp), 0); + return (n >= 5 && resp[4] == RESPONSE_SUCCESS); + } + + uint32_t read_dword(uint32_t addr) { + uint8_t pkt[10]; + pkt[0] = RD_DWORD; + pkt[1] = REQUEST_FLAGS_ACK_REQUEST; + write_be16(pkt + 2, seq_++); + pkt[4] = 0; + pkt[5] = 0; + write_be32(pkt + 6, addr); + + sendto(fd_, pkt, sizeof(pkt), 0, reinterpret_cast(&addr_), + sizeof(addr_)); + + uint8_t resp[32]; + ssize_t n = recv(fd_, resp, sizeof(resp), 0); + if (n >= 14) + return read_be32(resp + 10); + return 0; + } + +private: + int fd_ = -1; + sockaddr_in addr_{}; + uint16_t seq_ = 0; +}; + +//============================================================================== +// Arguments +//============================================================================== + +struct PlaybackArgs { + std::string control_ip = "10.0.0.2"; + uint16_t control_port = 8193; + uint32_t bridge_qp = 0; + uint32_t bridge_rkey = 0; + uint64_t bridge_buffer = 0; + size_t page_size = 384; + unsigned num_pages = 64; + uint32_t num_shots = 100; + uint32_t payload_size = 8; // bytes of RPC argument data + uint32_t vp_address = 0x1000; + uint32_t hif_address = 0x0800; + std::string bridge_ip = "10.0.0.1"; + bool verify = true; +}; + +static PlaybackArgs parse_args(int argc, char *argv[]) { + PlaybackArgs args; + for (int i = 1; i < argc; i++) { + std::string a = argv[i]; + if (a.find("--control-ip=") == 0) + args.control_ip = a.substr(13); + else if (a.find("--control-port=") == 0) + args.control_port = std::stoi(a.substr(15)); + else if (a.find("--bridge-qp=") == 0) + args.bridge_qp = std::stoul(a.substr(12), nullptr, 0); + else if (a.find("--bridge-rkey=") == 0) + args.bridge_rkey = std::stoul(a.substr(14), nullptr, 0); + else if (a.find("--bridge-buffer=") == 0) + args.bridge_buffer = std::stoull(a.substr(16), nullptr, 0); + else if (a.find("--page-size=") == 0) + args.page_size = std::stoull(a.substr(12)); + else if (a.find("--num-pages=") == 0) + args.num_pages = std::stoul(a.substr(12)); + else if (a.find("--num-shots=") == 0) + args.num_shots = std::stoul(a.substr(12)); + else if (a.find("--payload-size=") == 0) + args.payload_size = std::stoul(a.substr(15)); + else if (a.find("--vp-address=") == 0) + args.vp_address = std::stoul(a.substr(13), nullptr, 0); + else if (a.find("--hif-address=") == 0) + args.hif_address = std::stoul(a.substr(14), nullptr, 0); + else if (a.find("--bridge-ip=") == 0) + args.bridge_ip = a.substr(12); + else if (a == "--no-verify") + args.verify = false; + else if (a == "--help" || a == "-h") { + std::cout + << "Usage: hololink_fpga_playback [options]\n" + << "\nGeneric RPC playback tool for Hololink FPGA/emulator.\n" + << "\nOptions:\n" + << " --control-ip=ADDR Emulator/FPGA IP (default: 10.0.0.2)\n" + << " --control-port=N UDP control port (default: 8193)\n" + << " --bridge-qp=N Bridge QP number\n" + << " --bridge-rkey=N Bridge RKey\n" + << " --bridge-buffer=ADDR Bridge buffer address\n" + << " --page-size=N Ring buffer slot size (default: 384)\n" + << " --num-pages=N Number of ring buffer slots (default: " + "64)\n" + << " --num-shots=N Number of RPC messages (default: 100)\n" + << " --payload-size=N Bytes per RPC payload (default: 8)\n" + << " --vp-address=ADDR VP register base (default: 0x1000)\n" + << " --hif-address=ADDR HIF register base (default: 0x0800)\n" + << " --bridge-ip=ADDR Bridge IP for FPGA (default: 10.0.0.1)\n" + << " --no-verify Skip ILA correction verification\n"; + exit(0); + } + } + return args; +} + +//============================================================================== +// BRAM loading +//============================================================================== + +/// Build one RPC message for the increment handler. +/// Format: RPCHeader + ascending byte payload. +static std::vector build_rpc_message(uint32_t shot_index, + uint32_t payload_size) { + using cudaq::realtime::fnv1a_hash; + using cudaq::realtime::RPCHeader; + + constexpr uint32_t FUNC_ID = fnv1a_hash("rpc_increment"); + + std::vector msg(sizeof(RPCHeader) + payload_size, 0); + auto *hdr = reinterpret_cast(msg.data()); + hdr->magic = cudaq::realtime::RPC_MAGIC_REQUEST; + hdr->function_id = FUNC_ID; + hdr->arg_len = payload_size; + + uint8_t *payload = msg.data() + sizeof(RPCHeader); + for (uint32_t i = 0; i < payload_size; i++) { + payload[i] = static_cast((shot_index + i) & 0xFF); + } + return msg; +} + +/// Spread a message across 16 BRAM banks (64-byte beats). +static void load_message_to_bram(ControlPlaneClient &ctrl, + const std::vector &msg, + uint32_t window_index, + uint32_t cycles_per_window) { + std::vector> batch; + + for (uint32_t cycle = 0; cycle < cycles_per_window; cycle++) { + uint32_t sample = window_index * cycles_per_window + cycle; + for (int bank = 0; bank < BRAM_NUM_BANKS; bank++) { + uint32_t addr = + RAM_BASE + (bank << (BRAM_W_SAMPLE_ADDR + 2)) + (sample * 4); + uint32_t val = 0; + size_t byte_off = cycle * 64 + bank * 4; + if (byte_off < msg.size()) { + size_t copy_len = std::min(4, msg.size() - byte_off); + memcpy(&val, msg.data() + byte_off, copy_len); + } + batch.push_back({addr, val}); + } + + // Send in chunks to stay within UDP MTU + if (batch.size() >= 64) { + ctrl.write_block(batch); + batch.clear(); + } + } + + if (!batch.empty()) + ctrl.write_block(batch); +} + +//============================================================================== +// Main +//============================================================================== + +int main(int argc, char *argv[]) { + auto args = parse_args(argc, argv); + + std::cout << "=== Hololink Generic RPC Playback ===" << std::endl; + std::cout << "Control: " << args.control_ip << ":" << args.control_port + << std::endl; + std::cout << "Shots: " << args.num_shots << std::endl; + std::cout << "Payload size: " << args.payload_size << " bytes" << std::endl; + + ControlPlaneClient ctrl; + if (!ctrl.connect(args.control_ip, args.control_port)) { + std::cerr << "ERROR: Failed to connect to control plane" << std::endl; + return 1; + } + + //============================================================================ + // Configure RDMA target (bridge's QP/RKEY/buffer) + //============================================================================ + std::cout << "\n[1/4] Configuring RDMA target..." << std::endl; + + uint32_t vp = args.vp_address; + ctrl.write_dword(vp + DP_QP, args.bridge_qp); + ctrl.write_dword(vp + DP_RKEY, args.bridge_rkey); + ctrl.write_dword(vp + DP_PAGE_LSB, + static_cast(args.bridge_buffer >> PAGE_SHIFT)); + ctrl.write_dword(vp + DP_PAGE_MSB, + static_cast(args.bridge_buffer >> 32)); + ctrl.write_dword(vp + DP_PAGE_INC, + static_cast(args.page_size >> PAGE_SHIFT)); + ctrl.write_dword(vp + DP_MAX_BUFF, args.num_pages - 1); + + size_t frame_size = sizeof(cudaq::realtime::RPCHeader) + args.payload_size; + ctrl.write_dword(vp + DP_BUFFER_LENGTH, static_cast(frame_size)); + + // Set bridge IP for emulator GID derivation + { + in_addr a; + inet_pton(AF_INET, args.bridge_ip.c_str(), &a); + ctrl.write_dword(vp + DP_HOST_IP, a.s_addr); + } + + // Enable VP mask + ctrl.write_dword(args.hif_address + DP_VP_MASK, 0x01); + + std::cout << " Bridge QP: 0x" << std::hex << args.bridge_qp << std::dec + << std::endl; + std::cout << " Bridge RKey: " << args.bridge_rkey << std::endl; + std::cout << " Bridge Buffer: 0x" << std::hex << args.bridge_buffer + << std::dec << std::endl; + + //============================================================================ + // Load RPC messages into BRAM + //============================================================================ + std::cout << "\n[2/4] Loading RPC messages into BRAM..." << std::endl; + + uint32_t window_size = static_cast(frame_size); + uint32_t cycles_per_window = (window_size + 63) / 64; + + for (uint32_t shot = 0; shot < args.num_shots; shot++) { + auto msg = build_rpc_message(shot, args.payload_size); + load_message_to_bram(ctrl, msg, shot, cycles_per_window); + + if ((shot + 1) % 10 == 0) + std::cout << " Loaded " << (shot + 1) << "/" << args.num_shots + << std::endl; + } + + //============================================================================ + // Arm ILA and trigger playback + //============================================================================ + std::cout << "\n[3/4] Triggering playback..." << std::endl; + + // Arm ILA capture + if (args.verify) { + ctrl.write_dword(ILA_CTRL, 0x01); + } + + // Set player registers + ctrl.write_dword(PLAYER_WIN_SIZE, window_size); + ctrl.write_dword(PLAYER_WIN_NUM, args.num_shots); + ctrl.write_dword(PLAYER_TIMER, 322 * 100); // 100 us spacing + + // Trigger + ctrl.write_dword(PLAYER_ENABLE, 1); + std::cout << " Playback triggered for " << args.num_shots << " shots" + << std::endl; + + //============================================================================ + // Wait and verify ILA capture + //============================================================================ + if (args.verify) { + std::cout << "\n[4/4] Verifying responses..." << std::endl; + + // Wait for ILA to indicate done (bit 1 of ILA_STATUS) + int timeout = 120; // seconds + bool done = false; + for (int i = 0; i < timeout * 10 && !done; i++) { + uint32_t status = ctrl.read_dword(ILA_STATUS); + if (status & 0x02) + done = true; + else + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + + if (!done) { + std::cerr << "ERROR: ILA capture timeout" << std::endl; + return 1; + } + + uint32_t sample_count = ctrl.read_dword(ILA_SAMPLE_ADDR); + std::cout << " ILA captured " << sample_count << " samples" << std::endl; + + // Read back and verify each response + uint32_t matched = 0; + uint32_t check_count = std::min(sample_count, args.num_shots); + + for (uint32_t i = 0; i < check_count; i++) { + // Read response from ILA banks (the first bytes are RPCResponse header) + std::vector response_bytes(64, 0); + for (int bank = 0; bank < std::min(ILA_NUM_BANKS - 1, 16); bank++) { + uint32_t addr = ILA_DATA_BASE + (bank << (ILA_W_ADDR + 2)) + (i * 4); + uint32_t val = ctrl.read_dword(addr); + size_t byte_off = bank * 4; + if (byte_off + 4 <= response_bytes.size()) + memcpy(response_bytes.data() + byte_off, &val, 4); + } + + // Check control signals (bank 16): tvalid must be set + uint32_t ctrl_addr = + ILA_DATA_BASE + ((ILA_NUM_BANKS - 1) << (ILA_W_ADDR + 2)) + (i * 4); + uint32_t ctrl_val = ctrl.read_dword(ctrl_addr); + bool tvalid = (ctrl_val & 0x01) != 0; + + if (!tvalid) { + std::cerr << " Shot " << i << ": tvalid=0 (no response)" << std::endl; + continue; + } + + // Parse RPCResponse + auto *resp = reinterpret_cast( + response_bytes.data()); + + if (resp->magic != cudaq::realtime::RPC_MAGIC_RESPONSE) { + std::cerr << " Shot " << i << ": bad magic 0x" << std::hex + << resp->magic << std::dec << std::endl; + continue; + } + + if (resp->status != 0) { + std::cerr << " Shot " << i << ": error status " << resp->status + << std::endl; + continue; + } + + // Verify increment: each byte should be (shot_index + byte_index + 1) + const uint8_t *result_data = + response_bytes.data() + sizeof(cudaq::realtime::RPCResponse); + bool ok = true; + uint32_t check_len = std::min(resp->result_len, args.payload_size); + for (uint32_t j = 0; j < check_len && ok; j++) { + uint8_t expected = static_cast(((i + j) & 0xFF) + 1); + if (result_data[j] != expected) { + std::cerr << " Shot " << i << " byte " << j << ": expected " + << (int)expected << " got " << (int)result_data[j] + << std::endl; + ok = false; + } + } + if (ok) + matched++; + } + + std::cout << "\n=== Verification Results ===" << std::endl; + std::cout << " RPC responses matched: " << matched << " / " << check_count + << std::endl; + + if (matched == check_count) { + std::cout << "\n*** ALL RESPONSES VERIFIED ***" << std::endl; + return 0; + } else { + std::cout << "\n*** VERIFICATION FAILED ***" << std::endl; + return 1; + } + } else { + std::cout << "\n[4/4] Verification skipped (--no-verify)" << std::endl; + // Wait a bit for playback to complete + std::this_thread::sleep_for(std::chrono::seconds(10)); + std::cout << "\n*** PLAYBACK COMPLETE ***" << std::endl; + return 0; + } +} diff --git a/realtime/unittests/utils/hololink_test.sh b/realtime/unittests/utils/hololink_test.sh new file mode 100755 index 00000000..bafdb29b --- /dev/null +++ b/realtime/unittests/utils/hololink_test.sh @@ -0,0 +1,408 @@ +#!/bin/bash +# ============================================================================ # +# Copyright (c) 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # +# +# hololink_test.sh +# +# Orchestration script for end-to-end Hololink RPC dispatch testing. +# Tests libcudaq-realtime dispatch kernel over Hololink RDMA with a +# simple increment RPC handler (no QEC or decoder dependency). +# +# Modes: +# Default (FPGA): bridge + playback (requires real FPGA) +# --emulate: emulator + bridge + playback (no FPGA needed) +# +# Actions (can be combined): +# --build Build all required tools +# --setup-network Configure ConnectX interfaces +# (run is implicit unless only --build / --setup-network are given) +# +# Examples: +# # Full emulated test: build, configure network, run +# ./hololink_test.sh --emulate --build --setup-network +# +# # Just run with real FPGA (tools already built, network already set up) +# ./hololink_test.sh --fpga-ip 192.168.0.2 +# +# # Build only +# ./hololink_test.sh --build --no-run +# +set -euo pipefail + +# ============================================================================ +# Defaults +# ============================================================================ + +EMULATE=false +DO_BUILD=false +DO_SETUP_NETWORK=false +DO_RUN=true +VERIFY=true + +# Directory defaults +HOLOLINK_DIR="/workspaces/cuda-qx/hololink" +CUDA_QUANTUM_DIR="/workspaces/cuda-quantum" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Network defaults +IB_DEVICE="" # auto-detect +BRIDGE_IP="10.0.0.1" +EMULATOR_IP="10.0.0.2" +FPGA_IP="192.168.0.2" +MTU=4096 + +# Run defaults +GPU_ID=0 +TIMEOUT=60 +NUM_SHOTS=100 +PAYLOAD_SIZE=8 +PAGE_SIZE=384 +NUM_PAGES=64 +CONTROL_PORT=8193 + +# Build parallelism +JOBS=$(nproc 2>/dev/null || echo 8) + +# ============================================================================ +# Argument Parsing +# ============================================================================ + +print_usage() { + cat <<'EOF' +Usage: hololink_test.sh [options] + +Modes: + --emulate Use FPGA emulator (3-tool mode, no FPGA needed) + Default: FPGA mode (2-tool, requires real FPGA) + +Actions: + --build Build all required tools before running + --setup-network Configure ConnectX network interfaces + --no-run Skip running the test (useful with --build) + +Build options: + --hololink-dir DIR Hololink source directory + (default: /workspaces/cuda-qx/hololink) + --cuda-quantum-dir DIR cuda-quantum source directory + (default: /workspaces/cuda-quantum) + --jobs N Parallel build jobs (default: nproc) + +Network options: + --device DEV ConnectX IB device name (default: auto-detect) + --bridge-ip ADDR Bridge tool IP (default: 10.0.0.1) + --emulator-ip ADDR Emulator IP (default: 10.0.0.2) + --fpga-ip ADDR FPGA IP for non-emulate mode (default: 192.168.0.2) + --mtu N MTU size (default: 4096) + +Run options: + --gpu N GPU device ID (default: 0) + --timeout N Timeout in seconds (default: 60) + --no-verify Skip ILA correction verification + --num-shots N Number of RPC messages (default: 100) + --payload-size N Bytes per RPC payload (default: 8) + --page-size N Ring buffer slot size in bytes (default: 384) + --num-pages N Number of ring buffer slots (default: 64) + --control-port N UDP control port for emulator (default: 8193) + + --help, -h Show this help +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --emulate) EMULATE=true ;; + --build) DO_BUILD=true ;; + --setup-network) DO_SETUP_NETWORK=true ;; + --no-run) DO_RUN=false ;; + --no-verify) VERIFY=false ;; + --hololink-dir) HOLOLINK_DIR="$2"; shift ;; + --cuda-quantum-dir) CUDA_QUANTUM_DIR="$2"; shift ;; + --jobs) JOBS="$2"; shift ;; + --device) IB_DEVICE="$2"; shift ;; + --bridge-ip) BRIDGE_IP="$2"; shift ;; + --emulator-ip) EMULATOR_IP="$2"; shift ;; + --fpga-ip) FPGA_IP="$2"; shift ;; + --mtu) MTU="$2"; shift ;; + --gpu) GPU_ID="$2"; shift ;; + --timeout) TIMEOUT="$2"; shift ;; + --num-shots) NUM_SHOTS="$2"; shift ;; + --payload-size) PAYLOAD_SIZE="$2"; shift ;; + --page-size) PAGE_SIZE="$2"; shift ;; + --num-pages) NUM_PAGES="$2"; shift ;; + --control-port) CONTROL_PORT="$2"; shift ;; + --help|-h) print_usage; exit 0 ;; + *) + echo "ERROR: Unknown option: $1" >&2 + print_usage >&2 + exit 1 + ;; + esac + shift +done + +# ============================================================================ +# Auto-detect IB device +# ============================================================================ + +detect_ib_device() { + if [[ -n "$IB_DEVICE" ]]; then + echo "$IB_DEVICE" + return + fi + local dev + dev=$(ibstat -l 2>/dev/null | head -1 || true) + if [[ -z "$dev" ]]; then + dev=$(ls /sys/class/infiniband/ 2>/dev/null | head -1 || true) + fi + if [[ -z "$dev" ]]; then + echo "ERROR: Could not auto-detect IB device. Use --device." >&2 + exit 1 + fi + echo "$dev" +} + +# ============================================================================ +# Network interface name from IB device +# ============================================================================ + +get_netdev() { + local ib_dev=$1 + local netdev + netdev=$(ls "/sys/class/infiniband/$ib_dev/device/net/" 2>/dev/null | head -1 || true) + echo "$netdev" +} + +# ============================================================================ +# Build +# ============================================================================ + +do_build() { + echo "=== Building tools ===" + + local realtime_dir="$CUDA_QUANTUM_DIR/realtime" + local realtime_build="$realtime_dir/build" + local hololink_build="$HOLOLINK_DIR/build" + + # Detect target arch + local arch + arch=$(uname -m) + local target_arch="amd64" + if [[ "$arch" == "aarch64" ]]; then + target_arch="arm64" + fi + + # Build hololink (only the two libraries we need) + echo "--- Building hololink ($target_arch) ---" + cmake -G Ninja -S "$HOLOLINK_DIR" -B "$hololink_build" \ + -DCMAKE_BUILD_TYPE=Release \ + -DTARGETARCH="$target_arch" \ + -DHOLOLINK_BUILD_ONLY_NATIVE=OFF \ + -DHOLOLINK_BUILD_PYTHON=OFF \ + -DHOLOLINK_BUILD_TESTS=OFF \ + -DHOLOLINK_BUILD_TOOLS=OFF \ + -DHOLOLINK_BUILD_EXAMPLES=OFF \ + -DHOLOLINK_BUILD_EMULATOR=OFF + cmake --build "$hololink_build" -j"$JOBS" \ + --target gpu_roce_transceiver hololink_core + + # Build cuda-quantum/realtime with hololink tools enabled + echo "--- Building cuda-quantum/realtime ---" + cmake -G Ninja -S "$realtime_dir" -B "$realtime_build" \ + -DCMAKE_BUILD_TYPE=Release \ + -DCUDAQ_REALTIME_ENABLE_HOLOLINK_TOOLS=ON \ + -DHOLOSCAN_SENSOR_BRIDGE_SOURCE_DIR="$HOLOLINK_DIR" \ + -DHOLOSCAN_SENSOR_BRIDGE_BUILD_DIR="$hololink_build" + cmake --build "$realtime_build" -j"$JOBS" \ + --target hololink_bridge hololink_fpga_emulator hololink_fpga_playback + + echo "=== Build complete ===" +} + +# ============================================================================ +# Network setup +# ============================================================================ + +do_setup_network() { + IB_DEVICE=$(detect_ib_device) + local netdev + netdev=$(get_netdev "$IB_DEVICE") + + echo "=== Setting up network ===" + echo " IB device: $IB_DEVICE" + echo " Net device: $netdev" + + if [[ -z "$netdev" ]]; then + echo "ERROR: No network device found for $IB_DEVICE" >&2 + exit 1 + fi + + sudo ip link set "$netdev" up mtu "$MTU" || true + sudo ip addr add "$BRIDGE_IP/24" dev "$netdev" 2>/dev/null || true + + if $EMULATE; then + sudo ip addr add "$EMULATOR_IP/24" dev "$netdev" 2>/dev/null || true + # Add static ARP entries + sudo ip neigh replace "$BRIDGE_IP" lladdr "$(cat /sys/class/net/$netdev/address)" dev "$netdev" nud permanent 2>/dev/null || true + sudo ip neigh replace "$EMULATOR_IP" lladdr "$(cat /sys/class/net/$netdev/address)" dev "$netdev" nud permanent 2>/dev/null || true + fi + + echo "=== Network setup complete ===" +} + +# ============================================================================ +# Run +# ============================================================================ + +cleanup_pids() { + for pid in "${PIDS[@]}"; do + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" 2>/dev/null || true + wait "$pid" 2>/dev/null || true + fi + done +} + +do_run() { + IB_DEVICE=$(detect_ib_device) + local build_dir="$CUDA_QUANTUM_DIR/realtime/build" + local utils_dir="$build_dir/unittests/utils" + + local bridge_bin="$utils_dir/hololink_bridge" + local emulator_bin="$utils_dir/hololink_fpga_emulator" + local playback_bin="$utils_dir/hololink_fpga_playback" + + # Verify binaries exist + for bin in "$bridge_bin"; do + if [[ ! -x "$bin" ]]; then + echo "ERROR: $bin not found. Run with --build first." >&2 + exit 1 + fi + done + + PIDS=() + trap cleanup_pids EXIT + + local FPGA_QP + local FPGA_TARGET_IP + + if $EMULATE; then + echo "=== Emulated mode ===" + + # Start emulator + echo "--- Starting emulator ---" + "$emulator_bin" \ + --device="$IB_DEVICE" \ + --port="$CONTROL_PORT" \ + --bridge-ip="$BRIDGE_IP" \ + --page-size="$PAGE_SIZE" \ + 2>&1 | tee /tmp/emulator.log & + PIDS+=($!) + + # Wait for emulator to print QP number + sleep 2 + FPGA_QP=$(grep -oP 'QP Number: 0x\K[0-9a-fA-F]+' /tmp/emulator.log | head -1) + if [[ -z "$FPGA_QP" ]]; then + echo "ERROR: Could not parse emulator QP from log" >&2 + exit 1 + fi + FPGA_QP="0x$FPGA_QP" + FPGA_TARGET_IP="$EMULATOR_IP" + + echo " Emulator QP: $FPGA_QP" + else + echo "=== FPGA mode ===" + FPGA_QP="0x2" + FPGA_TARGET_IP="$FPGA_IP" + fi + + # Start bridge + echo "--- Starting bridge ---" + "$bridge_bin" \ + --device="$IB_DEVICE" \ + --peer-ip="$FPGA_TARGET_IP" \ + --remote-qp="$FPGA_QP" \ + --gpu="$GPU_ID" \ + --timeout="$TIMEOUT" \ + --page-size="$PAGE_SIZE" \ + --num-pages="$NUM_PAGES" \ + 2>&1 | tee /tmp/bridge.log & + PIDS+=($!) + + # Wait for bridge to print QP info + sleep 3 + local BRIDGE_QP BRIDGE_RKEY BRIDGE_BUFFER + BRIDGE_QP=$(grep -oP 'QP Number: 0x\K[0-9a-fA-F]+' /tmp/bridge.log | tail -1) + BRIDGE_RKEY=$(grep -oP 'RKey: \K[0-9]+' /tmp/bridge.log | tail -1) + BRIDGE_BUFFER=$(grep -oP 'Buffer Addr: 0x\K[0-9a-fA-F]+' /tmp/bridge.log | tail -1) + + if [[ -z "$BRIDGE_QP" || -z "$BRIDGE_RKEY" || -z "$BRIDGE_BUFFER" ]]; then + echo "ERROR: Could not parse bridge QP info from log" >&2 + echo " QP=$BRIDGE_QP RKEY=$BRIDGE_RKEY BUFFER=$BRIDGE_BUFFER" >&2 + exit 1 + fi + + echo " Bridge QP: 0x$BRIDGE_QP" + echo " Bridge RKey: $BRIDGE_RKEY" + echo " Bridge Buffer: 0x$BRIDGE_BUFFER" + + # Start playback + echo "--- Starting playback ---" + local verify_flag="" + if ! $VERIFY; then + verify_flag="--no-verify" + fi + + "$playback_bin" \ + --control-ip="$FPGA_TARGET_IP" \ + --control-port="$CONTROL_PORT" \ + --bridge-qp="0x$BRIDGE_QP" \ + --bridge-rkey="$BRIDGE_RKEY" \ + --bridge-buffer="0x$BRIDGE_BUFFER" \ + --page-size="$PAGE_SIZE" \ + --num-pages="$NUM_PAGES" \ + --num-shots="$NUM_SHOTS" \ + --payload-size="$PAYLOAD_SIZE" \ + --bridge-ip="$BRIDGE_IP" \ + $verify_flag + PLAYBACK_EXIT=$? + + # Wait for bridge to finish + sleep 2 + + # Cleanup + cleanup_pids + + echo "" + if [[ $PLAYBACK_EXIT -eq 0 ]]; then + echo "*** TEST PASSED ***" + else + echo "*** TEST FAILED ***" + fi + exit $PLAYBACK_EXIT +} + +# ============================================================================ +# Main +# ============================================================================ + +echo "=== Hololink Generic RPC Test ===" +echo "Mode: $(if $EMULATE; then echo "emulated"; else echo "FPGA"; fi)" + +if $DO_BUILD; then + do_build +fi + +if $DO_SETUP_NETWORK; then + do_setup_network +fi + +if $DO_RUN; then + do_run +fi + +echo "Done." diff --git a/realtime/unittests/utils/hololink_wrapper.cpp b/realtime/unittests/utils/hololink_wrapper.cpp new file mode 100644 index 00000000..fb83aedb --- /dev/null +++ b/realtime/unittests/utils/hololink_wrapper.cpp @@ -0,0 +1,216 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +/// @file hololink_wrapper.cpp +/// @brief C wrapper implementation for Hololink GpuRoceTransceiver. +/// +/// This file is compiled by g++ (not nvcc) to isolate Hololink's fmt +/// dependency from CUDA translation units. + +#include "hololink_wrapper.h" + +// Include Hololink headers here (with Holoscan's fmt) +#include + +#include + +using namespace hololink::operators; + +//============================================================================== +// Internal implementation struct +//============================================================================== + +struct HololinkTransceiverImpl { + std::unique_ptr transceiver; + size_t page_size; + unsigned num_pages; +}; + +//============================================================================== +// Lifecycle +//============================================================================== + +hololink_transceiver_t +hololink_create_transceiver(const char *device_name, int ib_port, + size_t frame_size, size_t page_size, + unsigned num_pages, const char *peer_ip, + int forward, int rx_only, int tx_only) { + try { + auto *impl = new HololinkTransceiverImpl(); + impl->page_size = page_size; + impl->num_pages = num_pages; + impl->transceiver = std::make_unique( + device_name, static_cast(ib_port), frame_size, page_size, + num_pages, peer_ip, forward != 0, rx_only != 0, tx_only != 0); + return reinterpret_cast(impl); + } catch (const std::exception &e) { + std::cerr << "ERROR: Failed to create GpuRoceTransceiver: " << e.what() + << std::endl; + return nullptr; + } catch (...) { + std::cerr << "ERROR: Failed to create GpuRoceTransceiver: unknown exception" + << std::endl; + return nullptr; + } +} + +void hololink_destroy_transceiver(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + delete impl; + } +} + +int hololink_start(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->start() ? 1 : 0; + } + return 0; +} + +void hololink_close(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + impl->transceiver->close(); + } +} + +void hololink_blocking_monitor(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + impl->transceiver->blocking_monitor(); + } +} + +//============================================================================== +// QP information +//============================================================================== + +uint32_t hololink_get_qp_number(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->get_qp_number(); + } + return 0; +} + +uint32_t hololink_get_rkey(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->get_rkey(); + } + return 0; +} + +uint64_t hololink_get_buffer_addr(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->external_frame_memory(); + } + return 0; +} + +int hololink_get_gid(hololink_transceiver_t handle, uint8_t *gid_out) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->get_gid(gid_out); + } + return 0; +} + +//============================================================================== +// Deferred QP connection +//============================================================================== + +int hololink_reconnect_qp(hololink_transceiver_t handle, + const uint8_t *remote_gid, uint32_t remote_qpn) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->reconnect_qp(remote_gid, remote_qpn) ? 1 : 0; + } + return 0; +} + +//============================================================================== +// Ring buffer access +//============================================================================== + +void *hololink_get_rx_ring_data_addr(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->get_rx_ring_data_addr(); + } + return nullptr; +} + +uint64_t *hololink_get_rx_ring_flag_addr(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->get_rx_ring_flag_addr(); + } + return nullptr; +} + +void *hololink_get_tx_ring_data_addr(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->get_tx_ring_data_addr(); + } + return nullptr; +} + +uint64_t *hololink_get_tx_ring_flag_addr(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->get_tx_ring_flag_addr(); + } + return nullptr; +} + +uint64_t *hololink_get_tx_ring_flag_host_addr(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->get_tx_ring_flag_host_addr(); + } + return nullptr; +} + +uint64_t *hololink_get_rx_ring_flag_host_addr(hololink_transceiver_t handle) { + // Note: GpuRoceTransceiver does not currently expose host RX flag addr. + (void)handle; + return nullptr; +} + +bool hololink_query_kernel_occupancy(void) { + int prep = 0, rx = 0, tx = 0; + cudaError_t err = GpuRoceTransceiverQueryOccupancy(&prep, &rx, &tx); + if (err != cudaSuccess) { + fprintf(stderr, "ERROR: Hololink kernel occupancy query failed: %s\n", + cudaGetErrorString(err)); + return false; + } + printf(" Hololink kernel occupancy: prepare=%d rx=%d tx=%d\n", prep, rx, tx); + return true; +} + +size_t hololink_get_page_size(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->page_size; + } + return 0; +} + +unsigned hololink_get_num_pages(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->num_pages; + } + return 0; +} diff --git a/realtime/unittests/utils/hololink_wrapper.h b/realtime/unittests/utils/hololink_wrapper.h new file mode 100644 index 00000000..ebc2ceef --- /dev/null +++ b/realtime/unittests/utils/hololink_wrapper.h @@ -0,0 +1,142 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +/// @file hololink_wrapper.h +/// @brief C interface to Hololink GpuRoceTransceiver. +/// +/// This wrapper avoids `fmt` library conflicts between Hololink (which uses +/// Holoscan's `fmt`) and CUDA files compiled by nvcc. + +#ifndef HOLOLINK_WRAPPER_H +#define HOLOLINK_WRAPPER_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Opaque handle for GpuRoceTransceiver +typedef void *hololink_transceiver_t; + +//============================================================================== +// Transceiver lifecycle +//============================================================================== + +/** + * Create a new Hololink transceiver. + * + * @param device_name IB device name (e.g., "rocep1s0f0") + * @param ib_port IB port number + * @param frame_size Size of each frame (cu_frame_size) + * @param page_size Size of each page/slot (cu_page_size) + * @param num_pages Number of pages (ring buffer slots) + * @param peer_ip Peer IP address (use "0.0.0.0" for deferred connection) + * @param forward 1 to run forward (echo) kernel + * @param rx_only 1 to run RX-only kernel + * @param tx_only 1 to run TX-only kernel + * @return Handle to transceiver, or NULL on failure + */ +hololink_transceiver_t +hololink_create_transceiver(const char *device_name, int ib_port, + size_t frame_size, size_t page_size, + unsigned num_pages, const char *peer_ip, + int forward, int rx_only, int tx_only); + +/** + * Destroy a transceiver and free resources. + */ +void hololink_destroy_transceiver(hololink_transceiver_t handle); + +/** + * Start the transceiver (initializes DOCA resources, creates QP/CQ). + * @return 1 on success, 0 on failure + */ +int hololink_start(hololink_transceiver_t handle); + +/** + * Close the transceiver (signals shutdown). + */ +void hololink_close(hololink_transceiver_t handle); + +/** + * Run the blocking monitor (launches GPU kernels and waits). + * This function blocks until close() is called. + */ +void hololink_blocking_monitor(hololink_transceiver_t handle); + +//============================================================================== +// QP information (for RDMA setup) +//============================================================================== + +uint32_t hololink_get_qp_number(hololink_transceiver_t handle); +uint32_t hololink_get_rkey(hololink_transceiver_t handle); +uint64_t hololink_get_buffer_addr(hololink_transceiver_t handle); + +/** + * Get the local GID for this transceiver. + * @param handle Transceiver handle + * @param gid_out Buffer to receive 16-byte GID + * @return 1 on success, 0 on failure + */ +int hololink_get_gid(hololink_transceiver_t handle, uint8_t *gid_out); + +//============================================================================== +// Deferred QP connection +//============================================================================== + +/** + * Connect the QP to a remote peer (for deferred connection mode). + * Call this after start() when peer_ip was "0.0.0.0". + * @param handle Transceiver handle + * @param remote_gid 16-byte remote GID + * @param remote_qpn Remote QP number + * @return 1 on success, 0 on failure + */ +int hololink_reconnect_qp(hololink_transceiver_t handle, + const uint8_t *remote_gid, uint32_t remote_qpn); + +//============================================================================== +// Ring buffer access +//============================================================================== + +/** Get device pointer to RX ring data buffer. */ +void *hololink_get_rx_ring_data_addr(hololink_transceiver_t handle); + +/** Get device pointer to RX ring flag array. */ +uint64_t *hololink_get_rx_ring_flag_addr(hololink_transceiver_t handle); + +/** Get device pointer to TX ring data buffer. */ +void *hololink_get_tx_ring_data_addr(hololink_transceiver_t handle); + +/** Get device pointer to TX ring flag array. */ +uint64_t *hololink_get_tx_ring_flag_addr(hololink_transceiver_t handle); + +/** Get host-accessible pointer to TX ring flag array. */ +uint64_t *hololink_get_tx_ring_flag_host_addr(hololink_transceiver_t handle); + +/** Get host-accessible pointer to RX ring flag array. */ +uint64_t *hololink_get_rx_ring_flag_host_addr(hololink_transceiver_t handle); + +/** Force eager CUDA module loading by querying kernel occupancy. + * Call before launching any persistent kernels. + * Returns true on success (all kernels valid). */ +bool hololink_query_kernel_occupancy(void); + +/** Get the page (slot) size configured for this transceiver. */ +size_t hololink_get_page_size(hololink_transceiver_t handle); + +/** Get the number of pages (slots) configured for this transceiver. */ +unsigned hololink_get_num_pages(hololink_transceiver_t handle); + +#ifdef __cplusplus +} +#endif + +#endif // HOLOLINK_WRAPPER_H diff --git a/realtime/unittests/utils/init_rpc_increment_function_table.cu b/realtime/unittests/utils/init_rpc_increment_function_table.cu new file mode 100644 index 00000000..5365bcb4 --- /dev/null +++ b/realtime/unittests/utils/init_rpc_increment_function_table.cu @@ -0,0 +1,92 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +/// @file init_rpc_increment_function_table.cu +/// @brief Device-side increment RPC handler and function table initialisation. +/// +/// This file is compiled by nvcc so that the __device__ function pointer +/// can be taken. The host-callable setup_rpc_increment_function_table() +/// wrapper is extern "C" so that the bridge .cpp (compiled by g++) can +/// call it without needing CUDA kernel launch syntax. + +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_modes.h" + +#include +#include + +namespace { + +//============================================================================== +// Increment RPC Handler +//============================================================================== + +/// @brief Simple RPC handler that increments each byte of the payload by 1. +/// +/// Matches the DeviceRPCFunction signature. Reads from input, writes to +/// output (no in-place overlap). +__device__ int rpc_increment_handler(const void *input, void *output, + std::uint32_t arg_len, + std::uint32_t max_result_len, + std::uint32_t *result_len) { + const std::uint8_t *in_data = static_cast(input); + std::uint8_t *out_data = static_cast(output); + std::uint32_t len = (arg_len < max_result_len) ? arg_len : max_result_len; + for (std::uint32_t i = 0; i < len; ++i) { + out_data[i] = static_cast(in_data[i] + 1); + } + *result_len = len; + return 0; +} + +constexpr std::uint32_t RPC_INCREMENT_FUNCTION_ID = + cudaq::realtime::fnv1a_hash("rpc_increment"); + +/// @brief Kernel to populate a cudaq_function_entry_t with the increment +/// handler. +__global__ void init_function_table_kernel(cudaq_function_entry_t *entries) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + entries[0].handler.device_fn_ptr = + reinterpret_cast(&rpc_increment_handler); + entries[0].function_id = RPC_INCREMENT_FUNCTION_ID; + entries[0].dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; + entries[0].reserved[0] = 0; + entries[0].reserved[1] = 0; + entries[0].reserved[2] = 0; + + // Schema: 1 array argument (uint8), 1 array result (uint8) + entries[0].schema.num_args = 1; + entries[0].schema.num_results = 1; + entries[0].schema.reserved = 0; + entries[0].schema.args[0].type_id = CUDAQ_TYPE_ARRAY_UINT8; + entries[0].schema.args[0].reserved[0] = 0; + entries[0].schema.args[0].reserved[1] = 0; + entries[0].schema.args[0].reserved[2] = 0; + entries[0].schema.args[0].size_bytes = 0; + entries[0].schema.args[0].num_elements = 0; + entries[0].schema.results[0].type_id = CUDAQ_TYPE_ARRAY_UINT8; + entries[0].schema.results[0].reserved[0] = 0; + entries[0].schema.results[0].reserved[1] = 0; + entries[0].schema.results[0].reserved[2] = 0; + entries[0].schema.results[0].size_bytes = 0; + entries[0].schema.results[0].num_elements = 0; + } +} + +} // anonymous namespace + +//============================================================================== +// Host-Callable Wrapper +//============================================================================== + +extern "C" void +setup_rpc_increment_function_table(cudaq_function_entry_t *d_entries) { + init_function_table_kernel<<<1, 1>>>(d_entries); + cudaDeviceSynchronize(); +} From 84bbda27b889470ef4770a20accde53a10e661dc Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Thu, 26 Feb 2026 18:13:20 +0000 Subject: [PATCH 17/40] Fix streaming pipeline: out-of-order consumer, race fix, and timing instrumentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the duplicate QEC-local host_dispatcher.{h,cpp} in favor of the canonical realtime library versions, eliminating link ambiguity. Fix three correctness/performance bugs in the streaming pipeline: - Consumer was strict in-order, causing 327 µs head-of-line blocking when parallel workers complete out of order. Changed to scan all active slots and harvest whichever are ready. - Dispatcher set tx_flags=READY immediately on graph launch (when tx_data_host was non-null), causing phantom completions. Set tx_data_host/dev to nullptr so dispatcher uses IN_FLIGHT sentinel. - Race between consumer clearing tx_flags and resetting slot_request: producer could see slot available and write slot_request before the consumer's slot_request=-1, permanently orphaning the slot. Fixed by resetting slot_request before clearing tx_flags with a store fence (__sync_synchronize) for ARM memory ordering. Replace broken timing breakdown (dispatch_ts was always 0, making the entire report show "Other/Misc Wait") with a 3-stage per-request breakdown: [A] submit→worker poll, [B] worker task, [C] consumer poll lag, with p50/p99 percentiles. Also: reduce NUM_SLOTS 64→16 to cut queuing delay, remove unused queue_depth from PipelineConfig, add DISABLE_PYMATCHING conditional compilation, add stuck-request diagnostics, and remove batch mode / watchdog / dead code. Results (d7, 8 workers, open-loop): 62.5K req/s, 230 µs mean latency, 500K/500K completed, 0 drops. Signed-off-by: Scott Thornton --- .../cudaq/qec/realtime/host_dispatcher.h | 65 ----- libs/qec/lib/realtime/host_dispatcher.cpp | 93 ------- .../test_realtime_predecoder_w_pymatching.cpp | 226 +++++++++++------- 3 files changed, 138 insertions(+), 246 deletions(-) delete mode 100644 libs/qec/include/cudaq/qec/realtime/host_dispatcher.h delete mode 100644 libs/qec/lib/realtime/host_dispatcher.cpp diff --git a/libs/qec/include/cudaq/qec/realtime/host_dispatcher.h b/libs/qec/include/cudaq/qec/realtime/host_dispatcher.h deleted file mode 100644 index 82412b75..00000000 --- a/libs/qec/include/cudaq/qec/realtime/host_dispatcher.h +++ /dev/null @@ -1,65 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2026 NVIDIA Corporation & Affiliates. - * All rights reserved. - * - * This source code and the accompanying materials are made available under - * the terms of the Apache License 2.0 which accompanies this distribution. - ******************************************************************************/ - -#pragma once - -#include -#include -#include -#include -#include - -#ifndef QEC_CPU_RELAX -#if defined(__x86_64__) -#include -#define QEC_CPU_RELAX() _mm_pause() -#elif defined(__aarch64__) -#define QEC_CPU_RELAX() __asm__ volatile("yield" ::: "memory") -#else -#define QEC_CPU_RELAX() do { } while (0) -#endif -#endif - -namespace cudaq::qec { - -using atomic_uint64_sys = cuda::std::atomic; -using atomic_int_sys = cuda::std::atomic; - -struct HostDispatchWorker { - cudaGraphExec_t graph_exec; - cudaStream_t stream; -}; - -struct HostDispatcherConfig { - atomic_uint64_sys* rx_flags; - atomic_uint64_sys* tx_flags; - uint8_t* rx_data_host; - uint8_t* rx_data_dev; - void** h_mailbox_bank; - size_t num_slots; - size_t slot_size; - std::vector workers; - atomic_int_sys* shutdown_flag; - uint64_t* stats_counter; - /// Optional: atomic counter incremented on each dispatch (for progress diagnostics). - atomic_uint64_sys* live_dispatched = nullptr; - - /// Dynamic worker pool (design: Host-Side Spin-Polling Dispatcher) - atomic_uint64_sys* idle_mask; ///< 1 = free, 0 = busy; bit index = worker_id - int* inflight_slot_tags; ///< worker_id -> origin FPGA slot for tx_flags routing - - // Optional arrays for timestamping - uint64_t* debug_dispatch_ts = nullptr; -}; - -/// Run the host-side dispatcher loop. Blocks until *config.shutdown_flag -/// becomes non-zero. Call from a dedicated thread. -/// Uses dynamic worker pool: allocates via idle_mask, tags with inflight_slot_tags. -void host_dispatcher_loop(const HostDispatcherConfig& config); - -} // namespace cudaq::qec diff --git a/libs/qec/lib/realtime/host_dispatcher.cpp b/libs/qec/lib/realtime/host_dispatcher.cpp deleted file mode 100644 index 65fb72a6..00000000 --- a/libs/qec/lib/realtime/host_dispatcher.cpp +++ /dev/null @@ -1,93 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2026 NVIDIA Corporation & Affiliates. - * All rights reserved. - * - * This source code and the accompanying materials are made available under - * the terms of the Apache License 2.0 which accompanies this distribution. - ******************************************************************************/ - -#include "cudaq/qec/realtime/host_dispatcher.h" - -#include -#include - -namespace cudaq::qec { - -void host_dispatcher_loop(const HostDispatcherConfig& config) { - size_t current_slot = 0; - const size_t num_slots = config.num_slots; - const int num_workers = static_cast(config.workers.size()); - uint64_t packets_dispatched = 0; - - nvtxRangePushA("Dispatcher Loop"); - - while (config.shutdown_flag->load(cuda::std::memory_order_acquire) == 0) { - uint64_t rx_value = config.rx_flags[current_slot].load(cuda::std::memory_order_acquire); - - if (rx_value != 0) { - nvtxRangePushA("Process Slot"); - - uint64_t mask = config.idle_mask->load(cuda::std::memory_order_acquire); - if (mask == 0) { - nvtxRangePushA("Wait Worker"); - QEC_CPU_RELAX(); - nvtxRangePop(); // Wait Worker - nvtxRangePop(); // Process Slot - continue; - } - - int worker_id = __builtin_ffsll(static_cast(mask)) - 1; - config.idle_mask->fetch_and(~(1ULL << worker_id), cuda::std::memory_order_release); - - config.inflight_slot_tags[worker_id] = static_cast(current_slot); - - void* data_host = reinterpret_cast(rx_value); - ptrdiff_t offset = static_cast(data_host) - config.rx_data_host; - void* data_dev = static_cast(config.rx_data_dev + offset); - - config.h_mailbox_bank[worker_id] = data_dev; - __sync_synchronize(); - - if (config.debug_dispatch_ts) { - config.debug_dispatch_ts[current_slot] = std::chrono::duration_cast( - std::chrono::high_resolution_clock::now().time_since_epoch()).count(); - } - - nvtxRangePushA("Launch Graph"); - cudaError_t err = cudaGraphLaunch(config.workers[worker_id].graph_exec, - config.workers[worker_id].stream); - nvtxRangePop(); // Launch Graph - - if (err != cudaSuccess) { - uint64_t error_val = (uint64_t)0xDEAD << 48 | (uint64_t)err; - config.tx_flags[current_slot].store(error_val, cuda::std::memory_order_release); - config.idle_mask->fetch_or(1ULL << worker_id, cuda::std::memory_order_release); - } else { - // Mark slot IN_FLIGHT so producer doesn't overwrite while GPU/workers use it - config.tx_flags[current_slot].store(0xEEEEEEEEEEEEEEEEULL, cuda::std::memory_order_release); - } - - config.rx_flags[current_slot].store(0, cuda::std::memory_order_release); - packets_dispatched++; - if (config.live_dispatched) - config.live_dispatched->fetch_add(1, cuda::std::memory_order_relaxed); - current_slot = (current_slot + 1) % num_slots; - - nvtxRangePop(); // Process Slot - } else { - QEC_CPU_RELAX(); - } - } - - nvtxRangePop(); // Dispatcher Loop - - for (const auto& w : config.workers) { - cudaStreamSynchronize(w.stream); - } - - if (config.stats_counter) { - *config.stats_counter = packets_dispatched; - } -} - -} // namespace cudaq::qec diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index 7ae57299..e15a6f66 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -110,7 +110,7 @@ namespace realtime_ns = cudaq::realtime; // Pipeline Configuration // ============================================================================= - constexpr size_t NUM_SLOTS = 64; + constexpr size_t NUM_SLOTS = 16; struct PipelineConfig { std::string label; @@ -121,7 +121,6 @@ namespace realtime_ns = cudaq::realtime; std::string onnx_filename; size_t slot_size; // must fit RPC header (CUDAQ_RPC_HEADER_SIZE) + input payload int num_predecoders; - int queue_depth; int num_workers; int input_elements() const { return meas_qubits * num_rounds; } @@ -148,8 +147,7 @@ namespace realtime_ns = cudaq::realtime; /*residual_detectors=*/336, "model1_d7_r7_unified_Z_batch1.onnx", /*slot_size=*/4096, -/*num_predecoders=*/8, - /*queue_depth=*/16, + /*num_predecoders=*/8, /*num_workers=*/8 }; } @@ -163,8 +161,7 @@ namespace realtime_ns = cudaq::realtime; /*residual_detectors=*/2184, "model1_d13_r13_unified_Z_batch1.onnx", /*slot_size=*/16384, -/*num_predecoders=*/8, - /*queue_depth=*/16, + /*num_predecoders=*/8, /*num_workers=*/8 }; } @@ -178,8 +175,7 @@ namespace realtime_ns = cudaq::realtime; /*residual_detectors=*/9240, "model1_d21_r21_unified_X_batch1.onnx", /*slot_size=*/65536, -/*num_predecoders=*/8, - /*queue_depth=*/16, + /*num_predecoders=*/8, /*num_workers=*/8 }; } @@ -194,7 +190,6 @@ namespace realtime_ns = cudaq::realtime; "model1_d31_r31_unified_Z_batch1.onnx", /*slot_size=*/262144, /*num_predecoders=*/8, - /*queue_depth=*/16, /*num_workers=*/8 }; } @@ -230,7 +225,8 @@ namespace realtime_ns = cudaq::realtime; realtime_ns::atomic_uint64_sys* tx_flags = nullptr; realtime_ns::atomic_uint64_sys* idle_mask = nullptr; int* inflight_slot_tags = nullptr; - uint64_t* debug_poll_ts = nullptr; + uint64_t* debug_poll_ts = nullptr; // when worker poll_next_job succeeded (ns epoch) + uint64_t* debug_worker_done_ts = nullptr; // when worker set tx_flags (ns epoch) }; // ============================================================================= @@ -255,13 +251,14 @@ namespace realtime_ns = cudaq::realtime; worker_start.time_since_epoch()).count(); } - const int32_t* residual = static_cast(job.inference_data); - auto* my_decoder = ctx->acquire_decoder(); - int total_corrections = 0; bool all_converged = true; auto decode_start = hrclock::now(); +#if !defined(DISABLE_PYMATCHING) + const int32_t* residual = static_cast(job.inference_data); + auto* my_decoder = ctx->acquire_decoder(); + nvtxRangePushA("PyMatching Decode"); cudaqx::tensor syndrome_tensor({(size_t)ctx->z_stabilizers}); @@ -280,6 +277,7 @@ namespace realtime_ns = cudaq::realtime; if (v > 0.5) total_corrections++; } nvtxRangePop(); // PyMatching Decode +#endif auto decode_end = hrclock::now(); DecodeResponse resp_data{total_corrections, all_converged ? 1 : 0}; @@ -302,6 +300,11 @@ namespace realtime_ns = cudaq::realtime; g_sys_ctx.tx_flags_host[slot_idx].store(rx_value, cuda::std::memory_order_release); } + if (pool_ctx && pool_ctx->debug_worker_done_ts) { + pool_ctx->debug_worker_done_ts[origin_slot] = std::chrono::duration_cast( + hrclock::now().time_since_epoch()).count(); + } + predecoder->release_job(job.slot_idx); if (pool_ctx && pool_ctx->idle_mask) { @@ -368,6 +371,7 @@ namespace realtime_ns = cudaq::realtime; std::vector completed(max_requests, false); std::vector dispatch_ts(max_requests, 0); std::vector poll_ts(max_requests, 0); + std::vector worker_done_ts(max_requests, 0); std::vector slot_request(NUM_SLOTS, -1); std::vector debug_dispatch_ts_arr(NUM_SLOTS, 0); @@ -397,9 +401,9 @@ namespace realtime_ns = cudaq::realtime; disp_cfg.tx_flags = tx_flags; disp_cfg.rx_data_host = rx_data_host; disp_cfg.rx_data_dev = rx_data_dev; - disp_cfg.tx_data_host = rx_data_host; - disp_cfg.tx_data_dev = rx_data_dev; - disp_cfg.tx_stride_sz = config.slot_size; + disp_cfg.tx_data_host = nullptr; + disp_cfg.tx_data_dev = nullptr; + disp_cfg.tx_stride_sz = config.slot_size; disp_cfg.h_mailbox_bank = h_mailbox_bank; disp_cfg.num_slots = NUM_SLOTS; disp_cfg.slot_size = config.slot_size; @@ -452,7 +456,7 @@ namespace realtime_ns = cudaq::realtime; << std::flush; // Progress reporter (debug only; set to true to print submitted/completed every second) - constexpr bool kEnableProgressReporter = false; + constexpr bool kEnableProgressReporter = true; std::atomic progress_done{false}; std::thread progress_reporter; if (kEnableProgressReporter) { @@ -523,10 +527,8 @@ namespace realtime_ns = cudaq::realtime; }); pin_thread_to_core(producer, 3); - // --- Consumer thread (harvests completions sequentially) --- + // --- Consumer thread (harvests completions out-of-order) --- std::thread consumer([&]() { - int next_harvest = 0; - while (true) { if (consumer_stop.load(std::memory_order_acquire)) break; @@ -537,40 +539,40 @@ namespace realtime_ns = cudaq::realtime; if (pdone && ncomp >= nsub) break; - if (next_harvest >= nsub) { - QEC_CPU_RELAX(); - continue; - } - - int slot = next_harvest % (int)NUM_SLOTS; - int cuda_error = 0; - cudaq_tx_status_t status = cudaq_host_ringbuffer_poll_tx_flag( - &rb, static_cast(slot), &cuda_error); - - if (status == CUDAQ_TX_READY) { - int rid = slot_request[slot]; - if (rid >= 0) { - complete_ts[rid] = hrclock::now(); - dispatch_ts[rid] = 0; - poll_ts[rid] = pool_ctx->debug_poll_ts ? pool_ctx->debug_poll_ts[slot] : 0; - completed[rid] = true; + bool found_any = false; + for (uint32_t s = 0; s < NUM_SLOTS; ++s) { + if (slot_request[s] < 0) continue; + + int cuda_error = 0; + cudaq_tx_status_t status = cudaq_host_ringbuffer_poll_tx_flag( + &rb, s, &cuda_error); + + if (status == CUDAQ_TX_READY) { + int rid = slot_request[s]; + if (rid >= 0) { + complete_ts[rid] = hrclock::now(); + poll_ts[rid] = pool_ctx->debug_poll_ts ? pool_ctx->debug_poll_ts[s] : 0; + worker_done_ts[rid] = pool_ctx->debug_worker_done_ts ? pool_ctx->debug_worker_done_ts[s] : 0; + completed[rid] = true; + total_completed.fetch_add(1, std::memory_order_relaxed); + } + slot_request[s] = -1; + __sync_synchronize(); + cudaq_host_ringbuffer_clear_slot(&rb, s); + found_any = true; + } else if (status == CUDAQ_TX_ERROR) { + std::cerr << " [FAIL] Slot " << s + << " cudaGraphLaunch error " << cuda_error + << " (" << cudaGetErrorString(static_cast(cuda_error)) + << ")\n"; total_completed.fetch_add(1, std::memory_order_relaxed); + slot_request[s] = -1; + __sync_synchronize(); + cudaq_host_ringbuffer_clear_slot(&rb, s); + found_any = true; } - cudaq_host_ringbuffer_clear_slot(&rb, static_cast(slot)); - slot_request[slot] = -1; - next_harvest++; - } else if (status == CUDAQ_TX_ERROR) { - std::cerr << " [FAIL] Slot " << slot - << " cudaGraphLaunch error " << cuda_error - << " (" << cudaGetErrorString(static_cast(cuda_error)) - << ")\n"; - total_completed.fetch_add(1, std::memory_order_relaxed); - cudaq_host_ringbuffer_clear_slot(&rb, static_cast(slot)); - slot_request[slot] = -1; - next_harvest++; - } else { - QEC_CPU_RELAX(); } + if (!found_any) QEC_CPU_RELAX(); } }); pin_thread_to_core(consumer, 4); @@ -579,11 +581,38 @@ namespace realtime_ns = cudaq::realtime; producer.join(); // Grace period for in-flight requests - auto grace_deadline = std::chrono::steady_clock::now() + std::chrono::seconds(10); + auto grace_deadline = std::chrono::steady_clock::now() + std::chrono::seconds(5); while (total_completed.load() < total_submitted.load() && std::chrono::steady_clock::now() < grace_deadline) { usleep(1000); } + + if (total_completed.load() < total_submitted.load()) { + int nsub_dbg = total_submitted.load(); + int ncomp_dbg = total_completed.load(); + std::cerr << " [DEBUG] Stuck: submitted=" << nsub_dbg << " completed=" << ncomp_dbg + << " diff=" << (nsub_dbg - ncomp_dbg) << "\n"; + for (uint32_t s = 0; s < NUM_SLOTS; ++s) { + uint64_t rx_val = reinterpret_cast(rx_flags)[s]; + uint64_t tx_val = reinterpret_cast(tx_flags)[s]; + int rid = slot_request[s]; + if (rx_val != 0 || tx_val != 0 || rid >= 0) { + std::cerr << " slot[" << s << "] rx=0x" << std::hex << rx_val + << " tx=0x" << tx_val << std::dec + << " slot_request=" << rid + << " (completed=" << (rid >= 0 ? (completed[rid] ? "YES" : "NO") : "n/a") + << ")\n"; + } + } + for (int w = 0; w < config.num_predecoders; ++w) { + auto* pd = predecoders[w].get(); + std::cerr << " worker[" << w << "] inflight_slot_tag=" + << pool_ctx->inflight_slot_tags[w] + << " idle=" << ((pool_ctx->idle_mask->load(cuda::std::memory_order_relaxed) >> w) & 1) + << "\n"; + } + } + consumer_stop.store(true, std::memory_order_release); shutdown_flag.store(1, cuda::std::memory_order_release); @@ -692,47 +721,66 @@ namespace realtime_ns = cudaq::realtime; double avg_decode = (double)decoder_ctx.total_decode_us.load() / n_decoded; double avg_worker = (double)decoder_ctx.total_worker_us.load() / n_decoded; double avg_overhead = avg_worker - avg_decode; - - double sum_dispatch_latency = 0; - double sum_gpu_execution = 0; - int count_valid_ts = 0; + + // Per-request breakdown using submit, poll (worker start), worker_done, complete timestamps. + // Stage A: submit → poll_ts = dispatch + graph launch + GPU execution + poll CAS + // Stage B: poll_ts → worker_done_ts = worker task (decode + response write + tx_flags set) + // Stage C: worker_done_ts → complete_ts = consumer polling delay + double sum_stage_a = 0, sum_stage_b = 0, sum_stage_c = 0; + int count_valid = 0; + std::vector stage_a_samples, stage_b_samples, stage_c_samples; for (int i = warmup; i < nsub; ++i) { - if (completed[i] && dispatch_ts[i] > 0) { - uint64_t submit_ns = std::chrono::duration_cast(submit_ts[i].time_since_epoch()).count(); - if (dispatch_ts[i] > submit_ns && poll_ts[i] > dispatch_ts[i]) { - sum_dispatch_latency += (dispatch_ts[i] - submit_ns) / 1000.0; - sum_gpu_execution += (poll_ts[i] - dispatch_ts[i]) / 1000.0; - count_valid_ts++; - } else if (i == warmup) { - std::cout << "Debug [warmup]: submit=" << submit_ns << " dispatch=" << dispatch_ts[i] << " poll=" << poll_ts[i] << "\n"; - } - } + if (!completed[i] || poll_ts[i] == 0 || worker_done_ts[i] == 0) continue; + uint64_t submit_ns = std::chrono::duration_cast( + submit_ts[i].time_since_epoch()).count(); + uint64_t complete_ns = std::chrono::duration_cast( + complete_ts[i].time_since_epoch()).count(); + if (poll_ts[i] <= submit_ns || worker_done_ts[i] < poll_ts[i] || complete_ns < worker_done_ts[i]) + continue; + double a = (poll_ts[i] - submit_ns) / 1000.0; + double b = (worker_done_ts[i] - poll_ts[i]) / 1000.0; + double c = (complete_ns - worker_done_ts[i]) / 1000.0; + sum_stage_a += a; sum_stage_b += b; sum_stage_c += c; + stage_a_samples.push_back(a); + stage_b_samples.push_back(b); + stage_c_samples.push_back(c); + count_valid++; } - double avg_dispatch_latency = count_valid_ts > 0 ? (sum_dispatch_latency / count_valid_ts) : 0; - double avg_gpu_execution = count_valid_ts > 0 ? (sum_gpu_execution / count_valid_ts) : 0; - - double avg_pipeline = mean - avg_worker; - + + auto percentile = [](std::vector& v, double pct) -> double { + if (v.empty()) return 0; + std::sort(v.begin(), v.end()); + size_t idx = std::min((size_t)(pct / 100.0 * v.size()), v.size() - 1); + return v[idx]; + }; + + double avg_a = count_valid > 0 ? sum_stage_a / count_valid : 0; + double avg_b = count_valid > 0 ? sum_stage_b / count_valid : 0; + double avg_c = count_valid > 0 ? sum_stage_c / count_valid : 0; + std::cout << std::setprecision(1); - std::cout << " Worker Timing Breakdown (avg over " << n_decoded << " requests):\n"; - std::cout << " Host Dispatch overhead:" << std::setw(9) << avg_dispatch_latency - << " us (" << std::setw(4) << (mean > 0 ? 100.0 * avg_dispatch_latency / mean : 0) - << "%)\n"; - std::cout << " GPU TRT Inference: " << std::setw(9) << avg_gpu_execution - << " us (" << std::setw(4) << (mean > 0 ? 100.0 * avg_gpu_execution / mean : 0) - << "%)\n"; - std::cout << " PyMatching decode: " << std::setw(9) << avg_decode - << " us (" << std::setw(4) << (mean > 0 ? 100.0 * avg_decode / mean : 0) - << "%)\n"; - std::cout << " Worker overhead: " << std::setw(9) << avg_overhead - << " us (" << std::setw(4) << (mean > 0 ? 100.0 * avg_overhead / mean : 0) - << "%)\n"; - std::cout << " Other/Misc Wait: " << std::setw(9) << (avg_pipeline - avg_dispatch_latency - avg_gpu_execution) - << " us (" << std::setw(4) << (mean > 0 ? 100.0 * (avg_pipeline - avg_dispatch_latency - avg_gpu_execution) / mean : 0) - << "%)\n"; - std::cout << " Total end-to-end: " << std::setw(9) << mean << " us\n"; + std::cout << " Pipeline Timing Breakdown (" << count_valid << " valid samples):\n"; + std::cout << " [A] Submit→Worker poll:" << std::setw(9) << avg_a + << " us (p50=" << percentile(stage_a_samples, 50) + << " p99=" << percentile(stage_a_samples, 99) << ")\n"; + std::cout << " (dispatch + graph launch + GPU exec + CAS)\n"; + std::cout << " [B] Worker task: " << std::setw(9) << avg_b + << " us (p50=" << percentile(stage_b_samples, 50) + << " p99=" << percentile(stage_b_samples, 99) << ")\n"; + std::cout << " (decode + response write + tx_flags set)\n"; + std::cout << " [C] Consumer poll lag: " << std::setw(9) << avg_c + << " us (p50=" << percentile(stage_c_samples, 50) + << " p99=" << percentile(stage_c_samples, 99) << ")\n"; + std::cout << " (tx_flags set → consumer sees it)\n"; + std::cout << " [A+B+C] Sum: " << std::setw(9) << (avg_a + avg_b + avg_c) << " us\n"; + std::cout << " End-to-end mean: " << std::setw(9) << mean << " us\n"; std::cout << " Per-round (/" << config.num_rounds << "): " << std::setw(9) << (mean / config.num_rounds) << " us/round\n"; + std::cout << " ---------------------------------------------------------------\n"; + std::cout << " Worker-level averages (" << n_decoded << " completed):\n"; + std::cout << " PyMatching decode: " << std::setw(9) << avg_decode << " us\n"; + std::cout << " Total worker: " << std::setw(9) << avg_worker << " us\n"; + std::cout << " Worker overhead: " << std::setw(9) << avg_overhead << " us\n"; } std::cout << " ---------------------------------------------------------------\n"; std::cout << " Host dispatcher processed " << dispatcher_stats << " packets.\n"; @@ -864,12 +912,14 @@ namespace realtime_ns = cudaq::realtime; atomic_uint64_sys idle_mask(initial_idle); std::vector inflight_slot_tags(config.num_predecoders, 0); std::vector debug_poll_ts_arr(NUM_SLOTS, 0); + std::vector debug_worker_done_ts_arr(NUM_SLOTS, 0); WorkerPoolContext pool_ctx; pool_ctx.tx_flags = tx_flags_host; pool_ctx.idle_mask = &idle_mask; pool_ctx.inflight_slot_tags = inflight_slot_tags.data(); pool_ctx.debug_poll_ts = debug_poll_ts_arr.data(); + pool_ctx.debug_worker_done_ts = debug_worker_done_ts_arr.data(); // ========================================================================= // Mailbox & Dispatcher Setup (mode-dependent) From d8bdbc117ce5273914be3fb37ac3e09002a83d3f Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Thu, 26 Feb 2026 19:44:17 +0000 Subject: [PATCH 18/40] =?UTF-8?q?Scale=20pipeline=20to=2016=20workers=20/?= =?UTF-8?q?=2032=20slots=20for=20sustained=2030=20=C2=B5s=20arrival=20rate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Increase num_predecoders and num_workers from 8 to 16 across all config presets, and set NUM_SLOTS to 32. With 8 workers the pipeline capacity (~24K req/s) was below the 33K req/s arrival rate at 30 µs spacing, causing unbounded queuing and p99 latency spikes to 4.9 ms. With 16 workers and 32 slots, d13 at 30 µs arrival sustains 25K req/s with 299 µs mean latency (23 µs/round), p99 = 334 µs, and near-zero backpressure (9K stalls vs 38M previously). Signed-off-by: Scott Thornton --- .../test_realtime_predecoder_w_pymatching.cpp | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index e15a6f66..502e9ea1 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -41,6 +41,10 @@ * Usage: test_realtime_predecoder_w_pymatching [d7|d13|d21|d31] [rate_us] [duration_s] ******************************************************************************/ + // Run the test: + // ./build/unittests/test_realtime_predecoder_w_pymatching d13 30 10 + // distance 13, 30 us between requests, 10 seconds + #include #include #include @@ -110,7 +114,7 @@ namespace realtime_ns = cudaq::realtime; // Pipeline Configuration // ============================================================================= - constexpr size_t NUM_SLOTS = 16; + constexpr size_t NUM_SLOTS = 32; struct PipelineConfig { std::string label; @@ -147,8 +151,8 @@ namespace realtime_ns = cudaq::realtime; /*residual_detectors=*/336, "model1_d7_r7_unified_Z_batch1.onnx", /*slot_size=*/4096, - /*num_predecoders=*/8, - /*num_workers=*/8 + /*num_predecoders=*/16, + /*num_workers=*/16 }; } @@ -161,8 +165,8 @@ namespace realtime_ns = cudaq::realtime; /*residual_detectors=*/2184, "model1_d13_r13_unified_Z_batch1.onnx", /*slot_size=*/16384, - /*num_predecoders=*/8, - /*num_workers=*/8 + /*num_predecoders=*/16, + /*num_workers=*/16 }; } @@ -175,8 +179,8 @@ namespace realtime_ns = cudaq::realtime; /*residual_detectors=*/9240, "model1_d21_r21_unified_X_batch1.onnx", /*slot_size=*/65536, - /*num_predecoders=*/8, - /*num_workers=*/8 + /*num_predecoders=*/16, + /*num_workers=*/16 }; } @@ -189,8 +193,8 @@ namespace realtime_ns = cudaq::realtime; /*residual_detectors=*/29760, "model1_d31_r31_unified_Z_batch1.onnx", /*slot_size=*/262144, - /*num_predecoders=*/8, - /*num_workers=*/8 + /*num_predecoders=*/16, + /*num_workers=*/16 }; } }; From 25e9b7f5eb7daaf1b0ef389327ba751d08d03d4e Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Thu, 26 Feb 2026 22:29:06 +0000 Subject: [PATCH 19/40] Handle dynamic batch dims in TRT engine build; swap d13 to memory model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add optimization profile in build_engine_from_onnx for ONNX models with dynamic dimensions (batch dim = 0). When detected, pin all dynamic dims to 1 via min/opt/max profile so TensorRT can build the engine. Previously these models failed with "Failed to build TRT engine from ONNX". Switch d13 config to predecoder_memory_d13_T13_X.onnx, which takes detectors as input rather than raw measurements. End-to-end latency drops from 299 µs to 226 µs, mainly from PyMatching (69 µs → 12 µs). Signed-off-by: Scott Thornton --- libs/qec/lib/realtime/ai_decoder_service.cu | 27 +++++++++++++++++++ .../test_realtime_predecoder_w_pymatching.cpp | 2 +- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/libs/qec/lib/realtime/ai_decoder_service.cu b/libs/qec/lib/realtime/ai_decoder_service.cu index 78f14850..10740236 100644 --- a/libs/qec/lib/realtime/ai_decoder_service.cu +++ b/libs/qec/lib/realtime/ai_decoder_service.cu @@ -161,6 +161,33 @@ void AIDecoderService::build_engine_from_onnx(const std::string& onnx_path, throw std::runtime_error("Failed to parse ONNX file: " + onnx_path); } + bool has_dynamic = false; + for (int i = 0; i < network->getNbInputs(); ++i) { + auto* input = network->getInput(i); + auto dims = input->getDimensions(); + for (int d = 0; d < dims.nbDims; ++d) { + if (dims.d[d] <= 0) { has_dynamic = true; break; } + } + if (has_dynamic) break; + } + + if (has_dynamic) { + auto* profile = builder->createOptimizationProfile(); + for (int i = 0; i < network->getNbInputs(); ++i) { + auto* input = network->getInput(i); + auto dims = input->getDimensions(); + nvinfer1::Dims fixed = dims; + for (int d = 0; d < fixed.nbDims; ++d) { + if (fixed.d[d] <= 0) fixed.d[d] = 1; + } + profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, fixed); + profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, fixed); + profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, fixed); + std::printf("[TensorRT] Set dynamic input \"%s\" to batch=1\n", input->getName()); + } + config->addOptimizationProfile(profile); + } + auto plan = std::unique_ptr( builder->buildSerializedNetwork(*network, *config)); if (!plan) throw std::runtime_error("Failed to build TRT engine from ONNX"); diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index 502e9ea1..f25370e8 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -163,7 +163,7 @@ namespace realtime_ns = cudaq::realtime; /*num_rounds=*/13, /*meas_qubits=*/252, /*residual_detectors=*/2184, - "model1_d13_r13_unified_Z_batch1.onnx", + "predecoder_memory_d13_T13_X.onnx", /*slot_size=*/16384, /*num_predecoders=*/16, /*num_workers=*/16 From 099bca2c8822282cfc44fab81c09e6938acafc00 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Thu, 26 Feb 2026 22:47:40 +0000 Subject: [PATCH 20/40] Optimize GPU copy kernels: vectorize loads and use DMA for output copy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace byte-by-byte memory copies with wider load/store operations to reduce memory transactions in the CUDA graph. The input kernel now uses uint32_t (4-byte) copies, the passthrough kernel uses uint4 (16-byte) copies, and the output kernel is replaced entirely with cudaMemcpyAsync (DMA copy engine) followed by a minimal 1-thread signal kernel. Thread counts bumped from 128 to 256. Reduces d13 mean end-to-end latency from 226 µs to 141 µs (~85 µs) and per-round latency from 17.4 µs to 10.8 µs. Signed-off-by: Scott Thornton --- .../qec/lib/realtime/ai_predecoder_service.cu | 58 ++++++++++--------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/libs/qec/lib/realtime/ai_predecoder_service.cu b/libs/qec/lib/realtime/ai_predecoder_service.cu index c29599d9..f8a47f9c 100644 --- a/libs/qec/lib/realtime/ai_predecoder_service.cu +++ b/libs/qec/lib/realtime/ai_predecoder_service.cu @@ -48,37 +48,36 @@ __global__ void predecoder_input_kernel( if (!ring_ptr) return; - const char* src = (const char*)ring_ptr + sizeof(cudaq::nvqlink::RPCHeader); - char* dst = (char*)trt_input; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < input_size_bytes; i += blockDim.x * gridDim.x) { - dst[i] = src[i]; - } + // RPCHeader is 12 bytes (3 x uint32_t), so src is 4-byte aligned. + const uint32_t* src4 = (const uint32_t*)((const char*)ring_ptr + sizeof(cudaq::nvqlink::RPCHeader)); + uint32_t* dst4 = (uint32_t*)trt_input; + size_t n4 = input_size_bytes / sizeof(uint32_t); + for (size_t i = threadIdx.x; i < n4; i += blockDim.x) + dst4[i] = src4[i]; + + size_t done = n4 * sizeof(uint32_t); + const char* src_tail = (const char*)src4 + done; + char* dst_tail = (char*)trt_input + done; + for (size_t i = done + threadIdx.x; i < input_size_bytes; i += blockDim.x) + dst_tail[i - done] = src_tail[i - done]; } -__global__ void predecoder_output_kernel( - atomic_int_sys* d_ready_flags, - void* d_outputs, - const void* trt_output, - size_t output_size_bytes) +__global__ void predecoder_signal_ready_kernel(atomic_int_sys* d_ready_flags) { - char* dst = (char*)d_outputs; - const char* src = (const char*)trt_output; - - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < output_size_bytes; i += blockDim.x * gridDim.x) { - dst[i] = src[i]; - } - - __syncthreads(); - - if (threadIdx.x == 0 && blockIdx.x == 0) { + if (threadIdx.x == 0) d_ready_flags[0].store(1, cuda::std::memory_order_release); - } } __global__ void passthrough_copy_kernel(void* dst, const void* src, size_t num_bytes) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < num_bytes; i += blockDim.x * gridDim.x) { + const uint4* src4 = (const uint4*)src; + uint4* dst4 = (uint4*)dst; + size_t n4 = num_bytes / sizeof(uint4); + for (size_t i = threadIdx.x; i < n4; i += blockDim.x) + dst4[i] = src4[i]; + + size_t done = n4 * sizeof(uint4); + for (size_t i = done + threadIdx.x; i < num_bytes; i += blockDim.x) ((char*)dst)[i] = ((const char*)src)[i]; - } } // ============================================================================= @@ -135,21 +134,24 @@ void AIPreDecoderService::capture_graph(cudaStream_t stream, bool device_launch) cudaGraph_t graph; SERVICE_CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); - predecoder_input_kernel<<<1, 128, 0, stream>>>( + predecoder_input_kernel<<<1, 256, 0, stream>>>( device_mailbox_slot_, static_cast(d_ready_flags_), d_ring_ptrs_, d_trt_input_, get_input_size()); if (skip_trt) { - passthrough_copy_kernel<<<1, 128, 0, stream>>>( + passthrough_copy_kernel<<<1, 256, 0, stream>>>( d_trt_output_, d_trt_input_, get_input_size()); } else { context_->enqueueV3(stream); } - predecoder_output_kernel<<<1, 128, 0, stream>>>( - static_cast(d_ready_flags_), - d_outputs_, d_trt_output_, get_output_size()); + SERVICE_CUDA_CHECK(cudaMemcpyAsync( + d_outputs_, d_trt_output_, get_output_size(), + cudaMemcpyDeviceToDevice, stream)); + + predecoder_signal_ready_kernel<<<1, 1, 0, stream>>>( + static_cast(d_ready_flags_)); SERVICE_CUDA_CHECK(cudaStreamEndCapture(stream, &graph)); From 3744c9ec577265fcb3d6c270ca4ecf21afef849a Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Fri, 27 Feb 2026 20:01:59 +0000 Subject: [PATCH 21/40] Add pre-launch DMA input copy callback and d13_r104 config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the input copy from an SM-based kernel inside the CUDA graph to a host-issued cudaMemcpyAsync via a new pre_launch_fn callback on HostDispatchWorker. This frees GPU compute units for TRT inference and reduces Stage A latency by ~19 µs. Add get_trt_input_ptr() and get_host_ring_ptrs() accessors to support the callback wiring. Separate the T104 ONNX model into its own d13_r104 pipeline config (104 rounds, 32K slots) and restore d13_r13 to use the T13 model. Update design document to reflect DMA data movement, pre-launch callback, out-of-order consumer, and ARM memory ordering constraints. Signed-off-by: Scott Thornton --- docs/host_side_dispatcher_design_gemini.md | 337 ++++++++++++------ .../cudaq/qec/realtime/ai_decoder_service.h | 2 + .../qec/realtime/ai_predecoder_service.h | 2 + .../qec/lib/realtime/ai_predecoder_service.cu | 5 - .../test_realtime_predecoder_w_pymatching.cpp | 82 +++-- .../daemon/dispatcher/host_dispatcher.h | 2 + .../lib/daemon/dispatcher/host_dispatcher.cu | 3 + 7 files changed, 292 insertions(+), 141 deletions(-) diff --git a/docs/host_side_dispatcher_design_gemini.md b/docs/host_side_dispatcher_design_gemini.md index b97fd74c..0f309800 100644 --- a/docs/host_side_dispatcher_design_gemini.md +++ b/docs/host_side_dispatcher_design_gemini.md @@ -3,11 +3,11 @@ ## Design Specification **Component**: `cudaq-qec` Realtime Decoding Subsystem -**Status**: Approved for Implementation +**Status**: Implemented **Supersedes**: Device-side persistent kernel dispatcher (`dispatch_kernel_with_graph`) and Statically-mapped Host Dispatcher **Target Platforms**: NVIDIA Grace Hopper (GH200), Grace Blackwell (GB200) **Shared-Memory Model**: libcu++ `cuda::std::atomic` with `thread_scope_system` -**Last Updated**: 2026-02-21 +**Last Updated**: 2026-02-26 --- @@ -15,8 +15,8 @@ ### 1.1 The Pipeline The system performs real-time quantum error correction (QEC). An FPGA streams syndrome measurements into a host-device shared ring buffer continuously (~1 µs cadence). -1. **Predecoding (GPU)**: TensorRT neural network inference (~9 µs). -2. **Global Decoding (CPU)**: PyMatching (MWPM) (~40-300 µs, highly variable). +1. **Predecoding (GPU)**: TensorRT neural network inference (~70 µs for d=13 with FP16). +2. **Global Decoding (CPU)**: PyMatching (MWPM) (~11 µs for d=13 with `predecoder_memory` model, up to ~70 µs with denser residual models). ### 1.2 The Problem The legacy architecture used a persistent GPU kernel to launch child CUDA graphs using `cudaStreamGraphFireAndForget`. This hit a hardcoded CUDA runtime limit of 128 cumulative launches, causing fatal crashes. A naive host-side port mapping FPGA slots 1:1 to GPU streams caused **Head-of-Line (HOL) blocking**: a single slow PyMatching decode would stall the sequential dispatcher, backing up the ring buffer and violating strict quantum coherence latency budgets. @@ -36,8 +36,9 @@ Instead of mapping predecoder streams statically to incoming data, the host disp 1. **Allocate**: When `rx_flags[slot]` indicates new data, the dispatcher finds the first available worker stream using a hardware bit-scan (`__builtin_ffsll`). 2. **Tag**: The dispatcher records the original `slot` in a tracking array (`inflight_slot_tags[worker_id]`) so the response can be routed correctly. -3. **Dispatch**: The dispatcher launches the CUDA graph on the assigned worker's stream and clears its availability bit. -4. **Free**: When the CPU PyMatching worker finishes the job and writes the response to `tx_flags[origin_slot]`, it restores the worker's availability bit in the `idle_mask`. +3. **Pre-launch DMA**: If a `pre_launch_fn` callback is registered on the worker, the dispatcher calls it to issue a `cudaMemcpyAsync` (DMA engine copy) of the input payload from the ring buffer to the TRT input buffer before graph launch. +4. **Dispatch**: The dispatcher launches the CUDA graph on the assigned worker's stream and clears its availability bit. +5. **Free**: When the CPU PyMatching worker finishes the job and writes the response to `tx_flags[origin_slot]`, it restores the worker's availability bit in the `idle_mask`. --- @@ -56,190 +57,296 @@ All shared state must use **libcu++ system-scope atomics** allocated in mapped p | `ready_flags[NUM_WORKERS]` | `atomic` | Mapped Pinned | GPU signals TRT done; CPU polls (Release/Acquire). | | `idle_mask` | `atomic` | Host CPU Mem | Bitmask of free workers. 1 = free, 0 = busy. | | `inflight_slot_tags[NUM_WORKERS]`| `int` (Plain array) | Host CPU Mem | Maps `worker_id` -> original FPGA `slot`. | -| `mailbox_bank[NUM_WORKERS]` | `void*` (Plain array) | Mapped Pinned | Dispatcher writes device ptr for GPU input kernel. | +| `mailbox_bank[NUM_WORKERS]` | `void*` (Plain array) | Mapped Pinned | Dispatcher writes device ptr for pre-launch callback. | +| `h_ring_ptrs[NUM_WORKERS]` | `void*` (Plain array) | Mapped Pinned | Pre-launch callback writes slot device ptr for CPU worker readback. | +| `h_outputs[NUM_WORKERS]` | `void*` (Mapped Pinned) | Mapped Pinned | GPU output copied here via DMA; CPU worker reads inference results. | --- ## 4. Host Dispatcher Thread (Producer) -The dispatcher loop is a tight spin-polling loop running on a dedicated CPU core. +The dispatcher loop is a tight spin-polling loop running on a dedicated CPU core. It is implemented in `realtime/lib/daemon/dispatcher/host_dispatcher.cu` as `host_dispatcher_loop()`. + +### 4.1 HostDispatchWorker Structure + +Each worker in the pool has the following fields: -### 4.1 Dispatcher Logic (Pseudocode) ```cpp -#include +struct HostDispatchWorker { + cudaGraphExec_t graph_exec; + cudaStream_t stream; + uint32_t function_id; + void (*pre_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream) = nullptr; + void* pre_launch_data = nullptr; +}; +``` -using atomic_uint64_sys = cuda::std::atomic; -using atomic_int_sys = cuda::std::atomic; +The `pre_launch_fn` callback enables the dispatcher to issue a `cudaMemcpyAsync` (using the DMA copy engine) for the input payload before each graph launch, without baking application-specific logic into the generic dispatcher. -void host_dispatcher_loop(DispatcherContext& ctx) { +### 4.2 Dispatcher Logic (Pseudocode) +```cpp +void host_dispatcher_loop(const HostDispatcherConfig& config) { size_t current_slot = 0; - - while (ctx.shutdown_flag->load(cuda::std::memory_order_acquire) == 0) { - // 1. Poll incoming ring buffer - uint64_t rx_value = ctx.rx_flags[current_slot].load(cuda::std::memory_order_acquire); - - if (rx_value != 0) { - // 2. Wait for an available worker in the pool (Spin if all busy) - uint64_t mask = ctx.idle_mask->load(cuda::std::memory_order_acquire); - if (mask == 0) { - QEC_CPU_RELAX(); - continue; // Do NOT advance slot. Wait for worker. - } - - // 3. Allocate worker - int worker_id = __builtin_ffsll(mask) - 1; - - // Mark worker as busy (atomic fetch_and with inverted bit) - ctx.idle_mask->fetch_and(~(1ULL << worker_id), cuda::std::memory_order_release); - - // 4. Tag the payload with its origin slot for out-of-order return - ctx.inflight_slot_tags[worker_id] = current_slot; - - // 5. Translate Host Ptr to Device Ptr for the GPU Mailbox - void* data_host = reinterpret_cast(rx_value); - ptrdiff_t offset = (uint8_t*)data_host - ctx.rx_data_host; - void* data_dev = (void*)(ctx.rx_data_dev + offset); - - ctx.h_mailbox_bank[worker_id] = data_dev; - __sync_synchronize(); // Full barrier to ensure mailbox write is visible - - // 6. Launch graph on the assigned worker's stream - cudaError_t err = cudaGraphLaunch(ctx.workers[worker_id].graph_exec, ctx.workers[worker_id].stream); - if (err != cudaSuccess) { - uint64_t error_val = (uint64_t)0xDEAD << 48 | (uint64_t)err; - ctx.tx_flags[current_slot].store(error_val, cuda::std::memory_order_release); - ctx.idle_mask->fetch_or(1ULL << worker_id, cuda::std::memory_order_release); - } else { - // 6b. Mark slot IN_FLIGHT so producer does not reuse it while GPU/workers use it - ctx.tx_flags[current_slot].store(0xEEEEEEEEEEEEEEEEULL, cuda::std::memory_order_release); - } - - // 7. Consume slot and advance - ctx.rx_flags[current_slot].store(0, cuda::std::memory_order_release); - current_slot = (current_slot + 1) % ctx.num_slots; - + + while (config.shutdown_flag->load(acquire) == 0) { + uint64_t rx_value = config.rx_flags[current_slot].load(acquire); + if (rx_value == 0) { QEC_CPU_RELAX(); continue; } + + void* slot_host = reinterpret_cast(rx_value); + + // Optional: parse RPC header and lookup function table + if (use_function_table) { + ParsedSlot parsed = parse_slot_with_function_table(slot_host, config); + if (parsed.drop) { clear_and_advance(); continue; } + } + + // Wait for an available worker (spin if all busy) + int worker_id = acquire_graph_worker(config, ...); + if (worker_id < 0) { QEC_CPU_RELAX(); continue; } + + // Mark worker busy, tag with origin slot + config.idle_mask->fetch_and(~(1ULL << worker_id), release); + config.inflight_slot_tags[worker_id] = current_slot; + + // Translate host ptr to device ptr, write to mailbox + ptrdiff_t offset = (uint8_t*)slot_host - config.rx_data_host; + void* data_dev = config.rx_data_dev + offset; + config.h_mailbox_bank[worker_id] = data_dev; + __sync_synchronize(); + + // Pre-launch callback: DMA copy input to TRT buffer + if (worker.pre_launch_fn) + worker.pre_launch_fn(worker.pre_launch_data, data_dev, worker.stream); + + // Launch graph + cudaError_t err = cudaGraphLaunch(worker.graph_exec, worker.stream); + if (err != cudaSuccess) { + tx_flags[current_slot].store(0xDEAD|err, release); + idle_mask->fetch_or(1ULL << worker_id, release); } else { - QEC_CPU_RELAX(); // No data, spin on current slot + tx_flags[current_slot].store(0xEEEEEEEEEEEEEEEEULL, release); } + + // Consume slot and advance + rx_flags[current_slot].store(0, release); + current_slot = (current_slot + 1) % num_slots; } - // Cleanup: Synchronize all streams before exit to prevent illegal memory access - for(auto& w : ctx.workers) cudaStreamSynchronize(w.stream); + for (auto& w : config.workers) cudaStreamSynchronize(w.stream); } ``` --- -## 5. GPU Kernel Modifications +## 5. GPU Graph Composition & Data Transfer + +### 5.1 DMA-Based Data Movement + +Data copies between the ring buffer and TRT inference buffers use the GPU's DMA copy engine rather than SM-based kernels, freeing compute resources for inference. -The predecoder GPU kernels require minimal changes, as the dynamic pooling complexity is handled entirely by the host. +**Input copy (ring buffer -> TRT input)**: Issued by the host dispatcher via `pre_launch_fn` callback as a `cudaMemcpyAsync(DeviceToDevice)` on the worker's stream *before* `cudaGraphLaunch`. The source address is dynamic (determined at dispatch time from the ring buffer slot), so it cannot be baked into the captured graph. -1. **Input Kernel**: Reads `*mailbox_slot_ptr` (mapped pinned) to get the device pointer to the ring buffer data. It copies this to `d_trt_input`. -2. **Output Kernel**: Copies `d_trt_output` to `h_outputs[worker_id]` (mapped pinned). -3. **Completion Signal**: The output kernel signals the CPU polling thread by setting the ready flag: - ```cpp - // Device code - d_ready_flags[worker_id].store(1, cuda::std::memory_order_release); - ``` +**Output copy (TRT output -> host-mapped outputs)**: Captured inside the CUDA graph as a `cudaMemcpyAsync(DeviceToDevice)`. Both source (`d_trt_output_`) and destination (`d_outputs_`) are fixed addresses, so this is captured at graph instantiation time. -*(Note: `cudaGraphInstantiateFlagDeviceLaunch` MUST be removed from graph capture. Use `cudaGraphInstantiate(&graph_exec, graph, 0)`).* +### 5.2 Captured CUDA Graph Contents + +The CUDA graph for each predecoder contains (in order): + +1. **TRT inference** (`context_->enqueueV3(stream)`) -- or `passthrough_copy_kernel` if `SKIP_TRT` is set. +2. **Output DMA copy** (`cudaMemcpyAsync` D2D) -- copies TRT output to host-mapped output buffer. +3. **Signal kernel** (`predecoder_signal_ready_kernel<<<1,1>>>`) -- a single-thread kernel that performs `d_ready_flags[0].store(1, release)` to notify the CPU worker. + +The graph is instantiated with `cudaGraphInstantiate(&graph_exec_, graph, 0)` for host-launch mode. The `predecoder_input_kernel` is no longer part of the graph; input data arrives via the pre-launch DMA copy. + +### 5.3 Passthrough Copy Kernel (SKIP_TRT mode) + +When `SKIP_TRT` is set, a vectorized passthrough kernel (`uint4` 16-byte loads/stores, 256 threads) substitutes for TRT inference for benchmarking the infrastructure overhead. --- ## 6. Worker Subsystem (Consumer) -A separate CPU polling thread scans the `ready_flags` array. When a GPU graph finishes, the job is handed to a CPU thread pool for PyMatching decoding. - ### 6.1 Ready-Flag State Machine (Atomic Claiming) -With a single slot per predecoder (queue depth 1), the poller must **claim** each completion exactly once. If the poller only checks `ready_flags[i]==1` and enqueues without claiming, it will enqueue the same job repeatedly until the PyMatching worker calls `release_job`, flooding the thread pool and stalling the pipeline. +With a single slot per predecoder (queue depth 1), the poller must **claim** each completion exactly once. **States** (per-worker ready flag): | Value | State | Meaning | | :--- | :--- | :--- | | 0 | Idle | Waiting for GPU, or worker has called `release_job`. | -| 1 | Ready | GPU finished; output kernel stored 1. | +| 1 | Ready | GPU finished; signal kernel stored 1. | | 2 | Processing | CPU poller claimed the job; PyMatching is running. | **Poller**: Use `compare_exchange_strong(expected=1, desired=2, memory_order_acquire, memory_order_relaxed)`. Only the thread that wins the CAS enqueues the job. Use **relaxed on failure** so spin-polling does not add barriers that delay seeing the GPU's store(1). **Worker**: When PyMatching finishes, call `release_job(slot_idx)` which does `ready_flags[0].store(0, release)` so the slot is Idle for the next launch. -### 6.2 Worker Logic (Pseudocode) +### 6.2 Dedicated Polling/Worker Threads + +Each predecoder has a dedicated polling thread that spins on `poll_next_job()` (the CAS), then runs PyMatching inline on the same thread. This avoids thread pool overhead. + +### 6.3 Worker Logic (Pseudocode) ```cpp -void pymatching_worker_task(WorkerContext& ctx, int worker_id) { - // 1. Read GPU outputs from mapped pinned memory - // ... run PyMatching MWPM ... - - // 2. Lookup origin slot for out-of-order routing - int origin_slot = ctx.inflight_slot_tags[worker_id]; - - // 3. Write response back to the EXACT slot the FPGA expects - uint64_t response_val = format_response(...); - ctx.tx_flags[origin_slot].store(response_val, cuda::std::memory_order_release); - - // 4. Acknowledge GPU read completion (Idle for next launch) - ctx.ready_flags[worker_id].store(0, cuda::std::memory_order_release); // 2 -> 0 - - // 5. FREE THE WORKER: Return this worker back to the dispatcher pool - ctx.idle_mask->fetch_or((1ULL << worker_id), cuda::std::memory_order_release); +void pymatching_worker_task(PreDecoderJob job, int worker_id, + AIPreDecoderService* predecoder, + DecoderContext* ctx, + WorkerPoolContext* pool_ctx) { + // 1. Read GPU outputs from mapped pinned memory (h_outputs_) + const int32_t* residual = static_cast(job.inference_data); + + // 2. Run PyMatching MWPM decode over spatial slices + for (int s = 0; s < ctx->spatial_slices; ++s) { + // ... decode each spatial slice ... + } + + // 3. Write RPC response back to the ring buffer slot + auto* header = static_cast(job.ring_buffer_ptr); + header->magic = RPC_MAGIC_RESPONSE; + header->status = 0; + header->result_len = sizeof(resp_data); + + // 4. Lookup origin slot and signal completion via tx_flags + int origin_slot = job.origin_slot; + pool_ctx->tx_flags[origin_slot].store( + reinterpret_cast(job.ring_buffer_ptr), release); + + // 5. Release GPU predecoder slot (2 -> 0) + predecoder->release_job(job.slot_idx); + + // 6. Return worker to the dispatcher pool + pool_ctx->idle_mask->fetch_or(1ULL << worker_id, release); } ``` --- -## 7. Step-by-Step Data Flow Trace +## 7. Out-of-Order Consumer + +The consumer thread harvests completions **out-of-order** by scanning all active slots on every iteration, rather than waiting for a sequential `next_harvest` counter. This eliminates head-of-line blocking where a slow request in slot N would prevent harvesting faster completions in slot N+1. + +### 7.1 Consumer Logic (Pseudocode) +```cpp +// Consumer scans all slots each iteration +while (!consumer_stop) { + bool found_any = false; + for (uint32_t s = 0; s < NUM_SLOTS; ++s) { + if (slot_request[s] < 0) continue; // no active request in this slot + + cudaq_tx_status_t status = cudaq_host_ringbuffer_poll_tx_flag(&rb, s, &err); + + if (status == CUDAQ_TX_READY) { + int rid = slot_request[s]; + complete_ts[rid] = now(); + completed[rid] = true; + total_completed++; + + slot_request[s] = -1; // Reset request ID FIRST + __sync_synchronize(); // ARM memory fence + cudaq_host_ringbuffer_clear_slot(&rb, s); // Then clear tx_flags + found_any = true; + } + } + if (!found_any) QEC_CPU_RELAX(); +} +``` + +### 7.2 Consumer-Producer Race Fix + +On ARM's weakly ordered memory model, the consumer must reset `slot_request[s] = -1` **before** clearing `tx_flags[s]` (via `cudaq_host_ringbuffer_clear_slot`), with a `__sync_synchronize()` fence between them. Without this ordering: +1. Consumer clears `tx_flags[s]` (slot appears free to producer) +2. Producer writes new `slot_request[s] = new_rid` +3. Consumer's delayed `slot_request[s] = -1` clobbers the producer's write + +This race caused exactly one request to get "stuck" indefinitely, eventually stalling the entire pipeline through backpressure. + +--- + +## 8. Step-by-Step Data Flow Trace 1. **FPGA** writes INT32 measurements into `rx_data[5]`. 2. **FPGA** sets `rx_flags[5] = host_ptr`. 3. **Host Dispatcher** reads `rx_flags[5]`, sees data. -4. **Host Dispatcher** scans `idle_mask`, finds `worker_id = 2` is free. -5. **Host Dispatcher** marks bit 2 busy in `idle_mask`. -6. **Host Dispatcher** saves `inflight_slot_tags[2] = 5`. -7. **Host Dispatcher** translates `host_ptr` to `dev_ptr`, writes to `mailbox_bank[2]`. -8. **Host Dispatcher** calls `cudaGraphLaunch(..., stream[2])`. -9. **Host Dispatcher** sets `tx_flags[5] = 0xEEEE...` (IN_FLIGHT), then clears `rx_flags[5] = 0` and advances to `current_slot = 6`. -10. **GPU** executes graph on stream 2. Finishes and sets `ready_flags[2] = 1`. -11. **CPU Poller** CAS(1, 2) on `ready_flags[2]`, wins, enqueues job once; PyMatching runs on CPU. -12. **CPU Worker** finishes PyMatching. -13. **CPU Worker** looks up `origin_slot = inflight_slot_tags[2]` (which is 5). -14. **CPU Worker** writes response to `tx_flags[5]` (overwrites 0xEEEE), then `release_job`, then restores bit 2 in `idle_mask`. -15. **Consumer** (harvest thread) sees `tx_flags[5] != 0` and `!= 0xEEEE`, harvests, then clears `tx_flags[5] = 0`. Producer may now reuse slot 5. +4. **Host Dispatcher** parses RPC header, looks up function in the function table. +5. **Host Dispatcher** scans `idle_mask`, finds `worker_id = 2` is free. +6. **Host Dispatcher** marks bit 2 busy in `idle_mask`. +7. **Host Dispatcher** saves `inflight_slot_tags[2] = 5`. +8. **Host Dispatcher** translates `host_ptr` to `dev_ptr`, writes to `mailbox_bank[2]`. +9. **Host Dispatcher** calls `pre_launch_fn`: writes `h_ring_ptrs[0] = dev_ptr`, issues `cudaMemcpyAsync(d_trt_input, dev_ptr + 12, input_size, D2D, stream[2])`. +10. **Host Dispatcher** calls `cudaGraphLaunch(..., stream[2])`. +11. **Host Dispatcher** sets `tx_flags[5] = 0xEEEE...` (IN_FLIGHT), then clears `rx_flags[5] = 0` and advances to `current_slot = 6`. +12. **GPU DMA engine** copies input payload from ring buffer to TRT input buffer. +13. **GPU** executes TRT inference. +14. **GPU DMA engine** copies TRT output to host-mapped `h_outputs_`. +15. **GPU signal kernel** sets `ready_flags[2] = 1` (system-scope atomic release). +16. **CPU Poller** CAS(1, 2) on `ready_flags[2]`, wins, reads `h_ring_ptrs[0]` to get ring buffer address and `h_outputs_` to get inference data. +17. **CPU Worker** runs PyMatching decode over spatial slices. +18. **CPU Worker** writes RPC response into ring buffer slot. +19. **CPU Worker** looks up `origin_slot = inflight_slot_tags[2]` (which is 5). +20. **CPU Worker** writes response address to `tx_flags[5]` (overwrites 0xEEEE). +21. **CPU Worker** calls `release_job` (`ready_flags[0].store(0, release)`), then restores bit 2 in `idle_mask`. +22. **Consumer** scans all slots, sees `tx_flags[5] != 0` and `!= 0xEEEE`, harvests. +23. **Consumer** sets `slot_request[5] = -1`, `__sync_synchronize()`, then clears `tx_flags[5] = 0`. Producer may now reuse slot 5. --- -## 8. Ring Buffer and IN_FLIGHT Sentinel +## 9. Ring Buffer and IN_FLIGHT Sentinel Because `cudaGraphLaunch` is asynchronous, the dispatcher clears `rx_flags[slot]` immediately after launch. Without a hold, the **producer** (FPGA sim or test) would see `rx_flags[slot]==0` and `tx_flags[slot]==0` (response not written yet) and reuse the slot, overwriting data while the GPU is still reading. **Fix: IN_FLIGHT tag** -1. **Dispatcher**: On successful launch, write `tx_flags[current_slot].store(0xEEEEEEEEEEEEEEEEULL, release)` **before** clearing `rx_flags[current_slot]`. On launch failure, write the 0xDEAD|err value and restore the worker bit; do not write 0xEEEE. +1. **Dispatcher**: On successful launch, write `tx_flags[current_slot].store(0xEEEEEEEEEEEEEEEEULL, release)` **before** clearing `rx_flags[current_slot]`. On launch failure, write the 0xDEAD|err value and restore the worker bit; do not write 0xEEEE. Setting `tx_data_host = nullptr` and `tx_data_dev = nullptr` in the config forces the dispatcher to use the `0xEEEE` sentinel rather than a real data address. 2. **Producer**: Reuse a slot only when **both** `rx_flags[slot]==0` **and** `tx_flags[slot]==0`. Thus the producer blocks until the consumer has harvested (tx cleared). 3. **Consumer**: When harvesting, treat only real responses: `tx_flags[slot] != 0` **and** `tx_flags[slot] != 0xEEEEEEEEEEEEEEEEULL`. Ignore 0xEEEE (in-flight). On harvest, clear `tx_flags[slot] = 0`. -**Slot lifecycle**: Idle (rx=0, tx=0) → Written (rx=ptr, tx=0) → In-flight (rx=0, tx=0xEEEE) → Completed (rx=0, tx=response) → Consumer harvests, tx=0 → Idle. +**Slot lifecycle**: Idle (rx=0, tx=0) -> Written (rx=ptr, tx=0) -> In-flight (rx=0, tx=0xEEEE) -> Completed (rx=0, tx=response) -> Consumer harvests, tx=0 -> Idle. + +--- + +## 10. Dynamic Batch Handling for ONNX Models + +When building a TensorRT engine from an ONNX model with dynamic batch dimensions (dim 0 <= 0), `ai_decoder_service.cu` automatically creates an optimization profile that pins all dynamic dimensions to 1. This enables building engines from models like `predecoder_memory_d13_T13_X.onnx` which use a symbolic `batch` dimension. --- -## 9. Shutdown and Grace Period +## 11. Shutdown and Grace Period -- **Grace period**: After the producer thread exits, the main thread may wait up to a bounded time (e.g. 10 s) for `total_completed >= total_submitted`. +- **Grace period**: After the producer thread exits, the main thread waits up to 5 seconds for `total_completed >= total_submitted`. - **Consumer exit**: The consumer thread normally exits when `producer_done && total_completed >= total_submitted`. To avoid hanging forever if some in-flight requests never complete, set a **consumer_stop** flag after the grace period; the consumer loop checks this and exits so `consumer.join()` returns and the process can print the final report and exit cleanly. -- **Diagnostic threads**: A progress reporter (submitted/completed every second) and a watchdog (stall detection every 2 s) are **optional** and should be **disabled by default** (e.g. `kEnableProgressReporter = false`, `kEnableWatchdog = false`). Enable them only when debugging stalls; otherwise they can block shutdown (e.g. watchdog not seeing `producer_done`). +- **Dispatcher shutdown**: Set `shutdown_flag = 1` after the consumer exits, then join the dispatcher thread. The dispatcher synchronizes all worker streams before returning. +- **Debug diagnostics**: If requests are stuck after the grace period, a debug dump prints per-slot rx/tx flags, slot_request state, and per-worker inflight_slot_tags and idle_mask bits. + +--- + +## 12. Performance Results (d=13, 30 µs rate, 10s) + +Measured on Grace Blackwell (GB200) with `predecoder_memory_d13_T13_X.onnx` (FP16), 16 workers, 32 slots: + +| Metric | Value | +| :--- | :--- | +| Throughput | 25,331 req/s | +| Mean latency | 122.0 µs | +| p50 latency | 119.3 µs | +| p99 latency | 135.3 µs | +| Per-round (/13) | 9.4 µs/round | +| Stage A (dispatch + GPU) | 109.9 µs | +| Stage B (PyMatching) | 11.8 µs | +| Stage C (consumer lag) | 0.3 µs | +| Raw TRT inference (trtexec) | 69.5 µs | --- -## 10. LLM Implementation Directives (Constraints Checklist) +## 13. LLM Implementation Directives (Constraints Checklist) When generating code from this specification, the LLM **MUST** strictly adhere to the following constraints: - [ ] **NO CUDA STREAM QUERYING**: Do not use `cudaStreamQuery()` for backpressure or completion checking. It incurs severe driver latency. Rely strictly on `idle_mask` and `ready_flags`. - [ ] **NO WEAK ORDERING BUGS**: Do not use `volatile`. Do not use `__threadfence_system()`. You must use `cuda::std::atomic` (or `` with `thread_scope_system`) for all cross-device synchronization. -- [ ] **NO HEAD OF LINE BLOCKING**: The host dispatcher MUST NOT statically map slots to predecoders. It must dynamically allocate via `idle_mask`. +- [ ] **NO HEAD OF LINE BLOCKING**: The host dispatcher MUST NOT statically map slots to predecoders. It must dynamically allocate via `idle_mask`. The consumer MUST harvest out-of-order by scanning all active slots. - [ ] **NO DATA LOSS**: If `idle_mask == 0` (all workers busy), the dispatcher MUST spin on the current slot (`QEC_CPU_RELAX()`). It MUST NOT advance `current_slot` until a worker is allocated and the graph is launched. - [ ] **NO RACE CONDITIONS ON TAGS**: `inflight_slot_tags` does not need to be atomic because index `[worker_id]` is exclusively owned by the active flow once the dispatcher clears the bit in `idle_mask`, until the worker thread restores the bit. - [ ] **READY FLAG CLAIMING**: The CPU poller MUST claim each completion exactly once using compare_exchange_strong(1, 2) on the ready flag; use relaxed memory order on CAS failure. The worker MUST clear the flag (store 0) in `release_job`. -- [ ] **IN_FLIGHT SENTINEL**: After a successful `cudaGraphLaunch`, the dispatcher MUST write `tx_flags[current_slot] = 0xEEEEEEEEEEEEEEEEULL` before clearing `rx_flags[current_slot]`. The producer MUST wait for both rx and tx to be 0 before reusing a slot. The consumer MUST ignore 0xEEEE and only harvest real responses (or 0xDEAD errors). +- [ ] **IN_FLIGHT SENTINEL**: After a successful `cudaGraphLaunch`, the dispatcher MUST write `tx_flags[current_slot] = 0xEEEEEEEEEEEEEEEEULL` before clearing `rx_flags[current_slot]`. Set `tx_data_host = nullptr` and `tx_data_dev = nullptr` to force the 0xEEEE path. The producer MUST wait for both rx and tx to be 0 before reusing a slot. The consumer MUST ignore 0xEEEE and only harvest real responses (or 0xDEAD errors). +- [ ] **CONSUMER MEMORY ORDERING**: The consumer MUST set `slot_request[s] = -1` BEFORE calling `cudaq_host_ringbuffer_clear_slot`, with a `__sync_synchronize()` fence between them, to prevent the producer-consumer race on ARM. +- [ ] **DMA DATA MOVEMENT**: Use `cudaMemcpyAsync` (DMA engine) for data copies. Input copy is issued via `pre_launch_fn` callback before graph launch. Output copy is captured inside the graph. Do not use SM-based byte-copy kernels for fixed-address transfers. - [ ] **SHUTDOWN**: Use a `consumer_stop` (or equivalent) flag so the consumer thread can exit after a grace period even when `total_completed < total_submitted`; join the consumer after setting the flag so the process exits cleanly. -- [ ] **DIAGNOSTIC THREADS**: Progress reporter and watchdog threads MUST be optional and disabled by default so they do not block normal shutdown. diff --git a/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h b/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h index 0c9aa709..62cab2e9 100644 --- a/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h +++ b/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h @@ -44,6 +44,8 @@ class AIDecoderService { /// @brief Size of the primary output tensor in bytes (forwarded to CPU) size_t get_output_size() const { return output_size_; } + void* get_trt_input_ptr() const { return d_trt_input_; } + protected: void load_engine(const std::string& path); void build_engine_from_onnx(const std::string& onnx_path, diff --git a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h index 69f07e21..13bd3c3b 100644 --- a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h +++ b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h @@ -57,6 +57,8 @@ class AIPreDecoderService : public AIDecoderService { volatile int* get_host_queue_idx() const { return nullptr; } int get_queue_depth() const { return queue_depth_; } + void** get_host_ring_ptrs() const { return h_ring_ptrs_; } + private: int queue_depth_; // Always 1 diff --git a/libs/qec/lib/realtime/ai_predecoder_service.cu b/libs/qec/lib/realtime/ai_predecoder_service.cu index f8a47f9c..533f6399 100644 --- a/libs/qec/lib/realtime/ai_predecoder_service.cu +++ b/libs/qec/lib/realtime/ai_predecoder_service.cu @@ -134,11 +134,6 @@ void AIPreDecoderService::capture_graph(cudaStream_t stream, bool device_launch) cudaGraph_t graph; SERVICE_CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); - predecoder_input_kernel<<<1, 256, 0, stream>>>( - device_mailbox_slot_, - static_cast(d_ready_flags_), - d_ring_ptrs_, d_trt_input_, get_input_size()); - if (skip_trt) { passthrough_copy_kernel<<<1, 256, 0, stream>>>( d_trt_output_, d_trt_input_, get_input_size()); diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index f25370e8..7a1bfc3c 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -156,19 +156,33 @@ namespace realtime_ns = cudaq::realtime; }; } - static PipelineConfig d13_r13() { - return { - "d13_r13_Z", - /*distance=*/13, - /*num_rounds=*/13, - /*meas_qubits=*/252, - /*residual_detectors=*/2184, - "predecoder_memory_d13_T13_X.onnx", - /*slot_size=*/16384, - /*num_predecoders=*/16, - /*num_workers=*/16 - }; - } + static PipelineConfig d13_r13() { + return { + "d13_r13_Z", + /*distance=*/13, + /*num_rounds=*/13, + /*meas_qubits=*/252, + /*residual_detectors=*/2184, + "predecoder_memory_d13_T13_X.onnx", + /*slot_size=*/16384, + /*num_predecoders=*/16, + /*num_workers=*/16 + }; + } + + static PipelineConfig d13_r104() { + return { + "d13_r104_Z", + /*distance=*/13, + /*num_rounds=*/104, + /*meas_qubits=*/252, + /*residual_detectors=*/2184, + "predecoder_memory_d13_T104_X.onnx", + /*slot_size=*/32768, + /*num_predecoders=*/16, + /*num_workers=*/16 + }; + } static PipelineConfig d21_r21() { return { @@ -346,6 +360,20 @@ namespace realtime_ns = cudaq::realtime; int duration_s = 5; // how long to run int warmup_count = 20; // discard first N from latency stats }; + +struct PreLaunchCopyCtx { + void* d_trt_input; + size_t input_size; + void** h_ring_ptrs; +}; + +static void pre_launch_input_copy(void* user_data, void* slot_dev, cudaStream_t stream) { + auto* ctx = static_cast(user_data); + ctx->h_ring_ptrs[0] = slot_dev; + cudaMemcpyAsync(ctx->d_trt_input, + static_cast(slot_dev) + CUDAQ_RPC_HEADER_SIZE, + ctx->input_size, cudaMemcpyDeviceToDevice, stream); +} void run_streaming_test( const PipelineConfig& config, @@ -418,11 +446,20 @@ namespace realtime_ns = cudaq::realtime; disp_cfg.live_dispatched = &live_dispatched; disp_cfg.idle_mask = pool_ctx->idle_mask; disp_cfg.inflight_slot_tags = pool_ctx->inflight_slot_tags; + std::vector pre_launch_ctxs(num_workers); + for (int i = 0; i < num_workers; ++i) { + pre_launch_ctxs[i].d_trt_input = predecoders[i]->get_trt_input_ptr(); + pre_launch_ctxs[i].input_size = predecoders[i]->get_input_size(); + pre_launch_ctxs[i].h_ring_ptrs = predecoders[i]->get_host_ring_ptrs(); + } + disp_cfg.workers.resize(num_workers); for (int i = 0; i < num_workers; ++i) { disp_cfg.workers[i].graph_exec = predecoders[i]->get_executable_graph(); disp_cfg.workers[i].stream = predecoder_streams[i]; disp_cfg.workers[i].function_id = function_table[i].function_id; + disp_cfg.workers[i].pre_launch_fn = pre_launch_input_copy; + disp_cfg.workers[i].pre_launch_data = &pre_launch_ctxs[i]; } std::thread dispatcher_thread([&disp_cfg]() { @@ -809,18 +846,21 @@ namespace realtime_ns = cudaq::realtime; PipelineConfig config; if (config_name == "d7") { config = PipelineConfig::d7_r7(); - } else if (config_name == "d13") { - config = PipelineConfig::d13_r13(); - } else if (config_name == "d21") { + } else if (config_name == "d13") { + config = PipelineConfig::d13_r13(); + } else if (config_name == "d13_r104") { + config = PipelineConfig::d13_r104(); + } else if (config_name == "d21") { config = PipelineConfig::d21_r21(); } else if (config_name == "d31") { config = PipelineConfig::d31_r31(); } else { - std::cerr << "Usage: " << argv[0] << " [d7|d13|d21|d31] [rate_us] [duration_s]\n" - << " d7 - distance 7, 7 rounds (default)\n" - << " d13 - distance 13, 13 rounds\n" - << " d21 - distance 21, 21 rounds\n" - << " d31 - distance 31, 31 rounds\n" + std::cerr << "Usage: " << argv[0] << " [d7|d13|d13_r104|d21|d31] [rate_us] [duration_s]\n" + << " d7 - distance 7, 7 rounds (default)\n" + << " d13 - distance 13, 13 rounds\n" + << " d13_r104 - distance 13, 104 rounds\n" + << " d21 - distance 21, 21 rounds\n" + << " d31 - distance 31, 31 rounds\n" << " rate_us - inter-arrival time in us (0 = open-loop, default)\n" << " duration_s - test duration in seconds (default: 5)\n" << "\nExamples:\n" diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h b/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h index 43ff3821..2fd1ec1b 100644 --- a/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h +++ b/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h @@ -36,6 +36,8 @@ struct HostDispatchWorker { cudaGraphExec_t graph_exec; cudaStream_t stream; uint32_t function_id; // matches table entry; used to assign slot to this worker + void (*pre_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream) = nullptr; + void* pre_launch_data = nullptr; }; struct HostDispatcherConfig { diff --git a/realtime/lib/daemon/dispatcher/host_dispatcher.cu b/realtime/lib/daemon/dispatcher/host_dispatcher.cu index abb52d87..7815cd50 100644 --- a/realtime/lib/daemon/dispatcher/host_dispatcher.cu +++ b/realtime/lib/daemon/dispatcher/host_dispatcher.cu @@ -99,6 +99,8 @@ static void launch_graph_worker(const HostDispatcherConfig& config, __sync_synchronize(); const size_t w = static_cast(worker_id); + if (config.workers[w].pre_launch_fn) + config.workers[w].pre_launch_fn(config.workers[w].pre_launch_data, data_dev, config.workers[w].stream); cudaError_t err = cudaGraphLaunch(config.workers[w].graph_exec, config.workers[w].stream); if (err != cudaSuccess) { @@ -138,6 +140,7 @@ void host_dispatcher_loop(const HostDispatcherConfig& config) { uint32_t function_id = 0; const cudaq_function_entry_t* entry = nullptr; + // TODO: Remove non-function-table path; RPC framing is always required. if (use_function_table) { ParsedSlot parsed = parse_slot_with_function_table(slot_host, config); if (parsed.drop) { From 9c544a57c84247351f60cd385b85c6a5a6c50d16 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Mon, 2 Mar 2026 21:55:58 +0000 Subject: [PATCH 22/40] Add RealtimePipeline scaffolding; refactor benchmark to use it Introduce a RealtimePipeline class (pipeline.h, realtime_pipeline.cu) that encapsulates all ring buffer allocation, atomic synchronization, dispatcher wiring, worker thread management, and consumer slot lifecycle behind a callback-driven API. Application code provides a GPU stage factory, a CPU stage callback, and a completion handler -- zero direct atomic access required. Refactor test_realtime_predecoder_w_pymatching.cpp from 1083 lines to ~470 lines by replacing inline atomics, thread management, and slot tracking with pipeline.submit() / pipeline.stop() / pipeline.stats(). Add d13_r104 config (T=104 model, 131K slot size). Signed-off-by: Scott Thornton --- .../test_realtime_predecoder_w_pymatching.cpp | 1561 ++++++----------- libs/qec/unittests/CMakeLists.txt | 1 + realtime/include/cudaq/realtime/pipeline.h | 138 ++ realtime/lib/CMakeLists.txt | 1 + realtime/lib/pipeline/CMakeLists.txt | 38 + realtime/lib/pipeline/realtime_pipeline.cu | 525 ++++++ 6 files changed, 1259 insertions(+), 1005 deletions(-) create mode 100644 realtime/include/cudaq/realtime/pipeline.h create mode 100644 realtime/lib/pipeline/CMakeLists.txt create mode 100644 realtime/lib/pipeline/realtime_pipeline.cu diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index 7a1bfc3c..d1573a03 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -1,368 +1,177 @@ /****************************************************************-*- C++ -*-**** * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * * All rights reserved. * - * * + * * * This source code and the accompanying materials are made available under * * the terms of the Apache License 2.0 which accompanies this distribution. * ******************************************************************************/ /******************************************************************************* - * Hybrid Realtime Pipeline Test with Real ONNX Pre-Decoder + PyMatching + * Hybrid Realtime Pipeline Benchmark with AI Pre-Decoder + PyMatching * - * Supports multiple surface code configurations: + * Uses the RealtimePipeline scaffolding to hide all ring buffer, atomics, + * and thread management. Application code only provides: + * 1. GPU stage factory (AIPreDecoderService instances) + * 2. CPU stage callback (PyMatching decode) + * 3. Completion callback (timestamp recording) * - * d=7 r=7 (model1_d7_r7_unified_Z_batch1.onnx) - * Input: all_measurements [1, 72, 7] INT32 (2016 bytes) - * Output: residual_detectors [1, 336] INT32 (1344 bytes) - * Output: logical_frame [1] INT32 (4 bytes) - * - * d=13 r=13 (model1_d13_r13_unified_Z_batch1.onnx) - * Input: all_measurements [1, 252, 13] INT32 (13104 bytes) - * Output: residual_detectors [1, 2184] INT32 (8736 bytes) - * Output: logical_frame [1] INT32 (4 bytes) - * - * d=21 r=21 (model1_d21_r21_unified_Z_batch1.onnx) - * Input: all_measurements [1, 660, 21] INT32 (55440 bytes) - * Output: residual_detectors [1, 9240] INT32 (36960 bytes) - * Output: logical_frame [1] INT32 (4 bytes) - * - * d=31 r=31 (model1_d31_r31_unified_Z_batch1.onnx) - * Input: all_measurements [1, 1440, 31] INT32 (178560 bytes) - * Output: residual_detectors [1, 29760] INT32 (119040 bytes) - * Output: logical_frame [1] INT32 (4 bytes) - * - * Pipeline: - * 1. Ring Buffer setup - * 2. Dispatcher Kernel -> Nx AIPreDecoderService instances (GPU, TRT from ONNX) - * 3. GPU -> CPU N-Deep Pinned Memory Queue handoff - * 4. Dedicated Polling Thread -> Worker PyMatching Thread Pool - * 5. CPU Workers closing the transaction (Setting TX flags) - * - * Usage: test_realtime_predecoder_w_pymatching [d7|d13|d21|d31] [rate_us] [duration_s] + * Usage: test_realtime_predecoder_w_pymatching [d7|d13|d13_r104|d21|d31] [rate_us] [duration_s] ******************************************************************************/ - // Run the test: - // ./build/unittests/test_realtime_predecoder_w_pymatching d13 30 10 - // distance 13, 30 us between requests, 10 seconds - - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include -#include -#include - - #include - - #ifndef CUDA_VERSION - #define CUDA_VERSION 13000 - #endif - #include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" - #include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" - #include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" - - #include "cudaq/qec/realtime/ai_decoder_service.h" - #include "cudaq/qec/realtime/ai_predecoder_service.h" - #include - #include "cudaq/qec/utils/pipeline_benchmarks.h" - #include "cudaq/qec/code.h" - #include "cudaq/qec/decoder.h" - +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifndef CUDA_VERSION +#define CUDA_VERSION 13000 +#endif + +#include "cudaq/realtime/pipeline.h" +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" + +#include "cudaq/qec/realtime/ai_decoder_service.h" +#include "cudaq/qec/realtime/ai_predecoder_service.h" +#include "cudaq/qec/code.h" +#include "cudaq/qec/decoder.h" + +using namespace cudaq::qec; +namespace realtime_ns = cudaq::realtime; + +// Portable CPU Yield +#ifndef QEC_CPU_RELAX +#if defined(__x86_64__) +#include +#define QEC_CPU_RELAX() _mm_pause() +#elif defined(__aarch64__) +#define QEC_CPU_RELAX() __asm__ volatile("yield" ::: "memory") +#else +#define QEC_CPU_RELAX() do { } while(0) +#endif +#endif + #define CUDA_CHECK(call) \ do { \ cudaError_t err = call; \ if (err != cudaSuccess) { \ - std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at line " << __LINE__ << std::endl; \ + std::cerr << "CUDA Error: " << cudaGetErrorString(err) \ + << " at line " << __LINE__ << std::endl; \ exit(1); \ } \ } while(0) -// Pin a thread to a specific CPU core (Cores 2-5 = spinning infra, 10+ = workers; 0-1 = OS). -static void pin_thread_to_core(std::thread& t, int core_id) { - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - int rc = pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset); - if (rc != 0) { - std::cerr << "Warning: Failed to pin thread to core " << core_id << " (Error: " << rc << ")\n"; +// ============================================================================= +// Pipeline Configuration (application-level, no atomics) +// ============================================================================= + +constexpr size_t NUM_SLOTS = 32; + +struct PipelineConfig { + std::string label; + int distance; + int num_rounds; + int meas_qubits; + int residual_detectors; + std::string onnx_filename; + size_t slot_size; + int num_predecoders; + int num_workers; + + int input_elements() const { return meas_qubits * num_rounds; } + size_t input_bytes() const { return input_elements() * sizeof(int32_t); } + + std::string onnx_path() const { + return std::string(ONNX_MODEL_DIR) + "/" + onnx_filename; } -} -static void pin_current_thread_to_core(int core_id) { - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - int rc = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); - if (rc != 0) { - std::cerr << "Warning: Failed to pin current thread to core " << core_id << " (Error: " << rc << ")\n"; + std::string engine_path() const { + std::string name = onnx_filename; + auto dot = name.rfind('.'); + if (dot != std::string::npos) + name = name.substr(0, dot); + return std::string(ONNX_MODEL_DIR) + "/" + name + ".engine"; } -} -using namespace cudaq::qec; -namespace realtime_ns = cudaq::realtime; - - // ============================================================================= - // Pipeline Configuration - // ============================================================================= - - constexpr size_t NUM_SLOTS = 32; - - struct PipelineConfig { - std::string label; - int distance; - int num_rounds; - int meas_qubits; // ONNX input shape[1] - int residual_detectors; // ONNX output dim - std::string onnx_filename; - size_t slot_size; // must fit RPC header (CUDAQ_RPC_HEADER_SIZE) + input payload - int num_predecoders; - int num_workers; - - int input_elements() const { return meas_qubits * num_rounds; } - size_t input_bytes() const { return input_elements() * sizeof(int32_t); } - - std::string onnx_path() const { - return std::string(ONNX_MODEL_DIR) + "/" + onnx_filename; - } - - std::string engine_path() const { - std::string name = onnx_filename; - auto dot = name.rfind('.'); - if (dot != std::string::npos) - name = name.substr(0, dot); - return std::string(ONNX_MODEL_DIR) + "/" + name + ".engine"; - } - - static PipelineConfig d7_r7() { - return { - "d7_r7_Z", - /*distance=*/7, - /*num_rounds=*/7, - /*meas_qubits=*/72, - /*residual_detectors=*/336, - "model1_d7_r7_unified_Z_batch1.onnx", - /*slot_size=*/4096, - /*num_predecoders=*/16, - /*num_workers=*/16 - }; - } + static PipelineConfig d7_r7() { + return { + "d7_r7_Z", 7, 7, 72, 336, + "model1_d7_r7_unified_Z_batch1.onnx", + 4096, 16, 16 + }; + } static PipelineConfig d13_r13() { return { - "d13_r13_Z", - /*distance=*/13, - /*num_rounds=*/13, - /*meas_qubits=*/252, - /*residual_detectors=*/2184, + "d13_r13_Z", 13, 13, 252, 2184, "predecoder_memory_d13_T13_X.onnx", - /*slot_size=*/16384, - /*num_predecoders=*/16, - /*num_workers=*/16 + 16384, 16, 16 }; } static PipelineConfig d13_r104() { return { - "d13_r104_Z", - /*distance=*/13, - /*num_rounds=*/104, - /*meas_qubits=*/252, - /*residual_detectors=*/2184, + "d13_r104_Z", 13, 104, 252, 2184, "predecoder_memory_d13_T104_X.onnx", - /*slot_size=*/32768, - /*num_predecoders=*/16, - /*num_workers=*/16 + 131072, 16, 16 }; } - static PipelineConfig d21_r21() { - return { - "d21_r21_Z", - /*distance=*/21, - /*num_rounds=*/21, - /*meas_qubits=*/660, - /*residual_detectors=*/9240, - "model1_d21_r21_unified_X_batch1.onnx", - /*slot_size=*/65536, - /*num_predecoders=*/16, - /*num_workers=*/16 - }; - } - - static PipelineConfig d31_r31() { - return { - "d31_r31_Z", - /*distance=*/31, - /*num_rounds=*/31, - /*meas_qubits=*/1440, - /*residual_detectors=*/29760, - "model1_d31_r31_unified_Z_batch1.onnx", - /*slot_size=*/262144, - /*num_predecoders=*/16, - /*num_workers=*/16 - }; - } - }; - - // Runtime decoder state populated during setup - struct DecoderContext { - std::vector> decoders; - std::atomic next_decoder_idx{0}; - int z_stabilizers = 0; - int spatial_slices = 0; - - cudaq::qec::decoder* acquire_decoder() { - thread_local int my_idx = next_decoder_idx.fetch_add(1, std::memory_order_relaxed); - return decoders[my_idx % decoders.size()].get(); - } - - // Per-worker timing accumulators (lock-free) - std::atomic total_decode_us{0}; - std::atomic total_worker_us{0}; - std::atomic decode_count{0}; - }; - - struct SystemContext { - realtime_ns::atomic_uint64_sys* tx_flags_host = nullptr; - uint8_t* rx_data_host = nullptr; - size_t slot_size = 0; - }; - SystemContext g_sys_ctx; - - /// Context for dynamic worker pool: worker task writes tx_flags[origin_slot] and frees idle_mask. - struct WorkerPoolContext { - realtime_ns::atomic_uint64_sys* tx_flags = nullptr; - realtime_ns::atomic_uint64_sys* idle_mask = nullptr; - int* inflight_slot_tags = nullptr; - uint64_t* debug_poll_ts = nullptr; // when worker poll_next_job succeeded (ns epoch) - uint64_t* debug_worker_done_ts = nullptr; // when worker set tx_flags (ns epoch) - }; - - // ============================================================================= - // Thread Pool Worker (Real PyMatching MWPM Decoder) - // ============================================================================= - - struct __attribute__((packed)) DecodeResponse { - int32_t total_corrections; - int32_t converged; - }; - - void pymatching_worker_task(PreDecoderJob job, int worker_id, - AIPreDecoderService* predecoder, - DecoderContext* ctx, - WorkerPoolContext* pool_ctx) { - nvtxRangePushA("Worker Task"); - using hrclock = std::chrono::high_resolution_clock; - auto worker_start = hrclock::now(); - - if (pool_ctx && pool_ctx->debug_poll_ts) { - pool_ctx->debug_poll_ts[job.origin_slot] = std::chrono::duration_cast( - worker_start.time_since_epoch()).count(); + static PipelineConfig d21_r21() { + return { + "d21_r21_Z", 21, 21, 660, 9240, + "model1_d21_r21_unified_X_batch1.onnx", + 65536, 16, 16 + }; } - int total_corrections = 0; - bool all_converged = true; - - auto decode_start = hrclock::now(); -#if !defined(DISABLE_PYMATCHING) - const int32_t* residual = static_cast(job.inference_data); - auto* my_decoder = ctx->acquire_decoder(); - - nvtxRangePushA("PyMatching Decode"); - - cudaqx::tensor syndrome_tensor({(size_t)ctx->z_stabilizers}); - uint8_t* syn_data = syndrome_tensor.data(); - - for (int s = 0; s < ctx->spatial_slices; ++s) { - const int32_t* slice = residual + s * ctx->z_stabilizers; - for (int i = 0; i < ctx->z_stabilizers; ++i) { - syn_data[i] = static_cast(slice[i]); - } - - auto result = my_decoder->decode(syndrome_tensor); - - all_converged &= result.converged; - for (auto v : result.result) - if (v > 0.5) total_corrections++; + static PipelineConfig d31_r31() { + return { + "d31_r31_Z", 31, 31, 1440, 29760, + "model1_d31_r31_unified_Z_batch1.onnx", + 262144, 16, 16 + }; } - nvtxRangePop(); // PyMatching Decode -#endif - auto decode_end = hrclock::now(); - - DecodeResponse resp_data{total_corrections, all_converged ? 1 : 0}; - - char* response_payload = (char*)job.ring_buffer_ptr + sizeof(realtime_ns::RPCResponse); - std::memcpy(response_payload, &resp_data, sizeof(resp_data)); - - auto* header = static_cast(job.ring_buffer_ptr); - header->magic = realtime_ns::RPC_MAGIC_RESPONSE; - header->status = 0; - header->result_len = sizeof(resp_data); +}; - uint64_t rx_value = reinterpret_cast(job.ring_buffer_ptr); - int origin_slot = job.origin_slot; +// ============================================================================= +// Decoder Context (application-level) +// ============================================================================= - if (pool_ctx && pool_ctx->tx_flags) { - pool_ctx->tx_flags[origin_slot].store(rx_value, cuda::std::memory_order_release); - } else { - size_t slot_idx = ((uint8_t*)job.ring_buffer_ptr - g_sys_ctx.rx_data_host) / g_sys_ctx.slot_size; - g_sys_ctx.tx_flags_host[slot_idx].store(rx_value, cuda::std::memory_order_release); - } +struct DecoderContext { + std::vector> decoders; + std::atomic next_decoder_idx{0}; + int z_stabilizers = 0; + int spatial_slices = 0; - if (pool_ctx && pool_ctx->debug_worker_done_ts) { - pool_ctx->debug_worker_done_ts[origin_slot] = std::chrono::duration_cast( - hrclock::now().time_since_epoch()).count(); + cudaq::qec::decoder* acquire_decoder() { + thread_local int my_idx = next_decoder_idx.fetch_add(1, std::memory_order_relaxed); + return decoders[my_idx % decoders.size()].get(); } - predecoder->release_job(job.slot_idx); - - if (pool_ctx && pool_ctx->idle_mask) { - pool_ctx->idle_mask->fetch_or(1ULL << worker_id, cuda::std::memory_order_release); - } + std::atomic total_decode_us{0}; + std::atomic total_worker_us{0}; + std::atomic decode_count{0}; +}; - auto worker_end = hrclock::now(); - auto decode_us = std::chrono::duration_cast( - decode_end - decode_start).count(); - auto worker_us = std::chrono::duration_cast( - worker_end - worker_start).count(); - ctx->total_decode_us.fetch_add(decode_us, std::memory_order_relaxed); - ctx->total_worker_us.fetch_add(worker_us, std::memory_order_relaxed); - ctx->decode_count.fetch_add(1, std::memory_order_relaxed); - nvtxRangePop(); // Worker Task -} - - // ============================================================================= - // Generate Realistic Syndrome Data - // ============================================================================= - void fill_measurement_payload(int32_t* payload, int input_elements, - std::mt19937& rng, double error_rate = 0.01) { - std::bernoulli_distribution err_dist(error_rate); - for (int i = 0; i < input_elements; ++i) { - payload[i] = err_dist(rng) ? 1 : 0; - } - } - - // ============================================================================= - // Streaming Test Mode (simulates FPGA continuous syndrome arrival) - // ============================================================================= - - struct StreamingConfig { - int rate_us = 0; // inter-arrival time in us (0 = open-loop) - int duration_s = 5; // how long to run - int warmup_count = 20; // discard first N from latency stats - }; +// ============================================================================= +// Pre-launch DMA copy callback +// ============================================================================= struct PreLaunchCopyCtx { - void* d_trt_input; + void* d_trt_input; size_t input_size; void** h_ring_ptrs; }; @@ -374,710 +183,452 @@ static void pre_launch_input_copy(void* user_data, void* slot_dev, cudaStream_t static_cast(slot_dev) + CUDAQ_RPC_HEADER_SIZE, ctx->input_size, cudaMemcpyDeviceToDevice, stream); } - - void run_streaming_test( - const PipelineConfig& config, - const StreamingConfig& scfg, - uint8_t* rx_data_host, - uint8_t* rx_data_dev, - realtime_ns::atomic_uint64_sys* rx_flags, - realtime_ns::atomic_uint64_sys* tx_flags, - DecoderContext& decoder_ctx, - std::vector>& predecoders, - std::atomic& system_stop, - void** h_mailbox_bank, - std::vector& predecoder_streams, - WorkerPoolContext* pool_ctx, - std::atomic* total_claimed = nullptr) - { - using hrclock = std::chrono::high_resolution_clock; - using atomic_uint64_sys = realtime_ns::atomic_uint64_sys; - using atomic_int_sys = realtime_ns::atomic_int_sys; - - const int num_workers = config.num_predecoders; - const int max_requests = 500000; - const size_t payload_bytes = config.input_bytes(); - - std::vector submit_ts(max_requests); - std::vector complete_ts(max_requests); - std::vector completed(max_requests, false); - std::vector dispatch_ts(max_requests, 0); - std::vector poll_ts(max_requests, 0); - std::vector worker_done_ts(max_requests, 0); - - std::vector slot_request(NUM_SLOTS, -1); - std::vector debug_dispatch_ts_arr(NUM_SLOTS, 0); - - std::atomic total_submitted{0}; - std::atomic total_completed{0}; - std::atomic backpressure_stalls{0}; - std::atomic producer_done{false}; - std::atomic consumer_stop{false}; - - atomic_int_sys shutdown_flag(0); - uint64_t dispatcher_stats = 0; - atomic_uint64_sys live_dispatched(0); - - // Build function table for realtime host dispatcher (lookup by function_id). - std::vector function_table(num_workers); - for (int i = 0; i < num_workers; ++i) { - std::string func_name = "predecode_target_" + std::to_string(i); - function_table[i].function_id = realtime_ns::fnv1a_hash(func_name.c_str()); - function_table[i].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; - function_table[i].handler.graph_exec = predecoders[i]->get_executable_graph(); - std::memset(&function_table[i].schema, 0, sizeof(function_table[i].schema)); - } - realtime_ns::HostDispatcherConfig disp_cfg; - disp_cfg.rx_flags = rx_flags; - disp_cfg.tx_flags = tx_flags; - disp_cfg.rx_data_host = rx_data_host; - disp_cfg.rx_data_dev = rx_data_dev; - disp_cfg.tx_data_host = nullptr; - disp_cfg.tx_data_dev = nullptr; - disp_cfg.tx_stride_sz = config.slot_size; - disp_cfg.h_mailbox_bank = h_mailbox_bank; - disp_cfg.num_slots = NUM_SLOTS; - disp_cfg.slot_size = config.slot_size; - disp_cfg.function_table = function_table.data(); - disp_cfg.function_table_count = num_workers; - disp_cfg.shutdown_flag = &shutdown_flag; - disp_cfg.stats_counter = &dispatcher_stats; - disp_cfg.live_dispatched = &live_dispatched; - disp_cfg.idle_mask = pool_ctx->idle_mask; - disp_cfg.inflight_slot_tags = pool_ctx->inflight_slot_tags; - std::vector pre_launch_ctxs(num_workers); - for (int i = 0; i < num_workers; ++i) { - pre_launch_ctxs[i].d_trt_input = predecoders[i]->get_trt_input_ptr(); - pre_launch_ctxs[i].input_size = predecoders[i]->get_input_size(); - pre_launch_ctxs[i].h_ring_ptrs = predecoders[i]->get_host_ring_ptrs(); - } +// ============================================================================= +// Worker context (passed through user_context) +// ============================================================================= - disp_cfg.workers.resize(num_workers); - for (int i = 0; i < num_workers; ++i) { - disp_cfg.workers[i].graph_exec = predecoders[i]->get_executable_graph(); - disp_cfg.workers[i].stream = predecoder_streams[i]; - disp_cfg.workers[i].function_id = function_table[i].function_id; - disp_cfg.workers[i].pre_launch_fn = pre_launch_input_copy; - disp_cfg.workers[i].pre_launch_data = &pre_launch_ctxs[i]; - } +struct WorkerCtx { + AIPreDecoderService* predecoder; + DecoderContext* decoder_ctx; +}; - std::thread dispatcher_thread([&disp_cfg]() { - realtime_ns::host_dispatcher_loop(disp_cfg); - }); - pin_thread_to_core(dispatcher_thread, 2); - - // Ring buffer view for producer/consumer helpers (realtime C API). - cudaq_ringbuffer_t rb{}; - rb.rx_flags = reinterpret_cast(rx_flags); - rb.tx_flags = reinterpret_cast(tx_flags); - rb.rx_data = rx_data_dev; - rb.tx_data = rx_data_dev; - rb.rx_stride_sz = config.slot_size; - rb.tx_stride_sz = config.slot_size; - rb.rx_flags_host = reinterpret_cast(rx_flags); - rb.tx_flags_host = reinterpret_cast(tx_flags); - rb.rx_data_host = rx_data_host; - rb.tx_data_host = rx_data_host; - - auto run_deadline = std::chrono::steady_clock::now() - + std::chrono::seconds(scfg.duration_s); - - std::string rate_label = (scfg.rate_us > 0) - ? std::to_string(scfg.rate_us) + " us" - : "open-loop"; - - std::cout << "\n[Stream] Starting streaming test (" << config.label - << ", HOST dispatcher)\n" - << " Rate: " << rate_label << "\n" - << " Duration: " << scfg.duration_s << " s\n" - << " Warmup: " << scfg.warmup_count << " requests\n" - << " Predecoders:" << config.num_predecoders << " (dedicated streams)\n" - << " Max reqs: " << max_requests << "\n\n" - << std::flush; - - // Progress reporter (debug only; set to true to print submitted/completed every second) - constexpr bool kEnableProgressReporter = true; - std::atomic progress_done{false}; - std::thread progress_reporter; - if (kEnableProgressReporter) { - progress_reporter = std::thread([&]() { - while (true) { - std::this_thread::sleep_for(std::chrono::seconds(1)); - if (progress_done.load(std::memory_order_acquire)) break; - bool pdone = producer_done.load(std::memory_order_acquire); - int nsub = total_submitted.load(std::memory_order_acquire); - int ncomp = total_completed.load(std::memory_order_acquire); - uint64_t disp = live_dispatched.load(cuda::std::memory_order_relaxed); - uint64_t claimed = total_claimed ? total_claimed->load(std::memory_order_relaxed) : 0; - uint64_t mask = pool_ctx->idle_mask ? pool_ctx->idle_mask->load(cuda::std::memory_order_relaxed) : 0; - std::cout << " [progress] submitted=" << nsub << " completed=" << ncomp - << " dispatched=" << disp << " claimed=" << claimed - << " idle_mask=0x" << std::hex << mask << std::dec << std::endl; - if (pdone && ncomp >= nsub) break; - } - }); +struct __attribute__((packed)) DecodeResponse { + int32_t total_corrections; + int32_t converged; +}; + +// ============================================================================= +// Data generation +// ============================================================================= + +void fill_measurement_payload(int32_t* payload, int input_elements, + std::mt19937& rng, double error_rate = 0.01) { + std::bernoulli_distribution err_dist(error_rate); + for (int i = 0; i < input_elements; ++i) { + payload[i] = err_dist(rng) ? 1 : 0; } +} - // --- Producer thread (simulates FPGA) --- - std::thread producer([&]() { - std::mt19937 rng(42); - int next_slot = 0; - int req_id = 0; - - while (std::chrono::steady_clock::now() < run_deadline - && req_id < max_requests) { - - int slot = next_slot % (int)NUM_SLOTS; - - while (!cudaq_host_ringbuffer_slot_available(&rb, static_cast(slot))) { - backpressure_stalls.fetch_add(1, std::memory_order_relaxed); - QEC_CPU_RELAX(); - if (std::chrono::steady_clock::now() >= run_deadline) return; - } - - int target = req_id % config.num_predecoders; - std::string func = "predecode_target_" + std::to_string(target); - uint32_t function_id = realtime_ns::fnv1a_hash(func.c_str()); - - uint8_t* slot_data = rx_data_host + (slot * config.slot_size); - int32_t* payload = reinterpret_cast( - slot_data + CUDAQ_RPC_HEADER_SIZE); - fill_measurement_payload(payload, config.input_elements(), rng, 0.01); - - cudaq_host_ringbuffer_write_rpc_request(&rb, static_cast(slot), - function_id, payload, static_cast(payload_bytes)); - - slot_request[slot] = req_id; - submit_ts[req_id] = hrclock::now(); - cudaq_host_ringbuffer_signal_slot(&rb, static_cast(slot)); - total_submitted.fetch_add(1, std::memory_order_release); - - next_slot++; - req_id++; - - if (scfg.rate_us > 0) { - auto target_time = submit_ts[req_id - 1] - + std::chrono::microseconds(scfg.rate_us); - while (hrclock::now() < target_time) - QEC_CPU_RELAX(); - } - } - - producer_done.store(true, std::memory_order_seq_cst); - }); - pin_thread_to_core(producer, 3); - - // --- Consumer thread (harvests completions out-of-order) --- - std::thread consumer([&]() { - while (true) { - if (consumer_stop.load(std::memory_order_acquire)) - break; - bool pdone = producer_done.load(std::memory_order_acquire); - int nsub = total_submitted.load(std::memory_order_acquire); - int ncomp = total_completed.load(std::memory_order_relaxed); - - if (pdone && ncomp >= nsub) - break; - - bool found_any = false; - for (uint32_t s = 0; s < NUM_SLOTS; ++s) { - if (slot_request[s] < 0) continue; - - int cuda_error = 0; - cudaq_tx_status_t status = cudaq_host_ringbuffer_poll_tx_flag( - &rb, s, &cuda_error); - - if (status == CUDAQ_TX_READY) { - int rid = slot_request[s]; - if (rid >= 0) { - complete_ts[rid] = hrclock::now(); - poll_ts[rid] = pool_ctx->debug_poll_ts ? pool_ctx->debug_poll_ts[s] : 0; - worker_done_ts[rid] = pool_ctx->debug_worker_done_ts ? pool_ctx->debug_worker_done_ts[s] : 0; - completed[rid] = true; - total_completed.fetch_add(1, std::memory_order_relaxed); - } - slot_request[s] = -1; - __sync_synchronize(); - cudaq_host_ringbuffer_clear_slot(&rb, s); - found_any = true; - } else if (status == CUDAQ_TX_ERROR) { - std::cerr << " [FAIL] Slot " << s - << " cudaGraphLaunch error " << cuda_error - << " (" << cudaGetErrorString(static_cast(cuda_error)) - << ")\n"; - total_completed.fetch_add(1, std::memory_order_relaxed); - slot_request[s] = -1; - __sync_synchronize(); - cudaq_host_ringbuffer_clear_slot(&rb, s); - found_any = true; - } - } - if (!found_any) QEC_CPU_RELAX(); - } - }); - pin_thread_to_core(consumer, 4); - - std::cout << " [shutdown] joining producer...\n" << std::flush; - producer.join(); - - // Grace period for in-flight requests - auto grace_deadline = std::chrono::steady_clock::now() + std::chrono::seconds(5); - while (total_completed.load() < total_submitted.load() - && std::chrono::steady_clock::now() < grace_deadline) { - usleep(1000); - } - - if (total_completed.load() < total_submitted.load()) { - int nsub_dbg = total_submitted.load(); - int ncomp_dbg = total_completed.load(); - std::cerr << " [DEBUG] Stuck: submitted=" << nsub_dbg << " completed=" << ncomp_dbg - << " diff=" << (nsub_dbg - ncomp_dbg) << "\n"; - for (uint32_t s = 0; s < NUM_SLOTS; ++s) { - uint64_t rx_val = reinterpret_cast(rx_flags)[s]; - uint64_t tx_val = reinterpret_cast(tx_flags)[s]; - int rid = slot_request[s]; - if (rx_val != 0 || tx_val != 0 || rid >= 0) { - std::cerr << " slot[" << s << "] rx=0x" << std::hex << rx_val - << " tx=0x" << tx_val << std::dec - << " slot_request=" << rid - << " (completed=" << (rid >= 0 ? (completed[rid] ? "YES" : "NO") : "n/a") - << ")\n"; - } - } - for (int w = 0; w < config.num_predecoders; ++w) { - auto* pd = predecoders[w].get(); - std::cerr << " worker[" << w << "] inflight_slot_tag=" - << pool_ctx->inflight_slot_tags[w] - << " idle=" << ((pool_ctx->idle_mask->load(cuda::std::memory_order_relaxed) >> w) & 1) - << "\n"; - } - } - - consumer_stop.store(true, std::memory_order_release); - - shutdown_flag.store(1, cuda::std::memory_order_release); - std::cout << " [shutdown] joining dispatcher...\n" << std::flush; - dispatcher_thread.join(); - std::cout << " [shutdown] joining consumer...\n" << std::flush; - consumer.join(); - - if (kEnableProgressReporter) { - progress_done.store(true, std::memory_order_release); - progress_reporter.join(); - } - - // ===== Report ===== - auto run_end = std::chrono::steady_clock::now(); - int nsub = total_submitted.load(); - int ncomp = total_completed.load(); - if (ncomp < nsub) - std::cerr << " [WARN] " << (nsub - ncomp) << " in-flight requests did not complete before grace period.\n"; - - // Build PipelineBenchmark from timestamps (skip warmup) - int warmup = std::min(scfg.warmup_count, nsub); - int bench_count = nsub - warmup; - - cudaq::qec::utils::PipelineBenchmark bench( - config.label + " (stream)", bench_count); - bench.start(); - - for (int i = warmup; i < nsub; ++i) { - int bench_id = i - warmup; - bench.mark_submit(bench_id); - } - - std::vector latencies; - latencies.reserve(bench_count); - for (int i = warmup; i < nsub; ++i) { - if (!completed[i]) continue; - auto dt = std::chrono::duration_cast>( - complete_ts[i] - submit_ts[i]); - latencies.push_back(dt.count()); - } - - bench.stop(); - - std::sort(latencies.begin(), latencies.end()); - - auto pct = [&](double p) -> double { - if (latencies.empty()) return 0; - double idx = (p / 100.0) * (latencies.size() - 1); - size_t lo = (size_t)idx; - size_t hi = std::min(lo + 1, latencies.size() - 1); - double frac = idx - lo; - return latencies[lo] * (1.0 - frac) + latencies[hi] * frac; - }; - - double mean = 0; - for (auto v : latencies) mean += v; - mean = latencies.empty() ? 0 : mean / latencies.size(); - - double stddev = 0; - for (auto v : latencies) stddev += (v - mean) * (v - mean); - stddev = latencies.empty() ? 0 : std::sqrt(stddev / latencies.size()); - - auto wall_us = std::chrono::duration_cast>( - run_end - (run_deadline - std::chrono::seconds(scfg.duration_s))).count(); - double throughput = (wall_us > 0) ? (ncomp * 1e6 / wall_us) : 0; - - double actual_rate = (nsub > 1) - ? std::chrono::duration_cast>( - submit_ts[nsub - 1] - submit_ts[0]).count() / (nsub - 1) - : 0; - - std::cout << std::fixed; - std::cout << "\n================================================================\n"; - std::cout << " Streaming Benchmark: " << config.label << "\n"; - std::cout << "================================================================\n"; - std::cout << " Submitted: " << nsub << "\n"; - std::cout << " Completed: " << ncomp << "\n"; - if (nsub > ncomp) - std::cout << " Dropped/timeout: " << (nsub - ncomp) << "\n"; - std::cout << std::setprecision(1); - std::cout << " Wall time: " << wall_us / 1000.0 << " ms\n"; - std::cout << " Throughput: " << throughput << " req/s\n"; - std::cout << " Actual arrival rate:" << std::setw(8) << actual_rate << " us/req\n"; - std::cout << " Backpressure stalls:" << std::setw(8) - << backpressure_stalls.load() << "\n"; - std::cout << " ---------------------------------------------------------------\n"; - std::cout << " Latency (us) [steady-state, " << latencies.size() - << " requests after " << warmup << " warmup]\n"; - std::cout << std::setprecision(1); - if (!latencies.empty()) { - std::cout << " min = " << std::setw(10) << latencies.front() << "\n"; - std::cout << " p50 = " << std::setw(10) << pct(50) << "\n"; - std::cout << " mean = " << std::setw(10) << mean << "\n"; - std::cout << " p90 = " << std::setw(10) << pct(90) << "\n"; - std::cout << " p95 = " << std::setw(10) << pct(95) << "\n"; - std::cout << " p99 = " << std::setw(10) << pct(99) << "\n"; - std::cout << " max = " << std::setw(10) << latencies.back() << "\n"; - std::cout << " stddev = " << std::setw(10) << stddev << "\n"; - } - std::cout << " ---------------------------------------------------------------\n"; - - // Worker timing breakdown - int n_decoded = decoder_ctx.decode_count.load(); - if (n_decoded > 0) { - double avg_decode = (double)decoder_ctx.total_decode_us.load() / n_decoded; - double avg_worker = (double)decoder_ctx.total_worker_us.load() / n_decoded; - double avg_overhead = avg_worker - avg_decode; - - // Per-request breakdown using submit, poll (worker start), worker_done, complete timestamps. - // Stage A: submit → poll_ts = dispatch + graph launch + GPU execution + poll CAS - // Stage B: poll_ts → worker_done_ts = worker task (decode + response write + tx_flags set) - // Stage C: worker_done_ts → complete_ts = consumer polling delay - double sum_stage_a = 0, sum_stage_b = 0, sum_stage_c = 0; - int count_valid = 0; - std::vector stage_a_samples, stage_b_samples, stage_c_samples; - for (int i = warmup; i < nsub; ++i) { - if (!completed[i] || poll_ts[i] == 0 || worker_done_ts[i] == 0) continue; - uint64_t submit_ns = std::chrono::duration_cast( - submit_ts[i].time_since_epoch()).count(); - uint64_t complete_ns = std::chrono::duration_cast( - complete_ts[i].time_since_epoch()).count(); - if (poll_ts[i] <= submit_ns || worker_done_ts[i] < poll_ts[i] || complete_ns < worker_done_ts[i]) - continue; - double a = (poll_ts[i] - submit_ns) / 1000.0; - double b = (worker_done_ts[i] - poll_ts[i]) / 1000.0; - double c = (complete_ns - worker_done_ts[i]) / 1000.0; - sum_stage_a += a; sum_stage_b += b; sum_stage_c += c; - stage_a_samples.push_back(a); - stage_b_samples.push_back(b); - stage_c_samples.push_back(c); - count_valid++; - } - - auto percentile = [](std::vector& v, double pct) -> double { - if (v.empty()) return 0; - std::sort(v.begin(), v.end()); - size_t idx = std::min((size_t)(pct / 100.0 * v.size()), v.size() - 1); - return v[idx]; - }; - - double avg_a = count_valid > 0 ? sum_stage_a / count_valid : 0; - double avg_b = count_valid > 0 ? sum_stage_b / count_valid : 0; - double avg_c = count_valid > 0 ? sum_stage_c / count_valid : 0; - - std::cout << std::setprecision(1); - std::cout << " Pipeline Timing Breakdown (" << count_valid << " valid samples):\n"; - std::cout << " [A] Submit→Worker poll:" << std::setw(9) << avg_a - << " us (p50=" << percentile(stage_a_samples, 50) - << " p99=" << percentile(stage_a_samples, 99) << ")\n"; - std::cout << " (dispatch + graph launch + GPU exec + CAS)\n"; - std::cout << " [B] Worker task: " << std::setw(9) << avg_b - << " us (p50=" << percentile(stage_b_samples, 50) - << " p99=" << percentile(stage_b_samples, 99) << ")\n"; - std::cout << " (decode + response write + tx_flags set)\n"; - std::cout << " [C] Consumer poll lag: " << std::setw(9) << avg_c - << " us (p50=" << percentile(stage_c_samples, 50) - << " p99=" << percentile(stage_c_samples, 99) << ")\n"; - std::cout << " (tx_flags set → consumer sees it)\n"; - std::cout << " [A+B+C] Sum: " << std::setw(9) << (avg_a + avg_b + avg_c) << " us\n"; - std::cout << " End-to-end mean: " << std::setw(9) << mean << " us\n"; - std::cout << " Per-round (/" << config.num_rounds << "): " - << std::setw(9) << (mean / config.num_rounds) << " us/round\n"; - std::cout << " ---------------------------------------------------------------\n"; - std::cout << " Worker-level averages (" << n_decoded << " completed):\n"; - std::cout << " PyMatching decode: " << std::setw(9) << avg_decode << " us\n"; - std::cout << " Total worker: " << std::setw(9) << avg_worker << " us\n"; - std::cout << " Worker overhead: " << std::setw(9) << avg_overhead << " us\n"; - } - std::cout << " ---------------------------------------------------------------\n"; - std::cout << " Host dispatcher processed " << dispatcher_stats << " packets.\n"; - std::cout << "================================================================\n"; - } - - // ============================================================================= - // Main - // ============================================================================= - int main(int argc, char* argv[]) { - // Parse arguments: [rate_us] [duration_s] - std::string config_name = "d7"; - StreamingConfig stream_cfg; - - if (argc > 1) - config_name = argv[1]; - if (argc > 2 && std::isdigit(argv[2][0])) - stream_cfg.rate_us = std::stoi(argv[2]); - if (argc > 3 && std::isdigit(argv[3][0])) - stream_cfg.duration_s = std::stoi(argv[3]); - - PipelineConfig config; - if (config_name == "d7") { - config = PipelineConfig::d7_r7(); +// ============================================================================= +// Streaming Config +// ============================================================================= + +struct StreamingConfig { + int rate_us = 0; + int duration_s = 5; + int warmup_count = 20; +}; + +// ============================================================================= +// Main +// ============================================================================= + +int main(int argc, char* argv[]) { + using hrclock = std::chrono::high_resolution_clock; + + // --- Parse arguments --- + std::string config_name = "d7"; + StreamingConfig scfg; + + if (argc > 1) + config_name = argv[1]; + if (argc > 2 && std::isdigit(argv[2][0])) + scfg.rate_us = std::stoi(argv[2]); + if (argc > 3 && std::isdigit(argv[3][0])) + scfg.duration_s = std::stoi(argv[3]); + + PipelineConfig config; + if (config_name == "d7") { + config = PipelineConfig::d7_r7(); } else if (config_name == "d13") { config = PipelineConfig::d13_r13(); } else if (config_name == "d13_r104") { config = PipelineConfig::d13_r104(); } else if (config_name == "d21") { - config = PipelineConfig::d21_r21(); - } else if (config_name == "d31") { - config = PipelineConfig::d31_r31(); - } else { + config = PipelineConfig::d21_r21(); + } else if (config_name == "d31") { + config = PipelineConfig::d31_r31(); + } else { std::cerr << "Usage: " << argv[0] << " [d7|d13|d13_r104|d21|d31] [rate_us] [duration_s]\n" << " d7 - distance 7, 7 rounds (default)\n" << " d13 - distance 13, 13 rounds\n" << " d13_r104 - distance 13, 104 rounds\n" << " d21 - distance 21, 21 rounds\n" << " d31 - distance 31, 31 rounds\n" - << " rate_us - inter-arrival time in us (0 = open-loop, default)\n" - << " duration_s - test duration in seconds (default: 5)\n" - << "\nExamples:\n" - << " " << argv[0] << " d13 # open-loop, 5s\n" - << " " << argv[0] << " d13 50 # 50 us between requests, 5s\n" - << " " << argv[0] << " d13 50 10 # 50 us rate, 10s duration\n"; - return 1; - } - - std::cout << "--- Initializing Hybrid AI Realtime Pipeline (" - << config.label << ") ---\n"; - std::cout << "[Config] distance=" << config.distance - << " rounds=" << config.num_rounds - << " meas_qubits=" << config.meas_qubits - << " residual_detectors=" << config.residual_detectors - << " input_bytes=" << config.input_bytes() - << " slot_size=" << config.slot_size << "\n"; - - CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost)); - - std::string engine_file = config.engine_path(); - std::string onnx_file = config.onnx_path(); - std::string model_path; - - std::ifstream engine_probe(engine_file, std::ios::binary); - if (engine_probe.good()) { - engine_probe.close(); - model_path = engine_file; - std::cout << "[Setup] Loading cached TRT engine: " << engine_file << "\n"; - } else { - model_path = onnx_file; - std::cout << "[Setup] Building TRT engines from ONNX: " << onnx_file << "\n"; - std::cout << "[Setup] Engine will be cached to: " << engine_file << "\n"; - } - - std::cout << "[Setup] Creating PyMatching decoder (d=" << config.distance - << " surface code, Z stabilizers)...\n"; - auto surface_code = cudaq::qec::get_code("surface_code", - {{"distance", config.distance}}); - auto H_z = surface_code->get_parity_z(); - - DecoderContext decoder_ctx; - decoder_ctx.z_stabilizers = static_cast(H_z.shape()[0]); - decoder_ctx.spatial_slices = config.residual_detectors / decoder_ctx.z_stabilizers; - std::cout << "[Setup] H_z shape: [" << H_z.shape()[0] << " x " - << H_z.shape()[1] << "]" - << " z_stabilizers=" << decoder_ctx.z_stabilizers - << " spatial_slices=" << decoder_ctx.spatial_slices << "\n"; - - cudaqx::heterogeneous_map pm_params; - pm_params.insert("merge_strategy", std::string("smallest_weight")); - std::cout << "[Setup] Pre-allocating " << config.num_workers - << " PyMatching decoders (one per worker)...\n"; - for (int i = 0; i < config.num_workers; ++i) - decoder_ctx.decoders.push_back( - cudaq::qec::decoder::get("pymatching", H_z, pm_params)); - std::cout << "[Setup] PyMatching decoder pool ready.\n"; - + << " rate_us - inter-arrival time in us (0 = open-loop)\n" + << " duration_s - test duration in seconds (default: 5)\n"; + return 1; + } + + std::cout << "--- Initializing Hybrid AI Realtime Pipeline (" + << config.label << ") ---\n"; + std::cout << "[Config] distance=" << config.distance + << " rounds=" << config.num_rounds + << " meas_qubits=" << config.meas_qubits + << " residual_detectors=" << config.residual_detectors + << " input_bytes=" << config.input_bytes() + << " slot_size=" << config.slot_size << "\n"; + + CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost)); + + // --- Model path --- + std::string engine_file = config.engine_path(); + std::string onnx_file = config.onnx_path(); + std::string model_path; + + std::ifstream engine_probe(engine_file, std::ios::binary); + if (engine_probe.good()) { + engine_probe.close(); + model_path = engine_file; + std::cout << "[Setup] Loading cached TRT engine: " << engine_file << "\n"; + } else { + model_path = onnx_file; + std::cout << "[Setup] Building TRT engines from ONNX: " << onnx_file << "\n"; + } + + // --- Create PyMatching decoders --- + std::cout << "[Setup] Creating PyMatching decoder (d=" << config.distance + << " surface code, Z stabilizers)...\n"; + auto surface_code = cudaq::qec::get_code("surface_code", + {{"distance", config.distance}}); + auto H_z = surface_code->get_parity_z(); + + DecoderContext decoder_ctx; + decoder_ctx.z_stabilizers = static_cast(H_z.shape()[0]); + decoder_ctx.spatial_slices = config.residual_detectors / decoder_ctx.z_stabilizers; + std::cout << "[Setup] H_z shape: [" << H_z.shape()[0] << " x " + << H_z.shape()[1] << "]" + << " z_stabilizers=" << decoder_ctx.z_stabilizers + << " spatial_slices=" << decoder_ctx.spatial_slices << "\n"; + + cudaqx::heterogeneous_map pm_params; + pm_params.insert("merge_strategy", std::string("smallest_weight")); + std::cout << "[Setup] Pre-allocating " << config.num_workers + << " PyMatching decoders...\n"; + for (int i = 0; i < config.num_workers; ++i) + decoder_ctx.decoders.push_back( + cudaq::qec::decoder::get("pymatching", H_z, pm_params)); + std::cout << "[Setup] PyMatching decoder pool ready.\n"; + + // --- Create GPU resources (predecoders, streams, mailbox) --- + void** h_mailbox_bank = nullptr; + void** d_mailbox_bank = nullptr; + CUDA_CHECK(cudaHostAlloc(&h_mailbox_bank, + config.num_predecoders * sizeof(void*), cudaHostAllocMapped)); + std::memset(h_mailbox_bank, 0, config.num_predecoders * sizeof(void*)); + CUDA_CHECK(cudaHostGetDevicePointer( + reinterpret_cast(&d_mailbox_bank), h_mailbox_bank, 0)); + + std::vector predecoder_streams; + for (int i = 0; i < config.num_predecoders; ++i) { + cudaStream_t s; + CUDA_CHECK(cudaStreamCreate(&s)); + predecoder_streams.push_back(s); + } + + std::cout << "[Setup] Capturing " << config.num_predecoders + << "x AIPreDecoder Graphs...\n"; + cudaStream_t capture_stream; + CUDA_CHECK(cudaStreamCreate(&capture_stream)); + + std::vector> predecoders; + bool need_save = (model_path == onnx_file); + for (int i = 0; i < config.num_predecoders; ++i) { + std::string save_path = (need_save && i == 0) ? engine_file : ""; + auto pd = std::make_unique( + model_path, d_mailbox_bank + i, 1, save_path); + std::cout << "[Setup] Decoder " << i + << ": input_size=" << pd->get_input_size() + << " output_size=" << pd->get_output_size() << "\n"; + pd->capture_graph(capture_stream, false); + predecoders.push_back(std::move(pd)); + } + + // Pre-launch DMA contexts + std::vector pre_launch_ctxs(config.num_predecoders); + for (int i = 0; i < config.num_predecoders; ++i) { + pre_launch_ctxs[i].d_trt_input = predecoders[i]->get_trt_input_ptr(); + pre_launch_ctxs[i].input_size = predecoders[i]->get_input_size(); + pre_launch_ctxs[i].h_ring_ptrs = predecoders[i]->get_host_ring_ptrs(); + } + + // Worker contexts (per-worker, application-specific) + std::vector worker_ctxs(config.num_workers); + for (int i = 0; i < config.num_workers; ++i) { + worker_ctxs[i].predecoder = predecoders[i].get(); + worker_ctxs[i].decoder_ctx = &decoder_ctx; + } + + // Build function table for RPC dispatch + std::vector function_ids(config.num_workers); + for (int i = 0; i < config.num_workers; ++i) { + std::string func = "predecode_target_" + std::to_string(i); + function_ids[i] = realtime_ns::fnv1a_hash(func.c_str()); + } + // ========================================================================= - // System-Scope Atomics & Ring Buffer Allocation (Replaces volatile setup) + // Create pipeline (all atomics hidden inside) // ========================================================================= - using atomic_uint64_sys = realtime_ns::atomic_uint64_sys; - using atomic_int_sys = realtime_ns::atomic_int_sys; - - void* buf_rx = nullptr; - CUDA_CHECK(cudaHostAlloc(&buf_rx, NUM_SLOTS * sizeof(atomic_uint64_sys), cudaHostAllocMapped)); - atomic_uint64_sys* rx_flags_host = static_cast(buf_rx); - for (size_t i = 0; i < NUM_SLOTS; ++i) new (rx_flags_host + i) atomic_uint64_sys(0); - - void* buf_tx = nullptr; - CUDA_CHECK(cudaHostAlloc(&buf_tx, NUM_SLOTS * sizeof(atomic_uint64_sys), cudaHostAllocMapped)); - atomic_uint64_sys* tx_flags_host = static_cast(buf_tx); - for (size_t i = 0; i < NUM_SLOTS; ++i) new (tx_flags_host + i) atomic_uint64_sys(0); - - uint64_t* rx_flags_dev = nullptr; - uint64_t* tx_flags_dev = nullptr; - CUDA_CHECK(cudaHostGetDevicePointer((void**)&rx_flags_dev, buf_rx, 0)); - CUDA_CHECK(cudaHostGetDevicePointer((void**)&tx_flags_dev, buf_tx, 0)); - - uint8_t *rx_data_host, *rx_data_dev; - CUDA_CHECK(cudaHostAlloc(&rx_data_host, NUM_SLOTS * config.slot_size, cudaHostAllocMapped)); - CUDA_CHECK(cudaHostGetDevicePointer((void**)&rx_data_dev, rx_data_host, 0)); - - g_sys_ctx.tx_flags_host = tx_flags_host; - g_sys_ctx.rx_data_host = rx_data_host; - g_sys_ctx.slot_size = config.slot_size; - - // Define the dynamic pool variables HERE so they live until the program exits - // Avoid 1ULL<<64 (UB); for 64 workers use all-ones mask. - uint64_t initial_idle = (config.num_predecoders >= 64) - ? ~0ULL - : ((1ULL << config.num_predecoders) - 1); - atomic_uint64_sys idle_mask(initial_idle); - std::vector inflight_slot_tags(config.num_predecoders, 0); - std::vector debug_poll_ts_arr(NUM_SLOTS, 0); - std::vector debug_worker_done_ts_arr(NUM_SLOTS, 0); - - WorkerPoolContext pool_ctx; - pool_ctx.tx_flags = tx_flags_host; - pool_ctx.idle_mask = &idle_mask; - pool_ctx.inflight_slot_tags = inflight_slot_tags.data(); - pool_ctx.debug_poll_ts = debug_poll_ts_arr.data(); - pool_ctx.debug_worker_done_ts = debug_worker_done_ts_arr.data(); - - // ========================================================================= - // Mailbox & Dispatcher Setup (mode-dependent) - // ========================================================================= - - void** h_mailbox_bank = nullptr; - void** d_mailbox_bank = nullptr; - CUDA_CHECK(cudaHostAlloc(&h_mailbox_bank, config.num_predecoders * sizeof(void*), cudaHostAllocMapped)); - std::memset(h_mailbox_bank, 0, config.num_predecoders * sizeof(void*)); - CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_mailbox_bank, h_mailbox_bank, 0)); - - std::vector predecoder_streams; - for (int i = 0; i < config.num_predecoders; ++i) { - cudaStream_t s; - CUDA_CHECK(cudaStreamCreate(&s)); - predecoder_streams.push_back(s); - } - - std::cout << "[Setup] Capturing " << config.num_predecoders - << "x AIPreDecoder Graphs (host-launch)...\n"; - cudaStream_t capture_stream; - CUDA_CHECK(cudaStreamCreate(&capture_stream)); - - std::vector> predecoders; - bool need_save = (model_path == onnx_file); - const int predecoder_queue_depth = 1; - for (int i = 0; i < config.num_predecoders; ++i) { - std::string save_path = (need_save && i == 0) ? engine_file : ""; - auto pd = std::make_unique(model_path, d_mailbox_bank + i, - predecoder_queue_depth, - save_path); - - std::cout << "[Setup] Decoder " << i - << ": input_size=" << pd->get_input_size() - << " output_size=" << pd->get_output_size() << "\n"; - - pd->capture_graph(capture_stream, false /* host-launch */); - - predecoders.push_back(std::move(pd)); - } - - std::cout << "[Setup] Host-side dispatcher will be launched in streaming test.\n"; - - std::atomic system_stop{false}; - std::atomic total_claimed{0}; - - std::cout << "[Setup] Booting " << config.num_workers << " Dedicated Polling/Worker Threads...\n"; - std::vector worker_threads; - for (int i = 0; i < config.num_workers; ++i) { - worker_threads.emplace_back([i, &predecoders, &decoder_ctx, &system_stop, &pool_ctx, &total_claimed]() { - int target_core = 10 + i; - pin_current_thread_to_core(target_core); - - AIPreDecoderService* pd_ptr = predecoders[i].get(); - - nvtxRangePushA("Worker Loop"); - PreDecoderJob job; - while (!system_stop.load(std::memory_order_relaxed)) { - // Wait for GPU to set ready flag to 1 - if (pd_ptr->poll_next_job(job)) { - nvtxRangePushA("Process Job"); - - total_claimed.fetch_add(1, std::memory_order_relaxed); - - if (pool_ctx.inflight_slot_tags) { - job.origin_slot = pool_ctx.inflight_slot_tags[i]; - } else { - job.origin_slot = static_cast(((uint8_t*)job.ring_buffer_ptr - g_sys_ctx.rx_data_host) / g_sys_ctx.slot_size); - } - - pymatching_worker_task(job, i, pd_ptr, &decoder_ctx, &pool_ctx); - nvtxRangePop(); // Process Job - } else { - QEC_CPU_RELAX(); - } - } - nvtxRangePop(); // Worker Loop - }); + + realtime_ns::PipelineStageConfig stage_cfg; + stage_cfg.num_workers = config.num_workers; + stage_cfg.num_slots = NUM_SLOTS; + stage_cfg.slot_size = config.slot_size; + stage_cfg.cores = {.dispatcher = 2, .consumer = 4, .worker_base = 10}; + + realtime_ns::RealtimePipeline pipeline(stage_cfg); + + // --- GPU stage factory --- + pipeline.set_gpu_stage([&](int w) -> realtime_ns::GpuWorkerResources { + return { + .graph_exec = predecoders[w]->get_executable_graph(), + .stream = predecoder_streams[w], + .pre_launch_fn = pre_launch_input_copy, + .pre_launch_data = &pre_launch_ctxs[w], + .function_id = function_ids[w], + .user_context = &worker_ctxs[w] + }; + }); + + // --- CPU stage callback (poll + PyMatching decode) --- + // Called repeatedly by the pipeline's worker thread. + // Returns 0 if GPU isn't ready, >0 when a job was processed. + pipeline.set_cpu_stage([](const realtime_ns::CpuStageContext& ctx) -> size_t { + auto* wctx = static_cast(ctx.user_context); + auto* pd = wctx->predecoder; + auto* dctx = wctx->decoder_ctx; + + PreDecoderJob job; + if (!pd->poll_next_job(job)) + return 0; // GPU not done yet + + using hrclock = std::chrono::high_resolution_clock; + auto worker_start = hrclock::now(); + + int total_corrections = 0; + bool all_converged = true; + + auto decode_start = hrclock::now(); +#if !defined(DISABLE_PYMATCHING) + const int32_t* residual = static_cast(job.inference_data); + auto* my_decoder = dctx->acquire_decoder(); + + cudaqx::tensor syndrome_tensor({(size_t)dctx->z_stabilizers}); + uint8_t* syn_data = syndrome_tensor.data(); + + for (int s = 0; s < dctx->spatial_slices; ++s) { + const int32_t* slice = residual + s * dctx->z_stabilizers; + for (int i = 0; i < dctx->z_stabilizers; ++i) + syn_data[i] = static_cast(slice[i]); + + auto result = my_decoder->decode(syndrome_tensor); + all_converged &= result.converged; + for (auto v : result.result) + if (v > 0.5) total_corrections++; + } +#endif + auto decode_end = hrclock::now(); + + // Write RPC response into ring buffer slot + DecodeResponse resp{total_corrections, all_converged ? 1 : 0}; + char* response_payload = (char*)job.ring_buffer_ptr + sizeof(realtime_ns::RPCResponse); + std::memcpy(response_payload, &resp, sizeof(resp)); + + auto* header = static_cast(job.ring_buffer_ptr); + header->magic = realtime_ns::RPC_MAGIC_RESPONSE; + header->status = 0; + header->result_len = sizeof(resp); + + pd->release_job(job.slot_idx); + + auto worker_end = hrclock::now(); + auto decode_us = std::chrono::duration_cast( + decode_end - decode_start).count(); + auto worker_us = std::chrono::duration_cast( + worker_end - worker_start).count(); + dctx->total_decode_us.fetch_add(decode_us, std::memory_order_relaxed); + dctx->total_worker_us.fetch_add(worker_us, std::memory_order_relaxed); + dctx->decode_count.fetch_add(1, std::memory_order_relaxed); + + return 1; + }); + + // --- Completion callback (record timestamps) --- + const int max_requests = 500000; + std::vector submit_ts(max_requests); + std::vector complete_ts(max_requests); + std::vector completed(max_requests, false); + + pipeline.set_completion_handler([&](const realtime_ns::Completion& c) { + if (c.request_id < static_cast(max_requests)) { + complete_ts[c.request_id] = hrclock::now(); + completed[c.request_id] = c.success; + } + }); + + // ========================================================================= + // Start pipeline and run producer + // ========================================================================= + + std::cout << "[Setup] Starting pipeline...\n"; + pipeline.start(); + + auto run_deadline = std::chrono::steady_clock::now() + + std::chrono::seconds(scfg.duration_s); + + std::string rate_label = (scfg.rate_us > 0) + ? std::to_string(scfg.rate_us) + " us" : "open-loop"; + + std::cout << "\n[Stream] Starting streaming test (" << config.label << ")\n" + << " Rate: " << rate_label << "\n" + << " Duration: " << scfg.duration_s << " s\n" + << " Warmup: " << scfg.warmup_count << " requests\n" + << " Predecoders:" << config.num_predecoders << " (dedicated streams)\n" + << " Max reqs: " << max_requests << "\n\n" << std::flush; + + // --- Producer loop (runs on main thread) --- + std::mt19937 rng(42); + const size_t payload_bytes = std::min( + config.input_bytes(), + config.slot_size - static_cast(CUDAQ_RPC_HEADER_SIZE)); + std::vector payload_buf(CUDAQ_RPC_HEADER_SIZE + payload_bytes); + int req_id = 0; + int target = 0; + + while (std::chrono::steady_clock::now() < run_deadline + && req_id < max_requests) { + + int32_t* payload = reinterpret_cast( + payload_buf.data() + CUDAQ_RPC_HEADER_SIZE); + int fill_elems = static_cast(payload_bytes / sizeof(int32_t)); + fill_measurement_payload(payload, fill_elems, rng, 0.01); + + std::string func = "predecode_target_" + std::to_string(target); + uint32_t fid = realtime_ns::fnv1a_hash(func.c_str()); + + submit_ts[req_id] = hrclock::now(); + pipeline.submit(fid, payload, static_cast(payload_bytes), + static_cast(req_id)); + + target = (target + 1) % config.num_predecoders; + req_id++; + + if (scfg.rate_us > 0) { + auto target_time = submit_ts[req_id - 1] + + std::chrono::microseconds(scfg.rate_us); + while (hrclock::now() < target_time) + QEC_CPU_RELAX(); + } } - - // ========================================================================= - // Streaming test - // ========================================================================= - run_streaming_test(config, stream_cfg, - rx_data_host, rx_data_dev, rx_flags_host, tx_flags_host, - decoder_ctx, predecoders, system_stop, - h_mailbox_bank, predecoder_streams, &pool_ctx, &total_claimed); - - // Teardown - std::cout << "[Teardown] Shutting down...\n"; - system_stop = true; - - for (auto& t : worker_threads) { - if (t.joinable()) t.join(); - } - CUDA_CHECK(cudaStreamSynchronize(capture_stream)); - - for (auto& s : predecoder_streams) { - cudaStreamSynchronize(s); - cudaStreamDestroy(s); - } - - // Explicitly call destructors for libcu++ atomics before freeing memory - for (size_t i = 0; i < NUM_SLOTS; ++i) { - rx_flags_host[i].~atomic_uint64_sys(); - tx_flags_host[i].~atomic_uint64_sys(); - } - - cudaFreeHost(buf_rx); - cudaFreeHost(buf_tx); - cudaFreeHost(rx_data_host); - cudaFreeHost(h_mailbox_bank); - cudaStreamDestroy(capture_stream); - - std::cout << "Done.\n"; - return 0; - } \ No newline at end of file + + // --- Shutdown --- + pipeline.stop(); + + // ========================================================================= + // Report + // ========================================================================= + + auto final_stats = pipeline.stats(); + uint64_t nsub = final_stats.submitted; + uint64_t ncomp = final_stats.completed; + + if (ncomp < nsub) + std::cerr << " [WARN] " << (nsub - ncomp) + << " requests did not complete.\n"; + + int warmup = std::min(scfg.warmup_count, static_cast(nsub)); + std::vector latencies; + latencies.reserve(nsub - warmup); + + for (uint64_t i = warmup; i < nsub; ++i) { + if (!completed[i]) continue; + auto dt = std::chrono::duration_cast>( + complete_ts[i] - submit_ts[i]); + latencies.push_back(dt.count()); + } + + std::sort(latencies.begin(), latencies.end()); + + auto pct = [&](double p) -> double { + if (latencies.empty()) return 0; + double idx = (p / 100.0) * (latencies.size() - 1); + size_t lo = (size_t)idx; + size_t hi = std::min(lo + 1, latencies.size() - 1); + double frac = idx - lo; + return latencies[lo] * (1.0 - frac) + latencies[hi] * frac; + }; + + double mean = 0; + for (auto v : latencies) mean += v; + mean = latencies.empty() ? 0 : mean / latencies.size(); + + double stddev = 0; + for (auto v : latencies) stddev += (v - mean) * (v - mean); + stddev = latencies.empty() ? 0 : std::sqrt(stddev / latencies.size()); + + auto wall_us = std::chrono::duration_cast>( + std::chrono::steady_clock::now() - + (run_deadline - std::chrono::seconds(scfg.duration_s))).count(); + double throughput = (wall_us > 0) ? (ncomp * 1e6 / wall_us) : 0; + + double actual_rate = (nsub > 1) + ? std::chrono::duration_cast>( + submit_ts[nsub - 1] - submit_ts[0]).count() / (nsub - 1) + : 0; + + std::cout << std::fixed; + std::cout << "\n================================================================\n"; + std::cout << " Streaming Benchmark: " << config.label << "\n"; + std::cout << "================================================================\n"; + std::cout << " Submitted: " << nsub << "\n"; + std::cout << " Completed: " << ncomp << "\n"; + std::cout << std::setprecision(1); + std::cout << " Wall time: " << wall_us / 1000.0 << " ms\n"; + std::cout << " Throughput: " << throughput << " req/s\n"; + std::cout << " Actual arrival rate:" << std::setw(8) << actual_rate << " us/req\n"; + std::cout << " Backpressure stalls:" << std::setw(8) + << final_stats.backpressure_stalls << "\n"; + std::cout << " ---------------------------------------------------------------\n"; + std::cout << " Latency (us) [steady-state, " << latencies.size() + << " requests after " << warmup << " warmup]\n"; + if (!latencies.empty()) { + std::cout << " min = " << std::setw(10) << latencies.front() << "\n"; + std::cout << " p50 = " << std::setw(10) << pct(50) << "\n"; + std::cout << " mean = " << std::setw(10) << mean << "\n"; + std::cout << " p90 = " << std::setw(10) << pct(90) << "\n"; + std::cout << " p95 = " << std::setw(10) << pct(95) << "\n"; + std::cout << " p99 = " << std::setw(10) << pct(99) << "\n"; + std::cout << " max = " << std::setw(10) << latencies.back() << "\n"; + std::cout << " stddev = " << std::setw(10) << stddev << "\n"; + } + + int n_decoded = decoder_ctx.decode_count.load(); + if (n_decoded > 0) { + double avg_decode = (double)decoder_ctx.total_decode_us.load() / n_decoded; + double avg_worker = (double)decoder_ctx.total_worker_us.load() / n_decoded; + double avg_overhead = avg_worker - avg_decode; + std::cout << " ---------------------------------------------------------------\n"; + std::cout << " Worker-level averages (" << n_decoded << " completed):\n"; + std::cout << " PyMatching decode: " << std::setw(9) << avg_decode << " us\n"; + std::cout << " Total worker: " << std::setw(9) << avg_worker << " us\n"; + std::cout << " Worker overhead: " << std::setw(9) << avg_overhead << " us\n"; + } + + std::cout << " ---------------------------------------------------------------\n"; + std::cout << " Host dispatcher processed " << final_stats.dispatched << " packets.\n"; + std::cout << "================================================================\n"; + + // --- Cleanup --- + std::cout << "[Teardown] Shutting down...\n"; + CUDA_CHECK(cudaStreamSynchronize(capture_stream)); + for (auto& s : predecoder_streams) { + cudaStreamSynchronize(s); + cudaStreamDestroy(s); + } + cudaFreeHost(h_mailbox_bank); + cudaStreamDestroy(capture_stream); + + std::cout << "Done.\n"; + return 0; +} diff --git a/libs/qec/unittests/CMakeLists.txt b/libs/qec/unittests/CMakeLists.txt index e3c4c1bc..4b5db8bb 100644 --- a/libs/qec/unittests/CMakeLists.txt +++ b/libs/qec/unittests/CMakeLists.txt @@ -286,6 +286,7 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) cudaq-realtime cudaq-realtime-host-dispatch cudaq-realtime-dispatch + cudaq-realtime-pipeline cudaq-qec cudaq::cudaq ) diff --git a/realtime/include/cudaq/realtime/pipeline.h b/realtime/include/cudaq/realtime/pipeline.h new file mode 100644 index 00000000..e3645a56 --- /dev/null +++ b/realtime/include/cudaq/realtime/pipeline.h @@ -0,0 +1,138 @@ +/******************************************************************************* + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. + * All rights reserved. + * + * This source code and the accompanying materials are made available under + * the terms of the Apache License 2.0 which accompanies this distribution. + ******************************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace cudaq::realtime { + +// --------------------------------------------------------------------------- +// Configuration +// --------------------------------------------------------------------------- + +struct CorePinning { + int dispatcher = -1; // -1 = no pinning + int consumer = -1; + int worker_base = -1; // workers pin to base, base+1, ... +}; + +struct PipelineStageConfig { + int num_workers = 8; + int num_slots = 32; + size_t slot_size = 16384; + CorePinning cores; +}; + +// --------------------------------------------------------------------------- +// GPU Stage Factory +// --------------------------------------------------------------------------- + +struct GpuWorkerResources { + cudaGraphExec_t graph_exec = nullptr; + cudaStream_t stream = nullptr; + void (*pre_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream) = nullptr; + void* pre_launch_data = nullptr; + uint32_t function_id = 0; + void* user_context = nullptr; +}; + +/// Called once per worker during start(). Returns GPU resources for that worker. +using GpuStageFactory = std::function; + +// --------------------------------------------------------------------------- +// CPU Stage Callback +// --------------------------------------------------------------------------- + +/// Passed to the user's CPU stage callback on each completed GPU inference. +/// The user reads inference_output, does post-processing, and writes the +/// result into response_buffer. No atomics are exposed. +struct CpuStageContext { + int worker_id; + int origin_slot; + const void* inference_output; + size_t output_size; + void* response_buffer; + size_t max_response_size; + void* user_context; +}; + +/// Returns the number of bytes written into response_buffer. +using CpuStageCallback = std::function; + +// --------------------------------------------------------------------------- +// Completion Callback +// --------------------------------------------------------------------------- + +struct Completion { + uint64_t request_id; + int slot; + bool success; + int cuda_error; // 0 on success +}; + +/// Called by the consumer thread for each completed (or errored) request. +using CompletionCallback = std::function; + +// --------------------------------------------------------------------------- +// Pipeline +// --------------------------------------------------------------------------- + +class RealtimePipeline { +public: + explicit RealtimePipeline(const PipelineStageConfig& config); + ~RealtimePipeline(); + + RealtimePipeline(const RealtimePipeline&) = delete; + RealtimePipeline& operator=(const RealtimePipeline&) = delete; + + /// Register the GPU stage factory (called before start). + void set_gpu_stage(GpuStageFactory factory); + + /// Register the CPU worker callback (called before start). + void set_cpu_stage(CpuStageCallback callback); + + /// Register the completion callback (called before start). + void set_completion_handler(CompletionCallback handler); + + /// Allocate resources, build dispatcher config, spawn all threads. + void start(); + + /// Signal shutdown, join all threads, free resources. + void stop(); + + /// Try to submit a request. Returns true if accepted, false if + /// backpressure (all slots busy). Non-blocking. + bool try_submit(uint32_t function_id, const void* payload, + size_t payload_size, uint64_t request_id); + + /// Blocking submit: spins until a slot becomes available. + void submit(uint32_t function_id, const void* payload, + size_t payload_size, uint64_t request_id); + + struct Stats { + uint64_t submitted; + uint64_t completed; + uint64_t dispatched; + uint64_t backpressure_stalls; + }; + + /// Thread-safe, lock-free stats snapshot. + Stats stats() const; + +private: + struct Impl; + std::unique_ptr impl_; +}; + +} // namespace cudaq::realtime diff --git a/realtime/lib/CMakeLists.txt b/realtime/lib/CMakeLists.txt index 916f5e39..1f3a26be 100644 --- a/realtime/lib/CMakeLists.txt +++ b/realtime/lib/CMakeLists.txt @@ -15,3 +15,4 @@ install(DIRECTORY ${CUDAQ_REALTIME_INCLUDE_DIR}/cudaq ) add_subdirectory(daemon) +add_subdirectory(pipeline) diff --git a/realtime/lib/pipeline/CMakeLists.txt b/realtime/lib/pipeline/CMakeLists.txt new file mode 100644 index 00000000..7c23beea --- /dev/null +++ b/realtime/lib/pipeline/CMakeLists.txt @@ -0,0 +1,38 @@ +# ============================================================================ # +# Copyright (c) 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +if(CUDA_FOUND) + add_library(cudaq-realtime-pipeline SHARED + realtime_pipeline.cu + ) + + target_include_directories(cudaq-realtime-pipeline + PUBLIC + $ + $ + ) + + target_link_libraries(cudaq-realtime-pipeline + PUBLIC + CUDA::cudart_static + PRIVATE + cudaq-realtime + cudaq-realtime-host-dispatch + ) + + set_target_properties(cudaq-realtime-pipeline PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + POSITION_INDEPENDENT_CODE ON + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib + ) + + install(TARGETS cudaq-realtime-pipeline + COMPONENT realtime-lib + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) +endif() diff --git a/realtime/lib/pipeline/realtime_pipeline.cu b/realtime/lib/pipeline/realtime_pipeline.cu new file mode 100644 index 00000000..b6dfffed --- /dev/null +++ b/realtime/lib/pipeline/realtime_pipeline.cu @@ -0,0 +1,525 @@ +/******************************************************************************* + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. + * All rights reserved. + * + * This source code and the accompanying materials are made available under + * the terms of the Apache License 2.0 which accompanies this distribution. + ******************************************************************************/ + +#include "cudaq/realtime/pipeline.h" +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cudaq::realtime { + +// --------------------------------------------------------------------------- +// Internal helpers +// --------------------------------------------------------------------------- + +#define PIPELINE_CUDA_CHECK(call) \ + do { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) { \ + std::cerr << "RealtimePipeline CUDA error: " \ + << cudaGetErrorString(err) << " at " << __FILE__ << ":" \ + << __LINE__ << std::endl; \ + std::abort(); \ + } \ + } while (0) + +static void pin_thread(std::thread& t, int core) { + if (core < 0) return; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core, &cpuset); + pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset); +} + + +// --------------------------------------------------------------------------- +// RingBufferManager +// --------------------------------------------------------------------------- + +class RingBufferManager { +public: + RingBufferManager(size_t num_slots, size_t slot_size) + : num_slots_(num_slots), slot_size_(slot_size) + { + PIPELINE_CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost)); + + PIPELINE_CUDA_CHECK(cudaHostAlloc(&buf_rx_, + num_slots * sizeof(atomic_uint64_sys), cudaHostAllocMapped)); + rx_flags_ = static_cast(buf_rx_); + for (size_t i = 0; i < num_slots; ++i) + new (rx_flags_ + i) atomic_uint64_sys(0); + + PIPELINE_CUDA_CHECK(cudaHostAlloc(&buf_tx_, + num_slots * sizeof(atomic_uint64_sys), cudaHostAllocMapped)); + tx_flags_ = static_cast(buf_tx_); + for (size_t i = 0; i < num_slots; ++i) + new (tx_flags_ + i) atomic_uint64_sys(0); + + PIPELINE_CUDA_CHECK(cudaHostGetDevicePointer( + reinterpret_cast(&rx_flags_dev_), buf_rx_, 0)); + PIPELINE_CUDA_CHECK(cudaHostGetDevicePointer( + reinterpret_cast(&tx_flags_dev_), buf_tx_, 0)); + + PIPELINE_CUDA_CHECK(cudaHostAlloc( + reinterpret_cast(&rx_data_host_), + num_slots * slot_size, cudaHostAllocMapped)); + PIPELINE_CUDA_CHECK(cudaHostGetDevicePointer( + reinterpret_cast(&rx_data_dev_), rx_data_host_, 0)); + + rb_.rx_flags = reinterpret_cast(rx_flags_); + rb_.tx_flags = reinterpret_cast(tx_flags_); + rb_.rx_data = rx_data_dev_; + rb_.tx_data = rx_data_dev_; + rb_.rx_stride_sz = slot_size; + rb_.tx_stride_sz = slot_size; + rb_.rx_flags_host = reinterpret_cast(rx_flags_); + rb_.tx_flags_host = reinterpret_cast(tx_flags_); + rb_.rx_data_host = rx_data_host_; + rb_.tx_data_host = rx_data_host_; + } + + ~RingBufferManager() { + for (size_t i = 0; i < num_slots_; ++i) { + rx_flags_[i].~atomic_uint64_sys(); + tx_flags_[i].~atomic_uint64_sys(); + } + cudaFreeHost(buf_rx_); + cudaFreeHost(buf_tx_); + cudaFreeHost(rx_data_host_); + } + + bool slot_available(uint32_t slot) const { + return cudaq_host_ringbuffer_slot_available(&rb_, slot) != 0; + } + + void write_and_signal(uint32_t slot, uint32_t function_id, + const void* payload, uint32_t payload_len) { + cudaq_host_ringbuffer_write_rpc_request( + &rb_, slot, function_id, payload, payload_len); + cudaq_host_ringbuffer_signal_slot(&rb_, slot); + } + + cudaq_tx_status_t poll_tx(uint32_t slot, int* cuda_error) const { + return cudaq_host_ringbuffer_poll_tx_flag(&rb_, slot, cuda_error); + } + + void clear_slot(uint32_t slot) { + cudaq_host_ringbuffer_clear_slot(&rb_, slot); + } + + size_t num_slots() const { return num_slots_; } + size_t slot_size() const { return slot_size_; } + + atomic_uint64_sys* rx_flags() { return rx_flags_; } + atomic_uint64_sys* tx_flags() { return tx_flags_; } + uint8_t* rx_data_host() { return rx_data_host_; } + uint8_t* rx_data_dev() { return rx_data_dev_; } + const cudaq_ringbuffer_t& ringbuffer() const { return rb_; } + +private: + size_t num_slots_; + size_t slot_size_; + void* buf_rx_ = nullptr; + void* buf_tx_ = nullptr; + atomic_uint64_sys* rx_flags_ = nullptr; + atomic_uint64_sys* tx_flags_ = nullptr; + uint64_t* rx_flags_dev_ = nullptr; + uint64_t* tx_flags_dev_ = nullptr; + uint8_t* rx_data_host_ = nullptr; + uint8_t* rx_data_dev_ = nullptr; + cudaq_ringbuffer_t rb_{}; +}; + +// --------------------------------------------------------------------------- +// Impl +// --------------------------------------------------------------------------- + +struct RealtimePipeline::Impl { + PipelineStageConfig config; + + GpuStageFactory gpu_factory; + CpuStageCallback cpu_stage; + CompletionCallback completion_handler; + + // Owned infrastructure + std::unique_ptr ring; + void** h_mailbox_bank = nullptr; + void** d_mailbox_bank = nullptr; + + // Dispatcher state (hidden atomics) + atomic_int_sys shutdown_flag{0}; + uint64_t dispatcher_stats = 0; + atomic_uint64_sys live_dispatched{0}; + atomic_uint64_sys idle_mask{0}; + std::vector inflight_slot_tags; + + // Function table + std::vector function_table; + + // Per-worker GPU resources (from factory) + std::vector worker_resources; + + // Slot-to-request mapping (consumer-owned) + std::vector slot_request; + + // Stats (atomic counters) + std::atomic total_submitted{0}; + std::atomic total_completed{0}; + std::atomic backpressure_stalls{0}; + + // Thread coordination + std::atomic producer_stop{false}; + std::atomic consumer_stop{false}; + + // Threads + std::thread dispatcher_thread; + std::thread consumer_thread; + std::vector worker_threads; + + // Producer slot cursor + std::atomic next_slot{0}; + + bool started = false; + + // ----------------------------------------------------------------------- + // Lifecycle + // ----------------------------------------------------------------------- + + void allocate(const PipelineStageConfig& cfg) { + config = cfg; + + ring = std::make_unique( + static_cast(cfg.num_slots), cfg.slot_size); + + PIPELINE_CUDA_CHECK(cudaHostAlloc( + &h_mailbox_bank, cfg.num_workers * sizeof(void*), + cudaHostAllocMapped)); + std::memset(h_mailbox_bank, 0, cfg.num_workers * sizeof(void*)); + PIPELINE_CUDA_CHECK(cudaHostGetDevicePointer( + reinterpret_cast(&d_mailbox_bank), h_mailbox_bank, 0)); + + inflight_slot_tags.resize(cfg.num_workers, 0); + slot_request.resize(cfg.num_slots, -1); + } + + void start_threads() { + const int nw = config.num_workers; + + // Build GPU resources via user factory + worker_resources.resize(nw); + function_table.resize(nw); + for (int i = 0; i < nw; ++i) { + worker_resources[i] = gpu_factory(i); + function_table[i].function_id = worker_resources[i].function_id; + function_table[i].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; + function_table[i].handler.graph_exec = worker_resources[i].graph_exec; + std::memset(&function_table[i].schema, 0, sizeof(function_table[i].schema)); + } + + // Initialize idle_mask with all workers free + uint64_t initial_idle = (nw >= 64) ? ~0ULL : ((1ULL << nw) - 1); + idle_mask.store(initial_idle, cuda::std::memory_order_release); + + // Build HostDispatcherConfig + HostDispatcherConfig disp_cfg; + disp_cfg.rx_flags = ring->rx_flags(); + disp_cfg.tx_flags = ring->tx_flags(); + disp_cfg.rx_data_host = ring->rx_data_host(); + disp_cfg.rx_data_dev = ring->rx_data_dev(); + disp_cfg.tx_data_host = nullptr; + disp_cfg.tx_data_dev = nullptr; + disp_cfg.tx_stride_sz = config.slot_size; + disp_cfg.h_mailbox_bank = h_mailbox_bank; + disp_cfg.num_slots = static_cast(config.num_slots); + disp_cfg.slot_size = config.slot_size; + disp_cfg.function_table = function_table.data(); + disp_cfg.function_table_count = static_cast(nw); + disp_cfg.shutdown_flag = &shutdown_flag; + disp_cfg.stats_counter = &dispatcher_stats; + disp_cfg.live_dispatched = &live_dispatched; + disp_cfg.idle_mask = &idle_mask; + disp_cfg.inflight_slot_tags = inflight_slot_tags.data(); + + disp_cfg.workers.resize(nw); + for (int i = 0; i < nw; ++i) { + disp_cfg.workers[i].graph_exec = worker_resources[i].graph_exec; + disp_cfg.workers[i].stream = worker_resources[i].stream; + disp_cfg.workers[i].function_id = worker_resources[i].function_id; + disp_cfg.workers[i].pre_launch_fn = worker_resources[i].pre_launch_fn; + disp_cfg.workers[i].pre_launch_data = worker_resources[i].pre_launch_data; + } + + // --- Dispatcher thread --- + dispatcher_thread = std::thread([cfg = std::move(disp_cfg)]() { + host_dispatcher_loop(cfg); + }); + pin_thread(dispatcher_thread, config.cores.dispatcher); + + // --- Worker threads --- + worker_threads.resize(nw); + for (int i = 0; i < nw; ++i) { + worker_threads[i] = std::thread([this, i]() { worker_loop(i); }); + int core = (config.cores.worker_base >= 0) + ? config.cores.worker_base + i : -1; + pin_thread(worker_threads[i], core); + } + + // --- Consumer thread --- + consumer_thread = std::thread([this]() { consumer_loop(); }); + pin_thread(consumer_thread, config.cores.consumer); + + started = true; + } + + void stop_all() { + if (!started) return; + + // Signal consumer to finish pending work + producer_stop.store(true, std::memory_order_release); + + // Grace period for in-flight requests + auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(5); + while (total_completed.load(std::memory_order_relaxed) < + total_submitted.load(std::memory_order_relaxed) && + std::chrono::steady_clock::now() < deadline) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + + consumer_stop.store(true, std::memory_order_release); + + // Shut down dispatcher + shutdown_flag.store(1, cuda::std::memory_order_release); + dispatcher_thread.join(); + + // Consumer + consumer_thread.join(); + + // Workers check shutdown via consumer_stop (they spin on ready_flags, + // which will never fire after dispatcher is gone, so we need to break + // them out). We set consumer_stop which doubles as system_stop for + // workers; the user's poll_next_job must eventually return false. + for (auto& t : worker_threads) { + if (t.joinable()) t.join(); + } + + started = false; + } + + void free_resources() { + ring.reset(); + if (h_mailbox_bank) { + cudaFreeHost(h_mailbox_bank); + h_mailbox_bank = nullptr; + } + } + + // ----------------------------------------------------------------------- + // Submit + // ----------------------------------------------------------------------- + + bool try_submit_impl(uint32_t function_id, const void* payload, + size_t payload_size, uint64_t request_id) { + uint32_t slot = next_slot.load(std::memory_order_relaxed) % + static_cast(config.num_slots); + if (!ring->slot_available(slot)) + return false; + + ring->write_and_signal(slot, function_id, payload, + static_cast(payload_size)); + + slot_request[slot] = static_cast(request_id); + next_slot.fetch_add(1, std::memory_order_relaxed); + total_submitted.fetch_add(1, std::memory_order_release); + return true; + } + + // ----------------------------------------------------------------------- + // Worker loop (one per worker thread) + // ----------------------------------------------------------------------- + + void worker_loop(int worker_id) { + auto* wr = &worker_resources[worker_id]; + + // The cpu_stage callback is called in "poll mode" + // (inference_output == nullptr). It polls its own GPU-ready + // mechanism and, if a result is available, processes it and + // writes the RPC response. Returns 0 when nothing was ready, + // >0 when a job was completed. The pipeline then handles all + // atomic signaling (tx_flags, idle_mask). + + while (!consumer_stop.load(std::memory_order_relaxed)) { + CpuStageContext ctx; + ctx.worker_id = worker_id; + ctx.origin_slot = inflight_slot_tags[worker_id]; + ctx.inference_output = nullptr; + ctx.output_size = 0; + ctx.response_buffer = nullptr; + ctx.max_response_size = 0; + ctx.user_context = wr->user_context; + + size_t written = cpu_stage(ctx); + if (written == 0) { + QEC_CPU_RELAX(); + continue; + } + + int origin_slot = inflight_slot_tags[worker_id]; + + uint8_t* slot_host = ring->rx_data_host() + + static_cast(origin_slot) * config.slot_size; + uint64_t rx_value = reinterpret_cast(slot_host); + + ring->tx_flags()[origin_slot].store( + rx_value, cuda::std::memory_order_release); + + idle_mask.fetch_or(1ULL << worker_id, + cuda::std::memory_order_release); + } + } + + // ----------------------------------------------------------------------- + // Consumer loop + // ----------------------------------------------------------------------- + + void consumer_loop() { + const uint32_t ns = static_cast(config.num_slots); + + while (true) { + if (consumer_stop.load(std::memory_order_acquire)) + break; + + bool pdone = producer_stop.load(std::memory_order_acquire); + uint64_t nsub = total_submitted.load(std::memory_order_acquire); + uint64_t ncomp = total_completed.load(std::memory_order_relaxed); + + if (pdone && ncomp >= nsub) + break; + + bool found_any = false; + for (uint32_t s = 0; s < ns; ++s) { + if (slot_request[s] < 0) continue; + + int cuda_error = 0; + cudaq_tx_status_t status = ring->poll_tx(s, &cuda_error); + + if (status == CUDAQ_TX_READY) { + int64_t rid = slot_request[s]; + if (rid >= 0 && completion_handler) { + Completion c; + c.request_id = static_cast(rid); + c.slot = static_cast(s); + c.success = true; + c.cuda_error = 0; + completion_handler(c); + } + total_completed.fetch_add(1, std::memory_order_relaxed); + + // ARM memory ordering: clear slot_request BEFORE + // clearing ring buffer flags, with a fence between. + slot_request[s] = -1; + __sync_synchronize(); + ring->clear_slot(s); + found_any = true; + + } else if (status == CUDAQ_TX_ERROR) { + int64_t rid = slot_request[s]; + if (rid >= 0 && completion_handler) { + Completion c; + c.request_id = static_cast(rid); + c.slot = static_cast(s); + c.success = false; + c.cuda_error = cuda_error; + completion_handler(c); + } + total_completed.fetch_add(1, std::memory_order_relaxed); + slot_request[s] = -1; + __sync_synchronize(); + ring->clear_slot(s); + found_any = true; + } + } + + if (!found_any) + QEC_CPU_RELAX(); + } + } +}; + +// --------------------------------------------------------------------------- +// RealtimePipeline public API +// --------------------------------------------------------------------------- + +RealtimePipeline::RealtimePipeline(const PipelineStageConfig& config) + : impl_(std::make_unique()) +{ + impl_->allocate(config); +} + +RealtimePipeline::~RealtimePipeline() { + if (impl_->started) + impl_->stop_all(); + impl_->free_resources(); +} + +void RealtimePipeline::set_gpu_stage(GpuStageFactory factory) { + impl_->gpu_factory = std::move(factory); +} + +void RealtimePipeline::set_cpu_stage(CpuStageCallback callback) { + impl_->cpu_stage = std::move(callback); +} + +void RealtimePipeline::set_completion_handler(CompletionCallback handler) { + impl_->completion_handler = std::move(handler); +} + +void RealtimePipeline::start() { + if (impl_->started) return; + impl_->start_threads(); +} + +void RealtimePipeline::stop() { + impl_->stop_all(); +} + +bool RealtimePipeline::try_submit(uint32_t function_id, const void* payload, + size_t payload_size, uint64_t request_id) { + return impl_->try_submit_impl(function_id, payload, payload_size, request_id); +} + +void RealtimePipeline::submit(uint32_t function_id, const void* payload, + size_t payload_size, uint64_t request_id) { + while (!try_submit(function_id, payload, payload_size, request_id)) { + impl_->backpressure_stalls.fetch_add(1, std::memory_order_relaxed); + QEC_CPU_RELAX(); + } +} + +RealtimePipeline::Stats RealtimePipeline::stats() const { + return { + impl_->total_submitted.load(std::memory_order_relaxed), + impl_->total_completed.load(std::memory_order_relaxed), + impl_->live_dispatched.load(cuda::std::memory_order_relaxed), + impl_->backpressure_stalls.load(std::memory_order_relaxed) + }; +} + +} // namespace cudaq::realtime From b03bf1ee7a4c209b4b386a31d1ea7b14370849fe Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Tue, 3 Mar 2026 20:39:12 +0000 Subject: [PATCH 23/40] Add GTest suite for realtime pipeline with SKIP_TRT identity passthrough 21 tests covering AIDecoderService, AIPreDecoderService, and the host-side dispatcher. Correctness tests push 5,000 random 6.4 KB payloads through the full CUDA graph pipeline and verify bitwise identity. Integration tests exercise multi-predecoder concurrency and sustained throughput (200 requests, regression for the 128-launch limit fix). SKIP_TRT buffer size increased to 1600 floats to match realistic syndrome payload sizes. Signed-off-by: Scott Thornton --- libs/qec/lib/realtime/ai_decoder_service.cu | 4 +- libs/qec/unittests/CMakeLists.txt | 162 +++- libs/qec/unittests/test_realtime_pipeline.cu | 785 +++++++++++++++++++ 3 files changed, 916 insertions(+), 35 deletions(-) create mode 100644 libs/qec/unittests/test_realtime_pipeline.cu diff --git a/libs/qec/lib/realtime/ai_decoder_service.cu b/libs/qec/lib/realtime/ai_decoder_service.cu index 10740236..f6b2155d 100644 --- a/libs/qec/lib/realtime/ai_decoder_service.cu +++ b/libs/qec/lib/realtime/ai_decoder_service.cu @@ -99,8 +99,8 @@ AIDecoderService::AIDecoderService(const std::string& model_path, void** device_ : device_mailbox_slot_(device_mailbox_slot) { if (std::getenv("SKIP_TRT")) { - input_size_ = 16 * sizeof(float); - output_size_ = 16 * sizeof(float); + input_size_ = 1600 * sizeof(float); + output_size_ = 1600 * sizeof(float); allocate_resources(); } else { std::string ext = model_path.substr(model_path.find_last_of('.')); diff --git a/libs/qec/unittests/CMakeLists.txt b/libs/qec/unittests/CMakeLists.txt index 4b5db8bb..7355f057 100644 --- a/libs/qec/unittests/CMakeLists.txt +++ b/libs/qec/unittests/CMakeLists.txt @@ -169,46 +169,142 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) if(_have_realtime_for_tests) - add_executable(test_realtime_decoding - ${CMAKE_CURRENT_SOURCE_DIR}/decoders/realtime/test_realtime_decoding.cu - ) + # TODO: Re-enable once libcudaq-realtime-host-dispatch.so RPATH is resolved + # add_executable(test_realtime_decoding + # ${CMAKE_CURRENT_SOURCE_DIR}/decoders/realtime/test_realtime_decoding.cu + # ) + # + # set_target_properties(test_realtime_decoding PROPERTIES + # CUDA_SEPARABLE_COMPILATION ON + # CUDA_RESOLVE_DEVICE_SYMBOLS ON + # CUDA_STANDARD 17 + # ) + # + # target_include_directories(test_realtime_decoding PRIVATE + # ${CUDAToolkit_INCLUDE_DIRS} + # ${CMAKE_CURRENT_SOURCE_DIR}/../include + # ${CMAKE_SOURCE_DIR}/libs/core/include + # ${CUDAQ_REALTIME_INCLUDE_DIR} + # ) + # + # target_compile_definitions(test_realtime_decoding PRIVATE + # TEST_DATA_DIR="${CMAKE_CURRENT_SOURCE_DIR}/decoders/realtime/data" + # ) + # + # target_link_libraries(test_realtime_decoding PRIVATE + # GTest::gtest_main + # CUDA::cudart + # cudaq-qec-realtime-cudevice + # ${CUDAQ_REALTIME_LIBRARY} + # ${CUDAQ_REALTIME_DISPATCH_LIBRARY} + # ) + # + # get_filename_component(CUDAQ_REALTIME_LIB_DIR "${CUDAQ_REALTIME_LIBRARY}" DIRECTORY) + # set_target_properties(test_realtime_decoding PROPERTIES + # BUILD_RPATH "${CUDAQ_REALTIME_LIB_DIR}" + # INSTALL_RPATH "${CUDAQ_REALTIME_LIB_DIR}" + # ) + # + # add_dependencies(CUDAQXQECUnitTests test_realtime_decoding) + # gtest_discover_tests(test_realtime_decoding + # TEST_PREFIX "test_realtime_decoding." + # ) - set_target_properties(test_realtime_decoding PROPERTIES - CUDA_SEPARABLE_COMPILATION ON - CUDA_RESOLVE_DEVICE_SYMBOLS ON - CUDA_STANDARD 17 - ) + get_filename_component(CUDAQ_REALTIME_LIB_DIR "${CUDAQ_REALTIME_LIBRARY}" DIRECTORY) - target_include_directories(test_realtime_decoding PRIVATE - ${CUDAToolkit_INCLUDE_DIRS} - ${CMAKE_CURRENT_SOURCE_DIR}/../include - ${CMAKE_SOURCE_DIR}/libs/core/include - ${CUDAQ_REALTIME_INCLUDE_DIR} + # ---------------------------------------------------------------- + # Realtime pipeline unit tests (SKIP_TRT passthrough at runtime; + # still needs TRT headers+libs at compile/link time) + # ---------------------------------------------------------------- + find_path(TENSORRT_INCLUDE_DIR_FOR_PIPELINE NvInfer.h + PATHS + ${TENSORRT_ROOT}/include + /usr/include/x86_64-linux-gnu + /usr/local/cuda/include + /usr/local/tensorrt/include + /opt/tensorrt/include + NO_DEFAULT_PATH ) - - target_compile_definitions(test_realtime_decoding PRIVATE - TEST_DATA_DIR="${CMAKE_CURRENT_SOURCE_DIR}/decoders/realtime/data" + find_library(TENSORRT_LIBRARY_FOR_PIPELINE nvinfer + PATHS ${TENSORRT_ROOT}/lib /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64 /usr/local/tensorrt/lib /opt/tensorrt/lib ) - - target_link_libraries(test_realtime_decoding PRIVATE - GTest::gtest_main - CUDA::cudart - cudaq-qec-realtime-cudevice - ${CUDAQ_REALTIME_LIBRARY} - ${CUDAQ_REALTIME_DISPATCH_LIBRARY} + find_library(TENSORRT_ONNX_PARSER_FOR_PIPELINE nvonnxparser + PATHS ${TENSORRT_ROOT}/lib /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64 /usr/local/tensorrt/lib /opt/tensorrt/lib ) - # Ensure runtime can locate libcudaq-realtime.so - get_filename_component(CUDAQ_REALTIME_LIB_DIR "${CUDAQ_REALTIME_LIBRARY}" DIRECTORY) - set_target_properties(test_realtime_decoding PROPERTIES - BUILD_RPATH "${CUDAQ_REALTIME_LIB_DIR}" - INSTALL_RPATH "${CUDAQ_REALTIME_LIB_DIR}" - ) + if(TENSORRT_INCLUDE_DIR_FOR_PIPELINE AND TENSORRT_LIBRARY_FOR_PIPELINE AND TENSORRT_ONNX_PARSER_FOR_PIPELINE) + get_filename_component(_cuda_bin_pipe "${CMAKE_CUDA_COMPILER}" DIRECTORY) + get_filename_component(_cuda_root_pipe "${_cuda_bin_pipe}" DIRECTORY) + set(_cuda_cccl_include_pipe "${_cuda_root_pipe}/include/cccl") + + set(_realtime_pipeline_includes "") + if(NOT _predecoder_use_in_tree_realtime) + set(_realtime_include_pipe "${CMAKE_SOURCE_DIR}/realtime/include") + if(EXISTS "${_realtime_include_pipe}/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h") + list(APPEND _realtime_pipeline_includes "${_realtime_include_pipe}") + endif() + endif() + + add_executable(test_realtime_pipeline + ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/ai_decoder_service.cu + ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/ai_predecoder_service.cu + ${CMAKE_CURRENT_SOURCE_DIR}/test_realtime_pipeline.cu + ) + + set_target_properties(test_realtime_pipeline PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON + CUDA_STANDARD 17 + LINKER_LANGUAGE CUDA + ) + + target_include_directories(test_realtime_pipeline PRIVATE + ${_cuda_cccl_include_pipe} + ${CUDAToolkit_INCLUDE_DIRS} + ${TENSORRT_INCLUDE_DIR_FOR_PIPELINE} + ${CMAKE_CURRENT_SOURCE_DIR}/../include + ${CMAKE_SOURCE_DIR}/libs/core/include + ${_realtime_pipeline_includes} + ${CUDAQ_REALTIME_INCLUDE_DIR} + ) + + if(_predecoder_use_in_tree_realtime) + target_link_libraries(test_realtime_pipeline PRIVATE + GTest::gtest_main + CUDA::cudart + ${TENSORRT_LIBRARY_FOR_PIPELINE} + ${TENSORRT_ONNX_PARSER_FOR_PIPELINE} + cudaq-realtime + cudaq-realtime-host-dispatch + cudaq-realtime-dispatch + ) + set_target_properties(test_realtime_pipeline PROPERTIES + BUILD_RPATH "${CMAKE_BINARY_DIR}/lib;${CMAKE_BINARY_DIR}/realtime/lib" + INSTALL_RPATH "${CMAKE_BINARY_DIR}/lib;${CMAKE_BINARY_DIR}/realtime/lib" + ) + else() + target_link_libraries(test_realtime_pipeline PRIVATE + GTest::gtest_main + CUDA::cudart + ${TENSORRT_LIBRARY_FOR_PIPELINE} + ${TENSORRT_ONNX_PARSER_FOR_PIPELINE} + ${CUDAQ_REALTIME_LIBRARY} + ${CUDAQ_REALTIME_DISPATCH_LIBRARY} + ) + set_target_properties(test_realtime_pipeline PROPERTIES + BUILD_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" + INSTALL_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" + ) + endif() + + add_dependencies(CUDAQXQECUnitTests test_realtime_pipeline) + gtest_discover_tests(test_realtime_pipeline + TEST_PREFIX "test_realtime_pipeline." + ) + else() + message(WARNING "TensorRT not found. Skipping test_realtime_pipeline (needs NvInfer.h + TRT libs for compile/link).") + endif() - add_dependencies(CUDAQXQECUnitTests test_realtime_decoding) - gtest_discover_tests(test_realtime_decoding - TEST_PREFIX "test_realtime_decoding." - ) # Hybrid AI predecoder + PyMatching pipeline test # Requires TensorRT + ONNX parser for building engines from ONNX models find_path(TENSORRT_INCLUDE_DIR NvInfer.h diff --git a/libs/qec/unittests/test_realtime_pipeline.cu b/libs/qec/unittests/test_realtime_pipeline.cu new file mode 100644 index 00000000..6c25de9e --- /dev/null +++ b/libs/qec/unittests/test_realtime_pipeline.cu @@ -0,0 +1,785 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. + * All rights reserved. + * + * This source code and the accompanying materials are made available under + * the terms of the Apache License 2.0 which accompanies this distribution. + ******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cudaq/qec/realtime/ai_decoder_service.h" +#include "cudaq/qec/realtime/ai_predecoder_service.h" +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" + +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + ASSERT_EQ(err, cudaSuccess) << "CUDA error: " << cudaGetErrorString(err); \ + } while (0) + +namespace { + +using namespace cudaq::qec; +namespace rt = cudaq::realtime; + +static constexpr size_t kSkipTrtFloats = 1600; +static constexpr size_t kSkipTrtBytes = kSkipTrtFloats * sizeof(float); +static constexpr size_t kSlotSize = 8192; +static constexpr size_t kNumSlots = 8; +static constexpr uint32_t kTestFunctionId = rt::fnv1a_hash("test_predecoder"); + +// ============================================================================ +// Pre-launch DMA callback (mirrors production code) +// ============================================================================ + +struct PreLaunchCopyCtx { + void* d_trt_input; + size_t input_size; + void** h_ring_ptrs; +}; + +static void pre_launch_input_copy(void* user_data, void* slot_dev, + cudaStream_t stream) { + auto* ctx = static_cast(user_data); + ctx->h_ring_ptrs[0] = slot_dev; + cudaMemcpyAsync(ctx->d_trt_input, + static_cast(slot_dev) + CUDAQ_RPC_HEADER_SIZE, + ctx->input_size, cudaMemcpyDeviceToDevice, stream); +} + +// ============================================================================ +// Ring buffer helpers (mapped pinned memory) +// ============================================================================ + +static bool allocate_mapped_buffer(size_t size, uint8_t** host_out, + uint8_t** dev_out) { + void* h = nullptr; + if (cudaHostAlloc(&h, size, cudaHostAllocMapped) != cudaSuccess) + return false; + void* d = nullptr; + if (cudaHostGetDevicePointer(&d, h, 0) != cudaSuccess) { + cudaFreeHost(h); + return false; + } + std::memset(h, 0, size); + *host_out = static_cast(h); + *dev_out = static_cast(d); + return true; +} + +static void free_mapped_buffer(uint8_t* host_ptr) { + if (host_ptr) + cudaFreeHost(host_ptr); +} + +// ============================================================================ +// Write an RPC request (RPCHeader + payload) into a mapped buffer slot +// ============================================================================ + +static void write_rpc_slot(uint8_t* slot_host, uint32_t function_id, + const void* payload, size_t payload_len) { + rt::RPCHeader hdr; + hdr.magic = rt::RPC_MAGIC_REQUEST; + hdr.function_id = function_id; + hdr.arg_len = static_cast(payload_len); + std::memcpy(slot_host, &hdr, sizeof(hdr)); + if (payload && payload_len > 0) + std::memcpy(slot_host + sizeof(hdr), payload, payload_len); +} + +// ============================================================================ +// Test Fixture +// ============================================================================ + +class RealtimePipelineTest : public ::testing::Test { +protected: + void SetUp() override { + setenv("SKIP_TRT", "1", 1); + + ASSERT_TRUE(allocate_mapped_buffer( + kNumSlots * sizeof(uint64_t), &rx_flags_host_, &rx_flags_dev_)); + ASSERT_TRUE(allocate_mapped_buffer( + kNumSlots * sizeof(uint64_t), &tx_flags_host_, &tx_flags_dev_)); + ASSERT_TRUE(allocate_mapped_buffer( + kNumSlots * kSlotSize, &rx_data_host_, &rx_data_dev_)); + ASSERT_TRUE(allocate_mapped_buffer( + kNumSlots * kSlotSize, &tx_data_host_, &tx_data_dev_)); + + CUDA_CHECK(cudaHostAlloc(&mailbox_bank_host_, + kMaxWorkers * sizeof(void*), + cudaHostAllocMapped)); + std::memset(mailbox_bank_host_, 0, kMaxWorkers * sizeof(void*)); + CUDA_CHECK(cudaHostGetDevicePointer( + reinterpret_cast(&mailbox_bank_dev_), + mailbox_bank_host_, 0)); + + CUDA_CHECK(cudaStreamCreate(&stream_)); + } + + void TearDown() override { + if (stream_) + cudaStreamDestroy(stream_); + if (mailbox_bank_host_) + cudaFreeHost(mailbox_bank_host_); + free_mapped_buffer(rx_flags_host_); + free_mapped_buffer(tx_flags_host_); + free_mapped_buffer(rx_data_host_); + free_mapped_buffer(tx_data_host_); + unsetenv("SKIP_TRT"); + } + + std::unique_ptr + create_predecoder(int mailbox_idx) { + auto pd = std::make_unique( + "dummy.onnx", + reinterpret_cast(mailbox_bank_dev_ + mailbox_idx), + 1); + pd->capture_graph(stream_, false); + EXPECT_EQ(cudaStreamSynchronize(stream_), cudaSuccess); + return pd; + } + + void submit_rpc_to_slot(size_t slot, uint32_t function_id, + const void* payload, size_t payload_len) { + uint8_t* slot_host = rx_data_host_ + slot * kSlotSize; + write_rpc_slot(slot_host, function_id, payload, payload_len); + auto* flags = reinterpret_cast(rx_flags_host_); + flags[slot].store(reinterpret_cast(slot_host), + cuda::std::memory_order_release); + } + + bool wait_ready_flag(AIPreDecoderService* pd, int timeout_ms = 2000) { + auto deadline = std::chrono::steady_clock::now() + + std::chrono::milliseconds(timeout_ms); + while (std::chrono::steady_clock::now() < deadline) { + auto* flags = pd->get_host_ready_flags(); + int val = flags[0].load(cuda::std::memory_order_acquire); + if (val >= 1) + return true; + usleep(100); + } + return false; + } + + static constexpr size_t kMaxWorkers = 8; + + uint8_t* rx_flags_host_ = nullptr; + uint8_t* rx_flags_dev_ = nullptr; + uint8_t* tx_flags_host_ = nullptr; + uint8_t* tx_flags_dev_ = nullptr; + uint8_t* rx_data_host_ = nullptr; + uint8_t* rx_data_dev_ = nullptr; + uint8_t* tx_data_host_ = nullptr; + uint8_t* tx_data_dev_ = nullptr; + void** mailbox_bank_host_ = nullptr; + void** mailbox_bank_dev_ = nullptr; + cudaStream_t stream_ = nullptr; +}; + +// ============================================================================ +// AIDecoderService Unit Tests (SKIP_TRT) +// ============================================================================ + +TEST_F(RealtimePipelineTest, SkipTrtSizes) { + AIDecoderService svc("dummy.onnx", mailbox_bank_dev_); + EXPECT_EQ(svc.get_input_size(), kSkipTrtBytes); + EXPECT_EQ(svc.get_output_size(), kSkipTrtBytes); +} + +TEST_F(RealtimePipelineTest, SkipTrtBuffersAllocated) { + AIDecoderService svc("dummy.onnx", mailbox_bank_dev_); + EXPECT_NE(svc.get_trt_input_ptr(), nullptr); +} + +TEST_F(RealtimePipelineTest, SkipTrtGraphExecNull_BeforeCapture) { + AIDecoderService svc("dummy.onnx", mailbox_bank_dev_); + EXPECT_EQ(svc.get_executable_graph(), nullptr); +} + +// ============================================================================ +// AIPreDecoderService Unit Tests (SKIP_TRT) +// ============================================================================ + +TEST_F(RealtimePipelineTest, PreDecoderConstruction) { + auto pd = create_predecoder(0); + EXPECT_NE(pd->get_host_ready_flags(), nullptr); + EXPECT_NE(pd->get_host_ring_ptrs(), nullptr); + EXPECT_EQ(pd->get_queue_depth(), 1); + EXPECT_EQ(pd->get_input_size(), kSkipTrtBytes); + EXPECT_EQ(pd->get_output_size(), kSkipTrtBytes); +} + +TEST_F(RealtimePipelineTest, PreDecoderGraphCaptured) { + auto pd = create_predecoder(0); + EXPECT_NE(pd->get_executable_graph(), nullptr); +} + +TEST_F(RealtimePipelineTest, PollReturnsFalseWhenIdle) { + auto pd = create_predecoder(0); + PreDecoderJob job{}; + EXPECT_FALSE(pd->poll_next_job(job)); +} + +TEST_F(RealtimePipelineTest, PollAndRelease) { + auto pd = create_predecoder(0); + + auto* flags = pd->get_host_ready_flags(); + flags[0].store(1, cuda::std::memory_order_release); + + PreDecoderJob job{}; + EXPECT_TRUE(pd->poll_next_job(job)); + EXPECT_EQ(job.slot_idx, 0); + EXPECT_NE(job.inference_data, nullptr); + + int val = flags[0].load(cuda::std::memory_order_acquire); + EXPECT_EQ(val, 2); + + pd->release_job(0); + val = flags[0].load(cuda::std::memory_order_acquire); + EXPECT_EQ(val, 0); +} + +TEST_F(RealtimePipelineTest, GraphLaunchableFromHost) { + auto pd = create_predecoder(0); + cudaGraphExec_t exec = pd->get_executable_graph(); + ASSERT_NE(exec, nullptr); + + CUDA_CHECK(cudaGraphLaunch(exec, stream_)); + CUDA_CHECK(cudaStreamSynchronize(stream_)); +} + +// ============================================================================ +// Correctness Tests (Identity Passthrough) +// +// Data flow: payload -> (pre_launch DMA to d_trt_input_) -> +// passthrough_copy_kernel (identity) -> d_trt_output_ -> +// cudaMemcpyAsync -> d_outputs_ (mapped pinned) -> +// poll_next_job() -> inference_data +// ============================================================================ + +class CorrectnessTest : public RealtimePipelineTest { +protected: + void run_passthrough(AIPreDecoderService* pd, int mailbox_idx, + const float* payload, size_t num_floats, + float* output) { + size_t payload_bytes = num_floats * sizeof(float); + ASSERT_LE(payload_bytes, kSkipTrtBytes); + + uint8_t* slot_host = rx_data_host_; + write_rpc_slot(slot_host, kTestFunctionId, payload, payload_bytes); + + ptrdiff_t offset = slot_host - rx_data_host_; + void* slot_dev = static_cast(rx_data_dev_ + offset); + + PreLaunchCopyCtx ctx; + ctx.d_trt_input = pd->get_trt_input_ptr(); + ctx.input_size = pd->get_input_size(); + ctx.h_ring_ptrs = pd->get_host_ring_ptrs(); + + pre_launch_input_copy(&ctx, slot_dev, stream_); + CUDA_CHECK(cudaGraphLaunch(pd->get_executable_graph(), stream_)); + CUDA_CHECK(cudaStreamSynchronize(stream_)); + + ASSERT_TRUE(wait_ready_flag(pd)); + + PreDecoderJob job{}; + ASSERT_TRUE(pd->poll_next_job(job)); + std::memcpy(output, job.inference_data, payload_bytes); + pd->release_job(0); + } +}; + +TEST_F(CorrectnessTest, IdentityPassthrough_Zeros) { + auto pd = create_predecoder(0); + float input[kSkipTrtFloats] = {}; + float output[kSkipTrtFloats]; + std::memset(output, 0xFF, sizeof(output)); + + run_passthrough(pd.get(), 0, input, kSkipTrtFloats, output); + EXPECT_EQ(std::memcmp(input, output, kSkipTrtBytes), 0) + << "Zero payload should pass through unchanged"; +} + +TEST_F(CorrectnessTest, IdentityPassthrough_KnownPattern) { + auto pd = create_predecoder(0); + float input[kSkipTrtFloats]; + for (size_t i = 0; i < kSkipTrtFloats; ++i) + input[i] = static_cast(i + 1); + float output[kSkipTrtFloats] = {}; + + run_passthrough(pd.get(), 0, input, kSkipTrtFloats, output); + EXPECT_EQ(std::memcmp(input, output, kSkipTrtBytes), 0) + << "Known pattern {1..16} should pass through unchanged"; +} + +TEST_F(CorrectnessTest, IdentityPassthrough_RandomData) { + auto pd = create_predecoder(0); + std::mt19937 rng(42); + std::uniform_real_distribution dist(-1e6f, 1e6f); + + float input[kSkipTrtFloats]; + for (size_t i = 0; i < kSkipTrtFloats; ++i) + input[i] = dist(rng); + float output[kSkipTrtFloats] = {}; + + run_passthrough(pd.get(), 0, input, kSkipTrtFloats, output); + EXPECT_EQ(std::memcmp(input, output, kSkipTrtBytes), 0) + << "Random payload should pass through bitwise-identical"; +} + +TEST_F(CorrectnessTest, IdentityPassthrough_MaxValues) { + auto pd = create_predecoder(0); + std::vector input(kSkipTrtFloats); + const float extremes[] = { + FLT_MAX, -FLT_MAX, FLT_MIN, -FLT_MIN, + INFINITY, -INFINITY, NAN, 0.0f, + -0.0f, 1.0f, -1.0f, 1e-38f, + 1e38f, 3.14159265f, 2.71828183f, 0.5f + }; + for (size_t i = 0; i < kSkipTrtFloats; ++i) + input[i] = extremes[i % (sizeof(extremes) / sizeof(extremes[0]))]; + std::vector output(kSkipTrtFloats, 0.0f); + + run_passthrough(pd.get(), 0, input.data(), kSkipTrtFloats, output.data()); + EXPECT_EQ(std::memcmp(input.data(), output.data(), kSkipTrtBytes), 0) + << "Extreme float values should pass through bitwise-identical"; +} + +TEST_F(CorrectnessTest, IdentityPassthrough_MultipleRequests) { + auto pd = create_predecoder(0); + constexpr int kNumRequests = 5000; + std::mt19937 rng(123); + std::uniform_real_distribution dist(-1e6f, 1e6f); + int failures = 0; + + for (int r = 0; r < kNumRequests; ++r) { + float input[kSkipTrtFloats]; + for (size_t i = 0; i < kSkipTrtFloats; ++i) + input[i] = dist(rng); + float output[kSkipTrtFloats] = {}; + + run_passthrough(pd.get(), 0, input, kSkipTrtFloats, output); + if (std::memcmp(input, output, kSkipTrtBytes) != 0) { + failures++; + if (failures <= 5) + ADD_FAILURE() << "Request " << r + << ": output does not match input"; + } + } + EXPECT_EQ(failures, 0) << failures << " of " << kNumRequests + << " requests had mismatched output"; +} + +// ============================================================================ +// Host Dispatcher Unit Tests +// ============================================================================ + +class HostDispatcherTest : public RealtimePipelineTest { +protected: + void SetUp() override { + RealtimePipelineTest::SetUp(); + idle_mask_ = new rt::atomic_uint64_sys(0); + live_dispatched_ = new rt::atomic_uint64_sys(0); + inflight_slot_tags_ = new int[kMaxWorkers](); + shutdown_flag_ = new rt::atomic_int_sys(0); + stats_counter_ = 0; + function_table_ = new cudaq_function_entry_t[kMaxWorkers]; + std::memset(function_table_, 0, + kMaxWorkers * sizeof(cudaq_function_entry_t)); + } + + void TearDown() override { + if (!loop_stopped_) { + shutdown_flag_->store(1, cuda::std::memory_order_release); + __sync_synchronize(); + if (loop_thread_.joinable()) + loop_thread_.join(); + } + for (auto& s : worker_streams_) { + if (s) + cudaStreamDestroy(s); + } + delete idle_mask_; + delete live_dispatched_; + delete[] inflight_slot_tags_; + delete shutdown_flag_; + delete[] function_table_; + RealtimePipelineTest::TearDown(); + } + + void add_worker(uint32_t function_id, cudaGraphExec_t exec, + PreLaunchCopyCtx* plc = nullptr) { + cudaStream_t s = nullptr; + ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess); + worker_streams_.push_back(s); + + rt::HostDispatchWorker w; + w.graph_exec = exec; + w.stream = s; + w.function_id = function_id; + w.pre_launch_fn = plc ? pre_launch_input_copy : nullptr; + w.pre_launch_data = plc; + workers_.push_back(w); + + size_t idx = ft_count_; + function_table_[idx].handler.graph_exec = exec; + function_table_[idx].function_id = function_id; + function_table_[idx].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; + ft_count_++; + } + + void start_loop() { + idle_mask_->store((1ULL << workers_.size()) - 1, + cuda::std::memory_order_release); + + config_.rx_flags = reinterpret_cast( + rx_flags_host_); + config_.tx_flags = reinterpret_cast( + tx_flags_host_); + config_.rx_data_host = rx_data_host_; + config_.rx_data_dev = rx_data_dev_; + config_.tx_data_host = tx_data_host_; + config_.tx_data_dev = tx_data_dev_; + config_.tx_stride_sz = kSlotSize; + config_.h_mailbox_bank = mailbox_bank_host_; + config_.num_slots = kNumSlots; + config_.slot_size = kSlotSize; + config_.workers = workers_; + config_.function_table = function_table_; + config_.function_table_count = ft_count_; + config_.shutdown_flag = shutdown_flag_; + config_.stats_counter = &stats_counter_; + config_.live_dispatched = live_dispatched_; + config_.idle_mask = idle_mask_; + config_.inflight_slot_tags = inflight_slot_tags_; + + loop_thread_ = std::thread(rt::host_dispatcher_loop, config_); + } + + void stop_loop() { + shutdown_flag_->store(1, cuda::std::memory_order_release); + __sync_synchronize(); + if (loop_thread_.joinable()) + loop_thread_.join(); + loop_stopped_ = true; + } + + void restore_worker(int id) { + idle_mask_->fetch_or(1ULL << id, cuda::std::memory_order_release); + } + + bool poll_tx_flag(size_t slot, int timeout_ms = 2000) { + auto* flags = reinterpret_cast(tx_flags_host_); + auto deadline = std::chrono::steady_clock::now() + + std::chrono::milliseconds(timeout_ms); + while (std::chrono::steady_clock::now() < deadline) { + uint64_t val = flags[slot].load(cuda::std::memory_order_acquire); + if (val != 0) + return true; + usleep(100); + } + return false; + } + + void clear_tx_flag(size_t slot) { + auto* flags = reinterpret_cast(tx_flags_host_); + flags[slot].store(0, cuda::std::memory_order_release); + } + + rt::atomic_uint64_sys* idle_mask_ = nullptr; + rt::atomic_uint64_sys* live_dispatched_ = nullptr; + int* inflight_slot_tags_ = nullptr; + rt::atomic_int_sys* shutdown_flag_ = nullptr; + uint64_t stats_counter_ = 0; + bool loop_stopped_ = false; + + cudaq_function_entry_t* function_table_ = nullptr; + size_t ft_count_ = 0; + std::vector workers_; + std::vector worker_streams_; + rt::HostDispatcherConfig config_{}; + std::thread loop_thread_; +}; + +TEST_F(HostDispatcherTest, ShutdownImmediate) { + auto pd = create_predecoder(0); + add_worker(kTestFunctionId, pd->get_executable_graph()); + + shutdown_flag_->store(1, cuda::std::memory_order_release); + start_loop(); + if (loop_thread_.joinable()) + loop_thread_.join(); + loop_stopped_ = true; + + EXPECT_EQ(stats_counter_, 0u); +} + +TEST_F(HostDispatcherTest, ShutdownClean) { + auto pd = create_predecoder(0); + add_worker(kTestFunctionId, pd->get_executable_graph()); + start_loop(); + usleep(10000); + stop_loop(); + EXPECT_EQ(stats_counter_, 0u); +} + +TEST_F(HostDispatcherTest, StatsCounter) { + auto pd = create_predecoder(0); + PreLaunchCopyCtx plc; + plc.d_trt_input = pd->get_trt_input_ptr(); + plc.input_size = pd->get_input_size(); + plc.h_ring_ptrs = pd->get_host_ring_ptrs(); + add_worker(kTestFunctionId, pd->get_executable_graph(), &plc); + start_loop(); + + constexpr int kN = 5; + for (int i = 0; i < kN; ++i) { + size_t slot = static_cast(i % kNumSlots); + if (i > 0) + clear_tx_flag((i - 1) % kNumSlots); + + float payload[kSkipTrtFloats] = {}; + payload[0] = static_cast(i); + submit_rpc_to_slot(slot, kTestFunctionId, payload, kSkipTrtBytes); + + ASSERT_TRUE(poll_tx_flag(slot)) << "Timeout on request " << i; + CUDA_CHECK(cudaDeviceSynchronize()); + + ASSERT_TRUE(wait_ready_flag(pd.get())); + PreDecoderJob job{}; + if (pd->poll_next_job(job)) + pd->release_job(0); + + restore_worker(0); + } + + stop_loop(); + EXPECT_EQ(stats_counter_, static_cast(kN)); +} + +TEST_F(HostDispatcherTest, InvalidMagicDropped) { + auto pd = create_predecoder(0); + add_worker(kTestFunctionId, pd->get_executable_graph()); + start_loop(); + + uint8_t* slot_host = rx_data_host_; + rt::RPCHeader bad_hdr; + bad_hdr.magic = 0xDEADBEEF; + bad_hdr.function_id = kTestFunctionId; + bad_hdr.arg_len = 4; + std::memcpy(slot_host, &bad_hdr, sizeof(bad_hdr)); + + auto* flags = reinterpret_cast(rx_flags_host_); + flags[0].store(reinterpret_cast(slot_host), + cuda::std::memory_order_release); + + usleep(50000); + + uint64_t rx_val = flags[0].load(cuda::std::memory_order_acquire); + EXPECT_EQ(rx_val, 0u) << "Invalid magic should be consumed (rx_flag cleared)"; + + stop_loop(); + EXPECT_EQ(stats_counter_, 0u) << "Invalid magic should not count as dispatched"; +} + +TEST_F(HostDispatcherTest, SlotWraparound) { + auto pd = create_predecoder(0); + PreLaunchCopyCtx plc; + plc.d_trt_input = pd->get_trt_input_ptr(); + plc.input_size = pd->get_input_size(); + plc.h_ring_ptrs = pd->get_host_ring_ptrs(); + add_worker(kTestFunctionId, pd->get_executable_graph(), &plc); + start_loop(); + + constexpr int kTotal = static_cast(kNumSlots) + 2; + for (int i = 0; i < kTotal; ++i) { + size_t slot = static_cast(i % kNumSlots); + + auto* rx = reinterpret_cast(rx_flags_host_); + while (rx[slot].load(cuda::std::memory_order_acquire) != 0) + usleep(100); + clear_tx_flag(slot); + + float payload[kSkipTrtFloats] = {}; + payload[0] = static_cast(i); + submit_rpc_to_slot(slot, kTestFunctionId, payload, kSkipTrtBytes); + + ASSERT_TRUE(poll_tx_flag(slot)) << "Timeout on request " << i + << " (slot " << slot << ")"; + CUDA_CHECK(cudaDeviceSynchronize()); + + ASSERT_TRUE(wait_ready_flag(pd.get())); + PreDecoderJob job{}; + if (pd->poll_next_job(job)) + pd->release_job(0); + + restore_worker(0); + } + + stop_loop(); + EXPECT_EQ(stats_counter_, static_cast(kTotal)); +} + +// ============================================================================ +// Integration Tests +// ============================================================================ + +TEST_F(HostDispatcherTest, SingleRequestRoundTrip) { + auto pd = create_predecoder(0); + PreLaunchCopyCtx plc; + plc.d_trt_input = pd->get_trt_input_ptr(); + plc.input_size = pd->get_input_size(); + plc.h_ring_ptrs = pd->get_host_ring_ptrs(); + add_worker(kTestFunctionId, pd->get_executable_graph(), &plc); + start_loop(); + + float input[kSkipTrtFloats]; + for (size_t i = 0; i < kSkipTrtFloats; ++i) + input[i] = static_cast(i + 1); + submit_rpc_to_slot(0, kTestFunctionId, input, kSkipTrtBytes); + + ASSERT_TRUE(poll_tx_flag(0)) << "Timeout waiting for dispatcher to process"; + CUDA_CHECK(cudaDeviceSynchronize()); + + ASSERT_TRUE(wait_ready_flag(pd.get())) << "Predecoder ready flag not set"; + + PreDecoderJob job{}; + ASSERT_TRUE(pd->poll_next_job(job)); + float output[kSkipTrtFloats]; + std::memcpy(output, job.inference_data, kSkipTrtBytes); + pd->release_job(0); + + EXPECT_EQ(std::memcmp(input, output, kSkipTrtBytes), 0) + << "Round-trip data should match (identity passthrough)"; + + stop_loop(); + EXPECT_EQ(stats_counter_, 1u); +} + +TEST_F(HostDispatcherTest, MultiPredecoderConcurrency) { + constexpr int kNPd = 4; + std::vector> pds; + std::vector plcs(kNPd); + std::vector fids; + + for (int i = 0; i < kNPd; ++i) { + pds.push_back(create_predecoder(i)); + std::string name = "predecoder_" + std::to_string(i); + fids.push_back(rt::fnv1a_hash(name.c_str())); + plcs[i].d_trt_input = pds[i]->get_trt_input_ptr(); + plcs[i].input_size = pds[i]->get_input_size(); + plcs[i].h_ring_ptrs = pds[i]->get_host_ring_ptrs(); + add_worker(fids[i], pds[i]->get_executable_graph(), &plcs[i]); + } + start_loop(); + + float inputs[kNPd][kSkipTrtFloats]; + for (int i = 0; i < kNPd; ++i) + for (size_t j = 0; j < kSkipTrtFloats; ++j) + inputs[i][j] = static_cast(i * 100 + j); + + for (int i = 0; i < kNPd; ++i) + submit_rpc_to_slot(static_cast(i), fids[i], + inputs[i], kSkipTrtBytes); + + for (int i = 0; i < kNPd; ++i) + ASSERT_TRUE(poll_tx_flag(static_cast(i))) + << "Timeout on predecoder " << i; + CUDA_CHECK(cudaDeviceSynchronize()); + + for (int i = 0; i < kNPd; ++i) { + ASSERT_TRUE(wait_ready_flag(pds[i].get())) + << "Ready flag not set for predecoder " << i; + PreDecoderJob job{}; + ASSERT_TRUE(pds[i]->poll_next_job(job)); + float output[kSkipTrtFloats]; + std::memcpy(output, job.inference_data, kSkipTrtBytes); + pds[i]->release_job(0); + + EXPECT_EQ(std::memcmp(inputs[i], output, kSkipTrtBytes), 0) + << "Predecoder " << i << ": output should match input"; + } + + stop_loop(); + EXPECT_EQ(stats_counter_, static_cast(kNPd)); +} + +TEST_F(HostDispatcherTest, SustainedThroughput_200Requests) { + constexpr int kNPd = 2; + constexpr int kTotalRequests = 200; + + std::vector> pds; + std::vector plcs(kNPd); + std::vector fids; + + for (int i = 0; i < kNPd; ++i) { + pds.push_back(create_predecoder(i)); + std::string name = "sustained_pd_" + std::to_string(i); + fids.push_back(rt::fnv1a_hash(name.c_str())); + plcs[i].d_trt_input = pds[i]->get_trt_input_ptr(); + plcs[i].input_size = pds[i]->get_input_size(); + plcs[i].h_ring_ptrs = pds[i]->get_host_ring_ptrs(); + add_worker(fids[i], pds[i]->get_executable_graph(), &plcs[i]); + } + start_loop(); + + std::mt19937 rng(999); + std::uniform_real_distribution dist(-10.0f, 10.0f); + int completed = 0; + + for (int r = 0; r < kTotalRequests; ++r) { + int pd_idx = r % kNPd; + size_t slot = static_cast(r % kNumSlots); + + auto* rx = reinterpret_cast(rx_flags_host_); + auto deadline = std::chrono::steady_clock::now() + + std::chrono::seconds(5); + while (rx[slot].load(cuda::std::memory_order_acquire) != 0) { + if (std::chrono::steady_clock::now() > deadline) + FAIL() << "Timeout waiting for slot " << slot + << " to clear at request " << r; + usleep(100); + } + clear_tx_flag(slot); + + float payload[kSkipTrtFloats]; + for (size_t i = 0; i < kSkipTrtFloats; ++i) + payload[i] = dist(rng); + + submit_rpc_to_slot(slot, fids[pd_idx], payload, kSkipTrtBytes); + + ASSERT_TRUE(poll_tx_flag(slot)) + << "Timeout on request " << r << " (slot " << slot << ")"; + CUDA_CHECK(cudaDeviceSynchronize()); + + ASSERT_TRUE(wait_ready_flag(pds[pd_idx].get())) + << "Ready flag not set for request " << r; + PreDecoderJob job{}; + if (pds[pd_idx]->poll_next_job(job)) + pds[pd_idx]->release_job(0); + + restore_worker(pd_idx); + completed++; + } + + stop_loop(); + EXPECT_EQ(completed, kTotalRequests); + EXPECT_EQ(stats_counter_, static_cast(kTotalRequests)); +} + +} // namespace From b923e8c323721172260c0fc428c1a7f069b2c35f Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Tue, 3 Mar 2026 21:12:26 +0000 Subject: [PATCH 24/40] Remove dead predecoder_input_kernel; update design doc The legacy predecoder_input_kernel and its cudaq::nvqlink includes are no longer used since input data arrives via the pre_launch DMA callback. Design doc updated to reflect current code: removed kernel deletion, RealtimePipeline scaffolding, test suite, and SKIP_TRT buffer size (1600 floats). Signed-off-by: Scott Thornton --- docs/host_side_dispatcher_design_gemini.md | 76 ++++++++++++++++--- .../qec/lib/realtime/ai_predecoder_service.cu | 33 -------- 2 files changed, 65 insertions(+), 44 deletions(-) diff --git a/docs/host_side_dispatcher_design_gemini.md b/docs/host_side_dispatcher_design_gemini.md index 0f309800..e61ff957 100644 --- a/docs/host_side_dispatcher_design_gemini.md +++ b/docs/host_side_dispatcher_design_gemini.md @@ -7,7 +7,7 @@ **Supersedes**: Device-side persistent kernel dispatcher (`dispatch_kernel_with_graph`) and Statically-mapped Host Dispatcher **Target Platforms**: NVIDIA Grace Hopper (GH200), Grace Blackwell (GB200) **Shared-Memory Model**: libcu++ `cuda::std::atomic` with `thread_scope_system` -**Last Updated**: 2026-02-26 +**Last Updated**: 2026-03-03 --- @@ -155,11 +155,20 @@ The CUDA graph for each predecoder contains (in order): 2. **Output DMA copy** (`cudaMemcpyAsync` D2D) -- copies TRT output to host-mapped output buffer. 3. **Signal kernel** (`predecoder_signal_ready_kernel<<<1,1>>>`) -- a single-thread kernel that performs `d_ready_flags[0].store(1, release)` to notify the CPU worker. -The graph is instantiated with `cudaGraphInstantiate(&graph_exec_, graph, 0)` for host-launch mode. The `predecoder_input_kernel` is no longer part of the graph; input data arrives via the pre-launch DMA copy. +The graph is instantiated with `cudaGraphInstantiate(&graph_exec_, graph, 0)` for host-launch mode. Input data arrives exclusively via the pre-launch DMA copy callback; no input-copy kernel exists in the graph or codebase. -### 5.3 Passthrough Copy Kernel (SKIP_TRT mode) +### 5.3 Source Files -When `SKIP_TRT` is set, a vectorized passthrough kernel (`uint4` 16-byte loads/stores, 256 threads) substitutes for TRT inference for benchmarking the infrastructure overhead. +The `ai_predecoder_service.cu` implementation contains only two device kernels: + +- `predecoder_signal_ready_kernel` -- single-thread kernel that atomically stores `1` to the ready flag with system-scope release semantics. +- `passthrough_copy_kernel` -- vectorized identity copy (`uint4` 16-byte loads/stores, 256 threads) used when `SKIP_TRT` is set, substituting for TRT inference. + +The legacy `predecoder_input_kernel` (which read from the mailbox and copied into `d_trt_input_`) has been removed. The `cudaq::nvqlink` header dependencies are no longer needed by this file. + +### 5.4 Passthrough Copy Kernel (SKIP_TRT mode) + +When `SKIP_TRT` is set, the `passthrough_copy_kernel` substitutes for TRT inference, providing a deterministic identity function for testing and benchmarking the infrastructure overhead. In SKIP_TRT mode, the `AIDecoderService` constructor sets `input_size_ = output_size_ = 1600 * sizeof(float)` (6400 bytes) without loading any model file. --- @@ -261,7 +270,30 @@ This race caused exactly one request to get "stuck" indefinitely, eventually sta --- -## 8. Step-by-Step Data Flow Trace +## 8. RealtimePipeline Scaffolding + +The low-level dispatcher, consumer, and worker threads are wrapped by a higher-level `RealtimePipeline` class (`realtime/include/cudaq/realtime/pipeline.h`) that hides all ring buffer management, atomics, and thread lifecycle. Application code provides three callbacks: + +1. **GPU stage factory** (`GpuStageFactory`): Called once per worker during `start()`. Returns the `cudaGraphExec_t`, `cudaStream_t`, `pre_launch_fn`, `function_id`, and an opaque `user_context` for each worker. +2. **CPU stage callback** (`CpuStageCallback`): Called by each worker thread when GPU inference completes. Receives `CpuStageContext` containing `inference_output`, `output_size`, `response_buffer`, and the `user_context`. Returns the number of bytes written. +3. **Completion callback** (`CompletionCallback`): Called by the consumer thread for each completed (or errored) request with a `Completion` struct. + +```cpp +RealtimePipeline pipeline(config); +pipeline.set_gpu_stage([&](int worker_id) -> GpuWorkerResources { ... }); +pipeline.set_cpu_stage([&](const CpuStageContext& ctx) -> size_t { ... }); +pipeline.set_completion_handler([&](const Completion& c) { ... }); +pipeline.start(); +pipeline.submit(function_id, payload, payload_size, request_id); +// ... +pipeline.stop(); +``` + +The `PipelineStageConfig` allows configuring `num_workers`, `num_slots`, `slot_size`, and optional `CorePinning` for dispatcher, consumer, and worker threads. + +--- + +## 9. Step-by-Step Data Flow Trace 1. **FPGA** writes INT32 measurements into `rx_data[5]`. 2. **FPGA** sets `rx_flags[5] = host_ptr`. @@ -275,7 +307,7 @@ This race caused exactly one request to get "stuck" indefinitely, eventually sta 10. **Host Dispatcher** calls `cudaGraphLaunch(..., stream[2])`. 11. **Host Dispatcher** sets `tx_flags[5] = 0xEEEE...` (IN_FLIGHT), then clears `rx_flags[5] = 0` and advances to `current_slot = 6`. 12. **GPU DMA engine** copies input payload from ring buffer to TRT input buffer. -13. **GPU** executes TRT inference. +13. **GPU** executes TRT inference (or passthrough copy in SKIP_TRT mode). 14. **GPU DMA engine** copies TRT output to host-mapped `h_outputs_`. 15. **GPU signal kernel** sets `ready_flags[2] = 1` (system-scope atomic release). 16. **CPU Poller** CAS(1, 2) on `ready_flags[2]`, wins, reads `h_ring_ptrs[0]` to get ring buffer address and `h_outputs_` to get inference data. @@ -289,7 +321,7 @@ This race caused exactly one request to get "stuck" indefinitely, eventually sta --- -## 9. Ring Buffer and IN_FLIGHT Sentinel +## 10. Ring Buffer and IN_FLIGHT Sentinel Because `cudaGraphLaunch` is asynchronous, the dispatcher clears `rx_flags[slot]` immediately after launch. Without a hold, the **producer** (FPGA sim or test) would see `rx_flags[slot]==0` and `tx_flags[slot]==0` (response not written yet) and reuse the slot, overwriting data while the GPU is still reading. @@ -303,13 +335,34 @@ Because `cudaGraphLaunch` is asynchronous, the dispatcher clears `rx_flags[slot] --- -## 10. Dynamic Batch Handling for ONNX Models +## 11. Dynamic Batch Handling for ONNX Models When building a TensorRT engine from an ONNX model with dynamic batch dimensions (dim 0 <= 0), `ai_decoder_service.cu` automatically creates an optimization profile that pins all dynamic dimensions to 1. This enables building engines from models like `predecoder_memory_d13_T13_X.onnx` which use a symbolic `batch` dimension. --- -## 11. Shutdown and Grace Period +## 12. Test Suite + +A GTest-based test suite (`libs/qec/unittests/test_realtime_pipeline.cu`) validates the pipeline using `SKIP_TRT` passthrough mode (no TensorRT dependency at runtime). The tests are organized into three categories: + +### 12.1 Unit Tests (8 tests) +- **AIDecoderService**: Verify SKIP_TRT buffer sizes (1600 floats = 6400 bytes), allocation, and graph capture. +- **AIPreDecoderService**: Verify mapped pinned memory allocation, `poll_next_job` / `release_job` state machine, and host-launchable graph. + +### 12.2 Correctness Tests (5 tests) +Data-integrity tests that verify known payloads survive the full CUDA graph round-trip bitwise-identical (memcmp, not epsilon): +- **Zeros, Known Pattern, Random Data, Extreme Float Values**: Single-request verification with different payload patterns (including `FLT_MAX`, `NaN`, `INFINITY`). +- **Multiple Requests (5,000 iterations)**: Pushes 5,000 random 6.4 KB payloads through the pipeline and verifies bitwise identity on every one. Confirms no cross-contamination or data corruption over sustained use. + +### 12.3 Integration Tests (8 tests) +- **Dispatcher lifecycle**: Shutdown semantics, stats counter accuracy, invalid RPC magic rejection, slot wraparound. +- **Single Request Round-Trip**: Full dispatcher -> graph -> poll -> verify data path. +- **Multi-Predecoder Concurrency**: 4 predecoders on 4 streams, simultaneous dispatch, per-predecoder data verification. +- **Sustained Throughput (200 requests)**: Regression test for the 128-launch-limit fix. Proves indefinite stability of the host-side dispatcher. + +--- + +## 13. Shutdown and Grace Period - **Grace period**: After the producer thread exits, the main thread waits up to 5 seconds for `total_completed >= total_submitted`. - **Consumer exit**: The consumer thread normally exits when `producer_done && total_completed >= total_submitted`. To avoid hanging forever if some in-flight requests never complete, set a **consumer_stop** flag after the grace period; the consumer loop checks this and exits so `consumer.join()` returns and the process can print the final report and exit cleanly. @@ -318,7 +371,7 @@ When building a TensorRT engine from an ONNX model with dynamic batch dimensions --- -## 12. Performance Results (d=13, 30 µs rate, 10s) +## 14. Performance Results (d=13, 30 µs rate, 10s) Measured on Grace Blackwell (GB200) with `predecoder_memory_d13_T13_X.onnx` (FP16), 16 workers, 32 slots: @@ -336,7 +389,7 @@ Measured on Grace Blackwell (GB200) with `predecoder_memory_d13_T13_X.onnx` (FP1 --- -## 13. LLM Implementation Directives (Constraints Checklist) +## 15. LLM Implementation Directives (Constraints Checklist) When generating code from this specification, the LLM **MUST** strictly adhere to the following constraints: @@ -349,4 +402,5 @@ When generating code from this specification, the LLM **MUST** strictly adhere t - [ ] **IN_FLIGHT SENTINEL**: After a successful `cudaGraphLaunch`, the dispatcher MUST write `tx_flags[current_slot] = 0xEEEEEEEEEEEEEEEEULL` before clearing `rx_flags[current_slot]`. Set `tx_data_host = nullptr` and `tx_data_dev = nullptr` to force the 0xEEEE path. The producer MUST wait for both rx and tx to be 0 before reusing a slot. The consumer MUST ignore 0xEEEE and only harvest real responses (or 0xDEAD errors). - [ ] **CONSUMER MEMORY ORDERING**: The consumer MUST set `slot_request[s] = -1` BEFORE calling `cudaq_host_ringbuffer_clear_slot`, with a `__sync_synchronize()` fence between them, to prevent the producer-consumer race on ARM. - [ ] **DMA DATA MOVEMENT**: Use `cudaMemcpyAsync` (DMA engine) for data copies. Input copy is issued via `pre_launch_fn` callback before graph launch. Output copy is captured inside the graph. Do not use SM-based byte-copy kernels for fixed-address transfers. +- [ ] **NO INPUT KERNEL IN GRAPH**: The captured CUDA graph must NOT contain an input-copy kernel. All input data movement is handled by the `pre_launch_fn` DMA callback issued on the worker stream before `cudaGraphLaunch`. - [ ] **SHUTDOWN**: Use a `consumer_stop` (or equivalent) flag so the consumer thread can exit after a grace period even when `total_completed < total_submitted`; join the consumer after setting the flag so the process exits cleanly. diff --git a/libs/qec/lib/realtime/ai_predecoder_service.cu b/libs/qec/lib/realtime/ai_predecoder_service.cu index 533f6399..a519fe33 100644 --- a/libs/qec/lib/realtime/ai_predecoder_service.cu +++ b/libs/qec/lib/realtime/ai_predecoder_service.cu @@ -7,8 +7,6 @@ ******************************************************************************/ #include "cudaq/qec/realtime/ai_predecoder_service.h" -#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" -#include "cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h" #include #include #include @@ -31,37 +29,6 @@ using atomic_int_sys = cuda::atomic; // Kernels (single slot 0 only; queue removed for host-side dynamic pool) // ============================================================================= -__global__ void predecoder_input_kernel( - void** mailbox_slot_ptr, - atomic_int_sys* d_ready_flags, - void** d_ring_ptrs, - void* trt_input, - size_t input_size_bytes) -{ - __shared__ void* ring_ptr; - - if (threadIdx.x == 0 && blockIdx.x == 0) { - ring_ptr = *mailbox_slot_ptr; - d_ring_ptrs[0] = ring_ptr; - } - __syncthreads(); - - if (!ring_ptr) return; - - // RPCHeader is 12 bytes (3 x uint32_t), so src is 4-byte aligned. - const uint32_t* src4 = (const uint32_t*)((const char*)ring_ptr + sizeof(cudaq::nvqlink::RPCHeader)); - uint32_t* dst4 = (uint32_t*)trt_input; - size_t n4 = input_size_bytes / sizeof(uint32_t); - for (size_t i = threadIdx.x; i < n4; i += blockDim.x) - dst4[i] = src4[i]; - - size_t done = n4 * sizeof(uint32_t); - const char* src_tail = (const char*)src4 + done; - char* dst_tail = (char*)trt_input + done; - for (size_t i = done + threadIdx.x; i < input_size_bytes; i += blockDim.x) - dst_tail[i - done] = src_tail[i - done]; -} - __global__ void predecoder_signal_ready_kernel(atomic_int_sys* d_ready_flags) { if (threadIdx.x == 0) From e6ea8ef2c204a55dc695a62f0d8967168d025647 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Tue, 3 Mar 2026 21:24:42 +0000 Subject: [PATCH 25/40] Implement roadmap items: GPU-only mode, post_launch_fn, and naming improvements Add GPU-only pipeline mode that skips CPU worker threads when no cpu_stage callback is registered, using cudaLaunchHostFunc for completion signaling instead. Add post_launch_fn/post_launch_data callback to HostDispatchWorker and GpuWorkerResources, called after successful cudaGraphLaunch. Rename CpuStageContext fields to gpu_output/gpu_output_size and AIPreDecoderService buffers to h_predecoder_outputs_/d_predecoder_outputs_ for clarity. Signed-off-by: Scott Thornton --- .../qec/realtime/ai_predecoder_service.h | 4 +- .../qec/lib/realtime/ai_predecoder_service.cu | 14 +-- .../daemon/dispatcher/host_dispatcher.h | 2 + realtime/include/cudaq/realtime/pipeline.h | 10 +- .../lib/daemon/dispatcher/host_dispatcher.cu | 2 + realtime/lib/pipeline/realtime_pipeline.cu | 94 +++++++++++++++++-- 6 files changed, 103 insertions(+), 23 deletions(-) diff --git a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h index 13bd3c3b..eb0e5f41 100644 --- a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h +++ b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h @@ -64,11 +64,11 @@ class AIPreDecoderService : public AIDecoderService { cuda::atomic* h_ready_flags_ = nullptr; void** h_ring_ptrs_ = nullptr; - void* h_outputs_ = nullptr; + void* h_predecoder_outputs_ = nullptr; cuda::atomic* d_ready_flags_ = nullptr; void** d_ring_ptrs_ = nullptr; - void* d_outputs_ = nullptr; + void* d_predecoder_outputs_ = nullptr; }; } // namespace cudaq::qec diff --git a/libs/qec/lib/realtime/ai_predecoder_service.cu b/libs/qec/lib/realtime/ai_predecoder_service.cu index 533f6399..c6b87384 100644 --- a/libs/qec/lib/realtime/ai_predecoder_service.cu +++ b/libs/qec/lib/realtime/ai_predecoder_service.cu @@ -96,11 +96,11 @@ AIPreDecoderService::AIPreDecoderService(const std::string& path, void** mailbox new (h_ready_flags_) atomic_int_sys(0); SERVICE_CUDA_CHECK(cudaHostAlloc(&h_ring_ptrs_, sizeof(void*), cudaHostAllocMapped)); - SERVICE_CUDA_CHECK(cudaHostAlloc(&h_outputs_, get_output_size(), cudaHostAllocMapped)); + SERVICE_CUDA_CHECK(cudaHostAlloc(&h_predecoder_outputs_, get_output_size(), cudaHostAllocMapped)); SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_ready_flags_, (void*)h_ready_flags_, 0)); SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_ring_ptrs_, (void*)h_ring_ptrs_, 0)); - SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_outputs_, (void*)h_outputs_, 0)); + SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_predecoder_outputs_, (void*)h_predecoder_outputs_, 0)); } AIPreDecoderService::~AIPreDecoderService() { @@ -114,9 +114,9 @@ AIPreDecoderService::~AIPreDecoderService() { cudaFreeHost(h_ring_ptrs_); h_ring_ptrs_ = nullptr; } - if (h_outputs_) { - cudaFreeHost(h_outputs_); - h_outputs_ = nullptr; + if (h_predecoder_outputs_) { + cudaFreeHost(h_predecoder_outputs_); + h_predecoder_outputs_ = nullptr; } } @@ -142,7 +142,7 @@ void AIPreDecoderService::capture_graph(cudaStream_t stream, bool device_launch) } SERVICE_CUDA_CHECK(cudaMemcpyAsync( - d_outputs_, d_trt_output_, get_output_size(), + d_predecoder_outputs_, d_trt_output_, get_output_size(), cudaMemcpyDeviceToDevice, stream)); predecoder_signal_ready_kernel<<<1, 1, 0, stream>>>( @@ -183,7 +183,7 @@ bool AIPreDecoderService::poll_next_job(PreDecoderJob& out_job) { cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed)) { out_job.slot_idx = 0; out_job.ring_buffer_ptr = h_ring_ptrs_[0]; - out_job.inference_data = h_outputs_; + out_job.inference_data = h_predecoder_outputs_; return true; } return false; diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h b/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h index 2fd1ec1b..67faf832 100644 --- a/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h +++ b/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h @@ -38,6 +38,8 @@ struct HostDispatchWorker { uint32_t function_id; // matches table entry; used to assign slot to this worker void (*pre_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream) = nullptr; void* pre_launch_data = nullptr; + void (*post_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream) = nullptr; + void* post_launch_data = nullptr; }; struct HostDispatcherConfig { diff --git a/realtime/include/cudaq/realtime/pipeline.h b/realtime/include/cudaq/realtime/pipeline.h index e3645a56..e04cf11d 100644 --- a/realtime/include/cudaq/realtime/pipeline.h +++ b/realtime/include/cudaq/realtime/pipeline.h @@ -43,6 +43,8 @@ struct GpuWorkerResources { cudaStream_t stream = nullptr; void (*pre_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream) = nullptr; void* pre_launch_data = nullptr; + void (*post_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream) = nullptr; + void* post_launch_data = nullptr; uint32_t function_id = 0; void* user_context = nullptr; }; @@ -54,14 +56,14 @@ using GpuStageFactory = std::function; // CPU Stage Callback // --------------------------------------------------------------------------- -/// Passed to the user's CPU stage callback on each completed GPU inference. -/// The user reads inference_output, does post-processing, and writes the +/// Passed to the user's CPU stage callback on each completed GPU workload. +/// The user reads gpu_output, does post-processing, and writes the /// result into response_buffer. No atomics are exposed. struct CpuStageContext { int worker_id; int origin_slot; - const void* inference_output; - size_t output_size; + const void* gpu_output; + size_t gpu_output_size; void* response_buffer; size_t max_response_size; void* user_context; diff --git a/realtime/lib/daemon/dispatcher/host_dispatcher.cu b/realtime/lib/daemon/dispatcher/host_dispatcher.cu index 7815cd50..1f1837c1 100644 --- a/realtime/lib/daemon/dispatcher/host_dispatcher.cu +++ b/realtime/lib/daemon/dispatcher/host_dispatcher.cu @@ -108,6 +108,8 @@ static void launch_graph_worker(const HostDispatcherConfig& config, config.tx_flags[current_slot].store(error_val, cuda::std::memory_order_release); config.idle_mask->fetch_or(1ULL << worker_id, cuda::std::memory_order_release); } else { + if (config.workers[w].post_launch_fn) + config.workers[w].post_launch_fn(config.workers[w].post_launch_data, data_dev, config.workers[w].stream); uint64_t tx_slot_addr = (config.tx_data_host != nullptr && config.tx_data_dev != nullptr) ? reinterpret_cast(config.tx_data_host + diff --git a/realtime/lib/pipeline/realtime_pipeline.cu b/realtime/lib/pipeline/realtime_pipeline.cu index b6dfffed..0992c6ab 100644 --- a/realtime/lib/pipeline/realtime_pipeline.cu +++ b/realtime/lib/pipeline/realtime_pipeline.cu @@ -49,6 +49,46 @@ static void pin_thread(std::thread& t, int core) { pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset); } +// --------------------------------------------------------------------------- +// GPU-only mode: completion signaling via cudaLaunchHostFunc +// --------------------------------------------------------------------------- + +struct GpuOnlyWorkerCtx { + atomic_uint64_sys* tx_flags; + atomic_uint64_sys* idle_mask; + int* inflight_slot_tags; + uint8_t* rx_data_host; + size_t slot_size; + int worker_id; + void (*user_post_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream); + void* user_post_launch_data; + int origin_slot; + uint64_t tx_value; +}; + +static void gpu_only_host_callback(void* user_data) { + auto* ctx = static_cast(user_data); + ctx->tx_flags[ctx->origin_slot].store( + ctx->tx_value, cuda::std::memory_order_release); + ctx->idle_mask->fetch_or( + 1ULL << ctx->worker_id, cuda::std::memory_order_release); +} + +static void gpu_only_post_launch(void* user_data, void* slot_dev, + cudaStream_t stream) { + auto* ctx = static_cast(user_data); + + if (ctx->user_post_launch_fn) + ctx->user_post_launch_fn(ctx->user_post_launch_data, slot_dev, stream); + + ctx->origin_slot = ctx->inflight_slot_tags[ctx->worker_id]; + uint8_t* slot_host = ctx->rx_data_host + + static_cast(ctx->origin_slot) * ctx->slot_size; + ctx->tx_value = reinterpret_cast(slot_host); + + cudaLaunchHostFunc(stream, gpu_only_host_callback, ctx); +} + // --------------------------------------------------------------------------- // RingBufferManager @@ -177,6 +217,10 @@ struct RealtimePipeline::Impl { // Per-worker GPU resources (from factory) std::vector worker_resources; + // GPU-only mode state + bool gpu_only = false; + std::vector gpu_only_ctxs; + // Slot-to-request mapping (consumer-owned) std::vector slot_request; @@ -222,6 +266,7 @@ struct RealtimePipeline::Impl { void start_threads() { const int nw = config.num_workers; + gpu_only = !cpu_stage; // Build GPU resources via user factory worker_resources.resize(nw); @@ -234,6 +279,25 @@ struct RealtimePipeline::Impl { std::memset(&function_table[i].schema, 0, sizeof(function_table[i].schema)); } + // In GPU-only mode, set up per-worker contexts for cudaLaunchHostFunc + // completion signaling (chains user's post_launch_fn if provided). + if (gpu_only) { + gpu_only_ctxs.resize(nw); + for (int i = 0; i < nw; ++i) { + auto& c = gpu_only_ctxs[i]; + c.tx_flags = ring->tx_flags(); + c.idle_mask = &idle_mask; + c.inflight_slot_tags = inflight_slot_tags.data(); + c.rx_data_host = ring->rx_data_host(); + c.slot_size = config.slot_size; + c.worker_id = i; + c.user_post_launch_fn = worker_resources[i].post_launch_fn; + c.user_post_launch_data = worker_resources[i].post_launch_data; + c.origin_slot = 0; + c.tx_value = 0; + } + } + // Initialize idle_mask with all workers free uint64_t initial_idle = (nw >= 64) ? ~0ULL : ((1ULL << nw) - 1); idle_mask.store(initial_idle, cuda::std::memory_order_release); @@ -265,6 +329,14 @@ struct RealtimePipeline::Impl { disp_cfg.workers[i].function_id = worker_resources[i].function_id; disp_cfg.workers[i].pre_launch_fn = worker_resources[i].pre_launch_fn; disp_cfg.workers[i].pre_launch_data = worker_resources[i].pre_launch_data; + + if (gpu_only) { + disp_cfg.workers[i].post_launch_fn = gpu_only_post_launch; + disp_cfg.workers[i].post_launch_data = &gpu_only_ctxs[i]; + } else { + disp_cfg.workers[i].post_launch_fn = worker_resources[i].post_launch_fn; + disp_cfg.workers[i].post_launch_data = worker_resources[i].post_launch_data; + } } // --- Dispatcher thread --- @@ -273,13 +345,15 @@ struct RealtimePipeline::Impl { }); pin_thread(dispatcher_thread, config.cores.dispatcher); - // --- Worker threads --- - worker_threads.resize(nw); - for (int i = 0; i < nw; ++i) { - worker_threads[i] = std::thread([this, i]() { worker_loop(i); }); - int core = (config.cores.worker_base >= 0) - ? config.cores.worker_base + i : -1; - pin_thread(worker_threads[i], core); + // --- Worker threads (skipped in GPU-only mode) --- + if (!gpu_only) { + worker_threads.resize(nw); + for (int i = 0; i < nw; ++i) { + worker_threads[i] = std::thread([this, i]() { worker_loop(i); }); + int core = (config.cores.worker_base >= 0) + ? config.cores.worker_base + i : -1; + pin_thread(worker_threads[i], core); + } } // --- Consumer thread --- @@ -359,7 +433,7 @@ struct RealtimePipeline::Impl { auto* wr = &worker_resources[worker_id]; // The cpu_stage callback is called in "poll mode" - // (inference_output == nullptr). It polls its own GPU-ready + // (gpu_output == nullptr). It polls its own GPU-ready // mechanism and, if a result is available, processes it and // writes the RPC response. Returns 0 when nothing was ready, // >0 when a job was completed. The pipeline then handles all @@ -369,8 +443,8 @@ struct RealtimePipeline::Impl { CpuStageContext ctx; ctx.worker_id = worker_id; ctx.origin_slot = inflight_slot_tags[worker_id]; - ctx.inference_output = nullptr; - ctx.output_size = 0; + ctx.gpu_output = nullptr; + ctx.gpu_output_size = 0; ctx.response_buffer = nullptr; ctx.max_response_size = 0; ctx.user_context = wr->user_context; From 84af084d6758751bbb5870649218f8fb1f0228aa Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Wed, 4 Mar 2026 01:05:49 +0000 Subject: [PATCH 26/40] Added pipeline library to QEC unittests CMake Signed-off-by: Scott Thornton --- libs/qec/unittests/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libs/qec/unittests/CMakeLists.txt b/libs/qec/unittests/CMakeLists.txt index 7355f057..9ffdbf71 100644 --- a/libs/qec/unittests/CMakeLists.txt +++ b/libs/qec/unittests/CMakeLists.txt @@ -220,6 +220,7 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) PATHS ${TENSORRT_ROOT}/include /usr/include/x86_64-linux-gnu + /usr/include/aarch64-linux-gnu /usr/local/cuda/include /usr/local/tensorrt/include /opt/tensorrt/include @@ -277,6 +278,7 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) cudaq-realtime cudaq-realtime-host-dispatch cudaq-realtime-dispatch + cudaq-realtime-pipeline ) set_target_properties(test_realtime_pipeline PROPERTIES BUILD_RPATH "${CMAKE_BINARY_DIR}/lib;${CMAKE_BINARY_DIR}/realtime/lib" From 64c0d9fa486fb55e7ada705aabcc3bef630f9c1d Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Wed, 4 Mar 2026 02:16:59 +0000 Subject: [PATCH 27/40] Fix critical and major defects from code review Address all critical (C1-C4) and major (M1-M12) defects identified during code review: Critical fixes: - C1: Fix race condition in try_submit via compare_exchange_weak - C2: Use uint64_t + separate occupancy flag for slot_request to support full request_id range (was int64_t with -1 sentinel) - C3: Add __syncthreads() before response header write in gateway_output_kernel to prevent partially-written result reads - C4: Always write IN_FLIGHT sentinel to tx_flags after graph launch Major fixes: - M1: Remove cudaSetDeviceFlags from RingBufferManager (caller's duty) - M2: Use std::atomic load with memory_order_acquire for tx/rx flag reads instead of plain volatile (ARM correctness) - M3: Validate num_workers <= 64 (idle_mask capacity) - M4: Validate gpu_factory is set before start() - M5: Check producer_stop in RingBufferInjector::submit to prevent infinite spin after shutdown - M6: Make started flag std::atomic - M7: Add CUDA error checks in AIDecoderService::capture_graph - M8: Check enqueueV3 return value in both service files - M9: Fix tensor_volume for dynamic-shape dims (was wrapping to SIZE_MAX on dim=-1) - M10: Assert num_workers == num_predecoders in benchmark - M11: Add aarch64 paths to predecoder test's TRT CMake search - M12: Replace vector with vector to avoid concurrent write UB Also extracts submit logic into RingBufferInjector class to separate test infrastructure from pipeline core. Signed-off-by: Scott Thornton --- libs/qec/lib/realtime/ai_decoder_service.cu | 41 +++-- .../qec/lib/realtime/ai_predecoder_service.cu | 6 +- .../test_realtime_predecoder_w_pymatching.cpp | 13 +- libs/qec/unittests/CMakeLists.txt | 7 +- realtime/include/cudaq/realtime/pipeline.h | 45 +++++- .../daemon/dispatcher/cudaq_realtime_api.cpp | 12 +- .../lib/daemon/dispatcher/host_dispatcher.cu | 10 +- realtime/lib/pipeline/realtime_pipeline.cu | 147 ++++++++++++------ 8 files changed, 198 insertions(+), 83 deletions(-) diff --git a/libs/qec/lib/realtime/ai_decoder_service.cu b/libs/qec/lib/realtime/ai_decoder_service.cu index f6b2155d..ab4e0e75 100644 --- a/libs/qec/lib/realtime/ai_decoder_service.cu +++ b/libs/qec/lib/realtime/ai_decoder_service.cu @@ -13,6 +13,16 @@ #include #include #include +#include +#include + +#define DECODER_CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + throw std::runtime_error(std::string("CUDA Error in AIDecoderService: ") + cudaGetErrorString(err)); \ + } \ + } while(0) namespace cudaq::qec { @@ -51,6 +61,8 @@ __global__ void gateway_output_kernel( dst[i] = src[i]; } + __syncthreads(); + if (threadIdx.x == 0 && blockIdx.x == 0) { auto* response = (cudaq::nvqlink::RPCResponse*)ring_buffer_data; response->magic = cudaq::nvqlink::RPC_MAGIC_RESPONSE; @@ -78,7 +90,8 @@ static size_t trt_dtype_size(nvinfer1::DataType dtype) { static size_t tensor_volume(const nvinfer1::Dims& d) { size_t v = 1; - for (int i = 0; i < d.nbDims; ++i) v *= d.d[i]; + for (int i = 0; i < d.nbDims; ++i) + v *= (d.d[i] > 0) ? static_cast(d.d[i]) : 1; return v; } @@ -275,28 +288,36 @@ void AIDecoderService::allocate_resources() { } void AIDecoderService::capture_graph(cudaStream_t stream) { - // Bind all tensors to TRT context for (auto& b : all_bindings_) { context_->setTensorAddress(b.name.c_str(), b.d_buffer); } - context_->enqueueV3(stream); - cudaStreamSynchronize(stream); + if (!context_->enqueueV3(stream)) + throw std::runtime_error("TRT enqueueV3 warmup failed in AIDecoderService"); + DECODER_CUDA_CHECK(cudaStreamSynchronize(stream)); cudaGraph_t graph; - cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal); + DECODER_CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); gateway_input_kernel<<<1, 128, 0, stream>>>(device_mailbox_slot_, d_trt_input_, input_size_); - context_->enqueueV3(stream); + if (!context_->enqueueV3(stream)) + throw std::runtime_error("TRT enqueueV3 failed during graph capture in AIDecoderService"); gateway_output_kernel<<<1, 128, 0, stream>>>(device_mailbox_slot_, d_trt_output_, output_size_); - cudaStreamEndCapture(stream, &graph); + DECODER_CUDA_CHECK(cudaStreamEndCapture(stream, &graph)); - cudaGraphInstantiateWithFlags(&graph_exec_, graph, cudaGraphInstantiateFlagDeviceLaunch); + cudaError_t inst_err = cudaGraphInstantiateWithFlags( + &graph_exec_, graph, cudaGraphInstantiateFlagDeviceLaunch); + if (inst_err != cudaSuccess) { + cudaGraphDestroy(graph); + throw std::runtime_error( + std::string("cudaGraphInstantiateWithFlags failed in AIDecoderService: ") + + cudaGetErrorString(inst_err)); + } - cudaGraphUpload(graph_exec_, stream); + DECODER_CUDA_CHECK(cudaGraphUpload(graph_exec_, stream)); cudaGraphDestroy(graph); - cudaStreamSynchronize(stream); + DECODER_CUDA_CHECK(cudaStreamSynchronize(stream)); } } // namespace cudaq::qec diff --git a/libs/qec/lib/realtime/ai_predecoder_service.cu b/libs/qec/lib/realtime/ai_predecoder_service.cu index cf9523c1..c539fe1e 100644 --- a/libs/qec/lib/realtime/ai_predecoder_service.cu +++ b/libs/qec/lib/realtime/ai_predecoder_service.cu @@ -94,7 +94,8 @@ void AIPreDecoderService::capture_graph(cudaStream_t stream, bool device_launch) for (auto& b : all_bindings_) { context_->setTensorAddress(b.name.c_str(), b.d_buffer); } - context_->enqueueV3(stream); + if (!context_->enqueueV3(stream)) + throw std::runtime_error("TRT enqueueV3 warmup failed in AIPreDecoderService"); } SERVICE_CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -105,7 +106,8 @@ void AIPreDecoderService::capture_graph(cudaStream_t stream, bool device_launch) passthrough_copy_kernel<<<1, 256, 0, stream>>>( d_trt_output_, d_trt_input_, get_input_size()); } else { - context_->enqueueV3(stream); + if (!context_->enqueueV3(stream)) + throw std::runtime_error("TRT enqueueV3 failed during graph capture in AIPreDecoderService"); } SERVICE_CUDA_CHECK(cudaMemcpyAsync( diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index d1573a03..93a0fd3a 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -353,6 +353,14 @@ int main(int argc, char* argv[]) { pre_launch_ctxs[i].h_ring_ptrs = predecoders[i]->get_host_ring_ptrs(); } + if (config.num_workers != config.num_predecoders) { + throw std::invalid_argument( + "num_workers (" + std::to_string(config.num_workers) + + ") must equal num_predecoders (" + + std::to_string(config.num_predecoders) + + ") in the current benchmark"); + } + // Worker contexts (per-worker, application-specific) std::vector worker_ctxs(config.num_workers); for (int i = 0; i < config.num_workers; ++i) { @@ -458,7 +466,7 @@ int main(int argc, char* argv[]) { const int max_requests = 500000; std::vector submit_ts(max_requests); std::vector complete_ts(max_requests); - std::vector completed(max_requests, false); + std::vector completed(max_requests, 0); pipeline.set_completion_handler([&](const realtime_ns::Completion& c) { if (c.request_id < static_cast(max_requests)) { @@ -472,6 +480,7 @@ int main(int argc, char* argv[]) { // ========================================================================= std::cout << "[Setup] Starting pipeline...\n"; + auto injector = pipeline.create_injector(); pipeline.start(); auto run_deadline = std::chrono::steady_clock::now() @@ -508,7 +517,7 @@ int main(int argc, char* argv[]) { uint32_t fid = realtime_ns::fnv1a_hash(func.c_str()); submit_ts[req_id] = hrclock::now(); - pipeline.submit(fid, payload, static_cast(payload_bytes), + injector.submit(fid, payload, static_cast(payload_bytes), static_cast(req_id)); target = (target + 1) % config.num_predecoders; diff --git a/libs/qec/unittests/CMakeLists.txt b/libs/qec/unittests/CMakeLists.txt index 9ffdbf71..cdc104a9 100644 --- a/libs/qec/unittests/CMakeLists.txt +++ b/libs/qec/unittests/CMakeLists.txt @@ -227,10 +227,10 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) NO_DEFAULT_PATH ) find_library(TENSORRT_LIBRARY_FOR_PIPELINE nvinfer - PATHS ${TENSORRT_ROOT}/lib /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64 /usr/local/tensorrt/lib /opt/tensorrt/lib + PATHS ${TENSORRT_ROOT}/lib /usr/lib/x86_64-linux-gnu /usr/lib/aarch64-linux-gnu /usr/local/cuda/lib64 /usr/local/tensorrt/lib /opt/tensorrt/lib ) find_library(TENSORRT_ONNX_PARSER_FOR_PIPELINE nvonnxparser - PATHS ${TENSORRT_ROOT}/lib /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64 /usr/local/tensorrt/lib /opt/tensorrt/lib + PATHS ${TENSORRT_ROOT}/lib /usr/lib/x86_64-linux-gnu /usr/lib/aarch64-linux-gnu /usr/local/cuda/lib64 /usr/local/tensorrt/lib /opt/tensorrt/lib ) if(TENSORRT_INCLUDE_DIR_FOR_PIPELINE AND TENSORRT_LIBRARY_FOR_PIPELINE AND TENSORRT_ONNX_PARSER_FOR_PIPELINE) @@ -313,6 +313,7 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) PATHS ${TENSORRT_ROOT}/include /usr/include/x86_64-linux-gnu + /usr/include/aarch64-linux-gnu /usr/local/cuda/include /usr/local/tensorrt/include /opt/tensorrt/include @@ -322,6 +323,7 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) PATHS ${TENSORRT_ROOT}/lib /usr/lib/x86_64-linux-gnu + /usr/lib/aarch64-linux-gnu /usr/local/cuda/lib64 /usr/local/tensorrt/lib /opt/tensorrt/lib @@ -330,6 +332,7 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) PATHS ${TENSORRT_ROOT}/lib /usr/lib/x86_64-linux-gnu + /usr/lib/aarch64-linux-gnu /usr/local/cuda/lib64 /usr/local/tensorrt/lib /opt/tensorrt/lib diff --git a/realtime/include/cudaq/realtime/pipeline.h b/realtime/include/cudaq/realtime/pipeline.h index e04cf11d..2bdcacd2 100644 --- a/realtime/include/cudaq/realtime/pipeline.h +++ b/realtime/include/cudaq/realtime/pipeline.h @@ -86,6 +86,40 @@ struct Completion { /// Called by the consumer thread for each completed (or errored) request. using CompletionCallback = std::function; +// --------------------------------------------------------------------------- +// Ring Buffer Injector (software-only test/replay data source) +// --------------------------------------------------------------------------- + +/// Writes RPC-framed requests into the pipeline's ring buffer, simulating +/// FPGA DMA deposits. Created via RealtimePipeline::create_injector(). +/// The parent RealtimePipeline must outlive the injector. +class RingBufferInjector { +public: + ~RingBufferInjector(); + RingBufferInjector(RingBufferInjector&&) noexcept; + RingBufferInjector& operator=(RingBufferInjector&&) noexcept; + + RingBufferInjector(const RingBufferInjector&) = delete; + RingBufferInjector& operator=(const RingBufferInjector&) = delete; + + /// Try to submit a request. Returns true if accepted, false if + /// backpressure (all slots busy). Non-blocking. Thread-safe. + bool try_submit(uint32_t function_id, const void* payload, + size_t payload_size, uint64_t request_id); + + /// Blocking submit: spins until a slot becomes available. + void submit(uint32_t function_id, const void* payload, + size_t payload_size, uint64_t request_id); + + uint64_t backpressure_stalls() const; + +private: + friend class RealtimePipeline; + struct State; + std::unique_ptr state_; + explicit RingBufferInjector(std::unique_ptr s); +}; + // --------------------------------------------------------------------------- // Pipeline // --------------------------------------------------------------------------- @@ -113,14 +147,9 @@ class RealtimePipeline { /// Signal shutdown, join all threads, free resources. void stop(); - /// Try to submit a request. Returns true if accepted, false if - /// backpressure (all slots busy). Non-blocking. - bool try_submit(uint32_t function_id, const void* payload, - size_t payload_size, uint64_t request_id); - - /// Blocking submit: spins until a slot becomes available. - void submit(uint32_t function_id, const void* payload, - size_t payload_size, uint64_t request_id); + /// Create a software injector for testing without FPGA hardware. + /// The pipeline must be constructed but need not be started yet. + RingBufferInjector create_injector(); struct Stats { uint64_t submitted; diff --git a/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp b/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp index 323be95e..b7054235 100644 --- a/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp +++ b/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp @@ -8,6 +8,7 @@ #include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include #include #include #include @@ -295,9 +296,15 @@ void cudaq_host_ringbuffer_signal_slot(const cudaq_ringbuffer_t *rb, rb->rx_data_host + slot_idx * rb->rx_stride_sz); } +static inline uint64_t load_acquire(volatile uint64_t *addr) { + auto *a = reinterpret_cast *>( + const_cast(addr)); + return a->load(std::memory_order_acquire); +} + cudaq_tx_status_t cudaq_host_ringbuffer_poll_tx_flag( const cudaq_ringbuffer_t *rb, uint32_t slot_idx, int *out_cuda_error) { - uint64_t v = rb->tx_flags_host[slot_idx]; + uint64_t v = load_acquire(&rb->tx_flags_host[slot_idx]); if (v == 0) return CUDAQ_TX_EMPTY; if (v == 0xEEEEEEEEEEEEEEEEULL) @@ -312,7 +319,8 @@ cudaq_tx_status_t cudaq_host_ringbuffer_poll_tx_flag( int cudaq_host_ringbuffer_slot_available(const cudaq_ringbuffer_t *rb, uint32_t slot_idx) { - return rb->rx_flags_host[slot_idx] == 0 && rb->tx_flags_host[slot_idx] == 0; + return load_acquire(&rb->rx_flags_host[slot_idx]) == 0 && + load_acquire(&rb->tx_flags_host[slot_idx]) == 0; } void cudaq_host_ringbuffer_clear_slot(const cudaq_ringbuffer_t *rb, diff --git a/realtime/lib/daemon/dispatcher/host_dispatcher.cu b/realtime/lib/daemon/dispatcher/host_dispatcher.cu index 1f1837c1..2f0b055f 100644 --- a/realtime/lib/daemon/dispatcher/host_dispatcher.cu +++ b/realtime/lib/daemon/dispatcher/host_dispatcher.cu @@ -110,12 +110,10 @@ static void launch_graph_worker(const HostDispatcherConfig& config, } else { if (config.workers[w].post_launch_fn) config.workers[w].post_launch_fn(config.workers[w].post_launch_data, data_dev, config.workers[w].stream); - uint64_t tx_slot_addr = - (config.tx_data_host != nullptr && config.tx_data_dev != nullptr) - ? reinterpret_cast(config.tx_data_host + - current_slot * config.tx_stride_sz) - : 0xEEEEEEEEEEEEEEEEULL; - config.tx_flags[current_slot].store(tx_slot_addr, cuda::std::memory_order_release); + // Always write IN_FLIGHT sentinel. The actual READY value is written + // later by the CPU worker thread or the GPU-only cudaLaunchHostFunc + // callback, after the graph has completed. + config.tx_flags[current_slot].store(0xEEEEEEEEEEEEEEEEULL, cuda::std::memory_order_release); } } diff --git a/realtime/lib/pipeline/realtime_pipeline.cu b/realtime/lib/pipeline/realtime_pipeline.cu index 0992c6ab..35fce363 100644 --- a/realtime/lib/pipeline/realtime_pipeline.cu +++ b/realtime/lib/pipeline/realtime_pipeline.cu @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #include @@ -99,8 +101,6 @@ public: RingBufferManager(size_t num_slots, size_t slot_size) : num_slots_(num_slots), slot_size_(slot_size) { - PIPELINE_CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost)); - PIPELINE_CUDA_CHECK(cudaHostAlloc(&buf_rx_, num_slots * sizeof(atomic_uint64_sys), cudaHostAllocMapped)); rx_flags_ = static_cast(buf_rx_); @@ -222,7 +222,8 @@ struct RealtimePipeline::Impl { std::vector gpu_only_ctxs; // Slot-to-request mapping (consumer-owned) - std::vector slot_request; + std::vector slot_request; + std::vector slot_occupied; // Stats (atomic counters) std::atomic total_submitted{0}; @@ -238,16 +239,19 @@ struct RealtimePipeline::Impl { std::thread consumer_thread; std::vector worker_threads; - // Producer slot cursor - std::atomic next_slot{0}; - - bool started = false; + std::atomic started{false}; // ----------------------------------------------------------------------- // Lifecycle // ----------------------------------------------------------------------- void allocate(const PipelineStageConfig& cfg) { + if (cfg.num_workers > 64) { + throw std::invalid_argument( + "num_workers (" + std::to_string(cfg.num_workers) + + ") exceeds idle_mask capacity of 64"); + } + config = cfg; ring = std::make_unique( @@ -261,10 +265,16 @@ struct RealtimePipeline::Impl { reinterpret_cast(&d_mailbox_bank), h_mailbox_bank, 0)); inflight_slot_tags.resize(cfg.num_workers, 0); - slot_request.resize(cfg.num_slots, -1); + slot_request.resize(cfg.num_slots, 0); + slot_occupied.resize(cfg.num_slots, 0); } void start_threads() { + if (!gpu_factory) { + throw std::logic_error( + "gpu_factory must be set before calling start()"); + } + const int nw = config.num_workers; gpu_only = !cpu_stage; @@ -405,26 +415,6 @@ struct RealtimePipeline::Impl { } } - // ----------------------------------------------------------------------- - // Submit - // ----------------------------------------------------------------------- - - bool try_submit_impl(uint32_t function_id, const void* payload, - size_t payload_size, uint64_t request_id) { - uint32_t slot = next_slot.load(std::memory_order_relaxed) % - static_cast(config.num_slots); - if (!ring->slot_available(slot)) - return false; - - ring->write_and_signal(slot, function_id, payload, - static_cast(payload_size)); - - slot_request[slot] = static_cast(request_id); - next_slot.fetch_add(1, std::memory_order_relaxed); - total_submitted.fetch_add(1, std::memory_order_release); - return true; - } - // ----------------------------------------------------------------------- // Worker loop (one per worker thread) // ----------------------------------------------------------------------- @@ -489,16 +479,15 @@ struct RealtimePipeline::Impl { bool found_any = false; for (uint32_t s = 0; s < ns; ++s) { - if (slot_request[s] < 0) continue; + if (!slot_occupied[s]) continue; int cuda_error = 0; cudaq_tx_status_t status = ring->poll_tx(s, &cuda_error); if (status == CUDAQ_TX_READY) { - int64_t rid = slot_request[s]; - if (rid >= 0 && completion_handler) { + if (completion_handler) { Completion c; - c.request_id = static_cast(rid); + c.request_id = slot_request[s]; c.slot = static_cast(s); c.success = true; c.cuda_error = 0; @@ -506,25 +495,24 @@ struct RealtimePipeline::Impl { } total_completed.fetch_add(1, std::memory_order_relaxed); - // ARM memory ordering: clear slot_request BEFORE + // ARM memory ordering: clear occupancy BEFORE // clearing ring buffer flags, with a fence between. - slot_request[s] = -1; + slot_occupied[s] = 0; __sync_synchronize(); ring->clear_slot(s); found_any = true; } else if (status == CUDAQ_TX_ERROR) { - int64_t rid = slot_request[s]; - if (rid >= 0 && completion_handler) { + if (completion_handler) { Completion c; - c.request_id = static_cast(rid); + c.request_id = slot_request[s]; c.slot = static_cast(s); c.success = false; c.cuda_error = cuda_error; completion_handler(c); } total_completed.fetch_add(1, std::memory_order_relaxed); - slot_request[s] = -1; + slot_occupied[s] = 0; __sync_synchronize(); ring->clear_slot(s); found_any = true; @@ -574,19 +562,6 @@ void RealtimePipeline::stop() { impl_->stop_all(); } -bool RealtimePipeline::try_submit(uint32_t function_id, const void* payload, - size_t payload_size, uint64_t request_id) { - return impl_->try_submit_impl(function_id, payload, payload_size, request_id); -} - -void RealtimePipeline::submit(uint32_t function_id, const void* payload, - size_t payload_size, uint64_t request_id) { - while (!try_submit(function_id, payload, payload_size, request_id)) { - impl_->backpressure_stalls.fetch_add(1, std::memory_order_relaxed); - QEC_CPU_RELAX(); - } -} - RealtimePipeline::Stats RealtimePipeline::stats() const { return { impl_->total_submitted.load(std::memory_order_relaxed), @@ -596,4 +571,74 @@ RealtimePipeline::Stats RealtimePipeline::stats() const { }; } +// --------------------------------------------------------------------------- +// RingBufferInjector +// --------------------------------------------------------------------------- + +struct RingBufferInjector::State { + RingBufferManager* ring = nullptr; + std::vector* slot_request = nullptr; + std::vector* slot_occupied = nullptr; + std::atomic* total_submitted = nullptr; + std::atomic* backpressure_stalls = nullptr; + std::atomic* producer_stop = nullptr; + int num_slots = 0; + std::atomic next_slot{0}; +}; + +RingBufferInjector RealtimePipeline::create_injector() { + auto s = std::make_unique(); + s->ring = impl_->ring.get(); + s->slot_request = &impl_->slot_request; + s->slot_occupied = &impl_->slot_occupied; + s->total_submitted = &impl_->total_submitted; + s->backpressure_stalls = &impl_->backpressure_stalls; + s->producer_stop = &impl_->producer_stop; + s->num_slots = impl_->config.num_slots; + return RingBufferInjector(std::move(s)); +} + +RingBufferInjector::RingBufferInjector(std::unique_ptr s) + : state_(std::move(s)) {} + +RingBufferInjector::~RingBufferInjector() = default; +RingBufferInjector::RingBufferInjector(RingBufferInjector&&) noexcept = default; +RingBufferInjector& RingBufferInjector::operator=(RingBufferInjector&&) noexcept = default; + +bool RingBufferInjector::try_submit(uint32_t function_id, const void* payload, + size_t payload_size, uint64_t request_id) { + uint32_t cur = state_->next_slot.load(std::memory_order_relaxed); + uint32_t slot = cur % static_cast(state_->num_slots); + if (!state_->ring->slot_available(slot)) + return false; + + if (!state_->next_slot.compare_exchange_weak( + cur, cur + 1, + std::memory_order_acq_rel, std::memory_order_relaxed)) + return false; + + state_->ring->write_and_signal(slot, function_id, payload, + static_cast(payload_size)); + + (*state_->slot_request)[slot] = request_id; + (*state_->slot_occupied)[slot] = 1; + state_->total_submitted->fetch_add(1, std::memory_order_release); + return true; +} + +void RingBufferInjector::submit(uint32_t function_id, const void* payload, + size_t payload_size, uint64_t request_id) { + while (!try_submit(function_id, payload, payload_size, request_id)) { + if (state_->producer_stop && + state_->producer_stop->load(std::memory_order_acquire)) + return; + state_->backpressure_stalls->fetch_add(1, std::memory_order_relaxed); + QEC_CPU_RELAX(); + } +} + +uint64_t RingBufferInjector::backpressure_stalls() const { + return state_->backpressure_stalls->load(std::memory_order_relaxed); +} + } // namespace cudaq::realtime From c5ee3c835f10b62aa5fa4a48b3d6e206dd1a0419 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Wed, 4 Mar 2026 02:20:30 +0000 Subject: [PATCH 28/40] Formatting Signed-off-by: Scott Thornton --- .../cudaq/qec/realtime/ai_decoder_service.h | 99 +- .../qec/realtime/ai_predecoder_service.h | 80 +- .../cudaq/qec/utils/pipeline_benchmarks.h | 333 +++-- .../qec/include/cudaq/qec/utils/thread_pool.h | 160 ++- libs/qec/lib/realtime/ai_decoder_service.cu | 522 ++++---- .../qec/lib/realtime/ai_predecoder_service.cu | 252 ++-- .../test_realtime_predecoder_w_pymatching.cpp | 1072 +++++++-------- libs/qec/unittests/test_realtime_pipeline.cu | 1160 ++++++++--------- .../daemon/dispatcher/cudaq_realtime.h | 43 +- .../daemon/dispatcher/dispatch_kernel.cuh | 64 +- .../daemon/dispatcher/host_dispatcher.h | 75 +- realtime/include/cudaq/realtime/pipeline.h | 151 +-- .../daemon/dispatcher/cudaq_realtime_api.cpp | 24 +- .../lib/daemon/dispatcher/dispatch_kernel.cu | 404 +++--- .../lib/daemon/dispatcher/host_dispatcher.cu | 96 +- .../daemon/dispatcher/host_dispatcher_capi.cu | 51 +- realtime/lib/pipeline/realtime_pipeline.cu | 990 +++++++------- realtime/unittests/test_dispatch_kernel.cu | 496 +++---- realtime/unittests/test_host_dispatcher.cu | 315 +++-- .../init_rpc_increment_function_table.cu | 2 +- 20 files changed, 3221 insertions(+), 3168 deletions(-) diff --git a/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h b/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h index 62cab2e9..ee3e075d 100644 --- a/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h +++ b/libs/qec/include/cudaq/qec/realtime/ai_decoder_service.h @@ -8,72 +8,73 @@ #pragma once -#include #include -#include -#include +#include #include #include +#include +#include namespace cudaq::qec { class AIDecoderService { public: - class Logger : public nvinfer1::ILogger { - void log(Severity severity, const char* msg) noexcept override; - } static gLogger; + class Logger : public nvinfer1::ILogger { + void log(Severity severity, const char *msg) noexcept override; + } static gLogger; - /// @brief Constructor. Accepts a serialized TRT engine (.engine/.plan) or - /// an ONNX model (.onnx) which will be compiled to a TRT engine. - /// @param model_path Path to the model file - /// @param device_mailbox_slot Pointer to the specific slot in the global mailbox bank - /// @param engine_save_path If non-empty and model_path is .onnx, save the - /// built engine to this path for fast reloading on subsequent runs - AIDecoderService(const std::string& model_path, void** device_mailbox_slot, - const std::string& engine_save_path = ""); + /// @brief Constructor. Accepts a serialized TRT engine (.engine/.plan) or + /// an ONNX model (.onnx) which will be compiled to a TRT engine. + /// @param model_path Path to the model file + /// @param device_mailbox_slot Pointer to the specific slot in the global + /// mailbox bank + /// @param engine_save_path If non-empty and model_path is .onnx, save the + /// built engine to this path for fast reloading on subsequent runs + AIDecoderService(const std::string &model_path, void **device_mailbox_slot, + const std::string &engine_save_path = ""); - virtual ~AIDecoderService(); + virtual ~AIDecoderService(); - virtual void capture_graph(cudaStream_t stream); + virtual void capture_graph(cudaStream_t stream); - cudaGraphExec_t get_executable_graph() const { return graph_exec_; } + cudaGraphExec_t get_executable_graph() const { return graph_exec_; } - /// @brief Size of the primary input tensor in bytes (payload from RPC) - size_t get_input_size() const { return input_size_; } + /// @brief Size of the primary input tensor in bytes (payload from RPC) + size_t get_input_size() const { return input_size_; } - /// @brief Size of the primary output tensor in bytes (forwarded to CPU) - size_t get_output_size() const { return output_size_; } + /// @brief Size of the primary output tensor in bytes (forwarded to CPU) + size_t get_output_size() const { return output_size_; } - void* get_trt_input_ptr() const { return d_trt_input_; } + void *get_trt_input_ptr() const { return d_trt_input_; } protected: - void load_engine(const std::string& path); - void build_engine_from_onnx(const std::string& onnx_path, - const std::string& engine_save_path = ""); - void setup_bindings(); - void allocate_resources(); - - std::unique_ptr runtime_; - std::unique_ptr engine_; - std::unique_ptr context_; - - cudaGraphExec_t graph_exec_ = nullptr; - - void** device_mailbox_slot_; - void* d_trt_input_ = nullptr; // Primary input buffer - void* d_trt_output_ = nullptr; // Primary output buffer (residual_detectors) - std::vector d_aux_buffers_; // Additional I/O buffers TRT needs - - struct TensorBinding { - std::string name; - void* d_buffer = nullptr; - size_t size_bytes = 0; - bool is_input = false; - }; - std::vector all_bindings_; - - size_t input_size_ = 0; - size_t output_size_ = 0; + void load_engine(const std::string &path); + void build_engine_from_onnx(const std::string &onnx_path, + const std::string &engine_save_path = ""); + void setup_bindings(); + void allocate_resources(); + + std::unique_ptr runtime_; + std::unique_ptr engine_; + std::unique_ptr context_; + + cudaGraphExec_t graph_exec_ = nullptr; + + void **device_mailbox_slot_; + void *d_trt_input_ = nullptr; // Primary input buffer + void *d_trt_output_ = nullptr; // Primary output buffer (residual_detectors) + std::vector d_aux_buffers_; // Additional I/O buffers TRT needs + + struct TensorBinding { + std::string name; + void *d_buffer = nullptr; + size_t size_bytes = 0; + bool is_input = false; + }; + std::vector all_bindings_; + + size_t input_size_ = 0; + size_t output_size_ = 0; }; } // namespace cudaq::qec diff --git a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h index eb0e5f41..10217a56 100644 --- a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h +++ b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h @@ -9,66 +9,74 @@ #pragma once #include "cudaq/qec/realtime/ai_decoder_service.h" -#include #include +#include // Portable CPU Yield Macro for busy-polling #if defined(__x86_64__) - #include - #define QEC_CPU_RELAX() _mm_pause() +#include +#define QEC_CPU_RELAX() _mm_pause() #elif defined(__aarch64__) - #define QEC_CPU_RELAX() asm volatile("yield" ::: "memory") +#define QEC_CPU_RELAX() asm volatile("yield" ::: "memory") #else - #define QEC_CPU_RELAX() std::atomic_thread_fence(std::memory_order_seq_cst) +#define QEC_CPU_RELAX() std::atomic_thread_fence(std::memory_order_seq_cst) #endif namespace cudaq::qec { struct PreDecoderJob { - int slot_idx; ///< Worker/slot index (for release_job; always 0) - int origin_slot; ///< FPGA ring slot for tx_flags routing (dynamic pool) - void* ring_buffer_ptr; - void* inference_data; ///< Points into the pinned output (single slot) - - // Performance Tracking - uint64_t submit_ts_ns; - uint64_t dispatch_ts_ns; - uint64_t poll_ts_ns; + int slot_idx; ///< Worker/slot index (for release_job; always 0) + int origin_slot; ///< FPGA ring slot for tx_flags routing (dynamic pool) + void *ring_buffer_ptr; + void *inference_data; ///< Points into the pinned output (single slot) + + // Performance Tracking + uint64_t submit_ts_ns; + uint64_t dispatch_ts_ns; + uint64_t poll_ts_ns; }; class AIPreDecoderService : public AIDecoderService { public: - AIPreDecoderService(const std::string& engine_path, void** device_mailbox_slot, - int queue_depth = 1, const std::string& engine_save_path = ""); - virtual ~AIPreDecoderService(); + AIPreDecoderService(const std::string &engine_path, + void **device_mailbox_slot, int queue_depth = 1, + const std::string &engine_save_path = ""); + virtual ~AIPreDecoderService(); - void capture_graph(cudaStream_t stream, bool device_launch); - void capture_graph(cudaStream_t stream) override { capture_graph(stream, true); } + void capture_graph(cudaStream_t stream, bool device_launch); + void capture_graph(cudaStream_t stream) override { + capture_graph(stream, true); + } - bool poll_next_job(PreDecoderJob& out_job); - void release_job(int slot_idx); + bool poll_next_job(PreDecoderJob &out_job); + void release_job(int slot_idx); - /// Stub for device-dispatcher batch path (returns nullptr; streaming uses host dispatcher) - int* get_device_queue_idx() const { return nullptr; } - cuda::atomic* get_device_ready_flags() const { return d_ready_flags_; } - int* get_device_inflight_flag() const { return nullptr; } + /// Stub for device-dispatcher batch path (returns nullptr; streaming uses + /// host dispatcher) + int *get_device_queue_idx() const { return nullptr; } + cuda::atomic *get_device_ready_flags() const { + return d_ready_flags_; + } + int *get_device_inflight_flag() const { return nullptr; } - cuda::atomic* get_host_ready_flags() const { return h_ready_flags_; } - volatile int* get_host_queue_idx() const { return nullptr; } - int get_queue_depth() const { return queue_depth_; } + cuda::atomic *get_host_ready_flags() const { + return h_ready_flags_; + } + volatile int *get_host_queue_idx() const { return nullptr; } + int get_queue_depth() const { return queue_depth_; } - void** get_host_ring_ptrs() const { return h_ring_ptrs_; } + void **get_host_ring_ptrs() const { return h_ring_ptrs_; } private: - int queue_depth_; // Always 1 + int queue_depth_; // Always 1 - cuda::atomic* h_ready_flags_ = nullptr; - void** h_ring_ptrs_ = nullptr; - void* h_predecoder_outputs_ = nullptr; + cuda::atomic *h_ready_flags_ = nullptr; + void **h_ring_ptrs_ = nullptr; + void *h_predecoder_outputs_ = nullptr; - cuda::atomic* d_ready_flags_ = nullptr; - void** d_ring_ptrs_ = nullptr; - void* d_predecoder_outputs_ = nullptr; + cuda::atomic *d_ready_flags_ = nullptr; + void **d_ring_ptrs_ = nullptr; + void *d_predecoder_outputs_ = nullptr; }; } // namespace cudaq::qec diff --git a/libs/qec/include/cudaq/qec/utils/pipeline_benchmarks.h b/libs/qec/include/cudaq/qec/utils/pipeline_benchmarks.h index 4ade0c6b..7075f5d4 100644 --- a/libs/qec/include/cudaq/qec/utils/pipeline_benchmarks.h +++ b/libs/qec/include/cudaq/qec/utils/pipeline_benchmarks.h @@ -34,180 +34,177 @@ namespace cudaq::qec::utils { /// class PipelineBenchmark { public: - using clock = std::chrono::high_resolution_clock; - using time_point = clock::time_point; - using duration_us = std::chrono::duration; - - explicit PipelineBenchmark(const std::string &label = "Pipeline", - size_t expected_requests = 0) - : label_(label), total_submitted_(0) { - if (expected_requests > 0) { - submit_times_.resize(expected_requests); - complete_times_.resize(expected_requests); - completed_.resize(expected_requests, false); - } + using clock = std::chrono::high_resolution_clock; + using time_point = clock::time_point; + using duration_us = std::chrono::duration; + + explicit PipelineBenchmark(const std::string &label = "Pipeline", + size_t expected_requests = 0) + : label_(label), total_submitted_(0) { + if (expected_requests > 0) { + submit_times_.resize(expected_requests); + complete_times_.resize(expected_requests); + completed_.resize(expected_requests, false); } - - void start() { run_start_ = clock::now(); } - void stop() { run_end_ = clock::now(); } - - void mark_submit(int request_id) { - ensure_capacity(request_id); - submit_times_[request_id] = clock::now(); - total_submitted_++; + } + + void start() { run_start_ = clock::now(); } + void stop() { run_end_ = clock::now(); } + + void mark_submit(int request_id) { + ensure_capacity(request_id); + submit_times_[request_id] = clock::now(); + total_submitted_++; + } + + void mark_complete(int request_id) { + ensure_capacity(request_id); + complete_times_[request_id] = clock::now(); + completed_[request_id] = true; + } + + struct Stats { + size_t submitted = 0; + size_t completed = 0; + double min_us = 0, max_us = 0, mean_us = 0; + double p50_us = 0, p90_us = 0, p95_us = 0, p99_us = 0; + double stddev_us = 0; + double total_wall_us = 0; + double throughput_rps = 0; + }; + + /// Return per-request latencies in microseconds (completed requests only). + std::vector latencies_us() const { + size_t n = std::min( + {submit_times_.size(), complete_times_.size(), completed_.size()}); + std::vector lats; + lats.reserve(n); + for (size_t i = 0; i < n; ++i) { + if (!completed_[i]) + continue; + auto dt = std::chrono::duration_cast(complete_times_[i] - + submit_times_[i]); + lats.push_back(dt.count()); } - - void mark_complete(int request_id) { - ensure_capacity(request_id); - complete_times_[request_id] = clock::now(); - completed_[request_id] = true; + return lats; + } + + /// Return per-request latency or -1.0 for incomplete (preserves indices). + std::vector all_latencies_us() const { + size_t n = std::min( + {submit_times_.size(), complete_times_.size(), completed_.size()}); + std::vector lats(n, -1.0); + for (size_t i = 0; i < n; ++i) { + if (!completed_[i]) + continue; + auto dt = std::chrono::duration_cast(complete_times_[i] - + submit_times_[i]); + lats[i] = dt.count(); } - - struct Stats { - size_t submitted = 0; - size_t completed = 0; - double min_us = 0, max_us = 0, mean_us = 0; - double p50_us = 0, p90_us = 0, p95_us = 0, p99_us = 0; - double stddev_us = 0; - double total_wall_us = 0; - double throughput_rps = 0; - }; - - /// Return per-request latencies in microseconds (completed requests only). - std::vector latencies_us() const { - size_t n = std::min({submit_times_.size(), complete_times_.size(), - completed_.size()}); - std::vector lats; - lats.reserve(n); - for (size_t i = 0; i < n; ++i) { - if (!completed_[i]) - continue; - auto dt = std::chrono::duration_cast( - complete_times_[i] - submit_times_[i]); - lats.push_back(dt.count()); - } - return lats; - } - - /// Return per-request latency or -1.0 for incomplete (preserves indices). - std::vector all_latencies_us() const { - size_t n = std::min({submit_times_.size(), complete_times_.size(), - completed_.size()}); - std::vector lats(n, -1.0); - for (size_t i = 0; i < n; ++i) { - if (!completed_[i]) - continue; - auto dt = std::chrono::duration_cast( - complete_times_[i] - submit_times_[i]); - lats[i] = dt.count(); - } - return lats; - } - - Stats compute_stats() const { - auto lats = latencies_us(); - Stats s; - s.submitted = total_submitted_; - s.completed = lats.size(); - if (s.completed == 0) - return s; - - std::sort(lats.begin(), lats.end()); - - s.min_us = lats.front(); - s.max_us = lats.back(); - s.mean_us = - std::accumulate(lats.begin(), lats.end(), 0.0) / s.completed; - s.p50_us = percentile(lats, 50.0); - s.p90_us = percentile(lats, 90.0); - s.p95_us = percentile(lats, 95.0); - s.p99_us = percentile(lats, 99.0); - - double sum_sq = 0; - for (auto v : lats) - sum_sq += (v - s.mean_us) * (v - s.mean_us); - s.stddev_us = std::sqrt(sum_sq / s.completed); - - auto wall = - std::chrono::duration_cast(run_end_ - run_start_); - s.total_wall_us = wall.count(); - s.throughput_rps = - (s.total_wall_us > 0) ? (s.completed * 1e6 / s.total_wall_us) : 0; - - return s; - } - - void report(std::ostream &os = std::cout) const { - auto s = compute_stats(); - auto all = all_latencies_us(); - - os << "\n"; - os << "================================================================\n"; - os << " Benchmark: " << label_ << "\n"; - os << "================================================================\n"; - os << std::fixed; - os << " Submitted: " << s.submitted << "\n"; - os << " Completed: " << s.completed << "\n"; - if (s.submitted > s.completed) - os << " Timed out: " << (s.submitted - s.completed) << "\n"; - os << std::setprecision(1); - os << " Wall time: " << s.total_wall_us / 1000.0 << " ms\n"; - os << " Throughput: " << s.throughput_rps << " req/s\n"; - os << " ---------------------------------------------------------------\n"; - os << " Latency (us) [completed requests only]\n"; - os << std::setprecision(1); - os << " min = " << std::setw(10) << s.min_us << "\n"; - os << " p50 = " << std::setw(10) << s.p50_us << "\n"; - os << " mean = " << std::setw(10) << s.mean_us << "\n"; - os << " p90 = " << std::setw(10) << s.p90_us << "\n"; - os << " p95 = " << std::setw(10) << s.p95_us << "\n"; - os << " p99 = " << std::setw(10) << s.p99_us << "\n"; - os << " max = " << std::setw(10) << s.max_us << "\n"; - os << " stddev = " << std::setw(10) << s.stddev_us << "\n"; - os << " ---------------------------------------------------------------\n"; - - // Per-request breakdown: only show for small runs (<=50 requests) - if (!all.empty() && all.size() <= 50) { - os << " Per-request latencies (us):\n"; - for (size_t i = 0; i < all.size(); ++i) { - os << " [" << std::setw(4) << i << "] "; - if (all[i] < 0) - os << " TIMEOUT\n"; - else - os << std::setprecision(1) << std::setw(10) << all[i] - << "\n"; - } - } - os << "================================================================\n"; + return lats; + } + + Stats compute_stats() const { + auto lats = latencies_us(); + Stats s; + s.submitted = total_submitted_; + s.completed = lats.size(); + if (s.completed == 0) + return s; + + std::sort(lats.begin(), lats.end()); + + s.min_us = lats.front(); + s.max_us = lats.back(); + s.mean_us = std::accumulate(lats.begin(), lats.end(), 0.0) / s.completed; + s.p50_us = percentile(lats, 50.0); + s.p90_us = percentile(lats, 90.0); + s.p95_us = percentile(lats, 95.0); + s.p99_us = percentile(lats, 99.0); + + double sum_sq = 0; + for (auto v : lats) + sum_sq += (v - s.mean_us) * (v - s.mean_us); + s.stddev_us = std::sqrt(sum_sq / s.completed); + + auto wall = std::chrono::duration_cast(run_end_ - run_start_); + s.total_wall_us = wall.count(); + s.throughput_rps = + (s.total_wall_us > 0) ? (s.completed * 1e6 / s.total_wall_us) : 0; + + return s; + } + + void report(std::ostream &os = std::cout) const { + auto s = compute_stats(); + auto all = all_latencies_us(); + + os << "\n"; + os << "================================================================\n"; + os << " Benchmark: " << label_ << "\n"; + os << "================================================================\n"; + os << std::fixed; + os << " Submitted: " << s.submitted << "\n"; + os << " Completed: " << s.completed << "\n"; + if (s.submitted > s.completed) + os << " Timed out: " << (s.submitted - s.completed) << "\n"; + os << std::setprecision(1); + os << " Wall time: " << s.total_wall_us / 1000.0 << " ms\n"; + os << " Throughput: " << s.throughput_rps << " req/s\n"; + os << " ---------------------------------------------------------------\n"; + os << " Latency (us) [completed requests only]\n"; + os << std::setprecision(1); + os << " min = " << std::setw(10) << s.min_us << "\n"; + os << " p50 = " << std::setw(10) << s.p50_us << "\n"; + os << " mean = " << std::setw(10) << s.mean_us << "\n"; + os << " p90 = " << std::setw(10) << s.p90_us << "\n"; + os << " p95 = " << std::setw(10) << s.p95_us << "\n"; + os << " p99 = " << std::setw(10) << s.p99_us << "\n"; + os << " max = " << std::setw(10) << s.max_us << "\n"; + os << " stddev = " << std::setw(10) << s.stddev_us << "\n"; + os << " ---------------------------------------------------------------\n"; + + // Per-request breakdown: only show for small runs (<=50 requests) + if (!all.empty() && all.size() <= 50) { + os << " Per-request latencies (us):\n"; + for (size_t i = 0; i < all.size(); ++i) { + os << " [" << std::setw(4) << i << "] "; + if (all[i] < 0) + os << " TIMEOUT\n"; + else + os << std::setprecision(1) << std::setw(10) << all[i] << "\n"; + } } + os << "================================================================\n"; + } private: - std::string label_; - size_t total_submitted_; - time_point run_start_{}, run_end_{}; - std::vector submit_times_; - std::vector complete_times_; - std::vector completed_; - - void ensure_capacity(int id) { - size_t needed = static_cast(id) + 1; - if (submit_times_.size() < needed) - submit_times_.resize(needed); - if (complete_times_.size() < needed) - complete_times_.resize(needed); - if (completed_.size() < needed) - completed_.resize(needed, false); - } - - static double percentile(const std::vector &sorted, double p) { - if (sorted.empty()) - return 0; - double idx = (p / 100.0) * (sorted.size() - 1); - size_t lo = static_cast(idx); - size_t hi = std::min(lo + 1, sorted.size() - 1); - double frac = idx - lo; - return sorted[lo] * (1.0 - frac) + sorted[hi] * frac; - } + std::string label_; + size_t total_submitted_; + time_point run_start_{}, run_end_{}; + std::vector submit_times_; + std::vector complete_times_; + std::vector completed_; + + void ensure_capacity(int id) { + size_t needed = static_cast(id) + 1; + if (submit_times_.size() < needed) + submit_times_.resize(needed); + if (complete_times_.size() < needed) + complete_times_.resize(needed); + if (completed_.size() < needed) + completed_.resize(needed, false); + } + + static double percentile(const std::vector &sorted, double p) { + if (sorted.empty()) + return 0; + double idx = (p / 100.0) * (sorted.size() - 1); + size_t lo = static_cast(idx); + size_t hi = std::min(lo + 1, sorted.size() - 1); + double frac = idx - lo; + return sorted[lo] * (1.0 - frac) + sorted[hi] * frac; + } }; } // namespace cudaq::qec::utils diff --git a/libs/qec/include/cudaq/qec/utils/thread_pool.h b/libs/qec/include/cudaq/qec/utils/thread_pool.h index 237c2b32..8fe3b67e 100644 --- a/libs/qec/include/cudaq/qec/utils/thread_pool.h +++ b/libs/qec/include/cudaq/qec/utils/thread_pool.h @@ -11,13 +11,13 @@ #include #include #include +#include #include #include #include #include #include #include -#include #if defined(__linux__) #include @@ -28,120 +28,118 @@ namespace cudaq::qec::utils { class ThreadPool { public: - // Option 1: Standard unpinned thread pool - explicit ThreadPool(size_t threads); + // Option 1: Standard unpinned thread pool + explicit ThreadPool(size_t threads); - // Option 2: Pinned thread pool (1 thread per specified core ID) - explicit ThreadPool(const std::vector& core_ids); + // Option 2: Pinned thread pool (1 thread per specified core ID) + explicit ThreadPool(const std::vector &core_ids); - ~ThreadPool(); + ~ThreadPool(); - // Enqueue a job into the pool. - template - auto enqueue(F&& f, Args&&... args) - -> std::future::type>; + // Enqueue a job into the pool. + template + auto enqueue(F &&f, Args &&...args) + -> std::future::type>; private: - void worker_loop(); + void worker_loop(); - std::vector workers; - std::queue> tasks; + std::vector workers; + std::queue> tasks; - std::mutex queue_mutex; - std::condition_variable condition; - bool stop; + std::mutex queue_mutex; + std::condition_variable condition; + bool stop; }; // --- Implementation --- inline void ThreadPool::worker_loop() { - while(true) { - std::function task; - { - std::unique_lock lock(this->queue_mutex); - this->condition.wait(lock, [this] { - return this->stop || !this->tasks.empty(); - }); - - if(this->stop && this->tasks.empty()) { - return; - } - - task = std::move(this->tasks.front()); - this->tasks.pop(); - } - task(); + while (true) { + std::function task; + { + std::unique_lock lock(this->queue_mutex); + this->condition.wait( + lock, [this] { return this->stop || !this->tasks.empty(); }); + + if (this->stop && this->tasks.empty()) { + return; + } + + task = std::move(this->tasks.front()); + this->tasks.pop(); } + task(); + } } // Constructor 1: Unpinned inline ThreadPool::ThreadPool(size_t threads) : stop(false) { - for(size_t i = 0; i < threads; ++i) { - workers.emplace_back([this] { this->worker_loop(); }); - } + for (size_t i = 0; i < threads; ++i) { + workers.emplace_back([this] { this->worker_loop(); }); + } } // Constructor 2: Pinned to specific cores -inline ThreadPool::ThreadPool(const std::vector& core_ids) : stop(false) { - for(size_t i = 0; i < core_ids.size(); ++i) { - int core_id = core_ids[i]; +inline ThreadPool::ThreadPool(const std::vector &core_ids) : stop(false) { + for (size_t i = 0; i < core_ids.size(); ++i) { + int core_id = core_ids[i]; - workers.emplace_back([this, core_id] { - // Apply Thread Affinity (Linux Only) + workers.emplace_back([this, core_id] { + // Apply Thread Affinity (Linux Only) #if defined(__linux__) - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core_id, &cpuset); - - int rc = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); - if (rc != 0) { - std::cerr << "[ThreadPool] Warning: Failed to pin thread to core " - << core_id << " (Error " << rc << ")\n"; - } + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + + int rc = + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); + if (rc != 0) { + std::cerr << "[ThreadPool] Warning: Failed to pin thread to core " + << core_id << " (Error " << rc << ")\n"; + } #else - // Silent fallback for non-Linux platforms - (void)core_id; + // Silent fallback for non-Linux platforms + (void)core_id; #endif - // Enter the standard execution loop - this->worker_loop(); - }); - } + // Enter the standard execution loop + this->worker_loop(); + }); + } } -template -auto ThreadPool::enqueue(F&& f, Args&&... args) - -> std::future::type> -{ - using return_type = typename std::invoke_result::type; +template +auto ThreadPool::enqueue(F &&f, Args &&...args) + -> std::future::type> { + using return_type = typename std::invoke_result::type; - auto task = std::make_shared>( - std::bind(std::forward(f), std::forward(args)...) - ); + auto task = std::make_shared>( + std::bind(std::forward(f), std::forward(args)...)); - std::future res = task->get_future(); - { - std::unique_lock lock(queue_mutex); - if(stop) { - throw std::runtime_error("enqueue on stopped ThreadPool"); - } - tasks.emplace([task](){ (*task)(); }); + std::future res = task->get_future(); + { + std::unique_lock lock(queue_mutex); + if (stop) { + throw std::runtime_error("enqueue on stopped ThreadPool"); } - condition.notify_one(); - return res; + tasks.emplace([task]() { (*task)(); }); + } + condition.notify_one(); + return res; } inline ThreadPool::~ThreadPool() { - { - std::unique_lock lock(queue_mutex); - stop = true; - } - condition.notify_all(); - for(std::thread &worker : workers) { - if (worker.joinable()) { - worker.join(); - } + { + std::unique_lock lock(queue_mutex); + stop = true; + } + condition.notify_all(); + for (std::thread &worker : workers) { + if (worker.joinable()) { + worker.join(); } + } } } // namespace cudaq::qec::utils diff --git a/libs/qec/lib/realtime/ai_decoder_service.cu b/libs/qec/lib/realtime/ai_decoder_service.cu index ab4e0e75..3efd9336 100644 --- a/libs/qec/lib/realtime/ai_decoder_service.cu +++ b/libs/qec/lib/realtime/ai_decoder_service.cu @@ -6,23 +6,25 @@ * the terms of the Apache License 2.0 which accompanies this distribution. * ******************************************************************************/ -#include "cudaq/qec/realtime/ai_decoder_service.h" #include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/qec/realtime/ai_decoder_service.h" #include +#include #include #include #include -#include #include #include -#define DECODER_CUDA_CHECK(call) \ - do { \ - cudaError_t err = call; \ - if (err != cudaSuccess) { \ - throw std::runtime_error(std::string("CUDA Error in AIDecoderService: ") + cudaGetErrorString(err)); \ - } \ - } while(0) +#define DECODER_CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + throw std::runtime_error( \ + std::string("CUDA Error in AIDecoderService: ") + \ + cudaGetErrorString(err)); \ + } \ + } while (0) namespace cudaq::qec { @@ -30,46 +32,47 @@ namespace cudaq::qec { // Gateway Kernels // ============================================================================= -__global__ void gateway_input_kernel( - void** mailbox_slot_ptr, - void* trt_fixed_input, - size_t copy_size_bytes) -{ - void* ring_buffer_data = *mailbox_slot_ptr; - if (ring_buffer_data == nullptr) return; - - const char* src = (const char*)ring_buffer_data + sizeof(cudaq::nvqlink::RPCHeader); - char* dst = (char*)trt_fixed_input; - - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < copy_size_bytes; i += blockDim.x * gridDim.x) { - dst[i] = src[i]; - } +__global__ void gateway_input_kernel(void **mailbox_slot_ptr, + void *trt_fixed_input, + size_t copy_size_bytes) { + void *ring_buffer_data = *mailbox_slot_ptr; + if (ring_buffer_data == nullptr) + return; + + const char *src = + (const char *)ring_buffer_data + sizeof(cudaq::nvqlink::RPCHeader); + char *dst = (char *)trt_fixed_input; + + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < copy_size_bytes; + i += blockDim.x * gridDim.x) { + dst[i] = src[i]; + } } -__global__ void gateway_output_kernel( - void** mailbox_slot_ptr, - const void* trt_fixed_output, - size_t result_size_bytes) -{ - void* ring_buffer_data = *mailbox_slot_ptr; - if (ring_buffer_data == nullptr) return; - - char* dst = (char*)ring_buffer_data + sizeof(cudaq::nvqlink::RPCHeader); - const char* src = (const char*)trt_fixed_output; - - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < result_size_bytes; i += blockDim.x * gridDim.x) { - dst[i] = src[i]; - } - - __syncthreads(); - - if (threadIdx.x == 0 && blockIdx.x == 0) { - auto* response = (cudaq::nvqlink::RPCResponse*)ring_buffer_data; - response->magic = cudaq::nvqlink::RPC_MAGIC_RESPONSE; - response->status = 0; - response->result_len = static_cast(result_size_bytes); - __threadfence_system(); - } +__global__ void gateway_output_kernel(void **mailbox_slot_ptr, + const void *trt_fixed_output, + size_t result_size_bytes) { + void *ring_buffer_data = *mailbox_slot_ptr; + if (ring_buffer_data == nullptr) + return; + + char *dst = (char *)ring_buffer_data + sizeof(cudaq::nvqlink::RPCHeader); + const char *src = (const char *)trt_fixed_output; + + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < result_size_bytes; + i += blockDim.x * gridDim.x) { + dst[i] = src[i]; + } + + __syncthreads(); + + if (threadIdx.x == 0 && blockIdx.x == 0) { + auto *response = (cudaq::nvqlink::RPCResponse *)ring_buffer_data; + response->magic = cudaq::nvqlink::RPC_MAGIC_RESPONSE; + response->status = 0; + response->result_len = static_cast(result_size_bytes); + __threadfence_system(); + } } // ============================================================================= @@ -77,22 +80,29 @@ __global__ void gateway_output_kernel( // ============================================================================= static size_t trt_dtype_size(nvinfer1::DataType dtype) { - switch (dtype) { - case nvinfer1::DataType::kFLOAT: return 4; - case nvinfer1::DataType::kHALF: return 2; - case nvinfer1::DataType::kINT8: return 1; - case nvinfer1::DataType::kINT32: return 4; - case nvinfer1::DataType::kINT64: return 8; - case nvinfer1::DataType::kBOOL: return 1; - default: return 4; - } + switch (dtype) { + case nvinfer1::DataType::kFLOAT: + return 4; + case nvinfer1::DataType::kHALF: + return 2; + case nvinfer1::DataType::kINT8: + return 1; + case nvinfer1::DataType::kINT32: + return 4; + case nvinfer1::DataType::kINT64: + return 8; + case nvinfer1::DataType::kBOOL: + return 1; + default: + return 4; + } } -static size_t tensor_volume(const nvinfer1::Dims& d) { - size_t v = 1; - for (int i = 0; i < d.nbDims; ++i) - v *= (d.d[i] > 0) ? static_cast(d.d[i]) : 1; - return v; +static size_t tensor_volume(const nvinfer1::Dims &d) { + size_t v = 1; + for (int i = 0; i < d.nbDims; ++i) + v *= (d.d[i] > 0) ? static_cast(d.d[i]) : 1; + return v; } // ============================================================================= @@ -101,223 +111,251 @@ static size_t tensor_volume(const nvinfer1::Dims& d) { AIDecoderService::Logger AIDecoderService::gLogger; -void AIDecoderService::Logger::log(Severity severity, const char* msg) noexcept { - if (severity <= Severity::kWARNING) { - std::printf("[TensorRT] %s\n", msg); - } +void AIDecoderService::Logger::log(Severity severity, + const char *msg) noexcept { + if (severity <= Severity::kWARNING) { + std::printf("[TensorRT] %s\n", msg); + } } -AIDecoderService::AIDecoderService(const std::string& model_path, void** device_mailbox_slot, - const std::string& engine_save_path) +AIDecoderService::AIDecoderService(const std::string &model_path, + void **device_mailbox_slot, + const std::string &engine_save_path) : device_mailbox_slot_(device_mailbox_slot) { - if (std::getenv("SKIP_TRT")) { - input_size_ = 1600 * sizeof(float); - output_size_ = 1600 * sizeof(float); - allocate_resources(); + if (std::getenv("SKIP_TRT")) { + input_size_ = 1600 * sizeof(float); + output_size_ = 1600 * sizeof(float); + allocate_resources(); + } else { + std::string ext = model_path.substr(model_path.find_last_of('.')); + if (ext == ".onnx") { + build_engine_from_onnx(model_path, engine_save_path); } else { - std::string ext = model_path.substr(model_path.find_last_of('.')); - if (ext == ".onnx") { - build_engine_from_onnx(model_path, engine_save_path); - } else { - load_engine(model_path); - } - setup_bindings(); - allocate_resources(); + load_engine(model_path); } + setup_bindings(); + allocate_resources(); + } } AIDecoderService::~AIDecoderService() { - if (graph_exec_) cudaGraphExecDestroy(graph_exec_); - if (d_trt_input_) cudaFree(d_trt_input_); - if (d_trt_output_) cudaFree(d_trt_output_); - for (auto* buf : d_aux_buffers_) cudaFree(buf); + if (graph_exec_) + cudaGraphExecDestroy(graph_exec_); + if (d_trt_input_) + cudaFree(d_trt_input_); + if (d_trt_output_) + cudaFree(d_trt_output_); + for (auto *buf : d_aux_buffers_) + cudaFree(buf); } -void AIDecoderService::load_engine(const std::string& path) { - std::ifstream file(path, std::ios::binary); - if (!file.good()) throw std::runtime_error("Error opening engine file: " + path); +void AIDecoderService::load_engine(const std::string &path) { + std::ifstream file(path, std::ios::binary); + if (!file.good()) + throw std::runtime_error("Error opening engine file: " + path); - file.seekg(0, file.end); - size_t size = file.tellg(); - file.seekg(0, file.beg); + file.seekg(0, file.end); + size_t size = file.tellg(); + file.seekg(0, file.beg); - std::vector engine_data(size); - file.read(engine_data.data(), size); + std::vector engine_data(size); + file.read(engine_data.data(), size); - runtime_.reset(nvinfer1::createInferRuntime(gLogger)); - engine_.reset(runtime_->deserializeCudaEngine(engine_data.data(), size)); - context_.reset(engine_->createExecutionContext()); + runtime_.reset(nvinfer1::createInferRuntime(gLogger)); + engine_.reset(runtime_->deserializeCudaEngine(engine_data.data(), size)); + context_.reset(engine_->createExecutionContext()); } -void AIDecoderService::build_engine_from_onnx(const std::string& onnx_path, - const std::string& engine_save_path) { - runtime_.reset(nvinfer1::createInferRuntime(gLogger)); - - auto builder = std::unique_ptr(nvinfer1::createInferBuilder(gLogger)); - auto network = std::unique_ptr(builder->createNetworkV2(0)); - auto config = std::unique_ptr(builder->createBuilderConfig()); - - // Enable FP16 optimization for Grace Blackwell / Hopper - if (builder->platformHasFastFp16()) { - config->setFlag(nvinfer1::BuilderFlag::kFP16); - std::printf("[TensorRT] FP16 precision enabled.\n"); - } else { - std::printf("[TensorRT] Warning: Platform does not support fast FP16. Using FP32.\n"); - } - - auto parser = std::unique_ptr( - nvonnxparser::createParser(*network, gLogger)); - - if (!parser->parseFromFile(onnx_path.c_str(), - static_cast(nvinfer1::ILogger::Severity::kWARNING))) { - throw std::runtime_error("Failed to parse ONNX file: " + onnx_path); +void AIDecoderService::build_engine_from_onnx( + const std::string &onnx_path, const std::string &engine_save_path) { + runtime_.reset(nvinfer1::createInferRuntime(gLogger)); + + auto builder = std::unique_ptr( + nvinfer1::createInferBuilder(gLogger)); + auto network = std::unique_ptr( + builder->createNetworkV2(0)); + auto config = + std::unique_ptr(builder->createBuilderConfig()); + + // Enable FP16 optimization for Grace Blackwell / Hopper + if (builder->platformHasFastFp16()) { + config->setFlag(nvinfer1::BuilderFlag::kFP16); + std::printf("[TensorRT] FP16 precision enabled.\n"); + } else { + std::printf("[TensorRT] Warning: Platform does not support fast FP16. " + "Using FP32.\n"); + } + + auto parser = std::unique_ptr( + nvonnxparser::createParser(*network, gLogger)); + + if (!parser->parseFromFile( + onnx_path.c_str(), + static_cast(nvinfer1::ILogger::Severity::kWARNING))) { + throw std::runtime_error("Failed to parse ONNX file: " + onnx_path); + } + + bool has_dynamic = false; + for (int i = 0; i < network->getNbInputs(); ++i) { + auto *input = network->getInput(i); + auto dims = input->getDimensions(); + for (int d = 0; d < dims.nbDims; ++d) { + if (dims.d[d] <= 0) { + has_dynamic = true; + break; + } } + if (has_dynamic) + break; + } - bool has_dynamic = false; + if (has_dynamic) { + auto *profile = builder->createOptimizationProfile(); for (int i = 0; i < network->getNbInputs(); ++i) { - auto* input = network->getInput(i); - auto dims = input->getDimensions(); - for (int d = 0; d < dims.nbDims; ++d) { - if (dims.d[d] <= 0) { has_dynamic = true; break; } - } - if (has_dynamic) break; - } - - if (has_dynamic) { - auto* profile = builder->createOptimizationProfile(); - for (int i = 0; i < network->getNbInputs(); ++i) { - auto* input = network->getInput(i); - auto dims = input->getDimensions(); - nvinfer1::Dims fixed = dims; - for (int d = 0; d < fixed.nbDims; ++d) { - if (fixed.d[d] <= 0) fixed.d[d] = 1; - } - profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, fixed); - profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, fixed); - profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, fixed); - std::printf("[TensorRT] Set dynamic input \"%s\" to batch=1\n", input->getName()); - } - config->addOptimizationProfile(profile); + auto *input = network->getInput(i); + auto dims = input->getDimensions(); + nvinfer1::Dims fixed = dims; + for (int d = 0; d < fixed.nbDims; ++d) { + if (fixed.d[d] <= 0) + fixed.d[d] = 1; + } + profile->setDimensions(input->getName(), + nvinfer1::OptProfileSelector::kMIN, fixed); + profile->setDimensions(input->getName(), + nvinfer1::OptProfileSelector::kOPT, fixed); + profile->setDimensions(input->getName(), + nvinfer1::OptProfileSelector::kMAX, fixed); + std::printf("[TensorRT] Set dynamic input \"%s\" to batch=1\n", + input->getName()); } - - auto plan = std::unique_ptr( - builder->buildSerializedNetwork(*network, *config)); - if (!plan) throw std::runtime_error("Failed to build TRT engine from ONNX"); - - if (!engine_save_path.empty()) { - std::ofstream out(engine_save_path, std::ios::binary); - if (out.good()) { - out.write(static_cast(plan->data()), plan->size()); - std::printf("[TensorRT] Saved engine to: %s\n", engine_save_path.c_str()); - } else { - std::fprintf(stderr, "[TensorRT] Warning: could not save engine to %s\n", - engine_save_path.c_str()); - } + config->addOptimizationProfile(profile); + } + + auto plan = std::unique_ptr( + builder->buildSerializedNetwork(*network, *config)); + if (!plan) + throw std::runtime_error("Failed to build TRT engine from ONNX"); + + if (!engine_save_path.empty()) { + std::ofstream out(engine_save_path, std::ios::binary); + if (out.good()) { + out.write(static_cast(plan->data()), plan->size()); + std::printf("[TensorRT] Saved engine to: %s\n", engine_save_path.c_str()); + } else { + std::fprintf(stderr, "[TensorRT] Warning: could not save engine to %s\n", + engine_save_path.c_str()); } + } - engine_.reset(runtime_->deserializeCudaEngine(plan->data(), plan->size())); - if (!engine_) throw std::runtime_error("Failed to deserialize built engine"); + engine_.reset(runtime_->deserializeCudaEngine(plan->data(), plan->size())); + if (!engine_) + throw std::runtime_error("Failed to deserialize built engine"); - context_.reset(engine_->createExecutionContext()); + context_.reset(engine_->createExecutionContext()); - std::printf("[TensorRT] Built engine from ONNX: %s\n", onnx_path.c_str()); + std::printf("[TensorRT] Built engine from ONNX: %s\n", onnx_path.c_str()); } void AIDecoderService::setup_bindings() { - int num_io = engine_->getNbIOTensors(); - bool found_input = false; - bool found_output = false; - - for (int i = 0; i < num_io; ++i) { - const char* name = engine_->getIOTensorName(i); - auto mode = engine_->getTensorIOMode(name); - auto dims = engine_->getTensorShape(name); - auto dtype = engine_->getTensorDataType(name); - size_t size_bytes = tensor_volume(dims) * trt_dtype_size(dtype); - - bool is_input = (mode == nvinfer1::TensorIOMode::kINPUT); - - std::printf("[TensorRT] Binding %d: \"%s\" %s, %zu bytes\n", - i, name, is_input ? "INPUT" : "OUTPUT", size_bytes); - - TensorBinding binding{name, nullptr, size_bytes, is_input}; - - if (is_input && !found_input) { - input_size_ = size_bytes; - found_input = true; - } else if (!is_input && !found_output) { - output_size_ = size_bytes; - found_output = true; - } - - all_bindings_.push_back(std::move(binding)); + int num_io = engine_->getNbIOTensors(); + bool found_input = false; + bool found_output = false; + + for (int i = 0; i < num_io; ++i) { + const char *name = engine_->getIOTensorName(i); + auto mode = engine_->getTensorIOMode(name); + auto dims = engine_->getTensorShape(name); + auto dtype = engine_->getTensorDataType(name); + size_t size_bytes = tensor_volume(dims) * trt_dtype_size(dtype); + + bool is_input = (mode == nvinfer1::TensorIOMode::kINPUT); + + std::printf("[TensorRT] Binding %d: \"%s\" %s, %zu bytes\n", i, name, + is_input ? "INPUT" : "OUTPUT", size_bytes); + + TensorBinding binding{name, nullptr, size_bytes, is_input}; + + if (is_input && !found_input) { + input_size_ = size_bytes; + found_input = true; + } else if (!is_input && !found_output) { + output_size_ = size_bytes; + found_output = true; } + + all_bindings_.push_back(std::move(binding)); + } } void AIDecoderService::allocate_resources() { - if (all_bindings_.empty()) { - // SKIP_TRT fallback path - if (cudaMalloc(&d_trt_input_, input_size_) != cudaSuccess) - throw std::runtime_error("Failed to allocate TRT Input"); - if (cudaMalloc(&d_trt_output_, output_size_) != cudaSuccess) - throw std::runtime_error("Failed to allocate TRT Output"); - return; - } - - bool assigned_input = false; - bool assigned_output = false; - - for (auto& b : all_bindings_) { - void* buf = nullptr; - if (cudaMalloc(&buf, b.size_bytes) != cudaSuccess) - throw std::runtime_error("Failed to allocate buffer for " + b.name); - cudaMemset(buf, 0, b.size_bytes); - b.d_buffer = buf; - - if (b.is_input && !assigned_input) { - d_trt_input_ = buf; - assigned_input = true; - } else if (!b.is_input && !assigned_output) { - d_trt_output_ = buf; - assigned_output = true; - } else { - d_aux_buffers_.push_back(buf); - } + if (all_bindings_.empty()) { + // SKIP_TRT fallback path + if (cudaMalloc(&d_trt_input_, input_size_) != cudaSuccess) + throw std::runtime_error("Failed to allocate TRT Input"); + if (cudaMalloc(&d_trt_output_, output_size_) != cudaSuccess) + throw std::runtime_error("Failed to allocate TRT Output"); + return; + } + + bool assigned_input = false; + bool assigned_output = false; + + for (auto &b : all_bindings_) { + void *buf = nullptr; + if (cudaMalloc(&buf, b.size_bytes) != cudaSuccess) + throw std::runtime_error("Failed to allocate buffer for " + b.name); + cudaMemset(buf, 0, b.size_bytes); + b.d_buffer = buf; + + if (b.is_input && !assigned_input) { + d_trt_input_ = buf; + assigned_input = true; + } else if (!b.is_input && !assigned_output) { + d_trt_output_ = buf; + assigned_output = true; + } else { + d_aux_buffers_.push_back(buf); } + } } void AIDecoderService::capture_graph(cudaStream_t stream) { - for (auto& b : all_bindings_) { - context_->setTensorAddress(b.name.c_str(), b.d_buffer); - } - - if (!context_->enqueueV3(stream)) - throw std::runtime_error("TRT enqueueV3 warmup failed in AIDecoderService"); - DECODER_CUDA_CHECK(cudaStreamSynchronize(stream)); - - cudaGraph_t graph; - DECODER_CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); - - gateway_input_kernel<<<1, 128, 0, stream>>>(device_mailbox_slot_, d_trt_input_, input_size_); - if (!context_->enqueueV3(stream)) - throw std::runtime_error("TRT enqueueV3 failed during graph capture in AIDecoderService"); - gateway_output_kernel<<<1, 128, 0, stream>>>(device_mailbox_slot_, d_trt_output_, output_size_); - - DECODER_CUDA_CHECK(cudaStreamEndCapture(stream, &graph)); - - cudaError_t inst_err = cudaGraphInstantiateWithFlags( - &graph_exec_, graph, cudaGraphInstantiateFlagDeviceLaunch); - if (inst_err != cudaSuccess) { - cudaGraphDestroy(graph); - throw std::runtime_error( - std::string("cudaGraphInstantiateWithFlags failed in AIDecoderService: ") - + cudaGetErrorString(inst_err)); - } - - DECODER_CUDA_CHECK(cudaGraphUpload(graph_exec_, stream)); + for (auto &b : all_bindings_) { + context_->setTensorAddress(b.name.c_str(), b.d_buffer); + } + + if (!context_->enqueueV3(stream)) + throw std::runtime_error("TRT enqueueV3 warmup failed in AIDecoderService"); + DECODER_CUDA_CHECK(cudaStreamSynchronize(stream)); + + cudaGraph_t graph; + DECODER_CUDA_CHECK( + cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); + + gateway_input_kernel<<<1, 128, 0, stream>>>(device_mailbox_slot_, + d_trt_input_, input_size_); + if (!context_->enqueueV3(stream)) + throw std::runtime_error( + "TRT enqueueV3 failed during graph capture in AIDecoderService"); + gateway_output_kernel<<<1, 128, 0, stream>>>(device_mailbox_slot_, + d_trt_output_, output_size_); + + DECODER_CUDA_CHECK(cudaStreamEndCapture(stream, &graph)); + + cudaError_t inst_err = cudaGraphInstantiateWithFlags( + &graph_exec_, graph, cudaGraphInstantiateFlagDeviceLaunch); + if (inst_err != cudaSuccess) { cudaGraphDestroy(graph); - DECODER_CUDA_CHECK(cudaStreamSynchronize(stream)); + throw std::runtime_error( + std::string( + "cudaGraphInstantiateWithFlags failed in AIDecoderService: ") + + cudaGetErrorString(inst_err)); + } + + DECODER_CUDA_CHECK(cudaGraphUpload(graph_exec_, stream)); + cudaGraphDestroy(graph); + DECODER_CUDA_CHECK(cudaStreamSynchronize(stream)); } } // namespace cudaq::qec diff --git a/libs/qec/lib/realtime/ai_predecoder_service.cu b/libs/qec/lib/realtime/ai_predecoder_service.cu index c539fe1e..b9564a3b 100644 --- a/libs/qec/lib/realtime/ai_predecoder_service.cu +++ b/libs/qec/lib/realtime/ai_predecoder_service.cu @@ -7,161 +7,175 @@ ******************************************************************************/ #include "cudaq/qec/realtime/ai_predecoder_service.h" -#include #include +#include #include #include -#define SERVICE_CUDA_CHECK(call) \ - do { \ - cudaError_t err = call; \ - if (err != cudaSuccess) { \ - throw std::runtime_error(std::string("CUDA Error in AIPreDecoderService: ") + cudaGetErrorString(err)); \ - } \ - } while(0) +#define SERVICE_CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + throw std::runtime_error( \ + std::string("CUDA Error in AIPreDecoderService: ") + \ + cudaGetErrorString(err)); \ + } \ + } while (0) namespace cudaq::qec { -// System scope for NVLink/PCIe visibility to host (design: no __threadfence_system) +// System scope for NVLink/PCIe visibility to host (design: no +// __threadfence_system) using atomic_int_sys = cuda::atomic; // ============================================================================= // Kernels (single slot 0 only; queue removed for host-side dynamic pool) // ============================================================================= -__global__ void predecoder_signal_ready_kernel(atomic_int_sys* d_ready_flags) -{ - if (threadIdx.x == 0) - d_ready_flags[0].store(1, cuda::std::memory_order_release); +__global__ void predecoder_signal_ready_kernel(atomic_int_sys *d_ready_flags) { + if (threadIdx.x == 0) + d_ready_flags[0].store(1, cuda::std::memory_order_release); } -__global__ void passthrough_copy_kernel(void* dst, const void* src, size_t num_bytes) { - const uint4* src4 = (const uint4*)src; - uint4* dst4 = (uint4*)dst; - size_t n4 = num_bytes / sizeof(uint4); - for (size_t i = threadIdx.x; i < n4; i += blockDim.x) - dst4[i] = src4[i]; - - size_t done = n4 * sizeof(uint4); - for (size_t i = done + threadIdx.x; i < num_bytes; i += blockDim.x) - ((char*)dst)[i] = ((const char*)src)[i]; +__global__ void passthrough_copy_kernel(void *dst, const void *src, + size_t num_bytes) { + const uint4 *src4 = (const uint4 *)src; + uint4 *dst4 = (uint4 *)dst; + size_t n4 = num_bytes / sizeof(uint4); + for (size_t i = threadIdx.x; i < n4; i += blockDim.x) + dst4[i] = src4[i]; + + size_t done = n4 * sizeof(uint4); + for (size_t i = done + threadIdx.x; i < num_bytes; i += blockDim.x) + ((char *)dst)[i] = ((const char *)src)[i]; } // ============================================================================= // Class Implementation // ============================================================================= -AIPreDecoderService::AIPreDecoderService(const std::string& path, void** mailbox, - int /* queue_depth (ignored; always 1) */, - const std::string& engine_save_path) - : AIDecoderService(path, mailbox, engine_save_path), queue_depth_(1) -{ - void* buf = nullptr; - - SERVICE_CUDA_CHECK(cudaHostAlloc(&buf, sizeof(atomic_int_sys), cudaHostAllocMapped)); - h_ready_flags_ = static_cast(buf); - new (h_ready_flags_) atomic_int_sys(0); - - SERVICE_CUDA_CHECK(cudaHostAlloc(&h_ring_ptrs_, sizeof(void*), cudaHostAllocMapped)); - SERVICE_CUDA_CHECK(cudaHostAlloc(&h_predecoder_outputs_, get_output_size(), cudaHostAllocMapped)); - - SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_ready_flags_, (void*)h_ready_flags_, 0)); - SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_ring_ptrs_, (void*)h_ring_ptrs_, 0)); - SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void**)&d_predecoder_outputs_, (void*)h_predecoder_outputs_, 0)); +AIPreDecoderService::AIPreDecoderService( + const std::string &path, void **mailbox, + int /* queue_depth (ignored; always 1) */, + const std::string &engine_save_path) + : AIDecoderService(path, mailbox, engine_save_path), queue_depth_(1) { + void *buf = nullptr; + + SERVICE_CUDA_CHECK( + cudaHostAlloc(&buf, sizeof(atomic_int_sys), cudaHostAllocMapped)); + h_ready_flags_ = static_cast(buf); + new (h_ready_flags_) atomic_int_sys(0); + + SERVICE_CUDA_CHECK( + cudaHostAlloc(&h_ring_ptrs_, sizeof(void *), cudaHostAllocMapped)); + SERVICE_CUDA_CHECK(cudaHostAlloc(&h_predecoder_outputs_, get_output_size(), + cudaHostAllocMapped)); + + SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void **)&d_ready_flags_, + (void *)h_ready_flags_, 0)); + SERVICE_CUDA_CHECK(cudaHostGetDevicePointer((void **)&d_ring_ptrs_, + (void *)h_ring_ptrs_, 0)); + SERVICE_CUDA_CHECK(cudaHostGetDevicePointer( + (void **)&d_predecoder_outputs_, (void *)h_predecoder_outputs_, 0)); } AIPreDecoderService::~AIPreDecoderService() { - if (h_ready_flags_) { - h_ready_flags_[0].~atomic_int_sys(); - cudaFreeHost((void*)h_ready_flags_); - h_ready_flags_ = nullptr; - d_ready_flags_ = nullptr; - } - if (h_ring_ptrs_) { - cudaFreeHost(h_ring_ptrs_); - h_ring_ptrs_ = nullptr; - } - if (h_predecoder_outputs_) { - cudaFreeHost(h_predecoder_outputs_); - h_predecoder_outputs_ = nullptr; - } + if (h_ready_flags_) { + h_ready_flags_[0].~atomic_int_sys(); + cudaFreeHost((void *)h_ready_flags_); + h_ready_flags_ = nullptr; + d_ready_flags_ = nullptr; + } + if (h_ring_ptrs_) { + cudaFreeHost(h_ring_ptrs_); + h_ring_ptrs_ = nullptr; + } + if (h_predecoder_outputs_) { + cudaFreeHost(h_predecoder_outputs_); + h_predecoder_outputs_ = nullptr; + } } -void AIPreDecoderService::capture_graph(cudaStream_t stream, bool device_launch) { - bool skip_trt = (std::getenv("SKIP_TRT") != nullptr); +void AIPreDecoderService::capture_graph(cudaStream_t stream, + bool device_launch) { + bool skip_trt = (std::getenv("SKIP_TRT") != nullptr); - if (!skip_trt) { - for (auto& b : all_bindings_) { - context_->setTensorAddress(b.name.c_str(), b.d_buffer); - } - if (!context_->enqueueV3(stream)) - throw std::runtime_error("TRT enqueueV3 warmup failed in AIPreDecoderService"); + if (!skip_trt) { + for (auto &b : all_bindings_) { + context_->setTensorAddress(b.name.c_str(), b.d_buffer); } - SERVICE_CUDA_CHECK(cudaStreamSynchronize(stream)); - - cudaGraph_t graph; - SERVICE_CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); - - if (skip_trt) { - passthrough_copy_kernel<<<1, 256, 0, stream>>>( - d_trt_output_, d_trt_input_, get_input_size()); - } else { - if (!context_->enqueueV3(stream)) - throw std::runtime_error("TRT enqueueV3 failed during graph capture in AIPreDecoderService"); + if (!context_->enqueueV3(stream)) + throw std::runtime_error( + "TRT enqueueV3 warmup failed in AIPreDecoderService"); + } + SERVICE_CUDA_CHECK(cudaStreamSynchronize(stream)); + + cudaGraph_t graph; + SERVICE_CUDA_CHECK( + cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); + + if (skip_trt) { + passthrough_copy_kernel<<<1, 256, 0, stream>>>(d_trt_output_, d_trt_input_, + get_input_size()); + } else { + if (!context_->enqueueV3(stream)) + throw std::runtime_error( + "TRT enqueueV3 failed during graph capture in AIPreDecoderService"); + } + + SERVICE_CUDA_CHECK(cudaMemcpyAsync(d_predecoder_outputs_, d_trt_output_, + get_output_size(), + cudaMemcpyDeviceToDevice, stream)); + + predecoder_signal_ready_kernel<<<1, 1, 0, stream>>>( + static_cast(d_ready_flags_)); + + SERVICE_CUDA_CHECK(cudaStreamEndCapture(stream, &graph)); + + if (device_launch) { + cudaError_t inst_err = cudaGraphInstantiateWithFlags( + &graph_exec_, graph, cudaGraphInstantiateFlagDeviceLaunch); + if (inst_err != cudaSuccess) { + cudaGraphDestroy(graph); + throw std::runtime_error( + std::string("cudaGraphInstantiateWithFlags (DeviceLaunch) FAILED: ") + + cudaGetErrorString(inst_err)); } - - SERVICE_CUDA_CHECK(cudaMemcpyAsync( - d_predecoder_outputs_, d_trt_output_, get_output_size(), - cudaMemcpyDeviceToDevice, stream)); - - predecoder_signal_ready_kernel<<<1, 1, 0, stream>>>( - static_cast(d_ready_flags_)); - - SERVICE_CUDA_CHECK(cudaStreamEndCapture(stream, &graph)); - - if (device_launch) { - cudaError_t inst_err = cudaGraphInstantiateWithFlags( - &graph_exec_, graph, cudaGraphInstantiateFlagDeviceLaunch); - if (inst_err != cudaSuccess) { - cudaGraphDestroy(graph); - throw std::runtime_error( - std::string("cudaGraphInstantiateWithFlags (DeviceLaunch) FAILED: ") - + cudaGetErrorString(inst_err)); - } - SERVICE_CUDA_CHECK(cudaGraphUpload(graph_exec_, stream)); - } else { - cudaError_t inst_err = cudaGraphInstantiate(&graph_exec_, graph, 0); - if (inst_err != cudaSuccess) { - cudaGraphDestroy(graph); - throw std::runtime_error( - std::string("cudaGraphInstantiate FAILED: ") - + cudaGetErrorString(inst_err)); - } + SERVICE_CUDA_CHECK(cudaGraphUpload(graph_exec_, stream)); + } else { + cudaError_t inst_err = cudaGraphInstantiate(&graph_exec_, graph, 0); + if (inst_err != cudaSuccess) { + cudaGraphDestroy(graph); + throw std::runtime_error(std::string("cudaGraphInstantiate FAILED: ") + + cudaGetErrorString(inst_err)); } + } - cudaGraphDestroy(graph); - SERVICE_CUDA_CHECK(cudaStreamSynchronize(stream)); + cudaGraphDestroy(graph); + SERVICE_CUDA_CHECK(cudaStreamSynchronize(stream)); } -bool AIPreDecoderService::poll_next_job(PreDecoderJob& out_job) { - auto* sys_flags = static_cast(h_ready_flags_); - int expected = 1; - // Atomically claim: 1 (Ready) -> 2 (Processing) so we enqueue the job exactly once. - // Use relaxed on failure so spinning doesn't add barriers that delay seeing GPU's store(1). - if (sys_flags[0].compare_exchange_strong(expected, 2, - cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed)) { - out_job.slot_idx = 0; - out_job.ring_buffer_ptr = h_ring_ptrs_[0]; - out_job.inference_data = h_predecoder_outputs_; - return true; - } - return false; +bool AIPreDecoderService::poll_next_job(PreDecoderJob &out_job) { + auto *sys_flags = static_cast(h_ready_flags_); + int expected = 1; + // Atomically claim: 1 (Ready) -> 2 (Processing) so we enqueue the job exactly + // once. Use relaxed on failure so spinning doesn't add barriers that delay + // seeing GPU's store(1). + if (sys_flags[0].compare_exchange_strong(expected, 2, + cuda::std::memory_order_acquire, + cuda::std::memory_order_relaxed)) { + out_job.slot_idx = 0; + out_job.ring_buffer_ptr = h_ring_ptrs_[0]; + out_job.inference_data = h_predecoder_outputs_; + return true; + } + return false; } void AIPreDecoderService::release_job(int /* slot_idx */) { - auto* sys_flags = static_cast(h_ready_flags_); - // PyMatching done: 2 (Processing) -> 0 (Idle) - sys_flags[0].store(0, cuda::std::memory_order_release); + auto *sys_flags = static_cast(h_ready_flags_); + // PyMatching done: 2 (Processing) -> 0 (Idle) + sys_flags[0].store(0, cuda::std::memory_order_release); } } // namespace cudaq::qec diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index 93a0fd3a..9c31cfaf 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -15,22 +15,23 @@ * 2. CPU stage callback (PyMatching decode) * 3. Completion callback (timestamp recording) * - * Usage: test_realtime_predecoder_w_pymatching [d7|d13|d13_r104|d21|d31] [rate_us] [duration_s] + * Usage: test_realtime_predecoder_w_pymatching [d7|d13|d13_r104|d21|d31] + *[rate_us] [duration_s] ******************************************************************************/ -#include -#include +#include #include -#include +#include +#include #include -#include +#include +#include +#include +#include #include #include -#include -#include -#include -#include -#include +#include +#include #include @@ -38,15 +39,15 @@ #define CUDA_VERSION 13000 #endif -#include "cudaq/realtime/pipeline.h" #include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" #include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" #include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" +#include "cudaq/realtime/pipeline.h" -#include "cudaq/qec/realtime/ai_decoder_service.h" -#include "cudaq/qec/realtime/ai_predecoder_service.h" #include "cudaq/qec/code.h" #include "cudaq/qec/decoder.h" +#include "cudaq/qec/realtime/ai_decoder_service.h" +#include "cudaq/qec/realtime/ai_predecoder_service.h" using namespace cudaq::qec; namespace realtime_ns = cudaq::realtime; @@ -59,19 +60,21 @@ namespace realtime_ns = cudaq::realtime; #elif defined(__aarch64__) #define QEC_CPU_RELAX() __asm__ volatile("yield" ::: "memory") #else -#define QEC_CPU_RELAX() do { } while(0) +#define QEC_CPU_RELAX() \ + do { \ + } while (0) #endif #endif -#define CUDA_CHECK(call) \ - do { \ - cudaError_t err = call; \ - if (err != cudaSuccess) { \ - std::cerr << "CUDA Error: " << cudaGetErrorString(err) \ - << " at line " << __LINE__ << std::endl; \ - exit(1); \ - } \ - } while(0) +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at line " \ + << __LINE__ << std::endl; \ + exit(1); \ + } \ + } while (0) // ============================================================================= // Pipeline Configuration (application-level, no atomics) @@ -80,70 +83,58 @@ namespace realtime_ns = cudaq::realtime; constexpr size_t NUM_SLOTS = 32; struct PipelineConfig { - std::string label; - int distance; - int num_rounds; - int meas_qubits; - int residual_detectors; - std::string onnx_filename; - size_t slot_size; - int num_predecoders; - int num_workers; - - int input_elements() const { return meas_qubits * num_rounds; } - size_t input_bytes() const { return input_elements() * sizeof(int32_t); } - - std::string onnx_path() const { - return std::string(ONNX_MODEL_DIR) + "/" + onnx_filename; - } - - std::string engine_path() const { - std::string name = onnx_filename; - auto dot = name.rfind('.'); - if (dot != std::string::npos) - name = name.substr(0, dot); - return std::string(ONNX_MODEL_DIR) + "/" + name + ".engine"; - } - - static PipelineConfig d7_r7() { - return { - "d7_r7_Z", 7, 7, 72, 336, - "model1_d7_r7_unified_Z_batch1.onnx", - 4096, 16, 16 - }; - } - - static PipelineConfig d13_r13() { - return { - "d13_r13_Z", 13, 13, 252, 2184, - "predecoder_memory_d13_T13_X.onnx", - 16384, 16, 16 - }; - } - - static PipelineConfig d13_r104() { - return { - "d13_r104_Z", 13, 104, 252, 2184, - "predecoder_memory_d13_T104_X.onnx", - 131072, 16, 16 - }; - } - - static PipelineConfig d21_r21() { - return { - "d21_r21_Z", 21, 21, 660, 9240, - "model1_d21_r21_unified_X_batch1.onnx", - 65536, 16, 16 - }; - } - - static PipelineConfig d31_r31() { - return { - "d31_r31_Z", 31, 31, 1440, 29760, - "model1_d31_r31_unified_Z_batch1.onnx", - 262144, 16, 16 - }; - } + std::string label; + int distance; + int num_rounds; + int meas_qubits; + int residual_detectors; + std::string onnx_filename; + size_t slot_size; + int num_predecoders; + int num_workers; + + int input_elements() const { return meas_qubits * num_rounds; } + size_t input_bytes() const { return input_elements() * sizeof(int32_t); } + + std::string onnx_path() const { + return std::string(ONNX_MODEL_DIR) + "/" + onnx_filename; + } + + std::string engine_path() const { + std::string name = onnx_filename; + auto dot = name.rfind('.'); + if (dot != std::string::npos) + name = name.substr(0, dot); + return std::string(ONNX_MODEL_DIR) + "/" + name + ".engine"; + } + + static PipelineConfig d7_r7() { + return {"d7_r7_Z", 7, 7, 72, 336, "model1_d7_r7_unified_Z_batch1.onnx", + 4096, 16, 16}; + } + + static PipelineConfig d13_r13() { + return {"d13_r13_Z", 13, 13, 252, 2184, "predecoder_memory_d13_T13_X.onnx", + 16384, 16, 16}; + } + + static PipelineConfig d13_r104() { + return {"d13_r104_Z", 13, 104, + 252, 2184, "predecoder_memory_d13_T104_X.onnx", + 131072, 16, 16}; + } + + static PipelineConfig d21_r21() { + return {"d21_r21_Z", 21, 21, + 660, 9240, "model1_d21_r21_unified_X_batch1.onnx", + 65536, 16, 16}; + } + + static PipelineConfig d31_r31() { + return {"d31_r31_Z", 31, 31, + 1440, 29760, "model1_d31_r31_unified_Z_batch1.onnx", + 262144, 16, 16}; + } }; // ============================================================================= @@ -151,19 +142,20 @@ struct PipelineConfig { // ============================================================================= struct DecoderContext { - std::vector> decoders; - std::atomic next_decoder_idx{0}; - int z_stabilizers = 0; - int spatial_slices = 0; - - cudaq::qec::decoder* acquire_decoder() { - thread_local int my_idx = next_decoder_idx.fetch_add(1, std::memory_order_relaxed); - return decoders[my_idx % decoders.size()].get(); - } - - std::atomic total_decode_us{0}; - std::atomic total_worker_us{0}; - std::atomic decode_count{0}; + std::vector> decoders; + std::atomic next_decoder_idx{0}; + int z_stabilizers = 0; + int spatial_slices = 0; + + cudaq::qec::decoder *acquire_decoder() { + thread_local int my_idx = + next_decoder_idx.fetch_add(1, std::memory_order_relaxed); + return decoders[my_idx % decoders.size()].get(); + } + + std::atomic total_decode_us{0}; + std::atomic total_worker_us{0}; + std::atomic decode_count{0}; }; // ============================================================================= @@ -171,17 +163,18 @@ struct DecoderContext { // ============================================================================= struct PreLaunchCopyCtx { - void* d_trt_input; - size_t input_size; - void** h_ring_ptrs; + void *d_trt_input; + size_t input_size; + void **h_ring_ptrs; }; -static void pre_launch_input_copy(void* user_data, void* slot_dev, cudaStream_t stream) { - auto* ctx = static_cast(user_data); - ctx->h_ring_ptrs[0] = slot_dev; - cudaMemcpyAsync(ctx->d_trt_input, - static_cast(slot_dev) + CUDAQ_RPC_HEADER_SIZE, - ctx->input_size, cudaMemcpyDeviceToDevice, stream); +static void pre_launch_input_copy(void *user_data, void *slot_dev, + cudaStream_t stream) { + auto *ctx = static_cast(user_data); + ctx->h_ring_ptrs[0] = slot_dev; + cudaMemcpyAsync(ctx->d_trt_input, + static_cast(slot_dev) + CUDAQ_RPC_HEADER_SIZE, + ctx->input_size, cudaMemcpyDeviceToDevice, stream); } // ============================================================================= @@ -189,25 +182,25 @@ static void pre_launch_input_copy(void* user_data, void* slot_dev, cudaStream_t // ============================================================================= struct WorkerCtx { - AIPreDecoderService* predecoder; - DecoderContext* decoder_ctx; + AIPreDecoderService *predecoder; + DecoderContext *decoder_ctx; }; struct __attribute__((packed)) DecodeResponse { - int32_t total_corrections; - int32_t converged; + int32_t total_corrections; + int32_t converged; }; // ============================================================================= // Data generation // ============================================================================= -void fill_measurement_payload(int32_t* payload, int input_elements, - std::mt19937& rng, double error_rate = 0.01) { - std::bernoulli_distribution err_dist(error_rate); - for (int i = 0; i < input_elements; ++i) { - payload[i] = err_dist(rng) ? 1 : 0; - } +void fill_measurement_payload(int32_t *payload, int input_elements, + std::mt19937 &rng, double error_rate = 0.01) { + std::bernoulli_distribution err_dist(error_rate); + for (int i = 0; i < input_elements; ++i) { + payload[i] = err_dist(rng) ? 1 : 0; + } } // ============================================================================= @@ -215,429 +208,458 @@ void fill_measurement_payload(int32_t* payload, int input_elements, // ============================================================================= struct StreamingConfig { - int rate_us = 0; - int duration_s = 5; - int warmup_count = 20; + int rate_us = 0; + int duration_s = 5; + int warmup_count = 20; }; // ============================================================================= // Main // ============================================================================= -int main(int argc, char* argv[]) { - using hrclock = std::chrono::high_resolution_clock; - - // --- Parse arguments --- - std::string config_name = "d7"; - StreamingConfig scfg; - - if (argc > 1) - config_name = argv[1]; - if (argc > 2 && std::isdigit(argv[2][0])) - scfg.rate_us = std::stoi(argv[2]); - if (argc > 3 && std::isdigit(argv[3][0])) - scfg.duration_s = std::stoi(argv[3]); - - PipelineConfig config; - if (config_name == "d7") { - config = PipelineConfig::d7_r7(); - } else if (config_name == "d13") { - config = PipelineConfig::d13_r13(); - } else if (config_name == "d13_r104") { - config = PipelineConfig::d13_r104(); - } else if (config_name == "d21") { - config = PipelineConfig::d21_r21(); - } else if (config_name == "d31") { - config = PipelineConfig::d31_r31(); - } else { - std::cerr << "Usage: " << argv[0] << " [d7|d13|d13_r104|d21|d31] [rate_us] [duration_s]\n" - << " d7 - distance 7, 7 rounds (default)\n" - << " d13 - distance 13, 13 rounds\n" - << " d13_r104 - distance 13, 104 rounds\n" - << " d21 - distance 21, 21 rounds\n" - << " d31 - distance 31, 31 rounds\n" - << " rate_us - inter-arrival time in us (0 = open-loop)\n" - << " duration_s - test duration in seconds (default: 5)\n"; - return 1; - } - - std::cout << "--- Initializing Hybrid AI Realtime Pipeline (" - << config.label << ") ---\n"; - std::cout << "[Config] distance=" << config.distance - << " rounds=" << config.num_rounds - << " meas_qubits=" << config.meas_qubits - << " residual_detectors=" << config.residual_detectors - << " input_bytes=" << config.input_bytes() - << " slot_size=" << config.slot_size << "\n"; - - CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost)); - - // --- Model path --- - std::string engine_file = config.engine_path(); - std::string onnx_file = config.onnx_path(); - std::string model_path; - - std::ifstream engine_probe(engine_file, std::ios::binary); - if (engine_probe.good()) { - engine_probe.close(); - model_path = engine_file; - std::cout << "[Setup] Loading cached TRT engine: " << engine_file << "\n"; - } else { - model_path = onnx_file; - std::cout << "[Setup] Building TRT engines from ONNX: " << onnx_file << "\n"; - } - - // --- Create PyMatching decoders --- - std::cout << "[Setup] Creating PyMatching decoder (d=" << config.distance - << " surface code, Z stabilizers)...\n"; - auto surface_code = cudaq::qec::get_code("surface_code", - {{"distance", config.distance}}); - auto H_z = surface_code->get_parity_z(); - - DecoderContext decoder_ctx; - decoder_ctx.z_stabilizers = static_cast(H_z.shape()[0]); - decoder_ctx.spatial_slices = config.residual_detectors / decoder_ctx.z_stabilizers; - std::cout << "[Setup] H_z shape: [" << H_z.shape()[0] << " x " - << H_z.shape()[1] << "]" - << " z_stabilizers=" << decoder_ctx.z_stabilizers - << " spatial_slices=" << decoder_ctx.spatial_slices << "\n"; - - cudaqx::heterogeneous_map pm_params; - pm_params.insert("merge_strategy", std::string("smallest_weight")); - std::cout << "[Setup] Pre-allocating " << config.num_workers - << " PyMatching decoders...\n"; - for (int i = 0; i < config.num_workers; ++i) - decoder_ctx.decoders.push_back( - cudaq::qec::decoder::get("pymatching", H_z, pm_params)); - std::cout << "[Setup] PyMatching decoder pool ready.\n"; - - // --- Create GPU resources (predecoders, streams, mailbox) --- - void** h_mailbox_bank = nullptr; - void** d_mailbox_bank = nullptr; - CUDA_CHECK(cudaHostAlloc(&h_mailbox_bank, - config.num_predecoders * sizeof(void*), cudaHostAllocMapped)); - std::memset(h_mailbox_bank, 0, config.num_predecoders * sizeof(void*)); - CUDA_CHECK(cudaHostGetDevicePointer( - reinterpret_cast(&d_mailbox_bank), h_mailbox_bank, 0)); - - std::vector predecoder_streams; - for (int i = 0; i < config.num_predecoders; ++i) { - cudaStream_t s; - CUDA_CHECK(cudaStreamCreate(&s)); - predecoder_streams.push_back(s); - } - - std::cout << "[Setup] Capturing " << config.num_predecoders - << "x AIPreDecoder Graphs...\n"; - cudaStream_t capture_stream; - CUDA_CHECK(cudaStreamCreate(&capture_stream)); - - std::vector> predecoders; - bool need_save = (model_path == onnx_file); - for (int i = 0; i < config.num_predecoders; ++i) { - std::string save_path = (need_save && i == 0) ? engine_file : ""; - auto pd = std::make_unique( - model_path, d_mailbox_bank + i, 1, save_path); - std::cout << "[Setup] Decoder " << i - << ": input_size=" << pd->get_input_size() - << " output_size=" << pd->get_output_size() << "\n"; - pd->capture_graph(capture_stream, false); - predecoders.push_back(std::move(pd)); - } - - // Pre-launch DMA contexts - std::vector pre_launch_ctxs(config.num_predecoders); - for (int i = 0; i < config.num_predecoders; ++i) { - pre_launch_ctxs[i].d_trt_input = predecoders[i]->get_trt_input_ptr(); - pre_launch_ctxs[i].input_size = predecoders[i]->get_input_size(); - pre_launch_ctxs[i].h_ring_ptrs = predecoders[i]->get_host_ring_ptrs(); - } - - if (config.num_workers != config.num_predecoders) { - throw std::invalid_argument( - "num_workers (" + std::to_string(config.num_workers) + - ") must equal num_predecoders (" + - std::to_string(config.num_predecoders) + - ") in the current benchmark"); - } - - // Worker contexts (per-worker, application-specific) - std::vector worker_ctxs(config.num_workers); - for (int i = 0; i < config.num_workers; ++i) { - worker_ctxs[i].predecoder = predecoders[i].get(); - worker_ctxs[i].decoder_ctx = &decoder_ctx; - } - - // Build function table for RPC dispatch - std::vector function_ids(config.num_workers); - for (int i = 0; i < config.num_workers; ++i) { - std::string func = "predecode_target_" + std::to_string(i); - function_ids[i] = realtime_ns::fnv1a_hash(func.c_str()); - } - - // ========================================================================= - // Create pipeline (all atomics hidden inside) - // ========================================================================= +int main(int argc, char *argv[]) { + using hrclock = std::chrono::high_resolution_clock; + + // --- Parse arguments --- + std::string config_name = "d7"; + StreamingConfig scfg; + + if (argc > 1) + config_name = argv[1]; + if (argc > 2 && std::isdigit(argv[2][0])) + scfg.rate_us = std::stoi(argv[2]); + if (argc > 3 && std::isdigit(argv[3][0])) + scfg.duration_s = std::stoi(argv[3]); + + PipelineConfig config; + if (config_name == "d7") { + config = PipelineConfig::d7_r7(); + } else if (config_name == "d13") { + config = PipelineConfig::d13_r13(); + } else if (config_name == "d13_r104") { + config = PipelineConfig::d13_r104(); + } else if (config_name == "d21") { + config = PipelineConfig::d21_r21(); + } else if (config_name == "d31") { + config = PipelineConfig::d31_r31(); + } else { + std::cerr << "Usage: " << argv[0] + << " [d7|d13|d13_r104|d21|d31] [rate_us] [duration_s]\n" + << " d7 - distance 7, 7 rounds (default)\n" + << " d13 - distance 13, 13 rounds\n" + << " d13_r104 - distance 13, 104 rounds\n" + << " d21 - distance 21, 21 rounds\n" + << " d31 - distance 31, 31 rounds\n" + << " rate_us - inter-arrival time in us (0 = open-loop)\n" + << " duration_s - test duration in seconds (default: 5)\n"; + return 1; + } + + std::cout << "--- Initializing Hybrid AI Realtime Pipeline (" << config.label + << ") ---\n"; + std::cout << "[Config] distance=" << config.distance + << " rounds=" << config.num_rounds + << " meas_qubits=" << config.meas_qubits + << " residual_detectors=" << config.residual_detectors + << " input_bytes=" << config.input_bytes() + << " slot_size=" << config.slot_size << "\n"; + + CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost)); + + // --- Model path --- + std::string engine_file = config.engine_path(); + std::string onnx_file = config.onnx_path(); + std::string model_path; + + std::ifstream engine_probe(engine_file, std::ios::binary); + if (engine_probe.good()) { + engine_probe.close(); + model_path = engine_file; + std::cout << "[Setup] Loading cached TRT engine: " << engine_file << "\n"; + } else { + model_path = onnx_file; + std::cout << "[Setup] Building TRT engines from ONNX: " << onnx_file + << "\n"; + } + + // --- Create PyMatching decoders --- + std::cout << "[Setup] Creating PyMatching decoder (d=" << config.distance + << " surface code, Z stabilizers)...\n"; + auto surface_code = + cudaq::qec::get_code("surface_code", {{"distance", config.distance}}); + auto H_z = surface_code->get_parity_z(); + + DecoderContext decoder_ctx; + decoder_ctx.z_stabilizers = static_cast(H_z.shape()[0]); + decoder_ctx.spatial_slices = + config.residual_detectors / decoder_ctx.z_stabilizers; + std::cout << "[Setup] H_z shape: [" << H_z.shape()[0] << " x " + << H_z.shape()[1] << "]" + << " z_stabilizers=" << decoder_ctx.z_stabilizers + << " spatial_slices=" << decoder_ctx.spatial_slices << "\n"; + + cudaqx::heterogeneous_map pm_params; + pm_params.insert("merge_strategy", std::string("smallest_weight")); + std::cout << "[Setup] Pre-allocating " << config.num_workers + << " PyMatching decoders...\n"; + for (int i = 0; i < config.num_workers; ++i) + decoder_ctx.decoders.push_back( + cudaq::qec::decoder::get("pymatching", H_z, pm_params)); + std::cout << "[Setup] PyMatching decoder pool ready.\n"; + + // --- Create GPU resources (predecoders, streams, mailbox) --- + void **h_mailbox_bank = nullptr; + void **d_mailbox_bank = nullptr; + CUDA_CHECK(cudaHostAlloc(&h_mailbox_bank, + config.num_predecoders * sizeof(void *), + cudaHostAllocMapped)); + std::memset(h_mailbox_bank, 0, config.num_predecoders * sizeof(void *)); + CUDA_CHECK(cudaHostGetDevicePointer( + reinterpret_cast(&d_mailbox_bank), h_mailbox_bank, 0)); + + std::vector predecoder_streams; + for (int i = 0; i < config.num_predecoders; ++i) { + cudaStream_t s; + CUDA_CHECK(cudaStreamCreate(&s)); + predecoder_streams.push_back(s); + } + + std::cout << "[Setup] Capturing " << config.num_predecoders + << "x AIPreDecoder Graphs...\n"; + cudaStream_t capture_stream; + CUDA_CHECK(cudaStreamCreate(&capture_stream)); + + std::vector> predecoders; + bool need_save = (model_path == onnx_file); + for (int i = 0; i < config.num_predecoders; ++i) { + std::string save_path = (need_save && i == 0) ? engine_file : ""; + auto pd = std::make_unique( + model_path, d_mailbox_bank + i, 1, save_path); + std::cout << "[Setup] Decoder " << i + << ": input_size=" << pd->get_input_size() + << " output_size=" << pd->get_output_size() << "\n"; + pd->capture_graph(capture_stream, false); + predecoders.push_back(std::move(pd)); + } + + // Pre-launch DMA contexts + std::vector pre_launch_ctxs(config.num_predecoders); + for (int i = 0; i < config.num_predecoders; ++i) { + pre_launch_ctxs[i].d_trt_input = predecoders[i]->get_trt_input_ptr(); + pre_launch_ctxs[i].input_size = predecoders[i]->get_input_size(); + pre_launch_ctxs[i].h_ring_ptrs = predecoders[i]->get_host_ring_ptrs(); + } + + if (config.num_workers != config.num_predecoders) { + throw std::invalid_argument( + "num_workers (" + std::to_string(config.num_workers) + + ") must equal num_predecoders (" + + std::to_string(config.num_predecoders) + ") in the current benchmark"); + } + + // Worker contexts (per-worker, application-specific) + std::vector worker_ctxs(config.num_workers); + for (int i = 0; i < config.num_workers; ++i) { + worker_ctxs[i].predecoder = predecoders[i].get(); + worker_ctxs[i].decoder_ctx = &decoder_ctx; + } + + // Build function table for RPC dispatch + std::vector function_ids(config.num_workers); + for (int i = 0; i < config.num_workers; ++i) { + std::string func = "predecode_target_" + std::to_string(i); + function_ids[i] = realtime_ns::fnv1a_hash(func.c_str()); + } + + // ========================================================================= + // Create pipeline (all atomics hidden inside) + // ========================================================================= + + realtime_ns::PipelineStageConfig stage_cfg; + stage_cfg.num_workers = config.num_workers; + stage_cfg.num_slots = NUM_SLOTS; + stage_cfg.slot_size = config.slot_size; + stage_cfg.cores = {.dispatcher = 2, .consumer = 4, .worker_base = 10}; + + realtime_ns::RealtimePipeline pipeline(stage_cfg); + + // --- GPU stage factory --- + pipeline.set_gpu_stage([&](int w) -> realtime_ns::GpuWorkerResources { + return {.graph_exec = predecoders[w]->get_executable_graph(), + .stream = predecoder_streams[w], + .pre_launch_fn = pre_launch_input_copy, + .pre_launch_data = &pre_launch_ctxs[w], + .function_id = function_ids[w], + .user_context = &worker_ctxs[w]}; + }); + + // --- CPU stage callback (poll + PyMatching decode) --- + // Called repeatedly by the pipeline's worker thread. + // Returns 0 if GPU isn't ready, >0 when a job was processed. + pipeline.set_cpu_stage([](const realtime_ns::CpuStageContext &ctx) -> size_t { + auto *wctx = static_cast(ctx.user_context); + auto *pd = wctx->predecoder; + auto *dctx = wctx->decoder_ctx; + + PreDecoderJob job; + if (!pd->poll_next_job(job)) + return 0; // GPU not done yet - realtime_ns::PipelineStageConfig stage_cfg; - stage_cfg.num_workers = config.num_workers; - stage_cfg.num_slots = NUM_SLOTS; - stage_cfg.slot_size = config.slot_size; - stage_cfg.cores = {.dispatcher = 2, .consumer = 4, .worker_base = 10}; + using hrclock = std::chrono::high_resolution_clock; + auto worker_start = hrclock::now(); - realtime_ns::RealtimePipeline pipeline(stage_cfg); + int total_corrections = 0; + bool all_converged = true; - // --- GPU stage factory --- - pipeline.set_gpu_stage([&](int w) -> realtime_ns::GpuWorkerResources { - return { - .graph_exec = predecoders[w]->get_executable_graph(), - .stream = predecoder_streams[w], - .pre_launch_fn = pre_launch_input_copy, - .pre_launch_data = &pre_launch_ctxs[w], - .function_id = function_ids[w], - .user_context = &worker_ctxs[w] - }; - }); - - // --- CPU stage callback (poll + PyMatching decode) --- - // Called repeatedly by the pipeline's worker thread. - // Returns 0 if GPU isn't ready, >0 when a job was processed. - pipeline.set_cpu_stage([](const realtime_ns::CpuStageContext& ctx) -> size_t { - auto* wctx = static_cast(ctx.user_context); - auto* pd = wctx->predecoder; - auto* dctx = wctx->decoder_ctx; - - PreDecoderJob job; - if (!pd->poll_next_job(job)) - return 0; // GPU not done yet - - using hrclock = std::chrono::high_resolution_clock; - auto worker_start = hrclock::now(); - - int total_corrections = 0; - bool all_converged = true; - - auto decode_start = hrclock::now(); + auto decode_start = hrclock::now(); #if !defined(DISABLE_PYMATCHING) - const int32_t* residual = static_cast(job.inference_data); - auto* my_decoder = dctx->acquire_decoder(); - - cudaqx::tensor syndrome_tensor({(size_t)dctx->z_stabilizers}); - uint8_t* syn_data = syndrome_tensor.data(); - - for (int s = 0; s < dctx->spatial_slices; ++s) { - const int32_t* slice = residual + s * dctx->z_stabilizers; - for (int i = 0; i < dctx->z_stabilizers; ++i) - syn_data[i] = static_cast(slice[i]); - - auto result = my_decoder->decode(syndrome_tensor); - all_converged &= result.converged; - for (auto v : result.result) - if (v > 0.5) total_corrections++; - } + const int32_t *residual = static_cast(job.inference_data); + auto *my_decoder = dctx->acquire_decoder(); + + cudaqx::tensor syndrome_tensor({(size_t)dctx->z_stabilizers}); + uint8_t *syn_data = syndrome_tensor.data(); + + for (int s = 0; s < dctx->spatial_slices; ++s) { + const int32_t *slice = residual + s * dctx->z_stabilizers; + for (int i = 0; i < dctx->z_stabilizers; ++i) + syn_data[i] = static_cast(slice[i]); + + auto result = my_decoder->decode(syndrome_tensor); + all_converged &= result.converged; + for (auto v : result.result) + if (v > 0.5) + total_corrections++; + } #endif - auto decode_end = hrclock::now(); - - // Write RPC response into ring buffer slot - DecodeResponse resp{total_corrections, all_converged ? 1 : 0}; - char* response_payload = (char*)job.ring_buffer_ptr + sizeof(realtime_ns::RPCResponse); - std::memcpy(response_payload, &resp, sizeof(resp)); - - auto* header = static_cast(job.ring_buffer_ptr); - header->magic = realtime_ns::RPC_MAGIC_RESPONSE; - header->status = 0; - header->result_len = sizeof(resp); - - pd->release_job(job.slot_idx); - - auto worker_end = hrclock::now(); - auto decode_us = std::chrono::duration_cast( - decode_end - decode_start).count(); - auto worker_us = std::chrono::duration_cast( - worker_end - worker_start).count(); - dctx->total_decode_us.fetch_add(decode_us, std::memory_order_relaxed); - dctx->total_worker_us.fetch_add(worker_us, std::memory_order_relaxed); - dctx->decode_count.fetch_add(1, std::memory_order_relaxed); - - return 1; - }); - - // --- Completion callback (record timestamps) --- - const int max_requests = 500000; - std::vector submit_ts(max_requests); - std::vector complete_ts(max_requests); - std::vector completed(max_requests, 0); - - pipeline.set_completion_handler([&](const realtime_ns::Completion& c) { - if (c.request_id < static_cast(max_requests)) { - complete_ts[c.request_id] = hrclock::now(); - completed[c.request_id] = c.success; - } - }); - - // ========================================================================= - // Start pipeline and run producer - // ========================================================================= - - std::cout << "[Setup] Starting pipeline...\n"; - auto injector = pipeline.create_injector(); - pipeline.start(); - - auto run_deadline = std::chrono::steady_clock::now() - + std::chrono::seconds(scfg.duration_s); - - std::string rate_label = (scfg.rate_us > 0) - ? std::to_string(scfg.rate_us) + " us" : "open-loop"; - - std::cout << "\n[Stream] Starting streaming test (" << config.label << ")\n" - << " Rate: " << rate_label << "\n" - << " Duration: " << scfg.duration_s << " s\n" - << " Warmup: " << scfg.warmup_count << " requests\n" - << " Predecoders:" << config.num_predecoders << " (dedicated streams)\n" - << " Max reqs: " << max_requests << "\n\n" << std::flush; - - // --- Producer loop (runs on main thread) --- - std::mt19937 rng(42); - const size_t payload_bytes = std::min( - config.input_bytes(), - config.slot_size - static_cast(CUDAQ_RPC_HEADER_SIZE)); - std::vector payload_buf(CUDAQ_RPC_HEADER_SIZE + payload_bytes); - int req_id = 0; - int target = 0; - - while (std::chrono::steady_clock::now() < run_deadline - && req_id < max_requests) { - - int32_t* payload = reinterpret_cast( - payload_buf.data() + CUDAQ_RPC_HEADER_SIZE); - int fill_elems = static_cast(payload_bytes / sizeof(int32_t)); - fill_measurement_payload(payload, fill_elems, rng, 0.01); - - std::string func = "predecode_target_" + std::to_string(target); - uint32_t fid = realtime_ns::fnv1a_hash(func.c_str()); - - submit_ts[req_id] = hrclock::now(); - injector.submit(fid, payload, static_cast(payload_bytes), - static_cast(req_id)); - - target = (target + 1) % config.num_predecoders; - req_id++; - - if (scfg.rate_us > 0) { - auto target_time = submit_ts[req_id - 1] - + std::chrono::microseconds(scfg.rate_us); - while (hrclock::now() < target_time) - QEC_CPU_RELAX(); - } + auto decode_end = hrclock::now(); + + // Write RPC response into ring buffer slot + DecodeResponse resp{total_corrections, all_converged ? 1 : 0}; + char *response_payload = + (char *)job.ring_buffer_ptr + sizeof(realtime_ns::RPCResponse); + std::memcpy(response_payload, &resp, sizeof(resp)); + + auto *header = static_cast(job.ring_buffer_ptr); + header->magic = realtime_ns::RPC_MAGIC_RESPONSE; + header->status = 0; + header->result_len = sizeof(resp); + + pd->release_job(job.slot_idx); + + auto worker_end = hrclock::now(); + auto decode_us = std::chrono::duration_cast( + decode_end - decode_start) + .count(); + auto worker_us = std::chrono::duration_cast( + worker_end - worker_start) + .count(); + dctx->total_decode_us.fetch_add(decode_us, std::memory_order_relaxed); + dctx->total_worker_us.fetch_add(worker_us, std::memory_order_relaxed); + dctx->decode_count.fetch_add(1, std::memory_order_relaxed); + + return 1; + }); + + // --- Completion callback (record timestamps) --- + const int max_requests = 500000; + std::vector submit_ts(max_requests); + std::vector complete_ts(max_requests); + std::vector completed(max_requests, 0); + + pipeline.set_completion_handler([&](const realtime_ns::Completion &c) { + if (c.request_id < static_cast(max_requests)) { + complete_ts[c.request_id] = hrclock::now(); + completed[c.request_id] = c.success; + } + }); + + // ========================================================================= + // Start pipeline and run producer + // ========================================================================= + + std::cout << "[Setup] Starting pipeline...\n"; + auto injector = pipeline.create_injector(); + pipeline.start(); + + auto run_deadline = + std::chrono::steady_clock::now() + std::chrono::seconds(scfg.duration_s); + + std::string rate_label = + (scfg.rate_us > 0) ? std::to_string(scfg.rate_us) + " us" : "open-loop"; + + std::cout << "\n[Stream] Starting streaming test (" << config.label << ")\n" + << " Rate: " << rate_label << "\n" + << " Duration: " << scfg.duration_s << " s\n" + << " Warmup: " << scfg.warmup_count << " requests\n" + << " Predecoders:" << config.num_predecoders + << " (dedicated streams)\n" + << " Max reqs: " << max_requests << "\n\n" + << std::flush; + + // --- Producer loop (runs on main thread) --- + std::mt19937 rng(42); + const size_t payload_bytes = + std::min(config.input_bytes(), + config.slot_size - static_cast(CUDAQ_RPC_HEADER_SIZE)); + std::vector payload_buf(CUDAQ_RPC_HEADER_SIZE + payload_bytes); + int req_id = 0; + int target = 0; + + while (std::chrono::steady_clock::now() < run_deadline && + req_id < max_requests) { + + int32_t *payload = + reinterpret_cast(payload_buf.data() + CUDAQ_RPC_HEADER_SIZE); + int fill_elems = static_cast(payload_bytes / sizeof(int32_t)); + fill_measurement_payload(payload, fill_elems, rng, 0.01); + + std::string func = "predecode_target_" + std::to_string(target); + uint32_t fid = realtime_ns::fnv1a_hash(func.c_str()); + + submit_ts[req_id] = hrclock::now(); + injector.submit(fid, payload, static_cast(payload_bytes), + static_cast(req_id)); + + target = (target + 1) % config.num_predecoders; + req_id++; + + if (scfg.rate_us > 0) { + auto target_time = + submit_ts[req_id - 1] + std::chrono::microseconds(scfg.rate_us); + while (hrclock::now() < target_time) + QEC_CPU_RELAX(); } + } - // --- Shutdown --- - pipeline.stop(); + // --- Shutdown --- + pipeline.stop(); - // ========================================================================= - // Report - // ========================================================================= + // ========================================================================= + // Report + // ========================================================================= - auto final_stats = pipeline.stats(); - uint64_t nsub = final_stats.submitted; - uint64_t ncomp = final_stats.completed; + auto final_stats = pipeline.stats(); + uint64_t nsub = final_stats.submitted; + uint64_t ncomp = final_stats.completed; - if (ncomp < nsub) - std::cerr << " [WARN] " << (nsub - ncomp) - << " requests did not complete.\n"; + if (ncomp < nsub) + std::cerr << " [WARN] " << (nsub - ncomp) + << " requests did not complete.\n"; - int warmup = std::min(scfg.warmup_count, static_cast(nsub)); - std::vector latencies; - latencies.reserve(nsub - warmup); + int warmup = std::min(scfg.warmup_count, static_cast(nsub)); + std::vector latencies; + latencies.reserve(nsub - warmup); - for (uint64_t i = warmup; i < nsub; ++i) { - if (!completed[i]) continue; - auto dt = std::chrono::duration_cast>( + for (uint64_t i = warmup; i < nsub; ++i) { + if (!completed[i]) + continue; + auto dt = + std::chrono::duration_cast>( complete_ts[i] - submit_ts[i]); - latencies.push_back(dt.count()); - } - - std::sort(latencies.begin(), latencies.end()); - - auto pct = [&](double p) -> double { - if (latencies.empty()) return 0; - double idx = (p / 100.0) * (latencies.size() - 1); - size_t lo = (size_t)idx; - size_t hi = std::min(lo + 1, latencies.size() - 1); - double frac = idx - lo; - return latencies[lo] * (1.0 - frac) + latencies[hi] * frac; - }; - - double mean = 0; - for (auto v : latencies) mean += v; - mean = latencies.empty() ? 0 : mean / latencies.size(); - - double stddev = 0; - for (auto v : latencies) stddev += (v - mean) * (v - mean); - stddev = latencies.empty() ? 0 : std::sqrt(stddev / latencies.size()); - - auto wall_us = std::chrono::duration_cast>( - std::chrono::steady_clock::now() - - (run_deadline - std::chrono::seconds(scfg.duration_s))).count(); - double throughput = (wall_us > 0) ? (ncomp * 1e6 / wall_us) : 0; - - double actual_rate = (nsub > 1) - ? std::chrono::duration_cast>( - submit_ts[nsub - 1] - submit_ts[0]).count() / (nsub - 1) - : 0; - - std::cout << std::fixed; - std::cout << "\n================================================================\n"; - std::cout << " Streaming Benchmark: " << config.label << "\n"; - std::cout << "================================================================\n"; - std::cout << " Submitted: " << nsub << "\n"; - std::cout << " Completed: " << ncomp << "\n"; - std::cout << std::setprecision(1); - std::cout << " Wall time: " << wall_us / 1000.0 << " ms\n"; - std::cout << " Throughput: " << throughput << " req/s\n"; - std::cout << " Actual arrival rate:" << std::setw(8) << actual_rate << " us/req\n"; - std::cout << " Backpressure stalls:" << std::setw(8) - << final_stats.backpressure_stalls << "\n"; - std::cout << " ---------------------------------------------------------------\n"; - std::cout << " Latency (us) [steady-state, " << latencies.size() - << " requests after " << warmup << " warmup]\n"; - if (!latencies.empty()) { - std::cout << " min = " << std::setw(10) << latencies.front() << "\n"; - std::cout << " p50 = " << std::setw(10) << pct(50) << "\n"; - std::cout << " mean = " << std::setw(10) << mean << "\n"; - std::cout << " p90 = " << std::setw(10) << pct(90) << "\n"; - std::cout << " p95 = " << std::setw(10) << pct(95) << "\n"; - std::cout << " p99 = " << std::setw(10) << pct(99) << "\n"; - std::cout << " max = " << std::setw(10) << latencies.back() << "\n"; - std::cout << " stddev = " << std::setw(10) << stddev << "\n"; - } - - int n_decoded = decoder_ctx.decode_count.load(); - if (n_decoded > 0) { - double avg_decode = (double)decoder_ctx.total_decode_us.load() / n_decoded; - double avg_worker = (double)decoder_ctx.total_worker_us.load() / n_decoded; - double avg_overhead = avg_worker - avg_decode; - std::cout << " ---------------------------------------------------------------\n"; - std::cout << " Worker-level averages (" << n_decoded << " completed):\n"; - std::cout << " PyMatching decode: " << std::setw(9) << avg_decode << " us\n"; - std::cout << " Total worker: " << std::setw(9) << avg_worker << " us\n"; - std::cout << " Worker overhead: " << std::setw(9) << avg_overhead << " us\n"; - } - - std::cout << " ---------------------------------------------------------------\n"; - std::cout << " Host dispatcher processed " << final_stats.dispatched << " packets.\n"; - std::cout << "================================================================\n"; - - // --- Cleanup --- - std::cout << "[Teardown] Shutting down...\n"; - CUDA_CHECK(cudaStreamSynchronize(capture_stream)); - for (auto& s : predecoder_streams) { - cudaStreamSynchronize(s); - cudaStreamDestroy(s); - } - cudaFreeHost(h_mailbox_bank); - cudaStreamDestroy(capture_stream); - - std::cout << "Done.\n"; - return 0; + latencies.push_back(dt.count()); + } + + std::sort(latencies.begin(), latencies.end()); + + auto pct = [&](double p) -> double { + if (latencies.empty()) + return 0; + double idx = (p / 100.0) * (latencies.size() - 1); + size_t lo = (size_t)idx; + size_t hi = std::min(lo + 1, latencies.size() - 1); + double frac = idx - lo; + return latencies[lo] * (1.0 - frac) + latencies[hi] * frac; + }; + + double mean = 0; + for (auto v : latencies) + mean += v; + mean = latencies.empty() ? 0 : mean / latencies.size(); + + double stddev = 0; + for (auto v : latencies) + stddev += (v - mean) * (v - mean); + stddev = latencies.empty() ? 0 : std::sqrt(stddev / latencies.size()); + + auto wall_us = + std::chrono::duration_cast>( + std::chrono::steady_clock::now() - + (run_deadline - std::chrono::seconds(scfg.duration_s))) + .count(); + double throughput = (wall_us > 0) ? (ncomp * 1e6 / wall_us) : 0; + + double actual_rate = (nsub > 1) + ? std::chrono::duration_cast< + std::chrono::duration>( + submit_ts[nsub - 1] - submit_ts[0]) + .count() / + (nsub - 1) + : 0; + + std::cout << std::fixed; + std::cout + << "\n================================================================\n"; + std::cout << " Streaming Benchmark: " << config.label << "\n"; + std::cout + << "================================================================\n"; + std::cout << " Submitted: " << nsub << "\n"; + std::cout << " Completed: " << ncomp << "\n"; + std::cout << std::setprecision(1); + std::cout << " Wall time: " << wall_us / 1000.0 << " ms\n"; + std::cout << " Throughput: " << throughput << " req/s\n"; + std::cout << " Actual arrival rate:" << std::setw(8) << actual_rate + << " us/req\n"; + std::cout << " Backpressure stalls:" << std::setw(8) + << final_stats.backpressure_stalls << "\n"; + std::cout + << " ---------------------------------------------------------------\n"; + std::cout << " Latency (us) [steady-state, " << latencies.size() + << " requests after " << warmup << " warmup]\n"; + if (!latencies.empty()) { + std::cout << " min = " << std::setw(10) << latencies.front() << "\n"; + std::cout << " p50 = " << std::setw(10) << pct(50) << "\n"; + std::cout << " mean = " << std::setw(10) << mean << "\n"; + std::cout << " p90 = " << std::setw(10) << pct(90) << "\n"; + std::cout << " p95 = " << std::setw(10) << pct(95) << "\n"; + std::cout << " p99 = " << std::setw(10) << pct(99) << "\n"; + std::cout << " max = " << std::setw(10) << latencies.back() << "\n"; + std::cout << " stddev = " << std::setw(10) << stddev << "\n"; + } + + int n_decoded = decoder_ctx.decode_count.load(); + if (n_decoded > 0) { + double avg_decode = (double)decoder_ctx.total_decode_us.load() / n_decoded; + double avg_worker = (double)decoder_ctx.total_worker_us.load() / n_decoded; + double avg_overhead = avg_worker - avg_decode; + std::cout + << " " + "---------------------------------------------------------------\n"; + std::cout << " Worker-level averages (" << n_decoded << " completed):\n"; + std::cout << " PyMatching decode: " << std::setw(9) << avg_decode + << " us\n"; + std::cout << " Total worker: " << std::setw(9) << avg_worker + << " us\n"; + std::cout << " Worker overhead: " << std::setw(9) << avg_overhead + << " us\n"; + } + + std::cout + << " ---------------------------------------------------------------\n"; + std::cout << " Host dispatcher processed " << final_stats.dispatched + << " packets.\n"; + std::cout + << "================================================================\n"; + + // --- Cleanup --- + std::cout << "[Teardown] Shutting down...\n"; + CUDA_CHECK(cudaStreamSynchronize(capture_stream)); + for (auto &s : predecoder_streams) { + cudaStreamSynchronize(s); + cudaStreamDestroy(s); + } + cudaFreeHost(h_mailbox_bank); + cudaStreamDestroy(capture_stream); + + std::cout << "Done.\n"; + return 0; } diff --git a/libs/qec/unittests/test_realtime_pipeline.cu b/libs/qec/unittests/test_realtime_pipeline.cu index 6c25de9e..04f03be1 100644 --- a/libs/qec/unittests/test_realtime_pipeline.cu +++ b/libs/qec/unittests/test_realtime_pipeline.cu @@ -6,19 +6,19 @@ * the terms of the Apache License 2.0 which accompanies this distribution. ******************************************************************************/ -#include -#include -#include +#include +#include +#include #include #include -#include -#include +#include +#include +#include +#include #include #include -#include -#include -#include #include +#include #include "cudaq/qec/realtime/ai_decoder_service.h" #include "cudaq/qec/realtime/ai_predecoder_service.h" @@ -48,58 +48,58 @@ static constexpr uint32_t kTestFunctionId = rt::fnv1a_hash("test_predecoder"); // ============================================================================ struct PreLaunchCopyCtx { - void* d_trt_input; - size_t input_size; - void** h_ring_ptrs; + void *d_trt_input; + size_t input_size; + void **h_ring_ptrs; }; -static void pre_launch_input_copy(void* user_data, void* slot_dev, +static void pre_launch_input_copy(void *user_data, void *slot_dev, cudaStream_t stream) { - auto* ctx = static_cast(user_data); - ctx->h_ring_ptrs[0] = slot_dev; - cudaMemcpyAsync(ctx->d_trt_input, - static_cast(slot_dev) + CUDAQ_RPC_HEADER_SIZE, - ctx->input_size, cudaMemcpyDeviceToDevice, stream); + auto *ctx = static_cast(user_data); + ctx->h_ring_ptrs[0] = slot_dev; + cudaMemcpyAsync(ctx->d_trt_input, + static_cast(slot_dev) + CUDAQ_RPC_HEADER_SIZE, + ctx->input_size, cudaMemcpyDeviceToDevice, stream); } // ============================================================================ // Ring buffer helpers (mapped pinned memory) // ============================================================================ -static bool allocate_mapped_buffer(size_t size, uint8_t** host_out, - uint8_t** dev_out) { - void* h = nullptr; - if (cudaHostAlloc(&h, size, cudaHostAllocMapped) != cudaSuccess) - return false; - void* d = nullptr; - if (cudaHostGetDevicePointer(&d, h, 0) != cudaSuccess) { - cudaFreeHost(h); - return false; - } - std::memset(h, 0, size); - *host_out = static_cast(h); - *dev_out = static_cast(d); - return true; +static bool allocate_mapped_buffer(size_t size, uint8_t **host_out, + uint8_t **dev_out) { + void *h = nullptr; + if (cudaHostAlloc(&h, size, cudaHostAllocMapped) != cudaSuccess) + return false; + void *d = nullptr; + if (cudaHostGetDevicePointer(&d, h, 0) != cudaSuccess) { + cudaFreeHost(h); + return false; + } + std::memset(h, 0, size); + *host_out = static_cast(h); + *dev_out = static_cast(d); + return true; } -static void free_mapped_buffer(uint8_t* host_ptr) { - if (host_ptr) - cudaFreeHost(host_ptr); +static void free_mapped_buffer(uint8_t *host_ptr) { + if (host_ptr) + cudaFreeHost(host_ptr); } // ============================================================================ // Write an RPC request (RPCHeader + payload) into a mapped buffer slot // ============================================================================ -static void write_rpc_slot(uint8_t* slot_host, uint32_t function_id, - const void* payload, size_t payload_len) { - rt::RPCHeader hdr; - hdr.magic = rt::RPC_MAGIC_REQUEST; - hdr.function_id = function_id; - hdr.arg_len = static_cast(payload_len); - std::memcpy(slot_host, &hdr, sizeof(hdr)); - if (payload && payload_len > 0) - std::memcpy(slot_host + sizeof(hdr), payload, payload_len); +static void write_rpc_slot(uint8_t *slot_host, uint32_t function_id, + const void *payload, size_t payload_len) { + rt::RPCHeader hdr; + hdr.magic = rt::RPC_MAGIC_REQUEST; + hdr.function_id = function_id; + hdr.arg_len = static_cast(payload_len); + std::memcpy(slot_host, &hdr, sizeof(hdr)); + if (payload && payload_len > 0) + std::memcpy(slot_host + sizeof(hdr), payload, payload_len); } // ============================================================================ @@ -108,87 +108,83 @@ static void write_rpc_slot(uint8_t* slot_host, uint32_t function_id, class RealtimePipelineTest : public ::testing::Test { protected: - void SetUp() override { - setenv("SKIP_TRT", "1", 1); - - ASSERT_TRUE(allocate_mapped_buffer( - kNumSlots * sizeof(uint64_t), &rx_flags_host_, &rx_flags_dev_)); - ASSERT_TRUE(allocate_mapped_buffer( - kNumSlots * sizeof(uint64_t), &tx_flags_host_, &tx_flags_dev_)); - ASSERT_TRUE(allocate_mapped_buffer( - kNumSlots * kSlotSize, &rx_data_host_, &rx_data_dev_)); - ASSERT_TRUE(allocate_mapped_buffer( - kNumSlots * kSlotSize, &tx_data_host_, &tx_data_dev_)); - - CUDA_CHECK(cudaHostAlloc(&mailbox_bank_host_, - kMaxWorkers * sizeof(void*), - cudaHostAllocMapped)); - std::memset(mailbox_bank_host_, 0, kMaxWorkers * sizeof(void*)); - CUDA_CHECK(cudaHostGetDevicePointer( - reinterpret_cast(&mailbox_bank_dev_), - mailbox_bank_host_, 0)); - - CUDA_CHECK(cudaStreamCreate(&stream_)); - } - - void TearDown() override { - if (stream_) - cudaStreamDestroy(stream_); - if (mailbox_bank_host_) - cudaFreeHost(mailbox_bank_host_); - free_mapped_buffer(rx_flags_host_); - free_mapped_buffer(tx_flags_host_); - free_mapped_buffer(rx_data_host_); - free_mapped_buffer(tx_data_host_); - unsetenv("SKIP_TRT"); - } - - std::unique_ptr - create_predecoder(int mailbox_idx) { - auto pd = std::make_unique( - "dummy.onnx", - reinterpret_cast(mailbox_bank_dev_ + mailbox_idx), - 1); - pd->capture_graph(stream_, false); - EXPECT_EQ(cudaStreamSynchronize(stream_), cudaSuccess); - return pd; + void SetUp() override { + setenv("SKIP_TRT", "1", 1); + + ASSERT_TRUE(allocate_mapped_buffer(kNumSlots * sizeof(uint64_t), + &rx_flags_host_, &rx_flags_dev_)); + ASSERT_TRUE(allocate_mapped_buffer(kNumSlots * sizeof(uint64_t), + &tx_flags_host_, &tx_flags_dev_)); + ASSERT_TRUE(allocate_mapped_buffer(kNumSlots * kSlotSize, &rx_data_host_, + &rx_data_dev_)); + ASSERT_TRUE(allocate_mapped_buffer(kNumSlots * kSlotSize, &tx_data_host_, + &tx_data_dev_)); + + CUDA_CHECK(cudaHostAlloc(&mailbox_bank_host_, kMaxWorkers * sizeof(void *), + cudaHostAllocMapped)); + std::memset(mailbox_bank_host_, 0, kMaxWorkers * sizeof(void *)); + CUDA_CHECK(cudaHostGetDevicePointer( + reinterpret_cast(&mailbox_bank_dev_), mailbox_bank_host_, 0)); + + CUDA_CHECK(cudaStreamCreate(&stream_)); + } + + void TearDown() override { + if (stream_) + cudaStreamDestroy(stream_); + if (mailbox_bank_host_) + cudaFreeHost(mailbox_bank_host_); + free_mapped_buffer(rx_flags_host_); + free_mapped_buffer(tx_flags_host_); + free_mapped_buffer(rx_data_host_); + free_mapped_buffer(tx_data_host_); + unsetenv("SKIP_TRT"); + } + + std::unique_ptr create_predecoder(int mailbox_idx) { + auto pd = std::make_unique( + "dummy.onnx", + reinterpret_cast(mailbox_bank_dev_ + mailbox_idx), 1); + pd->capture_graph(stream_, false); + EXPECT_EQ(cudaStreamSynchronize(stream_), cudaSuccess); + return pd; + } + + void submit_rpc_to_slot(size_t slot, uint32_t function_id, + const void *payload, size_t payload_len) { + uint8_t *slot_host = rx_data_host_ + slot * kSlotSize; + write_rpc_slot(slot_host, function_id, payload, payload_len); + auto *flags = reinterpret_cast(rx_flags_host_); + flags[slot].store(reinterpret_cast(slot_host), + cuda::std::memory_order_release); + } + + bool wait_ready_flag(AIPreDecoderService *pd, int timeout_ms = 2000) { + auto deadline = std::chrono::steady_clock::now() + + std::chrono::milliseconds(timeout_ms); + while (std::chrono::steady_clock::now() < deadline) { + auto *flags = pd->get_host_ready_flags(); + int val = flags[0].load(cuda::std::memory_order_acquire); + if (val >= 1) + return true; + usleep(100); } - - void submit_rpc_to_slot(size_t slot, uint32_t function_id, - const void* payload, size_t payload_len) { - uint8_t* slot_host = rx_data_host_ + slot * kSlotSize; - write_rpc_slot(slot_host, function_id, payload, payload_len); - auto* flags = reinterpret_cast(rx_flags_host_); - flags[slot].store(reinterpret_cast(slot_host), - cuda::std::memory_order_release); - } - - bool wait_ready_flag(AIPreDecoderService* pd, int timeout_ms = 2000) { - auto deadline = std::chrono::steady_clock::now() + - std::chrono::milliseconds(timeout_ms); - while (std::chrono::steady_clock::now() < deadline) { - auto* flags = pd->get_host_ready_flags(); - int val = flags[0].load(cuda::std::memory_order_acquire); - if (val >= 1) - return true; - usleep(100); - } - return false; - } - - static constexpr size_t kMaxWorkers = 8; - - uint8_t* rx_flags_host_ = nullptr; - uint8_t* rx_flags_dev_ = nullptr; - uint8_t* tx_flags_host_ = nullptr; - uint8_t* tx_flags_dev_ = nullptr; - uint8_t* rx_data_host_ = nullptr; - uint8_t* rx_data_dev_ = nullptr; - uint8_t* tx_data_host_ = nullptr; - uint8_t* tx_data_dev_ = nullptr; - void** mailbox_bank_host_ = nullptr; - void** mailbox_bank_dev_ = nullptr; - cudaStream_t stream_ = nullptr; + return false; + } + + static constexpr size_t kMaxWorkers = 8; + + uint8_t *rx_flags_host_ = nullptr; + uint8_t *rx_flags_dev_ = nullptr; + uint8_t *tx_flags_host_ = nullptr; + uint8_t *tx_flags_dev_ = nullptr; + uint8_t *rx_data_host_ = nullptr; + uint8_t *rx_data_dev_ = nullptr; + uint8_t *tx_data_host_ = nullptr; + uint8_t *tx_data_dev_ = nullptr; + void **mailbox_bank_host_ = nullptr; + void **mailbox_bank_dev_ = nullptr; + cudaStream_t stream_ = nullptr; }; // ============================================================================ @@ -196,19 +192,19 @@ protected: // ============================================================================ TEST_F(RealtimePipelineTest, SkipTrtSizes) { - AIDecoderService svc("dummy.onnx", mailbox_bank_dev_); - EXPECT_EQ(svc.get_input_size(), kSkipTrtBytes); - EXPECT_EQ(svc.get_output_size(), kSkipTrtBytes); + AIDecoderService svc("dummy.onnx", mailbox_bank_dev_); + EXPECT_EQ(svc.get_input_size(), kSkipTrtBytes); + EXPECT_EQ(svc.get_output_size(), kSkipTrtBytes); } TEST_F(RealtimePipelineTest, SkipTrtBuffersAllocated) { - AIDecoderService svc("dummy.onnx", mailbox_bank_dev_); - EXPECT_NE(svc.get_trt_input_ptr(), nullptr); + AIDecoderService svc("dummy.onnx", mailbox_bank_dev_); + EXPECT_NE(svc.get_trt_input_ptr(), nullptr); } TEST_F(RealtimePipelineTest, SkipTrtGraphExecNull_BeforeCapture) { - AIDecoderService svc("dummy.onnx", mailbox_bank_dev_); - EXPECT_EQ(svc.get_executable_graph(), nullptr); + AIDecoderService svc("dummy.onnx", mailbox_bank_dev_); + EXPECT_EQ(svc.get_executable_graph(), nullptr); } // ============================================================================ @@ -216,51 +212,51 @@ TEST_F(RealtimePipelineTest, SkipTrtGraphExecNull_BeforeCapture) { // ============================================================================ TEST_F(RealtimePipelineTest, PreDecoderConstruction) { - auto pd = create_predecoder(0); - EXPECT_NE(pd->get_host_ready_flags(), nullptr); - EXPECT_NE(pd->get_host_ring_ptrs(), nullptr); - EXPECT_EQ(pd->get_queue_depth(), 1); - EXPECT_EQ(pd->get_input_size(), kSkipTrtBytes); - EXPECT_EQ(pd->get_output_size(), kSkipTrtBytes); + auto pd = create_predecoder(0); + EXPECT_NE(pd->get_host_ready_flags(), nullptr); + EXPECT_NE(pd->get_host_ring_ptrs(), nullptr); + EXPECT_EQ(pd->get_queue_depth(), 1); + EXPECT_EQ(pd->get_input_size(), kSkipTrtBytes); + EXPECT_EQ(pd->get_output_size(), kSkipTrtBytes); } TEST_F(RealtimePipelineTest, PreDecoderGraphCaptured) { - auto pd = create_predecoder(0); - EXPECT_NE(pd->get_executable_graph(), nullptr); + auto pd = create_predecoder(0); + EXPECT_NE(pd->get_executable_graph(), nullptr); } TEST_F(RealtimePipelineTest, PollReturnsFalseWhenIdle) { - auto pd = create_predecoder(0); - PreDecoderJob job{}; - EXPECT_FALSE(pd->poll_next_job(job)); + auto pd = create_predecoder(0); + PreDecoderJob job{}; + EXPECT_FALSE(pd->poll_next_job(job)); } TEST_F(RealtimePipelineTest, PollAndRelease) { - auto pd = create_predecoder(0); + auto pd = create_predecoder(0); - auto* flags = pd->get_host_ready_flags(); - flags[0].store(1, cuda::std::memory_order_release); + auto *flags = pd->get_host_ready_flags(); + flags[0].store(1, cuda::std::memory_order_release); - PreDecoderJob job{}; - EXPECT_TRUE(pd->poll_next_job(job)); - EXPECT_EQ(job.slot_idx, 0); - EXPECT_NE(job.inference_data, nullptr); + PreDecoderJob job{}; + EXPECT_TRUE(pd->poll_next_job(job)); + EXPECT_EQ(job.slot_idx, 0); + EXPECT_NE(job.inference_data, nullptr); - int val = flags[0].load(cuda::std::memory_order_acquire); - EXPECT_EQ(val, 2); + int val = flags[0].load(cuda::std::memory_order_acquire); + EXPECT_EQ(val, 2); - pd->release_job(0); - val = flags[0].load(cuda::std::memory_order_acquire); - EXPECT_EQ(val, 0); + pd->release_job(0); + val = flags[0].load(cuda::std::memory_order_acquire); + EXPECT_EQ(val, 0); } TEST_F(RealtimePipelineTest, GraphLaunchableFromHost) { - auto pd = create_predecoder(0); - cudaGraphExec_t exec = pd->get_executable_graph(); - ASSERT_NE(exec, nullptr); + auto pd = create_predecoder(0); + cudaGraphExec_t exec = pd->get_executable_graph(); + ASSERT_NE(exec, nullptr); - CUDA_CHECK(cudaGraphLaunch(exec, stream_)); - CUDA_CHECK(cudaStreamSynchronize(stream_)); + CUDA_CHECK(cudaGraphLaunch(exec, stream_)); + CUDA_CHECK(cudaStreamSynchronize(stream_)); } // ============================================================================ @@ -274,115 +270,111 @@ TEST_F(RealtimePipelineTest, GraphLaunchableFromHost) { class CorrectnessTest : public RealtimePipelineTest { protected: - void run_passthrough(AIPreDecoderService* pd, int mailbox_idx, - const float* payload, size_t num_floats, - float* output) { - size_t payload_bytes = num_floats * sizeof(float); - ASSERT_LE(payload_bytes, kSkipTrtBytes); + void run_passthrough(AIPreDecoderService *pd, int mailbox_idx, + const float *payload, size_t num_floats, float *output) { + size_t payload_bytes = num_floats * sizeof(float); + ASSERT_LE(payload_bytes, kSkipTrtBytes); - uint8_t* slot_host = rx_data_host_; - write_rpc_slot(slot_host, kTestFunctionId, payload, payload_bytes); + uint8_t *slot_host = rx_data_host_; + write_rpc_slot(slot_host, kTestFunctionId, payload, payload_bytes); - ptrdiff_t offset = slot_host - rx_data_host_; - void* slot_dev = static_cast(rx_data_dev_ + offset); + ptrdiff_t offset = slot_host - rx_data_host_; + void *slot_dev = static_cast(rx_data_dev_ + offset); - PreLaunchCopyCtx ctx; - ctx.d_trt_input = pd->get_trt_input_ptr(); - ctx.input_size = pd->get_input_size(); - ctx.h_ring_ptrs = pd->get_host_ring_ptrs(); + PreLaunchCopyCtx ctx; + ctx.d_trt_input = pd->get_trt_input_ptr(); + ctx.input_size = pd->get_input_size(); + ctx.h_ring_ptrs = pd->get_host_ring_ptrs(); - pre_launch_input_copy(&ctx, slot_dev, stream_); - CUDA_CHECK(cudaGraphLaunch(pd->get_executable_graph(), stream_)); - CUDA_CHECK(cudaStreamSynchronize(stream_)); + pre_launch_input_copy(&ctx, slot_dev, stream_); + CUDA_CHECK(cudaGraphLaunch(pd->get_executable_graph(), stream_)); + CUDA_CHECK(cudaStreamSynchronize(stream_)); - ASSERT_TRUE(wait_ready_flag(pd)); + ASSERT_TRUE(wait_ready_flag(pd)); - PreDecoderJob job{}; - ASSERT_TRUE(pd->poll_next_job(job)); - std::memcpy(output, job.inference_data, payload_bytes); - pd->release_job(0); - } + PreDecoderJob job{}; + ASSERT_TRUE(pd->poll_next_job(job)); + std::memcpy(output, job.inference_data, payload_bytes); + pd->release_job(0); + } }; TEST_F(CorrectnessTest, IdentityPassthrough_Zeros) { - auto pd = create_predecoder(0); - float input[kSkipTrtFloats] = {}; - float output[kSkipTrtFloats]; - std::memset(output, 0xFF, sizeof(output)); - - run_passthrough(pd.get(), 0, input, kSkipTrtFloats, output); - EXPECT_EQ(std::memcmp(input, output, kSkipTrtBytes), 0) - << "Zero payload should pass through unchanged"; + auto pd = create_predecoder(0); + float input[kSkipTrtFloats] = {}; + float output[kSkipTrtFloats]; + std::memset(output, 0xFF, sizeof(output)); + + run_passthrough(pd.get(), 0, input, kSkipTrtFloats, output); + EXPECT_EQ(std::memcmp(input, output, kSkipTrtBytes), 0) + << "Zero payload should pass through unchanged"; } TEST_F(CorrectnessTest, IdentityPassthrough_KnownPattern) { - auto pd = create_predecoder(0); - float input[kSkipTrtFloats]; - for (size_t i = 0; i < kSkipTrtFloats; ++i) - input[i] = static_cast(i + 1); - float output[kSkipTrtFloats] = {}; - - run_passthrough(pd.get(), 0, input, kSkipTrtFloats, output); - EXPECT_EQ(std::memcmp(input, output, kSkipTrtBytes), 0) - << "Known pattern {1..16} should pass through unchanged"; + auto pd = create_predecoder(0); + float input[kSkipTrtFloats]; + for (size_t i = 0; i < kSkipTrtFloats; ++i) + input[i] = static_cast(i + 1); + float output[kSkipTrtFloats] = {}; + + run_passthrough(pd.get(), 0, input, kSkipTrtFloats, output); + EXPECT_EQ(std::memcmp(input, output, kSkipTrtBytes), 0) + << "Known pattern {1..16} should pass through unchanged"; } TEST_F(CorrectnessTest, IdentityPassthrough_RandomData) { - auto pd = create_predecoder(0); - std::mt19937 rng(42); - std::uniform_real_distribution dist(-1e6f, 1e6f); - - float input[kSkipTrtFloats]; - for (size_t i = 0; i < kSkipTrtFloats; ++i) - input[i] = dist(rng); - float output[kSkipTrtFloats] = {}; - - run_passthrough(pd.get(), 0, input, kSkipTrtFloats, output); - EXPECT_EQ(std::memcmp(input, output, kSkipTrtBytes), 0) - << "Random payload should pass through bitwise-identical"; + auto pd = create_predecoder(0); + std::mt19937 rng(42); + std::uniform_real_distribution dist(-1e6f, 1e6f); + + float input[kSkipTrtFloats]; + for (size_t i = 0; i < kSkipTrtFloats; ++i) + input[i] = dist(rng); + float output[kSkipTrtFloats] = {}; + + run_passthrough(pd.get(), 0, input, kSkipTrtFloats, output); + EXPECT_EQ(std::memcmp(input, output, kSkipTrtBytes), 0) + << "Random payload should pass through bitwise-identical"; } TEST_F(CorrectnessTest, IdentityPassthrough_MaxValues) { - auto pd = create_predecoder(0); - std::vector input(kSkipTrtFloats); - const float extremes[] = { - FLT_MAX, -FLT_MAX, FLT_MIN, -FLT_MIN, - INFINITY, -INFINITY, NAN, 0.0f, - -0.0f, 1.0f, -1.0f, 1e-38f, - 1e38f, 3.14159265f, 2.71828183f, 0.5f - }; - for (size_t i = 0; i < kSkipTrtFloats; ++i) - input[i] = extremes[i % (sizeof(extremes) / sizeof(extremes[0]))]; - std::vector output(kSkipTrtFloats, 0.0f); - - run_passthrough(pd.get(), 0, input.data(), kSkipTrtFloats, output.data()); - EXPECT_EQ(std::memcmp(input.data(), output.data(), kSkipTrtBytes), 0) - << "Extreme float values should pass through bitwise-identical"; + auto pd = create_predecoder(0); + std::vector input(kSkipTrtFloats); + const float extremes[] = {FLT_MAX, -FLT_MAX, FLT_MIN, -FLT_MIN, + INFINITY, -INFINITY, NAN, 0.0f, + -0.0f, 1.0f, -1.0f, 1e-38f, + 1e38f, 3.14159265f, 2.71828183f, 0.5f}; + for (size_t i = 0; i < kSkipTrtFloats; ++i) + input[i] = extremes[i % (sizeof(extremes) / sizeof(extremes[0]))]; + std::vector output(kSkipTrtFloats, 0.0f); + + run_passthrough(pd.get(), 0, input.data(), kSkipTrtFloats, output.data()); + EXPECT_EQ(std::memcmp(input.data(), output.data(), kSkipTrtBytes), 0) + << "Extreme float values should pass through bitwise-identical"; } TEST_F(CorrectnessTest, IdentityPassthrough_MultipleRequests) { - auto pd = create_predecoder(0); - constexpr int kNumRequests = 5000; - std::mt19937 rng(123); - std::uniform_real_distribution dist(-1e6f, 1e6f); - int failures = 0; - - for (int r = 0; r < kNumRequests; ++r) { - float input[kSkipTrtFloats]; - for (size_t i = 0; i < kSkipTrtFloats; ++i) - input[i] = dist(rng); - float output[kSkipTrtFloats] = {}; - - run_passthrough(pd.get(), 0, input, kSkipTrtFloats, output); - if (std::memcmp(input, output, kSkipTrtBytes) != 0) { - failures++; - if (failures <= 5) - ADD_FAILURE() << "Request " << r - << ": output does not match input"; - } + auto pd = create_predecoder(0); + constexpr int kNumRequests = 5000; + std::mt19937 rng(123); + std::uniform_real_distribution dist(-1e6f, 1e6f); + int failures = 0; + + for (int r = 0; r < kNumRequests; ++r) { + float input[kSkipTrtFloats]; + for (size_t i = 0; i < kSkipTrtFloats; ++i) + input[i] = dist(rng); + float output[kSkipTrtFloats] = {}; + + run_passthrough(pd.get(), 0, input, kSkipTrtFloats, output); + if (std::memcmp(input, output, kSkipTrtBytes) != 0) { + failures++; + if (failures <= 5) + ADD_FAILURE() << "Request " << r << ": output does not match input"; } - EXPECT_EQ(failures, 0) << failures << " of " << kNumRequests - << " requests had mismatched output"; + } + EXPECT_EQ(failures, 0) << failures << " of " << kNumRequests + << " requests had mismatched output"; } // ============================================================================ @@ -391,248 +383,249 @@ TEST_F(CorrectnessTest, IdentityPassthrough_MultipleRequests) { class HostDispatcherTest : public RealtimePipelineTest { protected: - void SetUp() override { - RealtimePipelineTest::SetUp(); - idle_mask_ = new rt::atomic_uint64_sys(0); - live_dispatched_ = new rt::atomic_uint64_sys(0); - inflight_slot_tags_ = new int[kMaxWorkers](); - shutdown_flag_ = new rt::atomic_int_sys(0); - stats_counter_ = 0; - function_table_ = new cudaq_function_entry_t[kMaxWorkers]; - std::memset(function_table_, 0, - kMaxWorkers * sizeof(cudaq_function_entry_t)); - } - - void TearDown() override { - if (!loop_stopped_) { - shutdown_flag_->store(1, cuda::std::memory_order_release); - __sync_synchronize(); - if (loop_thread_.joinable()) - loop_thread_.join(); - } - for (auto& s : worker_streams_) { - if (s) - cudaStreamDestroy(s); - } - delete idle_mask_; - delete live_dispatched_; - delete[] inflight_slot_tags_; - delete shutdown_flag_; - delete[] function_table_; - RealtimePipelineTest::TearDown(); - } - - void add_worker(uint32_t function_id, cudaGraphExec_t exec, - PreLaunchCopyCtx* plc = nullptr) { - cudaStream_t s = nullptr; - ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess); - worker_streams_.push_back(s); - - rt::HostDispatchWorker w; - w.graph_exec = exec; - w.stream = s; - w.function_id = function_id; - w.pre_launch_fn = plc ? pre_launch_input_copy : nullptr; - w.pre_launch_data = plc; - workers_.push_back(w); - - size_t idx = ft_count_; - function_table_[idx].handler.graph_exec = exec; - function_table_[idx].function_id = function_id; - function_table_[idx].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; - ft_count_++; - } - - void start_loop() { - idle_mask_->store((1ULL << workers_.size()) - 1, - cuda::std::memory_order_release); - - config_.rx_flags = reinterpret_cast( - rx_flags_host_); - config_.tx_flags = reinterpret_cast( - tx_flags_host_); - config_.rx_data_host = rx_data_host_; - config_.rx_data_dev = rx_data_dev_; - config_.tx_data_host = tx_data_host_; - config_.tx_data_dev = tx_data_dev_; - config_.tx_stride_sz = kSlotSize; - config_.h_mailbox_bank = mailbox_bank_host_; - config_.num_slots = kNumSlots; - config_.slot_size = kSlotSize; - config_.workers = workers_; - config_.function_table = function_table_; - config_.function_table_count = ft_count_; - config_.shutdown_flag = shutdown_flag_; - config_.stats_counter = &stats_counter_; - config_.live_dispatched = live_dispatched_; - config_.idle_mask = idle_mask_; - config_.inflight_slot_tags = inflight_slot_tags_; - - loop_thread_ = std::thread(rt::host_dispatcher_loop, config_); - } - - void stop_loop() { - shutdown_flag_->store(1, cuda::std::memory_order_release); - __sync_synchronize(); - if (loop_thread_.joinable()) - loop_thread_.join(); - loop_stopped_ = true; + void SetUp() override { + RealtimePipelineTest::SetUp(); + idle_mask_ = new rt::atomic_uint64_sys(0); + live_dispatched_ = new rt::atomic_uint64_sys(0); + inflight_slot_tags_ = new int[kMaxWorkers](); + shutdown_flag_ = new rt::atomic_int_sys(0); + stats_counter_ = 0; + function_table_ = new cudaq_function_entry_t[kMaxWorkers]; + std::memset(function_table_, 0, + kMaxWorkers * sizeof(cudaq_function_entry_t)); + } + + void TearDown() override { + if (!loop_stopped_) { + shutdown_flag_->store(1, cuda::std::memory_order_release); + __sync_synchronize(); + if (loop_thread_.joinable()) + loop_thread_.join(); } - - void restore_worker(int id) { - idle_mask_->fetch_or(1ULL << id, cuda::std::memory_order_release); + for (auto &s : worker_streams_) { + if (s) + cudaStreamDestroy(s); } - - bool poll_tx_flag(size_t slot, int timeout_ms = 2000) { - auto* flags = reinterpret_cast(tx_flags_host_); - auto deadline = std::chrono::steady_clock::now() + - std::chrono::milliseconds(timeout_ms); - while (std::chrono::steady_clock::now() < deadline) { - uint64_t val = flags[slot].load(cuda::std::memory_order_acquire); - if (val != 0) - return true; - usleep(100); - } - return false; - } - - void clear_tx_flag(size_t slot) { - auto* flags = reinterpret_cast(tx_flags_host_); - flags[slot].store(0, cuda::std::memory_order_release); + delete idle_mask_; + delete live_dispatched_; + delete[] inflight_slot_tags_; + delete shutdown_flag_; + delete[] function_table_; + RealtimePipelineTest::TearDown(); + } + + void add_worker(uint32_t function_id, cudaGraphExec_t exec, + PreLaunchCopyCtx *plc = nullptr) { + cudaStream_t s = nullptr; + ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess); + worker_streams_.push_back(s); + + rt::HostDispatchWorker w; + w.graph_exec = exec; + w.stream = s; + w.function_id = function_id; + w.pre_launch_fn = plc ? pre_launch_input_copy : nullptr; + w.pre_launch_data = plc; + workers_.push_back(w); + + size_t idx = ft_count_; + function_table_[idx].handler.graph_exec = exec; + function_table_[idx].function_id = function_id; + function_table_[idx].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; + ft_count_++; + } + + void start_loop() { + idle_mask_->store((1ULL << workers_.size()) - 1, + cuda::std::memory_order_release); + + config_.rx_flags = + reinterpret_cast(rx_flags_host_); + config_.tx_flags = + reinterpret_cast(tx_flags_host_); + config_.rx_data_host = rx_data_host_; + config_.rx_data_dev = rx_data_dev_; + config_.tx_data_host = tx_data_host_; + config_.tx_data_dev = tx_data_dev_; + config_.tx_stride_sz = kSlotSize; + config_.h_mailbox_bank = mailbox_bank_host_; + config_.num_slots = kNumSlots; + config_.slot_size = kSlotSize; + config_.workers = workers_; + config_.function_table = function_table_; + config_.function_table_count = ft_count_; + config_.shutdown_flag = shutdown_flag_; + config_.stats_counter = &stats_counter_; + config_.live_dispatched = live_dispatched_; + config_.idle_mask = idle_mask_; + config_.inflight_slot_tags = inflight_slot_tags_; + + loop_thread_ = std::thread(rt::host_dispatcher_loop, config_); + } + + void stop_loop() { + shutdown_flag_->store(1, cuda::std::memory_order_release); + __sync_synchronize(); + if (loop_thread_.joinable()) + loop_thread_.join(); + loop_stopped_ = true; + } + + void restore_worker(int id) { + idle_mask_->fetch_or(1ULL << id, cuda::std::memory_order_release); + } + + bool poll_tx_flag(size_t slot, int timeout_ms = 2000) { + auto *flags = reinterpret_cast(tx_flags_host_); + auto deadline = std::chrono::steady_clock::now() + + std::chrono::milliseconds(timeout_ms); + while (std::chrono::steady_clock::now() < deadline) { + uint64_t val = flags[slot].load(cuda::std::memory_order_acquire); + if (val != 0) + return true; + usleep(100); } - - rt::atomic_uint64_sys* idle_mask_ = nullptr; - rt::atomic_uint64_sys* live_dispatched_ = nullptr; - int* inflight_slot_tags_ = nullptr; - rt::atomic_int_sys* shutdown_flag_ = nullptr; - uint64_t stats_counter_ = 0; - bool loop_stopped_ = false; - - cudaq_function_entry_t* function_table_ = nullptr; - size_t ft_count_ = 0; - std::vector workers_; - std::vector worker_streams_; - rt::HostDispatcherConfig config_{}; - std::thread loop_thread_; + return false; + } + + void clear_tx_flag(size_t slot) { + auto *flags = reinterpret_cast(tx_flags_host_); + flags[slot].store(0, cuda::std::memory_order_release); + } + + rt::atomic_uint64_sys *idle_mask_ = nullptr; + rt::atomic_uint64_sys *live_dispatched_ = nullptr; + int *inflight_slot_tags_ = nullptr; + rt::atomic_int_sys *shutdown_flag_ = nullptr; + uint64_t stats_counter_ = 0; + bool loop_stopped_ = false; + + cudaq_function_entry_t *function_table_ = nullptr; + size_t ft_count_ = 0; + std::vector workers_; + std::vector worker_streams_; + rt::HostDispatcherConfig config_{}; + std::thread loop_thread_; }; TEST_F(HostDispatcherTest, ShutdownImmediate) { - auto pd = create_predecoder(0); - add_worker(kTestFunctionId, pd->get_executable_graph()); + auto pd = create_predecoder(0); + add_worker(kTestFunctionId, pd->get_executable_graph()); - shutdown_flag_->store(1, cuda::std::memory_order_release); - start_loop(); - if (loop_thread_.joinable()) - loop_thread_.join(); - loop_stopped_ = true; + shutdown_flag_->store(1, cuda::std::memory_order_release); + start_loop(); + if (loop_thread_.joinable()) + loop_thread_.join(); + loop_stopped_ = true; - EXPECT_EQ(stats_counter_, 0u); + EXPECT_EQ(stats_counter_, 0u); } TEST_F(HostDispatcherTest, ShutdownClean) { - auto pd = create_predecoder(0); - add_worker(kTestFunctionId, pd->get_executable_graph()); - start_loop(); - usleep(10000); - stop_loop(); - EXPECT_EQ(stats_counter_, 0u); + auto pd = create_predecoder(0); + add_worker(kTestFunctionId, pd->get_executable_graph()); + start_loop(); + usleep(10000); + stop_loop(); + EXPECT_EQ(stats_counter_, 0u); } TEST_F(HostDispatcherTest, StatsCounter) { - auto pd = create_predecoder(0); - PreLaunchCopyCtx plc; - plc.d_trt_input = pd->get_trt_input_ptr(); - plc.input_size = pd->get_input_size(); - plc.h_ring_ptrs = pd->get_host_ring_ptrs(); - add_worker(kTestFunctionId, pd->get_executable_graph(), &plc); - start_loop(); - - constexpr int kN = 5; - for (int i = 0; i < kN; ++i) { - size_t slot = static_cast(i % kNumSlots); - if (i > 0) - clear_tx_flag((i - 1) % kNumSlots); - - float payload[kSkipTrtFloats] = {}; - payload[0] = static_cast(i); - submit_rpc_to_slot(slot, kTestFunctionId, payload, kSkipTrtBytes); - - ASSERT_TRUE(poll_tx_flag(slot)) << "Timeout on request " << i; - CUDA_CHECK(cudaDeviceSynchronize()); - - ASSERT_TRUE(wait_ready_flag(pd.get())); - PreDecoderJob job{}; - if (pd->poll_next_job(job)) - pd->release_job(0); - - restore_worker(0); - } + auto pd = create_predecoder(0); + PreLaunchCopyCtx plc; + plc.d_trt_input = pd->get_trt_input_ptr(); + plc.input_size = pd->get_input_size(); + plc.h_ring_ptrs = pd->get_host_ring_ptrs(); + add_worker(kTestFunctionId, pd->get_executable_graph(), &plc); + start_loop(); + + constexpr int kN = 5; + for (int i = 0; i < kN; ++i) { + size_t slot = static_cast(i % kNumSlots); + if (i > 0) + clear_tx_flag((i - 1) % kNumSlots); + + float payload[kSkipTrtFloats] = {}; + payload[0] = static_cast(i); + submit_rpc_to_slot(slot, kTestFunctionId, payload, kSkipTrtBytes); + + ASSERT_TRUE(poll_tx_flag(slot)) << "Timeout on request " << i; + CUDA_CHECK(cudaDeviceSynchronize()); + + ASSERT_TRUE(wait_ready_flag(pd.get())); + PreDecoderJob job{}; + if (pd->poll_next_job(job)) + pd->release_job(0); + + restore_worker(0); + } - stop_loop(); - EXPECT_EQ(stats_counter_, static_cast(kN)); + stop_loop(); + EXPECT_EQ(stats_counter_, static_cast(kN)); } TEST_F(HostDispatcherTest, InvalidMagicDropped) { - auto pd = create_predecoder(0); - add_worker(kTestFunctionId, pd->get_executable_graph()); - start_loop(); + auto pd = create_predecoder(0); + add_worker(kTestFunctionId, pd->get_executable_graph()); + start_loop(); - uint8_t* slot_host = rx_data_host_; - rt::RPCHeader bad_hdr; - bad_hdr.magic = 0xDEADBEEF; - bad_hdr.function_id = kTestFunctionId; - bad_hdr.arg_len = 4; - std::memcpy(slot_host, &bad_hdr, sizeof(bad_hdr)); + uint8_t *slot_host = rx_data_host_; + rt::RPCHeader bad_hdr; + bad_hdr.magic = 0xDEADBEEF; + bad_hdr.function_id = kTestFunctionId; + bad_hdr.arg_len = 4; + std::memcpy(slot_host, &bad_hdr, sizeof(bad_hdr)); - auto* flags = reinterpret_cast(rx_flags_host_); - flags[0].store(reinterpret_cast(slot_host), - cuda::std::memory_order_release); + auto *flags = reinterpret_cast(rx_flags_host_); + flags[0].store(reinterpret_cast(slot_host), + cuda::std::memory_order_release); - usleep(50000); + usleep(50000); - uint64_t rx_val = flags[0].load(cuda::std::memory_order_acquire); - EXPECT_EQ(rx_val, 0u) << "Invalid magic should be consumed (rx_flag cleared)"; + uint64_t rx_val = flags[0].load(cuda::std::memory_order_acquire); + EXPECT_EQ(rx_val, 0u) << "Invalid magic should be consumed (rx_flag cleared)"; - stop_loop(); - EXPECT_EQ(stats_counter_, 0u) << "Invalid magic should not count as dispatched"; + stop_loop(); + EXPECT_EQ(stats_counter_, 0u) + << "Invalid magic should not count as dispatched"; } TEST_F(HostDispatcherTest, SlotWraparound) { - auto pd = create_predecoder(0); - PreLaunchCopyCtx plc; - plc.d_trt_input = pd->get_trt_input_ptr(); - plc.input_size = pd->get_input_size(); - plc.h_ring_ptrs = pd->get_host_ring_ptrs(); - add_worker(kTestFunctionId, pd->get_executable_graph(), &plc); - start_loop(); - - constexpr int kTotal = static_cast(kNumSlots) + 2; - for (int i = 0; i < kTotal; ++i) { - size_t slot = static_cast(i % kNumSlots); - - auto* rx = reinterpret_cast(rx_flags_host_); - while (rx[slot].load(cuda::std::memory_order_acquire) != 0) - usleep(100); - clear_tx_flag(slot); - - float payload[kSkipTrtFloats] = {}; - payload[0] = static_cast(i); - submit_rpc_to_slot(slot, kTestFunctionId, payload, kSkipTrtBytes); - - ASSERT_TRUE(poll_tx_flag(slot)) << "Timeout on request " << i - << " (slot " << slot << ")"; - CUDA_CHECK(cudaDeviceSynchronize()); - - ASSERT_TRUE(wait_ready_flag(pd.get())); - PreDecoderJob job{}; - if (pd->poll_next_job(job)) - pd->release_job(0); - - restore_worker(0); - } + auto pd = create_predecoder(0); + PreLaunchCopyCtx plc; + plc.d_trt_input = pd->get_trt_input_ptr(); + plc.input_size = pd->get_input_size(); + plc.h_ring_ptrs = pd->get_host_ring_ptrs(); + add_worker(kTestFunctionId, pd->get_executable_graph(), &plc); + start_loop(); + + constexpr int kTotal = static_cast(kNumSlots) + 2; + for (int i = 0; i < kTotal; ++i) { + size_t slot = static_cast(i % kNumSlots); + + auto *rx = reinterpret_cast(rx_flags_host_); + while (rx[slot].load(cuda::std::memory_order_acquire) != 0) + usleep(100); + clear_tx_flag(slot); + + float payload[kSkipTrtFloats] = {}; + payload[0] = static_cast(i); + submit_rpc_to_slot(slot, kTestFunctionId, payload, kSkipTrtBytes); + + ASSERT_TRUE(poll_tx_flag(slot)) + << "Timeout on request " << i << " (slot " << slot << ")"; + CUDA_CHECK(cudaDeviceSynchronize()); + + ASSERT_TRUE(wait_ready_flag(pd.get())); + PreDecoderJob job{}; + if (pd->poll_next_job(job)) + pd->release_job(0); + + restore_worker(0); + } - stop_loop(); - EXPECT_EQ(stats_counter_, static_cast(kTotal)); + stop_loop(); + EXPECT_EQ(stats_counter_, static_cast(kTotal)); } // ============================================================================ @@ -640,146 +633,145 @@ TEST_F(HostDispatcherTest, SlotWraparound) { // ============================================================================ TEST_F(HostDispatcherTest, SingleRequestRoundTrip) { - auto pd = create_predecoder(0); - PreLaunchCopyCtx plc; - plc.d_trt_input = pd->get_trt_input_ptr(); - plc.input_size = pd->get_input_size(); - plc.h_ring_ptrs = pd->get_host_ring_ptrs(); - add_worker(kTestFunctionId, pd->get_executable_graph(), &plc); - start_loop(); - - float input[kSkipTrtFloats]; - for (size_t i = 0; i < kSkipTrtFloats; ++i) - input[i] = static_cast(i + 1); - submit_rpc_to_slot(0, kTestFunctionId, input, kSkipTrtBytes); - - ASSERT_TRUE(poll_tx_flag(0)) << "Timeout waiting for dispatcher to process"; - CUDA_CHECK(cudaDeviceSynchronize()); - - ASSERT_TRUE(wait_ready_flag(pd.get())) << "Predecoder ready flag not set"; + auto pd = create_predecoder(0); + PreLaunchCopyCtx plc; + plc.d_trt_input = pd->get_trt_input_ptr(); + plc.input_size = pd->get_input_size(); + plc.h_ring_ptrs = pd->get_host_ring_ptrs(); + add_worker(kTestFunctionId, pd->get_executable_graph(), &plc); + start_loop(); + + float input[kSkipTrtFloats]; + for (size_t i = 0; i < kSkipTrtFloats; ++i) + input[i] = static_cast(i + 1); + submit_rpc_to_slot(0, kTestFunctionId, input, kSkipTrtBytes); + + ASSERT_TRUE(poll_tx_flag(0)) << "Timeout waiting for dispatcher to process"; + CUDA_CHECK(cudaDeviceSynchronize()); + + ASSERT_TRUE(wait_ready_flag(pd.get())) << "Predecoder ready flag not set"; + + PreDecoderJob job{}; + ASSERT_TRUE(pd->poll_next_job(job)); + float output[kSkipTrtFloats]; + std::memcpy(output, job.inference_data, kSkipTrtBytes); + pd->release_job(0); + + EXPECT_EQ(std::memcmp(input, output, kSkipTrtBytes), 0) + << "Round-trip data should match (identity passthrough)"; + + stop_loop(); + EXPECT_EQ(stats_counter_, 1u); +} +TEST_F(HostDispatcherTest, MultiPredecoderConcurrency) { + constexpr int kNPd = 4; + std::vector> pds; + std::vector plcs(kNPd); + std::vector fids; + + for (int i = 0; i < kNPd; ++i) { + pds.push_back(create_predecoder(i)); + std::string name = "predecoder_" + std::to_string(i); + fids.push_back(rt::fnv1a_hash(name.c_str())); + plcs[i].d_trt_input = pds[i]->get_trt_input_ptr(); + plcs[i].input_size = pds[i]->get_input_size(); + plcs[i].h_ring_ptrs = pds[i]->get_host_ring_ptrs(); + add_worker(fids[i], pds[i]->get_executable_graph(), &plcs[i]); + } + start_loop(); + + float inputs[kNPd][kSkipTrtFloats]; + for (int i = 0; i < kNPd; ++i) + for (size_t j = 0; j < kSkipTrtFloats; ++j) + inputs[i][j] = static_cast(i * 100 + j); + + for (int i = 0; i < kNPd; ++i) + submit_rpc_to_slot(static_cast(i), fids[i], inputs[i], + kSkipTrtBytes); + + for (int i = 0; i < kNPd; ++i) + ASSERT_TRUE(poll_tx_flag(static_cast(i))) + << "Timeout on predecoder " << i; + CUDA_CHECK(cudaDeviceSynchronize()); + + for (int i = 0; i < kNPd; ++i) { + ASSERT_TRUE(wait_ready_flag(pds[i].get())) + << "Ready flag not set for predecoder " << i; PreDecoderJob job{}; - ASSERT_TRUE(pd->poll_next_job(job)); + ASSERT_TRUE(pds[i]->poll_next_job(job)); float output[kSkipTrtFloats]; std::memcpy(output, job.inference_data, kSkipTrtBytes); - pd->release_job(0); + pds[i]->release_job(0); - EXPECT_EQ(std::memcmp(input, output, kSkipTrtBytes), 0) - << "Round-trip data should match (identity passthrough)"; + EXPECT_EQ(std::memcmp(inputs[i], output, kSkipTrtBytes), 0) + << "Predecoder " << i << ": output should match input"; + } - stop_loop(); - EXPECT_EQ(stats_counter_, 1u); + stop_loop(); + EXPECT_EQ(stats_counter_, static_cast(kNPd)); } -TEST_F(HostDispatcherTest, MultiPredecoderConcurrency) { - constexpr int kNPd = 4; - std::vector> pds; - std::vector plcs(kNPd); - std::vector fids; - - for (int i = 0; i < kNPd; ++i) { - pds.push_back(create_predecoder(i)); - std::string name = "predecoder_" + std::to_string(i); - fids.push_back(rt::fnv1a_hash(name.c_str())); - plcs[i].d_trt_input = pds[i]->get_trt_input_ptr(); - plcs[i].input_size = pds[i]->get_input_size(); - plcs[i].h_ring_ptrs = pds[i]->get_host_ring_ptrs(); - add_worker(fids[i], pds[i]->get_executable_graph(), &plcs[i]); +TEST_F(HostDispatcherTest, SustainedThroughput_200Requests) { + constexpr int kNPd = 2; + constexpr int kTotalRequests = 200; + + std::vector> pds; + std::vector plcs(kNPd); + std::vector fids; + + for (int i = 0; i < kNPd; ++i) { + pds.push_back(create_predecoder(i)); + std::string name = "sustained_pd_" + std::to_string(i); + fids.push_back(rt::fnv1a_hash(name.c_str())); + plcs[i].d_trt_input = pds[i]->get_trt_input_ptr(); + plcs[i].input_size = pds[i]->get_input_size(); + plcs[i].h_ring_ptrs = pds[i]->get_host_ring_ptrs(); + add_worker(fids[i], pds[i]->get_executable_graph(), &plcs[i]); + } + start_loop(); + + std::mt19937 rng(999); + std::uniform_real_distribution dist(-10.0f, 10.0f); + int completed = 0; + + for (int r = 0; r < kTotalRequests; ++r) { + int pd_idx = r % kNPd; + size_t slot = static_cast(r % kNumSlots); + + auto *rx = reinterpret_cast(rx_flags_host_); + auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(5); + while (rx[slot].load(cuda::std::memory_order_acquire) != 0) { + if (std::chrono::steady_clock::now() > deadline) + FAIL() << "Timeout waiting for slot " << slot << " to clear at request " + << r; + usleep(100); } - start_loop(); + clear_tx_flag(slot); - float inputs[kNPd][kSkipTrtFloats]; - for (int i = 0; i < kNPd; ++i) - for (size_t j = 0; j < kSkipTrtFloats; ++j) - inputs[i][j] = static_cast(i * 100 + j); + float payload[kSkipTrtFloats]; + for (size_t i = 0; i < kSkipTrtFloats; ++i) + payload[i] = dist(rng); - for (int i = 0; i < kNPd; ++i) - submit_rpc_to_slot(static_cast(i), fids[i], - inputs[i], kSkipTrtBytes); + submit_rpc_to_slot(slot, fids[pd_idx], payload, kSkipTrtBytes); - for (int i = 0; i < kNPd; ++i) - ASSERT_TRUE(poll_tx_flag(static_cast(i))) - << "Timeout on predecoder " << i; + ASSERT_TRUE(poll_tx_flag(slot)) + << "Timeout on request " << r << " (slot " << slot << ")"; CUDA_CHECK(cudaDeviceSynchronize()); - for (int i = 0; i < kNPd; ++i) { - ASSERT_TRUE(wait_ready_flag(pds[i].get())) - << "Ready flag not set for predecoder " << i; - PreDecoderJob job{}; - ASSERT_TRUE(pds[i]->poll_next_job(job)); - float output[kSkipTrtFloats]; - std::memcpy(output, job.inference_data, kSkipTrtBytes); - pds[i]->release_job(0); - - EXPECT_EQ(std::memcmp(inputs[i], output, kSkipTrtBytes), 0) - << "Predecoder " << i << ": output should match input"; - } - - stop_loop(); - EXPECT_EQ(stats_counter_, static_cast(kNPd)); -} + ASSERT_TRUE(wait_ready_flag(pds[pd_idx].get())) + << "Ready flag not set for request " << r; + PreDecoderJob job{}; + if (pds[pd_idx]->poll_next_job(job)) + pds[pd_idx]->release_job(0); -TEST_F(HostDispatcherTest, SustainedThroughput_200Requests) { - constexpr int kNPd = 2; - constexpr int kTotalRequests = 200; - - std::vector> pds; - std::vector plcs(kNPd); - std::vector fids; - - for (int i = 0; i < kNPd; ++i) { - pds.push_back(create_predecoder(i)); - std::string name = "sustained_pd_" + std::to_string(i); - fids.push_back(rt::fnv1a_hash(name.c_str())); - plcs[i].d_trt_input = pds[i]->get_trt_input_ptr(); - plcs[i].input_size = pds[i]->get_input_size(); - plcs[i].h_ring_ptrs = pds[i]->get_host_ring_ptrs(); - add_worker(fids[i], pds[i]->get_executable_graph(), &plcs[i]); - } - start_loop(); - - std::mt19937 rng(999); - std::uniform_real_distribution dist(-10.0f, 10.0f); - int completed = 0; - - for (int r = 0; r < kTotalRequests; ++r) { - int pd_idx = r % kNPd; - size_t slot = static_cast(r % kNumSlots); - - auto* rx = reinterpret_cast(rx_flags_host_); - auto deadline = std::chrono::steady_clock::now() + - std::chrono::seconds(5); - while (rx[slot].load(cuda::std::memory_order_acquire) != 0) { - if (std::chrono::steady_clock::now() > deadline) - FAIL() << "Timeout waiting for slot " << slot - << " to clear at request " << r; - usleep(100); - } - clear_tx_flag(slot); - - float payload[kSkipTrtFloats]; - for (size_t i = 0; i < kSkipTrtFloats; ++i) - payload[i] = dist(rng); - - submit_rpc_to_slot(slot, fids[pd_idx], payload, kSkipTrtBytes); - - ASSERT_TRUE(poll_tx_flag(slot)) - << "Timeout on request " << r << " (slot " << slot << ")"; - CUDA_CHECK(cudaDeviceSynchronize()); - - ASSERT_TRUE(wait_ready_flag(pds[pd_idx].get())) - << "Ready flag not set for request " << r; - PreDecoderJob job{}; - if (pds[pd_idx]->poll_next_job(job)) - pds[pd_idx]->release_job(0); - - restore_worker(pd_idx); - completed++; - } + restore_worker(pd_idx); + completed++; + } - stop_loop(); - EXPECT_EQ(completed, kTotalRequests); - EXPECT_EQ(stats_counter_, static_cast(kTotalRequests)); + stop_loop(); + EXPECT_EQ(completed, kTotalRequests); + EXPECT_EQ(stats_counter_, static_cast(kTotalRequests)); } } // namespace diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h b/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h index cf8eaacb..e484a69c 100644 --- a/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h +++ b/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h @@ -43,9 +43,9 @@ typedef enum { } cudaq_tx_status_t; // RPC wire-format constants (must match dispatch_kernel_launch.h). -#define CUDAQ_RPC_MAGIC_REQUEST 0x43555152u /* 'CUQR' */ +#define CUDAQ_RPC_MAGIC_REQUEST 0x43555152u /* 'CUQR' */ #define CUDAQ_RPC_MAGIC_RESPONSE 0x43555153u /* 'CUQS' */ -#define CUDAQ_RPC_HEADER_SIZE 12u /* 3 x uint32_t */ +#define CUDAQ_RPC_HEADER_SIZE 12u /* 3 x uint32_t */ // Kernel synchronization type typedef enum { @@ -102,8 +102,8 @@ typedef struct { uint32_t slot_size; // bytes per slot uint32_t vp_id; // virtual port ID cudaq_kernel_type_t kernel_type; // regular/cooperative kernel - cudaq_dispatch_mode_t dispatch_mode; // device call/graph launch - cudaq_backend_t backend; // device kernel or host loop (default DEVICE_KERNEL) + cudaq_dispatch_mode_t dispatch_mode; // device call/graph launch + cudaq_backend_t backend; // device kernel or host loop (default DEVICE_KERNEL) } cudaq_dispatcher_config_t; // GPU ring buffer pointers. For device backend use device pointers only. @@ -116,23 +116,25 @@ typedef struct { uint8_t *tx_data; // device pointer to TX data buffer size_t rx_stride_sz; // size of each RX slot in bytes size_t tx_stride_sz; // size of each TX slot in bytes - // Host-side view (required when backend == CUDAQ_BACKEND_HOST_LOOP; NULL otherwise) + // Host-side view (required when backend == CUDAQ_BACKEND_HOST_LOOP; NULL + // otherwise) volatile uint64_t *rx_flags_host; volatile uint64_t *tx_flags_host; uint8_t *rx_data_host; uint8_t *tx_data_host; } cudaq_ringbuffer_t; -// Host RPC callback: reads RPCHeader + args from slot, writes RPCResponse + result. -// slot_host is the host pointer to the slot (same layout as device slot). +// Host RPC callback: reads RPCHeader + args from slot, writes RPCResponse + +// result. slot_host is the host pointer to the slot (same layout as device +// slot). typedef void (*cudaq_host_rpc_fn_t)(void *slot_host, size_t slot_size); // Unified function table entry with schema typedef struct { union { - void *device_fn_ptr; // for CUDAQ_DISPATCH_DEVICE_CALL - cudaGraphExec_t graph_exec; // for CUDAQ_DISPATCH_GRAPH_LAUNCH - cudaq_host_rpc_fn_t host_fn; // for CUDAQ_DISPATCH_HOST_CALL + void *device_fn_ptr; // for CUDAQ_DISPATCH_DEVICE_CALL + cudaGraphExec_t graph_exec; // for CUDAQ_DISPATCH_GRAPH_LAUNCH + cudaq_host_rpc_fn_t host_fn; // for CUDAQ_DISPATCH_HOST_CALL } handler; uint32_t function_id; // hash of function name (FNV-1a) uint8_t dispatch_mode; // cudaq_dispatch_mode_t value @@ -275,16 +277,14 @@ cudaq_status_t cudaq_dispatcher_get_processed(cudaq_dispatcher_t *dispatcher, typedef struct cudaq_host_dispatcher_handle cudaq_host_dispatcher_handle_t; -// Start the host dispatcher loop in a new thread. Call from cudaq_dispatcher_start -// when backend is CUDAQ_BACKEND_HOST_LOOP. Returns a handle for stop, or NULL on error. -// If external_mailbox is non-NULL, uses it instead of allocating internally. +// Start the host dispatcher loop in a new thread. Call from +// cudaq_dispatcher_start when backend is CUDAQ_BACKEND_HOST_LOOP. Returns a +// handle for stop, or NULL on error. If external_mailbox is non-NULL, uses it +// instead of allocating internally. cudaq_host_dispatcher_handle_t *cudaq_host_dispatcher_start_thread( - const cudaq_ringbuffer_t *ringbuffer, - const cudaq_function_table_t *table, - const cudaq_dispatcher_config_t *config, - volatile int *shutdown_flag, - uint64_t *stats, - void **external_mailbox); + const cudaq_ringbuffer_t *ringbuffer, const cudaq_function_table_t *table, + const cudaq_dispatcher_config_t *config, volatile int *shutdown_flag, + uint64_t *stats, void **external_mailbox); // Stop the host dispatcher thread and free resources. void cudaq_host_dispatcher_stop(cudaq_host_dispatcher_handle_t *handle); @@ -315,8 +315,9 @@ void cudaq_host_ringbuffer_signal_slot(const cudaq_ringbuffer_t *rb, // Poll tx_flags_host[slot_idx] and classify the result. // If status == CUDAQ_TX_ERROR and out_cuda_error is non-NULL, the CUDA error // code is written there. -cudaq_tx_status_t cudaq_host_ringbuffer_poll_tx_flag( - const cudaq_ringbuffer_t *rb, uint32_t slot_idx, int *out_cuda_error); +cudaq_tx_status_t +cudaq_host_ringbuffer_poll_tx_flag(const cudaq_ringbuffer_t *rb, + uint32_t slot_idx, int *out_cuda_error); // Check whether a slot is available for reuse (both rx and tx flags are 0). int cudaq_host_ringbuffer_slot_available(const cudaq_ringbuffer_t *rb, diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh b/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh index 3b3be6dc..1ebef291 100644 --- a/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh +++ b/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh @@ -17,66 +17,46 @@ #include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" #include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" -#include "cudaq/realtime/daemon/dispatcher/kernel_types.h" #include "cudaq/realtime/daemon/dispatcher/dispatch_modes.h" +#include "cudaq/realtime/daemon/dispatcher/kernel_types.h" -#include #include +#include namespace cudaq::realtime { //============================================================================== // Kernel Launch Function Declarations (with schema-driven function table) //============================================================================== -// These declarations match the extern "C" functions defined in dispatch_kernel.cu -// and cudaq_realtime.h +// These declarations match the extern "C" functions defined in +// dispatch_kernel.cu and cudaq_realtime.h /// @brief Inline wrapper for regular kernel (schema-aware). inline void launch_dispatch_kernel_regular_inline( - volatile std::uint64_t* rx_flags, - volatile std::uint64_t* tx_flags, - std::uint8_t* rx_data, - std::uint8_t* tx_data, - std::size_t rx_stride_sz, - std::size_t tx_stride_sz, - cudaq_function_entry_t* function_table, - std::size_t func_count, - volatile int* shutdown_flag, - std::uint64_t* stats, - std::size_t num_slots, - std::uint32_t num_blocks, - std::uint32_t threads_per_block, - cudaStream_t stream) { + volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, + std::uint8_t *rx_data, std::uint8_t *tx_data, std::size_t rx_stride_sz, + std::size_t tx_stride_sz, cudaq_function_entry_t *function_table, + std::size_t func_count, volatile int *shutdown_flag, std::uint64_t *stats, + std::size_t num_slots, std::uint32_t num_blocks, + std::uint32_t threads_per_block, cudaStream_t stream) { cudaq_launch_dispatch_kernel_regular( - rx_flags, tx_flags, rx_data, tx_data, - rx_stride_sz, tx_stride_sz, - function_table, func_count, - shutdown_flag, stats, num_slots, - num_blocks, threads_per_block, stream); + rx_flags, tx_flags, rx_data, tx_data, rx_stride_sz, tx_stride_sz, + function_table, func_count, shutdown_flag, stats, num_slots, num_blocks, + threads_per_block, stream); } /// @brief Inline wrapper for cooperative kernel (schema-aware). inline void launch_dispatch_kernel_cooperative_inline( - volatile std::uint64_t* rx_flags, - volatile std::uint64_t* tx_flags, - std::uint8_t* rx_data, - std::uint8_t* tx_data, - std::size_t rx_stride_sz, - std::size_t tx_stride_sz, - cudaq_function_entry_t* function_table, - std::size_t func_count, - volatile int* shutdown_flag, - std::uint64_t* stats, - std::size_t num_slots, - std::uint32_t num_blocks, - std::uint32_t threads_per_block, - cudaStream_t stream) { + volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, + std::uint8_t *rx_data, std::uint8_t *tx_data, std::size_t rx_stride_sz, + std::size_t tx_stride_sz, cudaq_function_entry_t *function_table, + std::size_t func_count, volatile int *shutdown_flag, std::uint64_t *stats, + std::size_t num_slots, std::uint32_t num_blocks, + std::uint32_t threads_per_block, cudaStream_t stream) { cudaq_launch_dispatch_kernel_cooperative( - rx_flags, tx_flags, rx_data, tx_data, - rx_stride_sz, tx_stride_sz, - function_table, func_count, - shutdown_flag, stats, num_slots, - num_blocks, threads_per_block, stream); + rx_flags, tx_flags, rx_data, tx_data, rx_stride_sz, tx_stride_sz, + function_table, func_count, shutdown_flag, stats, num_slots, num_blocks, + threads_per_block, stream); } } // namespace cudaq::realtime diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h b/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h index 67faf832..9b7c5ca6 100644 --- a/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h +++ b/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h @@ -10,10 +10,10 @@ #include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" -#include -#include #include #include +#include +#include #include #ifndef QEC_CPU_RELAX @@ -23,7 +23,9 @@ #elif defined(__aarch64__) #define QEC_CPU_RELAX() __asm__ volatile("yield" ::: "memory") #else -#define QEC_CPU_RELAX() do { } while (0) +#define QEC_CPU_RELAX() \ + do { \ + } while (0) #endif #endif @@ -33,43 +35,50 @@ using atomic_uint64_sys = cuda::std::atomic; using atomic_int_sys = cuda::std::atomic; struct HostDispatchWorker { - cudaGraphExec_t graph_exec; - cudaStream_t stream; - uint32_t function_id; // matches table entry; used to assign slot to this worker - void (*pre_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream) = nullptr; - void* pre_launch_data = nullptr; - void (*post_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream) = nullptr; - void* post_launch_data = nullptr; + cudaGraphExec_t graph_exec; + cudaStream_t stream; + uint32_t + function_id; // matches table entry; used to assign slot to this worker + void (*pre_launch_fn)(void *user_data, void *slot_dev, + cudaStream_t stream) = nullptr; + void *pre_launch_data = nullptr; + void (*post_launch_fn)(void *user_data, void *slot_dev, + cudaStream_t stream) = nullptr; + void *post_launch_data = nullptr; }; struct HostDispatcherConfig { - atomic_uint64_sys* rx_flags; - atomic_uint64_sys* tx_flags; - uint8_t* rx_data_host; - uint8_t* rx_data_dev; - uint8_t* tx_data_host; - uint8_t* tx_data_dev; - size_t tx_stride_sz; - void** h_mailbox_bank; - size_t num_slots; - size_t slot_size; - std::vector workers; - /// Host-visible function table for lookup by function_id (GRAPH_LAUNCH only; others dropped). - cudaq_function_entry_t* function_table = nullptr; - size_t function_table_count = 0; - atomic_int_sys* shutdown_flag; - uint64_t* stats_counter; - /// Optional: atomic counter incremented on each dispatch (for progress diagnostics). - atomic_uint64_sys* live_dispatched = nullptr; + atomic_uint64_sys *rx_flags; + atomic_uint64_sys *tx_flags; + uint8_t *rx_data_host; + uint8_t *rx_data_dev; + uint8_t *tx_data_host; + uint8_t *tx_data_dev; + size_t tx_stride_sz; + void **h_mailbox_bank; + size_t num_slots; + size_t slot_size; + std::vector workers; + /// Host-visible function table for lookup by function_id (GRAPH_LAUNCH only; + /// others dropped). + cudaq_function_entry_t *function_table = nullptr; + size_t function_table_count = 0; + atomic_int_sys *shutdown_flag; + uint64_t *stats_counter; + /// Optional: atomic counter incremented on each dispatch (for progress + /// diagnostics). + atomic_uint64_sys *live_dispatched = nullptr; - /// Dynamic worker pool (graph workers only) - atomic_uint64_sys* idle_mask; ///< 1 = free, 0 = busy; bit index = worker_id - int* inflight_slot_tags; ///< worker_id -> origin FPGA slot for tx_flags routing + /// Dynamic worker pool (graph workers only) + atomic_uint64_sys *idle_mask; ///< 1 = free, 0 = busy; bit index = worker_id + int *inflight_slot_tags; ///< worker_id -> origin FPGA slot for tx_flags + ///< routing }; /// Run the host-side dispatcher loop. Blocks until *config.shutdown_flag /// becomes non-zero. Call from a dedicated thread. -/// Uses dynamic worker pool: allocates via idle_mask, tags with inflight_slot_tags. -void host_dispatcher_loop(const HostDispatcherConfig& config); +/// Uses dynamic worker pool: allocates via idle_mask, tags with +/// inflight_slot_tags. +void host_dispatcher_loop(const HostDispatcherConfig &config); } // namespace cudaq::realtime diff --git a/realtime/include/cudaq/realtime/pipeline.h b/realtime/include/cudaq/realtime/pipeline.h index 2bdcacd2..310bae61 100644 --- a/realtime/include/cudaq/realtime/pipeline.h +++ b/realtime/include/cudaq/realtime/pipeline.h @@ -8,9 +8,9 @@ #pragma once -#include #include #include +#include #include #include #include @@ -22,16 +22,16 @@ namespace cudaq::realtime { // --------------------------------------------------------------------------- struct CorePinning { - int dispatcher = -1; // -1 = no pinning - int consumer = -1; - int worker_base = -1; // workers pin to base, base+1, ... + int dispatcher = -1; // -1 = no pinning + int consumer = -1; + int worker_base = -1; // workers pin to base, base+1, ... }; struct PipelineStageConfig { - int num_workers = 8; - int num_slots = 32; - size_t slot_size = 16384; - CorePinning cores; + int num_workers = 8; + int num_slots = 32; + size_t slot_size = 16384; + CorePinning cores; }; // --------------------------------------------------------------------------- @@ -39,17 +39,20 @@ struct PipelineStageConfig { // --------------------------------------------------------------------------- struct GpuWorkerResources { - cudaGraphExec_t graph_exec = nullptr; - cudaStream_t stream = nullptr; - void (*pre_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream) = nullptr; - void* pre_launch_data = nullptr; - void (*post_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream) = nullptr; - void* post_launch_data = nullptr; - uint32_t function_id = 0; - void* user_context = nullptr; + cudaGraphExec_t graph_exec = nullptr; + cudaStream_t stream = nullptr; + void (*pre_launch_fn)(void *user_data, void *slot_dev, + cudaStream_t stream) = nullptr; + void *pre_launch_data = nullptr; + void (*post_launch_fn)(void *user_data, void *slot_dev, + cudaStream_t stream) = nullptr; + void *post_launch_data = nullptr; + uint32_t function_id = 0; + void *user_context = nullptr; }; -/// Called once per worker during start(). Returns GPU resources for that worker. +/// Called once per worker during start(). Returns GPU resources for that +/// worker. using GpuStageFactory = std::function; // --------------------------------------------------------------------------- @@ -60,31 +63,31 @@ using GpuStageFactory = std::function; /// The user reads gpu_output, does post-processing, and writes the /// result into response_buffer. No atomics are exposed. struct CpuStageContext { - int worker_id; - int origin_slot; - const void* gpu_output; - size_t gpu_output_size; - void* response_buffer; - size_t max_response_size; - void* user_context; + int worker_id; + int origin_slot; + const void *gpu_output; + size_t gpu_output_size; + void *response_buffer; + size_t max_response_size; + void *user_context; }; /// Returns the number of bytes written into response_buffer. -using CpuStageCallback = std::function; +using CpuStageCallback = std::function; // --------------------------------------------------------------------------- // Completion Callback // --------------------------------------------------------------------------- struct Completion { - uint64_t request_id; - int slot; - bool success; - int cuda_error; // 0 on success + uint64_t request_id; + int slot; + bool success; + int cuda_error; // 0 on success }; /// Called by the consumer thread for each completed (or errored) request. -using CompletionCallback = std::function; +using CompletionCallback = std::function; // --------------------------------------------------------------------------- // Ring Buffer Injector (software-only test/replay data source) @@ -95,29 +98,29 @@ using CompletionCallback = std::function; /// The parent RealtimePipeline must outlive the injector. class RingBufferInjector { public: - ~RingBufferInjector(); - RingBufferInjector(RingBufferInjector&&) noexcept; - RingBufferInjector& operator=(RingBufferInjector&&) noexcept; + ~RingBufferInjector(); + RingBufferInjector(RingBufferInjector &&) noexcept; + RingBufferInjector &operator=(RingBufferInjector &&) noexcept; - RingBufferInjector(const RingBufferInjector&) = delete; - RingBufferInjector& operator=(const RingBufferInjector&) = delete; + RingBufferInjector(const RingBufferInjector &) = delete; + RingBufferInjector &operator=(const RingBufferInjector &) = delete; - /// Try to submit a request. Returns true if accepted, false if - /// backpressure (all slots busy). Non-blocking. Thread-safe. - bool try_submit(uint32_t function_id, const void* payload, - size_t payload_size, uint64_t request_id); + /// Try to submit a request. Returns true if accepted, false if + /// backpressure (all slots busy). Non-blocking. Thread-safe. + bool try_submit(uint32_t function_id, const void *payload, + size_t payload_size, uint64_t request_id); - /// Blocking submit: spins until a slot becomes available. - void submit(uint32_t function_id, const void* payload, - size_t payload_size, uint64_t request_id); + /// Blocking submit: spins until a slot becomes available. + void submit(uint32_t function_id, const void *payload, size_t payload_size, + uint64_t request_id); - uint64_t backpressure_stalls() const; + uint64_t backpressure_stalls() const; private: - friend class RealtimePipeline; - struct State; - std::unique_ptr state_; - explicit RingBufferInjector(std::unique_ptr s); + friend class RealtimePipeline; + struct State; + std::unique_ptr state_; + explicit RingBufferInjector(std::unique_ptr s); }; // --------------------------------------------------------------------------- @@ -126,44 +129,44 @@ class RingBufferInjector { class RealtimePipeline { public: - explicit RealtimePipeline(const PipelineStageConfig& config); - ~RealtimePipeline(); + explicit RealtimePipeline(const PipelineStageConfig &config); + ~RealtimePipeline(); - RealtimePipeline(const RealtimePipeline&) = delete; - RealtimePipeline& operator=(const RealtimePipeline&) = delete; + RealtimePipeline(const RealtimePipeline &) = delete; + RealtimePipeline &operator=(const RealtimePipeline &) = delete; - /// Register the GPU stage factory (called before start). - void set_gpu_stage(GpuStageFactory factory); + /// Register the GPU stage factory (called before start). + void set_gpu_stage(GpuStageFactory factory); - /// Register the CPU worker callback (called before start). - void set_cpu_stage(CpuStageCallback callback); + /// Register the CPU worker callback (called before start). + void set_cpu_stage(CpuStageCallback callback); - /// Register the completion callback (called before start). - void set_completion_handler(CompletionCallback handler); + /// Register the completion callback (called before start). + void set_completion_handler(CompletionCallback handler); - /// Allocate resources, build dispatcher config, spawn all threads. - void start(); + /// Allocate resources, build dispatcher config, spawn all threads. + void start(); - /// Signal shutdown, join all threads, free resources. - void stop(); + /// Signal shutdown, join all threads, free resources. + void stop(); - /// Create a software injector for testing without FPGA hardware. - /// The pipeline must be constructed but need not be started yet. - RingBufferInjector create_injector(); + /// Create a software injector for testing without FPGA hardware. + /// The pipeline must be constructed but need not be started yet. + RingBufferInjector create_injector(); - struct Stats { - uint64_t submitted; - uint64_t completed; - uint64_t dispatched; - uint64_t backpressure_stalls; - }; + struct Stats { + uint64_t submitted; + uint64_t completed; + uint64_t dispatched; + uint64_t backpressure_stalls; + }; - /// Thread-safe, lock-free stats snapshot. - Stats stats() const; + /// Thread-safe, lock-free stats snapshot. + Stats stats() const; private: - struct Impl; - std::unique_ptr impl_; + struct Impl; + std::unique_ptr impl_; }; } // namespace cudaq::realtime diff --git a/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp b/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp index b7054235..3b8ba1d8 100644 --- a/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp +++ b/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp @@ -64,8 +64,10 @@ static cudaq_status_t validate_dispatcher(cudaq_dispatcher_t *dispatcher) { return CUDAQ_ERR_INVALID_ARG; if (dispatcher->config.backend == CUDAQ_BACKEND_HOST_LOOP) { - if (!dispatcher->ringbuffer.rx_flags_host || !dispatcher->ringbuffer.tx_flags_host || - !dispatcher->ringbuffer.rx_data_host || !dispatcher->ringbuffer.tx_data_host) + if (!dispatcher->ringbuffer.rx_flags_host || + !dispatcher->ringbuffer.tx_flags_host || + !dispatcher->ringbuffer.rx_data_host || + !dispatcher->ringbuffer.tx_data_host) return CUDAQ_ERR_INVALID_ARG; return CUDAQ_OK; } @@ -156,7 +158,8 @@ cudaq_dispatcher_set_launch_fn(cudaq_dispatcher_t *dispatcher, cudaq_dispatch_launch_fn_t launch_fn) { if (!dispatcher) return CUDAQ_ERR_INVALID_ARG; - if (dispatcher->config.backend == CUDAQ_BACKEND_HOST_LOOP && launch_fn != nullptr) + if (dispatcher->config.backend == CUDAQ_BACKEND_HOST_LOOP && + launch_fn != nullptr) return CUDAQ_ERR_INVALID_ARG; if (dispatcher->config.backend != CUDAQ_BACKEND_HOST_LOOP && !launch_fn) return CUDAQ_ERR_INVALID_ARG; @@ -291,19 +294,20 @@ cudaq_status_t cudaq_host_ringbuffer_write_rpc_request( void cudaq_host_ringbuffer_signal_slot(const cudaq_ringbuffer_t *rb, uint32_t slot_idx) { __sync_synchronize(); - const_cast( - rb->rx_flags_host)[slot_idx] = reinterpret_cast( - rb->rx_data_host + slot_idx * rb->rx_stride_sz); + const_cast(rb->rx_flags_host)[slot_idx] = + reinterpret_cast(rb->rx_data_host + + slot_idx * rb->rx_stride_sz); } static inline uint64_t load_acquire(volatile uint64_t *addr) { - auto *a = reinterpret_cast *>( - const_cast(addr)); + auto *a = + reinterpret_cast *>(const_cast(addr)); return a->load(std::memory_order_acquire); } -cudaq_tx_status_t cudaq_host_ringbuffer_poll_tx_flag( - const cudaq_ringbuffer_t *rb, uint32_t slot_idx, int *out_cuda_error) { +cudaq_tx_status_t +cudaq_host_ringbuffer_poll_tx_flag(const cudaq_ringbuffer_t *rb, + uint32_t slot_idx, int *out_cuda_error) { uint64_t v = load_acquire(&rb->tx_flags_host[slot_idx]); if (v == 0) return CUDAQ_TX_EMPTY; diff --git a/realtime/lib/daemon/dispatcher/dispatch_kernel.cu b/realtime/lib/daemon/dispatcher/dispatch_kernel.cu index dceac063..0500929f 100644 --- a/realtime/lib/daemon/dispatcher/dispatch_kernel.cu +++ b/realtime/lib/daemon/dispatcher/dispatch_kernel.cu @@ -7,14 +7,14 @@ ******************************************************************************/ #include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" -#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" #include "cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" #include "cudaq/realtime/daemon/dispatcher/dispatch_modes.h" #include "cudaq/realtime/daemon/dispatcher/kernel_types.h" -#include -#include #include +#include +#include namespace cudaq::realtime { @@ -23,10 +23,10 @@ namespace cudaq::realtime { //============================================================================== /// @brief Lookup function entry in table by function_id. -__device__ inline const cudaq_function_entry_t* dispatch_lookup_entry( - std::uint32_t function_id, - cudaq_function_entry_t* entries, - std::size_t entry_count) { +__device__ inline const cudaq_function_entry_t * +dispatch_lookup_entry(std::uint32_t function_id, + cudaq_function_entry_t *entries, + std::size_t entry_count) { for (std::size_t i = 0; i < entry_count; ++i) { if (entries[i].function_id == function_id) { return &entries[i]; @@ -51,15 +51,10 @@ __device__ inline const cudaq_function_entry_t* dispatch_lookup_entry( /// then all threads call the handler after a grid.sync(). template __global__ void dispatch_kernel_device_call_only( - volatile std::uint64_t* rx_flags, - volatile std::uint64_t* tx_flags, - std::uint8_t* tx_data, - std::size_t tx_stride_sz, - cudaq_function_entry_t* function_table, - std::size_t func_count, - volatile int* shutdown_flag, - std::uint64_t* stats, - std::size_t num_slots) { + volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, + std::uint8_t *tx_data, std::size_t tx_stride_sz, + cudaq_function_entry_t *function_table, std::size_t func_count, + volatile int *shutdown_flag, std::uint64_t *stats, std::size_t num_slots) { int tid = threadIdx.x + blockIdx.x * blockDim.x; std::uint64_t local_packet_count = 0; std::size_t current_slot = 0; @@ -73,21 +68,21 @@ __global__ void dispatch_kernel_device_call_only( // read the device-memory copies after the grid barrier. //========================================================================== __shared__ DeviceRPCFunction s_func; - __shared__ void* s_arg_buffer; - __shared__ std::uint8_t* s_output_buffer; - __shared__ std::uint32_t s_arg_len; - __shared__ std::uint32_t s_max_result_len; - __shared__ bool s_have_work; + __shared__ void *s_arg_buffer; + __shared__ std::uint8_t *s_output_buffer; + __shared__ std::uint32_t s_arg_len; + __shared__ std::uint32_t s_max_result_len; + __shared__ bool s_have_work; // Device-memory work descriptor visible to all blocks after grid.sync. // We use a single set since the cooperative kernel processes one RPC at // a time (all threads participate, so no pipelining). __device__ static DeviceRPCFunction d_func; - __device__ static void* d_arg_buffer; - __device__ static std::uint8_t* d_output_buffer; - __device__ static std::uint32_t d_arg_len; - __device__ static std::uint32_t d_max_result_len; - __device__ static bool d_have_work; + __device__ static void *d_arg_buffer; + __device__ static std::uint8_t *d_output_buffer; + __device__ static std::uint32_t d_arg_len; + __device__ static std::uint32_t d_max_result_len; + __device__ static bool d_have_work; while (!(*shutdown_flag)) { // --- Phase 1: Thread 0 polls and parses --- @@ -95,30 +90,30 @@ __global__ void dispatch_kernel_device_call_only( s_have_work = false; std::uint64_t rx_value = rx_flags[current_slot]; if (rx_value != 0) { - void* rx_slot = reinterpret_cast(rx_value); - RPCHeader* header = static_cast(rx_slot); + void *rx_slot = reinterpret_cast(rx_value); + RPCHeader *header = static_cast(rx_slot); if (header->magic == RPC_MAGIC_REQUEST) { - const cudaq_function_entry_t* entry = dispatch_lookup_entry( + const cudaq_function_entry_t *entry = dispatch_lookup_entry( header->function_id, function_table, func_count); if (entry != nullptr && entry->dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) { - std::uint8_t* tx_slot = tx_data + current_slot * tx_stride_sz; + std::uint8_t *tx_slot = tx_data + current_slot * tx_stride_sz; - s_func = reinterpret_cast( + s_func = reinterpret_cast( entry->handler.device_fn_ptr); - s_arg_buffer = static_cast(header + 1); + s_arg_buffer = static_cast(header + 1); s_output_buffer = tx_slot + sizeof(RPCResponse); - s_arg_len = header->arg_len; + s_arg_len = header->arg_len; s_max_result_len = tx_stride_sz - sizeof(RPCResponse); - s_have_work = true; + s_have_work = true; // Publish to device memory for other blocks - d_func = s_func; - d_arg_buffer = s_arg_buffer; - d_output_buffer = s_output_buffer; - d_arg_len = s_arg_len; + d_func = s_func; + d_arg_buffer = s_arg_buffer; + d_output_buffer = s_output_buffer; + d_arg_len = s_arg_len; d_max_result_len = s_max_result_len; - d_have_work = true; + d_have_work = true; } } if (!s_have_work) { @@ -135,23 +130,23 @@ __global__ void dispatch_kernel_device_call_only( // Non-block-0 threads read from device memory bool have_work; DeviceRPCFunction func; - void* arg_buffer; - std::uint8_t* output_buffer; + void *arg_buffer; + std::uint8_t *output_buffer; std::uint32_t arg_len; std::uint32_t max_result_len; if (blockIdx.x == 0) { - have_work = s_have_work; - func = s_func; - arg_buffer = s_arg_buffer; - output_buffer = s_output_buffer; - arg_len = s_arg_len; + have_work = s_have_work; + func = s_func; + arg_buffer = s_arg_buffer; + output_buffer = s_output_buffer; + arg_len = s_arg_len; max_result_len = s_max_result_len; } else { - have_work = d_have_work; - func = d_func; - arg_buffer = d_arg_buffer; - output_buffer = d_output_buffer; - arg_len = d_arg_len; + have_work = d_have_work; + func = d_func; + arg_buffer = d_arg_buffer; + output_buffer = d_output_buffer; + arg_len = d_arg_len; max_result_len = d_max_result_len; } @@ -159,16 +154,16 @@ __global__ void dispatch_kernel_device_call_only( std::uint32_t result_len = 0; int status = 0; if (have_work) { - status = func(arg_buffer, output_buffer, arg_len, - max_result_len, &result_len); + status = func(arg_buffer, output_buffer, arg_len, max_result_len, + &result_len); } // --- Phase 4: Sync, then thread 0 writes response --- KernelType::sync(); if (tid == 0 && have_work) { - std::uint8_t* tx_slot = tx_data + current_slot * tx_stride_sz; - RPCResponse* response = reinterpret_cast(tx_slot); + std::uint8_t *tx_slot = tx_data + current_slot * tx_stride_sz; + RPCResponse *response = reinterpret_cast(tx_slot); response->magic = RPC_MAGIC_RESPONSE; response->status = status; response->result_len = result_len; @@ -203,8 +198,8 @@ __global__ void dispatch_kernel_device_call_only( if (rx_value != 0) { // RX data address comes from rx_flags (set by Hololink RX kernel // or host test harness to the address of the RX data slot) - void* rx_slot = reinterpret_cast(rx_value); - RPCHeader* header = static_cast(rx_slot); + void *rx_slot = reinterpret_cast(rx_value); + RPCHeader *header = static_cast(rx_slot); if (header->magic != RPC_MAGIC_REQUEST) { __threadfence_system(); rx_flags[current_slot] = 0; @@ -213,33 +208,36 @@ __global__ void dispatch_kernel_device_call_only( std::uint32_t function_id = header->function_id; std::uint32_t arg_len = header->arg_len; - void* arg_buffer = static_cast(header + 1); + void *arg_buffer = static_cast(header + 1); - const cudaq_function_entry_t* entry = dispatch_lookup_entry( - function_id, function_table, func_count); + const cudaq_function_entry_t *entry = + dispatch_lookup_entry(function_id, function_table, func_count); - if (entry != nullptr && entry->dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) { - DeviceRPCFunction func = - reinterpret_cast(entry->handler.device_fn_ptr); + if (entry != nullptr && + entry->dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) { + DeviceRPCFunction func = reinterpret_cast( + entry->handler.device_fn_ptr); // Compute TX slot address from symmetric TX data buffer - std::uint8_t* tx_slot = tx_data + current_slot * tx_stride_sz; + std::uint8_t *tx_slot = tx_data + current_slot * tx_stride_sz; - // Handler writes results directly to TX slot (after response header) - std::uint8_t* output_buffer = tx_slot + sizeof(RPCResponse); + // Handler writes results directly to TX slot (after response + // header) + std::uint8_t *output_buffer = tx_slot + sizeof(RPCResponse); std::uint32_t result_len = 0; std::uint32_t max_result_len = tx_stride_sz - sizeof(RPCResponse); int status = func(arg_buffer, output_buffer, arg_len, max_result_len, &result_len); // Write RPC response header to TX slot - RPCResponse* response = reinterpret_cast(tx_slot); + RPCResponse *response = reinterpret_cast(tx_slot); response->magic = RPC_MAGIC_RESPONSE; response->status = status; response->result_len = result_len; __threadfence_system(); - // Signal TX with the TX slot address (symmetric with Hololink TX kernel) + // Signal TX with the TX slot address (symmetric with Hololink TX + // kernel) tx_flags[current_slot] = reinterpret_cast(tx_slot); } @@ -259,27 +257,24 @@ __global__ void dispatch_kernel_device_call_only( } if (tid == 0) { - atomicAdd(reinterpret_cast(stats), local_packet_count); + atomicAdd(reinterpret_cast(stats), + local_packet_count); } } /// @brief Dispatch kernel supporting both DEVICE_CALL and GRAPH_LAUNCH modes. -/// This kernel includes device-side graph launch code and requires compute capability >= 9.0. -/// NOTE: Graph launch code is conditionally compiled based on __CUDA_ARCH__. +/// This kernel includes device-side graph launch code and requires compute +/// capability >= 9.0. NOTE: Graph launch code is conditionally compiled based +/// on __CUDA_ARCH__. /// /// Supports symmetric RX/TX data buffers for Hololink compatibility. template __global__ void dispatch_kernel_with_graph( - volatile std::uint64_t* rx_flags, - volatile std::uint64_t* tx_flags, - std::uint8_t* tx_data, - std::size_t tx_stride_sz, - cudaq_function_entry_t* function_table, - std::size_t func_count, - GraphIOContext* graph_io_ctx, - volatile int* shutdown_flag, - std::uint64_t* stats, - std::size_t num_slots) { + volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, + std::uint8_t *tx_data, std::size_t tx_stride_sz, + cudaq_function_entry_t *function_table, std::size_t func_count, + GraphIOContext *graph_io_ctx, volatile int *shutdown_flag, + std::uint64_t *stats, std::size_t num_slots) { int tid = threadIdx.x + blockIdx.x * blockDim.x; std::uint64_t local_packet_count = 0; std::size_t current_slot = 0; @@ -288,8 +283,8 @@ __global__ void dispatch_kernel_with_graph( if (tid == 0) { std::uint64_t rx_value = rx_flags[current_slot]; if (rx_value != 0) { - void* rx_slot = reinterpret_cast(rx_value); - RPCHeader* header = static_cast(rx_slot); + void *rx_slot = reinterpret_cast(rx_value); + RPCHeader *header = static_cast(rx_slot); if (header->magic != RPC_MAGIC_REQUEST) { __threadfence_system(); rx_flags[current_slot] = 0; @@ -298,28 +293,29 @@ __global__ void dispatch_kernel_with_graph( std::uint32_t function_id = header->function_id; std::uint32_t arg_len = header->arg_len; - void* arg_buffer = static_cast(header + 1); + void *arg_buffer = static_cast(header + 1); + + const cudaq_function_entry_t *entry = + dispatch_lookup_entry(function_id, function_table, func_count); - const cudaq_function_entry_t* entry = dispatch_lookup_entry( - function_id, function_table, func_count); - // Compute TX slot address from symmetric TX data buffer - std::uint8_t* tx_slot = tx_data + current_slot * tx_stride_sz; + std::uint8_t *tx_slot = tx_data + current_slot * tx_stride_sz; if (entry != nullptr) { if (entry->dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) { - DeviceRPCFunction func = - reinterpret_cast(entry->handler.device_fn_ptr); + DeviceRPCFunction func = reinterpret_cast( + entry->handler.device_fn_ptr); - // Handler writes results directly to TX slot (after response header) - std::uint8_t* output_buffer = tx_slot + sizeof(RPCResponse); + // Handler writes results directly to TX slot (after response + // header) + std::uint8_t *output_buffer = tx_slot + sizeof(RPCResponse); std::uint32_t result_len = 0; std::uint32_t max_result_len = tx_stride_sz - sizeof(RPCResponse); int status = func(arg_buffer, output_buffer, arg_len, max_result_len, &result_len); // Write RPC response to TX slot - RPCResponse* response = reinterpret_cast(tx_slot); + RPCResponse *response = reinterpret_cast(tx_slot); response->magic = RPC_MAGIC_RESPONSE; response->status = status; response->result_len = result_len; @@ -366,7 +362,8 @@ __global__ void dispatch_kernel_with_graph( } if (tid == 0) { - atomicAdd(reinterpret_cast(stats), local_packet_count); + atomicAdd(reinterpret_cast(stats), + local_packet_count); } } @@ -378,90 +375,80 @@ __global__ void dispatch_kernel_with_graph( // Force eager CUDA module loading for the dispatch kernel. // Call before launching persistent kernels to avoid lazy-loading deadlocks. -extern "C" cudaError_t cudaq_dispatch_kernel_query_occupancy( - int* out_blocks, uint32_t threads_per_block) { +extern "C" cudaError_t +cudaq_dispatch_kernel_query_occupancy(int *out_blocks, + uint32_t threads_per_block) { int num_blocks = 0; cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor( &num_blocks, - cudaq::realtime::dispatch_kernel_device_call_only, + cudaq::realtime::dispatch_kernel_device_call_only< + cudaq::realtime::RegularKernel>, threads_per_block, 0); - if (err != cudaSuccess) return err; - if (out_blocks) *out_blocks = num_blocks; + if (err != cudaSuccess) + return err; + if (out_blocks) + *out_blocks = num_blocks; return cudaSuccess; } -extern "C" cudaError_t cudaq_dispatch_kernel_cooperative_query_occupancy( - int* out_blocks, uint32_t threads_per_block) { +extern "C" cudaError_t +cudaq_dispatch_kernel_cooperative_query_occupancy(int *out_blocks, + uint32_t threads_per_block) { int num_blocks = 0; cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor( &num_blocks, cudaq::realtime::dispatch_kernel_device_call_only< cudaq::realtime::CooperativeKernel>, threads_per_block, 0); - if (err != cudaSuccess) return err; - if (out_blocks) *out_blocks = num_blocks; + if (err != cudaSuccess) + return err; + if (out_blocks) + *out_blocks = num_blocks; return cudaSuccess; } extern "C" void cudaq_launch_dispatch_kernel_regular( - volatile std::uint64_t* rx_flags, - volatile std::uint64_t* tx_flags, - std::uint8_t* rx_data, - std::uint8_t* tx_data, - std::size_t rx_stride_sz, - std::size_t tx_stride_sz, - cudaq_function_entry_t* function_table, - std::size_t func_count, - volatile int* shutdown_flag, - std::uint64_t* stats, - std::size_t num_slots, - std::uint32_t num_blocks, - std::uint32_t threads_per_block, - cudaStream_t stream) { + volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, + std::uint8_t *rx_data, std::uint8_t *tx_data, std::size_t rx_stride_sz, + std::size_t tx_stride_sz, cudaq_function_entry_t *function_table, + std::size_t func_count, volatile int *shutdown_flag, std::uint64_t *stats, + std::size_t num_slots, std::uint32_t num_blocks, + std::uint32_t threads_per_block, cudaStream_t stream) { // Use device-call-only kernel (no graph launch support) // Note: rx_data/rx_stride_sz are available in the ringbuffer struct but // not passed to the kernel since it reads RX addresses from rx_flags. (void)rx_data; (void)rx_stride_sz; - cudaq::realtime::dispatch_kernel_device_call_only + cudaq::realtime::dispatch_kernel_device_call_only< + cudaq::realtime::RegularKernel> <<>>( - rx_flags, tx_flags, tx_data, tx_stride_sz, - function_table, func_count, + rx_flags, tx_flags, tx_data, tx_stride_sz, function_table, func_count, shutdown_flag, stats, num_slots); } extern "C" void cudaq_launch_dispatch_kernel_cooperative( - volatile std::uint64_t* rx_flags, - volatile std::uint64_t* tx_flags, - std::uint8_t* rx_data, - std::uint8_t* tx_data, - std::size_t rx_stride_sz, - std::size_t tx_stride_sz, - cudaq_function_entry_t* function_table, - std::size_t func_count, - volatile int* shutdown_flag, - std::uint64_t* stats, - std::size_t num_slots, - std::uint32_t num_blocks, - std::uint32_t threads_per_block, - cudaStream_t stream) { + volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, + std::uint8_t *rx_data, std::uint8_t *tx_data, std::size_t rx_stride_sz, + std::size_t tx_stride_sz, cudaq_function_entry_t *function_table, + std::size_t func_count, volatile int *shutdown_flag, std::uint64_t *stats, + std::size_t num_slots, std::uint32_t num_blocks, + std::uint32_t threads_per_block, cudaStream_t stream) { (void)rx_data; (void)rx_stride_sz; - void* kernel_args[] = { - const_cast(&rx_flags), - const_cast(&tx_flags), - &tx_data, - &tx_stride_sz, - &function_table, - &func_count, - const_cast(&shutdown_flag), - &stats, - &num_slots - }; + void *kernel_args[] = {const_cast(&rx_flags), + const_cast(&tx_flags), + &tx_data, + &tx_stride_sz, + &function_table, + &func_count, + const_cast(&shutdown_flag), + &stats, + &num_slots}; cudaLaunchCooperativeKernel( - reinterpret_cast( - cudaq::realtime::dispatch_kernel_device_call_only), + reinterpret_cast( + cudaq::realtime::dispatch_kernel_device_call_only< + cudaq::realtime::CooperativeKernel>), dim3(num_blocks), dim3(threads_per_block), kernel_args, 0, stream); } @@ -471,8 +458,9 @@ extern "C" void cudaq_launch_dispatch_kernel_cooperative( // // To use device-side cudaGraphLaunch(), the dispatch kernel itself must be // running inside a graph execution context. These functions create a graph -// containing the dispatch kernel, instantiate it with cudaGraphInstantiateFlagDeviceLaunch, -// and provide proper launch/cleanup functions. +// containing the dispatch kernel, instantiate it with +// cudaGraphInstantiateFlagDeviceLaunch, and provide proper launch/cleanup +// functions. // Internal storage for graph-based dispatch context // Parameters must be stored persistently since the graph may execute after @@ -482,46 +470,37 @@ struct cudaq_dispatch_graph_context { cudaGraphExec_t graph_exec; cudaGraphNode_t kernel_node; bool is_valid; - + // Persistent storage for kernel parameters (must outlive graph execution) - volatile std::uint64_t* rx_flags; - volatile std::uint64_t* tx_flags; - std::uint8_t* tx_data; + volatile std::uint64_t *rx_flags; + volatile std::uint64_t *tx_flags; + std::uint8_t *tx_data; std::size_t tx_stride_sz; - cudaq_function_entry_t* function_table; + cudaq_function_entry_t *function_table; std::size_t func_count; - cudaq::realtime::GraphIOContext* graph_io_ctx; - volatile int* shutdown_flag; - std::uint64_t* stats; + cudaq::realtime::GraphIOContext *graph_io_ctx; + volatile int *shutdown_flag; + std::uint64_t *stats; std::size_t num_slots; }; extern "C" cudaError_t cudaq_create_dispatch_graph_regular( - volatile std::uint64_t* rx_flags, - volatile std::uint64_t* tx_flags, - std::uint8_t* rx_data, - std::uint8_t* tx_data, - std::size_t rx_stride_sz, - std::size_t tx_stride_sz, - cudaq_function_entry_t* function_table, - std::size_t func_count, - void* graph_io_ctx_raw, - volatile int* shutdown_flag, - std::uint64_t* stats, - std::size_t num_slots, - std::uint32_t num_blocks, - std::uint32_t threads_per_block, - cudaStream_t stream, - cudaq_dispatch_graph_context** out_context) { - + volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, + std::uint8_t *rx_data, std::uint8_t *tx_data, std::size_t rx_stride_sz, + std::size_t tx_stride_sz, cudaq_function_entry_t *function_table, + std::size_t func_count, void *graph_io_ctx_raw, volatile int *shutdown_flag, + std::uint64_t *stats, std::size_t num_slots, std::uint32_t num_blocks, + std::uint32_t threads_per_block, cudaStream_t stream, + cudaq_dispatch_graph_context **out_context) { + (void)rx_data; (void)rx_stride_sz; cudaError_t err; - + // Allocate context with persistent parameter storage - cudaq_dispatch_graph_context* ctx = new cudaq_dispatch_graph_context(); + cudaq_dispatch_graph_context *ctx = new cudaq_dispatch_graph_context(); ctx->is_valid = false; - + // Store parameters persistently in the context ctx->rx_flags = rx_flags; ctx->tx_flags = tx_flags; @@ -530,58 +509,53 @@ extern "C" cudaError_t cudaq_create_dispatch_graph_regular( ctx->function_table = function_table; ctx->func_count = func_count; ctx->graph_io_ctx = - static_cast(graph_io_ctx_raw); + static_cast(graph_io_ctx_raw); ctx->shutdown_flag = shutdown_flag; ctx->stats = stats; ctx->num_slots = num_slots; - + // Create graph err = cudaGraphCreate(&ctx->graph, 0); if (err != cudaSuccess) { delete ctx; return err; } - + // Set up kernel parameters - point to persistent storage in context cudaKernelNodeParams kernel_params = {}; - void* kernel_args[] = { - &ctx->rx_flags, - &ctx->tx_flags, - &ctx->tx_data, - &ctx->tx_stride_sz, - &ctx->function_table, - &ctx->func_count, - &ctx->graph_io_ctx, - &ctx->shutdown_flag, - &ctx->stats, - &ctx->num_slots - }; - - kernel_params.func = reinterpret_cast( - cudaq::realtime::dispatch_kernel_with_graph); + void *kernel_args[] = {&ctx->rx_flags, &ctx->tx_flags, + &ctx->tx_data, &ctx->tx_stride_sz, + &ctx->function_table, &ctx->func_count, + &ctx->graph_io_ctx, &ctx->shutdown_flag, + &ctx->stats, &ctx->num_slots}; + + kernel_params.func = + reinterpret_cast(cudaq::realtime::dispatch_kernel_with_graph< + cudaq::realtime::RegularKernel>); kernel_params.gridDim = dim3(num_blocks, 1, 1); kernel_params.blockDim = dim3(threads_per_block, 1, 1); kernel_params.sharedMemBytes = 0; kernel_params.kernelParams = kernel_args; kernel_params.extra = nullptr; - + // Add kernel node to graph - err = cudaGraphAddKernelNode(&ctx->kernel_node, ctx->graph, nullptr, 0, &kernel_params); + err = cudaGraphAddKernelNode(&ctx->kernel_node, ctx->graph, nullptr, 0, + &kernel_params); if (err != cudaSuccess) { cudaGraphDestroy(ctx->graph); delete ctx; return err; } - + // Instantiate with device launch flag - THIS IS THE KEY! - err = cudaGraphInstantiate(&ctx->graph_exec, ctx->graph, - cudaGraphInstantiateFlagDeviceLaunch); + err = cudaGraphInstantiate(&ctx->graph_exec, ctx->graph, + cudaGraphInstantiateFlagDeviceLaunch); if (err != cudaSuccess) { cudaGraphDestroy(ctx->graph); delete ctx; return err; } - + // Upload graph to device (required before device-side launch) err = cudaGraphUpload(ctx->graph_exec, stream); if (err != cudaSuccess) { @@ -590,7 +564,7 @@ extern "C" cudaError_t cudaq_create_dispatch_graph_regular( delete ctx; return err; } - + // Synchronize to ensure upload completes err = cudaStreamSynchronize(stream); if (err != cudaSuccess) { @@ -599,38 +573,40 @@ extern "C" cudaError_t cudaq_create_dispatch_graph_regular( delete ctx; return err; } - + ctx->is_valid = true; *out_context = ctx; return cudaSuccess; } -extern "C" cudaError_t cudaq_launch_dispatch_graph( - cudaq_dispatch_graph_context* context, - cudaStream_t stream) { +extern "C" cudaError_t +cudaq_launch_dispatch_graph(cudaq_dispatch_graph_context *context, + cudaStream_t stream) { if (context == nullptr || !context->is_valid) { return cudaErrorInvalidValue; } - + // Launch the graph - now device-side cudaGraphLaunch will work! return cudaGraphLaunch(context->graph_exec, stream); } -extern "C" cudaError_t cudaq_destroy_dispatch_graph( - cudaq_dispatch_graph_context* context) { +extern "C" cudaError_t +cudaq_destroy_dispatch_graph(cudaq_dispatch_graph_context *context) { if (context == nullptr) { return cudaErrorInvalidValue; } - + cudaError_t err = cudaSuccess; - + if (context->is_valid) { cudaError_t err1 = cudaGraphExecDestroy(context->graph_exec); cudaError_t err2 = cudaGraphDestroy(context->graph); - if (err1 != cudaSuccess) err = err1; - else if (err2 != cudaSuccess) err = err2; + if (err1 != cudaSuccess) + err = err1; + else if (err2 != cudaSuccess) + err = err2; } - + delete context; return err; } diff --git a/realtime/lib/daemon/dispatcher/host_dispatcher.cu b/realtime/lib/daemon/dispatcher/host_dispatcher.cu index 2f0b055f..0b96e673 100644 --- a/realtime/lib/daemon/dispatcher/host_dispatcher.cu +++ b/realtime/lib/daemon/dispatcher/host_dispatcher.cu @@ -6,8 +6,8 @@ * the terms of the Apache License 2.0 which accompanies this distribution. ******************************************************************************/ -#include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" #include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" namespace cudaq::realtime { @@ -15,9 +15,9 @@ namespace cudaq::realtime { // Helpers: function table lookup //----------------------------------------------------------------------------- -static const cudaq_function_entry_t* lookup_function(cudaq_function_entry_t* table, - size_t count, - uint32_t function_id) { +static const cudaq_function_entry_t * +lookup_function(cudaq_function_entry_t *table, size_t count, + uint32_t function_id) { for (size_t i = 0; i < count; ++i) { if (table[i].function_id == function_id) return &table[i]; @@ -25,12 +25,14 @@ static const cudaq_function_entry_t* lookup_function(cudaq_function_entry_t* tab return nullptr; } -static int find_idle_graph_worker_for_function(const HostDispatcherConfig& config, - uint32_t function_id) { +static int +find_idle_graph_worker_for_function(const HostDispatcherConfig &config, + uint32_t function_id) { uint64_t mask = config.idle_mask->load(cuda::std::memory_order_acquire); while (mask != 0) { int worker_id = __builtin_ffsll(static_cast(mask)) - 1; - if (config.workers[static_cast(worker_id)].function_id == function_id) + if (config.workers[static_cast(worker_id)].function_id == + function_id) return worker_id; mask &= ~(1ULL << worker_id); } @@ -40,31 +42,32 @@ static int find_idle_graph_worker_for_function(const HostDispatcherConfig& confi /// Result of parsing the slot when a function table is in use. struct ParsedSlot { uint32_t function_id = 0; - const cudaq_function_entry_t* entry = nullptr; - bool drop = false; // true => invalid magic or unknown function_id; clear slot and advance + const cudaq_function_entry_t *entry = nullptr; + bool drop = false; // true => invalid magic or unknown function_id; clear slot + // and advance }; -static ParsedSlot parse_slot_with_function_table(void* slot_host, - const HostDispatcherConfig& config) { +static ParsedSlot +parse_slot_with_function_table(void *slot_host, + const HostDispatcherConfig &config) { ParsedSlot out; - const RPCHeader* header = static_cast(slot_host); + const RPCHeader *header = static_cast(slot_host); if (header->magic != RPC_MAGIC_REQUEST) { out.drop = true; return out; } out.function_id = header->function_id; - out.entry = lookup_function(config.function_table, config.function_table_count, - out.function_id); + out.entry = lookup_function(config.function_table, + config.function_table_count, out.function_id); if (!out.entry) out.drop = true; return out; } /// Clear rx_flag for this slot, increment stats, advance slot index. -static void finish_slot_and_advance(const HostDispatcherConfig& config, - size_t& current_slot, - size_t num_slots, - uint64_t& packets_dispatched) { +static void finish_slot_and_advance(const HostDispatcherConfig &config, + size_t ¤t_slot, size_t num_slots, + uint64_t &packets_dispatched) { config.rx_flags[current_slot].store(0, cuda::std::memory_order_release); packets_dispatched++; if (config.live_dispatched) @@ -72,12 +75,14 @@ static void finish_slot_and_advance(const HostDispatcherConfig& config, current_slot = (current_slot + 1) % num_slots; } -/// Acquire a graph worker (by function_id if table in use, else any idle worker). -static int acquire_graph_worker(const HostDispatcherConfig& config, +/// Acquire a graph worker (by function_id if table in use, else any idle +/// worker). +static int acquire_graph_worker(const HostDispatcherConfig &config, bool use_function_table, - const cudaq_function_entry_t* entry, + const cudaq_function_entry_t *entry, uint32_t function_id) { - if (use_function_table && entry && entry->dispatch_mode == CUDAQ_DISPATCH_GRAPH_LAUNCH) + if (use_function_table && entry && + entry->dispatch_mode == CUDAQ_DISPATCH_GRAPH_LAUNCH) return find_idle_graph_worker_for_function(config, function_id); uint64_t mask = config.idle_mask->load(cuda::std::memory_order_acquire); if (mask == 0) @@ -86,34 +91,40 @@ static int acquire_graph_worker(const HostDispatcherConfig& config, } /// Launch the graph for the given worker; set tx_flags on success or error. -static void launch_graph_worker(const HostDispatcherConfig& config, - int worker_id, - void* slot_host, +static void launch_graph_worker(const HostDispatcherConfig &config, + int worker_id, void *slot_host, size_t current_slot) { - config.idle_mask->fetch_and(~(1ULL << worker_id), cuda::std::memory_order_release); + config.idle_mask->fetch_and(~(1ULL << worker_id), + cuda::std::memory_order_release); config.inflight_slot_tags[worker_id] = static_cast(current_slot); - ptrdiff_t offset = static_cast(slot_host) - config.rx_data_host; - void* data_dev = static_cast(config.rx_data_dev + offset); + ptrdiff_t offset = static_cast(slot_host) - config.rx_data_host; + void *data_dev = static_cast(config.rx_data_dev + offset); config.h_mailbox_bank[worker_id] = data_dev; __sync_synchronize(); const size_t w = static_cast(worker_id); if (config.workers[w].pre_launch_fn) - config.workers[w].pre_launch_fn(config.workers[w].pre_launch_data, data_dev, config.workers[w].stream); - cudaError_t err = cudaGraphLaunch(config.workers[w].graph_exec, config.workers[w].stream); + config.workers[w].pre_launch_fn(config.workers[w].pre_launch_data, data_dev, + config.workers[w].stream); + cudaError_t err = + cudaGraphLaunch(config.workers[w].graph_exec, config.workers[w].stream); if (err != cudaSuccess) { uint64_t error_val = (uint64_t)0xDEAD << 48 | (uint64_t)err; - config.tx_flags[current_slot].store(error_val, cuda::std::memory_order_release); - config.idle_mask->fetch_or(1ULL << worker_id, cuda::std::memory_order_release); + config.tx_flags[current_slot].store(error_val, + cuda::std::memory_order_release); + config.idle_mask->fetch_or(1ULL << worker_id, + cuda::std::memory_order_release); } else { if (config.workers[w].post_launch_fn) - config.workers[w].post_launch_fn(config.workers[w].post_launch_data, data_dev, config.workers[w].stream); + config.workers[w].post_launch_fn(config.workers[w].post_launch_data, + data_dev, config.workers[w].stream); // Always write IN_FLIGHT sentinel. The actual READY value is written // later by the CPU worker thread or the GPU-only cudaLaunchHostFunc // callback, after the graph has completed. - config.tx_flags[current_slot].store(0xEEEEEEEEEEEEEEEEULL, cuda::std::memory_order_release); + config.tx_flags[current_slot].store(0xEEEEEEEEEEEEEEEEULL, + cuda::std::memory_order_release); } } @@ -121,7 +132,7 @@ static void launch_graph_worker(const HostDispatcherConfig& config, // Main loop //----------------------------------------------------------------------------- -void host_dispatcher_loop(const HostDispatcherConfig& config) { +void host_dispatcher_loop(const HostDispatcherConfig &config) { size_t current_slot = 0; const size_t num_slots = config.num_slots; uint64_t packets_dispatched = 0; @@ -129,16 +140,17 @@ void host_dispatcher_loop(const HostDispatcherConfig& config) { (config.function_table != nullptr && config.function_table_count > 0); while (config.shutdown_flag->load(cuda::std::memory_order_acquire) == 0) { - uint64_t rx_value = config.rx_flags[current_slot].load(cuda::std::memory_order_acquire); + uint64_t rx_value = + config.rx_flags[current_slot].load(cuda::std::memory_order_acquire); if (rx_value == 0) { QEC_CPU_RELAX(); continue; } - void* slot_host = reinterpret_cast(rx_value); + void *slot_host = reinterpret_cast(rx_value); uint32_t function_id = 0; - const cudaq_function_entry_t* entry = nullptr; + const cudaq_function_entry_t *entry = nullptr; // TODO: Remove non-function-table path; RPC framing is always required. if (use_function_table) { @@ -159,17 +171,19 @@ void host_dispatcher_loop(const HostDispatcherConfig& config) { continue; } - int worker_id = acquire_graph_worker(config, use_function_table, entry, function_id); + int worker_id = + acquire_graph_worker(config, use_function_table, entry, function_id); if (worker_id < 0) { QEC_CPU_RELAX(); continue; } launch_graph_worker(config, worker_id, slot_host, current_slot); - finish_slot_and_advance(config, current_slot, num_slots, packets_dispatched); + finish_slot_and_advance(config, current_slot, num_slots, + packets_dispatched); } - for (const auto& w : config.workers) { + for (const auto &w : config.workers) { cudaStreamSynchronize(w.stream); } diff --git a/realtime/lib/daemon/dispatcher/host_dispatcher_capi.cu b/realtime/lib/daemon/dispatcher/host_dispatcher_capi.cu index e9c5be95..109fb79d 100644 --- a/realtime/lib/daemon/dispatcher/host_dispatcher_capi.cu +++ b/realtime/lib/daemon/dispatcher/host_dispatcher_capi.cu @@ -18,14 +18,14 @@ struct cudaq_host_dispatcher_handle { std::thread thread; std::vector workers; - cudaq::realtime::atomic_uint64_sys* idle_mask = nullptr; - int* inflight_slot_tags = nullptr; - void** h_mailbox_bank = nullptr; + cudaq::realtime::atomic_uint64_sys *idle_mask = nullptr; + int *inflight_slot_tags = nullptr; + void **h_mailbox_bank = nullptr; bool owns_mailbox = false; size_t num_workers = 0; }; -static size_t count_graph_launch_workers(const cudaq_function_table_t* table) { +static size_t count_graph_launch_workers(const cudaq_function_table_t *table) { size_t n = 0; for (uint32_t i = 0; i < table->count; ++i) { if (table->entries[i].dispatch_mode == CUDAQ_DISPATCH_GRAPH_LAUNCH) @@ -34,13 +34,10 @@ static size_t count_graph_launch_workers(const cudaq_function_table_t* table) { return n; } -extern "C" cudaq_host_dispatcher_handle_t* cudaq_host_dispatcher_start_thread( - const cudaq_ringbuffer_t* ringbuffer, - const cudaq_function_table_t* table, - const cudaq_dispatcher_config_t* config, - volatile int* shutdown_flag, - uint64_t* stats, - void** external_mailbox) { +extern "C" cudaq_host_dispatcher_handle_t *cudaq_host_dispatcher_start_thread( + const cudaq_ringbuffer_t *ringbuffer, const cudaq_function_table_t *table, + const cudaq_dispatcher_config_t *config, volatile int *shutdown_flag, + uint64_t *stats, void **external_mailbox) { if (!ringbuffer || !table || !config || !shutdown_flag || !stats) return nullptr; if (!ringbuffer->rx_flags_host || !ringbuffer->tx_flags_host || @@ -55,7 +52,7 @@ extern "C" cudaq_host_dispatcher_handle_t* cudaq_host_dispatcher_start_thread( if (num_workers == 0) return nullptr; - auto* handle = new (std::nothrow) cudaq_host_dispatcher_handle(); + auto *handle = new (std::nothrow) cudaq_host_dispatcher_handle(); if (!handle) return nullptr; @@ -65,10 +62,11 @@ extern "C" cudaq_host_dispatcher_handle_t* cudaq_host_dispatcher_start_thread( handle->h_mailbox_bank = external_mailbox; handle->owns_mailbox = false; } else { - handle->h_mailbox_bank = new (std::nothrow) void*[num_workers]; + handle->h_mailbox_bank = new (std::nothrow) void *[num_workers]; handle->owns_mailbox = true; } - if (!handle->idle_mask || !handle->inflight_slot_tags || !handle->h_mailbox_bank) { + if (!handle->idle_mask || !handle->inflight_slot_tags || + !handle->h_mailbox_bank) { delete handle->idle_mask; delete[] handle->inflight_slot_tags; if (handle->owns_mailbox) @@ -85,7 +83,7 @@ extern "C" cudaq_host_dispatcher_handle_t* cudaq_host_dispatcher_start_thread( continue; cudaStream_t stream = nullptr; if (cudaStreamCreate(&stream) != cudaSuccess) { - for (auto& w : handle->workers) + for (auto &w : handle->workers) cudaStreamDestroy(w.stream); delete handle->idle_mask; delete[] handle->inflight_slot_tags; @@ -105,10 +103,10 @@ extern "C" cudaq_host_dispatcher_handle_t* cudaq_host_dispatcher_start_thread( cuda::std::memory_order_release); cudaq::realtime::HostDispatcherConfig host_config; - host_config.rx_flags = - (cudaq::realtime::atomic_uint64_sys*)(uintptr_t)ringbuffer->rx_flags_host; - host_config.tx_flags = - (cudaq::realtime::atomic_uint64_sys*)(uintptr_t)ringbuffer->tx_flags_host; + host_config.rx_flags = (cudaq::realtime::atomic_uint64_sys *)(uintptr_t) + ringbuffer->rx_flags_host; + host_config.tx_flags = (cudaq::realtime::atomic_uint64_sys *)(uintptr_t) + ringbuffer->tx_flags_host; host_config.rx_data_host = ringbuffer->rx_data_host; host_config.rx_data_dev = ringbuffer->rx_data; host_config.tx_data_host = ringbuffer->tx_data_host; @@ -121,18 +119,20 @@ extern "C" cudaq_host_dispatcher_handle_t* cudaq_host_dispatcher_start_thread( host_config.function_table = table->entries; host_config.function_table_count = table->count; host_config.shutdown_flag = - (cudaq::realtime::atomic_int_sys*)(uintptr_t)shutdown_flag; + (cudaq::realtime::atomic_int_sys *)(uintptr_t)shutdown_flag; host_config.stats_counter = stats; host_config.live_dispatched = nullptr; host_config.idle_mask = handle->idle_mask; host_config.inflight_slot_tags = handle->inflight_slot_tags; - handle->thread = std::thread(cudaq::realtime::host_dispatcher_loop, host_config); + handle->thread = + std::thread(cudaq::realtime::host_dispatcher_loop, host_config); return handle; } -extern "C" cudaq_status_t cudaq_host_dispatcher_release_worker( - cudaq_host_dispatcher_handle_t* handle, int worker_id) { +extern "C" cudaq_status_t +cudaq_host_dispatcher_release_worker(cudaq_host_dispatcher_handle_t *handle, + int worker_id) { if (!handle || !handle->idle_mask) return CUDAQ_ERR_INVALID_ARG; if (worker_id < 0 || static_cast(worker_id) >= handle->num_workers) @@ -142,12 +142,13 @@ extern "C" cudaq_status_t cudaq_host_dispatcher_release_worker( return CUDAQ_OK; } -extern "C" void cudaq_host_dispatcher_stop(cudaq_host_dispatcher_handle_t* handle) { +extern "C" void +cudaq_host_dispatcher_stop(cudaq_host_dispatcher_handle_t *handle) { if (!handle) return; if (handle->thread.joinable()) handle->thread.join(); - for (auto& w : handle->workers) + for (auto &w : handle->workers) cudaStreamDestroy(w.stream); delete handle->idle_mask; delete[] handle->inflight_slot_tags; diff --git a/realtime/lib/pipeline/realtime_pipeline.cu b/realtime/lib/pipeline/realtime_pipeline.cu index 35fce363..586cd250 100644 --- a/realtime/lib/pipeline/realtime_pipeline.cu +++ b/realtime/lib/pipeline/realtime_pipeline.cu @@ -6,12 +6,12 @@ * the terms of the Apache License 2.0 which accompanies this distribution. ******************************************************************************/ -#include "cudaq/realtime/pipeline.h" #include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" #include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" +#include "cudaq/realtime/pipeline.h" -#include #include +#include #include #include @@ -36,19 +36,19 @@ namespace cudaq::realtime { do { \ cudaError_t err = (call); \ if (err != cudaSuccess) { \ - std::cerr << "RealtimePipeline CUDA error: " \ - << cudaGetErrorString(err) << " at " << __FILE__ << ":" \ - << __LINE__ << std::endl; \ + std::cerr << "RealtimePipeline CUDA error: " << cudaGetErrorString(err) \ + << " at " << __FILE__ << ":" << __LINE__ << std::endl; \ std::abort(); \ } \ } while (0) -static void pin_thread(std::thread& t, int core) { - if (core < 0) return; - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(core, &cpuset); - pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset); +static void pin_thread(std::thread &t, int core) { + if (core < 0) + return; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core, &cpuset); + pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset); } // --------------------------------------------------------------------------- @@ -56,136 +56,135 @@ static void pin_thread(std::thread& t, int core) { // --------------------------------------------------------------------------- struct GpuOnlyWorkerCtx { - atomic_uint64_sys* tx_flags; - atomic_uint64_sys* idle_mask; - int* inflight_slot_tags; - uint8_t* rx_data_host; - size_t slot_size; - int worker_id; - void (*user_post_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream); - void* user_post_launch_data; - int origin_slot; - uint64_t tx_value; + atomic_uint64_sys *tx_flags; + atomic_uint64_sys *idle_mask; + int *inflight_slot_tags; + uint8_t *rx_data_host; + size_t slot_size; + int worker_id; + void (*user_post_launch_fn)(void *user_data, void *slot_dev, + cudaStream_t stream); + void *user_post_launch_data; + int origin_slot; + uint64_t tx_value; }; -static void gpu_only_host_callback(void* user_data) { - auto* ctx = static_cast(user_data); - ctx->tx_flags[ctx->origin_slot].store( - ctx->tx_value, cuda::std::memory_order_release); - ctx->idle_mask->fetch_or( - 1ULL << ctx->worker_id, cuda::std::memory_order_release); +static void gpu_only_host_callback(void *user_data) { + auto *ctx = static_cast(user_data); + ctx->tx_flags[ctx->origin_slot].store(ctx->tx_value, + cuda::std::memory_order_release); + ctx->idle_mask->fetch_or(1ULL << ctx->worker_id, + cuda::std::memory_order_release); } -static void gpu_only_post_launch(void* user_data, void* slot_dev, +static void gpu_only_post_launch(void *user_data, void *slot_dev, cudaStream_t stream) { - auto* ctx = static_cast(user_data); + auto *ctx = static_cast(user_data); - if (ctx->user_post_launch_fn) - ctx->user_post_launch_fn(ctx->user_post_launch_data, slot_dev, stream); + if (ctx->user_post_launch_fn) + ctx->user_post_launch_fn(ctx->user_post_launch_data, slot_dev, stream); - ctx->origin_slot = ctx->inflight_slot_tags[ctx->worker_id]; - uint8_t* slot_host = ctx->rx_data_host + - static_cast(ctx->origin_slot) * ctx->slot_size; - ctx->tx_value = reinterpret_cast(slot_host); + ctx->origin_slot = ctx->inflight_slot_tags[ctx->worker_id]; + uint8_t *slot_host = ctx->rx_data_host + + static_cast(ctx->origin_slot) * ctx->slot_size; + ctx->tx_value = reinterpret_cast(slot_host); - cudaLaunchHostFunc(stream, gpu_only_host_callback, ctx); + cudaLaunchHostFunc(stream, gpu_only_host_callback, ctx); } - // --------------------------------------------------------------------------- // RingBufferManager // --------------------------------------------------------------------------- class RingBufferManager { public: - RingBufferManager(size_t num_slots, size_t slot_size) - : num_slots_(num_slots), slot_size_(slot_size) - { - PIPELINE_CUDA_CHECK(cudaHostAlloc(&buf_rx_, - num_slots * sizeof(atomic_uint64_sys), cudaHostAllocMapped)); - rx_flags_ = static_cast(buf_rx_); - for (size_t i = 0; i < num_slots; ++i) - new (rx_flags_ + i) atomic_uint64_sys(0); - - PIPELINE_CUDA_CHECK(cudaHostAlloc(&buf_tx_, - num_slots * sizeof(atomic_uint64_sys), cudaHostAllocMapped)); - tx_flags_ = static_cast(buf_tx_); - for (size_t i = 0; i < num_slots; ++i) - new (tx_flags_ + i) atomic_uint64_sys(0); - - PIPELINE_CUDA_CHECK(cudaHostGetDevicePointer( - reinterpret_cast(&rx_flags_dev_), buf_rx_, 0)); - PIPELINE_CUDA_CHECK(cudaHostGetDevicePointer( - reinterpret_cast(&tx_flags_dev_), buf_tx_, 0)); - - PIPELINE_CUDA_CHECK(cudaHostAlloc( - reinterpret_cast(&rx_data_host_), - num_slots * slot_size, cudaHostAllocMapped)); - PIPELINE_CUDA_CHECK(cudaHostGetDevicePointer( - reinterpret_cast(&rx_data_dev_), rx_data_host_, 0)); - - rb_.rx_flags = reinterpret_cast(rx_flags_); - rb_.tx_flags = reinterpret_cast(tx_flags_); - rb_.rx_data = rx_data_dev_; - rb_.tx_data = rx_data_dev_; - rb_.rx_stride_sz = slot_size; - rb_.tx_stride_sz = slot_size; - rb_.rx_flags_host = reinterpret_cast(rx_flags_); - rb_.tx_flags_host = reinterpret_cast(tx_flags_); - rb_.rx_data_host = rx_data_host_; - rb_.tx_data_host = rx_data_host_; + RingBufferManager(size_t num_slots, size_t slot_size) + : num_slots_(num_slots), slot_size_(slot_size) { + PIPELINE_CUDA_CHECK(cudaHostAlloc( + &buf_rx_, num_slots * sizeof(atomic_uint64_sys), cudaHostAllocMapped)); + rx_flags_ = static_cast(buf_rx_); + for (size_t i = 0; i < num_slots; ++i) + new (rx_flags_ + i) atomic_uint64_sys(0); + + PIPELINE_CUDA_CHECK(cudaHostAlloc( + &buf_tx_, num_slots * sizeof(atomic_uint64_sys), cudaHostAllocMapped)); + tx_flags_ = static_cast(buf_tx_); + for (size_t i = 0; i < num_slots; ++i) + new (tx_flags_ + i) atomic_uint64_sys(0); + + PIPELINE_CUDA_CHECK(cudaHostGetDevicePointer( + reinterpret_cast(&rx_flags_dev_), buf_rx_, 0)); + PIPELINE_CUDA_CHECK(cudaHostGetDevicePointer( + reinterpret_cast(&tx_flags_dev_), buf_tx_, 0)); + + PIPELINE_CUDA_CHECK(cudaHostAlloc(reinterpret_cast(&rx_data_host_), + num_slots * slot_size, + cudaHostAllocMapped)); + PIPELINE_CUDA_CHECK(cudaHostGetDevicePointer( + reinterpret_cast(&rx_data_dev_), rx_data_host_, 0)); + + rb_.rx_flags = reinterpret_cast(rx_flags_); + rb_.tx_flags = reinterpret_cast(tx_flags_); + rb_.rx_data = rx_data_dev_; + rb_.tx_data = rx_data_dev_; + rb_.rx_stride_sz = slot_size; + rb_.tx_stride_sz = slot_size; + rb_.rx_flags_host = reinterpret_cast(rx_flags_); + rb_.tx_flags_host = reinterpret_cast(tx_flags_); + rb_.rx_data_host = rx_data_host_; + rb_.tx_data_host = rx_data_host_; + } + + ~RingBufferManager() { + for (size_t i = 0; i < num_slots_; ++i) { + rx_flags_[i].~atomic_uint64_sys(); + tx_flags_[i].~atomic_uint64_sys(); } - - ~RingBufferManager() { - for (size_t i = 0; i < num_slots_; ++i) { - rx_flags_[i].~atomic_uint64_sys(); - tx_flags_[i].~atomic_uint64_sys(); - } - cudaFreeHost(buf_rx_); - cudaFreeHost(buf_tx_); - cudaFreeHost(rx_data_host_); - } - - bool slot_available(uint32_t slot) const { - return cudaq_host_ringbuffer_slot_available(&rb_, slot) != 0; - } - - void write_and_signal(uint32_t slot, uint32_t function_id, - const void* payload, uint32_t payload_len) { - cudaq_host_ringbuffer_write_rpc_request( - &rb_, slot, function_id, payload, payload_len); - cudaq_host_ringbuffer_signal_slot(&rb_, slot); - } - - cudaq_tx_status_t poll_tx(uint32_t slot, int* cuda_error) const { - return cudaq_host_ringbuffer_poll_tx_flag(&rb_, slot, cuda_error); - } - - void clear_slot(uint32_t slot) { - cudaq_host_ringbuffer_clear_slot(&rb_, slot); - } - - size_t num_slots() const { return num_slots_; } - size_t slot_size() const { return slot_size_; } - - atomic_uint64_sys* rx_flags() { return rx_flags_; } - atomic_uint64_sys* tx_flags() { return tx_flags_; } - uint8_t* rx_data_host() { return rx_data_host_; } - uint8_t* rx_data_dev() { return rx_data_dev_; } - const cudaq_ringbuffer_t& ringbuffer() const { return rb_; } + cudaFreeHost(buf_rx_); + cudaFreeHost(buf_tx_); + cudaFreeHost(rx_data_host_); + } + + bool slot_available(uint32_t slot) const { + return cudaq_host_ringbuffer_slot_available(&rb_, slot) != 0; + } + + void write_and_signal(uint32_t slot, uint32_t function_id, + const void *payload, uint32_t payload_len) { + cudaq_host_ringbuffer_write_rpc_request(&rb_, slot, function_id, payload, + payload_len); + cudaq_host_ringbuffer_signal_slot(&rb_, slot); + } + + cudaq_tx_status_t poll_tx(uint32_t slot, int *cuda_error) const { + return cudaq_host_ringbuffer_poll_tx_flag(&rb_, slot, cuda_error); + } + + void clear_slot(uint32_t slot) { + cudaq_host_ringbuffer_clear_slot(&rb_, slot); + } + + size_t num_slots() const { return num_slots_; } + size_t slot_size() const { return slot_size_; } + + atomic_uint64_sys *rx_flags() { return rx_flags_; } + atomic_uint64_sys *tx_flags() { return tx_flags_; } + uint8_t *rx_data_host() { return rx_data_host_; } + uint8_t *rx_data_dev() { return rx_data_dev_; } + const cudaq_ringbuffer_t &ringbuffer() const { return rb_; } private: - size_t num_slots_; - size_t slot_size_; - void* buf_rx_ = nullptr; - void* buf_tx_ = nullptr; - atomic_uint64_sys* rx_flags_ = nullptr; - atomic_uint64_sys* tx_flags_ = nullptr; - uint64_t* rx_flags_dev_ = nullptr; - uint64_t* tx_flags_dev_ = nullptr; - uint8_t* rx_data_host_ = nullptr; - uint8_t* rx_data_dev_ = nullptr; - cudaq_ringbuffer_t rb_{}; + size_t num_slots_; + size_t slot_size_; + void *buf_rx_ = nullptr; + void *buf_tx_ = nullptr; + atomic_uint64_sys *rx_flags_ = nullptr; + atomic_uint64_sys *tx_flags_ = nullptr; + uint64_t *rx_flags_dev_ = nullptr; + uint64_t *tx_flags_dev_ = nullptr; + uint8_t *rx_data_host_ = nullptr; + uint8_t *rx_data_dev_ = nullptr; + cudaq_ringbuffer_t rb_{}; }; // --------------------------------------------------------------------------- @@ -193,382 +192,380 @@ private: // --------------------------------------------------------------------------- struct RealtimePipeline::Impl { - PipelineStageConfig config; - - GpuStageFactory gpu_factory; - CpuStageCallback cpu_stage; - CompletionCallback completion_handler; - - // Owned infrastructure - std::unique_ptr ring; - void** h_mailbox_bank = nullptr; - void** d_mailbox_bank = nullptr; - - // Dispatcher state (hidden atomics) - atomic_int_sys shutdown_flag{0}; - uint64_t dispatcher_stats = 0; - atomic_uint64_sys live_dispatched{0}; - atomic_uint64_sys idle_mask{0}; - std::vector inflight_slot_tags; - - // Function table - std::vector function_table; - - // Per-worker GPU resources (from factory) - std::vector worker_resources; - - // GPU-only mode state - bool gpu_only = false; - std::vector gpu_only_ctxs; - - // Slot-to-request mapping (consumer-owned) - std::vector slot_request; - std::vector slot_occupied; - - // Stats (atomic counters) - std::atomic total_submitted{0}; - std::atomic total_completed{0}; - std::atomic backpressure_stalls{0}; - - // Thread coordination - std::atomic producer_stop{false}; - std::atomic consumer_stop{false}; - - // Threads - std::thread dispatcher_thread; - std::thread consumer_thread; - std::vector worker_threads; - - std::atomic started{false}; - - // ----------------------------------------------------------------------- - // Lifecycle - // ----------------------------------------------------------------------- - - void allocate(const PipelineStageConfig& cfg) { - if (cfg.num_workers > 64) { - throw std::invalid_argument( - "num_workers (" + std::to_string(cfg.num_workers) + - ") exceeds idle_mask capacity of 64"); - } + PipelineStageConfig config; + + GpuStageFactory gpu_factory; + CpuStageCallback cpu_stage; + CompletionCallback completion_handler; + + // Owned infrastructure + std::unique_ptr ring; + void **h_mailbox_bank = nullptr; + void **d_mailbox_bank = nullptr; + + // Dispatcher state (hidden atomics) + atomic_int_sys shutdown_flag{0}; + uint64_t dispatcher_stats = 0; + atomic_uint64_sys live_dispatched{0}; + atomic_uint64_sys idle_mask{0}; + std::vector inflight_slot_tags; + + // Function table + std::vector function_table; + + // Per-worker GPU resources (from factory) + std::vector worker_resources; + + // GPU-only mode state + bool gpu_only = false; + std::vector gpu_only_ctxs; + + // Slot-to-request mapping (consumer-owned) + std::vector slot_request; + std::vector slot_occupied; + + // Stats (atomic counters) + std::atomic total_submitted{0}; + std::atomic total_completed{0}; + std::atomic backpressure_stalls{0}; + + // Thread coordination + std::atomic producer_stop{false}; + std::atomic consumer_stop{false}; + + // Threads + std::thread dispatcher_thread; + std::thread consumer_thread; + std::vector worker_threads; + + std::atomic started{false}; + + // ----------------------------------------------------------------------- + // Lifecycle + // ----------------------------------------------------------------------- + + void allocate(const PipelineStageConfig &cfg) { + if (cfg.num_workers > 64) { + throw std::invalid_argument("num_workers (" + + std::to_string(cfg.num_workers) + + ") exceeds idle_mask capacity of 64"); + } + + config = cfg; - config = cfg; + ring = std::make_unique( + static_cast(cfg.num_slots), cfg.slot_size); - ring = std::make_unique( - static_cast(cfg.num_slots), cfg.slot_size); + PIPELINE_CUDA_CHECK(cudaHostAlloc(&h_mailbox_bank, + cfg.num_workers * sizeof(void *), + cudaHostAllocMapped)); + std::memset(h_mailbox_bank, 0, cfg.num_workers * sizeof(void *)); + PIPELINE_CUDA_CHECK(cudaHostGetDevicePointer( + reinterpret_cast(&d_mailbox_bank), h_mailbox_bank, 0)); - PIPELINE_CUDA_CHECK(cudaHostAlloc( - &h_mailbox_bank, cfg.num_workers * sizeof(void*), - cudaHostAllocMapped)); - std::memset(h_mailbox_bank, 0, cfg.num_workers * sizeof(void*)); - PIPELINE_CUDA_CHECK(cudaHostGetDevicePointer( - reinterpret_cast(&d_mailbox_bank), h_mailbox_bank, 0)); + inflight_slot_tags.resize(cfg.num_workers, 0); + slot_request.resize(cfg.num_slots, 0); + slot_occupied.resize(cfg.num_slots, 0); + } - inflight_slot_tags.resize(cfg.num_workers, 0); - slot_request.resize(cfg.num_slots, 0); - slot_occupied.resize(cfg.num_slots, 0); + void start_threads() { + if (!gpu_factory) { + throw std::logic_error("gpu_factory must be set before calling start()"); } - void start_threads() { - if (!gpu_factory) { - throw std::logic_error( - "gpu_factory must be set before calling start()"); - } + const int nw = config.num_workers; + gpu_only = !cpu_stage; + + // Build GPU resources via user factory + worker_resources.resize(nw); + function_table.resize(nw); + for (int i = 0; i < nw; ++i) { + worker_resources[i] = gpu_factory(i); + function_table[i].function_id = worker_resources[i].function_id; + function_table[i].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; + function_table[i].handler.graph_exec = worker_resources[i].graph_exec; + std::memset(&function_table[i].schema, 0, + sizeof(function_table[i].schema)); + } - const int nw = config.num_workers; - gpu_only = !cpu_stage; - - // Build GPU resources via user factory - worker_resources.resize(nw); - function_table.resize(nw); - for (int i = 0; i < nw; ++i) { - worker_resources[i] = gpu_factory(i); - function_table[i].function_id = worker_resources[i].function_id; - function_table[i].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; - function_table[i].handler.graph_exec = worker_resources[i].graph_exec; - std::memset(&function_table[i].schema, 0, sizeof(function_table[i].schema)); - } + // In GPU-only mode, set up per-worker contexts for cudaLaunchHostFunc + // completion signaling (chains user's post_launch_fn if provided). + if (gpu_only) { + gpu_only_ctxs.resize(nw); + for (int i = 0; i < nw; ++i) { + auto &c = gpu_only_ctxs[i]; + c.tx_flags = ring->tx_flags(); + c.idle_mask = &idle_mask; + c.inflight_slot_tags = inflight_slot_tags.data(); + c.rx_data_host = ring->rx_data_host(); + c.slot_size = config.slot_size; + c.worker_id = i; + c.user_post_launch_fn = worker_resources[i].post_launch_fn; + c.user_post_launch_data = worker_resources[i].post_launch_data; + c.origin_slot = 0; + c.tx_value = 0; + } + } - // In GPU-only mode, set up per-worker contexts for cudaLaunchHostFunc - // completion signaling (chains user's post_launch_fn if provided). - if (gpu_only) { - gpu_only_ctxs.resize(nw); - for (int i = 0; i < nw; ++i) { - auto& c = gpu_only_ctxs[i]; - c.tx_flags = ring->tx_flags(); - c.idle_mask = &idle_mask; - c.inflight_slot_tags = inflight_slot_tags.data(); - c.rx_data_host = ring->rx_data_host(); - c.slot_size = config.slot_size; - c.worker_id = i; - c.user_post_launch_fn = worker_resources[i].post_launch_fn; - c.user_post_launch_data = worker_resources[i].post_launch_data; - c.origin_slot = 0; - c.tx_value = 0; - } - } + // Initialize idle_mask with all workers free + uint64_t initial_idle = (nw >= 64) ? ~0ULL : ((1ULL << nw) - 1); + idle_mask.store(initial_idle, cuda::std::memory_order_release); + + // Build HostDispatcherConfig + HostDispatcherConfig disp_cfg; + disp_cfg.rx_flags = ring->rx_flags(); + disp_cfg.tx_flags = ring->tx_flags(); + disp_cfg.rx_data_host = ring->rx_data_host(); + disp_cfg.rx_data_dev = ring->rx_data_dev(); + disp_cfg.tx_data_host = nullptr; + disp_cfg.tx_data_dev = nullptr; + disp_cfg.tx_stride_sz = config.slot_size; + disp_cfg.h_mailbox_bank = h_mailbox_bank; + disp_cfg.num_slots = static_cast(config.num_slots); + disp_cfg.slot_size = config.slot_size; + disp_cfg.function_table = function_table.data(); + disp_cfg.function_table_count = static_cast(nw); + disp_cfg.shutdown_flag = &shutdown_flag; + disp_cfg.stats_counter = &dispatcher_stats; + disp_cfg.live_dispatched = &live_dispatched; + disp_cfg.idle_mask = &idle_mask; + disp_cfg.inflight_slot_tags = inflight_slot_tags.data(); + + disp_cfg.workers.resize(nw); + for (int i = 0; i < nw; ++i) { + disp_cfg.workers[i].graph_exec = worker_resources[i].graph_exec; + disp_cfg.workers[i].stream = worker_resources[i].stream; + disp_cfg.workers[i].function_id = worker_resources[i].function_id; + disp_cfg.workers[i].pre_launch_fn = worker_resources[i].pre_launch_fn; + disp_cfg.workers[i].pre_launch_data = worker_resources[i].pre_launch_data; + + if (gpu_only) { + disp_cfg.workers[i].post_launch_fn = gpu_only_post_launch; + disp_cfg.workers[i].post_launch_data = &gpu_only_ctxs[i]; + } else { + disp_cfg.workers[i].post_launch_fn = worker_resources[i].post_launch_fn; + disp_cfg.workers[i].post_launch_data = + worker_resources[i].post_launch_data; + } + } - // Initialize idle_mask with all workers free - uint64_t initial_idle = (nw >= 64) ? ~0ULL : ((1ULL << nw) - 1); - idle_mask.store(initial_idle, cuda::std::memory_order_release); - - // Build HostDispatcherConfig - HostDispatcherConfig disp_cfg; - disp_cfg.rx_flags = ring->rx_flags(); - disp_cfg.tx_flags = ring->tx_flags(); - disp_cfg.rx_data_host = ring->rx_data_host(); - disp_cfg.rx_data_dev = ring->rx_data_dev(); - disp_cfg.tx_data_host = nullptr; - disp_cfg.tx_data_dev = nullptr; - disp_cfg.tx_stride_sz = config.slot_size; - disp_cfg.h_mailbox_bank = h_mailbox_bank; - disp_cfg.num_slots = static_cast(config.num_slots); - disp_cfg.slot_size = config.slot_size; - disp_cfg.function_table = function_table.data(); - disp_cfg.function_table_count = static_cast(nw); - disp_cfg.shutdown_flag = &shutdown_flag; - disp_cfg.stats_counter = &dispatcher_stats; - disp_cfg.live_dispatched = &live_dispatched; - disp_cfg.idle_mask = &idle_mask; - disp_cfg.inflight_slot_tags = inflight_slot_tags.data(); - - disp_cfg.workers.resize(nw); - for (int i = 0; i < nw; ++i) { - disp_cfg.workers[i].graph_exec = worker_resources[i].graph_exec; - disp_cfg.workers[i].stream = worker_resources[i].stream; - disp_cfg.workers[i].function_id = worker_resources[i].function_id; - disp_cfg.workers[i].pre_launch_fn = worker_resources[i].pre_launch_fn; - disp_cfg.workers[i].pre_launch_data = worker_resources[i].pre_launch_data; - - if (gpu_only) { - disp_cfg.workers[i].post_launch_fn = gpu_only_post_launch; - disp_cfg.workers[i].post_launch_data = &gpu_only_ctxs[i]; - } else { - disp_cfg.workers[i].post_launch_fn = worker_resources[i].post_launch_fn; - disp_cfg.workers[i].post_launch_data = worker_resources[i].post_launch_data; - } - } + // --- Dispatcher thread --- + dispatcher_thread = std::thread( + [cfg = std::move(disp_cfg)]() { host_dispatcher_loop(cfg); }); + pin_thread(dispatcher_thread, config.cores.dispatcher); + + // --- Worker threads (skipped in GPU-only mode) --- + if (!gpu_only) { + worker_threads.resize(nw); + for (int i = 0; i < nw; ++i) { + worker_threads[i] = std::thread([this, i]() { worker_loop(i); }); + int core = + (config.cores.worker_base >= 0) ? config.cores.worker_base + i : -1; + pin_thread(worker_threads[i], core); + } + } - // --- Dispatcher thread --- - dispatcher_thread = std::thread([cfg = std::move(disp_cfg)]() { - host_dispatcher_loop(cfg); - }); - pin_thread(dispatcher_thread, config.cores.dispatcher); - - // --- Worker threads (skipped in GPU-only mode) --- - if (!gpu_only) { - worker_threads.resize(nw); - for (int i = 0; i < nw; ++i) { - worker_threads[i] = std::thread([this, i]() { worker_loop(i); }); - int core = (config.cores.worker_base >= 0) - ? config.cores.worker_base + i : -1; - pin_thread(worker_threads[i], core); - } - } + // --- Consumer thread --- + consumer_thread = std::thread([this]() { consumer_loop(); }); + pin_thread(consumer_thread, config.cores.consumer); - // --- Consumer thread --- - consumer_thread = std::thread([this]() { consumer_loop(); }); - pin_thread(consumer_thread, config.cores.consumer); + started = true; + } - started = true; - } + void stop_all() { + if (!started) + return; - void stop_all() { - if (!started) return; + // Signal consumer to finish pending work + producer_stop.store(true, std::memory_order_release); - // Signal consumer to finish pending work - producer_stop.store(true, std::memory_order_release); + // Grace period for in-flight requests + auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(5); + while (total_completed.load(std::memory_order_relaxed) < + total_submitted.load(std::memory_order_relaxed) && + std::chrono::steady_clock::now() < deadline) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } - // Grace period for in-flight requests - auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(5); - while (total_completed.load(std::memory_order_relaxed) < - total_submitted.load(std::memory_order_relaxed) && - std::chrono::steady_clock::now() < deadline) { - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - } + consumer_stop.store(true, std::memory_order_release); - consumer_stop.store(true, std::memory_order_release); + // Shut down dispatcher + shutdown_flag.store(1, cuda::std::memory_order_release); + dispatcher_thread.join(); - // Shut down dispatcher - shutdown_flag.store(1, cuda::std::memory_order_release); - dispatcher_thread.join(); + // Consumer + consumer_thread.join(); - // Consumer - consumer_thread.join(); + // Workers check shutdown via consumer_stop (they spin on ready_flags, + // which will never fire after dispatcher is gone, so we need to break + // them out). We set consumer_stop which doubles as system_stop for + // workers; the user's poll_next_job must eventually return false. + for (auto &t : worker_threads) { + if (t.joinable()) + t.join(); + } - // Workers check shutdown via consumer_stop (they spin on ready_flags, - // which will never fire after dispatcher is gone, so we need to break - // them out). We set consumer_stop which doubles as system_stop for - // workers; the user's poll_next_job must eventually return false. - for (auto& t : worker_threads) { - if (t.joinable()) t.join(); - } + started = false; + } - started = false; + void free_resources() { + ring.reset(); + if (h_mailbox_bank) { + cudaFreeHost(h_mailbox_bank); + h_mailbox_bank = nullptr; } + } + + // ----------------------------------------------------------------------- + // Worker loop (one per worker thread) + // ----------------------------------------------------------------------- + + void worker_loop(int worker_id) { + auto *wr = &worker_resources[worker_id]; + + // The cpu_stage callback is called in "poll mode" + // (gpu_output == nullptr). It polls its own GPU-ready + // mechanism and, if a result is available, processes it and + // writes the RPC response. Returns 0 when nothing was ready, + // >0 when a job was completed. The pipeline then handles all + // atomic signaling (tx_flags, idle_mask). + + while (!consumer_stop.load(std::memory_order_relaxed)) { + CpuStageContext ctx; + ctx.worker_id = worker_id; + ctx.origin_slot = inflight_slot_tags[worker_id]; + ctx.gpu_output = nullptr; + ctx.gpu_output_size = 0; + ctx.response_buffer = nullptr; + ctx.max_response_size = 0; + ctx.user_context = wr->user_context; + + size_t written = cpu_stage(ctx); + if (written == 0) { + QEC_CPU_RELAX(); + continue; + } - void free_resources() { - ring.reset(); - if (h_mailbox_bank) { - cudaFreeHost(h_mailbox_bank); - h_mailbox_bank = nullptr; - } - } + int origin_slot = inflight_slot_tags[worker_id]; - // ----------------------------------------------------------------------- - // Worker loop (one per worker thread) - // ----------------------------------------------------------------------- - - void worker_loop(int worker_id) { - auto* wr = &worker_resources[worker_id]; - - // The cpu_stage callback is called in "poll mode" - // (gpu_output == nullptr). It polls its own GPU-ready - // mechanism and, if a result is available, processes it and - // writes the RPC response. Returns 0 when nothing was ready, - // >0 when a job was completed. The pipeline then handles all - // atomic signaling (tx_flags, idle_mask). - - while (!consumer_stop.load(std::memory_order_relaxed)) { - CpuStageContext ctx; - ctx.worker_id = worker_id; - ctx.origin_slot = inflight_slot_tags[worker_id]; - ctx.gpu_output = nullptr; - ctx.gpu_output_size = 0; - ctx.response_buffer = nullptr; - ctx.max_response_size = 0; - ctx.user_context = wr->user_context; - - size_t written = cpu_stage(ctx); - if (written == 0) { - QEC_CPU_RELAX(); - continue; - } - - int origin_slot = inflight_slot_tags[worker_id]; - - uint8_t* slot_host = ring->rx_data_host() + - static_cast(origin_slot) * config.slot_size; - uint64_t rx_value = reinterpret_cast(slot_host); - - ring->tx_flags()[origin_slot].store( - rx_value, cuda::std::memory_order_release); - - idle_mask.fetch_or(1ULL << worker_id, - cuda::std::memory_order_release); - } - } + uint8_t *slot_host = ring->rx_data_host() + + static_cast(origin_slot) * config.slot_size; + uint64_t rx_value = reinterpret_cast(slot_host); - // ----------------------------------------------------------------------- - // Consumer loop - // ----------------------------------------------------------------------- - - void consumer_loop() { - const uint32_t ns = static_cast(config.num_slots); - - while (true) { - if (consumer_stop.load(std::memory_order_acquire)) - break; - - bool pdone = producer_stop.load(std::memory_order_acquire); - uint64_t nsub = total_submitted.load(std::memory_order_acquire); - uint64_t ncomp = total_completed.load(std::memory_order_relaxed); - - if (pdone && ncomp >= nsub) - break; - - bool found_any = false; - for (uint32_t s = 0; s < ns; ++s) { - if (!slot_occupied[s]) continue; - - int cuda_error = 0; - cudaq_tx_status_t status = ring->poll_tx(s, &cuda_error); - - if (status == CUDAQ_TX_READY) { - if (completion_handler) { - Completion c; - c.request_id = slot_request[s]; - c.slot = static_cast(s); - c.success = true; - c.cuda_error = 0; - completion_handler(c); - } - total_completed.fetch_add(1, std::memory_order_relaxed); - - // ARM memory ordering: clear occupancy BEFORE - // clearing ring buffer flags, with a fence between. - slot_occupied[s] = 0; - __sync_synchronize(); - ring->clear_slot(s); - found_any = true; - - } else if (status == CUDAQ_TX_ERROR) { - if (completion_handler) { - Completion c; - c.request_id = slot_request[s]; - c.slot = static_cast(s); - c.success = false; - c.cuda_error = cuda_error; - completion_handler(c); - } - total_completed.fetch_add(1, std::memory_order_relaxed); - slot_occupied[s] = 0; - __sync_synchronize(); - ring->clear_slot(s); - found_any = true; - } - } - - if (!found_any) - QEC_CPU_RELAX(); + ring->tx_flags()[origin_slot].store(rx_value, + cuda::std::memory_order_release); + + idle_mask.fetch_or(1ULL << worker_id, cuda::std::memory_order_release); + } + } + + // ----------------------------------------------------------------------- + // Consumer loop + // ----------------------------------------------------------------------- + + void consumer_loop() { + const uint32_t ns = static_cast(config.num_slots); + + while (true) { + if (consumer_stop.load(std::memory_order_acquire)) + break; + + bool pdone = producer_stop.load(std::memory_order_acquire); + uint64_t nsub = total_submitted.load(std::memory_order_acquire); + uint64_t ncomp = total_completed.load(std::memory_order_relaxed); + + if (pdone && ncomp >= nsub) + break; + + bool found_any = false; + for (uint32_t s = 0; s < ns; ++s) { + if (!slot_occupied[s]) + continue; + + int cuda_error = 0; + cudaq_tx_status_t status = ring->poll_tx(s, &cuda_error); + + if (status == CUDAQ_TX_READY) { + if (completion_handler) { + Completion c; + c.request_id = slot_request[s]; + c.slot = static_cast(s); + c.success = true; + c.cuda_error = 0; + completion_handler(c); + } + total_completed.fetch_add(1, std::memory_order_relaxed); + + // ARM memory ordering: clear occupancy BEFORE + // clearing ring buffer flags, with a fence between. + slot_occupied[s] = 0; + __sync_synchronize(); + ring->clear_slot(s); + found_any = true; + + } else if (status == CUDAQ_TX_ERROR) { + if (completion_handler) { + Completion c; + c.request_id = slot_request[s]; + c.slot = static_cast(s); + c.success = false; + c.cuda_error = cuda_error; + completion_handler(c); + } + total_completed.fetch_add(1, std::memory_order_relaxed); + slot_occupied[s] = 0; + __sync_synchronize(); + ring->clear_slot(s); + found_any = true; } + } + + if (!found_any) + QEC_CPU_RELAX(); } + } }; // --------------------------------------------------------------------------- // RealtimePipeline public API // --------------------------------------------------------------------------- -RealtimePipeline::RealtimePipeline(const PipelineStageConfig& config) - : impl_(std::make_unique()) -{ - impl_->allocate(config); +RealtimePipeline::RealtimePipeline(const PipelineStageConfig &config) + : impl_(std::make_unique()) { + impl_->allocate(config); } RealtimePipeline::~RealtimePipeline() { - if (impl_->started) - impl_->stop_all(); - impl_->free_resources(); + if (impl_->started) + impl_->stop_all(); + impl_->free_resources(); } void RealtimePipeline::set_gpu_stage(GpuStageFactory factory) { - impl_->gpu_factory = std::move(factory); + impl_->gpu_factory = std::move(factory); } void RealtimePipeline::set_cpu_stage(CpuStageCallback callback) { - impl_->cpu_stage = std::move(callback); + impl_->cpu_stage = std::move(callback); } void RealtimePipeline::set_completion_handler(CompletionCallback handler) { - impl_->completion_handler = std::move(handler); + impl_->completion_handler = std::move(handler); } void RealtimePipeline::start() { - if (impl_->started) return; - impl_->start_threads(); + if (impl_->started) + return; + impl_->start_threads(); } -void RealtimePipeline::stop() { - impl_->stop_all(); -} +void RealtimePipeline::stop() { impl_->stop_all(); } RealtimePipeline::Stats RealtimePipeline::stats() const { - return { - impl_->total_submitted.load(std::memory_order_relaxed), - impl_->total_completed.load(std::memory_order_relaxed), - impl_->live_dispatched.load(cuda::std::memory_order_relaxed), - impl_->backpressure_stalls.load(std::memory_order_relaxed) - }; + return {impl_->total_submitted.load(std::memory_order_relaxed), + impl_->total_completed.load(std::memory_order_relaxed), + impl_->live_dispatched.load(cuda::std::memory_order_relaxed), + impl_->backpressure_stalls.load(std::memory_order_relaxed)}; } // --------------------------------------------------------------------------- @@ -576,69 +573,70 @@ RealtimePipeline::Stats RealtimePipeline::stats() const { // --------------------------------------------------------------------------- struct RingBufferInjector::State { - RingBufferManager* ring = nullptr; - std::vector* slot_request = nullptr; - std::vector* slot_occupied = nullptr; - std::atomic* total_submitted = nullptr; - std::atomic* backpressure_stalls = nullptr; - std::atomic* producer_stop = nullptr; - int num_slots = 0; - std::atomic next_slot{0}; + RingBufferManager *ring = nullptr; + std::vector *slot_request = nullptr; + std::vector *slot_occupied = nullptr; + std::atomic *total_submitted = nullptr; + std::atomic *backpressure_stalls = nullptr; + std::atomic *producer_stop = nullptr; + int num_slots = 0; + std::atomic next_slot{0}; }; RingBufferInjector RealtimePipeline::create_injector() { - auto s = std::make_unique(); - s->ring = impl_->ring.get(); - s->slot_request = &impl_->slot_request; - s->slot_occupied = &impl_->slot_occupied; - s->total_submitted = &impl_->total_submitted; - s->backpressure_stalls = &impl_->backpressure_stalls; - s->producer_stop = &impl_->producer_stop; - s->num_slots = impl_->config.num_slots; - return RingBufferInjector(std::move(s)); + auto s = std::make_unique(); + s->ring = impl_->ring.get(); + s->slot_request = &impl_->slot_request; + s->slot_occupied = &impl_->slot_occupied; + s->total_submitted = &impl_->total_submitted; + s->backpressure_stalls = &impl_->backpressure_stalls; + s->producer_stop = &impl_->producer_stop; + s->num_slots = impl_->config.num_slots; + return RingBufferInjector(std::move(s)); } RingBufferInjector::RingBufferInjector(std::unique_ptr s) : state_(std::move(s)) {} RingBufferInjector::~RingBufferInjector() = default; -RingBufferInjector::RingBufferInjector(RingBufferInjector&&) noexcept = default; -RingBufferInjector& RingBufferInjector::operator=(RingBufferInjector&&) noexcept = default; - -bool RingBufferInjector::try_submit(uint32_t function_id, const void* payload, - size_t payload_size, uint64_t request_id) { - uint32_t cur = state_->next_slot.load(std::memory_order_relaxed); - uint32_t slot = cur % static_cast(state_->num_slots); - if (!state_->ring->slot_available(slot)) - return false; - - if (!state_->next_slot.compare_exchange_weak( - cur, cur + 1, - std::memory_order_acq_rel, std::memory_order_relaxed)) - return false; - - state_->ring->write_and_signal(slot, function_id, payload, - static_cast(payload_size)); - - (*state_->slot_request)[slot] = request_id; - (*state_->slot_occupied)[slot] = 1; - state_->total_submitted->fetch_add(1, std::memory_order_release); - return true; +RingBufferInjector::RingBufferInjector(RingBufferInjector &&) noexcept = + default; +RingBufferInjector & +RingBufferInjector::operator=(RingBufferInjector &&) noexcept = default; + +bool RingBufferInjector::try_submit(uint32_t function_id, const void *payload, + size_t payload_size, uint64_t request_id) { + uint32_t cur = state_->next_slot.load(std::memory_order_relaxed); + uint32_t slot = cur % static_cast(state_->num_slots); + if (!state_->ring->slot_available(slot)) + return false; + + if (!state_->next_slot.compare_exchange_weak( + cur, cur + 1, std::memory_order_acq_rel, std::memory_order_relaxed)) + return false; + + state_->ring->write_and_signal(slot, function_id, payload, + static_cast(payload_size)); + + (*state_->slot_request)[slot] = request_id; + (*state_->slot_occupied)[slot] = 1; + state_->total_submitted->fetch_add(1, std::memory_order_release); + return true; } -void RingBufferInjector::submit(uint32_t function_id, const void* payload, - size_t payload_size, uint64_t request_id) { - while (!try_submit(function_id, payload, payload_size, request_id)) { - if (state_->producer_stop && - state_->producer_stop->load(std::memory_order_acquire)) - return; - state_->backpressure_stalls->fetch_add(1, std::memory_order_relaxed); - QEC_CPU_RELAX(); - } +void RingBufferInjector::submit(uint32_t function_id, const void *payload, + size_t payload_size, uint64_t request_id) { + while (!try_submit(function_id, payload, payload_size, request_id)) { + if (state_->producer_stop && + state_->producer_stop->load(std::memory_order_acquire)) + return; + state_->backpressure_stalls->fetch_add(1, std::memory_order_relaxed); + QEC_CPU_RELAX(); + } } uint64_t RingBufferInjector::backpressure_stalls() const { - return state_->backpressure_stalls->load(std::memory_order_relaxed); + return state_->backpressure_stalls->load(std::memory_order_relaxed); } } // namespace cudaq::realtime diff --git a/realtime/unittests/test_dispatch_kernel.cu b/realtime/unittests/test_dispatch_kernel.cu index bef7e049..05df4f96 100644 --- a/realtime/unittests/test_dispatch_kernel.cu +++ b/realtime/unittests/test_dispatch_kernel.cu @@ -6,18 +6,18 @@ * the terms of the Apache License 2.0 which accompanies this distribution. * ******************************************************************************/ -#include -#include #include -#include #include -#include +#include +#include #include +#include +#include #include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" -#include "cudaq/realtime/daemon/dispatcher/kernel_types.h" -#include "cudaq/realtime/daemon/dispatcher/dispatch_modes.h" #include "cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh" +#include "cudaq/realtime/daemon/dispatcher/dispatch_modes.h" +#include "cudaq/realtime/daemon/dispatcher/kernel_types.h" // Helper macro for CUDA error checking #define CUDA_CHECK(call) \ @@ -33,12 +33,12 @@ namespace { //============================================================================== /// @brief Test handler that adds 1 to each byte. -__device__ int increment_handler(const void* input, void* output, - std::uint32_t arg_len, - std::uint32_t max_result_len, - std::uint32_t* result_len) { - const std::uint8_t* in_data = static_cast(input); - std::uint8_t* out_data = static_cast(output); +__device__ int increment_handler(const void *input, void *output, + std::uint32_t arg_len, + std::uint32_t max_result_len, + std::uint32_t *result_len) { + const std::uint8_t *in_data = static_cast(input); + std::uint8_t *out_data = static_cast(output); for (std::uint32_t i = 0; i < arg_len && i < max_result_len; ++i) { out_data[i] = in_data[i] + 1; } @@ -53,12 +53,12 @@ __device__ int increment_handler(const void* input, void* output, constexpr std::uint32_t RPC_INCREMENT_FUNCTION_ID = cudaq::realtime::fnv1a_hash("rpc_increment"); -__device__ int rpc_increment_handler(const void* input, void* output, +__device__ int rpc_increment_handler(const void *input, void *output, std::uint32_t arg_len, std::uint32_t max_result_len, - std::uint32_t* result_len) { - const std::uint8_t* in_data = static_cast(input); - std::uint8_t* out_data = static_cast(output); + std::uint32_t *result_len) { + const std::uint8_t *in_data = static_cast(input); + std::uint8_t *out_data = static_cast(output); for (std::uint32_t i = 0; i < arg_len && i < max_result_len; ++i) { out_data[i] = static_cast(in_data[i] + 1); } @@ -66,15 +66,16 @@ __device__ int rpc_increment_handler(const void* input, void* output, return 0; } -__global__ void init_rpc_function_table(cudaq_function_entry_t* entries) { +__global__ void init_rpc_function_table(cudaq_function_entry_t *entries) { if (threadIdx.x == 0 && blockIdx.x == 0) { - entries[0].handler.device_fn_ptr = reinterpret_cast(&rpc_increment_handler); + entries[0].handler.device_fn_ptr = + reinterpret_cast(&rpc_increment_handler); entries[0].function_id = RPC_INCREMENT_FUNCTION_ID; entries[0].dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; entries[0].reserved[0] = 0; entries[0].reserved[1] = 0; entries[0].reserved[2] = 0; - + // Schema: 1 array argument (uint8), 1 array result (uint8) entries[0].schema.num_args = 1; entries[0].schema.num_results = 1; @@ -83,46 +84,44 @@ __global__ void init_rpc_function_table(cudaq_function_entry_t* entries) { entries[0].schema.args[0].reserved[0] = 0; entries[0].schema.args[0].reserved[1] = 0; entries[0].schema.args[0].reserved[2] = 0; - entries[0].schema.args[0].size_bytes = 0; // Variable size + entries[0].schema.args[0].size_bytes = 0; // Variable size entries[0].schema.args[0].num_elements = 0; // Variable size entries[0].schema.results[0].type_id = CUDAQ_TYPE_ARRAY_UINT8; entries[0].schema.results[0].reserved[0] = 0; entries[0].schema.results[0].reserved[1] = 0; entries[0].schema.results[0].reserved[2] = 0; - entries[0].schema.results[0].size_bytes = 0; // Variable size + entries[0].schema.results[0].size_bytes = 0; // Variable size entries[0].schema.results[0].num_elements = 0; // Variable size } } bool allocate_ring_buffer(std::size_t num_slots, std::size_t slot_size, - volatile uint64_t** host_flags_out, - volatile uint64_t** device_flags_out, - std::uint8_t** host_data_out, - std::uint8_t** device_data_out) { - void* host_flags_ptr = nullptr; - cudaError_t err = cudaHostAlloc(&host_flags_ptr, - num_slots * sizeof(uint64_t), + volatile uint64_t **host_flags_out, + volatile uint64_t **device_flags_out, + std::uint8_t **host_data_out, + std::uint8_t **device_data_out) { + void *host_flags_ptr = nullptr; + cudaError_t err = cudaHostAlloc(&host_flags_ptr, num_slots * sizeof(uint64_t), cudaHostAllocMapped); if (err != cudaSuccess) return false; - void* device_flags_ptr = nullptr; + void *device_flags_ptr = nullptr; err = cudaHostGetDevicePointer(&device_flags_ptr, host_flags_ptr, 0); if (err != cudaSuccess) { cudaFreeHost(host_flags_ptr); return false; } - void* host_data_ptr = nullptr; - err = cudaHostAlloc(&host_data_ptr, - num_slots * slot_size, - cudaHostAllocMapped); + void *host_data_ptr = nullptr; + err = + cudaHostAlloc(&host_data_ptr, num_slots * slot_size, cudaHostAllocMapped); if (err != cudaSuccess) { cudaFreeHost(host_flags_ptr); return false; } - void* device_data_ptr = nullptr; + void *device_data_ptr = nullptr; err = cudaHostGetDevicePointer(&device_data_ptr, host_data_ptr, 0); if (err != cudaSuccess) { cudaFreeHost(host_flags_ptr); @@ -132,65 +131,53 @@ bool allocate_ring_buffer(std::size_t num_slots, std::size_t slot_size, memset(host_flags_ptr, 0, num_slots * sizeof(uint64_t)); - *host_flags_out = static_cast(host_flags_ptr); - *device_flags_out = static_cast(device_flags_ptr); - *host_data_out = static_cast(host_data_ptr); - *device_data_out = static_cast(device_data_ptr); + *host_flags_out = static_cast(host_flags_ptr); + *device_flags_out = static_cast(device_flags_ptr); + *host_data_out = static_cast(host_data_ptr); + *device_data_out = static_cast(device_data_ptr); return true; } -void free_ring_buffer(volatile uint64_t* host_flags, - std::uint8_t* host_data) { +void free_ring_buffer(volatile uint64_t *host_flags, std::uint8_t *host_data) { if (host_flags) - cudaFreeHost(const_cast(host_flags)); + cudaFreeHost(const_cast(host_flags)); if (host_data) cudaFreeHost(host_data); } extern "C" void launch_dispatch_kernel_wrapper( - volatile std::uint64_t* rx_flags, - volatile std::uint64_t* tx_flags, - std::uint8_t* rx_data, - std::uint8_t* tx_data, - std::size_t rx_stride_sz, - std::size_t tx_stride_sz, - cudaq_function_entry_t* function_table, - std::size_t func_count, - volatile int* shutdown_flag, - std::uint64_t* stats, - std::size_t num_slots, - std::uint32_t num_blocks, - std::uint32_t threads_per_block, - cudaStream_t stream) { + volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, + std::uint8_t *rx_data, std::uint8_t *tx_data, std::size_t rx_stride_sz, + std::size_t tx_stride_sz, cudaq_function_entry_t *function_table, + std::size_t func_count, volatile int *shutdown_flag, std::uint64_t *stats, + std::size_t num_slots, std::uint32_t num_blocks, + std::uint32_t threads_per_block, cudaStream_t stream) { cudaq_launch_dispatch_kernel_regular( rx_flags, tx_flags, rx_data, tx_data, rx_stride_sz, tx_stride_sz, - function_table, func_count, - shutdown_flag, stats, num_slots, num_blocks, threads_per_block, stream); + function_table, func_count, shutdown_flag, stats, num_slots, num_blocks, + threads_per_block, stream); } //============================================================================== // Test Kernel for DeviceCallMode //============================================================================== -using HandlerFunc = int (*)(const void*, void*, std::uint32_t, std::uint32_t, std::uint32_t*); +using HandlerFunc = int (*)(const void *, void *, std::uint32_t, std::uint32_t, + std::uint32_t *); __device__ HandlerFunc d_increment_handler = increment_handler; /// @brief Test kernel that dispatches to a handler using DeviceCallMode. template -__global__ void test_dispatch_kernel( - HandlerFunc handler, - const void* input, - void* output, - std::uint32_t arg_len, - std::uint32_t max_result_len, - std::uint32_t* result_len, - int* status) { - +__global__ void test_dispatch_kernel(HandlerFunc handler, const void *input, + void *output, std::uint32_t arg_len, + std::uint32_t max_result_len, + std::uint32_t *result_len, int *status) { + if (threadIdx.x == 0 && blockIdx.x == 0) { *status = handler(input, output, arg_len, max_result_len, result_len); } - + KernelType::sync(); } @@ -205,16 +192,19 @@ protected: CUDA_CHECK(cudaMalloc(&d_result_len_, sizeof(std::uint32_t))); CUDA_CHECK(cudaMalloc(&d_status_, sizeof(int))); } - + void TearDown() override { - if (d_buffer_) cudaFree(d_buffer_); - if (d_result_len_) cudaFree(d_result_len_); - if (d_status_) cudaFree(d_status_); + if (d_buffer_) + cudaFree(d_buffer_); + if (d_result_len_) + cudaFree(d_result_len_); + if (d_status_) + cudaFree(d_status_); } - - void* d_buffer_ = nullptr; - std::uint32_t* d_result_len_ = nullptr; - int* d_status_ = nullptr; + + void *d_buffer_ = nullptr; + std::uint32_t *d_result_len_ = nullptr; + int *d_status_ = nullptr; }; //============================================================================== @@ -226,35 +216,37 @@ TEST_F(DispatchKernelTest, IncrementHandlerBasic) { std::vector input = {0, 1, 2, 3, 4}; std::vector expected = {1, 2, 3, 4, 5}; - void* d_input = nullptr; + void *d_input = nullptr; CUDA_CHECK(cudaMalloc(&d_input, 1024)); - CUDA_CHECK(cudaMemcpy(d_input, input.data(), input.size(), - cudaMemcpyHostToDevice)); - + CUDA_CHECK( + cudaMemcpy(d_input, input.data(), input.size(), cudaMemcpyHostToDevice)); + // Get device function pointer HandlerFunc h_handler; - CUDA_CHECK(cudaMemcpyFromSymbol(&h_handler, d_increment_handler, - sizeof(HandlerFunc))); - + CUDA_CHECK(cudaMemcpyFromSymbol(&h_handler, d_increment_handler, + sizeof(HandlerFunc))); + // Launch kernel with separate input/output buffers - test_dispatch_kernel<<<1, 32>>>( - h_handler, d_input, d_buffer_, input.size(), 1024, d_result_len_, d_status_); + test_dispatch_kernel + <<<1, 32>>>(h_handler, d_input, d_buffer_, input.size(), 1024, + d_result_len_, d_status_); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); - + // Check results int status; std::uint32_t result_len; - CUDA_CHECK(cudaMemcpy(&status, d_status_, sizeof(int), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(&result_len, d_result_len_, sizeof(std::uint32_t), + CUDA_CHECK( + cudaMemcpy(&status, d_status_, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(&result_len, d_result_len_, sizeof(std::uint32_t), cudaMemcpyDeviceToHost)); - + EXPECT_EQ(status, 0) << "Handler should return success"; EXPECT_EQ(result_len, input.size()) << "Result length should match input"; - + // Verify output buffer has incremented data std::vector output(input.size()); - CUDA_CHECK(cudaMemcpy(output.data(), d_buffer_, output.size(), + CUDA_CHECK(cudaMemcpy(output.data(), d_buffer_, output.size(), cudaMemcpyDeviceToHost)); EXPECT_EQ(expected, output) << "Increment handler should add 1 to each byte"; @@ -274,31 +266,32 @@ TEST_F(DispatchKernelTest, LargeBuffer) { for (std::size_t i = 0; i < size; ++i) { input[i] = static_cast(i & 0xFF); } - - void* d_input = nullptr; + + void *d_input = nullptr; CUDA_CHECK(cudaMalloc(&d_input, 1024)); - CUDA_CHECK(cudaMemcpy(d_input, input.data(), input.size(), - cudaMemcpyHostToDevice)); - + CUDA_CHECK( + cudaMemcpy(d_input, input.data(), input.size(), cudaMemcpyHostToDevice)); + HandlerFunc h_handler; - CUDA_CHECK(cudaMemcpyFromSymbol(&h_handler, d_increment_handler, - sizeof(HandlerFunc))); - - test_dispatch_kernel<<<1, 256>>>( - h_handler, d_input, d_buffer_, input.size(), 1024, d_result_len_, d_status_); + CUDA_CHECK(cudaMemcpyFromSymbol(&h_handler, d_increment_handler, + sizeof(HandlerFunc))); + + test_dispatch_kernel + <<<1, 256>>>(h_handler, d_input, d_buffer_, input.size(), 1024, + d_result_len_, d_status_); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); - + std::uint32_t result_len; - CUDA_CHECK(cudaMemcpy(&result_len, d_result_len_, sizeof(std::uint32_t), + CUDA_CHECK(cudaMemcpy(&result_len, d_result_len_, sizeof(std::uint32_t), cudaMemcpyDeviceToHost)); EXPECT_EQ(result_len, size) << "Should process all bytes"; - + // Verify all bytes incremented in output buffer std::vector output(size); - CUDA_CHECK(cudaMemcpy(output.data(), d_buffer_, output.size(), + CUDA_CHECK(cudaMemcpy(output.data(), d_buffer_, output.size(), cudaMemcpyDeviceToHost)); - + for (std::size_t i = 0; i < size; ++i) { uint8_t expected = static_cast((i + 1) & 0xFF); EXPECT_EQ(output[i], expected) << "Mismatch at index " << i; @@ -315,21 +308,22 @@ protected: ASSERT_TRUE(allocate_ring_buffer(num_slots_, slot_size_, &tx_flags_host_, &tx_flags_, &tx_data_host_, &tx_data_)); - void* tmp_shutdown = nullptr; + void *tmp_shutdown = nullptr; CUDA_CHECK(cudaHostAlloc(&tmp_shutdown, sizeof(int), cudaHostAllocMapped)); - shutdown_flag_ = static_cast(tmp_shutdown); - void* tmp_d_shutdown = nullptr; + shutdown_flag_ = static_cast(tmp_shutdown); + void *tmp_d_shutdown = nullptr; CUDA_CHECK(cudaHostGetDevicePointer(&tmp_d_shutdown, tmp_shutdown, 0)); - d_shutdown_flag_ = static_cast(tmp_d_shutdown); + d_shutdown_flag_ = static_cast(tmp_d_shutdown); *shutdown_flag_ = 0; int zero = 0; - CUDA_CHECK(cudaMemcpy(const_cast(d_shutdown_flag_), &zero, + CUDA_CHECK(cudaMemcpy(const_cast(d_shutdown_flag_), &zero, sizeof(int), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMalloc(&d_stats_, sizeof(uint64_t))); CUDA_CHECK(cudaMemset(d_stats_, 0, sizeof(uint64_t))); - CUDA_CHECK(cudaMalloc(&d_function_entries_, sizeof(cudaq_function_entry_t))); + CUDA_CHECK( + cudaMalloc(&d_function_entries_, sizeof(cudaq_function_entry_t))); init_rpc_function_table<<<1, 1>>>(d_function_entries_); CUDA_CHECK(cudaDeviceSynchronize()); func_count_ = 1; @@ -344,7 +338,8 @@ protected: config.vp_id = 0; config.kernel_type = CUDAQ_KERNEL_REGULAR; config.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; - ASSERT_EQ(cudaq_dispatcher_create(manager_, &config, &dispatcher_), CUDAQ_OK); + ASSERT_EQ(cudaq_dispatcher_create(manager_, &config, &dispatcher_), + CUDAQ_OK); cudaq_ringbuffer_t ringbuffer{}; ringbuffer.rx_flags = rx_flags_; @@ -353,12 +348,14 @@ protected: ringbuffer.tx_data = tx_data_; ringbuffer.rx_stride_sz = slot_size_; ringbuffer.tx_stride_sz = slot_size_; - ASSERT_EQ(cudaq_dispatcher_set_ringbuffer(dispatcher_, &ringbuffer), CUDAQ_OK); + ASSERT_EQ(cudaq_dispatcher_set_ringbuffer(dispatcher_, &ringbuffer), + CUDAQ_OK); cudaq_function_table_t table{}; table.entries = d_function_entries_; table.count = func_count_; - ASSERT_EQ(cudaq_dispatcher_set_function_table(dispatcher_, &table), CUDAQ_OK); + ASSERT_EQ(cudaq_dispatcher_set_function_table(dispatcher_, &table), + CUDAQ_OK); ASSERT_EQ( cudaq_dispatcher_set_control(dispatcher_, d_shutdown_flag_, d_stats_), @@ -387,7 +384,7 @@ protected: free_ring_buffer(tx_flags_host_, tx_data_host_); if (shutdown_flag_) - cudaFreeHost(const_cast(shutdown_flag_)); + cudaFreeHost(const_cast(shutdown_flag_)); if (d_stats_) cudaFree(d_stats_); if (d_function_entries_) @@ -395,10 +392,10 @@ protected: } void write_rpc_request(std::size_t slot, - const std::vector& payload) { - std::uint8_t* slot_data = - const_cast(rx_data_host_) + slot * slot_size_; - auto* header = reinterpret_cast(slot_data); + const std::vector &payload) { + std::uint8_t *slot_data = + const_cast(rx_data_host_) + slot * slot_size_; + auto *header = reinterpret_cast(slot_data); header->magic = cudaq::realtime::RPC_MAGIC_REQUEST; header->function_id = RPC_INCREMENT_FUNCTION_ID; header->arg_len = static_cast(payload.size()); @@ -406,16 +403,15 @@ protected: payload.size()); } - bool read_rpc_response(std::size_t slot, - std::vector& payload, - std::int32_t* status_out = nullptr, - std::uint32_t* result_len_out = nullptr) { + bool read_rpc_response(std::size_t slot, std::vector &payload, + std::int32_t *status_out = nullptr, + std::uint32_t *result_len_out = nullptr) { __sync_synchronize(); // Read from TX buffer (dispatch kernel writes response to symmetric TX) - const std::uint8_t* slot_data = - const_cast(tx_data_host_) + slot * slot_size_; - auto* response = - reinterpret_cast(slot_data); + const std::uint8_t *slot_data = + const_cast(tx_data_host_) + slot * slot_size_; + auto *response = + reinterpret_cast(slot_data); if (response->magic != cudaq::realtime::RPC_MAGIC_RESPONSE) return false; @@ -427,32 +423,31 @@ protected: return false; payload.resize(response->result_len); - memcpy(payload.data(), - slot_data + sizeof(cudaq::realtime::RPCResponse), + memcpy(payload.data(), slot_data + sizeof(cudaq::realtime::RPCResponse), response->result_len); return true; } static constexpr std::size_t num_slots_ = 2; std::size_t slot_size_ = 256; - volatile uint64_t* rx_flags_host_ = nullptr; - volatile uint64_t* tx_flags_host_ = nullptr; - volatile uint64_t* rx_flags_ = nullptr; - volatile uint64_t* tx_flags_ = nullptr; - std::uint8_t* rx_data_host_ = nullptr; - std::uint8_t* tx_data_host_ = nullptr; - std::uint8_t* rx_data_ = nullptr; - std::uint8_t* tx_data_ = nullptr; - - volatile int* shutdown_flag_ = nullptr; - volatile int* d_shutdown_flag_ = nullptr; - uint64_t* d_stats_ = nullptr; - - cudaq_function_entry_t* d_function_entries_ = nullptr; + volatile uint64_t *rx_flags_host_ = nullptr; + volatile uint64_t *tx_flags_host_ = nullptr; + volatile uint64_t *rx_flags_ = nullptr; + volatile uint64_t *tx_flags_ = nullptr; + std::uint8_t *rx_data_host_ = nullptr; + std::uint8_t *tx_data_host_ = nullptr; + std::uint8_t *rx_data_ = nullptr; + std::uint8_t *tx_data_ = nullptr; + + volatile int *shutdown_flag_ = nullptr; + volatile int *d_shutdown_flag_ = nullptr; + uint64_t *d_stats_ = nullptr; + + cudaq_function_entry_t *d_function_entries_ = nullptr; std::size_t func_count_ = 0; - cudaq_dispatch_manager_t* manager_ = nullptr; - cudaq_dispatcher_t* dispatcher_ = nullptr; + cudaq_dispatch_manager_t *manager_ = nullptr; + cudaq_dispatcher_t *dispatcher_ = nullptr; }; TEST_F(HostApiDispatchTest, RpcIncrementHandler) { @@ -460,7 +455,7 @@ TEST_F(HostApiDispatchTest, RpcIncrementHandler) { write_rpc_request(0, payload); __sync_synchronize(); - const_cast(rx_flags_host_)[0] = + const_cast(rx_flags_host_)[0] = reinterpret_cast(rx_data_); int timeout = 50; @@ -485,22 +480,24 @@ TEST_F(HostApiDispatchTest, RpcIncrementHandler) { //============================================================================== // Graph kernel that processes RPC buffer via pointer indirection -__global__ void graph_increment_kernel(void** buffer_ptr) { +__global__ void graph_increment_kernel(void **buffer_ptr) { if (threadIdx.x == 0 && blockIdx.x == 0) { - void* buffer = *buffer_ptr; - cudaq::realtime::RPCHeader* header = static_cast(buffer); - + void *buffer = *buffer_ptr; + cudaq::realtime::RPCHeader *header = + static_cast(buffer); + std::uint32_t arg_len = header->arg_len; - void* arg_buffer = static_cast(header + 1); - std::uint8_t* data = static_cast(arg_buffer); - + void *arg_buffer = static_cast(header + 1); + std::uint8_t *data = static_cast(arg_buffer); + // Increment each byte for (std::uint32_t i = 0; i < arg_len; ++i) { data[i] = data[i] + 1; } - + // Write response - cudaq::realtime::RPCResponse* response = static_cast(buffer); + cudaq::realtime::RPCResponse *response = + static_cast(buffer); response->magic = cudaq::realtime::RPC_MAGIC_RESPONSE; response->status = 0; response->result_len = arg_len; @@ -510,7 +507,7 @@ __global__ void graph_increment_kernel(void** buffer_ptr) { constexpr std::uint32_t RPC_GRAPH_INCREMENT_FUNCTION_ID = cudaq::realtime::fnv1a_hash("rpc_graph_increment"); -__global__ void init_graph_function_table(cudaq_function_entry_t* entries, +__global__ void init_graph_function_table(cudaq_function_entry_t *entries, cudaGraphExec_t graph_exec) { if (threadIdx.x == 0 && blockIdx.x == 0) { entries[0].handler.graph_exec = graph_exec; @@ -528,195 +525,206 @@ TEST(GraphLaunchTest, DispatchKernelGraphLaunch) { CUDA_CHECK(cudaGetDevice(&device)); cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); - + if (prop.major < 9) { - GTEST_SKIP() << "Graph device launch requires compute capability 9.0+, found " - << prop.major << "." << prop.minor; + GTEST_SKIP() + << "Graph device launch requires compute capability 9.0+, found " + << prop.major << "." << prop.minor; } - + // Allocate graph buffer pointer (for pointer indirection pattern) - void** d_graph_buffer_ptr; - CUDA_CHECK(cudaMalloc(&d_graph_buffer_ptr, sizeof(void*))); - CUDA_CHECK(cudaMemset(d_graph_buffer_ptr, 0, sizeof(void*))); - + void **d_graph_buffer_ptr; + CUDA_CHECK(cudaMalloc(&d_graph_buffer_ptr, sizeof(void *))); + CUDA_CHECK(cudaMemset(d_graph_buffer_ptr, 0, sizeof(void *))); + // Allocate test buffer constexpr size_t buffer_size = 1024; - void* d_buffer; + void *d_buffer; CUDA_CHECK(cudaMalloc(&d_buffer, buffer_size)); - + // Create the child graph (the one that will be launched from device) cudaGraph_t child_graph; cudaGraphExec_t child_graph_exec; - + CUDA_CHECK(cudaGraphCreate(&child_graph, 0)); - + // Add kernel node to child graph cudaKernelNodeParams kernel_params = {}; - void* kernel_args[] = {&d_graph_buffer_ptr}; - kernel_params.func = reinterpret_cast(&graph_increment_kernel); + void *kernel_args[] = {&d_graph_buffer_ptr}; + kernel_params.func = reinterpret_cast(&graph_increment_kernel); kernel_params.gridDim = dim3(1, 1, 1); kernel_params.blockDim = dim3(32, 1, 1); kernel_params.sharedMemBytes = 0; kernel_params.kernelParams = kernel_args; kernel_params.extra = nullptr; - + cudaGraphNode_t kernel_node; - CUDA_CHECK(cudaGraphAddKernelNode(&kernel_node, child_graph, nullptr, 0, &kernel_params)); - + CUDA_CHECK(cudaGraphAddKernelNode(&kernel_node, child_graph, nullptr, 0, + &kernel_params)); + // Instantiate CHILD graph with DEVICE LAUNCH FLAG - CUDA_CHECK(cudaGraphInstantiate(&child_graph_exec, child_graph, - cudaGraphInstantiateFlagDeviceLaunch)); - + CUDA_CHECK(cudaGraphInstantiate(&child_graph_exec, child_graph, + cudaGraphInstantiateFlagDeviceLaunch)); + // Create stream for operations cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - + // Upload the child graph to device CUDA_CHECK(cudaGraphUpload(child_graph_exec, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); - + // Set up function table with graph launch entry - cudaq_function_entry_t* d_function_entries; + cudaq_function_entry_t *d_function_entries; CUDA_CHECK(cudaMalloc(&d_function_entries, sizeof(cudaq_function_entry_t))); init_graph_function_table<<<1, 1>>>(d_function_entries, child_graph_exec); CUDA_CHECK(cudaDeviceSynchronize()); - + // Set up RPC buffer on host - std::uint8_t* h_buffer = new std::uint8_t[buffer_size]; - cudaq::realtime::RPCHeader* h_header = reinterpret_cast(h_buffer); + std::uint8_t *h_buffer = new std::uint8_t[buffer_size]; + cudaq::realtime::RPCHeader *h_header = + reinterpret_cast(h_buffer); h_header->magic = cudaq::realtime::RPC_MAGIC_REQUEST; h_header->function_id = RPC_GRAPH_INCREMENT_FUNCTION_ID; h_header->arg_len = 4; - - std::uint8_t* h_data = h_buffer + sizeof(cudaq::realtime::RPCHeader); + + std::uint8_t *h_data = h_buffer + sizeof(cudaq::realtime::RPCHeader); h_data[0] = 0; h_data[1] = 1; h_data[2] = 2; h_data[3] = 3; - + // Copy to device - CUDA_CHECK(cudaMemcpy(d_buffer, h_buffer, buffer_size, cudaMemcpyHostToDevice)); - + CUDA_CHECK( + cudaMemcpy(d_buffer, h_buffer, buffer_size, cudaMemcpyHostToDevice)); + // Set up fake RX/TX flags for single-shot test - volatile uint64_t* d_rx_flags; - volatile uint64_t* d_tx_flags; + volatile uint64_t *d_rx_flags; + volatile uint64_t *d_tx_flags; CUDA_CHECK(cudaMalloc(&d_rx_flags, sizeof(uint64_t))); CUDA_CHECK(cudaMalloc(&d_tx_flags, sizeof(uint64_t))); - CUDA_CHECK(cudaMemset((void*)d_rx_flags, 0, sizeof(uint64_t))); - CUDA_CHECK(cudaMemset((void*)d_tx_flags, 0, sizeof(uint64_t))); - + CUDA_CHECK(cudaMemset((void *)d_rx_flags, 0, sizeof(uint64_t))); + CUDA_CHECK(cudaMemset((void *)d_tx_flags, 0, sizeof(uint64_t))); + // Set RX flag to point to our buffer (simulating incoming RPC) uint64_t buffer_addr = reinterpret_cast(d_buffer); - CUDA_CHECK(cudaMemcpy((void*)d_rx_flags, &buffer_addr, sizeof(uint64_t), cudaMemcpyHostToDevice)); - + CUDA_CHECK(cudaMemcpy((void *)d_rx_flags, &buffer_addr, sizeof(uint64_t), + cudaMemcpyHostToDevice)); + // Set up shutdown flag using pinned mapped memory so the dispatch kernel // can see host updates immediately - volatile int* h_shutdown; - volatile int* d_shutdown; + volatile int *h_shutdown; + volatile int *d_shutdown; { - void* tmp_shutdown; + void *tmp_shutdown; CUDA_CHECK(cudaHostAlloc(&tmp_shutdown, sizeof(int), cudaHostAllocMapped)); - h_shutdown = static_cast(tmp_shutdown); + h_shutdown = static_cast(tmp_shutdown); *h_shutdown = 0; - - void* tmp_d_shutdown; + + void *tmp_d_shutdown; CUDA_CHECK(cudaHostGetDevicePointer(&tmp_d_shutdown, tmp_shutdown, 0)); - d_shutdown = static_cast(tmp_d_shutdown); + d_shutdown = static_cast(tmp_d_shutdown); } - + // Set up stats - uint64_t* d_stats; + uint64_t *d_stats; CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t))); CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t))); - + // Create dispatch graph context - THIS WRAPS THE DISPATCH KERNEL IN A GRAPH // so that device-side cudaGraphLaunch() can work! - cudaq_dispatch_graph_context* dispatch_ctx = nullptr; + cudaq_dispatch_graph_context *dispatch_ctx = nullptr; cudaError_t err = cudaq_create_dispatch_graph_regular( d_rx_flags, d_tx_flags, - reinterpret_cast(d_buffer), // rx_data - reinterpret_cast(d_buffer), // tx_data (same buffer for single-slot test) - buffer_size, // rx_stride_sz - buffer_size, // tx_stride_sz - d_function_entries, 1, - d_graph_buffer_ptr, d_shutdown, d_stats, 1, - 1, 32, stream, &dispatch_ctx); - + reinterpret_cast(d_buffer), // rx_data + reinterpret_cast( + d_buffer), // tx_data (same buffer for single-slot test) + buffer_size, // rx_stride_sz + buffer_size, // tx_stride_sz + d_function_entries, 1, d_graph_buffer_ptr, d_shutdown, d_stats, 1, 1, 32, + stream, &dispatch_ctx); + if (err != cudaSuccess) { - GTEST_SKIP() << "Device-side graph launch not supported: " + GTEST_SKIP() << "Device-side graph launch not supported: " << cudaGetErrorString(err) << " (" << err << ")"; } - + // Launch dispatch graph - now device-side cudaGraphLaunch will work! CUDA_CHECK(cudaq_launch_dispatch_graph(dispatch_ctx, stream)); - + // Poll for the response using pinned memory and async operations // The child graph runs asynchronously (fire-and-forget) so we need to poll - std::uint8_t* h_poll_buffer; - CUDA_CHECK(cudaHostAlloc(&h_poll_buffer, sizeof(cudaq::realtime::RPCResponse), cudaHostAllocDefault)); + std::uint8_t *h_poll_buffer; + CUDA_CHECK(cudaHostAlloc(&h_poll_buffer, sizeof(cudaq::realtime::RPCResponse), + cudaHostAllocDefault)); memset(h_poll_buffer, 0, sizeof(cudaq::realtime::RPCResponse)); - + cudaStream_t poll_stream; CUDA_CHECK(cudaStreamCreate(&poll_stream)); - + int timeout_ms = 5000; int poll_interval_ms = 100; bool got_response = false; - + for (int elapsed = 0; elapsed < timeout_ms; elapsed += poll_interval_ms) { - CUDA_CHECK(cudaMemcpyAsync(h_poll_buffer, d_buffer, sizeof(cudaq::realtime::RPCResponse), - cudaMemcpyDeviceToHost, poll_stream)); + CUDA_CHECK(cudaMemcpyAsync(h_poll_buffer, d_buffer, + sizeof(cudaq::realtime::RPCResponse), + cudaMemcpyDeviceToHost, poll_stream)); CUDA_CHECK(cudaStreamSynchronize(poll_stream)); - - cudaq::realtime::RPCResponse* peek = reinterpret_cast(h_poll_buffer); + + cudaq::realtime::RPCResponse *peek = + reinterpret_cast(h_poll_buffer); if (peek->magic == cudaq::realtime::RPC_MAGIC_RESPONSE) { got_response = true; break; } - + usleep(poll_interval_ms * 1000); } - + // Signal shutdown to allow kernel to exit *h_shutdown = 1; __sync_synchronize(); usleep(100000); // Give kernel time to see shutdown flag - + // Copy final results - CUDA_CHECK(cudaMemcpyAsync(h_buffer, d_buffer, buffer_size, cudaMemcpyDeviceToHost, poll_stream)); + CUDA_CHECK(cudaMemcpyAsync(h_buffer, d_buffer, buffer_size, + cudaMemcpyDeviceToHost, poll_stream)); CUDA_CHECK(cudaStreamSynchronize(poll_stream)); - - // Clean up poll resources + + // Clean up poll resources CUDA_CHECK(cudaStreamDestroy(poll_stream)); cudaFreeHost(h_poll_buffer); - + // Sync main stream (dispatch kernel should have exited) CUDA_CHECK(cudaStreamSynchronize(stream)); - - ASSERT_TRUE(got_response) << "Timeout waiting for device-side graph launch response"; - + + ASSERT_TRUE(got_response) + << "Timeout waiting for device-side graph launch response"; + // Verify response - cudaq::realtime::RPCResponse* h_response = reinterpret_cast(h_buffer); - EXPECT_EQ(h_response->magic, cudaq::realtime::RPC_MAGIC_RESPONSE) + cudaq::realtime::RPCResponse *h_response = + reinterpret_cast(h_buffer); + EXPECT_EQ(h_response->magic, cudaq::realtime::RPC_MAGIC_RESPONSE) << "Expected RPC_MAGIC_RESPONSE, got 0x" << std::hex << h_response->magic; EXPECT_EQ(h_response->status, 0) << "Handler returned error status"; EXPECT_EQ(h_response->result_len, 4u) << "Unexpected result length"; - + // Verify data was incremented by graph kernel launched from dispatch kernel - std::uint8_t* h_result = h_buffer + sizeof(cudaq::realtime::RPCResponse); + std::uint8_t *h_result = h_buffer + sizeof(cudaq::realtime::RPCResponse); EXPECT_EQ(h_result[0], 1) << "Expected h_result[0]=1"; EXPECT_EQ(h_result[1], 2) << "Expected h_result[1]=2"; EXPECT_EQ(h_result[2], 3) << "Expected h_result[2]=3"; EXPECT_EQ(h_result[3], 4) << "Expected h_result[3]=4"; - + // Cleanup delete[] h_buffer; CUDA_CHECK(cudaq_destroy_dispatch_graph(dispatch_ctx)); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(d_stats)); - CUDA_CHECK(cudaFreeHost(const_cast(h_shutdown))); // Free mapped memory - CUDA_CHECK(cudaFree((void*)d_tx_flags)); - CUDA_CHECK(cudaFree((void*)d_rx_flags)); + CUDA_CHECK(cudaFreeHost(const_cast(h_shutdown))); // Free mapped memory + CUDA_CHECK(cudaFree((void *)d_tx_flags)); + CUDA_CHECK(cudaFree((void *)d_rx_flags)); CUDA_CHECK(cudaFree(d_function_entries)); CUDA_CHECK(cudaGraphExecDestroy(child_graph_exec)); CUDA_CHECK(cudaGraphDestroy(child_graph)); diff --git a/realtime/unittests/test_host_dispatcher.cu b/realtime/unittests/test_host_dispatcher.cu index 7d79c5b3..f955554e 100644 --- a/realtime/unittests/test_host_dispatcher.cu +++ b/realtime/unittests/test_host_dispatcher.cu @@ -6,10 +6,10 @@ * the terms of the Apache License 2.0 which accompanies this distribution. ******************************************************************************/ -#include -#include #include #include +#include +#include #include #include #include @@ -31,33 +31,32 @@ namespace { //============================================================================== bool allocate_ring_buffer(std::size_t num_slots, std::size_t slot_size, - volatile uint64_t** host_flags_out, - volatile uint64_t** device_flags_out, - std::uint8_t** host_data_out, - std::uint8_t** device_data_out) { - void* host_flags_ptr = nullptr; - cudaError_t err = cudaHostAlloc(&host_flags_ptr, - num_slots * sizeof(uint64_t), + volatile uint64_t **host_flags_out, + volatile uint64_t **device_flags_out, + std::uint8_t **host_data_out, + std::uint8_t **device_data_out) { + void *host_flags_ptr = nullptr; + cudaError_t err = cudaHostAlloc(&host_flags_ptr, num_slots * sizeof(uint64_t), cudaHostAllocMapped); if (err != cudaSuccess) return false; - void* device_flags_ptr = nullptr; + void *device_flags_ptr = nullptr; err = cudaHostGetDevicePointer(&device_flags_ptr, host_flags_ptr, 0); if (err != cudaSuccess) { cudaFreeHost(host_flags_ptr); return false; } - void* host_data_ptr = nullptr; - err = cudaHostAlloc(&host_data_ptr, num_slots * slot_size, - cudaHostAllocMapped); + void *host_data_ptr = nullptr; + err = + cudaHostAlloc(&host_data_ptr, num_slots * slot_size, cudaHostAllocMapped); if (err != cudaSuccess) { cudaFreeHost(host_flags_ptr); return false; } - void* device_data_ptr = nullptr; + void *device_data_ptr = nullptr; err = cudaHostGetDevicePointer(&device_data_ptr, host_data_ptr, 0); if (err != cudaSuccess) { cudaFreeHost(host_flags_ptr); @@ -67,16 +66,16 @@ bool allocate_ring_buffer(std::size_t num_slots, std::size_t slot_size, std::memset(host_flags_ptr, 0, num_slots * sizeof(uint64_t)); - *host_flags_out = static_cast(host_flags_ptr); - *device_flags_out = static_cast(device_flags_ptr); - *host_data_out = static_cast(host_data_ptr); - *device_data_out = static_cast(device_data_ptr); + *host_flags_out = static_cast(host_flags_ptr); + *device_flags_out = static_cast(device_flags_ptr); + *host_data_out = static_cast(host_data_ptr); + *device_data_out = static_cast(device_data_ptr); return true; } -void free_ring_buffer(volatile uint64_t* host_flags, std::uint8_t* host_data) { +void free_ring_buffer(volatile uint64_t *host_flags, std::uint8_t *host_data) { if (host_flags) - cudaFreeHost(const_cast(host_flags)); + cudaFreeHost(const_cast(host_flags)); if (host_data) cudaFreeHost(host_data); } @@ -89,14 +88,14 @@ __global__ void noop_kernel() {} // Creates a minimal executable graph and returns it. Caller must destroy with // cudaGraphExecDestroy and cudaGraphDestroy. -bool create_dummy_graph(cudaGraph_t* graph_out, cudaGraphExec_t* exec_out) { +bool create_dummy_graph(cudaGraph_t *graph_out, cudaGraphExec_t *exec_out) { cudaGraph_t graph = nullptr; if (cudaGraphCreate(&graph, 0) != cudaSuccess) return false; cudaKernelNodeParams params = {}; - void* args[] = {}; - params.func = reinterpret_cast(noop_kernel); + void *args[] = {}; + params.func = reinterpret_cast(noop_kernel); params.gridDim = dim3(1, 1, 1); params.blockDim = dim3(1, 1, 1); params.sharedMemBytes = 0; @@ -126,18 +125,18 @@ bool create_dummy_graph(cudaGraph_t* graph_out, cudaGraphExec_t* exec_out) { // in-place (same buffer as request; use single ring buffer for rx/tx). //============================================================================== -__global__ void graph_increment_kernel(void** mailbox_slot_ptr) { +__global__ void graph_increment_kernel(void **mailbox_slot_ptr) { if (threadIdx.x == 0 && blockIdx.x == 0) { - void* buffer = *mailbox_slot_ptr; - cudaq::realtime::RPCHeader* header = - static_cast(buffer); + void *buffer = *mailbox_slot_ptr; + cudaq::realtime::RPCHeader *header = + static_cast(buffer); std::uint32_t arg_len = header->arg_len; - void* arg_buffer = static_cast(header + 1); - std::uint8_t* data = static_cast(arg_buffer); + void *arg_buffer = static_cast(header + 1); + std::uint8_t *data = static_cast(arg_buffer); for (std::uint32_t i = 0; i < arg_len; ++i) data[i] = data[i] + 1; - cudaq::realtime::RPCResponse* response = - static_cast(buffer); + cudaq::realtime::RPCResponse *response = + static_cast(buffer); response->magic = cudaq::realtime::RPC_MAGIC_RESPONSE; response->status = 0; response->result_len = arg_len; @@ -150,8 +149,8 @@ constexpr std::uint32_t RPC_GRAPH_INCREMENT_FUNCTION_ID = /// Creates an executable graph that runs graph_increment_kernel with /// kernel arg = d_mailbox_bank (device pointer to first mailbox slot). /// Caller must cudaGraphExecDestroy / cudaGraphDestroy. -bool create_increment_graph(void** d_mailbox_bank, cudaGraph_t* graph_out, - cudaGraphExec_t* exec_out) { +bool create_increment_graph(void **d_mailbox_bank, cudaGraph_t *graph_out, + cudaGraphExec_t *exec_out) { cudaGraph_t graph = nullptr; if (cudaGraphCreate(&graph, 0) != cudaSuccess) return false; @@ -159,8 +158,8 @@ bool create_increment_graph(void** d_mailbox_bank, cudaGraph_t* graph_out, // kernelParams[i] must be a *pointer to* the i-th argument value. // The kernel takes void** so we pass &d_mailbox_bank (a void***). cudaKernelNodeParams params = {}; - void* kernel_args[] = {&d_mailbox_bank}; - params.func = reinterpret_cast(graph_increment_kernel); + void *kernel_args[] = {&d_mailbox_bank}; + params.func = reinterpret_cast(graph_increment_kernel); params.gridDim = dim3(1, 1, 1); params.blockDim = dim3(32, 1, 1); params.sharedMemBytes = 0; @@ -190,18 +189,18 @@ bool create_increment_graph(void** d_mailbox_bank, cudaGraph_t* graph_out, // in-place (for function_id routing differentiation vs increment kernel). //============================================================================== -__global__ void graph_double_kernel(void** mailbox_slot_ptr) { +__global__ void graph_double_kernel(void **mailbox_slot_ptr) { if (threadIdx.x == 0 && blockIdx.x == 0) { - void* buffer = *mailbox_slot_ptr; - cudaq::realtime::RPCHeader* header = - static_cast(buffer); + void *buffer = *mailbox_slot_ptr; + cudaq::realtime::RPCHeader *header = + static_cast(buffer); std::uint32_t arg_len = header->arg_len; - void* arg_buffer = static_cast(header + 1); - std::uint8_t* data = static_cast(arg_buffer); + void *arg_buffer = static_cast(header + 1); + std::uint8_t *data = static_cast(arg_buffer); for (std::uint32_t i = 0; i < arg_len; ++i) data[i] = data[i] * 2; - cudaq::realtime::RPCResponse* response = - static_cast(buffer); + cudaq::realtime::RPCResponse *response = + static_cast(buffer); response->magic = cudaq::realtime::RPC_MAGIC_RESPONSE; response->status = 0; response->result_len = arg_len; @@ -211,15 +210,15 @@ __global__ void graph_double_kernel(void** mailbox_slot_ptr) { constexpr std::uint32_t RPC_GRAPH_DOUBLE_FUNCTION_ID = cudaq::realtime::fnv1a_hash("rpc_graph_double"); -bool create_double_graph(void** d_mailbox_slot, cudaGraph_t* graph_out, - cudaGraphExec_t* exec_out) { +bool create_double_graph(void **d_mailbox_slot, cudaGraph_t *graph_out, + cudaGraphExec_t *exec_out) { cudaGraph_t graph = nullptr; if (cudaGraphCreate(&graph, 0) != cudaSuccess) return false; cudaKernelNodeParams params = {}; - void* kernel_args[] = {&d_mailbox_slot}; - params.func = reinterpret_cast(graph_double_kernel); + void *kernel_args[] = {&d_mailbox_slot}; + params.func = reinterpret_cast(graph_double_kernel); params.gridDim = dim3(1, 1, 1); params.blockDim = dim3(32, 1, 1); params.sharedMemBytes = 0; @@ -261,12 +260,11 @@ protected: &tx_flags_dev_, &tx_data_host_, &tx_data_dev_)); - CUDA_CHECK(cudaHostAlloc(&h_mailbox_bank_, - kMaxWorkers * sizeof(void*), + CUDA_CHECK(cudaHostAlloc(&h_mailbox_bank_, kMaxWorkers * sizeof(void *), cudaHostAllocMapped)); - std::memset(h_mailbox_bank_, 0, kMaxWorkers * sizeof(void*)); + std::memset(h_mailbox_bank_, 0, kMaxWorkers * sizeof(void *)); CUDA_CHECK(cudaHostGetDevicePointer( - reinterpret_cast(&d_mailbox_bank_), h_mailbox_bank_, 0)); + reinterpret_cast(&d_mailbox_bank_), h_mailbox_bank_, 0)); idle_mask_ = new cudaq::realtime::atomic_uint64_sys(0); live_dispatched_ = new cudaq::realtime::atomic_uint64_sys(0); @@ -275,7 +273,8 @@ protected: stats_counter_ = 0; function_table_ = new cudaq_function_entry_t[kMaxWorkers]; - std::memset(function_table_, 0, kMaxWorkers * sizeof(cudaq_function_entry_t)); + std::memset(function_table_, 0, + kMaxWorkers * sizeof(cudaq_function_entry_t)); std::memset(&ringbuffer_, 0, sizeof(ringbuffer_)); ringbuffer_.rx_flags = rx_flags_dev_; @@ -298,7 +297,7 @@ protected: loop_thread_.join(); } - for (auto& w : worker_info_) { + for (auto &w : worker_info_) { if (w.stream) cudaStreamDestroy(w.stream); if (w.graph_exec) @@ -347,12 +346,10 @@ protected: idle_mask_->store((1ULL << workers_.size()) - 1, cuda::std::memory_order_release); - config_.rx_flags = - reinterpret_cast( - const_cast(rx_flags_host_)); - config_.tx_flags = - reinterpret_cast( - const_cast(tx_flags_host_)); + config_.rx_flags = reinterpret_cast( + const_cast(rx_flags_host_)); + config_.tx_flags = reinterpret_cast( + const_cast(tx_flags_host_)); config_.rx_data_host = rx_data_host_; config_.rx_data_dev = rx_data_dev_; config_.tx_data_host = tx_data_host_; @@ -374,7 +371,7 @@ protected: } void WriteRpcRequest(std::size_t slot, std::uint32_t function_id, - const std::uint8_t* payload, std::size_t len) { + const std::uint8_t *payload, std::size_t len) { ASSERT_EQ(cudaq_host_ringbuffer_write_rpc_request( &ringbuffer_, static_cast(slot), function_id, payload, static_cast(len)), @@ -382,7 +379,8 @@ protected: } void SignalSlot(std::size_t slot) { - cudaq_host_ringbuffer_signal_slot(&ringbuffer_, static_cast(slot)); + cudaq_host_ringbuffer_signal_slot(&ringbuffer_, + static_cast(slot)); } bool PollTxFlag(std::size_t slot, int timeout_ms = 2000) { @@ -393,9 +391,9 @@ protected: return true; usleep(200); } - return cudaq_host_ringbuffer_poll_tx_flag( - &ringbuffer_, static_cast(slot), nullptr) != - CUDAQ_TX_EMPTY; + return cudaq_host_ringbuffer_poll_tx_flag(&ringbuffer_, + static_cast(slot), + nullptr) != CUDAQ_TX_EMPTY; } void StopLoop() { @@ -415,52 +413,51 @@ protected: std::memset(rx_data_host_ + slot * slot_size_, 0, slot_size_); } - void VerifyResponse(std::size_t slot, const std::uint8_t* expected, + void VerifyResponse(std::size_t slot, const std::uint8_t *expected, std::size_t len) { int cuda_err = 0; cudaq_tx_status_t st = cudaq_host_ringbuffer_poll_tx_flag( &ringbuffer_, static_cast(slot), &cuda_err); - ASSERT_EQ(st, CUDAQ_TX_READY) << "slot " << slot - << ": tx_flag not READY (status=" << st << " cuda_err=" << cuda_err << ")"; + ASSERT_EQ(st, CUDAQ_TX_READY) + << "slot " << slot << ": tx_flag not READY (status=" << st + << " cuda_err=" << cuda_err << ")"; - std::uint8_t* slot_data = rx_data_host_ + slot * slot_size_; - auto* resp = - reinterpret_cast(slot_data); + std::uint8_t *slot_data = rx_data_host_ + slot * slot_size_; + auto *resp = reinterpret_cast(slot_data); ASSERT_EQ(resp->magic, CUDAQ_RPC_MAGIC_RESPONSE) << "slot " << slot << ": expected response magic"; ASSERT_EQ(resp->status, 0) << "slot " << slot << ": non-zero status"; ASSERT_EQ(resp->result_len, static_cast(len)) << "slot " << slot << ": wrong result_len"; - std::uint8_t* result = slot_data + sizeof(cudaq::realtime::RPCResponse); + std::uint8_t *result = slot_data + sizeof(cudaq::realtime::RPCResponse); for (std::size_t i = 0; i < len; ++i) { - EXPECT_EQ(result[i], expected[i]) - << "slot " << slot << " byte " << i; + EXPECT_EQ(result[i], expected[i]) << "slot " << slot << " byte " << i; } } std::size_t num_slots_ = 4; std::size_t slot_size_ = 256; - volatile uint64_t* rx_flags_host_ = nullptr; - volatile uint64_t* tx_flags_host_ = nullptr; - volatile uint64_t* rx_flags_dev_ = nullptr; - volatile uint64_t* tx_flags_dev_ = nullptr; - std::uint8_t* rx_data_host_ = nullptr; - std::uint8_t* tx_data_host_ = nullptr; - std::uint8_t* rx_data_dev_ = nullptr; - std::uint8_t* tx_data_dev_ = nullptr; - - void** h_mailbox_bank_ = nullptr; - void** d_mailbox_bank_ = nullptr; - - cudaq::realtime::atomic_uint64_sys* idle_mask_ = nullptr; - cudaq::realtime::atomic_uint64_sys* live_dispatched_ = nullptr; - int* inflight_slot_tags_ = nullptr; - cudaq::realtime::atomic_int_sys* shutdown_flag_ = nullptr; + volatile uint64_t *rx_flags_host_ = nullptr; + volatile uint64_t *tx_flags_host_ = nullptr; + volatile uint64_t *rx_flags_dev_ = nullptr; + volatile uint64_t *tx_flags_dev_ = nullptr; + std::uint8_t *rx_data_host_ = nullptr; + std::uint8_t *tx_data_host_ = nullptr; + std::uint8_t *rx_data_dev_ = nullptr; + std::uint8_t *tx_data_dev_ = nullptr; + + void **h_mailbox_bank_ = nullptr; + void **d_mailbox_bank_ = nullptr; + + cudaq::realtime::atomic_uint64_sys *idle_mask_ = nullptr; + cudaq::realtime::atomic_uint64_sys *live_dispatched_ = nullptr; + int *inflight_slot_tags_ = nullptr; + cudaq::realtime::atomic_int_sys *shutdown_flag_ = nullptr; uint64_t stats_counter_ = 0; bool loop_stopped_ = false; - cudaq_function_entry_t* function_table_ = nullptr; + cudaq_function_entry_t *function_table_ = nullptr; std::size_t function_table_count_ = 0; std::vector workers_; std::vector worker_info_; @@ -530,9 +527,8 @@ protected: ASSERT_EQ(cudaq_dispatcher_set_function_table(dispatcher_, &table), CUDAQ_OK); - ASSERT_EQ( - cudaq_dispatcher_set_control(dispatcher_, shutdown_flag_, stats_), - CUDAQ_OK); + ASSERT_EQ(cudaq_dispatcher_set_control(dispatcher_, shutdown_flag_, stats_), + CUDAQ_OK); ASSERT_EQ(cudaq_dispatcher_start(dispatcher_), CUDAQ_OK); } @@ -575,24 +571,24 @@ protected: static constexpr std::size_t num_slots_ = 2; std::size_t slot_size_ = 256; - volatile uint64_t* rx_flags_host_ = nullptr; - volatile uint64_t* tx_flags_host_ = nullptr; - volatile uint64_t* rx_flags_ = nullptr; - volatile uint64_t* tx_flags_ = nullptr; - std::uint8_t* rx_data_host_ = nullptr; - std::uint8_t* tx_data_host_ = nullptr; - std::uint8_t* rx_data_ = nullptr; - std::uint8_t* tx_data_ = nullptr; - - int* shutdown_flag_ = nullptr; - uint64_t* stats_ = nullptr; - cudaq_function_entry_t* host_table_ = nullptr; + volatile uint64_t *rx_flags_host_ = nullptr; + volatile uint64_t *tx_flags_host_ = nullptr; + volatile uint64_t *rx_flags_ = nullptr; + volatile uint64_t *tx_flags_ = nullptr; + std::uint8_t *rx_data_host_ = nullptr; + std::uint8_t *tx_data_host_ = nullptr; + std::uint8_t *rx_data_ = nullptr; + std::uint8_t *tx_data_ = nullptr; + + int *shutdown_flag_ = nullptr; + uint64_t *stats_ = nullptr; + cudaq_function_entry_t *host_table_ = nullptr; cudaGraph_t dummy_graph_ = nullptr; cudaGraphExec_t dummy_graph_exec_ = nullptr; cudaq_ringbuffer_t ringbuffer_{}; - cudaq_dispatch_manager_t* manager_ = nullptr; - cudaq_dispatcher_t* dispatcher_ = nullptr; + cudaq_dispatch_manager_t *manager_ = nullptr; + cudaq_dispatcher_t *dispatcher_ = nullptr; }; TEST_F(HostDispatcherSmokeTest, DropsSlotWithUnknownFunctionId) { @@ -643,18 +639,17 @@ TEST(HostDispatcherGraphLaunchTest, FullRpcRoundTripViaPinnedMailbox) { // Separate flag arrays for RX and TX: the dispatcher clears rx_flags[slot] // right after setting tx_flags[slot], so sharing would clobber the signal. // Data buffers are shared (graph writes response in-place to the RX slot). - volatile uint64_t* rx_flags_host = nullptr; - volatile uint64_t* rx_flags_dev = nullptr; - std::uint8_t* rx_data_host = nullptr; - std::uint8_t* rx_data_dev = nullptr; - volatile uint64_t* tx_flags_host = nullptr; - volatile uint64_t* tx_flags_dev = nullptr; - std::uint8_t* tx_data_host_unused = nullptr; - std::uint8_t* tx_data_dev_unused = nullptr; + volatile uint64_t *rx_flags_host = nullptr; + volatile uint64_t *rx_flags_dev = nullptr; + std::uint8_t *rx_data_host = nullptr; + std::uint8_t *rx_data_dev = nullptr; + volatile uint64_t *tx_flags_host = nullptr; + volatile uint64_t *tx_flags_dev = nullptr; + std::uint8_t *tx_data_host_unused = nullptr; + std::uint8_t *tx_data_dev_unused = nullptr; ASSERT_TRUE(allocate_ring_buffer(num_slots, slot_size, &rx_flags_host, - &rx_flags_dev, &rx_data_host, - &rx_data_dev)); + &rx_flags_dev, &rx_data_host, &rx_data_dev)); ASSERT_TRUE(allocate_ring_buffer(num_slots, slot_size, &tx_flags_host, &tx_flags_dev, &tx_data_host_unused, &tx_data_dev_unused)); @@ -663,13 +658,13 @@ TEST(HostDispatcherGraphLaunchTest, FullRpcRoundTripViaPinnedMailbox) { // cudaHostAllocMapped gives us host + device views of the same memory. // The host dispatcher writes the slot device pointer to h_mailbox_bank[0]; // the graph reads it from d_mailbox_bank[0] (same physical location). - void** h_mailbox_bank = nullptr; - void** d_mailbox_bank = nullptr; - CUDA_CHECK(cudaHostAlloc(&h_mailbox_bank, sizeof(void*), - cudaHostAllocMapped)); - std::memset(h_mailbox_bank, 0, sizeof(void*)); + void **h_mailbox_bank = nullptr; + void **d_mailbox_bank = nullptr; + CUDA_CHECK( + cudaHostAlloc(&h_mailbox_bank, sizeof(void *), cudaHostAllocMapped)); + std::memset(h_mailbox_bank, 0, sizeof(void *)); CUDA_CHECK( - cudaHostGetDevicePointer((void**)&d_mailbox_bank, h_mailbox_bank, 0)); + cudaHostGetDevicePointer((void **)&d_mailbox_bank, h_mailbox_bank, 0)); // --- Graph --- // Capture graph_increment_kernel with d_mailbox_bank baked in as the @@ -677,8 +672,7 @@ TEST(HostDispatcherGraphLaunchTest, FullRpcRoundTripViaPinnedMailbox) { // the slot, so different slots can be processed on each launch. cudaGraph_t graph = nullptr; cudaGraphExec_t graph_exec = nullptr; - ASSERT_TRUE( - create_increment_graph(d_mailbox_bank, &graph, &graph_exec)); + ASSERT_TRUE(create_increment_graph(d_mailbox_bank, &graph, &graph_exec)); // --- Function table (one GRAPH_LAUNCH entry) --- cudaq_function_entry_t host_table[1]; @@ -688,7 +682,7 @@ TEST(HostDispatcherGraphLaunchTest, FullRpcRoundTripViaPinnedMailbox) { host_table[0].handler.graph_exec = graph_exec; // --- C API: create manager + dispatcher --- - cudaq_dispatch_manager_t* manager = nullptr; + cudaq_dispatch_manager_t *manager = nullptr; ASSERT_EQ(cudaq_dispatch_manager_create(&manager), CUDAQ_OK); cudaq_dispatcher_config_t disp_config{}; @@ -697,7 +691,7 @@ TEST(HostDispatcherGraphLaunchTest, FullRpcRoundTripViaPinnedMailbox) { disp_config.slot_size = static_cast(slot_size); disp_config.backend = CUDAQ_BACKEND_HOST_LOOP; - cudaq_dispatcher_t* dispatcher = nullptr; + cudaq_dispatcher_t *dispatcher = nullptr; ASSERT_EQ(cudaq_dispatcher_create(manager, &disp_config, &dispatcher), CUDAQ_OK); @@ -713,25 +707,22 @@ TEST(HostDispatcherGraphLaunchTest, FullRpcRoundTripViaPinnedMailbox) { ringbuffer.tx_flags_host = tx_flags_host; ringbuffer.rx_data_host = rx_data_host; ringbuffer.tx_data_host = rx_data_host; - ASSERT_EQ(cudaq_dispatcher_set_ringbuffer(dispatcher, &ringbuffer), - CUDAQ_OK); + ASSERT_EQ(cudaq_dispatcher_set_ringbuffer(dispatcher, &ringbuffer), CUDAQ_OK); cudaq_function_table_t table{}; table.entries = host_table; table.count = 1; - ASSERT_EQ(cudaq_dispatcher_set_function_table(dispatcher, &table), - CUDAQ_OK); + ASSERT_EQ(cudaq_dispatcher_set_function_table(dispatcher, &table), CUDAQ_OK); int shutdown_flag = 0; uint64_t stats_counter = 0; - ASSERT_EQ(cudaq_dispatcher_set_control(dispatcher, &shutdown_flag, - &stats_counter), - CUDAQ_OK); + ASSERT_EQ( + cudaq_dispatcher_set_control(dispatcher, &shutdown_flag, &stats_counter), + CUDAQ_OK); // Provide the caller-allocated pinned mailbox so the dispatcher uses it // instead of allocating plain host memory (which the graph can't read). - ASSERT_EQ(cudaq_dispatcher_set_mailbox(dispatcher, h_mailbox_bank), - CUDAQ_OK); + ASSERT_EQ(cudaq_dispatcher_set_mailbox(dispatcher, h_mailbox_bank), CUDAQ_OK); // --- Start --- ASSERT_EQ(cudaq_dispatcher_start(dispatcher), CUDAQ_OK); @@ -758,13 +749,13 @@ TEST(HostDispatcherGraphLaunchTest, FullRpcRoundTripViaPinnedMailbox) { CUDA_CHECK(cudaDeviceSynchronize()); // --- Verify: graph wrote correct response in-place --- - std::uint8_t* slot_data = rx_data_host + 0 * slot_size; - auto* resp = reinterpret_cast(slot_data); + std::uint8_t *slot_data = rx_data_host + 0 * slot_size; + auto *resp = reinterpret_cast(slot_data); ASSERT_EQ(resp->magic, CUDAQ_RPC_MAGIC_RESPONSE) << "Expected response magic (graph in-place write)"; ASSERT_EQ(resp->status, 0); ASSERT_EQ(resp->result_len, 4u); - std::uint8_t* result = slot_data + sizeof(cudaq::realtime::RPCResponse); + std::uint8_t *result = slot_data + sizeof(cudaq::realtime::RPCResponse); EXPECT_EQ(result[0], 1); EXPECT_EQ(result[1], 2); EXPECT_EQ(result[2], 3); @@ -796,7 +787,8 @@ TEST(HostDispatcherGraphLaunchTest, FullRpcRoundTripViaPinnedMailbox) { TEST_F(HostDispatcherLoopTest, MultiWorkerFunctionIdRouting) { cudaGraph_t inc_graph = nullptr; cudaGraphExec_t inc_exec = nullptr; - ASSERT_TRUE(create_increment_graph(d_mailbox_bank_ + 0, &inc_graph, &inc_exec)); + ASSERT_TRUE( + create_increment_graph(d_mailbox_bank_ + 0, &inc_graph, &inc_exec)); AddWorker(RPC_GRAPH_INCREMENT_FUNCTION_ID, inc_exec, inc_graph); cudaGraph_t dbl_graph = nullptr; @@ -931,21 +923,20 @@ TEST_F(HostDispatcherLoopTest, StatsCounterAccuracy) { if (i >= static_cast(num_slots_)) ClearSlot(slot); - std::uint8_t payload[] = { - static_cast(i * 10), - static_cast(i * 10 + 1), - static_cast(i * 10 + 2), - static_cast(i * 10 + 3)}; + std::uint8_t payload[] = {static_cast(i * 10), + static_cast(i * 10 + 1), + static_cast(i * 10 + 2), + static_cast(i * 10 + 3)}; WriteRpcRequest(slot, RPC_GRAPH_INCREMENT_FUNCTION_ID, payload, 4); SignalSlot(slot); - ASSERT_TRUE(PollTxFlag(slot)) << "Timeout on RPC " << i << " (slot " << slot << ")"; + ASSERT_TRUE(PollTxFlag(slot)) + << "Timeout on RPC " << i << " (slot " << slot << ")"; ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); - std::uint8_t expected[] = { - static_cast(i * 10 + 1), - static_cast(i * 10 + 2), - static_cast(i * 10 + 3), - static_cast(i * 10 + 4)}; + std::uint8_t expected[] = {static_cast(i * 10 + 1), + static_cast(i * 10 + 2), + static_cast(i * 10 + 3), + static_cast(i * 10 + 4)}; VerifyResponse(slot, expected, 4); RestoreWorker(0); @@ -970,19 +961,18 @@ TEST_F(HostDispatcherLoopTest, MultiSlotRoundRobin) { cudaGraph_t graphs[kNumSlots]; cudaGraphExec_t execs[kNumSlots]; for (int i = 0; i < kNumSlots; ++i) { - ASSERT_TRUE(create_increment_graph(d_mailbox_bank_ + i, &graphs[i], - &execs[i])); + ASSERT_TRUE( + create_increment_graph(d_mailbox_bank_ + i, &graphs[i], &execs[i])); AddWorker(RPC_GRAPH_INCREMENT_FUNCTION_ID, execs[i], graphs[i]); } StartLoop(); for (int i = 0; i < kNumSlots; ++i) { - std::uint8_t payload[] = { - static_cast(i * 4 + 1), - static_cast(i * 4 + 2), - static_cast(i * 4 + 3), - static_cast(i * 4 + 4)}; + std::uint8_t payload[] = {static_cast(i * 4 + 1), + static_cast(i * 4 + 2), + static_cast(i * 4 + 3), + static_cast(i * 4 + 4)}; WriteRpcRequest(static_cast(i), RPC_GRAPH_INCREMENT_FUNCTION_ID, payload, 4); } @@ -997,11 +987,10 @@ TEST_F(HostDispatcherLoopTest, MultiSlotRoundRobin) { ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); for (int i = 0; i < kNumSlots; ++i) { - std::uint8_t expected[] = { - static_cast(i * 4 + 2), - static_cast(i * 4 + 3), - static_cast(i * 4 + 4), - static_cast(i * 4 + 5)}; + std::uint8_t expected[] = {static_cast(i * 4 + 2), + static_cast(i * 4 + 3), + static_cast(i * 4 + 4), + static_cast(i * 4 + 5)}; VerifyResponse(static_cast(i), expected, 4); } diff --git a/realtime/unittests/utils/init_rpc_increment_function_table.cu b/realtime/unittests/utils/init_rpc_increment_function_table.cu index 5365bcb4..dde181cf 100644 --- a/realtime/unittests/utils/init_rpc_increment_function_table.cu +++ b/realtime/unittests/utils/init_rpc_increment_function_table.cu @@ -18,8 +18,8 @@ #include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" #include "cudaq/realtime/daemon/dispatcher/dispatch_modes.h" -#include #include +#include namespace { From c81935420de7b5c6d9ce4b406687ce5e5e6a9374 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Wed, 4 Mar 2026 17:39:40 +0000 Subject: [PATCH 29/40] Added mermaid documentation Signed-off-by: Scott Thornton --- docs/realtime_pipeline_architecture.md | 343 +++++++++++++++++++++++++ 1 file changed, 343 insertions(+) create mode 100644 docs/realtime_pipeline_architecture.md diff --git a/docs/realtime_pipeline_architecture.md b/docs/realtime_pipeline_architecture.md new file mode 100644 index 00000000..dadf7033 --- /dev/null +++ b/docs/realtime_pipeline_architecture.md @@ -0,0 +1,343 @@ +# Realtime Pipeline Architecture + +## 1. Component Overview + +```mermaid +classDiagram + class RealtimePipeline { + -Impl* impl_ + +set_gpu_stage(GpuStageFactory) + +set_cpu_stage(CpuStageCallback) + +set_completion_handler(CompletionCallback) + +start() + +stop() + +create_injector() RingBufferInjector + +stats() Stats + } + + class RingBufferInjector { + -State* state_ + +try_submit(fid, payload, size, rid) bool + +submit(fid, payload, size, rid) + +backpressure_stalls() uint64_t + } + + class RingBufferManager { + -rx_flags_ : atomic_uint64[N] + -tx_flags_ : atomic_uint64[N] + -rx_data_host_ : uint8_t* + +slot_available(slot) bool + +write_and_signal(slot, fid, payload, len) + +poll_tx(slot, err) cudaq_tx_status_t + +clear_slot(slot) + } + + class HostDispatcherConfig { + +rx_flags : atomic_uint64* + +tx_flags : atomic_uint64* + +idle_mask : atomic_uint64* + +inflight_slot_tags : int* + +h_mailbox_bank : void** + +workers : HostDispatchWorker[] + +function_table : cudaq_function_entry_t* + +shutdown_flag : atomic_int* + } + + class AIPreDecoderService { + -h_ready_flags_ : atomic_int* + -h_predecoder_outputs_ : void* + -graph_exec_ : cudaGraphExec_t + +capture_graph(stream, device_launch) + +poll_next_job(job) bool + +release_job(slot) + } + + RealtimePipeline *-- RingBufferManager : owns + RealtimePipeline *-- HostDispatcherConfig : builds + RealtimePipeline --> RingBufferInjector : creates + RingBufferInjector --> RingBufferManager : writes to + HostDispatcherConfig --> AIPreDecoderService : launches graph +``` + +## 2. Thread Model + +The pipeline spawns three categories of threads, each pinnable to a specific CPU core: + +```mermaid +flowchart LR + subgraph "Producer (main thread or FPGA DMA)" + P["RingBufferInjector::submit()"] + end + + subgraph "Dispatcher Thread (core 2)" + D["host_dispatcher_loop()"] + end + + subgraph "Worker Threads (cores 4..4+N)" + W0["worker_loop(0)"] + W1["worker_loop(1)"] + Wn["worker_loop(N-1)"] + end + + subgraph "Consumer Thread (core 3)" + C["consumer_loop()"] + end + + subgraph "GPU Streams" + G0["stream[0]: CUDA Graph"] + G1["stream[1]: CUDA Graph"] + Gn["stream[N-1]: CUDA Graph"] + end + + P -->|"rx_flags[slot]"| D + D -->|"cudaGraphLaunch"| G0 + D -->|"cudaGraphLaunch"| G1 + D -->|"cudaGraphLaunch"| Gn + G0 -->|"ready_flags[0] = 1"| W0 + G1 -->|"ready_flags[0] = 1"| W1 + Gn -->|"ready_flags[0] = 1"| Wn + W0 -->|"tx_flags[slot]"| C + W1 -->|"tx_flags[slot]"| C + Wn -->|"tx_flags[slot]"| C + C -->|"clear_slot()"| P +``` + +## 3. Sequence Diagram: Single Syndrome Through the Pipeline + +This traces one syndrome request from submission to completion, showing every +atomic operation and the thread/device boundary crossings. + +```mermaid +sequenceDiagram + participant Prod as Producer
(main thread) + participant RB as Ring Buffer
(shared memory) + participant Disp as Dispatcher
(dedicated thread) + participant GPU as GPU Stream[w]
(CUDA Graph) + participant Work as Worker Thread[w]
(CPU) + participant Cons as Consumer
(dedicated thread) + participant App as Application
(completion handler) + + Note over Prod,App: === PHASE 1: Injection === + + Prod->>Prod: CAS next_slot (acq_rel)
claim slot S + Prod->>RB: memcpy payload → rx_data[S] + Prod->>RB: write RPCHeader {magic, function_id} + Prod->>RB: rx_flags[S].store(host_ptr, release) + Prod->>Prod: slot_occupied[S] = 1
slot_request[S] = request_id + Prod->>Prod: total_submitted.fetch_add(1, release) + + Note over Prod,App: === PHASE 2: Dispatch === + + Disp->>RB: rx_flags[S].load(acquire)
sees non-zero → slot S ready + Disp->>Disp: parse RPCHeader → function_id + Disp->>Disp: idle_mask.load(acquire)
find worker W via __builtin_ffsll + Disp->>Disp: idle_mask.fetch_and(~(1<mark W busy + Disp->>Disp: inflight_slot_tags[W] = S + Disp->>RB: h_mailbox_bank[W] = dev_ptr + Disp->>Disp: __sync_synchronize() + + opt pre_launch_fn configured + Disp->>GPU: pre_launch_fn: cudaMemcpyAsync
DMA syndrome → TRT input buffer + end + + Disp->>GPU: cudaGraphLaunch(graph_exec[W], stream[W]) + Disp->>RB: tx_flags[S].store(0xEEEE..., release)
IN_FLIGHT sentinel + Disp->>RB: rx_flags[S].store(0, release)
free rx slot, advance + + Note over Prod,App: === PHASE 3: GPU Inference === + + GPU->>GPU: gateway_input_kernel:
copy ring buffer → TRT input + GPU->>GPU: TRT enqueueV3:
AI predecoder inference + GPU->>GPU: cudaMemcpyAsync:
TRT output → h_predecoder_outputs + GPU->>GPU: predecoder_signal_ready_kernel:
ready_flags[0].store(1, release) + + Note over Prod,App: === PHASE 4: CPU Post-Processing === + + Work->>Work: poll_next_job():
ready_flags[0].CAS(1→2, acquire) + Work->>Work: Read h_predecoder_outputs
Run PyMatching MWPM decoder + Work->>Work: Write RPC response to ring buffer slot + Work->>Work: release_job():
ready_flags[0].store(0, release) + Work->>RB: tx_flags[S].store(slot_host_addr, release)
marks READY + Work->>Disp: idle_mask.fetch_or(1<worker W free again + + Note over Prod,App: === PHASE 5: Completion === + + Cons->>RB: poll_tx(S): tx_flags[S].load(acquire)
sees valid host addr → READY + Cons->>App: completion_handler({request_id, slot, success}) + Cons->>Cons: total_completed.fetch_add(1, relaxed) + Cons->>Cons: slot_occupied[S] = 0 + Cons->>Cons: __sync_synchronize() + Cons->>RB: clear_slot(S):
rx_flags[S] = 0, tx_flags[S] = 0 + Note over Prod: Slot S now available
for next submission +``` + +## 4. Atomic Variables Reference + +Every atomic used in the pipeline, its scope, who writes it, who reads it, +and the memory ordering used. + +### Ring Buffer Flags + +| Atomic | Type | Scope | Writer(s) | Reader(s) | Ordering | +|--------|------|-------|-----------|-----------|----------| +| `rx_flags[slot]` | `cuda::atomic` | Producer ↔ Dispatcher | Producer (signal), Dispatcher (clear), Consumer (clear) | Dispatcher (poll) | store: `release`, load: `acquire` | +| `tx_flags[slot]` | `cuda::atomic` | Dispatcher ↔ Worker ↔ Consumer | Dispatcher (IN_FLIGHT), Worker (READY/addr) | Consumer (poll) | store: `release`, load: `acquire` | + +### Worker Pool Scheduling + +| Atomic | Type | Scope | Writer(s) | Reader(s) | Ordering | +|--------|------|-------|-----------|-----------|----------| +| `idle_mask` | `cuda::atomic` | Dispatcher ↔ Workers | Dispatcher (clear bit), Worker (set bit) | Dispatcher (find free worker) | fetch_and/fetch_or: `release`, load: `acquire` | + +### GPU ↔ CPU Handoff (per AIPreDecoderService) + +| Atomic | Type | Scope | Writer(s) | Reader(s) | Ordering | +|--------|------|-------|-----------|-----------|----------| +| `ready_flags[0]` | `cuda::atomic` | GPU kernel ↔ Worker thread | GPU kernel (0→1), Worker (CAS 1→2), Worker (2→0) | Worker (CAS poll) | store: `release`, CAS success: `acquire`, CAS fail: `relaxed` | + +### Pipeline Lifecycle + +| Atomic | Type | Scope | Writer(s) | Reader(s) | Ordering | +|--------|------|-------|-----------|-----------|----------| +| `shutdown_flag` | `cuda::atomic` | Main ↔ Dispatcher | Main thread | Dispatcher loop | store: `release`, load: `acquire` | +| `producer_stop` | `std::atomic` | Main ↔ Consumer/Injector | Main thread | Consumer, Injector | store: `release`, load: `acquire` | +| `consumer_stop` | `std::atomic` | Main ↔ Consumer/Workers | Main thread | Consumer, Workers | store: `release`, load: `acquire` | +| `total_submitted` | `std::atomic` | Injector ↔ Consumer | Injector | Consumer | fetch_add: `release`, load: `acquire` | +| `total_completed` | `std::atomic` | Consumer ↔ Main | Consumer | Main (stats) | fetch_add: `relaxed`, load: `relaxed` | +| `backpressure_stalls` | `std::atomic` | Injector ↔ Main | Injector | Main (stats) | fetch_add: `relaxed`, load: `relaxed` | +| `started` | `std::atomic` | Main thread | start()/stop() | destructor, start() | implicit seq_cst | + +### Injector Slot Claiming + +| Atomic | Type | Scope | Writer(s) | Reader(s) | Ordering | +|--------|------|-------|-----------|-----------|----------| +| `next_slot` | `std::atomic` | Injector-internal | try_submit (CAS) | try_submit | CAS: `acq_rel` / `relaxed` | + +## 5. Ring Buffer Slot State Machine + +Each of the N ring buffer slots transitions through these states. The +transitions are driven by atomic flag writes from different threads. + +```mermaid +stateDiagram-v2 + [*] --> FREE : initialization + + FREE --> RX_SIGNALED : Producer writes rx_flags[S] = host_ptr + note right of RX_SIGNALED : rx_flags ≠ 0, tx_flags = 0\nPayload + RPCHeader in rx_data[S] + + RX_SIGNALED --> IN_FLIGHT : Dispatcher reads rx_flags,\nlaunches graph,\nwrites tx_flags = 0xEEEE...,\nclears rx_flags = 0 + note right of IN_FLIGHT : rx_flags = 0, tx_flags = 0xEEEE...\nGPU processing in progress + + IN_FLIGHT --> TX_READY : Worker writes tx_flags[S] = slot_host_addr\n(after GPU done + PyMatching done) + note right of TX_READY : rx_flags = 0, tx_flags = valid addr\nResult available for consumer + + TX_READY --> FREE : Consumer reads result,\ncalls clear_slot():\nrx_flags = 0, tx_flags = 0 + + IN_FLIGHT --> TX_ERROR : cudaGraphLaunch failed\ntx_flags = 0xDEAD... | err + TX_ERROR --> FREE : Consumer reads error,\ncalls clear_slot() +``` + +**`tx_flags` value encoding:** + +| Value | Meaning | +|-------|---------| +| `0` | Slot is free (no pending result) | +| `0xEEEEEEEEEEEEEEEE` | IN_FLIGHT — graph launched, result not yet ready | +| `0xDEAD____XXXXXXXX` | ERROR — upper 16 bits = `0xDEAD`, lower 32 = cudaError_t | +| Any other non-zero | READY — value is host pointer to slot data containing result | + +## 6. CUDA Graph Structure (per Worker) + +Each worker has a pre-captured CUDA graph that executes on its dedicated stream. +The graph is instantiated once at startup and replayed for every syndrome. + +```mermaid +flowchart TD + subgraph "CUDA Graph (AIPreDecoderService)" + A["TRT enqueueV3\n(AI predecoder inference)"] --> B["cudaMemcpyAsync\nTRT output → h_predecoder_outputs\n(host-mapped)"] + B --> C["predecoder_signal_ready_kernel\nready_flags[0].store(1, release)"] + end + + subgraph "Pre-Launch Callback (host-side, before graph)" + P["pre_launch_fn:\ncudaMemcpyAsync\nring buffer slot → TRT input\n(DMA copy engine)"] + end + + subgraph "Post-Graph (Worker Thread)" + D["poll_next_job():\nready_flags CAS 1→2"] + E["PyMatching MWPM decode"] + F["Write RPC response"] + G["release_job():\nready_flags store 0"] + H["tx_flags[S].store(addr, release)"] + I["idle_mask.fetch_or(1< E --> F --> G --> H --> I + end + + P --> A + C -.->|"GPU signals\nready_flags = 1"| D +``` + +## 7. Backpressure and Flow Control + +The pipeline uses implicit backpressure through slot availability: + +```mermaid +flowchart TD + subgraph "Flow Control" + Submit["Injector::try_submit()"] + Check{"slot_available(S)?\nrx_flags=0 AND tx_flags=0"} + CAS{"CAS next_slot\ncur → cur+1"} + Write["Write payload + signal"] + Stall["backpressure_stalls++\nQEC_CPU_RELAX()"] + Retry["Retry"] + + Submit --> Check + Check -->|yes| CAS + Check -->|no| Stall + CAS -->|success| Write + CAS -->|fail (contention)| Stall + Stall --> Retry --> Submit + end +``` + +**Capacity:** With `num_slots = 32` and `num_workers = 16`, up to 32 syndromes +can be in various stages of processing simultaneously. When all 32 slots are +occupied (either waiting for dispatch, in-flight on GPU, or awaiting consumer +pickup), the injector stalls until the consumer frees a slot. + +## 8. ARM Memory Ordering Considerations + +The pipeline runs on NVIDIA Grace (ARM aarch64) which has a weakly-ordered +memory model. Key ordering guarantees: + +1. **Producer → Dispatcher:** `rx_flags[S].store(release)` pairs with + `rx_flags[S].load(acquire)`. The dispatcher sees all payload bytes written + before the flag. + +2. **Dispatcher → Worker (via GPU):** The CUDA graph launch is ordered by + `cudaGraphLaunch` semantics. The `ready_flags` store inside the GPU kernel + uses `cuda::thread_scope_system` + `memory_order_release`, paired with the + worker's `compare_exchange_strong(acquire)`. + +3. **Worker → Consumer:** `tx_flags[S].store(release)` pairs with + `tx_flags[S].load(acquire)` in `poll_tx_flag()`. Consumer sees PyMatching + results before the ready flag. + +4. **Consumer → Producer (slot recycling):** `slot_occupied[S] = 0` followed + by `__sync_synchronize()` (full barrier) before `clear_slot()` ensures the + producer cannot see a free slot while the consumer is still accessing + slot_request metadata. + +```mermaid +flowchart LR + subgraph "Release/Acquire Pairs" + A["rx_flags store\n(release)"] -->|"paired with"| B["rx_flags load\n(acquire)"] + C["tx_flags store\n(release)"] -->|"paired with"| D["tx_flags load\n(acquire)"] + E["ready_flags store(1)\n(release, system scope)"] -->|"paired with"| F["ready_flags CAS\n(acquire)"] + G["idle_mask fetch_or\n(release)"] -->|"paired with"| H["idle_mask load\n(acquire)"] + end + + subgraph "Full Barriers" + I["__sync_synchronize()\nbetween slot_occupied=0\nand clear_slot()"] + J["__sync_synchronize()\nbetween mailbox_bank write\nand cudaGraphLaunch"] + end +``` From ac8277c0a06a3ede5693f8965277fc5877f58e89 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Wed, 4 Mar 2026 17:55:20 +0000 Subject: [PATCH 30/40] Fixed errors in mermaid diagram Signed-off-by: Scott Thornton --- docs/realtime_pipeline_architecture.md | 201 +++++++++++++------------ 1 file changed, 105 insertions(+), 96 deletions(-) diff --git a/docs/realtime_pipeline_architecture.md b/docs/realtime_pipeline_architecture.md index dadf7033..4ec03d5c 100644 --- a/docs/realtime_pipeline_architecture.md +++ b/docs/realtime_pipeline_architecture.md @@ -5,7 +5,7 @@ ```mermaid classDiagram class RealtimePipeline { - -Impl* impl_ + -impl_ : Impl~ptr~ +set_gpu_stage(GpuStageFactory) +set_cpu_stage(CpuStageCallback) +set_completion_handler(CompletionCallback) @@ -16,16 +16,16 @@ classDiagram } class RingBufferInjector { - -State* state_ + -state_ : State~ptr~ +try_submit(fid, payload, size, rid) bool +submit(fid, payload, size, rid) +backpressure_stalls() uint64_t } class RingBufferManager { - -rx_flags_ : atomic_uint64[N] - -tx_flags_ : atomic_uint64[N] - -rx_data_host_ : uint8_t* + -rx_flags_ : atomic_uint64~N~ + -tx_flags_ : atomic_uint64~N~ + -rx_data_host_ : uint8_t~ptr~ +slot_available(slot) bool +write_and_signal(slot, fid, payload, len) +poll_tx(slot, err) cudaq_tx_status_t @@ -33,19 +33,19 @@ classDiagram } class HostDispatcherConfig { - +rx_flags : atomic_uint64* - +tx_flags : atomic_uint64* - +idle_mask : atomic_uint64* - +inflight_slot_tags : int* - +h_mailbox_bank : void** - +workers : HostDispatchWorker[] - +function_table : cudaq_function_entry_t* - +shutdown_flag : atomic_int* + +rx_flags : atomic_uint64~ptr~ + +tx_flags : atomic_uint64~ptr~ + +idle_mask : atomic_uint64~ptr~ + +inflight_slot_tags : int~ptr~ + +h_mailbox_bank : void~ptrptr~ + +workers : HostDispatchWorker~list~ + +function_table : cudaq_function_entry_t~ptr~ + +shutdown_flag : atomic_int~ptr~ } class AIPreDecoderService { - -h_ready_flags_ : atomic_int* - -h_predecoder_outputs_ : void* + -h_ready_flags_ : atomic_int~ptr~ + -h_predecoder_outputs_ : void~ptr~ -graph_exec_ : cudaGraphExec_t +capture_graph(stream, device_launch) +poll_next_job(job) bool @@ -84,22 +84,22 @@ flowchart LR end subgraph "GPU Streams" - G0["stream[0]: CUDA Graph"] - G1["stream[1]: CUDA Graph"] - Gn["stream[N-1]: CUDA Graph"] + G0["stream 0: CUDA Graph"] + G1["stream 1: CUDA Graph"] + Gn["stream N-1: CUDA Graph"] end - P -->|"rx_flags[slot]"| D + P -->|"rx_flags signal"| D D -->|"cudaGraphLaunch"| G0 D -->|"cudaGraphLaunch"| G1 D -->|"cudaGraphLaunch"| Gn - G0 -->|"ready_flags[0] = 1"| W0 - G1 -->|"ready_flags[0] = 1"| W1 - Gn -->|"ready_flags[0] = 1"| Wn - W0 -->|"tx_flags[slot]"| C - W1 -->|"tx_flags[slot]"| C - Wn -->|"tx_flags[slot]"| C - C -->|"clear_slot()"| P + G0 -->|"ready_flags = 1"| W0 + G1 -->|"ready_flags = 1"| W1 + Gn -->|"ready_flags = 1"| Wn + W0 -->|"tx_flags signal"| C + W1 -->|"tx_flags signal"| C + Wn -->|"tx_flags signal"| C + C -->|"clear_slot"| P ``` ## 3. Sequence Diagram: Single Syndrome Through the Pipeline @@ -109,66 +109,66 @@ atomic operation and the thread/device boundary crossings. ```mermaid sequenceDiagram - participant Prod as Producer
(main thread) - participant RB as Ring Buffer
(shared memory) - participant Disp as Dispatcher
(dedicated thread) - participant GPU as GPU Stream[w]
(CUDA Graph) - participant Work as Worker Thread[w]
(CPU) - participant Cons as Consumer
(dedicated thread) - participant App as Application
(completion handler) + participant Prod as Producer
(main thread) + participant RB as Ring Buffer
(shared memory) + participant Disp as Dispatcher
(dedicated thread) + participant GPU as GPU Stream w
(CUDA Graph) + participant Work as Worker Thread w
(CPU) + participant Cons as Consumer
(dedicated thread) + participant App as Application
(completion handler) Note over Prod,App: === PHASE 1: Injection === - Prod->>Prod: CAS next_slot (acq_rel)
claim slot S - Prod->>RB: memcpy payload → rx_data[S] - Prod->>RB: write RPCHeader {magic, function_id} - Prod->>RB: rx_flags[S].store(host_ptr, release) - Prod->>Prod: slot_occupied[S] = 1
slot_request[S] = request_id - Prod->>Prod: total_submitted.fetch_add(1, release) + Prod->>Prod: CAS next_slot acq_rel, claim slot S + Prod->>RB: memcpy payload to rx_data S + Prod->>RB: write RPCHeader magic+function_id + Prod->>RB: rx_flags S .store host_ptr, release + Prod->>Prod: slot_occupied S = 1, slot_request S = request_id + Prod->>Prod: total_submitted.fetch_add 1, release Note over Prod,App: === PHASE 2: Dispatch === - Disp->>RB: rx_flags[S].load(acquire)
sees non-zero → slot S ready - Disp->>Disp: parse RPCHeader → function_id - Disp->>Disp: idle_mask.load(acquire)
find worker W via __builtin_ffsll - Disp->>Disp: idle_mask.fetch_and(~(1<mark W busy - Disp->>Disp: inflight_slot_tags[W] = S - Disp->>RB: h_mailbox_bank[W] = dev_ptr - Disp->>Disp: __sync_synchronize() + Disp->>RB: rx_flags S .load acquire, sees non-zero slot S ready + Disp->>Disp: parse RPCHeader to function_id + Disp->>Disp: idle_mask.load acquire, find worker W via ffsll + Disp->>Disp: idle_mask.fetch_and ~1 shl W, release, mark W busy + Disp->>Disp: inflight_slot_tags W = S + Disp->>RB: h_mailbox_bank W = dev_ptr + Disp->>Disp: __sync_synchronize opt pre_launch_fn configured - Disp->>GPU: pre_launch_fn: cudaMemcpyAsync
DMA syndrome → TRT input buffer + Disp->>GPU: pre_launch_fn cudaMemcpyAsync DMA syndrome to TRT input end - Disp->>GPU: cudaGraphLaunch(graph_exec[W], stream[W]) - Disp->>RB: tx_flags[S].store(0xEEEE..., release)
IN_FLIGHT sentinel - Disp->>RB: rx_flags[S].store(0, release)
free rx slot, advance + Disp->>GPU: cudaGraphLaunch graph_exec W, stream W + Disp->>RB: tx_flags S .store 0xEEEE, release, IN_FLIGHT sentinel + Disp->>RB: rx_flags S .store 0, release, free rx slot Note over Prod,App: === PHASE 3: GPU Inference === - GPU->>GPU: gateway_input_kernel:
copy ring buffer → TRT input - GPU->>GPU: TRT enqueueV3:
AI predecoder inference - GPU->>GPU: cudaMemcpyAsync:
TRT output → h_predecoder_outputs - GPU->>GPU: predecoder_signal_ready_kernel:
ready_flags[0].store(1, release) + GPU->>GPU: gateway_input_kernel: copy ring buffer to TRT input + GPU->>GPU: TRT enqueueV3: AI predecoder inference + GPU->>GPU: cudaMemcpyAsync: TRT output to h_predecoder_outputs + GPU->>GPU: predecoder_signal_ready_kernel: ready_flags.store 1, release Note over Prod,App: === PHASE 4: CPU Post-Processing === - Work->>Work: poll_next_job():
ready_flags[0].CAS(1→2, acquire) - Work->>Work: Read h_predecoder_outputs
Run PyMatching MWPM decoder + Work->>Work: poll_next_job: ready_flags CAS 1 to 2, acquire + Work->>Work: Read h_predecoder_outputs, run PyMatching MWPM decoder Work->>Work: Write RPC response to ring buffer slot - Work->>Work: release_job():
ready_flags[0].store(0, release) - Work->>RB: tx_flags[S].store(slot_host_addr, release)
marks READY - Work->>Disp: idle_mask.fetch_or(1<worker W free again + Work->>Work: release_job: ready_flags.store 0, release + Work->>RB: tx_flags S .store slot_host_addr, release, marks READY + Work->>Disp: idle_mask.fetch_or 1 shl W, release, worker W free Note over Prod,App: === PHASE 5: Completion === - Cons->>RB: poll_tx(S): tx_flags[S].load(acquire)
sees valid host addr → READY - Cons->>App: completion_handler({request_id, slot, success}) - Cons->>Cons: total_completed.fetch_add(1, relaxed) - Cons->>Cons: slot_occupied[S] = 0 - Cons->>Cons: __sync_synchronize() - Cons->>RB: clear_slot(S):
rx_flags[S] = 0, tx_flags[S] = 0 - Note over Prod: Slot S now available
for next submission + Cons->>RB: poll_tx S: tx_flags S .load acquire, sees valid addr READY + Cons->>App: completion_handler request_id, slot, success + Cons->>Cons: total_completed.fetch_add 1, relaxed + Cons->>Cons: slot_occupied S = 0 + Cons->>Cons: __sync_synchronize + Cons->>RB: clear_slot S: rx_flags = 0, tx_flags = 0 + Note over Prod: Slot S now available for next submission ``` ## 4. Atomic Variables Reference @@ -223,18 +223,27 @@ stateDiagram-v2 [*] --> FREE : initialization FREE --> RX_SIGNALED : Producer writes rx_flags[S] = host_ptr - note right of RX_SIGNALED : rx_flags ≠ 0, tx_flags = 0\nPayload + RPCHeader in rx_data[S] - - RX_SIGNALED --> IN_FLIGHT : Dispatcher reads rx_flags,\nlaunches graph,\nwrites tx_flags = 0xEEEE...,\nclears rx_flags = 0 - note right of IN_FLIGHT : rx_flags = 0, tx_flags = 0xEEEE...\nGPU processing in progress - - IN_FLIGHT --> TX_READY : Worker writes tx_flags[S] = slot_host_addr\n(after GPU done + PyMatching done) - note right of TX_READY : rx_flags = 0, tx_flags = valid addr\nResult available for consumer - - TX_READY --> FREE : Consumer reads result,\ncalls clear_slot():\nrx_flags = 0, tx_flags = 0 - - IN_FLIGHT --> TX_ERROR : cudaGraphLaunch failed\ntx_flags = 0xDEAD... | err - TX_ERROR --> FREE : Consumer reads error,\ncalls clear_slot() + note right of RX_SIGNALED + rx_flags != 0, tx_flags = 0 + Payload + RPCHeader in rx_data + end note + + RX_SIGNALED --> IN_FLIGHT : Dispatcher reads rx_flags, launches graph, sets tx_flags IN_FLIGHT, clears rx_flags + note right of IN_FLIGHT + rx_flags = 0, tx_flags = 0xEEEE + GPU processing in progress + end note + + IN_FLIGHT --> TX_READY : Worker writes tx_flags = slot_host_addr after GPU + PyMatching done + note right of TX_READY + rx_flags = 0, tx_flags = valid addr + Result available for consumer + end note + + TX_READY --> FREE : Consumer reads result, calls clear_slot + + IN_FLIGHT --> TX_ERROR : cudaGraphLaunch failed, tx_flags = 0xDEAD | err + TX_ERROR --> FREE : Consumer reads error, calls clear_slot ``` **`tx_flags` value encoding:** @@ -254,26 +263,26 @@ The graph is instantiated once at startup and replayed for every syndrome. ```mermaid flowchart TD subgraph "CUDA Graph (AIPreDecoderService)" - A["TRT enqueueV3\n(AI predecoder inference)"] --> B["cudaMemcpyAsync\nTRT output → h_predecoder_outputs\n(host-mapped)"] - B --> C["predecoder_signal_ready_kernel\nready_flags[0].store(1, release)"] + A["TRT enqueueV3
(AI predecoder inference)"] --> B["cudaMemcpyAsync
TRT output to h_predecoder_outputs
(host-mapped)"] + B --> C["predecoder_signal_ready_kernel
ready_flags.store(1, release)"] end subgraph "Pre-Launch Callback (host-side, before graph)" - P["pre_launch_fn:\ncudaMemcpyAsync\nring buffer slot → TRT input\n(DMA copy engine)"] + P["pre_launch_fn:
cudaMemcpyAsync
ring buffer slot to TRT input
(DMA copy engine)"] end subgraph "Post-Graph (Worker Thread)" - D["poll_next_job():\nready_flags CAS 1→2"] + D["poll_next_job():
ready_flags CAS 1 to 2"] E["PyMatching MWPM decode"] F["Write RPC response"] - G["release_job():\nready_flags store 0"] - H["tx_flags[S].store(addr, release)"] - I["idle_mask.fetch_or(1<ready_flags store 0"] + H["tx_flags.store(addr, release)"] + I["idle_mask.fetch_or(1 shl W, release)"] D --> E --> F --> G --> H --> I end P --> A - C -.->|"GPU signals\nready_flags = 1"| D + C -.->|"GPU signals ready_flags = 1"| D ``` ## 7. Backpressure and Flow Control @@ -284,17 +293,17 @@ The pipeline uses implicit backpressure through slot availability: flowchart TD subgraph "Flow Control" Submit["Injector::try_submit()"] - Check{"slot_available(S)?\nrx_flags=0 AND tx_flags=0"} - CAS{"CAS next_slot\ncur → cur+1"} + Check{"slot_available(S)?
rx_flags=0 AND tx_flags=0"} + CAS{"CAS next_slot
cur to cur+1"} Write["Write payload + signal"] - Stall["backpressure_stalls++\nQEC_CPU_RELAX()"] + Stall["backpressure_stalls++
QEC_CPU_RELAX()"] Retry["Retry"] Submit --> Check Check -->|yes| CAS Check -->|no| Stall CAS -->|success| Write - CAS -->|fail (contention)| Stall + CAS -->|"fail contention"| Stall Stall --> Retry --> Submit end ``` @@ -330,14 +339,14 @@ memory model. Key ordering guarantees: ```mermaid flowchart LR subgraph "Release/Acquire Pairs" - A["rx_flags store\n(release)"] -->|"paired with"| B["rx_flags load\n(acquire)"] - C["tx_flags store\n(release)"] -->|"paired with"| D["tx_flags load\n(acquire)"] - E["ready_flags store(1)\n(release, system scope)"] -->|"paired with"| F["ready_flags CAS\n(acquire)"] - G["idle_mask fetch_or\n(release)"] -->|"paired with"| H["idle_mask load\n(acquire)"] + A["rx_flags store
(release)"] -->|"paired with"| B["rx_flags load
(acquire)"] + C["tx_flags store
(release)"] -->|"paired with"| D["tx_flags load
(acquire)"] + E["ready_flags store(1)
(release, system scope)"] -->|"paired with"| F["ready_flags CAS
(acquire)"] + G["idle_mask fetch_or
(release)"] -->|"paired with"| H["idle_mask load
(acquire)"] end subgraph "Full Barriers" - I["__sync_synchronize()\nbetween slot_occupied=0\nand clear_slot()"] - J["__sync_synchronize()\nbetween mailbox_bank write\nand cudaGraphLaunch"] + I["__sync_synchronize()
between slot_occupied=0
and clear_slot()"] + J["__sync_synchronize()
between mailbox_bank write
and cudaGraphLaunch"] end ``` From 9e183df8b2f8bdfb3d926d967ac8ca4a0f0e957e Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Fri, 6 Mar 2026 01:46:42 +0000 Subject: [PATCH 31/40] Remove in-tree realtime/ directory; use pre-installed cudaq-realtime exclusively The realtime/ source tree is removed from the build. All CMake targets (cudaq-realtime-pipeline, test_realtime_pipeline, and test_realtime_predecoder_w_pymatching) now discover headers and libraries from the CUDAQ_REALTIME_ROOT install prefix via find_path/find_library. - Remove add_subdirectory(realtime) from top-level CMakeLists.txt - Move pipeline.h to libs/qec/include/cudaq/qec/realtime/pipeline.h - Move realtime_pipeline.cu to libs/qec/lib/realtime/ - Rewrite cudaq-realtime-pipeline target to link against installed libs - Remove all in-tree TARGET cudaq-realtime branches from unittests CMake - Migrate cudaq::nvqlink:: namespace references to cudaq::realtime:: - Update #include paths from cudaq/nvqlink/ to cudaq/realtime/ - Delete the entire realtime/ source tree (13.5k lines) Signed-off-by: Scott Thornton --- CMakeLists.txt | 7 - .../include/cudaq/qec}/realtime/pipeline.h | 0 libs/qec/lib/realtime/CMakeLists.txt | 66 +- libs/qec/lib/realtime/ai_decoder_service.cu | 10 +- libs/qec/lib/realtime/mock_decode_handler.cu | 10 +- .../qec/lib/realtime}/realtime_pipeline.cu | 2 +- .../test_realtime_predecoder_w_pymatching.cpp | 2 +- libs/qec/unittests/CMakeLists.txt | 127 +- .../realtime/test_realtime_decoding.cu | 26 +- realtime/.clang-format | 12 - realtime/.gitignore | 99 - realtime/CMakeLists.txt | 119 - realtime/README.md | 36 - realtime/docs/cudaq_realtime_host_api.html | 2945 ----------------- .../docs/cudaq_realtime_message_protocol.html | 2513 -------------- realtime/docs/nvqlink_latency_demo.md | 232 -- .../daemon/dispatcher/cudaq_realtime.h | 346 -- .../daemon/dispatcher/dispatch_kernel.cuh | 62 - .../dispatcher/dispatch_kernel_launch.h | 132 - .../daemon/dispatcher/dispatch_modes.h | 64 - .../daemon/dispatcher/host_dispatcher.h | 84 - .../realtime/daemon/dispatcher/kernel_types.h | 39 - .../cudaq/realtime/hololink_bridge_common.h | 502 --- realtime/lib/CMakeLists.txt | 18 - realtime/lib/daemon/CMakeLists.txt | 110 - .../daemon/dispatcher/cudaq_realtime_api.cpp | 345 -- .../lib/daemon/dispatcher/dispatch_kernel.cu | 612 ---- .../lib/daemon/dispatcher/host_dispatcher.cu | 195 -- .../daemon/dispatcher/host_dispatcher_capi.cu | 158 - realtime/lib/pipeline/CMakeLists.txt | 38 - realtime/scripts/install_dev_prerequisites.sh | 53 - realtime/unittests/CMakeLists.txt | 104 - realtime/unittests/test_dispatch_kernel.cu | 735 ---- realtime/unittests/test_host_dispatcher.cu | 1004 ------ realtime/unittests/utils/CMakeLists.txt | 264 -- realtime/unittests/utils/hololink_bridge.cpp | 124 - .../utils/hololink_fpga_emulator.cpp | 1210 ------- .../utils/hololink_fpga_playback.cpp | 534 --- realtime/unittests/utils/hololink_test.sh | 408 --- realtime/unittests/utils/hololink_wrapper.cpp | 216 -- realtime/unittests/utils/hololink_wrapper.h | 142 - .../init_rpc_increment_function_table.cu | 92 - 42 files changed, 124 insertions(+), 13673 deletions(-) rename {realtime/include/cudaq => libs/qec/include/cudaq/qec}/realtime/pipeline.h (100%) rename {realtime/lib/pipeline => libs/qec/lib/realtime}/realtime_pipeline.cu (99%) delete mode 100644 realtime/.clang-format delete mode 100644 realtime/.gitignore delete mode 100644 realtime/CMakeLists.txt delete mode 100644 realtime/README.md delete mode 100644 realtime/docs/cudaq_realtime_host_api.html delete mode 100644 realtime/docs/cudaq_realtime_message_protocol.html delete mode 100644 realtime/docs/nvqlink_latency_demo.md delete mode 100644 realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h delete mode 100644 realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh delete mode 100644 realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h delete mode 100644 realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_modes.h delete mode 100644 realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h delete mode 100644 realtime/include/cudaq/realtime/daemon/dispatcher/kernel_types.h delete mode 100644 realtime/include/cudaq/realtime/hololink_bridge_common.h delete mode 100644 realtime/lib/CMakeLists.txt delete mode 100644 realtime/lib/daemon/CMakeLists.txt delete mode 100644 realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp delete mode 100644 realtime/lib/daemon/dispatcher/dispatch_kernel.cu delete mode 100644 realtime/lib/daemon/dispatcher/host_dispatcher.cu delete mode 100644 realtime/lib/daemon/dispatcher/host_dispatcher_capi.cu delete mode 100644 realtime/lib/pipeline/CMakeLists.txt delete mode 100755 realtime/scripts/install_dev_prerequisites.sh delete mode 100644 realtime/unittests/CMakeLists.txt delete mode 100644 realtime/unittests/test_dispatch_kernel.cu delete mode 100644 realtime/unittests/test_host_dispatcher.cu delete mode 100644 realtime/unittests/utils/CMakeLists.txt delete mode 100644 realtime/unittests/utils/hololink_bridge.cpp delete mode 100644 realtime/unittests/utils/hololink_fpga_emulator.cpp delete mode 100644 realtime/unittests/utils/hololink_fpga_playback.cpp delete mode 100755 realtime/unittests/utils/hololink_test.sh delete mode 100644 realtime/unittests/utils/hololink_wrapper.cpp delete mode 100644 realtime/unittests/utils/hololink_wrapper.h delete mode 100644 realtime/unittests/utils/init_rpc_increment_function_table.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 4fbc9e4d..020b8c4b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -286,13 +286,6 @@ if (CUDAQX_INCLUDE_DOCS) add_subdirectory(docs) endif() -# In-tree realtime (optional): provides cudaq-realtime and host-dispatcher for QEC tests -if(EXISTS "${CMAKE_SOURCE_DIR}/realtime/CMakeLists.txt" AND CMAKE_CUDA_COMPILER) - set(CUDAQ_REALTIME_STANDALONE_BUILD FALSE) - add_subdirectory(realtime) - set(CUDAQX_BUILD_REALTIME_IN_TREE TRUE) -endif() - foreach(lib ${CUDAQX_ENABLE_LIBS}) add_subdirectory(libs/${lib}) endforeach() diff --git a/realtime/include/cudaq/realtime/pipeline.h b/libs/qec/include/cudaq/qec/realtime/pipeline.h similarity index 100% rename from realtime/include/cudaq/realtime/pipeline.h rename to libs/qec/include/cudaq/qec/realtime/pipeline.h diff --git a/libs/qec/lib/realtime/CMakeLists.txt b/libs/qec/lib/realtime/CMakeLists.txt index 31056201..1486b746 100644 --- a/libs/qec/lib/realtime/CMakeLists.txt +++ b/libs/qec/lib/realtime/CMakeLists.txt @@ -24,10 +24,17 @@ if(CMAKE_CUDA_COMPILER) endif() find_path(CUDAQ_REALTIME_INCLUDE_DIR - NAMES cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h + NAMES cudaq/realtime/daemon/dispatcher/cudaq_realtime.h PATHS ${_cudaq_realtime_prefixes} - PATH_SUFFIXES include ../include + PATH_SUFFIXES include ) + if(NOT CUDAQ_REALTIME_INCLUDE_DIR) + find_path(CUDAQ_REALTIME_INCLUDE_DIR + NAMES cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h + PATHS ${_cudaq_realtime_prefixes} + PATH_SUFFIXES include ../include + ) + endif() if(CUDAQ_REALTIME_INCLUDE_DIR) message(STATUS "Found cuda-quantum realtime headers at ${CUDAQ_REALTIME_INCLUDE_DIR}") @@ -115,5 +122,60 @@ install(TARGETS cudaq-qec-realtime-decoding LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) +# --------------------------------------------------------------------------- +# RealtimePipeline shared library +# Requires pre-installed cudaq-realtime (set CUDAQ_REALTIME_ROOT) +# --------------------------------------------------------------------------- +if(CMAKE_CUDA_COMPILER AND CUDAQ_REALTIME_INCLUDE_DIR) + find_library(_CUDAQ_RT_LIB cudaq-realtime + PATHS ${_cudaq_realtime_prefixes} PATH_SUFFIXES lib) + find_library(_CUDAQ_RT_HD_LIB cudaq-realtime-host-dispatch + PATHS ${_cudaq_realtime_prefixes} PATH_SUFFIXES lib) + + if(_CUDAQ_RT_LIB AND _CUDAQ_RT_HD_LIB) + message(STATUS "RealtimePipeline: building with CUDAQ_REALTIME_INCLUDE_DIR=${CUDAQ_REALTIME_INCLUDE_DIR}") + + add_library(cudaq-realtime-pipeline SHARED + realtime_pipeline.cu + ) + + get_filename_component(_cuda_bin_pl "${CMAKE_CUDA_COMPILER}" DIRECTORY) + get_filename_component(_cuda_root_pl "${_cuda_bin_pl}" DIRECTORY) + set(_cuda_cccl_include_pl "${_cuda_root_pl}/include/cccl") + + target_include_directories(cudaq-realtime-pipeline + PUBLIC + $ + $ + $ + $ + ) + + target_link_libraries(cudaq-realtime-pipeline + PUBLIC CUDA::cudart_static + PRIVATE ${_CUDAQ_RT_LIB} ${_CUDAQ_RT_HD_LIB} + ) + + get_filename_component(_CUDAQ_RT_LIB_DIR "${_CUDAQ_RT_LIB}" DIRECTORY) + set_target_properties(cudaq-realtime-pipeline PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + POSITION_INDEPENDENT_CODE ON + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib + BUILD_RPATH "${_CUDAQ_RT_LIB_DIR};${CMAKE_BINARY_DIR}/lib" + ) + + install(TARGETS cudaq-realtime-pipeline + COMPONENT qec-lib + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) + else() + message(STATUS "RealtimePipeline: skipping (cudaq-realtime or cudaq-realtime-host-dispatch not found)") + endif() +else() + if(CMAKE_CUDA_COMPILER) + message(STATUS "RealtimePipeline: skipping (CUDAQ_REALTIME_INCLUDE_DIR not set)") + endif() +endif() + add_subdirectory(quantinuum) add_subdirectory(simulation) diff --git a/libs/qec/lib/realtime/ai_decoder_service.cu b/libs/qec/lib/realtime/ai_decoder_service.cu index 3efd9336..90f18c24 100644 --- a/libs/qec/lib/realtime/ai_decoder_service.cu +++ b/libs/qec/lib/realtime/ai_decoder_service.cu @@ -6,7 +6,7 @@ * the terms of the Apache License 2.0 which accompanies this distribution. * ******************************************************************************/ -#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" #include "cudaq/qec/realtime/ai_decoder_service.h" #include #include @@ -40,7 +40,7 @@ __global__ void gateway_input_kernel(void **mailbox_slot_ptr, return; const char *src = - (const char *)ring_buffer_data + sizeof(cudaq::nvqlink::RPCHeader); + (const char *)ring_buffer_data + sizeof(cudaq::realtime::RPCHeader); char *dst = (char *)trt_fixed_input; for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < copy_size_bytes; @@ -56,7 +56,7 @@ __global__ void gateway_output_kernel(void **mailbox_slot_ptr, if (ring_buffer_data == nullptr) return; - char *dst = (char *)ring_buffer_data + sizeof(cudaq::nvqlink::RPCHeader); + char *dst = (char *)ring_buffer_data + sizeof(cudaq::realtime::RPCHeader); const char *src = (const char *)trt_fixed_output; for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < result_size_bytes; @@ -67,8 +67,8 @@ __global__ void gateway_output_kernel(void **mailbox_slot_ptr, __syncthreads(); if (threadIdx.x == 0 && blockIdx.x == 0) { - auto *response = (cudaq::nvqlink::RPCResponse *)ring_buffer_data; - response->magic = cudaq::nvqlink::RPC_MAGIC_RESPONSE; + auto *response = (cudaq::realtime::RPCResponse *)ring_buffer_data; + response->magic = cudaq::realtime::RPC_MAGIC_RESPONSE; response->status = 0; response->result_len = static_cast(result_size_bytes); __threadfence_system(); diff --git a/libs/qec/lib/realtime/mock_decode_handler.cu b/libs/qec/lib/realtime/mock_decode_handler.cu index a8224520..318cb4c2 100644 --- a/libs/qec/lib/realtime/mock_decode_handler.cu +++ b/libs/qec/lib/realtime/mock_decode_handler.cu @@ -6,7 +6,7 @@ * the terms of the Apache License 2.0 which accompanies this distribution. * ******************************************************************************/ -#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" #include "cudaq/qec/realtime/mock_decode_handler.cuh" namespace cudaq::qec::realtime { @@ -98,10 +98,10 @@ __global__ void mock_decode_graph_kernel(void **buffer_ptr) { return; // Parse RPC header - auto *header = static_cast(data_buffer); + auto *header = static_cast(data_buffer); void *arg_buffer = static_cast(header + 1); - auto *response = static_cast(data_buffer); + auto *response = static_cast(data_buffer); if (g_mock_decoder != nullptr) { uint8_t *measurements = static_cast(arg_buffer); @@ -112,12 +112,12 @@ __global__ void mock_decode_graph_kernel(void **buffer_ptr) { ctx.num_observables); // Write response - response->magic = cudaq::nvqlink::RPC_MAGIC_RESPONSE; + response->magic = cudaq::realtime::RPC_MAGIC_RESPONSE; response->status = 0; response->result_len = static_cast(ctx.num_observables); } else { // Error: decoder not set - response->magic = cudaq::nvqlink::RPC_MAGIC_RESPONSE; + response->magic = cudaq::realtime::RPC_MAGIC_RESPONSE; response->status = -1; response->result_len = 0; } diff --git a/realtime/lib/pipeline/realtime_pipeline.cu b/libs/qec/lib/realtime/realtime_pipeline.cu similarity index 99% rename from realtime/lib/pipeline/realtime_pipeline.cu rename to libs/qec/lib/realtime/realtime_pipeline.cu index 586cd250..13c20f26 100644 --- a/realtime/lib/pipeline/realtime_pipeline.cu +++ b/libs/qec/lib/realtime/realtime_pipeline.cu @@ -8,7 +8,7 @@ #include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" #include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" -#include "cudaq/realtime/pipeline.h" +#include "cudaq/qec/realtime/pipeline.h" #include #include diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index 9c31cfaf..72f1bd53 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -42,7 +42,7 @@ #include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" #include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" #include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" -#include "cudaq/realtime/pipeline.h" +#include "cudaq/qec/realtime/pipeline.h" #include "cudaq/qec/code.h" #include "cudaq/qec/decoder.h" diff --git a/libs/qec/unittests/CMakeLists.txt b/libs/qec/unittests/CMakeLists.txt index cdc104a9..4807a274 100644 --- a/libs/qec/unittests/CMakeLists.txt +++ b/libs/qec/unittests/CMakeLists.txt @@ -149,12 +149,11 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) PATH_SUFFIXES lib ) - # In-tree realtime (built from top-level add_subdirectory(realtime)) provides new API - set(_predecoder_use_in_tree_realtime FALSE) - if(TARGET cudaq-realtime) - set(_predecoder_use_in_tree_realtime TRUE) - message(STATUS "Using in-tree realtime (cudaq-realtime) for predecoder test") - endif() + find_library(CUDAQ_REALTIME_HOST_DISPATCH_LIBRARY + NAMES cudaq-realtime-host-dispatch + PATHS ${_cudaq_realtime_prefixes} + PATH_SUFFIXES lib + ) set(_have_realtime_for_tests FALSE) if(CUDAQ_REALTIME_INCLUDE_DIR AND CUDAQ_REALTIME_LIBRARY AND CUDAQ_REALTIME_DISPATCH_LIBRARY) @@ -163,9 +162,6 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) message(STATUS "Found cuda-quantum realtime library at ${CUDAQ_REALTIME_LIBRARY}") message(STATUS "Found cuda-quantum realtime dispatch library at ${CUDAQ_REALTIME_DISPATCH_LIBRARY}") endif() - if(TARGET cudaq-realtime) - set(_have_realtime_for_tests TRUE) - endif() if(_have_realtime_for_tests) @@ -238,14 +234,6 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) get_filename_component(_cuda_root_pipe "${_cuda_bin_pipe}" DIRECTORY) set(_cuda_cccl_include_pipe "${_cuda_root_pipe}/include/cccl") - set(_realtime_pipeline_includes "") - if(NOT _predecoder_use_in_tree_realtime) - set(_realtime_include_pipe "${CMAKE_SOURCE_DIR}/realtime/include") - if(EXISTS "${_realtime_include_pipe}/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h") - list(APPEND _realtime_pipeline_includes "${_realtime_include_pipe}") - endif() - endif() - add_executable(test_realtime_pipeline ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/ai_decoder_service.cu ${CMAKE_SOURCE_DIR}/libs/qec/lib/realtime/ai_predecoder_service.cu @@ -265,39 +253,23 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) ${TENSORRT_INCLUDE_DIR_FOR_PIPELINE} ${CMAKE_CURRENT_SOURCE_DIR}/../include ${CMAKE_SOURCE_DIR}/libs/core/include - ${_realtime_pipeline_includes} ${CUDAQ_REALTIME_INCLUDE_DIR} ) - if(_predecoder_use_in_tree_realtime) - target_link_libraries(test_realtime_pipeline PRIVATE - GTest::gtest_main - CUDA::cudart - ${TENSORRT_LIBRARY_FOR_PIPELINE} - ${TENSORRT_ONNX_PARSER_FOR_PIPELINE} - cudaq-realtime - cudaq-realtime-host-dispatch - cudaq-realtime-dispatch - cudaq-realtime-pipeline - ) - set_target_properties(test_realtime_pipeline PROPERTIES - BUILD_RPATH "${CMAKE_BINARY_DIR}/lib;${CMAKE_BINARY_DIR}/realtime/lib" - INSTALL_RPATH "${CMAKE_BINARY_DIR}/lib;${CMAKE_BINARY_DIR}/realtime/lib" - ) - else() - target_link_libraries(test_realtime_pipeline PRIVATE - GTest::gtest_main - CUDA::cudart - ${TENSORRT_LIBRARY_FOR_PIPELINE} - ${TENSORRT_ONNX_PARSER_FOR_PIPELINE} - ${CUDAQ_REALTIME_LIBRARY} - ${CUDAQ_REALTIME_DISPATCH_LIBRARY} - ) - set_target_properties(test_realtime_pipeline PROPERTIES - BUILD_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" - INSTALL_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" - ) - endif() + target_link_libraries(test_realtime_pipeline PRIVATE + GTest::gtest_main + CUDA::cudart + ${TENSORRT_LIBRARY_FOR_PIPELINE} + ${TENSORRT_ONNX_PARSER_FOR_PIPELINE} + ${CUDAQ_REALTIME_LIBRARY} + ${CUDAQ_REALTIME_DISPATCH_LIBRARY} + ${CUDAQ_REALTIME_HOST_DISPATCH_LIBRARY} + cudaq-realtime-pipeline + ) + set_target_properties(test_realtime_pipeline PROPERTIES + BUILD_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" + INSTALL_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" + ) add_dependencies(CUDAQXQECUnitTests test_realtime_pipeline) gtest_discover_tests(test_realtime_pipeline @@ -361,58 +333,29 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) get_filename_component(_cuda_root "${_cuda_bin}" DIRECTORY) set(_cuda_cccl_include "${_cuda_root}/include/cccl") - # Includes: in-tree realtime target brings include; else in-repo or install dir - set(_realtime_predecoder_includes "") - if(NOT _predecoder_use_in_tree_realtime) - set(_realtime_include "${CMAKE_SOURCE_DIR}/realtime/include") - if(EXISTS "${_realtime_include}/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h") - list(APPEND _realtime_predecoder_includes "${_realtime_include}") - endif() - endif() target_include_directories(test_realtime_predecoder_w_pymatching PRIVATE ${_cuda_cccl_include} ${CUDAToolkit_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../include ${CMAKE_SOURCE_DIR}/libs/core/include - ${_realtime_predecoder_includes} ${CUDAQ_REALTIME_INCLUDE_DIR} ) - if(_predecoder_use_in_tree_realtime) - target_link_libraries(test_realtime_predecoder_w_pymatching PRIVATE - CUDA::cudart - ${TENSORRT_LIBRARY} - ${TENSORRT_ONNX_PARSER_LIBRARY} - cudaq-realtime - cudaq-realtime-host-dispatch - cudaq-realtime-dispatch - cudaq-realtime-pipeline - cudaq-qec - cudaq::cudaq - ) - set_target_properties(test_realtime_predecoder_w_pymatching PROPERTIES - BUILD_RPATH "${CMAKE_BINARY_DIR}/lib;${CMAKE_BINARY_DIR}/realtime/lib" - INSTALL_RPATH "${CMAKE_BINARY_DIR}/lib;${CMAKE_BINARY_DIR}/realtime/lib" - ) - else() - target_link_libraries(test_realtime_predecoder_w_pymatching PRIVATE - CUDA::cudart - ${TENSORRT_LIBRARY} - ${TENSORRT_ONNX_PARSER_LIBRARY} - ${CUDAQ_REALTIME_LIBRARY} - ${CUDAQ_REALTIME_DISPATCH_LIBRARY} - cudaq-qec - cudaq::cudaq - ) - target_link_directories(test_realtime_predecoder_w_pymatching PRIVATE - ${CMAKE_BINARY_DIR}/lib - ) - set_target_properties(test_realtime_predecoder_w_pymatching PROPERTIES - BUILD_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" - INSTALL_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" - ) - endif() + target_link_libraries(test_realtime_predecoder_w_pymatching PRIVATE + CUDA::cudart + ${TENSORRT_LIBRARY} + ${TENSORRT_ONNX_PARSER_LIBRARY} + ${CUDAQ_REALTIME_LIBRARY} + ${CUDAQ_REALTIME_DISPATCH_LIBRARY} + cudaq-realtime-pipeline + cudaq-qec + cudaq::cudaq + ) + set_target_properties(test_realtime_predecoder_w_pymatching PROPERTIES + BUILD_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" + INSTALL_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" + ) add_dependencies(CUDAQXQECUnitTests test_realtime_predecoder_w_pymatching) else() @@ -421,8 +364,8 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) else() message(WARNING "cuda-quantum realtime dependency not found. " - "Set CUDAQ_REALTIME_ROOT or build with in-tree realtime to enable " - "test_realtime_decoding and test_realtime_predecoder_w_pymatching.") + "Set CUDAQ_REALTIME_ROOT to enable " + "test_realtime_pipeline and test_realtime_predecoder_w_pymatching.") endif() endif() diff --git a/libs/qec/unittests/decoders/realtime/test_realtime_decoding.cu b/libs/qec/unittests/decoders/realtime/test_realtime_decoding.cu index 48e5992a..3afdd977 100644 --- a/libs/qec/unittests/decoders/realtime/test_realtime_decoding.cu +++ b/libs/qec/unittests/decoders/realtime/test_realtime_decoding.cu @@ -27,13 +27,13 @@ #include // cuda-quantum host API -#include "cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" // cuda-quantum RPC types/hash helper -#include "cudaq/nvqlink/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" // cuda-quantum kernel types for graph-aware dispatch -#include "cudaq/nvqlink/daemon/dispatcher/kernel_types.h" +#include "cudaq/realtime/daemon/dispatcher/kernel_types.h" // cudaqx mock decoder #include "cudaq/qec/realtime/mock_decode_handler.cuh" @@ -53,7 +53,7 @@ namespace { // The dispatch kernel uses function_id to find the handler constexpr std::uint32_t MOCK_DECODE_FUNCTION_ID = - cudaq::nvqlink::fnv1a_hash("mock_decode"); + cudaq::realtime::fnv1a_hash("mock_decode"); //============================================================================== // Hololink-Style Ring Buffer @@ -378,7 +378,7 @@ protected: cudaq::qec::realtime::set_mock_decoder(d_decoder_); // Allocate ring buffers (with space for RPCHeader) - slot_size_ = sizeof(cudaq::nvqlink::RPCHeader) + + slot_size_ = sizeof(cudaq::realtime::RPCHeader) + std::max(syndrome_size_, static_cast(256)); ASSERT_TRUE(allocate_ring_buffer(num_slots_, slot_size_, &rx_flags_host_, &rx_flags_, &rx_data_host_, &rx_data_)); @@ -560,14 +560,14 @@ protected: const_cast(rx_data_host_) + slot * slot_size_; // Write RPCHeader - cudaq::nvqlink::RPCHeader *header = - reinterpret_cast(slot_data); - header->magic = cudaq::nvqlink::RPC_MAGIC_REQUEST; + cudaq::realtime::RPCHeader *header = + reinterpret_cast(slot_data); + header->magic = cudaq::realtime::RPC_MAGIC_REQUEST; header->function_id = MOCK_DECODE_FUNCTION_ID; header->arg_len = static_cast(measurements.size()); // Write measurement data after header - memcpy(slot_data + sizeof(cudaq::nvqlink::RPCHeader), measurements.data(), + memcpy(slot_data + sizeof(cudaq::realtime::RPCHeader), measurements.data(), measurements.size()); } @@ -580,10 +580,10 @@ protected: const_cast(rx_data_host_) + slot * slot_size_; // Read RPCResponse - const cudaq::nvqlink::RPCResponse *response = - reinterpret_cast(slot_data); + const cudaq::realtime::RPCResponse *response = + reinterpret_cast(slot_data); - if (response->magic != cudaq::nvqlink::RPC_MAGIC_RESPONSE) { + if (response->magic != cudaq::realtime::RPC_MAGIC_RESPONSE) { return false; } if (status_out) @@ -596,7 +596,7 @@ protected: } // Read correction data after response header - correction = *(slot_data + sizeof(cudaq::nvqlink::RPCResponse)); + correction = *(slot_data + sizeof(cudaq::realtime::RPCResponse)); return true; } diff --git a/realtime/.clang-format b/realtime/.clang-format deleted file mode 100644 index 4c6382a7..00000000 --- a/realtime/.clang-format +++ /dev/null @@ -1,12 +0,0 @@ -BasedOnStyle: LLVM -AlwaysBreakTemplateDeclarations: Yes -IncludeCategories: - - Regex: '^<' - Priority: 4 - - Regex: '^"cudaq/' - Priority: 3 - - Regex: '^"(realtime|\.\.)/' - Priority: 2 - - Regex: '.*' - Priority: 1 -InsertNewlineAtEOF: Yes diff --git a/realtime/.gitignore b/realtime/.gitignore deleted file mode 100644 index ccec909e..00000000 --- a/realtime/.gitignore +++ /dev/null @@ -1,99 +0,0 @@ -# Editor backup files -*~ - -# Patch files -*.orig -*.rej - -# Compiled Object files -*.slo -*.lo -*.o -*.obj -*.x -# Precompiled Headers -*.gch -*.pch - -# Compiled Dynamic libraries -*.so -*.dylib -*.dll - -# Fortran module files -*.mod -*.smod - -# Compiled Static libraries -*.lai -*.la -*.a -*.lib - -**/Output/ -**/.lit*.txt - -# Executables -*.exe -*.out -*.app -**/out/ -/*build*/ -/*Build/ -/plugins/ -/other_library_builds/ -/.cproject -/.project -/.settings/ -**/*.jar -**/.ptp* -*.ab -/dist/ -/*egg*/ -/python/*egg* -/*tmp*/ -/wheelhouse/ -**/.ipynb_checkpoints -compile_commands.json -**/*.dat -**/.antlr -__pycache__/ - -# IDE files -.vscode/* -.theia/* - -# Container files -**/.docker/* - -# LSP files -.cache/* - -# LLVM/MLIR files -*.ll -*.bc - -# Build results -[Bb]in/ -[Oo]bj/ -*.bson -*.csv -*.bin -docs/sphinx/_doxygen -docs/sphinx/_mdgen -**/_build/* -**/_skbuild/* -_version.py - -# third party integrations -simulators/ -apps/ - -# macOS -.DS_Store - -# JetBrains IDE files -.idea - -# vim files -*.tmp diff --git a/realtime/CMakeLists.txt b/realtime/CMakeLists.txt deleted file mode 100644 index f5a78407..00000000 --- a/realtime/CMakeLists.txt +++ /dev/null @@ -1,119 +0,0 @@ -# ============================================================================ # -# Copyright (c) 2025 NVIDIA Corporation & Affiliates. # -# All rights reserved. # -# # -# This source code and the accompanying materials are made available under # -# the terms of the Apache License 2.0 which accompanies this distribution. # -# ============================================================================ # - -# Requiring the same version as the others. -cmake_minimum_required(VERSION 3.28 FATAL_ERROR) - -include(FetchContent) - -# Set a default build type if none was specified. Must set this before -# project(). -set(CMAKE_BUILD_TYPE "Release" CACHE STRING - "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel") - -# Set a default install prefix if none was specified. -set(CMAKE_INSTALL_PREFIX "$ENV{HOME}/.cudaq_realtime" CACHE STRING - "Install path prefix, prepended onto install directories") - -# Project setup -# ============================================================================== - -# Check if built as standalone (not as subdirectory of cudaqx). -project(cudaq-realtime) -if(NOT DEFINED CUDAQ_REALTIME_STANDALONE_BUILD) - set(CUDAQ_REALTIME_STANDALONE_BUILD TRUE) -endif() - -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) - -# The following must go after `project(...)` -set(CMAKE_CXX_STANDARD 20) -set(CMAKE_CXX_STANDARD_REQUIRED TRUE) -set(CMAKE_POSITION_INDEPENDENT_CODE TRUE) - -set(CUDAQ_REALTIME_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) -set(CUDAQ_REALTIME_INCLUDE_DIR ${CUDAQ_REALTIME_SOURCE_DIR}/include) - -# Add cmake directory to module path for custom Find modules -list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") - -# Options -# ============================================================================== - -option(CUDAQ_REALTIME_BUILD_TESTS - "Generate build targets for the CUDAQ real-time unit tests" ON) -option(CUDAQ_REALTIME_BUILD_EXAMPLES - "Generate build targets for the CUDAQ real-time example programs" ON) -option(CUDAQ_REALTIME_ENABLE_HOLOLINK_TOOLS - "Build Hololink bridge/emulator/playback tools (requires hololink)." - OFF) - -# Check for CUDA Support (ref: cuda-quantum/CMakeLists.txt) -# ============================================================================== -include(CheckLanguage) -check_language(CUDA) -set(CUDA_FOUND FALSE) -# Generate -gencode arch=compute_XX,code=sm_XX for list of supported -# arch values. -# List should be sorted in increasing order. -function(CUDA_get_gencode_args out_args_string arch_values) - # allow the user to pass the list like a normal variable - set(arch_list ${arch_values} ${ARGN}) - set(out "") - foreach(arch IN LISTS arch_list) - set(out "${out} -gencode arch=compute_${arch},code=sm_${arch}") - endforeach(arch) - - # Repeat the last one as to ensure the generation of PTX for most - # recent virtual architecture for forward compatibility - list(GET arch_list -1 last_arch) - set(out "${out} -gencode arch=compute_${last_arch},code=compute_${last_arch}") - set(${out_args_string} ${out} PARENT_SCOPE) -endfunction() - -if(CMAKE_CUDA_COMPILER) - if (NOT CUDA_TARGET_ARCHS) - # Ampere, Hopper - set(CUDA_TARGET_ARCHS "80;90") - endif() - CUDA_get_gencode_args(CUDA_gencode_flags ${CUDA_TARGET_ARCHS}) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -shared -std=c++17 ${CUDA_gencode_flags} --compiler-options -fPIC") - - enable_language(CUDA) - set(CUDA_FOUND TRUE) - set(CMAKE_CUDA_STANDARD 17) - set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) - find_package(CUDAToolkit REQUIRED) - message(STATUS "Cuda language found.") -endif() - -# External Dependencies -# ============================================================================== - -find_package(Threads REQUIRED) - -add_subdirectory(lib) - -if (CUDAQ_REALTIME_BUILD_EXAMPLES) - message(STATUS "RoCE/DOCA examples removed for RPC dispatch workflow.") -endif() - -if (CUDAQ_REALTIME_BUILD_TESTS AND CUDAQ_REALTIME_STANDALONE_BUILD) - add_custom_target(CudaqRealtimeUnitTests) - include(CTest) - - add_custom_target(run_tests - COMMAND ${CMAKE_COMMAND} -E env - PYTHONPATH="${CUDAQ_INSTALL_DIR}:${CMAKE_BINARY_DIR}/python" - ${CMAKE_CTEST_COMMAND} --output-on-failure - DEPENDS CudaqRealtimeUnitTests - WORKING_DIRECTORY ${CMAKE_BINARY_DIR} - ) - add_subdirectory(unittests) -endif() - diff --git a/realtime/README.md b/realtime/README.md deleted file mode 100644 index 5ebdd7db..00000000 --- a/realtime/README.md +++ /dev/null @@ -1,36 +0,0 @@ -# CUDA-Q Realtime Library - -CUDA-Q Realtime is a library for tightly coupling GPU accelerated compute -to the control system of a quantum processor. - -It fulfills two primary responsibilities: - -1. It provides the low-level basis of realtime coprocessing -between FPGA and CPU-GPU systems. - -2. It provides the low latency networking stack of the NVQLink architecture, -enabling system integrators to achieve few-microsecond -data round trips between FPGA and GPU. - -> [!WARNING] -> This library is currently in early access / alpha stage -> and will continue to rapidly evolve as we build interactively with collaborators. - - - -> [!NOTE] -> While the library is in early access, instructions to reproduce the FPGA-GPU latency -> round trip on third party systems can be found at [docs/nvqlink_latency_demo.md](docs/nvqlink_latency_demo.md). - -## Getting Started - -```bash -# Configure, need cmake 3.28+ -cmake -G Ninja .. -DCUDAQ_REALTIME_BUILD_TESTS=ON -# Build -ninja -# Test -ctest -``` - -Check out the tests in the `unittests` folder for examples. diff --git a/realtime/docs/cudaq_realtime_host_api.html b/realtime/docs/cudaq_realtime_host_api.html deleted file mode 100644 index 0338ec07..00000000 --- a/realtime/docs/cudaq_realtime_host_api.html +++ /dev/null @@ -1,2945 +0,0 @@ - - - - - CUDA-Q Realtime Host API (Draft) - - - - - - - - - - - - - - - -
-

-

CUDA-Q Realtime Host API (Draft)

-

Published Proposal, -

-
-
-
Editor: -
(NVIDIA) -
Issue Tracking: -
GitHub -
-
-
-
-
-
-

Abstract

-

Host API, wiring, and usage for CUDA-Q realtime dispatch.

-
-
- -
-

1. CUDA-Q Realtime Host API

-

This document explains the C host API for realtime dispatch, the RPC wire -protocol, and complete wiring examples. It is written for external partners -integrating CUDA-QX decoders with their own transport mechanisms. The API and -protocol are transport-agnostic and support multiple data transport options, -including NVIDIA Hololink (RDMA via ConnectX NICs), libibverbs, and proprietary -transport layers. Handlers can execute on GPU (via CUDA kernels) or CPU (via -host threads). Examples in this document use Hololink’s 3-kernel workflow (RX -kernel/dispatch/TX kernel) for illustration, but the same principles apply to -other transport mechanisms.

- -

Hololink is NVIDIA’s low-latency sensor bridge framework that enables -direct GPU memory access from external devices (FPGAs, sensors) over Ethernet -using RDMA (Remote Direct Memory Access) via ConnectX NICs. In the context of -quantum error correction, Hololink is one example of a transport mechanism that -connects the quantum control system (typically an FPGA) to GPU-based decoders.

-

Repository: nvidia-holoscan/holoscan-sensor-bridge (nvqlink branch)

-

Hololink handles:

-
    -
  • -

    RX (Receive): RX kernel receives data from the FPGA directly into GPU memory via RDMA

    -
  • -

    TX (Transmit): TX kernel sends results back to the FPGA via RDMA

    -
  • -

    RDMA transport: Zero-copy data movement using ConnectX-7 NICs with GPUDirect support

    -
-

The CUDA-Q Realtime Host API provides the middle component (dispatch kernel or thread) that -sits between the transport’s RX and TX components, executing the actual decoder logic.

-

1.2. Transport Mechanisms # {#transport-mechanisms}

-

The realtime dispatch API is designed to work with multiple transport mechanisms -that move data between the quantum control system (FPGA) and the decoder. The -transport mechanism handles getting RPC messages into RX ring buffer slots and -sending responses from TX ring buffer slots back to the FPGA.

-

1.2.1. Supported Transport Options

-

Hololink (GPU-based with GPUDirect):

-
    -
  • -

    Uses ConnectX-7 NICs with RDMA for zero-copy data movement

    -
  • -

    RX and TX are persistent GPU kernels that directly access GPU memory

    -
  • -

    Requires GPUDirect support

    -
  • -

    Lowest latency option for GPU-based decoders

    -
-

libibverbs (CPU-based):

-
    -
  • -

    Standard InfiniBand Verbs API for RDMA on the CPU

    -
  • -

    RX and TX are host threads that poll CPU-accessible memory

    -
  • -

    Works with CPU-based dispatchers

    -
  • -

    Ring buffers reside in host memory (cudaHostAlloc or regular malloc)

    -
-

Proprietary Transport Mechanisms:

-
    -
  • -

    Custom implementations with or without GPUDirect support

    -
  • -

    May use different networking technologies or memory transfer methods

    -
  • -

    Must implement the ring buffer + flag protocol defined in this document

    -
  • -

    Can target either GPU (with suitable memory access) or CPU execution

    -
-

The key requirement is that the transport mechanism implements the ring buffer -slot + flag protocol: writing RPC messages to RX slots and setting rx_flags, -then reading TX slots after tx_flags are set.

-

1.3. The 3-Kernel Architecture (Hololink Example) # {#three-kernel-architecture}

-

The Hololink workflow separates concerns into three persistent GPU kernels that -communicate via shared ring buffers:

-

3-kernel architecture

-

1.3.1. Data Flow Summary # {#data-flow-summary}

- - - - - - - - - - -
Step - Component - Action -
1-2 - FPGA → ConnectX - Detection event data sent over Ethernet, RDMA writes to GPU memory -
3 - RX Kernel - Frames detection events into RPC message, sets rx_flags[slot] (see Message completion note) -
4-5 - Dispatch Kernel - Polls for ready slots, looks up handler by function_id, executes decoder -
6 - Dispatch Kernel - Writes RPCResponse + correction, sets tx_flags[slot] -
7-8 - TX Kernel - Polls for responses, triggers RDMA send back to FPGA -
9 - ConnectX → FPGA - Correction delivered to quantum controller -
-

1.3.2. Why 3 Kernels? # {#why-3-kernels}

-
    -
  1. -

    Separation of concerns: Transport (RX/TX kernels) vs. compute (dispatch) are decoupled

    -
  2. -

    Reusability: Same dispatch kernel works with any decoder handler

    -
  3. -

    Testability: Dispatch kernel can be tested without Hololink hardware

    -
  4. -

    Flexibility: RX/TX kernels can be replaced with different transport mechanisms

    -
  5. -

    Transport independence: The protocol works with Hololink, libibverbs, or proprietary transports

    -
-

1.4. What This API Does (In One Paragraph) # {#what-this-does}

-

The host API wires a dispatcher (GPU kernel or CPU thread) to shared ring buffers. -The transport mechanism (e.g., Hololink RX/TX kernels, libibverbs threads, or -proprietary transport) places incoming RPC messages into RX slots and retrieves -responses from TX slots. -The dispatcher polls RX flags (see Message completion note), looks up a -handler by function_id, executes it on the GPU, and writes a response into the -same slot. Hololink’s RX/TX kernels handle device I/O; the dispatch kernel sits -in the middle and runs the decoder handler.

-

1.5. Scope # {#scope}

-
    -
  • -

    C host API in cudaq_realtime.h

    -
  • -

    RPC messaging protocol (header + payload + response)

    -
  • -

    End-to-end example using the mock decoder in cudaqx

    -
  • -

    NIC-free testing path

    -
-

1.6. Terms and Components # {#terms}

-
    -
  • -

    Ring buffer: Fixed-size slots holding RPC messages (see Message completion note). Each slot has an RX flag and a TX flag.

    -
  • -

    RX flag: Nonzero means a slot is ready to be processed.

    -
  • -

    TX flag: Nonzero means a response is ready to send.

    -
  • -

    Dispatcher: Component that processes RPC messages (GPU kernel or CPU thread).

    -
  • -

    Handler: Function registered in the function table that processes specific message types.

    -
  • -

    Function table: Array of handler function pointers + IDs + schemas.

    -
-

1.7. Schema Data Structures # {#schema-structures}

-

Each handler registered in the function table includes a schema that describes -its argument and result types.

-

1.7.1. Type Descriptors

-
// Standardized payload type identifiersenum PayloadTypeID : uint8_t {  TYPE_UINT8           = 0x10,  TYPE_INT32           = 0x11,  TYPE_INT64           = 0x12,  TYPE_FLOAT32         = 0x13,  TYPE_FLOAT64         = 0x14,  TYPE_ARRAY_UINT8     = 0x20,  TYPE_ARRAY_INT32     = 0x21,  TYPE_ARRAY_FLOAT32   = 0x22,  TYPE_ARRAY_FLOAT64   = 0x23,  TYPE_BIT_PACKED      = 0x30   // Bit-packed data (LSB-first)};struct cudaq_type_desc_t {  uint8_t  type_id;       // PayloadTypeID value  uint8_t  reserved[3];  uint32_t size_bytes;    // Total size in bytes  uint32_t num_elements;  // Interpretation depends on type_id};
-

The num_elements field interpretation:

-
    -
  • -

    Scalar types (TYPE_UINT8, TYPE_INT32, etc.): unused, set to 1

    -
  • -

    Array types (TYPE_ARRAY_*): number of array elements

    -
  • -

    TYPE_BIT_PACKED: number of bits (not bytes)

    -
-

1.7.2. Handler Schema

-
struct cudaq_handler_schema_t {  uint8_t  num_args;              // Number of input arguments  uint8_t  num_results;           // Number of return values  uint16_t reserved;  cudaq_type_desc_t args[8];      // Argument type descriptors  cudaq_type_desc_t results[4];   // Result type descriptors};
-

Limits:

-
    -
  • -

    Maximum 8 arguments per handler

    -
  • -

    Maximum 4 results per handler

    -
  • -

    Total payload size must fit in slot: slot_size - sizeof(RPCHeader)

    -
-

1.8. RPC Messaging Protocol # {#rpc-protocol}

-

Each RX ring buffer slot contains an RPC request. The dispatcher writes the -response to the corresponding TX ring buffer slot.

-
RX Slot: | RPCHeader | request payload bytes |TX Slot: | RPCResponse | response payload bytes |
-

Payload encoding details (type system, multi-argument encoding, bit-packing, -and QEC-specific examples) are defined in cudaq_realtime_message_protocol.bs.

-

Magic values (little-endian 32-bit):

-
    -
  • -

    RPC_MAGIC_REQUEST = 0x43555152 ('CUQR')

    -
  • -

    RPC_MAGIC_RESPONSE = 0x43555153 ('CUQS')

    -
-
// Wire format (byte layout must match dispatch_kernel.cuh)struct RPCHeader {  uint32_t magic;        // RPC_MAGIC_REQUEST  uint32_t function_id;  // fnv1a_hash("handler_name")  uint32_t arg_len;      // payload bytes following this header};struct RPCResponse {  uint32_t magic;        // RPC_MAGIC_RESPONSE  int32_t  status;       // 0 = success  uint32_t result_len;   // bytes of response payload};
-

Payload conventions:

-
    -
  • -

    Request payload: argument data as specified by handler schema.

    -
  • -

    Response payload: result data as specified by handler schema.

    -
  • -

    Size limit: payload must fit in one slot. max_payload_bytes = slot_size - sizeof(RPCHeader).

    -
  • -

    Multi-argument encoding: arguments concatenated in schema order (see message protocol doc).

    -
-

1.9. Host API Overview # {#api-overview}

-

Header: realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h

-

1.10. Manager and Dispatcher Topology # {#manager-dispatcher}

-

The manager is a lightweight owner for one or more dispatchers. Each dispatcher -is configured independently (e.g., vp_id, kernel_type, dispatch_mode) and -can target different workloads.

-

Manager and dispatcher topology

-

1.11. Host API Functions # {#api-functions}

-

Function usage:

-

cudaq_dispatch_manager_create creates the top-level manager that owns -dispatchers.

-

Parameters:

-
    -
  • -

    out_mgr: receives the created manager handle.

    -
-

Call this once near program startup and keep the manager alive for the -lifetime of the dispatch subsystem.

-

cudaq_dispatch_manager_destroy releases the manager and any internal -resources.

-

Parameters:

-
    -
  • -

    mgr: manager handle to destroy.

    -
-

Call this after all dispatchers have been destroyed and the program is -shutting down.

-

cudaq_dispatcher_create allocates a dispatcher instance and validates the -configuration.

-

Parameters:

-
    -
  • -

    mgr: owning manager.

    -
  • -

    config: filled cudaq_dispatcher_config_t with:

    -
  • -

    device_id (default 0): selects the CUDA device for the dispatcher

    -
  • -

    num_blocks (default 1)

    -
  • -

    threads_per_block (default 32)

    -
  • -

    num_slots (required)

    -
  • -

    slot_size (required)

    -
  • -

    vp_id (default 0): tags a dispatcher to a transport channel. Queue pair selection and NIC port/IP binding are configured in Hololink, not in this API.

    -
  • -

    kernel_type (default CUDAQ_KERNEL_REGULAR)

    -
      -
    • -

      CUDAQ_KERNEL_REGULAR: standard kernel launch

      -
    • -

      CUDAQ_KERNEL_COOPERATIVE: cooperative launch (grid.sync() capable)

      -
    -
  • -

    dispatch_mode (default CUDAQ_DISPATCH_DEVICE_CALL)

    -
      -
    • -

      CUDAQ_DISPATCH_DEVICE_CALL: direct __device__ handler call (lowest latency)

      -
    • -

      CUDAQ_DISPATCH_GRAPH_LAUNCH: CUDA graph launch from device code (requires sm_90+, Hopper or later GPUs)

      -
    -
  • -

    out_dispatcher: receives the created dispatcher handle.

    -
-

Call this before wiring ring buffers, function tables, or control state.

-

cudaq_dispatcher_destroy releases a dispatcher after it has been stopped.

-

Parameters:

-
    -
  • -

    dispatcher: dispatcher handle to destroy.

    -
-

Call this when the dispatcher is no longer needed.

-

cudaq_dispatcher_set_ringbuffer provides the RX/TX flag and data -pointers the dispatch kernel will poll and use for request/response slots.

-

Parameters:

-
    -
  • -

    dispatcher: dispatcher handle.

    -
  • -

    ringbuffer: cudaq_ringbuffer_t with:

    -
  • -

    rx_flags: device-visible pointer to RX flags.

    -
  • -

    tx_flags: device-visible pointer to TX flags.

    -
  • -

    rx_data: device-visible pointer to RX slot data (request payloads).

    -
  • -

    tx_data: device-visible pointer to TX slot data (response payloads).

    -
  • -

    rx_stride_sz: size in bytes of each RX slot.

    -
  • -

    tx_stride_sz: size in bytes of each TX slot.

    -
-

Call this before cudaq_dispatcher_start, after allocating mapped host memory -or device memory for the ring buffers.

-

cudaq_dispatcher_set_function_table supplies the function table -containing handler pointers, IDs, and schemas.

-

Parameters:

-
    -
  • -

    dispatcher: dispatcher handle.

    -
  • -

    table: cudaq_function_table_t with:

    -
  • -

    entries: device pointer to array of cudaq_function_entry_t.

    -
  • -

    count: number of entries in the table.

    -
-
// Unified function table entry with schemastruct cudaq_function_entry_t {  union {    void*           device_fn_ptr;   // for CUDAQ_DISPATCH_DEVICE_CALL    cudaGraphExec_t graph_exec;      // for CUDAQ_DISPATCH_GRAPH_LAUNCH  } handler;  uint32_t                function_id;  uint8_t                 dispatch_mode;   // Per-handler dispatch mode  uint8_t                 reserved[3];  cudaq_handler_schema_t  schema;          // Handler interface schema};struct cudaq_function_table_t {  cudaq_function_entry_t* entries;   // Device pointer to entry array  uint32_t                count;     // Number of entries};
-

Call this after initializing the device-side function table entries. -Each entry contains a handler pointer (or graph), function_id, dispatch mode, -and schema describing the handler’s interface.

-

Function ID semantics:

-
    -
  • -

    function_id is the 32-bit FNV-1a hash of the handler name string.

    -
  • -

    The handler name is the string you hash when populating entries; there is no separate runtime registration call.

    -
  • -

    If no entry matches, the dispatcher clears the slot without a response.

    -
  • -

    Suggested: use stable, human-readable handler names (e.g., "mock_decode").

    -
-

cudaq_dispatcher_set_control supplies the shutdown flag and stats buffer -the dispatch kernel uses for termination and bookkeeping.

-

Parameters:

-
    -
  • -

    dispatcher: dispatcher handle.

    -
  • -

    shutdown_flag: device-visible flag used to signal shutdown.

    -
  • -

    stats: device-visible stats buffer.

    -
-

Call this before starting the dispatcher; both buffers must remain valid for -the dispatcher’s lifetime.

-

cudaq_dispatcher_set_launch_fn provides the host-side launch wrapper that -invokes the dispatch kernel with the correct grid/block dimensions.

-

Parameters:

-
    -
  • -

    dispatcher: dispatcher handle.

    -
  • -

    launch_fn: host launch function pointer.

    -
-

Call this once during setup. Typically you pass one of the provided launch functions:

-
    -
  • -

    cudaq_launch_dispatch_kernel_regular - for CUDAQ_KERNEL_REGULAR mode

    -
  • -

    cudaq_launch_dispatch_kernel_cooperative - for CUDAQ_KERNEL_COOPERATIVE mode

    -
-

cudaq_dispatcher_start launches the persistent dispatch kernel and begins -processing slots.

-

Parameters:

-
    -
  • -

    dispatcher: dispatcher handle.

    -
-

Call this only after ring buffers, function table, control buffers, and launch -function are set.

-

cudaq_dispatcher_stop signals the dispatch kernel to exit and waits for it -to shut down.

-

Parameters:

-
    -
  • -

    dispatcher: dispatcher handle.

    -
-

Call this during teardown before destroying the dispatcher.

-

cudaq_dispatcher_get_processed reads the processed‑packet counter from the -stats buffer to support debugging or throughput tracking.

-

Parameters:

-
    -
  • -

    dispatcher: dispatcher handle.

    -
  • -

    out_packets: receives the processed packet count.

    -
-

1.11.1. Occupancy Query and Eager Module Loading # {#occupancy-query}

-

Before calling cudaq_dispatcher_start, call the appropriate occupancy query -to force eager loading of the dispatch kernel module. This avoids lazy-load -deadlocks when the dispatch kernel and transport kernels (e.g., Hololink RX/TX) -run as persistent kernels.

-

cudaq_dispatch_kernel_query_occupancy returns the -maximum number of active blocks per multiprocessor for the regular dispatch -kernel.

-

Parameters:

-
    -
  • -

    out_blocks: receives the max blocks per SM (or 0 on error).

    -
  • -

    threads_per_block: block size used for the occupancy calculation.

    -
-

Returns cudaSuccess on success. Call this when kernel_type is -CUDAQ_KERNEL_REGULAR.

-

cudaq_dispatch_kernel_cooperative_query_occupancy -returns the maximum number of active blocks per multiprocessor for the -cooperative dispatch kernel.

-

Parameters:

-
    -
  • -

    out_blocks: receives the max blocks per SM (or 0 on error).

    -
  • -

    threads_per_block: block size used for the occupancy calculation (e.g., 128 for cooperative decoders).

    -
-

Returns cudaSuccess on success. Call this when kernel_type is -CUDAQ_KERNEL_COOPERATIVE. Use the same threads_per_block value that will -be passed to the dispatcher config and launch function.

-

Call the occupancy function that matches the dispatcher’s kernel_type once -before cudaq_dispatcher_start; the result can be used to size the dispatch -grid (e.g., to reserve SMs for transport kernels).

-

Lifetime/ownership:

-
    -
  • -

    All resources are assumed to live for the program lifetime.

    -
  • -

    The API does not take ownership of host-allocated memory.

    -
-

Threading:

-
    -
  • -

    Single-threaded host usage; create/wire/start/stop from one thread.

    -
-

Error handling:

-
    -
  • -

    All calls return cudaq_status_t.

    -
  • -

    CUDAQ_ERR_INVALID_ARG for missing pointers or invalid config.

    -
  • -

    CUDAQ_ERR_CUDA for CUDA API failures during start/stop.

    -
-

1.11.2. Graph-Based Dispatch Functions

-

The following functions are only available when using CUDAQ_DISPATCH_GRAPH_LAUNCH mode with sm_90+ GPUs:

-

cudaq_create_dispatch_graph_regular creates a graph-based dispatch context that enables device-side graph launching.

-

Parameters:

-
    -
  • -

    rx_flags: device-visible pointer to RX ring buffer flags

    -
  • -

    tx_flags: device-visible pointer to TX ring buffer flags

    -
  • -

    function_table: device pointer to function table entries

    -
  • -

    func_count: number of function table entries

    -
  • -

    graph_buffer_ptr: device pointer for graph buffer communication

    -
  • -

    shutdown_flag: device-visible shutdown flag

    -
  • -

    stats: device-visible stats buffer

    -
  • -

    num_slots: number of ring buffer slots

    -
  • -

    num_blocks: grid size for dispatch kernel

    -
  • -

    threads_per_block: block size for dispatch kernel

    -
  • -

    stream: CUDA stream for graph operations

    -
  • -

    out_context: receives the created graph context handle

    -
-

Returns cudaSuccess on success, or CUDA error code on failure.

-

This function creates a graph containing the dispatch kernel, instantiates it with cudaGraphInstantiateFlagDeviceLaunch, and uploads it to the device. The resulting graph context enables device-side cudaGraphLaunch() calls from within handlers.

-

cudaq_launch_dispatch_graph launches the dispatch graph to begin processing RPC messages.

-

Parameters:

-
    -
  • -

    context: graph context handle from cudaq_create_dispatch_graph_regular

    -
  • -

    stream: CUDA stream for graph launch

    -
-

Returns cudaSuccess on success, or CUDA error code on failure.

-

Call this to start the persistent dispatch kernel. The kernel will continue running until the shutdown flag is set.

-

cudaq_destroy_dispatch_graph destroys the graph context and releases all associated resources.

-

Parameters:

-
    -
  • -

    context: graph context handle to destroy

    -
-

Returns cudaSuccess on success, or CUDA error code on failure.

-

Call this after the dispatch kernel has exited (shutdown flag was set) to clean up graph resources.

-

1.11.3. Kernel Launch Helper Functions

-

The following helper functions are provided for use with cudaq_dispatcher_set_launch_fn():

-

cudaq_launch_dispatch_kernel_regular launches the dispatch kernel in regular (non-cooperative) mode.

-

Parameters:

-
    -
  • -

    rx_flags: device-visible pointer to RX ring buffer flags

    -
  • -

    tx_flags: device-visible pointer to TX ring buffer flags

    -
  • -

    function_table: device pointer to function table entries

    -
  • -

    func_count: number of function table entries

    -
  • -

    shutdown_flag: device-visible shutdown flag

    -
  • -

    stats: device-visible stats buffer

    -
  • -

    num_slots: number of ring buffer slots

    -
  • -

    num_blocks: grid size for dispatch kernel

    -
  • -

    threads_per_block: block size for dispatch kernel

    -
  • -

    stream: CUDA stream for kernel launch

    -
-

Use this when kernel_type is set to CUDAQ_KERNEL_REGULAR in the dispatcher configuration.

-

cudaq_launch_dispatch_kernel_cooperative launches the dispatch kernel in cooperative mode.

-

Parameters: Same as cudaq_launch_dispatch_kernel_regular.

-

Use this when kernel_type is set to CUDAQ_KERNEL_COOPERATIVE in the dispatcher configuration. This enables the dispatch kernel and handlers to use grid-wide synchronization via cooperative_groups::this_grid().sync().

-

1.12. Memory Layout and Ring Buffer Wiring # {#memory-layout}

-

Each slot is a fixed-size byte region:

-
| RPCHeader | payload bytes (arg_len) | unused padding (slot_size - header - payload) |
-

Unused padding is the remaining bytes in the fixed-size slot after the header -and payload.

-

Flags (both are uint64_t arrays of slot flags):

-
    -
  • -

    rx_flags[slot] is set by the producer to a non-zero value when a slot is ready.

    -
  • -

    tx_flags[slot] is set by the dispatch kernel to a non-zero value when the response is ready.

    -
-

Message completion note: -An RPC message may be delivered as multiple RDMA writes into a single slot. -Completion is signaled only after the final write (often an RDMA write with -immediate) sets rx_flags[slot] to a non-zero value. The dispatch kernel treats -the slot as complete only after the flag is set.

-

In the NIC-free path, flags and data are allocated with -cudaHostAllocMapped so the device and host see the same memory.

-

1.13. Step-by-Step: Wiring the Host API (Minimal) # {#wiring}

-

The snippet below is real code from -cudaqx/libs/qec/unittests/decoders/realtime/test_realtime_decoding.cu:

-
// Host API wiringASSERT_EQ(cudaq_dispatch_manager_create(&manager_), CUDAQ_OK);cudaq_dispatcher_config_t config{};config.device_id = 0;config.num_blocks = 1;config.threads_per_block = 32;config.num_slots = static_cast<uint32_t>(num_slots_);config.slot_size = static_cast<uint32_t>(slot_size_);config.vp_id = 0;config.kernel_type = CUDAQ_KERNEL_REGULAR;config.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL;ASSERT_EQ(cudaq_dispatcher_create(manager_, &config, &dispatcher_), CUDAQ_OK);cudaq_ringbuffer_t ringbuffer{};ringbuffer.rx_flags = rx_flags_;ringbuffer.tx_flags = tx_flags_;ringbuffer.rx_data = rx_data_;ringbuffer.tx_data = tx_data_;ringbuffer.rx_stride_sz = slot_size_;ringbuffer.tx_stride_sz = slot_size_;ASSERT_EQ(cudaq_dispatcher_set_ringbuffer(dispatcher_, &ringbuffer), CUDAQ_OK);// Allocate and initialize function table entriescudaq_function_entry_t* d_entries;cudaMalloc(&d_entries, func_count_ * sizeof(cudaq_function_entry_t));// Initialize entries on device (including schemas)init_function_table<<<1, 1>>>(d_entries);cudaDeviceSynchronize();cudaq_function_table_t table{};table.entries = d_entries;table.count = func_count_;ASSERT_EQ(cudaq_dispatcher_set_function_table(dispatcher_, &table), CUDAQ_OK);ASSERT_EQ(cudaq_dispatcher_set_control(dispatcher_, d_shutdown_flag_, d_stats_),          CUDAQ_OK);ASSERT_EQ(cudaq_dispatcher_set_launch_fn(dispatcher_, &launch_dispatch_kernel_wrapper),          CUDAQ_OK);ASSERT_EQ(cudaq_dispatcher_start(dispatcher_), CUDAQ_OK);
-

1.14. Device Handler and Function ID # {#device-handler}

-

Real code from test_realtime_decoding.cu:

-
// The dispatcher uses function_id to find the handlerconstexpr std::uint32_t MOCK_DECODE_FUNCTION_ID =    cudaq::realtime::fnv1a_hash("mock_decode");/// @brief Initialize the device function table with schema__global__ void init_function_table(cudaq_function_entry_t* entries) {  if (threadIdx.x == 0 && blockIdx.x == 0) {    // Entry 0: Mock decoder    entries[0].handler.device_fn_ptr =         reinterpret_cast<void*>(&cudaq::qec::realtime::mock_decode_rpc);    entries[0].function_id = MOCK_DECODE_FUNCTION_ID;    entries[0].dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL;    // Schema: 1 arg (bit-packed detection events), 1 result (correction byte)    entries[0].schema.num_args = 1;    entries[0].schema.args[0] = {TYPE_BIT_PACKED, {0}, 16, 128};  // 128 bits    entries[0].schema.num_results = 1;    entries[0].schema.results[0] = {TYPE_UINT8, {0}, 1, 1};  }}
-

1.14.1. Multi-Argument Handler Example

-
constexpr std::uint32_t ADVANCED_DECODE_FUNCTION_ID =    cudaq::realtime::fnv1a_hash("advanced_decode");__global__ void init_advanced_handler(cudaq_function_entry_t* entries,                                        uint32_t index) {  if (threadIdx.x == 0 && blockIdx.x == 0) {    entries[index].handler.device_fn_ptr =         reinterpret_cast<void*>(&advanced_decode_rpc);    entries[index].function_id = ADVANCED_DECODE_FUNCTION_ID;    entries[index].dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL;    // Schema: 2 args (detection events + calibration), 1 result    entries[index].schema.num_args = 2;    entries[index].schema.args[0] = {TYPE_BIT_PACKED, {0}, 16, 128};    entries[index].schema.args[1] = {TYPE_ARRAY_FLOAT32, {0}, 64, 16};  // 16 floats    entries[index].schema.num_results = 1;    entries[index].schema.results[0] = {TYPE_UINT8, {0}, 1, 1};  }}
-

1.15. CUDA Graph Dispatch Mode # {#graph-dispatch}

-

The CUDAQ_DISPATCH_GRAPH_LAUNCH mode enables handlers to be executed as pre-captured CUDA graphs launched from device code. This is useful for complex multi-kernel workflows that benefit from graph optimization and can reduce kernel launch overhead for sophisticated decoders.

-

1.15.1. Requirements

-
    -
  • -

    GPU Architecture: Compute capability 9.0 or higher (Hopper H100 or later)

    -
  • -

    CUDA Version: CUDA 12.0+ with device-side graph launch support

    -
  • -

    Graph Setup: Handler graphs must be captured and instantiated with cudaGraphInstantiateFlagDeviceLaunch

    -
-

1.15.2. Graph-Based Dispatch API

-

The API provides functions to properly wrap the dispatch kernel in a graph context that enables device-side cudaGraphLaunch():

-
// Opaque handle for graph-based dispatch contexttypedef struct cudaq_dispatch_graph_context cudaq_dispatch_graph_context;// Create a graph-based dispatch contextcudaError_t cudaq_create_dispatch_graph_regular(    volatile uint64_t *rx_flags, volatile uint64_t *tx_flags,    cudaq_function_entry_t *function_table, size_t func_count,    void **graph_buffer_ptr, volatile int *shutdown_flag, uint64_t *stats,    size_t num_slots, uint32_t num_blocks, uint32_t threads_per_block,    cudaStream_t stream, cudaq_dispatch_graph_context **out_context);// Launch the dispatch graphcudaError_t cudaq_launch_dispatch_graph(cudaq_dispatch_graph_context *context,                                        cudaStream_t stream);// Destroy the dispatch graph contextcudaError_t cudaq_destroy_dispatch_graph(cudaq_dispatch_graph_context *context);
-

1.15.3. Graph Handler Setup Example

-
/// @brief Initialize function table with CUDA graph handler__global__ void init_function_table_graph(cudaq_function_entry_t* entries) {  if (threadIdx.x == 0 && blockIdx.x == 0) {    entries[0].handler.graph_exec = /* pre-captured cudaGraphExec_t */;    entries[0].function_id = DECODE_FUNCTION_ID;    entries[0].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH;    // Schema: same as device call mode    entries[0].schema.num_args = 1;    entries[0].schema.args[0] = {TYPE_BIT_PACKED, {0}, 16, 128};    entries[0].schema.num_results = 1;    entries[0].schema.results[0] = {TYPE_UINT8, {0}, 1, 1};  }}
-

1.15.4. Graph Capture and Instantiation

-

Handler graphs must be captured and instantiated with the device launch flag:

-
cudaStream_t capture_stream;cudaStreamCreate(&capture_stream);// Capture the decoder kernel(s) into a graphcudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeGlobal);decode_kernel<<<blocks, threads, 0, capture_stream>>>(args...);cudaStreamEndCapture(capture_stream, &graph);// Instantiate with device launch flag (required for device-side cudaGraphLaunch)cudaGraphExec_t graph_exec;cudaGraphInstantiateWithFlags(&graph_exec, graph,                               cudaGraphInstantiateFlagDeviceLaunch);// Upload graph to devicecudaGraphUpload(graph_exec, capture_stream);cudaStreamSynchronize(capture_stream);cudaStreamDestroy(capture_stream);
-

1.15.5. When to Use Graph Dispatch

-

Use CUDAQ_DISPATCH_GRAPH_LAUNCH mode with the graph-based dispatch API when handlers need to launch CUDA graphs from device code. The graph-based dispatch API (cudaq_create_dispatch_graph_regular() + cudaq_launch_dispatch_graph()) wraps the dispatch kernel in a graph execution context, enabling device-side cudaGraphLaunch() calls from within handlers.

-

1.15.6. Graph vs Device Call Dispatch

-

Device Call Mode (CUDAQ_DISPATCH_DEVICE_CALL):

-
    -
  • -

    Lowest latency for simple handlers

    -
  • -

    Direct __device__ function call from dispatcher

    -
  • -

    Suitable for lightweight decoders and data transformations

    -
  • -

    No special hardware requirements

    -
-

Graph Launch Mode (CUDAQ_DISPATCH_GRAPH_LAUNCH):

-
    -
  • -

    Enables complex multi-kernel workflows

    -
  • -

    Benefits from CUDA graph optimizations

    -
  • -

    Requires sm_90+ hardware (Hopper or later)

    -
  • -

    Higher setup overhead but can reduce per-invocation latency for complex pipelines

    -
-

1.16. Building and Sending an RPC Message # {#build-rpc}

-

Real code from test_realtime_decoding.cu:

-

Note: this host-side snippet emulates what the external device/FPGA would do -when populating RX slots in a Hololink deployment.

-
/// @brief Write detection events to RX buffer in RPC format.void write_rpc_request(std::size_t slot, const std::vector<uint8_t>& measurements) {  uint8_t* slot_data = const_cast<uint8_t*>(rx_data_host_) + slot * slot_size_;  // Write RPCHeader  cudaq::realtime::RPCHeader* header =      reinterpret_cast<cudaq::realtime::RPCHeader*>(slot_data);  header->magic = cudaq::realtime::RPC_MAGIC_REQUEST;  header->function_id = MOCK_DECODE_FUNCTION_ID;  header->arg_len = static_cast<std::uint32_t>(measurements.size());  // Write measurement data after header  memcpy(slot_data + sizeof(cudaq::realtime::RPCHeader),         measurements.data(), measurements.size());}
-

1.17. Reading the Response # {#read-response}

-

Real code from test_realtime_decoding.cu:

-

Note: this host-side snippet emulates what the external device/FPGA would do -when consuming TX slots in a Hololink deployment.

-
/// @brief Read response from TX buffer./// Responses are written by the dispatch kernel to the TX ring buffer; read from tx_data, not rx_data.bool read_rpc_response(std::size_t slot, uint8_t& correction,                       std::int32_t* status_out = nullptr,                       std::uint32_t* result_len_out = nullptr) {  __sync_synchronize();  const uint8_t* slot_data = const_cast<uint8_t*>(tx_data_host_) + slot * slot_size_;  // Read RPCResponse  const cudaq::realtime::RPCResponse* response =      reinterpret_cast<const cudaq::realtime::RPCResponse*>(slot_data);  if (response->magic != cudaq::realtime::RPC_MAGIC_RESPONSE) {    return false;  }  if (status_out)    *status_out = response->status;  if (result_len_out)    *result_len_out = response->result_len;  if (response->status != 0) {    return false;  }  // Read correction data after response header  correction = *(slot_data + sizeof(cudaq::realtime::RPCResponse));  return true;}
-

1.18. Schema-Driven Argument Parsing # {#schema-parsing}

-

The dispatcher uses the handler schema to interpret the typeless payload bytes. -This example shows conceptual parsing logic:

-
__device__ void parse_args_from_payload(    const uint8_t* payload,    const cudaq_handler_schema_t& schema,    void** arg_ptrs) {  uint32_t offset = 0;  for (uint8_t i = 0; i < schema.num_args; i++) {    arg_ptrs[i] = const_cast<uint8_t*>(payload + offset);    offset += schema.args[i].size_bytes;  }}__device__ void dispatch_with_schema(    uint8_t* slot_data,    const cudaq_function_entry_t& entry) {  RPCHeader* hdr = reinterpret_cast<RPCHeader*>(slot_data);  uint8_t* payload = slot_data + sizeof(RPCHeader);  // Parse arguments using schema  void* arg_ptrs[8];  parse_args_from_payload(payload, entry.schema, arg_ptrs);  // Call handler with parsed arguments  if (entry.dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) {    auto handler = reinterpret_cast<HandlerFn>(entry.handler.device_fn_ptr);    handler(arg_ptrs, entry.schema.num_args, /* result buffer */);  }  // ... graph launch path uses same parsed args}
-

For multi-argument payloads, arguments are concatenated in schema order:

-
| RPCHeader | arg0_bytes | arg1_bytes | arg2_bytes | ... |             ^            ^            ^             offset=0     offset=16    offset=80
-

The schema specifies the size of each argument, allowing the dispatcher to -compute offsets.

- -

See the 3-Kernel Architecture diagram above for -the complete data flow. The key integration points are:

-

Ring buffer handoff (RX → Dispatch):

-
// Hololink RX kernel sets this after writing detection event datarx_flags[slot] = device_ptr_to_slot_data;
-

Ring buffer handoff (Dispatch → TX):

-
// Dispatch kernel sets this after writing RPCResponsetx_flags[slot] = device_ptr_to_slot_data;
-

Latency path: The critical path is:

-
    -
  1. -

    RDMA write completes → RX kernel signals → Dispatch polls and processes → TX kernel polls and sends → RDMA read completes

    -
-

All three kernels are persistent (launched once, run indefinitely), so -there is no kernel launch overhead in the hot path.

-

1.20. NIC-Free Testing (No Hololink / No ConnectX-7) # {#nic-free}

-

Emulate RX/TX with mapped host memory:

-
    -
  • -

    cudaqx mock-decoder test:

    -
  • -

    libs/qec/unittests/decoders/realtime/test_realtime_decoding.cu

    -
  • -

    cuda-quantum host API test:

    -
  • -

    realtime/unittests/test_dispatch_kernel.cu

    -
-

Detection event file convention used by the tests:

-
    -
  • -

    Each ROUND_START block represents one decoding round.

    -
  • -

    Only the numeric detection event values are encoded into the payload (do not send the ROUND_START tokens).

    -
-

Note: Existing test files may use SHOT_START for backwards compatibility; this should be interpreted as ROUND_START in the context of realtime decoding.

-

1.21. Mock Decoder Example (cudaqx) # {#mock-decoder}

-

The mock decoder is registered as an RPC handler and invoked by the dispatch -kernel. The tests show end-to-end wiring with detection events loaded from -the detection event file.

-

See:

-
    -
  • -

    cudaqx/libs/qec/unittests/decoders/realtime/test_realtime_decoding.cu

    -
-

1.22. Troubleshooting # {#troubleshooting}

-
    -
  • -

    Timeout waiting for TX: ensure the RX flag points to device-mapped memory.

    -
  • -

    Invalid arg: check slot_size, num_slots, function table pointers.

    -
  • -

    CUDA errors: verify device_id, and that CUDA is initialized.

    -
-

1.23. References # {#references}

-
    -
  • -

    cuda-quantum/realtime/unittests/test_dispatch_kernel.cu

    -
  • -

    cudaqx/libs/qec/unittests/decoders/realtime/test_realtime_decoding.cu

    -
-
- \ No newline at end of file diff --git a/realtime/docs/cudaq_realtime_message_protocol.html b/realtime/docs/cudaq_realtime_message_protocol.html deleted file mode 100644 index 2e9e98df..00000000 --- a/realtime/docs/cudaq_realtime_message_protocol.html +++ /dev/null @@ -1,2513 +0,0 @@ - - - - - CUDA-Q Realtime Messaging Protocol (Draft) - - - - - - - - - - - - - - - -
-

-

CUDA-Q Realtime Messaging Protocol (Draft)

-

Published Proposal, -

-
-
-
Editor: -
(NVIDIA) -
Issue Tracking: -
GitHub -
-
-
-
-
-
-

Abstract

-

RPC payload encoding and message conventions for realtime dispatch.

-
-
- -
-

1. CUDA-Q Realtime Messaging Protocol

-

This document defines the RPC (Remote Procedure Call) payload encoding used by the realtime dispatch kernel for processing data and returning results. It complements -cudaq_realtime_host_api.bs, which focuses on wiring and API usage.

-

1.1. Scope # {#scope}

-
    -
  • -

    RPC header/response wire format

    -
  • -

    Payload encoding and type system

    -
  • -

    Schema contract and payload interpretation

    -
  • -

    Function dispatch semantics

    -
-

Note: This protocol is hardware-agnostic. While the companion document -cudaq_realtime_host_api.bs provides implementation details for both GPU and -CPU-based dispatchers, the wire format and encoding rules specified here apply -universally.

-

1.2. RPC Header / Response # {#rpc-header}

-

Each ring-buffer slot is interpreted as:

-
| RPCHeader | payload bytes (arg_len) | unused padding (slot_size - header - payload) |
-
struct RPCHeader {  uint32_t magic;        // RPC_MAGIC_REQUEST  uint32_t function_id;  // fnv1a_hash("handler_name")  uint32_t arg_len;      // payload bytes following this header};struct RPCResponse {  uint32_t magic;        // RPC_MAGIC_RESPONSE  int32_t  status;       // 0 = success  uint32_t result_len;   // bytes of response payload};
-

Magic values (little-endian 32-bit):

-
    -
  • -

    RPC_MAGIC_REQUEST = 0x43555152 ('CUQR')

    -
  • -

    RPC_MAGIC_RESPONSE = 0x43555153 ('CUQS')

    -
-

1.3. Function ID Semantics # {#function-id}

-

function_id selects which handler the dispatcher invokes for a given RPC -message. The dispatcher performs a lookup in the function table (array of -function pointers + IDs) and calls the matching entry.

-

See cudaq_realtime_host_api.bs for function ID hashing, handler naming, and function -table registration details.

-

1.4. Schema and Payload Interpretation # {#schema-interpretation}

-

The RPC payload is typeless on the wire. The bytes following RPCHeader -are an opaque blob from the protocol’s perspective.

-

Payload interpretation is defined by the handler schema, which is registered -in the dispatcher’s function table during setup (see cudaq_realtime_host_api.bs). -The schema specifies:

-
    -
  • -

    Number of arguments

    -
  • -

    Type and size of each argument

    -
  • -

    Number of return values

    -
  • -

    Type and size of each return value

    -
-

Out-of-band contract: The client (e.g., FPGA) firmware and dispatcher function -table must agree on the schema for each function_id. Schema mismatches are detected -during integration testing, not at runtime.

-

For handlers with multiple arguments, the payload is a concatenation of -argument data in schema order:

-
| RPCHeader | arg0_bytes | arg1_bytes | arg2_bytes | ... |
-

The dispatcher uses the schema to determine where each argument begins and ends within -the payload.

-

1.4.1. Type System # {#type-system}

-

Standardized payload type identifiers used in handler schemas:

-
enum PayloadTypeID : uint8_t {  TYPE_UINT8           = 0x10,  TYPE_INT32           = 0x11,  TYPE_INT64           = 0x12,  TYPE_FLOAT32         = 0x13,  TYPE_FLOAT64         = 0x14,  TYPE_ARRAY_UINT8     = 0x20,  TYPE_ARRAY_INT32     = 0x21,  TYPE_ARRAY_FLOAT32   = 0x22,  TYPE_ARRAY_FLOAT64   = 0x23,  TYPE_BIT_PACKED      = 0x30   // Bit-packed data (LSB-first)};
-

Schema type descriptor (see cudaq_realtime_host_api.bs for full definition):

-
struct cudaq_type_desc_t {  uint8_t  type_id;       // PayloadTypeID value  uint8_t  reserved[3];  uint32_t size_bytes;    // Total size in bytes  uint32_t num_elements;  // Interpretation depends on type_id};
-

The num_elements field interpretation:

-
    -
  • -

    Scalar types (TYPE_UINT8, TYPE_INT32, etc.): unused, set to 1

    -
  • -

    Array types (TYPE_ARRAY_*): number of array elements

    -
  • -

    TYPE_BIT_PACKED: number of bits (not bytes)

    -
-

Note: For arbitrary binary data or vendor-specific formats, use TYPE_ARRAY_UINT8.

-

Encoding rules:

-
    -
  • -

    All multi-byte integers: little-endian

    -
  • -

    Floating-point: IEEE 754 format

    -
  • -

    Arrays: tightly packed elements (no padding)

    -
  • -

    Bit-packed data: LSB-first within each byte, size_bytes = ceil(num_elements / 8)

    -
-

1.5. Payload Encoding # {#payload-encoding}

-

The payload contains the argument data for the handler function. The encoding -depends on the argument types specified in the handler schema.

-

1.5.1. Single-Argument Payloads

-

For handlers with one argument, the payload contains the argument data directly:

-
| RPCHeader | argument_bytes |
-

1.5.2. Multi-Argument Payloads

-

For handlers with multiple arguments, arguments are concatenated in schema order -with no padding or delimiters:

-
| RPCHeader | arg0_bytes | arg1_bytes | arg2_bytes | ... |
-

The schema specifies the size of each argument, allowing the dispatcher to compute offsets.

-

1.5.3. Size Constraints

-

The total payload must fit in a single ring-buffer slot:

-
total_size = sizeof(RPCHeader) + arg_lenslot_sizemax_payload_bytes = slot_size - sizeof(RPCHeader)
-

1.5.4. Encoding Examples

-

Example 1: Handler with signature void process(int32_t count, float threshold)

-

Schema:

-
    -
  • -

    arg0: TYPE_INT32, 4 bytes

    -
  • -

    arg1: TYPE_FLOAT32, 4 bytes

    -
-

Wire encoding:

-
Offset | Content-------|--------0-11   | RPCHeader { magic, function_id, arg_len=8 }12-15  | count (int32_t, little-endian)16-19  | threshold (float, IEEE 754)
-

Example 2: Handler with signature void decode(const uint8_t* bits, uint32_t num_bits)

-

Schema:

-
    -
  • -

    arg0: TYPE_BIT_PACKED, size_bytes=16, num_elements=128

    -
  • -

    arg1: TYPE_UINT32, size_bytes=4, num_elements=1

    -
-

Wire encoding:

-
Offset | Content-------|--------0-11   | RPCHeader { magic, function_id, arg_len=20 }12-27  | bits (bit-packed, LSB-first, 128 bits)28-31  | num_bits=128 (uint32_t, little-endian)
-

1.5.5. Bit-Packed Data Encoding

-

For TYPE_BIT_PACKED arguments:

-
    -
  • -

    Bits are packed LSB-first within each byte

    -
  • -

    Payload length: size_bytes = ceil(num_elements / 8) bytes

    -
  • -

    The schema specifies both size_bytes (storage) and num_elements (actual bit count)

    -
-

Example for 10 bits (size_bytes=2, num_elements=10):

-
bits:    b0 b1 b2 b3 b4 b5 b6 b7 b8 b9byte[0]: b0 b1 b2 b3 b4 b5 b6 b7   (LSB-first)byte[1]: b8 b9 0  0  0  0  0  0    (unused bits set to zero)
-

The handler can use num_elements from the schema to determine how many bits -are valid, avoiding the need to pass bit count as a separate argument (though -some handlers may still choose to do so for flexibility).

-

Use case: TYPE_BIT_PACKED is suitable for binary measurements where -each measurement result is 0 or 1 (1 bit per measurement).

-

1.5.6. Multi-Bit Measurement Encoding

-

For applications requiring richer measurement data (e.g., soft readout, leakage -detection), use array types instead of TYPE_BIT_PACKED:

-

4-bit soft readout (confidence values 0-15):

-

Use TYPE_ARRAY_UINT8 with custom packing (2 measurements per byte):

-
    -
  • -

    Schema: TYPE_ARRAY_UINT8, size_bytes = ceil(num_measurements / 2), num_elements = num_measurements

    -
  • -

    Encoding: Low nibble = measurement[0], high nibble = measurement[1], etc.

    -
-

8-bit soft readout (confidence values 0-255):

-

Use TYPE_ARRAY_UINT8 with one byte per measurement:

-
    -
  • -

    Schema: TYPE_ARRAY_UINT8, size_bytes = num_measurements, num_elements = num_measurements

    -
  • -

    Encoding: byte[i] = measurement[i]

    -
-

Floating-point confidence values:

-

Use TYPE_ARRAY_FLOAT32:

-
    -
  • -

    Schema: TYPE_ARRAY_FLOAT32, size_bytes = num_measurements × 4, num_elements = num_measurements

    -
  • -

    Encoding: IEEE 754 single-precision floats, tightly packed

    -
-

Leakage/erasure-resolving readout (values beyond binary):

-

Use TYPE_ARRAY_UINT8 or TYPE_ARRAY_INT32 depending on the range of measurement outcomes (e.g., 0=ground, 1=excited, 2=leakage state).

-

1.6. Response Encoding # {#response-encoding}

-

The response is written to the TX ring buffer slot (separate from the RX buffer -that contains the request):

-
| RPCResponse | result_bytes |
-

Like the request payload, the response payload encoding is defined by the -handler schema. The schema’s results[] array specifies the type and size -of each return value.

-

1.6.1. Single-Result Response

-

For handlers returning one value, the result is written directly after the -response header.

-

Example response for a handler returning a single uint8_t:

-

Schema:

-
    -
  • -

    result0: TYPE_UINT8, size_bytes=1, num_elements=1

    -
-

Wire encoding:

-
Offset | Content                                    | Value (hex)-------|--------------------------------------------|--------------0-3    | magic (RPC_MAGIC_RESPONSE)                 | 53 51 55 434-7    | status (0 = success)                       | 00 00 00 008-11   | result_len                                 | 01 00 00 0012     | result value (uint8_t)                     | 0313-... | unused padding                             | XX XX XX XX
-

1.6.2. Multi-Result Response

-

For handlers returning multiple values, results are concatenated in schema order -(same pattern as multi-argument requests):

-
| RPCResponse | result0_bytes | result1_bytes | ... |
-

Example: Handler returning correction (uint8_t) + confidence (float)

-

Schema:

-
    -
  • -

    result0: TYPE_UINT8, size_bytes=1, num_elements=1

    -
  • -

    result1: TYPE_FLOAT32, size_bytes=4, num_elements=1

    -
-

Wire encoding:

-
Offset | Content-------|--------0-11   | RPCResponse { magic, status=0, result_len=5 }12     | correction (uint8_t)13-16  | confidence (float32, IEEE 754)
-

1.6.3. Status Codes

-
    -
  • -

    status = 0: Success

    -
  • -

    status > 0: Handler-specific error

    -
  • -

    status < 0: Protocol-level error

    -
-

1.7. QEC-Specific Usage Example # {#qec-example}

-

This section shows how the realtime messaging protocol is used for quantum -error correction (QEC) decoding. This is one application of the protocol; -other use cases follow the same pattern.

-

1.7.1. QEC Terminology

-

In QEC applications, the following terminology applies:

-
    -
  • -

    Measurement result: Raw readout value from a QPU measurement (0 or 1 for binary readout)

    -
  • -

    Detection event: XOR’d measurement results as dictated by the parity check (stabilizer) matrix

    -
  • -

    Syndrome: The full history or set of detection events used by the decoder

    -
-

The decoder consumes detection events (often called "syndrome data" colloquially) -and produces corrections.

-

1.7.2. QEC Decoder Handler

-

Typical QEC decoder signature:

-
void qec_decode(const uint8_t* detection_events, uint32_t num_events,                 uint8_t* correction);
-

Schema:

-
    -
  • -

    arg0: TYPE_BIT_PACKED, variable size (detection events, 1 bit per event)

    -
  • -

    arg1: TYPE_UINT32, 4 bytes (number of detection events)

    -
  • -

    result0: TYPE_UINT8, 1 byte (correction bit-packed)

    -
-

1.7.3. Decoding Rounds

-

For QEC applications, one RPC message typically corresponds to one decoding round -(one invocation of the decoder with a set of detection events). The boundaries of -each decoding round are determined by the quantum control system (e.g., FPGA) when -building RPC messages.

-

Note: The term "shot" is often used in quantum computing to mean one full execution -of a quantum program (repeated num_shots times for statistics). In the context -of realtime decoding, we use "decoding round" to avoid confusion, as there may be -many RPC invocations during a single quantum program execution.

-

1.7.4. Testing with Detection Event Files

-

The mock-decoder tests in cudaqx use a text file format for testing:

-
NUM_DATA <N>NUM_LOGICAL <M>ROUND_START 0<detection event bits, one per line>ROUND_START 1<detection event bits, one per line>...CORRECTIONS_START<expected corrections, one per line>CORRECTIONS_END
-

Only the numeric detection event values are encoded into RPC payloads. The -ROUND_START markers and other metadata are not transmitted on the wire.

-

Note: Existing test files may use SHOT_START for backwards compatibility; this -should be interpreted as ROUND_START in the context of realtime decoding.

-

1.8. References # {#references}

-
    -
  • -

    cudaqx/libs/qec/unittests/decoders/realtime/test_realtime_decoding.cu

    -
  • -

    cudaqx/libs/qec/unittests/decoders/realtime/data/syndromes_multi_err_lut.txt

    -
-
- \ No newline at end of file diff --git a/realtime/docs/nvqlink_latency_demo.md b/realtime/docs/nvqlink_latency_demo.md deleted file mode 100644 index c96f8a45..00000000 --- a/realtime/docs/nvqlink_latency_demo.md +++ /dev/null @@ -1,232 +0,0 @@ -# Steps to execute the NVQLink latency demo - -The source Verilog code can be found at: - - -More details about how the Holoscan Sensor Bridge (HSB) IP can be incorporated can be found at: - - -Furthermore, for this experiment, we need the Integrated Logic Analyzer (ILA) to keep the captured measurements. See the "Hololink IP: Connecting an APB ILA for Debug" section below. - -# Steps to do the experiment - -1. Load the bitfile into the FPGA. -2. Setup the host to run the experiment. Mainly the IP address of the NIC needs to be set to `192.168.0.101`. More details can be found at the *Data Channel Enumeration and IP Address Configuration* section of: - -3. Download the accompanying software from: - - - Then generate the docker: - ```sh - sudo sh ./docker/build.sh --dgpu - sudo sh ./docker/demo.sh - ``` - -To run the test, here is an example for 32B messages reported in the paper: -```sh -python3 ./examples/gpunetio_loopback.py --frame-size=32 --hololink=192.168.0.2 --rx-ibv-name=mlx5_0 --tx-ibv-name=mlx5_0 --mtu=256 -``` - -Then to capture the data from the experiment and run the latency calculation: -```sh -python3 ila.py -python3 latency_analysis.py -``` -(These two python scripts can be found next to the Verilog source code). - -# Hololink IP: Connecting an APB ILA for Debug - -This guide describes how to attach an Integrated Logic Analyzer (ILA) to one of the Hololink IP's APB register interfaces for real-time signal capture and debugging over Ethernet. - -## Overview - -The Hololink IP exposes multiple APB register interfaces via the `REG_INST` parameter (defined in `HOLOLINK_def.svh`). These interfaces can be used to connect custom user logic, including ILAs, for monitoring internal signals. - -In this example, we connect the `s_apb_ila` module to **APB[2]** and configure it to capture PTP timestamps, frame information, and other debug signals. - -## APB Interface Signals from Hololink - -The Hololink IP provides the following APB signals for user register interfaces: - -```systemverilog -// From HOLOLINK_top outputs -logic [`REG_INST-1:0] apb_psel; // Per-interface select -logic apb_penable; // Common enable -logic [31:0] apb_paddr; // Common address bus -logic [31:0] apb_pwdata; // Common write data -logic apb_pwrite; // Common write enable - -// To HOLOLINK_top inputs -logic [`REG_INST-1:0] apb_pready; // Per-interface ready -logic [31:0] apb_prdata [`REG_INST-1:0]; // Per-interface read data -logic [`REG_INST-1:0] apb_pserr; // Per-interface error -``` - -## Step 1: Tie Off Unused APB Interfaces - -For any APB interfaces not in use, tie off the signals appropriately: - -```systemverilog -// Tie off unused APB bus signals -assign apb_pserr[7:3] = '0; -assign apb_pserr[1:0] = '0; -assign apb_pready[7:3] = '1; -assign apb_pready[1:0] = '0; -``` - -> **Note:** APB[2] is left unassigned here since it will be connected to the ILA. - ---- - -## Step 2: Create APB Interface Structs for the ILA - -The `s_apb_ila` module uses the `apb_m2s` and `apb_s2m` struct types from `apb_pkg`. Declare the interface signals: - -```systemverilog -import apb_pkg::*; - -apb_m2s ila_apb_m2s; -apb_s2m ila_apb_s2m; -``` - ---- - -## Step 3: Instantiate the s_apb_ila Module - -The `s_apb_ila` module is part of the Hololink IP library (`lib_apb/s_apb_ila.sv`). - -```systemverilog -localparam ILA_DATA_WIDTH = 256; - -s_apb_ila #( - .DEPTH ( 65536 ), - .W_DATA ( ILA_DATA_WIDTH ) -) u_apb_ila ( - // APB Interface (slow clock domain) - .i_aclk ( apb_clk ), - .i_arst ( apb_rst ), - .i_apb_m2s ( ila_apb_m2s ), - .o_apb_s2m ( ila_apb_s2m ), - - // User Capture Interface (fast clock domain) - .i_pclk ( hif_clk ), - .i_prst ( hif_rst ), - .i_trigger ( '1 ), // Always triggered - .i_enable ( '1 ), // Always enabled - .i_wr_data ( ila_wr_data ), // Data to capture - .i_wr_en ( ptp_ts_en ), // Write enable - .o_ctrl_reg ( ) // Optional control output -); -``` - ---- - -## Step 4: Connect APB[2] to the ILA - -Map the Hololink APB signals to the ILA's struct interface: - -```systemverilog -// APB Master-to-Slave signals (from Hololink to ILA) -assign ila_apb_m2s.psel = apb_psel[2]; // Select APB interface 2 -assign ila_apb_m2s.penable = apb_penable; -assign ila_apb_m2s.paddr = apb_paddr; -assign ila_apb_m2s.pwdata = apb_pwdata; -assign ila_apb_m2s.pwrite = apb_pwrite; - -// APB Slave-to-Master signals (from ILA back to Hololink) -assign apb_pready[2] = ila_apb_s2m.pready; -assign apb_prdata[2] = ila_apb_s2m.prdata; -assign apb_pserr[2] = ila_apb_s2m.pserr; -``` - ---- - -## Step 5: Define the Write Data Vector - -Structure the `ila_wr_data` signal to capture the signals of interest. Here's the example configuration used: - -```systemverilog -localparam ILA_DATA_WIDTH = 256; -logic [ILA_DATA_WIDTH-1:0] ila_wr_data; - -// Bit assignments -assign ila_wr_data[63:0] = ptp_ts[63:0]; // PTP timestamp from sensor frame -assign ila_wr_data[127:64] = {ptp_sec_sync_usr[31:0], // Synchronized PTP seconds - ptp_nsec_sync_usr[31:0]}; // Synchronized PTP nanoseconds -assign ila_wr_data[139:128] = frame_cnt; // 12-bit frame counter -assign ila_wr_data[140] = sof; // Start of frame -assign ila_wr_data[141] = eof; // End of frame -assign ila_wr_data[255:142] = 'h123456789ABCDEF; // Debug pattern (filler) -``` - -### Write Data Bit Map Summary - -| Bits | Width | Signal | Description | -|------|-------|--------|-------------| -| [63:0] | 64 | `ptp_ts` | PTP timestamp extracted from sensor TX data | -| [127:64] | 64 | `{ptp_sec, ptp_nsec}` | Synchronized PTP time (seconds + nanoseconds) from Hololink | -| [139:128] | 12 | `frame_cnt` | Frame counter extracted from sensor TX data | -| [140] | 1 | `sof` | Start of frame indicator | -| [141] | 1 | `eof` | End of frame indicator | -| [255:142] | 114 | Debug pattern | Fixed pattern for debugging | - -> **Note:** `ptp_sec_sync_usr` and `ptp_nsec_sync_usr` are the PTP time outputs from Hololink (`o_ptp_sec`, `o_ptp_nanosec`) synchronized to the host interface clock domain. - ---- - -## Step 6: Supporting Logic - -### Frame Detection - -```systemverilog -logic sof, eof; -assign sof = sif_tx_axis_tvalid[0]; // SOF on first valid -assign eof = sif_tx_axis_tlast[0]; // EOF on last -``` - -### Timestamp Capture - -```systemverilog -logic [79:0] ptp_ts; -logic ptp_ts_en; -logic [11:0] frame_cnt; - -always_ff @(posedge hif_clk) begin - if (hif_rst) begin - ptp_ts <= '0; - ptp_ts_en <= '0; - frame_cnt <= '0; - end - else begin - ptp_ts <= (sof) ? sif_tx_axis_tdata[0][79:0] : ptp_ts; - frame_cnt <= (sof) ? sif_tx_axis_tdata[0][91:80] : frame_cnt; - ptp_ts_en <= sof; - end -end -``` - ---- - -## Sensor RX Interface Tie-Off - -In this configuration, only the **Sensor TX interface** is used (for receiving data from the host). The Sensor RX interface is not used and should be tied off as follows: - -```systemverilog -// Sensor Rx Streaming Interface - Tie off (not used) -.i_sif_axis_tvalid ( '0 ), -.i_sif_axis_tlast ( '0 ), -.i_sif_axis_tdata ( '{default:0} ), -.i_sif_axis_tkeep ( '{default:0} ), -.i_sif_axis_tuser ( '{default:0} ), -.o_sif_axis_tready ( ), // Leave unconnected -``` - -The Sensor TX interface (`o_sif_axis_*`) should have `i_sif_axis_tready` tied high to always accept data: - -```systemverilog -.i_sif_axis_tready ( '1 ), -``` - ---- - -Once integrated, the ILA data can be accessed via APB register reads from the host over Ethernet using the Hololink control plane. diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h b/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h deleted file mode 100644 index e484a69c..00000000 --- a/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h +++ /dev/null @@ -1,346 +0,0 @@ -/****************************************************************-*- C++ -*-**** - * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * - * All rights reserved. * - * * - * This source code and the accompanying materials are made available under * - * the terms of the Apache License 2.0 which accompanies this distribution. * - ******************************************************************************/ - -#pragma once - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -// Opaque handles -typedef struct cudaq_dispatch_manager_t cudaq_dispatch_manager_t; -typedef struct cudaq_dispatcher_t cudaq_dispatcher_t; - -// Error codes -typedef enum { - CUDAQ_OK = 0, - CUDAQ_ERR_INVALID_ARG = 1, - CUDAQ_ERR_INTERNAL = 2, - CUDAQ_ERR_CUDA = 3 -} cudaq_status_t; - -// Dispatcher backend: device persistent kernel vs host-side loop -typedef enum { - CUDAQ_BACKEND_DEVICE_KERNEL = 0, - CUDAQ_BACKEND_HOST_LOOP = 1 -} cudaq_backend_t; - -// TX flag status returned by cudaq_host_ringbuffer_poll_tx_flag. -typedef enum { - CUDAQ_TX_EMPTY = 0, - CUDAQ_TX_IN_FLIGHT = 1, - CUDAQ_TX_ERROR = 2, - CUDAQ_TX_READY = 3 -} cudaq_tx_status_t; - -// RPC wire-format constants (must match dispatch_kernel_launch.h). -#define CUDAQ_RPC_MAGIC_REQUEST 0x43555152u /* 'CUQR' */ -#define CUDAQ_RPC_MAGIC_RESPONSE 0x43555153u /* 'CUQS' */ -#define CUDAQ_RPC_HEADER_SIZE 12u /* 3 x uint32_t */ - -// Kernel synchronization type -typedef enum { - CUDAQ_KERNEL_REGULAR = 0, - CUDAQ_KERNEL_COOPERATIVE = 1 -} cudaq_kernel_type_t; - -// Dispatch invocation mode. -// For CUDAQ_BACKEND_HOST_LOOP only GRAPH_LAUNCH is dispatched; DEVICE_CALL and -// HOST_CALL table entries are dropped (slot cleared and advanced). -typedef enum { - CUDAQ_DISPATCH_DEVICE_CALL = 0, - CUDAQ_DISPATCH_GRAPH_LAUNCH = 1, - CUDAQ_DISPATCH_HOST_CALL = 2 -} cudaq_dispatch_mode_t; - -// Payload type identifiers (matching PayloadTypeID in dispatch_kernel_launch.h) -typedef enum { - CUDAQ_TYPE_UINT8 = 0x10, - CUDAQ_TYPE_INT32 = 0x11, - CUDAQ_TYPE_INT64 = 0x12, - CUDAQ_TYPE_FLOAT32 = 0x13, - CUDAQ_TYPE_FLOAT64 = 0x14, - CUDAQ_TYPE_ARRAY_UINT8 = 0x20, - CUDAQ_TYPE_ARRAY_INT32 = 0x21, - CUDAQ_TYPE_ARRAY_FLOAT32 = 0x22, - CUDAQ_TYPE_ARRAY_FLOAT64 = 0x23, - CUDAQ_TYPE_BIT_PACKED = 0x30 -} cudaq_payload_type_t; - -// Type descriptor for arguments/results -typedef struct { - uint8_t type_id; // cudaq_payload_type_t value - uint8_t reserved[3]; // padding - uint32_t size_bytes; // total size in bytes - uint32_t num_elements; // number of elements (for arrays) -} cudaq_type_desc_t; - -// Handler schema describing function signature -typedef struct { - uint8_t num_args; // number of arguments - uint8_t num_results; // number of results - uint16_t reserved; // padding - cudaq_type_desc_t args[8]; // argument descriptors (max 8) - cudaq_type_desc_t results[4]; // result descriptors (max 4) -} cudaq_handler_schema_t; - -// Dispatcher configuration -typedef struct { - int device_id; // GPU device ID (>=0) - uint32_t num_blocks; // grid size - uint32_t threads_per_block; // block size - uint32_t num_slots; // ring buffer slots - uint32_t slot_size; // bytes per slot - uint32_t vp_id; // virtual port ID - cudaq_kernel_type_t kernel_type; // regular/cooperative kernel - cudaq_dispatch_mode_t dispatch_mode; // device call/graph launch - cudaq_backend_t backend; // device kernel or host loop (default DEVICE_KERNEL) -} cudaq_dispatcher_config_t; - -// GPU ring buffer pointers. For device backend use device pointers only. -// For CUDAQ_BACKEND_HOST_LOOP, also set the _host pointers (same pinned -// mapped allocation); the host loop polls rx_flags_host and uses host data. -typedef struct { - volatile uint64_t *rx_flags; // device pointer - volatile uint64_t *tx_flags; // device pointer - uint8_t *rx_data; // device pointer to RX data buffer - uint8_t *tx_data; // device pointer to TX data buffer - size_t rx_stride_sz; // size of each RX slot in bytes - size_t tx_stride_sz; // size of each TX slot in bytes - // Host-side view (required when backend == CUDAQ_BACKEND_HOST_LOOP; NULL - // otherwise) - volatile uint64_t *rx_flags_host; - volatile uint64_t *tx_flags_host; - uint8_t *rx_data_host; - uint8_t *tx_data_host; -} cudaq_ringbuffer_t; - -// Host RPC callback: reads RPCHeader + args from slot, writes RPCResponse + -// result. slot_host is the host pointer to the slot (same layout as device -// slot). -typedef void (*cudaq_host_rpc_fn_t)(void *slot_host, size_t slot_size); - -// Unified function table entry with schema -typedef struct { - union { - void *device_fn_ptr; // for CUDAQ_DISPATCH_DEVICE_CALL - cudaGraphExec_t graph_exec; // for CUDAQ_DISPATCH_GRAPH_LAUNCH - cudaq_host_rpc_fn_t host_fn; // for CUDAQ_DISPATCH_HOST_CALL - } handler; - uint32_t function_id; // hash of function name (FNV-1a) - uint8_t dispatch_mode; // cudaq_dispatch_mode_t value - uint8_t reserved[3]; // padding - cudaq_handler_schema_t schema; // function signature schema -} cudaq_function_entry_t; - -// Function table for device-side dispatch -typedef struct { - cudaq_function_entry_t *entries; // device pointer to array of entries - uint32_t count; // number of entries -} cudaq_function_table_t; - -// Host launch function pointer type -typedef void (*cudaq_dispatch_launch_fn_t)( - volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, uint8_t *rx_data, - uint8_t *tx_data, size_t rx_stride_sz, size_t tx_stride_sz, - cudaq_function_entry_t *function_table, size_t func_count, - volatile int *shutdown_flag, uint64_t *stats, size_t num_slots, - uint32_t num_blocks, uint32_t threads_per_block, cudaStream_t stream); - -// Default dispatch kernel launch helpers (from libcudaq-realtime-dispatch.a) -void cudaq_launch_dispatch_kernel_regular( - volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, uint8_t *rx_data, - uint8_t *tx_data, size_t rx_stride_sz, size_t tx_stride_sz, - cudaq_function_entry_t *function_table, size_t func_count, - volatile int *shutdown_flag, uint64_t *stats, size_t num_slots, - uint32_t num_blocks, uint32_t threads_per_block, cudaStream_t stream); - -void cudaq_launch_dispatch_kernel_cooperative( - volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, uint8_t *rx_data, - uint8_t *tx_data, size_t rx_stride_sz, size_t tx_stride_sz, - cudaq_function_entry_t *function_table, size_t func_count, - volatile int *shutdown_flag, uint64_t *stats, size_t num_slots, - uint32_t num_blocks, uint32_t threads_per_block, cudaStream_t stream); - -// Graph-enabled dispatch kernels (requires compute capability 9.0+, sm_90+) -// These functions are only available when compiled for sm_90 or higher -#if defined(__CUDACC__) || defined(CUDA_VERSION) - -//============================================================================== -// Graph-Based Dispatch API (Proper Device-Side Graph Launch Support) -//============================================================================== -// -// These functions properly support device-side cudaGraphLaunch() by wrapping -// the dispatch kernel in a graph that is instantiated with -// cudaGraphInstantiateFlagDeviceLaunch. -// -// Usage: -// 1. Allocate a GraphIOContext on the device (cudaMalloc) -// 2. Call cudaq_create_dispatch_graph_regular() to create the graph context -// 3. Call cudaq_launch_dispatch_graph() to launch the dispatch kernel -// 4. When done, call cudaq_destroy_dispatch_graph() to cleanup -// -// The dispatch kernel fills the GraphIOContext before each fire-and-forget -// graph launch. The graph kernel reads input from io_ctx->rx_slot, writes -// the RPCResponse to io_ctx->tx_slot, and signals completion by writing -// io_ctx->tx_flag_value to *io_ctx->tx_flag after a __threadfence_system(). - -// Forward declaration for GraphIOContext (defined in dispatch_kernel_launch.h) -struct cudaq_graph_io_context; - -// Opaque handle for graph-based dispatch context -typedef struct cudaq_dispatch_graph_context cudaq_dispatch_graph_context; - -// Create a graph-based dispatch context for the regular kernel type. -// This creates a graph containing the dispatch kernel, instantiates it with -// cudaGraphInstantiateFlagDeviceLaunch, and uploads it to the device. -// -// graph_io_ctx: Device pointer to a GraphIOContext struct. The dispatch -// kernel fills this before each fire-and-forget child graph launch so -// the graph kernel knows where to read input and write output. -// -// Returns cudaSuccess on success, or an error code on failure. -cudaError_t cudaq_create_dispatch_graph_regular( - volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, uint8_t *rx_data, - uint8_t *tx_data, size_t rx_stride_sz, size_t tx_stride_sz, - cudaq_function_entry_t *function_table, size_t func_count, - void *graph_io_ctx, volatile int *shutdown_flag, uint64_t *stats, - size_t num_slots, uint32_t num_blocks, uint32_t threads_per_block, - cudaStream_t stream, cudaq_dispatch_graph_context **out_context); - -// Launch the dispatch graph. The dispatch kernel inside this graph can call -// cudaGraphLaunch() to launch child graphs from device code. -cudaError_t cudaq_launch_dispatch_graph(cudaq_dispatch_graph_context *context, - cudaStream_t stream); - -// Destroy the dispatch graph context and release all resources. -cudaError_t cudaq_destroy_dispatch_graph(cudaq_dispatch_graph_context *context); - -#endif - -// Manager lifecycle -cudaq_status_t -cudaq_dispatch_manager_create(cudaq_dispatch_manager_t **out_mgr); -cudaq_status_t cudaq_dispatch_manager_destroy(cudaq_dispatch_manager_t *mgr); - -// Dispatcher lifecycle -cudaq_status_t cudaq_dispatcher_create(cudaq_dispatch_manager_t *mgr, - const cudaq_dispatcher_config_t *config, - cudaq_dispatcher_t **out_dispatcher); -cudaq_status_t cudaq_dispatcher_destroy(cudaq_dispatcher_t *dispatcher); - -// Wiring inputs -cudaq_status_t -cudaq_dispatcher_set_ringbuffer(cudaq_dispatcher_t *dispatcher, - const cudaq_ringbuffer_t *ringbuffer); -cudaq_status_t -cudaq_dispatcher_set_function_table(cudaq_dispatcher_t *dispatcher, - const cudaq_function_table_t *table); -cudaq_status_t cudaq_dispatcher_set_control(cudaq_dispatcher_t *dispatcher, - volatile int *shutdown_flag, - uint64_t *stats); -cudaq_status_t -cudaq_dispatcher_set_launch_fn(cudaq_dispatcher_t *dispatcher, - cudaq_dispatch_launch_fn_t launch_fn); - -// Optional: provide a caller-managed pinned mailbox for GRAPH_LAUNCH workers. -// h_mailbox_bank must be allocated with cudaHostAlloc(..., cudaHostAllocMapped) -// and sized to at least (num_graph_launch_entries * sizeof(void*)). -// If set, the dispatcher uses this mailbox instead of allocating its own. -// The caller retains ownership and must free it after cudaq_dispatcher_destroy. -cudaq_status_t cudaq_dispatcher_set_mailbox(cudaq_dispatcher_t *dispatcher, - void **h_mailbox_bank); - -// Start/stop -cudaq_status_t cudaq_dispatcher_start(cudaq_dispatcher_t *dispatcher); -cudaq_status_t cudaq_dispatcher_stop(cudaq_dispatcher_t *dispatcher); - -// Stats -cudaq_status_t cudaq_dispatcher_get_processed(cudaq_dispatcher_t *dispatcher, - uint64_t *out_packets); - -//============================================================================== -// Host dispatcher backend (CUDAQ_BACKEND_HOST_LOOP) -//============================================================================== -// When config.backend == CUDAQ_BACKEND_HOST_LOOP, start() uses these instead -// of launch_fn. The realtime lib calls them; implementation is in -// libcudaq-realtime-host-dispatch. - -typedef struct cudaq_host_dispatcher_handle cudaq_host_dispatcher_handle_t; - -// Start the host dispatcher loop in a new thread. Call from -// cudaq_dispatcher_start when backend is CUDAQ_BACKEND_HOST_LOOP. Returns a -// handle for stop, or NULL on error. If external_mailbox is non-NULL, uses it -// instead of allocating internally. -cudaq_host_dispatcher_handle_t *cudaq_host_dispatcher_start_thread( - const cudaq_ringbuffer_t *ringbuffer, const cudaq_function_table_t *table, - const cudaq_dispatcher_config_t *config, volatile int *shutdown_flag, - uint64_t *stats, void **external_mailbox); - -// Stop the host dispatcher thread and free resources. -void cudaq_host_dispatcher_stop(cudaq_host_dispatcher_handle_t *handle); - -// Release a worker back to the idle pool (handle-level, called by API layer). -cudaq_status_t -cudaq_host_dispatcher_release_worker(cudaq_host_dispatcher_handle_t *handle, - int worker_id); - -//============================================================================== -// Ring buffer slot helpers (producer / consumer side) -//============================================================================== -// These encapsulate the RPC wire format and flag-signalling protocol so that -// producers and consumers don't need to know about magic constants, the -// "address-as-flag" convention, or the tx_flags state machine. - -// Write an RPC request (RPCHeader + payload) into slot `slot_idx`. -// payload_len must satisfy CUDAQ_RPC_HEADER_SIZE + payload_len <= rx_stride_sz. -cudaq_status_t cudaq_host_ringbuffer_write_rpc_request( - const cudaq_ringbuffer_t *rb, uint32_t slot_idx, uint32_t function_id, - const void *payload, uint32_t payload_len); - -// Signal that slot `slot_idx` has data ready for the dispatcher. -// Stores the host address of the slot into rx_flags_host[slot_idx]. -void cudaq_host_ringbuffer_signal_slot(const cudaq_ringbuffer_t *rb, - uint32_t slot_idx); - -// Poll tx_flags_host[slot_idx] and classify the result. -// If status == CUDAQ_TX_ERROR and out_cuda_error is non-NULL, the CUDA error -// code is written there. -cudaq_tx_status_t -cudaq_host_ringbuffer_poll_tx_flag(const cudaq_ringbuffer_t *rb, - uint32_t slot_idx, int *out_cuda_error); - -// Check whether a slot is available for reuse (both rx and tx flags are 0). -int cudaq_host_ringbuffer_slot_available(const cudaq_ringbuffer_t *rb, - uint32_t slot_idx); - -// Clear tx_flags_host[slot_idx] after consuming the response. -void cudaq_host_ringbuffer_clear_slot(const cudaq_ringbuffer_t *rb, - uint32_t slot_idx); - -// Release a worker back to the idle pool after the graph has completed. -// This is the consumer-side counterpart to the dispatcher's internal -// idle_mask acquisition — without this call the worker stays "busy" forever. -cudaq_status_t cudaq_host_release_worker(cudaq_dispatcher_t *dispatcher, - int worker_id); - -// Force eager CUDA module loading for dispatch kernels (occupancy query). -// Call before cudaq_dispatcher_start() to avoid lazy-loading deadlocks. -cudaError_t cudaq_dispatch_kernel_query_occupancy(int *out_blocks, - uint32_t threads_per_block); -cudaError_t -cudaq_dispatch_kernel_cooperative_query_occupancy(int *out_blocks, - uint32_t threads_per_block); - -#ifdef __cplusplus -} -#endif diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh b/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh deleted file mode 100644 index 1ebef291..00000000 --- a/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh +++ /dev/null @@ -1,62 +0,0 @@ -/****************************************************************-*- C++ -*-**** - * Copyright (c) 2025 - 2026 NVIDIA Corporation & Affiliates. * - * All rights reserved. * - * * - * This source code and the accompanying materials are made available under * - * the terms of the Apache License 2.0 which accompanies this distribution. * - ******************************************************************************/ - -#pragma once - -/// @file dispatch_kernel.cuh -/// @brief Dispatch kernel declarations for external projects. -/// -/// The dispatch kernel implementation now lives in a separate CUDA TU -/// (dispatch_kernel.cu) and is linked into libcudaq-realtime.so. This header -/// provides declarations and inline wrappers for the launch functions. - -#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" -#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" -#include "cudaq/realtime/daemon/dispatcher/dispatch_modes.h" -#include "cudaq/realtime/daemon/dispatcher/kernel_types.h" - -#include -#include - -namespace cudaq::realtime { - -//============================================================================== -// Kernel Launch Function Declarations (with schema-driven function table) -//============================================================================== -// These declarations match the extern "C" functions defined in -// dispatch_kernel.cu and cudaq_realtime.h - -/// @brief Inline wrapper for regular kernel (schema-aware). -inline void launch_dispatch_kernel_regular_inline( - volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, - std::uint8_t *rx_data, std::uint8_t *tx_data, std::size_t rx_stride_sz, - std::size_t tx_stride_sz, cudaq_function_entry_t *function_table, - std::size_t func_count, volatile int *shutdown_flag, std::uint64_t *stats, - std::size_t num_slots, std::uint32_t num_blocks, - std::uint32_t threads_per_block, cudaStream_t stream) { - cudaq_launch_dispatch_kernel_regular( - rx_flags, tx_flags, rx_data, tx_data, rx_stride_sz, tx_stride_sz, - function_table, func_count, shutdown_flag, stats, num_slots, num_blocks, - threads_per_block, stream); -} - -/// @brief Inline wrapper for cooperative kernel (schema-aware). -inline void launch_dispatch_kernel_cooperative_inline( - volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, - std::uint8_t *rx_data, std::uint8_t *tx_data, std::size_t rx_stride_sz, - std::size_t tx_stride_sz, cudaq_function_entry_t *function_table, - std::size_t func_count, volatile int *shutdown_flag, std::uint64_t *stats, - std::size_t num_slots, std::uint32_t num_blocks, - std::uint32_t threads_per_block, cudaStream_t stream) { - cudaq_launch_dispatch_kernel_cooperative( - rx_flags, tx_flags, rx_data, tx_data, rx_stride_sz, tx_stride_sz, - function_table, func_count, shutdown_flag, stats, num_slots, num_blocks, - threads_per_block, stream); -} - -} // namespace cudaq::realtime diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h b/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h deleted file mode 100644 index d5eaf6bf..00000000 --- a/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h +++ /dev/null @@ -1,132 +0,0 @@ -/****************************************************************-*- C++ -*-**** - * Copyright (c) 2025 - 2026 NVIDIA Corporation & Affiliates. * - * All rights reserved. * - * * - * This source code and the accompanying materials are made available under * - * the terms of the Apache License 2.0 which accompanies this distribution. * - ******************************************************************************/ - -#pragma once - -#include -#include - -namespace cudaq::realtime { - -//============================================================================== -// RPC Protocol Structures (Wire Format) -//============================================================================== - -/// @brief RPC request header - wire format for function dispatch. -/// Must be wire-compatible with cuda-quantum RPC protocol. -struct __attribute__((packed)) RPCHeader { - std::uint32_t magic; ///< Magic value to validate message framing - std::uint32_t function_id; ///< Hash of function name (FNV-1a) - std::uint32_t arg_len; ///< Length of argument data in bytes -}; - -/// @brief RPC response header - returned to caller. -struct __attribute__((packed)) RPCResponse { - std::uint32_t magic; ///< Magic value to validate message framing - std::int32_t status; ///< Return status (0 = success) - std::uint32_t result_len; ///< Length of result data in bytes -}; - -//============================================================================== -// Device Function Type -//============================================================================== - -/// @brief Device RPC function signature. -/// -/// The handler reads arguments from the input buffer and writes results -/// directly to the output buffer. The two buffers never overlap, which -/// enables the dispatch kernel to point `output` straight into the TX -/// ring-buffer slot, eliminating a post-handler copy. -/// -/// @param input Pointer to argument data (RX buffer, read-only) -/// @param output Pointer to result buffer (TX buffer, write-only) -/// @param arg_len Length of argument data in bytes -/// @param max_result_len Maximum result buffer size in bytes -/// @param result_len Output: actual result length written -/// @return Status code (0 = success) -using DeviceRPCFunction = int (*)(const void *input, void *output, - std::uint32_t arg_len, - std::uint32_t max_result_len, - std::uint32_t *result_len); - -//============================================================================== -// Function ID Hashing -//============================================================================== - -/// @brief Compute FNV-1a hash of a string (for function_id). -/// @param str Null-terminated string to hash -/// @return 32-bit hash value -constexpr std::uint32_t fnv1a_hash(const char *str) { - std::uint32_t hash = 2166136261u; - while (*str) { - hash ^= static_cast(*str++); - hash *= 16777619u; - } - return hash; -} - -// RPC framing magic values (ASCII: CUQ?). -constexpr std::uint32_t RPC_MAGIC_REQUEST = 0x43555152; // 'CUQR' -constexpr std::uint32_t RPC_MAGIC_RESPONSE = 0x43555153; // 'CUQS' - -//============================================================================== -// Graph IO Context (for CUDAQ_DISPATCH_GRAPH_LAUNCH) -//============================================================================== - -/// @brief IO context passed to graph-launched RPC handlers via pointer -/// indirection. -/// -/// The dispatch kernel fills this context before each fire-and-forget graph -/// launch so the graph kernel knows where to read input, where to write the -/// response, and how to signal completion. The graph kernel is responsible -/// for writing the RPCResponse header to `tx_slot` and then setting -/// `*tx_flag = tx_flag_value` after a `__threadfence_system()`. -struct GraphIOContext { - void *rx_slot; ///< Input: RX slot (RPCHeader + `args`) - std::uint8_t *tx_slot; ///< Output: TX slot for RPCResponse - volatile std::uint64_t *tx_flag; ///< Pointer to TX flag for this slot - std::uint64_t tx_flag_value; ///< Value to write to tx_flag when done - std::size_t tx_stride_sz; ///< TX slot size (for max_result_len) -}; - -//============================================================================== -// Schema-Driven Type System -//============================================================================== - -/// @brief Standardized payload type identifiers for RPC arguments/results. -enum PayloadTypeID : std::uint8_t { - TYPE_UINT8 = 0x10, - TYPE_INT32 = 0x11, - TYPE_INT64 = 0x12, - TYPE_FLOAT32 = 0x13, - TYPE_FLOAT64 = 0x14, - TYPE_ARRAY_UINT8 = 0x20, - TYPE_ARRAY_INT32 = 0x21, - TYPE_ARRAY_FLOAT32 = 0x22, - TYPE_ARRAY_FLOAT64 = 0x23, - TYPE_BIT_PACKED = 0x30 -}; - -/// @brief Type descriptor for a single argument or result. -struct __attribute__((packed)) cudaq_type_desc_t { - std::uint8_t type_id; ///< PayloadTypeID value - std::uint8_t reserved[3]; ///< Padding for alignment - std::uint32_t size_bytes; ///< Total size in bytes - std::uint32_t num_elements; ///< Number of elements (for arrays) -}; - -/// @brief Handler schema describing argument and result types. -struct __attribute__((packed)) cudaq_handler_schema_t { - std::uint8_t num_args; ///< Number of arguments - std::uint8_t num_results; ///< Number of results - std::uint16_t reserved; ///< Padding for alignment - cudaq_type_desc_t args[8]; ///< Argument type descriptors (max 8) - cudaq_type_desc_t results[4]; ///< Result type descriptors (max 4) -}; - -} // namespace cudaq::realtime diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_modes.h b/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_modes.h deleted file mode 100644 index d34c0b83..00000000 --- a/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_modes.h +++ /dev/null @@ -1,64 +0,0 @@ -/****************************************************************-*- C++ -*-**** - * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * - * All rights reserved. * - * * - * This source code and the accompanying materials are made available under * - * the terms of the Apache License 2.0 which accompanies this distribution. * - ******************************************************************************/ - -#pragma once - -#include - -namespace cudaq::realtime { - -/// @brief Device call dispatch mode - direct __device__ function call. -/// -/// The handler function is called directly from within the dispatch kernel. -/// This is the simplest and lowest-latency dispatch mode, suitable for -/// lightweight handlers like simple decoders or data transformations. -struct DeviceCallMode { - /// @brief Dispatch to handler via direct device function call. - /// - /// @tparam HandlerFunc Function pointer type - /// @tparam ContextType Context structure type - /// @tparam Args Additional argument types - /// @param handler The __device__ function to call - /// @param ctx Handler context (matrices, dimensions, etc.) - /// @param args Additional arguments - template - __device__ static void dispatch(HandlerFunc handler, ContextType &ctx, - Args... args) { - handler(ctx, args...); - } -}; - -/// @brief Graph launch dispatch mode - launches a CUDA graph from device. -/// -/// The handler is a pre-captured CUDA graph that gets launched from the -/// persistent kernel. This is suitable for complex multi-kernel workflows -/// that benefit from graph optimization. -/// -/// NOTE: Requires the graph to be captured and stored in the context at -/// initialization time. The context must contain graph_exec handle. -struct GraphLaunchMode { - /// @brief Dispatch via CUDA graph launch from device. - /// - /// @tparam ContextType Context structure type (must have graph_exec member) - /// @param ctx Handler context containing the graph executable - template - __device__ static void dispatch(ContextType &ctx) { -// Device graph launch requires CUDA 12.0+ and appropriate context setup -// The graph_exec must be a cudaGraphExec_t captured at initialization -#if __CUDA_ARCH__ >= 900 - // cudaGraphLaunch is available from device code on Hopper+ - // Note: This is a placeholder - actual implementation requires - // the graph_exec to be properly set up in the context - if (ctx.graph_exec != nullptr) { - cudaGraphLaunch(ctx.graph_exec, ctx.stream); - } -#endif - } -}; - -} // namespace cudaq::realtime diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h b/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h deleted file mode 100644 index 9b7c5ca6..00000000 --- a/realtime/include/cudaq/realtime/daemon/dispatcher/host_dispatcher.h +++ /dev/null @@ -1,84 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2026 NVIDIA Corporation & Affiliates. - * All rights reserved. - * - * This source code and the accompanying materials are made available under - * the terms of the Apache License 2.0 which accompanies this distribution. - ******************************************************************************/ - -#pragma once - -#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" - -#include -#include -#include -#include -#include - -#ifndef QEC_CPU_RELAX -#if defined(__x86_64__) -#include -#define QEC_CPU_RELAX() _mm_pause() -#elif defined(__aarch64__) -#define QEC_CPU_RELAX() __asm__ volatile("yield" ::: "memory") -#else -#define QEC_CPU_RELAX() \ - do { \ - } while (0) -#endif -#endif - -namespace cudaq::realtime { - -using atomic_uint64_sys = cuda::std::atomic; -using atomic_int_sys = cuda::std::atomic; - -struct HostDispatchWorker { - cudaGraphExec_t graph_exec; - cudaStream_t stream; - uint32_t - function_id; // matches table entry; used to assign slot to this worker - void (*pre_launch_fn)(void *user_data, void *slot_dev, - cudaStream_t stream) = nullptr; - void *pre_launch_data = nullptr; - void (*post_launch_fn)(void *user_data, void *slot_dev, - cudaStream_t stream) = nullptr; - void *post_launch_data = nullptr; -}; - -struct HostDispatcherConfig { - atomic_uint64_sys *rx_flags; - atomic_uint64_sys *tx_flags; - uint8_t *rx_data_host; - uint8_t *rx_data_dev; - uint8_t *tx_data_host; - uint8_t *tx_data_dev; - size_t tx_stride_sz; - void **h_mailbox_bank; - size_t num_slots; - size_t slot_size; - std::vector workers; - /// Host-visible function table for lookup by function_id (GRAPH_LAUNCH only; - /// others dropped). - cudaq_function_entry_t *function_table = nullptr; - size_t function_table_count = 0; - atomic_int_sys *shutdown_flag; - uint64_t *stats_counter; - /// Optional: atomic counter incremented on each dispatch (for progress - /// diagnostics). - atomic_uint64_sys *live_dispatched = nullptr; - - /// Dynamic worker pool (graph workers only) - atomic_uint64_sys *idle_mask; ///< 1 = free, 0 = busy; bit index = worker_id - int *inflight_slot_tags; ///< worker_id -> origin FPGA slot for tx_flags - ///< routing -}; - -/// Run the host-side dispatcher loop. Blocks until *config.shutdown_flag -/// becomes non-zero. Call from a dedicated thread. -/// Uses dynamic worker pool: allocates via idle_mask, tags with -/// inflight_slot_tags. -void host_dispatcher_loop(const HostDispatcherConfig &config); - -} // namespace cudaq::realtime diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/kernel_types.h b/realtime/include/cudaq/realtime/daemon/dispatcher/kernel_types.h deleted file mode 100644 index b7efcac1..00000000 --- a/realtime/include/cudaq/realtime/daemon/dispatcher/kernel_types.h +++ /dev/null @@ -1,39 +0,0 @@ -/****************************************************************-*- C++ -*-**** - * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * - * All rights reserved. * - * * - * This source code and the accompanying materials are made available under * - * the terms of the Apache License 2.0 which accompanies this distribution. * - ******************************************************************************/ - -#pragma once - -#include -#include - -namespace cudaq::realtime { - -/// @brief Regular kernel synchronization using __syncthreads(). -/// -/// Use this for single-block kernels or when only block-level synchronization -/// is needed. Suitable for simple decode handlers that don't require -/// grid-wide coordination. -struct RegularKernel { - /// @brief Not a cooperative kernel -- handler is called by thread 0 only. - static constexpr bool is_cooperative = false; - /// @brief Synchronize threads within a block. - __device__ static void sync() { __syncthreads(); } -}; - -/// @brief Cooperative kernel synchronization using grid.sync(). -/// -/// Use this for multi-block kernels that need grid-wide synchronization, -/// such as complex decoders with data dependencies across blocks. -/// Requires kernel to be launched with cudaLaunchCooperativeKernel. -struct CooperativeKernel { - /// @brief Cooperative kernel -- handler is called by ALL threads. - static constexpr bool is_cooperative = true; - __device__ static void sync() { cooperative_groups::this_grid().sync(); } -}; - -} // namespace cudaq::realtime diff --git a/realtime/include/cudaq/realtime/hololink_bridge_common.h b/realtime/include/cudaq/realtime/hololink_bridge_common.h deleted file mode 100644 index d5fb254a..00000000 --- a/realtime/include/cudaq/realtime/hololink_bridge_common.h +++ /dev/null @@ -1,502 +0,0 @@ -/****************************************************************-*- C++ -*-**** - * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * - * All rights reserved. * - * * - * This source code and the accompanying materials are made available under * - * the terms of the Apache License 2.0 which accompanies this distribution. * - ******************************************************************************/ - -#pragma once - -/// @file hololink_bridge_common.h -/// @brief Header-only bridge skeleton for Hololink-based RPC dispatch. -/// -/// Provides common infrastructure used by all Hololink bridge tools: -/// - Command-line argument parsing for IB device, peer IP, QP, etc. -/// - Hololink transceiver creation and QP connection -/// - Dispatch kernel wiring via the cudaq host API -/// - Main run loop with diagnostics -/// - Graceful shutdown -/// -/// Each concrete bridge tool (generic increment, mock decoder, real decoder) -/// implements a small main() that: -/// 1. Parses any tool-specific arguments -/// 2. Sets up its RPC function table on the GPU -/// 3. Calls bridge_run() with a BridgeConfig struct -/// -/// This header is compiled by a standard C++ compiler; all CUDA and Hololink -/// calls go through C interfaces (cudaq_realtime.h, hololink_wrapper.h). - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" -#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" - -// Hololink C wrapper (link against hololink_wrapper_bridge static library) -#include "hololink_wrapper.h" - -namespace cudaq::realtime { - -//============================================================================== -// CUDA Error Checking -//============================================================================== - -#ifndef BRIDGE_CUDA_CHECK -#define BRIDGE_CUDA_CHECK(call) \ - do { \ - cudaError_t err = call; \ - if (err != cudaSuccess) { \ - std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << ": " \ - << cudaGetErrorString(err) << std::endl; \ - return 1; \ - } \ - } while (0) -#endif - -//============================================================================== -// Global Signal Handler -//============================================================================== - -namespace detail { -inline std::atomic &bridge_shutdown_flag() { - static std::atomic flag{false}; - return flag; -} -inline void bridge_signal_handler(int) { bridge_shutdown_flag() = true; } -} // namespace detail - -//============================================================================== -// Bridge Configuration -//============================================================================== - -/// @brief Configuration for the bridge's Hololink and dispatch kernel setup. -struct BridgeConfig { - // IB / network - std::string device = "rocep1s0f0"; ///< IB device name - std::string peer_ip = "10.0.0.2"; ///< FPGA/emulator IP - uint32_t remote_qp = 0x2; ///< Remote QP number (FPGA default: 2) - int gpu_id = 0; ///< GPU device ID - int timeout_sec = 60; ///< Runtime timeout in seconds - - // Ring buffer sizing - size_t frame_size = 256; ///< Minimum frame size (RPCHeader + payload) - size_t page_size = - 384; ///< Ring buffer slot size (>= frame_size, 128-aligned) - unsigned num_pages = 64; ///< Number of ring buffer slots - - // QP exchange (emulator mode) - bool exchange_qp = false; ///< Use QP exchange protocol - int exchange_port = 12345; ///< TCP port for QP exchange - - // Dispatch kernel config - cudaq_function_entry_t *d_function_entries = nullptr; ///< GPU function table - size_t func_count = 0; ///< Number of entries - - /// @brief Dispatch kernel grid configuration. - /// Defaults match the regular (non-cooperative) kernel. - cudaq_kernel_type_t kernel_type = CUDAQ_KERNEL_REGULAR; - uint32_t num_blocks = 1; - uint32_t threads_per_block = 32; - - /// @brief Pointer to the dispatch kernel launch function. - /// Default: cudaq_launch_dispatch_kernel_regular - cudaq_dispatch_launch_fn_t launch_fn = nullptr; - - /// @brief Optional cleanup callback invoked during shutdown. - std::function cleanup_fn; -}; - -//============================================================================== -// Common Argument Parsing -//============================================================================== - -/// @brief Parse common bridge arguments from the command line. -/// -/// Recognized flags: `--device=`, `--peer-ip=`, `--remote-qp=`, `--gpu=`, -/// `--timeout=`, `--page-size=`, `--num-pages=`, `--exchange-qp`, -/// `--exchange-port=`. Unknown flags are silently ignored (so tool-specific -/// flags can co-exist). -/// -/// @param argc Argument count -/// @param argv Argument vector -/// @param [out] config Bridge configuration to populate -inline void parse_bridge_args(int argc, char *argv[], BridgeConfig &config) { - for (int i = 1; i < argc; i++) { - std::string arg = argv[i]; - if (arg.find("--device=") == 0) - config.device = arg.substr(9); - else if (arg.find("--peer-ip=") == 0) - config.peer_ip = arg.substr(10); - else if (arg.find("--remote-qp=") == 0) - config.remote_qp = std::stoul(arg.substr(12), nullptr, 0); - else if (arg.find("--gpu=") == 0) - config.gpu_id = std::stoi(arg.substr(6)); - else if (arg.find("--timeout=") == 0) - config.timeout_sec = std::stoi(arg.substr(10)); - else if (arg.find("--page-size=") == 0) - config.page_size = std::stoull(arg.substr(12)); - else if (arg.find("--num-pages=") == 0) - config.num_pages = std::stoul(arg.substr(12)); - else if (arg == "--exchange-qp") - config.exchange_qp = true; - else if (arg.find("--exchange-port=") == 0) - config.exchange_port = std::stoi(arg.substr(16)); - } -} - -//============================================================================== -// Bridge Run Function -//============================================================================== - -/// @brief Run the Hololink bridge with the given configuration. -/// -/// This function: -/// 1. Initialises CUDA on the configured GPU -/// 2. Creates the Hololink transceiver and connects the QP -/// 3. Forces eager CUDA module loading -/// 4. Wires the cudaq dispatch kernel to the Hololink ring buffers -/// 5. Launches Hololink RX+TX kernels -/// 6. Runs the main diagnostic loop until timeout or signal -/// 7. Performs orderly shutdown -/// -/// The caller must set config.d_function_entries and config.func_count -/// before calling this function. -/// -/// @param config Fully-populated bridge configuration -/// @return 0 on success, non-zero on error -inline int bridge_run(BridgeConfig &config) { - signal(SIGINT, detail::bridge_signal_handler); - signal(SIGTERM, detail::bridge_signal_handler); - - auto &g_shutdown = detail::bridge_shutdown_flag(); - - //============================================================================ - // [1] Initialize CUDA - //============================================================================ - std::cout << "\n[1/5] Initializing CUDA..." << std::endl; - BRIDGE_CUDA_CHECK(cudaSetDevice(config.gpu_id)); - - cudaDeviceProp prop; - BRIDGE_CUDA_CHECK(cudaGetDeviceProperties(&prop, config.gpu_id)); - std::cout << " GPU: " << prop.name << std::endl; - - //============================================================================ - // [2] Create Hololink transceiver - //============================================================================ - std::cout << "\n[2/5] Creating Hololink transceiver..." << std::endl; - - // Ensure page_size >= frame_size - if (config.page_size < config.frame_size) { - std::cout << " Adjusting page_size from " << config.page_size << " to " - << config.frame_size << " to fit frame" << std::endl; - config.page_size = config.frame_size; - } - - std::cout << " Frame size: " << config.frame_size << " bytes" << std::endl; - std::cout << " Page size: " << config.page_size << " bytes" << std::endl; - std::cout << " Num pages: " << config.num_pages << std::endl; - - hololink_transceiver_t transceiver = hololink_create_transceiver( - config.device.c_str(), 1, // ib_port - config.frame_size, config.page_size, config.num_pages, - "0.0.0.0", // deferred connection - 0, // forward = false - 1, // rx_only = true - 1 // tx_only = true - ); - - if (!transceiver) { - std::cerr << "ERROR: Failed to create Hololink transceiver" << std::endl; - return 1; - } - - if (!hololink_start(transceiver)) { - std::cerr << "ERROR: Failed to start Hololink transceiver" << std::endl; - hololink_destroy_transceiver(transceiver); - return 1; - } - - // Connect QP to remote peer - { - uint8_t remote_gid[16] = {}; - remote_gid[10] = 0xff; - remote_gid[11] = 0xff; - inet_pton(AF_INET, config.peer_ip.c_str(), &remote_gid[12]); - - std::cout << " Connecting QP to remote QP 0x" << std::hex - << config.remote_qp << std::dec << " at " << config.peer_ip - << "..." << std::endl; - - if (!hololink_reconnect_qp(transceiver, remote_gid, config.remote_qp)) { - std::cerr << "ERROR: Failed to connect QP to remote peer" << std::endl; - hololink_destroy_transceiver(transceiver); - return 1; - } - std::cout << " QP connected to remote peer" << std::endl; - } - - uint32_t our_qp = hololink_get_qp_number(transceiver); - uint32_t our_rkey = hololink_get_rkey(transceiver); - uint64_t our_buffer = hololink_get_buffer_addr(transceiver); - - std::cout << " QP Number: 0x" << std::hex << our_qp << std::dec << std::endl; - std::cout << " RKey: " << our_rkey << std::endl; - std::cout << " Buffer Addr: 0x" << std::hex << our_buffer << std::dec - << std::endl; - - // Ring buffer pointers - uint8_t *rx_ring_data = - reinterpret_cast(hololink_get_rx_ring_data_addr(transceiver)); - uint64_t *rx_ring_flag = hololink_get_rx_ring_flag_addr(transceiver); - uint8_t *tx_ring_data = - reinterpret_cast(hololink_get_tx_ring_data_addr(transceiver)); - uint64_t *tx_ring_flag = hololink_get_tx_ring_flag_addr(transceiver); - - if (!rx_ring_data || !rx_ring_flag || !tx_ring_data || !tx_ring_flag) { - std::cerr << "ERROR: Failed to get ring buffer pointers" << std::endl; - hololink_destroy_transceiver(transceiver); - return 1; - } - - //============================================================================ - // [3] Force eager CUDA module loading - //============================================================================ - std::cout << "\n[3/5] Forcing CUDA module loading..." << std::endl; - { - int dispatch_blocks = 0; - cudaError_t occ_err; - if (config.kernel_type == CUDAQ_KERNEL_COOPERATIVE) { - occ_err = cudaq_dispatch_kernel_cooperative_query_occupancy( - &dispatch_blocks, config.threads_per_block); - } else { - occ_err = cudaq_dispatch_kernel_query_occupancy(&dispatch_blocks, 1); - } - if (occ_err != cudaSuccess) { - std::cerr << "ERROR: Dispatch kernel occupancy query failed: " - << cudaGetErrorString(occ_err) << std::endl; - return 1; - } - std::cout << " Dispatch kernel occupancy: " << dispatch_blocks - << " blocks/SM" << std::endl; - - if (!hololink_query_kernel_occupancy()) { - std::cerr << "ERROR: Hololink kernel occupancy query failed" << std::endl; - return 1; - } - } - - //============================================================================ - // [4] Wire dispatch kernel to Hololink ring buffers - //============================================================================ - std::cout << "\n[4/5] Wiring dispatch kernel..." << std::endl; - - // Allocate control variables - void *tmp_shutdown = nullptr; - BRIDGE_CUDA_CHECK( - cudaHostAlloc(&tmp_shutdown, sizeof(int), cudaHostAllocMapped)); - volatile int *shutdown_flag = static_cast(tmp_shutdown); - void *tmp_d_shutdown = nullptr; - BRIDGE_CUDA_CHECK(cudaHostGetDevicePointer(&tmp_d_shutdown, tmp_shutdown, 0)); - volatile int *d_shutdown_flag = static_cast(tmp_d_shutdown); - *shutdown_flag = 0; - int zero = 0; - BRIDGE_CUDA_CHECK(cudaMemcpy(const_cast(d_shutdown_flag), &zero, - sizeof(int), cudaMemcpyHostToDevice)); - - uint64_t *d_stats = nullptr; - BRIDGE_CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t))); - BRIDGE_CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t))); - - // Host API wiring - cudaq_dispatch_manager_t *manager = nullptr; - cudaq_dispatcher_t *dispatcher = nullptr; - - if (cudaq_dispatch_manager_create(&manager) != CUDAQ_OK) { - std::cerr << "ERROR: Failed to create dispatch manager" << std::endl; - return 1; - } - - cudaq_dispatcher_config_t dconfig{}; - dconfig.device_id = config.gpu_id; - dconfig.num_blocks = config.num_blocks; - dconfig.threads_per_block = config.threads_per_block; - dconfig.num_slots = static_cast(config.num_pages); - dconfig.slot_size = static_cast(config.page_size); - dconfig.vp_id = 0; - dconfig.kernel_type = config.kernel_type; - dconfig.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; - - if (cudaq_dispatcher_create(manager, &dconfig, &dispatcher) != CUDAQ_OK) { - std::cerr << "ERROR: Failed to create dispatcher" << std::endl; - return 1; - } - - cudaq_ringbuffer_t ringbuffer{}; - ringbuffer.rx_flags = reinterpret_cast(rx_ring_flag); - ringbuffer.tx_flags = reinterpret_cast(tx_ring_flag); - ringbuffer.rx_data = rx_ring_data; - ringbuffer.tx_data = tx_ring_data; - ringbuffer.rx_stride_sz = config.page_size; - ringbuffer.tx_stride_sz = config.page_size; - - if (cudaq_dispatcher_set_ringbuffer(dispatcher, &ringbuffer) != CUDAQ_OK) { - std::cerr << "ERROR: Failed to set ringbuffer" << std::endl; - return 1; - } - - cudaq_function_table_t table{}; - table.entries = config.d_function_entries; - table.count = config.func_count; - if (cudaq_dispatcher_set_function_table(dispatcher, &table) != CUDAQ_OK) { - std::cerr << "ERROR: Failed to set function table" << std::endl; - return 1; - } - - if (cudaq_dispatcher_set_control(dispatcher, d_shutdown_flag, d_stats) != - CUDAQ_OK) { - std::cerr << "ERROR: Failed to set control" << std::endl; - return 1; - } - - // Use provided launch function, or default to regular dispatch - cudaq_dispatch_launch_fn_t launch_fn = config.launch_fn; - if (!launch_fn) { - launch_fn = &cudaq_launch_dispatch_kernel_regular; - } - if (cudaq_dispatcher_set_launch_fn(dispatcher, launch_fn) != CUDAQ_OK) { - std::cerr << "ERROR: Failed to set launch function" << std::endl; - return 1; - } - - if (cudaq_dispatcher_start(dispatcher) != CUDAQ_OK) { - std::cerr << "ERROR: Failed to start dispatcher" << std::endl; - return 1; - } - std::cout << " Dispatch kernel launched" << std::endl; - - //============================================================================ - // [5] Launch Hololink kernels and run - //============================================================================ - std::cout << "\n[5/5] Launching Hololink kernels..." << std::endl; - - std::thread hololink_thread( - [transceiver]() { hololink_blocking_monitor(transceiver); }); - - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - std::cout << " Hololink RX+TX kernels started" << std::endl; - - // Print QP info for FPGA stimulus tool - std::cout << "\n=== Bridge Ready ===" << std::endl; - std::cout << " QP Number: 0x" << std::hex << our_qp << std::dec << std::endl; - std::cout << " RKey: " << our_rkey << std::endl; - std::cout << " Buffer Addr: 0x" << std::hex << our_buffer << std::dec - << std::endl; - std::cout << "\nWaiting for data (Ctrl+C to stop, timeout=" - << config.timeout_sec << "s)..." << std::endl; - - //============================================================================ - // Main run loop - //============================================================================ - cudaStream_t diag_stream = nullptr; - BRIDGE_CUDA_CHECK( - cudaStreamCreateWithFlags(&diag_stream, cudaStreamNonBlocking)); - - auto start_time = std::chrono::steady_clock::now(); - uint64_t last_processed = 0; - - while (!g_shutdown) { - auto elapsed = std::chrono::duration_cast( - std::chrono::steady_clock::now() - start_time) - .count(); - if (elapsed > config.timeout_sec) { - std::cout << "\nTimeout reached (" << config.timeout_sec << "s)" - << std::endl; - break; - } - - // Progress report every 5 seconds - if (elapsed > 0 && elapsed % 5 == 0) { - uint64_t processed = 0; - cudaMemcpyAsync(&processed, d_stats, sizeof(uint64_t), - cudaMemcpyDeviceToHost, diag_stream); - cudaStreamSynchronize(diag_stream); - if (processed != last_processed) { - std::cout << " [" << elapsed << "s] Processed " << processed - << " packets" << std::endl; - last_processed = processed; - } - } - - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - } - - //============================================================================ - // Shutdown - //============================================================================ - std::cout << "\n=== Shutting down ===" << std::endl; - - if (diag_stream) { - cudaStreamDestroy(diag_stream); - diag_stream = nullptr; - } - - *shutdown_flag = 1; - __sync_synchronize(); - cudaq_dispatcher_stop(dispatcher); - - uint64_t total_processed = 0; - cudaq_dispatcher_get_processed(dispatcher, &total_processed); - std::cout << " Total packets processed: " << total_processed << std::endl; - - hololink_close(transceiver); - if (hololink_thread.joinable()) - hololink_thread.join(); - - cudaq_dispatcher_destroy(dispatcher); - cudaq_dispatch_manager_destroy(manager); - hololink_destroy_transceiver(transceiver); - - if (shutdown_flag) - cudaFreeHost(const_cast(shutdown_flag)); - if (d_stats) - cudaFree(d_stats); - - // Call tool-specific cleanup - if (config.cleanup_fn) - config.cleanup_fn(); - - std::cout << "\n*** Bridge shutdown complete ***" << std::endl; - return 0; -} - -/// @brief Default dispatch kernel launch wrapper. -/// -/// Matches cudaq_dispatch_launch_fn_t signature; delegates to -/// cudaq_launch_dispatch_kernel_regular from libcudaq-realtime. -inline void bridge_launch_dispatch_kernel( - volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, - std::uint8_t *rx_data, std::uint8_t *tx_data, std::size_t rx_stride_sz, - std::size_t tx_stride_sz, cudaq_function_entry_t *function_table, - std::size_t func_count, volatile int *shutdown_flag, std::uint64_t *stats, - std::size_t num_slots, std::uint32_t num_blocks, - std::uint32_t threads_per_block, cudaStream_t stream) { - cudaq_launch_dispatch_kernel_regular( - rx_flags, tx_flags, rx_data, tx_data, rx_stride_sz, tx_stride_sz, - function_table, func_count, shutdown_flag, stats, num_slots, num_blocks, - threads_per_block, stream); -} - -} // namespace cudaq::realtime diff --git a/realtime/lib/CMakeLists.txt b/realtime/lib/CMakeLists.txt deleted file mode 100644 index 1f3a26be..00000000 --- a/realtime/lib/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -# ============================================================================ # -# Copyright (c) 2024 - 2025 NVIDIA Corporation & Affiliates. # -# All rights reserved. # -# # -# This source code and the accompanying materials are made available under # -# the terms of the Apache License 2.0 which accompanies this distribution. # -# ============================================================================ # - -include(GNUInstallDirs) - -install(DIRECTORY ${CUDAQ_REALTIME_INCLUDE_DIR}/cudaq - COMPONENT cudaq-realtime-headers - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} - FILES_MATCHING PATTERN "*.h" -) - -add_subdirectory(daemon) -add_subdirectory(pipeline) diff --git a/realtime/lib/daemon/CMakeLists.txt b/realtime/lib/daemon/CMakeLists.txt deleted file mode 100644 index 95d67ddc..00000000 --- a/realtime/lib/daemon/CMakeLists.txt +++ /dev/null @@ -1,110 +0,0 @@ -# ============================================================================ # -# Copyright (c) 2025 NVIDIA Corporation & Affiliates. # -# All rights reserved. # -# # -# This source code and the accompanying materials are made available under # -# the terms of the Apache License 2.0 which accompanies this distribution. # -# ============================================================================ # - -# ============================================================================== -# Shared library for external consumers (libcudaq-realtime.so) -# ============================================================================== -# This shared library exports a C-compatible host API for wiring dispatchers -# and includes the GPU dispatch kernel device code. - -if(CUDA_FOUND) - set(CUDAQ_REALTIME_SOURCES - dispatcher/cudaq_realtime_api.cpp - ) - - add_library(cudaq-realtime SHARED ${CUDAQ_REALTIME_SOURCES}) - - target_include_directories(cudaq-realtime - PUBLIC - $ - $ - ) - - target_link_libraries(cudaq-realtime - PUBLIC - CUDA::cudart_static - PRIVATE - cudaq-realtime-host-dispatch - ) - - target_compile_definitions(cudaq-realtime PUBLIC CUDAQ_REALTIME_HAVE_CUDA) - - set_target_properties(cudaq-realtime PROPERTIES - CUDA_SEPARABLE_COMPILATION ON - POSITION_INDEPENDENT_CODE ON - LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib - ) - - install(TARGETS cudaq-realtime - COMPONENT realtime-lib - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ) - - add_library(cudaq-realtime-dispatch STATIC dispatcher/dispatch_kernel.cu) - - target_include_directories(cudaq-realtime-dispatch - PUBLIC - $ - $ - ) - - # Link CUDA device runtime library (required for device-side API calls like cudaGraphLaunch) - find_library(CUDADEVRT_LIBRARY cudadevrt - HINTS ${CUDAToolkit_LIBRARY_DIR} - REQUIRED - ) - - target_link_libraries(cudaq-realtime-dispatch - PUBLIC - CUDA::cudart_static - ${CUDADEVRT_LIBRARY} - ) - - set_target_properties(cudaq-realtime-dispatch PROPERTIES - CUDA_SEPARABLE_COMPILATION ON - POSITION_INDEPENDENT_CODE ON - ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib - ) - - install(TARGETS cudaq-realtime-dispatch - COMPONENT realtime-lib - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - ) - - # ============================================================================ - # Host-side graph dispatcher (optional, for Grace Hopper / Grace Blackwell etc.) - # ============================================================================ - # Compiled with nvcc so libcu++ () works without extra - # include paths. Host-only code; no device code in this TU. - add_library(cudaq-realtime-host-dispatch SHARED - dispatcher/host_dispatcher.cu - dispatcher/host_dispatcher_capi.cu - ) - - target_include_directories(cudaq-realtime-host-dispatch - PUBLIC - $ - $ - ) - - target_link_libraries(cudaq-realtime-host-dispatch - PUBLIC - CUDA::cudart_static - ) - - set_target_properties(cudaq-realtime-host-dispatch PROPERTIES - CUDA_SEPARABLE_COMPILATION ON - POSITION_INDEPENDENT_CODE ON - LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib - ) - - install(TARGETS cudaq-realtime-host-dispatch - COMPONENT realtime-lib - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ) -endif() diff --git a/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp b/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp deleted file mode 100644 index 3b8ba1d8..00000000 --- a/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp +++ /dev/null @@ -1,345 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * - * All rights reserved. * - * * - * This source code and the accompanying materials are made available under * - * the terms of the Apache License 2.0 which accompanies this distribution. * - ******************************************************************************/ - -#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" - -#include -#include -#include -#include - -struct cudaq_dispatch_manager_t { - int reserved = 0; -}; - -struct cudaq_dispatcher_t { - cudaq_dispatcher_config_t config{}; - cudaq_ringbuffer_t ringbuffer{}; - cudaq_function_table_t table{}; - cudaq_dispatch_launch_fn_t launch_fn = nullptr; - volatile int *shutdown_flag = nullptr; - uint64_t *stats = nullptr; - cudaStream_t stream = nullptr; - bool running = false; - cudaq_host_dispatcher_handle_t *host_handle = nullptr; - void **h_mailbox_bank = nullptr; -}; - -static bool is_valid_kernel_type(cudaq_kernel_type_t kernel_type) { - switch (kernel_type) { - case CUDAQ_KERNEL_REGULAR: - case CUDAQ_KERNEL_COOPERATIVE: - return true; - default: - return false; - } -} - -static bool is_valid_dispatch_mode(cudaq_dispatch_mode_t dispatch_mode) { - switch (dispatch_mode) { - case CUDAQ_DISPATCH_DEVICE_CALL: - case CUDAQ_DISPATCH_GRAPH_LAUNCH: - case CUDAQ_DISPATCH_HOST_CALL: - return true; - default: - return false; - } -} - -static cudaq_status_t validate_dispatcher(cudaq_dispatcher_t *dispatcher) { - if (!dispatcher) - return CUDAQ_ERR_INVALID_ARG; - if (!dispatcher->shutdown_flag || !dispatcher->stats) - return CUDAQ_ERR_INVALID_ARG; - if (!dispatcher->ringbuffer.rx_flags || !dispatcher->ringbuffer.tx_flags) - return CUDAQ_ERR_INVALID_ARG; - if (!dispatcher->table.entries || dispatcher->table.count == 0) - return CUDAQ_ERR_INVALID_ARG; - if (dispatcher->config.num_slots == 0 || dispatcher->config.slot_size == 0) - return CUDAQ_ERR_INVALID_ARG; - - if (dispatcher->config.backend == CUDAQ_BACKEND_HOST_LOOP) { - if (!dispatcher->ringbuffer.rx_flags_host || - !dispatcher->ringbuffer.tx_flags_host || - !dispatcher->ringbuffer.rx_data_host || - !dispatcher->ringbuffer.tx_data_host) - return CUDAQ_ERR_INVALID_ARG; - return CUDAQ_OK; - } - - if (!dispatcher->launch_fn) - return CUDAQ_ERR_INVALID_ARG; - if (dispatcher->config.num_blocks == 0 || - dispatcher->config.threads_per_block == 0) - return CUDAQ_ERR_INVALID_ARG; - if (!is_valid_kernel_type(dispatcher->config.kernel_type) || - !is_valid_dispatch_mode(dispatcher->config.dispatch_mode)) - return CUDAQ_ERR_INVALID_ARG; - return CUDAQ_OK; -} - -cudaq_status_t -cudaq_dispatch_manager_create(cudaq_dispatch_manager_t **out_mgr) { - if (!out_mgr) - return CUDAQ_ERR_INVALID_ARG; - auto *mgr = new (std::nothrow) cudaq_dispatch_manager_t(); - if (!mgr) - return CUDAQ_ERR_INTERNAL; - *out_mgr = mgr; - return CUDAQ_OK; -} - -cudaq_status_t cudaq_dispatch_manager_destroy(cudaq_dispatch_manager_t *mgr) { - if (mgr) - delete mgr; - return CUDAQ_OK; -} - -cudaq_status_t cudaq_dispatcher_create(cudaq_dispatch_manager_t *, - const cudaq_dispatcher_config_t *config, - cudaq_dispatcher_t **out_dispatcher) { - if (!config || !out_dispatcher) - return CUDAQ_ERR_INVALID_ARG; - auto *dispatcher = new (std::nothrow) cudaq_dispatcher_t(); - if (!dispatcher) - return CUDAQ_ERR_INTERNAL; - dispatcher->config = *config; - *out_dispatcher = dispatcher; - return CUDAQ_OK; -} - -cudaq_status_t cudaq_dispatcher_destroy(cudaq_dispatcher_t *dispatcher) { - if (!dispatcher) - return CUDAQ_ERR_INVALID_ARG; - if (dispatcher->running && dispatcher->host_handle) { - *dispatcher->shutdown_flag = 1; - cudaq_host_dispatcher_stop(dispatcher->host_handle); - dispatcher->host_handle = nullptr; - } - delete dispatcher; - return CUDAQ_OK; -} - -cudaq_status_t -cudaq_dispatcher_set_ringbuffer(cudaq_dispatcher_t *dispatcher, - const cudaq_ringbuffer_t *ringbuffer) { - if (!dispatcher || !ringbuffer) - return CUDAQ_ERR_INVALID_ARG; - dispatcher->ringbuffer = *ringbuffer; - return CUDAQ_OK; -} - -cudaq_status_t -cudaq_dispatcher_set_function_table(cudaq_dispatcher_t *dispatcher, - const cudaq_function_table_t *table) { - if (!dispatcher || !table) - return CUDAQ_ERR_INVALID_ARG; - dispatcher->table = *table; - return CUDAQ_OK; -} - -cudaq_status_t cudaq_dispatcher_set_control(cudaq_dispatcher_t *dispatcher, - volatile int *shutdown_flag, - uint64_t *stats) { - if (!dispatcher || !shutdown_flag || !stats) - return CUDAQ_ERR_INVALID_ARG; - dispatcher->shutdown_flag = shutdown_flag; - dispatcher->stats = stats; - return CUDAQ_OK; -} - -cudaq_status_t -cudaq_dispatcher_set_launch_fn(cudaq_dispatcher_t *dispatcher, - cudaq_dispatch_launch_fn_t launch_fn) { - if (!dispatcher) - return CUDAQ_ERR_INVALID_ARG; - if (dispatcher->config.backend == CUDAQ_BACKEND_HOST_LOOP && - launch_fn != nullptr) - return CUDAQ_ERR_INVALID_ARG; - if (dispatcher->config.backend != CUDAQ_BACKEND_HOST_LOOP && !launch_fn) - return CUDAQ_ERR_INVALID_ARG; - dispatcher->launch_fn = launch_fn; - return CUDAQ_OK; -} - -cudaq_status_t cudaq_dispatcher_set_mailbox(cudaq_dispatcher_t *dispatcher, - void **h_mailbox_bank) { - if (!dispatcher) - return CUDAQ_ERR_INVALID_ARG; - dispatcher->h_mailbox_bank = h_mailbox_bank; - return CUDAQ_OK; -} - -cudaq_status_t cudaq_dispatcher_start(cudaq_dispatcher_t *dispatcher) { - auto status = validate_dispatcher(dispatcher); - if (status != CUDAQ_OK) - return status; - if (dispatcher->running) - return CUDAQ_OK; - - int device_id = dispatcher->config.device_id; - if (device_id < 0) - device_id = 0; - if (cudaSetDevice(device_id) != cudaSuccess) - return CUDAQ_ERR_CUDA; - - if (dispatcher->config.backend == CUDAQ_BACKEND_HOST_LOOP) { - dispatcher->host_handle = cudaq_host_dispatcher_start_thread( - &dispatcher->ringbuffer, &dispatcher->table, &dispatcher->config, - dispatcher->shutdown_flag, dispatcher->stats, - dispatcher->h_mailbox_bank); - if (!dispatcher->host_handle) - return CUDAQ_ERR_INTERNAL; - dispatcher->running = true; - return CUDAQ_OK; - } - - if (cudaStreamCreate(&dispatcher->stream) != cudaSuccess) - return CUDAQ_ERR_CUDA; - - dispatcher->launch_fn( - dispatcher->ringbuffer.rx_flags, dispatcher->ringbuffer.tx_flags, - dispatcher->ringbuffer.rx_data, dispatcher->ringbuffer.tx_data, - dispatcher->ringbuffer.rx_stride_sz, dispatcher->ringbuffer.tx_stride_sz, - dispatcher->table.entries, dispatcher->table.count, - dispatcher->shutdown_flag, dispatcher->stats, - dispatcher->config.num_slots, dispatcher->config.num_blocks, - dispatcher->config.threads_per_block, dispatcher->stream); - - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - fprintf(stderr, "CUDA error in dispatcher launch: %s (%d)\n", - cudaGetErrorString(err), err); - cudaStreamDestroy(dispatcher->stream); - dispatcher->stream = nullptr; - return CUDAQ_ERR_CUDA; - } - - dispatcher->running = true; - return CUDAQ_OK; -} - -cudaq_status_t cudaq_dispatcher_stop(cudaq_dispatcher_t *dispatcher) { - if (!dispatcher) - return CUDAQ_ERR_INVALID_ARG; - if (!dispatcher->running) - return CUDAQ_OK; - - if (dispatcher->config.backend == CUDAQ_BACKEND_HOST_LOOP && - dispatcher->host_handle) { - *dispatcher->shutdown_flag = 1; - cudaq_host_dispatcher_stop(dispatcher->host_handle); - dispatcher->host_handle = nullptr; - dispatcher->running = false; - return CUDAQ_OK; - } - - int shutdown = 1; - if (cudaMemcpy(const_cast(dispatcher->shutdown_flag), &shutdown, - sizeof(int), cudaMemcpyHostToDevice) != cudaSuccess) - return CUDAQ_ERR_CUDA; - cudaStreamSynchronize(dispatcher->stream); - cudaStreamDestroy(dispatcher->stream); - dispatcher->stream = nullptr; - dispatcher->running = false; - return CUDAQ_OK; -} - -cudaq_status_t cudaq_dispatcher_get_processed(cudaq_dispatcher_t *dispatcher, - uint64_t *out_packets) { - if (!dispatcher || !out_packets || !dispatcher->stats) - return CUDAQ_ERR_INVALID_ARG; - - if (dispatcher->config.backend == CUDAQ_BACKEND_HOST_LOOP) { - *out_packets = *dispatcher->stats; - return CUDAQ_OK; - } - - if (cudaMemcpy(out_packets, dispatcher->stats, sizeof(uint64_t), - cudaMemcpyDeviceToHost) != cudaSuccess) - return CUDAQ_ERR_CUDA; - - return CUDAQ_OK; -} - -//============================================================================== -// Ring buffer slot helpers -//============================================================================== - -cudaq_status_t cudaq_host_ringbuffer_write_rpc_request( - const cudaq_ringbuffer_t *rb, uint32_t slot_idx, uint32_t function_id, - const void *payload, uint32_t payload_len) { - if (!rb || !rb->rx_data_host) - return CUDAQ_ERR_INVALID_ARG; - if (CUDAQ_RPC_HEADER_SIZE + payload_len > rb->rx_stride_sz) - return CUDAQ_ERR_INVALID_ARG; - - uint8_t *slot = rb->rx_data_host + slot_idx * rb->rx_stride_sz; - uint32_t *hdr = reinterpret_cast(slot); - hdr[0] = CUDAQ_RPC_MAGIC_REQUEST; - hdr[1] = function_id; - hdr[2] = payload_len; - - if (payload && payload_len > 0) - std::memcpy(slot + CUDAQ_RPC_HEADER_SIZE, payload, payload_len); - - return CUDAQ_OK; -} - -void cudaq_host_ringbuffer_signal_slot(const cudaq_ringbuffer_t *rb, - uint32_t slot_idx) { - __sync_synchronize(); - const_cast(rb->rx_flags_host)[slot_idx] = - reinterpret_cast(rb->rx_data_host + - slot_idx * rb->rx_stride_sz); -} - -static inline uint64_t load_acquire(volatile uint64_t *addr) { - auto *a = - reinterpret_cast *>(const_cast(addr)); - return a->load(std::memory_order_acquire); -} - -cudaq_tx_status_t -cudaq_host_ringbuffer_poll_tx_flag(const cudaq_ringbuffer_t *rb, - uint32_t slot_idx, int *out_cuda_error) { - uint64_t v = load_acquire(&rb->tx_flags_host[slot_idx]); - if (v == 0) - return CUDAQ_TX_EMPTY; - if (v == 0xEEEEEEEEEEEEEEEEULL) - return CUDAQ_TX_IN_FLIGHT; - if ((v >> 48) == 0xDEAD) { - if (out_cuda_error) - *out_cuda_error = static_cast(v & 0xFFFF); - return CUDAQ_TX_ERROR; - } - return CUDAQ_TX_READY; -} - -int cudaq_host_ringbuffer_slot_available(const cudaq_ringbuffer_t *rb, - uint32_t slot_idx) { - return load_acquire(&rb->rx_flags_host[slot_idx]) == 0 && - load_acquire(&rb->tx_flags_host[slot_idx]) == 0; -} - -void cudaq_host_ringbuffer_clear_slot(const cudaq_ringbuffer_t *rb, - uint32_t slot_idx) { - const_cast(rb->tx_flags_host)[slot_idx] = 0; - __sync_synchronize(); -} - -cudaq_status_t cudaq_host_release_worker(cudaq_dispatcher_t *dispatcher, - int worker_id) { - if (!dispatcher) - return CUDAQ_ERR_INVALID_ARG; - if (dispatcher->config.backend != CUDAQ_BACKEND_HOST_LOOP || - !dispatcher->host_handle) - return CUDAQ_ERR_INVALID_ARG; - return cudaq_host_dispatcher_release_worker(dispatcher->host_handle, - worker_id); -} diff --git a/realtime/lib/daemon/dispatcher/dispatch_kernel.cu b/realtime/lib/daemon/dispatcher/dispatch_kernel.cu deleted file mode 100644 index 0500929f..00000000 --- a/realtime/lib/daemon/dispatcher/dispatch_kernel.cu +++ /dev/null @@ -1,612 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2025 - 2026 NVIDIA Corporation & Affiliates. * - * All rights reserved. * - * * - * This source code and the accompanying materials are made available under * - * the terms of the Apache License 2.0 which accompanies this distribution. * - ******************************************************************************/ - -#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" -#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh" -#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" -#include "cudaq/realtime/daemon/dispatcher/dispatch_modes.h" -#include "cudaq/realtime/daemon/dispatcher/kernel_types.h" - -#include -#include -#include - -namespace cudaq::realtime { - -//============================================================================== -// Dispatch Kernel Implementation (compiled into libcudaq-realtime.so) -//============================================================================== - -/// @brief Lookup function entry in table by function_id. -__device__ inline const cudaq_function_entry_t * -dispatch_lookup_entry(std::uint32_t function_id, - cudaq_function_entry_t *entries, - std::size_t entry_count) { - for (std::size_t i = 0; i < entry_count; ++i) { - if (entries[i].function_id == function_id) { - return &entries[i]; - } - } - return nullptr; -} - -/// @brief Dispatch kernel for DEVICE_CALL mode only (no graph launch support). -/// This kernel does not contain any device-side graph launch code, avoiding -/// compatibility issues on systems where cudaGraphLaunch is not supported. -/// -/// Supports symmetric RX/TX data buffers for Hololink compatibility: -/// - RX data address comes from rx_flags[slot] (set by Hololink RX kernel) -/// - TX response is written to tx_data + slot * tx_stride_sz -/// - tx_flags[slot] is set to the TX slot address -/// -/// When KernelType::is_cooperative is true, the kernel is launched via -/// cudaLaunchCooperativeKernel and ALL threads participate in calling the -/// RPC handler (needed for multi-block cooperative decode kernels like BP). -/// Thread 0 polls/parses the header, broadcasts work via shared memory, -/// then all threads call the handler after a grid.sync(). -template -__global__ void dispatch_kernel_device_call_only( - volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, - std::uint8_t *tx_data, std::size_t tx_stride_sz, - cudaq_function_entry_t *function_table, std::size_t func_count, - volatile int *shutdown_flag, std::uint64_t *stats, std::size_t num_slots) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - std::uint64_t local_packet_count = 0; - std::size_t current_slot = 0; - - if constexpr (KernelType::is_cooperative) { - //========================================================================== - // Cooperative path: ALL threads call the handler. - // - // Work descriptor in shared memory (block 0 broadcasts via grid.sync). - // Only block 0 needs shared memory for the descriptor; other blocks - // read the device-memory copies after the grid barrier. - //========================================================================== - __shared__ DeviceRPCFunction s_func; - __shared__ void *s_arg_buffer; - __shared__ std::uint8_t *s_output_buffer; - __shared__ std::uint32_t s_arg_len; - __shared__ std::uint32_t s_max_result_len; - __shared__ bool s_have_work; - - // Device-memory work descriptor visible to all blocks after grid.sync. - // We use a single set since the cooperative kernel processes one RPC at - // a time (all threads participate, so no pipelining). - __device__ static DeviceRPCFunction d_func; - __device__ static void *d_arg_buffer; - __device__ static std::uint8_t *d_output_buffer; - __device__ static std::uint32_t d_arg_len; - __device__ static std::uint32_t d_max_result_len; - __device__ static bool d_have_work; - - while (!(*shutdown_flag)) { - // --- Phase 1: Thread 0 polls and parses --- - if (tid == 0) { - s_have_work = false; - std::uint64_t rx_value = rx_flags[current_slot]; - if (rx_value != 0) { - void *rx_slot = reinterpret_cast(rx_value); - RPCHeader *header = static_cast(rx_slot); - if (header->magic == RPC_MAGIC_REQUEST) { - const cudaq_function_entry_t *entry = dispatch_lookup_entry( - header->function_id, function_table, func_count); - if (entry != nullptr && - entry->dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) { - std::uint8_t *tx_slot = tx_data + current_slot * tx_stride_sz; - - s_func = reinterpret_cast( - entry->handler.device_fn_ptr); - s_arg_buffer = static_cast(header + 1); - s_output_buffer = tx_slot + sizeof(RPCResponse); - s_arg_len = header->arg_len; - s_max_result_len = tx_stride_sz - sizeof(RPCResponse); - s_have_work = true; - - // Publish to device memory for other blocks - d_func = s_func; - d_arg_buffer = s_arg_buffer; - d_output_buffer = s_output_buffer; - d_arg_len = s_arg_len; - d_max_result_len = s_max_result_len; - d_have_work = true; - } - } - if (!s_have_work) { - // Bad magic or unsupported mode -- discard - __threadfence_system(); - rx_flags[current_slot] = 0; - } - } - } - - // --- Phase 2: Broadcast to all threads --- - KernelType::sync(); - - // Non-block-0 threads read from device memory - bool have_work; - DeviceRPCFunction func; - void *arg_buffer; - std::uint8_t *output_buffer; - std::uint32_t arg_len; - std::uint32_t max_result_len; - if (blockIdx.x == 0) { - have_work = s_have_work; - func = s_func; - arg_buffer = s_arg_buffer; - output_buffer = s_output_buffer; - arg_len = s_arg_len; - max_result_len = s_max_result_len; - } else { - have_work = d_have_work; - func = d_func; - arg_buffer = d_arg_buffer; - output_buffer = d_output_buffer; - arg_len = d_arg_len; - max_result_len = d_max_result_len; - } - - // --- Phase 3: ALL threads call the handler --- - std::uint32_t result_len = 0; - int status = 0; - if (have_work) { - status = func(arg_buffer, output_buffer, arg_len, max_result_len, - &result_len); - } - - // --- Phase 4: Sync, then thread 0 writes response --- - KernelType::sync(); - - if (tid == 0 && have_work) { - std::uint8_t *tx_slot = tx_data + current_slot * tx_stride_sz; - RPCResponse *response = reinterpret_cast(tx_slot); - response->magic = RPC_MAGIC_RESPONSE; - response->status = status; - response->result_len = result_len; - - __threadfence_system(); - tx_flags[current_slot] = reinterpret_cast(tx_slot); - - __threadfence_system(); - rx_flags[current_slot] = 0; - local_packet_count++; - current_slot = (current_slot + 1) % num_slots; - } - - // Reset device-memory work flag for next iteration - if (tid == 0) { - d_have_work = false; - } - - KernelType::sync(); - - if ((local_packet_count & 0xFF) == 0) { - __threadfence_system(); - } - } - } else { - //========================================================================== - // Regular path: only thread 0 calls the handler (unchanged). - //========================================================================== - while (!(*shutdown_flag)) { - if (tid == 0) { - std::uint64_t rx_value = rx_flags[current_slot]; - if (rx_value != 0) { - // RX data address comes from rx_flags (set by Hololink RX kernel - // or host test harness to the address of the RX data slot) - void *rx_slot = reinterpret_cast(rx_value); - RPCHeader *header = static_cast(rx_slot); - if (header->magic != RPC_MAGIC_REQUEST) { - __threadfence_system(); - rx_flags[current_slot] = 0; - continue; - } - - std::uint32_t function_id = header->function_id; - std::uint32_t arg_len = header->arg_len; - void *arg_buffer = static_cast(header + 1); - - const cudaq_function_entry_t *entry = - dispatch_lookup_entry(function_id, function_table, func_count); - - if (entry != nullptr && - entry->dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) { - DeviceRPCFunction func = reinterpret_cast( - entry->handler.device_fn_ptr); - - // Compute TX slot address from symmetric TX data buffer - std::uint8_t *tx_slot = tx_data + current_slot * tx_stride_sz; - - // Handler writes results directly to TX slot (after response - // header) - std::uint8_t *output_buffer = tx_slot + sizeof(RPCResponse); - std::uint32_t result_len = 0; - std::uint32_t max_result_len = tx_stride_sz - sizeof(RPCResponse); - int status = func(arg_buffer, output_buffer, arg_len, - max_result_len, &result_len); - - // Write RPC response header to TX slot - RPCResponse *response = reinterpret_cast(tx_slot); - response->magic = RPC_MAGIC_RESPONSE; - response->status = status; - response->result_len = result_len; - - __threadfence_system(); - // Signal TX with the TX slot address (symmetric with Hololink TX - // kernel) - tx_flags[current_slot] = reinterpret_cast(tx_slot); - } - - __threadfence_system(); - rx_flags[current_slot] = 0; - local_packet_count++; - current_slot = (current_slot + 1) % num_slots; - } - } - - KernelType::sync(); - - if ((local_packet_count & 0xFF) == 0) { - __threadfence_system(); - } - } - } - - if (tid == 0) { - atomicAdd(reinterpret_cast(stats), - local_packet_count); - } -} - -/// @brief Dispatch kernel supporting both DEVICE_CALL and GRAPH_LAUNCH modes. -/// This kernel includes device-side graph launch code and requires compute -/// capability >= 9.0. NOTE: Graph launch code is conditionally compiled based -/// on __CUDA_ARCH__. -/// -/// Supports symmetric RX/TX data buffers for Hololink compatibility. -template -__global__ void dispatch_kernel_with_graph( - volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, - std::uint8_t *tx_data, std::size_t tx_stride_sz, - cudaq_function_entry_t *function_table, std::size_t func_count, - GraphIOContext *graph_io_ctx, volatile int *shutdown_flag, - std::uint64_t *stats, std::size_t num_slots) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - std::uint64_t local_packet_count = 0; - std::size_t current_slot = 0; - - while (!(*shutdown_flag)) { - if (tid == 0) { - std::uint64_t rx_value = rx_flags[current_slot]; - if (rx_value != 0) { - void *rx_slot = reinterpret_cast(rx_value); - RPCHeader *header = static_cast(rx_slot); - if (header->magic != RPC_MAGIC_REQUEST) { - __threadfence_system(); - rx_flags[current_slot] = 0; - continue; - } - - std::uint32_t function_id = header->function_id; - std::uint32_t arg_len = header->arg_len; - void *arg_buffer = static_cast(header + 1); - - const cudaq_function_entry_t *entry = - dispatch_lookup_entry(function_id, function_table, func_count); - - // Compute TX slot address from symmetric TX data buffer - std::uint8_t *tx_slot = tx_data + current_slot * tx_stride_sz; - - if (entry != nullptr) { - if (entry->dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) { - DeviceRPCFunction func = reinterpret_cast( - entry->handler.device_fn_ptr); - - // Handler writes results directly to TX slot (after response - // header) - std::uint8_t *output_buffer = tx_slot + sizeof(RPCResponse); - std::uint32_t result_len = 0; - std::uint32_t max_result_len = tx_stride_sz - sizeof(RPCResponse); - int status = func(arg_buffer, output_buffer, arg_len, - max_result_len, &result_len); - - // Write RPC response to TX slot - RPCResponse *response = reinterpret_cast(tx_slot); - response->magic = RPC_MAGIC_RESPONSE; - response->status = status; - response->result_len = result_len; - - __threadfence_system(); - tx_flags[current_slot] = reinterpret_cast(tx_slot); - } -#if __CUDA_ARCH__ >= 900 - else if (entry->dispatch_mode == CUDAQ_DISPATCH_GRAPH_LAUNCH) { - // Fill IO context so the graph kernel can read input from - // rx_slot, write the RPCResponse to tx_slot, and signal - // completion by setting *tx_flag = tx_flag_value. - if (graph_io_ctx != nullptr) { - graph_io_ctx->rx_slot = rx_slot; - graph_io_ctx->tx_slot = tx_slot; - graph_io_ctx->tx_flag = &tx_flags[current_slot]; - graph_io_ctx->tx_flag_value = - reinterpret_cast(tx_slot); - graph_io_ctx->tx_stride_sz = tx_stride_sz; - __threadfence_system(); - } - - // Launch pre-created graph (fire-and-forget is async; the - // graph kernel is responsible for writing the response and - // signaling tx_flag when done). - cudaGraphLaunch(entry->handler.graph_exec, - cudaStreamGraphFireAndForget); - } -#endif // __CUDA_ARCH__ >= 900 - } - - __threadfence_system(); - rx_flags[current_slot] = 0; - local_packet_count++; - current_slot = (current_slot + 1) % num_slots; - } - } - - KernelType::sync(); - - if ((local_packet_count & 0xFF) == 0) { - __threadfence_system(); - } - } - - if (tid == 0) { - atomicAdd(reinterpret_cast(stats), - local_packet_count); - } -} - -} // namespace cudaq::realtime - -//============================================================================== -// Host Launch Functions -//============================================================================== - -// Force eager CUDA module loading for the dispatch kernel. -// Call before launching persistent kernels to avoid lazy-loading deadlocks. -extern "C" cudaError_t -cudaq_dispatch_kernel_query_occupancy(int *out_blocks, - uint32_t threads_per_block) { - int num_blocks = 0; - cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &num_blocks, - cudaq::realtime::dispatch_kernel_device_call_only< - cudaq::realtime::RegularKernel>, - threads_per_block, 0); - if (err != cudaSuccess) - return err; - if (out_blocks) - *out_blocks = num_blocks; - return cudaSuccess; -} - -extern "C" cudaError_t -cudaq_dispatch_kernel_cooperative_query_occupancy(int *out_blocks, - uint32_t threads_per_block) { - int num_blocks = 0; - cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &num_blocks, - cudaq::realtime::dispatch_kernel_device_call_only< - cudaq::realtime::CooperativeKernel>, - threads_per_block, 0); - if (err != cudaSuccess) - return err; - if (out_blocks) - *out_blocks = num_blocks; - return cudaSuccess; -} - -extern "C" void cudaq_launch_dispatch_kernel_regular( - volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, - std::uint8_t *rx_data, std::uint8_t *tx_data, std::size_t rx_stride_sz, - std::size_t tx_stride_sz, cudaq_function_entry_t *function_table, - std::size_t func_count, volatile int *shutdown_flag, std::uint64_t *stats, - std::size_t num_slots, std::uint32_t num_blocks, - std::uint32_t threads_per_block, cudaStream_t stream) { - // Use device-call-only kernel (no graph launch support) - // Note: rx_data/rx_stride_sz are available in the ringbuffer struct but - // not passed to the kernel since it reads RX addresses from rx_flags. - (void)rx_data; - (void)rx_stride_sz; - cudaq::realtime::dispatch_kernel_device_call_only< - cudaq::realtime::RegularKernel> - <<>>( - rx_flags, tx_flags, tx_data, tx_stride_sz, function_table, func_count, - shutdown_flag, stats, num_slots); -} - -extern "C" void cudaq_launch_dispatch_kernel_cooperative( - volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, - std::uint8_t *rx_data, std::uint8_t *tx_data, std::size_t rx_stride_sz, - std::size_t tx_stride_sz, cudaq_function_entry_t *function_table, - std::size_t func_count, volatile int *shutdown_flag, std::uint64_t *stats, - std::size_t num_slots, std::uint32_t num_blocks, - std::uint32_t threads_per_block, cudaStream_t stream) { - (void)rx_data; - (void)rx_stride_sz; - void *kernel_args[] = {const_cast(&rx_flags), - const_cast(&tx_flags), - &tx_data, - &tx_stride_sz, - &function_table, - &func_count, - const_cast(&shutdown_flag), - &stats, - &num_slots}; - - cudaLaunchCooperativeKernel( - reinterpret_cast( - cudaq::realtime::dispatch_kernel_device_call_only< - cudaq::realtime::CooperativeKernel>), - dim3(num_blocks), dim3(threads_per_block), kernel_args, 0, stream); -} - -//============================================================================== -// Graph-Based Dispatch (Proper Device-Side Graph Launch Support) -//============================================================================== -// -// To use device-side cudaGraphLaunch(), the dispatch kernel itself must be -// running inside a graph execution context. These functions create a graph -// containing the dispatch kernel, instantiate it with -// cudaGraphInstantiateFlagDeviceLaunch, and provide proper launch/cleanup -// functions. - -// Internal storage for graph-based dispatch context -// Parameters must be stored persistently since the graph may execute after -// the create function returns. -struct cudaq_dispatch_graph_context { - cudaGraph_t graph; - cudaGraphExec_t graph_exec; - cudaGraphNode_t kernel_node; - bool is_valid; - - // Persistent storage for kernel parameters (must outlive graph execution) - volatile std::uint64_t *rx_flags; - volatile std::uint64_t *tx_flags; - std::uint8_t *tx_data; - std::size_t tx_stride_sz; - cudaq_function_entry_t *function_table; - std::size_t func_count; - cudaq::realtime::GraphIOContext *graph_io_ctx; - volatile int *shutdown_flag; - std::uint64_t *stats; - std::size_t num_slots; -}; - -extern "C" cudaError_t cudaq_create_dispatch_graph_regular( - volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, - std::uint8_t *rx_data, std::uint8_t *tx_data, std::size_t rx_stride_sz, - std::size_t tx_stride_sz, cudaq_function_entry_t *function_table, - std::size_t func_count, void *graph_io_ctx_raw, volatile int *shutdown_flag, - std::uint64_t *stats, std::size_t num_slots, std::uint32_t num_blocks, - std::uint32_t threads_per_block, cudaStream_t stream, - cudaq_dispatch_graph_context **out_context) { - - (void)rx_data; - (void)rx_stride_sz; - cudaError_t err; - - // Allocate context with persistent parameter storage - cudaq_dispatch_graph_context *ctx = new cudaq_dispatch_graph_context(); - ctx->is_valid = false; - - // Store parameters persistently in the context - ctx->rx_flags = rx_flags; - ctx->tx_flags = tx_flags; - ctx->tx_data = tx_data; - ctx->tx_stride_sz = tx_stride_sz; - ctx->function_table = function_table; - ctx->func_count = func_count; - ctx->graph_io_ctx = - static_cast(graph_io_ctx_raw); - ctx->shutdown_flag = shutdown_flag; - ctx->stats = stats; - ctx->num_slots = num_slots; - - // Create graph - err = cudaGraphCreate(&ctx->graph, 0); - if (err != cudaSuccess) { - delete ctx; - return err; - } - - // Set up kernel parameters - point to persistent storage in context - cudaKernelNodeParams kernel_params = {}; - void *kernel_args[] = {&ctx->rx_flags, &ctx->tx_flags, - &ctx->tx_data, &ctx->tx_stride_sz, - &ctx->function_table, &ctx->func_count, - &ctx->graph_io_ctx, &ctx->shutdown_flag, - &ctx->stats, &ctx->num_slots}; - - kernel_params.func = - reinterpret_cast(cudaq::realtime::dispatch_kernel_with_graph< - cudaq::realtime::RegularKernel>); - kernel_params.gridDim = dim3(num_blocks, 1, 1); - kernel_params.blockDim = dim3(threads_per_block, 1, 1); - kernel_params.sharedMemBytes = 0; - kernel_params.kernelParams = kernel_args; - kernel_params.extra = nullptr; - - // Add kernel node to graph - err = cudaGraphAddKernelNode(&ctx->kernel_node, ctx->graph, nullptr, 0, - &kernel_params); - if (err != cudaSuccess) { - cudaGraphDestroy(ctx->graph); - delete ctx; - return err; - } - - // Instantiate with device launch flag - THIS IS THE KEY! - err = cudaGraphInstantiate(&ctx->graph_exec, ctx->graph, - cudaGraphInstantiateFlagDeviceLaunch); - if (err != cudaSuccess) { - cudaGraphDestroy(ctx->graph); - delete ctx; - return err; - } - - // Upload graph to device (required before device-side launch) - err = cudaGraphUpload(ctx->graph_exec, stream); - if (err != cudaSuccess) { - cudaGraphExecDestroy(ctx->graph_exec); - cudaGraphDestroy(ctx->graph); - delete ctx; - return err; - } - - // Synchronize to ensure upload completes - err = cudaStreamSynchronize(stream); - if (err != cudaSuccess) { - cudaGraphExecDestroy(ctx->graph_exec); - cudaGraphDestroy(ctx->graph); - delete ctx; - return err; - } - - ctx->is_valid = true; - *out_context = ctx; - return cudaSuccess; -} - -extern "C" cudaError_t -cudaq_launch_dispatch_graph(cudaq_dispatch_graph_context *context, - cudaStream_t stream) { - if (context == nullptr || !context->is_valid) { - return cudaErrorInvalidValue; - } - - // Launch the graph - now device-side cudaGraphLaunch will work! - return cudaGraphLaunch(context->graph_exec, stream); -} - -extern "C" cudaError_t -cudaq_destroy_dispatch_graph(cudaq_dispatch_graph_context *context) { - if (context == nullptr) { - return cudaErrorInvalidValue; - } - - cudaError_t err = cudaSuccess; - - if (context->is_valid) { - cudaError_t err1 = cudaGraphExecDestroy(context->graph_exec); - cudaError_t err2 = cudaGraphDestroy(context->graph); - if (err1 != cudaSuccess) - err = err1; - else if (err2 != cudaSuccess) - err = err2; - } - - delete context; - return err; -} diff --git a/realtime/lib/daemon/dispatcher/host_dispatcher.cu b/realtime/lib/daemon/dispatcher/host_dispatcher.cu deleted file mode 100644 index 0b96e673..00000000 --- a/realtime/lib/daemon/dispatcher/host_dispatcher.cu +++ /dev/null @@ -1,195 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2026 NVIDIA Corporation & Affiliates. - * All rights reserved. - * - * This source code and the accompanying materials are made available under - * the terms of the Apache License 2.0 which accompanies this distribution. - ******************************************************************************/ - -#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" -#include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" - -namespace cudaq::realtime { - -//----------------------------------------------------------------------------- -// Helpers: function table lookup -//----------------------------------------------------------------------------- - -static const cudaq_function_entry_t * -lookup_function(cudaq_function_entry_t *table, size_t count, - uint32_t function_id) { - for (size_t i = 0; i < count; ++i) { - if (table[i].function_id == function_id) - return &table[i]; - } - return nullptr; -} - -static int -find_idle_graph_worker_for_function(const HostDispatcherConfig &config, - uint32_t function_id) { - uint64_t mask = config.idle_mask->load(cuda::std::memory_order_acquire); - while (mask != 0) { - int worker_id = __builtin_ffsll(static_cast(mask)) - 1; - if (config.workers[static_cast(worker_id)].function_id == - function_id) - return worker_id; - mask &= ~(1ULL << worker_id); - } - return -1; -} - -/// Result of parsing the slot when a function table is in use. -struct ParsedSlot { - uint32_t function_id = 0; - const cudaq_function_entry_t *entry = nullptr; - bool drop = false; // true => invalid magic or unknown function_id; clear slot - // and advance -}; - -static ParsedSlot -parse_slot_with_function_table(void *slot_host, - const HostDispatcherConfig &config) { - ParsedSlot out; - const RPCHeader *header = static_cast(slot_host); - if (header->magic != RPC_MAGIC_REQUEST) { - out.drop = true; - return out; - } - out.function_id = header->function_id; - out.entry = lookup_function(config.function_table, - config.function_table_count, out.function_id); - if (!out.entry) - out.drop = true; - return out; -} - -/// Clear rx_flag for this slot, increment stats, advance slot index. -static void finish_slot_and_advance(const HostDispatcherConfig &config, - size_t ¤t_slot, size_t num_slots, - uint64_t &packets_dispatched) { - config.rx_flags[current_slot].store(0, cuda::std::memory_order_release); - packets_dispatched++; - if (config.live_dispatched) - config.live_dispatched->fetch_add(1, cuda::std::memory_order_relaxed); - current_slot = (current_slot + 1) % num_slots; -} - -/// Acquire a graph worker (by function_id if table in use, else any idle -/// worker). -static int acquire_graph_worker(const HostDispatcherConfig &config, - bool use_function_table, - const cudaq_function_entry_t *entry, - uint32_t function_id) { - if (use_function_table && entry && - entry->dispatch_mode == CUDAQ_DISPATCH_GRAPH_LAUNCH) - return find_idle_graph_worker_for_function(config, function_id); - uint64_t mask = config.idle_mask->load(cuda::std::memory_order_acquire); - if (mask == 0) - return -1; - return __builtin_ffsll(static_cast(mask)) - 1; -} - -/// Launch the graph for the given worker; set tx_flags on success or error. -static void launch_graph_worker(const HostDispatcherConfig &config, - int worker_id, void *slot_host, - size_t current_slot) { - config.idle_mask->fetch_and(~(1ULL << worker_id), - cuda::std::memory_order_release); - config.inflight_slot_tags[worker_id] = static_cast(current_slot); - - ptrdiff_t offset = static_cast(slot_host) - config.rx_data_host; - void *data_dev = static_cast(config.rx_data_dev + offset); - config.h_mailbox_bank[worker_id] = data_dev; - __sync_synchronize(); - - const size_t w = static_cast(worker_id); - if (config.workers[w].pre_launch_fn) - config.workers[w].pre_launch_fn(config.workers[w].pre_launch_data, data_dev, - config.workers[w].stream); - cudaError_t err = - cudaGraphLaunch(config.workers[w].graph_exec, config.workers[w].stream); - - if (err != cudaSuccess) { - uint64_t error_val = (uint64_t)0xDEAD << 48 | (uint64_t)err; - config.tx_flags[current_slot].store(error_val, - cuda::std::memory_order_release); - config.idle_mask->fetch_or(1ULL << worker_id, - cuda::std::memory_order_release); - } else { - if (config.workers[w].post_launch_fn) - config.workers[w].post_launch_fn(config.workers[w].post_launch_data, - data_dev, config.workers[w].stream); - // Always write IN_FLIGHT sentinel. The actual READY value is written - // later by the CPU worker thread or the GPU-only cudaLaunchHostFunc - // callback, after the graph has completed. - config.tx_flags[current_slot].store(0xEEEEEEEEEEEEEEEEULL, - cuda::std::memory_order_release); - } -} - -//----------------------------------------------------------------------------- -// Main loop -//----------------------------------------------------------------------------- - -void host_dispatcher_loop(const HostDispatcherConfig &config) { - size_t current_slot = 0; - const size_t num_slots = config.num_slots; - uint64_t packets_dispatched = 0; - const bool use_function_table = - (config.function_table != nullptr && config.function_table_count > 0); - - while (config.shutdown_flag->load(cuda::std::memory_order_acquire) == 0) { - uint64_t rx_value = - config.rx_flags[current_slot].load(cuda::std::memory_order_acquire); - - if (rx_value == 0) { - QEC_CPU_RELAX(); - continue; - } - - void *slot_host = reinterpret_cast(rx_value); - uint32_t function_id = 0; - const cudaq_function_entry_t *entry = nullptr; - - // TODO: Remove non-function-table path; RPC framing is always required. - if (use_function_table) { - ParsedSlot parsed = parse_slot_with_function_table(slot_host, config); - if (parsed.drop) { - config.rx_flags[current_slot].store(0, cuda::std::memory_order_release); - current_slot = (current_slot + 1) % num_slots; - continue; - } - function_id = parsed.function_id; - entry = parsed.entry; - } - - // Only GRAPH_LAUNCH is dispatched; HOST_CALL and DEVICE_CALL are dropped. - if (entry && entry->dispatch_mode != CUDAQ_DISPATCH_GRAPH_LAUNCH) { - config.rx_flags[current_slot].store(0, cuda::std::memory_order_release); - current_slot = (current_slot + 1) % num_slots; - continue; - } - - int worker_id = - acquire_graph_worker(config, use_function_table, entry, function_id); - if (worker_id < 0) { - QEC_CPU_RELAX(); - continue; - } - - launch_graph_worker(config, worker_id, slot_host, current_slot); - finish_slot_and_advance(config, current_slot, num_slots, - packets_dispatched); - } - - for (const auto &w : config.workers) { - cudaStreamSynchronize(w.stream); - } - - if (config.stats_counter) { - *config.stats_counter = packets_dispatched; - } -} - -} // namespace cudaq::realtime diff --git a/realtime/lib/daemon/dispatcher/host_dispatcher_capi.cu b/realtime/lib/daemon/dispatcher/host_dispatcher_capi.cu deleted file mode 100644 index 109fb79d..00000000 --- a/realtime/lib/daemon/dispatcher/host_dispatcher_capi.cu +++ /dev/null @@ -1,158 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2026 NVIDIA Corporation & Affiliates. - * All rights reserved. - * - * This source code and the accompanying materials are made available under - * the terms of the Apache License 2.0 which accompanies this distribution. - ******************************************************************************/ - -#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" -#include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" - -#include -#include -#include -#include -#include - -struct cudaq_host_dispatcher_handle { - std::thread thread; - std::vector workers; - cudaq::realtime::atomic_uint64_sys *idle_mask = nullptr; - int *inflight_slot_tags = nullptr; - void **h_mailbox_bank = nullptr; - bool owns_mailbox = false; - size_t num_workers = 0; -}; - -static size_t count_graph_launch_workers(const cudaq_function_table_t *table) { - size_t n = 0; - for (uint32_t i = 0; i < table->count; ++i) { - if (table->entries[i].dispatch_mode == CUDAQ_DISPATCH_GRAPH_LAUNCH) - ++n; - } - return n; -} - -extern "C" cudaq_host_dispatcher_handle_t *cudaq_host_dispatcher_start_thread( - const cudaq_ringbuffer_t *ringbuffer, const cudaq_function_table_t *table, - const cudaq_dispatcher_config_t *config, volatile int *shutdown_flag, - uint64_t *stats, void **external_mailbox) { - if (!ringbuffer || !table || !config || !shutdown_flag || !stats) - return nullptr; - if (!ringbuffer->rx_flags_host || !ringbuffer->tx_flags_host || - !ringbuffer->rx_data_host || !ringbuffer->tx_data_host) - return nullptr; - if (!table->entries || table->count == 0) - return nullptr; - if (config->num_slots == 0 || config->slot_size == 0) - return nullptr; - - const size_t num_workers = count_graph_launch_workers(table); - if (num_workers == 0) - return nullptr; - - auto *handle = new (std::nothrow) cudaq_host_dispatcher_handle(); - if (!handle) - return nullptr; - - handle->idle_mask = new (std::nothrow) cudaq::realtime::atomic_uint64_sys(0); - handle->inflight_slot_tags = new (std::nothrow) int[num_workers]; - if (external_mailbox) { - handle->h_mailbox_bank = external_mailbox; - handle->owns_mailbox = false; - } else { - handle->h_mailbox_bank = new (std::nothrow) void *[num_workers]; - handle->owns_mailbox = true; - } - if (!handle->idle_mask || !handle->inflight_slot_tags || - !handle->h_mailbox_bank) { - delete handle->idle_mask; - delete[] handle->inflight_slot_tags; - if (handle->owns_mailbox) - delete[] handle->h_mailbox_bank; - delete handle; - return nullptr; - } - - std::memset(handle->inflight_slot_tags, 0, num_workers * sizeof(int)); - - handle->workers.reserve(num_workers); - for (uint32_t i = 0; i < table->count; ++i) { - if (table->entries[i].dispatch_mode != CUDAQ_DISPATCH_GRAPH_LAUNCH) - continue; - cudaStream_t stream = nullptr; - if (cudaStreamCreate(&stream) != cudaSuccess) { - for (auto &w : handle->workers) - cudaStreamDestroy(w.stream); - delete handle->idle_mask; - delete[] handle->inflight_slot_tags; - delete[] handle->h_mailbox_bank; - delete handle; - return nullptr; - } - cudaq::realtime::HostDispatchWorker w; - w.graph_exec = table->entries[i].handler.graph_exec; - w.stream = stream; - w.function_id = table->entries[i].function_id; - handle->workers.push_back(w); - } - handle->num_workers = num_workers; - - handle->idle_mask->store((1ULL << num_workers) - 1, - cuda::std::memory_order_release); - - cudaq::realtime::HostDispatcherConfig host_config; - host_config.rx_flags = (cudaq::realtime::atomic_uint64_sys *)(uintptr_t) - ringbuffer->rx_flags_host; - host_config.tx_flags = (cudaq::realtime::atomic_uint64_sys *)(uintptr_t) - ringbuffer->tx_flags_host; - host_config.rx_data_host = ringbuffer->rx_data_host; - host_config.rx_data_dev = ringbuffer->rx_data; - host_config.tx_data_host = ringbuffer->tx_data_host; - host_config.tx_data_dev = ringbuffer->tx_data; - host_config.tx_stride_sz = ringbuffer->tx_stride_sz; - host_config.h_mailbox_bank = handle->h_mailbox_bank; - host_config.num_slots = config->num_slots; - host_config.slot_size = config->slot_size; - host_config.workers = handle->workers; - host_config.function_table = table->entries; - host_config.function_table_count = table->count; - host_config.shutdown_flag = - (cudaq::realtime::atomic_int_sys *)(uintptr_t)shutdown_flag; - host_config.stats_counter = stats; - host_config.live_dispatched = nullptr; - host_config.idle_mask = handle->idle_mask; - host_config.inflight_slot_tags = handle->inflight_slot_tags; - - handle->thread = - std::thread(cudaq::realtime::host_dispatcher_loop, host_config); - return handle; -} - -extern "C" cudaq_status_t -cudaq_host_dispatcher_release_worker(cudaq_host_dispatcher_handle_t *handle, - int worker_id) { - if (!handle || !handle->idle_mask) - return CUDAQ_ERR_INVALID_ARG; - if (worker_id < 0 || static_cast(worker_id) >= handle->num_workers) - return CUDAQ_ERR_INVALID_ARG; - handle->idle_mask->fetch_or(1ULL << worker_id, - cuda::std::memory_order_release); - return CUDAQ_OK; -} - -extern "C" void -cudaq_host_dispatcher_stop(cudaq_host_dispatcher_handle_t *handle) { - if (!handle) - return; - if (handle->thread.joinable()) - handle->thread.join(); - for (auto &w : handle->workers) - cudaStreamDestroy(w.stream); - delete handle->idle_mask; - delete[] handle->inflight_slot_tags; - if (handle->owns_mailbox) - delete[] handle->h_mailbox_bank; - delete handle; -} diff --git a/realtime/lib/pipeline/CMakeLists.txt b/realtime/lib/pipeline/CMakeLists.txt deleted file mode 100644 index 7c23beea..00000000 --- a/realtime/lib/pipeline/CMakeLists.txt +++ /dev/null @@ -1,38 +0,0 @@ -# ============================================================================ # -# Copyright (c) 2026 NVIDIA Corporation & Affiliates. # -# All rights reserved. # -# # -# This source code and the accompanying materials are made available under # -# the terms of the Apache License 2.0 which accompanies this distribution. # -# ============================================================================ # - -if(CUDA_FOUND) - add_library(cudaq-realtime-pipeline SHARED - realtime_pipeline.cu - ) - - target_include_directories(cudaq-realtime-pipeline - PUBLIC - $ - $ - ) - - target_link_libraries(cudaq-realtime-pipeline - PUBLIC - CUDA::cudart_static - PRIVATE - cudaq-realtime - cudaq-realtime-host-dispatch - ) - - set_target_properties(cudaq-realtime-pipeline PROPERTIES - CUDA_SEPARABLE_COMPILATION ON - POSITION_INDEPENDENT_CODE ON - LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib - ) - - install(TARGETS cudaq-realtime-pipeline - COMPONENT realtime-lib - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ) -endif() diff --git a/realtime/scripts/install_dev_prerequisites.sh b/realtime/scripts/install_dev_prerequisites.sh deleted file mode 100755 index bf8c57f4..00000000 --- a/realtime/scripts/install_dev_prerequisites.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash - -# ============================================================================ # -# Copyright (c) 2026 NVIDIA Corporation & Affiliates. # -# All rights reserved. # -# # -# This source code and the accompanying materials are made available under # -# the terms of the Apache License 2.0 which accompanies this distribution. # -# ============================================================================ # - -# Usage: -# This script builds and installs a minimal set of dependencies needed to build -# CUDA-Q realtime from source. -# -# Usage: -# bash install_dev_prerequisites.sh - - -if [ -x "$(command -v apt-get)" ]; then - # [libibverbs] - echo "Installing libibverbs..." - apt-get update && apt-get install -y --no-install-recommends libibverbs-dev - - # [DOCA Host] - - if [ ! -x "$(command -v curl)" ]; then - apt-get update && apt-get install -y --no-install-recommends curl - fi - - DOCA_VERSION=3.2.1 - echo "Installing DOCA version $DOCA_VERSION..." - arch=$(uname -m) - distro=$(. /etc/os-release && echo ${ID}${VERSION_ID}) # e.g., ubuntu24.04 - export DOCA_URL="https://linux.mellanox.com/public/repo/doca/$DOCA_VERSION/$distro/$arch/" - echo "Using DOCA_REPO_LINK=${DOCA_URL}" - curl https://linux.mellanox.com/public/repo/doca/GPG-KEY-Mellanox.pub | gpg --dearmor > /etc/apt/trusted.gpg.d/GPG-KEY-Mellanox.pub - echo "deb [signed-by=/etc/apt/trusted.gpg.d/GPG-KEY-Mellanox.pub] $DOCA_URL ./" > /etc/apt/sources.list.d/doca.list - apt-get update - DEBIAN_FRONTEND=noninteractive apt-get -y install doca-all - - # [Holoscan SDK] - CUDA_MAJOR_VERSION=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\).*$/\1/p') - if [ -z "$CUDA_MAJOR_VERSION" ]; then - echo "Could not determine CUDA version from nvcc. Is the CUDA toolkit installed?" >&2 - exit 1 - fi - apt-get update && apt-get install -y --no-install-recommends holoscan-cuda-$CUDA_MAJOR_VERSION - -elif [ -x "$(command -v dnf)" ]; then - echo "TODO: Support RHEL." >&2 -else - echo "No supported package manager detected." >&2 -fi diff --git a/realtime/unittests/CMakeLists.txt b/realtime/unittests/CMakeLists.txt deleted file mode 100644 index 048f8e88..00000000 --- a/realtime/unittests/CMakeLists.txt +++ /dev/null @@ -1,104 +0,0 @@ -# ============================================================================ # -# Copyright (c) 2024 - 2025 NVIDIA Corporation & Affiliates. # -# All rights reserved. # -# # -# This source code and the accompanying materials are made available under # -# the terms of the Apache License 2.0 which accompanies this distribution. # -# ============================================================================ # - -# External Dependencies -# ============================================================================== - -FetchContent_Declare( - googletest - GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG v1.17.0 - EXCLUDE_FROM_ALL -) -FetchContent_MakeAvailable(googletest) - -set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) - -# Bug in GCC 12 leads to spurious warnings (-Wrestrict) -# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105329 -if (CMAKE_COMPILER_IS_GNUCXX - AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0.0 - AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 13.0.0) - target_compile_options(gtest PUBLIC --param=evrp-mode=legacy) -endif() -include(GoogleTest) - - -add_compile_options(-Wno-attributes) - -# ============================================================================== -# GPU Dispatch Kernel Tests -# ============================================================================== - -find_package(CUDAToolkit) -if(CMAKE_CUDA_COMPILER) - enable_language(CUDA) - - add_executable(test_dispatch_kernel test_dispatch_kernel.cu) - - set_target_properties(test_dispatch_kernel PROPERTIES - CUDA_SEPARABLE_COMPILATION ON - CUDA_STANDARD 17 - ) - - target_include_directories(test_dispatch_kernel PRIVATE - ${CUDAToolkit_INCLUDE_DIRS} - ${CUDAQ_REALTIME_INCLUDE_DIR} - ) - - # Find CUDA device runtime library (required for device-side API calls like cudaGraphLaunch) - find_library(CUDADEVRT_LIBRARY cudadevrt - HINTS ${CUDAToolkit_LIBRARY_DIR} - REQUIRED - ) - - target_link_libraries(test_dispatch_kernel PRIVATE - GTest::gtest_main - CUDA::cudart - cudaq-realtime - cudaq-realtime-dispatch - ${CUDADEVRT_LIBRARY} - ) - - add_dependencies(CudaqRealtimeUnitTests test_dispatch_kernel) - gtest_discover_tests(test_dispatch_kernel - TEST_PREFIX "test_dispatch_kernel." - ) - - message(STATUS " - test_dispatch_kernel (GPU dispatch infrastructure)") - - # Host dispatcher tests (CUDAQ_BACKEND_HOST_LOOP) - add_executable(test_host_dispatcher test_host_dispatcher.cu) - set_target_properties(test_host_dispatcher PROPERTIES - CUDA_SEPARABLE_COMPILATION ON - CUDA_STANDARD 17 - ) - target_include_directories(test_host_dispatcher PRIVATE - ${CUDAToolkit_INCLUDE_DIRS} - ${CUDAQ_REALTIME_INCLUDE_DIR} - ) - target_link_libraries(test_host_dispatcher PRIVATE - GTest::gtest_main - CUDA::cudart - cudaq-realtime - cudaq-realtime-host-dispatch - ) - add_dependencies(CudaqRealtimeUnitTests test_host_dispatcher) - gtest_discover_tests(test_host_dispatcher - TEST_PREFIX "test_host_dispatcher." - ) - message(STATUS " - test_host_dispatcher (host dispatcher loop)") -endif() - -# ============================================================================== -# Hololink bridge/emulator/playback tools (optional, not CI) -# ============================================================================== - -if (CUDAQ_REALTIME_ENABLE_HOLOLINK_TOOLS) - add_subdirectory(utils) -endif() diff --git a/realtime/unittests/test_dispatch_kernel.cu b/realtime/unittests/test_dispatch_kernel.cu deleted file mode 100644 index 05df4f96..00000000 --- a/realtime/unittests/test_dispatch_kernel.cu +++ /dev/null @@ -1,735 +0,0 @@ -/****************************************************************-*- C++ -*-**** - * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * - * All rights reserved. * - * * - * This source code and the accompanying materials are made available under * - * the terms of the Apache License 2.0 which accompanies this distribution. * - ******************************************************************************/ - -#include -#include -#include -#include -#include -#include -#include - -#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" -#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh" -#include "cudaq/realtime/daemon/dispatcher/dispatch_modes.h" -#include "cudaq/realtime/daemon/dispatcher/kernel_types.h" - -// Helper macro for CUDA error checking -#define CUDA_CHECK(call) \ - do { \ - cudaError_t err = call; \ - ASSERT_EQ(err, cudaSuccess) << "CUDA error: " << cudaGetErrorString(err); \ - } while (0) - -namespace { - -//============================================================================== -// Test Handler: Simple noop that copies input to output -//============================================================================== - -/// @brief Test handler that adds 1 to each byte. -__device__ int increment_handler(const void *input, void *output, - std::uint32_t arg_len, - std::uint32_t max_result_len, - std::uint32_t *result_len) { - const std::uint8_t *in_data = static_cast(input); - std::uint8_t *out_data = static_cast(output); - for (std::uint32_t i = 0; i < arg_len && i < max_result_len; ++i) { - out_data[i] = in_data[i] + 1; - } - *result_len = arg_len; - return 0; -} - -//============================================================================== -// Host API Dispatch Kernel Test Helpers -//============================================================================== - -constexpr std::uint32_t RPC_INCREMENT_FUNCTION_ID = - cudaq::realtime::fnv1a_hash("rpc_increment"); - -__device__ int rpc_increment_handler(const void *input, void *output, - std::uint32_t arg_len, - std::uint32_t max_result_len, - std::uint32_t *result_len) { - const std::uint8_t *in_data = static_cast(input); - std::uint8_t *out_data = static_cast(output); - for (std::uint32_t i = 0; i < arg_len && i < max_result_len; ++i) { - out_data[i] = static_cast(in_data[i] + 1); - } - *result_len = arg_len; - return 0; -} - -__global__ void init_rpc_function_table(cudaq_function_entry_t *entries) { - if (threadIdx.x == 0 && blockIdx.x == 0) { - entries[0].handler.device_fn_ptr = - reinterpret_cast(&rpc_increment_handler); - entries[0].function_id = RPC_INCREMENT_FUNCTION_ID; - entries[0].dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; - entries[0].reserved[0] = 0; - entries[0].reserved[1] = 0; - entries[0].reserved[2] = 0; - - // Schema: 1 array argument (uint8), 1 array result (uint8) - entries[0].schema.num_args = 1; - entries[0].schema.num_results = 1; - entries[0].schema.reserved = 0; - entries[0].schema.args[0].type_id = CUDAQ_TYPE_ARRAY_UINT8; - entries[0].schema.args[0].reserved[0] = 0; - entries[0].schema.args[0].reserved[1] = 0; - entries[0].schema.args[0].reserved[2] = 0; - entries[0].schema.args[0].size_bytes = 0; // Variable size - entries[0].schema.args[0].num_elements = 0; // Variable size - entries[0].schema.results[0].type_id = CUDAQ_TYPE_ARRAY_UINT8; - entries[0].schema.results[0].reserved[0] = 0; - entries[0].schema.results[0].reserved[1] = 0; - entries[0].schema.results[0].reserved[2] = 0; - entries[0].schema.results[0].size_bytes = 0; // Variable size - entries[0].schema.results[0].num_elements = 0; // Variable size - } -} - -bool allocate_ring_buffer(std::size_t num_slots, std::size_t slot_size, - volatile uint64_t **host_flags_out, - volatile uint64_t **device_flags_out, - std::uint8_t **host_data_out, - std::uint8_t **device_data_out) { - void *host_flags_ptr = nullptr; - cudaError_t err = cudaHostAlloc(&host_flags_ptr, num_slots * sizeof(uint64_t), - cudaHostAllocMapped); - if (err != cudaSuccess) - return false; - - void *device_flags_ptr = nullptr; - err = cudaHostGetDevicePointer(&device_flags_ptr, host_flags_ptr, 0); - if (err != cudaSuccess) { - cudaFreeHost(host_flags_ptr); - return false; - } - - void *host_data_ptr = nullptr; - err = - cudaHostAlloc(&host_data_ptr, num_slots * slot_size, cudaHostAllocMapped); - if (err != cudaSuccess) { - cudaFreeHost(host_flags_ptr); - return false; - } - - void *device_data_ptr = nullptr; - err = cudaHostGetDevicePointer(&device_data_ptr, host_data_ptr, 0); - if (err != cudaSuccess) { - cudaFreeHost(host_flags_ptr); - cudaFreeHost(host_data_ptr); - return false; - } - - memset(host_flags_ptr, 0, num_slots * sizeof(uint64_t)); - - *host_flags_out = static_cast(host_flags_ptr); - *device_flags_out = static_cast(device_flags_ptr); - *host_data_out = static_cast(host_data_ptr); - *device_data_out = static_cast(device_data_ptr); - return true; -} - -void free_ring_buffer(volatile uint64_t *host_flags, std::uint8_t *host_data) { - if (host_flags) - cudaFreeHost(const_cast(host_flags)); - if (host_data) - cudaFreeHost(host_data); -} - -extern "C" void launch_dispatch_kernel_wrapper( - volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, - std::uint8_t *rx_data, std::uint8_t *tx_data, std::size_t rx_stride_sz, - std::size_t tx_stride_sz, cudaq_function_entry_t *function_table, - std::size_t func_count, volatile int *shutdown_flag, std::uint64_t *stats, - std::size_t num_slots, std::uint32_t num_blocks, - std::uint32_t threads_per_block, cudaStream_t stream) { - cudaq_launch_dispatch_kernel_regular( - rx_flags, tx_flags, rx_data, tx_data, rx_stride_sz, tx_stride_sz, - function_table, func_count, shutdown_flag, stats, num_slots, num_blocks, - threads_per_block, stream); -} - -//============================================================================== -// Test Kernel for DeviceCallMode -//============================================================================== - -using HandlerFunc = int (*)(const void *, void *, std::uint32_t, std::uint32_t, - std::uint32_t *); - -__device__ HandlerFunc d_increment_handler = increment_handler; - -/// @brief Test kernel that dispatches to a handler using DeviceCallMode. -template -__global__ void test_dispatch_kernel(HandlerFunc handler, const void *input, - void *output, std::uint32_t arg_len, - std::uint32_t max_result_len, - std::uint32_t *result_len, int *status) { - - if (threadIdx.x == 0 && blockIdx.x == 0) { - *status = handler(input, output, arg_len, max_result_len, result_len); - } - - KernelType::sync(); -} - -//============================================================================== -// Test Fixture -//============================================================================== - -class DispatchKernelTest : public ::testing::Test { -protected: - void SetUp() override { - CUDA_CHECK(cudaMalloc(&d_buffer_, 1024)); - CUDA_CHECK(cudaMalloc(&d_result_len_, sizeof(std::uint32_t))); - CUDA_CHECK(cudaMalloc(&d_status_, sizeof(int))); - } - - void TearDown() override { - if (d_buffer_) - cudaFree(d_buffer_); - if (d_result_len_) - cudaFree(d_result_len_); - if (d_status_) - cudaFree(d_status_); - } - - void *d_buffer_ = nullptr; - std::uint32_t *d_result_len_ = nullptr; - int *d_status_ = nullptr; -}; - -//============================================================================== -// Tests -//============================================================================== - -TEST_F(DispatchKernelTest, IncrementHandlerBasic) { - // Prepare test data - separate input and output buffers - std::vector input = {0, 1, 2, 3, 4}; - std::vector expected = {1, 2, 3, 4, 5}; - - void *d_input = nullptr; - CUDA_CHECK(cudaMalloc(&d_input, 1024)); - CUDA_CHECK( - cudaMemcpy(d_input, input.data(), input.size(), cudaMemcpyHostToDevice)); - - // Get device function pointer - HandlerFunc h_handler; - CUDA_CHECK(cudaMemcpyFromSymbol(&h_handler, d_increment_handler, - sizeof(HandlerFunc))); - - // Launch kernel with separate input/output buffers - test_dispatch_kernel - <<<1, 32>>>(h_handler, d_input, d_buffer_, input.size(), 1024, - d_result_len_, d_status_); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaDeviceSynchronize()); - - // Check results - int status; - std::uint32_t result_len; - CUDA_CHECK( - cudaMemcpy(&status, d_status_, sizeof(int), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(&result_len, d_result_len_, sizeof(std::uint32_t), - cudaMemcpyDeviceToHost)); - - EXPECT_EQ(status, 0) << "Handler should return success"; - EXPECT_EQ(result_len, input.size()) << "Result length should match input"; - - // Verify output buffer has incremented data - std::vector output(input.size()); - CUDA_CHECK(cudaMemcpy(output.data(), d_buffer_, output.size(), - cudaMemcpyDeviceToHost)); - EXPECT_EQ(expected, output) << "Increment handler should add 1 to each byte"; - - // Verify input buffer is unchanged - std::vector input_readback(input.size()); - CUDA_CHECK(cudaMemcpy(input_readback.data(), d_input, input.size(), - cudaMemcpyDeviceToHost)); - EXPECT_EQ(input, input_readback) << "Input buffer should be unchanged"; - - cudaFree(d_input); -} - -TEST_F(DispatchKernelTest, LargeBuffer) { - // Test with larger data - separate input/output buffers - const std::size_t size = 512; - std::vector input(size); - for (std::size_t i = 0; i < size; ++i) { - input[i] = static_cast(i & 0xFF); - } - - void *d_input = nullptr; - CUDA_CHECK(cudaMalloc(&d_input, 1024)); - CUDA_CHECK( - cudaMemcpy(d_input, input.data(), input.size(), cudaMemcpyHostToDevice)); - - HandlerFunc h_handler; - CUDA_CHECK(cudaMemcpyFromSymbol(&h_handler, d_increment_handler, - sizeof(HandlerFunc))); - - test_dispatch_kernel - <<<1, 256>>>(h_handler, d_input, d_buffer_, input.size(), 1024, - d_result_len_, d_status_); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaDeviceSynchronize()); - - std::uint32_t result_len; - CUDA_CHECK(cudaMemcpy(&result_len, d_result_len_, sizeof(std::uint32_t), - cudaMemcpyDeviceToHost)); - EXPECT_EQ(result_len, size) << "Should process all bytes"; - - // Verify all bytes incremented in output buffer - std::vector output(size); - CUDA_CHECK(cudaMemcpy(output.data(), d_buffer_, output.size(), - cudaMemcpyDeviceToHost)); - - for (std::size_t i = 0; i < size; ++i) { - uint8_t expected = static_cast((i + 1) & 0xFF); - EXPECT_EQ(output[i], expected) << "Mismatch at index " << i; - } - - cudaFree(d_input); -} - -class HostApiDispatchTest : public ::testing::Test { -protected: - void SetUp() override { - ASSERT_TRUE(allocate_ring_buffer(num_slots_, slot_size_, &rx_flags_host_, - &rx_flags_, &rx_data_host_, &rx_data_)); - ASSERT_TRUE(allocate_ring_buffer(num_slots_, slot_size_, &tx_flags_host_, - &tx_flags_, &tx_data_host_, &tx_data_)); - - void *tmp_shutdown = nullptr; - CUDA_CHECK(cudaHostAlloc(&tmp_shutdown, sizeof(int), cudaHostAllocMapped)); - shutdown_flag_ = static_cast(tmp_shutdown); - void *tmp_d_shutdown = nullptr; - CUDA_CHECK(cudaHostGetDevicePointer(&tmp_d_shutdown, tmp_shutdown, 0)); - d_shutdown_flag_ = static_cast(tmp_d_shutdown); - *shutdown_flag_ = 0; - int zero = 0; - CUDA_CHECK(cudaMemcpy(const_cast(d_shutdown_flag_), &zero, - sizeof(int), cudaMemcpyHostToDevice)); - - CUDA_CHECK(cudaMalloc(&d_stats_, sizeof(uint64_t))); - CUDA_CHECK(cudaMemset(d_stats_, 0, sizeof(uint64_t))); - - CUDA_CHECK( - cudaMalloc(&d_function_entries_, sizeof(cudaq_function_entry_t))); - init_rpc_function_table<<<1, 1>>>(d_function_entries_); - CUDA_CHECK(cudaDeviceSynchronize()); - func_count_ = 1; - - ASSERT_EQ(cudaq_dispatch_manager_create(&manager_), CUDAQ_OK); - cudaq_dispatcher_config_t config{}; - config.device_id = 0; - config.num_blocks = 1; - config.threads_per_block = 64; - config.num_slots = static_cast(num_slots_); - config.slot_size = static_cast(slot_size_); - config.vp_id = 0; - config.kernel_type = CUDAQ_KERNEL_REGULAR; - config.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; - ASSERT_EQ(cudaq_dispatcher_create(manager_, &config, &dispatcher_), - CUDAQ_OK); - - cudaq_ringbuffer_t ringbuffer{}; - ringbuffer.rx_flags = rx_flags_; - ringbuffer.tx_flags = tx_flags_; - ringbuffer.rx_data = rx_data_; - ringbuffer.tx_data = tx_data_; - ringbuffer.rx_stride_sz = slot_size_; - ringbuffer.tx_stride_sz = slot_size_; - ASSERT_EQ(cudaq_dispatcher_set_ringbuffer(dispatcher_, &ringbuffer), - CUDAQ_OK); - - cudaq_function_table_t table{}; - table.entries = d_function_entries_; - table.count = func_count_; - ASSERT_EQ(cudaq_dispatcher_set_function_table(dispatcher_, &table), - CUDAQ_OK); - - ASSERT_EQ( - cudaq_dispatcher_set_control(dispatcher_, d_shutdown_flag_, d_stats_), - CUDAQ_OK); - ASSERT_EQ(cudaq_dispatcher_set_launch_fn(dispatcher_, - &launch_dispatch_kernel_wrapper), - CUDAQ_OK); - ASSERT_EQ(cudaq_dispatcher_start(dispatcher_), CUDAQ_OK); - } - - void TearDown() override { - if (shutdown_flag_) { - *shutdown_flag_ = 1; - __sync_synchronize(); - } - if (dispatcher_) { - cudaq_dispatcher_stop(dispatcher_); - cudaq_dispatcher_destroy(dispatcher_); - dispatcher_ = nullptr; - } - if (manager_) { - cudaq_dispatch_manager_destroy(manager_); - manager_ = nullptr; - } - free_ring_buffer(rx_flags_host_, rx_data_host_); - free_ring_buffer(tx_flags_host_, tx_data_host_); - - if (shutdown_flag_) - cudaFreeHost(const_cast(shutdown_flag_)); - if (d_stats_) - cudaFree(d_stats_); - if (d_function_entries_) - cudaFree(d_function_entries_); - } - - void write_rpc_request(std::size_t slot, - const std::vector &payload) { - std::uint8_t *slot_data = - const_cast(rx_data_host_) + slot * slot_size_; - auto *header = reinterpret_cast(slot_data); - header->magic = cudaq::realtime::RPC_MAGIC_REQUEST; - header->function_id = RPC_INCREMENT_FUNCTION_ID; - header->arg_len = static_cast(payload.size()); - memcpy(slot_data + sizeof(cudaq::realtime::RPCHeader), payload.data(), - payload.size()); - } - - bool read_rpc_response(std::size_t slot, std::vector &payload, - std::int32_t *status_out = nullptr, - std::uint32_t *result_len_out = nullptr) { - __sync_synchronize(); - // Read from TX buffer (dispatch kernel writes response to symmetric TX) - const std::uint8_t *slot_data = - const_cast(tx_data_host_) + slot * slot_size_; - auto *response = - reinterpret_cast(slot_data); - - if (response->magic != cudaq::realtime::RPC_MAGIC_RESPONSE) - return false; - if (status_out) - *status_out = response->status; - if (result_len_out) - *result_len_out = response->result_len; - if (response->status != 0) - return false; - - payload.resize(response->result_len); - memcpy(payload.data(), slot_data + sizeof(cudaq::realtime::RPCResponse), - response->result_len); - return true; - } - - static constexpr std::size_t num_slots_ = 2; - std::size_t slot_size_ = 256; - volatile uint64_t *rx_flags_host_ = nullptr; - volatile uint64_t *tx_flags_host_ = nullptr; - volatile uint64_t *rx_flags_ = nullptr; - volatile uint64_t *tx_flags_ = nullptr; - std::uint8_t *rx_data_host_ = nullptr; - std::uint8_t *tx_data_host_ = nullptr; - std::uint8_t *rx_data_ = nullptr; - std::uint8_t *tx_data_ = nullptr; - - volatile int *shutdown_flag_ = nullptr; - volatile int *d_shutdown_flag_ = nullptr; - uint64_t *d_stats_ = nullptr; - - cudaq_function_entry_t *d_function_entries_ = nullptr; - std::size_t func_count_ = 0; - - cudaq_dispatch_manager_t *manager_ = nullptr; - cudaq_dispatcher_t *dispatcher_ = nullptr; -}; - -TEST_F(HostApiDispatchTest, RpcIncrementHandler) { - std::vector payload = {0, 1, 2, 3}; - write_rpc_request(0, payload); - - __sync_synchronize(); - const_cast(rx_flags_host_)[0] = - reinterpret_cast(rx_data_); - - int timeout = 50; - while (tx_flags_host_[0] == 0 && timeout-- > 0) { - usleep(1000); - } - ASSERT_GT(timeout, 0) << "Timeout waiting for dispatch kernel response"; - - std::vector response; - std::int32_t status = -1; - std::uint32_t result_len = 0; - ASSERT_TRUE(read_rpc_response(0, response, &status, &result_len)); - EXPECT_EQ(status, 0); - ASSERT_EQ(result_len, payload.size()); - - std::vector expected = {1, 2, 3, 4}; - EXPECT_EQ(response, expected); -} - -//============================================================================== -// Graph Launch Test -//============================================================================== - -// Graph kernel that processes RPC buffer via pointer indirection -__global__ void graph_increment_kernel(void **buffer_ptr) { - if (threadIdx.x == 0 && blockIdx.x == 0) { - void *buffer = *buffer_ptr; - cudaq::realtime::RPCHeader *header = - static_cast(buffer); - - std::uint32_t arg_len = header->arg_len; - void *arg_buffer = static_cast(header + 1); - std::uint8_t *data = static_cast(arg_buffer); - - // Increment each byte - for (std::uint32_t i = 0; i < arg_len; ++i) { - data[i] = data[i] + 1; - } - - // Write response - cudaq::realtime::RPCResponse *response = - static_cast(buffer); - response->magic = cudaq::realtime::RPC_MAGIC_RESPONSE; - response->status = 0; - response->result_len = arg_len; - } -} - -constexpr std::uint32_t RPC_GRAPH_INCREMENT_FUNCTION_ID = - cudaq::realtime::fnv1a_hash("rpc_graph_increment"); - -__global__ void init_graph_function_table(cudaq_function_entry_t *entries, - cudaGraphExec_t graph_exec) { - if (threadIdx.x == 0 && blockIdx.x == 0) { - entries[0].handler.graph_exec = graph_exec; - entries[0].function_id = RPC_GRAPH_INCREMENT_FUNCTION_ID; - entries[0].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; - entries[0].reserved[0] = 0; - entries[0].reserved[1] = 0; - entries[0].reserved[2] = 0; - } -} - -TEST(GraphLaunchTest, DispatchKernelGraphLaunch) { - // Check compute capability - int device; - CUDA_CHECK(cudaGetDevice(&device)); - cudaDeviceProp prop; - CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); - - if (prop.major < 9) { - GTEST_SKIP() - << "Graph device launch requires compute capability 9.0+, found " - << prop.major << "." << prop.minor; - } - - // Allocate graph buffer pointer (for pointer indirection pattern) - void **d_graph_buffer_ptr; - CUDA_CHECK(cudaMalloc(&d_graph_buffer_ptr, sizeof(void *))); - CUDA_CHECK(cudaMemset(d_graph_buffer_ptr, 0, sizeof(void *))); - - // Allocate test buffer - constexpr size_t buffer_size = 1024; - void *d_buffer; - CUDA_CHECK(cudaMalloc(&d_buffer, buffer_size)); - - // Create the child graph (the one that will be launched from device) - cudaGraph_t child_graph; - cudaGraphExec_t child_graph_exec; - - CUDA_CHECK(cudaGraphCreate(&child_graph, 0)); - - // Add kernel node to child graph - cudaKernelNodeParams kernel_params = {}; - void *kernel_args[] = {&d_graph_buffer_ptr}; - kernel_params.func = reinterpret_cast(&graph_increment_kernel); - kernel_params.gridDim = dim3(1, 1, 1); - kernel_params.blockDim = dim3(32, 1, 1); - kernel_params.sharedMemBytes = 0; - kernel_params.kernelParams = kernel_args; - kernel_params.extra = nullptr; - - cudaGraphNode_t kernel_node; - CUDA_CHECK(cudaGraphAddKernelNode(&kernel_node, child_graph, nullptr, 0, - &kernel_params)); - - // Instantiate CHILD graph with DEVICE LAUNCH FLAG - CUDA_CHECK(cudaGraphInstantiate(&child_graph_exec, child_graph, - cudaGraphInstantiateFlagDeviceLaunch)); - - // Create stream for operations - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - // Upload the child graph to device - CUDA_CHECK(cudaGraphUpload(child_graph_exec, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); - - // Set up function table with graph launch entry - cudaq_function_entry_t *d_function_entries; - CUDA_CHECK(cudaMalloc(&d_function_entries, sizeof(cudaq_function_entry_t))); - init_graph_function_table<<<1, 1>>>(d_function_entries, child_graph_exec); - CUDA_CHECK(cudaDeviceSynchronize()); - - // Set up RPC buffer on host - std::uint8_t *h_buffer = new std::uint8_t[buffer_size]; - cudaq::realtime::RPCHeader *h_header = - reinterpret_cast(h_buffer); - h_header->magic = cudaq::realtime::RPC_MAGIC_REQUEST; - h_header->function_id = RPC_GRAPH_INCREMENT_FUNCTION_ID; - h_header->arg_len = 4; - - std::uint8_t *h_data = h_buffer + sizeof(cudaq::realtime::RPCHeader); - h_data[0] = 0; - h_data[1] = 1; - h_data[2] = 2; - h_data[3] = 3; - - // Copy to device - CUDA_CHECK( - cudaMemcpy(d_buffer, h_buffer, buffer_size, cudaMemcpyHostToDevice)); - - // Set up fake RX/TX flags for single-shot test - volatile uint64_t *d_rx_flags; - volatile uint64_t *d_tx_flags; - CUDA_CHECK(cudaMalloc(&d_rx_flags, sizeof(uint64_t))); - CUDA_CHECK(cudaMalloc(&d_tx_flags, sizeof(uint64_t))); - CUDA_CHECK(cudaMemset((void *)d_rx_flags, 0, sizeof(uint64_t))); - CUDA_CHECK(cudaMemset((void *)d_tx_flags, 0, sizeof(uint64_t))); - - // Set RX flag to point to our buffer (simulating incoming RPC) - uint64_t buffer_addr = reinterpret_cast(d_buffer); - CUDA_CHECK(cudaMemcpy((void *)d_rx_flags, &buffer_addr, sizeof(uint64_t), - cudaMemcpyHostToDevice)); - - // Set up shutdown flag using pinned mapped memory so the dispatch kernel - // can see host updates immediately - volatile int *h_shutdown; - volatile int *d_shutdown; - { - void *tmp_shutdown; - CUDA_CHECK(cudaHostAlloc(&tmp_shutdown, sizeof(int), cudaHostAllocMapped)); - h_shutdown = static_cast(tmp_shutdown); - *h_shutdown = 0; - - void *tmp_d_shutdown; - CUDA_CHECK(cudaHostGetDevicePointer(&tmp_d_shutdown, tmp_shutdown, 0)); - d_shutdown = static_cast(tmp_d_shutdown); - } - - // Set up stats - uint64_t *d_stats; - CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t))); - CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t))); - - // Create dispatch graph context - THIS WRAPS THE DISPATCH KERNEL IN A GRAPH - // so that device-side cudaGraphLaunch() can work! - cudaq_dispatch_graph_context *dispatch_ctx = nullptr; - cudaError_t err = cudaq_create_dispatch_graph_regular( - d_rx_flags, d_tx_flags, - reinterpret_cast(d_buffer), // rx_data - reinterpret_cast( - d_buffer), // tx_data (same buffer for single-slot test) - buffer_size, // rx_stride_sz - buffer_size, // tx_stride_sz - d_function_entries, 1, d_graph_buffer_ptr, d_shutdown, d_stats, 1, 1, 32, - stream, &dispatch_ctx); - - if (err != cudaSuccess) { - GTEST_SKIP() << "Device-side graph launch not supported: " - << cudaGetErrorString(err) << " (" << err << ")"; - } - - // Launch dispatch graph - now device-side cudaGraphLaunch will work! - CUDA_CHECK(cudaq_launch_dispatch_graph(dispatch_ctx, stream)); - - // Poll for the response using pinned memory and async operations - // The child graph runs asynchronously (fire-and-forget) so we need to poll - std::uint8_t *h_poll_buffer; - CUDA_CHECK(cudaHostAlloc(&h_poll_buffer, sizeof(cudaq::realtime::RPCResponse), - cudaHostAllocDefault)); - memset(h_poll_buffer, 0, sizeof(cudaq::realtime::RPCResponse)); - - cudaStream_t poll_stream; - CUDA_CHECK(cudaStreamCreate(&poll_stream)); - - int timeout_ms = 5000; - int poll_interval_ms = 100; - bool got_response = false; - - for (int elapsed = 0; elapsed < timeout_ms; elapsed += poll_interval_ms) { - CUDA_CHECK(cudaMemcpyAsync(h_poll_buffer, d_buffer, - sizeof(cudaq::realtime::RPCResponse), - cudaMemcpyDeviceToHost, poll_stream)); - CUDA_CHECK(cudaStreamSynchronize(poll_stream)); - - cudaq::realtime::RPCResponse *peek = - reinterpret_cast(h_poll_buffer); - if (peek->magic == cudaq::realtime::RPC_MAGIC_RESPONSE) { - got_response = true; - break; - } - - usleep(poll_interval_ms * 1000); - } - - // Signal shutdown to allow kernel to exit - *h_shutdown = 1; - __sync_synchronize(); - usleep(100000); // Give kernel time to see shutdown flag - - // Copy final results - CUDA_CHECK(cudaMemcpyAsync(h_buffer, d_buffer, buffer_size, - cudaMemcpyDeviceToHost, poll_stream)); - CUDA_CHECK(cudaStreamSynchronize(poll_stream)); - - // Clean up poll resources - CUDA_CHECK(cudaStreamDestroy(poll_stream)); - cudaFreeHost(h_poll_buffer); - - // Sync main stream (dispatch kernel should have exited) - CUDA_CHECK(cudaStreamSynchronize(stream)); - - ASSERT_TRUE(got_response) - << "Timeout waiting for device-side graph launch response"; - - // Verify response - cudaq::realtime::RPCResponse *h_response = - reinterpret_cast(h_buffer); - EXPECT_EQ(h_response->magic, cudaq::realtime::RPC_MAGIC_RESPONSE) - << "Expected RPC_MAGIC_RESPONSE, got 0x" << std::hex << h_response->magic; - EXPECT_EQ(h_response->status, 0) << "Handler returned error status"; - EXPECT_EQ(h_response->result_len, 4u) << "Unexpected result length"; - - // Verify data was incremented by graph kernel launched from dispatch kernel - std::uint8_t *h_result = h_buffer + sizeof(cudaq::realtime::RPCResponse); - EXPECT_EQ(h_result[0], 1) << "Expected h_result[0]=1"; - EXPECT_EQ(h_result[1], 2) << "Expected h_result[1]=2"; - EXPECT_EQ(h_result[2], 3) << "Expected h_result[2]=3"; - EXPECT_EQ(h_result[3], 4) << "Expected h_result[3]=4"; - - // Cleanup - delete[] h_buffer; - CUDA_CHECK(cudaq_destroy_dispatch_graph(dispatch_ctx)); - CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(d_stats)); - CUDA_CHECK(cudaFreeHost(const_cast(h_shutdown))); // Free mapped memory - CUDA_CHECK(cudaFree((void *)d_tx_flags)); - CUDA_CHECK(cudaFree((void *)d_rx_flags)); - CUDA_CHECK(cudaFree(d_function_entries)); - CUDA_CHECK(cudaGraphExecDestroy(child_graph_exec)); - CUDA_CHECK(cudaGraphDestroy(child_graph)); - CUDA_CHECK(cudaFree(d_graph_buffer_ptr)); - CUDA_CHECK(cudaFree(d_buffer)); -} - -} // namespace diff --git a/realtime/unittests/test_host_dispatcher.cu b/realtime/unittests/test_host_dispatcher.cu deleted file mode 100644 index f955554e..00000000 --- a/realtime/unittests/test_host_dispatcher.cu +++ /dev/null @@ -1,1004 +0,0 @@ -/****************************************************************-*- C++ -*-**** - * Copyright (c) 2026 NVIDIA Corporation & Affiliates. - * All rights reserved. - * - * This source code and the accompanying materials are made available under - * the terms of the Apache License 2.0 which accompanies this distribution. - ******************************************************************************/ - -#include -#include -#include -#include -#include -#include -#include - -#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" -#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" -#include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" - -#define CUDA_CHECK(call) \ - do { \ - cudaError_t err = call; \ - ASSERT_EQ(err, cudaSuccess) << "CUDA error: " << cudaGetErrorString(err); \ - } while (0) - -namespace { - -//============================================================================== -// Ring buffer helpers (same pattern as test_dispatch_kernel.cu) -//============================================================================== - -bool allocate_ring_buffer(std::size_t num_slots, std::size_t slot_size, - volatile uint64_t **host_flags_out, - volatile uint64_t **device_flags_out, - std::uint8_t **host_data_out, - std::uint8_t **device_data_out) { - void *host_flags_ptr = nullptr; - cudaError_t err = cudaHostAlloc(&host_flags_ptr, num_slots * sizeof(uint64_t), - cudaHostAllocMapped); - if (err != cudaSuccess) - return false; - - void *device_flags_ptr = nullptr; - err = cudaHostGetDevicePointer(&device_flags_ptr, host_flags_ptr, 0); - if (err != cudaSuccess) { - cudaFreeHost(host_flags_ptr); - return false; - } - - void *host_data_ptr = nullptr; - err = - cudaHostAlloc(&host_data_ptr, num_slots * slot_size, cudaHostAllocMapped); - if (err != cudaSuccess) { - cudaFreeHost(host_flags_ptr); - return false; - } - - void *device_data_ptr = nullptr; - err = cudaHostGetDevicePointer(&device_data_ptr, host_data_ptr, 0); - if (err != cudaSuccess) { - cudaFreeHost(host_flags_ptr); - cudaFreeHost(host_data_ptr); - return false; - } - - std::memset(host_flags_ptr, 0, num_slots * sizeof(uint64_t)); - - *host_flags_out = static_cast(host_flags_ptr); - *device_flags_out = static_cast(device_flags_ptr); - *host_data_out = static_cast(host_data_ptr); - *device_data_out = static_cast(device_data_ptr); - return true; -} - -void free_ring_buffer(volatile uint64_t *host_flags, std::uint8_t *host_data) { - if (host_flags) - cudaFreeHost(const_cast(host_flags)); - if (host_data) - cudaFreeHost(host_data); -} - -//============================================================================== -// Minimal graph for dummy GRAPH_LAUNCH entry (so C API starts the host thread) -//============================================================================== - -__global__ void noop_kernel() {} - -// Creates a minimal executable graph and returns it. Caller must destroy with -// cudaGraphExecDestroy and cudaGraphDestroy. -bool create_dummy_graph(cudaGraph_t *graph_out, cudaGraphExec_t *exec_out) { - cudaGraph_t graph = nullptr; - if (cudaGraphCreate(&graph, 0) != cudaSuccess) - return false; - - cudaKernelNodeParams params = {}; - void *args[] = {}; - params.func = reinterpret_cast(noop_kernel); - params.gridDim = dim3(1, 1, 1); - params.blockDim = dim3(1, 1, 1); - params.sharedMemBytes = 0; - params.kernelParams = args; - params.extra = nullptr; - - cudaGraphNode_t node = nullptr; - if (cudaGraphAddKernelNode(&node, graph, nullptr, 0, ¶ms) != - cudaSuccess) { - cudaGraphDestroy(graph); - return false; - } - - cudaGraphExec_t exec = nullptr; - if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) != cudaSuccess) { - cudaGraphDestroy(graph); - return false; - } - - *graph_out = graph; - *exec_out = exec; - return true; -} - -//============================================================================== -// Graph launch test: kernel that reads slot from mailbox and writes response -// in-place (same buffer as request; use single ring buffer for rx/tx). -//============================================================================== - -__global__ void graph_increment_kernel(void **mailbox_slot_ptr) { - if (threadIdx.x == 0 && blockIdx.x == 0) { - void *buffer = *mailbox_slot_ptr; - cudaq::realtime::RPCHeader *header = - static_cast(buffer); - std::uint32_t arg_len = header->arg_len; - void *arg_buffer = static_cast(header + 1); - std::uint8_t *data = static_cast(arg_buffer); - for (std::uint32_t i = 0; i < arg_len; ++i) - data[i] = data[i] + 1; - cudaq::realtime::RPCResponse *response = - static_cast(buffer); - response->magic = cudaq::realtime::RPC_MAGIC_RESPONSE; - response->status = 0; - response->result_len = arg_len; - } -} - -constexpr std::uint32_t RPC_GRAPH_INCREMENT_FUNCTION_ID = - cudaq::realtime::fnv1a_hash("rpc_graph_increment"); - -/// Creates an executable graph that runs graph_increment_kernel with -/// kernel arg = d_mailbox_bank (device pointer to first mailbox slot). -/// Caller must cudaGraphExecDestroy / cudaGraphDestroy. -bool create_increment_graph(void **d_mailbox_bank, cudaGraph_t *graph_out, - cudaGraphExec_t *exec_out) { - cudaGraph_t graph = nullptr; - if (cudaGraphCreate(&graph, 0) != cudaSuccess) - return false; - - // kernelParams[i] must be a *pointer to* the i-th argument value. - // The kernel takes void** so we pass &d_mailbox_bank (a void***). - cudaKernelNodeParams params = {}; - void *kernel_args[] = {&d_mailbox_bank}; - params.func = reinterpret_cast(graph_increment_kernel); - params.gridDim = dim3(1, 1, 1); - params.blockDim = dim3(32, 1, 1); - params.sharedMemBytes = 0; - params.kernelParams = kernel_args; - params.extra = nullptr; - - cudaGraphNode_t node = nullptr; - if (cudaGraphAddKernelNode(&node, graph, nullptr, 0, ¶ms) != - cudaSuccess) { - cudaGraphDestroy(graph); - return false; - } - - cudaGraphExec_t exec = nullptr; - if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) != cudaSuccess) { - cudaGraphDestroy(graph); - return false; - } - - *graph_out = graph; - *exec_out = exec; - return true; -} - -//============================================================================== -// Graph launch test: kernel that reads slot from mailbox and doubles payload -// in-place (for function_id routing differentiation vs increment kernel). -//============================================================================== - -__global__ void graph_double_kernel(void **mailbox_slot_ptr) { - if (threadIdx.x == 0 && blockIdx.x == 0) { - void *buffer = *mailbox_slot_ptr; - cudaq::realtime::RPCHeader *header = - static_cast(buffer); - std::uint32_t arg_len = header->arg_len; - void *arg_buffer = static_cast(header + 1); - std::uint8_t *data = static_cast(arg_buffer); - for (std::uint32_t i = 0; i < arg_len; ++i) - data[i] = data[i] * 2; - cudaq::realtime::RPCResponse *response = - static_cast(buffer); - response->magic = cudaq::realtime::RPC_MAGIC_RESPONSE; - response->status = 0; - response->result_len = arg_len; - } -} - -constexpr std::uint32_t RPC_GRAPH_DOUBLE_FUNCTION_ID = - cudaq::realtime::fnv1a_hash("rpc_graph_double"); - -bool create_double_graph(void **d_mailbox_slot, cudaGraph_t *graph_out, - cudaGraphExec_t *exec_out) { - cudaGraph_t graph = nullptr; - if (cudaGraphCreate(&graph, 0) != cudaSuccess) - return false; - - cudaKernelNodeParams params = {}; - void *kernel_args[] = {&d_mailbox_slot}; - params.func = reinterpret_cast(graph_double_kernel); - params.gridDim = dim3(1, 1, 1); - params.blockDim = dim3(32, 1, 1); - params.sharedMemBytes = 0; - params.kernelParams = kernel_args; - params.extra = nullptr; - - cudaGraphNode_t node = nullptr; - if (cudaGraphAddKernelNode(&node, graph, nullptr, 0, ¶ms) != - cudaSuccess) { - cudaGraphDestroy(graph); - return false; - } - - cudaGraphExec_t exec = nullptr; - if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) != cudaSuccess) { - cudaGraphDestroy(graph); - return false; - } - - *graph_out = graph; - *exec_out = exec; - return true; -} - -//============================================================================== -// Test fixture: drives host_dispatcher_loop directly (not C API) for full -// control over idle_mask, enabling worker recycling and backpressure tests. -//============================================================================== - -static constexpr std::size_t kMaxWorkers = 8; - -class HostDispatcherLoopTest : public ::testing::Test { -protected: - void SetUp() override { - ASSERT_TRUE(allocate_ring_buffer(num_slots_, slot_size_, &rx_flags_host_, - &rx_flags_dev_, &rx_data_host_, - &rx_data_dev_)); - ASSERT_TRUE(allocate_ring_buffer(num_slots_, slot_size_, &tx_flags_host_, - &tx_flags_dev_, &tx_data_host_, - &tx_data_dev_)); - - CUDA_CHECK(cudaHostAlloc(&h_mailbox_bank_, kMaxWorkers * sizeof(void *), - cudaHostAllocMapped)); - std::memset(h_mailbox_bank_, 0, kMaxWorkers * sizeof(void *)); - CUDA_CHECK(cudaHostGetDevicePointer( - reinterpret_cast(&d_mailbox_bank_), h_mailbox_bank_, 0)); - - idle_mask_ = new cudaq::realtime::atomic_uint64_sys(0); - live_dispatched_ = new cudaq::realtime::atomic_uint64_sys(0); - inflight_slot_tags_ = new int[kMaxWorkers](); - shutdown_flag_ = new cudaq::realtime::atomic_int_sys(0); - stats_counter_ = 0; - - function_table_ = new cudaq_function_entry_t[kMaxWorkers]; - std::memset(function_table_, 0, - kMaxWorkers * sizeof(cudaq_function_entry_t)); - - std::memset(&ringbuffer_, 0, sizeof(ringbuffer_)); - ringbuffer_.rx_flags = rx_flags_dev_; - ringbuffer_.tx_flags = tx_flags_dev_; - ringbuffer_.rx_data = rx_data_dev_; - ringbuffer_.tx_data = tx_data_dev_; - ringbuffer_.rx_stride_sz = slot_size_; - ringbuffer_.tx_stride_sz = slot_size_; - ringbuffer_.rx_flags_host = rx_flags_host_; - ringbuffer_.tx_flags_host = tx_flags_host_; - ringbuffer_.rx_data_host = rx_data_host_; - ringbuffer_.tx_data_host = tx_data_host_; - } - - void TearDown() override { - if (!loop_stopped_) { - shutdown_flag_->store(1, cuda::std::memory_order_release); - __sync_synchronize(); - if (loop_thread_.joinable()) - loop_thread_.join(); - } - - for (auto &w : worker_info_) { - if (w.stream) - cudaStreamDestroy(w.stream); - if (w.graph_exec) - cudaGraphExecDestroy(w.graph_exec); - if (w.graph) - cudaGraphDestroy(w.graph); - } - - free_ring_buffer(rx_flags_host_, rx_data_host_); - free_ring_buffer(tx_flags_host_, tx_data_host_); - if (h_mailbox_bank_) - cudaFreeHost(h_mailbox_bank_); - delete idle_mask_; - delete live_dispatched_; - delete[] inflight_slot_tags_; - delete shutdown_flag_; - delete[] function_table_; - } - - struct WorkerInfo { - cudaGraphExec_t graph_exec = nullptr; - cudaGraph_t graph = nullptr; - cudaStream_t stream = nullptr; - }; - - void AddWorker(std::uint32_t function_id, cudaGraphExec_t exec, - cudaGraph_t graph) { - cudaStream_t stream = nullptr; - ASSERT_EQ(cudaStreamCreate(&stream), cudaSuccess); - - cudaq::realtime::HostDispatchWorker w; - w.graph_exec = exec; - w.stream = stream; - w.function_id = function_id; - workers_.push_back(w); - worker_info_.push_back({exec, graph, stream}); - - std::size_t idx = function_table_count_; - function_table_[idx].handler.graph_exec = exec; - function_table_[idx].function_id = function_id; - function_table_[idx].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; - function_table_count_++; - } - - void StartLoop() { - idle_mask_->store((1ULL << workers_.size()) - 1, - cuda::std::memory_order_release); - - config_.rx_flags = reinterpret_cast( - const_cast(rx_flags_host_)); - config_.tx_flags = reinterpret_cast( - const_cast(tx_flags_host_)); - config_.rx_data_host = rx_data_host_; - config_.rx_data_dev = rx_data_dev_; - config_.tx_data_host = tx_data_host_; - config_.tx_data_dev = tx_data_dev_; - config_.tx_stride_sz = slot_size_; - config_.h_mailbox_bank = h_mailbox_bank_; - config_.num_slots = num_slots_; - config_.slot_size = slot_size_; - config_.workers = workers_; - config_.function_table = function_table_; - config_.function_table_count = function_table_count_; - config_.shutdown_flag = shutdown_flag_; - config_.stats_counter = &stats_counter_; - config_.live_dispatched = live_dispatched_; - config_.idle_mask = idle_mask_; - config_.inflight_slot_tags = inflight_slot_tags_; - - loop_thread_ = std::thread(cudaq::realtime::host_dispatcher_loop, config_); - } - - void WriteRpcRequest(std::size_t slot, std::uint32_t function_id, - const std::uint8_t *payload, std::size_t len) { - ASSERT_EQ(cudaq_host_ringbuffer_write_rpc_request( - &ringbuffer_, static_cast(slot), function_id, - payload, static_cast(len)), - CUDAQ_OK); - } - - void SignalSlot(std::size_t slot) { - cudaq_host_ringbuffer_signal_slot(&ringbuffer_, - static_cast(slot)); - } - - bool PollTxFlag(std::size_t slot, int timeout_ms = 2000) { - for (int waited = 0; waited < timeout_ms * 1000; waited += 200) { - cudaq_tx_status_t st = cudaq_host_ringbuffer_poll_tx_flag( - &ringbuffer_, static_cast(slot), nullptr); - if (st != CUDAQ_TX_EMPTY) - return true; - usleep(200); - } - return cudaq_host_ringbuffer_poll_tx_flag(&ringbuffer_, - static_cast(slot), - nullptr) != CUDAQ_TX_EMPTY; - } - - void StopLoop() { - shutdown_flag_->store(1, cuda::std::memory_order_release); - __sync_synchronize(); - if (loop_thread_.joinable()) - loop_thread_.join(); - loop_stopped_ = true; - } - - void RestoreWorker(int worker_id) { - idle_mask_->fetch_or(1ULL << worker_id, cuda::std::memory_order_release); - } - - void ClearSlot(std::size_t slot) { - cudaq_host_ringbuffer_clear_slot(&ringbuffer_, static_cast(slot)); - std::memset(rx_data_host_ + slot * slot_size_, 0, slot_size_); - } - - void VerifyResponse(std::size_t slot, const std::uint8_t *expected, - std::size_t len) { - int cuda_err = 0; - cudaq_tx_status_t st = cudaq_host_ringbuffer_poll_tx_flag( - &ringbuffer_, static_cast(slot), &cuda_err); - ASSERT_EQ(st, CUDAQ_TX_READY) - << "slot " << slot << ": tx_flag not READY (status=" << st - << " cuda_err=" << cuda_err << ")"; - - std::uint8_t *slot_data = rx_data_host_ + slot * slot_size_; - auto *resp = reinterpret_cast(slot_data); - ASSERT_EQ(resp->magic, CUDAQ_RPC_MAGIC_RESPONSE) - << "slot " << slot << ": expected response magic"; - ASSERT_EQ(resp->status, 0) << "slot " << slot << ": non-zero status"; - ASSERT_EQ(resp->result_len, static_cast(len)) - << "slot " << slot << ": wrong result_len"; - std::uint8_t *result = slot_data + sizeof(cudaq::realtime::RPCResponse); - for (std::size_t i = 0; i < len; ++i) { - EXPECT_EQ(result[i], expected[i]) << "slot " << slot << " byte " << i; - } - } - - std::size_t num_slots_ = 4; - std::size_t slot_size_ = 256; - - volatile uint64_t *rx_flags_host_ = nullptr; - volatile uint64_t *tx_flags_host_ = nullptr; - volatile uint64_t *rx_flags_dev_ = nullptr; - volatile uint64_t *tx_flags_dev_ = nullptr; - std::uint8_t *rx_data_host_ = nullptr; - std::uint8_t *tx_data_host_ = nullptr; - std::uint8_t *rx_data_dev_ = nullptr; - std::uint8_t *tx_data_dev_ = nullptr; - - void **h_mailbox_bank_ = nullptr; - void **d_mailbox_bank_ = nullptr; - - cudaq::realtime::atomic_uint64_sys *idle_mask_ = nullptr; - cudaq::realtime::atomic_uint64_sys *live_dispatched_ = nullptr; - int *inflight_slot_tags_ = nullptr; - cudaq::realtime::atomic_int_sys *shutdown_flag_ = nullptr; - uint64_t stats_counter_ = 0; - bool loop_stopped_ = false; - - cudaq_function_entry_t *function_table_ = nullptr; - std::size_t function_table_count_ = 0; - std::vector workers_; - std::vector worker_info_; - - cudaq_ringbuffer_t ringbuffer_{}; - cudaq::realtime::HostDispatcherConfig config_{}; - std::thread loop_thread_; -}; - -//============================================================================== -// Test 1: Smoke test — host loop starts and drops slot with unknown function_id -//============================================================================== - -constexpr std::uint32_t DUMMY_GRAPH_FUNCTION_ID = - cudaq::realtime::fnv1a_hash("dummy_graph"); -// Use a different function_id in the slot so the host loop does not find it. -constexpr std::uint32_t UNKNOWN_FUNCTION_ID = 0xdeadbeefu; - -class HostDispatcherSmokeTest : public ::testing::Test { -protected: - void SetUp() override { - ASSERT_TRUE(allocate_ring_buffer(num_slots_, slot_size_, &rx_flags_host_, - &rx_flags_, &rx_data_host_, &rx_data_)); - ASSERT_TRUE(allocate_ring_buffer(num_slots_, slot_size_, &tx_flags_host_, - &tx_flags_, &tx_data_host_, &tx_data_)); - - shutdown_flag_ = new (std::nothrow) int(0); - stats_ = new (std::nothrow) uint64_t(0); - ASSERT_NE(shutdown_flag_, nullptr); - ASSERT_NE(stats_, nullptr); - - ASSERT_TRUE(create_dummy_graph(&dummy_graph_, &dummy_graph_exec_)); - - host_table_ = new (std::nothrow) cudaq_function_entry_t[1]; - ASSERT_NE(host_table_, nullptr); - std::memset(host_table_, 0, sizeof(cudaq_function_entry_t)); - host_table_[0].handler.graph_exec = dummy_graph_exec_; - host_table_[0].function_id = DUMMY_GRAPH_FUNCTION_ID; - host_table_[0].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; - - ASSERT_EQ(cudaq_dispatch_manager_create(&manager_), CUDAQ_OK); - cudaq_dispatcher_config_t config{}; - config.device_id = 0; - config.num_slots = static_cast(num_slots_); - config.slot_size = static_cast(slot_size_); - config.backend = CUDAQ_BACKEND_HOST_LOOP; - ASSERT_EQ(cudaq_dispatcher_create(manager_, &config, &dispatcher_), - CUDAQ_OK); - - std::memset(&ringbuffer_, 0, sizeof(ringbuffer_)); - ringbuffer_.rx_flags = rx_flags_; - ringbuffer_.tx_flags = tx_flags_; - ringbuffer_.rx_data = rx_data_; - ringbuffer_.tx_data = tx_data_; - ringbuffer_.rx_stride_sz = slot_size_; - ringbuffer_.tx_stride_sz = slot_size_; - ringbuffer_.rx_flags_host = rx_flags_host_; - ringbuffer_.tx_flags_host = tx_flags_host_; - ringbuffer_.rx_data_host = rx_data_host_; - ringbuffer_.tx_data_host = tx_data_host_; - ASSERT_EQ(cudaq_dispatcher_set_ringbuffer(dispatcher_, &ringbuffer_), - CUDAQ_OK); - - cudaq_function_table_t table{}; - table.entries = host_table_; - table.count = 1; - ASSERT_EQ(cudaq_dispatcher_set_function_table(dispatcher_, &table), - CUDAQ_OK); - - ASSERT_EQ(cudaq_dispatcher_set_control(dispatcher_, shutdown_flag_, stats_), - CUDAQ_OK); - ASSERT_EQ(cudaq_dispatcher_start(dispatcher_), CUDAQ_OK); - } - - void TearDown() override { - if (shutdown_flag_) { - *shutdown_flag_ = 1; - __sync_synchronize(); - } - if (dispatcher_) { - cudaq_dispatcher_stop(dispatcher_); - cudaq_dispatcher_destroy(dispatcher_); - dispatcher_ = nullptr; - } - if (manager_) { - cudaq_dispatch_manager_destroy(manager_); - manager_ = nullptr; - } - free_ring_buffer(rx_flags_host_, rx_data_host_); - free_ring_buffer(tx_flags_host_, tx_data_host_); - if (shutdown_flag_) - delete shutdown_flag_; - if (stats_) - delete stats_; - if (host_table_) - delete[] host_table_; - if (dummy_graph_exec_) - cudaGraphExecDestroy(dummy_graph_exec_); - if (dummy_graph_) - cudaGraphDestroy(dummy_graph_); - } - - void write_rpc_request_unknown_function(std::size_t slot) { - const std::uint8_t payload[] = {0, 1, 2, 3}; - ASSERT_EQ(cudaq_host_ringbuffer_write_rpc_request( - &ringbuffer_, static_cast(slot), - UNKNOWN_FUNCTION_ID, payload, 4), - CUDAQ_OK); - } - - static constexpr std::size_t num_slots_ = 2; - std::size_t slot_size_ = 256; - - volatile uint64_t *rx_flags_host_ = nullptr; - volatile uint64_t *tx_flags_host_ = nullptr; - volatile uint64_t *rx_flags_ = nullptr; - volatile uint64_t *tx_flags_ = nullptr; - std::uint8_t *rx_data_host_ = nullptr; - std::uint8_t *tx_data_host_ = nullptr; - std::uint8_t *rx_data_ = nullptr; - std::uint8_t *tx_data_ = nullptr; - - int *shutdown_flag_ = nullptr; - uint64_t *stats_ = nullptr; - cudaq_function_entry_t *host_table_ = nullptr; - cudaGraph_t dummy_graph_ = nullptr; - cudaGraphExec_t dummy_graph_exec_ = nullptr; - - cudaq_ringbuffer_t ringbuffer_{}; - cudaq_dispatch_manager_t *manager_ = nullptr; - cudaq_dispatcher_t *dispatcher_ = nullptr; -}; - -TEST_F(HostDispatcherSmokeTest, DropsSlotWithUnknownFunctionId) { - write_rpc_request_unknown_function(0); - cudaq_host_ringbuffer_signal_slot(&ringbuffer_, 0); - - for (int i = 0; i < 50; ++i) { - usleep(1000); - cudaq_tx_status_t st = - cudaq_host_ringbuffer_poll_tx_flag(&ringbuffer_, 0, nullptr); - if (st != CUDAQ_TX_EMPTY) - break; - } - - cudaq_tx_status_t final_st = - cudaq_host_ringbuffer_poll_tx_flag(&ringbuffer_, 0, nullptr); - EXPECT_EQ(final_st, CUDAQ_TX_EMPTY) - << "Host loop should drop slot with unknown function_id (no response)"; -} - -//============================================================================== -// Test 2: GRAPH_LAUNCH via host loop (full RPC round-trip) using the C API -// -// End-to-end test of: RPC in ring buffer → C API dispatcher → CUDA graph -// launch via pinned mailbox → in-place response. -// -// Flow: -// 1. Allocate pinned ring buffers and pinned mailbox (cudaHostAllocMapped). -// 2. Capture graph_increment_kernel with d_mailbox_bank baked in. -// 3. Build function table with one GRAPH_LAUNCH entry. -// 4. Wire the C API: manager → dispatcher → ringbuffer, function table, -// control, mailbox → start. -// 5. Write an RPC request {0,1,2,3} into slot 0 and signal rx_flags. -// 6. The dispatcher picks up the slot, matches function_id → GRAPH_LAUNCH, -// acquires the idle worker, writes the slot device pointer into the -// pinned mailbox, and launches the graph. -// 7. The graph reads the slot pointer from the mailbox, increments each -// payload byte, and writes an RPCResponse header in-place. -// 8. Test polls tx_flags, syncs device, then asserts the response is -// {1,2,3,4} with correct magic/status/result_len. -//============================================================================== - -TEST(HostDispatcherGraphLaunchTest, FullRpcRoundTripViaPinnedMailbox) { - constexpr std::size_t num_slots = 2; - constexpr std::size_t slot_size = 256; - - // --- Ring buffers --- - // Separate flag arrays for RX and TX: the dispatcher clears rx_flags[slot] - // right after setting tx_flags[slot], so sharing would clobber the signal. - // Data buffers are shared (graph writes response in-place to the RX slot). - volatile uint64_t *rx_flags_host = nullptr; - volatile uint64_t *rx_flags_dev = nullptr; - std::uint8_t *rx_data_host = nullptr; - std::uint8_t *rx_data_dev = nullptr; - volatile uint64_t *tx_flags_host = nullptr; - volatile uint64_t *tx_flags_dev = nullptr; - std::uint8_t *tx_data_host_unused = nullptr; - std::uint8_t *tx_data_dev_unused = nullptr; - - ASSERT_TRUE(allocate_ring_buffer(num_slots, slot_size, &rx_flags_host, - &rx_flags_dev, &rx_data_host, &rx_data_dev)); - ASSERT_TRUE(allocate_ring_buffer(num_slots, slot_size, &tx_flags_host, - &tx_flags_dev, &tx_data_host_unused, - &tx_data_dev_unused)); - - // --- Pinned mailbox --- - // cudaHostAllocMapped gives us host + device views of the same memory. - // The host dispatcher writes the slot device pointer to h_mailbox_bank[0]; - // the graph reads it from d_mailbox_bank[0] (same physical location). - void **h_mailbox_bank = nullptr; - void **d_mailbox_bank = nullptr; - CUDA_CHECK( - cudaHostAlloc(&h_mailbox_bank, sizeof(void *), cudaHostAllocMapped)); - std::memset(h_mailbox_bank, 0, sizeof(void *)); - CUDA_CHECK( - cudaHostGetDevicePointer((void **)&d_mailbox_bank, h_mailbox_bank, 0)); - - // --- Graph --- - // Capture graph_increment_kernel with d_mailbox_bank baked in as the - // kernel arg. At runtime the kernel reads *d_mailbox_bank to find - // the slot, so different slots can be processed on each launch. - cudaGraph_t graph = nullptr; - cudaGraphExec_t graph_exec = nullptr; - ASSERT_TRUE(create_increment_graph(d_mailbox_bank, &graph, &graph_exec)); - - // --- Function table (one GRAPH_LAUNCH entry) --- - cudaq_function_entry_t host_table[1]; - std::memset(host_table, 0, sizeof(host_table)); - host_table[0].function_id = RPC_GRAPH_INCREMENT_FUNCTION_ID; - host_table[0].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; - host_table[0].handler.graph_exec = graph_exec; - - // --- C API: create manager + dispatcher --- - cudaq_dispatch_manager_t *manager = nullptr; - ASSERT_EQ(cudaq_dispatch_manager_create(&manager), CUDAQ_OK); - - cudaq_dispatcher_config_t disp_config{}; - disp_config.device_id = 0; - disp_config.num_slots = static_cast(num_slots); - disp_config.slot_size = static_cast(slot_size); - disp_config.backend = CUDAQ_BACKEND_HOST_LOOP; - - cudaq_dispatcher_t *dispatcher = nullptr; - ASSERT_EQ(cudaq_dispatcher_create(manager, &disp_config, &dispatcher), - CUDAQ_OK); - - // --- Wire ring buffer (rx/tx flags separate, data shared for in-place) --- - cudaq_ringbuffer_t ringbuffer{}; - ringbuffer.rx_flags = rx_flags_dev; - ringbuffer.tx_flags = tx_flags_dev; - ringbuffer.rx_data = rx_data_dev; - ringbuffer.tx_data = rx_data_dev; - ringbuffer.rx_stride_sz = slot_size; - ringbuffer.tx_stride_sz = slot_size; - ringbuffer.rx_flags_host = rx_flags_host; - ringbuffer.tx_flags_host = tx_flags_host; - ringbuffer.rx_data_host = rx_data_host; - ringbuffer.tx_data_host = rx_data_host; - ASSERT_EQ(cudaq_dispatcher_set_ringbuffer(dispatcher, &ringbuffer), CUDAQ_OK); - - cudaq_function_table_t table{}; - table.entries = host_table; - table.count = 1; - ASSERT_EQ(cudaq_dispatcher_set_function_table(dispatcher, &table), CUDAQ_OK); - - int shutdown_flag = 0; - uint64_t stats_counter = 0; - ASSERT_EQ( - cudaq_dispatcher_set_control(dispatcher, &shutdown_flag, &stats_counter), - CUDAQ_OK); - - // Provide the caller-allocated pinned mailbox so the dispatcher uses it - // instead of allocating plain host memory (which the graph can't read). - ASSERT_EQ(cudaq_dispatcher_set_mailbox(dispatcher, h_mailbox_bank), CUDAQ_OK); - - // --- Start --- - ASSERT_EQ(cudaq_dispatcher_start(dispatcher), CUDAQ_OK); - - // --- Send RPC request (simulates FPGA / producer) --- - const std::uint8_t payload[] = {0, 1, 2, 3}; - ASSERT_EQ(cudaq_host_ringbuffer_write_rpc_request( - &ringbuffer, 0, RPC_GRAPH_INCREMENT_FUNCTION_ID, payload, 4), - CUDAQ_OK); - cudaq_host_ringbuffer_signal_slot(&ringbuffer, 0); - - // --- Verify: dispatcher picked up slot and launched graph --- - int cuda_err = 0; - cudaq_tx_status_t st = CUDAQ_TX_EMPTY; - for (int i = 0; i < 5000 && st == CUDAQ_TX_EMPTY; ++i) { - usleep(200); - st = cudaq_host_ringbuffer_poll_tx_flag(&ringbuffer, 0, &cuda_err); - } - ASSERT_NE(st, CUDAQ_TX_EMPTY) << "Timeout waiting for tx flag"; - ASSERT_NE(st, CUDAQ_TX_ERROR) - << "Dispatcher reported graph launch error (cuda_err=" << cuda_err << ")"; - - // cudaGraphLaunch is async; sync device so the in-place response is visible - CUDA_CHECK(cudaDeviceSynchronize()); - - // --- Verify: graph wrote correct response in-place --- - std::uint8_t *slot_data = rx_data_host + 0 * slot_size; - auto *resp = reinterpret_cast(slot_data); - ASSERT_EQ(resp->magic, CUDAQ_RPC_MAGIC_RESPONSE) - << "Expected response magic (graph in-place write)"; - ASSERT_EQ(resp->status, 0); - ASSERT_EQ(resp->result_len, 4u); - std::uint8_t *result = slot_data + sizeof(cudaq::realtime::RPCResponse); - EXPECT_EQ(result[0], 1); - EXPECT_EQ(result[1], 2); - EXPECT_EQ(result[2], 3); - EXPECT_EQ(result[3], 4); - - // --- Teardown (C API handles thread join) --- - shutdown_flag = 1; - __sync_synchronize(); - cudaq_dispatcher_stop(dispatcher); - cudaq_dispatcher_destroy(dispatcher); - cudaq_dispatch_manager_destroy(manager); - - cudaGraphExecDestroy(graph_exec); - cudaGraphDestroy(graph); - cudaFreeHost(h_mailbox_bank); - free_ring_buffer(rx_flags_host, rx_data_host); - free_ring_buffer(tx_flags_host, tx_data_host_unused); -} - -//============================================================================== -// Test 3: Multiple workers with function_id routing (internal API) -// -// Two workers: worker 0 runs graph_increment_kernel (func_id A), -// worker 1 runs graph_double_kernel (func_id B). Sends one RPC per worker -// and verifies each graph produced the expected output, confirming the -// dispatcher routed by function_id. -//============================================================================== - -TEST_F(HostDispatcherLoopTest, MultiWorkerFunctionIdRouting) { - cudaGraph_t inc_graph = nullptr; - cudaGraphExec_t inc_exec = nullptr; - ASSERT_TRUE( - create_increment_graph(d_mailbox_bank_ + 0, &inc_graph, &inc_exec)); - AddWorker(RPC_GRAPH_INCREMENT_FUNCTION_ID, inc_exec, inc_graph); - - cudaGraph_t dbl_graph = nullptr; - cudaGraphExec_t dbl_exec = nullptr; - ASSERT_TRUE(create_double_graph(d_mailbox_bank_ + 1, &dbl_graph, &dbl_exec)); - AddWorker(RPC_GRAPH_DOUBLE_FUNCTION_ID, dbl_exec, dbl_graph); - - StartLoop(); - - const std::uint8_t payload[] = {1, 2, 3, 4}; - WriteRpcRequest(0, RPC_GRAPH_INCREMENT_FUNCTION_ID, payload, 4); - WriteRpcRequest(1, RPC_GRAPH_DOUBLE_FUNCTION_ID, payload, 4); - SignalSlot(0); - SignalSlot(1); - - ASSERT_TRUE(PollTxFlag(0)) << "Timeout on slot 0 (increment)"; - ASSERT_TRUE(PollTxFlag(1)) << "Timeout on slot 1 (double)"; - ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); - - const std::uint8_t expected_inc[] = {2, 3, 4, 5}; - const std::uint8_t expected_dbl[] = {2, 4, 6, 8}; - VerifyResponse(0, expected_inc, 4); - VerifyResponse(1, expected_dbl, 4); -} - -//============================================================================== -// Test 4: Worker recycling — idle_mask round-trip (internal API) -// -// One worker, two sequential RPCs to the same slot. The second dispatch -// can only proceed after the test restores idle_mask (simulating the -// external worker thread that returns the worker to the pool). -//============================================================================== - -TEST_F(HostDispatcherLoopTest, WorkerRecycling) { - cudaGraph_t graph = nullptr; - cudaGraphExec_t exec = nullptr; - ASSERT_TRUE(create_increment_graph(d_mailbox_bank_, &graph, &exec)); - AddWorker(RPC_GRAPH_INCREMENT_FUNCTION_ID, exec, graph); - - StartLoop(); - - // RPC 1 on slot 0 — after dispatch, current_slot advances to 1. - const std::uint8_t payload1[] = {0, 1, 2, 3}; - WriteRpcRequest(0, RPC_GRAPH_INCREMENT_FUNCTION_ID, payload1, 4); - SignalSlot(0); - ASSERT_TRUE(PollTxFlag(0)) << "Timeout on first RPC"; - ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); - - const std::uint8_t expected1[] = {1, 2, 3, 4}; - VerifyResponse(0, expected1, 4); - - RestoreWorker(0); - - // RPC 2 on slot 1 — the dispatcher is now polling slot 1. - // This can only dispatch if idle_mask was properly restored above. - const std::uint8_t payload2[] = {10, 11, 12, 13}; - WriteRpcRequest(1, RPC_GRAPH_INCREMENT_FUNCTION_ID, payload2, 4); - SignalSlot(1); - ASSERT_TRUE(PollTxFlag(1)) << "Timeout on second RPC (worker not recycled?)"; - ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); - - const std::uint8_t expected2[] = {11, 12, 13, 14}; - VerifyResponse(1, expected2, 4); -} - -//============================================================================== -// Test 5: Backpressure — dispatcher stalls when all workers are busy -// -// One worker, two slots signalled simultaneously. Slot 0 dispatches -// immediately; slot 1 stalls until the test restores idle_mask. -//============================================================================== - -TEST_F(HostDispatcherLoopTest, BackpressureWhenAllBusy) { - cudaGraph_t graph = nullptr; - cudaGraphExec_t exec = nullptr; - ASSERT_TRUE(create_increment_graph(d_mailbox_bank_, &graph, &exec)); - AddWorker(RPC_GRAPH_INCREMENT_FUNCTION_ID, exec, graph); - - StartLoop(); - - const std::uint8_t payload0[] = {0, 1, 2, 3}; - const std::uint8_t payload1[] = {10, 11, 12, 13}; - WriteRpcRequest(0, RPC_GRAPH_INCREMENT_FUNCTION_ID, payload0, 4); - WriteRpcRequest(1, RPC_GRAPH_INCREMENT_FUNCTION_ID, payload1, 4); - SignalSlot(0); - SignalSlot(1); - - ASSERT_TRUE(PollTxFlag(0)) << "Timeout on slot 0"; - ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); - - // Slot 1 should still be pending — worker is busy. - EXPECT_EQ(tx_flags_host_[1], 0u) - << "Slot 1 should stall while worker is busy"; - - RestoreWorker(0); - - ASSERT_TRUE(PollTxFlag(1)) << "Timeout on slot 1 after restoring worker"; - ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); - - const std::uint8_t expected0[] = {1, 2, 3, 4}; - const std::uint8_t expected1[] = {11, 12, 13, 14}; - VerifyResponse(0, expected0, 4); - VerifyResponse(1, expected1, 4); - - EXPECT_EQ(live_dispatched_->load(cuda::std::memory_order_acquire), 2u); - - StopLoop(); - EXPECT_EQ(stats_counter_, 2u); -} - -//============================================================================== -// Test 6: Stats counter accuracy (internal API) -// -// Sends 5 sequential RPCs through a single worker (recycling between each) -// and verifies stats_counter == 5 at the end. -//============================================================================== - -TEST_F(HostDispatcherLoopTest, StatsCounterAccuracy) { - cudaGraph_t graph = nullptr; - cudaGraphExec_t exec = nullptr; - ASSERT_TRUE(create_increment_graph(d_mailbox_bank_, &graph, &exec)); - AddWorker(RPC_GRAPH_INCREMENT_FUNCTION_ID, exec, graph); - - StartLoop(); - - // Sequential RPCs through slots 0,1,2,3,0 — the dispatcher advances - // current_slot after each dispatch, so each RPC must target the next slot. - // When wrapping back to slot 0 for the 5th RPC, clear its tx_flags first. - constexpr int kNumRpcs = 5; - for (int i = 0; i < kNumRpcs; ++i) { - std::size_t slot = static_cast(i % num_slots_); - if (i >= static_cast(num_slots_)) - ClearSlot(slot); - - std::uint8_t payload[] = {static_cast(i * 10), - static_cast(i * 10 + 1), - static_cast(i * 10 + 2), - static_cast(i * 10 + 3)}; - WriteRpcRequest(slot, RPC_GRAPH_INCREMENT_FUNCTION_ID, payload, 4); - SignalSlot(slot); - ASSERT_TRUE(PollTxFlag(slot)) - << "Timeout on RPC " << i << " (slot " << slot << ")"; - ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); - - std::uint8_t expected[] = {static_cast(i * 10 + 1), - static_cast(i * 10 + 2), - static_cast(i * 10 + 3), - static_cast(i * 10 + 4)}; - VerifyResponse(slot, expected, 4); - - RestoreWorker(0); - } - - EXPECT_EQ(live_dispatched_->load(cuda::std::memory_order_acquire), - static_cast(kNumRpcs)); - - StopLoop(); - EXPECT_EQ(stats_counter_, static_cast(kNumRpcs)); -} - -//============================================================================== -// Test 7: Multi-slot round-robin dispatch (internal API) -// -// 4 slots, 4 workers (all same function_id). All slots signalled at once; -// the dispatcher processes them 0 → 1 → 2 → 3 using one worker each. -//============================================================================== - -TEST_F(HostDispatcherLoopTest, MultiSlotRoundRobin) { - constexpr int kNumSlots = 4; - cudaGraph_t graphs[kNumSlots]; - cudaGraphExec_t execs[kNumSlots]; - for (int i = 0; i < kNumSlots; ++i) { - ASSERT_TRUE( - create_increment_graph(d_mailbox_bank_ + i, &graphs[i], &execs[i])); - AddWorker(RPC_GRAPH_INCREMENT_FUNCTION_ID, execs[i], graphs[i]); - } - - StartLoop(); - - for (int i = 0; i < kNumSlots; ++i) { - std::uint8_t payload[] = {static_cast(i * 4 + 1), - static_cast(i * 4 + 2), - static_cast(i * 4 + 3), - static_cast(i * 4 + 4)}; - WriteRpcRequest(static_cast(i), - RPC_GRAPH_INCREMENT_FUNCTION_ID, payload, 4); - } - - for (int i = 0; i < kNumSlots; ++i) - SignalSlot(static_cast(i)); - - for (int i = 0; i < kNumSlots; ++i) { - ASSERT_TRUE(PollTxFlag(static_cast(i))) - << "Timeout on slot " << i; - } - ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); - - for (int i = 0; i < kNumSlots; ++i) { - std::uint8_t expected[] = {static_cast(i * 4 + 2), - static_cast(i * 4 + 3), - static_cast(i * 4 + 4), - static_cast(i * 4 + 5)}; - VerifyResponse(static_cast(i), expected, 4); - } - - EXPECT_EQ(live_dispatched_->load(cuda::std::memory_order_acquire), - static_cast(kNumSlots)); - - StopLoop(); - EXPECT_EQ(stats_counter_, static_cast(kNumSlots)); -} - -} // namespace diff --git a/realtime/unittests/utils/CMakeLists.txt b/realtime/unittests/utils/CMakeLists.txt deleted file mode 100644 index d6811a1f..00000000 --- a/realtime/unittests/utils/CMakeLists.txt +++ /dev/null @@ -1,264 +0,0 @@ -# ============================================================================ # -# Copyright (c) 2026 NVIDIA Corporation & Affiliates. # -# All rights reserved. # -# # -# This source code and the accompanying materials are made available under # -# the terms of the Apache License 2.0 which accompanies this distribution. # -# ============================================================================ # - -# Hololink bridge and playback tools -# ============================================================================== -# These targets are gated by CUDAQ_REALTIME_ENABLE_HOLOLINK_TOOLS and require -# a pre-built hololink (holoscan-sensor-bridge) with DOCA support. -# They are NOT CI tests -- they need FPGA hardware or an FPGA emulator. - -if (NOT HOLOSCAN_SENSOR_BRIDGE_SOURCE_DIR) - message(FATAL_ERROR - "HOLOSCAN_SENSOR_BRIDGE_SOURCE_DIR must be set when building hololink tools.") -endif() -if (NOT HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR) - message(FATAL_ERROR - "HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR must be set when building hololink tools.") -endif() - -find_package(Threads REQUIRED) -find_package(CUDAToolkit REQUIRED) - -# --------------------------------------------------------------------------- # -# Find Hololink core library -# --------------------------------------------------------------------------- # - -find_library(HOLOLINK_CORE_LIB - NAMES hololink_core - PATHS - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/core" - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" - NO_DEFAULT_PATH) - -if (NOT HOLOLINK_CORE_LIB) - message(FATAL_ERROR - "Could not find hololink_core library under ${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}.") -endif() - -# --------------------------------------------------------------------------- # -# Find GPU RoCE Transceiver library -# --------------------------------------------------------------------------- # - -find_library(GPU_ROCE_TRANSCEIVER_LIB - NAMES gpu_roce_transceiver - PATHS - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/operators/gpu_roce_transceiver" - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" - NO_DEFAULT_PATH) - -if (NOT GPU_ROCE_TRANSCEIVER_LIB) - message(WARNING - "Could not find gpu_roce_transceiver library. " - "hololink_bridge will not be built.") -endif() - -# --------------------------------------------------------------------------- # -# Find transitive Hololink libraries -# --------------------------------------------------------------------------- # - -find_library(HOLOLINK_COMMON_LIB - NAMES hololink - PATHS - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/common" - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" - NO_DEFAULT_PATH) - -find_library(ROCE_RECEIVER_LIB - NAMES roce_receiver - PATHS - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/operators/roce_receiver" - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" - NO_DEFAULT_PATH) - -find_library(BASE_RECEIVER_OP_LIB - NAMES base_receiver_op - PATHS - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/operators" - "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" - NO_DEFAULT_PATH) - -find_library(IBVERBS_LIB NAMES ibverbs) - -# --------------------------------------------------------------------------- # -# Find DOCA libraries -# --------------------------------------------------------------------------- # - -set(DOCA_PATH "/opt/mellanox/doca") - -if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)") - set(DOCA_LIB_DIR "${DOCA_PATH}/lib/x86_64-linux-gnu") -elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)") - set(DOCA_LIB_DIR "${DOCA_PATH}/lib/aarch64-linux-gnu") -else() - set(DOCA_LIB_DIR "${DOCA_PATH}/lib") -endif() - -find_path(DOCA_INCLUDE_DIR doca_verbs.h - PATHS ${DOCA_PATH}/include - NO_DEFAULT_PATH) - -find_library(DOCA_VERBS_LIB doca_verbs - PATHS ${DOCA_LIB_DIR} - NO_DEFAULT_PATH) - -find_library(DOCA_GPUNETIO_LIB doca_gpunetio - PATHS ${DOCA_LIB_DIR} - NO_DEFAULT_PATH) - -find_library(DOCA_COMMON_LIB doca_common - PATHS ${DOCA_LIB_DIR} - NO_DEFAULT_PATH) - -# --------------------------------------------------------------------------- # -# Find Holoscan (required by gpu_roce_transceiver -> holoscan::core) -# --------------------------------------------------------------------------- # - -find_package(holoscan QUIET) - -# --------------------------------------------------------------------------- # -# Find fmt (transitive dependency of hololink logging) -# --------------------------------------------------------------------------- # - -find_path(FMT_INCLUDE_DIR - NAMES fmt/format.h - PATHS /opt/nvidia/holoscan /usr/local/cudaq /usr /usr/local - PATH_SUFFIXES include - NO_DEFAULT_PATH) - -# =========================================================================== # -# hololink_fpga_playback (no GPU / DOCA dependency) -# =========================================================================== # - -add_executable(hololink_fpga_playback - hololink_fpga_playback.cpp) - -target_include_directories(hololink_fpga_playback - PRIVATE ${CUDAQ_REALTIME_INCLUDE_DIR}) - -target_link_libraries(hololink_fpga_playback - PRIVATE Threads::Threads) - -# =========================================================================== # -# hololink_bridge (generic increment bridge) -# =========================================================================== # - -if (GPU_ROCE_TRANSCEIVER_LIB AND - DOCA_INCLUDE_DIR AND DOCA_VERBS_LIB AND DOCA_COMMON_LIB AND - DOCA_GPUNETIO_LIB) - - message(STATUS "Building hololink_bridge (generic increment)") - message(STATUS " GPU RoCE Transceiver: ${GPU_ROCE_TRANSCEIVER_LIB}") - - # Hololink wrapper static library (compiled by g++, isolates fmt) - add_library(hololink_wrapper_generic STATIC - hololink_wrapper.cpp) - - target_include_directories(hololink_wrapper_generic - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR} - "${HOLOSCAN_SENSOR_BRIDGE_SOURCE_DIR}/src" - ${DOCA_INCLUDE_DIR} - ${CUDAToolkit_INCLUDE_DIRS} - ${FMT_INCLUDE_DIR}) - - target_link_libraries(hololink_wrapper_generic - PRIVATE ${GPU_ROCE_TRANSCEIVER_LIB}) - - target_compile_options(hololink_wrapper_generic PRIVATE -Wno-deprecated-declarations) - - # Increment function table (compiled by nvcc) - add_library(rpc_increment_ft STATIC - init_rpc_increment_function_table.cu) - - set_target_properties(rpc_increment_ft PROPERTIES - CUDA_SEPARABLE_COMPILATION ON - CUDA_STANDARD 17) - - target_include_directories(rpc_increment_ft PRIVATE - ${CUDAQ_REALTIME_INCLUDE_DIR} - ${CUDAToolkit_INCLUDE_DIRS}) - - # Bridge executable (.cpp, linked with CUDA) - add_executable(hololink_bridge - hololink_bridge.cpp) - - set_target_properties(hololink_bridge PROPERTIES - LINKER_LANGUAGE CUDA - CUDA_SEPARABLE_COMPILATION ON - CUDA_RESOLVE_DEVICE_SYMBOLS ON) - - target_include_directories(hololink_bridge - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR} - ${CUDAQ_REALTIME_INCLUDE_DIR} - ${CUDAToolkit_INCLUDE_DIRS}) - - # Link order: static archives first, then shared - target_link_libraries(hololink_bridge - PRIVATE - rpc_increment_ft - cudaq-realtime-dispatch - hololink_wrapper_generic - ${GPU_ROCE_TRANSCEIVER_LIB} - ${ROCE_RECEIVER_LIB} - ${BASE_RECEIVER_OP_LIB} - ${HOLOLINK_CORE_LIB} - ${HOLOLINK_COMMON_LIB} - cudaq-realtime - CUDA::cudart - CUDA::cuda_driver - ${DOCA_VERBS_LIB} - ${DOCA_GPUNETIO_LIB} - ${DOCA_COMMON_LIB} - ${IBVERBS_LIB} - Threads::Threads - ${CMAKE_DL_LIBS}) - - if (holoscan_FOUND) - target_link_libraries(hololink_bridge PRIVATE holoscan::core) - target_link_libraries(hololink_wrapper_generic PRIVATE holoscan::core) - endif() - - # Set RPATH for shared libraries - set_target_properties(hololink_bridge PROPERTIES - BUILD_RPATH "${DOCA_LIB_DIR}" - INSTALL_RPATH "${DOCA_LIB_DIR}") - -else() - if (NOT GPU_ROCE_TRANSCEIVER_LIB) - message(WARNING "gpu_roce_transceiver library not found. " - "hololink_bridge will not be built.") - endif() - if (NOT DOCA_INCLUDE_DIR OR NOT DOCA_VERBS_LIB) - message(WARNING "DOCA libraries not found. " - "hololink_bridge requires DOCA.") - endif() -endif() - -# =========================================================================== # -# hololink_fpga_emulator (software FPGA, libibverbs only) -# =========================================================================== # - -if (IBVERBS_LIB) - message(STATUS "Building hololink_fpga_emulator") - - add_executable(hololink_fpga_emulator - hololink_fpga_emulator.cpp) - - target_link_libraries(hololink_fpga_emulator - PRIVATE - ${IBVERBS_LIB} - Threads::Threads) -else() - message(WARNING "libibverbs not found. hololink_fpga_emulator will not be built.") -endif() diff --git a/realtime/unittests/utils/hololink_bridge.cpp b/realtime/unittests/utils/hololink_bridge.cpp deleted file mode 100644 index 0f10caa9..00000000 --- a/realtime/unittests/utils/hololink_bridge.cpp +++ /dev/null @@ -1,124 +0,0 @@ -/****************************************************************-*- C++ -*-**** - * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * - * All rights reserved. * - * * - * This source code and the accompanying materials are made available under * - * the terms of the Apache License 2.0 which accompanies this distribution. * - ******************************************************************************/ - -/// @file hololink_bridge.cpp -/// @brief Generic Hololink bridge tool for testing libcudaq-realtime dispatch. -/// -/// Registers a simple increment RPC handler (adds 1 to each byte) and wires -/// it through the Hololink GPU-RoCE Transceiver. No QEC or decoder dependency. -/// -/// Usage: -/// ./hololink_bridge \ -/// --device=rocep1s0f0 \ -/// --peer-ip=10.0.0.2 \ -/// --remote-qp=0x2 \ -/// --gpu=0 \ -/// --timeout=60 - -#include -#include -#include -#include - -#include - -#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" -#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" -#include "cudaq/realtime/hololink_bridge_common.h" - -//============================================================================== -// Increment RPC Handler Function Table -//============================================================================== - -// The actual __device__ rpc_increment_handler lives in -// init_rpc_increment_function_table.cu (compiled by nvcc). We declare the -// host-callable setup function here so this .cpp can be compiled by g++. - -extern "C" void -setup_rpc_increment_function_table(cudaq_function_entry_t *d_entries); - -//============================================================================== -// Main -//============================================================================== - -int main(int argc, char *argv[]) { - // Check for help - for (int i = 1; i < argc; i++) { - std::string arg = argv[i]; - if (arg == "--help" || arg == "-h") { - std::cout - << "Usage: " << argv[0] << " [options]\n" - << "\n" - << "Generic Hololink bridge for testing libcudaq-realtime dispatch.\n" - << "Registers increment handler (adds 1 to each byte of the RPC " - "payload).\n" - << "\n" - << "Options:\n" - << " --device=NAME IB device (default: rocep1s0f0)\n" - << " --peer-ip=ADDR FPGA/emulator IP (default: 10.0.0.2)\n" - << " --remote-qp=N Remote QP number (default: 0x2)\n" - << " --gpu=N GPU device ID (default: 0)\n" - << " --timeout=N Timeout in seconds (default: 60)\n" - << " --page-size=N Ring buffer slot size (default: 384)\n" - << " --num-pages=N Number of ring buffer slots (default: " - "64)\n" - << " --exchange-qp Enable QP exchange protocol\n" - << " --exchange-port=N TCP port for QP exchange (default: " - "12345)\n"; - return 0; - } - } - - try { - std::cout << "=== Hololink Generic Bridge ===" << std::endl; - - // Parse common bridge args - cudaq::realtime::BridgeConfig config; - cudaq::realtime::parse_bridge_args(argc, argv, config); - - // Frame size: RPCHeader + 256 bytes payload - config.frame_size = sizeof(cudaq::realtime::RPCHeader) + 256; - - std::cout << "Device: " << config.device << std::endl; - std::cout << "Peer IP: " << config.peer_ip << std::endl; - std::cout << "Remote QP: 0x" << std::hex << config.remote_qp << std::dec - << std::endl; - std::cout << "GPU: " << config.gpu_id << std::endl; - - // Initialize CUDA early to allocate function table - cudaError_t err = cudaSetDevice(config.gpu_id); - if (err != cudaSuccess) { - std::cerr << "ERROR: cudaSetDevice failed: " << cudaGetErrorString(err) - << std::endl; - return 1; - } - - // Set up increment RPC function table on GPU - cudaq_function_entry_t *d_function_entries = nullptr; - err = cudaMalloc(&d_function_entries, sizeof(cudaq_function_entry_t)); - if (err != cudaSuccess) { - std::cerr << "ERROR: cudaMalloc failed: " << cudaGetErrorString(err) - << std::endl; - return 1; - } - setup_rpc_increment_function_table(d_function_entries); - - config.d_function_entries = d_function_entries; - config.func_count = 1; - config.launch_fn = &cudaq::realtime::bridge_launch_dispatch_kernel; - config.cleanup_fn = [d_function_entries]() { - cudaFree(d_function_entries); - }; - - return cudaq::realtime::bridge_run(config); - - } catch (const std::exception &e) { - std::cerr << "ERROR: " << e.what() << std::endl; - return 1; - } -} diff --git a/realtime/unittests/utils/hololink_fpga_emulator.cpp b/realtime/unittests/utils/hololink_fpga_emulator.cpp deleted file mode 100644 index 284fff87..00000000 --- a/realtime/unittests/utils/hololink_fpga_emulator.cpp +++ /dev/null @@ -1,1210 +0,0 @@ -/****************************************************************-*- C++ -*-**** - * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * - * All rights reserved. * - * * - * This source code and the accompanying materials are made available under * - * the terms of the Apache License 2.0 which accompanies this distribution. * - ******************************************************************************/ - -/// @file hololink_fpga_emulator.cpp -/// @brief Software FPGA emulator for Hololink RPC testing. -/// -/// Emulates the FPGA's role in the RPC pipeline: -/// 1. Hololink UDP control plane server (register read/write) -/// 2. Playback BRAM (receives payloads from playback tool) -/// 3. RDMA transmit (sends RPC requests to bridge) -/// 4. RDMA receive (receives RPC responses from bridge) -/// 5. ILA capture RAM (stores responses for verification readback) -/// -/// Three-tool workflow: -/// 1. Start this emulator (prints QP number) -/// 2. Start hololink_mock_decoder_bridge with --remote-qp= -/// 3. Start hololink_fpga_syndrome_playback --control-port= -/// with bridge's QP/RKEY/buffer-addr -/// -/// The playback tool drives the emulator via UDP just as it would a real FPGA. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -//============================================================================== -// Global shutdown flag -//============================================================================== - -static std::atomic g_shutdown{false}; -static void signal_handler(int) { g_shutdown = true; } - -//============================================================================== -// Hololink Protocol Constants -//============================================================================== - -static constexpr uint8_t WR_DWORD = 0x04; -static constexpr uint8_t WR_BLOCK = 0x09; -static constexpr uint8_t RD_DWORD = 0x14; -static constexpr uint8_t RD_BLOCK = 0x19; - -static constexpr uint8_t REQUEST_FLAGS_ACK_REQUEST = 0x01; -static constexpr uint8_t RESPONSE_SUCCESS = 0x00; - -// VP register offsets (relative to vp_address) -static constexpr uint32_t DP_QP = 0x00; -static constexpr uint32_t DP_RKEY = 0x04; -static constexpr uint32_t DP_PAGE_LSB = 0x08; -static constexpr uint32_t DP_PAGE_MSB = 0x0C; -static constexpr uint32_t DP_PAGE_INC = 0x10; -static constexpr uint32_t DP_MAX_BUFF = 0x14; -static constexpr uint32_t DP_BUFFER_LENGTH = 0x18; - -// HIF register offsets (relative to hif_address) -static constexpr uint32_t DP_VP_MASK = 0x0C; - -// Player registers -static constexpr uint32_t PLAYER_BASE = 0x50000000; -static constexpr uint32_t PLAYER_ENABLE = PLAYER_BASE + 0x04; -static constexpr uint32_t PLAYER_TIMER = PLAYER_BASE + 0x08; -static constexpr uint32_t PLAYER_WIN_SIZE = PLAYER_BASE + 0x0C; -static constexpr uint32_t PLAYER_WIN_NUM = PLAYER_BASE + 0x10; - -// Playback BRAM -static constexpr uint32_t RAM_BASE = 0x50100000; -static constexpr int BRAM_NUM_BANKS = 16; -static constexpr int BRAM_W_SAMPLE_ADDR = 9; // log2(512 entries) -static constexpr int BRAM_BANK_STRIDE = 1 << (BRAM_W_SAMPLE_ADDR + 2); // 2048 - -// ILA capture -static constexpr uint32_t ILA_BASE = 0x40000000; -static constexpr uint32_t ILA_CTRL = ILA_BASE + 0x00; -static constexpr uint32_t ILA_STATUS = ILA_BASE + 0x80; -static constexpr uint32_t ILA_SAMPLE_ADDR = ILA_BASE + 0x84; -static constexpr uint32_t ILA_DATA_BASE = 0x40100000; -static constexpr int ILA_NUM_BANKS = 17; -static constexpr int ILA_W_ADDR = 13; // log2(8192 entries) -static constexpr int ILA_BANK_STRIDE = 1 << (ILA_W_ADDR + 2); // 32768 - -// Ring buffer -static constexpr int NUM_BUFFERS = 64; - -//============================================================================== -// RDMA Context (adapted from cuda-qx rdma_utils.hpp) -//============================================================================== - -class RdmaContext { -public: - ~RdmaContext() { cleanup(); } - - bool open(const std::string &device_name, int port = 1) { - int num_devices; - ibv_device **devices = ibv_get_device_list(&num_devices); - if (!devices || num_devices == 0) - return false; - - ibv_device *target = nullptr; - for (int i = 0; i < num_devices; i++) { - if (device_name == ibv_get_device_name(devices[i])) { - target = devices[i]; - break; - } - } - if (!target) { - ibv_free_device_list(devices); - return false; - } - - ctx_ = ibv_open_device(target); - ibv_free_device_list(devices); - if (!ctx_) - return false; - - port_ = port; - pd_ = ibv_alloc_pd(ctx_); - if (!pd_) { - cleanup(); - return false; - } - - if (ibv_query_port(ctx_, port_, &port_attr_) != 0) { - cleanup(); - return false; - } - - gid_index_ = find_roce_v2_gid_index(); - return true; - } - - ibv_cq *create_cq(int size) { - return ibv_create_cq(ctx_, size, nullptr, nullptr, 0); - } - - ibv_mr *register_memory(void *addr, size_t size, - int access = IBV_ACCESS_LOCAL_WRITE | - IBV_ACCESS_REMOTE_WRITE) { - return ibv_reg_mr(pd_, addr, size, access); - } - - ibv_qp *create_qp(ibv_cq *send_cq, ibv_cq *recv_cq, uint32_t max_send_wr = 64, - uint32_t max_recv_wr = 64) { - ibv_qp_init_attr init_attr{}; - init_attr.qp_type = IBV_QPT_UC; // Unreliable Connected - matches FPGA - init_attr.send_cq = send_cq; - init_attr.recv_cq = recv_cq; - init_attr.cap.max_send_wr = max_send_wr; - init_attr.cap.max_recv_wr = max_recv_wr; - init_attr.cap.max_send_sge = 1; - init_attr.cap.max_recv_sge = 1; - return ibv_create_qp(pd_, &init_attr); - } - - bool qp_to_init(ibv_qp *qp) { - ibv_qp_attr attr{}; - attr.qp_state = IBV_QPS_INIT; - attr.port_num = port_; - attr.pkey_index = 0; - attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE; - return ibv_modify_qp(qp, &attr, - IBV_QP_STATE | IBV_QP_PORT | IBV_QP_PKEY_INDEX | - IBV_QP_ACCESS_FLAGS) == 0; - } - - bool qp_to_rtr(ibv_qp *qp, const ibv_gid &remote_gid, uint32_t remote_qp_num, - uint32_t psn = 0) { - ibv_qp_attr attr{}; - attr.qp_state = IBV_QPS_RTR; - attr.path_mtu = port_attr_.active_mtu; - attr.dest_qp_num = remote_qp_num; - attr.rq_psn = psn; - attr.ah_attr.is_global = 1; - attr.ah_attr.grh.dgid = remote_gid; - attr.ah_attr.grh.sgid_index = gid_index_; - attr.ah_attr.grh.hop_limit = 64; - attr.ah_attr.grh.traffic_class = 0; - attr.ah_attr.dlid = 0; - attr.ah_attr.sl = 0; - attr.ah_attr.src_path_bits = 0; - attr.ah_attr.port_num = port_; - return ibv_modify_qp(qp, &attr, - IBV_QP_STATE | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | - IBV_QP_RQ_PSN | IBV_QP_AV) == 0; - } - - bool qp_to_rts(ibv_qp *qp, uint32_t psn = 0) { - ibv_qp_attr attr{}; - attr.qp_state = IBV_QPS_RTS; - attr.sq_psn = psn; - return ibv_modify_qp(qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN) == 0; - } - - bool post_recv(ibv_qp *qp, uint64_t wr_id, void *addr, uint32_t length, - uint32_t lkey) { - ibv_sge sge{}; - sge.addr = reinterpret_cast(addr); - sge.length = length; - sge.lkey = lkey; - - ibv_recv_wr wr{}; - wr.wr_id = wr_id; - wr.sg_list = &sge; - wr.num_sge = 1; - wr.next = nullptr; - - ibv_recv_wr *bad_wr = nullptr; - return ibv_post_recv(qp, &wr, &bad_wr) == 0; - } - - bool post_rdma_write_imm(ibv_qp *qp, uint64_t wr_id, void *local_addr, - uint32_t length, uint32_t lkey, uint64_t remote_addr, - uint32_t rkey, uint32_t imm_data) { - ibv_sge sge{}; - sge.addr = reinterpret_cast(local_addr); - sge.length = length; - sge.lkey = lkey; - - ibv_send_wr wr{}; - wr.wr_id = wr_id; - wr.sg_list = &sge; - wr.num_sge = 1; - wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; - wr.send_flags = IBV_SEND_SIGNALED; - wr.imm_data = htonl(imm_data); - wr.wr.rdma.remote_addr = remote_addr; - wr.wr.rdma.rkey = rkey; - wr.next = nullptr; - - ibv_send_wr *bad_wr = nullptr; - return ibv_post_send(qp, &wr, &bad_wr) == 0; - } - - int poll_cq(ibv_cq *cq, ibv_wc *wc, int max_wc = 1) { - return ibv_poll_cq(cq, max_wc, wc); - } - - int get_gid_index() const { return gid_index_; } - -private: - void cleanup() { - if (pd_) { - ibv_dealloc_pd(pd_); - pd_ = nullptr; - } - if (ctx_) { - ibv_close_device(ctx_); - ctx_ = nullptr; - } - } - - int find_roce_v2_gid_index() { - int best_gid = -1; - for (int i = 0; i < port_attr_.gid_tbl_len; i++) { - ibv_gid gid; - if (ibv_query_gid(ctx_, port_, i, &gid) == 0) { - if (gid.raw[10] == 0xff && gid.raw[11] == 0xff) { - best_gid = i; // Last match = RoCE v2 - } - } - } - return (best_gid >= 0) ? best_gid : 0; - } - - ibv_context *ctx_ = nullptr; - ibv_pd *pd_ = nullptr; - ibv_port_attr port_attr_{}; - int port_ = 1; - int gid_index_ = 0; -}; - -//============================================================================== -// RDMA Buffer -//============================================================================== - -class RdmaBuffer { -public: - ~RdmaBuffer() { release(); } - - bool allocate(RdmaContext &ctx, size_t size) { - size_t page_size = 4096; - size_t aligned = ((size + page_size - 1) / page_size) * page_size; - data_ = aligned_alloc(page_size, aligned); - if (!data_) - return false; - size_ = size; - memset(data_, 0, aligned); - mr_ = ctx.register_memory(data_, aligned, - IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); - if (!mr_) { - ::free(data_); - data_ = nullptr; - return false; - } - return true; - } - - void release() { - if (mr_) { - ibv_dereg_mr(mr_); - mr_ = nullptr; - } - if (data_) { - ::free(data_); - data_ = nullptr; - } - } - - void *data() const { return data_; } - size_t size() const { return size_; } - uint32_t lkey() const { return mr_ ? mr_->lkey : 0; } - uint32_t rkey() const { return mr_ ? mr_->rkey : 0; } - -private: - void *data_ = nullptr; - size_t size_ = 0; - ibv_mr *mr_ = nullptr; -}; - -//============================================================================== -// Emulated Register File -//============================================================================== - -class RegisterFile { -public: - void write(uint32_t addr, uint32_t value) { - std::lock_guard lock(mu_); - regs_[addr] = value; - } - - uint32_t read(uint32_t addr) const { - std::lock_guard lock(mu_); - auto it = regs_.find(addr); - return (it != regs_.end()) ? it->second : 0; - } - - /// Batch write (for BRAM loading efficiency). - void write_batch(const std::vector> &writes) { - std::lock_guard lock(mu_); - for (auto &[addr, val] : writes) { - regs_[addr] = val; - } - } - - /// Read a range of contiguous 32-bit registers. - std::vector read_range(uint32_t base_addr, uint32_t count) const { - std::lock_guard lock(mu_); - std::vector result(count); - for (uint32_t i = 0; i < count; i++) { - auto it = regs_.find(base_addr + i * 4); - result[i] = (it != regs_.end()) ? it->second : 0; - } - return result; - } - -private: - mutable std::mutex mu_; - std::unordered_map regs_; -}; - -//============================================================================== -// RDMA Target Config (decoded from VP register writes) -//============================================================================== - -struct RdmaTargetConfig { - uint32_t qp_number = 0; - uint32_t rkey = 0; - uint64_t buffer_addr = 0; - uint32_t page_inc = 0; // bytes - uint32_t max_buff = 0; // max buffer index - uint32_t buffer_length = 0; - - // Temporary storage for two-part address - uint32_t page_lsb = 0; - uint32_t page_msb = 0; - - // Track whether key fields were explicitly set (buffer_addr=0 is valid - // when Hololink uses IOVA with dmabuf). - bool qp_set = false; - bool rkey_set = false; - - void update_addr() { - // Hololink encodes: PAGE_LSB = addr >> 7, PAGE_MSB = addr >> 32 - // Reconstruct: addr = (MSB << 32) | (LSB << 7) - buffer_addr = (static_cast(page_msb) << 32) | - (static_cast(page_lsb) << 7); - } - - bool is_complete() const { - // buffer_addr=0 is valid (Hololink IOVA/dmabuf), so we only check - // that QP and RKEY were explicitly set. - return qp_set && rkey_set; - } - - void print() const { - std::cout << " RDMA Target Config:" << std::endl; - std::cout << " QP: 0x" << std::hex << qp_number << std::dec << std::endl; - std::cout << " RKEY: 0x" << std::hex << rkey << std::dec << std::endl; - std::cout << " Buffer addr: 0x" << std::hex << buffer_addr << std::dec - << std::endl; - std::cout << " Page inc: " << page_inc << " bytes" << std::endl; - std::cout << " Max buff: " << max_buff << std::endl; - } -}; - -//============================================================================== -// UDP Control Plane Server -//============================================================================== - -class ControlPlaneServer { -public: - ControlPlaneServer(uint16_t port, uint32_t vp_address, uint32_t hif_address, - RegisterFile ®s) - : port_(port), vp_addr_(vp_address), hif_addr_(hif_address), regs_(regs) { - } - - ~ControlPlaneServer() { stop(); } - - void set_my_qp(uint32_t qp) { my_qp_ = qp; } - - bool start() { - fd_ = socket(AF_INET, SOCK_DGRAM, 0); - if (fd_ < 0) - return false; - - int opt = 1; - setsockopt(fd_, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); - - sockaddr_in addr{}; - addr.sin_family = AF_INET; - addr.sin_addr.s_addr = INADDR_ANY; - addr.sin_port = htons(port_); - if (bind(fd_, reinterpret_cast(&addr), sizeof(addr)) < 0) { - ::close(fd_); - fd_ = -1; - return false; - } - - running_ = true; - thread_ = std::thread(&ControlPlaneServer::run, this); - return true; - } - - void stop() { - running_ = false; - if (fd_ >= 0) { - shutdown(fd_, SHUT_RDWR); - ::close(fd_); - fd_ = -1; - } - if (thread_.joinable()) - thread_.join(); - } - - /// Block until RDMA config is complete or timeout. - bool wait_for_config(int timeout_ms = 60000) { - auto start = std::chrono::steady_clock::now(); - while (!target_.is_complete() && !g_shutdown) { - auto elapsed = std::chrono::duration_cast( - std::chrono::steady_clock::now() - start) - .count(); - if (elapsed >= timeout_ms) - return false; - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - } - return target_.is_complete(); - } - - const RdmaTargetConfig &target() const { return target_; } - - /// Check if player_enable was set to 1. - bool playback_triggered() const { return playback_triggered_.load(); } - void clear_playback_trigger() { playback_triggered_ = false; } - - /// Get player config. - uint32_t window_size() const { return regs_.read(PLAYER_WIN_SIZE); } - uint32_t window_number() const { return regs_.read(PLAYER_WIN_NUM); } - uint32_t timer_spacing() const { return regs_.read(PLAYER_TIMER); } - -private: - void run() { - std::vector buf(4096); - while (running_ && !g_shutdown) { - fd_set fds; - FD_ZERO(&fds); - FD_SET(fd_, &fds); - timeval tv{0, 100000}; // 100ms - - int ready = select(fd_ + 1, &fds, nullptr, nullptr, &tv); - if (ready <= 0) - continue; - - sockaddr_in client{}; - socklen_t clen = sizeof(client); - ssize_t len = recvfrom(fd_, buf.data(), buf.size(), 0, - reinterpret_cast(&client), &clen); - if (len < 6) - continue; - - handle_packet(buf.data(), static_cast(len), client); - } - } - - // --- Packet helpers --- - - static uint32_t read_be32(const uint8_t *p) { - return (uint32_t(p[0]) << 24) | (uint32_t(p[1]) << 16) | - (uint32_t(p[2]) << 8) | p[3]; - } - - static uint16_t read_be16(const uint8_t *p) { - return (uint16_t(p[0]) << 8) | p[1]; - } - - static void write_be32(uint8_t *p, uint32_t v) { - p[0] = (v >> 24) & 0xFF; - p[1] = (v >> 16) & 0xFF; - p[2] = (v >> 8) & 0xFF; - p[3] = v & 0xFF; - } - - static void write_be16(uint8_t *p, uint16_t v) { - p[0] = (v >> 8) & 0xFF; - p[1] = v & 0xFF; - } - - // --- Handle incoming packet --- - - void handle_packet(const uint8_t *data, size_t len, - const sockaddr_in &client) { - uint8_t opcode = data[0]; - uint8_t flags = data[1]; - uint16_t seq = read_be16(data + 2); - - switch (opcode) { - case WR_DWORD: - if (len >= 14) - handle_wr_dword(data, flags, seq, client); - break; - case WR_BLOCK: - handle_wr_block(data, len, flags, seq, client); - break; - case RD_DWORD: - if (len >= 10) - handle_rd_dword(data, flags, seq, client); - break; - case RD_BLOCK: - handle_rd_block(data, len, flags, seq, client); - break; - default: - // Unknown opcode - send error ACK - if (flags & REQUEST_FLAGS_ACK_REQUEST) - send_write_ack(client, opcode, flags, seq); - break; - } - } - - void handle_wr_dword(const uint8_t *data, uint8_t flags, uint16_t seq, - const sockaddr_in &client) { - uint32_t addr = read_be32(data + 6); - uint32_t val = read_be32(data + 10); - process_register_write(addr, val); - if (flags & REQUEST_FLAGS_ACK_REQUEST) - send_write_ack(client, WR_DWORD, flags, seq); - } - - void handle_wr_block(const uint8_t *data, size_t len, uint8_t flags, - uint16_t seq, const sockaddr_in &client) { - // Pairs start at offset 6, each pair is 8 bytes - size_t offset = 6; - std::vector> batch; - while (offset + 8 <= len) { - uint32_t addr = read_be32(data + offset); - uint32_t val = read_be32(data + offset + 4); - batch.push_back({addr, val}); - offset += 8; - } - - // Batch write to register file - regs_.write_batch(batch); - - // Process VP register updates - for (auto &[addr, val] : batch) { - process_vp_update(addr, val); - check_player_enable(addr, val); - } - - if (flags & REQUEST_FLAGS_ACK_REQUEST) - send_write_ack(client, WR_BLOCK, flags, seq); - } - - void handle_rd_dword(const uint8_t *data, uint8_t flags, uint16_t seq, - const sockaddr_in &client) { - uint32_t addr = read_be32(data + 6); - uint32_t val = regs_.read(addr); - - // Response: cmd(1) + flags(1) + seq(2) + response_code(1) + reserved(1) + - // addr(4) + value(4) + latched_seq(2) = 16 bytes - uint8_t resp[16]; - resp[0] = RD_DWORD; - resp[1] = flags; - write_be16(resp + 2, seq); - resp[4] = RESPONSE_SUCCESS; - resp[5] = 0; // reserved - write_be32(resp + 6, addr); - write_be32(resp + 10, val); - write_be16(resp + 14, seq); // latched sequence - - sendto(fd_, resp, sizeof(resp), 0, - reinterpret_cast(&client), sizeof(client)); - } - - void handle_rd_block(const uint8_t *data, size_t len, uint8_t flags, - uint16_t seq, const sockaddr_in &client) { - // Parse addresses from request - std::vector addrs; - size_t offset = 6; - while (offset + 8 <= len) { - addrs.push_back(read_be32(data + offset)); - offset += 8; - } - - // Build response: cmd(1) + flags(1) + seq(2) + rc(1) + reserved(1) + - // N*(addr(4)+value(4)) + latched_seq(2) - size_t resp_len = 6 + addrs.size() * 8 + 2; - std::vector resp(resp_len); - resp[0] = RD_BLOCK; - resp[1] = flags; - write_be16(resp.data() + 2, seq); - resp[4] = RESPONSE_SUCCESS; - resp[5] = 0; - - size_t roff = 6; - for (uint32_t a : addrs) { - uint32_t val = regs_.read(a); - write_be32(resp.data() + roff, a); - write_be32(resp.data() + roff + 4, val); - roff += 8; - } - write_be16(resp.data() + roff, seq); // latched sequence - - sendto(fd_, resp.data(), resp.size(), 0, - reinterpret_cast(&client), sizeof(client)); - } - - // --- Write ACK for WR_DWORD / WR_BLOCK --- - - void send_write_ack(const sockaddr_in &client, uint8_t cmd, uint8_t flags, - uint16_t seq) { - uint8_t resp[5]; - resp[0] = cmd; - resp[1] = flags; - write_be16(resp + 2, seq); - resp[4] = RESPONSE_SUCCESS; - sendto(fd_, resp, sizeof(resp), 0, - reinterpret_cast(&client), sizeof(client)); - } - - // --- Register write processing --- - - void process_register_write(uint32_t addr, uint32_t val) { - regs_.write(addr, val); - process_vp_update(addr, val); - check_player_enable(addr, val); - } - - void process_vp_update(uint32_t addr, uint32_t val) { - // Check if this is a VP register (relative to vp_addr_) - if (addr < vp_addr_ || addr >= vp_addr_ + 0x100) - return; - - uint32_t offset = addr - vp_addr_; - switch (offset) { - case DP_QP: - target_.qp_number = val; - target_.qp_set = true; - break; - case DP_RKEY: - target_.rkey = val; - target_.rkey_set = true; - break; - case DP_PAGE_LSB: - target_.page_lsb = val; - target_.update_addr(); - break; - case DP_PAGE_MSB: - target_.page_msb = val; - target_.update_addr(); - break; - case DP_PAGE_INC: - target_.page_inc = val << 7; // PAGES encoding: value * 128 - break; - case DP_MAX_BUFF: - target_.max_buff = val; - break; - case DP_BUFFER_LENGTH: - target_.buffer_length = val; - break; - } - } - - void check_player_enable(uint32_t addr, uint32_t val) { - if (addr == PLAYER_ENABLE && val == 1) { - playback_triggered_ = true; - } - } - - uint16_t port_; - uint32_t vp_addr_; - uint32_t hif_addr_; - RegisterFile ®s_; - int fd_ = -1; - std::atomic running_{false}; - std::thread thread_; - uint32_t my_qp_ = 0; - RdmaTargetConfig target_; - std::atomic playback_triggered_{false}; -}; - -//============================================================================== -// BRAM Reassembly -//============================================================================== - -/// Reassemble one window from the 16-bank BRAM layout. -/// Each 64-byte beat is spread across 16 banks (4 bytes each). -/// @param regs Register file to read from -/// @param window_index Window number -/// @param cycles_per_window Number of 64-byte beats per window -/// @return Reassembled window payload -static std::vector reassemble_window(const RegisterFile ®s, - uint32_t window_index, - uint32_t cycles_per_window) { - std::vector payload(cycles_per_window * 64, 0); - for (uint32_t cycle = 0; cycle < cycles_per_window; cycle++) { - uint32_t sample_index = window_index * cycles_per_window + cycle; - for (int bank = 0; bank < BRAM_NUM_BANKS; bank++) { - uint32_t addr = - RAM_BASE + (bank << (BRAM_W_SAMPLE_ADDR + 2)) + (sample_index * 4); - uint32_t val = regs.read(addr); - // Store as little-endian (matching FPGA BRAM word order) - size_t byte_offset = cycle * 64 + bank * 4; - memcpy(&payload[byte_offset], &val, 4); - } - } - return payload; -} - -//============================================================================== -// ILA Capture Storage -//============================================================================== - -/// Store a correction response into the ILA capture register file. -/// The ILA stores each sample across 17 banks of 32-bit words. -/// Banks 0-15 = 512-bit AXI data bus (raw correction bytes). -/// Bank 16 = control signals: -/// bit 0 = tvalid (bit 512 of the captured word) -/// bit 1 = tlast (bit 513) -/// bits [8:2] = wr_tcnt (bits 520:514, 7-bit write transaction count) -static void store_ila_sample(RegisterFile ®s, uint32_t sample_index, - const uint8_t *data, size_t data_len) { - // Spread the data across banks 0-15 (the 512-bit AXI data bus). - for (int bank = 0; bank < ILA_NUM_BANKS - 1; bank++) { - uint32_t addr = - ILA_DATA_BASE + (bank << (ILA_W_ADDR + 2)) + (sample_index * 4); - uint32_t val = 0; - size_t byte_offset = bank * 4; - if (byte_offset < data_len) { - size_t copy_len = std::min(4, data_len - byte_offset); - memcpy(&val, data + byte_offset, copy_len); - } - regs.write(addr, val); - } - - // Bank 16: set control signals (tvalid=1, tlast=1, wr_tcnt=1) - { - uint32_t ctrl_addr = ILA_DATA_BASE + - ((ILA_NUM_BANKS - 1) << (ILA_W_ADDR + 2)) + - (sample_index * 4); - uint32_t ctrl_val = 0; - ctrl_val |= (1u << 0); // tvalid (bit 512) - ctrl_val |= (1u << 1); // tlast (bit 513) - ctrl_val |= (1u << 2); // wr_tcnt = 1 (bits 514+, value 1 in 7-bit field) - regs.write(ctrl_addr, ctrl_val); - } - - // Update sample count - regs.write(ILA_SAMPLE_ADDR, sample_index + 1); -} - -//============================================================================== -// Command-Line Arguments -//============================================================================== - -struct EmulatorArgs { - std::string device = "rocep1s0f0"; - int ib_port = 1; - uint16_t control_port = 8193; - std::string bridge_ip = ""; // Bridge IP (for GID, auto-detect if empty) - uint32_t vp_address = 0x1000; - uint32_t hif_address = 0x0800; - size_t page_size = 256; // Default slot size for responses RX -}; - -static void print_usage(const char *prog) { - std::cout - << "Usage: " << prog << " [options]\n" - << "\nFPGA emulator for QEC decode loop testing.\n" - << "\nOptions:\n" - << " --device=NAME IB device name (default: rocep1s0f0)\n" - << " --ib-port=N IB port number (default: 1)\n" - << " --port=N UDP control plane port (default: 8193)\n" - << " --bridge-ip=ADDR Bridge tool IP for GID (default: auto)\n" - << " --vp-address=ADDR VP register base (default: 0x1000)\n" - << " --hif-address=ADDR HIF register base (default: 0x0800)\n" - << " --page-size=N Slot size for correction RX (default: 256)\n" - << " --help Show this help\n"; -} - -static EmulatorArgs parse_args(int argc, char *argv[]) { - EmulatorArgs args; - for (int i = 1; i < argc; i++) { - std::string arg = argv[i]; - if (arg.find("--device=") == 0) - args.device = arg.substr(9); - else if (arg.find("--ib-port=") == 0) - args.ib_port = std::stoi(arg.substr(10)); - else if (arg.find("--port=") == 0) - args.control_port = std::stoi(arg.substr(7)); - else if (arg.find("--bridge-ip=") == 0) - args.bridge_ip = arg.substr(12); - else if (arg.find("--vp-address=") == 0) - args.vp_address = std::stoul(arg.substr(13), nullptr, 0); - else if (arg.find("--hif-address=") == 0) - args.hif_address = std::stoul(arg.substr(14), nullptr, 0); - else if (arg.find("--page-size=") == 0) - args.page_size = std::stoull(arg.substr(12)); - else if (arg == "--help" || arg == "-h") { - print_usage(argv[0]); - exit(0); - } - } - return args; -} - -//============================================================================== -// MAIN -//============================================================================== - -int main(int argc, char *argv[]) { - signal(SIGINT, signal_handler); - signal(SIGTERM, signal_handler); - - try { - auto args = parse_args(argc, argv); - - std::cout << "=== Hololink FPGA Emulator ===" << std::endl; - std::cout << "IB Device: " << args.device << std::endl; - std::cout << "Control port: " << args.control_port << std::endl; - std::cout << "VP address: 0x" << std::hex << args.vp_address << std::dec - << std::endl; - - //========================================================================== - // [1/4] Initialize RDMA - //========================================================================== - std::cout << "\n[1/4] Initializing RDMA..." << std::endl; - - RdmaContext rdma; - if (!rdma.open(args.device, args.ib_port)) { - std::cerr << "ERROR: Failed to open RDMA device: " << args.device - << std::endl; - return 1; - } - std::cout << " GID index: " << rdma.get_gid_index() << std::endl; - - // TX buffer for outgoing syndromes - RdmaBuffer tx_buffer; - if (!tx_buffer.allocate(rdma, NUM_BUFFERS * args.page_size)) { - std::cerr << "ERROR: Failed to allocate TX buffer" << std::endl; - return 1; - } - - // RX buffer for incoming responses (same page_size as bridge for - // symmetry) - RdmaBuffer rx_buffer; - if (!rx_buffer.allocate(rdma, NUM_BUFFERS * args.page_size)) { - std::cerr << "ERROR: Failed to allocate RX buffer" << std::endl; - return 1; - } - - // Create CQs and QP - ibv_cq *tx_cq = rdma.create_cq(NUM_BUFFERS * 2); - ibv_cq *rx_cq = rdma.create_cq(NUM_BUFFERS * 2); - if (!tx_cq || !rx_cq) { - std::cerr << "ERROR: Failed to create CQs" << std::endl; - return 1; - } - - ibv_qp *qp = rdma.create_qp(tx_cq, rx_cq, NUM_BUFFERS, NUM_BUFFERS); - if (!qp) { - std::cerr << "ERROR: Failed to create QP" << std::endl; - return 1; - } - if (!rdma.qp_to_init(qp)) { - std::cerr << "ERROR: Failed to set QP to INIT" << std::endl; - return 1; - } - - std::cout << " QP Number: 0x" << std::hex << qp->qp_num << std::dec - << std::endl; - std::cout << " TX buffer: " << tx_buffer.size() << " bytes" << std::endl; - std::cout << " RX buffer: " << rx_buffer.size() << " bytes" << std::endl; - - //========================================================================== - // [2/4] Start UDP control plane server - //========================================================================== - std::cout << "\n[2/4] Starting control plane server..." << std::endl; - - RegisterFile regs; - ControlPlaneServer server(args.control_port, args.vp_address, - args.hif_address, regs); - server.set_my_qp(qp->qp_num); - - if (!server.start()) { - std::cerr << "ERROR: Failed to start control plane server" << std::endl; - return 1; - } - std::cout << " Listening on UDP port " << args.control_port << std::endl; - std::cout << " Emulator QP: 0x" << std::hex << qp->qp_num << std::dec - << std::endl; - - //========================================================================== - // [3/4] Wait for RDMA config from playback tool - //========================================================================== - std::cout << "\n[3/4] Waiting for RDMA configuration..." << std::endl; - std::cout << " (Start bridge tool, then playback tool with " - "--control-port=" - << args.control_port << ")" << std::endl; - - if (!server.wait_for_config(300000)) { // 5 minute timeout - std::cerr << "ERROR: Timeout waiting for RDMA configuration" << std::endl; - return 1; - } - - auto &target = server.target(); - target.print(); - - // Connect QP to bridge - ibv_gid remote_gid{}; - if (!args.bridge_ip.empty()) { - // Use provided IP - remote_gid.raw[10] = 0xff; - remote_gid.raw[11] = 0xff; - inet_pton(AF_INET, args.bridge_ip.c_str(), &remote_gid.raw[12]); - } else { - // Derive from VP HOST_IP register if available - uint32_t host_ip = regs.read(args.vp_address + 0x28); // DP_HOST_IP - if (host_ip != 0) { - remote_gid.raw[10] = 0xff; - remote_gid.raw[11] = 0xff; - // DP_HOST_IP is in network byte order from inet_network() - memcpy(&remote_gid.raw[12], &host_ip, 4); - } else { - std::cerr << "ERROR: No bridge IP available. Use --bridge-ip or ensure " - "configure_roce sets HOST_IP." - << std::endl; - return 1; - } - } - - std::cout << " Connecting QP to bridge QP 0x" << std::hex - << target.qp_number << std::dec << "..." << std::endl; - - if (!rdma.qp_to_rtr(qp, remote_gid, target.qp_number, 0)) { - std::cerr << "ERROR: Failed QP -> RTR" << std::endl; - return 1; - } - if (!rdma.qp_to_rts(qp, 0)) { - std::cerr << "ERROR: Failed QP -> RTS" << std::endl; - return 1; - } - std::cout << " QP connected!" << std::endl; - - // Post receive WQEs for responses - for (size_t i = 0; i < NUM_BUFFERS; i++) { - void *addr = - static_cast(rx_buffer.data()) + (i * args.page_size); - if (!rdma.post_recv(qp, i, addr, args.page_size, rx_buffer.lkey())) { - std::cerr << "ERROR: Failed to post receive WQE " << i << std::endl; - return 1; - } - } - std::cout << " Posted " << NUM_BUFFERS << " receive WQEs" << std::endl; - - //========================================================================== - // [4/4] Wait for playback trigger, then run - //========================================================================== - std::cout << "\n[4/4] Waiting for playback trigger..." << std::endl; - - while (!server.playback_triggered() && !g_shutdown) { - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - } - - if (g_shutdown) { - std::cout << "Shutdown requested" << std::endl; - return 0; - } - - std::cout << "\n=== Playback triggered ===" << std::endl; - - uint32_t win_size = server.window_size(); - uint32_t win_num = server.window_number(); - uint32_t timer = server.timer_spacing(); - uint32_t cycles_per_window = (win_size + 63) / 64; // 64 bytes per beat - - std::cout << " Window size: " << win_size << " bytes" << std::endl; - std::cout << " Window count: " << win_num << std::endl; - std::cout << " Timer spacing: " << timer << " (raw)" << std::endl; - std::cout << " Cycles per window: " << cycles_per_window << std::endl; - - // Compute pacing interval from timer register (timer = 322 * microseconds) - int pacing_us = (timer > 0) ? (timer / 322) : 10; - - // Check if ILA is armed - bool ila_armed = (regs.read(ILA_CTRL) & 0x01) != 0; - std::cout << " ILA capture: " << (ila_armed ? "armed" : "not armed") - << std::endl; - - // Determine page_size for RDMA addressing from target config - uint32_t rdma_page_size = - (target.page_inc > 0) ? target.page_inc : args.page_size; - uint32_t num_pages = target.max_buff + 1; - - std::cout << "\n=== Starting syndrome transmission ===" << std::endl; - - auto start_time = std::chrono::high_resolution_clock::now(); - uint32_t responses_received = 0; - uint32_t send_errors = 0; - uint32_t recv_timeouts = 0; - - for (uint32_t window = 0; window < win_num && !g_shutdown; window++) { - uint32_t slot = window % num_pages; - - // Reassemble syndrome payload from BRAM - auto payload = reassemble_window(regs, window, cycles_per_window); - - // Copy to RDMA TX buffer slot - uint8_t *tx_addr = - static_cast(tx_buffer.data()) + (slot * rdma_page_size); - size_t copy_len = std::min(payload.size(), rdma_page_size); - memcpy(tx_addr, payload.data(), copy_len); - - // RDMA WRITE to bridge's ring buffer - uint64_t remote_addr = target.buffer_addr + (slot * rdma_page_size); - if (!rdma.post_rdma_write_imm(qp, window, tx_addr, copy_len, - tx_buffer.lkey(), remote_addr, target.rkey, - slot)) { - std::cerr << "ERROR: RDMA WRITE failed for window " << window - << std::endl; - send_errors++; - continue; - } - - // Wait for send completion - bool send_ok = false; - auto t0 = std::chrono::steady_clock::now(); - while (!send_ok && !g_shutdown) { - ibv_wc wc; - int n = rdma.poll_cq(tx_cq, &wc, 1); - if (n > 0) { - send_ok = (wc.status == IBV_WC_SUCCESS); - if (!send_ok) { - std::cerr << "ERROR: Send CQE error: " - << ibv_wc_status_str(wc.status) << std::endl; - send_errors++; - } - break; - } - auto elapsed = std::chrono::duration_cast( - std::chrono::steady_clock::now() - t0) - .count(); - if (elapsed > 5000) { - std::cerr << "ERROR: Send timeout for window " << window << std::endl; - recv_timeouts++; - break; - } - } - if (!send_ok) - continue; - - // Wait for correction response (natural pacing) - bool corr_ok = false; - t0 = std::chrono::steady_clock::now(); - while (!corr_ok && !g_shutdown) { - ibv_wc wc; - int n = rdma.poll_cq(rx_cq, &wc, 1); - if (n > 0) { - if (wc.status == IBV_WC_SUCCESS) { - corr_ok = true; - responses_received++; - - // Store in ILA capture if armed - if (ila_armed) { - uint32_t rx_slot = wc.wr_id % NUM_BUFFERS; - uint8_t *resp_data = static_cast(rx_buffer.data()) + - (rx_slot * args.page_size); - store_ila_sample(regs, window, resp_data, wc.byte_len); - } - - // Re-post receive WQE - uint32_t rx_slot = wc.wr_id % NUM_BUFFERS; - void *rx_addr = static_cast(rx_buffer.data()) + - (rx_slot * args.page_size); - rdma.post_recv(qp, rx_slot, rx_addr, args.page_size, - rx_buffer.lkey()); - } else { - std::cerr << "ERROR: Recv CQE error: " - << ibv_wc_status_str(wc.status) << std::endl; - } - break; - } - auto elapsed = std::chrono::duration_cast( - std::chrono::steady_clock::now() - t0) - .count(); - if (elapsed > 10000) { - std::cerr << "ERROR: Correction timeout for window " << window - << std::endl; - recv_timeouts++; - break; - } - } - - // Progress - if ((window + 1) % 10 == 0 || window == win_num - 1) { - std::cout << " Window " << (window + 1) << "/" << win_num - << " (responses: " << responses_received - << ", errors: " << send_errors << ")" << std::endl; - } - - // Pacing delay - if (pacing_us > 0 && window + 1 < win_num) { - std::this_thread::sleep_for(std::chrono::microseconds(pacing_us)); - } - } - - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast( - end_time - start_time); - - // Mark ILA as done - if (ila_armed) { - regs.write(ILA_STATUS, regs.read(ILA_STATUS) | 0x02); // done bit - } - - // Report results - std::cout << "\n=== Emulator Results ===" << std::endl; - std::cout << " Windows sent: " << win_num << std::endl; - std::cout << " Responses received: " << responses_received << std::endl; - std::cout << " Send errors: " << send_errors << std::endl; - std::cout << " Timeouts: " << recv_timeouts << std::endl; - std::cout << " Duration: " << duration.count() << " ms" << std::endl; - - // Keep running to allow playback tool to read ILA capture data - if (ila_armed) { - std::cout << "\nWaiting for ILA readback (Ctrl+C to stop)..." - << std::endl; - while (!g_shutdown) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - } - - // Cleanup - server.stop(); - ibv_destroy_qp(qp); - ibv_destroy_cq(tx_cq); - ibv_destroy_cq(rx_cq); - - if (send_errors == 0 && recv_timeouts == 0 && - responses_received == win_num) { - std::cout << "\n*** EMULATOR: ALL WINDOWS PROCESSED ***" << std::endl; - return 0; - } else { - std::cout << "\n*** EMULATOR: ERRORS DETECTED ***" << std::endl; - return 1; - } - - } catch (const std::exception &e) { - std::cerr << "ERROR: " << e.what() << std::endl; - return 1; - } -} diff --git a/realtime/unittests/utils/hololink_fpga_playback.cpp b/realtime/unittests/utils/hololink_fpga_playback.cpp deleted file mode 100644 index c98d346f..00000000 --- a/realtime/unittests/utils/hololink_fpga_playback.cpp +++ /dev/null @@ -1,534 +0,0 @@ -/****************************************************************-*- C++ -*-**** - * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * - * All rights reserved. * - * * - * This source code and the accompanying materials are made available under * - * the terms of the Apache License 2.0 which accompanies this distribution. * - ******************************************************************************/ - -/// @file hololink_fpga_playback.cpp -/// @brief Generic RPC playback tool for Hololink FPGA / emulator testing. -/// -/// Sends RPC messages to the FPGA (or emulator) via the Hololink UDP control -/// plane, triggering RDMA transmission to the bridge. After playback, reads -/// back responses from the ILA capture RAM and verifies them. -/// -/// For the generic bridge, the payload is a sequence of ascending bytes and -/// the expected response is each byte incremented by 1. -/// -/// Usage: -/// ./hololink_fpga_playback \ -/// --control-ip=10.0.0.2 --control-port=8193 \ -/// --bridge-qp=0x5 --bridge-rkey=12345 --bridge-buffer=0x7f... \ -/// --page-size=384 --num-pages=64 --num-shots=100 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" - -//============================================================================== -// Hololink Control Plane Protocol -//============================================================================== - -static constexpr uint8_t WR_DWORD = 0x04; -static constexpr uint8_t WR_BLOCK = 0x09; -static constexpr uint8_t RD_DWORD = 0x14; -static constexpr uint8_t RD_BLOCK = 0x19; -static constexpr uint8_t REQUEST_FLAGS_ACK_REQUEST = 0x01; -static constexpr uint8_t RESPONSE_SUCCESS = 0x00; - -// VP register offsets -static constexpr uint32_t DP_QP = 0x00; -static constexpr uint32_t DP_RKEY = 0x04; -static constexpr uint32_t DP_PAGE_LSB = 0x08; -static constexpr uint32_t DP_PAGE_MSB = 0x0C; -static constexpr uint32_t DP_PAGE_INC = 0x10; -static constexpr uint32_t DP_MAX_BUFF = 0x14; -static constexpr uint32_t DP_BUFFER_LENGTH = 0x18; -static constexpr uint32_t DP_HOST_IP = 0x28; - -// HIF register offsets -static constexpr uint32_t DP_VP_MASK = 0x0C; - -// Player registers -static constexpr uint32_t PLAYER_BASE = 0x50000000; -static constexpr uint32_t PLAYER_ENABLE = PLAYER_BASE + 0x04; -static constexpr uint32_t PLAYER_TIMER = PLAYER_BASE + 0x08; -static constexpr uint32_t PLAYER_WIN_SIZE = PLAYER_BASE + 0x0C; -static constexpr uint32_t PLAYER_WIN_NUM = PLAYER_BASE + 0x10; - -// Playback BRAM -static constexpr uint32_t RAM_BASE = 0x50100000; -static constexpr int BRAM_NUM_BANKS = 16; -static constexpr int BRAM_W_SAMPLE_ADDR = 9; -static constexpr int BRAM_BANK_STRIDE = 1 << (BRAM_W_SAMPLE_ADDR + 2); - -// ILA capture -static constexpr uint32_t ILA_BASE = 0x40000000; -static constexpr uint32_t ILA_CTRL = ILA_BASE + 0x00; -static constexpr uint32_t ILA_STATUS = ILA_BASE + 0x80; -static constexpr uint32_t ILA_SAMPLE_ADDR = ILA_BASE + 0x84; -static constexpr uint32_t ILA_DATA_BASE = 0x40100000; -static constexpr int ILA_NUM_BANKS = 17; -static constexpr int ILA_W_ADDR = 13; -static constexpr int ILA_BANK_STRIDE = 1 << (ILA_W_ADDR + 2); - -// Hololink page encoding -static constexpr int PAGE_SHIFT = 7; // 128-byte pages - -//============================================================================== -// UDP helpers -//============================================================================== - -static void write_be32(uint8_t *p, uint32_t v) { - p[0] = (v >> 24) & 0xFF; - p[1] = (v >> 16) & 0xFF; - p[2] = (v >> 8) & 0xFF; - p[3] = v & 0xFF; -} - -static void write_be16(uint8_t *p, uint16_t v) { - p[0] = (v >> 8) & 0xFF; - p[1] = v & 0xFF; -} - -static uint32_t read_be32(const uint8_t *p) { - return (uint32_t(p[0]) << 24) | (uint32_t(p[1]) << 16) | - (uint32_t(p[2]) << 8) | p[3]; -} - -//============================================================================== -// Control plane client -//============================================================================== - -class ControlPlaneClient { -public: - bool connect(const std::string &ip, uint16_t port) { - fd_ = socket(AF_INET, SOCK_DGRAM, 0); - if (fd_ < 0) - return false; - - addr_.sin_family = AF_INET; - addr_.sin_port = htons(port); - inet_pton(AF_INET, ip.c_str(), &addr_.sin_addr); - - // Set receive timeout - timeval tv{2, 0}; - setsockopt(fd_, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); - return true; - } - - ~ControlPlaneClient() { - if (fd_ >= 0) - ::close(fd_); - } - - bool write_dword(uint32_t addr, uint32_t value) { - uint8_t pkt[14]; - pkt[0] = WR_DWORD; - pkt[1] = REQUEST_FLAGS_ACK_REQUEST; - write_be16(pkt + 2, seq_++); - pkt[4] = 0; - pkt[5] = 0; - write_be32(pkt + 6, addr); - write_be32(pkt + 10, value); - - sendto(fd_, pkt, sizeof(pkt), 0, reinterpret_cast(&addr_), - sizeof(addr_)); - - // Wait for ACK - uint8_t resp[16]; - ssize_t n = recv(fd_, resp, sizeof(resp), 0); - return (n >= 5 && resp[4] == RESPONSE_SUCCESS); - } - - bool write_block(const std::vector> &pairs) { - std::vector pkt(6 + pairs.size() * 8); - pkt[0] = WR_BLOCK; - pkt[1] = REQUEST_FLAGS_ACK_REQUEST; - write_be16(pkt.data() + 2, seq_++); - pkt[4] = 0; - pkt[5] = 0; - - size_t off = 6; - for (auto &[addr, val] : pairs) { - write_be32(pkt.data() + off, addr); - write_be32(pkt.data() + off + 4, val); - off += 8; - } - - sendto(fd_, pkt.data(), pkt.size(), 0, reinterpret_cast(&addr_), - sizeof(addr_)); - - uint8_t resp[16]; - ssize_t n = recv(fd_, resp, sizeof(resp), 0); - return (n >= 5 && resp[4] == RESPONSE_SUCCESS); - } - - uint32_t read_dword(uint32_t addr) { - uint8_t pkt[10]; - pkt[0] = RD_DWORD; - pkt[1] = REQUEST_FLAGS_ACK_REQUEST; - write_be16(pkt + 2, seq_++); - pkt[4] = 0; - pkt[5] = 0; - write_be32(pkt + 6, addr); - - sendto(fd_, pkt, sizeof(pkt), 0, reinterpret_cast(&addr_), - sizeof(addr_)); - - uint8_t resp[32]; - ssize_t n = recv(fd_, resp, sizeof(resp), 0); - if (n >= 14) - return read_be32(resp + 10); - return 0; - } - -private: - int fd_ = -1; - sockaddr_in addr_{}; - uint16_t seq_ = 0; -}; - -//============================================================================== -// Arguments -//============================================================================== - -struct PlaybackArgs { - std::string control_ip = "10.0.0.2"; - uint16_t control_port = 8193; - uint32_t bridge_qp = 0; - uint32_t bridge_rkey = 0; - uint64_t bridge_buffer = 0; - size_t page_size = 384; - unsigned num_pages = 64; - uint32_t num_shots = 100; - uint32_t payload_size = 8; // bytes of RPC argument data - uint32_t vp_address = 0x1000; - uint32_t hif_address = 0x0800; - std::string bridge_ip = "10.0.0.1"; - bool verify = true; -}; - -static PlaybackArgs parse_args(int argc, char *argv[]) { - PlaybackArgs args; - for (int i = 1; i < argc; i++) { - std::string a = argv[i]; - if (a.find("--control-ip=") == 0) - args.control_ip = a.substr(13); - else if (a.find("--control-port=") == 0) - args.control_port = std::stoi(a.substr(15)); - else if (a.find("--bridge-qp=") == 0) - args.bridge_qp = std::stoul(a.substr(12), nullptr, 0); - else if (a.find("--bridge-rkey=") == 0) - args.bridge_rkey = std::stoul(a.substr(14), nullptr, 0); - else if (a.find("--bridge-buffer=") == 0) - args.bridge_buffer = std::stoull(a.substr(16), nullptr, 0); - else if (a.find("--page-size=") == 0) - args.page_size = std::stoull(a.substr(12)); - else if (a.find("--num-pages=") == 0) - args.num_pages = std::stoul(a.substr(12)); - else if (a.find("--num-shots=") == 0) - args.num_shots = std::stoul(a.substr(12)); - else if (a.find("--payload-size=") == 0) - args.payload_size = std::stoul(a.substr(15)); - else if (a.find("--vp-address=") == 0) - args.vp_address = std::stoul(a.substr(13), nullptr, 0); - else if (a.find("--hif-address=") == 0) - args.hif_address = std::stoul(a.substr(14), nullptr, 0); - else if (a.find("--bridge-ip=") == 0) - args.bridge_ip = a.substr(12); - else if (a == "--no-verify") - args.verify = false; - else if (a == "--help" || a == "-h") { - std::cout - << "Usage: hololink_fpga_playback [options]\n" - << "\nGeneric RPC playback tool for Hololink FPGA/emulator.\n" - << "\nOptions:\n" - << " --control-ip=ADDR Emulator/FPGA IP (default: 10.0.0.2)\n" - << " --control-port=N UDP control port (default: 8193)\n" - << " --bridge-qp=N Bridge QP number\n" - << " --bridge-rkey=N Bridge RKey\n" - << " --bridge-buffer=ADDR Bridge buffer address\n" - << " --page-size=N Ring buffer slot size (default: 384)\n" - << " --num-pages=N Number of ring buffer slots (default: " - "64)\n" - << " --num-shots=N Number of RPC messages (default: 100)\n" - << " --payload-size=N Bytes per RPC payload (default: 8)\n" - << " --vp-address=ADDR VP register base (default: 0x1000)\n" - << " --hif-address=ADDR HIF register base (default: 0x0800)\n" - << " --bridge-ip=ADDR Bridge IP for FPGA (default: 10.0.0.1)\n" - << " --no-verify Skip ILA correction verification\n"; - exit(0); - } - } - return args; -} - -//============================================================================== -// BRAM loading -//============================================================================== - -/// Build one RPC message for the increment handler. -/// Format: RPCHeader + ascending byte payload. -static std::vector build_rpc_message(uint32_t shot_index, - uint32_t payload_size) { - using cudaq::realtime::fnv1a_hash; - using cudaq::realtime::RPCHeader; - - constexpr uint32_t FUNC_ID = fnv1a_hash("rpc_increment"); - - std::vector msg(sizeof(RPCHeader) + payload_size, 0); - auto *hdr = reinterpret_cast(msg.data()); - hdr->magic = cudaq::realtime::RPC_MAGIC_REQUEST; - hdr->function_id = FUNC_ID; - hdr->arg_len = payload_size; - - uint8_t *payload = msg.data() + sizeof(RPCHeader); - for (uint32_t i = 0; i < payload_size; i++) { - payload[i] = static_cast((shot_index + i) & 0xFF); - } - return msg; -} - -/// Spread a message across 16 BRAM banks (64-byte beats). -static void load_message_to_bram(ControlPlaneClient &ctrl, - const std::vector &msg, - uint32_t window_index, - uint32_t cycles_per_window) { - std::vector> batch; - - for (uint32_t cycle = 0; cycle < cycles_per_window; cycle++) { - uint32_t sample = window_index * cycles_per_window + cycle; - for (int bank = 0; bank < BRAM_NUM_BANKS; bank++) { - uint32_t addr = - RAM_BASE + (bank << (BRAM_W_SAMPLE_ADDR + 2)) + (sample * 4); - uint32_t val = 0; - size_t byte_off = cycle * 64 + bank * 4; - if (byte_off < msg.size()) { - size_t copy_len = std::min(4, msg.size() - byte_off); - memcpy(&val, msg.data() + byte_off, copy_len); - } - batch.push_back({addr, val}); - } - - // Send in chunks to stay within UDP MTU - if (batch.size() >= 64) { - ctrl.write_block(batch); - batch.clear(); - } - } - - if (!batch.empty()) - ctrl.write_block(batch); -} - -//============================================================================== -// Main -//============================================================================== - -int main(int argc, char *argv[]) { - auto args = parse_args(argc, argv); - - std::cout << "=== Hololink Generic RPC Playback ===" << std::endl; - std::cout << "Control: " << args.control_ip << ":" << args.control_port - << std::endl; - std::cout << "Shots: " << args.num_shots << std::endl; - std::cout << "Payload size: " << args.payload_size << " bytes" << std::endl; - - ControlPlaneClient ctrl; - if (!ctrl.connect(args.control_ip, args.control_port)) { - std::cerr << "ERROR: Failed to connect to control plane" << std::endl; - return 1; - } - - //============================================================================ - // Configure RDMA target (bridge's QP/RKEY/buffer) - //============================================================================ - std::cout << "\n[1/4] Configuring RDMA target..." << std::endl; - - uint32_t vp = args.vp_address; - ctrl.write_dword(vp + DP_QP, args.bridge_qp); - ctrl.write_dword(vp + DP_RKEY, args.bridge_rkey); - ctrl.write_dword(vp + DP_PAGE_LSB, - static_cast(args.bridge_buffer >> PAGE_SHIFT)); - ctrl.write_dword(vp + DP_PAGE_MSB, - static_cast(args.bridge_buffer >> 32)); - ctrl.write_dword(vp + DP_PAGE_INC, - static_cast(args.page_size >> PAGE_SHIFT)); - ctrl.write_dword(vp + DP_MAX_BUFF, args.num_pages - 1); - - size_t frame_size = sizeof(cudaq::realtime::RPCHeader) + args.payload_size; - ctrl.write_dword(vp + DP_BUFFER_LENGTH, static_cast(frame_size)); - - // Set bridge IP for emulator GID derivation - { - in_addr a; - inet_pton(AF_INET, args.bridge_ip.c_str(), &a); - ctrl.write_dword(vp + DP_HOST_IP, a.s_addr); - } - - // Enable VP mask - ctrl.write_dword(args.hif_address + DP_VP_MASK, 0x01); - - std::cout << " Bridge QP: 0x" << std::hex << args.bridge_qp << std::dec - << std::endl; - std::cout << " Bridge RKey: " << args.bridge_rkey << std::endl; - std::cout << " Bridge Buffer: 0x" << std::hex << args.bridge_buffer - << std::dec << std::endl; - - //============================================================================ - // Load RPC messages into BRAM - //============================================================================ - std::cout << "\n[2/4] Loading RPC messages into BRAM..." << std::endl; - - uint32_t window_size = static_cast(frame_size); - uint32_t cycles_per_window = (window_size + 63) / 64; - - for (uint32_t shot = 0; shot < args.num_shots; shot++) { - auto msg = build_rpc_message(shot, args.payload_size); - load_message_to_bram(ctrl, msg, shot, cycles_per_window); - - if ((shot + 1) % 10 == 0) - std::cout << " Loaded " << (shot + 1) << "/" << args.num_shots - << std::endl; - } - - //============================================================================ - // Arm ILA and trigger playback - //============================================================================ - std::cout << "\n[3/4] Triggering playback..." << std::endl; - - // Arm ILA capture - if (args.verify) { - ctrl.write_dword(ILA_CTRL, 0x01); - } - - // Set player registers - ctrl.write_dword(PLAYER_WIN_SIZE, window_size); - ctrl.write_dword(PLAYER_WIN_NUM, args.num_shots); - ctrl.write_dword(PLAYER_TIMER, 322 * 100); // 100 us spacing - - // Trigger - ctrl.write_dword(PLAYER_ENABLE, 1); - std::cout << " Playback triggered for " << args.num_shots << " shots" - << std::endl; - - //============================================================================ - // Wait and verify ILA capture - //============================================================================ - if (args.verify) { - std::cout << "\n[4/4] Verifying responses..." << std::endl; - - // Wait for ILA to indicate done (bit 1 of ILA_STATUS) - int timeout = 120; // seconds - bool done = false; - for (int i = 0; i < timeout * 10 && !done; i++) { - uint32_t status = ctrl.read_dword(ILA_STATUS); - if (status & 0x02) - done = true; - else - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - - if (!done) { - std::cerr << "ERROR: ILA capture timeout" << std::endl; - return 1; - } - - uint32_t sample_count = ctrl.read_dword(ILA_SAMPLE_ADDR); - std::cout << " ILA captured " << sample_count << " samples" << std::endl; - - // Read back and verify each response - uint32_t matched = 0; - uint32_t check_count = std::min(sample_count, args.num_shots); - - for (uint32_t i = 0; i < check_count; i++) { - // Read response from ILA banks (the first bytes are RPCResponse header) - std::vector response_bytes(64, 0); - for (int bank = 0; bank < std::min(ILA_NUM_BANKS - 1, 16); bank++) { - uint32_t addr = ILA_DATA_BASE + (bank << (ILA_W_ADDR + 2)) + (i * 4); - uint32_t val = ctrl.read_dword(addr); - size_t byte_off = bank * 4; - if (byte_off + 4 <= response_bytes.size()) - memcpy(response_bytes.data() + byte_off, &val, 4); - } - - // Check control signals (bank 16): tvalid must be set - uint32_t ctrl_addr = - ILA_DATA_BASE + ((ILA_NUM_BANKS - 1) << (ILA_W_ADDR + 2)) + (i * 4); - uint32_t ctrl_val = ctrl.read_dword(ctrl_addr); - bool tvalid = (ctrl_val & 0x01) != 0; - - if (!tvalid) { - std::cerr << " Shot " << i << ": tvalid=0 (no response)" << std::endl; - continue; - } - - // Parse RPCResponse - auto *resp = reinterpret_cast( - response_bytes.data()); - - if (resp->magic != cudaq::realtime::RPC_MAGIC_RESPONSE) { - std::cerr << " Shot " << i << ": bad magic 0x" << std::hex - << resp->magic << std::dec << std::endl; - continue; - } - - if (resp->status != 0) { - std::cerr << " Shot " << i << ": error status " << resp->status - << std::endl; - continue; - } - - // Verify increment: each byte should be (shot_index + byte_index + 1) - const uint8_t *result_data = - response_bytes.data() + sizeof(cudaq::realtime::RPCResponse); - bool ok = true; - uint32_t check_len = std::min(resp->result_len, args.payload_size); - for (uint32_t j = 0; j < check_len && ok; j++) { - uint8_t expected = static_cast(((i + j) & 0xFF) + 1); - if (result_data[j] != expected) { - std::cerr << " Shot " << i << " byte " << j << ": expected " - << (int)expected << " got " << (int)result_data[j] - << std::endl; - ok = false; - } - } - if (ok) - matched++; - } - - std::cout << "\n=== Verification Results ===" << std::endl; - std::cout << " RPC responses matched: " << matched << " / " << check_count - << std::endl; - - if (matched == check_count) { - std::cout << "\n*** ALL RESPONSES VERIFIED ***" << std::endl; - return 0; - } else { - std::cout << "\n*** VERIFICATION FAILED ***" << std::endl; - return 1; - } - } else { - std::cout << "\n[4/4] Verification skipped (--no-verify)" << std::endl; - // Wait a bit for playback to complete - std::this_thread::sleep_for(std::chrono::seconds(10)); - std::cout << "\n*** PLAYBACK COMPLETE ***" << std::endl; - return 0; - } -} diff --git a/realtime/unittests/utils/hololink_test.sh b/realtime/unittests/utils/hololink_test.sh deleted file mode 100755 index bafdb29b..00000000 --- a/realtime/unittests/utils/hololink_test.sh +++ /dev/null @@ -1,408 +0,0 @@ -#!/bin/bash -# ============================================================================ # -# Copyright (c) 2026 NVIDIA Corporation & Affiliates. # -# All rights reserved. # -# # -# This source code and the accompanying materials are made available under # -# the terms of the Apache License 2.0 which accompanies this distribution. # -# ============================================================================ # -# -# hololink_test.sh -# -# Orchestration script for end-to-end Hololink RPC dispatch testing. -# Tests libcudaq-realtime dispatch kernel over Hololink RDMA with a -# simple increment RPC handler (no QEC or decoder dependency). -# -# Modes: -# Default (FPGA): bridge + playback (requires real FPGA) -# --emulate: emulator + bridge + playback (no FPGA needed) -# -# Actions (can be combined): -# --build Build all required tools -# --setup-network Configure ConnectX interfaces -# (run is implicit unless only --build / --setup-network are given) -# -# Examples: -# # Full emulated test: build, configure network, run -# ./hololink_test.sh --emulate --build --setup-network -# -# # Just run with real FPGA (tools already built, network already set up) -# ./hololink_test.sh --fpga-ip 192.168.0.2 -# -# # Build only -# ./hololink_test.sh --build --no-run -# -set -euo pipefail - -# ============================================================================ -# Defaults -# ============================================================================ - -EMULATE=false -DO_BUILD=false -DO_SETUP_NETWORK=false -DO_RUN=true -VERIFY=true - -# Directory defaults -HOLOLINK_DIR="/workspaces/cuda-qx/hololink" -CUDA_QUANTUM_DIR="/workspaces/cuda-quantum" -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Network defaults -IB_DEVICE="" # auto-detect -BRIDGE_IP="10.0.0.1" -EMULATOR_IP="10.0.0.2" -FPGA_IP="192.168.0.2" -MTU=4096 - -# Run defaults -GPU_ID=0 -TIMEOUT=60 -NUM_SHOTS=100 -PAYLOAD_SIZE=8 -PAGE_SIZE=384 -NUM_PAGES=64 -CONTROL_PORT=8193 - -# Build parallelism -JOBS=$(nproc 2>/dev/null || echo 8) - -# ============================================================================ -# Argument Parsing -# ============================================================================ - -print_usage() { - cat <<'EOF' -Usage: hololink_test.sh [options] - -Modes: - --emulate Use FPGA emulator (3-tool mode, no FPGA needed) - Default: FPGA mode (2-tool, requires real FPGA) - -Actions: - --build Build all required tools before running - --setup-network Configure ConnectX network interfaces - --no-run Skip running the test (useful with --build) - -Build options: - --hololink-dir DIR Hololink source directory - (default: /workspaces/cuda-qx/hololink) - --cuda-quantum-dir DIR cuda-quantum source directory - (default: /workspaces/cuda-quantum) - --jobs N Parallel build jobs (default: nproc) - -Network options: - --device DEV ConnectX IB device name (default: auto-detect) - --bridge-ip ADDR Bridge tool IP (default: 10.0.0.1) - --emulator-ip ADDR Emulator IP (default: 10.0.0.2) - --fpga-ip ADDR FPGA IP for non-emulate mode (default: 192.168.0.2) - --mtu N MTU size (default: 4096) - -Run options: - --gpu N GPU device ID (default: 0) - --timeout N Timeout in seconds (default: 60) - --no-verify Skip ILA correction verification - --num-shots N Number of RPC messages (default: 100) - --payload-size N Bytes per RPC payload (default: 8) - --page-size N Ring buffer slot size in bytes (default: 384) - --num-pages N Number of ring buffer slots (default: 64) - --control-port N UDP control port for emulator (default: 8193) - - --help, -h Show this help -EOF -} - -while [[ $# -gt 0 ]]; do - case "$1" in - --emulate) EMULATE=true ;; - --build) DO_BUILD=true ;; - --setup-network) DO_SETUP_NETWORK=true ;; - --no-run) DO_RUN=false ;; - --no-verify) VERIFY=false ;; - --hololink-dir) HOLOLINK_DIR="$2"; shift ;; - --cuda-quantum-dir) CUDA_QUANTUM_DIR="$2"; shift ;; - --jobs) JOBS="$2"; shift ;; - --device) IB_DEVICE="$2"; shift ;; - --bridge-ip) BRIDGE_IP="$2"; shift ;; - --emulator-ip) EMULATOR_IP="$2"; shift ;; - --fpga-ip) FPGA_IP="$2"; shift ;; - --mtu) MTU="$2"; shift ;; - --gpu) GPU_ID="$2"; shift ;; - --timeout) TIMEOUT="$2"; shift ;; - --num-shots) NUM_SHOTS="$2"; shift ;; - --payload-size) PAYLOAD_SIZE="$2"; shift ;; - --page-size) PAGE_SIZE="$2"; shift ;; - --num-pages) NUM_PAGES="$2"; shift ;; - --control-port) CONTROL_PORT="$2"; shift ;; - --help|-h) print_usage; exit 0 ;; - *) - echo "ERROR: Unknown option: $1" >&2 - print_usage >&2 - exit 1 - ;; - esac - shift -done - -# ============================================================================ -# Auto-detect IB device -# ============================================================================ - -detect_ib_device() { - if [[ -n "$IB_DEVICE" ]]; then - echo "$IB_DEVICE" - return - fi - local dev - dev=$(ibstat -l 2>/dev/null | head -1 || true) - if [[ -z "$dev" ]]; then - dev=$(ls /sys/class/infiniband/ 2>/dev/null | head -1 || true) - fi - if [[ -z "$dev" ]]; then - echo "ERROR: Could not auto-detect IB device. Use --device." >&2 - exit 1 - fi - echo "$dev" -} - -# ============================================================================ -# Network interface name from IB device -# ============================================================================ - -get_netdev() { - local ib_dev=$1 - local netdev - netdev=$(ls "/sys/class/infiniband/$ib_dev/device/net/" 2>/dev/null | head -1 || true) - echo "$netdev" -} - -# ============================================================================ -# Build -# ============================================================================ - -do_build() { - echo "=== Building tools ===" - - local realtime_dir="$CUDA_QUANTUM_DIR/realtime" - local realtime_build="$realtime_dir/build" - local hololink_build="$HOLOLINK_DIR/build" - - # Detect target arch - local arch - arch=$(uname -m) - local target_arch="amd64" - if [[ "$arch" == "aarch64" ]]; then - target_arch="arm64" - fi - - # Build hololink (only the two libraries we need) - echo "--- Building hololink ($target_arch) ---" - cmake -G Ninja -S "$HOLOLINK_DIR" -B "$hololink_build" \ - -DCMAKE_BUILD_TYPE=Release \ - -DTARGETARCH="$target_arch" \ - -DHOLOLINK_BUILD_ONLY_NATIVE=OFF \ - -DHOLOLINK_BUILD_PYTHON=OFF \ - -DHOLOLINK_BUILD_TESTS=OFF \ - -DHOLOLINK_BUILD_TOOLS=OFF \ - -DHOLOLINK_BUILD_EXAMPLES=OFF \ - -DHOLOLINK_BUILD_EMULATOR=OFF - cmake --build "$hololink_build" -j"$JOBS" \ - --target gpu_roce_transceiver hololink_core - - # Build cuda-quantum/realtime with hololink tools enabled - echo "--- Building cuda-quantum/realtime ---" - cmake -G Ninja -S "$realtime_dir" -B "$realtime_build" \ - -DCMAKE_BUILD_TYPE=Release \ - -DCUDAQ_REALTIME_ENABLE_HOLOLINK_TOOLS=ON \ - -DHOLOSCAN_SENSOR_BRIDGE_SOURCE_DIR="$HOLOLINK_DIR" \ - -DHOLOSCAN_SENSOR_BRIDGE_BUILD_DIR="$hololink_build" - cmake --build "$realtime_build" -j"$JOBS" \ - --target hololink_bridge hololink_fpga_emulator hololink_fpga_playback - - echo "=== Build complete ===" -} - -# ============================================================================ -# Network setup -# ============================================================================ - -do_setup_network() { - IB_DEVICE=$(detect_ib_device) - local netdev - netdev=$(get_netdev "$IB_DEVICE") - - echo "=== Setting up network ===" - echo " IB device: $IB_DEVICE" - echo " Net device: $netdev" - - if [[ -z "$netdev" ]]; then - echo "ERROR: No network device found for $IB_DEVICE" >&2 - exit 1 - fi - - sudo ip link set "$netdev" up mtu "$MTU" || true - sudo ip addr add "$BRIDGE_IP/24" dev "$netdev" 2>/dev/null || true - - if $EMULATE; then - sudo ip addr add "$EMULATOR_IP/24" dev "$netdev" 2>/dev/null || true - # Add static ARP entries - sudo ip neigh replace "$BRIDGE_IP" lladdr "$(cat /sys/class/net/$netdev/address)" dev "$netdev" nud permanent 2>/dev/null || true - sudo ip neigh replace "$EMULATOR_IP" lladdr "$(cat /sys/class/net/$netdev/address)" dev "$netdev" nud permanent 2>/dev/null || true - fi - - echo "=== Network setup complete ===" -} - -# ============================================================================ -# Run -# ============================================================================ - -cleanup_pids() { - for pid in "${PIDS[@]}"; do - if kill -0 "$pid" 2>/dev/null; then - kill "$pid" 2>/dev/null || true - wait "$pid" 2>/dev/null || true - fi - done -} - -do_run() { - IB_DEVICE=$(detect_ib_device) - local build_dir="$CUDA_QUANTUM_DIR/realtime/build" - local utils_dir="$build_dir/unittests/utils" - - local bridge_bin="$utils_dir/hololink_bridge" - local emulator_bin="$utils_dir/hololink_fpga_emulator" - local playback_bin="$utils_dir/hololink_fpga_playback" - - # Verify binaries exist - for bin in "$bridge_bin"; do - if [[ ! -x "$bin" ]]; then - echo "ERROR: $bin not found. Run with --build first." >&2 - exit 1 - fi - done - - PIDS=() - trap cleanup_pids EXIT - - local FPGA_QP - local FPGA_TARGET_IP - - if $EMULATE; then - echo "=== Emulated mode ===" - - # Start emulator - echo "--- Starting emulator ---" - "$emulator_bin" \ - --device="$IB_DEVICE" \ - --port="$CONTROL_PORT" \ - --bridge-ip="$BRIDGE_IP" \ - --page-size="$PAGE_SIZE" \ - 2>&1 | tee /tmp/emulator.log & - PIDS+=($!) - - # Wait for emulator to print QP number - sleep 2 - FPGA_QP=$(grep -oP 'QP Number: 0x\K[0-9a-fA-F]+' /tmp/emulator.log | head -1) - if [[ -z "$FPGA_QP" ]]; then - echo "ERROR: Could not parse emulator QP from log" >&2 - exit 1 - fi - FPGA_QP="0x$FPGA_QP" - FPGA_TARGET_IP="$EMULATOR_IP" - - echo " Emulator QP: $FPGA_QP" - else - echo "=== FPGA mode ===" - FPGA_QP="0x2" - FPGA_TARGET_IP="$FPGA_IP" - fi - - # Start bridge - echo "--- Starting bridge ---" - "$bridge_bin" \ - --device="$IB_DEVICE" \ - --peer-ip="$FPGA_TARGET_IP" \ - --remote-qp="$FPGA_QP" \ - --gpu="$GPU_ID" \ - --timeout="$TIMEOUT" \ - --page-size="$PAGE_SIZE" \ - --num-pages="$NUM_PAGES" \ - 2>&1 | tee /tmp/bridge.log & - PIDS+=($!) - - # Wait for bridge to print QP info - sleep 3 - local BRIDGE_QP BRIDGE_RKEY BRIDGE_BUFFER - BRIDGE_QP=$(grep -oP 'QP Number: 0x\K[0-9a-fA-F]+' /tmp/bridge.log | tail -1) - BRIDGE_RKEY=$(grep -oP 'RKey: \K[0-9]+' /tmp/bridge.log | tail -1) - BRIDGE_BUFFER=$(grep -oP 'Buffer Addr: 0x\K[0-9a-fA-F]+' /tmp/bridge.log | tail -1) - - if [[ -z "$BRIDGE_QP" || -z "$BRIDGE_RKEY" || -z "$BRIDGE_BUFFER" ]]; then - echo "ERROR: Could not parse bridge QP info from log" >&2 - echo " QP=$BRIDGE_QP RKEY=$BRIDGE_RKEY BUFFER=$BRIDGE_BUFFER" >&2 - exit 1 - fi - - echo " Bridge QP: 0x$BRIDGE_QP" - echo " Bridge RKey: $BRIDGE_RKEY" - echo " Bridge Buffer: 0x$BRIDGE_BUFFER" - - # Start playback - echo "--- Starting playback ---" - local verify_flag="" - if ! $VERIFY; then - verify_flag="--no-verify" - fi - - "$playback_bin" \ - --control-ip="$FPGA_TARGET_IP" \ - --control-port="$CONTROL_PORT" \ - --bridge-qp="0x$BRIDGE_QP" \ - --bridge-rkey="$BRIDGE_RKEY" \ - --bridge-buffer="0x$BRIDGE_BUFFER" \ - --page-size="$PAGE_SIZE" \ - --num-pages="$NUM_PAGES" \ - --num-shots="$NUM_SHOTS" \ - --payload-size="$PAYLOAD_SIZE" \ - --bridge-ip="$BRIDGE_IP" \ - $verify_flag - PLAYBACK_EXIT=$? - - # Wait for bridge to finish - sleep 2 - - # Cleanup - cleanup_pids - - echo "" - if [[ $PLAYBACK_EXIT -eq 0 ]]; then - echo "*** TEST PASSED ***" - else - echo "*** TEST FAILED ***" - fi - exit $PLAYBACK_EXIT -} - -# ============================================================================ -# Main -# ============================================================================ - -echo "=== Hololink Generic RPC Test ===" -echo "Mode: $(if $EMULATE; then echo "emulated"; else echo "FPGA"; fi)" - -if $DO_BUILD; then - do_build -fi - -if $DO_SETUP_NETWORK; then - do_setup_network -fi - -if $DO_RUN; then - do_run -fi - -echo "Done." diff --git a/realtime/unittests/utils/hololink_wrapper.cpp b/realtime/unittests/utils/hololink_wrapper.cpp deleted file mode 100644 index fb83aedb..00000000 --- a/realtime/unittests/utils/hololink_wrapper.cpp +++ /dev/null @@ -1,216 +0,0 @@ -/****************************************************************-*- C++ -*-**** - * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * - * All rights reserved. * - * * - * This source code and the accompanying materials are made available under * - * the terms of the Apache License 2.0 which accompanies this distribution. * - ******************************************************************************/ - -/// @file hololink_wrapper.cpp -/// @brief C wrapper implementation for Hololink GpuRoceTransceiver. -/// -/// This file is compiled by g++ (not nvcc) to isolate Hololink's fmt -/// dependency from CUDA translation units. - -#include "hololink_wrapper.h" - -// Include Hololink headers here (with Holoscan's fmt) -#include - -#include - -using namespace hololink::operators; - -//============================================================================== -// Internal implementation struct -//============================================================================== - -struct HololinkTransceiverImpl { - std::unique_ptr transceiver; - size_t page_size; - unsigned num_pages; -}; - -//============================================================================== -// Lifecycle -//============================================================================== - -hololink_transceiver_t -hololink_create_transceiver(const char *device_name, int ib_port, - size_t frame_size, size_t page_size, - unsigned num_pages, const char *peer_ip, - int forward, int rx_only, int tx_only) { - try { - auto *impl = new HololinkTransceiverImpl(); - impl->page_size = page_size; - impl->num_pages = num_pages; - impl->transceiver = std::make_unique( - device_name, static_cast(ib_port), frame_size, page_size, - num_pages, peer_ip, forward != 0, rx_only != 0, tx_only != 0); - return reinterpret_cast(impl); - } catch (const std::exception &e) { - std::cerr << "ERROR: Failed to create GpuRoceTransceiver: " << e.what() - << std::endl; - return nullptr; - } catch (...) { - std::cerr << "ERROR: Failed to create GpuRoceTransceiver: unknown exception" - << std::endl; - return nullptr; - } -} - -void hololink_destroy_transceiver(hololink_transceiver_t handle) { - if (handle) { - auto *impl = reinterpret_cast(handle); - delete impl; - } -} - -int hololink_start(hololink_transceiver_t handle) { - if (handle) { - auto *impl = reinterpret_cast(handle); - return impl->transceiver->start() ? 1 : 0; - } - return 0; -} - -void hololink_close(hololink_transceiver_t handle) { - if (handle) { - auto *impl = reinterpret_cast(handle); - impl->transceiver->close(); - } -} - -void hololink_blocking_monitor(hololink_transceiver_t handle) { - if (handle) { - auto *impl = reinterpret_cast(handle); - impl->transceiver->blocking_monitor(); - } -} - -//============================================================================== -// QP information -//============================================================================== - -uint32_t hololink_get_qp_number(hololink_transceiver_t handle) { - if (handle) { - auto *impl = reinterpret_cast(handle); - return impl->transceiver->get_qp_number(); - } - return 0; -} - -uint32_t hololink_get_rkey(hololink_transceiver_t handle) { - if (handle) { - auto *impl = reinterpret_cast(handle); - return impl->transceiver->get_rkey(); - } - return 0; -} - -uint64_t hololink_get_buffer_addr(hololink_transceiver_t handle) { - if (handle) { - auto *impl = reinterpret_cast(handle); - return impl->transceiver->external_frame_memory(); - } - return 0; -} - -int hololink_get_gid(hololink_transceiver_t handle, uint8_t *gid_out) { - if (handle) { - auto *impl = reinterpret_cast(handle); - return impl->transceiver->get_gid(gid_out); - } - return 0; -} - -//============================================================================== -// Deferred QP connection -//============================================================================== - -int hololink_reconnect_qp(hololink_transceiver_t handle, - const uint8_t *remote_gid, uint32_t remote_qpn) { - if (handle) { - auto *impl = reinterpret_cast(handle); - return impl->transceiver->reconnect_qp(remote_gid, remote_qpn) ? 1 : 0; - } - return 0; -} - -//============================================================================== -// Ring buffer access -//============================================================================== - -void *hololink_get_rx_ring_data_addr(hololink_transceiver_t handle) { - if (handle) { - auto *impl = reinterpret_cast(handle); - return impl->transceiver->get_rx_ring_data_addr(); - } - return nullptr; -} - -uint64_t *hololink_get_rx_ring_flag_addr(hololink_transceiver_t handle) { - if (handle) { - auto *impl = reinterpret_cast(handle); - return impl->transceiver->get_rx_ring_flag_addr(); - } - return nullptr; -} - -void *hololink_get_tx_ring_data_addr(hololink_transceiver_t handle) { - if (handle) { - auto *impl = reinterpret_cast(handle); - return impl->transceiver->get_tx_ring_data_addr(); - } - return nullptr; -} - -uint64_t *hololink_get_tx_ring_flag_addr(hololink_transceiver_t handle) { - if (handle) { - auto *impl = reinterpret_cast(handle); - return impl->transceiver->get_tx_ring_flag_addr(); - } - return nullptr; -} - -uint64_t *hololink_get_tx_ring_flag_host_addr(hololink_transceiver_t handle) { - if (handle) { - auto *impl = reinterpret_cast(handle); - return impl->transceiver->get_tx_ring_flag_host_addr(); - } - return nullptr; -} - -uint64_t *hololink_get_rx_ring_flag_host_addr(hololink_transceiver_t handle) { - // Note: GpuRoceTransceiver does not currently expose host RX flag addr. - (void)handle; - return nullptr; -} - -bool hololink_query_kernel_occupancy(void) { - int prep = 0, rx = 0, tx = 0; - cudaError_t err = GpuRoceTransceiverQueryOccupancy(&prep, &rx, &tx); - if (err != cudaSuccess) { - fprintf(stderr, "ERROR: Hololink kernel occupancy query failed: %s\n", - cudaGetErrorString(err)); - return false; - } - printf(" Hololink kernel occupancy: prepare=%d rx=%d tx=%d\n", prep, rx, tx); - return true; -} - -size_t hololink_get_page_size(hololink_transceiver_t handle) { - if (handle) { - auto *impl = reinterpret_cast(handle); - return impl->page_size; - } - return 0; -} - -unsigned hololink_get_num_pages(hololink_transceiver_t handle) { - if (handle) { - auto *impl = reinterpret_cast(handle); - return impl->num_pages; - } - return 0; -} diff --git a/realtime/unittests/utils/hololink_wrapper.h b/realtime/unittests/utils/hololink_wrapper.h deleted file mode 100644 index ebc2ceef..00000000 --- a/realtime/unittests/utils/hololink_wrapper.h +++ /dev/null @@ -1,142 +0,0 @@ -/****************************************************************-*- C++ -*-**** - * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * - * All rights reserved. * - * * - * This source code and the accompanying materials are made available under * - * the terms of the Apache License 2.0 which accompanies this distribution. * - ******************************************************************************/ - -/// @file hololink_wrapper.h -/// @brief C interface to Hololink GpuRoceTransceiver. -/// -/// This wrapper avoids `fmt` library conflicts between Hololink (which uses -/// Holoscan's `fmt`) and CUDA files compiled by nvcc. - -#ifndef HOLOLINK_WRAPPER_H -#define HOLOLINK_WRAPPER_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -// Opaque handle for GpuRoceTransceiver -typedef void *hololink_transceiver_t; - -//============================================================================== -// Transceiver lifecycle -//============================================================================== - -/** - * Create a new Hololink transceiver. - * - * @param device_name IB device name (e.g., "rocep1s0f0") - * @param ib_port IB port number - * @param frame_size Size of each frame (cu_frame_size) - * @param page_size Size of each page/slot (cu_page_size) - * @param num_pages Number of pages (ring buffer slots) - * @param peer_ip Peer IP address (use "0.0.0.0" for deferred connection) - * @param forward 1 to run forward (echo) kernel - * @param rx_only 1 to run RX-only kernel - * @param tx_only 1 to run TX-only kernel - * @return Handle to transceiver, or NULL on failure - */ -hololink_transceiver_t -hololink_create_transceiver(const char *device_name, int ib_port, - size_t frame_size, size_t page_size, - unsigned num_pages, const char *peer_ip, - int forward, int rx_only, int tx_only); - -/** - * Destroy a transceiver and free resources. - */ -void hololink_destroy_transceiver(hololink_transceiver_t handle); - -/** - * Start the transceiver (initializes DOCA resources, creates QP/CQ). - * @return 1 on success, 0 on failure - */ -int hololink_start(hololink_transceiver_t handle); - -/** - * Close the transceiver (signals shutdown). - */ -void hololink_close(hololink_transceiver_t handle); - -/** - * Run the blocking monitor (launches GPU kernels and waits). - * This function blocks until close() is called. - */ -void hololink_blocking_monitor(hololink_transceiver_t handle); - -//============================================================================== -// QP information (for RDMA setup) -//============================================================================== - -uint32_t hololink_get_qp_number(hololink_transceiver_t handle); -uint32_t hololink_get_rkey(hololink_transceiver_t handle); -uint64_t hololink_get_buffer_addr(hololink_transceiver_t handle); - -/** - * Get the local GID for this transceiver. - * @param handle Transceiver handle - * @param gid_out Buffer to receive 16-byte GID - * @return 1 on success, 0 on failure - */ -int hololink_get_gid(hololink_transceiver_t handle, uint8_t *gid_out); - -//============================================================================== -// Deferred QP connection -//============================================================================== - -/** - * Connect the QP to a remote peer (for deferred connection mode). - * Call this after start() when peer_ip was "0.0.0.0". - * @param handle Transceiver handle - * @param remote_gid 16-byte remote GID - * @param remote_qpn Remote QP number - * @return 1 on success, 0 on failure - */ -int hololink_reconnect_qp(hololink_transceiver_t handle, - const uint8_t *remote_gid, uint32_t remote_qpn); - -//============================================================================== -// Ring buffer access -//============================================================================== - -/** Get device pointer to RX ring data buffer. */ -void *hololink_get_rx_ring_data_addr(hololink_transceiver_t handle); - -/** Get device pointer to RX ring flag array. */ -uint64_t *hololink_get_rx_ring_flag_addr(hololink_transceiver_t handle); - -/** Get device pointer to TX ring data buffer. */ -void *hololink_get_tx_ring_data_addr(hololink_transceiver_t handle); - -/** Get device pointer to TX ring flag array. */ -uint64_t *hololink_get_tx_ring_flag_addr(hololink_transceiver_t handle); - -/** Get host-accessible pointer to TX ring flag array. */ -uint64_t *hololink_get_tx_ring_flag_host_addr(hololink_transceiver_t handle); - -/** Get host-accessible pointer to RX ring flag array. */ -uint64_t *hololink_get_rx_ring_flag_host_addr(hololink_transceiver_t handle); - -/** Force eager CUDA module loading by querying kernel occupancy. - * Call before launching any persistent kernels. - * Returns true on success (all kernels valid). */ -bool hololink_query_kernel_occupancy(void); - -/** Get the page (slot) size configured for this transceiver. */ -size_t hololink_get_page_size(hololink_transceiver_t handle); - -/** Get the number of pages (slots) configured for this transceiver. */ -unsigned hololink_get_num_pages(hololink_transceiver_t handle); - -#ifdef __cplusplus -} -#endif - -#endif // HOLOLINK_WRAPPER_H diff --git a/realtime/unittests/utils/init_rpc_increment_function_table.cu b/realtime/unittests/utils/init_rpc_increment_function_table.cu deleted file mode 100644 index dde181cf..00000000 --- a/realtime/unittests/utils/init_rpc_increment_function_table.cu +++ /dev/null @@ -1,92 +0,0 @@ -/****************************************************************-*- C++ -*-**** - * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * - * All rights reserved. * - * * - * This source code and the accompanying materials are made available under * - * the terms of the Apache License 2.0 which accompanies this distribution. * - ******************************************************************************/ - -/// @file init_rpc_increment_function_table.cu -/// @brief Device-side increment RPC handler and function table initialisation. -/// -/// This file is compiled by nvcc so that the __device__ function pointer -/// can be taken. The host-callable setup_rpc_increment_function_table() -/// wrapper is extern "C" so that the bridge .cpp (compiled by g++) can -/// call it without needing CUDA kernel launch syntax. - -#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" -#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" -#include "cudaq/realtime/daemon/dispatcher/dispatch_modes.h" - -#include -#include - -namespace { - -//============================================================================== -// Increment RPC Handler -//============================================================================== - -/// @brief Simple RPC handler that increments each byte of the payload by 1. -/// -/// Matches the DeviceRPCFunction signature. Reads from input, writes to -/// output (no in-place overlap). -__device__ int rpc_increment_handler(const void *input, void *output, - std::uint32_t arg_len, - std::uint32_t max_result_len, - std::uint32_t *result_len) { - const std::uint8_t *in_data = static_cast(input); - std::uint8_t *out_data = static_cast(output); - std::uint32_t len = (arg_len < max_result_len) ? arg_len : max_result_len; - for (std::uint32_t i = 0; i < len; ++i) { - out_data[i] = static_cast(in_data[i] + 1); - } - *result_len = len; - return 0; -} - -constexpr std::uint32_t RPC_INCREMENT_FUNCTION_ID = - cudaq::realtime::fnv1a_hash("rpc_increment"); - -/// @brief Kernel to populate a cudaq_function_entry_t with the increment -/// handler. -__global__ void init_function_table_kernel(cudaq_function_entry_t *entries) { - if (threadIdx.x == 0 && blockIdx.x == 0) { - entries[0].handler.device_fn_ptr = - reinterpret_cast(&rpc_increment_handler); - entries[0].function_id = RPC_INCREMENT_FUNCTION_ID; - entries[0].dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; - entries[0].reserved[0] = 0; - entries[0].reserved[1] = 0; - entries[0].reserved[2] = 0; - - // Schema: 1 array argument (uint8), 1 array result (uint8) - entries[0].schema.num_args = 1; - entries[0].schema.num_results = 1; - entries[0].schema.reserved = 0; - entries[0].schema.args[0].type_id = CUDAQ_TYPE_ARRAY_UINT8; - entries[0].schema.args[0].reserved[0] = 0; - entries[0].schema.args[0].reserved[1] = 0; - entries[0].schema.args[0].reserved[2] = 0; - entries[0].schema.args[0].size_bytes = 0; - entries[0].schema.args[0].num_elements = 0; - entries[0].schema.results[0].type_id = CUDAQ_TYPE_ARRAY_UINT8; - entries[0].schema.results[0].reserved[0] = 0; - entries[0].schema.results[0].reserved[1] = 0; - entries[0].schema.results[0].reserved[2] = 0; - entries[0].schema.results[0].size_bytes = 0; - entries[0].schema.results[0].num_elements = 0; - } -} - -} // anonymous namespace - -//============================================================================== -// Host-Callable Wrapper -//============================================================================== - -extern "C" void -setup_rpc_increment_function_table(cudaq_function_entry_t *d_entries) { - init_function_table_kernel<<<1, 1>>>(d_entries); - cudaDeviceSynchronize(); -} From 1ae8ae38dacc3b5c61f434a364e43028860e3d03 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Fri, 6 Mar 2026 18:02:51 +0000 Subject: [PATCH 32/40] Fix predecoder test link: add host-dispatch lib and prioritize build RPATH Link cudaq-realtime-host-dispatch directly to the predecoder test to resolve a missing symbol at runtime (RUNPATH is not transitive). Reorder BUILD_RPATH so the local build directory is searched before the install prefix, ensuring the freshly built pipeline library is loaded. Signed-off-by: Scott Thornton --- libs/qec/unittests/CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libs/qec/unittests/CMakeLists.txt b/libs/qec/unittests/CMakeLists.txt index 4807a274..e9b7f660 100644 --- a/libs/qec/unittests/CMakeLists.txt +++ b/libs/qec/unittests/CMakeLists.txt @@ -348,13 +348,14 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) ${TENSORRT_ONNX_PARSER_LIBRARY} ${CUDAQ_REALTIME_LIBRARY} ${CUDAQ_REALTIME_DISPATCH_LIBRARY} + ${CUDAQ_REALTIME_HOST_DISPATCH_LIBRARY} cudaq-realtime-pipeline cudaq-qec cudaq::cudaq ) set_target_properties(test_realtime_predecoder_w_pymatching PROPERTIES - BUILD_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" - INSTALL_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" + BUILD_RPATH "${CMAKE_BINARY_DIR}/lib;${CUDAQ_REALTIME_LIB_DIR}" + INSTALL_RPATH "${CMAKE_BINARY_DIR}/lib;${CUDAQ_REALTIME_LIB_DIR}" ) add_dependencies(CUDAQXQECUnitTests test_realtime_predecoder_w_pymatching) From cbb8e1e64aaf6e3e17f12ce76fcc3dd0472ba209 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Fri, 6 Mar 2026 23:32:30 +0000 Subject: [PATCH 33/40] Adapt cudaqx to extern "C" host dispatcher API Update all cudaqx-side consumers to use the renamed C-compatible host dispatcher types (cudaq_host_dispatcher_config_t, cudaq_host_dispatch_worker_t, cudaq_host_dispatcher_loop) with opaque void* atomic fields and pointer+count worker arrays. Fix uninitialized post_launch_fn causing segfault in dispatcher tests. Signed-off-by: Scott Thornton --- docs/host_side_dispatcher_design_gemini.md | 8 +-- docs/realtime_pipeline_architecture.md | 11 ++-- libs/qec/lib/realtime/CMakeLists.txt | 12 +++- libs/qec/lib/realtime/realtime_pipeline.cu | 62 ++++++++++++-------- libs/qec/unittests/CMakeLists.txt | 12 +++- libs/qec/unittests/test_realtime_pipeline.cu | 47 ++++++++------- 6 files changed, 92 insertions(+), 60 deletions(-) diff --git a/docs/host_side_dispatcher_design_gemini.md b/docs/host_side_dispatcher_design_gemini.md index e61ff957..b53376ed 100644 --- a/docs/host_side_dispatcher_design_gemini.md +++ b/docs/host_side_dispatcher_design_gemini.md @@ -65,14 +65,14 @@ All shared state must use **libcu++ system-scope atomics** allocated in mapped p ## 4. Host Dispatcher Thread (Producer) -The dispatcher loop is a tight spin-polling loop running on a dedicated CPU core. It is implemented in `realtime/lib/daemon/dispatcher/host_dispatcher.cu` as `host_dispatcher_loop()`. +The dispatcher loop is a tight spin-polling loop running on a dedicated CPU core. It is implemented in `realtime/lib/daemon/dispatcher/host_dispatcher.cu` as `cudaq_host_dispatcher_loop()`. -### 4.1 HostDispatchWorker Structure +### 4.1 cudaq_host_dispatch_worker_t Structure Each worker in the pool has the following fields: ```cpp -struct HostDispatchWorker { +typedef struct { cudaGraphExec_t graph_exec; cudaStream_t stream; uint32_t function_id; @@ -85,7 +85,7 @@ The `pre_launch_fn` callback enables the dispatcher to issue a `cudaMemcpyAsync` ### 4.2 Dispatcher Logic (Pseudocode) ```cpp -void host_dispatcher_loop(const HostDispatcherConfig& config) { +void cudaq_host_dispatcher_loop(const cudaq_host_dispatcher_config_t *config) { size_t current_slot = 0; while (config.shutdown_flag->load(acquire) == 0) { diff --git a/docs/realtime_pipeline_architecture.md b/docs/realtime_pipeline_architecture.md index 4ec03d5c..3c5073c7 100644 --- a/docs/realtime_pipeline_architecture.md +++ b/docs/realtime_pipeline_architecture.md @@ -32,13 +32,14 @@ classDiagram +clear_slot(slot) } - class HostDispatcherConfig { + class cudaq_host_dispatcher_config_t { +rx_flags : atomic_uint64~ptr~ +tx_flags : atomic_uint64~ptr~ +idle_mask : atomic_uint64~ptr~ +inflight_slot_tags : int~ptr~ +h_mailbox_bank : void~ptrptr~ - +workers : HostDispatchWorker~list~ + +workers : cudaq_host_dispatch_worker_t* + +num_workers : size_t +function_table : cudaq_function_entry_t~ptr~ +shutdown_flag : atomic_int~ptr~ } @@ -53,10 +54,10 @@ classDiagram } RealtimePipeline *-- RingBufferManager : owns - RealtimePipeline *-- HostDispatcherConfig : builds + RealtimePipeline *-- cudaq_host_dispatcher_config_t : builds RealtimePipeline --> RingBufferInjector : creates RingBufferInjector --> RingBufferManager : writes to - HostDispatcherConfig --> AIPreDecoderService : launches graph + cudaq_host_dispatcher_config_t --> AIPreDecoderService : launches graph ``` ## 2. Thread Model @@ -70,7 +71,7 @@ flowchart LR end subgraph "Dispatcher Thread (core 2)" - D["host_dispatcher_loop()"] + D["cudaq_host_dispatcher_loop()"] end subgraph "Worker Threads (cores 4..4+N)" diff --git a/libs/qec/lib/realtime/CMakeLists.txt b/libs/qec/lib/realtime/CMakeLists.txt index 1486b746..0a9449bf 100644 --- a/libs/qec/lib/realtime/CMakeLists.txt +++ b/libs/qec/lib/realtime/CMakeLists.txt @@ -25,13 +25,21 @@ if(CMAKE_CUDA_COMPILER) find_path(CUDAQ_REALTIME_INCLUDE_DIR NAMES cudaq/realtime/daemon/dispatcher/cudaq_realtime.h - PATHS ${_cudaq_realtime_prefixes} + HINTS ${_cudaq_realtime_prefixes} PATH_SUFFIXES include + NO_DEFAULT_PATH ) if(NOT CUDAQ_REALTIME_INCLUDE_DIR) find_path(CUDAQ_REALTIME_INCLUDE_DIR - NAMES cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h + NAMES cudaq/realtime/daemon/dispatcher/cudaq_realtime.h PATHS ${_cudaq_realtime_prefixes} + PATH_SUFFIXES include + ) + endif() + if(NOT CUDAQ_REALTIME_INCLUDE_DIR) + find_path(CUDAQ_REALTIME_INCLUDE_DIR + NAMES cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h + HINTS ${_cudaq_realtime_prefixes} PATH_SUFFIXES include ../include ) endif() diff --git a/libs/qec/lib/realtime/realtime_pipeline.cu b/libs/qec/lib/realtime/realtime_pipeline.cu index 13c20f26..942c78b8 100644 --- a/libs/qec/lib/realtime/realtime_pipeline.cu +++ b/libs/qec/lib/realtime/realtime_pipeline.cu @@ -28,6 +28,9 @@ namespace cudaq::realtime { +using atomic_uint64_sys = cuda::std::atomic; +using atomic_int_sys = cuda::std::atomic; + // --------------------------------------------------------------------------- // Internal helpers // --------------------------------------------------------------------------- @@ -311,10 +314,29 @@ struct RealtimePipeline::Impl { uint64_t initial_idle = (nw >= 64) ? ~0ULL : ((1ULL << nw) - 1); idle_mask.store(initial_idle, cuda::std::memory_order_release); - // Build HostDispatcherConfig - HostDispatcherConfig disp_cfg; - disp_cfg.rx_flags = ring->rx_flags(); - disp_cfg.tx_flags = ring->tx_flags(); + // Build cudaq_host_dispatcher_config_t + std::vector disp_workers(nw); + for (int i = 0; i < nw; ++i) { + disp_workers[i].graph_exec = worker_resources[i].graph_exec; + disp_workers[i].stream = worker_resources[i].stream; + disp_workers[i].function_id = worker_resources[i].function_id; + disp_workers[i].pre_launch_fn = worker_resources[i].pre_launch_fn; + disp_workers[i].pre_launch_data = worker_resources[i].pre_launch_data; + + if (gpu_only) { + disp_workers[i].post_launch_fn = gpu_only_post_launch; + disp_workers[i].post_launch_data = &gpu_only_ctxs[i]; + } else { + disp_workers[i].post_launch_fn = worker_resources[i].post_launch_fn; + disp_workers[i].post_launch_data = + worker_resources[i].post_launch_data; + } + } + + cudaq_host_dispatcher_config_t disp_cfg; + std::memset(&disp_cfg, 0, sizeof(disp_cfg)); + disp_cfg.rx_flags = static_cast(ring->rx_flags()); + disp_cfg.tx_flags = static_cast(ring->tx_flags()); disp_cfg.rx_data_host = ring->rx_data_host(); disp_cfg.rx_data_dev = ring->rx_data_dev(); disp_cfg.tx_data_host = nullptr; @@ -323,35 +345,23 @@ struct RealtimePipeline::Impl { disp_cfg.h_mailbox_bank = h_mailbox_bank; disp_cfg.num_slots = static_cast(config.num_slots); disp_cfg.slot_size = config.slot_size; + disp_cfg.workers = disp_workers.data(); + disp_cfg.num_workers = static_cast(nw); disp_cfg.function_table = function_table.data(); disp_cfg.function_table_count = static_cast(nw); - disp_cfg.shutdown_flag = &shutdown_flag; + disp_cfg.shutdown_flag = static_cast(&shutdown_flag); disp_cfg.stats_counter = &dispatcher_stats; - disp_cfg.live_dispatched = &live_dispatched; - disp_cfg.idle_mask = &idle_mask; + disp_cfg.live_dispatched = static_cast(&live_dispatched); + disp_cfg.idle_mask = static_cast(&idle_mask); disp_cfg.inflight_slot_tags = inflight_slot_tags.data(); - disp_cfg.workers.resize(nw); - for (int i = 0; i < nw; ++i) { - disp_cfg.workers[i].graph_exec = worker_resources[i].graph_exec; - disp_cfg.workers[i].stream = worker_resources[i].stream; - disp_cfg.workers[i].function_id = worker_resources[i].function_id; - disp_cfg.workers[i].pre_launch_fn = worker_resources[i].pre_launch_fn; - disp_cfg.workers[i].pre_launch_data = worker_resources[i].pre_launch_data; - - if (gpu_only) { - disp_cfg.workers[i].post_launch_fn = gpu_only_post_launch; - disp_cfg.workers[i].post_launch_data = &gpu_only_ctxs[i]; - } else { - disp_cfg.workers[i].post_launch_fn = worker_resources[i].post_launch_fn; - disp_cfg.workers[i].post_launch_data = - worker_resources[i].post_launch_data; - } - } - // --- Dispatcher thread --- + // Copy workers vector into the lambda so it outlives this scope. dispatcher_thread = std::thread( - [cfg = std::move(disp_cfg)]() { host_dispatcher_loop(cfg); }); + [cfg = disp_cfg, workers = std::move(disp_workers)]() mutable { + cfg.workers = workers.data(); + cudaq_host_dispatcher_loop(&cfg); + }); pin_thread(dispatcher_thread, config.cores.dispatcher); // --- Worker threads (skipped in GPU-only mode) --- diff --git a/libs/qec/unittests/CMakeLists.txt b/libs/qec/unittests/CMakeLists.txt index e9b7f660..68be7069 100644 --- a/libs/qec/unittests/CMakeLists.txt +++ b/libs/qec/unittests/CMakeLists.txt @@ -126,13 +126,21 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) # Header layout: include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h find_path(CUDAQ_REALTIME_INCLUDE_DIR NAMES cudaq/realtime/daemon/dispatcher/cudaq_realtime.h - PATHS ${_cudaq_realtime_prefixes} + HINTS ${_cudaq_realtime_prefixes} PATH_SUFFIXES include + NO_DEFAULT_PATH ) if(NOT CUDAQ_REALTIME_INCLUDE_DIR) find_path(CUDAQ_REALTIME_INCLUDE_DIR - NAMES cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h + NAMES cudaq/realtime/daemon/dispatcher/cudaq_realtime.h PATHS ${_cudaq_realtime_prefixes} + PATH_SUFFIXES include + ) + endif() + if(NOT CUDAQ_REALTIME_INCLUDE_DIR) + find_path(CUDAQ_REALTIME_INCLUDE_DIR + NAMES cudaq/nvqlink/daemon/dispatcher/cudaq_realtime.h + HINTS ${_cudaq_realtime_prefixes} PATH_SUFFIXES include ../include ) endif() diff --git a/libs/qec/unittests/test_realtime_pipeline.cu b/libs/qec/unittests/test_realtime_pipeline.cu index 04f03be1..0d4660a0 100644 --- a/libs/qec/unittests/test_realtime_pipeline.cu +++ b/libs/qec/unittests/test_realtime_pipeline.cu @@ -37,6 +37,9 @@ namespace { using namespace cudaq::qec; namespace rt = cudaq::realtime; +using atomic_uint64_sys = cuda::std::atomic; +using atomic_int_sys = cuda::std::atomic; + static constexpr size_t kSkipTrtFloats = 1600; static constexpr size_t kSkipTrtBytes = kSkipTrtFloats * sizeof(float); static constexpr size_t kSlotSize = 8192; @@ -154,7 +157,7 @@ protected: const void *payload, size_t payload_len) { uint8_t *slot_host = rx_data_host_ + slot * kSlotSize; write_rpc_slot(slot_host, function_id, payload, payload_len); - auto *flags = reinterpret_cast(rx_flags_host_); + auto *flags = reinterpret_cast(rx_flags_host_); flags[slot].store(reinterpret_cast(slot_host), cuda::std::memory_order_release); } @@ -385,10 +388,10 @@ class HostDispatcherTest : public RealtimePipelineTest { protected: void SetUp() override { RealtimePipelineTest::SetUp(); - idle_mask_ = new rt::atomic_uint64_sys(0); - live_dispatched_ = new rt::atomic_uint64_sys(0); + idle_mask_ = new atomic_uint64_sys(0); + live_dispatched_ = new atomic_uint64_sys(0); inflight_slot_tags_ = new int[kMaxWorkers](); - shutdown_flag_ = new rt::atomic_int_sys(0); + shutdown_flag_ = new atomic_int_sys(0); stats_counter_ = 0; function_table_ = new cudaq_function_entry_t[kMaxWorkers]; std::memset(function_table_, 0, @@ -420,7 +423,7 @@ protected: ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess); worker_streams_.push_back(s); - rt::HostDispatchWorker w; + cudaq_host_dispatch_worker_t w{}; w.graph_exec = exec; w.stream = s; w.function_id = function_id; @@ -439,10 +442,9 @@ protected: idle_mask_->store((1ULL << workers_.size()) - 1, cuda::std::memory_order_release); - config_.rx_flags = - reinterpret_cast(rx_flags_host_); - config_.tx_flags = - reinterpret_cast(tx_flags_host_); + std::memset(&config_, 0, sizeof(config_)); + config_.rx_flags = rx_flags_host_; + config_.tx_flags = tx_flags_host_; config_.rx_data_host = rx_data_host_; config_.rx_data_dev = rx_data_dev_; config_.tx_data_host = tx_data_host_; @@ -451,7 +453,8 @@ protected: config_.h_mailbox_bank = mailbox_bank_host_; config_.num_slots = kNumSlots; config_.slot_size = kSlotSize; - config_.workers = workers_; + config_.workers = workers_.data(); + config_.num_workers = workers_.size(); config_.function_table = function_table_; config_.function_table_count = ft_count_; config_.shutdown_flag = shutdown_flag_; @@ -460,7 +463,9 @@ protected: config_.idle_mask = idle_mask_; config_.inflight_slot_tags = inflight_slot_tags_; - loop_thread_ = std::thread(rt::host_dispatcher_loop, config_); + loop_thread_ = std::thread([this]() { + cudaq_host_dispatcher_loop(&config_); + }); } void stop_loop() { @@ -476,7 +481,7 @@ protected: } bool poll_tx_flag(size_t slot, int timeout_ms = 2000) { - auto *flags = reinterpret_cast(tx_flags_host_); + auto *flags = reinterpret_cast(tx_flags_host_); auto deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds(timeout_ms); while (std::chrono::steady_clock::now() < deadline) { @@ -489,22 +494,22 @@ protected: } void clear_tx_flag(size_t slot) { - auto *flags = reinterpret_cast(tx_flags_host_); + auto *flags = reinterpret_cast(tx_flags_host_); flags[slot].store(0, cuda::std::memory_order_release); } - rt::atomic_uint64_sys *idle_mask_ = nullptr; - rt::atomic_uint64_sys *live_dispatched_ = nullptr; + atomic_uint64_sys *idle_mask_ = nullptr; + atomic_uint64_sys *live_dispatched_ = nullptr; int *inflight_slot_tags_ = nullptr; - rt::atomic_int_sys *shutdown_flag_ = nullptr; + atomic_int_sys *shutdown_flag_ = nullptr; uint64_t stats_counter_ = 0; bool loop_stopped_ = false; cudaq_function_entry_t *function_table_ = nullptr; size_t ft_count_ = 0; - std::vector workers_; + std::vector workers_; std::vector worker_streams_; - rt::HostDispatcherConfig config_{}; + cudaq_host_dispatcher_config_t config_; std::thread loop_thread_; }; @@ -576,7 +581,7 @@ TEST_F(HostDispatcherTest, InvalidMagicDropped) { bad_hdr.arg_len = 4; std::memcpy(slot_host, &bad_hdr, sizeof(bad_hdr)); - auto *flags = reinterpret_cast(rx_flags_host_); + auto *flags = reinterpret_cast(rx_flags_host_); flags[0].store(reinterpret_cast(slot_host), cuda::std::memory_order_release); @@ -603,7 +608,7 @@ TEST_F(HostDispatcherTest, SlotWraparound) { for (int i = 0; i < kTotal; ++i) { size_t slot = static_cast(i % kNumSlots); - auto *rx = reinterpret_cast(rx_flags_host_); + auto *rx = reinterpret_cast(rx_flags_host_); while (rx[slot].load(cuda::std::memory_order_acquire) != 0) usleep(100); clear_tx_flag(slot); @@ -739,7 +744,7 @@ TEST_F(HostDispatcherTest, SustainedThroughput_200Requests) { int pd_idx = r % kNPd; size_t slot = static_cast(r % kNumSlots); - auto *rx = reinterpret_cast(rx_flags_host_); + auto *rx = reinterpret_cast(rx_flags_host_); auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(5); while (rx[slot].load(cuda::std::memory_order_acquire) != 0) { if (std::chrono::steady_clock::now() > deadline) From 61f13687b3765d7ca65a5f81e1be7d135a67d416 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Mon, 9 Mar 2026 18:33:02 +0000 Subject: [PATCH 34/40] Update pipeline for 24-byte RPC header and tune d13_r104 config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adapt RingBufferManager and RingBufferInjector to pass request_id and ptp_timestamp through to the updated cudaq_host_ringbuffer_write_rpc_request API (now writes the full 24-byte RPCHeader). Zero-initialize RPCHeader in test_realtime_pipeline to avoid uninitialized fields. Tune d13_r104 config to 16 slots / 4 workers based on benchmarking (0 backpressure stalls, p50=169µs, p99=186µs). Signed-off-by: Scott Thornton --- libs/qec/lib/realtime/realtime_pipeline.cu | 10 +++++++--- .../realtime/test_realtime_predecoder_w_pymatching.cpp | 4 ++-- libs/qec/unittests/test_realtime_pipeline.cu | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/libs/qec/lib/realtime/realtime_pipeline.cu b/libs/qec/lib/realtime/realtime_pipeline.cu index 942c78b8..c05c5b4f 100644 --- a/libs/qec/lib/realtime/realtime_pipeline.cu +++ b/libs/qec/lib/realtime/realtime_pipeline.cu @@ -153,9 +153,12 @@ public: } void write_and_signal(uint32_t slot, uint32_t function_id, - const void *payload, uint32_t payload_len) { + const void *payload, uint32_t payload_len, + uint32_t request_id = 0, + uint64_t ptp_timestamp = 0) { cudaq_host_ringbuffer_write_rpc_request(&rb_, slot, function_id, payload, - payload_len); + payload_len, request_id, + ptp_timestamp); cudaq_host_ringbuffer_signal_slot(&rb_, slot); } @@ -626,7 +629,8 @@ bool RingBufferInjector::try_submit(uint32_t function_id, const void *payload, return false; state_->ring->write_and_signal(slot, function_id, payload, - static_cast(payload_size)); + static_cast(payload_size), + static_cast(request_id)); (*state_->slot_request)[slot] = request_id; (*state_->slot_occupied)[slot] = 1; diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index 72f1bd53..84a626f2 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -80,7 +80,7 @@ namespace realtime_ns = cudaq::realtime; // Pipeline Configuration (application-level, no atomics) // ============================================================================= -constexpr size_t NUM_SLOTS = 32; +constexpr size_t NUM_SLOTS = 16; struct PipelineConfig { std::string label; @@ -121,7 +121,7 @@ struct PipelineConfig { static PipelineConfig d13_r104() { return {"d13_r104_Z", 13, 104, 252, 2184, "predecoder_memory_d13_T104_X.onnx", - 131072, 16, 16}; + 131072, 4, 4}; } static PipelineConfig d21_r21() { diff --git a/libs/qec/unittests/test_realtime_pipeline.cu b/libs/qec/unittests/test_realtime_pipeline.cu index 0d4660a0..d4a106ba 100644 --- a/libs/qec/unittests/test_realtime_pipeline.cu +++ b/libs/qec/unittests/test_realtime_pipeline.cu @@ -96,7 +96,7 @@ static void free_mapped_buffer(uint8_t *host_ptr) { static void write_rpc_slot(uint8_t *slot_host, uint32_t function_id, const void *payload, size_t payload_len) { - rt::RPCHeader hdr; + rt::RPCHeader hdr{}; hdr.magic = rt::RPC_MAGIC_REQUEST; hdr.function_id = function_id; hdr.arg_len = static_cast(payload_len); From 8cd20a548eb4f9c7ffc073fd2653c6a67db305f1 Mon Sep 17 00:00:00 2001 From: Ben Howe <141149032+bmhowe23@users.noreply.github.com> Date: Tue, 24 Feb 2026 14:40:47 -0800 Subject: [PATCH 35/40] Update CMake for TensorRT decoder unit test (#448) --- libs/qec/unittests/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libs/qec/unittests/CMakeLists.txt b/libs/qec/unittests/CMakeLists.txt index 68be7069..49317716 100644 --- a/libs/qec/unittests/CMakeLists.txt +++ b/libs/qec/unittests/CMakeLists.txt @@ -56,6 +56,7 @@ if(CUDAQ_QEC_BUILD_TRT_DECODER AND CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD # Find TensorRT for the test find_path(TENSORRT_INCLUDE_DIR NvInfer.h PATHS + ${TENSORRT_ROOT}/include /usr/include/x86_64-linux-gnu /usr/local/cuda/include /usr/local/tensorrt/include @@ -65,6 +66,7 @@ if(CUDAQ_QEC_BUILD_TRT_DECODER AND CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD target_include_directories(test_trt_decoder PRIVATE ${CUDAToolkit_INCLUDE_DIRS} + ${TENSORRT_INCLUDE_DIR} ) target_link_libraries(test_trt_decoder PRIVATE GTest::gtest_main cudaq-qec cudaq-qec-trt-decoder cudaq::cudaq) From 96f5c3337e190cf8a83a3244de54fe23d71d33fc Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Wed, 11 Mar 2026 04:25:19 +0000 Subject: [PATCH 36/40] Fix uint8 model I/O and enable correctness verification with Stim data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TRT engine for the predecoder model uses uint8 I/O, but trt_dtype_size() was missing the kUINT8 case, falling through to the default of 4 bytes. This caused 4x oversized buffer allocations, garbled model input (int32 values read as uint8 by TRT), and misinterpreted output — producing coin-flip LER (~0.50). Three fixes bring the pipeline to verified-correct LER of 0.002: 1. Add nvinfer1::DataType::kUINT8 to trt_dtype_size() (returns 1). Corrects buffer sizes, input copy, and output interpretation. 2. Replace the CUDA-Q surface_code H_z parity matrix with the Stim-derived full spacetime check matrix (H) and observables matrix (O), loaded from binary files in --data-dir. This gives PyMatching the correct matching graph and lets it project edge corrections onto the logical observable. 3. Add --data-dir support for loading pre-generated Stim detector samples and ground-truth observables. The producer feeds real uint8 detector data through the pipeline, and a post-run correctness report compares decode results against ground truth. Additional changes: - Derive slot_size, residual_detectors, and spatial_slices from the TRT model bindings at runtime instead of hardcoding in PipelineConfig - Read request_id from RPCHeader before overwriting with RPCResponse - Track per-request decode_corrections and logical_pred via request_id - Pre-allocate syndrome tensors with thread_local to avoid per-decode heap allocation - Bump d13_r104 config to 8 workers / 32 slots for full-H decode latency headroom - Guard QEC_CPU_RELAX macro against redefinition from host_dispatcher.h - Print TRT binding dtype and element size in setup diagnostics Signed-off-by: Scott Thornton --- .../qec/realtime/ai_predecoder_service.h | 6 +- libs/qec/lib/realtime/ai_decoder_service.cu | 9 +- .../test_realtime_predecoder_w_pymatching.cpp | 509 +++++++++++++++--- 3 files changed, 435 insertions(+), 89 deletions(-) diff --git a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h index 10217a56..db5638dd 100644 --- a/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h +++ b/libs/qec/include/cudaq/qec/realtime/ai_predecoder_service.h @@ -12,15 +12,17 @@ #include #include -// Portable CPU Yield Macro for busy-polling +// Portable CPU Yield Macro for busy-polling (skip if already defined by realtime API) +#ifndef QEC_CPU_RELAX #if defined(__x86_64__) #include #define QEC_CPU_RELAX() _mm_pause() #elif defined(__aarch64__) -#define QEC_CPU_RELAX() asm volatile("yield" ::: "memory") +#define QEC_CPU_RELAX() __asm__ volatile("yield" ::: "memory") #else #define QEC_CPU_RELAX() std::atomic_thread_fence(std::memory_order_seq_cst) #endif +#endif namespace cudaq::qec { diff --git a/libs/qec/lib/realtime/ai_decoder_service.cu b/libs/qec/lib/realtime/ai_decoder_service.cu index 90f18c24..4694a477 100644 --- a/libs/qec/lib/realtime/ai_decoder_service.cu +++ b/libs/qec/lib/realtime/ai_decoder_service.cu @@ -87,6 +87,8 @@ static size_t trt_dtype_size(nvinfer1::DataType dtype) { return 2; case nvinfer1::DataType::kINT8: return 1; + case nvinfer1::DataType::kUINT8: + return 1; case nvinfer1::DataType::kINT32: return 4; case nvinfer1::DataType::kINT64: @@ -271,8 +273,11 @@ void AIDecoderService::setup_bindings() { bool is_input = (mode == nvinfer1::TensorIOMode::kINPUT); - std::printf("[TensorRT] Binding %d: \"%s\" %s, %zu bytes\n", i, name, - is_input ? "INPUT" : "OUTPUT", size_bytes); + std::printf("[TensorRT] Binding %d: \"%s\" %s, dtype=%d, elem_size=%zu, " + "volume=%zu, %zu bytes\n", + i, name, is_input ? "INPUT" : "OUTPUT", + static_cast(dtype), trt_dtype_size(dtype), + tensor_volume(dims), size_bytes); TensorBinding binding{name, nullptr, size_bytes, is_input}; diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index 84a626f2..c736aa10 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -80,22 +80,16 @@ namespace realtime_ns = cudaq::realtime; // Pipeline Configuration (application-level, no atomics) // ============================================================================= -constexpr size_t NUM_SLOTS = 16; +constexpr size_t NUM_SLOTS = 32; struct PipelineConfig { std::string label; int distance; int num_rounds; - int meas_qubits; - int residual_detectors; std::string onnx_filename; - size_t slot_size; int num_predecoders; int num_workers; - int input_elements() const { return meas_qubits * num_rounds; } - size_t input_bytes() const { return input_elements() * sizeof(int32_t); } - std::string onnx_path() const { return std::string(ONNX_MODEL_DIR) + "/" + onnx_filename; } @@ -109,34 +103,39 @@ struct PipelineConfig { } static PipelineConfig d7_r7() { - return {"d7_r7_Z", 7, 7, 72, 336, "model1_d7_r7_unified_Z_batch1.onnx", - 4096, 16, 16}; + return {"d7_r7_Z", 7, 7, "model1_d7_r7_unified_Z_batch1.onnx", 16, 16}; } static PipelineConfig d13_r13() { - return {"d13_r13_Z", 13, 13, 252, 2184, "predecoder_memory_d13_T13_X.onnx", - 16384, 16, 16}; + return {"d13_r13_Z", 13, 13, "predecoder_memory_d13_T13_X.onnx", 16, 16}; } static PipelineConfig d13_r104() { - return {"d13_r104_Z", 13, 104, - 252, 2184, "predecoder_memory_d13_T104_X.onnx", - 131072, 4, 4}; + return {"d13_r104_Z", 13, 104, "predecoder_memory_d13_T104_X.onnx", 8, 8}; } static PipelineConfig d21_r21() { - return {"d21_r21_Z", 21, 21, - 660, 9240, "model1_d21_r21_unified_X_batch1.onnx", - 65536, 16, 16}; + return {"d21_r21_Z", 21, 21, "model1_d21_r21_unified_X_batch1.onnx", 16, + 16}; } static PipelineConfig d31_r31() { - return {"d31_r31_Z", 31, 31, - 1440, 29760, "model1_d31_r31_unified_Z_batch1.onnx", - 262144, 16, 16}; + return {"d31_r31_Z", 31, 31, "model1_d31_r31_unified_Z_batch1.onnx", 16, + 16}; } }; +static size_t round_up_pow2(size_t v) { + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v |= v >> 32; + return v + 1; +} + // ============================================================================= // Decoder Context (application-level) // ============================================================================= @@ -146,6 +145,8 @@ struct DecoderContext { std::atomic next_decoder_idx{0}; int z_stabilizers = 0; int spatial_slices = 0; + int num_residual_detectors = 0; + bool use_full_H = false; cudaq::qec::decoder *acquire_decoder() { thread_local int my_idx = @@ -184,6 +185,11 @@ static void pre_launch_input_copy(void *user_data, void *slot_dev, struct WorkerCtx { AIPreDecoderService *predecoder; DecoderContext *decoder_ctx; + int32_t *decode_corrections = nullptr; + int32_t *decode_logical_pred = nullptr; + int max_requests = 0; + const uint8_t *obs_row = nullptr; + size_t obs_row_size = 0; }; struct __attribute__((packed)) DecodeResponse { @@ -192,15 +198,149 @@ struct __attribute__((packed)) DecodeResponse { }; // ============================================================================= -// Data generation +// Test data (pre-generated from Stim, or random) +// ============================================================================= + +struct TestData { + std::vector detectors; // (num_samples × num_detectors) row-major + std::vector observables; // (num_samples × num_observables) row-major + uint32_t num_samples = 0; + uint32_t num_detectors = 0; + uint32_t num_observables = 0; + + bool loaded() const { return num_samples > 0 && num_detectors > 0; } + + const int32_t *sample(int idx) const { + return detectors.data() + + (static_cast(idx % num_samples) * num_detectors); + } + + int32_t observable(int idx, int obs = 0) const { + return observables[static_cast(idx % num_samples) * + num_observables + + obs]; + } +}; + +static bool load_binary_file(const std::string &path, uint32_t &out_rows, + uint32_t &out_cols, std::vector &data) { + std::ifstream f(path, std::ios::binary); + if (!f.good()) + return false; + f.read(reinterpret_cast(&out_rows), sizeof(uint32_t)); + f.read(reinterpret_cast(&out_cols), sizeof(uint32_t)); + size_t count = static_cast(out_rows) * out_cols; + data.resize(count); + f.read(reinterpret_cast(data.data()), count * sizeof(int32_t)); + return f.good(); +} + +static TestData load_test_data(const std::string &data_dir) { + TestData td; + std::string det_path = data_dir + "/detectors.bin"; + std::string obs_path = data_dir + "/observables.bin"; + + if (!load_binary_file(det_path, td.num_samples, td.num_detectors, + td.detectors)) { + std::cerr << "ERROR: Failed to load " << det_path << "\n"; + return td; + } + uint32_t obs_samples = 0; + if (!load_binary_file(obs_path, obs_samples, td.num_observables, + td.observables)) { + std::cerr << "ERROR: Failed to load " << obs_path << "\n"; + td.num_samples = 0; + return td; + } + if (obs_samples != td.num_samples) { + std::cerr << "ERROR: sample count mismatch: detectors=" << td.num_samples + << " observables=" << obs_samples << "\n"; + td.num_samples = 0; + return td; + } + std::cout << "[Data] Loaded " << td.num_samples << " samples, " + << td.num_detectors << " detectors, " << td.num_observables + << " observables from " << data_dir << "\n"; + return td; +} + +// ============================================================================= +// Stim-derived parity check matrix loader (CSR sparse → dense tensor) // ============================================================================= -void fill_measurement_payload(int32_t *payload, int input_elements, - std::mt19937 &rng, double error_rate = 0.01) { - std::bernoulli_distribution err_dist(error_rate); - for (int i = 0; i < input_elements; ++i) { - payload[i] = err_dist(rng) ? 1 : 0; +struct SparseCSR { + uint32_t nrows = 0, ncols = 0, nnz = 0; + std::vector indptr; + std::vector indices; + + bool loaded() const { return nrows > 0 && ncols > 0; } + + cudaqx::tensor to_dense() const { + cudaqx::tensor T; + std::vector data(static_cast(nrows) * ncols, 0); + for (uint32_t r = 0; r < nrows; ++r) + for (int32_t j = indptr[r]; j < indptr[r + 1]; ++j) + data[static_cast(r) * ncols + indices[j]] = 1; + T.copy(data.data(), + {static_cast(nrows), static_cast(ncols)}); + return T; + } + + std::vector row_dense(uint32_t r) const { + std::vector row(ncols, 0); + for (int32_t j = indptr[r]; j < indptr[r + 1]; ++j) + row[indices[j]] = 1; + return row; + } +}; + +struct StimData { + SparseCSR H; + SparseCSR O; + std::vector priors; +}; + +static bool load_csr(const std::string &path, SparseCSR &out) { + std::ifstream f(path, std::ios::binary); + if (!f.good()) + return false; + f.read(reinterpret_cast(&out.nrows), sizeof(uint32_t)); + f.read(reinterpret_cast(&out.ncols), sizeof(uint32_t)); + f.read(reinterpret_cast(&out.nnz), sizeof(uint32_t)); + out.indptr.resize(out.nrows + 1); + out.indices.resize(out.nnz); + f.read(reinterpret_cast(out.indptr.data()), + (out.nrows + 1) * sizeof(int32_t)); + f.read(reinterpret_cast(out.indices.data()), + out.nnz * sizeof(int32_t)); + return f.good(); +} + +static StimData load_stim_data(const std::string &data_dir) { + StimData sd; + + if (!load_csr(data_dir + "/H_csr.bin", sd.H)) { + std::cerr << "[Data] No H_csr.bin found in " << data_dir << "\n"; + return sd; + } + std::cout << "[Data] Loaded H_csr " << sd.H.nrows << "x" << sd.H.ncols + << " (" << sd.H.nnz << " nnz)\n"; + + if (load_csr(data_dir + "/O_csr.bin", sd.O)) + std::cout << "[Data] Loaded O_csr " << sd.O.nrows << "x" << sd.O.ncols + << " (" << sd.O.nnz << " nnz)\n"; + + std::string priors_path = data_dir + "/priors.bin"; + std::ifstream pf(priors_path, std::ios::binary); + if (pf.good()) { + uint32_t nedges = 0; + pf.read(reinterpret_cast(&nedges), sizeof(uint32_t)); + sd.priors.resize(nedges); + pf.read(reinterpret_cast(sd.priors.data()), + nedges * sizeof(double)); + std::cout << "[Data] Loaded " << sd.priors.size() << " priors\n"; } + return sd; } // ============================================================================= @@ -211,6 +351,7 @@ struct StreamingConfig { int rate_us = 0; int duration_s = 5; int warmup_count = 20; + std::string data_dir; }; // ============================================================================= @@ -224,7 +365,15 @@ int main(int argc, char *argv[]) { std::string config_name = "d7"; StreamingConfig scfg; - if (argc > 1) + // Scan for --data-dir first (can appear anywhere) + for (int i = 1; i < argc; ++i) { + if (std::string(argv[i]) == "--data-dir" && i + 1 < argc) { + scfg.data_dir = argv[i + 1]; + break; + } + } + // Positional: config_name [rate_us] [duration_s] + if (argc > 1 && std::string(argv[1]).substr(0, 2) != "--") config_name = argv[1]; if (argc > 2 && std::isdigit(argv[2][0])) scfg.rate_us = std::stoi(argv[2]); @@ -257,12 +406,6 @@ int main(int argc, char *argv[]) { std::cout << "--- Initializing Hybrid AI Realtime Pipeline (" << config.label << ") ---\n"; - std::cout << "[Config] distance=" << config.distance - << " rounds=" << config.num_rounds - << " meas_qubits=" << config.meas_qubits - << " residual_detectors=" << config.residual_detectors - << " input_bytes=" << config.input_bytes() - << " slot_size=" << config.slot_size << "\n"; CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost)); @@ -282,31 +425,6 @@ int main(int argc, char *argv[]) { << "\n"; } - // --- Create PyMatching decoders --- - std::cout << "[Setup] Creating PyMatching decoder (d=" << config.distance - << " surface code, Z stabilizers)...\n"; - auto surface_code = - cudaq::qec::get_code("surface_code", {{"distance", config.distance}}); - auto H_z = surface_code->get_parity_z(); - - DecoderContext decoder_ctx; - decoder_ctx.z_stabilizers = static_cast(H_z.shape()[0]); - decoder_ctx.spatial_slices = - config.residual_detectors / decoder_ctx.z_stabilizers; - std::cout << "[Setup] H_z shape: [" << H_z.shape()[0] << " x " - << H_z.shape()[1] << "]" - << " z_stabilizers=" << decoder_ctx.z_stabilizers - << " spatial_slices=" << decoder_ctx.spatial_slices << "\n"; - - cudaqx::heterogeneous_map pm_params; - pm_params.insert("merge_strategy", std::string("smallest_weight")); - std::cout << "[Setup] Pre-allocating " << config.num_workers - << " PyMatching decoders...\n"; - for (int i = 0; i < config.num_workers; ++i) - decoder_ctx.decoders.push_back( - cudaq::qec::decoder::get("pymatching", H_z, pm_params)); - std::cout << "[Setup] PyMatching decoder pool ready.\n"; - // --- Create GPU resources (predecoders, streams, mailbox) --- void **h_mailbox_bank = nullptr; void **d_mailbox_bank = nullptr; @@ -335,13 +453,120 @@ int main(int argc, char *argv[]) { std::string save_path = (need_save && i == 0) ? engine_file : ""; auto pd = std::make_unique( model_path, d_mailbox_bank + i, 1, save_path); - std::cout << "[Setup] Decoder " << i + std::cout << "[Setup] Predecoder " << i << ": input_size=" << pd->get_input_size() << " output_size=" << pd->get_output_size() << "\n"; pd->capture_graph(capture_stream, false); predecoders.push_back(std::move(pd)); } + // --- Derive dimensions from TRT model bindings --- + const size_t model_input_bytes = predecoders[0]->get_input_size(); + const size_t model_output_bytes = predecoders[0]->get_output_size(); + const size_t slot_size = + round_up_pow2(CUDAQ_RPC_HEADER_SIZE + model_input_bytes); + + // Model I/O element count: for uint8 models, 1 byte per element; + // for int32, 4 bytes per element. Detect by comparing against expected + // detector count from the ONNX model shape. + const size_t model_input_elements = model_input_bytes; + const size_t model_output_elements_total = model_output_bytes; + // If model_input_bytes equals num_detectors (uint8), elem_size is 1. + // If model_input_bytes equals num_detectors*4 (int32), elem_size is 4. + // We detect this by checking if model_output_bytes == model_input_bytes + 1 + // (uint8: one extra L element) vs model_input_bytes + 4 (int32). + const size_t model_elem_size = + (model_output_bytes == model_input_bytes + 1) ? 1 : sizeof(int32_t); + const size_t num_input_detectors = model_input_bytes / model_elem_size; + const size_t num_output_elements = model_output_bytes / model_elem_size; + + std::cout << "[Setup] Model I/O element size: " << model_elem_size + << " bytes (" << (model_elem_size == 1 ? "uint8" : "int32") << ")\n"; + std::cout << "[Setup] Input detectors: " << num_input_detectors + << ", Output elements: " << num_output_elements << "\n"; + + const int residual_detectors = static_cast(num_output_elements) - 1; + + std::cout << "[Config] distance=" << config.distance + << " rounds=" << config.num_rounds + << " residual_detectors=" << residual_detectors + << " model_input=" << model_input_bytes + << " model_output=" << model_output_bytes + << " slot_size=" << slot_size << "\n"; + + // --- Load test data (optional) --- + TestData test_data; + StimData stim; + if (!scfg.data_dir.empty()) { + test_data = load_test_data(scfg.data_dir); + if (!test_data.loaded()) { + std::cerr << "ERROR: Failed to load test data from " << scfg.data_dir + << "\n"; + return 1; + } + if (test_data.num_detectors != num_input_detectors) { + std::cerr << "ERROR: detector count mismatch: data has " + << test_data.num_detectors << " but model expects " + << num_input_detectors << "\n"; + return 1; + } + stim = load_stim_data(scfg.data_dir); + } + + // --- Build PyMatching decoder --- + DecoderContext decoder_ctx; + decoder_ctx.num_residual_detectors = residual_detectors; + cudaqx::heterogeneous_map pm_params; + pm_params.insert("merge_strategy", std::string("smallest_weight")); + + // Observable row from O matrix (for projecting edge corrections → logical) + std::vector obs_row; + + if (stim.H.loaded() && + static_cast(stim.H.nrows) == residual_detectors) { + decoder_ctx.use_full_H = true; + std::cout << "[Setup] Converting sparse H (" << stim.H.nrows << "x" + << stim.H.ncols << ") to dense tensor...\n"; + auto H_full = stim.H.to_dense(); + std::cout << "[Setup] H tensor: [" << H_full.shape()[0] << " x " + << H_full.shape()[1] << "]\n"; + + if (!stim.priors.empty() && stim.priors.size() == stim.H.ncols) + pm_params.insert("error_rate_vec", stim.priors); + + if (stim.O.loaded()) + obs_row = stim.O.row_dense(0); + + std::cout << "[Setup] Creating " << config.num_workers + << " PyMatching decoders (full H)...\n"; + for (int i = 0; i < config.num_workers; ++i) + decoder_ctx.decoders.push_back( + cudaq::qec::decoder::get("pymatching", H_full, pm_params)); + } else { + // Fallback: per-slice decode with CUDA-Q surface code H_z + std::cout << "[Setup] Creating PyMatching decoder (d=" << config.distance + << " surface code, Z stabilizers)...\n"; + auto surface_code = + cudaq::qec::get_code("surface_code", {{"distance", config.distance}}); + auto H_z = surface_code->get_parity_z(); + + const int z_stabilizers = static_cast(H_z.shape()[0]); + if (residual_detectors > 0 && residual_detectors % z_stabilizers == 0) + decoder_ctx.spatial_slices = residual_detectors / z_stabilizers; + decoder_ctx.z_stabilizers = z_stabilizers; + + std::cout << "[Setup] H_z shape: [" << H_z.shape()[0] << " x " + << H_z.shape()[1] << "], spatial_slices=" + << decoder_ctx.spatial_slices << "\n"; + + std::cout << "[Setup] Creating " << config.num_workers + << " PyMatching decoders (per-slice)...\n"; + for (int i = 0; i < config.num_workers; ++i) + decoder_ctx.decoders.push_back( + cudaq::qec::decoder::get("pymatching", H_z, pm_params)); + } + std::cout << "[Setup] PyMatching decoder pool ready.\n"; + // Pre-launch DMA contexts std::vector pre_launch_ctxs(config.num_predecoders); for (int i = 0; i < config.num_predecoders; ++i) { @@ -378,7 +603,7 @@ int main(int argc, char *argv[]) { realtime_ns::PipelineStageConfig stage_cfg; stage_cfg.num_workers = config.num_workers; stage_cfg.num_slots = NUM_SLOTS; - stage_cfg.slot_size = config.slot_size; + stage_cfg.slot_size = slot_size; stage_cfg.cores = {.dispatcher = 2, .consumer = 4, .worker_base = 10}; realtime_ns::RealtimePipeline pipeline(stage_cfg); @@ -410,29 +635,56 @@ int main(int argc, char *argv[]) { int total_corrections = 0; bool all_converged = true; + const uint8_t *output_u8 = + static_cast(job.inference_data); + const int32_t logical_pred = output_u8[0]; auto decode_start = hrclock::now(); #if !defined(DISABLE_PYMATCHING) - const int32_t *residual = static_cast(job.inference_data); + const uint8_t *residual_u8 = output_u8 + 1; auto *my_decoder = dctx->acquire_decoder(); - cudaqx::tensor syndrome_tensor({(size_t)dctx->z_stabilizers}); - uint8_t *syn_data = syndrome_tensor.data(); - - for (int s = 0; s < dctx->spatial_slices; ++s) { - const int32_t *slice = residual + s * dctx->z_stabilizers; - for (int i = 0; i < dctx->z_stabilizers; ++i) - syn_data[i] = static_cast(slice[i]); - + if (dctx->use_full_H) { + thread_local cudaqx::tensor syndrome_tensor( + {(size_t)dctx->num_residual_detectors}); + std::memcpy(syndrome_tensor.data(), residual_u8, + dctx->num_residual_detectors); auto result = my_decoder->decode(syndrome_tensor); - all_converged &= result.converged; - for (auto v : result.result) - if (v > 0.5) - total_corrections++; + all_converged = result.converged; + if (wctx->obs_row && wctx->obs_row_size == result.result.size()) { + int obs_parity = 0; + for (size_t e = 0; e < result.result.size(); ++e) + if (result.result[e] > 0.5 && wctx->obs_row[e]) + obs_parity ^= 1; + total_corrections += obs_parity; + } else { + for (auto v : result.result) + if (v > 0.5) + total_corrections++; + } + } else { + thread_local cudaqx::tensor syndrome_tensor( + {(size_t)dctx->z_stabilizers}); + uint8_t *syn_data = syndrome_tensor.data(); + for (int s = 0; s < dctx->spatial_slices; ++s) { + const uint8_t *slice = residual_u8 + s * dctx->z_stabilizers; + std::memcpy(syn_data, slice, dctx->z_stabilizers); + auto result = my_decoder->decode(syndrome_tensor); + all_converged &= result.converged; + for (auto v : result.result) + if (v > 0.5) + total_corrections++; + } } + total_corrections += logical_pred; #endif auto decode_end = hrclock::now(); + // Capture request_id before we overwrite the slot with the response + auto *rpc_hdr = + static_cast(job.ring_buffer_ptr); + uint32_t rid = rpc_hdr->request_id; + // Write RPC response into ring buffer slot DecodeResponse resp{total_corrections, all_converged ? 1 : 0}; char *response_payload = @@ -457,6 +709,11 @@ int main(int argc, char *argv[]) { dctx->total_worker_us.fetch_add(worker_us, std::memory_order_relaxed); dctx->decode_count.fetch_add(1, std::memory_order_relaxed); + if (wctx->decode_corrections && rid < (uint32_t)wctx->max_requests) { + wctx->decode_corrections[rid] = total_corrections; + wctx->decode_logical_pred[rid] = logical_pred; + } + return 1; }); @@ -465,6 +722,8 @@ int main(int argc, char *argv[]) { std::vector submit_ts(max_requests); std::vector complete_ts(max_requests); std::vector completed(max_requests, 0); + std::vector decode_corrections(max_requests, -1); + std::vector decode_logical_pred(max_requests, -1); pipeline.set_completion_handler([&](const realtime_ns::Completion &c) { if (c.request_id < static_cast(max_requests)) { @@ -477,6 +736,16 @@ int main(int argc, char *argv[]) { // Start pipeline and run producer // ========================================================================= + for (int i = 0; i < config.num_workers; ++i) { + worker_ctxs[i].decode_corrections = decode_corrections.data(); + worker_ctxs[i].decode_logical_pred = decode_logical_pred.data(); + worker_ctxs[i].max_requests = max_requests; + if (!obs_row.empty()) { + worker_ctxs[i].obs_row = obs_row.data(); + worker_ctxs[i].obs_row_size = obs_row.size(); + } + } + std::cout << "[Setup] Starting pipeline...\n"; auto injector = pipeline.create_injector(); pipeline.start(); @@ -499,19 +768,32 @@ int main(int argc, char *argv[]) { // --- Producer loop (runs on main thread) --- std::mt19937 rng(42); const size_t payload_bytes = - std::min(config.input_bytes(), - config.slot_size - static_cast(CUDAQ_RPC_HEADER_SIZE)); + std::min(model_input_bytes, + slot_size - static_cast(CUDAQ_RPC_HEADER_SIZE)); std::vector payload_buf(CUDAQ_RPC_HEADER_SIZE + payload_bytes); int req_id = 0; int target = 0; + auto next_submit_time = hrclock::now(); + while (std::chrono::steady_clock::now() < run_deadline && req_id < max_requests) { - int32_t *payload = - reinterpret_cast(payload_buf.data() + CUDAQ_RPC_HEADER_SIZE); - int fill_elems = static_cast(payload_bytes / sizeof(int32_t)); - fill_measurement_payload(payload, fill_elems, rng, 0.01); + if (scfg.rate_us > 0) { + while (hrclock::now() < next_submit_time) + QEC_CPU_RELAX(); + } + + uint8_t *payload = payload_buf.data() + CUDAQ_RPC_HEADER_SIZE; + if (test_data.loaded()) { + const int32_t *src = test_data.sample(req_id); + for (size_t d = 0; d < num_input_detectors; ++d) + payload[d] = static_cast(src[d]); + } else { + std::bernoulli_distribution err_dist(0.01); + for (size_t d = 0; d < num_input_detectors; ++d) + payload[d] = err_dist(rng) ? 1 : 0; + } std::string func = "predecode_target_" + std::to_string(target); uint32_t fid = realtime_ns::fnv1a_hash(func.c_str()); @@ -523,12 +805,8 @@ int main(int argc, char *argv[]) { target = (target + 1) % config.num_predecoders; req_id++; - if (scfg.rate_us > 0) { - auto target_time = - submit_ts[req_id - 1] + std::chrono::microseconds(scfg.rate_us); - while (hrclock::now() < target_time) - QEC_CPU_RELAX(); - } + if (scfg.rate_us > 0) + next_submit_time += std::chrono::microseconds(scfg.rate_us); } // --- Shutdown --- @@ -650,6 +928,67 @@ int main(int argc, char *argv[]) { std::cout << "================================================================\n"; + // --- Correctness verification (when using real data) --- + if (test_data.loaded()) { + int verified = 0, mismatches = 0, missing = 0; + int pred_only_mismatches = 0; + int64_t sum_total_corr = 0, sum_logical_pred = 0; + int nonzero_logical = 0, nonzero_pymatch = 0; + for (int i = 0; i < nsub; ++i) { + if (decode_corrections[i] < 0) { + missing++; + continue; + } + int32_t total_corr = decode_corrections[i]; + int32_t lpred = decode_logical_pred[i]; + int32_t pymatch_corr = total_corr - lpred; + int32_t pipeline_parity = total_corr % 2; + int32_t ground_truth = test_data.observable(i, 0); + + if (pipeline_parity != ground_truth) + mismatches++; + if ((lpred % 2) != ground_truth) + pred_only_mismatches++; + + sum_total_corr += total_corr; + sum_logical_pred += lpred; + if (lpred != 0) + nonzero_logical++; + if (pymatch_corr != 0) + nonzero_pymatch++; + verified++; + } + double ler = + (verified > 0) ? static_cast(mismatches) / verified : 0; + double pred_ler = + (verified > 0) ? static_cast(pred_only_mismatches) / verified + : 0; + std::cout << "\n[Correctness] Verified " << verified << "/" << nsub + << " requests (" << missing << " missing)\n"; + std::cout << "[Correctness] Pipeline (pred+pymatch) mismatches: " + << mismatches << " LER: " << std::setprecision(4) << ler + << "\n"; + std::cout << "[Correctness] Predecoder-only mismatches: " + << pred_only_mismatches + << " LER: " << std::setprecision(4) << pred_ler << "\n"; + std::cout << "[Correctness] Avg logical_pred: " << std::setprecision(3) + << (verified > 0 ? (double)sum_logical_pred / verified : 0) + << " nonzero: " << nonzero_logical << "/" << verified << "\n"; + std::cout << "[Correctness] Avg pymatch_corr: " << std::setprecision(3) + << (verified > 0 + ? (double)(sum_total_corr - sum_logical_pred) / verified + : 0) + << " nonzero: " << nonzero_pymatch << "/" << verified << "\n"; + std::cout << "[Correctness] Ground truth ones: "; + int gt_ones = 0; + int gt_count = static_cast( + std::min(nsub, static_cast(test_data.num_samples))); + for (int i = 0; i < gt_count; ++i) + if (test_data.observable(i, 0)) + gt_ones++; + std::cout << gt_ones << "/" << gt_count << "\n"; + } + // --- Cleanup --- std::cout << "[Teardown] Shutting down...\n"; CUDA_CHECK(cudaStreamSynchronize(capture_stream)); From 9eab912fb062b4012bb14a787ba3089361e6e552 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Thu, 12 Mar 2026 05:54:41 +0000 Subject: [PATCH 37/40] Add syndrome density diagnostic and fix X-basis config labels Track input vs output syndrome density in the predecoder benchmark to verify the neural network is reducing detector noise (98.3% reduction observed at d13_r104). Also correct config labels from _Z to _X to match the actual measurement basis of the ONNX models. Signed-off-by: Scott Thornton --- .../test_realtime_predecoder_w_pymatching.cpp | 42 ++++++++++++++++++- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index c736aa10..4162fdd6 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -107,11 +107,11 @@ struct PipelineConfig { } static PipelineConfig d13_r13() { - return {"d13_r13_Z", 13, 13, "predecoder_memory_d13_T13_X.onnx", 16, 16}; + return {"d13_r13_X", 13, 13, "predecoder_memory_d13_T13_X.onnx", 16, 16}; } static PipelineConfig d13_r104() { - return {"d13_r104_Z", 13, 104, "predecoder_memory_d13_T104_X.onnx", 8, 8}; + return {"d13_r104_X", 13, 104, "predecoder_memory_d13_T104_X.onnx", 8, 8}; } static PipelineConfig d21_r21() { @@ -157,6 +157,10 @@ struct DecoderContext { std::atomic total_decode_us{0}; std::atomic total_worker_us{0}; std::atomic decode_count{0}; + + int num_input_detectors = 0; + std::atomic total_input_nonzero{0}; + std::atomic total_output_nonzero{0}; }; // ============================================================================= @@ -516,6 +520,7 @@ int main(int argc, char *argv[]) { // --- Build PyMatching decoder --- DecoderContext decoder_ctx; decoder_ctx.num_residual_detectors = residual_detectors; + decoder_ctx.num_input_detectors = static_cast(num_input_detectors); cudaqx::heterogeneous_map pm_params; pm_params.insert("merge_strategy", std::string("smallest_weight")); @@ -639,6 +644,18 @@ int main(int argc, char *argv[]) { static_cast(job.inference_data); const int32_t logical_pred = output_u8[0]; + // Syndrome density: count nonzero in input and output residuals + const uint8_t *input_u8 = + static_cast(job.ring_buffer_ptr) + CUDAQ_RPC_HEADER_SIZE; + int input_nz = 0; + for (int k = 0; k < dctx->num_input_detectors; ++k) + input_nz += (input_u8[k] != 0); + int output_nz = 0; + for (int k = 0; k < dctx->num_residual_detectors; ++k) + output_nz += (output_u8[1 + k] != 0); + dctx->total_input_nonzero.fetch_add(input_nz, std::memory_order_relaxed); + dctx->total_output_nonzero.fetch_add(output_nz, std::memory_order_relaxed); + auto decode_start = hrclock::now(); #if !defined(DISABLE_PYMATCHING) const uint8_t *residual_u8 = output_u8 + 1; @@ -920,6 +937,27 @@ int main(int argc, char *argv[]) { std::cout << " Worker overhead: " << std::setw(9) << avg_overhead << " us\n"; } + if (n_decoded > 0) { + double avg_in_nz = + (double)decoder_ctx.total_input_nonzero.load() / n_decoded; + double avg_out_nz = + (double)decoder_ctx.total_output_nonzero.load() / n_decoded; + double in_density = avg_in_nz / decoder_ctx.num_input_detectors; + double out_density = avg_out_nz / decoder_ctx.num_residual_detectors; + double reduction = (in_density > 0) ? (1.0 - out_density / in_density) : 0; + std::cout + << " " + "---------------------------------------------------------------\n"; + std::cout << " Syndrome density (" << n_decoded << " samples):\n"; + std::cout << " Input: " << std::fixed << std::setprecision(1) + << avg_in_nz << " / " << decoder_ctx.num_input_detectors + << " (" << std::setprecision(4) << in_density << ")\n"; + std::cout << " Output: " << std::fixed << std::setprecision(1) + << avg_out_nz << " / " << decoder_ctx.num_residual_detectors + << " (" << std::setprecision(4) << out_density << ")\n"; + std::cout << " Reduction: " << std::setprecision(1) + << (reduction * 100.0) << "%\n"; + } std::cout << " ---------------------------------------------------------------\n"; From d91d6bbc36e24c3d0437267c72f3a09071d082ee Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Mon, 16 Mar 2026 19:00:51 +0000 Subject: [PATCH 38/40] Add NVTX profiling instrumentation to realtime pipeline Gate all annotations behind ENABLE_NVTX cmake option (header-only, zero overhead when disabled). Instruments 9 pipeline stages: Submit, PreLaunchCopy, GPUPostLaunch, PollJob, ReleaseJob, WorkerPoll, CpuStageTotal, PyMatchDecode, ProducerSubmit, ConsumerComplete. Also reduces NUM_SLOTS from 32 to 12 to match the optimal config identified in benchmarking. Signed-off-by: Scott Thornton --- libs/qec/lib/realtime/CMakeLists.txt | 6 ++++++ libs/qec/lib/realtime/ai_predecoder_service.cu | 5 +++++ libs/qec/lib/realtime/realtime_pipeline.cu | 9 +++++++++ .../test_realtime_predecoder_w_pymatching.cpp | 11 ++++++++++- libs/qec/unittests/CMakeLists.txt | 9 +++++++++ 5 files changed, 39 insertions(+), 1 deletion(-) diff --git a/libs/qec/lib/realtime/CMakeLists.txt b/libs/qec/lib/realtime/CMakeLists.txt index 1b9fbfb8..3d25e3dd 100644 --- a/libs/qec/lib/realtime/CMakeLists.txt +++ b/libs/qec/lib/realtime/CMakeLists.txt @@ -162,6 +162,12 @@ if(CMAKE_CUDA_COMPILER AND CUDAQ_REALTIME_INCLUDE_DIR) PRIVATE ${_CUDAQ_RT_LIB} ${_CUDAQ_RT_HD_LIB} ) + option(ENABLE_NVTX "Enable NVTX profiling ranges" OFF) + if(ENABLE_NVTX) + target_compile_definitions(cudaq-realtime-pipeline PRIVATE ENABLE_NVTX) + message(STATUS "NVTX profiling enabled for cudaq-realtime-pipeline") + endif() + get_filename_component(_CUDAQ_RT_LIB_DIR "${_CUDAQ_RT_LIB}" DIRECTORY) set_target_properties(cudaq-realtime-pipeline PROPERTIES CUDA_SEPARABLE_COMPILATION ON diff --git a/libs/qec/lib/realtime/ai_predecoder_service.cu b/libs/qec/lib/realtime/ai_predecoder_service.cu index b9564a3b..c1cf3b8b 100644 --- a/libs/qec/lib/realtime/ai_predecoder_service.cu +++ b/libs/qec/lib/realtime/ai_predecoder_service.cu @@ -7,6 +7,7 @@ ******************************************************************************/ #include "cudaq/qec/realtime/ai_predecoder_service.h" +#include "cudaq/qec/realtime/nvtx_helpers.h" #include #include #include @@ -164,18 +165,22 @@ bool AIPreDecoderService::poll_next_job(PreDecoderJob &out_job) { if (sys_flags[0].compare_exchange_strong(expected, 2, cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed)) { + NVTX_PUSH("PollJob"); out_job.slot_idx = 0; out_job.ring_buffer_ptr = h_ring_ptrs_[0]; out_job.inference_data = h_predecoder_outputs_; + NVTX_POP(); return true; } return false; } void AIPreDecoderService::release_job(int /* slot_idx */) { + NVTX_PUSH("ReleaseJob"); auto *sys_flags = static_cast(h_ready_flags_); // PyMatching done: 2 (Processing) -> 0 (Idle) sys_flags[0].store(0, cuda::std::memory_order_release); + NVTX_POP(); } } // namespace cudaq::qec diff --git a/libs/qec/lib/realtime/realtime_pipeline.cu b/libs/qec/lib/realtime/realtime_pipeline.cu index c05c5b4f..2f43ab93 100644 --- a/libs/qec/lib/realtime/realtime_pipeline.cu +++ b/libs/qec/lib/realtime/realtime_pipeline.cu @@ -9,6 +9,7 @@ #include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" #include "cudaq/realtime/daemon/dispatcher/host_dispatcher.h" #include "cudaq/qec/realtime/pipeline.h" +#include "cudaq/qec/realtime/nvtx_helpers.h" #include #include @@ -82,6 +83,7 @@ static void gpu_only_host_callback(void *user_data) { static void gpu_only_post_launch(void *user_data, void *slot_dev, cudaStream_t stream) { + NVTX_PUSH("GPUPostLaunch"); auto *ctx = static_cast(user_data); if (ctx->user_post_launch_fn) @@ -93,6 +95,7 @@ static void gpu_only_post_launch(void *user_data, void *slot_dev, ctx->tx_value = reinterpret_cast(slot_host); cudaLaunchHostFunc(stream, gpu_only_host_callback, ctx); + NVTX_POP(); } // --------------------------------------------------------------------------- @@ -453,7 +456,9 @@ struct RealtimePipeline::Impl { ctx.max_response_size = 0; ctx.user_context = wr->user_context; + NVTX_PUSH("WorkerPoll"); size_t written = cpu_stage(ctx); + NVTX_POP(); if (written == 0) { QEC_CPU_RELAX(); continue; @@ -499,6 +504,7 @@ struct RealtimePipeline::Impl { cudaq_tx_status_t status = ring->poll_tx(s, &cuda_error); if (status == CUDAQ_TX_READY) { + NVTX_PUSH("ConsumerComplete"); if (completion_handler) { Completion c; c.request_id = slot_request[s]; @@ -515,6 +521,7 @@ struct RealtimePipeline::Impl { __sync_synchronize(); ring->clear_slot(s); found_any = true; + NVTX_POP(); } else if (status == CUDAQ_TX_ERROR) { if (completion_handler) { @@ -628,6 +635,7 @@ bool RingBufferInjector::try_submit(uint32_t function_id, const void *payload, cur, cur + 1, std::memory_order_acq_rel, std::memory_order_relaxed)) return false; + NVTX_PUSH("Submit"); state_->ring->write_and_signal(slot, function_id, payload, static_cast(payload_size), static_cast(request_id)); @@ -635,6 +643,7 @@ bool RingBufferInjector::try_submit(uint32_t function_id, const void *payload, (*state_->slot_request)[slot] = request_id; (*state_->slot_occupied)[slot] = 1; state_->total_submitted->fetch_add(1, std::memory_order_release); + NVTX_POP(); return true; } diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index 4162fdd6..d9800cd4 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -48,6 +48,7 @@ #include "cudaq/qec/decoder.h" #include "cudaq/qec/realtime/ai_decoder_service.h" #include "cudaq/qec/realtime/ai_predecoder_service.h" +#include "cudaq/qec/realtime/nvtx_helpers.h" using namespace cudaq::qec; namespace realtime_ns = cudaq::realtime; @@ -80,7 +81,7 @@ namespace realtime_ns = cudaq::realtime; // Pipeline Configuration (application-level, no atomics) // ============================================================================= -constexpr size_t NUM_SLOTS = 32; +constexpr size_t NUM_SLOTS = 12; struct PipelineConfig { std::string label; @@ -175,11 +176,13 @@ struct PreLaunchCopyCtx { static void pre_launch_input_copy(void *user_data, void *slot_dev, cudaStream_t stream) { + NVTX_PUSH("PreLaunchCopy"); auto *ctx = static_cast(user_data); ctx->h_ring_ptrs[0] = slot_dev; cudaMemcpyAsync(ctx->d_trt_input, static_cast(slot_dev) + CUDAQ_RPC_HEADER_SIZE, ctx->input_size, cudaMemcpyDeviceToDevice, stream); + NVTX_POP(); } // ============================================================================= @@ -635,6 +638,7 @@ int main(int argc, char *argv[]) { if (!pd->poll_next_job(job)) return 0; // GPU not done yet + NVTX_PUSH("CpuStageTotal"); using hrclock = std::chrono::high_resolution_clock; auto worker_start = hrclock::now(); @@ -657,6 +661,7 @@ int main(int argc, char *argv[]) { dctx->total_output_nonzero.fetch_add(output_nz, std::memory_order_relaxed); auto decode_start = hrclock::now(); + NVTX_PUSH("PyMatchDecode"); #if !defined(DISABLE_PYMATCHING) const uint8_t *residual_u8 = output_u8 + 1; auto *my_decoder = dctx->acquire_decoder(); @@ -695,6 +700,7 @@ int main(int argc, char *argv[]) { } total_corrections += logical_pred; #endif + NVTX_POP(); // PyMatchDecode auto decode_end = hrclock::now(); // Capture request_id before we overwrite the slot with the response @@ -731,6 +737,7 @@ int main(int argc, char *argv[]) { wctx->decode_logical_pred[rid] = logical_pred; } + NVTX_POP(); // CpuStageTotal return 1; }); @@ -816,8 +823,10 @@ int main(int argc, char *argv[]) { uint32_t fid = realtime_ns::fnv1a_hash(func.c_str()); submit_ts[req_id] = hrclock::now(); + NVTX_PUSH("ProducerSubmit"); injector.submit(fid, payload, static_cast(payload_bytes), static_cast(req_id)); + NVTX_POP(); target = (target + 1) % config.num_predecoders; req_id++; diff --git a/libs/qec/unittests/CMakeLists.txt b/libs/qec/unittests/CMakeLists.txt index 9c27a1ba..c6eed7a0 100644 --- a/libs/qec/unittests/CMakeLists.txt +++ b/libs/qec/unittests/CMakeLists.txt @@ -310,6 +310,10 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) INSTALL_RPATH "${CUDAQ_REALTIME_LIB_DIR};${CMAKE_BINARY_DIR}/lib" ) + if(ENABLE_NVTX) + target_compile_definitions(test_realtime_pipeline PRIVATE ENABLE_NVTX) + endif() + add_dependencies(CUDAQXQECUnitTests test_realtime_pipeline) gtest_discover_tests(test_realtime_pipeline TEST_PREFIX "test_realtime_pipeline." @@ -397,6 +401,11 @@ if(CUDAQ_REALTIME_ROOT AND CMAKE_CUDA_COMPILER) INSTALL_RPATH "${CMAKE_BINARY_DIR}/lib;${CUDAQ_REALTIME_LIB_DIR}" ) + if(ENABLE_NVTX) + target_compile_definitions(test_realtime_predecoder_w_pymatching PRIVATE ENABLE_NVTX) + message(STATUS "NVTX profiling enabled for test_realtime_predecoder_w_pymatching") + endif() + add_dependencies(CUDAQXQECUnitTests test_realtime_predecoder_w_pymatching) else() message(WARNING "TensorRT or ONNX parser not found. Skipping test_realtime_predecoder_w_pymatching.") From cfbc4be6a4ecbf343eeb5ef96b858fea98bfd9b8 Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Mon, 16 Mar 2026 20:23:09 +0000 Subject: [PATCH 39/40] Forgot to add this to the NVTX stuff Signed-off-by: Scott Thornton --- .../include/cudaq/qec/realtime/nvtx_helpers.h | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 libs/qec/include/cudaq/qec/realtime/nvtx_helpers.h diff --git a/libs/qec/include/cudaq/qec/realtime/nvtx_helpers.h b/libs/qec/include/cudaq/qec/realtime/nvtx_helpers.h new file mode 100644 index 00000000..d20568b6 --- /dev/null +++ b/libs/qec/include/cudaq/qec/realtime/nvtx_helpers.h @@ -0,0 +1,32 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. + * All rights reserved. + * + * This source code and the accompanying materials are made available under + * the terms of the Apache License 2.0 which accompanies this distribution. + ******************************************************************************/ + +#pragma once + +#ifdef ENABLE_NVTX + +#include + +struct NvtxRange { + explicit NvtxRange(const char *name) { nvtxRangePushA(name); } + ~NvtxRange() { nvtxRangePop(); } + NvtxRange(const NvtxRange &) = delete; + NvtxRange &operator=(const NvtxRange &) = delete; +}; + +#define NVTX_RANGE(name) NvtxRange _nvtx_range_##__LINE__(name) +#define NVTX_PUSH(name) nvtxRangePushA(name) +#define NVTX_POP() nvtxRangePop() + +#else + +#define NVTX_RANGE(name) (void)0 +#define NVTX_PUSH(name) (void)0 +#define NVTX_POP() (void)0 + +#endif From 293ad5b01d6817012b828cd1857f97c18c4687af Mon Sep 17 00:00:00 2001 From: Scott Thornton Date: Tue, 17 Mar 2026 18:31:20 +0000 Subject: [PATCH 40/40] Decouple PyMatching workers from predecoder workers and update docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce DEFERRED_COMPLETION protocol to separate GPU polling (~10 µs) from CPU-intensive PyMatching decode (~224 µs). Predecoder workers now release their GPU stream immediately and enqueue jobs to a dedicated PyMatchQueue serviced by 16 decode threads. This reduces backpressure stalls by 85% (41M→6.2M) and collapses tail latencies (p90: 970→515 µs, p99: 1767→1249 µs) while preserving correctness (LER 0.0020). Also fixes CUDAQ_RPC_HEADER_SIZE (12→24) to match sizeof(RPCHeader). Rewrites all three design docs to reflect the current architecture: decoupled two-tier workers, DEFERRED_COMPLETION, PyMatchQueue, complete_deferred(), RingBufferInjector, GPU-only mode, and updated performance numbers from the latest d13_r104 benchmark (192K requests). Signed-off-by: Scott Thornton --- docs/host_side_dispatcher_design_gemini.md | 326 ++++--- docs/hybrid_ai_predecoder_pipeline.md | 833 +++++++++--------- docs/realtime_pipeline_architecture.md | 203 +++-- .../qec/include/cudaq/qec/realtime/pipeline.h | 15 + libs/qec/lib/realtime/realtime_pipeline.cu | 13 + .../test_realtime_predecoder_w_pymatching.cpp | 360 +++++--- 6 files changed, 1036 insertions(+), 714 deletions(-) diff --git a/docs/host_side_dispatcher_design_gemini.md b/docs/host_side_dispatcher_design_gemini.md index b53376ed..287abe8e 100644 --- a/docs/host_side_dispatcher_design_gemini.md +++ b/docs/host_side_dispatcher_design_gemini.md @@ -7,16 +7,16 @@ **Supersedes**: Device-side persistent kernel dispatcher (`dispatch_kernel_with_graph`) and Statically-mapped Host Dispatcher **Target Platforms**: NVIDIA Grace Hopper (GH200), Grace Blackwell (GB200) **Shared-Memory Model**: libcu++ `cuda::std::atomic` with `thread_scope_system` -**Last Updated**: 2026-03-03 +**Last Updated**: 2026-03-17 --- ## 1. System Context & Motivation ### 1.1 The Pipeline -The system performs real-time quantum error correction (QEC). An FPGA streams syndrome measurements into a host-device shared ring buffer continuously (~1 µs cadence). -1. **Predecoding (GPU)**: TensorRT neural network inference (~70 µs for d=13 with FP16). -2. **Global Decoding (CPU)**: PyMatching (MWPM) (~11 µs for d=13 with `predecoder_memory` model, up to ~70 µs with denser residual models). +The system performs real-time quantum error correction (QEC). An FPGA streams syndrome measurements into a host-device shared ring buffer continuously (~104 µs cadence for d=13, T=104). +1. **Predecoding (GPU)**: TensorRT neural network inference (~88 µs pure GPU compute for d=13/T=104 with FP16; ~146 µs p50 in pipeline with DMA and dispatch overhead). +2. **Global Decoding (CPU)**: PyMatching (MWPM) (~224 µs average for d=13/T=104 with full 17,472-detector parity check matrix). ### 1.2 The Problem The legacy architecture used a persistent GPU kernel to launch child CUDA graphs using `cudaStreamGraphFireAndForget`. This hit a hardcoded CUDA runtime limit of 128 cumulative launches, causing fatal crashes. A naive host-side port mapping FPGA slots 1:1 to GPU streams caused **Head-of-Line (HOL) blocking**: a single slow PyMatching decode would stall the sequential dispatcher, backing up the ring buffer and violating strict quantum coherence latency budgets. @@ -27,6 +27,7 @@ This document defines a **Host-Side Dispatcher with a Dynamic Worker Pool**. * Predecoder streams and CPU workers act as an interchangeable pool. * Inflight jobs are tagged with their origin slot, allowing out-of-order execution and completion. * Synchronization relies exclusively on Grace Blackwell's NVLink-C2C hardware using libcu++ system-scope atomics. +* **Decoupled architecture**: PyMatching decode runs in a separate thread pool from the predecoder workers, allowing GPU streams to be released immediately after inference completion rather than blocking on CPU decode. --- @@ -38,7 +39,7 @@ Instead of mapping predecoder streams statically to incoming data, the host disp 2. **Tag**: The dispatcher records the original `slot` in a tracking array (`inflight_slot_tags[worker_id]`) so the response can be routed correctly. 3. **Pre-launch DMA**: If a `pre_launch_fn` callback is registered on the worker, the dispatcher calls it to issue a `cudaMemcpyAsync` (DMA engine copy) of the input payload from the ring buffer to the TRT input buffer before graph launch. 4. **Dispatch**: The dispatcher launches the CUDA graph on the assigned worker's stream and clears its availability bit. -5. **Free**: When the CPU PyMatching worker finishes the job and writes the response to `tx_flags[origin_slot]`, it restores the worker's availability bit in the `idle_mask`. +5. **Free**: The predecoder worker thread (not the PyMatching thread) restores the worker's availability bit in the `idle_mask` after copying inference output and enqueuing the PyMatching job. Slot completion is deferred to the PyMatching thread pool. --- @@ -54,12 +55,16 @@ All shared state must use **libcu++ system-scope atomics** allocated in mapped p | :--- | :--- | :--- | :--- | | `rx_flags[NUM_SLOTS]` | `atomic` | Mapped Pinned | FPGA writes data ptr; CPU polls (Acquire). | | `tx_flags[NUM_SLOTS]` | `atomic` | Mapped Pinned | CPU writes response; FPGA polls (Release). | -| `ready_flags[NUM_WORKERS]` | `atomic` | Mapped Pinned | GPU signals TRT done; CPU polls (Release/Acquire). | +| `ready_flags[1]` | `atomic` | Mapped Pinned | GPU signals TRT done; CPU polls (Release/Acquire). Queue depth = 1. | | `idle_mask` | `atomic` | Host CPU Mem | Bitmask of free workers. 1 = free, 0 = busy. | | `inflight_slot_tags[NUM_WORKERS]`| `int` (Plain array) | Host CPU Mem | Maps `worker_id` -> original FPGA `slot`. | | `mailbox_bank[NUM_WORKERS]` | `void*` (Plain array) | Mapped Pinned | Dispatcher writes device ptr for pre-launch callback. | -| `h_ring_ptrs[NUM_WORKERS]` | `void*` (Plain array) | Mapped Pinned | Pre-launch callback writes slot device ptr for CPU worker readback. | -| `h_outputs[NUM_WORKERS]` | `void*` (Mapped Pinned) | Mapped Pinned | GPU output copied here via DMA; CPU worker reads inference results. | +| `h_ring_ptrs[1]` | `void*` (Mapped Pinned) | Mapped Pinned | Pre-launch callback writes slot device ptr for CPU worker readback. | +| `h_predecoder_outputs_[1]` | `void*` (Mapped Pinned) | Mapped Pinned | GPU output copied here via DMA; CPU worker reads inference results. | + +**NUM_SLOTS**: 16 (ring buffer capacity). +**NUM_WORKERS**: 8 (predecoder streams, each with a dedicated CPU poller thread). +**Queue depth**: 1 per predecoder (single in-flight inference per stream). --- @@ -76,12 +81,14 @@ typedef struct { cudaGraphExec_t graph_exec; cudaStream_t stream; uint32_t function_id; - void (*pre_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream) = nullptr; - void* pre_launch_data = nullptr; -}; + void (*pre_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream); + void* pre_launch_data; + void (*post_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream); + void* post_launch_data; +} cudaq_host_dispatch_worker_t; ``` -The `pre_launch_fn` callback enables the dispatcher to issue a `cudaMemcpyAsync` (using the DMA copy engine) for the input payload before each graph launch, without baking application-specific logic into the generic dispatcher. +The `pre_launch_fn` callback enables the dispatcher to issue a `cudaMemcpyAsync` (using the DMA copy engine) for the input payload before each graph launch, without baking application-specific logic into the generic dispatcher. The `post_launch_fn` callback is used in GPU-only mode to enqueue a `cudaLaunchHostFunc` that signals slot completion without CPU worker threads. ### 4.2 Dispatcher Logic (Pseudocode) ```cpp @@ -127,6 +134,10 @@ void cudaq_host_dispatcher_loop(const cudaq_host_dispatcher_config_t *config) { tx_flags[current_slot].store(0xEEEEEEEEEEEEEEEEULL, release); } + // Post-launch callback (GPU-only mode: enqueue cudaLaunchHostFunc) + if (worker.post_launch_fn) + worker.post_launch_fn(worker.post_launch_data, data_dev, worker.stream); + // Consume slot and advance rx_flags[current_slot].store(0, release); current_slot = (current_slot + 1) % num_slots; @@ -143,19 +154,19 @@ void cudaq_host_dispatcher_loop(const cudaq_host_dispatcher_config_t *config) { Data copies between the ring buffer and TRT inference buffers use the GPU's DMA copy engine rather than SM-based kernels, freeing compute resources for inference. -**Input copy (ring buffer -> TRT input)**: Issued by the host dispatcher via `pre_launch_fn` callback as a `cudaMemcpyAsync(DeviceToDevice)` on the worker's stream *before* `cudaGraphLaunch`. The source address is dynamic (determined at dispatch time from the ring buffer slot), so it cannot be baked into the captured graph. +**Input copy (ring buffer -> TRT input)**: Issued by the host dispatcher via `pre_launch_fn` callback as a `cudaMemcpyAsync(DeviceToDevice)` on the worker's stream *before* `cudaGraphLaunch`. The source address is dynamic (determined at dispatch time from the ring buffer slot at offset `CUDAQ_RPC_HEADER_SIZE` = 24 bytes), so it cannot be baked into the captured graph. -**Output copy (TRT output -> host-mapped outputs)**: Captured inside the CUDA graph as a `cudaMemcpyAsync(DeviceToDevice)`. Both source (`d_trt_output_`) and destination (`d_outputs_`) are fixed addresses, so this is captured at graph instantiation time. +**Output copy (TRT output -> host-mapped outputs)**: Captured inside the CUDA graph as a `cudaMemcpyAsync(DeviceToDevice)`. Both source (`d_trt_output_`) and destination (`d_predecoder_outputs_`) are fixed addresses, so this is captured at graph instantiation time. ### 5.2 Captured CUDA Graph Contents The CUDA graph for each predecoder contains (in order): 1. **TRT inference** (`context_->enqueueV3(stream)`) -- or `passthrough_copy_kernel` if `SKIP_TRT` is set. -2. **Output DMA copy** (`cudaMemcpyAsync` D2D) -- copies TRT output to host-mapped output buffer. +2. **Output DMA copy** (`cudaMemcpyAsync` D2D) -- copies TRT output to host-mapped predecoder output buffer (`h_predecoder_outputs_`). 3. **Signal kernel** (`predecoder_signal_ready_kernel<<<1,1>>>`) -- a single-thread kernel that performs `d_ready_flags[0].store(1, release)` to notify the CPU worker. -The graph is instantiated with `cudaGraphInstantiate(&graph_exec_, graph, 0)` for host-launch mode. Input data arrives exclusively via the pre-launch DMA copy callback; no input-copy kernel exists in the graph or codebase. +The graph is instantiated with `cudaGraphInstantiate(&graph_exec_, graph, 0)` for host-launch mode. Input data arrives exclusively via the pre-launch DMA copy callback; no input-copy kernel exists in the graph. ### 5.3 Source Files @@ -164,69 +175,100 @@ The `ai_predecoder_service.cu` implementation contains only two device kernels: - `predecoder_signal_ready_kernel` -- single-thread kernel that atomically stores `1` to the ready flag with system-scope release semantics. - `passthrough_copy_kernel` -- vectorized identity copy (`uint4` 16-byte loads/stores, 256 threads) used when `SKIP_TRT` is set, substituting for TRT inference. -The legacy `predecoder_input_kernel` (which read from the mailbox and copied into `d_trt_input_`) has been removed. The `cudaq::nvqlink` header dependencies are no longer needed by this file. - ### 5.4 Passthrough Copy Kernel (SKIP_TRT mode) When `SKIP_TRT` is set, the `passthrough_copy_kernel` substitutes for TRT inference, providing a deterministic identity function for testing and benchmarking the infrastructure overhead. In SKIP_TRT mode, the `AIDecoderService` constructor sets `input_size_ = output_size_ = 1600 * sizeof(float)` (6400 bytes) without loading any model file. --- -## 6. Worker Subsystem (Consumer) +## 6. Decoupled Worker Architecture -### 6.1 Ready-Flag State Machine (Atomic Claiming) +The CPU-side processing uses a **two-tier decoupled architecture** that separates GPU polling from CPU-intensive decode: -With a single slot per predecoder (queue depth 1), the poller must **claim** each completion exactly once. +### 6.1 Tier 1: Predecoder Workers (GPU Polling + Copy) -**States** (per-worker ready flag): +Each predecoder has a dedicated worker thread in the `RealtimePipeline`. These threads: -| Value | State | Meaning | -| :--- | :--- | :--- | -| 0 | Idle | Waiting for GPU, or worker has called `release_job`. | -| 1 | Ready | GPU finished; signal kernel stored 1. | -| 2 | Processing | CPU poller claimed the job; PyMatching is running. | +1. **Poll** `ready_flags[0]` via `compare_exchange_strong(1, 2, acquire, relaxed)` (CAS claiming). +2. **Copy** inference output from `h_predecoder_outputs_` to a per-slot buffer (`deferred_outputs[origin_slot]`). +3. **Compute** syndrome density metrics (input vs. output nonzero detector counts). +4. **Release** the GPU predecoder slot via `release_job(slot_idx)` → `ready_flags[0].store(0, release)`. +5. **Enqueue** a `PyMatchJob` to the `PyMatchQueue`. +6. **Return** `DEFERRED_COMPLETION` to the pipeline, which releases `idle_mask` but does NOT set `tx_flags`. -**Poller**: Use `compare_exchange_strong(expected=1, desired=2, memory_order_acquire, memory_order_relaxed)`. Only the thread that wins the CAS enqueues the job. Use **relaxed on failure** so spin-polling does not add barriers that delay seeing the GPU's store(1). +### 6.2 Tier 2: PyMatching Workers (CPU Decode + Completion) -**Worker**: When PyMatching finishes, call `release_job(slot_idx)` which does `ready_flags[0].store(0, release)` so the slot is Idle for the next launch. +A separate thread pool (16 workers for d13_r104) dequeues from `PyMatchQueue` and: -### 6.2 Dedicated Polling/Worker Threads +1. **Decode** using a per-thread PyMatching decoder instance (lock-free `thread_local` acquisition). +2. **Write** the RPC response (`DecodeResponse`) directly into the ring buffer slot. +3. **Signal** slot completion via `pipeline.complete_deferred(origin_slot)`, which stores the slot host address into `tx_flags[origin_slot]`. -Each predecoder has a dedicated polling thread that spins on `poll_next_job()` (the CAS), then runs PyMatching inline on the same thread. This avoids thread pool overhead. +### 6.3 Why Decouple? -### 6.3 Worker Logic (Pseudocode) -```cpp -void pymatching_worker_task(PreDecoderJob job, int worker_id, - AIPreDecoderService* predecoder, - DecoderContext* ctx, - WorkerPoolContext* pool_ctx) { - // 1. Read GPU outputs from mapped pinned memory (h_outputs_) - const int32_t* residual = static_cast(job.inference_data); - - // 2. Run PyMatching MWPM decode over spatial slices - for (int s = 0; s < ctx->spatial_slices; ++s) { - // ... decode each spatial slice ... - } +With the coupled architecture, a single slow PyMatching decode (up to 139 ms tail latency) would hold the predecoder worker busy, preventing the GPU stream from being dispatched new work. This caused: +- Severe head-of-line blocking on `idle_mask` +- ~41M backpressure stalls +- Tail latencies: p90 = 970 µs, p99 = 1,767 µs + +The decoupled architecture reduces predecoder worker hold time from ~214 µs to ~10 µs, dropping: +- Backpressure stalls from 41M to 6.2M (85% reduction) +- p90 from 970 µs to 515 µs (47% reduction) +- p99 from 1,767 µs to 1,249 µs (29% reduction) - // 3. Write RPC response back to the ring buffer slot - auto* header = static_cast(job.ring_buffer_ptr); - header->magic = RPC_MAGIC_RESPONSE; - header->status = 0; - header->result_len = sizeof(resp_data); +### 6.4 DEFERRED_COMPLETION Protocol - // 4. Lookup origin slot and signal completion via tx_flags - int origin_slot = job.origin_slot; - pool_ctx->tx_flags[origin_slot].store( - reinterpret_cast(job.ring_buffer_ptr), release); +``` +Pipeline Worker Thread: PyMatching Thread: + poll_next_job() → CAS 1→2 (blocked on queue) + copy output to deferred_buf | + release_job() → store 0 | + enqueue PyMatchJob ──────────► pop PyMatchJob + return DEFERRED_COMPLETION decode with PyMatching + pipeline sets idle_mask ✓ write RPC response + pipeline skips tx_flags ✗ complete_deferred(slot) + └──► tx_flags[slot].store(addr) +``` - // 5. Release GPU predecoder slot (2 -> 0) - predecoder->release_job(job.slot_idx); +### 6.5 PyMatchQueue - // 6. Return worker to the dispatcher pool - pool_ctx->idle_mask->fetch_or(1ULL << worker_id, release); -} +Thread-safe MPSC queue using `std::mutex` + `std::condition_variable`: + +```cpp +struct PyMatchJob { + int origin_slot; + uint64_t request_id; + void *ring_buffer_ptr; +}; + +class PyMatchQueue { + std::mutex mtx_; + std::condition_variable cv_; + std::queue jobs_; + bool stop_ = false; +public: + void push(PyMatchJob &&j); + bool pop(PyMatchJob &out); // blocks until job available or shutdown + void shutdown(); +}; ``` +### 6.6 Ready-Flag State Machine (Atomic Claiming) + +With queue depth 1, the poller must **claim** each completion exactly once. + +**States** (per-worker ready flag): + +| Value | State | Meaning | +| :--- | :--- | :--- | +| 0 | Idle | Waiting for GPU, or worker has called `release_job`. | +| 1 | Ready | GPU finished; signal kernel stored 1. | +| 2 | Processing | CPU poller claimed the job; copying output. | + +**Poller**: Use `compare_exchange_strong(expected=1, desired=2, memory_order_acquire, memory_order_relaxed)`. Only the thread that wins the CAS proceeds. Use **relaxed on failure** so spin-polling does not add barriers that delay seeing the GPU's store(1). + +**Worker**: When output is copied and job is enqueued, call `release_job(slot_idx)` which does `ready_flags[0].store(0, release)` so the slot is Idle for the next launch. + --- ## 7. Out-of-Order Consumer @@ -235,11 +277,10 @@ The consumer thread harvests completions **out-of-order** by scanning all active ### 7.1 Consumer Logic (Pseudocode) ```cpp -// Consumer scans all slots each iteration while (!consumer_stop) { bool found_any = false; for (uint32_t s = 0; s < NUM_SLOTS; ++s) { - if (slot_request[s] < 0) continue; // no active request in this slot + if (!slot_occupied[s]) continue; cudaq_tx_status_t status = cudaq_host_ringbuffer_poll_tx_flag(&rb, s, &err); @@ -249,7 +290,7 @@ while (!consumer_stop) { completed[rid] = true; total_completed++; - slot_request[s] = -1; // Reset request ID FIRST + slot_occupied[s] = 0; // Reset occupancy FIRST __sync_synchronize(); // ARM memory fence cudaq_host_ringbuffer_clear_slot(&rb, s); // Then clear tx_flags found_any = true; @@ -261,10 +302,10 @@ while (!consumer_stop) { ### 7.2 Consumer-Producer Race Fix -On ARM's weakly ordered memory model, the consumer must reset `slot_request[s] = -1` **before** clearing `tx_flags[s]` (via `cudaq_host_ringbuffer_clear_slot`), with a `__sync_synchronize()` fence between them. Without this ordering: +On ARM's weakly ordered memory model, the consumer must reset `slot_occupied[s] = 0` **before** clearing `tx_flags[s]` (via `cudaq_host_ringbuffer_clear_slot`), with a `__sync_synchronize()` fence between them. Without this ordering: 1. Consumer clears `tx_flags[s]` (slot appears free to producer) -2. Producer writes new `slot_request[s] = new_rid` -3. Consumer's delayed `slot_request[s] = -1` clobbers the producer's write +2. Producer writes new `slot_occupied[s] = 1` +3. Consumer's delayed `slot_occupied[s] = 0` clobbers the producer's write This race caused exactly one request to get "stuck" indefinitely, eventually stalling the entire pipeline through backpressure. @@ -272,10 +313,10 @@ This race caused exactly one request to get "stuck" indefinitely, eventually sta ## 8. RealtimePipeline Scaffolding -The low-level dispatcher, consumer, and worker threads are wrapped by a higher-level `RealtimePipeline` class (`realtime/include/cudaq/realtime/pipeline.h`) that hides all ring buffer management, atomics, and thread lifecycle. Application code provides three callbacks: +The low-level dispatcher, consumer, and worker threads are wrapped by a higher-level `RealtimePipeline` class (`libs/qec/include/cudaq/qec/realtime/pipeline.h`) that hides all ring buffer management, atomics, and thread lifecycle. Application code provides three callbacks: -1. **GPU stage factory** (`GpuStageFactory`): Called once per worker during `start()`. Returns the `cudaGraphExec_t`, `cudaStream_t`, `pre_launch_fn`, `function_id`, and an opaque `user_context` for each worker. -2. **CPU stage callback** (`CpuStageCallback`): Called by each worker thread when GPU inference completes. Receives `CpuStageContext` containing `inference_output`, `output_size`, `response_buffer`, and the `user_context`. Returns the number of bytes written. +1. **GPU stage factory** (`GpuStageFactory`): Called once per worker during `start()`. Returns the `cudaGraphExec_t`, `cudaStream_t`, `pre_launch_fn`, `post_launch_fn`, `function_id`, and an opaque `user_context` for each worker. +2. **CPU stage callback** (`CpuStageCallback`): Called by each worker thread when GPU inference completes. Receives `CpuStageContext` containing `gpu_output`, `gpu_output_size`, `response_buffer`, and the `user_context`. Returns the number of bytes written, `0` if no result ready (poll again), or `DEFERRED_COMPLETION` to release the worker without signaling slot completion. 3. **Completion callback** (`CompletionCallback`): Called by the consumer thread for each completed (or errored) request with a `Completion` struct. ```cpp @@ -283,45 +324,92 @@ RealtimePipeline pipeline(config); pipeline.set_gpu_stage([&](int worker_id) -> GpuWorkerResources { ... }); pipeline.set_cpu_stage([&](const CpuStageContext& ctx) -> size_t { ... }); pipeline.set_completion_handler([&](const Completion& c) { ... }); +auto injector = pipeline.create_injector(); pipeline.start(); -pipeline.submit(function_id, payload, payload_size, request_id); +injector.submit(function_id, payload, payload_size, request_id); // ... pipeline.stop(); ``` +### 8.1 DEFERRED_COMPLETION + +When the CPU stage callback returns `DEFERRED_COMPLETION` (= `SIZE_MAX`), the pipeline: +- Sets the worker's bit in `idle_mask` (worker is free for next dispatch) +- Does NOT write to `tx_flags[origin_slot]` (slot stays IN_FLIGHT) + +The caller is responsible for eventually calling `pipeline.complete_deferred(slot)`, which stores the slot host address into `tx_flags[slot]` with release semantics, making the completion visible to the consumer. + +### 8.2 GPU-Only Mode + +If no `CpuStageCallback` is registered, the pipeline operates in **GPU-only mode**: no CPU worker threads are spawned. Instead, the dispatcher's `post_launch_fn` enqueues a `cudaLaunchHostFunc` on each worker stream. When the GPU finishes, the CUDA runtime calls the host function, which stores into `tx_flags` and restores the `idle_mask` bit — all from the CUDA callback thread. + +### 8.3 RingBufferInjector + +The `RingBufferInjector` class (created via `pipeline.create_injector()`) encapsulates the host-side submission logic for testing without FPGA hardware. It provides: + +- `try_submit()`: Non-blocking, returns false on backpressure. +- `submit()`: Blocking spin-wait until a slot becomes available. +- `backpressure_stalls()`: Counter of spin iterations during backpressure. + +The injector uses a round-robin slot selection with atomic CAS for thread safety. + The `PipelineStageConfig` allows configuring `num_workers`, `num_slots`, `slot_size`, and optional `CorePinning` for dispatcher, consumer, and worker threads. --- ## 9. Step-by-Step Data Flow Trace -1. **FPGA** writes INT32 measurements into `rx_data[5]`. -2. **FPGA** sets `rx_flags[5] = host_ptr`. -3. **Host Dispatcher** reads `rx_flags[5]`, sees data. -4. **Host Dispatcher** parses RPC header, looks up function in the function table. -5. **Host Dispatcher** scans `idle_mask`, finds `worker_id = 2` is free. -6. **Host Dispatcher** marks bit 2 busy in `idle_mask`. -7. **Host Dispatcher** saves `inflight_slot_tags[2] = 5`. -8. **Host Dispatcher** translates `host_ptr` to `dev_ptr`, writes to `mailbox_bank[2]`. -9. **Host Dispatcher** calls `pre_launch_fn`: writes `h_ring_ptrs[0] = dev_ptr`, issues `cudaMemcpyAsync(d_trt_input, dev_ptr + 12, input_size, D2D, stream[2])`. -10. **Host Dispatcher** calls `cudaGraphLaunch(..., stream[2])`. -11. **Host Dispatcher** sets `tx_flags[5] = 0xEEEE...` (IN_FLIGHT), then clears `rx_flags[5] = 0` and advances to `current_slot = 6`. -12. **GPU DMA engine** copies input payload from ring buffer to TRT input buffer. -13. **GPU** executes TRT inference (or passthrough copy in SKIP_TRT mode). -14. **GPU DMA engine** copies TRT output to host-mapped `h_outputs_`. -15. **GPU signal kernel** sets `ready_flags[2] = 1` (system-scope atomic release). -16. **CPU Poller** CAS(1, 2) on `ready_flags[2]`, wins, reads `h_ring_ptrs[0]` to get ring buffer address and `h_outputs_` to get inference data. -17. **CPU Worker** runs PyMatching decode over spatial slices. -18. **CPU Worker** writes RPC response into ring buffer slot. -19. **CPU Worker** looks up `origin_slot = inflight_slot_tags[2]` (which is 5). -20. **CPU Worker** writes response address to `tx_flags[5]` (overwrites 0xEEEE). -21. **CPU Worker** calls `release_job` (`ready_flags[0].store(0, release)`), then restores bit 2 in `idle_mask`. -22. **Consumer** scans all slots, sees `tx_flags[5] != 0` and `!= 0xEEEE`, harvests. -23. **Consumer** sets `slot_request[5] = -1`, `__sync_synchronize()`, then clears `tx_flags[5] = 0`. Producer may now reuse slot 5. +1. **Producer** writes uint8 measurements into `payload_buf` from Stim test data. +2. **Producer** calls `injector.submit(fid, payload, size, request_id)`. +3. **RingBufferInjector** writes RPC header (`RPCHeader`: magic, function_id, arg_len, request_id, ptp_timestamp = 24 bytes) + payload into `rx_data[slot]`. +4. **RingBufferInjector** sets `rx_flags[slot] = host_ptr` (release). +5. **Host Dispatcher** reads `rx_flags[slot]`, sees data. +6. **Host Dispatcher** parses RPC header, looks up function in the function table. +7. **Host Dispatcher** scans `idle_mask`, finds `worker_id = 2` is free. +8. **Host Dispatcher** marks bit 2 busy in `idle_mask`. +9. **Host Dispatcher** saves `inflight_slot_tags[2] = slot`. +10. **Host Dispatcher** translates `host_ptr` to `dev_ptr`, writes to `mailbox_bank[2]`. +11. **Host Dispatcher** calls `pre_launch_fn`: writes `h_ring_ptrs[0] = dev_ptr`, issues `cudaMemcpyAsync(d_trt_input, dev_ptr + 24, input_size, D2D, stream[2])`. +12. **Host Dispatcher** calls `cudaGraphLaunch(..., stream[2])`. +13. **Host Dispatcher** sets `tx_flags[slot] = 0xEEEE...` (IN_FLIGHT), then clears `rx_flags[slot] = 0` and advances to next slot. +14. **GPU DMA engine** copies input payload from ring buffer to TRT input buffer. +15. **GPU** executes TRT inference (or passthrough copy in SKIP_TRT mode). +16. **GPU DMA engine** copies TRT output to host-mapped `h_predecoder_outputs_`. +17. **GPU signal kernel** sets `ready_flags[0] = 1` (system-scope atomic release). +18. **Predecoder Worker** CAS(1, 2) on `ready_flags[0]`, wins, reads inference output. +19. **Predecoder Worker** copies output to `deferred_outputs[origin_slot]`. +20. **Predecoder Worker** computes syndrome density metrics. +21. **Predecoder Worker** calls `release_job(0)` → `ready_flags[0].store(0, release)`. +22. **Predecoder Worker** extracts `request_id` from RPC header, enqueues `PyMatchJob`. +23. **Predecoder Worker** returns `DEFERRED_COMPLETION`. +24. **Pipeline** restores bit 2 in `idle_mask` (worker free for next dispatch). Does NOT touch `tx_flags`. +25. **PyMatching Worker** pops `PyMatchJob` from queue, acquires per-thread decoder. +26. **PyMatching Worker** runs PyMatching MWPM decode over full parity check matrix. +27. **PyMatching Worker** writes `RPCResponse + DecodeResponse` into ring buffer slot. +28. **PyMatching Worker** calls `pipeline.complete_deferred(slot)` → `tx_flags[slot].store(host_addr, release)`. +29. **Consumer** scans all slots, sees `tx_flags[slot] != 0` and `!= 0xEEEE`, harvests. +30. **Consumer** calls `completion_handler(request_id, slot, success)`. +31. **Consumer** sets `slot_occupied[slot] = 0`, `__sync_synchronize()`, then clears `tx_flags[slot] = 0`. Producer may now reuse slot. --- -## 10. Ring Buffer and IN_FLIGHT Sentinel +## 10. RPC Protocol & Ring Buffer + +### 10.1 RPC Header + +```cpp +struct RPCHeader { + uint32_t magic; // RPC_MAGIC_REQUEST + uint32_t function_id; // FNV-1a hash of function name + uint32_t arg_len; // payload length in bytes + uint32_t request_id; // unique request identifier + uint64_t ptp_timestamp; // PTP timestamp (optional) +}; +// sizeof(RPCHeader) == 24 +#define CUDAQ_RPC_HEADER_SIZE 24u +``` + +### 10.2 IN_FLIGHT Sentinel Because `cudaGraphLaunch` is asynchronous, the dispatcher clears `rx_flags[slot]` immediately after launch. Without a hold, the **producer** (FPGA sim or test) would see `rx_flags[slot]==0` and `tx_flags[slot]==0` (response not written yet) and reuse the slot, overwriting data while the GPU is still reading. @@ -360,32 +448,51 @@ Data-integrity tests that verify known payloads survive the full CUDA graph roun - **Multi-Predecoder Concurrency**: 4 predecoders on 4 streams, simultaneous dispatch, per-predecoder data verification. - **Sustained Throughput (200 requests)**: Regression test for the 128-launch-limit fix. Proves indefinite stability of the host-side dispatcher. +### 12.4 End-to-End Benchmark (test_realtime_predecoder_w_pymatching) +- Configurable surface code distance and round count: d7, d13, d13_r104, d21, d31. +- Loads Stim-generated test data (detectors, observables, parity check matrix, priors). +- Streams syndrome data at configurable rate with correctness verification (LER). +- Reports latency percentiles, throughput, backpressure stalls, syndrome density reduction. + --- ## 13. Shutdown and Grace Period -- **Grace period**: After the producer thread exits, the main thread waits up to 5 seconds for `total_completed >= total_submitted`. -- **Consumer exit**: The consumer thread normally exits when `producer_done && total_completed >= total_submitted`. To avoid hanging forever if some in-flight requests never complete, set a **consumer_stop** flag after the grace period; the consumer loop checks this and exits so `consumer.join()` returns and the process can print the final report and exit cleanly. +- **Grace period**: After the producer stops submitting, the pipeline waits up to 5 seconds for `total_completed >= total_submitted`. +- **Consumer exit**: The consumer thread normally exits when `producer_stop && total_completed >= total_submitted`. To avoid hanging forever if some in-flight requests never complete, set a **consumer_stop** flag after the grace period; the consumer loop checks this and exits so `consumer.join()` returns and the process can print the final report and exit cleanly. - **Dispatcher shutdown**: Set `shutdown_flag = 1` after the consumer exits, then join the dispatcher thread. The dispatcher synchronizes all worker streams before returning. -- **Debug diagnostics**: If requests are stuck after the grace period, a debug dump prints per-slot rx/tx flags, slot_request state, and per-worker inflight_slot_tags and idle_mask bits. +- **PyMatching thread pool**: Call `pymatch_queue.shutdown()` to unblock all waiting threads, then join all PyMatching worker threads. --- -## 14. Performance Results (d=13, 30 µs rate, 10s) +## 14. Performance Results (d=13, T=104, 104 µs rate, 20s) + +### 14.1 Decoupled Architecture (current) -Measured on Grace Blackwell (GB200) with `predecoder_memory_d13_T13_X.onnx` (FP16), 16 workers, 32 slots: +Measured on Grace Blackwell (GB200) with `predecoder_memory_d13_T104_X.onnx` (FP16), 8 predecoders, 16 PyMatching workers, 16 slots: | Metric | Value | | :--- | :--- | -| Throughput | 25,331 req/s | -| Mean latency | 122.0 µs | -| p50 latency | 119.3 µs | -| p99 latency | 135.3 µs | -| Per-round (/13) | 9.4 µs/round | -| Stage A (dispatch + GPU) | 109.9 µs | -| Stage B (PyMatching) | 11.8 µs | -| Stage C (consumer lag) | 0.3 µs | -| Raw TRT inference (trtexec) | 69.5 µs | +| Throughput | 9,610 req/s | +| Submitted / Completed | 192,309 / 192,309 | +| Backpressure stalls | 6,193,418 | +| p50 latency | 352.3 µs | +| Mean latency | 393.4 µs | +| p90 latency | 514.7 µs | +| p95 latency | 606.3 µs | +| p99 latency | 1,248.8 µs | +| Max latency | 3,930.0 µs | +| PyMatching decode (avg) | 224.4 µs | +| Syndrome density reduction | 98.3% | +| Pipeline LER | 0.0020 | + +### 14.2 Raw TRT Baseline (trtexec) + +| Mode | GPU Compute | Total Host Latency | +| :--- | :--- | :--- | +| Default | 107 µs | 119 µs | +| CUDA Graph + SpinWait | 90 µs | 99 µs | +| CUDA Graph + SpinWait + No Transfers | 88 µs | 88 µs | --- @@ -400,7 +507,8 @@ When generating code from this specification, the LLM **MUST** strictly adhere t - [ ] **NO RACE CONDITIONS ON TAGS**: `inflight_slot_tags` does not need to be atomic because index `[worker_id]` is exclusively owned by the active flow once the dispatcher clears the bit in `idle_mask`, until the worker thread restores the bit. - [ ] **READY FLAG CLAIMING**: The CPU poller MUST claim each completion exactly once using compare_exchange_strong(1, 2) on the ready flag; use relaxed memory order on CAS failure. The worker MUST clear the flag (store 0) in `release_job`. - [ ] **IN_FLIGHT SENTINEL**: After a successful `cudaGraphLaunch`, the dispatcher MUST write `tx_flags[current_slot] = 0xEEEEEEEEEEEEEEEEULL` before clearing `rx_flags[current_slot]`. Set `tx_data_host = nullptr` and `tx_data_dev = nullptr` to force the 0xEEEE path. The producer MUST wait for both rx and tx to be 0 before reusing a slot. The consumer MUST ignore 0xEEEE and only harvest real responses (or 0xDEAD errors). -- [ ] **CONSUMER MEMORY ORDERING**: The consumer MUST set `slot_request[s] = -1` BEFORE calling `cudaq_host_ringbuffer_clear_slot`, with a `__sync_synchronize()` fence between them, to prevent the producer-consumer race on ARM. -- [ ] **DMA DATA MOVEMENT**: Use `cudaMemcpyAsync` (DMA engine) for data copies. Input copy is issued via `pre_launch_fn` callback before graph launch. Output copy is captured inside the graph. Do not use SM-based byte-copy kernels for fixed-address transfers. +- [ ] **CONSUMER MEMORY ORDERING**: The consumer MUST set `slot_occupied[s] = 0` BEFORE calling `cudaq_host_ringbuffer_clear_slot`, with a `__sync_synchronize()` fence between them, to prevent the producer-consumer race on ARM. +- [ ] **DMA DATA MOVEMENT**: Use `cudaMemcpyAsync` (DMA engine) for data copies. Input copy is issued via `pre_launch_fn` callback before graph launch at offset `CUDAQ_RPC_HEADER_SIZE` (24 bytes). Output copy is captured inside the graph. Do not use SM-based byte-copy kernels for fixed-address transfers. - [ ] **NO INPUT KERNEL IN GRAPH**: The captured CUDA graph must NOT contain an input-copy kernel. All input data movement is handled by the `pre_launch_fn` DMA callback issued on the worker stream before `cudaGraphLaunch`. -- [ ] **SHUTDOWN**: Use a `consumer_stop` (or equivalent) flag so the consumer thread can exit after a grace period even when `total_completed < total_submitted`; join the consumer after setting the flag so the process exits cleanly. +- [ ] **DEFERRED COMPLETION**: When the CPU stage returns `DEFERRED_COMPLETION`, the pipeline MUST release `idle_mask` but MUST NOT write `tx_flags`. The external caller MUST call `complete_deferred(slot)` to signal completion. +- [ ] **SHUTDOWN**: Use a `consumer_stop` (or equivalent) flag so the consumer thread can exit after a grace period even when `total_completed < total_submitted`; join the consumer after setting the flag so the process exits cleanly. Shut down the PyMatching queue before stopping the pipeline. diff --git a/docs/hybrid_ai_predecoder_pipeline.md b/docs/hybrid_ai_predecoder_pipeline.md index 20a4013e..dbafa482 100644 --- a/docs/hybrid_ai_predecoder_pipeline.md +++ b/docs/hybrid_ai_predecoder_pipeline.md @@ -4,7 +4,7 @@ **Component**: `cudaq-qec` Realtime Decoding Subsystem **Status**: Implementation Complete (Test-Validated) -**Last Updated**: 2026-02-19 +**Last Updated**: 2026-03-17 --- @@ -15,15 +15,15 @@ 3. [Architecture](#3-architecture) 4. [Component Deep-Dive](#4-component-deep-dive) - 4.1 [Ring Buffer & RPC Protocol](#41-ring-buffer--rpc-protocol) - - 4.2 [GPU Persistent Dispatcher Kernel](#42-gpu-persistent-dispatcher-kernel) + - 4.2 [Host-Side Dispatcher](#42-host-side-dispatcher) - 4.3 [AIDecoderService (Base Class)](#43-aidecoderservice-base-class) - 4.4 [AIPreDecoderService (Predecoder + CPU Handoff)](#44-aipredeccoderservice-predecoder--cpu-handoff) - - 4.5 [CPU Worker Threads & PyMatching Decoder Pool](#45-cpu-worker-threads--pymatching-decoder-pool) + - 4.5 [Decoupled CPU Worker Architecture](#45-decoupled-cpu-worker-architecture) 5. [Data Flow](#5-data-flow) 6. [Memory Architecture](#6-memory-architecture) 7. [Backpressure Protocol](#7-backpressure-protocol) 8. [Memory Ordering & Synchronization](#8-memory-ordering--synchronization) -9. [CUDA Graph Hierarchy](#9-cuda-graph-hierarchy) +9. [CUDA Graph Structure](#9-cuda-graph-structure) 10. [Pipeline Configurations](#10-pipeline-configurations) 11. [File Inventory](#11-file-inventory) 12. [Configuration Parameters](#12-configuration-parameters) @@ -39,25 +39,25 @@ This system implements a **realtime hybrid GPU/CPU pipeline** for quantum error | Stage | Location | Algorithm | Data Type | |-------|----------|-----------|-----------| -| **Predecoding** | GPU | Neural network (TensorRT, from ONNX) | INT32 | +| **Predecoding** | GPU | Neural network (TensorRT, from ONNX) | uint8 | | **Global Decoding** | CPU | PyMatching (MWPM) | float64 | -A **persistent GPU kernel** (the Dispatcher) monitors a shared ring buffer for incoming syndrome data. When data arrives, the Dispatcher launches a CUDA Graph containing a TensorRT inference pass. The neural network accepts raw measurements as INT32 tensors and produces residual detectors and a logical frame. The residual detectors are handed off to the CPU via mapped pinned memory, where a thread pool runs PyMatching MWPM decoding. Results are written back to the ring buffer and acknowledged. +A **host-side spin-polling dispatcher** monitors a shared ring buffer for incoming syndrome data. When data arrives, the dispatcher finds a free GPU worker from a dynamic bitmask pool (`idle_mask`), copies the input via DMA, and launches a pre-captured CUDA graph containing a TensorRT inference pass. The neural network accepts raw detector values as uint8 tensors and produces residual detectors and a logical frame prediction. The residual detectors are handed off to a **decoupled PyMatching thread pool** via a lock-free queue, where per-thread MWPM decoders compute corrections. Results are written back to the ring buffer and acknowledged. -The system supports multiple surface code distances via a configurable `PipelineConfig` struct: d=7, d=13, d=21, and d=31. ONNX models are compiled to TensorRT engines on first use and cached to disk as `.engine` files for fast reloading on subsequent runs. +The system supports multiple surface code distances and round counts via a configurable `PipelineConfig` struct: d=7/T=7, d=13/T=13, d=13/T=104, d=21/T=21, and d=31/T=31. ONNX models are compiled to TensorRT engines on first use and cached to disk as `.engine` files for fast reloading on subsequent runs. --- ## 2. Problem Statement -Surface code QEC requires decoding syndrome measurements within the coherence time of the quantum system (typically ~1 microsecond for superconducting qubits). A pure CPU decoder cannot meet this budget at scale. A pure GPU decoder lacks the flexibility to run algorithms like Minimum Weight Perfect Matching (MWPM) efficiently on GPU. +Surface code QEC requires decoding syndrome measurements within the coherence time of the quantum system. A pure CPU decoder cannot meet this budget at scale. A pure GPU decoder lacks the flexibility to run algorithms like Minimum Weight Perfect Matching (MWPM) efficiently on GPU. The hybrid approach exploits the strengths of each: -- **GPU**: Massively parallel neural network inference provides fast soft-decision outputs (residual detectors) that reduce the problem complexity for the global decoder. +- **GPU**: Massively parallel neural network inference provides fast soft-decision outputs (residual detectors) that reduce the problem complexity for the global decoder. The predecoder achieves **98.3% syndrome density reduction** for d=13/T=104. - **CPU**: PyMatching solves the residual MWPM problem on the simplified output from the predecoder. -The critical constraint is **zero-copy, zero-allocation** on the hot path. Every buffer is pre-allocated, every kernel is pre-captured into a CUDA Graph, and every transfer uses mapped pinned memory. +The critical constraint is **zero-copy, zero-allocation** on the hot path. Every buffer is pre-allocated, every kernel is pre-captured into a CUDA Graph, and every transfer uses mapped pinned memory or DMA. --- @@ -66,75 +66,85 @@ The critical constraint is **zero-copy, zero-allocation** on the hot path. Every ### System Diagram ``` - FPGA / Quantum Control (or Test Harness) + Test Harness (or FPGA DMA) │ - │ syndrome data (INT32 measurements) + │ syndrome data (uint8 detectors) ▼ ┌─────────────────────────────────────────────────────┐ │ Ring Buffer (Mapped Pinned Memory) │ │ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ │ - │ │Slot 0│ │Slot 1│ │Slot 2│ ... │Slot63│ │ + │ │Slot 0│ │Slot 1│ │Slot 2│ ... │Slot15│ │ │ └──┬───┘ └──┬───┘ └──┬───┘ └──┬───┘ │ │ │ │ │ │ │ - │ rx_flags[0] rx_flags[1] ... rx_flags[63] │ + │ rx_flags[0] rx_flags[1] ... rx_flags[15] │ └─────┼────────┼────────┼───────────────┼────────────┘ │ │ │ │ ▼ ▼ ▼ ▼ ┌─────────────────────────────────────────────────────┐ - │ GPU Persistent Dispatcher Kernel │ + │ Host-Side Dispatcher Thread │ │ │ - │ Polls rx_flags[] ──► Looks up function_id │ - │ ──► Checks backpressure ──► Launches CUDA Graph │ + │ Polls rx_flags[] ──► Finds free worker (idle_mask)│ + │ ──► DMA copy (pre_launch_fn) ──► cudaGraphLaunch │ └──────────┬──────────┬──────────┬──────────┬─────────┘ │ │ │ │ ▼ ▼ ▼ ▼ ┌──────────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ - │ PreDecoder 0 │ │PreDec. 1 │ │PreDec. 2 │ │PreDec. 3 │ - │ (CUDA Graph) │ │(CUDAGraph│ │(CUDAGraph│ │(CUDAGraph│ + │ PreDecoder 0 │ │PreDec. 1 │ │ ... │ │PreDec. 7 │ + │ (CUDA Graph) │ │(CUDAGraph│ │ │ │(CUDAGraph│ │ │ │ │ │ │ │ │ - │ Input Kern │ │ │ │ │ │ │ - │ ──► TRT ──► │ │ ... │ │ ... │ │ ... │ - │ Output Kern │ │ │ │ │ │ │ + │ TRT Infer │ │ ... │ │ ... │ │ ... │ + │ DMA Output │ │ │ │ │ │ │ + │ Signal Kern │ │ │ │ │ │ │ └──────┬───────┘ └────┬─────┘ └────┬─────┘ └────┬─────┘ │ │ │ │ - │ (mapped pinned memory: ready_flags, outputs) + │ (mapped pinned memory: ready_flags, h_predecoder_outputs_) ▼ ▼ ▼ ▼ ┌─────────────────────────────────────────────────────┐ - │ Polling Thread (incoming_polling_loop) │ - │ Round-robins all predecoders, dispatches to pool │ + │ Predecoder Workers (1:1 with GPU streams) │ + │ CAS(1,2) on ready_flags → copy output → enqueue │ + │ Release predecoder → return DEFERRED_COMPLETION │ └──────────┬──────────────────────────────────────────┘ - │ + │ PyMatchQueue (mutex + condvar) ▼ - ┌──────────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ - │ Worker 0 │ │ Worker 1 │ │ Worker 2 │ │ Worker 3 │ - │ (thread pool)│ │(thd pool)│ │(thd pool)│ │(thd pool)│ - │ │ │ │ │ │ │ │ - │ PyMatching 0 │ │PyMatch 1 │ │PyMatch 2 │ │PyMatch 3 │ - │ (own decoder)│ │(own dec) │ │(own dec) │ │(own dec) │ - │ Write RPC │ │Write RPC │ │Write RPC │ │Write RPC │ - │ Set tx_flag │ │Set tx_flg│ │Set tx_flg│ │Set tx_flg│ - └──────┬───────┘ └────┬─────┘ └────┬─────┘ └────┬─────┘ - │ │ │ │ - └──────────────┼────────────┼────────────┘ + ┌──────────────┐ ┌──────────┐ ┌──────────────┐ + │ PyMatch 0 │ │PyMatch 1 │ ... │ PyMatch 15 │ + │ (thread pool)│ │(thd pool)│ │ (thread pool) │ + │ │ │ │ │ │ + │ PyMatching │ │PyMatch │ │ PyMatching │ + │ (own decoder)│ │(own dec) │ │ (own decoder) │ + │ Write RPC │ │Write RPC │ │ Write RPC │ + │ complete_ │ │complete_ │ │ complete_ │ + │ deferred() │ deferred() │ │ deferred() │ + └──────┬───────┘ └────┬─────┘ └────┬──────────┘ + │ │ │ + └──────────────┼──────────────────┘ ▼ - tx_flags[slot] ──► FPGA + ┌─────────────────────────────────────────────────────┐ + │ Consumer Thread │ + │ Scans tx_flags[] ──► completion_handler ──► clear │ + └─────────────────────────────────────────────────────┘ + tx_flags[slot] ──► Producer can reuse slot ``` ### Key Design Decisions -1. **CUDA Graphs everywhere** -- Both the dispatcher kernel and every predecoder instance are captured as CUDA Graphs. The dispatcher graph is instantiated with `cudaGraphInstantiateFlagDeviceLaunch`, enabling it to launch child predecoder graphs from device code via `cudaGraphLaunch(..., cudaStreamGraphFireAndForget)`. +1. **Host-side dispatcher with dynamic worker pool** -- The dispatcher runs as a dedicated CPU thread, polling `rx_flags` and dynamically allocating GPU workers via an atomic `idle_mask` bitmask. This replaced a device-side persistent kernel that hit a CUDA 128-launch limit. -2. **Mapped pinned memory for all CPU-GPU communication** -- `cudaHostAllocMapped` provides a single address space visible to both CPU and GPU without explicit copies. GPU writes are made visible via `__threadfence_system()`; CPU reads are ordered via `std::atomic_thread_fence(std::memory_order_acquire)`. +2. **CUDA Graphs for inference** -- Each predecoder instance has a pre-captured CUDA graph containing TRT inference, output DMA copy, and a signal kernel. Input data is injected via a `pre_launch_fn` DMA callback before graph launch (since the source address is dynamic). -3. **N-deep circular queue between GPU and CPU** -- Rather than a single handoff slot, each predecoder maintains a circular buffer of depth N (default 16), allowing the GPU to pipeline multiple inferences before the CPU consumes them. +3. **Mapped pinned memory for GPU→CPU handoff** -- `cudaHostAllocMapped` provides a single address space visible to both CPU and GPU without explicit copies. GPU writes are made visible via libcu++ system-scope atomics with release semantics; CPU reads use acquire semantics. -4. **Dispatcher-level backpressure** -- The dispatcher checks a predecoder's queue state *before* launching its graph. If the queue is full, the packet stays in the ring buffer and the dispatcher moves on to service other slots. +4. **Queue depth 1 per predecoder** -- Each `AIPreDecoderService` has a single in-flight inference slot. Deeper queues were found to add complexity without measurable throughput benefit, since 8 parallel streams already exceed the GPU's throughput capacity. -5. **ONNX model support with engine caching** -- The `AIDecoderService` accepts either a pre-built `.engine` file or an `.onnx` model. When given an ONNX file, it builds a TensorRT engine at runtime and optionally saves it to disk via the `engine_save_path` parameter. On subsequent runs, the cached `.engine` file is loaded directly, skipping the expensive autotuner phase (startup drops from ~15s to ~4s). +5. **Decoupled predecoder and PyMatching workers** -- GPU polling threads release the predecoder stream immediately after copying output (~10 µs), then hand off to a separate PyMatching thread pool via `PyMatchQueue`. This prevents slow CPU decodes (~224 µs) from blocking GPU dispatch. -6. **Per-worker PyMatching decoder pool** -- Each thread pool worker gets its own pre-allocated PyMatching decoder instance via `thread_local` assignment. This eliminates mutex contention on the decode path (previous single-decoder + mutex design was ~2.4x slower). +6. **ONNX model support with engine caching** -- The `AIDecoderService` accepts either a pre-built `.engine` file or an `.onnx` model. When given an ONNX file, it builds a TensorRT engine at runtime and optionally saves it to disk via the `engine_save_path` parameter. -7. **Type-agnostic I/O buffers** -- All TRT I/O buffers use `void*` rather than `float*`, supporting INT32 models natively without type casting on the GPU. +7. **Per-worker PyMatching decoder pool** -- Each PyMatching thread gets its own pre-allocated decoder instance via `thread_local` assignment. This eliminates mutex contention on the decode path. + +8. **Type-agnostic I/O buffers** -- All TRT I/O buffers use `void*` rather than `float*`, supporting uint8 and INT32 models natively without type casting. + +9. **Stim-derived parity check matrix** -- The PyMatching decoders are initialized from a full parity check matrix (`H`) and observable matrix (`O`) exported from Stim, rather than the `cudaq-qec` surface code's per-slice `H_z`. This enables full-H decoding with proper edge weighting via priors. --- @@ -142,21 +152,25 @@ The critical constraint is **zero-copy, zero-allocation** on the hot path. Every ### 4.1 Ring Buffer & RPC Protocol -**Files**: `dispatch_kernel_launch.h` (protocol), test harness (allocation) +**Files**: `dispatch_kernel_launch.h` (protocol), `cudaq_realtime.h` (C API), `realtime_pipeline.cu` (RingBufferManager) -The ring buffer is the communication channel between the FPGA (or test harness) and the GPU. It consists of: +The ring buffer is the communication channel between the producer (FPGA or test harness) and the GPU. It consists of: | Buffer | Type | Size | Purpose | |--------|------|------|---------| -| `rx_flags[N]` | `volatile uint64_t*` | N slots | Non-zero = data ready; value is pointer to slot data | -| `tx_flags[N]` | `volatile uint64_t*` | N slots | Non-zero = response ready; acknowledges to FPGA | -| `rx_data` | `uint8_t*` | N x SLOT_SIZE | Slot payload area | +| `rx_flags[N]` | `cuda::atomic` | N slots | Non-zero = data ready; value is pointer to slot data | +| `tx_flags[N]` | `cuda::atomic` | N slots | Non-zero = response ready; acknowledges to consumer | +| `rx_data` | `uint8_t*` | N x SLOT_SIZE | Slot payload area (mapped pinned) | Each slot carries an **RPC message** in a packed wire format: ``` -Request: [RPCHeader: magic(4) | function_id(4) | arg_len(4)] [payload: arg_len bytes] -Response: [RPCResponse: magic(4) | status(4) | result_len(4)] [payload: result_len bytes] +Request: [RPCHeader: magic(4) | function_id(4) | arg_len(4) | request_id(4) | ptp_timestamp(8)] + [payload: arg_len bytes] + Total header: 24 bytes (CUDAQ_RPC_HEADER_SIZE) + +Response: [RPCResponse: magic(4) | status(4) | result_len(4)] + [payload: result_len bytes] ``` The `function_id` is an FNV-1a hash of the target function name, enabling the dispatcher to route requests to different predecoder instances. @@ -170,104 +184,64 @@ struct __attribute__((packed)) DecodeResponse { }; ``` -### 4.2 GPU Persistent Dispatcher Kernel +### 4.2 Host-Side Dispatcher -**File**: `realtime/lib/daemon/dispatcher/dispatch_kernel.cu` +**File**: `realtime/lib/daemon/dispatcher/host_dispatcher.cu` -The dispatcher is a **persistent kernel** -- it runs for the lifetime of the system, spinning on the ring buffer. Two variants exist: +The dispatcher is a **spin-polling host thread** running on a dedicated CPU core. It monitors the ring buffer's `rx_flags` and dispatches work to GPU streams. -| Variant | Function | Graph Launch | Use Case | -|---------|----------|-------------|----------| -| `dispatch_kernel_device_call_only` | Direct device function calls | No | Legacy / simple RPC | -| `dispatch_kernel_with_graph` | Device function calls + CUDA Graph launch | Yes (sm_80+) | AI predecoder pipeline | +#### Worker Pool -#### Dispatch Loop (Graph Variant) +The dispatcher manages a pool of `num_workers` GPU streams. Each worker is described by a `cudaq_host_dispatch_worker_t`: -``` -while (!shutdown): - rx_value = rx_flags[current_slot] - if rx_value != 0: - header = parse_rpc_header(rx_value) - - if header.magic is invalid: - consume and clear slot ← garbage data - - else: - entry = lookup(header.function_id) - - if entry is DEVICE_CALL: - call device function inline - write RPC response - set tx_flags - consume slot - - elif entry is GRAPH_LAUNCH: - if backpressure_check(entry): - skip (do NOT consume) ← retry later - else: - write mailbox - cudaGraphLaunch(fire-and-forget) - consume slot - (tx_flags set later by CPU) - - else: - consume slot ← unknown function - - advance current_slot ← always advance - KernelType::sync() +```c +typedef struct { + cudaGraphExec_t graph_exec; + cudaStream_t stream; + uint32_t function_id; + void (*pre_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream); + void* pre_launch_data; + void (*post_launch_fn)(void* user_data, void* slot_dev, cudaStream_t stream); + void* post_launch_data; +} cudaq_host_dispatch_worker_t; ``` -The `packet_consumed` flag controls whether `rx_flags[slot]` is cleared. For backpressured graph launches, the slot is left intact so the dispatcher retries on the next pass. The slot pointer **always** advances to avoid head-of-line blocking. +#### Dispatch Loop -**Note on slot scanning**: The dispatcher only advances `current_slot` when a non-empty slot is found. When a slot is empty, it spins on that same slot. This means having many empty slots (e.g., 64 slots with only 4 in use) does not cause scanning overhead, but the dispatcher does park on a slot waiting for it to be filled. +``` +while (!shutdown): + rx_value = rx_flags[current_slot].load(acquire) + if rx_value == 0: QEC_CPU_RELAX(); continue -#### Function Table Entry + // Find free worker via idle_mask bitmask + worker_id = ffsll(idle_mask.load(acquire)) - 1 + if worker_id < 0: QEC_CPU_RELAX(); continue -Each registered function is described by a `cudaq_function_entry_t`: + // Claim worker, tag origin slot + idle_mask.fetch_and(~(1ULL << worker_id), release) + inflight_slot_tags[worker_id] = current_slot -```c -typedef struct { - union { - void *device_fn_ptr; // DEVICE_CALL handler - cudaGraphExec_t graph_exec; // GRAPH_LAUNCH handler - } handler; - uint32_t function_id; // FNV-1a hash - uint8_t dispatch_mode; // DEVICE_CALL or GRAPH_LAUNCH - uint8_t reserved[3]; - cudaq_handler_schema_t schema; // argument/result type descriptors - - // Graph-launch backpressure metadata: - uint32_t mailbox_idx; // index into global_mailbox_bank - int *d_queue_idx; // → predecoder's queue tail - volatile int *d_ready_flags; // → predecoder's ready flags - int *d_inflight_flag; // → predecoder's inflight flag -} cudaq_function_entry_t; -``` + // Pre-launch: DMA input to TRT buffer + if pre_launch_fn: pre_launch_fn(data, dev_ptr, stream) -#### Graph-Based Dispatch Context + // Launch CUDA graph + cudaGraphLaunch(graph_exec, stream) -The dispatcher kernel itself runs inside a CUDA Graph (`cudaq_dispatch_graph_context`), instantiated with `cudaGraphInstantiateFlagDeviceLaunch`. This is **required** for the kernel to call `cudaGraphLaunch()` from device code. The lifecycle is: + // Mark in-flight, consume slot + tx_flags[current_slot].store(0xEEEE..., release) + rx_flags[current_slot].store(0, release) -``` -cudaq_create_dispatch_graph_regular() - → cudaGraphCreate - → cudaGraphAddKernelNode (dispatch_kernel_with_graph) - → cudaGraphInstantiate (with DeviceLaunch flag) - → cudaGraphUpload - → cudaStreamSynchronize - -cudaq_launch_dispatch_graph() - → cudaGraphLaunch (from host) - -cudaq_destroy_dispatch_graph() - → cudaGraphExecDestroy + cudaGraphDestroy + // Post-launch callback (GPU-only mode) + if post_launch_fn: post_launch_fn(...) + + current_slot = (current_slot + 1) % num_slots ``` ### 4.3 AIDecoderService (Base Class) **Files**: `ai_decoder_service.h`, `ai_decoder_service.cu` -The base class manages the TensorRT lifecycle and provides a default "autonomous" CUDA Graph that reads from a mailbox, runs inference, and writes results back to the ring buffer -- all on the GPU. +The base class manages the TensorRT lifecycle. #### Constructor @@ -282,13 +256,9 @@ The constructor accepts either a `.engine` file (fast deserialization) or an `.o - **Engine loading**: Deserializes a TensorRT `.engine` file or builds from `.onnx` via `NvOnnxParser`. - **Engine caching**: Saves built engines to disk via `engine_save_path` for fast reload. -- **Dynamic tensor binding**: Enumerates all I/O tensors from the engine, storing metadata in `TensorBinding` structs. Supports models with multiple outputs (e.g., `residual_detectors` + `logical_frame`). -- **Buffer allocation**: Allocates persistent device buffers sized to the engine's static tensor shapes. Uses `void*` for type-agnostic I/O (INT32, FP32, etc.). -- **Graph capture**: The default `capture_graph()` creates a 3-node graph: - -``` -gateway_input_kernel ──► TRT enqueueV3 ──► gateway_output_kernel -``` +- **Dynamic tensor binding**: Enumerates all I/O tensors from the engine, storing metadata in `TensorBinding` structs. Supports models with multiple outputs. +- **Buffer allocation**: Allocates persistent device buffers sized to the engine's static tensor shapes. Uses `void*` for type-agnostic I/O. +- **Dynamic batch handling**: Automatically pins dynamic dimensions to 1 via optimization profiles. #### Dynamic Tensor Binding @@ -302,61 +272,42 @@ struct TensorBinding { std::vector all_bindings_; ``` -During `setup_bindings()`, all I/O tensors are enumerated from the engine. The first input becomes `d_trt_input_`, the first output becomes `d_trt_output_` (the primary output forwarded to the CPU), and any additional outputs are allocated as auxiliary buffers in `d_aux_buffers_`. - ### 4.4 AIPreDecoderService (Predecoder + CPU Handoff) **Files**: `ai_predecoder_service.h`, `ai_predecoder_service.cu` -This derived class replaces the base class's autonomous graph with one that hands inference results off to the CPU for further processing by PyMatching. +This derived class replaces the base class's autonomous graph with one that hands inference results off to the CPU. #### Constructor ```cpp AIPreDecoderService(const std::string& engine_path, void** device_mailbox_slot, - int queue_depth = 16, const std::string& engine_save_path = ""); + int queue_depth = 1, const std::string& engine_save_path = ""); ``` #### CUDA Graph Structure ``` -predecoder_input_kernel ──► TRT enqueueV3 ──► predecoder_output_kernel +[Pre-launch DMA: ring buffer → d_trt_input (host-side callback)] + ↓ +TRT enqueueV3 (AI predecoder inference) + ↓ +cudaMemcpyAsync D2D (d_trt_output_ → h_predecoder_outputs_) + ↓ +predecoder_signal_ready_kernel (ready_flags.store(1, release)) ``` -**`predecoder_input_kernel`**: -1. Reads the current queue tail index (`d_queue_idx`). -2. Performs a defense-in-depth bounded spin on `d_ready_flags[slot]` (primary backpressure is at the dispatcher level). -3. If the slot is free: saves the FPGA ring buffer pointer to `d_ring_ptrs[slot]` and copies syndrome data to the TRT input buffer. -4. If the spin times out: sets `ring_ptr = nullptr`, causing all threads to abort safely without corrupting the queue. - -**`predecoder_output_kernel`**: -1. Copies TRT output to `d_outputs[slot]` (mapped pinned memory, directly readable by CPU). Output data is `void*` (typically INT32 residual detectors). -2. Issues `__threadfence_system()` to ensure writes are visible over PCIe. -3. Sets `d_ready_flags[slot] = 1` (signals the CPU). -4. Advances `d_queue_idx` circularly. +The input DMA copy is NOT in the graph — it's issued by the `pre_launch_fn` callback on the worker stream before `cudaGraphLaunch`, because the source address (ring buffer slot) changes each invocation. -#### N-Deep Circular Queue - -Each `AIPreDecoderService` instance owns a private circular queue: - -``` - GPU writes → ← CPU reads - ┌───┬───┬───┬───┬───┬───┬───┬───┐ - │ 0 │ 1 │ 2 │ 3 │ 4 │...│14 │15 │ ready_flags[16] - └───┴───┴───┴───┴───┴───┴───┴───┘ - ▲ ▲ - │ │ - d_queue_idx cpu_poll_idx_ - (GPU tail) (CPU head) -``` +#### Per-Predecoder Buffers (queue_depth=1) | Buffer | Host Pointer | Device Pointer | Purpose | |--------|-------------|---------------|---------| -| `h_ready_flags_` | CPU reads | `d_ready_flags_` GPU writes | 1 = job ready, 0 = slot free | -| `h_ring_ptrs_` | CPU reads | `d_ring_ptrs_` GPU writes | Original FPGA buffer address per job | -| `h_outputs_` | CPU reads | `d_outputs_` GPU writes | TRT inference output (`void*`, typically INT32) | +| `h_ready_flags_` | CPU reads/writes | `d_ready_flags_` GPU writes | 1 = job ready, 0 = slot free | +| `h_ring_ptrs_` | CPU reads | `d_ring_ptrs_` GPU writes | Original ring buffer address per job | +| `h_predecoder_outputs_` | CPU reads | `d_predecoder_outputs_` GPU writes | TRT inference output (`void*`, uint8) | -All three buffers are allocated with `cudaHostAllocMapped` and mapped to device pointers via `cudaHostGetDevicePointer`. The GPU writes through the device pointers; the CPU reads through the host pointers. No explicit `cudaMemcpy` is ever issued on the hot path. +All buffers are allocated with `cudaHostAllocMapped` and mapped to device pointers via `cudaHostGetDevicePointer`. #### CPU Interface @@ -365,29 +316,47 @@ bool poll_next_job(PreDecoderJob& out_job); void release_job(int slot_idx); ``` -`poll_next_job` checks `h_ready_flags_[cpu_poll_idx_]`. If set, it issues an acquire fence (for ARM portability), populates the `PreDecoderJob` struct with the slot index, ring buffer pointer, and a pointer into the inference output buffer, then advances the poll index. +`poll_next_job` performs CAS(expected=1, desired=2) on `ready_flags[0]`. If successful, it populates the `PreDecoderJob` struct with the slot index, ring buffer pointer, and inference output pointer. -`release_job` uses `__atomic_store_n(..., __ATOMIC_RELEASE)` to clear the flag, ensuring that all prior CPU writes (RPC response data) are visible before the GPU is allowed to reuse the slot. +`release_job` stores 0 to the ready flag with release semantics, allowing the GPU to reuse the slot. -### 4.5 CPU Worker Threads & PyMatching Decoder Pool +### 4.5 Decoupled CPU Worker Architecture **File**: `test_realtime_predecoder_w_pymatching.cpp` -The CPU-side processing uses a **polling thread + thread pool** architecture: +The CPU-side processing uses a **two-tier decoupled architecture**: -1. **Polling thread** (`incoming_polling_loop`): A single dedicated thread round-robins all predecoder instances, calling `poll_next_job()` on each. When a job is found, it is dispatched to the thread pool. -2. **Thread pool** (`cudaq::qec::utils::ThreadPool`): A pool of `num_workers` threads (default 4) that execute `pymatching_worker_task` jobs concurrently. +#### Tier 1: Predecoder Workers (GPU Polling) -#### PyMatching Decoder Pool +Pipeline worker threads (1:1 with GPU streams) run in the `RealtimePipeline::worker_loop`. Each iteration: + +1. Polls `poll_next_job()` (CAS on ready_flags). +2. Copies inference output to `deferred_outputs[origin_slot]` (per-slot buffer). +3. Computes syndrome density metrics. +4. Releases predecoder via `release_job(0)`. +5. Enqueues `PyMatchJob{origin_slot, request_id, ring_buffer_ptr}` to `PyMatchQueue`. +6. Returns `DEFERRED_COMPLETION` → pipeline releases `idle_mask`, skips `tx_flags`. + +**Hold time**: ~10 µs (copy + release + enqueue). + +#### Tier 2: PyMatching Workers (CPU Decode) + +A separate thread pool (16 workers for d13_r104) processes `PyMatchJob`s: + +1. Pops job from `PyMatchQueue` (blocks if empty). +2. Acquires per-thread PyMatching decoder via `thread_local` lock-free assignment. +3. Runs PyMatching MWPM decode over the full parity check matrix. +4. Writes `RPCResponse + DecodeResponse` into the ring buffer slot. +5. Calls `pipeline.complete_deferred(origin_slot)` → stores host address into `tx_flags`. -Each worker thread gets its own pre-allocated PyMatching decoder via `thread_local` assignment: +**Decode time**: ~224 µs average. + +#### PyMatching Decoder Pool ```cpp struct DecoderContext { std::vector> decoders; std::atomic next_decoder_idx{0}; - int z_stabilizers = 0; - int spatial_slices = 0; cudaq::qec::decoder* acquire_decoder() { thread_local int my_idx = next_decoder_idx.fetch_add(1); @@ -396,32 +365,28 @@ struct DecoderContext { }; ``` -Decoders are constructed at startup from the surface code's Z parity check matrix (`H_z`) using the `cudaq-qec` plugin system: +Decoders are constructed at startup from the Stim-derived parity check matrix (`H`) with edge priors: ```cpp -auto surface_code = cudaq::qec::get_code("surface_code", {{"distance", d}}); -auto H_z = surface_code->get_parity_z(); -for (int i = 0; i < num_workers; ++i) - decoders.push_back(cudaq::qec::decoder::get("pymatching", H_z, pm_params)); +auto H_full = stim_data.H.to_dense(); +pm_params.insert("error_rate_vec", stim_data.priors); +for (int i = 0; i < num_decode_workers; ++i) + decoders.push_back(cudaq::qec::decoder::get("pymatching", H_full, pm_params)); ``` -The `merge_strategy` parameter is set to `"smallest_weight"` to handle parallel edges in the surface code's PCM. - -#### Worker Function (`pymatching_worker_task`) +#### Observable Projection -Each worker invocation: +When the observable matrix (`O`) is available, corrections are projected onto the logical observable: -1. **Acquires a decoder** from the pool via `ctx->acquire_decoder()` (lock-free, `thread_local`). -2. **Slices residual detectors** into `spatial_slices` groups of `z_stabilizers` each. For d=13, this is 26 slices of 84 stabilizers. -3. **Runs PyMatching** on each slice: converts INT32 residual detectors to `std::vector`, calls `decoder->decode(syndrome)`. -4. **Accumulates corrections** and convergence status across all slices. -5. **Writes RPC Response**: Formats `DecodeResponse{total_corrections, converged}` into the original ring buffer slot. -6. **Releases GPU Queue Slot**: Calls `predecoder->release_job(slot_idx)`. -7. **Acknowledges to FPGA**: Sets `tx_flags[slot]` to signal completion. - -#### Timing Instrumentation +```cpp +int obs_parity = 0; +for (size_t e = 0; e < result.result.size(); ++e) + if (result.result[e] > 0.5 && obs_row[e]) + obs_parity ^= 1; +total_corrections += obs_parity; +``` -The worker function includes `std::chrono::high_resolution_clock` instrumentation that accumulates PyMatching decode time and total worker time into atomic counters on the `DecoderContext`. These are reported at the end of the run as a latency breakdown. +The total corrections include both the predecoder's logical prediction (`output[0]`) and PyMatching's correction parity. --- @@ -430,49 +395,55 @@ The worker function includes `std::chrono::high_resolution_clock` instrumentatio The following traces a single syndrome packet through the entire pipeline: ``` -Step Location Action -──── ──────── ────────────────────────────────────────────────── - 1. Host/FPGA Writes RPCHeader + INT32 measurements into rx_data[slot] - 2. Host/FPGA Sets rx_flags[slot] = pointer_to_slot_data - ── memory fence ── - 3. GPU Dispatcher reads rx_flags[slot] (non-zero → work available) - 4. GPU Parses RPCHeader, extracts function_id - 5. GPU Looks up function_id in function_table → finds entry - 6. GPU Backpressure check: reads entry->d_queue_idx, entry->d_ready_flags - If queue full: skip, leave rx_flags intact, advance slot pointer - 7. GPU Writes data_buffer pointer to global_mailbox_bank[mailbox_idx] - ── __threadfence_system() ── - 8. GPU cudaGraphLaunch(entry->graph_exec, cudaStreamGraphFireAndForget) - 9. GPU Clears rx_flags[slot] = 0, increments packet count - ── slot consumed ── - - ── Inside Predecoder CUDA Graph ── -10. GPU predecoder_input_kernel: - Reads *mailbox → ring_buffer_ptr - Reads d_queue_idx → slot_idx - Copies INT32 measurements from ring buffer → d_trt_input_ -11. GPU TRT enqueueV3: Neural network inference (INT32 → INT32) -12. GPU predecoder_output_kernel: - Copies d_trt_output_ → h_outputs_[slot_idx] (mapped pinned) - Saves ring_buffer_ptr → h_ring_ptrs_[slot_idx] - ── __threadfence_system() ── - Sets h_ready_flags_[slot_idx] = 1 - Advances d_queue_idx - ── Graph complete ── - -13. CPU Polling thread: h_ready_flags_[poll_idx] == 1 → job found - Dispatches job to thread pool -14. CPU Worker thread: acquires per-thread PyMatching decoder - ── std::atomic_thread_fence(acquire) ── -15. CPU Worker reads h_ring_ptrs_[poll_idx], h_outputs_[poll_idx] -16. CPU Worker slices residual_detectors into spatial_slices - For each slice: PyMatching MWPM decode → corrections -17. CPU Worker writes RPCResponse + DecodeResponse into ring_buffer_ptr - ── std::atomic_thread_fence(release) ── -18. CPU Worker: release_job(slot_idx) → clears h_ready_flags_[slot_idx] - ── __atomic_store_n(..., RELEASE) ── -19. CPU Worker: Sets tx_flags[ring_slot] = rx_value - ── FPGA/Host sees response ── +Step Location Action +──── ──────── ────────────────────────────────────────────────── + 1. Producer Writes RPCHeader (24 bytes) + uint8 detectors into rx_data[slot] + 2. Injector Sets rx_flags[slot] = host_ptr (release) + ── release fence ── + 3. Dispatcher Reads rx_flags[slot] (acquire), sees data + 4. Dispatcher Parses RPCHeader, extracts function_id + 5. Dispatcher Scans idle_mask via ffsll → finds free worker W + 6. Dispatcher Marks bit W busy, saves inflight_slot_tags[W] = slot + 7. Dispatcher Writes dev_ptr to h_mailbox_bank[W], __sync_synchronize() + 8. Dispatcher pre_launch_fn: h_ring_ptrs[0] = dev_ptr, + cudaMemcpyAsync(d_trt_input, dev_ptr+24, input_size, D2D, stream[W]) + 9. Dispatcher cudaGraphLaunch(graph_exec[W], stream[W]) +10. Dispatcher tx_flags[slot].store(0xEEEE..., release) [IN_FLIGHT] +11. Dispatcher rx_flags[slot].store(0, release), advance slot + ── slot consumed ── + + ── Inside CUDA Graph ── +12. GPU TRT enqueueV3: AI predecoder inference (uint8 → uint8) +13. GPU cudaMemcpyAsync D2D: d_trt_output_ → h_predecoder_outputs_ +14. GPU predecoder_signal_ready_kernel: ready_flags.store(1, release) + ── Graph complete ── + +15. PreDec Worker CAS(1, 2) on ready_flags[0] (acquire), wins +16. PreDec Worker Copies h_predecoder_outputs_ → deferred_outputs[slot] +17. PreDec Worker Computes syndrome density (input vs output nonzero counts) +18. PreDec Worker release_job(0): ready_flags.store(0, release) +19. PreDec Worker Extracts request_id from RPCHeader +20. PreDec Worker Enqueues PyMatchJob{slot, request_id, ring_buffer_ptr} +21. PreDec Worker Returns DEFERRED_COMPLETION +22. Pipeline idle_mask.fetch_or(1<d_queue_idx; -volatile int* d_ready_flags = entry->d_ready_flags; - -int current_tail = *d_queue_idx; -if (d_ready_flags[current_tail] == 1) { - // Queue full: skip this packet, do NOT clear rx_flags - packet_consumed = false; -} -``` +Backpressure prevents the producer from overwriting ring buffer slots that are still in use. It operates through **slot availability**: -If the queue is full, the packet stays in the ring buffer. The dispatcher advances to the next slot, so **other decoders are not blocked** (no head-of-line blocking). On the next pass through the ring buffer, the dispatcher will retry the skipped slot. +### Ring Buffer Level (Primary) -### Level 2: Predecoder Input Kernel (Defense-in-Depth) +The `RingBufferInjector::try_submit()` checks if both `rx_flags[slot] == 0` AND `tx_flags[slot] == 0` before writing. If either is non-zero, the slot is busy: +- `rx_flags != 0`: Dispatcher hasn't consumed the slot yet. +- `tx_flags != 0`: Either IN_FLIGHT (`0xEEEE`) or completed (response addr) but not yet harvested by consumer. -If the dispatcher's backpressure check is bypassed (e.g., backpressure pointers not wired up, or a race condition), the predecoder input kernel has a **bounded spin** as a safety net: +The blocking `submit()` spins with `QEC_CPU_RELAX()` and increments a `backpressure_stalls` counter. -```c -int timeout_counter = 0; -while (d_ready_flags[slot_idx] == 1 && timeout_counter < 1000000) { - timeout_counter++; -} - -if (d_ready_flags[slot_idx] == 1) { - ring_ptr = nullptr; // Abort safely, don't corrupt the slot -} -``` +### Worker Level (Implicit) -On timeout, the kernel nullifies `ring_ptr`, which causes all threads to return without writing any data. This prevents silent corruption but means the syndrome is effectively dropped. In a correctly configured system, this path should never be reached. +If all `idle_mask` bits are 0 (all workers busy), the dispatcher spins on the current slot without advancing. This provides natural backpressure since `rx_flags[slot]` remains non-zero, preventing the producer from overwriting that slot. --- ## 8. Memory Ordering & Synchronization -The pipeline involves three independent agents (FPGA/Host, GPU, CPU) communicating through shared memory. Correctness depends on careful ordering: +The pipeline involves three independent agents (Producer, GPU, CPU workers/consumer) communicating through shared memory. All synchronization uses **libcu++ system-scope atomics** — no `volatile`, no `__threadfence_system()`. -### GPU → CPU (Predecoder Output → Poll) +### GPU → CPU (Signal Kernel → Worker Poll) | Agent | Operation | Ordering Primitive | |-------|-----------|-------------------| -| GPU | Write `h_outputs_[slot]` and `h_ring_ptrs_[slot]` | (normal device writes to mapped memory) | -| GPU | `__threadfence_system()` | Ensures all prior writes are visible over PCIe | -| GPU | Write `h_ready_flags_[slot] = 1` | (the "publish" signal) | -| CPU | Read `h_ready_flags_[slot] == 1` | (volatile read) | -| CPU | `std::atomic_thread_fence(acquire)` | Prevents CPU from speculatively reading data before the flag | -| CPU | Read `h_outputs_[slot]`, `h_ring_ptrs_[slot]` | (safe: ordered after acquire) | +| GPU | Write `h_predecoder_outputs_` (DMA copy in graph) | (ordered by graph node dependencies) | +| GPU | `ready_flags[0].store(1, release)` | system-scope atomic release | +| CPU Worker | `ready_flags[0].compare_exchange_strong(1, 2, acquire, relaxed)` | acquire on success, relaxed on failure | +| CPU Worker | Read `h_predecoder_outputs_` | (safe: ordered after acquire) | + +### CPU → GPU (Job Release → Stream Reuse) -On x86, the acquire fence is technically a no-op (loads are not reordered with loads), but it is necessary for correctness on ARM (e.g., Grace Hopper). +| Agent | Operation | Ordering Primitive | +|-------|-----------|-------------------| +| CPU Worker | Copy output to deferred buffer | (normal stores) | +| CPU Worker | `ready_flags[0].store(0, release)` | release ensures copy visible | +| GPU | `ready_flags[0].load(...)` sees 0 | GPU can write new results | -### CPU → GPU (Job Release → Queue Reuse) +### Worker → Consumer (tx_flags) | Agent | Operation | Ordering Primitive | |-------|-----------|-------------------| -| CPU | Write RPC response to ring buffer | (normal stores) | -| CPU | `__atomic_store_n(&h_ready_flags_[slot], 0, __ATOMIC_RELEASE)` | Ensures response writes are visible before flag is cleared | -| GPU | Read `d_ready_flags[slot] == 0` | (volatile read from mapped memory) | -| GPU | Overwrites `d_ring_ptrs[slot]`, `d_outputs[slot]` | (safe: flag was 0) | +| PyMatch Worker | Write RPC response to ring buffer | (normal stores) | +| PyMatch Worker | `tx_flags[slot].store(addr, release)` | release ensures response visible | +| Consumer | `tx_flags[slot].load(acquire)` | acquire sees response data | -### Host → GPU (Ring Buffer Signaling) +### Consumer → Producer (Slot Recycling) | Agent | Operation | Ordering Primitive | |-------|-----------|-------------------| -| Host/Test | Write RPC header + payload to `rx_data[slot]` | (normal stores) | -| Host/Test | `__sync_synchronize()` / memory barrier | Full fence before flag write | -| Host/Test | Write `rx_flags[slot] = pointer` | (the "publish" signal) | -| GPU | Read `rx_flags[slot] != 0` | (volatile read from mapped memory) | +| Consumer | `slot_occupied[slot] = 0` | (normal store) | +| Consumer | `__sync_synchronize()` | full barrier | +| Consumer | `tx_flags[slot].store(0)`, `rx_flags[slot].store(0)` | slot free | +| Producer | `slot_available()` checks both flags == 0 | can reuse | --- -## 9. CUDA Graph Hierarchy +## 9. CUDA Graph Structure -The system uses a **two-level graph hierarchy**: +Each predecoder has a pre-captured, host-launched CUDA graph: ``` -Level 0: Dispatcher Graph (cudaq_dispatch_graph_context) - │ - │ Instantiated with cudaGraphInstantiateFlagDeviceLaunch - │ Contains: dispatch_kernel_with_graph (persistent kernel node) - │ - │ Device-side cudaGraphLaunch() ──► - │ - ├──► Level 1: PreDecoder Graph [0] - │ predecoder_input_kernel → TRT enqueueV3 → predecoder_output_kernel - │ - ├──► Level 1: PreDecoder Graph [1] - │ ... - ├──► Level 1: PreDecoder Graph [2] - │ ... - └──► Level 1: PreDecoder Graph [3] - ... + ┌──────────────────────────────────────────────────────┐ + │ Pre-Launch (host-side callback) │ + │ pre_launch_fn: │ + │ h_ring_ptrs[0] = slot_dev_ptr │ + │ cudaMemcpyAsync(d_trt_input, │ + │ slot_dev + 24, ← CUDAQ_RPC_HEADER_SIZE + │ input_size, D2D, stream) │ + └──────────────────────┬───────────────────────────────┘ + │ + ┌──────────────────────▼───────────────────────────────┐ + │ CUDA Graph (captured once) │ + │ │ + │ Node 1: TRT enqueueV3 │ + │ (or passthrough_copy_kernel in SKIP_TRT) │ + │ │ │ + │ Node 2: cudaMemcpyAsync D2D │ + │ d_trt_output_ → h_predecoder_outputs_ (mapped) │ + │ │ │ + │ Node 3: predecoder_signal_ready_kernel<<<1,1>>> │ + │ ready_flags.store(1, release) │ + └──────────────────────────────────────────────────────┘ ``` -**Level 0** must be instantiated with `cudaGraphInstantiateFlagDeviceLaunch` so that the persistent kernel running inside it can call `cudaGraphLaunch()` on **Level 1** graphs. Level 1 graphs are also instantiated with this flag and uploaded to the device. The launch mode is `cudaStreamGraphFireAndForget`, meaning the predecoder graph executes asynchronously without blocking the dispatcher. - -**Requirement**: Compute capability >= sm_80 (Ampere and later). Device-side graph launch is gated by `#if __CUDA_ARCH__ >= 800`. - -**Limitation**: `cudaStreamGraphFireAndForget` has a CUDA runtime limit on concurrent pending child graph launches (~128). The test limits `total_requests` to 100 to stay under this ceiling. +The graph is instantiated with `cudaGraphInstantiate(&graph_exec_, graph, 0)` for host-launch mode. No device-side graph launch is used. --- ## 10. Pipeline Configurations -The test supports multiple surface code distances via the `PipelineConfig` struct: +The test supports multiple surface code distances via the `PipelineConfig` struct. Model dimensions are derived automatically from TRT engine bindings: -| Config | Distance | Rounds | ONNX Model | Input Shape | Input Bytes | Residual Detectors | Z Stabilizers | Spatial Slices | Slot Size | -|--------|----------|--------|------------|-------------|-------------|-------------------|---------------|---------------|-----------| -| `d7` | 7 | 7 | `model1_d7_r7_unified_Z_batch1.onnx` | [1, 72, 7] | 2,016 | 336 | 24 | 14 | 4,096 | -| `d13` | 13 | 13 | `model1_d13_r13_unified_Z_batch1.onnx` | [1, 252, 13] | 13,104 | 2,184 | 84 | 26 | 16,384 | -| `d21` | 21 | 21 | `model1_d21_r21_unified_X_batch1.onnx` | [1, 660, 21] | 55,440 | 9,240 | 220 | 42 | 65,536 | -| `d31` | 31 | 31 | `model1_d31_r31_unified_Z_batch1.onnx` | [1, 1440, 31] | 178,560 | 29,760 | 480 | 62 | 262,144 | +| Config | Distance | Rounds | ONNX Model | Input (uint8) | Output (uint8) | Predecoders | PyMatch Workers | Slot Size | +|--------|----------|--------|------------|--------------|----------------|-------------|-----------------|-----------| +| `d7_r7` | 7 | 7 | `model1_d7_r7_unified_Z_batch1.onnx` | 504 | 505 | 16 | 32 | 1,024 | +| `d13_r13` | 13 | 13 | `predecoder_memory_d13_T13_X.onnx` | 3,276 | 3,277 | 16 | 32 | 4,096 | +| `d13_r104` | 13 | 104 | `predecoder_memory_d13_T104_X.onnx` | 17,472 | 17,473 | 8 | 16 | 32,768 | +| `d21_r21` | 21 | 21 | `model1_d21_r21_unified_X_batch1.onnx` | 13,860 | 13,861 | 16 | 32 | 16,384 | +| `d31_r31` | 31 | 31 | `model1_d31_r31_unified_Z_batch1.onnx` | 44,640 | 44,641 | 16 | 32 | 65,536 | -All models use **INT32** tensors for both input (measurements) and output (residual detectors, logical frame). +All models use **uint8** tensors for both input (detectors) and output (logical prediction + residual detectors). -The number of **spatial slices** is `residual_detectors / z_stabilizers`. PyMatching is called once per slice, with each slice containing one group of Z-stabilizer detector values. +The `slot_size` is computed as `round_up_pow2(CUDAQ_RPC_HEADER_SIZE + model_input_bytes)`. Usage: ```bash -./test_realtime_predecoder_w_pymatching d7 # default +./test_realtime_predecoder_w_pymatching d7 # default ./test_realtime_predecoder_w_pymatching d13 +./test_realtime_predecoder_w_pymatching d13_r104 104 20 # 104 µs rate, 20 sec ./test_realtime_predecoder_w_pymatching d21 ./test_realtime_predecoder_w_pymatching d31 ``` +Optional flags: +- `--data-dir /path/to/stim/data`: Load real test data for correctness verification. + ### Engine Caching -On first run with a given configuration, the ONNX model is compiled to a TensorRT engine and saved alongside the ONNX file (e.g., `model1_d13_r13_unified_Z_batch1.engine`). Subsequent runs detect the cached engine and skip the build phase. +On first run with a given configuration, the ONNX model is compiled to a TensorRT engine and saved alongside the ONNX file (e.g., `predecoder_memory_d13_T104_X.engine`). Subsequent runs detect the cached engine and skip the build phase. --- @@ -666,16 +637,20 @@ On first run with a given configuration, the ONNX model is compiled to a TensorR | File | Layer | Purpose | |------|-------|---------| -| `realtime/include/.../cudaq_realtime.h` | API | C API header: structs, enums, function declarations | +| `realtime/include/.../cudaq_realtime.h` | API | C API header: structs, enums, ring buffer helpers, `CUDAQ_RPC_HEADER_SIZE` | | `realtime/include/.../dispatch_kernel_launch.h` | API | RPC protocol structs (RPCHeader, RPCResponse), FNV-1a hash | -| `realtime/lib/.../dispatch_kernel.cu` | Runtime | Persistent dispatcher kernels + graph-based dispatch context | +| `realtime/include/.../host_dispatcher.h` | API | Host dispatcher C API: `cudaq_host_dispatcher_config_t`, `cudaq_host_dispatch_worker_t` | +| `realtime/lib/.../host_dispatcher.cu` | Runtime | Host-side dispatcher loop implementation | +| `realtime/lib/.../cudaq_realtime_api.cpp` | Runtime | Ring buffer C API implementation | +| `libs/qec/include/.../pipeline.h` | Pipeline | `RealtimePipeline`, `RingBufferInjector`, callbacks, `DEFERRED_COMPLETION` | +| `libs/qec/lib/.../realtime_pipeline.cu` | Pipeline | Pipeline implementation: `RingBufferManager`, worker/consumer loops, injector | | `libs/qec/include/.../ai_decoder_service.h` | QEC | Base class header: TRT lifecycle, dynamic tensor bindings, engine caching | -| `libs/qec/lib/.../ai_decoder_service.cu` | QEC | Base class impl: ONNX build, engine save/load, gateway kernels, graph capture | -| `libs/qec/include/.../ai_predecoder_service.h` | QEC | Derived class header: CPU handoff queue, `QEC_CPU_RELAX` macro | -| `libs/qec/lib/.../ai_predecoder_service.cu` | QEC | Derived class impl: predecoder kernels, circular queue, poll/release | -| `libs/qec/include/.../utils/thread_pool.h` | Util | Thread pool with optional core pinning | -| `libs/qec/include/.../utils/pipeline_benchmarks.h` | Util | Reusable latency/throughput benchmarking utility | -| `libs/qec/lib/.../test_realtime_predecoder_w_pymatching.cpp` | Test | End-to-end integration test with real ONNX + PyMatching | +| `libs/qec/lib/.../ai_decoder_service.cu` | QEC | Base class impl: ONNX build, engine save/load, graph capture | +| `libs/qec/include/.../ai_predecoder_service.h` | QEC | Derived class header: CPU handoff, `poll_next_job`/`release_job` | +| `libs/qec/lib/.../ai_predecoder_service.cu` | QEC | Derived class impl: signal kernel, output DMA, graph capture | +| `libs/qec/include/.../nvtx_helpers.h` | Util | NVTX profiling macros (`NVTX_PUSH`, `NVTX_POP`) | +| `libs/qec/lib/.../test_realtime_predecoder_w_pymatching.cpp` | Test | End-to-end benchmark with real ONNX + PyMatching + correctness verification | +| `libs/qec/unittests/test_realtime_pipeline.cu` | Test | GTest unit/integration tests (21 tests, SKIP_TRT mode) | --- @@ -683,80 +658,88 @@ On first run with a given configuration, the ONNX model is compiled to a TensorR | Parameter | Default | Description | |-----------|---------|-------------| -| `NUM_SLOTS` | 64 | Ring buffer slot count (Host ↔ GPU) | -| `slot_size` | Per-config (4096 - 262144) | Max payload per slot (RPCHeader + measurements + result) | -| `num_predecoders` | 4 | Parallel predecoder instances (TRT engines) | -| `queue_depth` | 16 | N-deep circular queue per predecoder | -| `num_workers` | 4 | Thread pool size (each gets its own PyMatching decoder) | -| `total_requests` | 100 | Requests per test run (limited by CUDA graph launch ceiling) | -| Dispatcher grid | 1 block, 32 threads | Persistent kernel configuration | -| Predecoder grid | 1 block, 128 threads | Per-graph kernel configuration | -| Spin timeout | 1,000,000 iterations | Defense-in-depth backpressure in input kernel | +| `NUM_SLOTS` | 16 | Ring buffer slot count | +| `slot_size` | Per-config (1024 - 65536) | Max payload per slot (derived from model input size) | +| `num_predecoders` | 8 (d13_r104) | Parallel predecoder instances = pipeline worker threads | +| `queue_depth` | 1 | Single in-flight inference per predecoder | +| `num_decode_workers` | 16 (d13_r104) | PyMatching thread pool size | +| `rate_us` | 104 | Inter-arrival time in microseconds | +| `duration_s` | 20 | Test duration in seconds | +| `warmup_count` | 20 | Requests excluded from latency stats | +| `max_requests` | 500,000 | Maximum requests per run | ### Capacity Analysis -- **Total GPU→CPU queue capacity**: 4 predecoders x 16 depth = 64 slots -- **Ring buffer capacity**: 64 slots -- These are balanced: worst case, all 64 ring buffer requests could be in-flight across the predecoder queues simultaneously. -- If requests are unevenly distributed (e.g., 32 to one predecoder), that predecoder's queue fills at depth 16, and the dispatcher applies backpressure for the remaining 16. -- **Batched submission**: The test fires requests in batches of `num_predecoders` (4), waiting for each batch to complete before submitting the next. This avoids overwhelming the dispatcher and stays within CUDA graph launch limits. +- **Ring buffer**: 16 slots, each up to 32 KB for d13_r104. +- **GPU throughput**: 8 parallel streams × ~88 µs compute = ~90k req/s theoretical (far exceeds demand). +- **CPU throughput**: 16 PyMatching workers × ~224 µs decode = ~71k req/s theoretical. +- **Bottleneck**: PyMatching at 224 µs average, but 16 workers provide sufficient aggregate throughput for the 9.6k req/s demand at 104 µs inter-arrival. +- **Backpressure**: ~6.2M stalls over 20 s (noise floor of sub-microsecond spins when next round-robin slot is briefly busy). --- ## 13. Performance Benchmarking -### PipelineBenchmark Utility +### Pipeline Results (d=13, T=104, 104 µs rate, 20s) -The `PipelineBenchmark` class (`libs/qec/include/cudaq/qec/utils/pipeline_benchmarks.h`) provides reusable latency and throughput measurement for any pipeline test: +Configuration: 16 slots, 8 predecoders, 16 PyMatching workers, Stim test data. -```cpp -cudaq::qec::utils::PipelineBenchmark bench("d13_r13_Z", total_requests); -bench.start(); -// ... submit requests, mark_submit(i), mark_complete(i) ... -bench.stop(); -bench.report(); -``` +| Metric | Value | +|--------|-------| +| Throughput | 9,610 req/s | +| Submitted / Completed | 192,309 / 192,309 | +| Backpressure stalls | 6,193,418 | +| p50 latency | 352.3 µs | +| Mean latency | 393.4 µs | +| p90 latency | 514.7 µs | +| p95 latency | 606.3 µs | +| p99 latency | 1,248.8 µs | +| Max latency | 3,930.0 µs | +| stddev | 179.0 µs | -It tracks per-request submit and complete timestamps, computes statistics only on completed requests, and reports: +### Worker Timing Breakdown -- Min, max, mean, p50, p90, p95, p99 latencies (microseconds) -- Standard deviation -- Total wall time and throughput (req/s) -- Submitted / completed / timed-out counts +| Component | Avg Time | +|-----------|----------| +| PyMatching decode | 224.4 µs | +| Total worker (PyMatch thread) | 224.5 µs | +| Worker overhead | 0.1 µs | -### Worker Timing Breakdown +### Syndrome Density -The test also reports an average breakdown of where time is spent: +| | Avg nonzero / total | Density | +|-|---------------------|---------| +| Input detectors | 932.7 / 17,472 | 5.34% | +| Output residual detectors | 16.1 / 17,472 | 0.09% | +| **Reduction** | | **98.3%** | -``` - Worker Timing Breakdown (avg over 100 requests): - PyMatching decode: 164.3 us (23.6%) - Worker overhead: 0.4 us ( 0.1%) - GPU+dispatch+poll: 530.1 us (76.3%) - Total end-to-end: 694.8 us - Per-round (/13): 53.4 us/round -``` +### Logical Error Rate -### Measured Performance (representative, system-dependent) +| Mode | LER | Mismatches | +|------|-----|------------| +| Full pipeline (predecoder + PyMatching) | **0.0020** | 384 / 192,309 | +| Predecoder only (output[0]) | 0.3980 | 76,537 / 192,309 | -| Config | p50 Latency | Mean Latency | Throughput | PyMatching % | Per-round | -|--------|-------------|-------------|------------|-------------|-----------| -| d=7 | 262 us | 284 us | 10,803 req/s | 12.8% | 40.6 us | -| d=13 | 658 us | 678 us | 3,467 req/s | 23.0% | 52.1 us | +### Raw TRT Baseline (trtexec) -### Profiling with Nsight Systems +| Mode | GPU Compute | Total Host Latency | +|------|-------------|-------------------| +| Default (single stream) | 107 µs | 119 µs | +| CUDA Graph + SpinWait | 90 µs | 99 µs | +| CUDA Graph + SpinWait + No Transfers | 88 µs | 88 µs | -```bash -nsys profile --trace=cuda,nvtx,osrt --cuda-graph-trace=node \ - -o d13_profile ./test_realtime_predecoder_w_pymatching d13 -nsys stats d13_profile.nsys-rep -``` +### NVTX Profiling (per-stage timing) + +| Stage | Avg (µs) | Median (µs) | +|-------|----------|-------------| +| PyMatchDecode | 277 | 223 | +| PreLaunchCopy | 8.8 | 8.3 | +| ConsumerComplete | 3.3 | 3.2 | +| Submit | 2.8 | 2.7 | +| PollJob | 2.3 | 1.9 | +| ReleaseJob | 2.0 | 1.9 | -Key findings from profiling: -- GPU TRT inference is ~9 us/request (very fast) -- The dominant latency is in the dispatcher's slot-scanning loop and CPU polling gap -- PyMatching decode accounts for 13-23% of end-to-end latency depending on distance -- The `--cuda-graph-trace=node` flag is critical for seeing individual kernels inside CUDA graphs +Infrastructure overhead (ring buffer + dispatch + poll + consumer): **~18 µs per request**. --- @@ -764,39 +747,39 @@ Key findings from profiling: ### Architecture Support -| Feature | x86_64 | aarch64 (Grace Hopper) | +| Feature | x86_64 | aarch64 (Grace Blackwell) | |---------|--------|----------------------| | `QEC_CPU_RELAX()` | `_mm_pause()` | `asm volatile("yield")` | -| Acquire fence in `poll_next_job` | No-op (TSO) | Required (`std::atomic_thread_fence`) | -| Release store in `release_job` | `__atomic_store_n` | `__atomic_store_n` | -| `volatile` for mapped memory | Sufficient | Requires fences (provided) | +| Cross-device atomics | libcu++ system-scope | libcu++ system-scope | +| Memory model | TSO (strong) | Weakly ordered (requires fences) | +| Interconnect | PCIe | NVLink-C2C | -The `QEC_CPU_RELAX()` macro is defined in `ai_predecoder_service.h` and should be used by all polling code instead of platform-specific intrinsics. +The `QEC_CPU_RELAX()` macro is defined in both `ai_predecoder_service.h` and `host_dispatcher.h` and should be used by all polling code. ### CUDA Compute Capability | Feature | Minimum | |---------|---------| -| Device-side `cudaGraphLaunch` | sm_80 (Ampere) | -| `__threadfence_system()` | sm_20+ | -| Mapped pinned memory | All CUDA devices | +| `cudaHostAllocMapped` | All CUDA devices | +| CUDA Graphs (host launch) | sm_50+ | +| libcu++ system-scope atomics | sm_70+ | --- ## 15. Limitations & Future Work -1. **Linear function table lookup**: `dispatch_lookup_entry` performs a linear scan of the function table. With 4 entries this is negligible, but for larger tables a hash map or sorted binary search would be appropriate. +1. **PyMatching is the bottleneck**: At 224 µs average, PyMatching consumes 93% of CPU-stage time. A faster MWPM decoder (e.g., Fusion Blossom, GPU-accelerated matching) would directly reduce pipeline latency. -2. **No queue drain on shutdown**: Setting `system_stop = true` causes the worker threads to exit immediately. Jobs that the GPU has completed but the CPU hasn't polled are silently dropped. Production code should drain all queues before stopping. +2. **Round-robin slot injection**: The `RingBufferInjector` uses strict round-robin slot assignment. If slot N is busy, the producer stalls even if slot N+1 is free. Out-of-order slot allocation would reduce backpressure but sacrifice FIFO ordering. -3. **Dropped syndromes on timeout**: If the defense-in-depth spin timeout fires in `predecoder_input_kernel`, the syndrome is silently dropped. A production system should increment an error counter or signal the host. +3. **Single data type**: The current test assumes uint8 detectors matching the predecoder model. Support for INT32 models would require element-size-aware input packing. 4. **Static TRT shapes only**: The current implementation assumes static input/output tensor shapes. Dynamic shapes would require per-invocation shape metadata in the RPC payload and runtime TRT profile switching. -5. **Batched submission**: The test fires requests in batches of `num_predecoders` and waits for completion before the next batch. This serializes batches and underutilizes the pipeline. A pipelined submission strategy (overlapping batch N+1 submission with batch N completion) would improve throughput. +5. **No queue drain on shutdown**: The PyMatching queue is shut down immediately; jobs that were enqueued but not yet decoded are silently dropped. A production system should drain the queue before stopping. -6. **Single polling thread**: The `incoming_polling_loop` is a single thread that round-robins all predecoders. At higher predecoder counts, this could become a bottleneck. A per-predecoder polling thread or lock-free MPSC queue could help. +6. **Core pinning is advisory**: The pipeline pins threads to cores via `sched_setaffinity`, but does not isolate cores from the OS scheduler. A production deployment should use `isolcpus` or cgroups. -7. **CUDA graph launch ceiling**: `cudaStreamGraphFireAndForget` has a runtime limit of ~128 concurrent pending child graph launches. The test limits `total_requests` to 100 to stay under this. Production systems with sustained high throughput may need to throttle submissions or use a different dispatch strategy. +7. **INT8 quantization**: The predecoder model runs in FP16. INT8 quantization could reduce GPU compute from 88 µs to ~50 µs, though the GPU is not currently the bottleneck. -8. **Dispatcher scanning latency**: The persistent dispatcher kernel parks on the current slot and spins until it is populated. With batched submission, there is a round-trip delay between batch completion and next-batch submission that dominates the end-to-end latency (~550 us of the ~700 us total for d=13). +8. **Sparse PyMatching input**: The predecoder reduces syndrome density to 0.09%. Representing the sparse residual as a list of nonzero indices (rather than a dense vector) could speed up PyMatching's graph traversal. diff --git a/docs/realtime_pipeline_architecture.md b/docs/realtime_pipeline_architecture.md index 3c5073c7..b01055f1 100644 --- a/docs/realtime_pipeline_architecture.md +++ b/docs/realtime_pipeline_architecture.md @@ -12,6 +12,7 @@ classDiagram +start() +stop() +create_injector() RingBufferInjector + +complete_deferred(slot) +stats() Stats } @@ -53,20 +54,30 @@ classDiagram +release_job(slot) } + class PyMatchQueue { + -mtx_ : mutex + -cv_ : condition_variable + -jobs_ : queue~PyMatchJob~ + +push(PyMatchJob) + +pop(PyMatchJob) bool + +shutdown() + } + RealtimePipeline *-- RingBufferManager : owns RealtimePipeline *-- cudaq_host_dispatcher_config_t : builds RealtimePipeline --> RingBufferInjector : creates RingBufferInjector --> RingBufferManager : writes to cudaq_host_dispatcher_config_t --> AIPreDecoderService : launches graph + RealtimePipeline --> PyMatchQueue : deferred jobs flow through ``` ## 2. Thread Model -The pipeline spawns three categories of threads, each pinnable to a specific CPU core: +The pipeline spawns four categories of threads, each pinnable to a specific CPU core: ```mermaid flowchart LR - subgraph "Producer (main thread or FPGA DMA)" + subgraph "Producer (main thread)" P["RingBufferInjector::submit()"] end @@ -74,13 +85,19 @@ flowchart LR D["cudaq_host_dispatcher_loop()"] end - subgraph "Worker Threads (cores 4..4+N)" - W0["worker_loop(0)"] - W1["worker_loop(1)"] - Wn["worker_loop(N-1)"] + subgraph "Predecoder Workers (cores 10..10+N)" + W0["worker_loop(0)
polls GPU stream 0"] + W1["worker_loop(1)
polls GPU stream 1"] + Wn["worker_loop(N-1)
polls GPU stream N-1"] end - subgraph "Consumer Thread (core 3)" + subgraph "PyMatching Workers (no pinning)" + PM0["pymatch_thread(0)"] + PM1["pymatch_thread(1)"] + PMn["pymatch_thread(M-1)"] + end + + subgraph "Consumer Thread (core 4)" C["consumer_loop()"] end @@ -97,12 +114,25 @@ flowchart LR G0 -->|"ready_flags = 1"| W0 G1 -->|"ready_flags = 1"| W1 Gn -->|"ready_flags = 1"| Wn - W0 -->|"tx_flags signal"| C - W1 -->|"tx_flags signal"| C - Wn -->|"tx_flags signal"| C + W0 -->|"DEFERRED_COMPLETION
idle_mask restored"| D + W1 -->|"DEFERRED_COMPLETION
idle_mask restored"| D + Wn -->|"DEFERRED_COMPLETION
idle_mask restored"| D + W0 -->|"PyMatchJob"| PM0 + W1 -->|"PyMatchJob"| PM1 + Wn -->|"PyMatchJob"| PMn + PM0 -->|"complete_deferred
tx_flags signal"| C + PM1 -->|"complete_deferred
tx_flags signal"| C + PMn -->|"complete_deferred
tx_flags signal"| C C -->|"clear_slot"| P ``` +**Thread counts (d13_r104 configuration):** +- Dispatcher: 1 thread (core 2) +- Predecoder workers: 8 threads (cores 10-17) +- PyMatching workers: 16 threads (unpinned) +- Consumer: 1 thread (core 4) +- Total: 26 threads + ## 3. Sequence Diagram: Single Syndrome Through the Pipeline This traces one syndrome request from submission to completion, showing every @@ -114,15 +144,16 @@ sequenceDiagram participant RB as Ring Buffer
(shared memory) participant Disp as Dispatcher
(dedicated thread) participant GPU as GPU Stream w
(CUDA Graph) - participant Work as Worker Thread w
(CPU) + participant PDW as Predecoder Worker w
(CPU) + participant PMQ as PyMatchQueue + participant PMW as PyMatching Worker
(CPU) participant Cons as Consumer
(dedicated thread) participant App as Application
(completion handler) Note over Prod,App: === PHASE 1: Injection === Prod->>Prod: CAS next_slot acq_rel, claim slot S - Prod->>RB: memcpy payload to rx_data S - Prod->>RB: write RPCHeader magic+function_id + Prod->>RB: memcpy RPCHeader (24 bytes) + payload to rx_data S Prod->>RB: rx_flags S .store host_ptr, release Prod->>Prod: slot_occupied S = 1, slot_request S = request_id Prod->>Prod: total_submitted.fetch_add 1, release @@ -138,7 +169,7 @@ sequenceDiagram Disp->>Disp: __sync_synchronize opt pre_launch_fn configured - Disp->>GPU: pre_launch_fn cudaMemcpyAsync DMA syndrome to TRT input + Disp->>GPU: pre_launch_fn cudaMemcpyAsync DMA syndrome to TRT input (offset 24) end Disp->>GPU: cudaGraphLaunch graph_exec W, stream W @@ -147,21 +178,32 @@ sequenceDiagram Note over Prod,App: === PHASE 3: GPU Inference === - GPU->>GPU: gateway_input_kernel: copy ring buffer to TRT input - GPU->>GPU: TRT enqueueV3: AI predecoder inference - GPU->>GPU: cudaMemcpyAsync: TRT output to h_predecoder_outputs + GPU->>GPU: TRT enqueueV3: AI predecoder inference (uint8 → uint8) + GPU->>GPU: cudaMemcpyAsync D2D: TRT output to h_predecoder_outputs GPU->>GPU: predecoder_signal_ready_kernel: ready_flags.store 1, release - Note over Prod,App: === PHASE 4: CPU Post-Processing === + Note over Prod,App: === PHASE 4: Predecoder Worker (fast path, ~10 µs) === - Work->>Work: poll_next_job: ready_flags CAS 1 to 2, acquire - Work->>Work: Read h_predecoder_outputs, run PyMatching MWPM decoder - Work->>Work: Write RPC response to ring buffer slot - Work->>Work: release_job: ready_flags.store 0, release - Work->>RB: tx_flags S .store slot_host_addr, release, marks READY - Work->>Disp: idle_mask.fetch_or 1 shl W, release, worker W free + PDW->>PDW: poll_next_job: ready_flags CAS 1 to 2, acquire + PDW->>PDW: memcpy h_predecoder_outputs to deferred_outputs[S] + PDW->>PDW: compute syndrome density metrics + PDW->>PDW: release_job: ready_flags.store 0, release + PDW->>PDW: extract request_id from RPCHeader + PDW->>PMQ: push PyMatchJob(S, request_id, ring_buffer_ptr) + PDW->>PDW: return DEFERRED_COMPLETION + PDW->>Disp: idle_mask.fetch_or 1 shl W, release, worker W free - Note over Prod,App: === PHASE 5: Completion === + Note over Prod,App: === PHASE 5: PyMatching Decode (~224 µs) === + + PMW->>PMQ: pop PyMatchJob + PMW->>PMW: acquire per-thread decoder (thread_local) + PMW->>PMW: read deferred_outputs[S]: logical_pred + residual detectors + PMW->>PMW: PyMatching MWPM decode over full H matrix + PMW->>PMW: project corrections onto observable O + PMW->>RB: write RPCResponse + DecodeResponse to ring buffer slot + PMW->>RB: complete_deferred(S): tx_flags S .store slot_host_addr, release + + Note over Prod,App: === PHASE 6: Completion === Cons->>RB: poll_tx S: tx_flags S .load acquire, sees valid addr READY Cons->>App: completion_handler request_id, slot, success @@ -182,19 +224,19 @@ and the memory ordering used. | Atomic | Type | Scope | Writer(s) | Reader(s) | Ordering | |--------|------|-------|-----------|-----------|----------| | `rx_flags[slot]` | `cuda::atomic` | Producer ↔ Dispatcher | Producer (signal), Dispatcher (clear), Consumer (clear) | Dispatcher (poll) | store: `release`, load: `acquire` | -| `tx_flags[slot]` | `cuda::atomic` | Dispatcher ↔ Worker ↔ Consumer | Dispatcher (IN_FLIGHT), Worker (READY/addr) | Consumer (poll) | store: `release`, load: `acquire` | +| `tx_flags[slot]` | `cuda::atomic` | Dispatcher ↔ PyMatch Worker ↔ Consumer | Dispatcher (IN_FLIGHT), PyMatch Worker (READY/addr via `complete_deferred`) | Consumer (poll) | store: `release`, load: `acquire` | ### Worker Pool Scheduling | Atomic | Type | Scope | Writer(s) | Reader(s) | Ordering | |--------|------|-------|-----------|-----------|----------| -| `idle_mask` | `cuda::atomic` | Dispatcher ↔ Workers | Dispatcher (clear bit), Worker (set bit) | Dispatcher (find free worker) | fetch_and/fetch_or: `release`, load: `acquire` | +| `idle_mask` | `cuda::atomic` | Dispatcher ↔ Pipeline Workers | Dispatcher (clear bit), Pipeline (set bit after DEFERRED_COMPLETION) | Dispatcher (find free worker) | fetch_and/fetch_or: `release`, load: `acquire` | ### GPU ↔ CPU Handoff (per AIPreDecoderService) | Atomic | Type | Scope | Writer(s) | Reader(s) | Ordering | |--------|------|-------|-----------|-----------|----------| -| `ready_flags[0]` | `cuda::atomic` | GPU kernel ↔ Worker thread | GPU kernel (0→1), Worker (CAS 1→2), Worker (2→0) | Worker (CAS poll) | store: `release`, CAS success: `acquire`, CAS fail: `relaxed` | +| `ready_flags[0]` | `cuda::atomic` | GPU kernel ↔ Predecoder worker | GPU kernel (0→1), Worker (CAS 1→2), Worker (2→0 via release_job) | Worker (CAS poll) | store: `release`, CAS success: `acquire`, CAS fail: `relaxed` | ### Pipeline Lifecycle @@ -226,16 +268,16 @@ stateDiagram-v2 FREE --> RX_SIGNALED : Producer writes rx_flags[S] = host_ptr note right of RX_SIGNALED rx_flags != 0, tx_flags = 0 - Payload + RPCHeader in rx_data + RPCHeader (24B) + payload in rx_data end note RX_SIGNALED --> IN_FLIGHT : Dispatcher reads rx_flags, launches graph, sets tx_flags IN_FLIGHT, clears rx_flags note right of IN_FLIGHT rx_flags = 0, tx_flags = 0xEEEE - GPU processing in progress + GPU processing + predecoder worker + PyMatch queue end note - IN_FLIGHT --> TX_READY : Worker writes tx_flags = slot_host_addr after GPU + PyMatching done + IN_FLIGHT --> TX_READY : PyMatch worker calls complete_deferred → tx_flags = slot_host_addr note right of TX_READY rx_flags = 0, tx_flags = valid addr Result available for consumer @@ -264,26 +306,35 @@ The graph is instantiated once at startup and replayed for every syndrome. ```mermaid flowchart TD subgraph "CUDA Graph (AIPreDecoderService)" - A["TRT enqueueV3
(AI predecoder inference)"] --> B["cudaMemcpyAsync
TRT output to h_predecoder_outputs
(host-mapped)"] + A["TRT enqueueV3
(AI predecoder inference)"] --> B["cudaMemcpyAsync D2D
TRT output → h_predecoder_outputs
(host-mapped)"] B --> C["predecoder_signal_ready_kernel
ready_flags.store(1, release)"] end subgraph "Pre-Launch Callback (host-side, before graph)" - P["pre_launch_fn:
cudaMemcpyAsync
ring buffer slot to TRT input
(DMA copy engine)"] + P["pre_launch_fn:
cudaMemcpyAsync D2D
ring buffer slot+24 → TRT input
(DMA copy engine)"] end - subgraph "Post-Graph (Worker Thread)" - D["poll_next_job():
ready_flags CAS 1 to 2"] - E["PyMatching MWPM decode"] - F["Write RPC response"] + subgraph "Predecoder Worker (fast path, ~10 µs)" + D["poll_next_job():
ready_flags CAS 1 → 2"] + E["memcpy output → deferred_outputs[slot]"] + F["syndrome density metrics"] G["release_job():
ready_flags store 0"] - H["tx_flags.store(addr, release)"] - I["idle_mask.fetch_or(1 shl W, release)"] + H["enqueue PyMatchJob"] + I["return DEFERRED_COMPLETION
→ idle_mask restored"] D --> E --> F --> G --> H --> I end + subgraph "PyMatching Worker (~224 µs)" + J["pop PyMatchJob from queue"] + K["PyMatching MWPM decode"] + L["Write RPC response"] + M["complete_deferred(slot):
tx_flags.store(addr, release)"] + J --> K --> L --> M + end + P --> A C -.->|"GPU signals ready_flags = 1"| D + I -.->|"PyMatchQueue"| J ``` ## 7. Backpressure and Flow Control @@ -296,7 +347,7 @@ flowchart TD Submit["Injector::try_submit()"] Check{"slot_available(S)?
rx_flags=0 AND tx_flags=0"} CAS{"CAS next_slot
cur to cur+1"} - Write["Write payload + signal"] + Write["Write RPCHeader + payload + signal"] Stall["backpressure_stalls++
QEC_CPU_RELAX()"] Retry["Retry"] @@ -309,10 +360,15 @@ flowchart TD end ``` -**Capacity:** With `num_slots = 32` and `num_workers = 16`, up to 32 syndromes -can be in various stages of processing simultaneously. When all 32 slots are -occupied (either waiting for dispatch, in-flight on GPU, or awaiting consumer -pickup), the injector stalls until the consumer frees a slot. +**Capacity:** With `num_slots = 16` and `num_workers = 8` (predecoder) + `16` (PyMatching), +up to 16 syndromes can be in various stages of processing simultaneously. When all 16 +slots are occupied (either waiting for dispatch, in-flight on GPU, being decoded by +PyMatching, or awaiting consumer pickup), the injector stalls until the consumer frees a +slot. + +**Round-robin limitation:** The injector uses strict round-robin slot selection. If slot N +is busy but slot N+1 is free, the producer still stalls on slot N. This preserves FIFO +ordering but contributes to the ~6.2M backpressure stalls observed at 104 µs injection rate. ## 8. ARM Memory Ordering Considerations @@ -328,26 +384,69 @@ memory model. Key ordering guarantees: uses `cuda::thread_scope_system` + `memory_order_release`, paired with the worker's `compare_exchange_strong(acquire)`. -3. **Worker → Consumer:** `tx_flags[S].store(release)` pairs with - `tx_flags[S].load(acquire)` in `poll_tx_flag()`. Consumer sees PyMatching - results before the ready flag. +3. **Predecoder Worker → PyMatch Worker:** The `PyMatchQueue` uses `std::mutex` + + `std::condition_variable`, which provide implicit acquire/release semantics. + The `deferred_outputs[slot]` buffer is written by the predecoder worker before + `push()` and read by the PyMatch worker after `pop()`, so the mutex guarantees + visibility. -4. **Consumer → Producer (slot recycling):** `slot_occupied[S] = 0` followed +4. **PyMatch Worker → Consumer:** `tx_flags[S].store(release)` in + `complete_deferred()` pairs with `tx_flags[S].load(acquire)` in `poll_tx_flag()`. + Consumer sees the full RPC response before the ready flag. + +5. **Consumer → Producer (slot recycling):** `slot_occupied[S] = 0` followed by `__sync_synchronize()` (full barrier) before `clear_slot()` ensures the producer cannot see a free slot while the consumer is still accessing - slot_request metadata. + slot metadata. ```mermaid flowchart LR subgraph "Release/Acquire Pairs" A["rx_flags store
(release)"] -->|"paired with"| B["rx_flags load
(acquire)"] - C["tx_flags store
(release)"] -->|"paired with"| D["tx_flags load
(acquire)"] + C["tx_flags store
(release, complete_deferred)"] -->|"paired with"| D["tx_flags load
(acquire, poll_tx)"] E["ready_flags store(1)
(release, system scope)"] -->|"paired with"| F["ready_flags CAS
(acquire)"] G["idle_mask fetch_or
(release)"] -->|"paired with"| H["idle_mask load
(acquire)"] end + subgraph "Mutex-Based Ordering" + I["PyMatchQueue::push()
mutex lock/unlock"] -->|"happens-before"| J["PyMatchQueue::pop()
mutex lock/unlock"] + end + subgraph "Full Barriers" - I["__sync_synchronize()
between slot_occupied=0
and clear_slot()"] - J["__sync_synchronize()
between mailbox_bank write
and cudaGraphLaunch"] + K["__sync_synchronize()
between slot_occupied=0
and clear_slot()"] + L["__sync_synchronize()
between mailbox_bank write
and cudaGraphLaunch"] end ``` + +## 9. DEFERRED_COMPLETION Protocol + +The `DEFERRED_COMPLETION` mechanism allows predecoder workers to release their +GPU stream immediately while deferring ring buffer slot completion to a later +thread (the PyMatching worker pool). + +```mermaid +sequenceDiagram + participant PW as Predecoder Worker + participant Pipeline as RealtimePipeline + participant PMQ as PyMatchQueue + participant PMW as PyMatch Worker + + PW->>PW: poll_next_job() succeeds + PW->>PW: copy output, release GPU slot + PW->>PMQ: push(PyMatchJob) + PW->>Pipeline: return DEFERRED_COMPLETION + Pipeline->>Pipeline: idle_mask.fetch_or(1<tx_flags NOT touched + + PMW->>PMQ: pop(PyMatchJob) + PMW->>PMW: PyMatching MWPM decode + PMW->>PMW: Write RPC response to ring buffer + PMW->>Pipeline: complete_deferred(slot) + Pipeline->>Pipeline: tx_flags[slot].store(host_addr, release) + Note over Pipeline: Slot S now READY
Consumer can harvest +``` + +**Key invariant:** Between `DEFERRED_COMPLETION` and `complete_deferred()`, the ring +buffer slot remains in the IN_FLIGHT state (`tx_flags = 0xEEEE`). The slot's data area +is safe to read/write because the consumer only harvests when `tx_flags` transitions to +a valid address, and the producer cannot reuse the slot while `tx_flags != 0`. diff --git a/libs/qec/include/cudaq/qec/realtime/pipeline.h b/libs/qec/include/cudaq/qec/realtime/pipeline.h index 310bae61..57c96b37 100644 --- a/libs/qec/include/cudaq/qec/realtime/pipeline.h +++ b/libs/qec/include/cudaq/qec/realtime/pipeline.h @@ -73,8 +73,17 @@ struct CpuStageContext { }; /// Returns the number of bytes written into response_buffer. +/// Return 0 if no GPU result is ready yet (poll again). +/// Return DEFERRED_COMPLETION to release the worker immediately while +/// deferring slot completion to a later complete_deferred() call. using CpuStageCallback = std::function; +/// Sentinel return value from CpuStageCallback: release the worker +/// (idle_mask) but do NOT signal slot completion (tx_flags). The caller +/// is responsible for calling RealtimePipeline::complete_deferred(slot) +/// once the deferred work (e.g. a separate decode thread) finishes. +static constexpr size_t DEFERRED_COMPLETION = SIZE_MAX; + // --------------------------------------------------------------------------- // Completion Callback // --------------------------------------------------------------------------- @@ -164,6 +173,12 @@ class RealtimePipeline { /// Thread-safe, lock-free stats snapshot. Stats stats() const; + /// Signal that deferred processing for a slot is complete. + /// Call this from any thread after the cpu_stage callback returned + /// DEFERRED_COMPLETION and the deferred work has finished writing the + /// response into the slot's ring buffer area. + void complete_deferred(int slot); + private: struct Impl; std::unique_ptr impl_; diff --git a/libs/qec/lib/realtime/realtime_pipeline.cu b/libs/qec/lib/realtime/realtime_pipeline.cu index 2f43ab93..80339bc0 100644 --- a/libs/qec/lib/realtime/realtime_pipeline.cu +++ b/libs/qec/lib/realtime/realtime_pipeline.cu @@ -464,6 +464,11 @@ struct RealtimePipeline::Impl { continue; } + if (written == DEFERRED_COMPLETION) { + idle_mask.fetch_or(1ULL << worker_id, cuda::std::memory_order_release); + continue; + } + int origin_slot = inflight_slot_tags[worker_id]; uint8_t *slot_host = ring->rx_data_host() + @@ -588,6 +593,14 @@ RealtimePipeline::Stats RealtimePipeline::stats() const { impl_->backpressure_stalls.load(std::memory_order_relaxed)}; } +void RealtimePipeline::complete_deferred(int slot) { + uint8_t *slot_host = impl_->ring->rx_data_host() + + static_cast(slot) * impl_->config.slot_size; + uint64_t rx_value = reinterpret_cast(slot_host); + impl_->ring->tx_flags()[slot].store(rx_value, + cuda::std::memory_order_release); +} + // --------------------------------------------------------------------------- // RingBufferInjector // --------------------------------------------------------------------------- diff --git a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp index d9800cd4..63bfe668 100644 --- a/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp +++ b/libs/qec/lib/realtime/test_realtime_predecoder_w_pymatching.cpp @@ -23,11 +23,14 @@ #include #include #include +#include #include #include #include #include #include +#include +#include #include #include #include @@ -81,7 +84,7 @@ namespace realtime_ns = cudaq::realtime; // Pipeline Configuration (application-level, no atomics) // ============================================================================= -constexpr size_t NUM_SLOTS = 12; +constexpr size_t NUM_SLOTS = 16; struct PipelineConfig { std::string label; @@ -90,6 +93,7 @@ struct PipelineConfig { std::string onnx_filename; int num_predecoders; int num_workers; + int num_decode_workers; std::string onnx_path() const { return std::string(ONNX_MODEL_DIR) + "/" + onnx_filename; @@ -104,25 +108,25 @@ struct PipelineConfig { } static PipelineConfig d7_r7() { - return {"d7_r7_Z", 7, 7, "model1_d7_r7_unified_Z_batch1.onnx", 16, 16}; + return {"d7_r7_Z", 7, 7, "model1_d7_r7_unified_Z_batch1.onnx", 16, 16, 32}; } static PipelineConfig d13_r13() { - return {"d13_r13_X", 13, 13, "predecoder_memory_d13_T13_X.onnx", 16, 16}; + return {"d13_r13_X", 13, 13, "predecoder_memory_d13_T13_X.onnx", 16, 16, 32}; } static PipelineConfig d13_r104() { - return {"d13_r104_X", 13, 104, "predecoder_memory_d13_T104_X.onnx", 8, 8}; + return {"d13_r104_X", 13, 104, "predecoder_memory_d13_T104_X.onnx", 8, 8, 16}; } static PipelineConfig d21_r21() { return {"d21_r21_Z", 21, 21, "model1_d21_r21_unified_X_batch1.onnx", 16, - 16}; + 16, 32}; } static PipelineConfig d31_r31() { return {"d31_r31_Z", 31, 31, "model1_d31_r31_unified_Z_batch1.onnx", 16, - 16}; + 16, 32}; } }; @@ -204,6 +208,51 @@ struct __attribute__((packed)) DecodeResponse { int32_t converged; }; +// ============================================================================= +// PyMatching work queue (decoupled from predecoder workers) +// ============================================================================= + +struct PyMatchJob { + int origin_slot; + uint64_t request_id; + void *ring_buffer_ptr; +}; + +class PyMatchQueue { +public: + void push(PyMatchJob &&j) { + { + std::lock_guard lk(mtx_); + jobs_.push(std::move(j)); + } + cv_.notify_one(); + } + + bool pop(PyMatchJob &out) { + std::unique_lock lk(mtx_); + cv_.wait(lk, [&] { return !jobs_.empty() || stop_; }); + if (stop_ && jobs_.empty()) + return false; + out = std::move(jobs_.front()); + jobs_.pop(); + return true; + } + + void shutdown() { + { + std::lock_guard lk(mtx_); + stop_ = true; + } + cv_.notify_all(); + } + +private: + std::mutex mtx_; + std::condition_variable cv_; + std::queue jobs_; + bool stop_ = false; +}; + // ============================================================================= // Test data (pre-generated from Stim, or random) // ============================================================================= @@ -545,9 +594,9 @@ int main(int argc, char *argv[]) { if (stim.O.loaded()) obs_row = stim.O.row_dense(0); - std::cout << "[Setup] Creating " << config.num_workers + std::cout << "[Setup] Creating " << config.num_decode_workers << " PyMatching decoders (full H)...\n"; - for (int i = 0; i < config.num_workers; ++i) + for (int i = 0; i < config.num_decode_workers; ++i) decoder_ctx.decoders.push_back( cudaq::qec::decoder::get("pymatching", H_full, pm_params)); } else { @@ -567,9 +616,9 @@ int main(int argc, char *argv[]) { << H_z.shape()[1] << "], spatial_slices=" << decoder_ctx.spatial_slices << "\n"; - std::cout << "[Setup] Creating " << config.num_workers + std::cout << "[Setup] Creating " << config.num_decode_workers << " PyMatching decoders (per-slice)...\n"; - for (int i = 0; i < config.num_workers; ++i) + for (int i = 0; i < config.num_decode_workers; ++i) decoder_ctx.decoders.push_back( cudaq::qec::decoder::get("pymatching", H_z, pm_params)); } @@ -584,10 +633,9 @@ int main(int argc, char *argv[]) { } if (config.num_workers != config.num_predecoders) { - throw std::invalid_argument( - "num_workers (" + std::to_string(config.num_workers) + - ") must equal num_predecoders (" + - std::to_string(config.num_predecoders) + ") in the current benchmark"); + std::cerr << "[WARN] num_workers (" << config.num_workers + << ") != num_predecoders (" << config.num_predecoders + << "); pipeline workers should match predecoders for 1:1 poll\n"; } // Worker contexts (per-worker, application-specific) @@ -604,6 +652,15 @@ int main(int argc, char *argv[]) { function_ids[i] = realtime_ns::fnv1a_hash(func.c_str()); } + // ========================================================================= + // Per-slot output buffers (predecoder output copied here before release) + // ========================================================================= + + std::vector> deferred_outputs( + NUM_SLOTS, std::vector(model_output_bytes)); + + PyMatchQueue pymatch_queue; + // ========================================================================= // Create pipeline (all atomics hidden inside) // ========================================================================= @@ -626,120 +683,56 @@ int main(int argc, char *argv[]) { .user_context = &worker_ctxs[w]}; }); - // --- CPU stage callback (poll + PyMatching decode) --- - // Called repeatedly by the pipeline's worker thread. - // Returns 0 if GPU isn't ready, >0 when a job was processed. - pipeline.set_cpu_stage([](const realtime_ns::CpuStageContext &ctx) -> size_t { - auto *wctx = static_cast(ctx.user_context); - auto *pd = wctx->predecoder; - auto *dctx = wctx->decoder_ctx; - - PreDecoderJob job; - if (!pd->poll_next_job(job)) - return 0; // GPU not done yet - - NVTX_PUSH("CpuStageTotal"); - using hrclock = std::chrono::high_resolution_clock; - auto worker_start = hrclock::now(); - - int total_corrections = 0; - bool all_converged = true; - const uint8_t *output_u8 = - static_cast(job.inference_data); - const int32_t logical_pred = output_u8[0]; - - // Syndrome density: count nonzero in input and output residuals - const uint8_t *input_u8 = - static_cast(job.ring_buffer_ptr) + CUDAQ_RPC_HEADER_SIZE; - int input_nz = 0; - for (int k = 0; k < dctx->num_input_detectors; ++k) - input_nz += (input_u8[k] != 0); - int output_nz = 0; - for (int k = 0; k < dctx->num_residual_detectors; ++k) - output_nz += (output_u8[1 + k] != 0); - dctx->total_input_nonzero.fetch_add(input_nz, std::memory_order_relaxed); - dctx->total_output_nonzero.fetch_add(output_nz, std::memory_order_relaxed); - - auto decode_start = hrclock::now(); - NVTX_PUSH("PyMatchDecode"); -#if !defined(DISABLE_PYMATCHING) - const uint8_t *residual_u8 = output_u8 + 1; - auto *my_decoder = dctx->acquire_decoder(); - - if (dctx->use_full_H) { - thread_local cudaqx::tensor syndrome_tensor( - {(size_t)dctx->num_residual_detectors}); - std::memcpy(syndrome_tensor.data(), residual_u8, - dctx->num_residual_detectors); - auto result = my_decoder->decode(syndrome_tensor); - all_converged = result.converged; - if (wctx->obs_row && wctx->obs_row_size == result.result.size()) { - int obs_parity = 0; - for (size_t e = 0; e < result.result.size(); ++e) - if (result.result[e] > 0.5 && wctx->obs_row[e]) - obs_parity ^= 1; - total_corrections += obs_parity; - } else { - for (auto v : result.result) - if (v > 0.5) - total_corrections++; - } - } else { - thread_local cudaqx::tensor syndrome_tensor( - {(size_t)dctx->z_stabilizers}); - uint8_t *syn_data = syndrome_tensor.data(); - for (int s = 0; s < dctx->spatial_slices; ++s) { - const uint8_t *slice = residual_u8 + s * dctx->z_stabilizers; - std::memcpy(syn_data, slice, dctx->z_stabilizers); - auto result = my_decoder->decode(syndrome_tensor); - all_converged &= result.converged; - for (auto v : result.result) - if (v > 0.5) - total_corrections++; - } - } - total_corrections += logical_pred; -#endif - NVTX_POP(); // PyMatchDecode - auto decode_end = hrclock::now(); - - // Capture request_id before we overwrite the slot with the response - auto *rpc_hdr = - static_cast(job.ring_buffer_ptr); - uint32_t rid = rpc_hdr->request_id; - - // Write RPC response into ring buffer slot - DecodeResponse resp{total_corrections, all_converged ? 1 : 0}; - char *response_payload = - (char *)job.ring_buffer_ptr + sizeof(realtime_ns::RPCResponse); - std::memcpy(response_payload, &resp, sizeof(resp)); - - auto *header = static_cast(job.ring_buffer_ptr); - header->magic = realtime_ns::RPC_MAGIC_RESPONSE; - header->status = 0; - header->result_len = sizeof(resp); - - pd->release_job(job.slot_idx); - - auto worker_end = hrclock::now(); - auto decode_us = std::chrono::duration_cast( - decode_end - decode_start) - .count(); - auto worker_us = std::chrono::duration_cast( - worker_end - worker_start) - .count(); - dctx->total_decode_us.fetch_add(decode_us, std::memory_order_relaxed); - dctx->total_worker_us.fetch_add(worker_us, std::memory_order_relaxed); - dctx->decode_count.fetch_add(1, std::memory_order_relaxed); - - if (wctx->decode_corrections && rid < (uint32_t)wctx->max_requests) { - wctx->decode_corrections[rid] = total_corrections; - wctx->decode_logical_pred[rid] = logical_pred; - } - - NVTX_POP(); // CpuStageTotal - return 1; - }); + // --- CPU stage callback (poll GPU + copy + enqueue to PyMatch queue) --- + // Predecoder workers only poll GPU completion, copy the output to a + // per-slot buffer, release the predecoder, and enqueue a PyMatchJob. + // Returns DEFERRED_COMPLETION so the pipeline releases the worker + // (idle_mask) without signaling slot completion (tx_flags). + pipeline.set_cpu_stage( + [&deferred_outputs, &pymatch_queue, + out_sz = model_output_bytes](const realtime_ns::CpuStageContext &ctx) -> size_t { + auto *wctx = static_cast(ctx.user_context); + auto *pd = wctx->predecoder; + auto *dctx = wctx->decoder_ctx; + + PreDecoderJob job; + if (!pd->poll_next_job(job)) + return 0; + + NVTX_PUSH("PredecoderPoll"); + + int origin_slot = ctx.origin_slot; + + std::memcpy(deferred_outputs[origin_slot].data(), job.inference_data, + out_sz); + + // Syndrome density: count nonzero in input and output residuals + const uint8_t *input_u8 = + static_cast(job.ring_buffer_ptr) + + CUDAQ_RPC_HEADER_SIZE; + int input_nz = 0; + for (int k = 0; k < dctx->num_input_detectors; ++k) + input_nz += (input_u8[k] != 0); + const uint8_t *out_buf = deferred_outputs[origin_slot].data(); + int output_nz = 0; + for (int k = 0; k < dctx->num_residual_detectors; ++k) + output_nz += (out_buf[1 + k] != 0); + dctx->total_input_nonzero.fetch_add(input_nz, + std::memory_order_relaxed); + dctx->total_output_nonzero.fetch_add(output_nz, + std::memory_order_relaxed); + + pd->release_job(job.slot_idx); + + auto *rpc_hdr = + static_cast(job.ring_buffer_ptr); + uint32_t rid = rpc_hdr->request_id; + + pymatch_queue.push({origin_slot, rid, job.ring_buffer_ptr}); + + NVTX_POP(); // PredecoderPoll + return realtime_ns::DEFERRED_COMPLETION; + }); // --- Completion callback (record timestamps) --- const int max_requests = 500000; @@ -770,6 +763,111 @@ int main(int argc, char *argv[]) { } } + // ========================================================================= + // PyMatching thread pool (decoupled from predecoder workers) + // ========================================================================= + + std::vector pymatch_threads(config.num_decode_workers); + for (int t = 0; t < config.num_decode_workers; ++t) { + pymatch_threads[t] = std::thread( + [&pipeline, &pymatch_queue, &deferred_outputs, &decoder_ctx, + &decode_corrections, &decode_logical_pred, &obs_row, + max_requests]() { + PyMatchJob job; + while (pymatch_queue.pop(job)) { + NVTX_PUSH("PyMatchDecode"); + using hrclock = std::chrono::high_resolution_clock; + auto decode_start = hrclock::now(); + + const uint8_t *output_u8 = + deferred_outputs[job.origin_slot].data(); + const int32_t logical_pred = output_u8[0]; + int total_corrections = 0; + bool all_converged = true; + +#if !defined(DISABLE_PYMATCHING) + const uint8_t *residual_u8 = output_u8 + 1; + auto *my_decoder = decoder_ctx.acquire_decoder(); + + if (decoder_ctx.use_full_H) { + thread_local cudaqx::tensor syndrome_tensor( + {(size_t)decoder_ctx.num_residual_detectors}); + std::memcpy(syndrome_tensor.data(), residual_u8, + decoder_ctx.num_residual_detectors); + auto result = my_decoder->decode(syndrome_tensor); + all_converged = result.converged; + if (!obs_row.empty() && obs_row.size() == result.result.size()) { + int obs_parity = 0; + for (size_t e = 0; e < result.result.size(); ++e) + if (result.result[e] > 0.5 && obs_row[e]) + obs_parity ^= 1; + total_corrections += obs_parity; + } else { + for (auto v : result.result) + if (v > 0.5) + total_corrections++; + } + } else { + thread_local cudaqx::tensor syndrome_tensor( + {(size_t)decoder_ctx.z_stabilizers}); + uint8_t *syn_data = syndrome_tensor.data(); + for (int s = 0; s < decoder_ctx.spatial_slices; ++s) { + const uint8_t *slice = + residual_u8 + s * decoder_ctx.z_stabilizers; + std::memcpy(syn_data, slice, decoder_ctx.z_stabilizers); + auto result = my_decoder->decode(syndrome_tensor); + all_converged &= result.converged; + for (auto v : result.result) + if (v > 0.5) + total_corrections++; + } + } + total_corrections += logical_pred; +#endif + + auto decode_end = hrclock::now(); + NVTX_POP(); // PyMatchDecode + + // Write RPC response into ring buffer slot + DecodeResponse resp{total_corrections, all_converged ? 1 : 0}; + char *response_payload = (char *)job.ring_buffer_ptr + + sizeof(realtime_ns::RPCResponse); + std::memcpy(response_payload, &resp, sizeof(resp)); + + auto *header = static_cast( + job.ring_buffer_ptr); + header->magic = realtime_ns::RPC_MAGIC_RESPONSE; + header->status = 0; + header->result_len = sizeof(resp); + + pipeline.complete_deferred(job.origin_slot); + + auto worker_end = hrclock::now(); + auto decode_us = + std::chrono::duration_cast( + decode_end - decode_start) + .count(); + auto worker_us = + std::chrono::duration_cast( + worker_end - decode_start) + .count(); + decoder_ctx.total_decode_us.fetch_add(decode_us, + std::memory_order_relaxed); + decoder_ctx.total_worker_us.fetch_add(worker_us, + std::memory_order_relaxed); + decoder_ctx.decode_count.fetch_add(1, std::memory_order_relaxed); + + uint32_t rid = static_cast(job.request_id); + if (rid < static_cast(max_requests)) { + decode_corrections[rid] = total_corrections; + decode_logical_pred[rid] = logical_pred; + } + } + }); + } + std::cout << "[Setup] Started " << config.num_decode_workers + << " PyMatching decode workers.\n"; + std::cout << "[Setup] Starting pipeline...\n"; auto injector = pipeline.create_injector(); pipeline.start(); @@ -786,6 +884,7 @@ int main(int argc, char *argv[]) { << " Warmup: " << scfg.warmup_count << " requests\n" << " Predecoders:" << config.num_predecoders << " (dedicated streams)\n" + << " Decode workers:" << config.num_decode_workers << "\n" << " Max reqs: " << max_requests << "\n\n" << std::flush; @@ -838,6 +937,11 @@ int main(int argc, char *argv[]) { // --- Shutdown --- pipeline.stop(); + pymatch_queue.shutdown(); + for (auto &t : pymatch_threads) + if (t.joinable()) + t.join(); + // ========================================================================= // Report // =========================================================================