From 499534b72948b375cfc424e36a7dfece58c4d258 Mon Sep 17 00:00:00 2001
From: Eric Schweitz <eschweitz@nvidia.com>
Date: Mon, 6 Oct 2025 10:18:06 -0700
Subject: [PATCH 1/3] [runtime] Fix interface bugs.

Fixes kernel_builder to use the correct declaration of altLaunchKernel
and pass it the correct arguments.

Signed-off-by: Eric Schweitz <eschweitz@nvidia.com>
---
 runtime/cudaq/builder/kernel_builder.cpp  | 22 ++++---
 runtime/cudaq/platform/nvqpp_interface.h  | 74 +++++++++++++++++++++++
 runtime/cudaq/platform/quantum_platform.h |  2 +-
 3 files changed, 89 insertions(+), 9 deletions(-)
 create mode 100644 runtime/cudaq/platform/nvqpp_interface.h
diff --git a/runtime/cudaq/builder/kernel_builder.cpp b/runtime/cudaq/builder/kernel_builder.cpp
index ee7202db46f..efd28f10804 100644
--- a/runtime/cudaq/builder/kernel_builder.cpp
+++ b/runtime/cudaq/builder/kernel_builder.cpp
@@ -17,6 +17,7 @@
 #include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
+#include "cudaq/platform/nvqpp_interface.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/ExecutionEngine/ExecutionEngine.h"
@@ -30,16 +31,10 @@
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"
 #include "mlir/Transforms/Passes.h"
-
 #include <numeric>
 
 using namespace mlir;
 
-extern "C" {
-void altLaunchKernel(const char *kernelName, void (*kernelFunc)(void *),
-                     void *kernelArgs, std::uint64_t argsSize);
-}
-
 namespace cudaq::details {
 
 /// @brief Track unique measurement register names.
@@ -1105,9 +1100,20 @@ void invokeCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit,
   }
 
   // Invoke and free the args memory.
-  auto thunk = reinterpret_cast<void (*)(void *)>(*thunkPtr);
+  auto thunk = reinterpret_cast<KernelThunkType>(*thunkPtr);
+
+  //  Extract the result offset, which we named.
+  auto roName = properName + ".resultOffset";
+  auto roPtr = jit->lookup(roName);
+  if (!roPtr)
+    throw std::runtime_error(
+        "cudaq::builder failed to get result offset function");
+
+  // Invoke and free the args memory.
+  auto resultOffset = reinterpret_cast<std::uint64_t>(*roPtr);
 
-  altLaunchKernel(properName.data(), thunk, rawArgs, size);
+  [[maybe_unused]] auto uncheckedResult =
+      altLaunchKernel(properName.data(), thunk, rawArgs, size, resultOffset);
   std::free(rawArgs);
   // TODO: any return values are dropped on the floor here.
 }
diff --git a/runtime/cudaq/platform/nvqpp_interface.h b/runtime/cudaq/platform/nvqpp_interface.h
new file mode 100644
index 00000000000..84cdeb4f84c
--- /dev/null
+++ b/runtime/cudaq/platform/nvqpp_interface.h
@@ -0,0 +1,74 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2025 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "common/ThunkInterface.h"
+#include <cstdint>
+#include <vector>
+
+namespace cudaq {
+
+/// Entry point for the auto-generated kernel execution path. TODO: Needs to be
+/// tied to the quantum platform instance somehow. Note that the compiler cannot
+/// provide that information.
+extern "C" {
+// Client-server (legacy) interface.
+[[nodiscard]] KernelThunkResultType
+altLaunchKernel(const char *kernelName, KernelThunkType kernel, void *args,
+                std::uint64_t argsSize, std::uint64_t resultOffset);
+
+// Streamlined interface for launching kernels. Argument synthesis and JIT
+// compilation *must* happen on the local machine.
+[[nodiscard]] KernelThunkResultType
+streamlinedLaunchKernel(const char *kernelName,
+                        const std::vector<void *> &rawArgs);
+
+// Hybrid of the client-server and streamlined approaches. Letting JIT
+// compilation happen either early or late and can handle return values from
+// each kernel launch.
+[[nodiscard]] KernelThunkResultType
+hybridLaunchKernel(const char *kernelName, KernelThunkType kernel, void *args,
+                   std::uint64_t argsSize, std::uint64_t resultOffset,
+                   const std::vector<void *> &rawArgs);
+
+//===----------------------------------------------------------------------===//
+// Launch module entry points.
+//
+// In some environments (e.g., Python), the ModuleOp of the source can be
+// provided immediately to be launched, unlike with statically compiled systems
+// (C++). These entry points allow the managed runtime to provide the ModuleOp
+// directly.
+//===----------------------------------------------------------------------===//
+
+// Client-server interface. The caller must provide an mlir::ModuleOp and the
+// exact name of the entry point kernel function to be called, which is
+// typically the .thunk unmarshalling function. Passing short names is
+// considered incorrect.
+[[nodiscard]] KernelThunkResultType
+altLaunchModule(const char *exactEntryPointName, void *moduleOp, void *args,
+                std::uint64_t argsSize, std::uint64_t resultOffset);
+
+// Streamlined interface for launching kernels. Argument synthesis and JIT
+// compilation *must* happen on the local machine. The caller must provide an
+// mlir::ModuleOp and the exact name of the entry point kernel function to be
+// called,
+[[nodiscard]] KernelThunkResultType
+streamlinedLaunchModule(const char *exactEntryPointName, void *moduleOp,
+                        const std::vector<void *> &rawArgs);
+
+// Hybrid of the client-server and streamlined approaches. Letting JIT
+// compilation happen either early or late and can handle return values from
+// each kernel launch. The caller must provide an mlir::ModuleOp and the exact
+// name of the entry point kernel function to be called,
+[[nodiscard]] KernelThunkResultType
+hybridLaunchModule(const char *exactEntryPointName, void *moduleOp, void *args,
+                   std::uint64_t argsSize, std::uint64_t resultOffset,
+                   const std::vector<void *> &rawArgs);
+} // extern "C"
+} // namespace cudaq
diff --git a/runtime/cudaq/platform/quantum_platform.h b/runtime/cudaq/platform/quantum_platform.h
index 3bfa43ac152..c52e7065ec7 100644
--- a/runtime/cudaq/platform/quantum_platform.h
+++ b/runtime/cudaq/platform/quantum_platform.h
@@ -15,6 +15,7 @@
 #include "common/ThunkInterface.h"
 #include "cudaq/remote_capabilities.h"
 #include "cudaq/utils/cudaq_utils.h"
+#include "nvqpp_interface.h"
 #include <cstring>
 #include <cxxabi.h>
 #include <functional>
@@ -22,7 +23,6 @@
 #include <memory>
 #include <optional>
 #include <string>
-#include <vector>
 
 namespace cudaq {
 

From ac6ca134177042c69d9f11fdc19da3dbd0ef2654 Mon Sep 17 00:00:00 2001
From: Eric Schweitz <eschweitz@nvidia.com>
Date: Mon, 6 Oct 2025 10:22:47 -0700
Subject: [PATCH 2/3] Remove declarations needed for reworked runtime. (Will be
 added later.)

Signed-off-by: Eric Schweitz <eschweitz@nvidia.com>
---
 runtime/cudaq/platform/nvqpp_interface.h | 34 ------------------------
 1 file changed, 34 deletions(-)

diff --git a/runtime/cudaq/platform/nvqpp_interface.h b/runtime/cudaq/platform/nvqpp_interface.h
index 84cdeb4f84c..d7ed15dda9d 100644
--- a/runtime/cudaq/platform/nvqpp_interface.h
+++ b/runtime/cudaq/platform/nvqpp_interface.h
@@ -36,39 +36,5 @@ streamlinedLaunchKernel(const char *kernelName,
 hybridLaunchKernel(const char *kernelName, KernelThunkType kernel, void *args,
                    std::uint64_t argsSize, std::uint64_t resultOffset,
                    const std::vector<void *> &rawArgs);
-
-//===----------------------------------------------------------------------===//
-// Launch module entry points.
-//
-// In some environments (e.g., Python), the ModuleOp of the source can be
-// provided immediately to be launched, unlike with statically compiled systems
-// (C++). These entry points allow the managed runtime to provide the ModuleOp
-// directly.
-//===----------------------------------------------------------------------===//
-
-// Client-server interface. The caller must provide an mlir::ModuleOp and the
-// exact name of the entry point kernel function to be called, which is
-// typically the .thunk unmarshalling function. Passing short names is
-// considered incorrect.
-[[nodiscard]] KernelThunkResultType
-altLaunchModule(const char *exactEntryPointName, void *moduleOp, void *args,
-                std::uint64_t argsSize, std::uint64_t resultOffset);
-
-// Streamlined interface for launching kernels. Argument synthesis and JIT
-// compilation *must* happen on the local machine. The caller must provide an
-// mlir::ModuleOp and the exact name of the entry point kernel function to be
-// called,
-[[nodiscard]] KernelThunkResultType
-streamlinedLaunchModule(const char *exactEntryPointName, void *moduleOp,
-                        const std::vector<void *> &rawArgs);
-
-// Hybrid of the client-server and streamlined approaches. Letting JIT
-// compilation happen either early or late and can handle return values from
-// each kernel launch. The caller must provide an mlir::ModuleOp and the exact
-// name of the entry point kernel function to be called,
-[[nodiscard]] KernelThunkResultType
-hybridLaunchModule(const char *exactEntryPointName, void *moduleOp, void *args,
-                   std::uint64_t argsSize, std::uint64_t resultOffset,
-                   const std::vector<void *> &rawArgs);
 } // extern "C"
 } // namespace cudaq

From 24f0561e00d9858b530967994da314185d81be23 Mon Sep 17 00:00:00 2001
From: Eric Schweitz <eschweitz@nvidia.com>
Date: Mon, 6 Oct 2025 10:31:04 -0700
Subject: [PATCH 3/3] Fix spelling.

Signed-off-by: Eric Schweitz <eschweitz@nvidia.com>
---
 runtime/cudaq/builder/kernel_builder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runtime/cudaq/builder/kernel_builder.cpp b/runtime/cudaq/builder/kernel_builder.cpp
index efd28f10804..4002e57397f 100644
--- a/runtime/cudaq/builder/kernel_builder.cpp
+++ b/runtime/cudaq/builder/kernel_builder.cpp
@@ -1103,7 +1103,7 @@ void invokeCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit,
   auto thunk = reinterpret_cast<KernelThunkType>(*thunkPtr);
 
   //  Extract the result offset, which we named.
-  auto roName = properName + ".resultOffset";
+  auto roName = properName + ".returnOffset";
   auto roPtr = jit->lookup(roName);
   if (!roPtr)
     throw std::runtime_error(