Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/all_libs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ jobs:
# ========================================================================
#
- name: Run tests
env:
OMPI_MCA_pml: ob1
run: cmake --build ${{ steps.build.outputs.build-dir }} --target run_tests

# ========================================================================
Expand Down
111 changes: 62 additions & 49 deletions libs/solvers/lib/adapt/adapt_simulator.cpp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ simulator::run(const cudaq::qkernel<void(cudaq::qvector<> &)> &initialState,
// poolList is split into numRanks chunks, and each chunk can be
// further parallelized across numQpus.
// Compute the [H,Oi]
std::vector<spin_op> commutators;
std::vector<spin_op> localCommutators;
std::vector<std::size_t> localCommutatorIndices;
std::size_t total_elements = pool.size();
std::size_t elements_per_rank = total_elements / numRanks;
std::size_t remainder = total_elements % numRanks;
Expand All @@ -71,25 +72,25 @@ simulator::run(const cudaq::qkernel<void(cudaq::qvector<> &)> &initialState,
auto coeff = (!isImaginary) ? std::complex<double>{0.0, 1.0}
: std::complex<double>{1.0, 0.0};

for (auto &op : pool) {
auto commutator = H * op - op * H;
// Each rank computes commutators only for its chunk [start, end)
for (std::size_t globalIdx = start; globalIdx < end; ++globalIdx) {
auto commutator = H * pool[globalIdx] - pool[globalIdx] * H;
commutator.canonicalize().trim();
if (commutator.num_terms() > 0)
commutators.push_back(coeff * commutator);
if (commutator.num_terms() > 0) {
localCommutators.push_back(coeff * commutator);
localCommutatorIndices.push_back(globalIdx);
}
}

nlohmann::json initInfo = {{"num-qpus", numQpus},
{"numRanks", numRanks},
{"num-pool-elements", pool.size()},
{"num-elements-per-rank", end - start}};
nlohmann::json initInfo = {
{"num-qpus", numQpus},
{"numRanks", numRanks},
{"num-pool-elements", pool.size()},
{"num-elements-per-rank", end - start},
{"num-local-commutators", localCommutators.size()}};
if (rank == 0)
cudaq::info("[adapt] init info: {}", initInfo.dump(4));

// We'll need to know the local to global index map
std::vector<std::size_t> localToGlobalMap(end - start);
for (int i = 0; i < end - start; i++)
localToGlobalMap[i] = start + i;

// Start of with the initial |psi_n>
cudaq::state state = get_state(adapt_kernel, numQubits, initialState, thetas,
coefficients, pauliWords, poolIndices);
Expand All @@ -106,25 +107,26 @@ simulator::run(const cudaq::qkernel<void(cudaq::qvector<> &)> &initialState,
}
step++;

// Step 1 - compute <psi|[H,Oi]|psi> vector
// Step 1 - compute <psi|[H,Oi]|psi> vector for this rank's commutators
std::vector<double> gradients;
std::vector<observe_result> results;
double gradNorm = 0.0;
std::vector<async_observe_result> resultHandles;

if (numQpus == 1) {
for (std::size_t i = 0; i < commutators.size(); i++) {
for (std::size_t i = 0; i < localCommutators.size(); i++) {
cudaq::info("Compute commutator {}", i);
results.emplace_back(observe(prepare_state, commutators[i], state));
results.emplace_back(
observe(prepare_state, localCommutators[i], state));
}
} else {
for (std::size_t i = 0, qpuCounter = 0; i < commutators.size(); i++) {
for (std::size_t i = 0, qpuCounter = 0; i < localCommutators.size();
i++) {
if (rank == 0)
cudaq::info("Compute commutator {}", i);
if (qpuCounter % numQpus == 0)
qpuCounter = 0;
resultHandles.emplace_back(
observe_async(qpuCounter++, prepare_state, commutators[i], state));
resultHandles.emplace_back(observe_async(qpuCounter++, prepare_state,
localCommutators[i], state));
}
for (auto &handle : resultHandles)
results.emplace_back(handle.get());
Expand All @@ -135,42 +137,53 @@ simulator::run(const cudaq::qkernel<void(cudaq::qvector<> &)> &initialState,
std::back_inserter(gradients),
[](auto &&el) { return std::fabs(el.expectation()); });

// Compute the local gradient norm
double norm = 0.0;
// Compute global L2 norm: sum local squares, reduce across ranks, sqrt
double localNormSq = 0.0;
for (auto &g : gradients)
norm += g * g;
norm = std::sqrt(norm);

// All ranks have a norm, need to reduce that across all
localNormSq += g * g;
double globalNormSq = localNormSq;
if (mpi::is_initialized())
norm = cudaq::mpi::all_reduce(norm, std::plus<double>());
globalNormSq = cudaq::mpi::all_reduce(localNormSq, std::plus<double>());
double norm = std::sqrt(globalNormSq);

// Find this rank's local max gradient and its global pool index.
// A rank with no local commutators contributes sentinel values.
double localMaxGrad = -std::numeric_limits<double>::infinity();
int localMaxOpIdx = -1;
if (!gradients.empty()) {
auto iter = std::max_element(gradients.begin(), gradients.end());
localMaxGrad = *iter;
localMaxOpIdx = static_cast<int>(
localCommutatorIndices[std::distance(gradients.begin(), iter)]);
}

auto iter = std::max_element(gradients.begin(), gradients.end());
double maxGrad = *iter;
auto maxOpIdx = std::distance(gradients.begin(), iter);
// Determine the global max across all ranks
double globalMaxGrad = localMaxGrad;
int globalMaxOpIdx = localMaxOpIdx;

if (mpi::is_initialized()) {
std::vector<int> allMaxOpIndices(numRanks);
std::vector<double> allMaxGrads(numRanks);
// Distribute the max gradient from this rank to others
cudaq::mpi::all_gather(allMaxGrads, {*iter});
// Distribute the corresponding idx from this rank to others,
// make sure we map back to global indices
cudaq::mpi::all_gather(allMaxOpIndices,
{static_cast<int>(localToGlobalMap[maxOpIdx])});

// Everyone has the indices, loop over and pick out the
// max from all calculations
std::size_t cachedIdx = 0;
double cachedGrad = 0.0;
for (std::size_t i = 0; i < allMaxGrads.size(); i++)
if (allMaxGrads[i] > cachedGrad) {
cachedGrad = allMaxGrads[i];
cachedIdx = allMaxOpIndices[i];
std::vector<double> allLocalMaxGrads(numRanks);
std::vector<int> allLocalMaxOpIndices(numRanks);
cudaq::mpi::all_gather(allLocalMaxGrads, {localMaxGrad});
cudaq::mpi::all_gather(allLocalMaxOpIndices, {localMaxOpIdx});

globalMaxOpIdx = -1;
globalMaxGrad = -std::numeric_limits<double>::infinity();
for (std::size_t i = 0; i < allLocalMaxGrads.size(); i++)
if (allLocalMaxOpIndices[i] >= 0 &&
allLocalMaxGrads[i] > globalMaxGrad) {
globalMaxGrad = allLocalMaxGrads[i];
globalMaxOpIdx = allLocalMaxOpIndices[i];
}
}

maxOpIdx = cachedIdx;
if (globalMaxOpIdx < 0) {
if (rank == 0)
cudaq::warn("[adapt] all commutators [H, O_i] are zero; the operator "
"pool may be incompatible with the Hamiltonian.");
break;
}
auto maxOpIdx = static_cast<std::size_t>(globalMaxOpIdx);

if (rank == 0) {
cudaq::info("[adapt] index of element with max gradient is {}", maxOpIdx);
Expand Down
8 changes: 2 additions & 6 deletions libs/solvers/python/bindings/solvers/py_solvers.cpp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -1131,16 +1131,14 @@ RuntimeError
"Invalid functional optimizer provided (only "
"scipy.optimize.minimize supported).");
PythonOptimizer opt(func, options);
return cudaq::solvers::qaoa(problemHamiltonian, opt, numLayers,
initialParameters,
return cudaq::solvers::qaoa(problemHamiltonian, referenceHamiltonian,
opt, numLayers, initialParameters,
hetMapFromKwargs(options));
}

auto optimizerName =
cudaqx::getValueOr<std::string>(options, "optimizer", "cobyla");
auto optimizer = cudaq::optim::optimizer::get(optimizerName);
auto gradName =
cudaqx::getValueOr<std::string>(options, "gradient", "");

return cudaq::solvers::qaoa(problemHamiltonian, referenceHamiltonian,
*optimizer, numLayers, initialParameters,
Expand Down Expand Up @@ -1172,8 +1170,6 @@ RuntimeError
auto optimizerName =
cudaqx::getValueOr<std::string>(options, "optimizer", "cobyla");
auto optimizer = cudaq::optim::optimizer::get(optimizerName);
auto gradName =
cudaqx::getValueOr<std::string>(options, "gradient", "");
return cudaq::solvers::qaoa(problemHamiltonian, *optimizer, numLayers,
initialParameters,
hetMapFromKwargs(options));
Expand Down
17 changes: 17 additions & 0 deletions libs/solvers/python/tests/test_adapt.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import numpy as np

import cudaq
from cudaq import spin
import cudaq_solvers as solvers


Expand Down Expand Up @@ -79,3 +80,19 @@ def initState(q: cudaq.qview):
options={'disp': True})
print(energy)
assert np.isclose(energy, -1.137, atol=1e-3)


def test_adapt_empty_gradients():
# adapt_vqe must not crash when all commutators [H, O_i] are zero.
# H and pool are all diagonal (Z-only), so [H, O_i] = 0 for each.
H = spin.z(0) * spin.z(1)
pool = [spin.z(0), spin.z(0) * spin.z(1)]

@cudaq.kernel
def initState(q: cudaq.qview):
x(q[0])
x(q[1])

energy, thetas, ops = solvers.adapt_vqe(initState, H, pool)
# Should return without crashing; no operators selected
assert len(ops) == 0
90 changes: 90 additions & 0 deletions libs/solvers/python/tests/test_adapt_mpi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# ============================================================================ #
# Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates. #
# All rights reserved. #
# #
# This source code and the accompanying materials are made available under #
# the terms of the Apache License 2.0 which accompanies this distribution. #
# ============================================================================ #

# ADAPT-VQE MPI work-splitting test.
# Verifies that ADAPT-VQE produces the correct H2 ground-state energy when
# commutator evaluation is distributed across multiple MPI ranks.
#
# When invoked by pytest, the test launches itself via mpiexec as a subprocess.
# When invoked by mpiexec, the __name__ == "__main__" block runs the actual
# MPI computation.

import os
import shutil
import subprocess

import pytest

_MPI_CMD = ["mpiexec", "--allow-run-as-root", "--oversubscribe"]


def _mpi_available():
if shutil.which("mpiexec") is None:
return False
# Probe with 2 ranks to verify multi-rank MPI actually works
# (catches missing PML transports, absent cudaq MPI plugin, etc.)
try:
rc = subprocess.run(_MPI_CMD + [
"-np", "2", "python3", "-c",
"import cudaq; cudaq.mpi.initialize(); "
"cudaq.mpi.finalize()"
],
capture_output=True,
timeout=30)
return rc.returncode == 0
except subprocess.TimeoutExpired:
return False


@pytest.mark.skipif(not _mpi_available(),
reason="mpiexec or cudaq MPI support not available")
@pytest.mark.parametrize("num_ranks", [2, 4])
def test_adapt_mpi(num_ranks):
result = subprocess.run(_MPI_CMD +
["-np", str(num_ranks), "python3", __file__],
capture_output=True,
text=True,
timeout=120)
assert result.returncode == 0, \
(f"MPI test failed (np={num_ranks}):\n"
f"--- stdout ---\n{result.stdout}\n"
f"--- stderr ---\n{result.stderr}")


if __name__ == "__main__":
import numpy as np
import cudaq
import cudaq_solvers as solvers

cudaq.mpi.initialize()

geometry = [('H', (0., 0., 0.)), ('H', (0., 0., .7474))]
molecule = solvers.create_molecule(geometry, 'sto-3g', 0, 0, casci=True)
operators = solvers.get_operator_pool("spin_complement_gsd",
num_orbitals=molecule.n_orbitals)
numElectrons = molecule.n_electrons

@cudaq.kernel
def initState(q: cudaq.qview):
for i in range(numElectrons):
x(q[i])

energy, thetas, ops = solvers.adapt_vqe(initState, molecule.hamiltonian,
operators)

rank = cudaq.mpi.rank()
num_ranks = cudaq.mpi.num_ranks()

if rank == 0:
print(f"[MPI ADAPT] ranks={num_ranks}, energy={energy:.6f}")
assert np.isclose(energy, -1.137, atol=1e-3), \
f"MPI ADAPT energy {energy} does not match expected -1.137"
assert len(ops) > 0, "Expected at least one operator to be selected"
print("PASS")

cudaq.mpi.finalize()
43 changes: 43 additions & 0 deletions libs/solvers/python/tests/test_qaoa.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,3 +252,46 @@ def test_clique_weighted_nodes():

expected_ham = spin.z(0) + 1.5 * spin.z(1) - 2.5
assert ham == expected_ham


def test_qaoa_custom_mixer_forwarded():
# To verify the referenceHamiltonian is forwarded to the C++ qaoa().
# Use a triangle maxcut (3 qubits) so p=1 cannot reach the ground state,
# making the result sensitive to the mixer choice.
problem_ham = (0.5 * spin.z(0) * spin.z(1) + 0.5 * spin.z(1) * spin.z(2) +
0.5 * spin.z(0) * spin.z(2))
# Asymmetric mixer: deliberately different from default X0+X1+X2
custom_mixer = spin.y(0) + spin.y(1) + spin.y(2)
default_mixer = spin.x(0) + spin.x(1) + spin.x(2)
init_params = [0.1, 0.1]

# Path 1: custom mixer + cobyla
result_custom = solvers.qaoa(problem_ham,
custom_mixer,
1,
init_params,
optimizer='cobyla')

# Path 2: explicit default mixer (X0+X1+X2) + cobyla
result_explicit_default = solvers.qaoa(problem_ham,
default_mixer,
1,
init_params,
optimizer='cobyla')

# Path 3: implicit default mixer + cobyla
result_implicit_default = solvers.qaoa(problem_ham,
1,
init_params,
optimizer='cobyla')

# Explicit default must match implicit default — sanity check.
assert np.isclose(result_explicit_default.optimal_value,
result_implicit_default.optimal_value,
atol=1e-3)

# Custom mixer (Y-rotations) must differ from default (X-rotations) —
# proves the mixer is not silently dropped.
assert not np.isclose(result_custom.optimal_value,
result_implicit_default.optimal_value,
atol=1e-3)
4 changes: 4 additions & 0 deletions libs/solvers/unittests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,7 @@ gtest_discover_tests(test_upccgsd_operator_pool)
add_executable(test_qaoa test_qaoa.cpp)
target_link_libraries(test_qaoa PRIVATE GTest::gtest_main cudaq-solvers test-kernels cudaq::cudaq)
gtest_discover_tests(test_qaoa)

add_executable(test_adapt_mpi test_adapt_mpi.cpp)
target_link_libraries(test_adapt_mpi PRIVATE GTest::gtest cudaq-solvers test-kernels cudaq::cudaq)
gtest_discover_tests(test_adapt_mpi)
15 changes: 15 additions & 0 deletions libs/solvers/unittests/test_adapt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -499,3 +499,18 @@ TEST_F(SolversTester, checkSimpleAdaptGradientUCCSD_BeH2Sto3g) {
for (std::size_t i = 0; i < thetas.size(); i++)
printf("%lf -> %s\n", thetas[i], ops[i].to_string().c_str());
}

// adapt_vqe must not crash when all commutators [H, O_i] are zero.
TEST_F(SolversTester, checkAdaptEmptyGradients) {
// H and pool are all diagonal — they commute, so all [H, O_i] = 0.
// Use 2-qubit H so hartreeFock2Electrons (which sets q[0],q[1]) is valid.
auto H = cudaq::spin::z(0) * cudaq::spin::z(1);
std::vector<cudaq::spin_op> pool = {cudaq::spin::z(0),
cudaq::spin::z(0) * cudaq::spin::z(1)};

auto [energy, thetas, ops] = cudaq::solvers::adapt_vqe(
hartreeFock2Electrons, H, pool,
{{"grad_norm_tolerance", 1e-3}, {"max_iter", 5}});
// No operators should be selected since all commutators are zero
EXPECT_TRUE(ops.empty());
}
Loading
Loading