diff --git a/.github/workflows/all_libs.yaml b/.github/workflows/all_libs.yaml index 5aec49fc..d9a3bfcb 100644 --- a/.github/workflows/all_libs.yaml +++ b/.github/workflows/all_libs.yaml @@ -95,6 +95,8 @@ jobs: # ======================================================================== # - name: Run tests + env: + OMPI_MCA_pml: ob1 run: cmake --build ${{ steps.build.outputs.build-dir }} --target run_tests # ======================================================================== diff --git a/libs/solvers/lib/adapt/adapt_simulator.cpp b/libs/solvers/lib/adapt/adapt_simulator.cpp old mode 100644 new mode 100755 index 76f76e1a..621147e2 --- a/libs/solvers/lib/adapt/adapt_simulator.cpp +++ b/libs/solvers/lib/adapt/adapt_simulator.cpp @@ -56,7 +56,8 @@ simulator::run(const cudaq::qkernel &)> &initialState, // poolList is split into numRanks chunks, and each chunk can be // further parallelized across numQpus. // Compute the [H,Oi] - std::vector commutators; + std::vector localCommutators; + std::vector localCommutatorIndices; std::size_t total_elements = pool.size(); std::size_t elements_per_rank = total_elements / numRanks; std::size_t remainder = total_elements % numRanks; @@ -71,25 +72,25 @@ simulator::run(const cudaq::qkernel &)> &initialState, auto coeff = (!isImaginary) ? std::complex{0.0, 1.0} : std::complex{1.0, 0.0}; - for (auto &op : pool) { - auto commutator = H * op - op * H; + // Each rank computes commutators only for its chunk [start, end) + for (std::size_t globalIdx = start; globalIdx < end; ++globalIdx) { + auto commutator = H * pool[globalIdx] - pool[globalIdx] * H; commutator.canonicalize().trim(); - if (commutator.num_terms() > 0) - commutators.push_back(coeff * commutator); + if (commutator.num_terms() > 0) { + localCommutators.push_back(coeff * commutator); + localCommutatorIndices.push_back(globalIdx); + } } - nlohmann::json initInfo = {{"num-qpus", numQpus}, - {"numRanks", numRanks}, - {"num-pool-elements", pool.size()}, - {"num-elements-per-rank", end - start}}; + nlohmann::json initInfo = { + {"num-qpus", numQpus}, + {"numRanks", numRanks}, + {"num-pool-elements", pool.size()}, + {"num-elements-per-rank", end - start}, + {"num-local-commutators", localCommutators.size()}}; if (rank == 0) cudaq::info("[adapt] init info: {}", initInfo.dump(4)); - // We'll need to know the local to global index map - std::vector localToGlobalMap(end - start); - for (int i = 0; i < end - start; i++) - localToGlobalMap[i] = start + i; - // Start of with the initial |psi_n> cudaq::state state = get_state(adapt_kernel, numQubits, initialState, thetas, coefficients, pauliWords, poolIndices); @@ -106,25 +107,26 @@ simulator::run(const cudaq::qkernel &)> &initialState, } step++; - // Step 1 - compute vector + // Step 1 - compute vector for this rank's commutators std::vector gradients; std::vector results; - double gradNorm = 0.0; std::vector resultHandles; if (numQpus == 1) { - for (std::size_t i = 0; i < commutators.size(); i++) { + for (std::size_t i = 0; i < localCommutators.size(); i++) { cudaq::info("Compute commutator {}", i); - results.emplace_back(observe(prepare_state, commutators[i], state)); + results.emplace_back( + observe(prepare_state, localCommutators[i], state)); } } else { - for (std::size_t i = 0, qpuCounter = 0; i < commutators.size(); i++) { + for (std::size_t i = 0, qpuCounter = 0; i < localCommutators.size(); + i++) { if (rank == 0) cudaq::info("Compute commutator {}", i); if (qpuCounter % numQpus == 0) qpuCounter = 0; - resultHandles.emplace_back( - observe_async(qpuCounter++, prepare_state, commutators[i], state)); + resultHandles.emplace_back(observe_async(qpuCounter++, prepare_state, + localCommutators[i], state)); } for (auto &handle : resultHandles) results.emplace_back(handle.get()); @@ -135,42 +137,53 @@ simulator::run(const cudaq::qkernel &)> &initialState, std::back_inserter(gradients), [](auto &&el) { return std::fabs(el.expectation()); }); - // Compute the local gradient norm - double norm = 0.0; + // Compute global L2 norm: sum local squares, reduce across ranks, sqrt + double localNormSq = 0.0; for (auto &g : gradients) - norm += g * g; - norm = std::sqrt(norm); - - // All ranks have a norm, need to reduce that across all + localNormSq += g * g; + double globalNormSq = localNormSq; if (mpi::is_initialized()) - norm = cudaq::mpi::all_reduce(norm, std::plus()); + globalNormSq = cudaq::mpi::all_reduce(localNormSq, std::plus()); + double norm = std::sqrt(globalNormSq); + + // Find this rank's local max gradient and its global pool index. + // A rank with no local commutators contributes sentinel values. + double localMaxGrad = -std::numeric_limits::infinity(); + int localMaxOpIdx = -1; + if (!gradients.empty()) { + auto iter = std::max_element(gradients.begin(), gradients.end()); + localMaxGrad = *iter; + localMaxOpIdx = static_cast( + localCommutatorIndices[std::distance(gradients.begin(), iter)]); + } - auto iter = std::max_element(gradients.begin(), gradients.end()); - double maxGrad = *iter; - auto maxOpIdx = std::distance(gradients.begin(), iter); + // Determine the global max across all ranks + double globalMaxGrad = localMaxGrad; + int globalMaxOpIdx = localMaxOpIdx; if (mpi::is_initialized()) { - std::vector allMaxOpIndices(numRanks); - std::vector allMaxGrads(numRanks); - // Distribute the max gradient from this rank to others - cudaq::mpi::all_gather(allMaxGrads, {*iter}); - // Distribute the corresponding idx from this rank to others, - // make sure we map back to global indices - cudaq::mpi::all_gather(allMaxOpIndices, - {static_cast(localToGlobalMap[maxOpIdx])}); - - // Everyone has the indices, loop over and pick out the - // max from all calculations - std::size_t cachedIdx = 0; - double cachedGrad = 0.0; - for (std::size_t i = 0; i < allMaxGrads.size(); i++) - if (allMaxGrads[i] > cachedGrad) { - cachedGrad = allMaxGrads[i]; - cachedIdx = allMaxOpIndices[i]; + std::vector allLocalMaxGrads(numRanks); + std::vector allLocalMaxOpIndices(numRanks); + cudaq::mpi::all_gather(allLocalMaxGrads, {localMaxGrad}); + cudaq::mpi::all_gather(allLocalMaxOpIndices, {localMaxOpIdx}); + + globalMaxOpIdx = -1; + globalMaxGrad = -std::numeric_limits::infinity(); + for (std::size_t i = 0; i < allLocalMaxGrads.size(); i++) + if (allLocalMaxOpIndices[i] >= 0 && + allLocalMaxGrads[i] > globalMaxGrad) { + globalMaxGrad = allLocalMaxGrads[i]; + globalMaxOpIdx = allLocalMaxOpIndices[i]; } + } - maxOpIdx = cachedIdx; + if (globalMaxOpIdx < 0) { + if (rank == 0) + cudaq::warn("[adapt] all commutators [H, O_i] are zero; the operator " + "pool may be incompatible with the Hamiltonian."); + break; } + auto maxOpIdx = static_cast(globalMaxOpIdx); if (rank == 0) { cudaq::info("[adapt] index of element with max gradient is {}", maxOpIdx); diff --git a/libs/solvers/python/bindings/solvers/py_solvers.cpp b/libs/solvers/python/bindings/solvers/py_solvers.cpp old mode 100644 new mode 100755 index 1ddb4612..7cf84db5 --- a/libs/solvers/python/bindings/solvers/py_solvers.cpp +++ b/libs/solvers/python/bindings/solvers/py_solvers.cpp @@ -1131,16 +1131,14 @@ RuntimeError "Invalid functional optimizer provided (only " "scipy.optimize.minimize supported)."); PythonOptimizer opt(func, options); - return cudaq::solvers::qaoa(problemHamiltonian, opt, numLayers, - initialParameters, + return cudaq::solvers::qaoa(problemHamiltonian, referenceHamiltonian, + opt, numLayers, initialParameters, hetMapFromKwargs(options)); } auto optimizerName = cudaqx::getValueOr(options, "optimizer", "cobyla"); auto optimizer = cudaq::optim::optimizer::get(optimizerName); - auto gradName = - cudaqx::getValueOr(options, "gradient", ""); return cudaq::solvers::qaoa(problemHamiltonian, referenceHamiltonian, *optimizer, numLayers, initialParameters, @@ -1172,8 +1170,6 @@ RuntimeError auto optimizerName = cudaqx::getValueOr(options, "optimizer", "cobyla"); auto optimizer = cudaq::optim::optimizer::get(optimizerName); - auto gradName = - cudaqx::getValueOr(options, "gradient", ""); return cudaq::solvers::qaoa(problemHamiltonian, *optimizer, numLayers, initialParameters, hetMapFromKwargs(options)); diff --git a/libs/solvers/python/tests/test_adapt.py b/libs/solvers/python/tests/test_adapt.py index a6966a62..196ed8d9 100644 --- a/libs/solvers/python/tests/test_adapt.py +++ b/libs/solvers/python/tests/test_adapt.py @@ -12,6 +12,7 @@ import numpy as np import cudaq +from cudaq import spin import cudaq_solvers as solvers @@ -79,3 +80,19 @@ def initState(q: cudaq.qview): options={'disp': True}) print(energy) assert np.isclose(energy, -1.137, atol=1e-3) + + +def test_adapt_empty_gradients(): + # adapt_vqe must not crash when all commutators [H, O_i] are zero. + # H and pool are all diagonal (Z-only), so [H, O_i] = 0 for each. + H = spin.z(0) * spin.z(1) + pool = [spin.z(0), spin.z(0) * spin.z(1)] + + @cudaq.kernel + def initState(q: cudaq.qview): + x(q[0]) + x(q[1]) + + energy, thetas, ops = solvers.adapt_vqe(initState, H, pool) + # Should return without crashing; no operators selected + assert len(ops) == 0 diff --git a/libs/solvers/python/tests/test_adapt_mpi.py b/libs/solvers/python/tests/test_adapt_mpi.py new file mode 100644 index 00000000..794ba8bb --- /dev/null +++ b/libs/solvers/python/tests/test_adapt_mpi.py @@ -0,0 +1,90 @@ +# ============================================================================ # +# Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +# ADAPT-VQE MPI work-splitting test. +# Verifies that ADAPT-VQE produces the correct H2 ground-state energy when +# commutator evaluation is distributed across multiple MPI ranks. +# +# When invoked by pytest, the test launches itself via mpiexec as a subprocess. +# When invoked by mpiexec, the __name__ == "__main__" block runs the actual +# MPI computation. + +import os +import shutil +import subprocess + +import pytest + +_MPI_CMD = ["mpiexec", "--allow-run-as-root", "--oversubscribe"] + + +def _mpi_available(): + if shutil.which("mpiexec") is None: + return False + # Probe with 2 ranks to verify multi-rank MPI actually works + # (catches missing PML transports, absent cudaq MPI plugin, etc.) + try: + rc = subprocess.run(_MPI_CMD + [ + "-np", "2", "python3", "-c", + "import cudaq; cudaq.mpi.initialize(); " + "cudaq.mpi.finalize()" + ], + capture_output=True, + timeout=30) + return rc.returncode == 0 + except subprocess.TimeoutExpired: + return False + + +@pytest.mark.skipif(not _mpi_available(), + reason="mpiexec or cudaq MPI support not available") +@pytest.mark.parametrize("num_ranks", [2, 4]) +def test_adapt_mpi(num_ranks): + result = subprocess.run(_MPI_CMD + + ["-np", str(num_ranks), "python3", __file__], + capture_output=True, + text=True, + timeout=120) + assert result.returncode == 0, \ + (f"MPI test failed (np={num_ranks}):\n" + f"--- stdout ---\n{result.stdout}\n" + f"--- stderr ---\n{result.stderr}") + + +if __name__ == "__main__": + import numpy as np + import cudaq + import cudaq_solvers as solvers + + cudaq.mpi.initialize() + + geometry = [('H', (0., 0., 0.)), ('H', (0., 0., .7474))] + molecule = solvers.create_molecule(geometry, 'sto-3g', 0, 0, casci=True) + operators = solvers.get_operator_pool("spin_complement_gsd", + num_orbitals=molecule.n_orbitals) + numElectrons = molecule.n_electrons + + @cudaq.kernel + def initState(q: cudaq.qview): + for i in range(numElectrons): + x(q[i]) + + energy, thetas, ops = solvers.adapt_vqe(initState, molecule.hamiltonian, + operators) + + rank = cudaq.mpi.rank() + num_ranks = cudaq.mpi.num_ranks() + + if rank == 0: + print(f"[MPI ADAPT] ranks={num_ranks}, energy={energy:.6f}") + assert np.isclose(energy, -1.137, atol=1e-3), \ + f"MPI ADAPT energy {energy} does not match expected -1.137" + assert len(ops) > 0, "Expected at least one operator to be selected" + print("PASS") + + cudaq.mpi.finalize() diff --git a/libs/solvers/python/tests/test_qaoa.py b/libs/solvers/python/tests/test_qaoa.py index c8276cc2..e83574b6 100644 --- a/libs/solvers/python/tests/test_qaoa.py +++ b/libs/solvers/python/tests/test_qaoa.py @@ -252,3 +252,46 @@ def test_clique_weighted_nodes(): expected_ham = spin.z(0) + 1.5 * spin.z(1) - 2.5 assert ham == expected_ham + + +def test_qaoa_custom_mixer_forwarded(): + # To verify the referenceHamiltonian is forwarded to the C++ qaoa(). + # Use a triangle maxcut (3 qubits) so p=1 cannot reach the ground state, + # making the result sensitive to the mixer choice. + problem_ham = (0.5 * spin.z(0) * spin.z(1) + 0.5 * spin.z(1) * spin.z(2) + + 0.5 * spin.z(0) * spin.z(2)) + # Asymmetric mixer: deliberately different from default X0+X1+X2 + custom_mixer = spin.y(0) + spin.y(1) + spin.y(2) + default_mixer = spin.x(0) + spin.x(1) + spin.x(2) + init_params = [0.1, 0.1] + + # Path 1: custom mixer + cobyla + result_custom = solvers.qaoa(problem_ham, + custom_mixer, + 1, + init_params, + optimizer='cobyla') + + # Path 2: explicit default mixer (X0+X1+X2) + cobyla + result_explicit_default = solvers.qaoa(problem_ham, + default_mixer, + 1, + init_params, + optimizer='cobyla') + + # Path 3: implicit default mixer + cobyla + result_implicit_default = solvers.qaoa(problem_ham, + 1, + init_params, + optimizer='cobyla') + + # Explicit default must match implicit default — sanity check. + assert np.isclose(result_explicit_default.optimal_value, + result_implicit_default.optimal_value, + atol=1e-3) + + # Custom mixer (Y-rotations) must differ from default (X-rotations) — + # proves the mixer is not silently dropped. + assert not np.isclose(result_custom.optimal_value, + result_implicit_default.optimal_value, + atol=1e-3) diff --git a/libs/solvers/unittests/CMakeLists.txt b/libs/solvers/unittests/CMakeLists.txt index a5f909de..9ccab679 100644 --- a/libs/solvers/unittests/CMakeLists.txt +++ b/libs/solvers/unittests/CMakeLists.txt @@ -86,3 +86,7 @@ gtest_discover_tests(test_upccgsd_operator_pool) add_executable(test_qaoa test_qaoa.cpp) target_link_libraries(test_qaoa PRIVATE GTest::gtest_main cudaq-solvers test-kernels cudaq::cudaq) gtest_discover_tests(test_qaoa) + +add_executable(test_adapt_mpi test_adapt_mpi.cpp) +target_link_libraries(test_adapt_mpi PRIVATE GTest::gtest cudaq-solvers test-kernels cudaq::cudaq) +gtest_discover_tests(test_adapt_mpi) diff --git a/libs/solvers/unittests/test_adapt.cpp b/libs/solvers/unittests/test_adapt.cpp index 4600fa29..24b1b0f9 100644 --- a/libs/solvers/unittests/test_adapt.cpp +++ b/libs/solvers/unittests/test_adapt.cpp @@ -499,3 +499,18 @@ TEST_F(SolversTester, checkSimpleAdaptGradientUCCSD_BeH2Sto3g) { for (std::size_t i = 0; i < thetas.size(); i++) printf("%lf -> %s\n", thetas[i], ops[i].to_string().c_str()); } + +// adapt_vqe must not crash when all commutators [H, O_i] are zero. +TEST_F(SolversTester, checkAdaptEmptyGradients) { + // H and pool are all diagonal — they commute, so all [H, O_i] = 0. + // Use 2-qubit H so hartreeFock2Electrons (which sets q[0],q[1]) is valid. + auto H = cudaq::spin::z(0) * cudaq::spin::z(1); + std::vector pool = {cudaq::spin::z(0), + cudaq::spin::z(0) * cudaq::spin::z(1)}; + + auto [energy, thetas, ops] = cudaq::solvers::adapt_vqe( + hartreeFock2Electrons, H, pool, + {{"grad_norm_tolerance", 1e-3}, {"max_iter", 5}}); + // No operators should be selected since all commutators are zero + EXPECT_TRUE(ops.empty()); +} diff --git a/libs/solvers/unittests/test_adapt_mpi.cpp b/libs/solvers/unittests/test_adapt_mpi.cpp new file mode 100644 index 00000000..60d9e07c --- /dev/null +++ b/libs/solvers/unittests/test_adapt_mpi.cpp @@ -0,0 +1,109 @@ +/******************************************************************************* + * Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +// ADAPT-VQE MPI work-splitting test. +// Verifies that ADAPT-VQE produces the correct H2 ground-state energy when +// commutator evaluation is distributed across multiple MPI ranks. +// +// When run by gtest (ctest), the test launches itself via mpiexec as a +// subprocess. When invoked with --mpi-worker, the actual MPI computation runs. + +#include +#include +#include +#include + +#include + +#include "cudaq.h" +#include "nvqpp/test_kernels.h" +#include "cudaq/solvers/adapt.h" +#include "cudaq/solvers/operators.h" + +// Lightweight MPI init/finalize used by the skip-check probe. +static int runMpiProbe() { + cudaq::mpi::initialize(); + cudaq::mpi::finalize(); + return 0; +} + +static int runMpiWorker() { + cudaq::mpi::initialize(); + + auto geometryHH = cudaq::solvers::molecular_geometry{{"H", {0., 0., 0.}}, + {"H", {0., 0., .7474}}}; + auto hh = cudaq::solvers::create_molecule( + geometryHH, "sto-3g", 0, 0, + {.casci = true, .ccsd = false, .verbose = false}); + + auto pool = cudaq::solvers::operator_pool::get("spin_complement_gsd"); + auto poolList = + pool->generate({{"num-orbitals", hh.hamiltonian.num_qubits() / 2}}); + + auto [energy, thetas, ops] = + cudaq::solvers::adapt_vqe(hartreeFock2Electrons, hh.hamiltonian, poolList, + {{"grad_norm_tolerance", 1e-3}, + {"max_iter", 15}, + {"grad_norm_diff_tolerance", 1e-5}, + {"threshold_energy", 5e-6}}); + + int rc = 0; + if (cudaq::mpi::rank() == 0) { + std::cout << "[MPI ADAPT] ranks=" << cudaq::mpi::num_ranks() + << ", energy=" << energy << std::endl; + if (std::fabs(energy - (-1.13)) >= 1e-2) { + std::cerr << "FAIL: energy " << energy << " != expected -1.13" + << std::endl; + rc = 1; + } else if (ops.empty()) { + std::cerr << "FAIL: no operators selected" << std::endl; + rc = 1; + } else { + std::cout << "PASS" << std::endl; + } + } + + cudaq::mpi::finalize(); + return rc; +} + +class SolversTester : public ::testing::TestWithParam {}; + +TEST_P(SolversTester, checkSimpleAdaptMpi_H2Sto3g) { + if (std::system("which mpiexec > /dev/null 2>&1") != 0) + GTEST_SKIP() << "mpiexec not found, skipping MPI test"; + + // Probe with 2 ranks to verify MPI can actually launch multi-rank jobs + // (catches missing PML transports, absent cudaq MPI plugin, etc.) + std::string self = ::testing::internal::GetArgvs()[0]; + std::string probeCmd = "mpiexec --allow-run-as-root --oversubscribe -np 2 " + + self + " --mpi-probe > /dev/null 2>&1"; + if (std::system(probeCmd.c_str()) != 0) + GTEST_SKIP() << "MPI multi-rank launch not functional, skipping MPI test"; + + int numRanks = GetParam(); + + std::string cmd = "mpiexec --allow-run-as-root --oversubscribe -np " + + std::to_string(numRanks) + " " + self + " --mpi-worker"; + int rc = std::system(cmd.c_str()); + EXPECT_EQ(rc, 0) << "mpiexec failed with exit code " << rc; +} + +INSTANTIATE_TEST_SUITE_P(MpiRanks, SolversTester, ::testing::Values(2, 4)); + +int main(int argc, char **argv) { + for (int i = 1; i < argc; i++) { + if (std::string(argv[i]) == "--mpi-probe") + return runMpiProbe(); + if (std::string(argv[i]) == "--mpi-worker") + return runMpiWorker(); + } + + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +}