From 123a8dccacc2f6e83d0738091ef23a5ac86eef29 Mon Sep 17 00:00:00 2001 From: Gaurav Goklani Date: Wed, 11 Feb 2026 14:27:18 +0530 Subject: [PATCH] test: Added an RDMA validation testcase that checks the expected RDMA enablement and Azure RDMA persistent naming setup. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To automatically verify that the role correctly enables RDMA in waagent, installs RDMA userland tools, and (on Azure/systemd) configures and maintains Azure persistent RDMA naming services. How to run: Execute {{ __hpc_azure_tests_dir }}/test-rdma.sh after the role completes. Expected result: Exit 0 with “Test Passed …” lines; non-zero with “Failed: …” explaining the missing/failed prerequisite. Moved "Create Azure HPC resource directories" task at beginning to avoid path not found issue for other tasks. Signed-off-by: Gaurav Goklani --- tasks/main.yml | 36 ++++++----- templates/rdma/test-rdma.sh.j2 | 105 +++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 14 deletions(-) create mode 100644 templates/rdma/test-rdma.sh.j2 diff --git a/tasks/main.yml b/tasks/main.yml index a052bd5..b2c9b69 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -313,6 +313,20 @@ ternary('ansible.posix.rhel_rpm_ostree', omit) }}" register: __hpc_azure_packages_install until: __hpc_azure_packages_install is success + +- name: Create Azure HPC resource directories + file: + path: "{{ item }}" + state: directory + owner: root + group: root + mode: '0755' + loop: + - "{{ __hpc_azure_resource_dir }}" + - "{{ __hpc_azure_resource_dir }}/bin" + - "{{ __hpc_azure_tools_dir }}" + - "{{ __hpc_azure_tests_dir }}" + - "{{ __hpc_azure_runtime_dir }}" - name: Install NVidia driver # Note that currently the role supports only Microsoft Azure @@ -538,6 +552,14 @@ state: started daemon_reload: "{{ __hpc_azure_persistent_rdma_naming_monitor_unit.changed | d(false) }}" + - name: Install RDMA validation script + template: + src: rdma/test-rdma.sh.j2 + dest: "{{ __hpc_azure_tests_dir }}/test-rdma.sh" + owner: root + group: root + mode: "0755" + - name: Install common OpenMPI packages when: hpc_install_system_openmpi or hpc_build_openmpi_w_nvidia_gpu_support package: @@ -958,20 +980,6 @@ mode: '0644' notify: Reload udev -- name: Create Azure HPC resource directories - file: - path: "{{ item }}" - state: directory - owner: root - group: root - mode: '0755' - loop: - - "{{ __hpc_azure_resource_dir }}" - - "{{ __hpc_azure_resource_dir }}/bin" - - "{{ __hpc_azure_tools_dir }}" - - "{{ __hpc_azure_tests_dir }}" - - "{{ __hpc_azure_runtime_dir }}" - - name: Install SKU Customisation scripts and services when: hpc_sku_customisation block: diff --git a/templates/rdma/test-rdma.sh.j2 b/templates/rdma/test-rdma.sh.j2 new file mode 100644 index 0000000..5d8e2e7 --- /dev/null +++ b/templates/rdma/test-rdma.sh.j2 @@ -0,0 +1,105 @@ +#!/usr/bin/env bash +# These are templates, not actual shell scripts, so tell shellcheck to +# ignore the templated parts +# shellcheck disable=all +{{ ansible_managed | comment }} +{{ "system_role:hpc" | comment(prefix="", postfix="") }} +# shellcheck enable=all +# SPDX-License-Identifier: MIT +# +# RDMA Validation Script +# Usage: test-rdma.sh +# + +# This is test code, and some operations are expected to fail. Hence we can't +# use set -e to automatically exit the script if something fails. +set -u + +fail() +{ + echo Failed: "$1" + exit 1 +} + +require_file() { + local path="$1" + [[ -e "$path" ]] || fail "missing file: $path" +} + +require_executable() { + local path="$1" + [[ -x "$path" ]] || fail "not executable: $path" +} + +require_cmd() { + local cmd="$1" + command -v "$cmd" >/dev/null 2>&1 || fail "missing command in PATH: $cmd" +} + +sys_vendor() { + if [[ -r /sys/class/dmi/id/sys_vendor ]]; then + cat /sys/class/dmi/id/sys_vendor + else + echo "" + fi +} + +is_systemd() { + [[ "$(ps -p 1 -o comm= 2>/dev/null || true)" == "systemd" ]] +} + +main() { + echo + echo "Testing waagent RDMA flag" + require_file /etc/waagent.conf + grep -Fxq "OS.EnableRDMA=y" /etc/waagent.conf || fail "expected 'OS.EnableRDMA=y' in /etc/waagent.conf" + echo Test Passed: "waagent RDMA flag is set" + + echo + echo "Testing RDMA userland tools" + require_cmd ibv_devinfo + echo Test Passed: "RDMA tools are present (ibv_devinfo)" + + # Azure persistent RDMA naming artifacts/services (Azure only) + if [ "$(sys_vendor)" != "Microsoft Corporation" ]; then + echo + echo "Testing Azure persistent RDMA naming (skip: not Azure)" + echo Test Passed: "not running on Azure; Azure persistent RDMA naming checks skipped" + return 0 + fi + + if ! is_systemd; then + echo + echo "Testing Azure persistent RDMA naming (skip: not systemd)" + echo Test Passed: "not running systemd; systemd unit checks skipped" + return 0 + fi + + echo + echo "Testing Azure persistent RDMA naming artifacts" + require_executable /usr/sbin/azure_persistent_rdma_naming.sh + require_executable /usr/sbin/azure_persistent_rdma_naming_monitor.sh + require_file /etc/systemd/system/azure_persistent_rdma_naming.service + require_file /etc/systemd/system/azure_persistent_rdma_naming_monitor.service + require_file /etc/udev/rules.d/99-azure-persistent-rdma-naming.rules + echo Test Passed: "Azure persistent RDMA naming artifacts exist" + + echo + echo "Testing Azure persistent RDMA naming services" + require_cmd systemctl + systemctl is-enabled azure_persistent_rdma_naming.service >/dev/null 2>&1 || fail "azure_persistent_rdma_naming.service not enabled" + systemctl is-enabled azure_persistent_rdma_naming_monitor.service >/dev/null 2>&1 || fail "azure_persistent_rdma_naming_monitor.service not enabled" + + # azure_persistent_rdma_naming.service is Type=oneshot, so it may not remain + # "active" after it runs. Treat "failed" as an error; other states are OK. + if [ "$(systemctl is-failed azure_persistent_rdma_naming.service 2>/dev/null || true)" = "failed" ]; then + fail "azure_persistent_rdma_naming.service is in failed state" + fi + + # Monitor service should be continuously running. + systemctl is-active azure_persistent_rdma_naming_monitor.service >/dev/null 2>&1 || fail "azure_persistent_rdma_naming_monitor.service not active" + echo Test Passed: "Azure persistent RDMA naming services look healthy" +} + +main "$@" +