Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 22 additions & 14 deletions tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,20 @@
ternary('ansible.posix.rhel_rpm_ostree', omit) }}"
register: __hpc_azure_packages_install
until: __hpc_azure_packages_install is success

- name: Create Azure HPC resource directories
file:
path: "{{ item }}"
state: directory
owner: root
group: root
mode: '0755'
loop:
- "{{ __hpc_azure_resource_dir }}"
- "{{ __hpc_azure_resource_dir }}/bin"
- "{{ __hpc_azure_tools_dir }}"
- "{{ __hpc_azure_tests_dir }}"
- "{{ __hpc_azure_runtime_dir }}"

- name: Install NVidia driver
# Note that currently the role supports only Microsoft Azure
Expand Down Expand Up @@ -538,6 +552,14 @@
state: started
daemon_reload: "{{ __hpc_azure_persistent_rdma_naming_monitor_unit.changed | d(false) }}"

- name: Install RDMA validation script
template:
src: rdma/test-rdma.sh.j2
dest: "{{ __hpc_azure_tests_dir }}/test-rdma.sh"
owner: root
group: root
mode: "0755"

- name: Install common OpenMPI packages
when: hpc_install_system_openmpi or hpc_build_openmpi_w_nvidia_gpu_support
package:
Expand Down Expand Up @@ -958,20 +980,6 @@
mode: '0644'
notify: Reload udev

- name: Create Azure HPC resource directories
file:
path: "{{ item }}"
state: directory
owner: root
group: root
mode: '0755'
loop:
- "{{ __hpc_azure_resource_dir }}"
- "{{ __hpc_azure_resource_dir }}/bin"
- "{{ __hpc_azure_tools_dir }}"
- "{{ __hpc_azure_tests_dir }}"
- "{{ __hpc_azure_runtime_dir }}"

- name: Install SKU Customisation scripts and services
when: hpc_sku_customisation
block:
Expand Down
105 changes: 105 additions & 0 deletions templates/rdma/test-rdma.sh.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env bash
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nothing is templated in this file except for ansible_managed and "system_role:hpc" fingerprint. This is fine, but is this intended?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@spetrosi It is only done due to maintain standard as previously as well tests are kept in template form.

# These are templates, not actual shell scripts, so tell shellcheck to
# ignore the templated parts
# shellcheck disable=all
{{ ansible_managed | comment }}
{{ "system_role:hpc" | comment(prefix="", postfix="") }}
# shellcheck enable=all
# SPDX-License-Identifier: MIT
#
# RDMA Validation Script
# Usage: test-rdma.sh
#

# This is test code, and some operations are expected to fail. Hence we can't
# use set -e to automatically exit the script if something fails.
set -u

fail()
{
echo Failed: "$1"
exit 1
}

require_file() {
local path="$1"
[[ -e "$path" ]] || fail "missing file: $path"
}

require_executable() {
local path="$1"
[[ -x "$path" ]] || fail "not executable: $path"
}

require_cmd() {
local cmd="$1"
command -v "$cmd" >/dev/null 2>&1 || fail "missing command in PATH: $cmd"
}

sys_vendor() {
if [[ -r /sys/class/dmi/id/sys_vendor ]]; then
cat /sys/class/dmi/id/sys_vendor
else
echo ""
fi
}

is_systemd() {
[[ "$(ps -p 1 -o comm= 2>/dev/null || true)" == "systemd" ]]
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to check whether it is systemd environment even we are running in RHEL-9.6 systemd environment?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it doesn't hurt, and it "future proofs" this code.


main() {
echo
echo "Testing waagent RDMA flag"
require_file /etc/waagent.conf
grep -Fxq "OS.EnableRDMA=y" /etc/waagent.conf || fail "expected 'OS.EnableRDMA=y' in /etc/waagent.conf"
echo Test Passed: "waagent RDMA flag is set"

echo
echo "Testing RDMA userland tools"
require_cmd ibv_devinfo
echo Test Passed: "RDMA tools are present (ibv_devinfo)"

# Azure persistent RDMA naming artifacts/services (Azure only)
if [ "$(sys_vendor)" != "Microsoft Corporation" ]; then
echo
echo "Testing Azure persistent RDMA naming (skip: not Azure)"
echo Test Passed: "not running on Azure; Azure persistent RDMA naming checks skipped"
return 0
fi

if ! is_systemd; then
echo
echo "Testing Azure persistent RDMA naming (skip: not systemd)"
echo Test Passed: "not running systemd; systemd unit checks skipped"
return 0
fi

echo
echo "Testing Azure persistent RDMA naming artifacts"
require_executable /usr/sbin/azure_persistent_rdma_naming.sh
require_executable /usr/sbin/azure_persistent_rdma_naming_monitor.sh
require_file /etc/systemd/system/azure_persistent_rdma_naming.service
require_file /etc/systemd/system/azure_persistent_rdma_naming_monitor.service
require_file /etc/udev/rules.d/99-azure-persistent-rdma-naming.rules
echo Test Passed: "Azure persistent RDMA naming artifacts exist"

echo
echo "Testing Azure persistent RDMA naming services"
require_cmd systemctl
systemctl is-enabled azure_persistent_rdma_naming.service >/dev/null 2>&1 || fail "azure_persistent_rdma_naming.service not enabled"
systemctl is-enabled azure_persistent_rdma_naming_monitor.service >/dev/null 2>&1 || fail "azure_persistent_rdma_naming_monitor.service not enabled"

# azure_persistent_rdma_naming.service is Type=oneshot, so it may not remain
# "active" after it runs. Treat "failed" as an error; other states are OK.
if [ "$(systemctl is-failed azure_persistent_rdma_naming.service 2>/dev/null || true)" = "failed" ]; then
fail "azure_persistent_rdma_naming.service is in failed state"
fi

# Monitor service should be continuously running.
systemctl is-active azure_persistent_rdma_naming_monitor.service >/dev/null 2>&1 || fail "azure_persistent_rdma_naming_monitor.service not active"
echo Test Passed: "Azure persistent RDMA naming services look healthy"
}

main "$@"

Loading