From 4b805c953825f7d69e8c0715a9b5c4052568acfe Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 Feb 2026 15:14:12 +1100 Subject: [PATCH 1/3] diagnostics: define the hpc diagnostic script source Define up the Azure HPC diagnostics tarball download location, set up a SHA256 sum for it (as github does not supply one), then add the config documentation and infrastructure to pull it down and unpack it, ready to extract the diagnostics script from it. Signed-off-by: Dave Chinner --- README.md | 20 ++++++++++++++++++++ defaults/main.yml | 1 + tasks/main.yml | 23 +++++++++++++++++++++++ vars/RedHat_9.yml | 7 ++++++- 4 files changed, 50 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9593206..aad6315 100644 --- a/README.md +++ b/README.md @@ -202,6 +202,26 @@ The role installs Moneo to /opt/hpc/azure/tools/Moneo and adds an alias moneo to For more information, see . +### hpc_install_diagnostics + +Whether to install the Azure HPC Diagnostics tool. + +The Azure HPC Diagnostics tool gathers system information for triage and +debugging purposes. It collects information and state from the hardware, OS, +azure envinroment and installed applications and packages it into a tarball +to simplify the process of system support of bug triage. + +To gather diagnostics, run: + +```bash +/opt/hpc/azure/tools/gather_azhpc_vm_diagnostics.sh +``` + +The script will indicate where the tarball containing the diagnostic information +can be found. + +For more information, see + Default: `true` Type: `bool` diff --git a/defaults/main.yml b/defaults/main.yml index 13adcfd..89f2d8a 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -30,6 +30,7 @@ hpc_install_docker: "{{ hpc_install_nvidia_container_toolkit }}" hpc_install_azurehpc_health_checks: "{{ hpc_install_nvidia_container_toolkit }}" hpc_tuning: true hpc_sku_customisation: true +hpc_install_diagnostics: true hpc_update_kernel: true hpc_update_all_packages: false diff --git a/tasks/main.yml b/tasks/main.yml index a052bd5..8d9e0ec 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -1373,6 +1373,29 @@ group: root mode: '0755' +- name: Install Azure HPC Diagnostics tool + when: hpc_install_diagnostics + block: + - name: Check if diagnostics are already installed + stat: + path: "{{ __hpc_azure_tools_dir }}/gather_azhpc_vm_diagnostics.sh" + register: __hpc_azure_diags_installed + + - name: Download and install Diagnostics + when: not __hpc_azure_diags_installed.stat.exists + block: + + - name: Download Diagnostics + include_tasks: download_extract_package.yml + vars: + __hpc_pkg_info: "{{ __hpc_azure_diags_info }}" + + - name: Remove extracted temp directory + file: + path: "{{ __hpc_pkg_extracted.path }}" + state: absent + changed_when: false + - name: Clean dnf cache command: dnf clean all changed_when: false \ No newline at end of file diff --git a/vars/RedHat_9.yml b/vars/RedHat_9.yml index 8703228..b041f82 100644 --- a/vars/RedHat_9.yml +++ b/vars/RedHat_9.yml @@ -56,13 +56,18 @@ __hpc_pmix_info: version: 4.2.9 sha256: 6b11f4fd5c9d7f8e55fc6ebdee9af04b839f44d06044e58cea38c87c168784b3 url: https://github.com/openpmix/openpmix/releases/download/v4.2.9/pmix-4.2.9.tar.bz2 - __hpc_moneo_info: name: moneo version: 0.3.4 sha256: bab588b37f9a7d03fff82ff22d8a24c18a64e18eb2dad31f447a67b6fb76bd4c url: https://github.com/Azure/Moneo/archive/refs/tags/v0.3.4.tar.gz +__hpc_azhpc_diags_info: + name: hpcdiag + version: 20220316 + sha256: bcecba0ff8999131f45508718ac6eec8615550e046c77d69c148d3947647849f + url: https://github.com/Azure/azhpc-diagnostics/archive/refs/tags/hpcdiag-20220316.tar.gz + __hpc_aznhc_info: name: azurehpc-health-checks version: 0.4.5 From 24369fd83d6dacf97de2a91905d5213eb6775e30 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 Feb 2026 15:18:27 +1100 Subject: [PATCH 2/3] diagnostics: install package dependencies The script itself installs certain tools via repository deep links. Some of these tools are provided by OS pacakages, so pull them in via the system-role as a dependent package. Signed-off-by: Dave Chinner --- vars/main.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vars/main.yml b/vars/main.yml index 716f670..66fe93c 100644 --- a/vars/main.yml +++ b/vars/main.yml @@ -73,6 +73,11 @@ __hpc_kernel_versionlock_rpms: - kernel-modules-extra - kernel-devel - kernel-headers +__hpc_azure_diagnostics_packages: + # lsvmbus + - hyperv-tools + - patch + __hpc_install_prefix: /opt # packages for azure specific including VM management infrastructure and storage From 686a52d4941c726f321eb10b7721a1b5ef2718d9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 Feb 2026 15:18:46 +1100 Subject: [PATCH 3/3] feat: install the Azure HPC Diagnostics script The Azure HPC diagnostics script captures information about the system hardware and software for dianostic purposes. It is intended to supplement the RHEL sosreport diagnostics to cover the Azure specific hardware and software that the sosreport does not capture. This script will be used for information gathering in support contexts, it is not intended to be run on active HPC nodes. Before we install the downloaded diagnostic script, we need to change a few things in the script: - the output should be in {{ __hpc_azure_runtime_dir }}/diagnostics - permanently disable the auto update code - fix the version number instead of assuming the script it running from a local git repository - change from defaulting to online mode (requires internet access) to offline mode. --offline option goes away, replaced by --online option - Indicate that the diagnostic log files should be passed on to Red Hat, not Microsoft. To make this easy, we will add a patch file to the system role that contains the code changes we need to make to the script. This is much simpler to apply that needing to do complex parser based matches and replacements to make the changes we need. The resultant patch file will then need to be treated as a template to do path substitution for the runtime output directory. This will place the diagnostic output in a well known place by default, rather than where-ever the script was run from. The script will be installed to {{ __hpc_azure_tools_dir }}. If the script is already present in this location, then we will skip over the installation entirely. Signed-off-by: Dave Chinner --- README.md | 4 +- tasks/main.yml | 57 +++++++- templates/azhpc_vm_diagnostics.sh.patch.j2 | 153 +++++++++++++++++++++ tests/tests_default.yml | 1 + tests/tests_include_vars_from_parent.yml | 1 + tests/tests_skip_toolkit.yml | 1 + 6 files changed, 212 insertions(+), 5 deletions(-) create mode 100644 templates/azhpc_vm_diagnostics.sh.patch.j2 diff --git a/README.md b/README.md index aad6315..b32c8c9 100644 --- a/README.md +++ b/README.md @@ -208,8 +208,8 @@ Whether to install the Azure HPC Diagnostics tool. The Azure HPC Diagnostics tool gathers system information for triage and debugging purposes. It collects information and state from the hardware, OS, -azure envinroment and installed applications and packages it into a tarball -to simplify the process of system support of bug triage. +Azure environment and installed applications, then packages it into a tarball +to simplify the process of system support and bug triage. To gather diagnostics, run: diff --git a/tasks/main.yml b/tasks/main.yml index 8d9e0ec..82e1f07 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -1376,7 +1376,7 @@ - name: Install Azure HPC Diagnostics tool when: hpc_install_diagnostics block: - - name: Check if diagnostics are already installed + - name: Check if Diagnostics are already installed stat: path: "{{ __hpc_azure_tools_dir }}/gather_azhpc_vm_diagnostics.sh" register: __hpc_azure_diags_installed @@ -1384,11 +1384,62 @@ - name: Download and install Diagnostics when: not __hpc_azure_diags_installed.stat.exists block: + - name: Install dependencies + package: + name: "{{ __hpc_azure_diagnostics_packages }}" + state: present + use: "{{ (__hpc_server_is_ostree | d(false)) | + ternary('ansible.posix.rhel_rpm_ostree', omit) }}" + register: __hpc_azure_diagnostics_packages_install + until: __hpc_azure_diagnostics_packages_install is success - name: Download Diagnostics include_tasks: download_extract_package.yml vars: - __hpc_pkg_info: "{{ __hpc_azure_diags_info }}" + __hpc_pkg_info: "{{ __hpc_azhpc_diags_info }}" + + # The downloaded diagnostics script needs sufficient customisation that + # the simplest way to do this is to patch it. However, we also have to + # replace hard coded paths, so we need to be able to template it as + # well. + # + # Templating can only occur on the control node, so we must first run + # the patch through the template module and use the output as the patch + # source. Then we can apply the patch to the remote extracted file and + # copy it to the install location. + - name: Create a temp file for the diagnostics patch + tempfile: + state: file + prefix: hpc_diags + suffix: .patch + register: __hpc_diags_patch_file + + - name: Configure the diagnostics patch + template: + src: azhpc_vm_diagnostics.sh.patch.j2 + dest: "{{ __hpc_diags_patch_file.path }}" + mode: '0644' + + - name: Patch Diagnostics script + patch: + src: "{{ __hpc_diags_patch_file.path }}" + dest: "{{ __hpc_pkg_extracted.path }}/Linux/src/gather_azhpc_vm_diagnostics.sh" + remote_src: true + strip: 1 + + - name: Install Diagnostics script + copy: + src: "{{ __hpc_pkg_extracted.path }}/Linux/src/gather_azhpc_vm_diagnostics.sh" + dest: "{{ __hpc_azure_tools_dir }}/gather_azhpc_vm_diagnostics.sh" + remote_src: true + owner: root + group: root + mode: '0755' + + - name: Clean up temporary patch file + file: + path: "{{ __hpc_diags_patch_file.path }}" + state: absent - name: Remove extracted temp directory file: @@ -1398,4 +1449,4 @@ - name: Clean dnf cache command: dnf clean all - changed_when: false \ No newline at end of file + changed_when: false diff --git a/templates/azhpc_vm_diagnostics.sh.patch.j2 b/templates/azhpc_vm_diagnostics.sh.patch.j2 new file mode 100644 index 0000000..da194c2 --- /dev/null +++ b/templates/azhpc_vm_diagnostics.sh.patch.j2 @@ -0,0 +1,153 @@ +--- gather_azhpc_vm_diagnostics.sh.orig 2026-02-05 15:20:05.410458535 +1100 ++++ gather_azhpc_vm_diagnostics.sh 2026-02-11 18:24:23.902965560 +1100 +@@ -1,4 +1,7 @@ + #!/bin/bash ++{{ ansible_managed | comment(prefix="", postfix="") | trim }} ++{{ "system_role:hpc" | comment(prefix="", postfix="") | trim }} ++ + # Azure HPC Diagnostics Tool + # Gathers Diagnostic info from guest VM + # +@@ -51,16 +54,11 @@ + # Copyright (c) Microsoft Corporation. + # Licensed under the MIT license. + +- +- + #################################################################################################### + # Begin Constants + #################################################################################################### + + STREAM_URL='https://azhpcstor.blob.core.windows.net/diagtool-binaries/stream.tgz' +-LSVMBUS_URL='https://raw.githubusercontent.com/torvalds/linux/master/tools/hv/lsvmbus' +-HPC_DIAG_URL='https://raw.githubusercontent.com/Azure/azhpc-diagnostics/main/Linux/src/gather_azhpc_vm_diagnostics.sh' +-SCRIPT_DIR="$( cd "$( dirname "$0" )" >/dev/null 2>&1 && pwd )" + SYSFS_PATH=/sys # store as a variable so it is mockable + ETC_PATH=/etc + PROC_PATH=/proc +@@ -74,15 +72,7 @@ + CPU_LIST=(["Standard_HB120rs_v2"]="0 1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,65,69,73,77,81,85,89,93,97,101,105,109,113,117" + ["Standard_HB60rs"]="0 1,5,9,13,17,21,25,29,33,37,41,45,49,53,57") + RELEASE_DATE=20220316 # update upon each release +-COMMIT_HASH=$( +- ( +- command -v git >/dev/null && +- cd "$SCRIPT_DIR" && +- git config --get remote.origin.url | grep -q 'Azure/azhpc-diagnostics.git$' && +- git rev-parse HEAD 2>/dev/null +- ) || +- echo 'Unknown') +-VERSION_INFO="$RELEASE_DATE-$COMMIT_HASH" ++VERSION_INFO="$RELEASE_DATE-unknown" + + HELP_MESSAGE=" + Usage: $0 [OPTION] +@@ -100,8 +90,8 @@ + Execution Mode: + --gpu-level=GPU_LEVEL dcgmi run level (default is 1) + --mem-level=MEM_LEVEL set to 1 to run stream test (default is 0) +- --no-update do not prompt for auto-update +- --offline skips steps that require Internet access ++ --no-update Does nothing, auto-update functionality has been elided. ++ --online Run steps that require Internet access + + For more information on this script and the data it gathers, visit its Github: + +@@ -217,10 +207,9 @@ + echo "${CPU_LIST[$1]}" + } + ++COLUMNS=80 + if tput cols >/dev/null 2>/dev/null && (( $(tput cols) < 80 )); then + COLUMNS=$(tput cols) +-else +- COLUMNS=80 + fi + + print_enclosed() { +@@ -244,24 +233,6 @@ + echo '' + } + +-check_for_updates() { +- local message="You are not running the latest release of this tool. Switch to latest version?" +- +- local tmpfile +- tmpfile=$(mktemp) +- curl -s "$HPC_DIAG_URL" >"$tmpfile" || return 1 +- if ! cmp --silent "$0" "$tmpfile"; then +- if prompt "$message"; then +- mv "$tmpfile" "$0" +- bash "$0" "$RUNTIME_OPTIONS" +- exit $? +- else +- return 0 +- fi +- fi +- rm "$tmpfile" +-} +- + get_metadata() { + local path="$1" + curl -s -H Metadata:true "http://169.254.169.254/metadata/instance/$path?api-version=2021-03-01&format=text" +@@ -876,11 +847,11 @@ + print_divider + print_enclosed "NOTICES:" + print_divider +- print_enclosed This tool generates and bundles together various logs and diagnostic information. It, however, DOES NOT TRANSMIT any of said data. It is left to the user to choose to transmit this data to Microsoft. ++ print_enclosed This tool generates and bundles together various logs and diagnostic information. It, however, DOES NOT TRANSMIT any of said data. It is left to the user to choose to transmit this data to Red Hat. + print_divider +- print_enclosed Some of this info, such as IP addresses, may be Personally Identifiable Information. It is up to the user to redact any sensitive info from the output 'if' necessary before sending it to Microsoft. ++ print_enclosed Some of this info, such as IP addresses, may be Personally Identifiable Information. It is up to the user to redact any sensitive info from the output 'if' necessary before sending it to Red Hat. + print_divider +- print_enclosed This tool invokes various 3rd party tools 'if' they are present on the system Please review them and their EULAs at: ++ print_enclosed This tool invokes various 3rd party tools 'if' they are present on the system. Please review them and their EULAs at: + print_enclosed "https://github.com/Azure/azhpc-diagnostics" + print_divider + print_enclosed WARNING: THINK BEFORE YOU RUN THIS +@@ -1018,9 +989,6 @@ + print_enclosed 'Placing diagnostic files in the following location:' + print_enclosed "$DIAG_DIR.tar.gz" + print_divider +- print_enclosed If you have already opened a support request, you can take the tarball and follow this link to upload it: +- print_enclosed 'https://portal.azure.com/#blade/Microsoft_Azure_Support/HelpAndSupportBlade/managesupportrequest' +- print_divider + tar czf "$DIAG_DIR.tar.gz" -C "$DIAG_DIR_LOC" "$VM_ID.$TIMESTAMP" 2>/dev/null && rm -r "$DIAG_DIR" + } + +@@ -1030,9 +998,11 @@ + + GPU_LEVEL=1 + MEM_LEVEL=0 ++OFFLINE=true + DISPLAY_HELP=false +-# should be /opt/azurehpc/diagnostics +-DIAG_DIR_LOC="$SCRIPT_DIR" ++DISPLAY_VERSION=false ++# should be /var/hpc/azure/diagnostics ++DIAG_DIR_LOC="{{ __hpc_azure_runtime_dir }}/diagnostics" + + # save options + RUNTIME_OPTIONS=$* +@@ -1063,8 +1033,8 @@ + validate_run_level "$1" + MEM_LEVEL="$1" + ;; +- --no-update) DISABLE_UPDATE=true;; +- --offline) OFFLINE=true;; ++ --no-update) ;; # does nothing ++ --online) OFFLINE=false;; + --tuning) TUNING=true;; + -V|--version) DISPLAY_VERSION=true;; + esac +@@ -1081,10 +1051,6 @@ + # End Option Parsing + #################################################################################################### + +-if [ "$OFFLINE" != true ] && [ "$DISABLE_UPDATE" != true ] && ! [[ $- =~ 's' ]]; then +- check_for_updates +-fi +- + if [ ! "${BASH_SOURCE[0]}" -ef "$0" ]; then + # This lets us load all functions for unit testing. + # We wouldn't want people sourcing this script anyway. diff --git a/tests/tests_default.yml b/tests/tests_default.yml index bf26b18..d5689da 100644 --- a/tests/tests_default.yml +++ b/tests/tests_default.yml @@ -19,6 +19,7 @@ hpc_install_nvidia_container_toolkit: false hpc_install_docker: "{{ hpc_install_nvidia_container_toolkit }}" hpc_install_azurehpc_health_checks: "{{ hpc_install_nvidia_container_toolkit }}" + hpc_install_diagnostics: false tasks: - name: Skip unsupported architectures include_tasks: tasks/skip_unsupported_archs.yml diff --git a/tests/tests_include_vars_from_parent.yml b/tests/tests_include_vars_from_parent.yml index d77c05b..1b54ad1 100644 --- a/tests/tests_include_vars_from_parent.yml +++ b/tests/tests_include_vars_from_parent.yml @@ -64,6 +64,7 @@ hpc_install_nvidia_container_toolkit: false hpc_install_docker: "{{ hpc_install_nvidia_container_toolkit }}" hpc_install_azurehpc_health_checks: "{{ hpc_install_nvidia_container_toolkit }}" + hpc_install_diagnostics: false - name: Cleanup file: diff --git a/tests/tests_skip_toolkit.yml b/tests/tests_skip_toolkit.yml index ccb0955..4e9c746 100644 --- a/tests/tests_skip_toolkit.yml +++ b/tests/tests_skip_toolkit.yml @@ -22,6 +22,7 @@ hpc_install_nvidia_container_toolkit: false hpc_install_docker: "{{ hpc_install_nvidia_container_toolkit }}" hpc_install_azurehpc_health_checks: "{{ hpc_install_nvidia_container_toolkit }}" + hpc_install_diagnostics: false tags: - tests::reboot tasks: