diff --git a/README.md b/README.md index 9593206..b32c8c9 100644 --- a/README.md +++ b/README.md @@ -202,6 +202,26 @@ The role installs Moneo to /opt/hpc/azure/tools/Moneo and adds an alias moneo to For more information, see . +### hpc_install_diagnostics + +Whether to install the Azure HPC Diagnostics tool. + +The Azure HPC Diagnostics tool gathers system information for triage and +debugging purposes. It collects information and state from the hardware, OS, +Azure environment and installed applications, then packages it into a tarball +to simplify the process of system support and bug triage. + +To gather diagnostics, run: + +```bash +/opt/hpc/azure/tools/gather_azhpc_vm_diagnostics.sh +``` + +The script will indicate where the tarball containing the diagnostic information +can be found. + +For more information, see + Default: `true` Type: `bool` diff --git a/defaults/main.yml b/defaults/main.yml index 13adcfd..89f2d8a 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -30,6 +30,7 @@ hpc_install_docker: "{{ hpc_install_nvidia_container_toolkit }}" hpc_install_azurehpc_health_checks: "{{ hpc_install_nvidia_container_toolkit }}" hpc_tuning: true hpc_sku_customisation: true +hpc_install_diagnostics: true hpc_update_kernel: true hpc_update_all_packages: false diff --git a/tasks/main.yml b/tasks/main.yml index a052bd5..82e1f07 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -1373,6 +1373,80 @@ group: root mode: '0755' +- name: Install Azure HPC Diagnostics tool + when: hpc_install_diagnostics + block: + - name: Check if Diagnostics are already installed + stat: + path: "{{ __hpc_azure_tools_dir }}/gather_azhpc_vm_diagnostics.sh" + register: __hpc_azure_diags_installed + + - name: Download and install Diagnostics + when: not __hpc_azure_diags_installed.stat.exists + block: + - name: Install dependencies + package: + name: "{{ __hpc_azure_diagnostics_packages }}" + state: present + use: "{{ (__hpc_server_is_ostree | d(false)) | + ternary('ansible.posix.rhel_rpm_ostree', omit) }}" + register: __hpc_azure_diagnostics_packages_install + until: __hpc_azure_diagnostics_packages_install is success + + - name: Download Diagnostics + include_tasks: download_extract_package.yml + vars: + __hpc_pkg_info: "{{ __hpc_azhpc_diags_info }}" + + # The downloaded diagnostics script needs sufficient customisation that + # the simplest way to do this is to patch it. However, we also have to + # replace hard coded paths, so we need to be able to template it as + # well. + # + # Templating can only occur on the control node, so we must first run + # the patch through the template module and use the output as the patch + # source. Then we can apply the patch to the remote extracted file and + # copy it to the install location. + - name: Create a temp file for the diagnostics patch + tempfile: + state: file + prefix: hpc_diags + suffix: .patch + register: __hpc_diags_patch_file + + - name: Configure the diagnostics patch + template: + src: azhpc_vm_diagnostics.sh.patch.j2 + dest: "{{ __hpc_diags_patch_file.path }}" + mode: '0644' + + - name: Patch Diagnostics script + patch: + src: "{{ __hpc_diags_patch_file.path }}" + dest: "{{ __hpc_pkg_extracted.path }}/Linux/src/gather_azhpc_vm_diagnostics.sh" + remote_src: true + strip: 1 + + - name: Install Diagnostics script + copy: + src: "{{ __hpc_pkg_extracted.path }}/Linux/src/gather_azhpc_vm_diagnostics.sh" + dest: "{{ __hpc_azure_tools_dir }}/gather_azhpc_vm_diagnostics.sh" + remote_src: true + owner: root + group: root + mode: '0755' + + - name: Clean up temporary patch file + file: + path: "{{ __hpc_diags_patch_file.path }}" + state: absent + + - name: Remove extracted temp directory + file: + path: "{{ __hpc_pkg_extracted.path }}" + state: absent + changed_when: false + - name: Clean dnf cache command: dnf clean all - changed_when: false \ No newline at end of file + changed_when: false diff --git a/templates/azhpc_vm_diagnostics.sh.patch.j2 b/templates/azhpc_vm_diagnostics.sh.patch.j2 new file mode 100644 index 0000000..da194c2 --- /dev/null +++ b/templates/azhpc_vm_diagnostics.sh.patch.j2 @@ -0,0 +1,153 @@ +--- gather_azhpc_vm_diagnostics.sh.orig 2026-02-05 15:20:05.410458535 +1100 ++++ gather_azhpc_vm_diagnostics.sh 2026-02-11 18:24:23.902965560 +1100 +@@ -1,4 +1,7 @@ + #!/bin/bash ++{{ ansible_managed | comment(prefix="", postfix="") | trim }} ++{{ "system_role:hpc" | comment(prefix="", postfix="") | trim }} ++ + # Azure HPC Diagnostics Tool + # Gathers Diagnostic info from guest VM + # +@@ -51,16 +54,11 @@ + # Copyright (c) Microsoft Corporation. + # Licensed under the MIT license. + +- +- + #################################################################################################### + # Begin Constants + #################################################################################################### + + STREAM_URL='https://azhpcstor.blob.core.windows.net/diagtool-binaries/stream.tgz' +-LSVMBUS_URL='https://raw.githubusercontent.com/torvalds/linux/master/tools/hv/lsvmbus' +-HPC_DIAG_URL='https://raw.githubusercontent.com/Azure/azhpc-diagnostics/main/Linux/src/gather_azhpc_vm_diagnostics.sh' +-SCRIPT_DIR="$( cd "$( dirname "$0" )" >/dev/null 2>&1 && pwd )" + SYSFS_PATH=/sys # store as a variable so it is mockable + ETC_PATH=/etc + PROC_PATH=/proc +@@ -74,15 +72,7 @@ + CPU_LIST=(["Standard_HB120rs_v2"]="0 1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,65,69,73,77,81,85,89,93,97,101,105,109,113,117" + ["Standard_HB60rs"]="0 1,5,9,13,17,21,25,29,33,37,41,45,49,53,57") + RELEASE_DATE=20220316 # update upon each release +-COMMIT_HASH=$( +- ( +- command -v git >/dev/null && +- cd "$SCRIPT_DIR" && +- git config --get remote.origin.url | grep -q 'Azure/azhpc-diagnostics.git$' && +- git rev-parse HEAD 2>/dev/null +- ) || +- echo 'Unknown') +-VERSION_INFO="$RELEASE_DATE-$COMMIT_HASH" ++VERSION_INFO="$RELEASE_DATE-unknown" + + HELP_MESSAGE=" + Usage: $0 [OPTION] +@@ -100,8 +90,8 @@ + Execution Mode: + --gpu-level=GPU_LEVEL dcgmi run level (default is 1) + --mem-level=MEM_LEVEL set to 1 to run stream test (default is 0) +- --no-update do not prompt for auto-update +- --offline skips steps that require Internet access ++ --no-update Does nothing, auto-update functionality has been elided. ++ --online Run steps that require Internet access + + For more information on this script and the data it gathers, visit its Github: + +@@ -217,10 +207,9 @@ + echo "${CPU_LIST[$1]}" + } + ++COLUMNS=80 + if tput cols >/dev/null 2>/dev/null && (( $(tput cols) < 80 )); then + COLUMNS=$(tput cols) +-else +- COLUMNS=80 + fi + + print_enclosed() { +@@ -244,24 +233,6 @@ + echo '' + } + +-check_for_updates() { +- local message="You are not running the latest release of this tool. Switch to latest version?" +- +- local tmpfile +- tmpfile=$(mktemp) +- curl -s "$HPC_DIAG_URL" >"$tmpfile" || return 1 +- if ! cmp --silent "$0" "$tmpfile"; then +- if prompt "$message"; then +- mv "$tmpfile" "$0" +- bash "$0" "$RUNTIME_OPTIONS" +- exit $? +- else +- return 0 +- fi +- fi +- rm "$tmpfile" +-} +- + get_metadata() { + local path="$1" + curl -s -H Metadata:true "http://169.254.169.254/metadata/instance/$path?api-version=2021-03-01&format=text" +@@ -876,11 +847,11 @@ + print_divider + print_enclosed "NOTICES:" + print_divider +- print_enclosed This tool generates and bundles together various logs and diagnostic information. It, however, DOES NOT TRANSMIT any of said data. It is left to the user to choose to transmit this data to Microsoft. ++ print_enclosed This tool generates and bundles together various logs and diagnostic information. It, however, DOES NOT TRANSMIT any of said data. It is left to the user to choose to transmit this data to Red Hat. + print_divider +- print_enclosed Some of this info, such as IP addresses, may be Personally Identifiable Information. It is up to the user to redact any sensitive info from the output 'if' necessary before sending it to Microsoft. ++ print_enclosed Some of this info, such as IP addresses, may be Personally Identifiable Information. It is up to the user to redact any sensitive info from the output 'if' necessary before sending it to Red Hat. + print_divider +- print_enclosed This tool invokes various 3rd party tools 'if' they are present on the system Please review them and their EULAs at: ++ print_enclosed This tool invokes various 3rd party tools 'if' they are present on the system. Please review them and their EULAs at: + print_enclosed "https://github.com/Azure/azhpc-diagnostics" + print_divider + print_enclosed WARNING: THINK BEFORE YOU RUN THIS +@@ -1018,9 +989,6 @@ + print_enclosed 'Placing diagnostic files in the following location:' + print_enclosed "$DIAG_DIR.tar.gz" + print_divider +- print_enclosed If you have already opened a support request, you can take the tarball and follow this link to upload it: +- print_enclosed 'https://portal.azure.com/#blade/Microsoft_Azure_Support/HelpAndSupportBlade/managesupportrequest' +- print_divider + tar czf "$DIAG_DIR.tar.gz" -C "$DIAG_DIR_LOC" "$VM_ID.$TIMESTAMP" 2>/dev/null && rm -r "$DIAG_DIR" + } + +@@ -1030,9 +998,11 @@ + + GPU_LEVEL=1 + MEM_LEVEL=0 ++OFFLINE=true + DISPLAY_HELP=false +-# should be /opt/azurehpc/diagnostics +-DIAG_DIR_LOC="$SCRIPT_DIR" ++DISPLAY_VERSION=false ++# should be /var/hpc/azure/diagnostics ++DIAG_DIR_LOC="{{ __hpc_azure_runtime_dir }}/diagnostics" + + # save options + RUNTIME_OPTIONS=$* +@@ -1063,8 +1033,8 @@ + validate_run_level "$1" + MEM_LEVEL="$1" + ;; +- --no-update) DISABLE_UPDATE=true;; +- --offline) OFFLINE=true;; ++ --no-update) ;; # does nothing ++ --online) OFFLINE=false;; + --tuning) TUNING=true;; + -V|--version) DISPLAY_VERSION=true;; + esac +@@ -1081,10 +1051,6 @@ + # End Option Parsing + #################################################################################################### + +-if [ "$OFFLINE" != true ] && [ "$DISABLE_UPDATE" != true ] && ! [[ $- =~ 's' ]]; then +- check_for_updates +-fi +- + if [ ! "${BASH_SOURCE[0]}" -ef "$0" ]; then + # This lets us load all functions for unit testing. + # We wouldn't want people sourcing this script anyway. diff --git a/tests/tests_default.yml b/tests/tests_default.yml index bf26b18..d5689da 100644 --- a/tests/tests_default.yml +++ b/tests/tests_default.yml @@ -19,6 +19,7 @@ hpc_install_nvidia_container_toolkit: false hpc_install_docker: "{{ hpc_install_nvidia_container_toolkit }}" hpc_install_azurehpc_health_checks: "{{ hpc_install_nvidia_container_toolkit }}" + hpc_install_diagnostics: false tasks: - name: Skip unsupported architectures include_tasks: tasks/skip_unsupported_archs.yml diff --git a/tests/tests_include_vars_from_parent.yml b/tests/tests_include_vars_from_parent.yml index d77c05b..1b54ad1 100644 --- a/tests/tests_include_vars_from_parent.yml +++ b/tests/tests_include_vars_from_parent.yml @@ -64,6 +64,7 @@ hpc_install_nvidia_container_toolkit: false hpc_install_docker: "{{ hpc_install_nvidia_container_toolkit }}" hpc_install_azurehpc_health_checks: "{{ hpc_install_nvidia_container_toolkit }}" + hpc_install_diagnostics: false - name: Cleanup file: diff --git a/tests/tests_skip_toolkit.yml b/tests/tests_skip_toolkit.yml index ccb0955..4e9c746 100644 --- a/tests/tests_skip_toolkit.yml +++ b/tests/tests_skip_toolkit.yml @@ -22,6 +22,7 @@ hpc_install_nvidia_container_toolkit: false hpc_install_docker: "{{ hpc_install_nvidia_container_toolkit }}" hpc_install_azurehpc_health_checks: "{{ hpc_install_nvidia_container_toolkit }}" + hpc_install_diagnostics: false tags: - tests::reboot tasks: diff --git a/vars/RedHat_9.yml b/vars/RedHat_9.yml index 8703228..b041f82 100644 --- a/vars/RedHat_9.yml +++ b/vars/RedHat_9.yml @@ -56,13 +56,18 @@ __hpc_pmix_info: version: 4.2.9 sha256: 6b11f4fd5c9d7f8e55fc6ebdee9af04b839f44d06044e58cea38c87c168784b3 url: https://github.com/openpmix/openpmix/releases/download/v4.2.9/pmix-4.2.9.tar.bz2 - __hpc_moneo_info: name: moneo version: 0.3.4 sha256: bab588b37f9a7d03fff82ff22d8a24c18a64e18eb2dad31f447a67b6fb76bd4c url: https://github.com/Azure/Moneo/archive/refs/tags/v0.3.4.tar.gz +__hpc_azhpc_diags_info: + name: hpcdiag + version: 20220316 + sha256: bcecba0ff8999131f45508718ac6eec8615550e046c77d69c148d3947647849f + url: https://github.com/Azure/azhpc-diagnostics/archive/refs/tags/hpcdiag-20220316.tar.gz + __hpc_aznhc_info: name: azurehpc-health-checks version: 0.4.5 diff --git a/vars/main.yml b/vars/main.yml index 716f670..66fe93c 100644 --- a/vars/main.yml +++ b/vars/main.yml @@ -73,6 +73,11 @@ __hpc_kernel_versionlock_rpms: - kernel-modules-extra - kernel-devel - kernel-headers +__hpc_azure_diagnostics_packages: + # lsvmbus + - hyperv-tools + - patch + __hpc_install_prefix: /opt # packages for azure specific including VM management infrastructure and storage