From 2c45c6cffa9b5068196c22c360ff0071e197f074 Mon Sep 17 00:00:00 2001 From: Claudia Watson Date: Tue, 28 Oct 2025 15:27:31 +0000 Subject: [PATCH 1/9] Adding tasks to essi/configure.yml to make eessi configure gpu node automatically --- .../roles/compute_init/files/compute-init.yml | 16 ++++++++++++++++ ansible/roles/eessi/tasks/configure.yml | 17 +++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 81dedf8fb..0a21303a6 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -294,6 +294,22 @@ ansible.builtin.command: cmd: "cvmfs_config setup" + # configure gpus + - name: Check for NVIDIA driver + ansible.builtin.stat: + path: /dev/nvidia0 + register: nvidia_driver + + - name: Set fact if NVIDIA driver is present + ansible.builtin.set_fact: + has_nvidia_driver: "{{ nvidia_driver.stat.exists | default(false) }}" + + - name: Expose GPU drivers + ansible.builtin.shell: | + source /cvmfs/software.eessi.io/versions/2023.06/init/bash + /cvmfs/software.eessi.io/versions/2023.06/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh + when: has_nvidia_driver + - name: Configure VGPUs ansible.builtin.include_role: name: stackhpc.linux.vgpu diff --git a/ansible/roles/eessi/tasks/configure.yml b/ansible/roles/eessi/tasks/configure.yml index 2c765d20c..23030c3e5 100644 --- a/ansible/roles/eessi/tasks/configure.yml +++ b/ansible/roles/eessi/tasks/configure.yml @@ -15,3 +15,20 @@ - name: Ensure CVMFS config is setup # noqa: no-changed-when ansible.builtin.command: cmd: "cvmfs_config setup" + +# configure gpus +- name: Check for NVIDIA driver + ansible.builtin.stat: + path: /dev/nvidia0 + register: nvidia_driver + +- name: Set fact if NVIDIA driver is present + ansible.builtin.set_fact: + has_nvidia_driver: "{{ nvidia_driver.stat.exists | default(false) }}" + +- name: Expose GPU drivers + ansible.builtin.shell: | + source /cvmfs/software.eessi.io/versions/2023.06/init/bash + /cvmfs/software.eessi.io/versions/2023.06/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh + when: has_nvidia_driver + changed_when: true From fa78672bf8267abe189a842d6791464d6dec1903 Mon Sep 17 00:00:00 2001 From: Claudia Watson Date: Tue, 28 Oct 2025 15:27:31 +0000 Subject: [PATCH 2/9] Adding tasks to essi/configure.yml to make eessi configure gpu node automatically --- .../roles/compute_init/files/compute-init.yml | 16 ++++++++++++++++ ansible/roles/eessi/tasks/configure.yml | 17 +++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 81dedf8fb..0a21303a6 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -294,6 +294,22 @@ ansible.builtin.command: cmd: "cvmfs_config setup" + # configure gpus + - name: Check for NVIDIA driver + ansible.builtin.stat: + path: /dev/nvidia0 + register: nvidia_driver + + - name: Set fact if NVIDIA driver is present + ansible.builtin.set_fact: + has_nvidia_driver: "{{ nvidia_driver.stat.exists | default(false) }}" + + - name: Expose GPU drivers + ansible.builtin.shell: | + source /cvmfs/software.eessi.io/versions/2023.06/init/bash + /cvmfs/software.eessi.io/versions/2023.06/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh + when: has_nvidia_driver + - name: Configure VGPUs ansible.builtin.include_role: name: stackhpc.linux.vgpu diff --git a/ansible/roles/eessi/tasks/configure.yml b/ansible/roles/eessi/tasks/configure.yml index 2c765d20c..23030c3e5 100644 --- a/ansible/roles/eessi/tasks/configure.yml +++ b/ansible/roles/eessi/tasks/configure.yml @@ -15,3 +15,20 @@ - name: Ensure CVMFS config is setup # noqa: no-changed-when ansible.builtin.command: cmd: "cvmfs_config setup" + +# configure gpus +- name: Check for NVIDIA driver + ansible.builtin.stat: + path: /dev/nvidia0 + register: nvidia_driver + +- name: Set fact if NVIDIA driver is present + ansible.builtin.set_fact: + has_nvidia_driver: "{{ nvidia_driver.stat.exists | default(false) }}" + +- name: Expose GPU drivers + ansible.builtin.shell: | + source /cvmfs/software.eessi.io/versions/2023.06/init/bash + /cvmfs/software.eessi.io/versions/2023.06/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh + when: has_nvidia_driver + changed_when: true From d5ac3f76d0f856a6f0334a11f44ab93e47f023f7 Mon Sep 17 00:00:00 2001 From: Claudia Watson Date: Tue, 4 Nov 2025 11:55:57 +0000 Subject: [PATCH 3/9] applying edits to task names --- ansible/roles/eessi/tasks/configure.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/eessi/tasks/configure.yml b/ansible/roles/eessi/tasks/configure.yml index 23030c3e5..c5949ba76 100644 --- a/ansible/roles/eessi/tasks/configure.yml +++ b/ansible/roles/eessi/tasks/configure.yml @@ -17,12 +17,12 @@ cmd: "cvmfs_config setup" # configure gpus -- name: Check for NVIDIA driver +- name: Check for NVIDIA GPU ansible.builtin.stat: path: /dev/nvidia0 register: nvidia_driver -- name: Set fact if NVIDIA driver is present +- name: Set fact if NVIDIA GPU is present ansible.builtin.set_fact: has_nvidia_driver: "{{ nvidia_driver.stat.exists | default(false) }}" From 5f1dddeb797a7753d19074be6cecf289a399d657 Mon Sep 17 00:00:00 2001 From: Claudia Watson Date: Fri, 7 Nov 2025 16:54:46 +0000 Subject: [PATCH 4/9] replacing EESSI block with running the configure.yml task directly --- .../roles/compute_init/files/compute-init.yml | 33 ++----------------- ansible/roles/compute_init/tasks/install.yml | 2 ++ 2 files changed, 5 insertions(+), 30 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 0a21303a6..4d8c26072 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -277,38 +277,11 @@ name: basic_users when: enable_basic_users - - name: EESSI - when: enable_eessi # NB: don't need conditional block on enable_compute as have already exited # if not the case - block: - - name: Copy cvmfs config - ansible.builtin.copy: - src: /var/tmp/cluster/cvmfs/default.local - dest: /etc/cvmfs/default.local - owner: root - group: root - mode: "0644" - - - name: Ensure CVMFS config is setup # noqa: no-changed-when - ansible.builtin.command: - cmd: "cvmfs_config setup" - - # configure gpus - - name: Check for NVIDIA driver - ansible.builtin.stat: - path: /dev/nvidia0 - register: nvidia_driver - - - name: Set fact if NVIDIA driver is present - ansible.builtin.set_fact: - has_nvidia_driver: "{{ nvidia_driver.stat.exists | default(false) }}" - - - name: Expose GPU drivers - ansible.builtin.shell: | - source /cvmfs/software.eessi.io/versions/2023.06/init/bash - /cvmfs/software.eessi.io/versions/2023.06/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh - when: has_nvidia_driver + - name: Configure EESSI + ansible.builtin.include_tasks: tasks/eessi.yml + when: enable_eessi - name: Configure VGPUs ansible.builtin.include_role: diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml index f7ee87645..b239877b1 100644 --- a/ansible/roles/compute_init/tasks/install.yml +++ b/ansible/roles/compute_init/tasks/install.yml @@ -54,6 +54,8 @@ dest: roles/ - src: ../../nhc dest: roles/ + - src: ../../eessi/tasks/configure.yml + dest: tasks/eessi.yml - name: Add filter_plugins to ansible.cfg ansible.builtin.lineinfile: From 31c3e0085123c37fe3ae7bdcff82524af4042fbc Mon Sep 17 00:00:00 2001 From: Claudia Watson Date: Wed, 12 Nov 2025 17:25:29 +0000 Subject: [PATCH 5/9] Bump CI imageX --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 0be7322ec..eafd506ea 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-251027-1123-d389c00b", - "RL9": "openhpc-RL9-251027-1123-d389c00b" + "RL8": "openhpc-RL8-251112-1307-e34d64c4", + "RL9": "openhpc-RL9-251112-1307-e34d64c4" } } From c2f0ad4948d7f501f04727d56d24710980363ba1 Mon Sep 17 00:00:00 2001 From: Claudia Watson Date: Fri, 14 Nov 2025 11:46:59 +0000 Subject: [PATCH 6/9] removed v from trivyscan.yml --- .github/workflows/trivyscan.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/trivyscan.yml b/.github/workflows/trivyscan.yml index 1898d8558..df449451b 100644 --- a/.github/workflows/trivyscan.yml +++ b/.github/workflows/trivyscan.yml @@ -102,7 +102,7 @@ jobs: run: sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}' - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@v0.33.1 + uses: aquasecurity/trivy-action@0.33.1 with: scan-type: fs scan-ref: "${{ steps.manifest.outputs.image-name }}" @@ -122,7 +122,7 @@ jobs: category: "${{ matrix.build }}" - name: Fail if scan has CRITICAL vulnerabilities - uses: aquasecurity/trivy-action@v0.33.1 + uses: aquasecurity/trivy-action@0.33.1 with: scan-type: fs scan-ref: "${{ steps.manifest.outputs.image-name }}" From fda641e4e573f6fb672061c5332984c283c9201b Mon Sep 17 00:00:00 2001 From: Claudia Watson Date: Wed, 19 Nov 2025 11:52:15 +0000 Subject: [PATCH 7/9] fix eessi compute-init tasks to include role defaults --- ansible/roles/compute_init/files/compute-init.yml | 4 +++- ansible/roles/compute_init/tasks/install.yml | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 4d8c26072..91eaa86c1 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -280,7 +280,9 @@ # NB: don't need conditional block on enable_compute as have already exited # if not the case - name: Configure EESSI - ansible.builtin.include_tasks: tasks/eessi.yml + ansible.builtin.include_role: + name: eessi + tasks_from: configure.yml when: enable_eessi - name: Configure VGPUs diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml index b239877b1..4d353dea2 100644 --- a/ansible/roles/compute_init/tasks/install.yml +++ b/ansible/roles/compute_init/tasks/install.yml @@ -54,8 +54,8 @@ dest: roles/ - src: ../../nhc dest: roles/ - - src: ../../eessi/tasks/configure.yml - dest: tasks/eessi.yml + - src: ../../eessi + dest: roles/ - name: Add filter_plugins to ansible.cfg ansible.builtin.lineinfile: From 332ac921b259a7edb9449fccef0de1a2fe9460d7 Mon Sep 17 00:00:00 2001 From: Claudia Watson Date: Wed, 19 Nov 2025 11:57:46 +0000 Subject: [PATCH 8/9] removed eessi tasks from compute-init/tasks/export.yml --- ansible/roles/compute_init/tasks/export.yml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/ansible/roles/compute_init/tasks/export.yml b/ansible/roles/compute_init/tasks/export.yml index 5b31bd685..caf0cdc67 100644 --- a/ansible/roles/compute_init/tasks/export.yml +++ b/ansible/roles/compute_init/tasks/export.yml @@ -62,17 +62,6 @@ run_once: true delegate_to: "{{ groups['control'] | first }}" -- name: Copy EESSI CVMFS config to /exports/cluster - ansible.builtin.copy: - src: /etc/cvmfs/default.local - dest: /exports/cluster/cvmfs/default.local - owner: slurm - group: root - mode: "0644" - remote_src: true - run_once: true - delegate_to: "{{ groups['control'] | first }}" - - name: Export cacerts ansible.builtin.include_role: name: cacerts From 286ec46a7a9cada3d2aa367a68195c565bb012ca Mon Sep 17 00:00:00 2001 From: Claudia Watson Date: Wed, 19 Nov 2025 12:56:38 +0000 Subject: [PATCH 9/9] bump CI image --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index eafd506ea..c3c57f108 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-251112-1307-e34d64c4", - "RL9": "openhpc-RL9-251112-1307-e34d64c4" + "RL8": "openhpc-RL8-251119-1202-332ac921", + "RL9": "openhpc-RL9-251119-1202-332ac921" } }