Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions e2e/scenario_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,76 @@ func Test_ACL_DisableSSH(t *testing.T) {
})
}

func Test_ACL_GPUNC(t *testing.T) {
runScenarioACLGPU(t, "Standard_NC6s_v3")
}

func Test_ACL_GPUA100(t *testing.T) {
runScenarioACLGPU(t, "Standard_NC24ads_A100_v4")
}

func Test_ACL_GPUA10(t *testing.T) {
runScenarioACLGRID(t, "Standard_NV6ads_A10_v5")
}

// Returns config for the 'gpu' E2E scenario
func runScenarioACLGPU(t *testing.T, vmSize string) {
RunScenario(t, &Scenario{
Description: fmt.Sprintf("Tests that a GPU-enabled node with VM size %s using an ACL VHD can be properly bootstrapped", vmSize),
Tags: Tags{
GPU: true,
},
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDACLGen2TL,
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.AgentPoolProfile.VMSize = vmSize
nbc.ConfigGPUDriverIfNeeded = true
nbc.EnableGPUDevicePluginIfNeeded = false
nbc.EnableNvidia = true
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.SKU.Name = to.Ptr(vmSize)
vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties)
},
Validator: func(ctx context.Context, s *Scenario) {
// Ensure nvidia-modprobe install does not restart kubelet and temporarily cause node to be unschedulable
ValidateNvidiaModProbeInstalled(ctx, s)
ValidateNvidiaPersistencedRunning(ctx, s)
},
},
})
}

func runScenarioACLGRID(t *testing.T, vmSize string) {
RunScenario(t, &Scenario{
Description: fmt.Sprintf("Tests that a GPU-enabled node with VM size %s using an ACL VHD can be properly bootstrapped, and that the GRID license is valid", vmSize),
Tags: Tags{
GPU: true,
},
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDACLGen2TL,
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.AgentPoolProfile.VMSize = vmSize
nbc.ConfigGPUDriverIfNeeded = true
nbc.EnableGPUDevicePluginIfNeeded = false
nbc.EnableNvidia = true
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.SKU.Name = to.Ptr(vmSize)
vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties)
},
Validator: func(ctx context.Context, s *Scenario) {
// Ensure nvidia-modprobe install does not restart kubelet and temporarily cause node to be unschedulable
ValidateNvidiaModProbeInstalled(ctx, s)
ValidateNvidiaGRIDLicenseValid(ctx, s)
ValidateNvidiaPersistencedRunning(ctx, s)
},
},
})
}

func Test_AzureLinuxV3_SecureTLSBootstrapping_BootstrapToken_Fallback(t *testing.T) {
RunScenario(t, &Scenario{
Description: "Tests that a node using a AzureLinuxV3 Gen2 VHD can be properly bootstrapped even if secure TLS bootstrapping fails",
Expand Down Expand Up @@ -1838,6 +1908,35 @@ func Test_AzureLinuxV3_GPU(t *testing.T) {
})
}

func Test_AzureLinuxV3_GPUA10(t *testing.T) {
RunScenario(t, &Scenario{
Description: "Tests that a GPU-enabled node with A10 GPU SKU using a AzureLinuxV3 (CgroupV2) VHD can be properly bootstrapped",
Tags: Tags{
GPU: true,
},
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDAzureLinuxV3Gen2,
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.AgentPoolProfile.VMSize = "Standard_NV6ads_A10_v5"
nbc.ConfigGPUDriverIfNeeded = true
nbc.EnableGPUDevicePluginIfNeeded = false
nbc.EnableNvidia = true
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.SKU.Name = to.Ptr("Standard_NV6ads_A10_v5")
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateNvidiaModProbeInstalled(ctx, s)
ValidateNvidiaGRIDLicenseValid(ctx, s)
ValidateKubeletHasNotStopped(ctx, s)
ValidateServicesDoNotRestartKubelet(ctx, s)
ValidateNvidiaPersistencedRunning(ctx, s)
},
},
})
}

func Test_AzureLinuxV3_GPUAzureCNI(t *testing.T) {
RunScenario(t, &Scenario{
Description: "AzureLinux V3 (CgroupV2) gpu scenario on cluster configured with Azure CNI",
Expand Down
99 changes: 97 additions & 2 deletions parts/linux/cloud-init/artifacts/acl/cse_install_acl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,25 @@ downloadSysextFromVersion() {

matchLocalSysext() {
local seName=$1 desiredVer=$2 seArch=$3
printf "%s\n" "/opt/${seName}/downloads/${seName}-v${desiredVer}"[.~-]*"-${seArch}.raw" | sort -V | tail -n1
local downloadDir="/opt/${seName}/downloads"
# Try arch-specific versioned filename first (kubelet-style: name-vVER.X-arch.raw)
local match
match=$(find "${downloadDir}" -maxdepth 2 -name "${seName}-v${desiredVer}*-${seArch}.raw" -type f 2>/dev/null | sort -V | tail -n1)
if [ -f "${match}" ]; then
echo "${match}"
return
fi
# Fallback: GPU sysexts are downloaded as simple name.raw (e.g. nvidia-driver-vgpu.raw).
# MCR artifacts may place files in an arch subdirectory (e.g. amd64/name.raw),
# so search up to 2 levels deep.
match=$(find "${downloadDir}" -maxdepth 2 -name "${seName}.raw" -type f 2>/dev/null | head -n1)
Comment on lines +33 to +35
Copy link

Copilot AI Mar 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

matchLocalSysext fallback (find ... -name "${seName}.raw" | head -n1) can return an arbitrary file and does not filter by the requested systemd arch (seArch is x86-64 on amd64). If both arch subdirs exist (e.g. amd64/ + arm64/), this can select the wrong sysext and break provisioning. Consider preferring an arch-specific path/pattern (e.g. ${downloadDir}/${seArch}/${seName}.raw or filtering find results), and make the selection deterministic (e.g. sort -V | tail -n1).

Suggested change
# MCR artifacts may place files in an arch subdirectory (e.g. amd64/name.raw),
# so search up to 2 levels deep.
match=$(find "${downloadDir}" -maxdepth 2 -name "${seName}.raw" -type f 2>/dev/null | head -n1)
# Prefer an arch-specific subdirectory (${downloadDir}/${seArch}) when present,
# then fall back to an arch-neutral file directly under ${downloadDir}. In both
# cases, pick the highest version deterministically.
match=$(find "${downloadDir}/${seArch}" -maxdepth 1 -name "${seName}.raw" -type f 2>/dev/null | sort -V | tail -n1)
if [ -f "${match}" ]; then
echo "${match}"
return
fi
match=$(find "${downloadDir}" -maxdepth 1 -name "${seName}.raw" -type f 2>/dev/null | sort -V | tail -n1)

Copilot uses AI. Check for mistakes.
echo "${match}"
}

matchRemoteSysext() {
local seURL=$1 desiredVer=$2 seArch=$3
retrycmd_silent 120 5 20 oras repo tags --registry-config "${ORAS_REGISTRY_CONFIG_FILE}" "${seURL}" | grep -Ex "v${desiredVer//./\\.}[.~-].*-azlinux3-${seArch}" | sort -V | tail -n1
# Match either arch-specific tags (v{ver}[.~-]*-azlinux3-{arch}) or exact version tags ({ver})
retrycmd_silent 120 5 20 oras repo tags --registry-config "${ORAS_REGISTRY_CONFIG_FILE}" "${seURL}" | grep -Ex "(v${desiredVer//./\\.}[.~-].*-azlinux3-${seArch}|${desiredVer//./\\.})" | sort -V | tail -n1
test ${PIPESTATUS[0]} -eq 0
Comment on lines +41 to 43
Copy link

Copilot AI Mar 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The updated regex allows both arch-specific tags and “exact version” tags, but the single-pass sort -V | tail -n1 selection can become ambiguous if both forms exist (it may choose the wrong one depending on tag set/sort behavior). To make selection deterministic, consider doing a two-pass lookup: first attempt the arch-specific pattern; only if that yields no match, fallback to the exact-version tag.

Suggested change
# Match either arch-specific tags (v{ver}[.~-]*-azlinux3-{arch}) or exact version tags ({ver})
retrycmd_silent 120 5 20 oras repo tags --registry-config "${ORAS_REGISTRY_CONFIG_FILE}" "${seURL}" | grep -Ex "(v${desiredVer//./\\.}[.~-].*-azlinux3-${seArch}|${desiredVer//./\\.})" | sort -V | tail -n1
test ${PIPESTATUS[0]} -eq 0
local tags archPattern exactPattern match
# Fetch all tags once; retrycmd_silent handles retries and logging.
tags=$(retrycmd_silent 120 5 20 oras repo tags --registry-config "${ORAS_REGISTRY_CONFIG_FILE}" "${seURL}")
if [ $? -ne 0 ]; then
# Propagate failure from oras/registry access.
return 1
fi
# First pass: prefer arch-specific tags (v{ver}[.~-]*-azlinux3-{arch}).
archPattern="^v${desiredVer//./\\.}[.~-].*-azlinux3-${seArch}$"
match=$(printf '%s\n' "${tags}" | grep -E "${archPattern}" | sort -V | tail -n1)
if [ -n "${match}" ]; then
echo "${match}"
return 0
fi
# Second pass: fall back to exact-version tags ({ver}) if no arch-specific tag exists.
exactPattern="^${desiredVer//./\\.}$"
match=$(printf '%s\n' "${tags}" | grep -E "${exactPattern}" | sort -V | tail -n1)
echo "${match}"

Copilot uses AI. Check for mistakes.
}

Expand Down Expand Up @@ -100,6 +113,88 @@ installCredentialProviderPackageFromBootstrapProfileRegistry() {
installCredentialProviderFromPkg "$2" "$1"
}

# Reads VERSION_ID from /etc/os-release for use as the sysext version tag.
# GPU sysexts are tagged by the OS image version, not the driver version.
getACLVersionID() {
local version_id
version_id=$(. /etc/os-release && echo "${VERSION_ID}")
if [ -z "${version_id}" ]; then
echo "ERROR: VERSION_ID not found in /etc/os-release" >&2
return "${ERR_SYSEXT_VERSION_ID_NOT_FOUND}"
fi
echo "${version_id}"
}

# Pulls a GPU-related sysext by name using the ACL MCR registry.
# Registry path uses major.minor (e.g. 3.0), tag uses full VERSION_ID (e.g. 3.0.20260304).
# Example: mcr.microsoft.com/azurelinux/3.0/azure-container-linux/nvidia-driver-cuda:3.0.20260304
installACLGPUSysext() {
local sysext_name=$1
local version_id
version_id=$(getACLVersionID) || exit $ERR_SYSEXT_VERSION_ID_NOT_FOUND
local mcr_base="${MCR_REPOSITORY_BASE:-mcr.microsoft.com}"
local registry_base="${mcr_base%/}/azurelinux/${version_id%.*}/azure-container-linux"
mergeSysexts "${sysext_name}" "${registry_base}/${sysext_name}" "${version_id}" \
|| exit $ERR_ORAS_PULL_SYSEXT_FAIL
}

installGPUDriverSysext() {
# ACL NVIDIA GPU driver sysext registry paths:
# Registry path uses major.minor (e.g. 3.0), tag uses full VERSION_ID (e.g. 3.0.20260304).
#
# 1. NVIDIA proprietary driver:
# mcr.microsoft.com/azurelinux/3.0/azure-container-linux/nvidia-driver-cuda:${VERSION_ID}...
#
# 2. NVIDIA OpenRM driver:
# mcr.microsoft.com/azurelinux/3.0/azure-container-linux/nvidia-driver-cuda-open:${VERSION_ID}...
#
# 3. NVIDIA GRID (vGPU guest) driver for converged GPU sizes:
# mcr.microsoft.com/azurelinux/3.0/azure-container-linux/nvidia-driver-vgpu:${VERSION_ID}...
#
# NVIDIA_GPU_DRIVER_TYPE is set by AgentBaker based on ConvergedGPUDriverSizes map
# in gpu_components.go. Converged sizes get "grid"; all others get "cuda".
# Legacy GPUs (T4, V100) require proprietary CUDA drivers; A100+ use NVIDIA open drivers.
local vm_sku
vm_sku=$(get_compute_sku)
local sysext_name

# Converged GPU sizes (NVads_A10_v5, NCads_A10_v4) use GRID drivers
if [ "$NVIDIA_GPU_DRIVER_TYPE" = "grid" ]; then
echo "VM SKU ${vm_sku} uses NVIDIA GRID driver (converged)"
sysext_name="nvidia-driver-vgpu"
else
local driver_ret
should_use_nvidia_open_drivers
driver_ret=$?
if [ "$driver_ret" -eq 2 ]; then
echo "Failed to determine GPU driver type"
exit $ERR_MISSING_CUDA_PACKAGE
Comment on lines +170 to +171
Copy link

Copilot AI Mar 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should_use_nvidia_open_drivers returns 2 specifically for “unable to determine VM SKU”, but this path exits with ERR_MISSING_CUDA_PACKAGE, which is misleading and can cause incorrect failure categorization/telemetry. Prefer propagating the function’s error (or add a dedicated error code like ERR_GPU_DRIVER_SELECTION_FAIL) and emit an error message that matches the underlying cause (e.g., IMDS SKU lookup failure).

Suggested change
echo "Failed to determine GPU driver type"
exit $ERR_MISSING_CUDA_PACKAGE
echo "Failed to determine GPU driver type for this VM: unable to determine VM SKU (should_use_nvidia_open_drivers returned ${driver_ret})"
exit "${driver_ret}"

Copilot uses AI. Check for mistakes.
elif [ "$driver_ret" -eq 0 ]; then
echo "VM SKU ${vm_sku} uses NVIDIA OpenRM driver (cuda-open)"
sysext_name="nvidia-driver-cuda-open"
else
echo "VM SKU ${vm_sku} uses NVIDIA proprietary driver (cuda)"
sysext_name="nvidia-driver-cuda"
fi
fi

installACLGPUSysext "${sysext_name}"

# Process tmpfiles.d rules shipped inside the GPU sysexts (e.g. symlink
# /etc/nvidia/gridd.conf -> /usr/share/nvidia/gridd.conf). The sysext
# overlay only covers /usr; files under /etc must be created on the
# writable root via tmpfiles.d rules.
systemd-tmpfiles --create
}

installNvidiaContainerToolkitSysext() {
installACLGPUSysext nvidia-container-toolkit
}

installNvidiaFabricManagerSysext() {
installACLGPUSysext nvidia-fabric-manager
}

ensureRunc() {
stub
}
Expand Down
16 changes: 10 additions & 6 deletions parts/linux/cloud-init/artifacts/cse_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -943,6 +943,10 @@ configGPUDrivers() {
downloadGPUDrivers
installNvidiaContainerToolkit
enableNvidiaPersistenceMode
elif isACL "$OS" "$OS_VARIANT"; then
installNvidiaContainerToolkitSysext
installGPUDriverSysext
enableNvidiaPersistenceMode
else
echo "os $OS $OS_VARIANT not supported at this time. skipping configGPUDrivers"
exit 1
Expand All @@ -952,16 +956,16 @@ configGPUDrivers() {
retrycmd_if_failure 120 5 300 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL

# Fix the NVIDIA /dev/char link issue (Mariner/AzureLinux only)
if isMarinerOrAzureLinux "$OS"; then
# GRID vGPU licensing: restart nvidia-gridd after device nodes exist
if [ "$NVIDIA_GPU_DRIVER_TYPE" = "grid" ]; then
systemctlEnableAndStart nvidia-gridd 30
fi

# Fix the NVIDIA /dev/char link issue
createNvidiaSymlinkToAllDeviceNodes
fi

# GRID vGPU licensing: start nvidia-gridd service to ensure license configuration
if (isMarinerOrAzureLinux "$OS" || isACL "$OS" "$OS_VARIANT") && [ "$NVIDIA_GPU_DRIVER_TYPE" = "grid" ]; then
systemctlEnableAndStart nvidia-gridd 300 || exit $ERR_SYSTEMCTL_START_FAIL
fi

retrycmd_if_failure 120 5 25 pkill -SIGHUP containerd || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT

# NPD is installed as a VM extension, which might happen before/after/during CSE, so this
Expand Down
1 change: 1 addition & 0 deletions parts/linux/cloud-init/artifacts/cse_helpers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ ERR_LOOKUP_ENABLE_MANAGED_GPU_EXPERIENCE_TAG=230 # Error checking nodepool tags

ERR_PULL_POD_INFRA_CONTAINER_IMAGE=225 # Error pulling pause image
ERR_ORAS_PULL_SYSEXT_FAIL=231 # Error pulling systemd system extension artifact via oras from registry
ERR_SYSEXT_VERSION_ID_NOT_FOUND=232 # VERSION_ID not found in /etc/os-release, required for sysext tag resolution

# ----------------------- AKS Node Controller----------------------------------
ERR_AKS_NODE_CONTROLLER_ERROR=240 # Generic error in AKS Node Controller
Expand Down
61 changes: 61 additions & 0 deletions parts/linux/cloud-init/artifacts/cse_install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -851,4 +851,65 @@ datasource:
EOF
}

# ==== GPU driver functions ====
# Shared between Azure Linux (Mariner) and ACL distro install scripts.
# These functions are only invoked on GPU-enabled VM SKUs during provisioning;
# they are safe to define on all distros (no execution at source time).

should_use_nvidia_open_drivers() {
# Checks if the VM SKU should use NVIDIA open drivers (vs proprietary drivers).
# Legacy GPUs (T4, V100) use NVIDIA proprietary drivers; A100+ use NVIDIA open drivers.
# Returns: 0 (true) for open drivers, 1 (false) for proprietary drivers, 2 on error
local vm_sku
vm_sku=$(get_compute_sku)
if [ -z "$vm_sku" ]; then
echo "Error: Unable to determine VM SKU, cannot select GPU driver" >&2
return 2
fi
local lower="${vm_sku,,}"

# T4 GPUs (NC*_T4_v3 family) use proprietary drivers
# V100 GPUs: NDv2 (nd40rs_v2), NDv3 (nd40s_v3), NCsv3 (nc*s_v3) use proprietary drivers
case "$lower" in
*t4_v3*)
return 1
;;
*nd40rs_v2*)
return 1
;;
*nd40s_v3*)
return 1
;;
standard_nc*s_v3*)
return 1
;;
esac

# All other GPU SKUs (A100+) use open drivers
return 0
}

enableNvidiaPersistenceMode() {
PERSISTENCED_SERVICE_FILE_PATH="/etc/systemd/system/nvidia-persistenced.service"
touch ${PERSISTENCED_SERVICE_FILE_PATH}
cat << EOF > ${PERSISTENCED_SERVICE_FILE_PATH}
Comment on lines +893 to +895
Copy link

Copilot AI Mar 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PERSISTENCED_SERVICE_FILE_PATH is written as a global variable and is used unquoted in touch/redirection. Making it local and quoting expansions improves safety and avoids accidental global state leakage when cse_install.sh is sourced by multiple distro scripts.

Suggested change
PERSISTENCED_SERVICE_FILE_PATH="/etc/systemd/system/nvidia-persistenced.service"
touch ${PERSISTENCED_SERVICE_FILE_PATH}
cat << EOF > ${PERSISTENCED_SERVICE_FILE_PATH}
local PERSISTENCED_SERVICE_FILE_PATH="/etc/systemd/system/nvidia-persistenced.service"
touch "${PERSISTENCED_SERVICE_FILE_PATH}"
cat << EOF > "${PERSISTENCED_SERVICE_FILE_PATH}"

Copilot uses AI. Check for mistakes.
[Unit]
Description=NVIDIA Persistence Daemon
Wants=syslog.target

[Service]
Type=forking
ExecStart=/usr/bin/nvidia-persistenced --verbose
ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced
Restart=always
TimeoutSec=300

[Install]
WantedBy=multi-user.target
EOF

systemctl enable nvidia-persistenced.service || exit 1
systemctl restart nvidia-persistenced.service || exit 1
Comment on lines +911 to +912
Copy link

Copilot AI Mar 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

enableNvidiaPersistenceMode calls systemctl enable/restart and exit 1 on failure. Since this function is now shared and used for ACL, exiting with a generic code loses the repo’s standardized error codes and skips the retry/timeout wrappers (systemctlEnableAndStart, systemctl_*). Consider using the helper wrappers and returning a specific error code (e.g. ERR_SYSTEMCTL_START_FAIL) so failures are actionable in CSE telemetry.

Suggested change
systemctl enable nvidia-persistenced.service || exit 1
systemctl restart nvidia-persistenced.service || exit 1
if ! systemctlEnableAndStart nvidia-persistenced.service; then
return $ERR_SYSTEMCTL_START_FAIL
fi

Copilot uses AI. Check for mistakes.
}

#EOF
2 changes: 2 additions & 0 deletions parts/linux/cloud-init/artifacts/cse_main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,8 @@ function nodePrep {
# while it fails to install on NC24.
if isMarinerOrAzureLinux "$OS"; then
logs_to_events "AKS.CSE.installNvidiaFabricManager" installNvidiaFabricManager
elif isACL "$OS" "$OS_VARIANT"; then
logs_to_events "AKS.CSE.installNvidiaFabricManagerSysext" installNvidiaFabricManagerSysext
fi
# Start fabric manager service
logs_to_events "AKS.CSE.nvidia-fabricmanager" "systemctlEnableAndStart nvidia-fabricmanager 30" || exit $ERR_GPU_DRIVERS_START_FAIL
Expand Down
Loading
Loading