diff --git a/aks-node-controller/README.md b/aks-node-controller/README.md
index 49d0fa4ef9f..54c03523a6c 100644
--- a/aks-node-controller/README.md
+++ b/aks-node-controller/README.md
@@ -122,8 +122,8 @@ sequenceDiagram
ARM->>VM: Deploy config.json
(CustomData)
note over VM: cloud-init handles
config.json deployment
- note over VM: cloud-init completes processing
- note over VM: Start aks-node-controller.service (systemd service)
after cloud-init
+ note over VM: cloud-boothook writes config.json early
+ note over VM: cloud-boothook starts aks-node-controller.service
once config is on disk
VM->>VM: Run aks-node-controller
(Go binary) in provision mode
using config.json
ARM->>VM: Initiate aks-node-controller (Go binary)
in provision-wait mode via CSE
@@ -137,7 +137,7 @@ sequenceDiagram
Key components:
-1. `aks-node-controller.service`: systemd unit that is triggered once cloud-init is complete (guaranteeing that config is present on disk) and then kickstarts bootstrapping.
+1. `aks-node-controller.service`: systemd unit that can be started directly by cloud-boothook as soon as the config file is written, while remaining enabled on the VHD as a fallback boot hook.
2. `aks-node-controller` go binary with two modes:
- **provision**: Parses the node configuration and starts the bootstrap sequence.
diff --git a/aks-node-controller/parser/parser.go b/aks-node-controller/parser/parser.go
index 2f2745c9992..79817e93462 100644
--- a/aks-node-controller/parser/parser.go
+++ b/aks-node-controller/parser/parser.go
@@ -181,6 +181,7 @@ func getCSEEnv(config *aksnodeconfigv1.Configuration) map[string]string {
"SERVICE_ACCOUNT_IMAGE_PULL_DEFAULT_TENANT_ID": config.GetServiceAccountImagePullProfile().GetDefaultTenantId(),
"IDENTITY_BINDINGS_LOCAL_AUTHORITY_SNI": config.GetServiceAccountImagePullProfile().GetLocalAuthoritySni(),
"CSE_TIMEOUT": getCSETimeout(config),
+ "SKIP_WAAGENT_HOLD": "true",
}
for i, cert := range config.CustomCaCerts {
diff --git a/aks-node-controller/parser/templates/cse_cmd.sh.gtpl b/aks-node-controller/parser/templates/cse_cmd.sh.gtpl
index 1f2b311a794..b1359b071d9 100644
--- a/aks-node-controller/parser/templates/cse_cmd.sh.gtpl
+++ b/aks-node-controller/parser/templates/cse_cmd.sh.gtpl
@@ -1,21 +1,4 @@
echo $(date),$(hostname) > ${PROVISION_OUTPUT};
-{{if not .GetDisableCustomData}}
-CLOUD_INIT_STATUS_SCRIPT="/opt/azure/containers/cloud-init-status-check.sh";
-cloudInitExitCode=0;
-if [ -f "${CLOUD_INIT_STATUS_SCRIPT}" ]; then
- /bin/bash -c "source ${CLOUD_INIT_STATUS_SCRIPT}; handleCloudInitStatus \"${PROVISION_OUTPUT}\"; returnStatus=\$?; echo \"Cloud init status check exit code: \$returnStatus\" >> ${PROVISION_OUTPUT}; exit \$returnStatus" >> ${PROVISION_OUTPUT} 2>&1;
-else
- cloud-init status --wait > /dev/null 2>&1;
-fi;
-cloudInitExitCode=$?;
-if [ "$cloudInitExitCode" -eq 0 ]; then
- echo "cloud-init succeeded" >> ${PROVISION_OUTPUT};
-else
- echo "cloud-init failed with exit code ${cloudInitExitCode}" >> ${PROVISION_OUTPUT};
- cat ${PROVISION_OUTPUT}
- exit ${cloudInitExitCode};
-fi;
-{{end}}
{{if getIsAksCustomCloud .CustomCloudConfig}}
REPO_DEPOT_ENDPOINT="{{.CustomCloudConfig.RepoDepotEndpoint}}"
{{getInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1;
diff --git a/aks-node-controller/pkg/nodeconfigutils/utils.go b/aks-node-controller/pkg/nodeconfigutils/utils.go
index 0eaf5f6d341..678a83a8c56 100644
--- a/aks-node-controller/pkg/nodeconfigutils/utils.go
+++ b/aks-node-controller/pkg/nodeconfigutils/utils.go
@@ -1,34 +1,121 @@
package nodeconfigutils
import (
+ "bytes"
"encoding/base64"
"fmt"
+ "mime/multipart"
+ "net/textproto"
aksnodeconfigv1 "github.com/Azure/agentbaker/aks-node-controller/pkg/gen/aksnodeconfig/v1"
"google.golang.org/protobuf/encoding/protojson"
)
const (
- cloudConfigTemplate = `#cloud-config
-write_files:
-- path: /opt/azure/containers/aks-node-controller-config.json
- permissions: "0755"
- owner: root
- content: !!binary |
- %s`
CSE = "/opt/azure/containers/aks-node-controller provision-wait"
+
+ boothookTemplate = `#cloud-boothook
+#!/bin/bash
+set -euo pipefail
+
+logger -t aks-boothook "boothook start $(date -Ins)"
+
+mkdir -p /opt/azure/containers
+
+cat <<'EOF' | base64 -d >/opt/azure/containers/aks-node-controller-config.json
+%s
+EOF
+chmod 0644 /opt/azure/containers/aks-node-controller-config.json
+
+logger -t aks-boothook "launching aks-node-controller service $(date -Ins)"
+systemctl start --no-block aks-node-controller.service
+`
+
+ cloudConfigTemplate = `#cloud-config
+runcmd:
+- echo "AKS Node Controller cloud-init completed at $(date)"
+`
+
+ flatcarTemplate = `{
+ "ignition": { "version": "3.4.0" },
+ "storage": {
+ "files": [{
+ "path": "/opt/azure/containers/aks-node-controller-config.json",
+ "mode": 420,
+ "contents": { "source": "data:;base64,%s" }
+ }]
+ }
+ }`
)
+// CustomData builds a base64-encoded MIME multipart document to be used as VM custom data for cloud-init.
+// It encodes the node configuration as JSON, embeds it in a cloud-boothook script that writes the config
+// to disk and starts the aks-node-controller service, then pairs it with a cloud-config part. Cloud-init
+// processes each MIME part according to its Content-Type during the VM's first boot.
func CustomData(cfg *aksnodeconfigv1.Configuration) (string, error) {
aksNodeConfigJSON, err := MarshalConfigurationV1(cfg)
if err != nil {
return "", fmt.Errorf("failed to marshal nbc, error: %w", err)
}
+
encodedAksNodeConfigJSON := base64.StdEncoding.EncodeToString(aksNodeConfigJSON)
- customDataYAML := fmt.Sprintf(cloudConfigTemplate, encodedAksNodeConfigJSON)
+ boothook := fmt.Sprintf(boothookTemplate, encodedAksNodeConfigJSON)
+
+ var customData bytes.Buffer
+ writer := multipart.NewWriter(&customData)
+
+ fmt.Fprintf(&customData, "MIME-Version: 1.0\r\n")
+ fmt.Fprintf(&customData, "Content-Type: multipart/mixed; boundary=%q\r\n\r\n", writer.Boundary())
+
+ if err := writeMIMEPart(writer, "text/cloud-boothook", boothook); err != nil {
+ return "", fmt.Errorf("failed to write boothook part: %w", err)
+ }
+ if err := writeMIMEPart(writer, "text/cloud-config", cloudConfigTemplate); err != nil {
+ return "", fmt.Errorf("failed to write cloud-config part: %w", err)
+ }
+ if err := writer.Close(); err != nil {
+ return "", fmt.Errorf("failed to finalize multipart custom data: %w", err)
+ }
+
+ return base64.StdEncoding.EncodeToString(customData.Bytes()), nil
+}
+
+// CustomDataFlatcar builds base64-encoded custom data for Flatcar Container Linux nodes.
+// Unlike Ubuntu/Azure Linux which use cloud-init and expect MIME multipart custom data,
+// Flatcar uses Ignition (configured via Butane) to process machine configuration. Ignition
+// consumes a JSON document that declaratively specifies files to write to disk, so we embed
+// the node config directly as a base64 data URI in an Ignition storage entry instead of
+// wrapping it in a MIME multipart boothook script.
+func CustomDataFlatcar(cfg *aksnodeconfigv1.Configuration) (string, error) {
+ aksNodeConfigJSON, err := MarshalConfigurationV1(cfg)
+ if err != nil {
+ return "", fmt.Errorf("failed to marshal nbc, error: %w", err)
+ }
+
+ encodedAksNodeConfigJSON := base64.StdEncoding.EncodeToString(aksNodeConfigJSON)
+ customDataYAML := fmt.Sprintf(flatcarTemplate, encodedAksNodeConfigJSON)
return base64.StdEncoding.EncodeToString([]byte(customDataYAML)), nil
}
+// writeMIMEPart writes a single part to a MIME multipart message. Cloud-init expects custom data
+// as a MIME multipart document where each part carries a Content-Type that tells cloud-init how to
+// process it (e.g. "text/cloud-boothook" for early-boot scripts, "text/cloud-config" for declarative
+// cloud-config YAML). This helper creates one such part with the appropriate headers.
+func writeMIMEPart(writer *multipart.Writer, contentType, content string) error {
+ header := textproto.MIMEHeader{}
+ header.Set("Content-Type", contentType)
+ header.Set("MIME-Version", "1.0")
+ header.Set("Content-Transfer-Encoding", "7bit")
+
+ part, err := writer.CreatePart(header)
+ if err != nil {
+ return err
+ }
+
+ _, err = part.Write([]byte(content))
+ return err
+}
+
func MarshalConfigurationV1(cfg *aksnodeconfigv1.Configuration) ([]byte, error) {
options := protojson.MarshalOptions{
UseEnumNumbers: false,
diff --git a/aks-node-controller/pkg/nodeconfigutils/utils_test.go b/aks-node-controller/pkg/nodeconfigutils/utils_test.go
index d7407069d64..598ccc3c439 100644
--- a/aks-node-controller/pkg/nodeconfigutils/utils_test.go
+++ b/aks-node-controller/pkg/nodeconfigutils/utils_test.go
@@ -1,7 +1,13 @@
package nodeconfigutils
import (
+ "encoding/base64"
+ "io"
+ "mime"
+ "mime/multipart"
+ "net/textproto"
"os"
+ "strings"
"testing"
aksnodeconfigv1 "github.com/Azure/agentbaker/aks-node-controller/pkg/gen/aksnodeconfig/v1"
@@ -204,6 +210,67 @@ func TestMarsalConfiguratioV1(t *testing.T) {
require.JSONEq(t, `{"version":"v1","auth_config":{"subscription_id":"test-subscription"}, "workload_runtime":"WORKLOAD_RUNTIME_OCI_CONTAINER"}`, string(data))
}
+func TestCustomDataUsesMultipartBoothookAndCloudConfig(t *testing.T) {
+ cfg := &aksnodeconfigv1.Configuration{
+ Version: "v1",
+ AuthConfig: &aksnodeconfigv1.AuthConfig{
+ SubscriptionId: "test-subscription",
+ },
+ ClusterConfig: &aksnodeconfigv1.ClusterConfig{
+ ResourceGroup: "test-rg",
+ Location: "eastus",
+ },
+ ApiServerConfig: &aksnodeconfigv1.ApiServerConfig{
+ ApiServerName: "test-api-server",
+ },
+ }
+
+ customData, err := CustomData(cfg)
+ require.NoError(t, err)
+
+ decoded, err := base64.StdEncoding.DecodeString(customData)
+ require.NoError(t, err)
+
+ sections := strings.SplitN(string(decoded), "\r\n\r\n", 2)
+ require.Len(t, sections, 2)
+
+ message := textproto.MIMEHeader{}
+ for _, line := range strings.Split(sections[0], "\r\n") {
+ if line == "" {
+ continue
+ }
+ parts := strings.SplitN(line, ": ", 2)
+ require.Len(t, parts, 2)
+ message.Add(parts[0], parts[1])
+ }
+ mediaType, params, err := mime.ParseMediaType(message.Get("Content-Type"))
+ require.NoError(t, err)
+ require.Equal(t, "multipart/mixed", mediaType)
+
+ reader := multipart.NewReader(strings.NewReader(sections[1]), params["boundary"])
+
+ part, err := reader.NextPart()
+ require.NoError(t, err)
+ require.Equal(t, "text/cloud-boothook", part.Header.Get("Content-Type"))
+ boothook, err := io.ReadAll(part)
+ require.NoError(t, err)
+ require.True(t, strings.HasPrefix(string(boothook), "#cloud-boothook\n"))
+ require.Contains(t, string(boothook), "/opt/azure/containers/aks-node-controller-config.json")
+ require.Contains(t, string(boothook), "launching aks-node-controller service")
+ require.Contains(t, string(boothook), "systemctl start --no-block aks-node-controller.service")
+
+ part, err = reader.NextPart()
+ require.NoError(t, err)
+ require.Equal(t, "text/cloud-config", part.Header.Get("Content-Type"))
+ cloudConfig, err := io.ReadAll(part)
+ require.NoError(t, err)
+ require.True(t, strings.HasPrefix(string(cloudConfig), "#cloud-config\n"))
+ require.Contains(t, string(cloudConfig), "runcmd:")
+
+ _, err = reader.NextPart()
+ require.ErrorIs(t, err, io.EOF)
+}
+
func TestMarshalUnmarshalWithPopulatedConfig(t *testing.T) {
t.Run("fully populated config marshals to >100 bytes", func(t *testing.T) {
cfg := &aksnodeconfigv1.Configuration{}
diff --git a/e2e/node_config.go b/e2e/node_config.go
index 2d8fe33cbf5..716e1313588 100644
--- a/e2e/node_config.go
+++ b/e2e/node_config.go
@@ -162,7 +162,7 @@ func nbcToAKSNodeConfigV1(nbc *datamodel.NodeBootstrappingConfiguration) *aksnod
return &aksnodeconfigv1.Configuration{
Version: "v1",
BootstrappingConfig: bootstrappingConfig,
- DisableCustomData: nbc.AgentPoolProfile.IsFlatcar() || nbc.AgentPoolProfile.IsACL(),
+ DisableCustomData: true,
LinuxAdminUsername: "azureuser",
VmSize: config.Config.DefaultVMSKU,
ClusterConfig: &aksnodeconfigv1.ClusterConfig{
diff --git a/e2e/vmss.go b/e2e/vmss.go
index e09a4544f78..3fbc00c1c86 100644
--- a/e2e/vmss.go
+++ b/e2e/vmss.go
@@ -81,25 +81,58 @@ func ConfigureAndCreateVMSS(ctx context.Context, s *Scenario) (*ScenarioVM, erro
return vm, err
}
-// CustomDataWithHack is similar to nodeconfigutils.CustomData, but it uses a hack to run new aks-node-controller binary
+// CustomDataWithHack is similar to nodeconfigutils.CustomData, but it uses a hack to run new aks-node-controller binary.
// Original aks-node-controller isn't run because it fails systemd check validating aks-node-controller-config.json exists
-// check aks-node-controller.service for details
-// a new binary is downloaded from the given URL and run with provision command
+// (check aks-node-controller.service for details).
+//
+// Uses a cloud-boothook to write the config file and create a systemd service unit early in boot (during cloud-init init).
+// The systemd service waits for network-online.target before downloading the binary and running provisioning,
+// avoiding the race condition where runcmd or boothook scripts execute before networking is available.
+// Flatcar cannot use boothooks (coreos-cloudinit doesn't support MIME multipart), so it uses cloud-config
+// with a coreos.units block to define and start the service instead.
func CustomDataWithHack(s *Scenario, binaryURL string) (string, error) {
- cloudConfigTemplate := `#cloud-config
-write_files:
-- path: /opt/azure/containers/aks-node-controller-config-hack.json
- permissions: "0755"
- owner: root
- content: !!binary |
- %s
-runcmd:
- - mkdir -p /opt/azure/bin
- - curl -fSL "%s" -o /opt/azure/bin/aks-node-controller-hack
- - chmod +x /opt/azure/bin/aks-node-controller-hack
- - /opt/azure/bin/aks-node-controller-hack provision --provision-config=/opt/azure/containers/aks-node-controller-config-hack.json &
+ cloudConfigTemplate := `#cloud-boothook
+#!/bin/bash
+set -euo pipefail
+
+mkdir -p /opt/azure/containers /opt/azure/bin
+
+cat <<'EOF' | base64 -d > /opt/azure/containers/aks-node-controller-config-hack.json
+%s
+EOF
+chmod 0755 /opt/azure/containers/aks-node-controller-config-hack.json
+
+cat <<'SCRIPT' > /opt/azure/bin/run-aks-node-controller-hack.sh
+#!/bin/bash
+set -euo pipefail
+mkdir -p /opt/azure/bin
+curl -fSL --retry 10 --retry-delay 2 "%s" -o /opt/azure/bin/aks-node-controller-hack
+chmod +x /opt/azure/bin/aks-node-controller-hack
+/opt/azure/bin/aks-node-controller-hack provision --provision-config=/opt/azure/containers/aks-node-controller-config-hack.json
+SCRIPT
+chmod +x /opt/azure/bin/run-aks-node-controller-hack.sh
+
+cat <<'UNIT' > /etc/systemd/system/aks-node-controller-hack.service
+[Unit]
+Description=Downloads and runs the AKS node controller hack
+After=network-online.target
+Wants=network-online.target
+
+[Service]
+Type=oneshot
+ExecStart=/opt/azure/bin/run-aks-node-controller-hack.sh
+
+[Install]
+WantedBy=basic.target
+UNIT
+
+systemctl daemon-reload
+systemctl start --no-block aks-node-controller-hack.service
`
if s.VHD.Flatcar {
+ // Flatcar uses coreos-cloudinit which only supports a subset of cloud-config features
+ // and does not handle MIME multipart or boothooks. Use coreos.units to define the service instead.
+ // https://github.com/flatcar/coreos-cloudinit/blob/main/Documentation/cloud-config.md#coreos-parameters
cloudConfigTemplate = `#cloud-config
write_files:
- path: /opt/azure/containers/aks-node-controller-config-hack.json
@@ -114,7 +147,7 @@ write_files:
#!/bin/bash
set -euo pipefail
mkdir -p /opt/azure/bin
- curl -fSL "%s" -o /opt/azure/bin/aks-node-controller-hack
+ curl -fSL --retry 10 --retry-delay 2 "%s" -o /opt/azure/bin/aks-node-controller-hack
chmod +x /opt/azure/bin/aks-node-controller-hack
/opt/azure/bin/aks-node-controller-hack provision --provision-config=/opt/azure/containers/aks-node-controller-config-hack.json
# Flatcar specific configuration. It supports only a subset of cloud-init features https://github.com/flatcar/coreos-cloudinit/blob/main/Documentation/cloud-config.md#coreos-parameters
@@ -154,7 +187,13 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine
cse = nodeconfigutils.CSE
customData = func() string {
if config.Config.DisableScriptLessCompilation {
- data, err := nodeconfigutils.CustomData(s.Runtime.AKSNodeConfig)
+ var data string
+ var err error
+ if s.VHD.Flatcar {
+ data, err = nodeconfigutils.CustomDataFlatcar(s.Runtime.AKSNodeConfig)
+ } else {
+ data, err = nodeconfigutils.CustomData(s.Runtime.AKSNodeConfig)
+ }
require.NoError(s.T, err, "failed to generate custom data from AKSNodeConfig")
return data
}
diff --git a/parts/linux/cloud-init/artifacts/aks-node-controller.service b/parts/linux/cloud-init/artifacts/aks-node-controller.service
index b1571a9ba0b..0f1ab3a68bb 100644
--- a/parts/linux/cloud-init/artifacts/aks-node-controller.service
+++ b/parts/linux/cloud-init/artifacts/aks-node-controller.service
@@ -1,15 +1,13 @@
[Unit]
Description=Parse contract and run csecmd
ConditionPathExists=/opt/azure/containers/aks-node-controller-config.json
-After=cloud-init.target
-After=oem-cloudinit.service enable-oem-cloudinit.service
-Wants=cloud-init.target
+After=network-online.target
+Wants=network-online.target
[Service]
Type=oneshot
ExecStart=/opt/azure/containers/aks-node-controller-wrapper.sh
-RemainAfterExit=No
+RemainAfterExit=yes
[Install]
-WantedBy=cloud-init.target
-WantedBy=oem-cloudinit.service
+WantedBy=basic.target
diff --git a/parts/linux/cloud-init/artifacts/cse_main.sh b/parts/linux/cloud-init/artifacts/cse_main.sh
index 31c8367abf2..7bd1ff5a88a 100755
--- a/parts/linux/cloud-init/artifacts/cse_main.sh
+++ b/parts/linux/cloud-init/artifacts/cse_main.sh
@@ -56,7 +56,11 @@ get_ubuntu_release() {
# After completion, this VHD can be used as a base image for creating new node pools.
# Users may add custom configurations or pull additional container images after this stage.
function basePrep {
- logs_to_events "AKS.CSE.aptmarkWALinuxAgent" aptmarkWALinuxAgent hold &
+ if [ "${SKIP_WAAGENT_HOLD}" = "true" ]; then
+ echo "Skipping holding walinuxagent"
+ else
+ logs_to_events "AKS.CSE.aptmarkWALinuxAgent" aptmarkWALinuxAgent hold &
+ fi
logs_to_events "AKS.CSE.configureAdminUser" configureAdminUser
@@ -294,7 +298,6 @@ EOF
logs_to_events "AKS.CSE.ensureContainerd.ensureArtifactStreaming" ensureArtifactStreaming || exit $ERR_ARTIFACT_STREAMING_INSTALL
fi
- # This is to enable localdns using scriptless.
if [ "${SHOULD_ENABLE_LOCALDNS}" = "true" ]; then
logs_to_events "AKS.CSE.enableLocalDNS" enableLocalDNS || exit $ERR_LOCALDNS_FAIL
fi
@@ -493,8 +496,12 @@ function nodePrep {
echo 'reboot required, rebooting node in 1 minute'
/bin/bash -c "shutdown -r 1 &"
if [ "$OS" = "$UBUNTU_OS_NAME" ]; then
- # logs_to_events should not be run on & commands
- aptmarkWALinuxAgent unhold &
+ if [ "${SKIP_WAAGENT_HOLD}" = "true" ]; then
+ echo "Skipping unholding walinuxagent"
+ else
+ # logs_to_events should not be run on & commands
+ aptmarkWALinuxAgent unhold &
+ fi
fi
else
if [ "$OS" = "$UBUNTU_OS_NAME" ]; then
@@ -516,7 +523,11 @@ function nodePrep {
systemctl restart --no-block apt-daily.service
fi
- aptmarkWALinuxAgent unhold &
+ if [ "${SKIP_WAAGENT_HOLD}" = "true" ]; then
+ echo "Skipping unholding walinuxagent"
+ else
+ aptmarkWALinuxAgent unhold &
+ fi
elif isMarinerOrAzureLinux "$OS"; then
if [ "${ENABLE_UNATTENDED_UPGRADES}" = "true" ]; then
if [ "${IS_KATA}" = "true" ]; then