-
Notifications
You must be signed in to change notification settings - Fork 251
feat: run aks node controller at boot time faster by 15s #8082
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,21 +1,4 @@ | ||
| echo $(date),$(hostname) > ${PROVISION_OUTPUT}; | ||
| {{if not .GetDisableCustomData}} | ||
| CLOUD_INIT_STATUS_SCRIPT="/opt/azure/containers/cloud-init-status-check.sh"; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand the effort in this PR is somewhat remove the hard dependency with cloud-init status ready. However, the cloud-init-status-check.sh was added by a repair item for some intermittent sev2. Not meaning we can't remove it, just need to be aware that it could cause intermittent sev2.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we run the service before even cloud init is finished, so waiting for it doesnt make sense.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. may be add this part of provision-wait?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sync'd offline. Nishchay checked with the original owner of |
||
| cloudInitExitCode=0; | ||
| if [ -f "${CLOUD_INIT_STATUS_SCRIPT}" ]; then | ||
| /bin/bash -c "source ${CLOUD_INIT_STATUS_SCRIPT}; handleCloudInitStatus \"${PROVISION_OUTPUT}\"; returnStatus=\$?; echo \"Cloud init status check exit code: \$returnStatus\" >> ${PROVISION_OUTPUT}; exit \$returnStatus" >> ${PROVISION_OUTPUT} 2>&1; | ||
| else | ||
| cloud-init status --wait > /dev/null 2>&1; | ||
| fi; | ||
| cloudInitExitCode=$?; | ||
| if [ "$cloudInitExitCode" -eq 0 ]; then | ||
| echo "cloud-init succeeded" >> ${PROVISION_OUTPUT}; | ||
| else | ||
| echo "cloud-init failed with exit code ${cloudInitExitCode}" >> ${PROVISION_OUTPUT}; | ||
| cat ${PROVISION_OUTPUT} | ||
| exit ${cloudInitExitCode}; | ||
| fi; | ||
| {{end}} | ||
| {{if getIsAksCustomCloud .CustomCloudConfig}} | ||
| REPO_DEPOT_ENDPOINT="{{.CustomCloudConfig.RepoDepotEndpoint}}" | ||
| {{getInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; | ||
|
Comment on lines
1
to
4
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,34 +1,121 @@ | ||
| package nodeconfigutils | ||
|
|
||
| import ( | ||
| "bytes" | ||
| "encoding/base64" | ||
| "fmt" | ||
| "mime/multipart" | ||
| "net/textproto" | ||
|
|
||
| aksnodeconfigv1 "github.com/Azure/agentbaker/aks-node-controller/pkg/gen/aksnodeconfig/v1" | ||
| "google.golang.org/protobuf/encoding/protojson" | ||
| ) | ||
|
|
||
| const ( | ||
| cloudConfigTemplate = `#cloud-config | ||
| write_files: | ||
| - path: /opt/azure/containers/aks-node-controller-config.json | ||
| permissions: "0755" | ||
| owner: root | ||
| content: !!binary | | ||
| %s` | ||
| CSE = "/opt/azure/containers/aks-node-controller provision-wait" | ||
|
|
||
| boothookTemplate = `#cloud-boothook | ||
| #!/bin/bash | ||
| set -euo pipefail | ||
|
|
||
| logger -t aks-boothook "boothook start $(date -Ins)" | ||
|
|
||
| mkdir -p /opt/azure/containers | ||
|
|
||
| cat <<'EOF' | base64 -d >/opt/azure/containers/aks-node-controller-config.json | ||
| %s | ||
| EOF | ||
| chmod 0644 /opt/azure/containers/aks-node-controller-config.json | ||
|
|
||
|
|
||
| logger -t aks-boothook "launching aks-node-controller service $(date -Ins)" | ||
| systemctl start --no-block aks-node-controller.service | ||
| ` | ||
|
|
||
| cloudConfigTemplate = `#cloud-config | ||
| runcmd: | ||
| - echo "AKS Node Controller cloud-init completed at $(date)" | ||
| ` | ||
|
|
||
| flatcarTemplate = `{ | ||
| "ignition": { "version": "3.4.0" }, | ||
| "storage": { | ||
| "files": [{ | ||
| "path": "/opt/azure/containers/aks-node-controller-config.json", | ||
| "mode": 420, | ||
| "contents": { "source": "data:;base64,%s" } | ||
| }] | ||
| } | ||
| }` | ||
| ) | ||
|
|
||
| // CustomData builds a base64-encoded MIME multipart document to be used as VM custom data for cloud-init. | ||
| // It encodes the node configuration as JSON, embeds it in a cloud-boothook script that writes the config | ||
| // to disk and starts the aks-node-controller service, then pairs it with a cloud-config part. Cloud-init | ||
| // processes each MIME part according to its Content-Type during the VM's first boot. | ||
| func CustomData(cfg *aksnodeconfigv1.Configuration) (string, error) { | ||
| aksNodeConfigJSON, err := MarshalConfigurationV1(cfg) | ||
| if err != nil { | ||
| return "", fmt.Errorf("failed to marshal nbc, error: %w", err) | ||
| } | ||
|
|
||
| encodedAksNodeConfigJSON := base64.StdEncoding.EncodeToString(aksNodeConfigJSON) | ||
| customDataYAML := fmt.Sprintf(cloudConfigTemplate, encodedAksNodeConfigJSON) | ||
| boothook := fmt.Sprintf(boothookTemplate, encodedAksNodeConfigJSON) | ||
|
|
||
| var customData bytes.Buffer | ||
| writer := multipart.NewWriter(&customData) | ||
|
|
||
| fmt.Fprintf(&customData, "MIME-Version: 1.0\r\n") | ||
| fmt.Fprintf(&customData, "Content-Type: multipart/mixed; boundary=%q\r\n\r\n", writer.Boundary()) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question: what is the reason we need it to be multipart MIME?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
|
||
| if err := writeMIMEPart(writer, "text/cloud-boothook", boothook); err != nil { | ||
| return "", fmt.Errorf("failed to write boothook part: %w", err) | ||
| } | ||
| if err := writeMIMEPart(writer, "text/cloud-config", cloudConfigTemplate); err != nil { | ||
| return "", fmt.Errorf("failed to write cloud-config part: %w", err) | ||
| } | ||
| if err := writer.Close(); err != nil { | ||
| return "", fmt.Errorf("failed to finalize multipart custom data: %w", err) | ||
| } | ||
|
|
||
| return base64.StdEncoding.EncodeToString(customData.Bytes()), nil | ||
| } | ||
|
|
||
| // CustomDataFlatcar builds base64-encoded custom data for Flatcar Container Linux nodes. | ||
| // Unlike Ubuntu/Azure Linux which use cloud-init and expect MIME multipart custom data, | ||
| // Flatcar uses Ignition (configured via Butane) to process machine configuration. Ignition | ||
| // consumes a JSON document that declaratively specifies files to write to disk, so we embed | ||
| // the node config directly as a base64 data URI in an Ignition storage entry instead of | ||
| // wrapping it in a MIME multipart boothook script. | ||
| func CustomDataFlatcar(cfg *aksnodeconfigv1.Configuration) (string, error) { | ||
| aksNodeConfigJSON, err := MarshalConfigurationV1(cfg) | ||
| if err != nil { | ||
| return "", fmt.Errorf("failed to marshal nbc, error: %w", err) | ||
| } | ||
|
|
||
| encodedAksNodeConfigJSON := base64.StdEncoding.EncodeToString(aksNodeConfigJSON) | ||
| customDataYAML := fmt.Sprintf(flatcarTemplate, encodedAksNodeConfigJSON) | ||
| return base64.StdEncoding.EncodeToString([]byte(customDataYAML)), nil | ||
| } | ||
|
|
||
| // writeMIMEPart writes a single part to a MIME multipart message. Cloud-init expects custom data | ||
| // as a MIME multipart document where each part carries a Content-Type that tells cloud-init how to | ||
| // process it (e.g. "text/cloud-boothook" for early-boot scripts, "text/cloud-config" for declarative | ||
| // cloud-config YAML). This helper creates one such part with the appropriate headers. | ||
| func writeMIMEPart(writer *multipart.Writer, contentType, content string) error { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what does MIME stand for? can we had a comment explaining |
||
| header := textproto.MIMEHeader{} | ||
| header.Set("Content-Type", contentType) | ||
| header.Set("MIME-Version", "1.0") | ||
| header.Set("Content-Transfer-Encoding", "7bit") | ||
|
|
||
| part, err := writer.CreatePart(header) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| _, err = part.Write([]byte(content)) | ||
| return err | ||
| } | ||
|
|
||
| func MarshalConfigurationV1(cfg *aksnodeconfigv1.Configuration) ([]byte, error) { | ||
| options := protojson.MarshalOptions{ | ||
| UseEnumNumbers: false, | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -81,25 +81,58 @@ func ConfigureAndCreateVMSS(ctx context.Context, s *Scenario) (*ScenarioVM, erro | |||||||||||||
| return vm, err | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| // CustomDataWithHack is similar to nodeconfigutils.CustomData, but it uses a hack to run new aks-node-controller binary | ||||||||||||||
| // CustomDataWithHack is similar to nodeconfigutils.CustomData, but it uses a hack to run new aks-node-controller binary. | ||||||||||||||
| // Original aks-node-controller isn't run because it fails systemd check validating aks-node-controller-config.json exists | ||||||||||||||
| // check aks-node-controller.service for details | ||||||||||||||
| // a new binary is downloaded from the given URL and run with provision command | ||||||||||||||
| // (check aks-node-controller.service for details). | ||||||||||||||
| // | ||||||||||||||
| // Uses a cloud-boothook to write the config file and create a systemd service unit early in boot (during cloud-init init). | ||||||||||||||
| // The systemd service waits for network-online.target before downloading the binary and running provisioning, | ||||||||||||||
| // avoiding the race condition where runcmd or boothook scripts execute before networking is available. | ||||||||||||||
| // Flatcar cannot use boothooks (coreos-cloudinit doesn't support MIME multipart), so it uses cloud-config | ||||||||||||||
| // with a coreos.units block to define and start the service instead. | ||||||||||||||
| func CustomDataWithHack(s *Scenario, binaryURL string) (string, error) { | ||||||||||||||
| cloudConfigTemplate := `#cloud-config | ||||||||||||||
| write_files: | ||||||||||||||
| - path: /opt/azure/containers/aks-node-controller-config-hack.json | ||||||||||||||
| permissions: "0755" | ||||||||||||||
| owner: root | ||||||||||||||
| content: !!binary | | ||||||||||||||
| %s | ||||||||||||||
| runcmd: | ||||||||||||||
| - mkdir -p /opt/azure/bin | ||||||||||||||
| - curl -fSL "%s" -o /opt/azure/bin/aks-node-controller-hack | ||||||||||||||
| - chmod +x /opt/azure/bin/aks-node-controller-hack | ||||||||||||||
| - /opt/azure/bin/aks-node-controller-hack provision --provision-config=/opt/azure/containers/aks-node-controller-config-hack.json & | ||||||||||||||
| cloudConfigTemplate := `#cloud-boothook | ||||||||||||||
| #!/bin/bash | ||||||||||||||
| set -euo pipefail | ||||||||||||||
|
|
||||||||||||||
| mkdir -p /opt/azure/containers /opt/azure/bin | ||||||||||||||
|
|
||||||||||||||
| cat <<'EOF' | base64 -d > /opt/azure/containers/aks-node-controller-config-hack.json | ||||||||||||||
| %s | ||||||||||||||
| EOF | ||||||||||||||
| chmod 0755 /opt/azure/containers/aks-node-controller-config-hack.json | ||||||||||||||
|
|
||||||||||||||
| cat <<'SCRIPT' > /opt/azure/bin/run-aks-node-controller-hack.sh | ||||||||||||||
| #!/bin/bash | ||||||||||||||
| set -euo pipefail | ||||||||||||||
| mkdir -p /opt/azure/bin | ||||||||||||||
| curl -fSL --retry 10 --retry-delay 2 "%s" -o /opt/azure/bin/aks-node-controller-hack | ||||||||||||||
| chmod +x /opt/azure/bin/aks-node-controller-hack | ||||||||||||||
| /opt/azure/bin/aks-node-controller-hack provision --provision-config=/opt/azure/containers/aks-node-controller-config-hack.json | ||||||||||||||
| SCRIPT | ||||||||||||||
| chmod +x /opt/azure/bin/run-aks-node-controller-hack.sh | ||||||||||||||
|
|
||||||||||||||
| cat <<'UNIT' > /etc/systemd/system/aks-node-controller-hack.service | ||||||||||||||
| [Unit] | ||||||||||||||
| Description=Downloads and runs the AKS node controller hack | ||||||||||||||
| After=network-online.target | ||||||||||||||
| Wants=network-online.target | ||||||||||||||
|
|
||||||||||||||
| [Service] | ||||||||||||||
| Type=oneshot | ||||||||||||||
| ExecStart=/opt/azure/bin/run-aks-node-controller-hack.sh | ||||||||||||||
|
|
||||||||||||||
| [Install] | ||||||||||||||
| WantedBy=basic.target | ||||||||||||||
| UNIT | ||||||||||||||
|
|
||||||||||||||
| systemctl daemon-reload | ||||||||||||||
| systemctl start --no-block aks-node-controller-hack.service | ||||||||||||||
| ` | ||||||||||||||
|
Comment on lines
+129
to
131
|
||||||||||||||
| systemctl daemon-reload | |
| systemctl enable aks-node-controller-hack.service | |
| ` | |
| systemctl daemon-reload | |
| systemctl enable --now aks-node-controller-hack.service | |
| ` |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: add a comment explaining why different for flatcar?
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,15 +1,13 @@ | ||
| [Unit] | ||
| Description=Parse contract and run csecmd | ||
| ConditionPathExists=/opt/azure/containers/aks-node-controller-config.json | ||
| After=cloud-init.target | ||
| After=oem-cloudinit.service enable-oem-cloudinit.service | ||
| Wants=cloud-init.target | ||
| After=network-online.target | ||
| Wants=network-online.target | ||
|
|
||
| [Service] | ||
| Type=oneshot | ||
| ExecStart=/opt/azure/containers/aks-node-controller-wrapper.sh | ||
| RemainAfterExit=No | ||
| RemainAfterExit=yes | ||
awesomenix marked this conversation as resolved.
Show resolved
Hide resolved
awesomenix marked this conversation as resolved.
Show resolved
Hide resolved
Devinwong marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| [Install] | ||
| WantedBy=cloud-init.target | ||
| WantedBy=oem-cloudinit.service | ||
| WantedBy=basic.target | ||
awesomenix marked this conversation as resolved.
Show resolved
Hide resolved
Comment on lines
3
to
+13
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The README states
aks-node-controller.serviceremains enabled on the VHD as a fallback boot hook, but this PR’s unit file change removes the [Install] section (sosystemctl enable aks-node-controller.servicefails during VHD build). Either update this doc to match the new enable/start model, or restore an enable-able unit definition so the fallback claim is accurate.