diff --git a/.pipelines/scripts/verify_shell.sh b/.pipelines/scripts/verify_shell.sh index 8d8241131e7..f55d5529d06 100755 --- a/.pipelines/scripts/verify_shell.sh +++ b/.pipelines/scripts/verify_shell.sh @@ -30,6 +30,7 @@ filesToCheck=$(find . -type f -name "*.sh" -not -path './pkg/agent/testdata/*' - # Known bash-only scripts that intentionally use bash specific syntax. BASH_ONLY_LIST=$(cat <<'EOF' ./vhdbuilder/packer/install-ig.sh +./parts/linux/cloud-init/artifacts/aks-hosts-setup.sh EOF ) diff --git a/aks-node-controller/parser/helper.go b/aks-node-controller/parser/helper.go index f5644dcda02..8f302972822 100644 --- a/aks-node-controller/parser/helper.go +++ b/aks-node-controller/parser/helper.go @@ -719,11 +719,19 @@ func getFuncMapForLocalDnsCorefileTemplate() template.FuncMap { } } -// getLocalDnsCorefileBase64 returns the base64 encoded LocalDns corefile. -// base64 encoded corefile returned from this function will decoded and written -// to /opt/azure/containers/localdns/localdns.corefile in cse_config.sh -// and then used by localdns systemd unit to start localdns systemd unit. -func getLocalDnsCorefileBase64(aksnodeconfig *aksnodeconfigv1.Configuration) string { +// getLocalDnsCorefileBase64WithHostsPlugin generates and returns the base64-encoded LocalDns corefile +// with or without the hosts plugin, depending on the includeHostsPlugin parameter. +// +// The generated content is returned as a base64-encoded string and stored in environment variables: +// - LOCALDNS_GENERATED_COREFILE (kept for backward compat with old VHDs) +// - LOCALDNS_COREFILE_BASE (standard, without experimental plugins) +// - LOCALDNS_COREFILE_EXPERIMENTAL (with experimental plugins e.g. hosts plugin) +// +// The actual file writing happens in shell scripts (cse_config.sh), which decode and write +// a selected variant to /opt/azure/containers/localdns/localdns.corefile after populating the env file. +// Runtime selection between LOCALDNS_COREFILE_BASE and LOCALDNS_COREFILE_EXPERIMENTAL happens in localdns.sh +// (via select_localdns_corefile(), invoked on localdns service start/restart) based on the availability of /etc/localdns/hosts. +func getLocalDnsCorefileBase64WithHostsPlugin(aksnodeconfig *aksnodeconfigv1.Configuration, includeHostsPlugin bool) string { if aksnodeconfig == nil { return "" } @@ -737,17 +745,33 @@ func getLocalDnsCorefileBase64(aksnodeconfig *aksnodeconfigv1.Configuration) str return "" } - localDnsConfig, err := generateLocalDnsCorefileFromAKSNodeConfig(aksnodeconfig) + variant := "with hosts plugin" + if !includeHostsPlugin { + variant = "without hosts plugin" + } + + localDnsConfig, err := generateLocalDnsCorefileFromAKSNodeConfig(aksnodeconfig, includeHostsPlugin) if err != nil { - return fmt.Sprintf("error getting localdns corfile from aks node config: %v", err) + return fmt.Sprintf("error getting localdns corefile (%s) from aks node config: %v", variant, err) } return base64.StdEncoding.EncodeToString([]byte(localDnsConfig)) } +// localDnsCorefileTemplateData wraps the AKS node config with additional template control flags. +type localDnsCorefileTemplateData struct { + Config *aksnodeconfigv1.Configuration + IncludeHostsPlugin bool +} + // Corefile is created using localdns.toml.gtpl template and aksnodeconfig values. -func generateLocalDnsCorefileFromAKSNodeConfig(aksnodeconfig *aksnodeconfigv1.Configuration) (string, error) { +// includeHostsPlugin controls whether the hosts plugin block is included in the generated Corefile. +func generateLocalDnsCorefileFromAKSNodeConfig(aksnodeconfig *aksnodeconfigv1.Configuration, includeHostsPlugin bool) (string, error) { var corefileBuffer bytes.Buffer - if err := localDnsCorefileTemplate.Execute(&corefileBuffer, aksnodeconfig); err != nil { + templateData := localDnsCorefileTemplateData{ + Config: aksnodeconfig, + IncludeHostsPlugin: includeHostsPlugin, + } + if err := localDnsCorefileTemplate.Execute(&corefileBuffer, templateData); err != nil { return "", fmt.Errorf("failed to execute localdns corefile template: %w", err) } return corefileBuffer.String(), nil @@ -785,6 +809,13 @@ func shouldEnableLocalDns(aksnodeconfig *aksnodeconfigv1.Configuration) string { return fmt.Sprintf("%v", aksnodeconfig != nil && aksnodeconfig.GetLocalDnsProfile() != nil && aksnodeconfig.GetLocalDnsProfile().GetEnableLocalDns()) } +// shouldEnableHostsPlugin returns true if LocalDNS is enabled and the hosts plugin +// is explicitly enabled. When true, the localdns Corefile will include a hosts plugin +// block that serves cached DNS entries from /etc/localdns/hosts for critical AKS FQDNs. +func shouldEnableHostsPlugin(aksnodeconfig *aksnodeconfigv1.Configuration) string { + return fmt.Sprintf("%v", shouldEnableLocalDns(aksnodeconfig) == "true" && aksnodeconfig.GetLocalDnsProfile().GetEnableHostsPlugin()) +} + // getLocalDnsCpuLimitInPercentage returns CPU limit in percentage unit that will be used in localdns systemd unit. func getLocalDnsCpuLimitInPercentage(aksnodeconfig *aksnodeconfigv1.Configuration) string { if shouldEnableLocalDns(aksnodeconfig) == "true" && aksnodeconfig.GetLocalDnsProfile().GetCpuLimitInMilliCores() != 0 { diff --git a/aks-node-controller/parser/helper_test.go b/aks-node-controller/parser/helper_test.go index 46b05bc6550..263f45b400d 100644 --- a/aks-node-controller/parser/helper_test.go +++ b/aks-node-controller/parser/helper_test.go @@ -1446,6 +1446,10 @@ health-check.localdns.local:53 { .:53 { log bind 169.254.10.10 + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } forward . 168.63.129.16 { policy sequential max_concurrent 1000 @@ -1509,6 +1513,10 @@ testdomain456.com:53 { .:53 { errors bind 169.254.10.11 + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } forward . 10.0.0.10 { policy sequential max_concurrent 2000 @@ -1627,7 +1635,7 @@ func Test_getLocalDNSCorefileBase64(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got := getLocalDnsCorefileBase64(tt.args.aksnodeconfig) + got := getLocalDnsCorefileBase64WithHostsPlugin(tt.args.aksnodeconfig, true) if tt.wantContains == "" && got != "" { t.Errorf("expected empty string, got %q", got) @@ -1711,6 +1719,71 @@ func Test_shouldEnableLocalDns(t *testing.T) { } } +func Test_shouldEnableHostsPlugin(t *testing.T) { + type args struct { + aksnodeconfig *aksnodeconfigv1.Configuration + } + tests := []struct { + name string + args args + want string + }{ + { + name: "nil config", + args: args{aksnodeconfig: nil}, + want: "false", + }, + { + name: "nil LocalDnsProfile", + args: args{aksnodeconfig: &aksnodeconfigv1.Configuration{}}, + want: "false", + }, + { + name: "LocalDns disabled, HostsPlugin enabled", + args: args{aksnodeconfig: &aksnodeconfigv1.Configuration{ + LocalDnsProfile: &aksnodeconfigv1.LocalDnsProfile{ + EnableLocalDns: false, + EnableHostsPlugin: true}, + }}, + want: "false", + }, + { + name: "LocalDns enabled, HostsPlugin disabled", + args: args{aksnodeconfig: &aksnodeconfigv1.Configuration{ + LocalDnsProfile: &aksnodeconfigv1.LocalDnsProfile{ + EnableLocalDns: true, + EnableHostsPlugin: false}, + }}, + want: "false", + }, + { + name: "both LocalDns and HostsPlugin enabled", + args: args{aksnodeconfig: &aksnodeconfigv1.Configuration{ + LocalDnsProfile: &aksnodeconfigv1.LocalDnsProfile{ + EnableLocalDns: true, + EnableHostsPlugin: true}, + }}, + want: "true", + }, + { + name: "both disabled", + args: args{aksnodeconfig: &aksnodeconfigv1.Configuration{ + LocalDnsProfile: &aksnodeconfigv1.LocalDnsProfile{ + EnableLocalDns: false, + EnableHostsPlugin: false}, + }}, + want: "false", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := shouldEnableHostsPlugin(tt.args.aksnodeconfig); got != tt.want { + t.Errorf("shouldEnableHostsPlugin() = %v, want %v", got, tt.want) + } + }) + } +} + func Test_getLocalDnsCpuLimitInPercentage(t *testing.T) { type args struct { aksnodeconfig *aksnodeconfigv1.Configuration diff --git a/aks-node-controller/parser/parser.go b/aks-node-controller/parser/parser.go index d8541c45c65..da42f08b58e 100644 --- a/aks-node-controller/parser/parser.go +++ b/aks-node-controller/parser/parser.go @@ -170,9 +170,12 @@ func getCSEEnv(config *aksnodeconfigv1.Configuration) map[string]string { "INSERT_IMDS_RESTRICTION_RULE_TO_MANGLE_TABLE": fmt.Sprintf("%v", config.GetImdsRestrictionConfig().GetInsertImdsRestrictionRuleToMangleTable()), "PRE_PROVISION_ONLY": fmt.Sprintf("%v", config.GetPreProvisionOnly()), "SHOULD_ENABLE_LOCALDNS": shouldEnableLocalDns(config), + "SHOULD_ENABLE_HOSTS_PLUGIN": shouldEnableHostsPlugin(config), "LOCALDNS_CPU_LIMIT": getLocalDnsCpuLimitInPercentage(config), "LOCALDNS_MEMORY_LIMIT": getLocalDnsMemoryLimitInMb(config), - "LOCALDNS_GENERATED_COREFILE": getLocalDnsCorefileBase64(config), + "LOCALDNS_GENERATED_COREFILE": getLocalDnsCorefileBase64WithHostsPlugin(config, false), + "LOCALDNS_COREFILE_BASE": getLocalDnsCorefileBase64WithHostsPlugin(config, false), + "LOCALDNS_COREFILE_EXPERIMENTAL": getLocalDnsCorefileBase64WithHostsPlugin(config, true), "DISABLE_PUBKEY_AUTH": fmt.Sprintf("%v", config.GetDisablePubkeyAuth()), "SERVICE_ACCOUNT_IMAGE_PULL_ENABLED": fmt.Sprintf("%v", config.GetServiceAccountImagePullProfile().GetEnabled()), "SERVICE_ACCOUNT_IMAGE_PULL_DEFAULT_CLIENT_ID": config.GetServiceAccountImagePullProfile().GetDefaultClientId(), diff --git a/aks-node-controller/parser/parser_test.go b/aks-node-controller/parser/parser_test.go index 18a8d66e196..4c3fd343396 100644 --- a/aks-node-controller/parser/parser_test.go +++ b/aks-node-controller/parser/parser_test.go @@ -229,6 +229,38 @@ oom_score = -999 assert.Equal(t, "true", vars["NEEDS_CGROUPV2"]) }, }, + { + name: "AKSUbuntu2204 with LocalDNS and hosts plugin enabled", + folder: "AKSUbuntu2204+LocalDNS+HostsPlugin", + k8sVersion: "1.24.2", + aksNodeConfigUpdator: func(aksNodeConfig *aksnodeconfigv1.Configuration) { + aksNodeConfig.LocalDnsProfile = &aksnodeconfigv1.LocalDnsProfile{ + EnableLocalDns: true, + EnableHostsPlugin: true, + } + }, + validator: func(cmd *exec.Cmd) { + vars := environToMap(cmd.Env) + assert.Equal(t, "true", vars["SHOULD_ENABLE_LOCALDNS"]) + assert.Equal(t, "true", vars["SHOULD_ENABLE_HOSTS_PLUGIN"]) + }, + }, + { + name: "AKSUbuntu2204 with LocalDNS enabled but hosts plugin disabled", + folder: "AKSUbuntu2204+LocalDNS", + k8sVersion: "1.24.2", + aksNodeConfigUpdator: func(aksNodeConfig *aksnodeconfigv1.Configuration) { + aksNodeConfig.LocalDnsProfile = &aksnodeconfigv1.LocalDnsProfile{ + EnableLocalDns: true, + EnableHostsPlugin: false, + } + }, + validator: func(cmd *exec.Cmd) { + vars := environToMap(cmd.Env) + assert.Equal(t, "true", vars["SHOULD_ENABLE_LOCALDNS"]) + assert.Equal(t, "false", vars["SHOULD_ENABLE_HOSTS_PLUGIN"]) + }, + }, } for _, tt := range tests { diff --git a/aks-node-controller/parser/templates/localdns.toml.gtpl b/aks-node-controller/parser/templates/localdns.toml.gtpl index a636c357362..d503057486c 100644 --- a/aks-node-controller/parser/templates/localdns.toml.gtpl +++ b/aks-node-controller/parser/templates/localdns.toml.gtpl @@ -7,7 +7,7 @@ health-check.localdns.local:53 { whoami } # VnetDNS overrides apply to DNS traffic from pods with dnsPolicy:default or kubelet (referred to as VnetDNS traffic). -{{- range $domain, $override := $.LocalDnsProfile.VnetDnsOverrides -}} +{{- range $domain, $override := $.Config.LocalDnsProfile.VnetDnsOverrides -}} {{- $isRootDomain := eq $domain "." -}} {{- $fwdToClusterCoreDNS := or (hasSuffix $domain "cluster.local") (eq $override.ForwardDestination "ClusterCoreDNS")}} {{- $forwardPolicy := "sequential" -}} @@ -23,11 +23,17 @@ health-check.localdns.local:53 { log {{- end }} bind {{getLocalDnsNodeListenerIp}} + {{- if and $isRootDomain $.IncludeHostsPlugin}} + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } + {{- end}} {{- if $isRootDomain}} forward . {{getAzureDnsIp}} { {{- else}} {{- if $fwdToClusterCoreDNS}} - forward . {{getCoreDnsServiceIp $}} { + forward . {{getCoreDnsServiceIp $.Config}} { {{- else}} forward . {{getAzureDnsIp}} { {{- end}} @@ -67,7 +73,7 @@ health-check.localdns.local:53 { } {{- end}} # KubeDNS overrides apply to DNS traffic from pods with dnsPolicy:ClusterFirst (referred to as KubeDNS traffic). -{{- range $domain, $override := $.LocalDnsProfile.KubeDnsOverrides}} +{{- range $domain, $override := $.Config.LocalDnsProfile.KubeDnsOverrides}} {{- $isRootDomain := eq $domain "." -}} {{- $fwdToClusterCoreDNS := or (hasSuffix $domain "cluster.local") (eq $override.ForwardDestination "ClusterCoreDNS")}} {{- $forwardPolicy := "" }} @@ -84,8 +90,14 @@ health-check.localdns.local:53 { log {{- end }} bind {{getLocalDnsClusterListenerIp}} + {{- if and $isRootDomain $.IncludeHostsPlugin}} + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } + {{- end}} {{- if $fwdToClusterCoreDNS}} - forward . {{getCoreDnsServiceIp $}} { + forward . {{getCoreDnsServiceIp $.Config}} { {{- else}} forward . {{getAzureDnsIp}} { {{- end}} diff --git a/aks-node-controller/parser/testdata/AKSUbuntu2204+LocalDNS+HostsPlugin/generatedCSECommand b/aks-node-controller/parser/testdata/AKSUbuntu2204+LocalDNS+HostsPlugin/generatedCSECommand new file mode 100644 index 00000000000..1cd02c61bec --- /dev/null +++ b/aks-node-controller/parser/testdata/AKSUbuntu2204+LocalDNS+HostsPlugin/generatedCSECommand @@ -0,0 +1 @@ +/bin/bash -c echo $(date),$(hostname) > ${PROVISION_OUTPUT}; CLOUD_INIT_STATUS_SCRIPT="/opt/azure/containers/cloud-init-status-check.sh"; cloudInitExitCode=0; if [ -f "${CLOUD_INIT_STATUS_SCRIPT}" ]; then /bin/bash -c "source ${CLOUD_INIT_STATUS_SCRIPT}; handleCloudInitStatus \"${PROVISION_OUTPUT}\"; returnStatus=\$?; echo \"Cloud init status check exit code: \$returnStatus\" >> ${PROVISION_OUTPUT}; exit \$returnStatus" >> ${PROVISION_OUTPUT} 2>&1; else cloud-init status --wait > /dev/null 2>&1; fi; cloudInitExitCode=$?; if [ "$cloudInitExitCode" -eq 0 ]; then echo "cloud-init succeeded" >> ${PROVISION_OUTPUT}; else echo "cloud-init failed with exit code ${cloudInitExitCode}" >> ${PROVISION_OUTPUT}; cat ${PROVISION_OUTPUT} exit ${cloudInitExitCode}; fi; /usr/bin/nohup /bin/bash -c "/bin/bash /opt/azure/containers/provision_start.sh" \ No newline at end of file diff --git a/aks-node-controller/parser/testdata/AKSUbuntu2204+LocalDNS/generatedCSECommand b/aks-node-controller/parser/testdata/AKSUbuntu2204+LocalDNS/generatedCSECommand new file mode 100644 index 00000000000..1cd02c61bec --- /dev/null +++ b/aks-node-controller/parser/testdata/AKSUbuntu2204+LocalDNS/generatedCSECommand @@ -0,0 +1 @@ +/bin/bash -c echo $(date),$(hostname) > ${PROVISION_OUTPUT}; CLOUD_INIT_STATUS_SCRIPT="/opt/azure/containers/cloud-init-status-check.sh"; cloudInitExitCode=0; if [ -f "${CLOUD_INIT_STATUS_SCRIPT}" ]; then /bin/bash -c "source ${CLOUD_INIT_STATUS_SCRIPT}; handleCloudInitStatus \"${PROVISION_OUTPUT}\"; returnStatus=\$?; echo \"Cloud init status check exit code: \$returnStatus\" >> ${PROVISION_OUTPUT}; exit \$returnStatus" >> ${PROVISION_OUTPUT} 2>&1; else cloud-init status --wait > /dev/null 2>&1; fi; cloudInitExitCode=$?; if [ "$cloudInitExitCode" -eq 0 ]; then echo "cloud-init succeeded" >> ${PROVISION_OUTPUT}; else echo "cloud-init failed with exit code ${cloudInitExitCode}" >> ${PROVISION_OUTPUT}; cat ${PROVISION_OUTPUT} exit ${cloudInitExitCode}; fi; /usr/bin/nohup /bin/bash -c "/bin/bash /opt/azure/containers/provision_start.sh" \ No newline at end of file diff --git a/aks-node-controller/pkg/gen/aksnodeconfig/v1/localdns_config.pb.go b/aks-node-controller/pkg/gen/aksnodeconfig/v1/localdns_config.pb.go index 9f1a7d7af64..2b3560c8566 100644 --- a/aks-node-controller/pkg/gen/aksnodeconfig/v1/localdns_config.pb.go +++ b/aks-node-controller/pkg/gen/aksnodeconfig/v1/localdns_config.pb.go @@ -36,6 +36,10 @@ type LocalDnsProfile struct { VnetDnsOverrides map[string]*LocalDnsOverrides `protobuf:"bytes,4,rep,name=vnet_dns_overrides,json=vnetDnsOverrides,proto3" json:"vnet_dns_overrides,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` // KubeDns overrides apply to DNS traffic from pods with dnsPolicy:ClusterFirst (referred to as KubeDns traffic). KubeDnsOverrides map[string]*LocalDnsOverrides `protobuf:"bytes,5,rep,name=kube_dns_overrides,json=kubeDnsOverrides,proto3" json:"kube_dns_overrides,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` + // Specifies whether the hosts plugin should be enabled in the localdns Corefile. + // When true and LocalDNS is enabled, the Corefile will include a hosts plugin block + // that serves cached DNS entries from /etc/localdns/hosts for critical AKS FQDNs. + EnableHostsPlugin bool `protobuf:"varint,6,opt,name=enable_hosts_plugin,json=enableHostsPlugin,proto3" json:"enable_hosts_plugin,omitempty"` } func (x *LocalDnsProfile) Reset() { @@ -103,6 +107,13 @@ func (x *LocalDnsProfile) GetKubeDnsOverrides() map[string]*LocalDnsOverrides { return nil } +func (x *LocalDnsProfile) GetEnableHostsPlugin() bool { + if x != nil { + return x.EnableHostsPlugin + } + return false +} + // Represents DNS override settings for both VnetDNS and KubeDNS traffic. // VnetDns overrides apply to DNS traffic from pods with dnsPolicy:default or kubelet. // KubeDns overrides apply to DNS traffic from pods with dnsPolicy:ClusterFirst. @@ -221,7 +232,7 @@ var file_aksnodeconfig_v1_localdns_config_proto_rawDesc = []byte{ 0x0a, 0x26, 0x61, 0x6b, 0x73, 0x6e, 0x6f, 0x64, 0x65, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2f, 0x76, 0x31, 0x2f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x64, 0x6e, 0x73, 0x5f, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x10, 0x61, 0x6b, 0x73, 0x6e, 0x6f, 0x64, - 0x65, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x76, 0x31, 0x22, 0x80, 0x05, 0x0a, 0x0f, 0x4c, + 0x65, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x76, 0x31, 0x22, 0xb0, 0x05, 0x0a, 0x0f, 0x4c, 0x6f, 0x63, 0x61, 0x6c, 0x44, 0x6e, 0x73, 0x50, 0x72, 0x6f, 0x66, 0x69, 0x6c, 0x65, 0x12, 0x28, 0x0a, 0x10, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x64, 0x6e, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0e, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, @@ -245,7 +256,10 @@ var file_aksnodeconfig_v1_localdns_config_proto_rawDesc = []byte{ 0x63, 0x61, 0x6c, 0x44, 0x6e, 0x73, 0x50, 0x72, 0x6f, 0x66, 0x69, 0x6c, 0x65, 0x2e, 0x4b, 0x75, 0x62, 0x65, 0x44, 0x6e, 0x73, 0x4f, 0x76, 0x65, 0x72, 0x72, 0x69, 0x64, 0x65, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x10, 0x6b, 0x75, 0x62, 0x65, 0x44, 0x6e, 0x73, 0x4f, 0x76, 0x65, 0x72, - 0x72, 0x69, 0x64, 0x65, 0x73, 0x1a, 0x68, 0x0a, 0x15, 0x56, 0x6e, 0x65, 0x74, 0x44, 0x6e, 0x73, + 0x72, 0x69, 0x64, 0x65, 0x73, 0x12, 0x2e, 0x0a, 0x13, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x5f, + 0x68, 0x6f, 0x73, 0x74, 0x73, 0x5f, 0x70, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x18, 0x06, 0x20, 0x01, + 0x28, 0x08, 0x52, 0x11, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x48, 0x6f, 0x73, 0x74, 0x73, 0x50, + 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x1a, 0x68, 0x0a, 0x15, 0x56, 0x6e, 0x65, 0x74, 0x44, 0x6e, 0x73, 0x4f, 0x76, 0x65, 0x72, 0x72, 0x69, 0x64, 0x65, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x39, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, diff --git a/aks-node-controller/proto/aksnodeconfig/v1/localdns_config.proto b/aks-node-controller/proto/aksnodeconfig/v1/localdns_config.proto index ddc62b93e01..f4135ac697a 100644 --- a/aks-node-controller/proto/aksnodeconfig/v1/localdns_config.proto +++ b/aks-node-controller/proto/aksnodeconfig/v1/localdns_config.proto @@ -19,6 +19,11 @@ message LocalDnsProfile { // KubeDns overrides apply to DNS traffic from pods with dnsPolicy:ClusterFirst (referred to as KubeDns traffic). map kube_dns_overrides = 5; + + // Specifies whether the hosts plugin should be enabled in the localdns Corefile. + // When true and LocalDNS is enabled, the Corefile will include a hosts plugin block + // that serves cached DNS entries from /etc/localdns/hosts for critical AKS FQDNs. + bool enable_hosts_plugin = 6; } // Represents DNS override settings for both VnetDNS and KubeDNS traffic. diff --git a/e2e/aks_model.go b/e2e/aks_model.go index 7498d92c0d1..42899102d2b 100644 --- a/e2e/aks_model.go +++ b/e2e/aks_model.go @@ -5,9 +5,11 @@ import ( "errors" "fmt" "net" + "net/http" "os" "path/filepath" "strings" + "time" "github.com/Azure/agentbaker/e2e/config" "github.com/Azure/agentbaker/e2e/toolkit" @@ -856,6 +858,12 @@ func createPrivateEndpoint(ctx context.Context, nodeResourceGroup, privateEndpoi } func createPrivateZone(ctx context.Context, nodeResourceGroup, privateZoneName string) (*armprivatedns.PrivateZone, error) { + return createPrivateZoneWithTags(ctx, nodeResourceGroup, privateZoneName, map[string]*string{ + "e2e-test": to.Ptr("true"), + }) +} + +func createPrivateZoneWithTags(ctx context.Context, nodeResourceGroup, privateZoneName string, tags map[string]*string) (*armprivatedns.PrivateZone, error) { pzResp, err := config.Azure.PrivateZonesClient.Get( ctx, nodeResourceGroup, @@ -867,6 +875,7 @@ func createPrivateZone(ctx context.Context, nodeResourceGroup, privateZoneName s } dnsZoneParams := armprivatedns.PrivateZone{ Location: to.Ptr("global"), + Tags: tags, } poller, err := config.Azure.PrivateZonesClient.BeginCreateOrUpdate( ctx, @@ -888,7 +897,10 @@ func createPrivateZone(ctx context.Context, nodeResourceGroup, privateZoneName s } func createPrivateDNSLink(ctx context.Context, vnet VNet, nodeResourceGroup, privateZoneName string) error { - networkLinkName := "link-ABE2ETests" + return createPrivateDNSLinkWithName(ctx, vnet, nodeResourceGroup, privateZoneName, "link-ABE2ETests") +} + +func createPrivateDNSLinkWithName(ctx context.Context, vnet VNet, nodeResourceGroup, privateZoneName, networkLinkName string) error { _, err := config.Azure.VirutalNetworkLinksClient.Get( ctx, nodeResourceGroup, @@ -975,6 +987,89 @@ func addRecordSetToPrivateDNSZone(ctx context.Context, privateEndpoint *armnetwo return nil } +// cleanupPrivateDNSZone deletes a Private DNS zone (best effort cleanup for tests) +func cleanupPrivateDNSZone(ctx context.Context, resourceGroup, zoneName string) { + // Create a new context with timeout for cleanup + cleanupCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 5*time.Minute) + defer cancel() + + toolkit.Logf(cleanupCtx, "Deleting Private DNS zone %s in resource group %s", zoneName, resourceGroup) + + // First, delete all VNET links (required before zone deletion) + linkPager := config.Azure.VirutalNetworkLinksClient.NewListPager(resourceGroup, zoneName, nil) + for linkPager.More() { + linkPage, err := linkPager.NextPage(cleanupCtx) + if err != nil { + toolkit.Logf(cleanupCtx, "Failed to list VNET links for zone %s: %v", zoneName, err) + break + } + + for _, link := range linkPage.Value { + if link == nil || link.Name == nil { + continue + } + + toolkit.Logf(cleanupCtx, "Deleting VNET link %s from zone %s...", *link.Name, zoneName) + linkPoller, err := config.Azure.VirutalNetworkLinksClient.BeginDelete(cleanupCtx, resourceGroup, zoneName, *link.Name, nil) + if err != nil { + toolkit.Logf(cleanupCtx, "Failed to start deletion of VNET link %s: %v", *link.Name, err) + continue + } + + _, err = linkPoller.PollUntilDone(cleanupCtx, nil) + if err != nil { + toolkit.Logf(cleanupCtx, "Failed to delete VNET link %s: %v", *link.Name, err) + continue + } + toolkit.Logf(cleanupCtx, "Deleted VNET link %s", *link.Name) + } + } + + // Now delete the Private DNS zone itself + poller, err := config.Azure.PrivateZonesClient.BeginDelete(cleanupCtx, resourceGroup, zoneName, nil) + if err != nil { + toolkit.Logf(cleanupCtx, "Failed to start deletion of Private DNS zone %s: %v", zoneName, err) + return + } + + _, err = poller.PollUntilDone(cleanupCtx, nil) + if err != nil { + toolkit.Logf(cleanupCtx, "Failed to complete deletion of Private DNS zone %s: %v", zoneName, err) + return + } + + toolkit.Logf(cleanupCtx, "Successfully deleted Private DNS zone %s", zoneName) +} + +// deletePrivateDNSVNETLink deletes a specific VNET link from a Private DNS zone. +// This is used to clean up individual test resources without affecting other parallel tests. +func deletePrivateDNSVNETLink(ctx context.Context, resourceGroup, zoneName, linkName string) error { + // Create a new context with timeout for cleanup + cleanupCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 2*time.Minute) + defer cancel() + + toolkit.Logf(cleanupCtx, "Deleting VNET link %s from Private DNS zone %s in resource group %s", linkName, zoneName, resourceGroup) + + linkPoller, err := config.Azure.VirutalNetworkLinksClient.BeginDelete(cleanupCtx, resourceGroup, zoneName, linkName, nil) + if err != nil { + // If the link doesn't exist, that's fine (already cleaned up or never created) + var respErr *azcore.ResponseError + if errors.As(err, &respErr) && respErr.StatusCode == http.StatusNotFound { + toolkit.Logf(cleanupCtx, "VNET link %s not found (already deleted or never existed)", linkName) + return nil + } + return fmt.Errorf("failed to start deletion of VNET link %s: %w", linkName, err) + } + + _, err = linkPoller.PollUntilDone(cleanupCtx, nil) + if err != nil { + return fmt.Errorf("failed to complete deletion of VNET link %s: %w", linkName, err) + } + + toolkit.Logf(cleanupCtx, "Successfully deleted VNET link %s from zone %s", linkName, zoneName) + return nil +} + func addDNSZoneGroup(ctx context.Context, privateZone *armprivatedns.PrivateZone, nodeResourceGroup, privateZoneName, endpointName string) error { groupName := strings.Replace(privateZoneName, ".", "-", -1) // replace . with - _, err := config.Azure.PrivateDNSZoneGroup.Get(ctx, nodeResourceGroup, endpointName, groupName, nil) diff --git a/e2e/cluster.go b/e2e/cluster.go index 589371e2d2b..4c09a4535d5 100644 --- a/e2e/cluster.go +++ b/e2e/cluster.go @@ -126,6 +126,12 @@ func prepareCluster(ctx context.Context, cluster *armcontainerservice.ManagedClu return nil, fmt.Errorf("collect garbage vmss: %w", err) } + // Clean up orphaned Private DNS zones from failed tests + // These can interfere with DNS resolution during VM provisioning + if err := collectGarbagePrivateDNSZones(ctx, cluster); err != nil { + return nil, fmt.Errorf("collect garbage private dns zones: %w", err) + } + clusterParams, err := extractClusterParameters(ctx, kube, cluster) if err != nil { return nil, fmt.Errorf("extracting cluster parameters: %w", err) @@ -732,6 +738,106 @@ func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.Manage return nil } +func collectGarbagePrivateDNSZones(ctx context.Context, cluster *armcontainerservice.ManagedCluster) error { + defer toolkit.LogStepCtx(ctx, "collecting garbage Private DNS zones")() + rg := *cluster.Properties.NodeResourceGroup + + // Clean up Private DNS zones created by e2e tests (identified by tags). + // Only delete zones that: + // 1. Have the "e2e-test=true" tag (created by LocalDNS hosts plugin tests) + // 2. Are in zones commonly used by e2e tests (additional safety check) + testManagedZonePatterns := []string{ + "mcr.microsoft.com", + "mcr.azure.cn", + } + + // List all Private DNS zones in the node resource group + pager := config.Azure.PrivateZonesClient.NewListByResourceGroupPager(rg, nil) + for pager.More() { + page, err := pager.NextPage(ctx) + if err != nil { + return fmt.Errorf("failed to get next page of Private DNS zones: %w", err) + } + + for _, zone := range page.Value { + if zone == nil || zone.Name == nil { + continue + } + + zoneName := *zone.Name + + // Safety check 1: Only process zones that match our test patterns + isTestZone := false + for _, pattern := range testManagedZonePatterns { + if zoneName == pattern { + isTestZone = true + break + } + } + + if !isTestZone { + continue + } + + // Safety check 2: Only delete zones with e2e-test tag + if zone.Tags == nil || zone.Tags["e2e-test"] == nil || *zone.Tags["e2e-test"] != "true" { + toolkit.Logf(ctx, "skipping Private DNS zone %q (not tagged as e2e test)", zoneName) + continue + } + + toolkit.Logf(ctx, "found e2e test Private DNS zone %q (tagged), cleaning up...", zoneName) + + // Delete all VNET links first (required before zone deletion) + linkPager := config.Azure.VirutalNetworkLinksClient.NewListPager(rg, zoneName, nil) + for linkPager.More() { + linkPage, err := linkPager.NextPage(ctx) + if err != nil { + toolkit.Logf(ctx, "failed to list VNET links for zone %q: %s", zoneName, err) + break + } + + for _, link := range linkPage.Value { + if link == nil || link.Name == nil { + continue + } + + linkName := *link.Name + toolkit.Logf(ctx, "deleting VNET link %q from e2e test zone %q...", linkName, zoneName) + poller, err := config.Azure.VirutalNetworkLinksClient.BeginDelete(ctx, rg, zoneName, linkName, nil) + if err != nil { + toolkit.Logf(ctx, "failed to start deletion of VNET link %q: %s", linkName, err) + continue + } + + _, err = poller.PollUntilDone(ctx, nil) + if err != nil { + toolkit.Logf(ctx, "failed to delete VNET link %q: %s", linkName, err) + continue + } + toolkit.Logf(ctx, "deleted VNET link %q", linkName) + } + } + + // Now delete the e2e test Private DNS zone itself + toolkit.Logf(ctx, "deleting e2e test Private DNS zone %q...", zoneName) + poller, err := config.Azure.PrivateZonesClient.BeginDelete(ctx, rg, zoneName, nil) + if err != nil { + toolkit.Logf(ctx, "failed to start deletion of Private DNS zone %q: %s", zoneName, err) + continue + } + + _, err = poller.PollUntilDone(ctx, nil) + if err != nil { + toolkit.Logf(ctx, "failed to delete Private DNS zone %q: %s", zoneName, err) + continue + } + toolkit.Logf(ctx, "deleted e2e test Private DNS zone %q", zoneName) + } + } + + return nil +} + func ensureResourceGroup(ctx context.Context, location string) (armresources.ResourceGroup, error) { resourceGroupName := config.ResourceGroupName(location) rg, err := config.Azure.ResourceGroup.CreateOrUpdate( diff --git a/e2e/scenario_localdns_hosts_test.go b/e2e/scenario_localdns_hosts_test.go new file mode 100644 index 00000000000..dfdfc180c09 --- /dev/null +++ b/e2e/scenario_localdns_hosts_test.go @@ -0,0 +1,79 @@ +package e2e + +import ( + "testing" + + aksnodeconfigv1 "github.com/Azure/agentbaker/aks-node-controller/pkg/gen/aksnodeconfig/v1" + "github.com/Azure/agentbaker/e2e/config" + "github.com/Azure/agentbaker/pkg/agent/datamodel" +) + +// Test_LocalDNSHostsPlugin tests the localdns hosts plugin across all supported distros +// on the legacy (bash CSE) bootstrap path. +// Hosts plugin validators (AA flag, IP match, Corefile, hosts file) run automatically +// via ValidateCommonLinux when EnableHostsPlugin is set. +// +// Run a single distro with: go test -run "Test_LocalDNSHostsPlugin/AzureLinuxV3" -v +func Test_LocalDNSHostsPlugin(t *testing.T) { + tests := []struct { + name string + vhd *config.Image + }{ + {name: "Ubuntu2204", vhd: config.VHDUbuntu2204Gen2Containerd}, + {name: "Ubuntu2404", vhd: config.VHDUbuntu2404Gen2Containerd}, + {name: "AzureLinuxV2", vhd: config.VHDAzureLinuxV2Gen2}, + {name: "AzureLinuxV3", vhd: config.VHDAzureLinuxV3Gen2}, + {name: "CBLMarinerV2", vhd: config.VHDCBLMarinerV2Gen2}, + {name: "Flatcar", vhd: config.VHDFlatcarGen2}, + {name: "ACL", vhd: config.VHDACLGen2TL}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + RunScenario(t, &Scenario{ + Description: "Tests that localdns hosts plugin works correctly on " + tt.name, + Config: Config{ + Cluster: ClusterKubenet, + VHD: tt.vhd, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + nbc.AgentPoolProfile.LocalDNSProfile.EnableHostsPlugin = true + }, + }, + }) + }) + } +} + +// Test_LocalDNSHostsPlugin_Scriptless tests the localdns hosts plugin across all supported distros +// on the scriptless (aks-node-controller) bootstrap path. +// The base AKSNodeConfig from nbcToAKSNodeConfigV1 already includes a full LocalDnsProfile with +// DNS overrides, so the mutator only needs to enable the hosts plugin. +// +// Run a single distro with: go test -run "Test_LocalDNSHostsPlugin_Scriptless/Ubuntu2204" -v +func Test_LocalDNSHostsPlugin_Scriptless(t *testing.T) { + tests := []struct { + name string + vhd *config.Image + }{ + {name: "Ubuntu2204", vhd: config.VHDUbuntu2204Gen2Containerd}, + {name: "Ubuntu2404", vhd: config.VHDUbuntu2404Gen2Containerd}, + {name: "AzureLinuxV3", vhd: config.VHDAzureLinuxV3Gen2}, + {name: "Flatcar", vhd: config.VHDFlatcarGen2}, + {name: "ACL", vhd: config.VHDACLGen2TL}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + RunScenario(t, &Scenario{ + Description: "Tests that localdns hosts plugin works correctly on " + tt.name + " (scriptless)", + Config: Config{ + Cluster: ClusterKubenet, + VHD: tt.vhd, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.LocalDnsProfile.EnableHostsPlugin = true + }, + }, + }) + }) + } +} diff --git a/e2e/types.go b/e2e/types.go index 3766b19d858..9c2260318d2 100644 --- a/e2e/types.go +++ b/e2e/types.go @@ -35,7 +35,7 @@ type Tags struct { Scriptless bool VHDCaching bool MockAzureChinaCloud bool - VMSeriesCoverageTest bool + VMSeriesCoverageTest bool } // MatchesFilters checks if the Tags struct matches all given filters. @@ -396,3 +396,57 @@ func (s *Scenario) IsWindows() bool { func (s *Scenario) IsLinux() bool { return !s.IsWindows() } + +// IsHostsPluginEnabled returns true if the hosts plugin is explicitly enabled +// via either NBC (traditional) or AKSNodeConfig (scriptless) paths. +func (s *Scenario) IsHostsPluginEnabled() bool { + if s.Runtime.NBC != nil && s.Runtime.NBC.AgentPoolProfile != nil { + return s.Runtime.NBC.AgentPoolProfile.ShouldEnableHostsPlugin() + } + if s.Runtime.AKSNodeConfig != nil && s.Runtime.AKSNodeConfig.LocalDnsProfile != nil { + return s.Runtime.AKSNodeConfig.LocalDnsProfile.EnableLocalDns && + s.Runtime.AKSNodeConfig.LocalDnsProfile.EnableHostsPlugin + } + return false +} + +// GetDefaultFQDNsForValidation returns a minimal set of FQDNs to validate in the default validation. +// This mirrors the logic in GetCloudTargetEnv (pkg/agent/utils.go) and aks-hosts-setup.sh. +func (s *Scenario) GetDefaultFQDNsForValidation() []string { + if s.Runtime != nil && s.Runtime.NBC != nil && s.Runtime.NBC.ContainerService != nil { + location := strings.ToLower(s.Runtime.NBC.ContainerService.Location) + if strings.HasPrefix(location, "china") { + return []string{ + "mcr.azure.cn", + "login.partner.microsoftonline.cn", + "acs-mirror.azureedge.net", + } + } + if strings.HasPrefix(location, "usgov") || strings.HasPrefix(location, "usdod") { + return []string{ + "mcr.microsoft.com", + "login.microsoftonline.us", + "acs-mirror.azureedge.net", + } + } + } + return []string{ + "mcr.microsoft.com", + "login.microsoftonline.com", + "acs-mirror.azureedge.net", + } +} + +// GetContainerRegistryFQDN returns the container registry FQDN for the cloud environment +// determined by the NBC's ContainerService.Location field. This mirrors the logic in +// GetCloudTargetEnv (pkg/agent/utils.go) and aks-hosts-setup.sh. +func (s *Scenario) GetContainerRegistryFQDN() string { + if s.Runtime != nil && s.Runtime.NBC != nil && s.Runtime.NBC.ContainerService != nil { + location := strings.ToLower(s.Runtime.NBC.ContainerService.Location) + if strings.HasPrefix(location, "china") { + return "mcr.azure.cn" + } + } + // Default to public cloud container registry (also used by Fairfax/US Gov) + return "mcr.microsoft.com" +} diff --git a/e2e/validation.go b/e2e/validation.go index f9b7885487f..4e60316b337 100644 --- a/e2e/validation.go +++ b/e2e/validation.go @@ -71,10 +71,28 @@ func ValidateCommonLinux(ctx context.Context, s *Scenario) { ValidateKubeletNodeIP(ctx, s) } + // localdns is not supported on FIPS VHDs, older VHDs (privatekube, airgapped, scriptless), network isolated VHDs, and AzureLinux OSGuard. // localdns is not supported on scriptless, privatekube and VHDUbuntu2204Gen2ContainerdNetworkIsolatedK8sNotCached. if !s.VHD.UnsupportedLocalDns { ValidateLocalDNSService(ctx, s, "enabled") ValidateLocalDNSResolution(ctx, s, "169.254.10.10") + + // Validate hosts plugin validators only if hosts plugin is explicitly enabled + if s.IsHostsPluginEnabled() { + // Validate hosts file contains resolved IPs for critical FQDNs (IPs resolved dynamically). + // This validator triggers aks-hosts-setup.service to run, so it must come before + // ValidateAKSHostsSetupService which checks the service result. + ValidateLocalDNSHostsFile(ctx, s, s.GetDefaultFQDNsForValidation()) + // Validate aks-hosts-setup service ran successfully and timer is active + ValidateAKSHostsSetupService(ctx, s) + // Restart localdns so it regenerates its corefile with the hosts plugin variant. + // On first boot, localdns and aks-hosts-setup start concurrently — localdns often + // starts before the hosts file is populated, so it uses the base corefile (no hosts plugin). + // Restarting after the hosts file is confirmed populated lets localdns pick the right corefile. + execScriptOnVMForScenarioValidateExitCode(ctx, s, "sudo systemctl restart localdns", 0, "failed to restart localdns") + // Validate hosts plugin serves responses authoritatively (AA flag + IP match) + ValidateLocalDNSHostsPluginBypass(ctx, s) + } } ValidateInspektorGadget(ctx, s) diff --git a/e2e/validators.go b/e2e/validators.go index d0fae6f3ca0..58c9fefc80f 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -1455,6 +1455,266 @@ func ValidateLocalDNSResolution(ctx context.Context, s *Scenario, server string) assert.Contains(s.T, execResult.stdout, fmt.Sprintf("SERVER: %s", server)) } +// ValidateLocalDNSHostsFile checks that /etc/localdns/hosts contains at least one IPv4 entry for each critical FQDN. +// This validation approach avoids flakiness with CDN/frontdoor-backed FQDNs (like mcr.microsoft.com) whose A records +// can rotate between queries. We verify presence, not exact IP matching. +func ValidateLocalDNSHostsFile(ctx context.Context, s *Scenario, fqdns []string) { + s.T.Helper() + + // Force a fresh refresh of the hosts file before validating so the snapshot + // is consistent with the DNS answers we are about to resolve. Without this, + // the 15-minute timer gap can cause flaky mismatches due to DNS load-balancing + // or record rotation. + execScriptOnVMForScenarioValidateExitCode(ctx, s, + "sudo systemctl start aks-hosts-setup.service", + 0, "failed to refresh hosts file via aks-hosts-setup.service") + + // Build script that resolves each FQDN and checks it exists in hosts file + script := fmt.Sprintf(`set -euo pipefail +hosts_file="/etc/localdns/hosts" +fqdns=(%s) + +echo "=== Validating /etc/localdns/hosts contains resolved IPs for critical FQDNs ===" +echo "" +echo "Current hosts file contents:" +cat "$hosts_file" +echo "" + +errors=0 +for fqdn in "${fqdns[@]}"; do + echo "Checking FQDN: $fqdn" + + # Validate that there is at least one IPv4 entry for this FQDN in the hosts file, + # rather than requiring every currently resolved IP to be present. This avoids + # flakiness for CDN/frontdoor-backed FQDNs whose A records can rotate. + if grep -Eq '^[0-9]{1,3}(\.[0-9]{1,3}){3}[[:space:]]+'"$fqdn"'([[:space:]]|$)' "$hosts_file"; then + echo " OK: Found at least one IPv4 entry for $fqdn in hosts file" + else + echo " ERROR: No IPv4 entry found for $fqdn in hosts file" + errors=$((errors + 1)) + fi +done + +echo "" +if [ $errors -gt 0 ]; then + echo "FAILED: $errors FQDNs missing from hosts file" + exit 1 +else + echo "SUCCESS: All critical FQDNs have at least one IPv4 entry in hosts file" + exit 0 +fi +`, quoteFQDNsForBash(fqdns)) + + execScriptOnVMForScenarioValidateExitCode(ctx, s, script, 0, + "hosts file should contain resolved IPs for critical FQDNs") +} + +// quoteFQDNsForBash converts a slice of FQDNs to a bash array string +func quoteFQDNsForBash(fqdns []string) string { + return strings.Join(lo.Map(fqdns, func(fqdn string, _ int) string { + return fmt.Sprintf("%q", fqdn) + }), " ") +} + +// ValidateAKSHostsSetupService checks that aks-hosts-setup.service ran successfully +// and the aks-hosts-setup.timer is active to ensure periodic refresh of /etc/localdns/hosts. +func ValidateAKSHostsSetupService(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Check that aks-hosts-setup.service (oneshot) completed without failure + ValidateSystemdUnitIsNotFailed(ctx, s, "aks-hosts-setup.service") + + // Check that aks-hosts-setup.timer is active for periodic refresh + ValidateSystemdUnitIsRunning(ctx, s, "aks-hosts-setup.timer") +} + +// ValidateLocalDNSHostsPluginBypass verifies that localdns serves FQDNs from /etc/localdns/hosts +// authoritatively via the CoreDNS hosts plugin. It checks: +// 1. The node has the kubernetes.azure.com/localdns-hosts-plugin=enabled annotation +// 2. The Corefile has the hosts plugin configured in both VnetDNS and KubeDNS listeners +// 3. dig against localdns returns the AA (Authoritative Answer) flag, proving the response +// came from the hosts plugin rather than being forwarded upstream +// 4. The IPs returned by dig match the entries in /etc/localdns/hosts for the same FQDN +func ValidateLocalDNSHostsPluginBypass(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Step 1: Verify the node has the hosts plugin annotation + // The annotation is set asynchronously by localdns.sh (background job waiting for kubeconfig + node registration) + // Poll for up to 5 minutes with exponential backoff to avoid flaky failures + s.T.Log("Polling for node annotation kubernetes.azure.com/localdns-hosts-plugin=enabled...") + annotationKey := "kubernetes.azure.com/localdns-hosts-plugin" + + var node *corev1.Node + var err error + var annotationValue string + var exists bool + maxAttempts := 33 // ~5 minutes: first 4 attempts use 1+2+4+8=15s, then ~29 attempts at 10s cap = ~305s + + for attempt := 1; attempt <= maxAttempts; attempt++ { + node, err = s.Runtime.Cluster.Kube.Typed.CoreV1().Nodes().Get(ctx, s.Runtime.VM.KubeName, metav1.GetOptions{}) + require.NoError(s.T, err, "failed to get node %q", s.Runtime.VM.KubeName) + + annotationValue, exists = node.Annotations[annotationKey] + if exists && annotationValue == "enabled" { + s.T.Logf("✓ Node annotation %s=%s found after %d attempts", annotationKey, annotationValue, attempt) + break + } + + if attempt == maxAttempts { + s.T.Fatalf("Timeout: node %q annotation %q not found or not 'enabled' after %d attempts (~5 minutes). Current value: exists=%v, value=%q", + s.Runtime.VM.KubeName, annotationKey, maxAttempts, exists, annotationValue) + } + + // Exponential backoff: 1s, 2s, 4s, 8s, max 10s + sleepDuration := time.Duration(1< 10*time.Second { + sleepDuration = 10 * time.Second + } + s.T.Logf("Attempt %d/%d: annotation not ready (exists=%v, value=%q), retrying in %v...", attempt, maxAttempts, exists, annotationValue, sleepDuration) + time.Sleep(sleepDuration) + } + + // Step 2: Verify the Corefile has the hosts plugin configured + s.T.Log("Verifying Corefile contains hosts plugin configuration...") + corefileCheckScript := `set -euo pipefail +corefile="/opt/azure/containers/localdns/updated.localdns.corefile" + +echo "=== Verifying Corefile configuration ===" +echo "Checking if $corefile exists..." +if [ ! -f "$corefile" ]; then + echo "ERROR: Corefile $corefile does not exist" + exit 1 +fi +echo "✓ Corefile exists" +echo "" + +echo "Checking if Corefile contains hosts plugin directive..." +if ! grep -q "hosts /etc/localdns/hosts" "$corefile"; then + echo "ERROR: Corefile does not contain 'hosts /etc/localdns/hosts' directive" + echo "" + echo "Corefile contents:" + cat "$corefile" + exit 1 +fi +echo "✓ Found 'hosts /etc/localdns/hosts' directive in Corefile" +echo "" + +echo "Checking hosts plugin has fallthrough directive..." +if ! grep -A1 "hosts /etc/localdns/hosts" "$corefile" | grep -q "fallthrough"; then + echo "WARNING: hosts plugin may be missing 'fallthrough' directive" +fi +echo "✓ hosts plugin configuration looks correct" +echo "" + +echo "Corefile contents:" +cat "$corefile" +echo "" +echo "=== Corefile validation successful ===" +` + + execScriptOnVMForScenarioValidateExitCode(ctx, s, corefileCheckScript, 0, + "Corefile should contain hosts plugin configuration") + + // Step 3: Test that localdns resolves real FQDNs from /etc/localdns/hosts + // This validates the hosts plugin is working by checking: + // 1. dig output contains the AA (Authoritative Answer) flag — proving the response came + // from the hosts plugin, not forwarded upstream. This is stronger than "recursion not + // available" because AA definitively means CoreDNS served the answer from a local source. + // 2. The IPs returned by dig match the entries in /etc/localdns/hosts for the same FQDN. + // + // We use packages.microsoft.com because it's a real FQDN that aks-hosts-setup.service populates. + // This avoids race conditions with the aks-hosts-setup.timer overwriting fake test entries. + testFQDN := "packages.microsoft.com" + s.T.Logf("Testing hosts plugin resolves %s from /etc/localdns/hosts with AA flag", testFQDN) + + script := fmt.Sprintf(`set -euo pipefail +test_fqdn=%q +hosts_file="/etc/localdns/hosts" + +echo "=== Testing localdns hosts plugin functionality ===" +echo "Testing FQDN: $test_fqdn" +echo "" + +# Step 1: Get the expected IPs from /etc/localdns/hosts +echo "Reading expected IPs from $hosts_file..." +if [ ! -f "$hosts_file" ]; then + echo "ERROR: Hosts file $hosts_file does not exist" + exit 1 +fi + +# Extract IPv4 addresses for the test FQDN from hosts file +expected_ips=$(grep -E "^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+[[:space:]]+$test_fqdn" "$hosts_file" | awk '{print $1}' | sort) +if [ -z "$expected_ips" ]; then + echo "ERROR: No IPv4 entries found for $test_fqdn in $hosts_file" + echo "Hosts file contents:" + sudo cat "$hosts_file" + exit 1 +fi + +echo "Expected IPs from hosts file:" +echo "$expected_ips" +echo "" + +# Step 2: Query localdns with dig and capture full output for flag inspection +echo "Querying localdns for $test_fqdn at 169.254.10.10..." +dig_output=$(dig "$test_fqdn" @169.254.10.10 -t A +timeout=5 +tries=2 2>&1) +echo "Full dig output:" +echo "$dig_output" +echo "" + +# Step 3: Check for AA (Authoritative Answer) flag in dig output +# The flags line looks like: ";; flags: qr aa rd; QUERY: 1, ANSWER: N, ..." +# The AA flag proves the response was served authoritatively by the hosts plugin, +# not forwarded to an upstream resolver. +echo "Checking for AA (Authoritative Answer) flag in dig response..." +flags_line=$(echo "$dig_output" | grep -E "^;; flags:") +if [ -z "$flags_line" ]; then + echo "ERROR: No flags line found in dig output" + exit 1 +fi +echo "Flags line: $flags_line" + +if ! echo "$flags_line" | grep -qw "aa"; then + echo "ERROR: AA (Authoritative Answer) flag not present in dig response" + echo "This indicates localdns forwarded the query upstream instead of serving it from the hosts plugin" + exit 1 +fi +echo "✓ AA flag present — response served authoritatively by hosts plugin" +echo "" + +# Step 4: Extract resolved IPs from dig ANSWER section and compare with hosts file +resolved_ips=$(echo "$dig_output" | grep -E "^${test_fqdn}\..*IN[[:space:]]+A[[:space:]]" | awk '{print $NF}' | sort) +if [ -z "$resolved_ips" ]; then + echo "ERROR: No A records returned from dig query" + exit 1 +fi + +echo "Resolved IPs from dig:" +echo "$resolved_ips" +echo "" + +echo "Comparing resolved IPs with hosts file entries..." +if [ "$expected_ips" != "$resolved_ips" ]; then + echo "ERROR: Resolved IPs do not match hosts file entries" + echo "Expected (from hosts file):" + echo "$expected_ips" + echo "Got (from dig):" + echo "$resolved_ips" + exit 1 +fi +echo "✓ Resolved IPs match hosts file entries" +echo "" + +echo "=== SUCCESS ===" +echo "The localdns hosts plugin is working correctly:" +echo " 1. dig response contains AA flag (served authoritatively by hosts plugin)" +echo " 2. Resolved IPs match /etc/localdns/hosts entries" +`, testFQDN) + + execScriptOnVMForScenarioValidateExitCode(ctx, s, script, 0, + "localdns should resolve FQDN from hosts file with AA flag and matching IPs") +} + // ValidateJournalctlOutput checks if specific content exists in the systemd service logs func ValidateJournalctlOutput(ctx context.Context, s *Scenario, serviceName string, expectedContent string) { s.T.Helper() diff --git a/e2e/vmss.go b/e2e/vmss.go index 50cb0a1141d..64c6ca295d7 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -209,6 +209,7 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine require.NoError(s.T, err) cse = nodeBootstrapping.CSE customData = nodeBootstrapping.CustomData + if len(s.Config.CustomDataWriteFiles) > 0 { customData, err = injectWriteFilesEntriesToCustomData(customData, s.Config.CustomDataWriteFiles) require.NoError(s.T, err, "failed to inject customData write_files entries") diff --git a/parts/linux/cloud-init/artifacts/aks-hosts-setup.service b/parts/linux/cloud-init/artifacts/aks-hosts-setup.service new file mode 100644 index 00000000000..705cf3a2f41 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/aks-hosts-setup.service @@ -0,0 +1,13 @@ +[Unit] +Description=Populate /etc/localdns/hosts with critical AKS FQDN addresses +After=network-online.target +Wants=network-online.target + +[Service] +Type=oneshot +TimeoutStartSec=60 +EnvironmentFile=-/etc/localdns/cloud-env +ExecStart=/opt/azure/containers/aks-hosts-setup.sh + +[Install] +WantedBy=multi-user.target diff --git a/parts/linux/cloud-init/artifacts/aks-hosts-setup.sh b/parts/linux/cloud-init/artifacts/aks-hosts-setup.sh new file mode 100644 index 00000000000..c2349b8f6f2 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/aks-hosts-setup.sh @@ -0,0 +1,244 @@ +#!/bin/bash +set -euo pipefail + +# aks-hosts-setup.sh +# Resolves A and AAAA records for critical AKS FQDNs and populates /etc/localdns/hosts. +# TARGET_CLOUD is set by CSE (cse_cmd.sh) and persisted via /etc/localdns/cloud-env +# as a systemd EnvironmentFile so it's available on both initial and timer-triggered runs. + +HOSTS_FILE="/etc/localdns/hosts" + +# Ensure the directory exists +mkdir -p "$(dirname "$HOSTS_FILE")" + +# Use TARGET_CLOUD directly. It's available from: +# 1. CSE environment (initial run from enableAKSHostsSetup) +# 2. Systemd EnvironmentFile (timer-triggered runs via aks-hosts-setup.service) +# If TARGET_CLOUD is not set, exit immediately - we must not guess the cloud environment +# as this could cache incorrect DNS entries in the hosts file. +if [ -z "${TARGET_CLOUD:-}" ]; then + echo "ERROR: TARGET_CLOUD is not set. Cannot determine which FQDNs to resolve." + echo "This likely means the cloud environment file is missing or CSE did not set TARGET_CLOUD." + echo "Exiting without modifying hosts file to avoid caching incorrect DNS entries." + exit 1 +fi +local_cloud="${TARGET_CLOUD}" + +# Select critical FQDNs based on the cloud environment. +# Each cloud has its own service endpoints for container registry, identity, ARM, and packages. +# This mirrors the cloud detection in GetCloudTargetEnv (pkg/agent/datamodel/sig_config.go). + +# FQDNs common to all clouds. +COMMON_FQDNS=( + "packages.microsoft.com" # Microsoft packages +) + +# Cloud-specific FQDNs. +case "${local_cloud}" in + AzureChinaCloud) + CLOUD_FQDNS=( + "acs-mirror.azureedge.net" # K8s binaries mirror + "mcr.azure.cn" # Container registry (China)(New) + "mcr.azk8s.cn" # Container registry (China)(Old, migrating from this to mcr.azure.cn) + "login.partner.microsoftonline.cn" # Azure AD (China) + "management.chinacloudapi.cn" # ARM (China) + ) + ;; + AzureUSGovernmentCloud) + CLOUD_FQDNS=( + "acs-mirror.azureedge.net" # K8s binaries mirror + "mcr.microsoft.com" # Container registry + "login.microsoftonline.us" # Azure AD (US Gov) + "management.usgovcloudapi.net" # ARM (US Gov) + "packages.aks.azure.com" # AKS packages + ) + ;; + AzurePublicCloud) + CLOUD_FQDNS=( + "acs-mirror.azureedge.net" # K8s binaries mirror + "mcr.microsoft.com" # Container registry + "login.microsoftonline.com" # Azure AD / Entra ID + "management.azure.com" # ARM + "packages.aks.azure.com" # AKS packages + ) + ;; + *) + # Unsupported cloud environment - exit with error + echo "ERROR: The following cloud is not supported: ${local_cloud}" + echo "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + exit 1 + ;; +esac + +# Combine common + cloud-specific FQDNs. +CRITICAL_FQDNS=("${COMMON_FQDNS[@]}" "${CLOUD_FQDNS[@]}") + +echo "Detected cloud environment: ${local_cloud}" + +# Function to resolve IPv4 addresses for a domain +# Filters output to only include valid IPv4 addresses (rejects NXDOMAIN, SERVFAIL, hostnames, etc.) +resolve_ipv4() { + local domain="$1" + # dig +short returns one IP per line, no parsing needed + local output + output=$(timeout 3 dig +short -t A "${domain}" 2>/dev/null) || return 0 + # Validate IPv4 format with octet range 0-255 + echo "${output}" | grep -E '^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$' | while IFS='.' read -r a b c d; do + if [ "$a" -le 255 ] && [ "$b" -le 255 ] && [ "$c" -le 255 ] && [ "$d" -le 255 ]; then + echo "${a}.${b}.${c}.${d}" + fi + done || return 0 +} + +# Function to resolve IPv6 addresses for a domain +# Filters output to only include valid IPv6 addresses (rejects NXDOMAIN, SERVFAIL, hostnames, etc.) +resolve_ipv6() { + local domain="$1" + # dig +short returns one IP per line, no parsing needed + local output + output=$(timeout 3 dig +short -t AAAA "${domain}" 2>/dev/null) || return 0 + # Three checks: only hex+colon chars (min 3), at least two colons, at least one hex digit + # This rejects malformed strings like ":::::::" (no hex), "1:2" (one colon), ":ff" (one colon) + echo "${output}" | grep -E '^[0-9a-fA-F:]{3,}$' | grep ':.*:' | grep '[0-9a-fA-F]' || return 0 +} + +echo "Starting AKS critical FQDN hosts resolution at $(date)" + +# Track if we resolved at least one address +RESOLVED_ANY=false + +# Start building the hosts file content +HOSTS_CONTENT="# AKS critical FQDN addresses resolved at $(date) +# This file is automatically generated by aks-hosts-setup.service +" + +# Resolve each FQDN +for DOMAIN in "${CRITICAL_FQDNS[@]}"; do + echo "Resolving addresses for ${DOMAIN}..." + + # Get IPv4 and IPv6 addresses using helper functions + IPV4_ADDRS=$(resolve_ipv4 "${DOMAIN}") + IPV6_ADDRS=$(resolve_ipv6 "${DOMAIN}") + + # Check if we got any results for this domain + if [ -z "${IPV4_ADDRS}" ] && [ -z "${IPV6_ADDRS}" ]; then + echo " WARNING: No IP addresses resolved for ${DOMAIN}" + continue + fi + + RESOLVED_ANY=true + HOSTS_CONTENT+=" +# ${DOMAIN}" + + if [ -n "${IPV4_ADDRS}" ]; then + for addr in ${IPV4_ADDRS}; do + HOSTS_CONTENT+=" +${addr} ${DOMAIN}" + done + fi + + if [ -n "${IPV6_ADDRS}" ]; then + for addr in ${IPV6_ADDRS}; do + HOSTS_CONTENT+=" +${addr} ${DOMAIN}" + done + fi +done + +# Check if we resolved at least one domain +if [ "${RESOLVED_ANY}" != "true" ]; then + echo "WARNING: No IP addresses resolved for any domain at $(date)" + echo "This is likely a temporary DNS issue. Timer will retry later." + # Keep existing hosts file intact and exit successfully so systemd doesn't mark unit as failed + exit 0 +fi + +# Write the hosts file atomically: write to a temp file in the same directory, +# validate it, then rename it over the target. rename(2) on the same filesystem +# is atomic, so CoreDNS (or any other reader) never sees invalid or truncated data. +echo "Writing addresses to ${HOSTS_FILE}..." +HOSTS_TMP="${HOSTS_FILE}.tmp.$$" + +# Write content to temp file with explicit error checking +if ! printf '%s\n' "${HOSTS_CONTENT}" > "${HOSTS_TMP}"; then + echo "ERROR: Failed to write to temporary file ${HOSTS_TMP}" + rm -f "${HOSTS_TMP}" # Clean up temp file + exit 1 +fi + +# Set permissions with explicit error checking +if ! chmod 0644 "${HOSTS_TMP}"; then + echo "ERROR: Failed to chmod temporary file ${HOSTS_TMP}" + rm -f "${HOSTS_TMP}" # Clean up temp file + exit 1 +fi + +# Validate temp file BEFORE moving into place to ensure we never publish invalid data +# Verify the file was written and has content +if [ ! -s "${HOSTS_TMP}" ]; then + echo "ERROR: Temporary hosts file ${HOSTS_TMP} is empty or does not exist after write" + rm -f "${HOSTS_TMP}" + exit 1 +fi + +# Verify that every non-comment, non-empty line has the format: +# This ensures we don't have any lines with FQDN but missing IP address +echo "Validating hosts file entries format..." +INVALID_LINES=() +VALID_ENTRIES=0 +while IFS= read -r line; do + # Skip comments and empty lines + [[ "$line" =~ ^[[:space:]]*# ]] || [[ -z "$line" ]] && continue + + # Check if line has at least two fields (IP and FQDN) + ip=$(echo "$line" | awk '{print $1}') + fqdn=$(echo "$line" | awk '{print $2}') + + # Critical check: ensure we have both IP and FQDN (no empty IP mappings) + if [ -z "$ip" ] || [ -z "$fqdn" ]; then + echo "ERROR: Invalid entry found - missing IP or FQDN: '$line'" + INVALID_LINES+=("$line") + continue + fi + + # Validate IP format (IPv4 or IPv6) + if [[ "$ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + # Valid IPv4 + VALID_ENTRIES=$((VALID_ENTRIES + 1)) + elif [[ "$ip" =~ : ]]; then + # Valid IPv6 (contains colon) + VALID_ENTRIES=$((VALID_ENTRIES + 1)) + else + echo "ERROR: Invalid IP format: '$ip' in line: '$line'" + INVALID_LINES+=("$line") + fi +done < "${HOSTS_TMP}" + +if [ ${#INVALID_LINES[@]} -gt 0 ]; then + echo "ERROR: Found ${#INVALID_LINES[@]} invalid entries in temporary hosts file" + echo "Invalid entries:" + printf '%s\n' "${INVALID_LINES[@]}" + echo "This indicates FQDN to empty IP mappings or malformed entries" + rm -f "${HOSTS_TMP}" + exit 1 +fi + +if [ $VALID_ENTRIES -eq 0 ]; then + echo "ERROR: No valid IP address mappings found in temporary hosts file" + echo "File content:" + cat "${HOSTS_TMP}" + rm -f "${HOSTS_TMP}" + exit 1 +fi + +echo "✓ All entries in temporary hosts file are valid (IP FQDN format)" +echo "Found ${VALID_ENTRIES} valid IP address mappings" + +# Atomic rename with explicit error checking - only done after validation passes +if ! mv "${HOSTS_TMP}" "${HOSTS_FILE}"; then + echo "ERROR: Failed to move temporary file to ${HOSTS_FILE}" + rm -f "${HOSTS_TMP}" # Clean up temp file + exit 1 +fi + +echo "AKS critical FQDN hosts resolution completed at $(date)" diff --git a/parts/linux/cloud-init/artifacts/aks-hosts-setup.timer b/parts/linux/cloud-init/artifacts/aks-hosts-setup.timer new file mode 100644 index 00000000000..281880160f9 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/aks-hosts-setup.timer @@ -0,0 +1,13 @@ +[Unit] +Description=Run AKS hosts setup periodically + +[Timer] +# Run immediately on boot +OnBootSec=0 +# Run 15 minutes after the last activation (AKS critical FQDN IPs don't change frequently) +OnUnitActiveSec=15min +# Timer accuracy (how much systemd can delay) +AccuracySec=1min + +[Install] +WantedBy=timers.target diff --git a/parts/linux/cloud-init/artifacts/cse_cmd.sh b/parts/linux/cloud-init/artifacts/cse_cmd.sh index bc48088a3b8..c42a8f4cc70 100644 --- a/parts/linux/cloud-init/artifacts/cse_cmd.sh +++ b/parts/linux/cloud-init/artifacts/cse_cmd.sh @@ -181,9 +181,12 @@ MCR_REPOSITORY_BASE="{{GetMCRRepositoryBase}}" ENABLE_IMDS_RESTRICTION="{{EnableIMDSRestriction}}" INSERT_IMDS_RESTRICTION_RULE_TO_MANGLE_TABLE="{{InsertIMDSRestrictionRuleToMangleTable}}" SHOULD_ENABLE_LOCALDNS="{{ShouldEnableLocalDNS}}" +SHOULD_ENABLE_HOSTS_PLUGIN="{{ShouldEnableHostsPlugin}}" LOCALDNS_CPU_LIMIT="{{GetLocalDNSCPULimitInPercentage}}" LOCALDNS_MEMORY_LIMIT="{{GetLocalDNSMemoryLimitInMB}}" LOCALDNS_GENERATED_COREFILE="{{GetGeneratedLocalDNSCoreFile}}" +LOCALDNS_COREFILE_BASE="{{GetGeneratedLocalDNSCoreFileBase}}" +LOCALDNS_COREFILE_EXPERIMENTAL="{{GetGeneratedLocalDNSCoreFileExperimental}}" PRE_PROVISION_ONLY="{{GetPreProvisionOnly}}" CSE_TIMEOUT="{{GetCSETimeout}}" /usr/bin/nohup /bin/bash -c "/bin/bash /opt/azure/containers/provision_start.sh" diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index ca6629b5b40..5c0309b9bbb 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -1243,20 +1243,49 @@ LOCALDNS_CORE_FILE="/opt/azure/containers/localdns/localdns.corefile" LOCALDNS_SLICE_FILE="/etc/systemd/system/localdns.slice" # This function is called from cse_main.sh. # It creates the localdns corefile and slicefile, then enables and starts localdns. -# In this function, generated base64 encoded localdns corefile is decoded and written to the corefile path. -# This function also creates the localdns slice file with memory and cpu limits, that will be used by localdns systemd unit. +# Both corefile variants are read from globals set in cse_cmd.sh: +# LOCALDNS_COREFILE_BASE — standard corefile without experimental plugins +# LOCALDNS_COREFILE_EXPERIMENTAL — corefile with experimental plugins (e.g. hosts plugin) +# The base variant is written as the initial active corefile. +# Both variants are saved to /etc/localdns/environment so localdns.sh +# can dynamically switch between them on restart. generateLocalDNSFiles() { mkdir -p "$(dirname "${LOCALDNS_CORE_FILE}")" touch "${LOCALDNS_CORE_FILE}" chmod 0644 "${LOCALDNS_CORE_FILE}" - echo "${LOCALDNS_GENERATED_COREFILE}" | base64 -d > "${LOCALDNS_CORE_FILE}" || exit $ERR_LOCALDNS_FAIL + + # Determine the base corefile to use as the initial active corefile. + # LOCALDNS_COREFILE_BASE is set by new CSE; fall back to LOCALDNS_GENERATED_COREFILE + # for backward compatibility when this VHD runs with an older CSE that only sets + # LOCALDNS_GENERATED_COREFILE. + local corefile_base="${LOCALDNS_COREFILE_BASE:-${LOCALDNS_GENERATED_COREFILE:-}}" + if [ -z "${corefile_base}" ]; then + echo "Error: neither LOCALDNS_COREFILE_BASE nor LOCALDNS_GENERATED_COREFILE is set" + exit $ERR_LOCALDNS_FAIL + fi + + # Start with the base corefile as the initial active corefile. + # The experimental variant will be selected dynamically by localdns.sh + # once /etc/localdns/hosts has been populated by aks-hosts-setup. + base64 -d <<< "${corefile_base}" > "${LOCALDNS_CORE_FILE}" || exit $ERR_LOCALDNS_FAIL + + # Log whether the generated corefile includes hosts plugin + if grep -q "hosts /etc/localdns/hosts" "${LOCALDNS_CORE_FILE}"; then + echo "Generated corefile at ${LOCALDNS_CORE_FILE} INCLUDES hosts plugin" + else + echo "Generated corefile at ${LOCALDNS_CORE_FILE} DOES NOT include hosts plugin" + fi # Create environment file for corefile regeneration. # This file will be referenced by localdns.service using EnvironmentFile directive. + # Save BOTH corefile variants so localdns can dynamically choose on each restart. LOCALDNS_ENV_FILE="/etc/localdns/environment" mkdir -p "$(dirname "${LOCALDNS_ENV_FILE}")" cat > "${LOCALDNS_ENV_FILE}" </dev/null && echo 'WITH hosts plugin' || echo 'WITHOUT hosts plugin')" echo "localdns should be enabled." systemctlEnableAndStart localdns 30 || exit $ERR_LOCALDNS_FAIL echo "Enable localdns succeeded." } +# This function enables and starts the aks-hosts-setup timer. +# The timer periodically resolves critical AKS FQDN DNS records and populates /etc/localdns/hosts. +# Called from enableLocalDNS() when SHOULD_ENABLE_HOSTS_PLUGIN is true. +enableAKSHostsSetup() { + # Best-effort setup: log errors but never fail. + # The corefile will fall back to the no-hosts variant if hosts file is empty. + # Allow overriding paths for testing (via environment variables) + local hosts_file="${AKS_HOSTS_FILE:-/etc/localdns/hosts}" + local hosts_setup_script="${AKS_HOSTS_SETUP_SCRIPT:-/opt/azure/containers/aks-hosts-setup.sh}" + local hosts_setup_service="${AKS_HOSTS_SETUP_SERVICE:-/etc/systemd/system/aks-hosts-setup.service}" + local hosts_setup_timer="${AKS_HOSTS_SETUP_TIMER:-/etc/systemd/system/aks-hosts-setup.timer}" + local cloud_env_file="${AKS_CLOUD_ENV_FILE:-/etc/localdns/cloud-env}" + + # Guard: verify required artifacts exist on this VHD. + # Older VHDs (or certain build modes) may not include them. + if [ ! -f "${hosts_setup_script}" ]; then + echo "Warning: ${hosts_setup_script} not found on this VHD, skipping aks-hosts-setup" + return + fi + if [ ! -x "${hosts_setup_script}" ]; then + echo "Warning: ${hosts_setup_script} is not executable, skipping aks-hosts-setup" + return + fi + if [ ! -f "${hosts_setup_service}" ]; then + echo "Warning: ${hosts_setup_service} not found on this VHD, skipping aks-hosts-setup" + return + fi + if [ ! -f "${hosts_setup_timer}" ]; then + echo "Warning: ${hosts_setup_timer} not found on this VHD, skipping aks-hosts-setup" + return + fi + + # Write the cloud environment as a systemd EnvironmentFile so aks-hosts-setup.sh + # can use $TARGET_CLOUD directly — both when called from CSE (already in env) and + # when triggered by the systemd timer (injected via EnvironmentFile= in the .service unit). + if [ -z "${TARGET_CLOUD:-}" ]; then + echo "WARNING: TARGET_CLOUD is not set. Cannot run aks-hosts-setup without knowing cloud environment." + echo "aks-hosts-setup requires TARGET_CLOUD to determine which FQDNs to resolve." + echo "Skipping aks-hosts-setup. Corefile will fall back to version without hosts plugin." + return + fi + + # Validate that TARGET_CLOUD is one of the supported clouds + # This must match the case statement in aks-hosts-setup.sh + case "${TARGET_CLOUD}" in + AzurePublicCloud|AzureChinaCloud|AzureUSGovernmentCloud) + # Supported cloud, continue + ;; + *) + echo "WARNING: The following cloud is not supported by aks-hosts-setup: ${TARGET_CLOUD}" + echo "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + echo "Skipping aks-hosts-setup. Corefile will fall back to version without hosts plugin." + return + ;; + esac + + echo "Setting TARGET_CLOUD=${TARGET_CLOUD} for aks-hosts-setup" + mkdir -p "$(dirname "${cloud_env_file}")" + echo "TARGET_CLOUD=${TARGET_CLOUD}" > "${cloud_env_file}" + chmod 0644 "${cloud_env_file}" + + # Create an empty hosts file so the localdns hosts plugin can start watching it + # immediately. The file will be populated by aks-hosts-setup timer asynchronously. + mkdir -p "$(dirname "${hosts_file}")" + touch "${hosts_file}" + chmod 0644 "${hosts_file}" + + # Enable the timer for periodic refresh (every 15 minutes) + # This will update the hosts file with fresh IPs from live DNS + echo "Enabling aks-hosts-setup timer..." + if systemctlEnableAndStartNoBlock aks-hosts-setup.timer 30; then + echo "aks-hosts-setup timer enabled successfully." + else + echo "Warning: Failed to enable aks-hosts-setup timer" + fi +} + configureManagedGPUExperience() { if [ "${GPU_NODE}" != "true" ] || [ "${skip_nvidia_driver_install}" = "true" ]; then return diff --git a/parts/linux/cloud-init/artifacts/cse_main.sh b/parts/linux/cloud-init/artifacts/cse_main.sh index 0225bfd0944..67e9852eeff 100755 --- a/parts/linux/cloud-init/artifacts/cse_main.sh +++ b/parts/linux/cloud-init/artifacts/cse_main.sh @@ -294,6 +294,11 @@ EOF logs_to_events "AKS.CSE.ensureContainerd.ensureArtifactStreaming" ensureArtifactStreaming || exit $ERR_ARTIFACT_STREAMING_INSTALL fi + # Enable localdns to handle node and pod DNS traffic via a local CoreDNS instance. + # Startup ordering: localdns starts immediately with the base (no-hosts) corefile. + # If aks-hosts-setup timer is also enabled, it runs async and populates /etc/localdns/hosts. + # On the next localdns restart, select_localdns_corefile() upgrades to the hosts-plugin + # corefile variant if the hosts file has valid IP mappings. if [ "${SHOULD_ENABLE_LOCALDNS}" = "true" ]; then logs_to_events "AKS.CSE.enableLocalDNS" enableLocalDNS || exit $ERR_LOCALDNS_FAIL fi diff --git a/parts/linux/cloud-init/artifacts/localdns.sh b/parts/linux/cloud-init/artifacts/localdns.sh index f05e8c3837c..38b5fe47431 100644 --- a/parts/linux/cloud-init/artifacts/localdns.sh +++ b/parts/linux/cloud-init/artifacts/localdns.sh @@ -127,15 +127,18 @@ verify_localdns_binary() { # Regenerate the localdns corefile from base64 encoded content. # This is used when the corefile goes missing. regenerate_localdns_corefile() { - if [ -z "${LOCALDNS_BASE64_ENCODED_COREFILE:-}" ]; then - echo "LOCALDNS_BASE64_ENCODED_COREFILE is not set. Cannot regenerate corefile." + local corefile_to_use + corefile_to_use=$(select_localdns_corefile) || true + if [ -z "${corefile_to_use}" ]; then + echo "No corefile selected. Cannot regenerate corefile." return 1 fi + echo "Regenerating localdns corefile at ${LOCALDNS_CORE_FILE}" mkdir -p "$(dirname "${LOCALDNS_CORE_FILE}")" # Decode base64 corefile content and write to corefile. - if ! echo "${LOCALDNS_BASE64_ENCODED_COREFILE}" | base64 -d > "${LOCALDNS_CORE_FILE}"; then + if ! echo "${corefile_to_use}" | base64 -d > "${LOCALDNS_CORE_FILE}"; then echo "Failed to decode and write corefile." return 1 fi @@ -368,6 +371,104 @@ wait_for_localdns_ready() { return 0 } +# Set node annotation to indicate hosts plugin is in use if the hosts file has contents. +annotate_node_with_hosts_plugin_status() { + # Check if the running localdns corefile actually contains the hosts plugin block. + # This is the ground truth - we check the actual corefile being used by the service, + # not just what was selected during CSE, in case the file was modified or regenerated. + local corefile_path="${UPDATED_LOCALDNS_CORE_FILE:-/opt/azure/containers/localdns/updated.localdns.corefile}" + + if [ ! -f "${corefile_path}" ]; then + echo "Localdns corefile not found at ${corefile_path}, skipping annotation." + return 0 + fi + + # Check if the corefile contains the hosts plugin block + if ! grep -q "hosts /etc/localdns/hosts" "${corefile_path}"; then + echo "Localdns corefile does not contain hosts plugin block, skipping annotation." + return 0 + fi + + # Additionally verify that the hosts file exists and has content + # Allow overriding for testing via LOCALDNS_HOSTS_FILE environment variable + local hosts_file="${LOCALDNS_HOSTS_FILE:-/etc/localdns/hosts}" + if [ ! -f "${hosts_file}" ]; then + echo "Hosts file does not exist at ${hosts_file}, skipping annotation despite corefile having hosts plugin." + return 0 + fi + + if ! grep -qE '^[0-9a-fA-F.:]+[[:space:]]+[a-zA-Z]' "${hosts_file}"; then + echo "Hosts file exists but has no IP mappings, skipping annotation." + return 0 + fi + + echo "Localdns is using hosts plugin and hosts file has $(grep -cE '^[0-9a-fA-F.:]+[[:space:]]+[a-zA-Z]' "${hosts_file}" 2>/dev/null || echo 0) entries." + + # Only proceed if we have the necessary kubectl binary and configuration + if [ ! -x /opt/bin/kubectl ]; then + echo "kubectl binary not found at /opt/bin/kubectl, skipping annotation." + return 0 + fi + + local kubeconfig="${KUBECONFIG:-/var/lib/kubelet/kubeconfig}" + # Wait for kubelet to finish TLS bootstrapping and create the kubeconfig file + # This is necessary because localdns starts in basePrep(), before kubelet starts in nodePrep() + local wait_count=0 + local max_wait="${KUBECONFIG_WAIT_ATTEMPTS:-60}" # Default: wait up to 3 minutes (60 * 3 seconds), but configurable for testing + while [ ! -f "${kubeconfig}" ]; do + if [ $wait_count -ge $max_wait ]; then + echo "Timeout waiting for kubeconfig at ${kubeconfig} after ${max_wait} attempts, skipping annotation." + return 0 + fi + echo "Waiting for TLS bootstrapping to complete (attempt $((wait_count + 1))/${max_wait})..." + sleep 3 + wait_count=$((wait_count + 1)) + done + echo "Kubeconfig found at ${kubeconfig}" + + # Get node name + local node_name + node_name=$(hostname) + if [ -z "${node_name}" ]; then + echo "Cannot get node name, skipping annotation." + return 0 + fi + + # Azure cloud provider assigns node name as the lower case of the hostname + node_name=$(echo "$node_name" | tr '[:upper:]' '[:lower:]') + + # Wait for node to be registered in the cluster + # The kubeconfig exists but the node might not be registered yet + echo "Waiting for node ${node_name} to be registered in the cluster..." + local node_wait_count=0 + local max_node_wait="${NODE_REGISTRATION_WAIT_ATTEMPTS:-30}" # Default: wait up to 90 seconds (30 * 3 seconds) + while [ $node_wait_count -lt $max_node_wait ]; do + if /opt/bin/kubectl --kubeconfig "${kubeconfig}" get node "${node_name}" >/dev/null 2>&1; then + echo "Node ${node_name} is registered in the cluster." + break + fi + echo "Waiting for node registration (attempt $((node_wait_count + 1))/${max_node_wait})..." + sleep 3 + node_wait_count=$((node_wait_count + 1)) + done + + # Check if we timed out waiting for node registration + if [ $node_wait_count -ge $max_node_wait ]; then + echo "Timeout waiting for node ${node_name} to be registered after ${max_node_wait} attempts, skipping annotation." + return 0 + fi + + # Set annotation to indicate hosts plugin is in use + echo "Setting annotation to indicate hosts plugin is in use for node ${node_name}." + if /opt/bin/kubectl --kubeconfig "${kubeconfig}" annotate --overwrite node "${node_name}" kubernetes.azure.com/localdns-hosts-plugin=enabled; then + echo "Successfully set hosts plugin annotation." + else + echo "Warning: Failed to set hosts plugin annotation (this is non-fatal)." + fi + + return 0 +} + # Add iptables rules to skip conntrack for DNS traffic to localdns. add_iptable_rules_to_skip_conntrack_from_pods(){ # Check if the localdns interface already exists and delete it. @@ -510,6 +611,12 @@ cleanup_localdns_configs() { # Disable error handling so that we don't get into a recursive loop. set +e + # Kill orphaned background annotation process if still running. + if [ -n "${ANNOTATION_PID:-}" ] && kill -0 "${ANNOTATION_PID}" 2>/dev/null; then + echo "Killing background annotation process (PID: ${ANNOTATION_PID})" + kill "${ANNOTATION_PID}" 2>/dev/null || true + fi + # Remove iptables rules and revert DNS configuration cleanup_iptables_and_dns || return 1 @@ -626,10 +733,74 @@ start_localdns_watchdog() { fi } +# Selects the appropriate corefile variant based on environment state. +# Reads globals from the localdns environment file: +# LOCALDNS_COREFILE_BASE — base corefile (no experimental plugins) +# LOCALDNS_COREFILE_EXPERIMENTAL — corefile with experimental plugins (e.g. hosts) +# SHOULD_ENABLE_HOSTS_PLUGIN — whether hosts plugin is enabled +# +# Selection logic: +# 1. If both BASE and EXPERIMENTAL are available, dynamically choose based on +# whether the hosts file has been populated by aks-hosts-setup. +# 2. If only BASE is available, use it (no dynamic selection). +# 3. If nothing is available, return failure (caller handles error). +# +# Echoes the selected base64-encoded corefile to stdout. +# All diagnostic messages go to stderr. +select_localdns_corefile() { + local hosts_file_path="${LOCALDNS_HOSTS_FILE:-/etc/localdns/hosts}" + + # Case 1: Both corefile variants available — dynamic selection + if [ -n "${LOCALDNS_COREFILE_EXPERIMENTAL:-}" ] && \ + [ -n "${LOCALDNS_COREFILE_BASE:-}" ]; then + echo "Both corefile variants available, selecting based on current state..." >&2 + echo "LocalDNS corefile selection: SHOULD_ENABLE_HOSTS_PLUGIN=${SHOULD_ENABLE_HOSTS_PLUGIN:-}" >&2 + + if [ "${SHOULD_ENABLE_HOSTS_PLUGIN:-}" = "true" ]; then + echo "Hosts plugin is enabled, checking ${hosts_file_path} for content..." >&2 + if [ -f "${hosts_file_path}" ] && \ + grep -qE '^[0-9a-fA-F.:]+[[:space:]]+[a-zA-Z]' "${hosts_file_path}"; then + echo "Hosts file has IP mappings, using corefile with hosts plugin" >&2 + echo "${LOCALDNS_COREFILE_EXPERIMENTAL}" + return 0 + fi + echo "Info: ${hosts_file_path} not ready yet, falling back to corefile without hosts plugin" >&2 + echo "${LOCALDNS_COREFILE_BASE}" + return 0 + else + echo "Hosts plugin is not enabled, using corefile without hosts plugin" >&2 + echo "${LOCALDNS_COREFILE_BASE}" + return 0 + fi + fi + + # Case 2: Only BASE available — no dynamic selection + if [ -n "${LOCALDNS_COREFILE_BASE:-}" ]; then + echo "Using LOCALDNS_COREFILE_BASE (no dynamic selection)" >&2 + echo "${LOCALDNS_COREFILE_BASE}" + return 0 + fi + + # Case 3: Nothing available — signal failure so callers don't proceed with empty corefile + echo "No corefile variants available in environment." >&2 + return 1 +} + ${__SOURCED__:+return} # --------------------------------------- Main Execution starts here -------------------------------------------------- +# Regenerate corefile on every startup to enable dynamic variant selection. +# --------------------------------------------------------------------------------------------------------------------- +# This allows switching between EXPERIMENTAL and BASE corefile variants based on current state. +# On restarts, if /etc/localdns/hosts has been populated by aks-hosts-setup timer, +# localdns will automatically switch to the hosts-plugin variant. +# select_localdns_corefile checks the hosts file once and falls back to the +# no-hosts variant immediately if missing/empty. This is intentional — we don't +# block localdns startup waiting for DNS resolution. The aks-hosts-setup timer +# will populate the hosts file, and the next restart will pick it up. +regenerate_localdns_corefile || exit $ERR_LOCALDNS_COREFILE_NOTFOUND + # Verify localdns required files exists. # --------------------------------------------------------------------------------------------------------------------- # Verify that generated corefile exists and is not empty. @@ -708,6 +879,14 @@ echo "Updating network DNS configuration to point to localdns via ${NETWORK_DROP disable_dhcp_use_clusterlistener || exit $ERR_LOCALDNS_FAIL echo "Startup complete - serving node and pod DNS traffic." +# Set node annotation to indicate hosts plugin is in use (if applicable). +# -------------------------------------------------------------------------------------------------------------------- +# Run annotation in background to avoid blocking CSE completion +# The annotation is a best-effort operation that should not delay node provisioning +annotate_node_with_hosts_plugin_status & +ANNOTATION_PID=$! +echo "Started hosts plugin annotation in background (PID: ${ANNOTATION_PID})" + # Systemd notify: send ready if service is Type=notify. # -------------------------------------------------------------------------------------------------------------------- if [ -n "${NOTIFY_SOCKET:-}" ]; then diff --git a/pkg/agent/baker.go b/pkg/agent/baker.go index 4f3d0e6364c..cdf8f8fd081 100644 --- a/pkg/agent/baker.go +++ b/pkg/agent/baker.go @@ -1223,13 +1223,33 @@ func getContainerServiceFuncMap(config *datamodel.NodeBootstrappingConfiguration "ShouldEnableLocalDNS": func() bool { return profile.ShouldEnableLocalDNS() }, + "ShouldEnableHostsPlugin": func() bool { + return profile.ShouldEnableHostsPlugin() + }, "GetGeneratedLocalDNSCoreFile": func() (string, error) { - output, err := GenerateLocalDNSCoreFile(config, profile, localDNSCoreFileTemplateString) + // Legacy variable: kept for backward compat with old VHDs that only know + // LOCALDNS_GENERATED_COREFILE. Must use includeHostsPlugin=false because + // old VHDs don't provision /etc/localdns/hosts. + output, err := GenerateLocalDNSCoreFile(config, profile, false) if err != nil { return "", fmt.Errorf("failed generate corefile for localdns using template: %w", err) } return base64.StdEncoding.EncodeToString([]byte(output)), nil }, + "GetGeneratedLocalDNSCoreFileBase": func() (string, error) { + output, err := GenerateLocalDNSCoreFile(config, profile, false) + if err != nil { + return "", fmt.Errorf("failed generate base corefile for localdns using template: %w", err) + } + return base64.StdEncoding.EncodeToString([]byte(output)), nil + }, + "GetGeneratedLocalDNSCoreFileExperimental": func() (string, error) { + output, err := GenerateLocalDNSCoreFile(config, profile, true) + if err != nil { + return "", fmt.Errorf("failed generate experimental corefile for localdns using template: %w", err) + } + return base64.StdEncoding.EncodeToString([]byte(output)), nil + }, "GetLocalDNSCPULimitInPercentage": func() string { return profile.GetLocalDNSCPULimitInPercentage() }, @@ -1804,16 +1824,19 @@ func containerdConfigFromTemplate( // ----------------------- Start of changes related to localdns ------------------------------------------. // Parse and generate localdns Corefile from template and LocalDNSProfile. +// includeHostsPlugin controls whether the hosts plugin blocks for caching critical AKS FQDNs +// are included in the generated Corefile. When false, the same template is rendered without +// the hosts blocks, used as a fallback when enableAKSHostsSetup fails at provisioning time. func GenerateLocalDNSCoreFile( config *datamodel.NodeBootstrappingConfiguration, profile *datamodel.AgentPoolProfile, - tmpl string, + includeHostsPlugin bool, ) (string, error) { parameters := getParameters(config) variables := getCustomDataVariables(config) bakerFuncMap := getBakerFuncMap(config, parameters, variables) - if profile.LocalDNSProfile == nil || !profile.ShouldEnableLocalDNS() { + if profile == nil || profile.LocalDNSProfile == nil || !profile.ShouldEnableLocalDNS() { return "", nil } @@ -1821,7 +1844,11 @@ func GenerateLocalDNSCoreFile( "hasSuffix": strings.HasSuffix, } localDNSCoreFileData := profile.GetLocalDNSCoreFileData() - localDNSCorefileTemplate := template.Must(template.New("localdnscorefile").Funcs(bakerFuncMap).Funcs(funcMapForHasSuffix).Parse(tmpl)) + localDNSCoreFileData.IncludeHostsPlugin = includeHostsPlugin + localDNSCorefileTemplate, err := template.New("localdnscorefile").Funcs(bakerFuncMap).Funcs(funcMapForHasSuffix).Parse(localDNSCoreFileTemplateString) + if err != nil { + return "", fmt.Errorf("failed to parse localdns corefile template: %w", err) + } // Generate the Corefile content. var corefileBuffer bytes.Buffer @@ -1834,6 +1861,10 @@ func GenerateLocalDNSCoreFile( } // Template to create corefile that will be used by localdns service. +// When IncludeHostsPlugin is true, the hosts plugin blocks for caching critical AKS FQDNs +// (mcr.microsoft.com, packages.aks.azure.com, etc.) are included in root domain server blocks. +// When false, hosts blocks are omitted — used as a fallback when enableAKSHostsSetup fails at +// provisioning time, following the same dual-config pattern used for containerd GPU/no-GPU configs. const localDNSCoreFileTemplateString = ` # *********************************************************************************** # WARNING: Changes to this file will be overwritten and not persisted. @@ -1860,6 +1891,12 @@ health-check.localdns.local:53 { log {{- end }} bind {{$.NodeListenerIP}} + {{- if and $isRootDomain $.IncludeHostsPlugin}} + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } + {{- end}} {{- if $isRootDomain}} forward . {{$.AzureDNSIP}} { {{- else}} @@ -1921,6 +1958,12 @@ health-check.localdns.local:53 { log {{- end }} bind {{$.ClusterListenerIP}} + {{- if and $isRootDomain $.IncludeHostsPlugin}} + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } + {{- end}} {{- if $fwdToClusterCoreDNS}} forward . {{$.CoreDNSServiceIP}} { {{- else}} diff --git a/pkg/agent/baker_test.go b/pkg/agent/baker_test.go index a83405d7b70..cd3ea477871 100644 --- a/pkg/agent/baker_test.go +++ b/pkg/agent/baker_test.go @@ -274,21 +274,6 @@ var _ = Describe("Assert generated customData and cseCmd", func() { }) Describe(".GetGeneratedLocalDNSCoreFile()", func() { - // Expect an error from GenerateLocalDNSCoreFile if template is invalid. - It("returns an error when template parsing fails", func() { - config.AgentPoolProfile.LocalDNSProfile = &datamodel.LocalDNSProfile{ - EnableLocalDNS: true, - CPULimitInMilliCores: to.Int32Ptr(2008), - MemoryLimitInMB: to.Int32Ptr(128), - VnetDNSOverrides: nil, - KubeDNSOverrides: nil, - } - invalidTemplate := "{{.InvalidField}}" - _, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, invalidTemplate) - Expect(err).ToNot(BeNil()) - Expect(err.Error()).To(ContainSubstring("failed to execute localdns corefile template")) - }) - // Expect no error and a non-empty corefile when LocalDNSOverrides are nil. It("handles nil LocalDNSOverrides", func() { config.AgentPoolProfile.LocalDNSProfile = &datamodel.LocalDNSProfile{ @@ -298,7 +283,7 @@ var _ = Describe("Assert generated customData and cseCmd", func() { VnetDNSOverrides: nil, KubeDNSOverrides: nil, } - localDNSCoreFile, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, localDNSCoreFileTemplateString) + localDNSCoreFile, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, true) Expect(err).To(BeNil()) Expect(localDNSCoreFile).ToNot(BeEmpty()) Expect(localDNSCoreFile).To(ContainSubstring(expectedlocalDNSCorefileWithoutOverrides)) @@ -313,7 +298,7 @@ var _ = Describe("Assert generated customData and cseCmd", func() { VnetDNSOverrides: map[string]*datamodel.LocalDNSOverrides{}, KubeDNSOverrides: map[string]*datamodel.LocalDNSOverrides{}, } - localDNSCoreFile, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, localDNSCoreFileTemplateString) + localDNSCoreFile, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, true) Expect(err).To(BeNil()) Expect(localDNSCoreFile).ToNot(BeEmpty()) Expect(localDNSCoreFile).To(ContainSubstring(expectedlocalDNSCorefileWithoutOverrides)) @@ -370,7 +355,7 @@ var _ = Describe("Assert generated customData and cseCmd", func() { }, }, } - localDNSCoreFile, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, localDNSCoreFileTemplateString) + localDNSCoreFile, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, true) Expect(err).To(BeNil()) Expect(localDNSCoreFile).ToNot(BeEmpty()) @@ -387,6 +372,10 @@ health-check.localdns.local:53 { .:53 { log bind 169.254.10.10 + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } forward . 168.63.129.16 { policy sequential max_concurrent 1000 @@ -450,6 +439,10 @@ testdomain456.com:53 { .:53 { errors bind 169.254.10.11 + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } forward . 10.0.0.10 { policy sequential max_concurrent 2000 @@ -548,7 +541,7 @@ testdomain456.com:53 { }, }, } - localDNSCoreFile, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, localDNSCoreFileTemplateString) + localDNSCoreFile, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, true) Expect(err).To(BeNil()) Expect(localDNSCoreFile).ToNot(BeEmpty()) @@ -565,6 +558,10 @@ health-check.localdns.local:53 { .:53 { log bind 169.254.10.10 + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } forward . 168.63.129.16 { policy sequential max_concurrent 1000 @@ -628,6 +625,10 @@ testdomain456.com:53 { .:53 { errors bind 169.254.10.11 + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } forward . 10.0.0.10 { policy sequential max_concurrent 1000 @@ -690,10 +691,134 @@ testdomain567.com:53 { ` Expect(localDNSCoreFile).To(ContainSubstring(expectedlocalDNSCorefile)) }) + + // Expect a valid corefile WITHOUT hosts plugin blocks when includeHostsPlugin=false. + // This is the fallback corefile used when enableAKSHostsSetup fails at provisioning time. + It("generates a valid localdnsCorefile without hosts plugin when includeHostsPlugin is false", func() { + config.AgentPoolProfile.LocalDNSProfile = &datamodel.LocalDNSProfile{ + EnableLocalDNS: true, + EnableHostsPlugin: true, + CPULimitInMilliCores: to.Int32Ptr(2008), + MemoryLimitInMB: to.Int32Ptr(128), + VnetDNSOverrides: map[string]*datamodel.LocalDNSOverrides{ + ".": { + QueryLogging: "Log", + Protocol: "PreferUDP", + ForwardDestination: "VnetDNS", + ForwardPolicy: "Sequential", + MaxConcurrent: to.Int32Ptr(1000), + CacheDurationInSeconds: to.Int32Ptr(3600), + ServeStaleDurationInSeconds: to.Int32Ptr(3600), + ServeStale: "Immediate", + }, + }, + KubeDNSOverrides: map[string]*datamodel.LocalDNSOverrides{ + ".": { + QueryLogging: "Error", + Protocol: "PreferUDP", + ForwardDestination: "ClusterCoreDNS", + ForwardPolicy: "Sequential", + MaxConcurrent: to.Int32Ptr(2000), + CacheDurationInSeconds: to.Int32Ptr(3600), + ServeStaleDurationInSeconds: to.Int32Ptr(72000), + ServeStale: "Verify", + }, + }, + } + // Generate with includeHostsPlugin=false (the no-hosts fallback) + localDNSCoreFile, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, false) + Expect(err).To(BeNil()) + Expect(localDNSCoreFile).ToNot(BeEmpty()) + + // The no-hosts corefile must NOT contain hosts plugin blocks + Expect(localDNSCoreFile).ToNot(ContainSubstring("hosts /etc/localdns/hosts")) + Expect(localDNSCoreFile).ToNot(ContainSubstring("# Check /etc/localdns/hosts")) + + // But it should still contain the standard corefile structure + Expect(localDNSCoreFile).To(ContainSubstring("health-check.localdns.local:53")) + Expect(localDNSCoreFile).To(ContainSubstring("bind 169.254.10.10")) + Expect(localDNSCoreFile).To(ContainSubstring("bind 169.254.10.11")) + Expect(localDNSCoreFile).To(ContainSubstring("forward . 168.63.129.16")) + Expect(localDNSCoreFile).To(ContainSubstring("nsid localdns")) + Expect(localDNSCoreFile).To(ContainSubstring("nsid localdns-pod")) + }) + + // Verify that includeHostsPlugin=true produces hosts blocks and includeHostsPlugin=false does not, + // when using the same LocalDNSProfile configuration. + It("produces different output for includeHostsPlugin true vs false", func() { + config.AgentPoolProfile.LocalDNSProfile = &datamodel.LocalDNSProfile{ + EnableLocalDNS: true, + EnableHostsPlugin: true, + CPULimitInMilliCores: to.Int32Ptr(2008), + MemoryLimitInMB: to.Int32Ptr(128), + VnetDNSOverrides: map[string]*datamodel.LocalDNSOverrides{ + ".": { + QueryLogging: "Log", + Protocol: "PreferUDP", + ForwardDestination: "VnetDNS", + ForwardPolicy: "Sequential", + MaxConcurrent: to.Int32Ptr(1000), + CacheDurationInSeconds: to.Int32Ptr(3600), + ServeStaleDurationInSeconds: to.Int32Ptr(3600), + ServeStale: "Immediate", + }, + }, + KubeDNSOverrides: map[string]*datamodel.LocalDNSOverrides{ + ".": { + QueryLogging: "Error", + Protocol: "PreferUDP", + ForwardDestination: "ClusterCoreDNS", + ForwardPolicy: "Sequential", + MaxConcurrent: to.Int32Ptr(1000), + CacheDurationInSeconds: to.Int32Ptr(3600), + ServeStaleDurationInSeconds: to.Int32Ptr(3600), + ServeStale: "Verify", + }, + }, + } + withHosts, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, true) + Expect(err).To(BeNil()) + withoutHosts, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, false) + Expect(err).To(BeNil()) + + // With hosts should have the hosts plugin block + Expect(withHosts).To(ContainSubstring("hosts /etc/localdns/hosts")) + // Without hosts should NOT have it + Expect(withoutHosts).ToNot(ContainSubstring("hosts /etc/localdns/hosts")) + // Both should still be valid corefiles + Expect(withHosts).To(ContainSubstring("health-check.localdns.local:53")) + Expect(withoutHosts).To(ContainSubstring("health-check.localdns.local:53")) + }) }) }) }) +func getDecodedVarsFromCseCmd(data []byte) (map[string]string, error) { + cseRegex := regexp.MustCompile(cseRegexString) + cseVariableList := cseRegex.FindAllStringSubmatch(string(data), -1) + vars := make(map[string]string) + + for _, cseVar := range cseVariableList { + if len(cseVar) < 3 { + return nil, fmt.Errorf("expected 3 results (match, key, value) from regex, found %d, result %q", len(cseVar), cseVar) + } + + key := cseVar[1] + val := getValueWithoutQuotes(cseVar[2]) + + vars[key] = val + } + + return vars, nil +} + +func getValueWithoutQuotes(value string) string { + if len(value) > 1 && value[0] == '"' && value[len(value)-1] == '"' { + return value[1 : len(value)-1] + } + return value +} + type tarEntry struct { path string *decodedValue @@ -729,32 +854,6 @@ func decodeTarFiles(data []byte) ([]tarEntry, error) { return files, nil } -func getDecodedVarsFromCseCmd(data []byte) (map[string]string, error) { - cseRegex := regexp.MustCompile(cseRegexString) - cseVariableList := cseRegex.FindAllStringSubmatch(string(data), -1) - vars := make(map[string]string) - - for _, cseVar := range cseVariableList { - if len(cseVar) < 3 { - return nil, fmt.Errorf("expected 3 results (match, key, value) from regex, found %d, result %q", len(cseVar), cseVar) - } - - key := cseVar[1] - val := getValueWithoutQuotes(cseVar[2]) - - vars[key] = val - } - - return vars, nil -} - -func getValueWithoutQuotes(value string) string { - if len(value) > 1 && value[0] == '"' && value[len(value)-1] == '"' { - return value[1 : len(value)-1] - } - return value -} - var _ = Describe("Test normalizeResourceGroupNameForLabel", func() { It("should return the correct normalized resource group name", func() { Expect(normalizeResourceGroupNameForLabel("hello")).To(Equal("hello")) diff --git a/pkg/agent/datamodel/types.go b/pkg/agent/datamodel/types.go index 4cb6812cfb6..0860c1f54ef 100644 --- a/pkg/agent/datamodel/types.go +++ b/pkg/agent/datamodel/types.go @@ -2460,6 +2460,7 @@ const ( // LocalDNSProfile represents localdns configuration for agentpool nodes. type LocalDNSProfile struct { EnableLocalDNS bool `json:"enableLocalDNS,omitempty"` + EnableHostsPlugin bool `json:"enableHostsPlugin,omitempty"` CPULimitInMilliCores *int32 `json:"cpuLimitInMilliCores,omitempty"` MemoryLimitInMB *int32 `json:"memoryLimitInMB,omitempty"` VnetDNSOverrides map[string]*LocalDNSOverrides `json:"vnetDNSOverrides,omitempty"` @@ -2468,10 +2469,11 @@ type LocalDNSProfile struct { type LocalDNSCoreFileData struct { LocalDNSProfile - NodeListenerIP string - ClusterListenerIP string - CoreDNSServiceIP string - AzureDNSIP string + NodeListenerIP string + ClusterListenerIP string + CoreDNSServiceIP string + AzureDNSIP string + IncludeHostsPlugin bool } // LocalDNSOverrides represents DNS override settings for both VnetDNS and KubeDNS traffic. @@ -2496,6 +2498,13 @@ func (a *AgentPoolProfile) ShouldEnableLocalDNS() bool { return a != nil && a.LocalDNSProfile != nil && a.LocalDNSProfile.EnableLocalDNS } +// ShouldEnableHostsPlugin returns true if LocalDNS is enabled and the hosts plugin +// is explicitly enabled. When true, the localdns Corefile will include a hosts plugin +// block that serves cached DNS entries from /etc/localdns/hosts for critical AKS FQDNs. +func (a *AgentPoolProfile) ShouldEnableHostsPlugin() bool { + return a.ShouldEnableLocalDNS() && a.LocalDNSProfile.EnableHostsPlugin +} + // GetLocalDNSNodeListenerIP returns APIPA-IP address that will be used in localdns systemd unit. func (a *AgentPoolProfile) GetLocalDNSNodeListenerIP() string { return LocalDNSNodeListenerIP diff --git a/pkg/agent/datamodel/types_test.go b/pkg/agent/datamodel/types_test.go index 1cfb888056b..a0605aabd47 100644 --- a/pkg/agent/datamodel/types_test.go +++ b/pkg/agent/datamodel/types_test.go @@ -3090,10 +3090,8 @@ func TestShouldEnableLocalDNS(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - actualData := false - if tt.agentPoolProfile != nil { - actualData = tt.agentPoolProfile.ShouldEnableLocalDNS() - } + actualData := tt.agentPoolProfile.ShouldEnableLocalDNS() + assert.Equal(t, tt.expectedData, actualData) }) } @@ -3391,4 +3389,73 @@ func TestGetLocalDNSCoreFileData(t *testing.T) { } } +func TestShouldEnableHostsPlugin(t *testing.T) { + tests := []struct { + name string + agentPoolProfile *AgentPoolProfile + expectedData bool + }{ + { + name: "ShouldEnableHostsPlugin - AgentPoolProfile nil", + agentPoolProfile: nil, + expectedData: false, + }, + { + name: "ShouldEnableHostsPlugin - LocalDNSProfile nil", + agentPoolProfile: &AgentPoolProfile{ + LocalDNSProfile: nil, + }, + expectedData: false, + }, + { + name: "ShouldEnableHostsPlugin - LocalDNS disabled, HostsPlugin enabled", + agentPoolProfile: &AgentPoolProfile{ + LocalDNSProfile: &LocalDNSProfile{ + EnableLocalDNS: false, + EnableHostsPlugin: true, + }, + }, + expectedData: false, + }, + { + name: "ShouldEnableHostsPlugin - LocalDNS enabled, HostsPlugin disabled", + agentPoolProfile: &AgentPoolProfile{ + LocalDNSProfile: &LocalDNSProfile{ + EnableLocalDNS: true, + EnableHostsPlugin: false, + }, + }, + expectedData: false, + }, + { + name: "ShouldEnableHostsPlugin - both enabled", + agentPoolProfile: &AgentPoolProfile{ + LocalDNSProfile: &LocalDNSProfile{ + EnableLocalDNS: true, + EnableHostsPlugin: true, + }, + }, + expectedData: true, + }, + { + name: "ShouldEnableHostsPlugin - both disabled", + agentPoolProfile: &AgentPoolProfile{ + LocalDNSProfile: &LocalDNSProfile{ + EnableLocalDNS: false, + EnableHostsPlugin: false, + }, + }, + expectedData: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + actualData := tt.agentPoolProfile.ShouldEnableHostsPlugin() + + assert.Equal(t, tt.expectedData, actualData) + }) + } +} + // ----------------------- End of changes related to localdns ------------------------------------------. diff --git a/spec/parts/linux/cloud-init/artifacts/aks_hosts_setup_spec.sh b/spec/parts/linux/cloud-init/artifacts/aks_hosts_setup_spec.sh new file mode 100644 index 00000000000..825a74c64f3 --- /dev/null +++ b/spec/parts/linux/cloud-init/artifacts/aks_hosts_setup_spec.sh @@ -0,0 +1,500 @@ +#shellcheck shell=bash +#shellcheck disable=SC2148 + +Describe 'aks-hosts-setup.sh' + SCRIPT_PATH="parts/linux/cloud-init/artifacts/aks-hosts-setup.sh" + + # Helper to build a test script that uses the real system dig. + # Overrides only HOSTS_FILE and TARGET_CLOUD, preserving everything else + # (cloud selection, resolution loop, atomic write) from the real script. + # Uses sed to strip the shebang, set -euo pipefail, and HOSTS_FILE= lines + # so the test is not brittle to comment changes at the top of the script. + build_test_script() { + local test_dir="$1" + local hosts_file="$2" + local target_cloud="${3:-AzurePublicCloud}" + local test_script="${test_dir}/aks-hosts-setup-test.sh" + + cat > "${test_script}" << EOF +#!/usr/bin/env bash +set -uo pipefail +HOSTS_FILE="${hosts_file}" +export TARGET_CLOUD="${target_cloud}" +EOF + sed -e '/^#!\/bin\/bash/d' -e '/^set -euo pipefail/d' -e '/^HOSTS_FILE=/d' "${SCRIPT_PATH}" >> "${test_script}" + chmod +x "${test_script}" + echo "${test_script}" + } + + # Helper to build a test script with a mock dig prepended to PATH. + # Used only for edge-case tests that need controlled DNS output + # (failure handling, invalid response filtering). + build_mock_test_script() { + local test_dir="$1" + local hosts_file="$2" + local mock_bin_dir="$3" + local target_cloud="${4:-AzurePublicCloud}" + local test_script="${test_dir}/aks-hosts-setup-test.sh" + + cat > "${test_script}" << EOF +#!/usr/bin/env bash +set -uo pipefail +export PATH="${mock_bin_dir}:\$PATH" +HOSTS_FILE="${hosts_file}" +export TARGET_CLOUD="${target_cloud}" +EOF + sed -e '/^#!\/bin\/bash/d' -e '/^set -euo pipefail/d' -e '/^HOSTS_FILE=/d' "${SCRIPT_PATH}" >> "${test_script}" + chmod +x "${test_script}" + echo "${test_script}" + } + + # Creates a mock dig executable that simulates DNS failure (empty output). + create_failure_mock() { + local mock_bin_dir="$1" + mkdir -p "${mock_bin_dir}" + cat > "${mock_bin_dir}/dig" << 'MOCK_EOF' +#!/usr/bin/env bash +# Simulate DNS failure: dig +short returns empty output +exit 0 +MOCK_EOF + chmod +x "${mock_bin_dir}/dig" + } + + # ----------------------------------------------------------------------- + # Tests using real dig (no mocks) + # ----------------------------------------------------------------------- + + Describe 'DNS resolution and hosts file creation (AzurePublicCloud)' + setup() { + TEST_DIR=$(mktemp -d) + export HOSTS_FILE="${TEST_DIR}/hosts.testing" + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "AzurePublicCloud") + } + + cleanup() { + rm -rf "$TEST_DIR" + } + + BeforeEach 'setup' + AfterEach 'cleanup' + + It 'creates hosts file with resolved addresses for all critical FQDNs' + When run command bash "${TEST_SCRIPT}" + The status should be success + The file "$HOSTS_FILE" should be exist + The output should include "Starting AKS critical FQDN hosts resolution" + The output should include "AKS critical FQDN hosts resolution completed" + End + + It 'detects AzurePublicCloud environment' + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "Detected cloud environment: AzurePublicCloud" + End + + It 'resolves all public cloud FQDNs' + When run command bash "${TEST_SCRIPT}" + The status should be success + # Verify the script attempts to resolve all expected public cloud FQDNs + The output should include "Resolving addresses for mcr.microsoft.com" + The output should include "Resolving addresses for packages.microsoft.com" + The output should include "Resolving addresses for management.azure.com" + The output should include "Resolving addresses for login.microsoftonline.com" + The output should include "Resolving addresses for acs-mirror.azureedge.net" + The output should include "Resolving addresses for packages.aks.azure.com" + # Verify hosts file contains real resolved entries + The contents of file "$HOSTS_FILE" should include "mcr.microsoft.com" + The contents of file "$HOSTS_FILE" should include "packages.microsoft.com" + End + + It 'writes valid hosts file format' + When run command bash "${TEST_SCRIPT}" + The status should be success + The file "$HOSTS_FILE" should be exist + The output should include "Writing addresses" + End + + It 'includes header comments in hosts file' + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "AKS critical FQDN hosts resolution" + The contents of file "$HOSTS_FILE" should include "# AKS critical FQDN addresses resolved at" + The contents of file "$HOSTS_FILE" should include "# This file is automatically generated by aks-hosts-setup.service" + End + End + + Describe 'Cloud-specific FQDN selection' + # These tests use real nslookup. Sovereign cloud domains may not resolve + # from CI, so we assert on which FQDNs the script *attempts* to resolve + # (visible in stdout) rather than checking hosts file contents. + setup() { + TEST_DIR=$(mktemp -d) + export HOSTS_FILE="${TEST_DIR}/hosts.testing" + } + + cleanup() { + rm -rf "$TEST_DIR" + } + + BeforeEach 'setup' + AfterEach 'cleanup' + + It 'selects AzureChinaCloud FQDNs' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "AzureChinaCloud") + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "Detected cloud environment: AzureChinaCloud" + # Should resolve China-specific endpoints + The output should include "Resolving addresses for mcr.azure.cn" + The output should include "Resolving addresses for mcr.azk8s.cn" + The output should include "Resolving addresses for login.partner.microsoftonline.cn" + The output should include "Resolving addresses for management.chinacloudapi.cn" + The output should include "Resolving addresses for packages.microsoft.com" + # Should NOT attempt public cloud endpoints + The output should not include "Resolving addresses for login.microsoftonline.com" + The output should not include "Resolving addresses for management.azure.com" + End + + It 'selects AzureUSGovernmentCloud FQDNs' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "AzureUSGovernmentCloud") + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "Detected cloud environment: AzureUSGovernmentCloud" + The output should include "Resolving addresses for mcr.microsoft.com" + The output should include "Resolving addresses for login.microsoftonline.us" + The output should include "Resolving addresses for management.usgovcloudapi.net" + The output should include "Resolving addresses for packages.aks.azure.com" + The output should not include "Resolving addresses for login.microsoftonline.com" + The output should not include "Resolving addresses for management.azure.com" + End + + It 'exits with error for unknown cloud values' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "SomeUnknownCloud") + When run command bash "${TEST_SCRIPT}" + The status should be failure + The output should include "ERROR: The following cloud is not supported: SomeUnknownCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + The output should not include "Cannot determine which FQDNs to resolve for hosts file" + The output should not include "Exiting without modifying hosts file" + End + + It 'exits with error for USNatCloud (no longer supported)' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "USNatCloud") + When run command bash "${TEST_SCRIPT}" + The status should be failure + The output should include "ERROR: The following cloud is not supported: USNatCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + End + + It 'exits with error for USSecCloud (no longer supported)' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "USSecCloud") + When run command bash "${TEST_SCRIPT}" + The status should be failure + The output should include "ERROR: The following cloud is not supported: USSecCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + End + + It 'exits with error for AzureStackCloud (no longer supported)' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "AzureStackCloud") + When run command bash "${TEST_SCRIPT}" + The status should be failure + The output should include "ERROR: The following cloud is not supported: AzureStackCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + End + + It 'exits with error for AzureGermanCloud (no longer supported)' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "AzureGermanCloud") + When run command bash "${TEST_SCRIPT}" + The status should be failure + The output should include "ERROR: The following cloud is not supported: AzureGermanCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + End + + It 'exits with error for AzureGermanyCloud (no longer supported)' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "AzureGermanyCloud") + When run command bash "${TEST_SCRIPT}" + The status should be failure + The output should include "ERROR: The following cloud is not supported: AzureGermanyCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + End + + It 'exits with error for AzureBleuCloud (no longer supported)' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "AzureBleuCloud") + When run command bash "${TEST_SCRIPT}" + The status should be failure + The output should include "ERROR: The following cloud is not supported: AzureBleuCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + End + + It 'fails when TARGET_CLOUD is unset' + local test_script="${TEST_DIR}/aks-hosts-setup-test-nocloud.sh" + cat > "${test_script}" << EOF +#!/usr/bin/env bash +set -uo pipefail +HOSTS_FILE="${HOSTS_FILE}" +unset TARGET_CLOUD +EOF + tail -n +10 "${SCRIPT_PATH}" >> "${test_script}" + chmod +x "${test_script}" + + When run command bash "${test_script}" + The status should be failure + The output should include "ERROR: TARGET_CLOUD is not set" + The output should include "Cannot determine which FQDNs to resolve" + The output should include "Exiting without modifying hosts file" + End + + It 'fails when TARGET_CLOUD is empty string' + local test_script="${TEST_DIR}/aks-hosts-setup-test-empty.sh" + cat > "${test_script}" << EOF +#!/usr/bin/env bash +set -uo pipefail +HOSTS_FILE="${HOSTS_FILE}" +export TARGET_CLOUD="" +EOF + tail -n +10 "${SCRIPT_PATH}" >> "${test_script}" + chmod +x "${test_script}" + + When run command bash "${test_script}" + The status should be failure + The output should include "ERROR: TARGET_CLOUD is not set" + The output should include "Cannot determine which FQDNs to resolve" + End + + It 'includes packages.microsoft.com for all clouds (common FQDN)' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "AzurePublicCloud") + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "Resolving addresses for packages.microsoft.com" + End + End + + Describe 'Atomic file write' + setup() { + TEST_DIR=$(mktemp -d) + export HOSTS_FILE="${TEST_DIR}/hosts.testing" + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "AzurePublicCloud") + } + + cleanup() { + rm -rf "$TEST_DIR" + } + + BeforeEach 'setup' + AfterEach 'cleanup' + + It 'does not leave a temp file behind after successful write' + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "AKS critical FQDN hosts resolution" + The file "$HOSTS_FILE" should be exist + End + + It 'verifies no leftover temp files exist' + bash "${TEST_SCRIPT}" >/dev/null 2>&1 + # The temp file (hosts.testing.tmp.) should have been renamed away + When run command find "${TEST_DIR}" -name 'hosts.testing.tmp.*' + The output should equal "" + End + + It 'sets correct permissions on the hosts file' + bash "${TEST_SCRIPT}" >/dev/null 2>&1 + When run command stat -c '%a' "${HOSTS_FILE}" + The output should equal "644" + End + End + + # ----------------------------------------------------------------------- + # Mock-based tests below + # These require controlled dig output to verify error handling + # and response filtering logic that cannot be triggered with real DNS. + # ----------------------------------------------------------------------- + + Describe 'DNS resolution failure handling (mock)' + setup() { + TEST_DIR=$(mktemp -d) + MOCK_BIN="${TEST_DIR}/mock_bin" + export HOSTS_FILE="${TEST_DIR}/hosts.testing" + create_failure_mock "${MOCK_BIN}" + TEST_SCRIPT=$(build_mock_test_script "${TEST_DIR}" "${HOSTS_FILE}" "${MOCK_BIN}" "AzurePublicCloud") + } + + cleanup() { + rm -rf "$TEST_DIR" + } + + BeforeEach 'setup' + AfterEach 'cleanup' + + It 'exits gracefully when no DNS records are resolved' + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "WARNING: No IP addresses resolved for any domain" + The output should include "This is likely a temporary DNS issue" + End + + It 'does not create hosts file when no DNS records are resolved' + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "WARNING: No IP addresses resolved for any domain" + The file "$HOSTS_FILE" should not be exist + End + + It 'preserves existing hosts file when no DNS records are resolved' + echo "# old hosts content" > "${HOSTS_FILE}" + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "WARNING: No IP addresses resolved for any domain" + # Original hosts file should still be intact + The contents of file "$HOSTS_FILE" should include "# old hosts content" + End + End + + Describe 'Invalid DNS response filtering (mock)' + setup() { + TEST_DIR=$(mktemp -d) + MOCK_BIN="${TEST_DIR}/mock_bin" + mkdir -p "${MOCK_BIN}" + export HOSTS_FILE="${TEST_DIR}/hosts.testing" + } + + cleanup() { + rm -rf "$TEST_DIR" + } + + BeforeEach 'setup' + AfterEach 'cleanup' + + It 'filters out NXDOMAIN responses from hosts file' + create_failure_mock "${MOCK_BIN}" + TEST_SCRIPT=$(build_mock_test_script "${TEST_DIR}" "${HOSTS_FILE}" "${MOCK_BIN}" "AzurePublicCloud") + + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "WARNING: No IP addresses resolved for any domain" + The file "$HOSTS_FILE" should not be exist + End + + It 'filters out SERVFAIL responses from hosts file' + cat > "${MOCK_BIN}/dig" << 'MOCK_EOF' +#!/usr/bin/env bash +# Simulate SERVFAIL: dig +short returns empty output +exit 0 +MOCK_EOF + chmod +x "${MOCK_BIN}/dig" + TEST_SCRIPT=$(build_mock_test_script "${TEST_DIR}" "${HOSTS_FILE}" "${MOCK_BIN}" "AzurePublicCloud") + + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "WARNING: No IP addresses resolved for any domain" + The file "$HOSTS_FILE" should not be exist + End + + It 'does not write non-IP strings to hosts file' + cat > "${MOCK_BIN}/dig" << 'MOCK_EOF' +#!/usr/bin/env bash +record_type="" +for arg in "$@"; do + if [[ "$arg" == "A" ]]; then + record_type="A" + elif [[ "$arg" == "AAAA" ]]; then + record_type="AAAA" + fi +done + +# dig +short outputs one result per line, no prefix +if [[ "$record_type" == "A" ]]; then + echo "1.2.3.4" + echo "not-an-ip" + echo "NXDOMAIN" +fi +MOCK_EOF + chmod +x "${MOCK_BIN}/dig" + TEST_SCRIPT=$(build_mock_test_script "${TEST_DIR}" "${HOSTS_FILE}" "${MOCK_BIN}" "AzurePublicCloud") + + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "Writing addresses" + The file "$HOSTS_FILE" should be exist + The contents of file "$HOSTS_FILE" should include "1.2.3.4" + The contents of file "$HOSTS_FILE" should not include "not-an-ip" + The contents of file "$HOSTS_FILE" should not include "NXDOMAIN" + End + + It 'does not write invalid IPv6 strings to hosts file' + cat > "${MOCK_BIN}/dig" << 'MOCK_EOF' +#!/usr/bin/env bash +record_type="" +for arg in "$@"; do + if [[ "$arg" == "A" ]]; then + record_type="A" + elif [[ "$arg" == "AAAA" ]]; then + record_type="AAAA" + fi +done + +# dig +short outputs one result per line, no prefix +if [[ "$record_type" == "AAAA" ]]; then + echo "2001:db8::1" + echo "not-an-ipv6" + echo "SERVFAIL" + echo "fe80::1" + echo "1:2" + echo ":ff" + echo ":::::::" +fi +MOCK_EOF + chmod +x "${MOCK_BIN}/dig" + TEST_SCRIPT=$(build_mock_test_script "${TEST_DIR}" "${HOSTS_FILE}" "${MOCK_BIN}" "AzurePublicCloud") + + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "Writing addresses" + The file "$HOSTS_FILE" should be exist + The contents of file "$HOSTS_FILE" should include "2001:db8::1" + The contents of file "$HOSTS_FILE" should include "fe80::1" + The contents of file "$HOSTS_FILE" should not include "not-an-ipv6" + The contents of file "$HOSTS_FILE" should not include "SERVFAIL" + # Tightened IPv6 validation rejects too-short strings with fewer than 2 colons + The contents of file "$HOSTS_FILE" should not include "1:2" + The contents of file "$HOSTS_FILE" should not include ":ff" + # Rejects all-colon strings with no hex digits + The contents of file "$HOSTS_FILE" should not include ":::::::" + End + + It 'rejects IPv4 addresses with out-of-range octets' + cat > "${MOCK_BIN}/dig" << 'MOCK_EOF' +#!/usr/bin/env bash +record_type="" +for arg in "$@"; do + if [[ "$arg" == "A" ]]; then + record_type="A" + elif [[ "$arg" == "AAAA" ]]; then + record_type="AAAA" + fi +done + +# dig +short outputs one result per line, no prefix +if [[ "$record_type" == "A" ]]; then + echo "10.0.0.1" + echo "999.999.999.999" + echo "256.1.1.1" + echo "1.2.3.400" + echo "255.255.255.255" +fi +MOCK_EOF + chmod +x "${MOCK_BIN}/dig" + TEST_SCRIPT=$(build_mock_test_script "${TEST_DIR}" "${HOSTS_FILE}" "${MOCK_BIN}" "AzurePublicCloud") + + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "Writing addresses" + The file "$HOSTS_FILE" should be exist + The contents of file "$HOSTS_FILE" should include "10.0.0.1" + The contents of file "$HOSTS_FILE" should include "255.255.255.255" + The contents of file "$HOSTS_FILE" should not include "999.999.999.999" + The contents of file "$HOSTS_FILE" should not include "256.1.1.1" + The contents of file "$HOSTS_FILE" should not include "1.2.3.400" + End + End +End diff --git a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh index 3f935f17ba3..a3b62043666 100755 --- a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh @@ -1,5 +1,15 @@ #!/bin/bash +# Helper functions for tests +check_file_permissions() { + # Use printf to ensure leading zero (0644 format) + printf "0%s" "$(stat -c "%a" "$LOCALDNS_ENV_FILE")" +} + +check_cloud_env_permissions() { + printf "0%s" "$(stat -c "%a" "$AKS_CLOUD_ENV_FILE")" +} + Describe 'cse_config.sh' Include "./parts/linux/cloud-init/artifacts/cse_config.sh" Include "./parts/linux/cloud-init/artifacts/cse_helpers.sh" @@ -787,6 +797,16 @@ providers: setup() { TMP_DIR=$(mktemp -d) LOCALDNS_CORE_FILE="$TMP_DIR/localdns.corefile" + LOCALDNS_SLICE_FILE="$TMP_DIR/localdns.slice" + LOCALDNS_COREFILE_EXPERIMENTAL=$(echo -n "localdns corefile with hosts" | base64) + LOCALDNS_COREFILE_BASE=$(echo -n "localdns corefile" | base64) + LOCALDNS_MEMORY_LIMIT="128M" + LOCALDNS_CPU_LIMIT="200.0%" + # Create mock localdns assets that would be present on VHD + mkdir -p /etc/systemd/system + mkdir -p /opt/azure/containers/localdns + touch /etc/systemd/system/localdns.service + touch /opt/azure/containers/localdns/localdns.sh systemctlEnableAndStart() { echo "systemctlEnableAndStart $@" @@ -795,20 +815,37 @@ providers: } cleanup() { rm -rf "$TMP_DIR" + # Clean up mock VHD assets + rm -f /etc/systemd/system/localdns.service + rm -f /opt/azure/containers/localdns/localdns.sh } BeforeEach 'setup' AfterEach 'cleanup' - It 'should enable localdns successfully' - echo 'localdns corefile' > "$LOCALDNS_CORE_FILE" + It 'should enable localdns successfully when VHD has required assets' When run enableLocalDNS The status should be success The output should include "localdns should be enabled." The output should include "Enable localdns succeeded." End + It 'should skip localdns when localdns.service is missing on old VHD' + rm -f /etc/systemd/system/localdns.service + When run enableLocalDNS + The status should be success + The output should include "Warning: localdns.service not found on this VHD, skipping localdns setup" + The output should not include "localdns should be enabled." + End + + It 'should skip localdns when localdns.sh is missing on old VHD' + rm -f /opt/azure/containers/localdns/localdns.sh + When run enableLocalDNS + The status should be success + The output should include "Warning: localdns.sh not found on this VHD, skipping localdns setup" + The output should not include "localdns should be enabled." + End + It 'should return error when systemctl fails to start localdns' - echo 'localdns corefile' > "$LOCALDNS_CORE_FILE" systemctlEnableAndStart() { echo "systemctlEnableAndStart $@" return 1 @@ -819,14 +856,20 @@ providers: End End - Describe 'shouldEnableLocalDns' + Describe 'enableLocalDNSForScriptless' setup() { TMP_DIR=$(mktemp -d) LOCALDNS_CORE_FILE="$TMP_DIR/localdns.corefile" LOCALDNS_SLICE_FILE="$TMP_DIR/localdns.slice" - LOCALDNS_GENERATED_COREFILE=$(echo "bG9jYWxkbnMgY29yZWZpbGU=") # "localdns corefile" base64 + LOCALDNS_COREFILE_BASE=$(echo "bG9jYWxkbnMgY29yZWZpbGU=") # "localdns corefile" base64 + LOCALDNS_COREFILE_EXPERIMENTAL=$(echo "bG9jYWxkbnMgY29yZWZpbGU=") # "localdns corefile" base64 LOCALDNS_MEMORY_LIMIT="512M" LOCALDNS_CPU_LIMIT="250%" + # Create mock localdns assets that would be present on VHD + mkdir -p /etc/systemd/system + mkdir -p /opt/azure/containers/localdns + touch /etc/systemd/system/localdns.service + touch /opt/azure/containers/localdns/localdns.sh systemctlEnableAndStart() { echo "systemctlEnableAndStart $@" @@ -835,6 +878,9 @@ providers: } cleanup() { rm -rf "$TMP_DIR" + # Clean up mock VHD assets + rm -f /etc/systemd/system/localdns.service + rm -f /opt/azure/containers/localdns/localdns.sh } BeforeEach 'setup' AfterEach 'cleanup' @@ -880,6 +926,261 @@ providers: The output should include "localdns should be enabled." The output should include "Enable localdns succeeded." End + + # Environment file creation with both corefile variants. + It 'should create environment file with all corefile variants for dynamic selection' + # Set up both corefile variants + LOCALDNS_COREFILE_EXPERIMENTAL=$(echo -n "corefile with hosts plugin" | base64) + LOCALDNS_COREFILE_BASE=$(echo -n "corefile without hosts plugin" | base64) + SHOULD_ENABLE_HOSTS_PLUGIN="true" + LOCALDNS_ENV_FILE="$TMP_DIR/environment" + + When call enableLocalDNS + The status should be success + The stdout should include "enableLocalDNS called, generating corefile..." + The stdout should include "localdns should be enabled." + The stdout should include "Enable localdns succeeded." + The path "$LOCALDNS_ENV_FILE" should be file + The contents of file "$LOCALDNS_ENV_FILE" should include "LOCALDNS_BASE64_ENCODED_COREFILE=" + The contents of file "$LOCALDNS_ENV_FILE" should include "LOCALDNS_COREFILE_BASE=" + The contents of file "$LOCALDNS_ENV_FILE" should include "LOCALDNS_COREFILE_EXPERIMENTAL=${LOCALDNS_COREFILE_EXPERIMENTAL}" + The contents of file "$LOCALDNS_ENV_FILE" should include "SHOULD_ENABLE_HOSTS_PLUGIN=true" + End + + # Old CSE + new VHD backward compatibility. + # An old AgentBaker service only sets LOCALDNS_GENERATED_COREFILE (not LOCALDNS_COREFILE_BASE). + # The new VHD's generateLocalDNSFiles must fall back to the legacy variable. + It 'should fall back to LOCALDNS_GENERATED_COREFILE when LOCALDNS_COREFILE_BASE is unset (old CSE + new VHD)' + unset LOCALDNS_COREFILE_BASE + LOCALDNS_GENERATED_COREFILE=$(echo -n "legacy corefile from old CSE" | base64) + LOCALDNS_ENV_FILE="$TMP_DIR/environment" + + When call enableLocalDNS + The status should be success + The stdout should include "localdns should be enabled." + The stdout should include "Enable localdns succeeded." + The path "$LOCALDNS_CORE_FILE" should be file + The contents of file "$LOCALDNS_CORE_FILE" should include "legacy corefile from old CSE" + The path "$LOCALDNS_ENV_FILE" should be file + The contents of file "$LOCALDNS_ENV_FILE" should include "LOCALDNS_BASE64_ENCODED_COREFILE=" + The contents of file "$LOCALDNS_ENV_FILE" should include "LOCALDNS_COREFILE_BASE=" + End + + # Environment file permissions. + It 'should set correct permissions on environment file' + LOCALDNS_ENV_FILE="$TMP_DIR/environment" + When call enableLocalDNS + The status should be success + The path "$LOCALDNS_ENV_FILE" should be file + # Check permissions are 0644 (owner read/write, group read, others read) + The result of function check_file_permissions should equal "0644" + End + End + + Describe 'enableAKSHostsSetup' + setup() { + # Create temporary test directories and files + TEST_TEMP_DIR=$(mktemp -d) + AKS_HOSTS_FILE="${TEST_TEMP_DIR}/hosts" + AKS_HOSTS_SETUP_SCRIPT="${TEST_TEMP_DIR}/aks-hosts-setup.sh" + AKS_HOSTS_SETUP_SERVICE="${TEST_TEMP_DIR}/aks-hosts-setup.service" + AKS_HOSTS_SETUP_TIMER="${TEST_TEMP_DIR}/aks-hosts-setup.timer" + AKS_CLOUD_ENV_FILE="${TEST_TEMP_DIR}/cloud-env" + + # Create fake script that simulates successful hosts file creation + cat > "$AKS_HOSTS_SETUP_SCRIPT" << 'SETUP_EOF' +#!/bin/bash +echo "# test hosts file" > "${AKS_HOSTS_FILE}" +SETUP_EOF + chmod +x "$AKS_HOSTS_SETUP_SCRIPT" + + # Create dummy service and timer files + touch "$AKS_HOSTS_SETUP_SERVICE" + touch "$AKS_HOSTS_SETUP_TIMER" + + # Set up test environment + TARGET_CLOUD="AzurePublicCloud" + + # Mock systemctl function + systemctlEnableAndStartNoBlock() { + echo "systemctlEnableAndStartNoBlock $@" + return 0 + } + + # Export variables so the real function can use them + export AKS_HOSTS_FILE AKS_HOSTS_SETUP_SCRIPT AKS_HOSTS_SETUP_SERVICE + export AKS_HOSTS_SETUP_TIMER AKS_CLOUD_ENV_FILE TARGET_CLOUD + } + + cleanup() { + rm -rf "$TEST_TEMP_DIR" + unset AKS_HOSTS_FILE AKS_HOSTS_SETUP_SCRIPT AKS_HOSTS_SETUP_SERVICE + unset AKS_HOSTS_SETUP_TIMER AKS_CLOUD_ENV_FILE TARGET_CLOUD + } + + BeforeEach 'setup' + AfterEach 'cleanup' + + It 'should enable aks-hosts-setup timer successfully' + When call enableAKSHostsSetup + The status should be success + The output should include "Enabling aks-hosts-setup timer..." + The output should include "systemctlEnableAndStartNoBlock aks-hosts-setup.timer 30" + The output should include "aks-hosts-setup timer enabled successfully." + End + + It 'should call systemctlEnableAndStartNoBlock with correct parameters' + When call enableAKSHostsSetup + The status should be success + The output should include "systemctlEnableAndStartNoBlock aks-hosts-setup.timer 30" + End + + It 'should skip when setup script is missing' + rm -f "$AKS_HOSTS_SETUP_SCRIPT" + When call enableAKSHostsSetup + The status should be success + The output should include "not found on this VHD, skipping aks-hosts-setup" + End + + It 'should skip when timer unit is missing' + rm -f "$AKS_HOSTS_SETUP_TIMER" + When call enableAKSHostsSetup + The status should be success + The output should include "not found on this VHD, skipping aks-hosts-setup" + End + + It 'should print warning when systemctlEnableAndStartNoBlock fails' + systemctlEnableAndStartNoBlock() { + echo "systemctlEnableAndStartNoBlock $@" + return 1 + } + When call enableAKSHostsSetup + The status should be success + The output should include "Enabling aks-hosts-setup timer..." + The output should include "Warning: Failed to enable aks-hosts-setup timer" + The output should not include "aks-hosts-setup timer enabled successfully." + End + + It 'should skip when service unit is missing' + rm -f "$AKS_HOSTS_SETUP_SERVICE" + When call enableAKSHostsSetup + The status should be success + The output should include "not found on this VHD, skipping aks-hosts-setup" + End + + It 'should skip when setup script is not executable' + chmod -x "$AKS_HOSTS_SETUP_SCRIPT" + When call enableAKSHostsSetup + The status should be success + The output should include "is not executable, skipping aks-hosts-setup" + End + + It 'should create cloud-env file with TARGET_CLOUD value' + TARGET_CLOUD="AzurePublicCloud" + When call enableAKSHostsSetup + The status should be success + The output should include "aks-hosts-setup timer enabled successfully." + The file "$AKS_CLOUD_ENV_FILE" should be exist + The contents of file "$AKS_CLOUD_ENV_FILE" should equal "TARGET_CLOUD=AzurePublicCloud" + End + + It 'should write correct cloud-env for AzureChinaCloud' + TARGET_CLOUD="AzureChinaCloud" + When call enableAKSHostsSetup + The status should be success + The output should include "aks-hosts-setup timer enabled successfully." + The contents of file "$AKS_CLOUD_ENV_FILE" should equal "TARGET_CLOUD=AzureChinaCloud" + End + + It 'should write correct cloud-env for AzureUSGovernmentCloud' + TARGET_CLOUD="AzureUSGovernmentCloud" + When call enableAKSHostsSetup + The status should be success + The output should include "aks-hosts-setup timer enabled successfully." + The contents of file "$AKS_CLOUD_ENV_FILE" should equal "TARGET_CLOUD=AzureUSGovernmentCloud" + End + + It 'should set correct permissions on cloud-env file' + When call enableAKSHostsSetup + The status should be success + The output should include "aks-hosts-setup timer enabled successfully." + The file "$AKS_CLOUD_ENV_FILE" should be exist + The result of function check_cloud_env_permissions should equal "0644" + End + + It 'should skip when TARGET_CLOUD is unset' + unset TARGET_CLOUD + When call enableAKSHostsSetup + The status should be success + The output should include "WARNING: TARGET_CLOUD is not set" + The output should include "Cannot run aks-hosts-setup without knowing cloud environment" + The output should include "Skipping aks-hosts-setup" + End + + It 'should skip when TARGET_CLOUD is empty string' + TARGET_CLOUD="" + When call enableAKSHostsSetup + The status should be success + The output should include "WARNING: TARGET_CLOUD is not set" + The output should include "Skipping aks-hosts-setup" + End + + It 'should skip when TARGET_CLOUD is unsupported (USNatCloud)' + TARGET_CLOUD="USNatCloud" + When call enableAKSHostsSetup + The status should be success + The output should include "WARNING: The following cloud is not supported by aks-hosts-setup: USNatCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + The output should include "Skipping aks-hosts-setup" + The file "$AKS_CLOUD_ENV_FILE" should not be exist + End + + It 'should skip when TARGET_CLOUD is unsupported (USSecCloud)' + TARGET_CLOUD="USSecCloud" + When call enableAKSHostsSetup + The status should be success + The output should include "WARNING: The following cloud is not supported by aks-hosts-setup: USSecCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + The output should include "Skipping aks-hosts-setup" + The file "$AKS_CLOUD_ENV_FILE" should not be exist + End + + It 'should skip when TARGET_CLOUD is unsupported (AzureStackCloud)' + TARGET_CLOUD="AzureStackCloud" + When call enableAKSHostsSetup + The status should be success + The output should include "WARNING: The following cloud is not supported by aks-hosts-setup: AzureStackCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + The output should include "Skipping aks-hosts-setup" + The file "$AKS_CLOUD_ENV_FILE" should not be exist + End + + It 'should skip when TARGET_CLOUD is unsupported (AzureGermanCloud)' + TARGET_CLOUD="AzureGermanCloud" + When call enableAKSHostsSetup + The status should be success + The output should include "WARNING: The following cloud is not supported by aks-hosts-setup: AzureGermanCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + The output should include "Skipping aks-hosts-setup" + The file "$AKS_CLOUD_ENV_FILE" should not be exist + End + + It 'should skip when TARGET_CLOUD is unsupported (unknown cloud)' + TARGET_CLOUD="SomeRandomCloud" + When call enableAKSHostsSetup + The status should be success + The output should include "WARNING: The following cloud is not supported by aks-hosts-setup: SomeRandomCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + The output should include "Skipping aks-hosts-setup" + The file "$AKS_CLOUD_ENV_FILE" should not be exist + End + + It 'should log TARGET_CLOUD value when set' + TARGET_CLOUD="AzurePublicCloud" + When call enableAKSHostsSetup + The status should be success + The output should include "Setting TARGET_CLOUD=AzurePublicCloud for aks-hosts-setup" + End End Describe 'configureAndStartSecureTLSBootstrapping' diff --git a/spec/parts/linux/cloud-init/artifacts/cse_main_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_main_spec.sh new file mode 100644 index 00000000000..7c4a4b332a5 --- /dev/null +++ b/spec/parts/linux/cloud-init/artifacts/cse_main_spec.sh @@ -0,0 +1,149 @@ +#!/usr/bin/env shellspec + +# Unit tests for select_localdns_corefile() function +# select_localdns_corefile() reads globals from the environment: +# LOCALDNS_COREFILE_BASE — base corefile (no experimental plugins) +# LOCALDNS_COREFILE_EXPERIMENTAL — corefile with experimental plugins (e.g. hosts) +# SHOULD_ENABLE_HOSTS_PLUGIN — whether hosts plugin is enabled +# It checks LOCALDNS_HOSTS_FILE (default /etc/localdns/hosts) for valid IP mappings to decide which variant to use. + +Describe 'select_localdns_corefile()' + LOCALDNS_PATH="parts/linux/cloud-init/artifacts/localdns.sh" + + # Mock base64-encoded corefiles for testing + COREFILE_WITH_HOSTS="aG9zdHMgL2V0Yy9sb2NhbGRucy9ob3N0cw==" # "hosts /etc/localdns/hosts" + COREFILE_NO_HOSTS="bm8gaG9zdHMgcGx1Z2lu" # "no hosts plugin" + + setup() { + # Source localdns.sh to get select_localdns_corefile function + # We set __SOURCED__=1 to only source the functions, not run main execution + # shellcheck disable=SC1090 + __SOURCED__=1 . "${LOCALDNS_PATH}" + + # Create temp directory for test hosts file — avoids writing to /etc + TEST_DIR=$(mktemp -d) + LOCALDNS_HOSTS_FILE="${TEST_DIR}/hosts" + } + + cleanup() { + rm -rf "${TEST_DIR}" + unset LOCALDNS_COREFILE_BASE + unset LOCALDNS_COREFILE_EXPERIMENTAL + unset SHOULD_ENABLE_HOSTS_PLUGIN + unset LOCALDNS_HOSTS_FILE + } + + BeforeEach 'setup' + AfterEach 'cleanup' + + Context 'when both corefile variants are available and hosts plugin is enabled' + It 'returns EXPERIMENTAL when hosts file has valid IP mappings' + LOCALDNS_COREFILE_BASE="${COREFILE_NO_HOSTS}" + LOCALDNS_COREFILE_EXPERIMENTAL="${COREFILE_WITH_HOSTS}" + SHOULD_ENABLE_HOSTS_PLUGIN="true" + echo "10.0.0.1 mcr.microsoft.com" > "${LOCALDNS_HOSTS_FILE}" + + When call select_localdns_corefile + The output should equal "${COREFILE_WITH_HOSTS}" + The status should be success + The stderr should include "Hosts file has IP mappings" + The stderr should include "using corefile with hosts plugin" + End + + It 'returns BASE when hosts file exists but has no IP mappings' + LOCALDNS_COREFILE_BASE="${COREFILE_NO_HOSTS}" + LOCALDNS_COREFILE_EXPERIMENTAL="${COREFILE_WITH_HOSTS}" + SHOULD_ENABLE_HOSTS_PLUGIN="true" + echo "# comment only" > "${LOCALDNS_HOSTS_FILE}" + + When call select_localdns_corefile + The output should equal "${COREFILE_NO_HOSTS}" + The status should be success + The stderr should include "not ready yet, falling back to corefile without hosts plugin" + End + + It 'returns BASE when hosts file does not exist' + LOCALDNS_COREFILE_BASE="${COREFILE_NO_HOSTS}" + LOCALDNS_COREFILE_EXPERIMENTAL="${COREFILE_WITH_HOSTS}" + SHOULD_ENABLE_HOSTS_PLUGIN="true" + rm -f "${LOCALDNS_HOSTS_FILE}" + + When call select_localdns_corefile + The output should equal "${COREFILE_NO_HOSTS}" + The status should be success + The stderr should include "not ready yet, falling back to corefile without hosts plugin" + End + + It 'handles IPv6 addresses in hosts file' + LOCALDNS_COREFILE_BASE="${COREFILE_NO_HOSTS}" + LOCALDNS_COREFILE_EXPERIMENTAL="${COREFILE_WITH_HOSTS}" + SHOULD_ENABLE_HOSTS_PLUGIN="true" + echo "2001:db8::1 mcr.microsoft.com" > "${LOCALDNS_HOSTS_FILE}" + + When call select_localdns_corefile + The output should equal "${COREFILE_WITH_HOSTS}" + The status should be success + The stderr should include "using corefile with hosts plugin" + End + End + + Context 'when both corefile variants are available and hosts plugin is disabled' + It 'returns BASE when SHOULD_ENABLE_HOSTS_PLUGIN=false' + LOCALDNS_COREFILE_BASE="${COREFILE_NO_HOSTS}" + LOCALDNS_COREFILE_EXPERIMENTAL="${COREFILE_WITH_HOSTS}" + SHOULD_ENABLE_HOSTS_PLUGIN="false" + echo "10.0.0.1 mcr.microsoft.com" > "${LOCALDNS_HOSTS_FILE}" + + When call select_localdns_corefile + The output should equal "${COREFILE_NO_HOSTS}" + The status should be success + The stderr should include "Hosts plugin is not enabled" + End + + It 'returns BASE when SHOULD_ENABLE_HOSTS_PLUGIN is empty' + LOCALDNS_COREFILE_BASE="${COREFILE_NO_HOSTS}" + LOCALDNS_COREFILE_EXPERIMENTAL="${COREFILE_WITH_HOSTS}" + SHOULD_ENABLE_HOSTS_PLUGIN="" + + When call select_localdns_corefile + The output should equal "${COREFILE_NO_HOSTS}" + The status should be success + The stderr should include "Hosts plugin is not enabled" + End + + It 'returns BASE when SHOULD_ENABLE_HOSTS_PLUGIN is any value other than "true"' + LOCALDNS_COREFILE_BASE="${COREFILE_NO_HOSTS}" + LOCALDNS_COREFILE_EXPERIMENTAL="${COREFILE_WITH_HOSTS}" + SHOULD_ENABLE_HOSTS_PLUGIN="yes" + + When call select_localdns_corefile + The output should equal "${COREFILE_NO_HOSTS}" + The status should be success + The stderr should include "Hosts plugin is not enabled" + End + End + + Context 'when only BASE is available (no dynamic selection)' + It 'returns BASE when EXPERIMENTAL is not set' + LOCALDNS_COREFILE_BASE="${COREFILE_NO_HOSTS}" + unset LOCALDNS_COREFILE_EXPERIMENTAL + + When call select_localdns_corefile + The output should equal "${COREFILE_NO_HOSTS}" + The status should be success + The stderr should include "Using LOCALDNS_COREFILE_BASE (no dynamic selection)" + End + End + + Context 'when no corefile variants are available' + It 'returns failure when neither variant is set' + unset LOCALDNS_COREFILE_BASE + unset LOCALDNS_COREFILE_EXPERIMENTAL + + When call select_localdns_corefile + The output should equal "" + The status should be failure + The stderr should include "No corefile variants available in environment" + End + End +End diff --git a/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh b/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh index 95a5c555364..9027a89deaf 100644 --- a/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh @@ -66,34 +66,38 @@ EOF BeforeEach 'setup' AfterEach 'cleanup' #------------------------ regenerate_localdns_corefile --------------------------------------------- - It 'should regenerate corefile successfully when LOCALDNS_BASE64_ENCODED_COREFILE is set' + It 'should regenerate corefile successfully when LOCALDNS_COREFILE_BASE is set' rm -f "$LOCALDNS_CORE_FILE" - LOCALDNS_BASE64_ENCODED_COREFILE=$(echo ".:5353 { + LOCALDNS_COREFILE_BASE=$(echo ".:5353 { forward . 168.63.129.16 }" | base64) When run regenerate_localdns_corefile The status should be success The stdout should include "Regenerating localdns corefile at $LOCALDNS_CORE_FILE" The stdout should include "Successfully regenerated localdns corefile." + The stderr should include "Using LOCALDNS_COREFILE_BASE" The path "$LOCALDNS_CORE_FILE" should be file End - It 'should fail to regenerate when LOCALDNS_BASE64_ENCODED_COREFILE is not set' + It 'should fail to regenerate when no corefile variants are available' rm -f "$LOCALDNS_CORE_FILE" - unset LOCALDNS_BASE64_ENCODED_COREFILE + unset LOCALDNS_COREFILE_BASE + unset LOCALDNS_COREFILE_EXPERIMENTAL When run regenerate_localdns_corefile The status should be failure - The stdout should include "LOCALDNS_BASE64_ENCODED_COREFILE is not set. Cannot regenerate corefile." + The stdout should include "No corefile selected. Cannot regenerate corefile." + The stderr should include "No corefile variants available in environment." End It 'should set correct permissions on regenerated corefile' rm -f "$LOCALDNS_CORE_FILE" - LOCALDNS_BASE64_ENCODED_COREFILE=$(echo ".:5353 { + LOCALDNS_COREFILE_BASE=$(echo ".:5353 { forward . 168.63.129.16 }" | base64) When run regenerate_localdns_corefile The status should be success The stdout should include "Successfully regenerated localdns corefile." + The stderr should include "Using LOCALDNS_COREFILE_BASE" The path "$LOCALDNS_CORE_FILE" should be file End @@ -111,24 +115,28 @@ EOF The status should be success End - It 'should regenerate and succeed if corefile is missing and LOCALDNS_BASE64_ENCODED_COREFILE is set' + It 'should regenerate and succeed if corefile is missing and LOCALDNS_COREFILE_BASE is set' rm -f "$LOCALDNS_CORE_FILE" - LOCALDNS_BASE64_ENCODED_COREFILE=$(echo ".:5353 { + LOCALDNS_COREFILE_BASE=$(echo ".:5353 { forward . 168.63.129.16 }" | base64) When run verify_localdns_corefile The status should be success The stdout should include "Attempting to regenerate localdns corefile..." The stdout should include "Localdns corefile regenerated successfully." + The stderr should include "Using LOCALDNS_COREFILE_BASE" End It 'should return failure if localdns corefile does not exist and regeneration fails' - rm -r "$LOCALDNS_CORE_FILE" + rm -f "$LOCALDNS_CORE_FILE" + unset LOCALDNS_COREFILE_BASE + unset LOCALDNS_COREFILE_EXPERIMENTAL When run verify_localdns_corefile The status should be failure The stdout should include "Localdns corefile either does not exist or is empty at $LOCALDNS_CORE_FILE." The stdout should include "Attempting to regenerate localdns corefile..." - The stdout should include "LOCALDNS_BASE64_ENCODED_COREFILE is not set. Cannot regenerate corefile." + The stdout should include "No corefile selected. Cannot regenerate corefile." + The stderr should include "No corefile variants available in environment." End It 'should return failure if localdns corefile is empty and regeneration fails' @@ -137,6 +145,7 @@ EOF The status should be failure The stdout should include "Localdns corefile either does not exist or is empty at $LOCALDNS_CORE_FILE." The stdout should include "Attempting to regenerate localdns corefile..." + The stderr should include "No corefile variants available in environment." End It 'should return failure if LOCALDNS_CORE_FILE is unset' @@ -1261,4 +1270,361 @@ EOF The stdout should include "DNS configuration refreshed successfully" End End + + +# This section tests - annotate_node_with_hosts_plugin_status +# This function is defined in parts/linux/cloud-init/artifacts/localdns.sh file. +#------------------------------------------------------------------------------------------------------------------------------------ + Describe 'annotate_node_with_hosts_plugin_status' + setup() { + Include "./parts/linux/cloud-init/artifacts/localdns.sh" + TEST_DIR="/tmp/localdnstest-$$" + KUBECONFIG="${TEST_DIR}/var/lib/kubelet/kubeconfig" + UPDATED_LOCALDNS_CORE_FILE="${TEST_DIR}/opt/azure/containers/localdns/updated.localdns.corefile" + LOCALDNS_HOSTS_FILE="${TEST_DIR}/etc/localdns/hosts" + + # Create test directories + mkdir -p "$(dirname "$KUBECONFIG")" + mkdir -p "$(dirname "$UPDATED_LOCALDNS_CORE_FILE")" + mkdir -p "$(dirname "$LOCALDNS_HOSTS_FILE")" + + # Mock hostname command + hostname() { + echo "TestNode123" + } + } + cleanup() { + rm -rf "$TEST_DIR" + # Clean up mock kubectl symlink to prevent state leaking across specs + rm -f /opt/bin/kubectl + # Remove /opt/bin if it's empty and we created it + if [ -d /opt/bin ] && [ -z "$(ls -A /opt/bin 2>/dev/null)" ]; then + rmdir /opt/bin 2>/dev/null || true + fi + } + BeforeEach 'setup' + AfterEach 'cleanup' + + #------------------------- annotate_node_with_hosts_plugin_status ---------------------------------------------- + It 'should skip annotation if corefile does not exist' + rm -f "$UPDATED_LOCALDNS_CORE_FILE" + When run annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Localdns corefile not found" + The stdout should include "skipping annotation." + End + + It 'should skip annotation if corefile does not contain hosts plugin block' + # Create corefile without hosts plugin + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + forward . 168.63.129.16 +} +EOF + When run annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Localdns corefile does not contain hosts plugin block, skipping annotation." + End + + It 'should skip annotation if hosts file does not exist' + # Create corefile with hosts plugin + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + hosts /etc/localdns/hosts { + fallthrough + } + forward . 168.63.129.16 +} +EOF + rm -f "$LOCALDNS_HOSTS_FILE" + When run annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Hosts file does not exist" + The stdout should include "skipping annotation despite corefile having hosts plugin." + End + + It 'should skip annotation if hosts file has no IP mappings' + # Create corefile with hosts plugin + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + hosts /etc/localdns/hosts { + fallthrough + } + forward . 168.63.129.16 +} +EOF + # Create empty hosts file + cat > "$LOCALDNS_HOSTS_FILE" <<'EOF' +# Empty hosts file +EOF + When run annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Hosts file exists but has no IP mappings, skipping annotation." + End + + It 'should skip annotation if kubectl binary is not found' + # Create valid corefile and hosts file + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + hosts /etc/localdns/hosts { + fallthrough + } + forward . 168.63.129.16 +} +EOF + cat > "$LOCALDNS_HOSTS_FILE" <<'EOF' +10.0.0.1 mcr.microsoft.com +10.0.0.2 packages.aks.azure.com +EOF + + command() { + if [[ "$1" == "-v" && "$2" == "/opt/bin/kubectl" ]]; then + return 1 + fi + } + When run annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "kubectl binary not found at /opt/bin/kubectl, skipping annotation." + End + + It 'should timeout and skip annotation if kubeconfig does not exist after waiting' + # Create valid corefile and hosts file + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + hosts /etc/localdns/hosts { + fallthrough + } + forward . 168.63.129.16 +} +EOF + cat > "$LOCALDNS_HOSTS_FILE" <<'EOF' +10.0.0.1 mcr.microsoft.com +EOF + + # Create mock kubectl binary that is executable + mkdir -p /opt/bin + cat > /opt/bin/kubectl <<'KUBECTL_EOF' +#!/bin/bash +echo "mock kubectl" +KUBECTL_EOF + chmod +x /opt/bin/kubectl + + rm -f "$KUBECONFIG" + # Use short timeout for testing (2 attempts = 6 seconds) + KUBECONFIG_WAIT_ATTEMPTS=2 + When run annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Waiting for TLS bootstrapping to complete" + The stdout should include "Timeout waiting for kubeconfig" + End + + It 'should set annotation successfully when using corefile with hosts plugin' + # Create valid corefile and hosts file + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + hosts /etc/localdns/hosts { + fallthrough + } + forward . 168.63.129.16 +} +EOF + cat > "$LOCALDNS_HOSTS_FILE" <<'EOF' +# AKS critical FQDN addresses +10.0.0.1 mcr.microsoft.com +10.0.0.2 packages.aks.azure.com +10.0.0.3 management.azure.com +EOF + touch "$KUBECONFIG" + + # Create mock kubectl in /opt/bin (must exist in container filesystem) + # First verify we can write to /opt + if [ ! -d /opt ]; then + Skip "Cannot create /opt/bin/kubectl - /opt directory does not exist or is not writable" + fi + + mkdir -p /opt/bin || Skip "Cannot create /opt/bin directory" + + cat > /opt/bin/kubectl <<'KUBECTL_EOF' +#!/bin/bash +if [[ "$1" == "--kubeconfig" && "$3" == "get" && "$4" == "node" ]]; then + exit 0 +elif [[ "$1" == "--kubeconfig" && "$3" == "annotate" && "$4" == "--overwrite" && "$5" == "node" ]]; then + echo "node/testnode123 annotated" + exit 0 +fi +exit 1 +KUBECTL_EOF + chmod +x /opt/bin/kubectl || Skip "Cannot make /opt/bin/kubectl executable" + + # Verify the mock was created + [ -x /opt/bin/kubectl ] || Skip "Mock kubectl was not created successfully" + + When call annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Localdns is using hosts plugin and hosts file has 3 entries." + The stdout should include "Setting annotation to indicate hosts plugin is in use for node testnode123." + The stdout should include "Successfully set hosts plugin annotation." + End + + It 'should handle kubectl annotation failure gracefully (non-fatal)' + # Create valid corefile and hosts file + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + hosts /etc/localdns/hosts { + fallthrough + } + forward . 168.63.129.16 +} +EOF + cat > "$LOCALDNS_HOSTS_FILE" <<'EOF' +10.0.0.1 mcr.microsoft.com +EOF + touch "$KUBECONFIG" + + # Create mock kubectl binary that fails annotation + mkdir -p /opt/bin + cat > /opt/bin/kubectl <<'KUBECTL_EOF' +#!/bin/bash +if [[ "$1" == "--kubeconfig" && "$3" == "get" && "$4" == "node" ]]; then + exit 0 +elif [[ "$1" == "--kubeconfig" && "$3" == "annotate" ]]; then + echo "Error: failed to annotate node" >&2 + exit 1 +fi +exit 1 +KUBECTL_EOF + chmod +x /opt/bin/kubectl + + When call annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Setting annotation to indicate hosts plugin is in use for node testnode123." + The stdout should include "Warning: Failed to set hosts plugin annotation (this is non-fatal)." + The stderr should include "Error: failed to annotate node" + End + + It 'should convert hostname to lowercase for node name' + # Create valid corefile and hosts file + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + hosts /etc/localdns/hosts { + fallthrough + } + forward . 168.63.129.16 +} +EOF + cat > "$LOCALDNS_HOSTS_FILE" <<'EOF' +10.0.0.1 mcr.microsoft.com +EOF + touch "$KUBECONFIG" + + # Create mock kubectl binary that verifies lowercase node name + mkdir -p /opt/bin + cat > /opt/bin/kubectl <<'KUBECTL_EOF' +#!/bin/bash +if [[ "$1" == "--kubeconfig" && "$3" == "get" && "$4" == "node" ]]; then + exit 0 +elif [[ "$1" == "--kubeconfig" && "$3" == "annotate" && "$4" == "--overwrite" && "$5" == "node" && "$6" == "testnode123" ]]; then + echo "node/testnode123 annotated (lowercase verified)" + exit 0 +else + echo "Error: Expected lowercase node name 'testnode123' but got '$6'" >&2 + exit 1 +fi +KUBECTL_EOF + chmod +x /opt/bin/kubectl + + When call annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Successfully set hosts plugin annotation." + End + + It 'should wait for node to be registered before annotating' + # Create valid corefile and hosts file + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + hosts /etc/localdns/hosts { + fallthrough + } + forward . 168.63.129.16 +} +EOF + cat > "$LOCALDNS_HOSTS_FILE" <<'EOF' +10.0.0.1 mcr.microsoft.com +EOF + touch "$KUBECONFIG" + + # Create mock kubectl binary that simulates node not registered initially + # Create a counter file to track attempts + ATTEMPT_FILE="${TEST_DIR}/attempt_count" + echo "0" > "$ATTEMPT_FILE" + + mkdir -p /opt/bin + cat > /opt/bin/kubectl < "\$ATTEMPT_FILE" + +# Simulate node not ready for first 2 attempts +if [[ "\$1" == "--kubeconfig" && "\$3" == "get" && "\$4" == "node" && \$count -le 2 ]]; then + echo "Error from server (NotFound): nodes \"testnode123\" not found" >&2 + exit 1 +elif [[ "\$1" == "--kubeconfig" && "\$3" == "get" && "\$4" == "node" ]]; then + # Node is now registered + exit 0 +elif [[ "\$1" == "--kubeconfig" && "\$3" == "annotate" ]]; then + echo "node/testnode123 annotated" + exit 0 +fi +exit 1 +KUBECTL_EOF + chmod +x /opt/bin/kubectl + + # Use short timeout for testing + NODE_REGISTRATION_WAIT_ATTEMPTS=5 + + When call annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Waiting for node testnode123 to be registered in the cluster" + The stdout should include "Node testnode123 is registered in the cluster" + The stdout should include "Successfully set hosts plugin annotation" + End + + It 'should timeout and skip annotation if node never registers' + # Create valid corefile and hosts file + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + hosts /etc/localdns/hosts { + fallthrough + } + forward . 168.63.129.16 +} +EOF + cat > "$LOCALDNS_HOSTS_FILE" <<'EOF' +10.0.0.1 mcr.microsoft.com +EOF + touch "$KUBECONFIG" + + # Create mock kubectl that always fails to find node + mkdir -p /opt/bin + cat > /opt/bin/kubectl <<'KUBECTL_EOF' +#!/bin/bash +if [[ "$1" == "--kubeconfig" && "$3" == "get" && "$4" == "node" ]]; then + echo "Error from server (NotFound): nodes \"testnode123\" not found" >&2 + exit 1 +fi +exit 1 +KUBECTL_EOF + chmod +x /opt/bin/kubectl + + # Use very short timeout for testing + NODE_REGISTRATION_WAIT_ATTEMPTS=2 + + When call annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Waiting for node registration" + The stdout should include "Timeout waiting for node testnode123 to be registered" + End + End End diff --git a/spec/shellspec.Dockerfile b/spec/shellspec.Dockerfile index db8a68f7ebe..a8c98177361 100644 --- a/spec/shellspec.Dockerfile +++ b/spec/shellspec.Dockerfile @@ -4,7 +4,7 @@ FROM aksdataplanedev.azurecr.io/shellspec/shellspec-debian:0.28.1 RUN sed -i -e 's/\(deb\|security\).debian.org/archive.debian.org/g' /etc/apt/sources.list && \ apt-get update && \ - apt-get install -y --no-install-recommends gawk jq curl && \ + apt-get install -y --no-install-recommends gawk jq curl dnsutils && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* COPY ./ /src diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index c960d797a5c..7fe6075adb1 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -301,6 +301,18 @@ copyPackerFiles() { LOCALDNS_SERVICE_DELEGATE_SRC=/home/packer/localdns-delegate.conf LOCALDNS_SERVICE_DELEGATE_DEST=/etc/systemd/system/localdns.service.d/delegate.conf cpAndMode $LOCALDNS_SERVICE_DELEGATE_SRC $LOCALDNS_SERVICE_DELEGATE_DEST 0644 + + AKS_HOSTS_SETUP_SH_SRC=/home/packer/aks-hosts-setup.sh + AKS_HOSTS_SETUP_SH_DEST=/opt/azure/containers/aks-hosts-setup.sh + cpAndMode $AKS_HOSTS_SETUP_SH_SRC $AKS_HOSTS_SETUP_SH_DEST 0755 + + AKS_HOSTS_SETUP_SVC_SRC=/home/packer/aks-hosts-setup.service + AKS_HOSTS_SETUP_SVC_DEST=/etc/systemd/system/aks-hosts-setup.service + cpAndMode $AKS_HOSTS_SETUP_SVC_SRC $AKS_HOSTS_SETUP_SVC_DEST 0644 + + AKS_HOSTS_SETUP_TIMER_SRC=/home/packer/aks-hosts-setup.timer + AKS_HOSTS_SETUP_TIMER_DEST=/etc/systemd/system/aks-hosts-setup.timer + cpAndMode $AKS_HOSTS_SETUP_TIMER_SRC $AKS_HOSTS_SETUP_TIMER_DEST 0644 # --------------------------------------------------------------------------------------- # ------------------------- Files related to azure-network ------------------------------ diff --git a/vhdbuilder/packer/vhd-image-builder-acl-arm64.json b/vhdbuilder/packer/vhd-image-builder-acl-arm64.json index 6cebe0ec0f2..0087444602f 100644 --- a/vhdbuilder/packer/vhd-image-builder-acl-arm64.json +++ b/vhdbuilder/packer/vhd-image-builder-acl-arm64.json @@ -631,6 +631,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-acl.json b/vhdbuilder/packer/vhd-image-builder-acl.json index 03adb0f11f0..7768bb9316c 100644 --- a/vhdbuilder/packer/vhd-image-builder-acl.json +++ b/vhdbuilder/packer/vhd-image-builder-acl.json @@ -631,6 +631,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json index 615da5e9ee3..ada5349a4a5 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json @@ -702,6 +702,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-base.json b/vhdbuilder/packer/vhd-image-builder-base.json index bfe60f33041..839b7a5a9fc 100644 --- a/vhdbuilder/packer/vhd-image-builder-base.json +++ b/vhdbuilder/packer/vhd-image-builder-base.json @@ -710,6 +710,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-cvm.json b/vhdbuilder/packer/vhd-image-builder-cvm.json index 0e444781783..21f0fd7b52c 100644 --- a/vhdbuilder/packer/vhd-image-builder-cvm.json +++ b/vhdbuilder/packer/vhd-image-builder-cvm.json @@ -714,6 +714,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-flatcar-arm64.json b/vhdbuilder/packer/vhd-image-builder-flatcar-arm64.json index 203a22dc035..664a2d0880b 100644 --- a/vhdbuilder/packer/vhd-image-builder-flatcar-arm64.json +++ b/vhdbuilder/packer/vhd-image-builder-flatcar-arm64.json @@ -683,6 +683,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-flatcar.json b/vhdbuilder/packer/vhd-image-builder-flatcar.json index 959d78535d9..11f907a0ead 100644 --- a/vhdbuilder/packer/vhd-image-builder-flatcar.json +++ b/vhdbuilder/packer/vhd-image-builder-flatcar.json @@ -688,6 +688,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-mariner-arm64.json b/vhdbuilder/packer/vhd-image-builder-mariner-arm64.json index 6ed96281c5c..8f7dd5480fa 100644 --- a/vhdbuilder/packer/vhd-image-builder-mariner-arm64.json +++ b/vhdbuilder/packer/vhd-image-builder-mariner-arm64.json @@ -676,6 +676,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-mariner-cvm.json b/vhdbuilder/packer/vhd-image-builder-mariner-cvm.json index e4d58283d56..6e44f0ace68 100644 --- a/vhdbuilder/packer/vhd-image-builder-mariner-cvm.json +++ b/vhdbuilder/packer/vhd-image-builder-mariner-cvm.json @@ -677,6 +677,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-mariner.json b/vhdbuilder/packer/vhd-image-builder-mariner.json index 3fd5e90a8b3..714f32584c1 100644 --- a/vhdbuilder/packer/vhd-image-builder-mariner.json +++ b/vhdbuilder/packer/vhd-image-builder-mariner.json @@ -678,6 +678,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh",