diff --git a/.pipelines/scripts/verify_shell.sh b/.pipelines/scripts/verify_shell.sh index 8d8241131e7..f55d5529d06 100755 --- a/.pipelines/scripts/verify_shell.sh +++ b/.pipelines/scripts/verify_shell.sh @@ -30,6 +30,7 @@ filesToCheck=$(find . -type f -name "*.sh" -not -path './pkg/agent/testdata/*' - # Known bash-only scripts that intentionally use bash specific syntax. BASH_ONLY_LIST=$(cat <<'EOF' ./vhdbuilder/packer/install-ig.sh +./parts/linux/cloud-init/artifacts/aks-hosts-setup.sh EOF ) diff --git a/aks-node-controller/parser/helper.go b/aks-node-controller/parser/helper.go index f5644dcda02..042f8477e40 100644 --- a/aks-node-controller/parser/helper.go +++ b/aks-node-controller/parser/helper.go @@ -719,11 +719,17 @@ func getFuncMapForLocalDnsCorefileTemplate() template.FuncMap { } } -// getLocalDnsCorefileBase64 returns the base64 encoded LocalDns corefile. -// base64 encoded corefile returned from this function will decoded and written -// to /opt/azure/containers/localdns/localdns.corefile in cse_config.sh -// and then used by localdns systemd unit to start localdns systemd unit. -func getLocalDnsCorefileBase64(aksnodeconfig *aksnodeconfigv1.Configuration) string { +// getLocalDnsCorefileBase64WithHostsPlugin generates and returns the base64-encoded LocalDns corefile +// with or without the hosts plugin, depending on the includeHostsPlugin parameter. +// +// The generated content is returned as a base64-encoded string and stored in environment variables: +// - LOCALDNS_GENERATED_COREFILE (with hosts plugin) +// - LOCALDNS_GENERATED_COREFILE_NO_HOSTS (without hosts plugin) +// +// The actual file writing happens in shell scripts (cse_config.sh) which decode and write +// the selected variant to /opt/azure/containers/localdns/localdns.corefile. +// Runtime selection between variants happens in cse_main.sh based on the availability of /etc/localdns/hosts. +func getLocalDnsCorefileBase64WithHostsPlugin(aksnodeconfig *aksnodeconfigv1.Configuration, includeHostsPlugin bool) string { if aksnodeconfig == nil { return "" } @@ -737,17 +743,33 @@ func getLocalDnsCorefileBase64(aksnodeconfig *aksnodeconfigv1.Configuration) str return "" } - localDnsConfig, err := generateLocalDnsCorefileFromAKSNodeConfig(aksnodeconfig) + variant := "with hosts plugin" + if !includeHostsPlugin { + variant = "without hosts plugin" + } + + localDnsConfig, err := generateLocalDnsCorefileFromAKSNodeConfig(aksnodeconfig, includeHostsPlugin) if err != nil { - return fmt.Sprintf("error getting localdns corfile from aks node config: %v", err) + return fmt.Sprintf("error getting localdns corefile (%s) from aks node config: %v", variant, err) } return base64.StdEncoding.EncodeToString([]byte(localDnsConfig)) } +// localDnsCorefileTemplateData wraps the AKS node config with additional template control flags. +type localDnsCorefileTemplateData struct { + Config *aksnodeconfigv1.Configuration + IncludeHostsPlugin bool +} + // Corefile is created using localdns.toml.gtpl template and aksnodeconfig values. -func generateLocalDnsCorefileFromAKSNodeConfig(aksnodeconfig *aksnodeconfigv1.Configuration) (string, error) { +// includeHostsPlugin controls whether the hosts plugin block is included in the generated Corefile. +func generateLocalDnsCorefileFromAKSNodeConfig(aksnodeconfig *aksnodeconfigv1.Configuration, includeHostsPlugin bool) (string, error) { var corefileBuffer bytes.Buffer - if err := localDnsCorefileTemplate.Execute(&corefileBuffer, aksnodeconfig); err != nil { + templateData := localDnsCorefileTemplateData{ + Config: aksnodeconfig, + IncludeHostsPlugin: includeHostsPlugin, + } + if err := localDnsCorefileTemplate.Execute(&corefileBuffer, templateData); err != nil { return "", fmt.Errorf("failed to execute localdns corefile template: %w", err) } return corefileBuffer.String(), nil @@ -785,6 +807,13 @@ func shouldEnableLocalDns(aksnodeconfig *aksnodeconfigv1.Configuration) string { return fmt.Sprintf("%v", aksnodeconfig != nil && aksnodeconfig.GetLocalDnsProfile() != nil && aksnodeconfig.GetLocalDnsProfile().GetEnableLocalDns()) } +// shouldEnableHostsPlugin returns true if LocalDNS is enabled and the hosts plugin +// is explicitly enabled. When true, the localdns Corefile will include a hosts plugin +// block that serves cached DNS entries from /etc/localdns/hosts for critical AKS FQDNs. +func shouldEnableHostsPlugin(aksnodeconfig *aksnodeconfigv1.Configuration) string { + return fmt.Sprintf("%v", shouldEnableLocalDns(aksnodeconfig) == "true" && aksnodeconfig.GetLocalDnsProfile().GetEnableHostsPlugin()) +} + // getLocalDnsCpuLimitInPercentage returns CPU limit in percentage unit that will be used in localdns systemd unit. func getLocalDnsCpuLimitInPercentage(aksnodeconfig *aksnodeconfigv1.Configuration) string { if shouldEnableLocalDns(aksnodeconfig) == "true" && aksnodeconfig.GetLocalDnsProfile().GetCpuLimitInMilliCores() != 0 { diff --git a/aks-node-controller/parser/helper_test.go b/aks-node-controller/parser/helper_test.go index 46b05bc6550..263f45b400d 100644 --- a/aks-node-controller/parser/helper_test.go +++ b/aks-node-controller/parser/helper_test.go @@ -1446,6 +1446,10 @@ health-check.localdns.local:53 { .:53 { log bind 169.254.10.10 + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } forward . 168.63.129.16 { policy sequential max_concurrent 1000 @@ -1509,6 +1513,10 @@ testdomain456.com:53 { .:53 { errors bind 169.254.10.11 + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } forward . 10.0.0.10 { policy sequential max_concurrent 2000 @@ -1627,7 +1635,7 @@ func Test_getLocalDNSCorefileBase64(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got := getLocalDnsCorefileBase64(tt.args.aksnodeconfig) + got := getLocalDnsCorefileBase64WithHostsPlugin(tt.args.aksnodeconfig, true) if tt.wantContains == "" && got != "" { t.Errorf("expected empty string, got %q", got) @@ -1711,6 +1719,71 @@ func Test_shouldEnableLocalDns(t *testing.T) { } } +func Test_shouldEnableHostsPlugin(t *testing.T) { + type args struct { + aksnodeconfig *aksnodeconfigv1.Configuration + } + tests := []struct { + name string + args args + want string + }{ + { + name: "nil config", + args: args{aksnodeconfig: nil}, + want: "false", + }, + { + name: "nil LocalDnsProfile", + args: args{aksnodeconfig: &aksnodeconfigv1.Configuration{}}, + want: "false", + }, + { + name: "LocalDns disabled, HostsPlugin enabled", + args: args{aksnodeconfig: &aksnodeconfigv1.Configuration{ + LocalDnsProfile: &aksnodeconfigv1.LocalDnsProfile{ + EnableLocalDns: false, + EnableHostsPlugin: true}, + }}, + want: "false", + }, + { + name: "LocalDns enabled, HostsPlugin disabled", + args: args{aksnodeconfig: &aksnodeconfigv1.Configuration{ + LocalDnsProfile: &aksnodeconfigv1.LocalDnsProfile{ + EnableLocalDns: true, + EnableHostsPlugin: false}, + }}, + want: "false", + }, + { + name: "both LocalDns and HostsPlugin enabled", + args: args{aksnodeconfig: &aksnodeconfigv1.Configuration{ + LocalDnsProfile: &aksnodeconfigv1.LocalDnsProfile{ + EnableLocalDns: true, + EnableHostsPlugin: true}, + }}, + want: "true", + }, + { + name: "both disabled", + args: args{aksnodeconfig: &aksnodeconfigv1.Configuration{ + LocalDnsProfile: &aksnodeconfigv1.LocalDnsProfile{ + EnableLocalDns: false, + EnableHostsPlugin: false}, + }}, + want: "false", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := shouldEnableHostsPlugin(tt.args.aksnodeconfig); got != tt.want { + t.Errorf("shouldEnableHostsPlugin() = %v, want %v", got, tt.want) + } + }) + } +} + func Test_getLocalDnsCpuLimitInPercentage(t *testing.T) { type args struct { aksnodeconfig *aksnodeconfigv1.Configuration diff --git a/aks-node-controller/parser/parser.go b/aks-node-controller/parser/parser.go index d8541c45c65..f79d98fde15 100644 --- a/aks-node-controller/parser/parser.go +++ b/aks-node-controller/parser/parser.go @@ -170,9 +170,11 @@ func getCSEEnv(config *aksnodeconfigv1.Configuration) map[string]string { "INSERT_IMDS_RESTRICTION_RULE_TO_MANGLE_TABLE": fmt.Sprintf("%v", config.GetImdsRestrictionConfig().GetInsertImdsRestrictionRuleToMangleTable()), "PRE_PROVISION_ONLY": fmt.Sprintf("%v", config.GetPreProvisionOnly()), "SHOULD_ENABLE_LOCALDNS": shouldEnableLocalDns(config), + "SHOULD_ENABLE_HOSTS_PLUGIN": shouldEnableHostsPlugin(config), "LOCALDNS_CPU_LIMIT": getLocalDnsCpuLimitInPercentage(config), "LOCALDNS_MEMORY_LIMIT": getLocalDnsMemoryLimitInMb(config), - "LOCALDNS_GENERATED_COREFILE": getLocalDnsCorefileBase64(config), + "LOCALDNS_GENERATED_COREFILE": getLocalDnsCorefileBase64WithHostsPlugin(config, true), + "LOCALDNS_GENERATED_COREFILE_NO_HOSTS": getLocalDnsCorefileBase64WithHostsPlugin(config, false), "DISABLE_PUBKEY_AUTH": fmt.Sprintf("%v", config.GetDisablePubkeyAuth()), "SERVICE_ACCOUNT_IMAGE_PULL_ENABLED": fmt.Sprintf("%v", config.GetServiceAccountImagePullProfile().GetEnabled()), "SERVICE_ACCOUNT_IMAGE_PULL_DEFAULT_CLIENT_ID": config.GetServiceAccountImagePullProfile().GetDefaultClientId(), diff --git a/aks-node-controller/parser/parser_test.go b/aks-node-controller/parser/parser_test.go index 18a8d66e196..4c3fd343396 100644 --- a/aks-node-controller/parser/parser_test.go +++ b/aks-node-controller/parser/parser_test.go @@ -229,6 +229,38 @@ oom_score = -999 assert.Equal(t, "true", vars["NEEDS_CGROUPV2"]) }, }, + { + name: "AKSUbuntu2204 with LocalDNS and hosts plugin enabled", + folder: "AKSUbuntu2204+LocalDNS+HostsPlugin", + k8sVersion: "1.24.2", + aksNodeConfigUpdator: func(aksNodeConfig *aksnodeconfigv1.Configuration) { + aksNodeConfig.LocalDnsProfile = &aksnodeconfigv1.LocalDnsProfile{ + EnableLocalDns: true, + EnableHostsPlugin: true, + } + }, + validator: func(cmd *exec.Cmd) { + vars := environToMap(cmd.Env) + assert.Equal(t, "true", vars["SHOULD_ENABLE_LOCALDNS"]) + assert.Equal(t, "true", vars["SHOULD_ENABLE_HOSTS_PLUGIN"]) + }, + }, + { + name: "AKSUbuntu2204 with LocalDNS enabled but hosts plugin disabled", + folder: "AKSUbuntu2204+LocalDNS", + k8sVersion: "1.24.2", + aksNodeConfigUpdator: func(aksNodeConfig *aksnodeconfigv1.Configuration) { + aksNodeConfig.LocalDnsProfile = &aksnodeconfigv1.LocalDnsProfile{ + EnableLocalDns: true, + EnableHostsPlugin: false, + } + }, + validator: func(cmd *exec.Cmd) { + vars := environToMap(cmd.Env) + assert.Equal(t, "true", vars["SHOULD_ENABLE_LOCALDNS"]) + assert.Equal(t, "false", vars["SHOULD_ENABLE_HOSTS_PLUGIN"]) + }, + }, } for _, tt := range tests { diff --git a/aks-node-controller/parser/templates/localdns.toml.gtpl b/aks-node-controller/parser/templates/localdns.toml.gtpl index a636c357362..d503057486c 100644 --- a/aks-node-controller/parser/templates/localdns.toml.gtpl +++ b/aks-node-controller/parser/templates/localdns.toml.gtpl @@ -7,7 +7,7 @@ health-check.localdns.local:53 { whoami } # VnetDNS overrides apply to DNS traffic from pods with dnsPolicy:default or kubelet (referred to as VnetDNS traffic). -{{- range $domain, $override := $.LocalDnsProfile.VnetDnsOverrides -}} +{{- range $domain, $override := $.Config.LocalDnsProfile.VnetDnsOverrides -}} {{- $isRootDomain := eq $domain "." -}} {{- $fwdToClusterCoreDNS := or (hasSuffix $domain "cluster.local") (eq $override.ForwardDestination "ClusterCoreDNS")}} {{- $forwardPolicy := "sequential" -}} @@ -23,11 +23,17 @@ health-check.localdns.local:53 { log {{- end }} bind {{getLocalDnsNodeListenerIp}} + {{- if and $isRootDomain $.IncludeHostsPlugin}} + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } + {{- end}} {{- if $isRootDomain}} forward . {{getAzureDnsIp}} { {{- else}} {{- if $fwdToClusterCoreDNS}} - forward . {{getCoreDnsServiceIp $}} { + forward . {{getCoreDnsServiceIp $.Config}} { {{- else}} forward . {{getAzureDnsIp}} { {{- end}} @@ -67,7 +73,7 @@ health-check.localdns.local:53 { } {{- end}} # KubeDNS overrides apply to DNS traffic from pods with dnsPolicy:ClusterFirst (referred to as KubeDNS traffic). -{{- range $domain, $override := $.LocalDnsProfile.KubeDnsOverrides}} +{{- range $domain, $override := $.Config.LocalDnsProfile.KubeDnsOverrides}} {{- $isRootDomain := eq $domain "." -}} {{- $fwdToClusterCoreDNS := or (hasSuffix $domain "cluster.local") (eq $override.ForwardDestination "ClusterCoreDNS")}} {{- $forwardPolicy := "" }} @@ -84,8 +90,14 @@ health-check.localdns.local:53 { log {{- end }} bind {{getLocalDnsClusterListenerIp}} + {{- if and $isRootDomain $.IncludeHostsPlugin}} + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } + {{- end}} {{- if $fwdToClusterCoreDNS}} - forward . {{getCoreDnsServiceIp $}} { + forward . {{getCoreDnsServiceIp $.Config}} { {{- else}} forward . {{getAzureDnsIp}} { {{- end}} diff --git a/aks-node-controller/parser/testdata/AKSUbuntu2204+LocalDNS+HostsPlugin/generatedCSECommand b/aks-node-controller/parser/testdata/AKSUbuntu2204+LocalDNS+HostsPlugin/generatedCSECommand new file mode 100644 index 00000000000..1cd02c61bec --- /dev/null +++ b/aks-node-controller/parser/testdata/AKSUbuntu2204+LocalDNS+HostsPlugin/generatedCSECommand @@ -0,0 +1 @@ +/bin/bash -c echo $(date),$(hostname) > ${PROVISION_OUTPUT}; CLOUD_INIT_STATUS_SCRIPT="/opt/azure/containers/cloud-init-status-check.sh"; cloudInitExitCode=0; if [ -f "${CLOUD_INIT_STATUS_SCRIPT}" ]; then /bin/bash -c "source ${CLOUD_INIT_STATUS_SCRIPT}; handleCloudInitStatus \"${PROVISION_OUTPUT}\"; returnStatus=\$?; echo \"Cloud init status check exit code: \$returnStatus\" >> ${PROVISION_OUTPUT}; exit \$returnStatus" >> ${PROVISION_OUTPUT} 2>&1; else cloud-init status --wait > /dev/null 2>&1; fi; cloudInitExitCode=$?; if [ "$cloudInitExitCode" -eq 0 ]; then echo "cloud-init succeeded" >> ${PROVISION_OUTPUT}; else echo "cloud-init failed with exit code ${cloudInitExitCode}" >> ${PROVISION_OUTPUT}; cat ${PROVISION_OUTPUT} exit ${cloudInitExitCode}; fi; /usr/bin/nohup /bin/bash -c "/bin/bash /opt/azure/containers/provision_start.sh" \ No newline at end of file diff --git a/aks-node-controller/parser/testdata/AKSUbuntu2204+LocalDNS/generatedCSECommand b/aks-node-controller/parser/testdata/AKSUbuntu2204+LocalDNS/generatedCSECommand new file mode 100644 index 00000000000..1cd02c61bec --- /dev/null +++ b/aks-node-controller/parser/testdata/AKSUbuntu2204+LocalDNS/generatedCSECommand @@ -0,0 +1 @@ +/bin/bash -c echo $(date),$(hostname) > ${PROVISION_OUTPUT}; CLOUD_INIT_STATUS_SCRIPT="/opt/azure/containers/cloud-init-status-check.sh"; cloudInitExitCode=0; if [ -f "${CLOUD_INIT_STATUS_SCRIPT}" ]; then /bin/bash -c "source ${CLOUD_INIT_STATUS_SCRIPT}; handleCloudInitStatus \"${PROVISION_OUTPUT}\"; returnStatus=\$?; echo \"Cloud init status check exit code: \$returnStatus\" >> ${PROVISION_OUTPUT}; exit \$returnStatus" >> ${PROVISION_OUTPUT} 2>&1; else cloud-init status --wait > /dev/null 2>&1; fi; cloudInitExitCode=$?; if [ "$cloudInitExitCode" -eq 0 ]; then echo "cloud-init succeeded" >> ${PROVISION_OUTPUT}; else echo "cloud-init failed with exit code ${cloudInitExitCode}" >> ${PROVISION_OUTPUT}; cat ${PROVISION_OUTPUT} exit ${cloudInitExitCode}; fi; /usr/bin/nohup /bin/bash -c "/bin/bash /opt/azure/containers/provision_start.sh" \ No newline at end of file diff --git a/aks-node-controller/pkg/gen/aksnodeconfig/v1/localdns_config.pb.go b/aks-node-controller/pkg/gen/aksnodeconfig/v1/localdns_config.pb.go index 9f1a7d7af64..2b3560c8566 100644 --- a/aks-node-controller/pkg/gen/aksnodeconfig/v1/localdns_config.pb.go +++ b/aks-node-controller/pkg/gen/aksnodeconfig/v1/localdns_config.pb.go @@ -36,6 +36,10 @@ type LocalDnsProfile struct { VnetDnsOverrides map[string]*LocalDnsOverrides `protobuf:"bytes,4,rep,name=vnet_dns_overrides,json=vnetDnsOverrides,proto3" json:"vnet_dns_overrides,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` // KubeDns overrides apply to DNS traffic from pods with dnsPolicy:ClusterFirst (referred to as KubeDns traffic). KubeDnsOverrides map[string]*LocalDnsOverrides `protobuf:"bytes,5,rep,name=kube_dns_overrides,json=kubeDnsOverrides,proto3" json:"kube_dns_overrides,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` + // Specifies whether the hosts plugin should be enabled in the localdns Corefile. + // When true and LocalDNS is enabled, the Corefile will include a hosts plugin block + // that serves cached DNS entries from /etc/localdns/hosts for critical AKS FQDNs. + EnableHostsPlugin bool `protobuf:"varint,6,opt,name=enable_hosts_plugin,json=enableHostsPlugin,proto3" json:"enable_hosts_plugin,omitempty"` } func (x *LocalDnsProfile) Reset() { @@ -103,6 +107,13 @@ func (x *LocalDnsProfile) GetKubeDnsOverrides() map[string]*LocalDnsOverrides { return nil } +func (x *LocalDnsProfile) GetEnableHostsPlugin() bool { + if x != nil { + return x.EnableHostsPlugin + } + return false +} + // Represents DNS override settings for both VnetDNS and KubeDNS traffic. // VnetDns overrides apply to DNS traffic from pods with dnsPolicy:default or kubelet. // KubeDns overrides apply to DNS traffic from pods with dnsPolicy:ClusterFirst. @@ -221,7 +232,7 @@ var file_aksnodeconfig_v1_localdns_config_proto_rawDesc = []byte{ 0x0a, 0x26, 0x61, 0x6b, 0x73, 0x6e, 0x6f, 0x64, 0x65, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2f, 0x76, 0x31, 0x2f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x64, 0x6e, 0x73, 0x5f, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x10, 0x61, 0x6b, 0x73, 0x6e, 0x6f, 0x64, - 0x65, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x76, 0x31, 0x22, 0x80, 0x05, 0x0a, 0x0f, 0x4c, + 0x65, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x76, 0x31, 0x22, 0xb0, 0x05, 0x0a, 0x0f, 0x4c, 0x6f, 0x63, 0x61, 0x6c, 0x44, 0x6e, 0x73, 0x50, 0x72, 0x6f, 0x66, 0x69, 0x6c, 0x65, 0x12, 0x28, 0x0a, 0x10, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x64, 0x6e, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0e, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, @@ -245,7 +256,10 @@ var file_aksnodeconfig_v1_localdns_config_proto_rawDesc = []byte{ 0x63, 0x61, 0x6c, 0x44, 0x6e, 0x73, 0x50, 0x72, 0x6f, 0x66, 0x69, 0x6c, 0x65, 0x2e, 0x4b, 0x75, 0x62, 0x65, 0x44, 0x6e, 0x73, 0x4f, 0x76, 0x65, 0x72, 0x72, 0x69, 0x64, 0x65, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x10, 0x6b, 0x75, 0x62, 0x65, 0x44, 0x6e, 0x73, 0x4f, 0x76, 0x65, 0x72, - 0x72, 0x69, 0x64, 0x65, 0x73, 0x1a, 0x68, 0x0a, 0x15, 0x56, 0x6e, 0x65, 0x74, 0x44, 0x6e, 0x73, + 0x72, 0x69, 0x64, 0x65, 0x73, 0x12, 0x2e, 0x0a, 0x13, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x5f, + 0x68, 0x6f, 0x73, 0x74, 0x73, 0x5f, 0x70, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x18, 0x06, 0x20, 0x01, + 0x28, 0x08, 0x52, 0x11, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x48, 0x6f, 0x73, 0x74, 0x73, 0x50, + 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x1a, 0x68, 0x0a, 0x15, 0x56, 0x6e, 0x65, 0x74, 0x44, 0x6e, 0x73, 0x4f, 0x76, 0x65, 0x72, 0x72, 0x69, 0x64, 0x65, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x39, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, diff --git a/aks-node-controller/proto/aksnodeconfig/v1/localdns_config.proto b/aks-node-controller/proto/aksnodeconfig/v1/localdns_config.proto index ddc62b93e01..f4135ac697a 100644 --- a/aks-node-controller/proto/aksnodeconfig/v1/localdns_config.proto +++ b/aks-node-controller/proto/aksnodeconfig/v1/localdns_config.proto @@ -19,6 +19,11 @@ message LocalDnsProfile { // KubeDns overrides apply to DNS traffic from pods with dnsPolicy:ClusterFirst (referred to as KubeDns traffic). map kube_dns_overrides = 5; + + // Specifies whether the hosts plugin should be enabled in the localdns Corefile. + // When true and LocalDNS is enabled, the Corefile will include a hosts plugin block + // that serves cached DNS entries from /etc/localdns/hosts for critical AKS FQDNs. + bool enable_hosts_plugin = 6; } // Represents DNS override settings for both VnetDNS and KubeDNS traffic. diff --git a/e2e/aks_model.go b/e2e/aks_model.go index 7498d92c0d1..42899102d2b 100644 --- a/e2e/aks_model.go +++ b/e2e/aks_model.go @@ -5,9 +5,11 @@ import ( "errors" "fmt" "net" + "net/http" "os" "path/filepath" "strings" + "time" "github.com/Azure/agentbaker/e2e/config" "github.com/Azure/agentbaker/e2e/toolkit" @@ -856,6 +858,12 @@ func createPrivateEndpoint(ctx context.Context, nodeResourceGroup, privateEndpoi } func createPrivateZone(ctx context.Context, nodeResourceGroup, privateZoneName string) (*armprivatedns.PrivateZone, error) { + return createPrivateZoneWithTags(ctx, nodeResourceGroup, privateZoneName, map[string]*string{ + "e2e-test": to.Ptr("true"), + }) +} + +func createPrivateZoneWithTags(ctx context.Context, nodeResourceGroup, privateZoneName string, tags map[string]*string) (*armprivatedns.PrivateZone, error) { pzResp, err := config.Azure.PrivateZonesClient.Get( ctx, nodeResourceGroup, @@ -867,6 +875,7 @@ func createPrivateZone(ctx context.Context, nodeResourceGroup, privateZoneName s } dnsZoneParams := armprivatedns.PrivateZone{ Location: to.Ptr("global"), + Tags: tags, } poller, err := config.Azure.PrivateZonesClient.BeginCreateOrUpdate( ctx, @@ -888,7 +897,10 @@ func createPrivateZone(ctx context.Context, nodeResourceGroup, privateZoneName s } func createPrivateDNSLink(ctx context.Context, vnet VNet, nodeResourceGroup, privateZoneName string) error { - networkLinkName := "link-ABE2ETests" + return createPrivateDNSLinkWithName(ctx, vnet, nodeResourceGroup, privateZoneName, "link-ABE2ETests") +} + +func createPrivateDNSLinkWithName(ctx context.Context, vnet VNet, nodeResourceGroup, privateZoneName, networkLinkName string) error { _, err := config.Azure.VirutalNetworkLinksClient.Get( ctx, nodeResourceGroup, @@ -975,6 +987,89 @@ func addRecordSetToPrivateDNSZone(ctx context.Context, privateEndpoint *armnetwo return nil } +// cleanupPrivateDNSZone deletes a Private DNS zone (best effort cleanup for tests) +func cleanupPrivateDNSZone(ctx context.Context, resourceGroup, zoneName string) { + // Create a new context with timeout for cleanup + cleanupCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 5*time.Minute) + defer cancel() + + toolkit.Logf(cleanupCtx, "Deleting Private DNS zone %s in resource group %s", zoneName, resourceGroup) + + // First, delete all VNET links (required before zone deletion) + linkPager := config.Azure.VirutalNetworkLinksClient.NewListPager(resourceGroup, zoneName, nil) + for linkPager.More() { + linkPage, err := linkPager.NextPage(cleanupCtx) + if err != nil { + toolkit.Logf(cleanupCtx, "Failed to list VNET links for zone %s: %v", zoneName, err) + break + } + + for _, link := range linkPage.Value { + if link == nil || link.Name == nil { + continue + } + + toolkit.Logf(cleanupCtx, "Deleting VNET link %s from zone %s...", *link.Name, zoneName) + linkPoller, err := config.Azure.VirutalNetworkLinksClient.BeginDelete(cleanupCtx, resourceGroup, zoneName, *link.Name, nil) + if err != nil { + toolkit.Logf(cleanupCtx, "Failed to start deletion of VNET link %s: %v", *link.Name, err) + continue + } + + _, err = linkPoller.PollUntilDone(cleanupCtx, nil) + if err != nil { + toolkit.Logf(cleanupCtx, "Failed to delete VNET link %s: %v", *link.Name, err) + continue + } + toolkit.Logf(cleanupCtx, "Deleted VNET link %s", *link.Name) + } + } + + // Now delete the Private DNS zone itself + poller, err := config.Azure.PrivateZonesClient.BeginDelete(cleanupCtx, resourceGroup, zoneName, nil) + if err != nil { + toolkit.Logf(cleanupCtx, "Failed to start deletion of Private DNS zone %s: %v", zoneName, err) + return + } + + _, err = poller.PollUntilDone(cleanupCtx, nil) + if err != nil { + toolkit.Logf(cleanupCtx, "Failed to complete deletion of Private DNS zone %s: %v", zoneName, err) + return + } + + toolkit.Logf(cleanupCtx, "Successfully deleted Private DNS zone %s", zoneName) +} + +// deletePrivateDNSVNETLink deletes a specific VNET link from a Private DNS zone. +// This is used to clean up individual test resources without affecting other parallel tests. +func deletePrivateDNSVNETLink(ctx context.Context, resourceGroup, zoneName, linkName string) error { + // Create a new context with timeout for cleanup + cleanupCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 2*time.Minute) + defer cancel() + + toolkit.Logf(cleanupCtx, "Deleting VNET link %s from Private DNS zone %s in resource group %s", linkName, zoneName, resourceGroup) + + linkPoller, err := config.Azure.VirutalNetworkLinksClient.BeginDelete(cleanupCtx, resourceGroup, zoneName, linkName, nil) + if err != nil { + // If the link doesn't exist, that's fine (already cleaned up or never created) + var respErr *azcore.ResponseError + if errors.As(err, &respErr) && respErr.StatusCode == http.StatusNotFound { + toolkit.Logf(cleanupCtx, "VNET link %s not found (already deleted or never existed)", linkName) + return nil + } + return fmt.Errorf("failed to start deletion of VNET link %s: %w", linkName, err) + } + + _, err = linkPoller.PollUntilDone(cleanupCtx, nil) + if err != nil { + return fmt.Errorf("failed to complete deletion of VNET link %s: %w", linkName, err) + } + + toolkit.Logf(cleanupCtx, "Successfully deleted VNET link %s from zone %s", linkName, zoneName) + return nil +} + func addDNSZoneGroup(ctx context.Context, privateZone *armprivatedns.PrivateZone, nodeResourceGroup, privateZoneName, endpointName string) error { groupName := strings.Replace(privateZoneName, ".", "-", -1) // replace . with - _, err := config.Azure.PrivateDNSZoneGroup.Get(ctx, nodeResourceGroup, endpointName, groupName, nil) diff --git a/e2e/cluster.go b/e2e/cluster.go index 589371e2d2b..4c09a4535d5 100644 --- a/e2e/cluster.go +++ b/e2e/cluster.go @@ -126,6 +126,12 @@ func prepareCluster(ctx context.Context, cluster *armcontainerservice.ManagedClu return nil, fmt.Errorf("collect garbage vmss: %w", err) } + // Clean up orphaned Private DNS zones from failed tests + // These can interfere with DNS resolution during VM provisioning + if err := collectGarbagePrivateDNSZones(ctx, cluster); err != nil { + return nil, fmt.Errorf("collect garbage private dns zones: %w", err) + } + clusterParams, err := extractClusterParameters(ctx, kube, cluster) if err != nil { return nil, fmt.Errorf("extracting cluster parameters: %w", err) @@ -732,6 +738,106 @@ func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.Manage return nil } +func collectGarbagePrivateDNSZones(ctx context.Context, cluster *armcontainerservice.ManagedCluster) error { + defer toolkit.LogStepCtx(ctx, "collecting garbage Private DNS zones")() + rg := *cluster.Properties.NodeResourceGroup + + // Clean up Private DNS zones created by e2e tests (identified by tags). + // Only delete zones that: + // 1. Have the "e2e-test=true" tag (created by LocalDNS hosts plugin tests) + // 2. Are in zones commonly used by e2e tests (additional safety check) + testManagedZonePatterns := []string{ + "mcr.microsoft.com", + "mcr.azure.cn", + } + + // List all Private DNS zones in the node resource group + pager := config.Azure.PrivateZonesClient.NewListByResourceGroupPager(rg, nil) + for pager.More() { + page, err := pager.NextPage(ctx) + if err != nil { + return fmt.Errorf("failed to get next page of Private DNS zones: %w", err) + } + + for _, zone := range page.Value { + if zone == nil || zone.Name == nil { + continue + } + + zoneName := *zone.Name + + // Safety check 1: Only process zones that match our test patterns + isTestZone := false + for _, pattern := range testManagedZonePatterns { + if zoneName == pattern { + isTestZone = true + break + } + } + + if !isTestZone { + continue + } + + // Safety check 2: Only delete zones with e2e-test tag + if zone.Tags == nil || zone.Tags["e2e-test"] == nil || *zone.Tags["e2e-test"] != "true" { + toolkit.Logf(ctx, "skipping Private DNS zone %q (not tagged as e2e test)", zoneName) + continue + } + + toolkit.Logf(ctx, "found e2e test Private DNS zone %q (tagged), cleaning up...", zoneName) + + // Delete all VNET links first (required before zone deletion) + linkPager := config.Azure.VirutalNetworkLinksClient.NewListPager(rg, zoneName, nil) + for linkPager.More() { + linkPage, err := linkPager.NextPage(ctx) + if err != nil { + toolkit.Logf(ctx, "failed to list VNET links for zone %q: %s", zoneName, err) + break + } + + for _, link := range linkPage.Value { + if link == nil || link.Name == nil { + continue + } + + linkName := *link.Name + toolkit.Logf(ctx, "deleting VNET link %q from e2e test zone %q...", linkName, zoneName) + poller, err := config.Azure.VirutalNetworkLinksClient.BeginDelete(ctx, rg, zoneName, linkName, nil) + if err != nil { + toolkit.Logf(ctx, "failed to start deletion of VNET link %q: %s", linkName, err) + continue + } + + _, err = poller.PollUntilDone(ctx, nil) + if err != nil { + toolkit.Logf(ctx, "failed to delete VNET link %q: %s", linkName, err) + continue + } + toolkit.Logf(ctx, "deleted VNET link %q", linkName) + } + } + + // Now delete the e2e test Private DNS zone itself + toolkit.Logf(ctx, "deleting e2e test Private DNS zone %q...", zoneName) + poller, err := config.Azure.PrivateZonesClient.BeginDelete(ctx, rg, zoneName, nil) + if err != nil { + toolkit.Logf(ctx, "failed to start deletion of Private DNS zone %q: %s", zoneName, err) + continue + } + + _, err = poller.PollUntilDone(ctx, nil) + if err != nil { + toolkit.Logf(ctx, "failed to delete Private DNS zone %q: %s", zoneName, err) + continue + } + toolkit.Logf(ctx, "deleted e2e test Private DNS zone %q", zoneName) + } + } + + return nil +} + func ensureResourceGroup(ctx context.Context, location string) (armresources.ResourceGroup, error) { resourceGroupName := config.ResourceGroupName(location) rg, err := config.Azure.ResourceGroup.CreateOrUpdate( diff --git a/e2e/scenario_localdns_hosts_test.go b/e2e/scenario_localdns_hosts_test.go new file mode 100644 index 00000000000..f40c86518f6 --- /dev/null +++ b/e2e/scenario_localdns_hosts_test.go @@ -0,0 +1,215 @@ +package e2e + +import ( + "context" + "testing" + + aksnodeconfigv1 "github.com/Azure/agentbaker/aks-node-controller/pkg/gen/aksnodeconfig/v1" + "github.com/Azure/agentbaker/e2e/config" + "github.com/Azure/agentbaker/pkg/agent/datamodel" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" +) + +// Test_Ubuntu2204_LocalDNSHostsPlugin tests the localdns hosts plugin feature on Ubuntu 22.04 +func Test_Ubuntu2204_LocalDNSHostsPlugin(t *testing.T) { + RunScenario(t, &Scenario{ + Description: "Tests that localdns hosts plugin works correctly on Ubuntu 22.04 with dynamic IP resolution", + K8sSystemPoolSKU: "Standard_D4s_v3", + Config: Config{ + Cluster: ClusterKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + // Enable localdns and hosts plugin explicitly + if nbc.AgentPoolProfile.LocalDNSProfile == nil { + nbc.AgentPoolProfile.LocalDNSProfile = &datamodel.LocalDNSProfile{} + } + nbc.AgentPoolProfile.LocalDNSProfile.EnableLocalDNS = true + nbc.AgentPoolProfile.LocalDNSProfile.EnableHostsPlugin = true + }, + Validator: func(ctx context.Context, s *Scenario) { + // Validate aks-hosts-setup service ran successfully and timer is active + ValidateAKSHostsSetupService(ctx, s) + + // Validate hosts file contains resolved IPs for public cloud FQDNs + ValidateLocalDNSHostsFile(ctx, s, []string{ + "mcr.microsoft.com", + "login.microsoftonline.com", + "acs-mirror.azureedge.net", + "management.azure.com", + "packages.aks.azure.com", + "packages.microsoft.com", + }) + + // Validate localdns resolves fake FQDN from hosts file (proves hosts plugin bypass) + ValidateLocalDNSHostsPluginBypass(ctx, s) + }, + }, + }) +} + +// Test_Ubuntu2404_LocalDNSHostsPlugin tests the localdns hosts plugin feature on Ubuntu 24.04 +func Test_Ubuntu2404_LocalDNSHostsPlugin(t *testing.T) { + RunScenario(t, &Scenario{ + Description: "Tests that localdns hosts plugin works correctly on Ubuntu 24.04", + K8sSystemPoolSKU: "Standard_D4s_v3", + Config: Config{ + Cluster: ClusterKubenet, + VHD: config.VHDUbuntu2404Gen2Containerd, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + // Enable localdns and hosts plugin explicitly + if nbc.AgentPoolProfile.LocalDNSProfile == nil { + nbc.AgentPoolProfile.LocalDNSProfile = &datamodel.LocalDNSProfile{} + } + nbc.AgentPoolProfile.LocalDNSProfile.EnableLocalDNS = true + nbc.AgentPoolProfile.LocalDNSProfile.EnableHostsPlugin = true + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateAKSHostsSetupService(ctx, s) + ValidateLocalDNSHostsFile(ctx, s, []string{ + "mcr.microsoft.com", + "login.microsoftonline.com", + "acs-mirror.azureedge.net", + }) + ValidateLocalDNSHostsPluginBypass(ctx, s) + }, + }, + }) +} + +// Test_AzureLinuxV3_LocalDNSHostsPlugin tests the localdns hosts plugin feature on Azure Linux V3 +func Test_AzureLinuxV3_LocalDNSHostsPlugin(t *testing.T) { + RunScenario(t, &Scenario{ + Description: "Tests that localdns hosts plugin works correctly on Azure Linux V3", + K8sSystemPoolSKU: "Standard_D4s_v3", + Config: Config{ + Cluster: ClusterKubenet, + VHD: config.VHDAzureLinuxV3Gen2, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + // Enable localdns and hosts plugin explicitly + if nbc.AgentPoolProfile.LocalDNSProfile == nil { + nbc.AgentPoolProfile.LocalDNSProfile = &datamodel.LocalDNSProfile{} + } + nbc.AgentPoolProfile.LocalDNSProfile.EnableLocalDNS = true + nbc.AgentPoolProfile.LocalDNSProfile.EnableHostsPlugin = true + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateAKSHostsSetupService(ctx, s) + ValidateLocalDNSHostsFile(ctx, s, []string{ + "mcr.microsoft.com", + "login.microsoftonline.com", + "acs-mirror.azureedge.net", + }) + ValidateLocalDNSHostsPluginBypass(ctx, s) + }, + }, + }) +} + +// NOTE: UnknownCloud E2E tests have been removed because they fail during API server connectivity +// checks (exit code 52) before aks-hosts-setup runs. UnknownCloud scenarios are now covered by +// unit tests in spec/parts/linux/cloud-init/artifacts/aks_hosts_setup_spec.sh which test the +// script behavior directly without requiring full VM provisioning. + +// Test_Ubuntu2204_LocalDNSHostsPlugin_Scriptless tests the localdns hosts plugin on scriptless path +func Test_Ubuntu2204_LocalDNSHostsPlugin_Scriptless(t *testing.T) { + RunScenario(t, &Scenario{ + Description: "Tests that localdns hosts plugin works correctly on Ubuntu 22.04 scriptless path (aks-node-controller)", + K8sSystemPoolSKU: "Standard_D4s_v3", + Config: Config{ + Cluster: ClusterKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, + AKSNodeConfigMutator: func(aksNodeConfig *aksnodeconfigv1.Configuration) { + // Enable localdns and hosts plugin via AKSNodeConfig (scriptless path) + // Include DNS overrides to ensure corefile has health endpoint on port 8181 + aksNodeConfig.LocalDnsProfile = &aksnodeconfigv1.LocalDnsProfile{ + EnableLocalDns: true, + EnableHostsPlugin: true, + CpuLimitInMilliCores: to.Ptr(int32(2008)), + MemoryLimitInMb: to.Ptr(int32(128)), + VnetDnsOverrides: map[string]*aksnodeconfigv1.LocalDnsOverrides{ + ".": { + QueryLogging: "Log", + Protocol: "PreferUDP", + ForwardDestination: "VnetDNS", + ForwardPolicy: "Sequential", + MaxConcurrent: to.Ptr(int32(1000)), + CacheDurationInSeconds: to.Ptr(int32(3600)), + ServeStaleDurationInSeconds: to.Ptr(int32(3600)), + ServeStale: "Verify", + }, + "cluster.local": { + QueryLogging: "Error", + Protocol: "ForceTCP", + ForwardDestination: "ClusterCoreDNS", + ForwardPolicy: "Sequential", + MaxConcurrent: to.Ptr(int32(1000)), + CacheDurationInSeconds: to.Ptr(int32(3600)), + ServeStaleDurationInSeconds: to.Ptr(int32(3600)), + ServeStale: "Disable", + }, + "testdomain456.com": { + QueryLogging: "Log", + Protocol: "PreferUDP", + ForwardDestination: "ClusterCoreDNS", + ForwardPolicy: "Sequential", + MaxConcurrent: to.Ptr(int32(1000)), + CacheDurationInSeconds: to.Ptr(int32(3600)), + ServeStaleDurationInSeconds: to.Ptr(int32(3600)), + ServeStale: "Verify", + }, + }, + KubeDnsOverrides: map[string]*aksnodeconfigv1.LocalDnsOverrides{ + ".": { + QueryLogging: "Error", + Protocol: "PreferUDP", + ForwardDestination: "ClusterCoreDNS", + ForwardPolicy: "Sequential", + MaxConcurrent: to.Ptr(int32(1000)), + CacheDurationInSeconds: to.Ptr(int32(3600)), + ServeStaleDurationInSeconds: to.Ptr(int32(3600)), + ServeStale: "Verify", + }, + "cluster.local": { + QueryLogging: "Log", + Protocol: "ForceTCP", + ForwardDestination: "ClusterCoreDNS", + ForwardPolicy: "RoundRobin", + MaxConcurrent: to.Ptr(int32(1000)), + CacheDurationInSeconds: to.Ptr(int32(3600)), + ServeStaleDurationInSeconds: to.Ptr(int32(3600)), + ServeStale: "Disable", + }, + "testdomain567.com": { + QueryLogging: "Error", + Protocol: "PreferUDP", + ForwardDestination: "VnetDNS", + ForwardPolicy: "Random", + MaxConcurrent: to.Ptr(int32(1000)), + CacheDurationInSeconds: to.Ptr(int32(3600)), + ServeStaleDurationInSeconds: to.Ptr(int32(3600)), + ServeStale: "Immediate", + }, + }, + } + }, + Validator: func(ctx context.Context, s *Scenario) { + // Validate aks-hosts-setup service ran successfully and timer is active + ValidateAKSHostsSetupService(ctx, s) + + // Validate hosts file contains resolved IPs for public cloud FQDNs + ValidateLocalDNSHostsFile(ctx, s, []string{ + "mcr.microsoft.com", + "login.microsoftonline.com", + "acs-mirror.azureedge.net", + "management.azure.com", + "packages.aks.azure.com", + "packages.microsoft.com", + }) + + // Validate localdns resolves fake FQDN from hosts file (proves hosts plugin bypass) + ValidateLocalDNSHostsPluginBypass(ctx, s) + }, + }, + }) +} + diff --git a/e2e/types.go b/e2e/types.go index 3766b19d858..333f8b78a78 100644 --- a/e2e/types.go +++ b/e2e/types.go @@ -35,6 +35,7 @@ type Tags struct { Scriptless bool VHDCaching bool MockAzureChinaCloud bool + MockUnknownCloud bool VMSeriesCoverageTest bool } @@ -396,3 +397,56 @@ func (s *Scenario) IsWindows() bool { func (s *Scenario) IsLinux() bool { return !s.IsWindows() } + +// IsHostsPluginEnabled returns true if the hosts plugin is explicitly enabled +// via either NBC (traditional) or AKSNodeConfig (scriptless) paths. +func (s *Scenario) IsHostsPluginEnabled() bool { + if s.Runtime.NBC != nil && s.Runtime.NBC.AgentPoolProfile != nil { + return s.Runtime.NBC.AgentPoolProfile.ShouldEnableHostsPlugin() + } + if s.Runtime.AKSNodeConfig != nil && s.Runtime.AKSNodeConfig.LocalDnsProfile != nil { + return s.Runtime.AKSNodeConfig.LocalDnsProfile.EnableHostsPlugin + } + return false +} + +// GetDefaultFQDNsForValidation returns a minimal set of FQDNs to validate in the default validation. +// This mirrors the logic in GetCloudTargetEnv (pkg/agent/utils.go) and aks-hosts-setup.sh. +func (s *Scenario) GetDefaultFQDNsForValidation() []string { + if s.Runtime != nil && s.Runtime.NBC != nil && s.Runtime.NBC.ContainerService != nil { + location := strings.ToLower(s.Runtime.NBC.ContainerService.Location) + if strings.HasPrefix(location, "china") { + return []string{ + "mcr.azure.cn", + "login.partner.microsoftonline.cn", + "acs-mirror.azureedge.net", + } + } + if strings.HasPrefix(location, "usgov") || strings.HasPrefix(location, "usdod") { + return []string{ + "mcr.microsoft.com", + "login.microsoftonline.us", + "acs-mirror.azureedge.net", + } + } + } + return []string{ + "mcr.microsoft.com", + "login.microsoftonline.com", + "acs-mirror.azureedge.net", + } +} + +// GetContainerRegistryFQDN returns the container registry FQDN for the cloud environment +// determined by the NBC's ContainerService.Location field. This mirrors the logic in +// GetCloudTargetEnv (pkg/agent/utils.go) and aks-hosts-setup.sh. +func (s *Scenario) GetContainerRegistryFQDN() string { + if s.Runtime != nil && s.Runtime.NBC != nil && s.Runtime.NBC.ContainerService != nil { + location := strings.ToLower(s.Runtime.NBC.ContainerService.Location) + if strings.HasPrefix(location, "china") { + return "mcr.azure.cn" + } + } + // Default to public cloud container registry (also used by Fairfax/US Gov) + return "mcr.microsoft.com" +} diff --git a/e2e/validation.go b/e2e/validation.go index f9b7885487f..adad3f6afbd 100644 --- a/e2e/validation.go +++ b/e2e/validation.go @@ -71,10 +71,21 @@ func ValidateCommonLinux(ctx context.Context, s *Scenario) { ValidateKubeletNodeIP(ctx, s) } + // localdns is not supported on FIPS VHDs, older VHDs (privatekube, airgapped, scriptless), network isolated VHDs, and AzureLinux OSGuard. // localdns is not supported on scriptless, privatekube and VHDUbuntu2204Gen2ContainerdNetworkIsolatedK8sNotCached. if !s.VHD.UnsupportedLocalDns { ValidateLocalDNSService(ctx, s, "enabled") ValidateLocalDNSResolution(ctx, s, "169.254.10.10") + + // Validate hosts plugin validators only if hosts plugin is explicitly enabled + if s.IsHostsPluginEnabled() { + // Validate aks-hosts-setup service ran successfully and timer is active + ValidateAKSHostsSetupService(ctx, s) + // Validate hosts file contains resolved IPs for critical FQDNs (IPs resolved dynamically) + ValidateLocalDNSHostsFile(ctx, s, s.GetDefaultFQDNsForValidation()) + // Validate localdns resolves fake FQDN from hosts file (proves hosts plugin bypass) + ValidateLocalDNSHostsPluginBypass(ctx, s) + } } ValidateInspektorGadget(ctx, s) diff --git a/e2e/validators.go b/e2e/validators.go index d0fae6f3ca0..48e105bc456 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -1455,6 +1455,303 @@ func ValidateLocalDNSResolution(ctx context.Context, s *Scenario, server string) assert.Contains(s.T, execResult.stdout, fmt.Sprintf("SERVER: %s", server)) } +// ValidateLocalDNSHostsFile checks that /etc/localdns/hosts contains at least one IPv4 entry for each critical FQDN. +// This validation approach avoids flakiness with CDN/frontdoor-backed FQDNs (like mcr.microsoft.com) whose A records +// can rotate between queries. We verify presence, not exact IP matching. +func ValidateLocalDNSHostsFile(ctx context.Context, s *Scenario, fqdns []string) { + s.T.Helper() + + // Force a fresh refresh of the hosts file before validating so the snapshot + // is consistent with the DNS answers we are about to resolve. Without this, + // the 15-minute timer gap can cause flaky mismatches due to DNS load-balancing + // or record rotation. + execScriptOnVMForScenarioValidateExitCode(ctx, s, + "sudo systemctl start aks-hosts-setup.service", + 0, "failed to refresh hosts file via aks-hosts-setup.service") + + // Build script that resolves each FQDN and checks it exists in hosts file + script := fmt.Sprintf(`set -euo pipefail +hosts_file="/etc/localdns/hosts" +fqdns=(%s) + +echo "=== Validating /etc/localdns/hosts contains resolved IPs for critical FQDNs ===" +echo "" +echo "Current hosts file contents:" +cat "$hosts_file" +echo "" + +errors=0 +for fqdn in "${fqdns[@]}"; do + echo "Checking FQDN: $fqdn" + + # Validate that there is at least one IPv4 entry for this FQDN in the hosts file, + # rather than requiring every currently resolved IP to be present. This avoids + # flakiness for CDN/frontdoor-backed FQDNs whose A records can rotate. + if grep -Eq '^[0-9]{1,3}(\.[0-9]{1,3}){3}[[:space:]]+'"$fqdn"'([[:space:]]|$)' "$hosts_file"; then + echo " OK: Found at least one IPv4 entry for $fqdn in hosts file" + else + echo " ERROR: No IPv4 entry found for $fqdn in hosts file" + errors=$((errors + 1)) + fi +done + +echo "" +if [ $errors -gt 0 ]; then + echo "FAILED: $errors FQDNs missing from hosts file" + exit 1 +else + echo "SUCCESS: All critical FQDNs have at least one IPv4 entry in hosts file" + exit 0 +fi +`, quoteFQDNsForBash(fqdns)) + + execScriptOnVMForScenarioValidateExitCode(ctx, s, script, 0, + "hosts file should contain resolved IPs for critical FQDNs") +} + +// quoteFQDNsForBash converts a slice of FQDNs to a bash array string +func quoteFQDNsForBash(fqdns []string) string { + return strings.Join(lo.Map(fqdns, func(fqdn string, _ int) string { + return fmt.Sprintf("%q", fqdn) + }), " ") +} + +// ValidateAKSHostsSetupService checks that aks-hosts-setup.service ran successfully +// and the aks-hosts-setup.timer is active to ensure periodic refresh of /etc/localdns/hosts. +func ValidateAKSHostsSetupService(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Check that aks-hosts-setup.service completed successfully (oneshot service) + serviceScript := `set -euo pipefail +svc="aks-hosts-setup.service" +# For oneshot services, check if it ran successfully (exit code 0) +result=$(systemctl show -p Result "$svc" --value 2>/dev/null || echo "unknown") +echo "aks-hosts-setup.service result: $result" +if [ "$result" != "success" ]; then + echo "ERROR: aks-hosts-setup.service did not complete successfully" + systemctl status "$svc" --no-pager || true + journalctl -u "$svc" --no-pager -n 50 || true + exit 1 +fi +` + execScriptOnVMForScenarioValidateExitCode(ctx, s, serviceScript, 0, + "aks-hosts-setup.service should have completed successfully") + + // Check that aks-hosts-setup.timer is active for periodic refresh + ValidateSystemdUnitIsRunning(ctx, s, "aks-hosts-setup.timer") +} + +// ValidateLocalDNSHostsPluginBypass verifies that localdns resolves FQDNs from /etc/localdns/hosts +// without querying the upstream DNS server. This confirms the hosts plugin is working correctly. +// It injects a fake FQDN (that doesn't exist in public DNS) into the hosts file and verifies +// localdns can resolve it - proving the hosts plugin is functioning. +func ValidateLocalDNSHostsPluginBypass(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Step 1: Verify the node has the hosts plugin annotation + // The annotation is set asynchronously by localdns.sh (background job waiting for kubeconfig + node registration) + // Poll for up to 5 minutes with exponential backoff to avoid flaky failures + s.T.Log("Polling for node annotation kubernetes.azure.com/localdns-hosts-plugin=enabled...") + annotationKey := "kubernetes.azure.com/localdns-hosts-plugin" + + var node *corev1.Node + var err error + var annotationValue string + var exists bool + maxAttempts := 60 // 5 minutes with exponential backoff + + for attempt := 1; attempt <= maxAttempts; attempt++ { + node, err = s.Runtime.Cluster.Kube.Typed.CoreV1().Nodes().Get(ctx, s.Runtime.VM.KubeName, metav1.GetOptions{}) + require.NoError(s.T, err, "failed to get node %q", s.Runtime.VM.KubeName) + + annotationValue, exists = node.Annotations[annotationKey] + if exists && annotationValue == "enabled" { + s.T.Logf("✓ Node annotation %s=%s found after %d attempts", annotationKey, annotationValue, attempt) + break + } + + if attempt == maxAttempts { + s.T.Fatalf("Timeout: node %q annotation %q not found or not 'enabled' after %d attempts (5 minutes). Current value: exists=%v, value=%q", + s.Runtime.VM.KubeName, annotationKey, maxAttempts, exists, annotationValue) + } + + // Exponential backoff: 1s, 2s, 4s, 8s, max 10s + sleepDuration := time.Duration(1< 10*time.Second { + sleepDuration = 10 * time.Second + } + s.T.Logf("Attempt %d/%d: annotation not ready (exists=%v, value=%q), retrying in %v...", attempt, maxAttempts, exists, annotationValue, sleepDuration) + time.Sleep(sleepDuration) + } + + // Step 2: Verify the Corefile has the hosts plugin configured + s.T.Log("Verifying Corefile contains hosts plugin configuration...") + corefileCheckScript := `set -euo pipefail +corefile="/opt/azure/containers/localdns/updated.localdns.corefile" + +echo "=== Verifying Corefile configuration ===" +echo "Checking if $corefile exists..." +if [ ! -f "$corefile" ]; then + echo "ERROR: Corefile $corefile does not exist" + exit 1 +fi +echo "✓ Corefile exists" +echo "" + +echo "Checking if Corefile contains hosts plugin directive..." +if ! grep -q "hosts /etc/localdns/hosts" "$corefile"; then + echo "ERROR: Corefile does not contain 'hosts /etc/localdns/hosts' directive" + echo "" + echo "Corefile contents:" + cat "$corefile" + exit 1 +fi +echo "✓ Found 'hosts /etc/localdns/hosts' directive in Corefile" +echo "" + +echo "Verifying hosts plugin in VnetDNS listener (169.254.10.10)..." +# Extract the VnetDNS section (.:53 block with bind 169.254.10.10) +vnetdns_section=$(awk '/.:53 \{/,/^}/' "$corefile" | sed -n '/bind 169.254.10.10/,/^}/p') +if ! echo "$vnetdns_section" | grep -q "hosts /etc/localdns/hosts"; then + echo "ERROR: hosts plugin not found in VnetDNS listener (169.254.10.10)" + echo "VnetDNS section:" + echo "$vnetdns_section" + exit 1 +fi +echo "✓ hosts plugin found in VnetDNS listener (169.254.10.10)" + +# Verify hosts comes before forward in VnetDNS (order matters - hosts should be checked first) +hosts_line=$(echo "$vnetdns_section" | grep -n "hosts /etc/localdns/hosts" | cut -d: -f1 | head -1) +forward_line=$(echo "$vnetdns_section" | grep -n "forward \\." | cut -d: -f1 | head -1) +if [ -n "$hosts_line" ] && [ -n "$forward_line" ] && [ "$hosts_line" -gt "$forward_line" ]; then + echo "WARNING: hosts plugin appears after forward directive in VnetDNS listener" + echo "This may prevent hosts plugin from being consulted first" +fi +echo "✓ hosts plugin is properly ordered in VnetDNS listener" +echo "" + +echo "Verifying hosts plugin in KubeDNS overrides listener (169.254.10.11)..." +# Extract the KubeDNS section (.:53 block with bind 169.254.10.11) +kubedns_section=$(awk '/.:53 \{/,/^}/' "$corefile" | sed -n '/bind 169.254.10.11/,/^}/p') +if ! echo "$kubedns_section" | grep -q "hosts /etc/localdns/hosts"; then + echo "ERROR: hosts plugin not found in KubeDNS overrides listener (169.254.10.11)" + echo "KubeDNS section:" + echo "$kubedns_section" + exit 1 +fi +echo "✓ hosts plugin found in KubeDNS overrides listener (169.254.10.11)" + +# Verify hosts comes before forward in KubeDNS (order matters) +hosts_line=$(echo "$kubedns_section" | grep -n "hosts /etc/localdns/hosts" | cut -d: -f1 | head -1) +forward_line=$(echo "$kubedns_section" | grep -n "forward \\." | cut -d: -f1 | head -1) +if [ -n "$hosts_line" ] && [ -n "$forward_line" ] && [ "$hosts_line" -gt "$forward_line" ]; then + echo "WARNING: hosts plugin appears after forward directive in KubeDNS listener" + echo "This may prevent hosts plugin from being consulted first" +fi +echo "✓ hosts plugin is properly ordered in KubeDNS overrides listener" +echo "" + +echo "=== Corefile validation successful ===" +echo "Summary: hosts plugin is configured in both VnetDNS (169.254.10.10) and KubeDNS (169.254.10.11) listeners" +` + + execScriptOnVMForScenarioValidateExitCode(ctx, s, corefileCheckScript, 0, + "Corefile should contain hosts plugin configuration in both VnetDNS and KubeDNS listeners") + + // Step 3: Test that localdns resolves real FQDNs from /etc/localdns/hosts + // This validates the hosts plugin is working by checking: + // 1. DNS resolution returns IPs that match entries in /etc/localdns/hosts + // 2. DNS response includes "recursion not available" flag (proves it's from hosts plugin, not forwarded upstream) + // + // We use packages.microsoft.com because it's a real FQDN that aks-hosts-setup.service populates. + // This avoids race conditions with the aks-hosts-setup.timer overwriting fake test entries. + testFQDN := "packages.microsoft.com" + s.T.Logf("Testing hosts plugin resolves %s from /etc/localdns/hosts", testFQDN) + + script := fmt.Sprintf(`set -euo pipefail +test_fqdn=%q +hosts_file="/etc/localdns/hosts" + +echo "=== Testing localdns hosts plugin functionality ===" +echo "Testing FQDN: $test_fqdn" +echo "" + +# Step 1: Get the expected IPs from /etc/localdns/hosts +echo "Reading expected IPs from $hosts_file..." +if [ ! -f "$hosts_file" ]; then + echo "ERROR: Hosts file $hosts_file does not exist" + exit 1 +fi + +# Extract IPv4 addresses for the test FQDN from hosts file (ignore IPv6 for simplicity) +expected_ips=$(grep -E "^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+[[:space:]]+$test_fqdn" "$hosts_file" | awk '{print $1}' | sort) +if [ -z "$expected_ips" ]; then + echo "ERROR: No IPv4 entries found for $test_fqdn in $hosts_file" + echo "Hosts file contents:" + sudo cat "$hosts_file" + exit 1 +fi + +echo "Expected IPs from hosts file:" +echo "$expected_ips" +echo "" + +# Step 2: Query localdns and get the resolved IPs +echo "Querying localdns for $test_fqdn at 169.254.10.10..." +resolved_ips=$(dig "$test_fqdn" @169.254.10.10 +short -t A +timeout=5 +tries=2 2>/dev/null | sort) +if [ -z "$resolved_ips" ]; then + echo "ERROR: No IPs returned from localdns query" + echo "Full dig output:" + dig "$test_fqdn" @169.254.10.10 +timeout=5 +tries=2 || true + exit 1 +fi + +echo "Resolved IPs from localdns:" +echo "$resolved_ips" +echo "" + +# Step 3: Verify the resolved IPs match the hosts file entries +echo "Comparing resolved IPs with hosts file entries..." +if [ "$expected_ips" != "$resolved_ips" ]; then + echo "ERROR: Resolved IPs do not match hosts file entries" + echo "Expected (from hosts file):" + echo "$expected_ips" + echo "Got (from localdns):" + echo "$resolved_ips" + exit 1 +fi +echo "✓ Resolved IPs match hosts file entries" +echo "" + +# Step 4: Verify "recursion not available" flag in DNS response +# This proves the response came from the hosts plugin, not from forwarding to upstream DNS +# Note: We use nslookup without explicit server IP to preserve the recursion flag message +echo "Checking for 'recursion not available' flag in DNS response..." +nslookup_output=$(nslookup "$test_fqdn" 2>&1) +if ! echo "$nslookup_output" | grep -q "recursion not available"; then + echo "ERROR: Expected 'recursion not available' flag in DNS response" + echo "This indicates localdns forwarded the query upstream instead of using the hosts plugin" + echo "" + echo "Full nslookup output:" + echo "$nslookup_output" + exit 1 +fi +echo "✓ Found 'recursion not available' flag in DNS response" +echo "" + +echo "=== SUCCESS ===" +echo "The localdns hosts plugin is working correctly:" +echo " 1. DNS resolution returned IPs from /etc/localdns/hosts" +echo " 2. Response included 'recursion not available' (not forwarded upstream)" +echo "" +echo "Full nslookup output:" +echo "$nslookup_output" +`, testFQDN) + + execScriptOnVMForScenarioValidateExitCode(ctx, s, script, 0, + "localdns should resolve FQDN from hosts file with recursion not available") +} + // ValidateJournalctlOutput checks if specific content exists in the systemd service logs func ValidateJournalctlOutput(ctx context.Context, s *Scenario, serviceName string, expectedContent string) { s.T.Helper() diff --git a/e2e/vmss.go b/e2e/vmss.go index 50cb0a1141d..02b0d994ac4 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -209,6 +209,17 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine require.NoError(s.T, err) cse = nodeBootstrapping.CSE customData = nodeBootstrapping.CustomData + + // For MockUnknownCloud, inject an unsupported cloud name into the CSE script + // to test that aks-hosts-setup.sh gracefully handles unrecognized clouds + if s.Tags.MockUnknownCloud { + s.T.Log("E2E: Injecting TARGET_CLOUD=UnsupportedCloudE2ETest override into CSE script") + cse = strings.Replace(cse, + `TARGET_ENVIRONMENT="`, + `TARGET_CLOUD="UnsupportedCloudE2ETest" # E2E override for testing unsupported cloud`+"\n"+`TARGET_ENVIRONMENT="`, + 1) + } + if len(s.Config.CustomDataWriteFiles) > 0 { customData, err = injectWriteFilesEntriesToCustomData(customData, s.Config.CustomDataWriteFiles) require.NoError(s.T, err, "failed to inject customData write_files entries") diff --git a/parts/linux/cloud-init/artifacts/aks-hosts-setup.service b/parts/linux/cloud-init/artifacts/aks-hosts-setup.service new file mode 100644 index 00000000000..b207d9edb14 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/aks-hosts-setup.service @@ -0,0 +1,14 @@ +[Unit] +Description=Populate /etc/localdns/hosts with critical AKS FQDN addresses +After=network-online.target +Wants=network-online.target +Before=kubelet.service localdns.service + +[Service] +Type=oneshot +TimeoutStartSec=60 +EnvironmentFile=-/etc/localdns/cloud-env +ExecStart=/opt/azure/containers/aks-hosts-setup.sh + +[Install] +WantedBy=multi-user.target diff --git a/parts/linux/cloud-init/artifacts/aks-hosts-setup.sh b/parts/linux/cloud-init/artifacts/aks-hosts-setup.sh new file mode 100644 index 00000000000..cee5a82dde4 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/aks-hosts-setup.sh @@ -0,0 +1,243 @@ +#!/bin/bash +set -euo pipefail + +# aks-hosts-setup.sh +# Resolves A and AAAA records for critical AKS FQDNs and populates /etc/localdns/hosts. +# TARGET_CLOUD is set by CSE (cse_cmd.sh) and persisted via /etc/localdns/cloud-env +# as a systemd EnvironmentFile so it's available on both initial and timer-triggered runs. + +HOSTS_FILE="/etc/localdns/hosts" + +# Ensure the directory exists +mkdir -p "$(dirname "$HOSTS_FILE")" + +# Use TARGET_CLOUD directly. It's available from: +# 1. CSE environment (initial run from enableAKSHostsSetup) +# 2. Systemd EnvironmentFile (timer-triggered runs via aks-hosts-setup.service) +# If TARGET_CLOUD is not set, exit immediately - we must not guess the cloud environment +# as this could cache incorrect DNS entries in the hosts file. +if [ -z "${TARGET_CLOUD:-}" ]; then + echo "ERROR: TARGET_CLOUD is not set. Cannot determine which FQDNs to resolve." + echo "This likely means the cloud environment file is missing or CSE did not set TARGET_CLOUD." + echo "Exiting without modifying hosts file to avoid caching incorrect DNS entries." + exit 1 +fi +local_cloud="${TARGET_CLOUD}" + +# Select critical FQDNs based on the cloud environment. +# Each cloud has its own service endpoints for container registry, identity, ARM, and packages. +# This mirrors the cloud detection in GetCloudTargetEnv (pkg/agent/datamodel/sig_config.go). + +# FQDNs common to all clouds. +COMMON_FQDNS=( + "packages.microsoft.com" # Microsoft packages +) + +# Cloud-specific FQDNs. +case "${local_cloud}" in + AzureChinaCloud) + CLOUD_FQDNS=( + "acs-mirror.azureedge.net" # K8s binaries mirror + "mcr.azure.cn" # Container registry (China)(New) + "mcr.azk8s.cn" # Container registry (China)(Old, migrating from this to mcr.azure.cn) + "login.partner.microsoftonline.cn" # Azure AD (China) + "management.chinacloudapi.cn" # ARM (China) + ) + ;; + AzureUSGovernmentCloud) + CLOUD_FQDNS=( + "acs-mirror.azureedge.net" # K8s binaries mirror + "mcr.microsoft.com" # Container registry + "login.microsoftonline.us" # Azure AD (US Gov) + "management.usgovcloudapi.net" # ARM (US Gov) + "packages.aks.azure.com" # AKS packages + ) + ;; + AzurePublicCloud) + CLOUD_FQDNS=( + "acs-mirror.azureedge.net" # K8s binaries mirror + "mcr.microsoft.com" # Container registry + "login.microsoftonline.com" # Azure AD / Entra ID + "management.azure.com" # ARM + "packages.aks.azure.com" # AKS packages + ) + ;; + *) + # Unsupported cloud environment - exit with error + echo "ERROR: The following cloud is not supported: ${local_cloud}" + echo "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + exit 1 + ;; +esac + +# Combine common + cloud-specific FQDNs. +CRITICAL_FQDNS=("${COMMON_FQDNS[@]}" "${CLOUD_FQDNS[@]}") + +echo "Detected cloud environment: ${local_cloud}" + +# Function to resolve IPv4 addresses for a domain +# Filters output to only include valid IPv4 addresses (rejects NXDOMAIN, SERVFAIL, hostnames, etc.) +resolve_ipv4() { + local domain="$1" + local output + output=$(timeout 3 nslookup -type=A "${domain}" 2>/dev/null) || return 0 + # Parse Address lines (skip server address with #), validate IPv4 format with octet range 0-255 + echo "${output}" | awk '/^Address: / && !/^Address: .*#/ {print $2}' | grep -E '^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$' | while IFS='.' read -r a b c d; do + if [ "$a" -le 255 ] && [ "$b" -le 255 ] && [ "$c" -le 255 ] && [ "$d" -le 255 ]; then + echo "${a}.${b}.${c}.${d}" + fi + done + return 0 +} + +# Function to resolve IPv6 addresses for a domain +# Filters output to only include valid IPv6 addresses (rejects NXDOMAIN, SERVFAIL, hostnames, etc.) +resolve_ipv6() { + local domain="$1" + local output + output=$(timeout 3 nslookup -type=AAAA "${domain}" 2>/dev/null) || return 0 + # Parse Address lines (skip server address with #), validate IPv6 format + # Require at least two colons and min 7 chars to reject strings like "1:2" or ":ff" + echo "${output}" | awk '/^Address: / && !/^Address: .*#/ {print $2}' | grep -E '^[0-9a-fA-F:]{7,}$' | grep ':.*:' || return 0 +} + +echo "Starting AKS critical FQDN hosts resolution at $(date)" + +# Track if we resolved at least one address +RESOLVED_ANY=false + +# Start building the hosts file content +HOSTS_CONTENT="# AKS critical FQDN addresses resolved at $(date) +# This file is automatically generated by aks-hosts-setup.service +" + +# Resolve each FQDN +for DOMAIN in "${CRITICAL_FQDNS[@]}"; do + echo "Resolving addresses for ${DOMAIN}..." + + # Get IPv4 and IPv6 addresses using helper functions + IPV4_ADDRS=$(resolve_ipv4 "${DOMAIN}") + IPV6_ADDRS=$(resolve_ipv6 "${DOMAIN}") + + # Check if we got any results for this domain + if [ -z "${IPV4_ADDRS}" ] && [ -z "${IPV6_ADDRS}" ]; then + echo " WARNING: No IP addresses resolved for ${DOMAIN}" + continue + fi + + RESOLVED_ANY=true + HOSTS_CONTENT+=" +# ${DOMAIN}" + + if [ -n "${IPV4_ADDRS}" ]; then + for addr in ${IPV4_ADDRS}; do + HOSTS_CONTENT+=" +${addr} ${DOMAIN}" + done + fi + + if [ -n "${IPV6_ADDRS}" ]; then + for addr in ${IPV6_ADDRS}; do + HOSTS_CONTENT+=" +${addr} ${DOMAIN}" + done + fi +done + +# Check if we resolved at least one domain +if [ "${RESOLVED_ANY}" != "true" ]; then + echo "WARNING: No IP addresses resolved for any domain at $(date)" + echo "This is likely a temporary DNS issue. Timer will retry later." + # Keep existing hosts file intact and exit successfully so systemd doesn't mark unit as failed + exit 0 +fi + +# Write the hosts file atomically: write to a temp file in the same directory, +# validate it, then rename it over the target. rename(2) on the same filesystem +# is atomic, so CoreDNS (or any other reader) never sees invalid or truncated data. +echo "Writing addresses to ${HOSTS_FILE}..." +HOSTS_TMP="${HOSTS_FILE}.tmp.$$" + +# Write content to temp file with explicit error checking +if ! echo "${HOSTS_CONTENT}" > "${HOSTS_TMP}"; then + echo "ERROR: Failed to write to temporary file ${HOSTS_TMP}" + rm -f "${HOSTS_TMP}" # Clean up temp file + exit 1 +fi + +# Set permissions with explicit error checking +if ! chmod 0644 "${HOSTS_TMP}"; then + echo "ERROR: Failed to chmod temporary file ${HOSTS_TMP}" + rm -f "${HOSTS_TMP}" # Clean up temp file + exit 1 +fi + +# Validate temp file BEFORE moving into place to ensure we never publish invalid data +# Verify the file was written and has content +if [ ! -s "${HOSTS_TMP}" ]; then + echo "ERROR: Temporary hosts file ${HOSTS_TMP} is empty or does not exist after write" + rm -f "${HOSTS_TMP}" + exit 1 +fi + +# Verify that every non-comment, non-empty line has the format: +# This ensures we don't have any lines with FQDN but missing IP address +echo "Validating hosts file entries format..." +INVALID_LINES=() +VALID_ENTRIES=0 +while IFS= read -r line; do + # Skip comments and empty lines + [[ "$line" =~ ^[[:space:]]*# ]] || [[ -z "$line" ]] && continue + + # Check if line has at least two fields (IP and FQDN) + ip=$(echo "$line" | awk '{print $1}') + fqdn=$(echo "$line" | awk '{print $2}') + + # Critical check: ensure we have both IP and FQDN (no empty IP mappings) + if [ -z "$ip" ] || [ -z "$fqdn" ]; then + echo "ERROR: Invalid entry found - missing IP or FQDN: '$line'" + INVALID_LINES+=("$line") + continue + fi + + # Validate IP format (IPv4 or IPv6) + if [[ "$ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + # Valid IPv4 + VALID_ENTRIES=$((VALID_ENTRIES + 1)) + elif [[ "$ip" =~ : ]]; then + # Valid IPv6 (contains colon) + VALID_ENTRIES=$((VALID_ENTRIES + 1)) + else + echo "ERROR: Invalid IP format: '$ip' in line: '$line'" + INVALID_LINES+=("$line") + fi +done < "${HOSTS_TMP}" + +if [ ${#INVALID_LINES[@]} -gt 0 ]; then + echo "ERROR: Found ${#INVALID_LINES[@]} invalid entries in temporary hosts file" + echo "Invalid entries:" + printf '%s\n' "${INVALID_LINES[@]}" + echo "This indicates FQDN to empty IP mappings or malformed entries" + rm -f "${HOSTS_TMP}" + exit 1 +fi + +if [ $VALID_ENTRIES -eq 0 ]; then + echo "ERROR: No valid IP address mappings found in temporary hosts file" + echo "File content:" + cat "${HOSTS_TMP}" + rm -f "${HOSTS_TMP}" + exit 1 +fi + +echo "✓ All entries in temporary hosts file are valid (IP FQDN format)" +echo "Found ${VALID_ENTRIES} valid IP address mappings" + +# Atomic rename with explicit error checking - only done after validation passes +if ! mv "${HOSTS_TMP}" "${HOSTS_FILE}"; then + echo "ERROR: Failed to move temporary file to ${HOSTS_FILE}" + rm -f "${HOSTS_TMP}" # Clean up temp file + exit 1 +fi + +echo "AKS critical FQDN hosts resolution completed at $(date)" diff --git a/parts/linux/cloud-init/artifacts/aks-hosts-setup.timer b/parts/linux/cloud-init/artifacts/aks-hosts-setup.timer new file mode 100644 index 00000000000..281880160f9 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/aks-hosts-setup.timer @@ -0,0 +1,13 @@ +[Unit] +Description=Run AKS hosts setup periodically + +[Timer] +# Run immediately on boot +OnBootSec=0 +# Run 15 minutes after the last activation (AKS critical FQDN IPs don't change frequently) +OnUnitActiveSec=15min +# Timer accuracy (how much systemd can delay) +AccuracySec=1min + +[Install] +WantedBy=timers.target diff --git a/parts/linux/cloud-init/artifacts/cse_cmd.sh b/parts/linux/cloud-init/artifacts/cse_cmd.sh index bc48088a3b8..a7452e7cf76 100644 --- a/parts/linux/cloud-init/artifacts/cse_cmd.sh +++ b/parts/linux/cloud-init/artifacts/cse_cmd.sh @@ -181,9 +181,11 @@ MCR_REPOSITORY_BASE="{{GetMCRRepositoryBase}}" ENABLE_IMDS_RESTRICTION="{{EnableIMDSRestriction}}" INSERT_IMDS_RESTRICTION_RULE_TO_MANGLE_TABLE="{{InsertIMDSRestrictionRuleToMangleTable}}" SHOULD_ENABLE_LOCALDNS="{{ShouldEnableLocalDNS}}" +SHOULD_ENABLE_HOSTS_PLUGIN="{{ShouldEnableHostsPlugin}}" LOCALDNS_CPU_LIMIT="{{GetLocalDNSCPULimitInPercentage}}" LOCALDNS_MEMORY_LIMIT="{{GetLocalDNSMemoryLimitInMB}}" LOCALDNS_GENERATED_COREFILE="{{GetGeneratedLocalDNSCoreFile}}" +LOCALDNS_GENERATED_COREFILE_NO_HOSTS="{{GetGeneratedLocalDNSCoreFileNoHosts}}" PRE_PROVISION_ONLY="{{GetPreProvisionOnly}}" CSE_TIMEOUT="{{GetCSETimeout}}" /usr/bin/nohup /bin/bash -c "/bin/bash /opt/azure/containers/provision_start.sh" diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index ca6629b5b40..09b59de55ed 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -1245,18 +1245,41 @@ LOCALDNS_SLICE_FILE="/etc/systemd/system/localdns.slice" # It creates the localdns corefile and slicefile, then enables and starts localdns. # In this function, generated base64 encoded localdns corefile is decoded and written to the corefile path. # This function also creates the localdns slice file with memory and cpu limits, that will be used by localdns systemd unit. +# generateLocalDNSFiles creates the localdns corefile and slice file. +# Usage: generateLocalDNSFiles [corefile_base64] +# corefile_base64: optional base64-encoded corefile content to use. +# If not provided, falls back to LOCALDNS_GENERATED_COREFILE. generateLocalDNSFiles() { + local corefile_content="${1:-${LOCALDNS_GENERATED_COREFILE}}" + mkdir -p "$(dirname "${LOCALDNS_CORE_FILE}")" touch "${LOCALDNS_CORE_FILE}" chmod 0644 "${LOCALDNS_CORE_FILE}" - echo "${LOCALDNS_GENERATED_COREFILE}" | base64 -d > "${LOCALDNS_CORE_FILE}" || exit $ERR_LOCALDNS_FAIL + base64 -d <<< "${corefile_content}" > "${LOCALDNS_CORE_FILE}" || exit $ERR_LOCALDNS_FAIL + + # Log whether the generated corefile includes hosts plugin + if grep -q "hosts /etc/localdns/hosts" "${LOCALDNS_CORE_FILE}"; then + echo "Generated corefile at ${LOCALDNS_CORE_FILE} INCLUDES hosts plugin" + else + echo "Generated corefile at ${LOCALDNS_CORE_FILE} DOES NOT include hosts plugin" + fi # Create environment file for corefile regeneration. # This file will be referenced by localdns.service using EnvironmentFile directive. + # Save BOTH corefile variants so localdns can dynamically choose on each restart. + # + # Naming note: + # - LOCALDNS_BASE64_ENCODED_COREFILE (legacy key): stores whichever variant was selected + # as the initial default (currently the no-hosts variant from CSE). + # - LOCALDNS_BASE64_ENCODED_COREFILE_WITH_HOSTS: explicit with-hosts variant for dynamic selection. + # - LOCALDNS_BASE64_ENCODED_COREFILE_NO_HOSTS: explicit no-hosts variant for dynamic selection. LOCALDNS_ENV_FILE="/etc/localdns/environment" mkdir -p "$(dirname "${LOCALDNS_ENV_FILE}")" cat > "${LOCALDNS_ENV_FILE}" </dev/null && echo 'WITH hosts plugin' || echo 'WITHOUT hosts plugin')" echo "localdns should be enabled." systemctlEnableAndStart localdns 30 || exit $ERR_LOCALDNS_FAIL echo "Enable localdns succeeded." } +# This function enables and starts the aks-hosts-setup timer. +# The timer periodically resolves critical AKS FQDN DNS records and populates /etc/localdns/hosts. +# The caller in cse_main.sh checks /etc/localdns/hosts content directly to decide +# which corefile to use, so this function does not need to signal success/failure. +enableAKSHostsSetup() { + # Best-effort setup: log errors but never fail. + # The corefile will fall back to the no-hosts variant if hosts file is empty. + # Allow overriding paths for testing (via environment variables) + local hosts_file="${AKS_HOSTS_FILE:-/etc/localdns/hosts}" + local hosts_setup_script="${AKS_HOSTS_SETUP_SCRIPT:-/opt/azure/containers/aks-hosts-setup.sh}" + local hosts_setup_service="${AKS_HOSTS_SETUP_SERVICE:-/etc/systemd/system/aks-hosts-setup.service}" + local hosts_setup_timer="${AKS_HOSTS_SETUP_TIMER:-/etc/systemd/system/aks-hosts-setup.timer}" + local cloud_env_file="${AKS_CLOUD_ENV_FILE:-/etc/localdns/cloud-env}" + + # Guard: verify required artifacts exist on this VHD. + # Older VHDs (or certain build modes) may not include them. + if [ ! -f "${hosts_setup_script}" ]; then + echo "Warning: ${hosts_setup_script} not found on this VHD, skipping aks-hosts-setup" + return + fi + if [ ! -x "${hosts_setup_script}" ]; then + echo "Warning: ${hosts_setup_script} is not executable, skipping aks-hosts-setup" + return + fi + if [ ! -f "${hosts_setup_service}" ]; then + echo "Warning: ${hosts_setup_service} not found on this VHD, skipping aks-hosts-setup" + return + fi + if [ ! -f "${hosts_setup_timer}" ]; then + echo "Warning: ${hosts_setup_timer} not found on this VHD, skipping aks-hosts-setup" + return + fi + + # Write the cloud environment as a systemd EnvironmentFile so aks-hosts-setup.sh + # can use $TARGET_CLOUD directly — both when called from CSE (already in env) and + # when triggered by the systemd timer (injected via EnvironmentFile= in the .service unit). + if [ -z "${TARGET_CLOUD:-}" ]; then + echo "WARNING: TARGET_CLOUD is not set. Cannot run aks-hosts-setup without knowing cloud environment." + echo "aks-hosts-setup requires TARGET_CLOUD to determine which FQDNs to resolve." + echo "Skipping aks-hosts-setup. Corefile will fall back to version without hosts plugin." + return + fi + + # Validate that TARGET_CLOUD is one of the supported clouds + # This must match the case statement in aks-hosts-setup.sh + case "${TARGET_CLOUD}" in + AzurePublicCloud|AzureChinaCloud|AzureUSGovernmentCloud) + # Supported cloud, continue + ;; + *) + echo "WARNING: The following cloud is not supported by aks-hosts-setup: ${TARGET_CLOUD}" + echo "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + echo "Skipping aks-hosts-setup. Corefile will fall back to version without hosts plugin." + return + ;; + esac + + echo "Setting TARGET_CLOUD=${TARGET_CLOUD} for aks-hosts-setup" + mkdir -p "$(dirname "${cloud_env_file}")" + echo "TARGET_CLOUD=${TARGET_CLOUD}" > "${cloud_env_file}" + chmod 0644 "${cloud_env_file}" + + # Create an empty hosts file so the localdns hosts plugin can start watching it + # immediately. The file will be populated by aks-hosts-setup timer asynchronously. + mkdir -p "$(dirname "${hosts_file}")" + touch "${hosts_file}" + chmod 0644 "${hosts_file}" + + # Enable the timer for periodic refresh (every 15 minutes) + # This will update the hosts file with fresh IPs from live DNS + echo "Enabling aks-hosts-setup timer..." + if systemctlEnableAndStartNoBlock aks-hosts-setup.timer 30; then + echo "aks-hosts-setup timer enabled successfully." + else + echo "Warning: Failed to enable aks-hosts-setup timer" + fi +} + configureManagedGPUExperience() { if [ "${GPU_NODE}" != "true" ] || [ "${skip_nvidia_driver_install}" = "true" ]; then return diff --git a/parts/linux/cloud-init/artifacts/cse_main.sh b/parts/linux/cloud-init/artifacts/cse_main.sh index 0225bfd0944..882cd952fba 100755 --- a/parts/linux/cloud-init/artifacts/cse_main.sh +++ b/parts/linux/cloud-init/artifacts/cse_main.sh @@ -294,8 +294,18 @@ EOF logs_to_events "AKS.CSE.ensureContainerd.ensureArtifactStreaming" ensureArtifactStreaming || exit $ERR_ARTIFACT_STREAMING_INSTALL fi + # Enable aks-hosts-setup to populate /etc/localdns/hosts with resolved AKS FQDN IPs. + # Startup ordering: aks-hosts-setup runs async via timer; localdns starts immediately + # with the no-hosts corefile. On subsequent restarts, localdns.sh dynamically selects + # the hosts-plugin variant if /etc/localdns/hosts has been populated by the timer. + if [ "${SHOULD_ENABLE_LOCALDNS}" = "true" ] && [ "${SHOULD_ENABLE_HOSTS_PLUGIN}" = "true" ]; then + logs_to_events "AKS.CSE.enableAKSHostsSetup" enableAKSHostsSetup + fi + if [ "${SHOULD_ENABLE_LOCALDNS}" = "true" ]; then - logs_to_events "AKS.CSE.enableLocalDNS" enableLocalDNS || exit $ERR_LOCALDNS_FAIL + # Pass the no-hosts corefile as initial default. + # Both corefile variants are saved in /etc/localdns/environment for dynamic selection. + logs_to_events "AKS.CSE.enableLocalDNS" enableLocalDNS "${LOCALDNS_GENERATED_COREFILE_NO_HOSTS}" || exit $ERR_LOCALDNS_FAIL fi if [ "${ID}" != "mariner" ] && [ "${ID}" != "azurelinux" ]; then diff --git a/parts/linux/cloud-init/artifacts/localdns.sh b/parts/linux/cloud-init/artifacts/localdns.sh index f05e8c3837c..586dccbd121 100644 --- a/parts/linux/cloud-init/artifacts/localdns.sh +++ b/parts/linux/cloud-init/artifacts/localdns.sh @@ -127,15 +127,37 @@ verify_localdns_binary() { # Regenerate the localdns corefile from base64 encoded content. # This is used when the corefile goes missing. regenerate_localdns_corefile() { - if [ -z "${LOCALDNS_BASE64_ENCODED_COREFILE:-}" ]; then - echo "LOCALDNS_BASE64_ENCODED_COREFILE is not set. Cannot regenerate corefile." + # Dynamically select which corefile variant to use based on current state. + # This allows localdns to switch from no-hosts to hosts-plugin variant if: + # 1. SHOULD_ENABLE_HOSTS_PLUGIN is true, AND + # 2. /etc/localdns/hosts now exists and has valid content + # This provides recovery from initial CSE timeout scenarios. + + local corefile_to_use + + if [ -n "${LOCALDNS_BASE64_ENCODED_COREFILE_WITH_HOSTS:-}" ] && \ + [ -n "${LOCALDNS_BASE64_ENCODED_COREFILE_NO_HOSTS:-}" ]; then + # Both corefile variants are available - do dynamic selection + echo "Both corefile variants available, selecting based on current state..." + corefile_to_use=$(select_localdns_corefile \ + "${SHOULD_ENABLE_HOSTS_PLUGIN}" \ + "${LOCALDNS_BASE64_ENCODED_COREFILE_WITH_HOSTS}" \ + "${LOCALDNS_BASE64_ENCODED_COREFILE_NO_HOSTS}" \ + "/etc/localdns/hosts") + elif [ -n "${LOCALDNS_BASE64_ENCODED_COREFILE:-}" ]; then + # Fallback to legacy single corefile for backward compatibility + echo "Using legacy LOCALDNS_BASE64_ENCODED_COREFILE (no dynamic selection)" + corefile_to_use="${LOCALDNS_BASE64_ENCODED_COREFILE}" + else + echo "No corefile variants available in environment. Cannot regenerate corefile." return 1 fi + echo "Regenerating localdns corefile at ${LOCALDNS_CORE_FILE}" mkdir -p "$(dirname "${LOCALDNS_CORE_FILE}")" # Decode base64 corefile content and write to corefile. - if ! echo "${LOCALDNS_BASE64_ENCODED_COREFILE}" | base64 -d > "${LOCALDNS_CORE_FILE}"; then + if ! echo "${corefile_to_use}" | base64 -d > "${LOCALDNS_CORE_FILE}"; then echo "Failed to decode and write corefile." return 1 fi @@ -368,6 +390,104 @@ wait_for_localdns_ready() { return 0 } +# Set node annotation to indicate hosts plugin is in use if the hosts file has contents. +annotate_node_with_hosts_plugin_status() { + # Check if the running localdns corefile actually contains the hosts plugin block. + # This is the ground truth - we check the actual corefile being used by the service, + # not just what was selected during CSE, in case the file was modified or regenerated. + local corefile_path="${UPDATED_LOCALDNS_CORE_FILE:-/opt/azure/containers/localdns/updated.localdns.corefile}" + + if [ ! -f "${corefile_path}" ]; then + echo "Localdns corefile not found at ${corefile_path}, skipping annotation." + return 0 + fi + + # Check if the corefile contains the hosts plugin block + if ! grep -q "hosts /etc/localdns/hosts" "${corefile_path}"; then + echo "Localdns corefile does not contain hosts plugin block, skipping annotation." + return 0 + fi + + # Additionally verify that the hosts file exists and has content + # Allow overriding for testing via LOCALDNS_HOSTS_FILE environment variable + local hosts_file="${LOCALDNS_HOSTS_FILE:-/etc/localdns/hosts}" + if [ ! -f "${hosts_file}" ]; then + echo "Hosts file does not exist at ${hosts_file}, skipping annotation despite corefile having hosts plugin." + return 0 + fi + + if ! grep -qE '^[0-9a-fA-F.:]+[[:space:]]+[a-zA-Z]' "${hosts_file}"; then + echo "Hosts file exists but has no IP mappings, skipping annotation." + return 0 + fi + + echo "Localdns is using hosts plugin and hosts file has $(grep -cE '^[0-9a-fA-F.:]+[[:space:]]+[a-zA-Z]' "${hosts_file}" 2>/dev/null || echo 0) entries." + + # Only proceed if we have the necessary kubectl binary and configuration + if [ ! -x /opt/bin/kubectl ]; then + echo "kubectl binary not found at /opt/bin/kubectl, skipping annotation." + return 0 + fi + + local kubeconfig="${KUBECONFIG:-/var/lib/kubelet/kubeconfig}" + # Wait for kubelet to finish TLS bootstrapping and create the kubeconfig file + # This is necessary because localdns starts in basePrep(), before kubelet starts in nodePrep() + local wait_count=0 + local max_wait="${KUBECONFIG_WAIT_ATTEMPTS:-60}" # Default: wait up to 3 minutes (60 * 3 seconds), but configurable for testing + while [ ! -f "${kubeconfig}" ]; do + if [ $wait_count -ge $max_wait ]; then + echo "Timeout waiting for kubeconfig at ${kubeconfig} after ${max_wait} attempts, skipping annotation." + return 0 + fi + echo "Waiting for TLS bootstrapping to complete (attempt $((wait_count + 1))/${max_wait})..." + sleep 3 + wait_count=$((wait_count + 1)) + done + echo "Kubeconfig found at ${kubeconfig}" + + # Get node name + local node_name + node_name=$(hostname) + if [ -z "${node_name}" ]; then + echo "Cannot get node name, skipping annotation." + return 0 + fi + + # Azure cloud provider assigns node name as the lower case of the hostname + node_name=$(echo "$node_name" | tr '[:upper:]' '[:lower:]') + + # Wait for node to be registered in the cluster + # The kubeconfig exists but the node might not be registered yet + echo "Waiting for node ${node_name} to be registered in the cluster..." + local node_wait_count=0 + local max_node_wait="${NODE_REGISTRATION_WAIT_ATTEMPTS:-30}" # Default: wait up to 90 seconds (30 * 3 seconds) + while [ $node_wait_count -lt $max_node_wait ]; do + if /opt/bin/kubectl --kubeconfig "${kubeconfig}" get node "${node_name}" >/dev/null 2>&1; then + echo "Node ${node_name} is registered in the cluster." + break + fi + echo "Waiting for node registration (attempt $((node_wait_count + 1))/${max_node_wait})..." + sleep 3 + node_wait_count=$((node_wait_count + 1)) + done + + # Check if we timed out waiting for node registration + if [ $node_wait_count -ge $max_node_wait ]; then + echo "Timeout waiting for node ${node_name} to be registered after ${max_node_wait} attempts, skipping annotation." + return 0 + fi + + # Set annotation to indicate hosts plugin is in use + echo "Setting annotation to indicate hosts plugin is in use for node ${node_name}." + if /opt/bin/kubectl --kubeconfig "${kubeconfig}" annotate --overwrite node "${node_name}" kubernetes.azure.com/localdns-hosts-plugin=enabled; then + echo "Successfully set hosts plugin annotation." + else + echo "Warning: Failed to set hosts plugin annotation (this is non-fatal)." + fi + + return 0 +} + # Add iptables rules to skip conntrack for DNS traffic to localdns. add_iptable_rules_to_skip_conntrack_from_pods(){ # Check if the localdns interface already exists and delete it. @@ -626,10 +746,87 @@ start_localdns_watchdog() { fi } +select_localdns_corefile() { + local should_enable_hosts_plugin="${1}" + local corefile_with_hosts="${2}" + local corefile_no_hosts="${3}" + local hosts_file_path="${4}" + local timeout="${5:-0}" # Default to 0 (no wait) for restarts; can be overridden for initial CSE + + echo "LocalDNS corefile selection: SHOULD_ENABLE_HOSTS_PLUGIN=${should_enable_hosts_plugin:-}" >&2 + + if [ "${should_enable_hosts_plugin}" = "true" ]; then + echo "Hosts plugin is enabled, checking ${hosts_file_path} for content..." >&2 + + # During initial CSE, caller may set timeout > 0 to wait for aks-hosts-setup + # During restarts, timeout defaults to 0 (check immediately) + local wait_interval=5 + local elapsed=0 + + while [ $elapsed -le $timeout ]; do + if [ -f "${hosts_file_path}" ]; then + if grep -qE '^[0-9a-fA-F.:]+[[:space:]]+[a-zA-Z]' "${hosts_file_path}"; then + if [ $elapsed -eq 0 ]; then + echo "Hosts file has IP mappings, using corefile with hosts plugin" >&2 + else + echo "aks-hosts-setup produced hosts file with IP mappings after ${elapsed}s, using corefile with hosts plugin" >&2 + fi + echo "${corefile_with_hosts}" + return 0 + fi + fi + + # If timeout is 0, don't wait - check once and fall through + if [ $timeout -eq 0 ]; then + break + fi + + if [ $elapsed -eq 0 ]; then + echo "Waiting for aks-hosts-setup to populate ${hosts_file_path} (timeout: ${timeout}s)..." >&2 + fi + + sleep $wait_interval + elapsed=$((elapsed + wait_interval)) + done + + # Timeout reached or hosts file not ready - check final state and fall back + if [ -f "${hosts_file_path}" ]; then + if [ $timeout -gt 0 ]; then + echo "Warning: ${hosts_file_path} exists but has no IP mappings after ${timeout}s timeout, falling back to corefile without hosts plugin" >&2 + else + echo "Info: ${hosts_file_path} exists but has no IP mappings yet, falling back to corefile without hosts plugin" >&2 + fi + else + if [ $timeout -gt 0 ]; then + echo "Warning: ${hosts_file_path} does not exist after ${timeout}s timeout, falling back to corefile without hosts plugin" >&2 + else + echo "Info: ${hosts_file_path} does not exist yet, falling back to corefile without hosts plugin" >&2 + fi + fi + echo "${corefile_no_hosts}" + return 0 + else + echo "Hosts plugin is not enabled (SHOULD_ENABLE_HOSTS_PLUGIN != 'true'), using corefile without hosts plugin" >&2 + echo "${corefile_no_hosts}" + return 0 + fi +} + ${__SOURCED__:+return} # --------------------------------------- Main Execution starts here -------------------------------------------------- +# Regenerate corefile on every startup to enable dynamic variant selection. +# --------------------------------------------------------------------------------------------------------------------- +# This allows switching between WITH_HOSTS and NO_HOSTS variants based on current state. +# On restarts, if /etc/localdns/hosts has been populated by aks-hosts-setup timer, +# localdns will automatically switch to the hosts-plugin variant. +# Note: select_localdns_corefile is called with timeout=0 (default), meaning it checks +# the hosts file once and falls back to the no-hosts variant immediately if missing/empty. +# This is intentional — we don't block localdns startup waiting for DNS resolution. +# The aks-hosts-setup timer will populate the hosts file, and the next restart will pick it up. +regenerate_localdns_corefile || exit $ERR_LOCALDNS_COREFILE_NOTFOUND + # Verify localdns required files exists. # --------------------------------------------------------------------------------------------------------------------- # Verify that generated corefile exists and is not empty. @@ -708,6 +905,13 @@ echo "Updating network DNS configuration to point to localdns via ${NETWORK_DROP disable_dhcp_use_clusterlistener || exit $ERR_LOCALDNS_FAIL echo "Startup complete - serving node and pod DNS traffic." +# Set node annotation to indicate hosts plugin is in use (if applicable). +# -------------------------------------------------------------------------------------------------------------------- +# Run annotation in background to avoid blocking CSE completion +# The annotation is a best-effort operation that should not delay node provisioning +annotate_node_with_hosts_plugin_status & +echo "Started hosts plugin annotation in background (PID: $!)" + # Systemd notify: send ready if service is Type=notify. # -------------------------------------------------------------------------------------------------------------------- if [ -n "${NOTIFY_SOCKET:-}" ]; then diff --git a/pkg/agent/baker.go b/pkg/agent/baker.go index 4f3d0e6364c..fc977ac07cb 100644 --- a/pkg/agent/baker.go +++ b/pkg/agent/baker.go @@ -1223,13 +1223,23 @@ func getContainerServiceFuncMap(config *datamodel.NodeBootstrappingConfiguration "ShouldEnableLocalDNS": func() bool { return profile.ShouldEnableLocalDNS() }, + "ShouldEnableHostsPlugin": func() bool { + return profile.ShouldEnableHostsPlugin() + }, "GetGeneratedLocalDNSCoreFile": func() (string, error) { - output, err := GenerateLocalDNSCoreFile(config, profile, localDNSCoreFileTemplateString) + output, err := GenerateLocalDNSCoreFile(config, profile, true) if err != nil { return "", fmt.Errorf("failed generate corefile for localdns using template: %w", err) } return base64.StdEncoding.EncodeToString([]byte(output)), nil }, + "GetGeneratedLocalDNSCoreFileNoHosts": func() (string, error) { + output, err := GenerateLocalDNSCoreFile(config, profile, false) + if err != nil { + return "", fmt.Errorf("failed generate corefile (no hosts) for localdns using template: %w", err) + } + return base64.StdEncoding.EncodeToString([]byte(output)), nil + }, "GetLocalDNSCPULimitInPercentage": func() string { return profile.GetLocalDNSCPULimitInPercentage() }, @@ -1804,16 +1814,19 @@ func containerdConfigFromTemplate( // ----------------------- Start of changes related to localdns ------------------------------------------. // Parse and generate localdns Corefile from template and LocalDNSProfile. +// includeHostsPlugin controls whether the hosts plugin blocks for caching critical AKS FQDNs +// are included in the generated Corefile. When false, the same template is rendered without +// the hosts blocks, used as a fallback when enableAKSHostsSetup fails at provisioning time. func GenerateLocalDNSCoreFile( config *datamodel.NodeBootstrappingConfiguration, profile *datamodel.AgentPoolProfile, - tmpl string, + includeHostsPlugin bool, ) (string, error) { parameters := getParameters(config) variables := getCustomDataVariables(config) bakerFuncMap := getBakerFuncMap(config, parameters, variables) - if profile.LocalDNSProfile == nil || !profile.ShouldEnableLocalDNS() { + if profile == nil || profile.LocalDNSProfile == nil || !profile.ShouldEnableLocalDNS() { return "", nil } @@ -1821,7 +1834,11 @@ func GenerateLocalDNSCoreFile( "hasSuffix": strings.HasSuffix, } localDNSCoreFileData := profile.GetLocalDNSCoreFileData() - localDNSCorefileTemplate := template.Must(template.New("localdnscorefile").Funcs(bakerFuncMap).Funcs(funcMapForHasSuffix).Parse(tmpl)) + localDNSCoreFileData.IncludeHostsPlugin = includeHostsPlugin + localDNSCorefileTemplate, err := template.New("localdnscorefile").Funcs(bakerFuncMap).Funcs(funcMapForHasSuffix).Parse(localDNSCoreFileTemplateString) + if err != nil { + return "", fmt.Errorf("failed to parse localdns corefile template: %w", err) + } // Generate the Corefile content. var corefileBuffer bytes.Buffer @@ -1834,6 +1851,10 @@ func GenerateLocalDNSCoreFile( } // Template to create corefile that will be used by localdns service. +// When IncludeHostsPlugin is true, the hosts plugin blocks for caching critical AKS FQDNs +// (mcr.microsoft.com, packages.aks.azure.com, etc.) are included in root domain server blocks. +// When false, hosts blocks are omitted — used as a fallback when enableAKSHostsSetup fails at +// provisioning time, following the same dual-config pattern used for containerd GPU/no-GPU configs. const localDNSCoreFileTemplateString = ` # *********************************************************************************** # WARNING: Changes to this file will be overwritten and not persisted. @@ -1860,6 +1881,12 @@ health-check.localdns.local:53 { log {{- end }} bind {{$.NodeListenerIP}} + {{- if and $isRootDomain $.IncludeHostsPlugin}} + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } + {{- end}} {{- if $isRootDomain}} forward . {{$.AzureDNSIP}} { {{- else}} @@ -1921,6 +1948,12 @@ health-check.localdns.local:53 { log {{- end }} bind {{$.ClusterListenerIP}} + {{- if and $isRootDomain $.IncludeHostsPlugin}} + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } + {{- end}} {{- if $fwdToClusterCoreDNS}} forward . {{$.CoreDNSServiceIP}} { {{- else}} diff --git a/pkg/agent/baker_test.go b/pkg/agent/baker_test.go index a83405d7b70..cd3ea477871 100644 --- a/pkg/agent/baker_test.go +++ b/pkg/agent/baker_test.go @@ -274,21 +274,6 @@ var _ = Describe("Assert generated customData and cseCmd", func() { }) Describe(".GetGeneratedLocalDNSCoreFile()", func() { - // Expect an error from GenerateLocalDNSCoreFile if template is invalid. - It("returns an error when template parsing fails", func() { - config.AgentPoolProfile.LocalDNSProfile = &datamodel.LocalDNSProfile{ - EnableLocalDNS: true, - CPULimitInMilliCores: to.Int32Ptr(2008), - MemoryLimitInMB: to.Int32Ptr(128), - VnetDNSOverrides: nil, - KubeDNSOverrides: nil, - } - invalidTemplate := "{{.InvalidField}}" - _, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, invalidTemplate) - Expect(err).ToNot(BeNil()) - Expect(err.Error()).To(ContainSubstring("failed to execute localdns corefile template")) - }) - // Expect no error and a non-empty corefile when LocalDNSOverrides are nil. It("handles nil LocalDNSOverrides", func() { config.AgentPoolProfile.LocalDNSProfile = &datamodel.LocalDNSProfile{ @@ -298,7 +283,7 @@ var _ = Describe("Assert generated customData and cseCmd", func() { VnetDNSOverrides: nil, KubeDNSOverrides: nil, } - localDNSCoreFile, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, localDNSCoreFileTemplateString) + localDNSCoreFile, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, true) Expect(err).To(BeNil()) Expect(localDNSCoreFile).ToNot(BeEmpty()) Expect(localDNSCoreFile).To(ContainSubstring(expectedlocalDNSCorefileWithoutOverrides)) @@ -313,7 +298,7 @@ var _ = Describe("Assert generated customData and cseCmd", func() { VnetDNSOverrides: map[string]*datamodel.LocalDNSOverrides{}, KubeDNSOverrides: map[string]*datamodel.LocalDNSOverrides{}, } - localDNSCoreFile, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, localDNSCoreFileTemplateString) + localDNSCoreFile, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, true) Expect(err).To(BeNil()) Expect(localDNSCoreFile).ToNot(BeEmpty()) Expect(localDNSCoreFile).To(ContainSubstring(expectedlocalDNSCorefileWithoutOverrides)) @@ -370,7 +355,7 @@ var _ = Describe("Assert generated customData and cseCmd", func() { }, }, } - localDNSCoreFile, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, localDNSCoreFileTemplateString) + localDNSCoreFile, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, true) Expect(err).To(BeNil()) Expect(localDNSCoreFile).ToNot(BeEmpty()) @@ -387,6 +372,10 @@ health-check.localdns.local:53 { .:53 { log bind 169.254.10.10 + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } forward . 168.63.129.16 { policy sequential max_concurrent 1000 @@ -450,6 +439,10 @@ testdomain456.com:53 { .:53 { errors bind 169.254.10.11 + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } forward . 10.0.0.10 { policy sequential max_concurrent 2000 @@ -548,7 +541,7 @@ testdomain456.com:53 { }, }, } - localDNSCoreFile, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, localDNSCoreFileTemplateString) + localDNSCoreFile, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, true) Expect(err).To(BeNil()) Expect(localDNSCoreFile).ToNot(BeEmpty()) @@ -565,6 +558,10 @@ health-check.localdns.local:53 { .:53 { log bind 169.254.10.10 + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } forward . 168.63.129.16 { policy sequential max_concurrent 1000 @@ -628,6 +625,10 @@ testdomain456.com:53 { .:53 { errors bind 169.254.10.11 + # Check /etc/localdns/hosts first for critical AKS FQDNs (mcr.microsoft.com, packages.aks.azure.com, etc.) + hosts /etc/localdns/hosts { + fallthrough + } forward . 10.0.0.10 { policy sequential max_concurrent 1000 @@ -690,10 +691,134 @@ testdomain567.com:53 { ` Expect(localDNSCoreFile).To(ContainSubstring(expectedlocalDNSCorefile)) }) + + // Expect a valid corefile WITHOUT hosts plugin blocks when includeHostsPlugin=false. + // This is the fallback corefile used when enableAKSHostsSetup fails at provisioning time. + It("generates a valid localdnsCorefile without hosts plugin when includeHostsPlugin is false", func() { + config.AgentPoolProfile.LocalDNSProfile = &datamodel.LocalDNSProfile{ + EnableLocalDNS: true, + EnableHostsPlugin: true, + CPULimitInMilliCores: to.Int32Ptr(2008), + MemoryLimitInMB: to.Int32Ptr(128), + VnetDNSOverrides: map[string]*datamodel.LocalDNSOverrides{ + ".": { + QueryLogging: "Log", + Protocol: "PreferUDP", + ForwardDestination: "VnetDNS", + ForwardPolicy: "Sequential", + MaxConcurrent: to.Int32Ptr(1000), + CacheDurationInSeconds: to.Int32Ptr(3600), + ServeStaleDurationInSeconds: to.Int32Ptr(3600), + ServeStale: "Immediate", + }, + }, + KubeDNSOverrides: map[string]*datamodel.LocalDNSOverrides{ + ".": { + QueryLogging: "Error", + Protocol: "PreferUDP", + ForwardDestination: "ClusterCoreDNS", + ForwardPolicy: "Sequential", + MaxConcurrent: to.Int32Ptr(2000), + CacheDurationInSeconds: to.Int32Ptr(3600), + ServeStaleDurationInSeconds: to.Int32Ptr(72000), + ServeStale: "Verify", + }, + }, + } + // Generate with includeHostsPlugin=false (the no-hosts fallback) + localDNSCoreFile, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, false) + Expect(err).To(BeNil()) + Expect(localDNSCoreFile).ToNot(BeEmpty()) + + // The no-hosts corefile must NOT contain hosts plugin blocks + Expect(localDNSCoreFile).ToNot(ContainSubstring("hosts /etc/localdns/hosts")) + Expect(localDNSCoreFile).ToNot(ContainSubstring("# Check /etc/localdns/hosts")) + + // But it should still contain the standard corefile structure + Expect(localDNSCoreFile).To(ContainSubstring("health-check.localdns.local:53")) + Expect(localDNSCoreFile).To(ContainSubstring("bind 169.254.10.10")) + Expect(localDNSCoreFile).To(ContainSubstring("bind 169.254.10.11")) + Expect(localDNSCoreFile).To(ContainSubstring("forward . 168.63.129.16")) + Expect(localDNSCoreFile).To(ContainSubstring("nsid localdns")) + Expect(localDNSCoreFile).To(ContainSubstring("nsid localdns-pod")) + }) + + // Verify that includeHostsPlugin=true produces hosts blocks and includeHostsPlugin=false does not, + // when using the same LocalDNSProfile configuration. + It("produces different output for includeHostsPlugin true vs false", func() { + config.AgentPoolProfile.LocalDNSProfile = &datamodel.LocalDNSProfile{ + EnableLocalDNS: true, + EnableHostsPlugin: true, + CPULimitInMilliCores: to.Int32Ptr(2008), + MemoryLimitInMB: to.Int32Ptr(128), + VnetDNSOverrides: map[string]*datamodel.LocalDNSOverrides{ + ".": { + QueryLogging: "Log", + Protocol: "PreferUDP", + ForwardDestination: "VnetDNS", + ForwardPolicy: "Sequential", + MaxConcurrent: to.Int32Ptr(1000), + CacheDurationInSeconds: to.Int32Ptr(3600), + ServeStaleDurationInSeconds: to.Int32Ptr(3600), + ServeStale: "Immediate", + }, + }, + KubeDNSOverrides: map[string]*datamodel.LocalDNSOverrides{ + ".": { + QueryLogging: "Error", + Protocol: "PreferUDP", + ForwardDestination: "ClusterCoreDNS", + ForwardPolicy: "Sequential", + MaxConcurrent: to.Int32Ptr(1000), + CacheDurationInSeconds: to.Int32Ptr(3600), + ServeStaleDurationInSeconds: to.Int32Ptr(3600), + ServeStale: "Verify", + }, + }, + } + withHosts, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, true) + Expect(err).To(BeNil()) + withoutHosts, err := GenerateLocalDNSCoreFile(config, config.AgentPoolProfile, false) + Expect(err).To(BeNil()) + + // With hosts should have the hosts plugin block + Expect(withHosts).To(ContainSubstring("hosts /etc/localdns/hosts")) + // Without hosts should NOT have it + Expect(withoutHosts).ToNot(ContainSubstring("hosts /etc/localdns/hosts")) + // Both should still be valid corefiles + Expect(withHosts).To(ContainSubstring("health-check.localdns.local:53")) + Expect(withoutHosts).To(ContainSubstring("health-check.localdns.local:53")) + }) }) }) }) +func getDecodedVarsFromCseCmd(data []byte) (map[string]string, error) { + cseRegex := regexp.MustCompile(cseRegexString) + cseVariableList := cseRegex.FindAllStringSubmatch(string(data), -1) + vars := make(map[string]string) + + for _, cseVar := range cseVariableList { + if len(cseVar) < 3 { + return nil, fmt.Errorf("expected 3 results (match, key, value) from regex, found %d, result %q", len(cseVar), cseVar) + } + + key := cseVar[1] + val := getValueWithoutQuotes(cseVar[2]) + + vars[key] = val + } + + return vars, nil +} + +func getValueWithoutQuotes(value string) string { + if len(value) > 1 && value[0] == '"' && value[len(value)-1] == '"' { + return value[1 : len(value)-1] + } + return value +} + type tarEntry struct { path string *decodedValue @@ -729,32 +854,6 @@ func decodeTarFiles(data []byte) ([]tarEntry, error) { return files, nil } -func getDecodedVarsFromCseCmd(data []byte) (map[string]string, error) { - cseRegex := regexp.MustCompile(cseRegexString) - cseVariableList := cseRegex.FindAllStringSubmatch(string(data), -1) - vars := make(map[string]string) - - for _, cseVar := range cseVariableList { - if len(cseVar) < 3 { - return nil, fmt.Errorf("expected 3 results (match, key, value) from regex, found %d, result %q", len(cseVar), cseVar) - } - - key := cseVar[1] - val := getValueWithoutQuotes(cseVar[2]) - - vars[key] = val - } - - return vars, nil -} - -func getValueWithoutQuotes(value string) string { - if len(value) > 1 && value[0] == '"' && value[len(value)-1] == '"' { - return value[1 : len(value)-1] - } - return value -} - var _ = Describe("Test normalizeResourceGroupNameForLabel", func() { It("should return the correct normalized resource group name", func() { Expect(normalizeResourceGroupNameForLabel("hello")).To(Equal("hello")) diff --git a/pkg/agent/datamodel/types.go b/pkg/agent/datamodel/types.go index 4cb6812cfb6..0860c1f54ef 100644 --- a/pkg/agent/datamodel/types.go +++ b/pkg/agent/datamodel/types.go @@ -2460,6 +2460,7 @@ const ( // LocalDNSProfile represents localdns configuration for agentpool nodes. type LocalDNSProfile struct { EnableLocalDNS bool `json:"enableLocalDNS,omitempty"` + EnableHostsPlugin bool `json:"enableHostsPlugin,omitempty"` CPULimitInMilliCores *int32 `json:"cpuLimitInMilliCores,omitempty"` MemoryLimitInMB *int32 `json:"memoryLimitInMB,omitempty"` VnetDNSOverrides map[string]*LocalDNSOverrides `json:"vnetDNSOverrides,omitempty"` @@ -2468,10 +2469,11 @@ type LocalDNSProfile struct { type LocalDNSCoreFileData struct { LocalDNSProfile - NodeListenerIP string - ClusterListenerIP string - CoreDNSServiceIP string - AzureDNSIP string + NodeListenerIP string + ClusterListenerIP string + CoreDNSServiceIP string + AzureDNSIP string + IncludeHostsPlugin bool } // LocalDNSOverrides represents DNS override settings for both VnetDNS and KubeDNS traffic. @@ -2496,6 +2498,13 @@ func (a *AgentPoolProfile) ShouldEnableLocalDNS() bool { return a != nil && a.LocalDNSProfile != nil && a.LocalDNSProfile.EnableLocalDNS } +// ShouldEnableHostsPlugin returns true if LocalDNS is enabled and the hosts plugin +// is explicitly enabled. When true, the localdns Corefile will include a hosts plugin +// block that serves cached DNS entries from /etc/localdns/hosts for critical AKS FQDNs. +func (a *AgentPoolProfile) ShouldEnableHostsPlugin() bool { + return a.ShouldEnableLocalDNS() && a.LocalDNSProfile.EnableHostsPlugin +} + // GetLocalDNSNodeListenerIP returns APIPA-IP address that will be used in localdns systemd unit. func (a *AgentPoolProfile) GetLocalDNSNodeListenerIP() string { return LocalDNSNodeListenerIP diff --git a/pkg/agent/datamodel/types_test.go b/pkg/agent/datamodel/types_test.go index 1cfb888056b..a0605aabd47 100644 --- a/pkg/agent/datamodel/types_test.go +++ b/pkg/agent/datamodel/types_test.go @@ -3090,10 +3090,8 @@ func TestShouldEnableLocalDNS(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - actualData := false - if tt.agentPoolProfile != nil { - actualData = tt.agentPoolProfile.ShouldEnableLocalDNS() - } + actualData := tt.agentPoolProfile.ShouldEnableLocalDNS() + assert.Equal(t, tt.expectedData, actualData) }) } @@ -3391,4 +3389,73 @@ func TestGetLocalDNSCoreFileData(t *testing.T) { } } +func TestShouldEnableHostsPlugin(t *testing.T) { + tests := []struct { + name string + agentPoolProfile *AgentPoolProfile + expectedData bool + }{ + { + name: "ShouldEnableHostsPlugin - AgentPoolProfile nil", + agentPoolProfile: nil, + expectedData: false, + }, + { + name: "ShouldEnableHostsPlugin - LocalDNSProfile nil", + agentPoolProfile: &AgentPoolProfile{ + LocalDNSProfile: nil, + }, + expectedData: false, + }, + { + name: "ShouldEnableHostsPlugin - LocalDNS disabled, HostsPlugin enabled", + agentPoolProfile: &AgentPoolProfile{ + LocalDNSProfile: &LocalDNSProfile{ + EnableLocalDNS: false, + EnableHostsPlugin: true, + }, + }, + expectedData: false, + }, + { + name: "ShouldEnableHostsPlugin - LocalDNS enabled, HostsPlugin disabled", + agentPoolProfile: &AgentPoolProfile{ + LocalDNSProfile: &LocalDNSProfile{ + EnableLocalDNS: true, + EnableHostsPlugin: false, + }, + }, + expectedData: false, + }, + { + name: "ShouldEnableHostsPlugin - both enabled", + agentPoolProfile: &AgentPoolProfile{ + LocalDNSProfile: &LocalDNSProfile{ + EnableLocalDNS: true, + EnableHostsPlugin: true, + }, + }, + expectedData: true, + }, + { + name: "ShouldEnableHostsPlugin - both disabled", + agentPoolProfile: &AgentPoolProfile{ + LocalDNSProfile: &LocalDNSProfile{ + EnableLocalDNS: false, + EnableHostsPlugin: false, + }, + }, + expectedData: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + actualData := tt.agentPoolProfile.ShouldEnableHostsPlugin() + + assert.Equal(t, tt.expectedData, actualData) + }) + } +} + // ----------------------- End of changes related to localdns ------------------------------------------. diff --git a/spec/parts/linux/cloud-init/artifacts/aks_hosts_setup_spec.sh b/spec/parts/linux/cloud-init/artifacts/aks_hosts_setup_spec.sh new file mode 100644 index 00000000000..0115fde18d0 --- /dev/null +++ b/spec/parts/linux/cloud-init/artifacts/aks_hosts_setup_spec.sh @@ -0,0 +1,506 @@ +#shellcheck shell=bash +#shellcheck disable=SC2148 + +Describe 'aks-hosts-setup.sh' + SCRIPT_PATH="parts/linux/cloud-init/artifacts/aks-hosts-setup.sh" + + # Helper to build a test script that uses the real system nslookup. + # Overrides only HOSTS_FILE and TARGET_CLOUD, preserving everything else + # (cloud selection, resolution loop, atomic write) from the real script. + # Lines 1-9 of the real script are: shebang, set, blank, comments, and HOSTS_FILE=. + build_test_script() { + local test_dir="$1" + local hosts_file="$2" + local target_cloud="${3:-AzurePublicCloud}" + local test_script="${test_dir}/aks-hosts-setup-test.sh" + + cat > "${test_script}" << EOF +#!/usr/bin/env bash +set -uo pipefail +HOSTS_FILE="${hosts_file}" +export TARGET_CLOUD="${target_cloud}" +EOF + tail -n +10 "${SCRIPT_PATH}" >> "${test_script}" + chmod +x "${test_script}" + echo "${test_script}" + } + + # Helper to build a test script with a mock nslookup prepended to PATH. + # Used only for edge-case tests that need controlled DNS output + # (failure handling, invalid response filtering). + build_mock_test_script() { + local test_dir="$1" + local hosts_file="$2" + local mock_bin_dir="$3" + local target_cloud="${4:-AzurePublicCloud}" + local test_script="${test_dir}/aks-hosts-setup-test.sh" + + cat > "${test_script}" << EOF +#!/usr/bin/env bash +set -uo pipefail +export PATH="${mock_bin_dir}:\$PATH" +HOSTS_FILE="${hosts_file}" +export TARGET_CLOUD="${target_cloud}" +EOF + tail -n +10 "${SCRIPT_PATH}" >> "${test_script}" + chmod +x "${test_script}" + echo "${test_script}" + } + + # Creates a mock nslookup executable that simulates DNS failure (NXDOMAIN). + create_failure_mock() { + local mock_bin_dir="$1" + mkdir -p "${mock_bin_dir}" + cat > "${mock_bin_dir}/nslookup" << 'MOCK_EOF' +#!/usr/bin/env bash +echo "Server: 127.0.0.53" +echo "Address: 127.0.0.53#53" +echo "" +echo "** server can't find domain: NXDOMAIN" +MOCK_EOF + chmod +x "${mock_bin_dir}/nslookup" + } + + # ----------------------------------------------------------------------- + # Tests using real nslookup (no mocks) + # ----------------------------------------------------------------------- + + Describe 'DNS resolution and hosts file creation (AzurePublicCloud)' + setup() { + TEST_DIR=$(mktemp -d) + export HOSTS_FILE="${TEST_DIR}/hosts.testing" + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "AzurePublicCloud") + } + + cleanup() { + rm -rf "$TEST_DIR" + } + + BeforeEach 'setup' + AfterEach 'cleanup' + + It 'creates hosts file with resolved addresses for all critical FQDNs' + When run command bash "${TEST_SCRIPT}" + The status should be success + The file "$HOSTS_FILE" should be exist + The output should include "Starting AKS critical FQDN hosts resolution" + The output should include "AKS critical FQDN hosts resolution completed" + End + + It 'detects AzurePublicCloud environment' + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "Detected cloud environment: AzurePublicCloud" + End + + It 'resolves all public cloud FQDNs' + When run command bash "${TEST_SCRIPT}" + The status should be success + # Verify the script attempts to resolve all expected public cloud FQDNs + The output should include "Resolving addresses for mcr.microsoft.com" + The output should include "Resolving addresses for packages.microsoft.com" + The output should include "Resolving addresses for management.azure.com" + The output should include "Resolving addresses for login.microsoftonline.com" + The output should include "Resolving addresses for acs-mirror.azureedge.net" + The output should include "Resolving addresses for packages.aks.azure.com" + # Verify hosts file contains real resolved entries + The contents of file "$HOSTS_FILE" should include "mcr.microsoft.com" + The contents of file "$HOSTS_FILE" should include "packages.microsoft.com" + End + + It 'writes valid hosts file format' + When run command bash "${TEST_SCRIPT}" + The status should be success + The file "$HOSTS_FILE" should be exist + The output should include "Writing addresses" + End + + It 'includes header comments in hosts file' + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "AKS critical FQDN hosts resolution" + The contents of file "$HOSTS_FILE" should include "# AKS critical FQDN addresses resolved at" + The contents of file "$HOSTS_FILE" should include "# This file is automatically generated by aks-hosts-setup.service" + End + End + + Describe 'Cloud-specific FQDN selection' + # These tests use real nslookup. Sovereign cloud domains may not resolve + # from CI, so we assert on which FQDNs the script *attempts* to resolve + # (visible in stdout) rather than checking hosts file contents. + setup() { + TEST_DIR=$(mktemp -d) + export HOSTS_FILE="${TEST_DIR}/hosts.testing" + } + + cleanup() { + rm -rf "$TEST_DIR" + } + + BeforeEach 'setup' + AfterEach 'cleanup' + + It 'selects AzureChinaCloud FQDNs' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "AzureChinaCloud") + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "Detected cloud environment: AzureChinaCloud" + # Should resolve China-specific endpoints + The output should include "Resolving addresses for mcr.azure.cn" + The output should include "Resolving addresses for mcr.azk8s.cn" + The output should include "Resolving addresses for login.partner.microsoftonline.cn" + The output should include "Resolving addresses for management.chinacloudapi.cn" + The output should include "Resolving addresses for packages.microsoft.com" + # Should NOT attempt public cloud endpoints + The output should not include "Resolving addresses for login.microsoftonline.com" + The output should not include "Resolving addresses for management.azure.com" + End + + It 'selects AzureUSGovernmentCloud FQDNs' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "AzureUSGovernmentCloud") + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "Detected cloud environment: AzureUSGovernmentCloud" + The output should include "Resolving addresses for mcr.microsoft.com" + The output should include "Resolving addresses for login.microsoftonline.us" + The output should include "Resolving addresses for management.usgovcloudapi.net" + The output should include "Resolving addresses for packages.aks.azure.com" + The output should not include "Resolving addresses for login.microsoftonline.com" + The output should not include "Resolving addresses for management.azure.com" + End + + It 'exits with error for unknown cloud values' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "SomeUnknownCloud") + When run command bash "${TEST_SCRIPT}" + The status should be failure + The output should include "ERROR: The following cloud is not supported: SomeUnknownCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + The output should not include "Cannot determine which FQDNs to resolve for hosts file" + The output should not include "Exiting without modifying hosts file" + End + + It 'exits with error for USNatCloud (no longer supported)' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "USNatCloud") + When run command bash "${TEST_SCRIPT}" + The status should be failure + The output should include "ERROR: The following cloud is not supported: USNatCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + End + + It 'exits with error for USSecCloud (no longer supported)' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "USSecCloud") + When run command bash "${TEST_SCRIPT}" + The status should be failure + The output should include "ERROR: The following cloud is not supported: USSecCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + End + + It 'exits with error for AzureStackCloud (no longer supported)' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "AzureStackCloud") + When run command bash "${TEST_SCRIPT}" + The status should be failure + The output should include "ERROR: The following cloud is not supported: AzureStackCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + End + + It 'exits with error for AzureGermanCloud (no longer supported)' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "AzureGermanCloud") + When run command bash "${TEST_SCRIPT}" + The status should be failure + The output should include "ERROR: The following cloud is not supported: AzureGermanCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + End + + It 'exits with error for AzureGermanyCloud (no longer supported)' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "AzureGermanyCloud") + When run command bash "${TEST_SCRIPT}" + The status should be failure + The output should include "ERROR: The following cloud is not supported: AzureGermanyCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + End + + It 'exits with error for AzureBleuCloud (no longer supported)' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "AzureBleuCloud") + When run command bash "${TEST_SCRIPT}" + The status should be failure + The output should include "ERROR: The following cloud is not supported: AzureBleuCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + End + + It 'fails when TARGET_CLOUD is unset' + local test_script="${TEST_DIR}/aks-hosts-setup-test-nocloud.sh" + cat > "${test_script}" << EOF +#!/usr/bin/env bash +set -uo pipefail +HOSTS_FILE="${HOSTS_FILE}" +unset TARGET_CLOUD +EOF + tail -n +10 "${SCRIPT_PATH}" >> "${test_script}" + chmod +x "${test_script}" + + When run command bash "${test_script}" + The status should be failure + The output should include "ERROR: TARGET_CLOUD is not set" + The output should include "Cannot determine which FQDNs to resolve" + The output should include "Exiting without modifying hosts file" + End + + It 'fails when TARGET_CLOUD is empty string' + local test_script="${TEST_DIR}/aks-hosts-setup-test-empty.sh" + cat > "${test_script}" << EOF +#!/usr/bin/env bash +set -uo pipefail +HOSTS_FILE="${HOSTS_FILE}" +export TARGET_CLOUD="" +EOF + tail -n +10 "${SCRIPT_PATH}" >> "${test_script}" + chmod +x "${test_script}" + + When run command bash "${test_script}" + The status should be failure + The output should include "ERROR: TARGET_CLOUD is not set" + The output should include "Cannot determine which FQDNs to resolve" + End + + It 'includes packages.microsoft.com for all clouds (common FQDN)' + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "AzurePublicCloud") + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "Resolving addresses for packages.microsoft.com" + End + End + + Describe 'Atomic file write' + setup() { + TEST_DIR=$(mktemp -d) + export HOSTS_FILE="${TEST_DIR}/hosts.testing" + TEST_SCRIPT=$(build_test_script "${TEST_DIR}" "${HOSTS_FILE}" "AzurePublicCloud") + } + + cleanup() { + rm -rf "$TEST_DIR" + } + + BeforeEach 'setup' + AfterEach 'cleanup' + + It 'does not leave a temp file behind after successful write' + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "AKS critical FQDN hosts resolution" + The file "$HOSTS_FILE" should be exist + End + + It 'verifies no leftover temp files exist' + bash "${TEST_SCRIPT}" >/dev/null 2>&1 + # The temp file (hosts.testing.tmp.) should have been renamed away + When run command find "${TEST_DIR}" -name 'hosts.testing.tmp.*' + The output should equal "" + End + + It 'sets correct permissions on the hosts file' + bash "${TEST_SCRIPT}" >/dev/null 2>&1 + When run command stat -c '%a' "${HOSTS_FILE}" + The output should equal "644" + End + End + + # ----------------------------------------------------------------------- + # Mock-based tests below + # These require controlled nslookup output to verify error handling + # and response filtering logic that cannot be triggered with real DNS. + # ----------------------------------------------------------------------- + + Describe 'DNS resolution failure handling (mock)' + setup() { + TEST_DIR=$(mktemp -d) + MOCK_BIN="${TEST_DIR}/mock_bin" + export HOSTS_FILE="${TEST_DIR}/hosts.testing" + create_failure_mock "${MOCK_BIN}" + TEST_SCRIPT=$(build_mock_test_script "${TEST_DIR}" "${HOSTS_FILE}" "${MOCK_BIN}" "AzurePublicCloud") + } + + cleanup() { + rm -rf "$TEST_DIR" + } + + BeforeEach 'setup' + AfterEach 'cleanup' + + It 'exits gracefully when no DNS records are resolved' + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "WARNING: No IP addresses resolved for any domain" + The output should include "This is likely a temporary DNS issue" + End + + It 'does not create hosts file when no DNS records are resolved' + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "WARNING: No IP addresses resolved for any domain" + The file "$HOSTS_FILE" should not be exist + End + + It 'preserves existing hosts file when no DNS records are resolved' + echo "# old hosts content" > "${HOSTS_FILE}" + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "WARNING: No IP addresses resolved for any domain" + # Original hosts file should still be intact + The contents of file "$HOSTS_FILE" should include "# old hosts content" + End + End + + Describe 'Invalid DNS response filtering (mock)' + setup() { + TEST_DIR=$(mktemp -d) + MOCK_BIN="${TEST_DIR}/mock_bin" + mkdir -p "${MOCK_BIN}" + export HOSTS_FILE="${TEST_DIR}/hosts.testing" + } + + cleanup() { + rm -rf "$TEST_DIR" + } + + BeforeEach 'setup' + AfterEach 'cleanup' + + It 'filters out NXDOMAIN responses from hosts file' + create_failure_mock "${MOCK_BIN}" + TEST_SCRIPT=$(build_mock_test_script "${TEST_DIR}" "${HOSTS_FILE}" "${MOCK_BIN}" "AzurePublicCloud") + + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "WARNING: No IP addresses resolved for any domain" + The file "$HOSTS_FILE" should not be exist + End + + It 'filters out SERVFAIL responses from hosts file' + cat > "${MOCK_BIN}/nslookup" << 'MOCK_EOF' +#!/usr/bin/env bash +echo "Server: 127.0.0.53" +echo "Address: 127.0.0.53#53" +echo "" +echo "** server can't find domain: SERVFAIL" +MOCK_EOF + chmod +x "${MOCK_BIN}/nslookup" + TEST_SCRIPT=$(build_mock_test_script "${TEST_DIR}" "${HOSTS_FILE}" "${MOCK_BIN}" "AzurePublicCloud") + + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "WARNING: No IP addresses resolved for any domain" + The file "$HOSTS_FILE" should not be exist + End + + It 'does not write non-IP strings to hosts file' + cat > "${MOCK_BIN}/nslookup" << 'MOCK_EOF' +#!/usr/bin/env bash +record_type="" +for arg in "$@"; do + if [[ "$arg" == "-type=A" ]]; then + record_type="A" + elif [[ "$arg" == "-type=AAAA" ]]; then + record_type="AAAA" + fi +done + +echo "Server: 127.0.0.53" +echo "Address: 127.0.0.53#53" +echo "" +if [[ "$record_type" == "A" ]]; then + echo "Address: 1.2.3.4" + echo "Address: not-an-ip" + echo "Address: NXDOMAIN" +fi +MOCK_EOF + chmod +x "${MOCK_BIN}/nslookup" + TEST_SCRIPT=$(build_mock_test_script "${TEST_DIR}" "${HOSTS_FILE}" "${MOCK_BIN}" "AzurePublicCloud") + + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "Writing addresses" + The file "$HOSTS_FILE" should be exist + The contents of file "$HOSTS_FILE" should include "1.2.3.4" + The contents of file "$HOSTS_FILE" should not include "not-an-ip" + The contents of file "$HOSTS_FILE" should not include "NXDOMAIN" + End + + It 'does not write invalid IPv6 strings to hosts file' + cat > "${MOCK_BIN}/nslookup" << 'MOCK_EOF' +#!/usr/bin/env bash +record_type="" +for arg in "$@"; do + if [[ "$arg" == "-type=A" ]]; then + record_type="A" + elif [[ "$arg" == "-type=AAAA" ]]; then + record_type="AAAA" + fi +done + +echo "Server: 127.0.0.53" +echo "Address: 127.0.0.53#53" +echo "" +if [[ "$record_type" == "AAAA" ]]; then + echo "Address: 2001:db8::1" + echo "Address: not-an-ipv6" + echo "Address: SERVFAIL" + echo "Address: fe80::1" + echo "Address: 1:2" + echo "Address: :ff" +fi +MOCK_EOF + chmod +x "${MOCK_BIN}/nslookup" + TEST_SCRIPT=$(build_mock_test_script "${TEST_DIR}" "${HOSTS_FILE}" "${MOCK_BIN}" "AzurePublicCloud") + + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "Writing addresses" + The file "$HOSTS_FILE" should be exist + The contents of file "$HOSTS_FILE" should include "2001:db8::1" + The contents of file "$HOSTS_FILE" should include "fe80::1" + The contents of file "$HOSTS_FILE" should not include "not-an-ipv6" + The contents of file "$HOSTS_FILE" should not include "SERVFAIL" + # Tightened IPv6 validation rejects too-short strings with fewer than 2 colons + The contents of file "$HOSTS_FILE" should not include "1:2" + The contents of file "$HOSTS_FILE" should not include ":ff" + End + + It 'rejects IPv4 addresses with out-of-range octets' + cat > "${MOCK_BIN}/nslookup" << 'MOCK_EOF' +#!/usr/bin/env bash +record_type="" +for arg in "$@"; do + if [[ "$arg" == "-type=A" ]]; then + record_type="A" + elif [[ "$arg" == "-type=AAAA" ]]; then + record_type="AAAA" + fi +done + +echo "Server: 127.0.0.53" +echo "Address: 127.0.0.53#53" +echo "" +if [[ "$record_type" == "A" ]]; then + echo "Address: 10.0.0.1" + echo "Address: 999.999.999.999" + echo "Address: 256.1.1.1" + echo "Address: 1.2.3.400" + echo "Address: 255.255.255.255" +fi +MOCK_EOF + chmod +x "${MOCK_BIN}/nslookup" + TEST_SCRIPT=$(build_mock_test_script "${TEST_DIR}" "${HOSTS_FILE}" "${MOCK_BIN}" "AzurePublicCloud") + + When run command bash "${TEST_SCRIPT}" + The status should be success + The output should include "Writing addresses" + The file "$HOSTS_FILE" should be exist + The contents of file "$HOSTS_FILE" should include "10.0.0.1" + The contents of file "$HOSTS_FILE" should include "255.255.255.255" + The contents of file "$HOSTS_FILE" should not include "999.999.999.999" + The contents of file "$HOSTS_FILE" should not include "256.1.1.1" + The contents of file "$HOSTS_FILE" should not include "1.2.3.400" + End + End +End diff --git a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh index 3f935f17ba3..b6f8159e916 100755 --- a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh @@ -1,5 +1,11 @@ #!/bin/bash +# Helper functions for tests +check_file_permissions() { + # Use printf to ensure leading zero (0644 format) + printf "0%s" "$(stat -c "%a" "$LOCALDNS_ENV_FILE")" +} + Describe 'cse_config.sh' Include "./parts/linux/cloud-init/artifacts/cse_config.sh" Include "./parts/linux/cloud-init/artifacts/cse_helpers.sh" @@ -787,6 +793,11 @@ providers: setup() { TMP_DIR=$(mktemp -d) LOCALDNS_CORE_FILE="$TMP_DIR/localdns.corefile" + # Create mock localdns assets that would be present on VHD + mkdir -p /etc/systemd/system + mkdir -p /opt/azure/containers/localdns + touch /etc/systemd/system/localdns.service + touch /opt/azure/containers/localdns/localdns.sh systemctlEnableAndStart() { echo "systemctlEnableAndStart $@" @@ -795,11 +806,14 @@ providers: } cleanup() { rm -rf "$TMP_DIR" + # Clean up mock VHD assets + rm -f /etc/systemd/system/localdns.service + rm -f /opt/azure/containers/localdns/localdns.sh } BeforeEach 'setup' AfterEach 'cleanup' - It 'should enable localdns successfully' + It 'should enable localdns successfully when VHD has required assets' echo 'localdns corefile' > "$LOCALDNS_CORE_FILE" When run enableLocalDNS The status should be success @@ -807,6 +821,24 @@ providers: The output should include "Enable localdns succeeded." End + It 'should skip localdns when localdns.service is missing on old VHD' + rm -f /etc/systemd/system/localdns.service + echo 'localdns corefile' > "$LOCALDNS_CORE_FILE" + When run enableLocalDNS + The status should be success + The output should include "Warning: localdns.service not found on this VHD, skipping localdns setup" + The output should not include "localdns should be enabled." + End + + It 'should skip localdns when localdns.sh is missing on old VHD' + rm -f /opt/azure/containers/localdns/localdns.sh + echo 'localdns corefile' > "$LOCALDNS_CORE_FILE" + When run enableLocalDNS + The status should be success + The output should include "Warning: localdns.sh not found on this VHD, skipping localdns setup" + The output should not include "localdns should be enabled." + End + It 'should return error when systemctl fails to start localdns' echo 'localdns corefile' > "$LOCALDNS_CORE_FILE" systemctlEnableAndStart() { @@ -819,7 +851,7 @@ providers: End End - Describe 'shouldEnableLocalDns' + Describe 'enableLocalDNSForScriptless' setup() { TMP_DIR=$(mktemp -d) LOCALDNS_CORE_FILE="$TMP_DIR/localdns.corefile" @@ -827,6 +859,11 @@ providers: LOCALDNS_GENERATED_COREFILE=$(echo "bG9jYWxkbnMgY29yZWZpbGU=") # "localdns corefile" base64 LOCALDNS_MEMORY_LIMIT="512M" LOCALDNS_CPU_LIMIT="250%" + # Create mock localdns assets that would be present on VHD + mkdir -p /etc/systemd/system + mkdir -p /opt/azure/containers/localdns + touch /etc/systemd/system/localdns.service + touch /opt/azure/containers/localdns/localdns.sh systemctlEnableAndStart() { echo "systemctlEnableAndStart $@" @@ -835,6 +872,9 @@ providers: } cleanup() { rm -rf "$TMP_DIR" + # Clean up mock VHD assets + rm -f /etc/systemd/system/localdns.service + rm -f /opt/azure/containers/localdns/localdns.sh } BeforeEach 'setup' AfterEach 'cleanup' @@ -880,6 +920,241 @@ providers: The output should include "localdns should be enabled." The output should include "Enable localdns succeeded." End + + # Environment file creation with both corefile variants. + It 'should create environment file with all corefile variants for dynamic selection' + # Set up both corefile variants + LOCALDNS_GENERATED_COREFILE=$(echo -n "corefile with hosts plugin" | base64) + LOCALDNS_GENERATED_COREFILE_NO_HOSTS=$(echo -n "corefile without hosts plugin" | base64) + SHOULD_ENABLE_HOSTS_PLUGIN="true" + LOCALDNS_ENV_FILE="$TMP_DIR/environment" + + When call enableLocalDNS + The status should be success + The stdout should include "enableLocalDNS called, generating corefile..." + The stdout should include "localdns should be enabled." + The stdout should include "Enable localdns succeeded." + The path "$LOCALDNS_ENV_FILE" should be file + The contents of file "$LOCALDNS_ENV_FILE" should include "LOCALDNS_BASE64_ENCODED_COREFILE=" + The contents of file "$LOCALDNS_ENV_FILE" should include "LOCALDNS_BASE64_ENCODED_COREFILE_WITH_HOSTS=${LOCALDNS_GENERATED_COREFILE}" + The contents of file "$LOCALDNS_ENV_FILE" should include "LOCALDNS_BASE64_ENCODED_COREFILE_NO_HOSTS=${LOCALDNS_GENERATED_COREFILE_NO_HOSTS}" + The contents of file "$LOCALDNS_ENV_FILE" should include "SHOULD_ENABLE_HOSTS_PLUGIN=true" + End + + # Environment file permissions. + It 'should set correct permissions on environment file' + LOCALDNS_ENV_FILE="$TMP_DIR/environment" + When call enableLocalDNS + The status should be success + The path "$LOCALDNS_ENV_FILE" should be file + # Check permissions are 0644 (owner read/write, group read, others read) + The result of function check_file_permissions should equal "0644" + End + End + + Describe 'enableAKSHostsSetup' + setup() { + # Create temporary test directories and files + TEST_TEMP_DIR=$(mktemp -d) + AKS_HOSTS_FILE="${TEST_TEMP_DIR}/hosts" + AKS_HOSTS_SETUP_SCRIPT="${TEST_TEMP_DIR}/aks-hosts-setup.sh" + AKS_HOSTS_SETUP_SERVICE="${TEST_TEMP_DIR}/aks-hosts-setup.service" + AKS_HOSTS_SETUP_TIMER="${TEST_TEMP_DIR}/aks-hosts-setup.timer" + AKS_CLOUD_ENV_FILE="${TEST_TEMP_DIR}/cloud-env" + + # Create fake script that simulates successful hosts file creation + cat > "$AKS_HOSTS_SETUP_SCRIPT" << 'SETUP_EOF' +#!/bin/bash +echo "# test hosts file" > "${AKS_HOSTS_FILE}" +SETUP_EOF + chmod +x "$AKS_HOSTS_SETUP_SCRIPT" + + # Create dummy service and timer files + touch "$AKS_HOSTS_SETUP_SERVICE" + touch "$AKS_HOSTS_SETUP_TIMER" + + # Set up test environment + TARGET_CLOUD="AzurePublicCloud" + + # Mock systemctl function + systemctlEnableAndStartNoBlock() { + echo "systemctlEnableAndStartNoBlock $@" + return 0 + } + + # Export variables so the real function can use them + export AKS_HOSTS_FILE AKS_HOSTS_SETUP_SCRIPT AKS_HOSTS_SETUP_SERVICE + export AKS_HOSTS_SETUP_TIMER AKS_CLOUD_ENV_FILE TARGET_CLOUD + } + + cleanup() { + rm -rf "$TEST_TEMP_DIR" + unset AKS_HOSTS_FILE AKS_HOSTS_SETUP_SCRIPT AKS_HOSTS_SETUP_SERVICE + unset AKS_HOSTS_SETUP_TIMER AKS_CLOUD_ENV_FILE TARGET_CLOUD + } + + BeforeEach 'setup' + AfterEach 'cleanup' + + It 'should enable aks-hosts-setup timer successfully' + When call enableAKSHostsSetup + The status should be success + The output should include "Enabling aks-hosts-setup timer..." + The output should include "systemctlEnableAndStartNoBlock aks-hosts-setup.timer 30" + The output should include "aks-hosts-setup timer enabled successfully." + End + + It 'should call systemctlEnableAndStartNoBlock with correct parameters' + When call enableAKSHostsSetup + The status should be success + The output should include "systemctlEnableAndStartNoBlock aks-hosts-setup.timer 30" + End + + It 'should skip when setup script is missing' + rm -f "$AKS_HOSTS_SETUP_SCRIPT" + When call enableAKSHostsSetup + The status should be success + The output should include "not found on this VHD, skipping aks-hosts-setup" + End + + It 'should skip when timer unit is missing' + rm -f "$AKS_HOSTS_SETUP_TIMER" + When call enableAKSHostsSetup + The status should be success + The output should include "not found on this VHD, skipping aks-hosts-setup" + End + + It 'should print warning when systemctlEnableAndStartNoBlock fails' + systemctlEnableAndStartNoBlock() { + echo "systemctlEnableAndStartNoBlock $@" + return 1 + } + When call enableAKSHostsSetup + The status should be success + The output should include "Enabling aks-hosts-setup timer..." + The output should include "Warning: Failed to enable aks-hosts-setup timer" + The output should not include "aks-hosts-setup timer enabled successfully." + End + + It 'should skip when service unit is missing' + rm -f "$AKS_HOSTS_SETUP_SERVICE" + When call enableAKSHostsSetup + The status should be success + The output should include "not found on this VHD, skipping aks-hosts-setup" + End + + It 'should skip when setup script is not executable' + chmod -x "$AKS_HOSTS_SETUP_SCRIPT" + When call enableAKSHostsSetup + The status should be success + The output should include "is not executable, skipping aks-hosts-setup" + End + + It 'should create cloud-env file with TARGET_CLOUD value' + TARGET_CLOUD="AzurePublicCloud" + When call enableAKSHostsSetup + The status should be success + The output should include "aks-hosts-setup timer enabled successfully." + The file "$AKS_CLOUD_ENV_FILE" should be exist + The contents of file "$AKS_CLOUD_ENV_FILE" should equal "TARGET_CLOUD=AzurePublicCloud" + End + + It 'should write correct cloud-env for AzureChinaCloud' + TARGET_CLOUD="AzureChinaCloud" + When call enableAKSHostsSetup + The status should be success + The output should include "aks-hosts-setup timer enabled successfully." + The contents of file "$AKS_CLOUD_ENV_FILE" should equal "TARGET_CLOUD=AzureChinaCloud" + End + + It 'should write correct cloud-env for AzureUSGovernmentCloud' + TARGET_CLOUD="AzureUSGovernmentCloud" + When call enableAKSHostsSetup + The status should be success + The output should include "aks-hosts-setup timer enabled successfully." + The contents of file "$AKS_CLOUD_ENV_FILE" should equal "TARGET_CLOUD=AzureUSGovernmentCloud" + End + + It 'should set 0644 permissions on cloud-env file' + When call enableAKSHostsSetup + The status should be success + The output should include "aks-hosts-setup timer enabled successfully." + The file "$AKS_CLOUD_ENV_FILE" should be exist + End + + It 'should skip when TARGET_CLOUD is unset' + unset TARGET_CLOUD + When call enableAKSHostsSetup + The status should be success + The output should include "WARNING: TARGET_CLOUD is not set" + The output should include "Cannot run aks-hosts-setup without knowing cloud environment" + The output should include "Skipping aks-hosts-setup" + End + + It 'should skip when TARGET_CLOUD is empty string' + TARGET_CLOUD="" + When call enableAKSHostsSetup + The status should be success + The output should include "WARNING: TARGET_CLOUD is not set" + The output should include "Skipping aks-hosts-setup" + End + + It 'should skip when TARGET_CLOUD is unsupported (USNatCloud)' + TARGET_CLOUD="USNatCloud" + When call enableAKSHostsSetup + The status should be success + The output should include "WARNING: The following cloud is not supported by aks-hosts-setup: USNatCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + The output should include "Skipping aks-hosts-setup" + The file "$AKS_CLOUD_ENV_FILE" should not be exist + End + + It 'should skip when TARGET_CLOUD is unsupported (USSecCloud)' + TARGET_CLOUD="USSecCloud" + When call enableAKSHostsSetup + The status should be success + The output should include "WARNING: The following cloud is not supported by aks-hosts-setup: USSecCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + The output should include "Skipping aks-hosts-setup" + The file "$AKS_CLOUD_ENV_FILE" should not be exist + End + + It 'should skip when TARGET_CLOUD is unsupported (AzureStackCloud)' + TARGET_CLOUD="AzureStackCloud" + When call enableAKSHostsSetup + The status should be success + The output should include "WARNING: The following cloud is not supported by aks-hosts-setup: AzureStackCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + The output should include "Skipping aks-hosts-setup" + The file "$AKS_CLOUD_ENV_FILE" should not be exist + End + + It 'should skip when TARGET_CLOUD is unsupported (AzureGermanCloud)' + TARGET_CLOUD="AzureGermanCloud" + When call enableAKSHostsSetup + The status should be success + The output should include "WARNING: The following cloud is not supported by aks-hosts-setup: AzureGermanCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + The output should include "Skipping aks-hosts-setup" + The file "$AKS_CLOUD_ENV_FILE" should not be exist + End + + It 'should skip when TARGET_CLOUD is unsupported (unknown cloud)' + TARGET_CLOUD="SomeRandomCloud" + When call enableAKSHostsSetup + The status should be success + The output should include "WARNING: The following cloud is not supported by aks-hosts-setup: SomeRandomCloud" + The output should include "Supported clouds: AzurePublicCloud, AzureChinaCloud, AzureUSGovernmentCloud" + The output should include "Skipping aks-hosts-setup" + The file "$AKS_CLOUD_ENV_FILE" should not be exist + End + + It 'should log TARGET_CLOUD value when set' + TARGET_CLOUD="AzurePublicCloud" + When call enableAKSHostsSetup + The status should be success + The output should include "Setting TARGET_CLOUD=AzurePublicCloud for aks-hosts-setup" + End End Describe 'configureAndStartSecureTLSBootstrapping' diff --git a/spec/parts/linux/cloud-init/artifacts/cse_main_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_main_spec.sh new file mode 100644 index 00000000000..051541ce5ac --- /dev/null +++ b/spec/parts/linux/cloud-init/artifacts/cse_main_spec.sh @@ -0,0 +1,136 @@ +#!/usr/bin/env shellspec + +# Unit tests for cse_main.sh helper functions +# Tests the select_localdns_corefile() function for localdns corefile selection logic +# Note: select_localdns_corefile() is now defined in localdns.sh for dynamic selection on restart + +Describe 'cse_main.sh corefile selection' + LOCALDNS_PATH="parts/linux/cloud-init/artifacts/localdns.sh" + + # Mock base64-encoded corefiles for testing + COREFILE_WITH_HOSTS="aG9zdHMgL2V0Yy9sb2NhbGRucy9ob3N0cw==" # "hosts /etc/localdns/hosts" + COREFILE_NO_HOSTS="bm8gaG9zdHMgcGx1Z2lu" # "no hosts plugin" + + setup() { + # Source localdns.sh to get select_localdns_corefile function + # We set __SOURCED__=1 to only source the functions, not run main execution + # shellcheck disable=SC1090 + __SOURCED__=1 . "${LOCALDNS_PATH}" + + # Create temp directory for test files + TEST_DIR=$(mktemp -d) + HOSTS_FILE="${TEST_DIR}/hosts" + } + + cleanup() { + rm -rf "${TEST_DIR}" + } + + BeforeEach 'setup' + AfterEach 'cleanup' + + Describe 'select_localdns_corefile()' + Context 'when hosts plugin is enabled (SHOULD_ENABLE_HOSTS_PLUGIN=true)' + It 'returns corefile WITH hosts plugin when hosts file exists with valid IP mappings' + # Create hosts file with valid IP mappings + echo "10.0.0.1 mcr.microsoft.com" > "${HOSTS_FILE}" + echo "192.168.1.1 login.microsoftonline.com" >> "${HOSTS_FILE}" + + When call select_localdns_corefile "true" "${COREFILE_WITH_HOSTS}" "${COREFILE_NO_HOSTS}" "${HOSTS_FILE}" 0 + The output should equal "${COREFILE_WITH_HOSTS}" + The status should be success + The stderr should include "Hosts plugin is enabled" + The stderr should include "checking ${HOSTS_FILE} for content" + The stderr should include "using corefile with hosts plugin" + End + + It 'returns corefile WITHOUT hosts plugin when hosts file exists but has no IP mappings' + # Create empty hosts file + touch "${HOSTS_FILE}" + + When call select_localdns_corefile "true" "${COREFILE_WITH_HOSTS}" "${COREFILE_NO_HOSTS}" "${HOSTS_FILE}" 0 + The output should equal "${COREFILE_NO_HOSTS}" + The status should be success + The stderr should include "exists but has no IP mappings" + The stderr should include "falling back to corefile without hosts plugin" + End + + It 'returns corefile WITHOUT hosts plugin when hosts file exists with only comments' + # Create hosts file with only comments (no valid IP mappings) + echo "# This is a comment" > "${HOSTS_FILE}" + echo "# Another comment line" >> "${HOSTS_FILE}" + + When call select_localdns_corefile "true" "${COREFILE_WITH_HOSTS}" "${COREFILE_NO_HOSTS}" "${HOSTS_FILE}" 0 + The output should equal "${COREFILE_NO_HOSTS}" + The status should be success + The stderr should include "exists but has no IP mappings" + End + + It 'returns corefile WITHOUT hosts plugin when hosts file does not exist' + # Don't create hosts file + When call select_localdns_corefile "true" "${COREFILE_WITH_HOSTS}" "${COREFILE_NO_HOSTS}" "${HOSTS_FILE}" 0 + The output should equal "${COREFILE_NO_HOSTS}" + The status should be success + The stderr should include "does not exist" + The stderr should include "falling back to corefile without hosts plugin" + End + + It 'handles IPv6 addresses in hosts file' + # Create hosts file with IPv6 addresses + echo "2001:db8::1 mcr.microsoft.com" > "${HOSTS_FILE}" + echo "fe80::1 login.microsoftonline.com" >> "${HOSTS_FILE}" + + When call select_localdns_corefile "true" "${COREFILE_WITH_HOSTS}" "${COREFILE_NO_HOSTS}" "${HOSTS_FILE}" 0 + The output should equal "${COREFILE_WITH_HOSTS}" + The status should be success + The stderr should include "using corefile with hosts plugin" + End + End + + Context 'when hosts plugin is disabled' + It 'returns corefile WITHOUT hosts plugin when SHOULD_ENABLE_HOSTS_PLUGIN=false' + # Create hosts file with valid IP mappings (should be ignored) + echo "10.0.0.1 mcr.microsoft.com" > "${HOSTS_FILE}" + + When call select_localdns_corefile "false" "${COREFILE_WITH_HOSTS}" "${COREFILE_NO_HOSTS}" "${HOSTS_FILE}" 0 + The output should equal "${COREFILE_NO_HOSTS}" + The status should be success + The stderr should include "Hosts plugin is not enabled" + The stderr should include "using corefile without hosts plugin" + End + + It 'returns corefile WITHOUT hosts plugin when SHOULD_ENABLE_HOSTS_PLUGIN is empty' + # Create hosts file with valid IP mappings (should be ignored) + echo "10.0.0.1 mcr.microsoft.com" > "${HOSTS_FILE}" + + When call select_localdns_corefile "" "${COREFILE_WITH_HOSTS}" "${COREFILE_NO_HOSTS}" "${HOSTS_FILE}" 0 + The output should equal "${COREFILE_NO_HOSTS}" + The status should be success + The stderr should include "Hosts plugin is not enabled" + End + + It 'returns corefile WITHOUT hosts plugin when SHOULD_ENABLE_HOSTS_PLUGIN is any value other than "true"' + # Create hosts file with valid IP mappings (should be ignored) + echo "10.0.0.1 mcr.microsoft.com" > "${HOSTS_FILE}" + + When call select_localdns_corefile "yes" "${COREFILE_WITH_HOSTS}" "${COREFILE_NO_HOSTS}" "${HOSTS_FILE}" 0 + The output should equal "${COREFILE_NO_HOSTS}" + The status should be success + The stderr should include "Hosts plugin is not enabled" + End + End + + Context 'unknown cloud scenario (no hosts file created by aks-hosts-setup.sh)' + It 'returns corefile WITHOUT hosts plugin when hosts plugin enabled but file does not exist (unknown cloud)' + # Simulate unknown cloud: SHOULD_ENABLE_HOSTS_PLUGIN=true but aks-hosts-setup.sh + # exited before creating the file + + When call select_localdns_corefile "true" "${COREFILE_WITH_HOSTS}" "${COREFILE_NO_HOSTS}" "${HOSTS_FILE}" 0 + The output should equal "${COREFILE_NO_HOSTS}" + The status should be success + The stderr should include "does not exist" + The stderr should include "falling back to corefile without hosts plugin" + End + End + End +End diff --git a/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh b/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh index 95a5c555364..c6a060455e4 100644 --- a/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh @@ -78,12 +78,14 @@ EOF The path "$LOCALDNS_CORE_FILE" should be file End - It 'should fail to regenerate when LOCALDNS_BASE64_ENCODED_COREFILE is not set' + It 'should fail to regenerate when no corefile variants are available' rm -f "$LOCALDNS_CORE_FILE" unset LOCALDNS_BASE64_ENCODED_COREFILE + unset LOCALDNS_BASE64_ENCODED_COREFILE_WITH_HOSTS + unset LOCALDNS_BASE64_ENCODED_COREFILE_NO_HOSTS When run regenerate_localdns_corefile The status should be failure - The stdout should include "LOCALDNS_BASE64_ENCODED_COREFILE is not set. Cannot regenerate corefile." + The stdout should include "No corefile variants available in environment. Cannot regenerate corefile." End It 'should set correct permissions on regenerated corefile' @@ -123,12 +125,15 @@ EOF End It 'should return failure if localdns corefile does not exist and regeneration fails' - rm -r "$LOCALDNS_CORE_FILE" + rm -f "$LOCALDNS_CORE_FILE" + unset LOCALDNS_BASE64_ENCODED_COREFILE + unset LOCALDNS_BASE64_ENCODED_COREFILE_WITH_HOSTS + unset LOCALDNS_BASE64_ENCODED_COREFILE_NO_HOSTS When run verify_localdns_corefile The status should be failure The stdout should include "Localdns corefile either does not exist or is empty at $LOCALDNS_CORE_FILE." The stdout should include "Attempting to regenerate localdns corefile..." - The stdout should include "LOCALDNS_BASE64_ENCODED_COREFILE is not set. Cannot regenerate corefile." + The stdout should include "No corefile variants available in environment. Cannot regenerate corefile." End It 'should return failure if localdns corefile is empty and regeneration fails' @@ -1261,4 +1266,361 @@ EOF The stdout should include "DNS configuration refreshed successfully" End End + + +# This section tests - annotate_node_with_hosts_plugin_status +# This function is defined in parts/linux/cloud-init/artifacts/localdns.sh file. +#------------------------------------------------------------------------------------------------------------------------------------ + Describe 'annotate_node_with_hosts_plugin_status' + setup() { + Include "./parts/linux/cloud-init/artifacts/localdns.sh" + TEST_DIR="/tmp/localdnstest-$$" + KUBECONFIG="${TEST_DIR}/var/lib/kubelet/kubeconfig" + UPDATED_LOCALDNS_CORE_FILE="${TEST_DIR}/opt/azure/containers/localdns/updated.localdns.corefile" + LOCALDNS_HOSTS_FILE="${TEST_DIR}/etc/localdns/hosts" + + # Create test directories + mkdir -p "$(dirname "$KUBECONFIG")" + mkdir -p "$(dirname "$UPDATED_LOCALDNS_CORE_FILE")" + mkdir -p "$(dirname "$LOCALDNS_HOSTS_FILE")" + + # Mock hostname command + hostname() { + echo "TestNode123" + } + } + cleanup() { + rm -rf "$TEST_DIR" + # Clean up mock kubectl symlink to prevent state leaking across specs + rm -f /opt/bin/kubectl + # Remove /opt/bin if it's empty and we created it + if [ -d /opt/bin ] && [ -z "$(ls -A /opt/bin 2>/dev/null)" ]; then + rmdir /opt/bin 2>/dev/null || true + fi + } + BeforeEach 'setup' + AfterEach 'cleanup' + + #------------------------- annotate_node_with_hosts_plugin_status ---------------------------------------------- + It 'should skip annotation if corefile does not exist' + rm -f "$UPDATED_LOCALDNS_CORE_FILE" + When run annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Localdns corefile not found" + The stdout should include "skipping annotation." + End + + It 'should skip annotation if corefile does not contain hosts plugin block' + # Create corefile without hosts plugin + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + forward . 168.63.129.16 +} +EOF + When run annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Localdns corefile does not contain hosts plugin block, skipping annotation." + End + + It 'should skip annotation if hosts file does not exist' + # Create corefile with hosts plugin + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + hosts /etc/localdns/hosts { + fallthrough + } + forward . 168.63.129.16 +} +EOF + rm -f "$LOCALDNS_HOSTS_FILE" + When run annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Hosts file does not exist" + The stdout should include "skipping annotation despite corefile having hosts plugin." + End + + It 'should skip annotation if hosts file has no IP mappings' + # Create corefile with hosts plugin + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + hosts /etc/localdns/hosts { + fallthrough + } + forward . 168.63.129.16 +} +EOF + # Create empty hosts file + cat > "$LOCALDNS_HOSTS_FILE" <<'EOF' +# Empty hosts file +EOF + When run annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Hosts file exists but has no IP mappings, skipping annotation." + End + + It 'should skip annotation if kubectl binary is not found' + # Create valid corefile and hosts file + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + hosts /etc/localdns/hosts { + fallthrough + } + forward . 168.63.129.16 +} +EOF + cat > "$LOCALDNS_HOSTS_FILE" <<'EOF' +10.0.0.1 mcr.microsoft.com +10.0.0.2 packages.aks.azure.com +EOF + + command() { + if [[ "$1" == "-v" && "$2" == "/opt/bin/kubectl" ]]; then + return 1 + fi + } + When run annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "kubectl binary not found at /opt/bin/kubectl, skipping annotation." + End + + It 'should timeout and skip annotation if kubeconfig does not exist after waiting' + # Create valid corefile and hosts file + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + hosts /etc/localdns/hosts { + fallthrough + } + forward . 168.63.129.16 +} +EOF + cat > "$LOCALDNS_HOSTS_FILE" <<'EOF' +10.0.0.1 mcr.microsoft.com +EOF + + # Create mock kubectl binary that is executable + mkdir -p /opt/bin + cat > /opt/bin/kubectl <<'KUBECTL_EOF' +#!/bin/bash +echo "mock kubectl" +KUBECTL_EOF + chmod +x /opt/bin/kubectl + + rm -f "$KUBECONFIG" + # Use short timeout for testing (2 attempts = 6 seconds) + KUBECONFIG_WAIT_ATTEMPTS=2 + When run annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Waiting for TLS bootstrapping to complete" + The stdout should include "Timeout waiting for kubeconfig" + End + + It 'should set annotation successfully when using corefile with hosts plugin' + # Create valid corefile and hosts file + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + hosts /etc/localdns/hosts { + fallthrough + } + forward . 168.63.129.16 +} +EOF + cat > "$LOCALDNS_HOSTS_FILE" <<'EOF' +# AKS critical FQDN addresses +10.0.0.1 mcr.microsoft.com +10.0.0.2 packages.aks.azure.com +10.0.0.3 management.azure.com +EOF + touch "$KUBECONFIG" + + # Create mock kubectl in /opt/bin (must exist in container filesystem) + # First verify we can write to /opt + if [ ! -d /opt ]; then + Skip "Cannot create /opt/bin/kubectl - /opt directory does not exist or is not writable" + fi + + mkdir -p /opt/bin || Skip "Cannot create /opt/bin directory" + + cat > /opt/bin/kubectl <<'KUBECTL_EOF' +#!/bin/bash +if [[ "$1" == "--kubeconfig" && "$3" == "get" && "$4" == "node" ]]; then + exit 0 +elif [[ "$1" == "--kubeconfig" && "$3" == "annotate" && "$4" == "--overwrite" && "$5" == "node" ]]; then + echo "node/testnode123 annotated" + exit 0 +fi +exit 1 +KUBECTL_EOF + chmod +x /opt/bin/kubectl || Skip "Cannot make /opt/bin/kubectl executable" + + # Verify the mock was created + [ -x /opt/bin/kubectl ] || Skip "Mock kubectl was not created successfully" + + When call annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Localdns is using hosts plugin and hosts file has 3 entries." + The stdout should include "Setting annotation to indicate hosts plugin is in use for node testnode123." + The stdout should include "Successfully set hosts plugin annotation." + End + + It 'should handle kubectl annotation failure gracefully (non-fatal)' + # Create valid corefile and hosts file + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + hosts /etc/localdns/hosts { + fallthrough + } + forward . 168.63.129.16 +} +EOF + cat > "$LOCALDNS_HOSTS_FILE" <<'EOF' +10.0.0.1 mcr.microsoft.com +EOF + touch "$KUBECONFIG" + + # Create mock kubectl binary that fails annotation + mkdir -p /opt/bin + cat > /opt/bin/kubectl <<'KUBECTL_EOF' +#!/bin/bash +if [[ "$1" == "--kubeconfig" && "$3" == "get" && "$4" == "node" ]]; then + exit 0 +elif [[ "$1" == "--kubeconfig" && "$3" == "annotate" ]]; then + echo "Error: failed to annotate node" >&2 + exit 1 +fi +exit 1 +KUBECTL_EOF + chmod +x /opt/bin/kubectl + + When call annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Setting annotation to indicate hosts plugin is in use for node testnode123." + The stdout should include "Warning: Failed to set hosts plugin annotation (this is non-fatal)." + The stderr should include "Error: failed to annotate node" + End + + It 'should convert hostname to lowercase for node name' + # Create valid corefile and hosts file + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + hosts /etc/localdns/hosts { + fallthrough + } + forward . 168.63.129.16 +} +EOF + cat > "$LOCALDNS_HOSTS_FILE" <<'EOF' +10.0.0.1 mcr.microsoft.com +EOF + touch "$KUBECONFIG" + + # Create mock kubectl binary that verifies lowercase node name + mkdir -p /opt/bin + cat > /opt/bin/kubectl <<'KUBECTL_EOF' +#!/bin/bash +if [[ "$1" == "--kubeconfig" && "$3" == "get" && "$4" == "node" ]]; then + exit 0 +elif [[ "$1" == "--kubeconfig" && "$3" == "annotate" && "$4" == "--overwrite" && "$5" == "node" && "$6" == "testnode123" ]]; then + echo "node/testnode123 annotated (lowercase verified)" + exit 0 +else + echo "Error: Expected lowercase node name 'testnode123' but got '$6'" >&2 + exit 1 +fi +KUBECTL_EOF + chmod +x /opt/bin/kubectl + + When call annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Successfully set hosts plugin annotation." + End + + It 'should wait for node to be registered before annotating' + # Create valid corefile and hosts file + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + hosts /etc/localdns/hosts { + fallthrough + } + forward . 168.63.129.16 +} +EOF + cat > "$LOCALDNS_HOSTS_FILE" <<'EOF' +10.0.0.1 mcr.microsoft.com +EOF + touch "$KUBECONFIG" + + # Create mock kubectl binary that simulates node not registered initially + # Create a counter file to track attempts + ATTEMPT_FILE="${TEST_DIR}/attempt_count" + echo "0" > "$ATTEMPT_FILE" + + mkdir -p /opt/bin + cat > /opt/bin/kubectl < "\$ATTEMPT_FILE" + +# Simulate node not ready for first 2 attempts +if [[ "\$1" == "--kubeconfig" && "\$3" == "get" && "\$4" == "node" && \$count -le 2 ]]; then + echo "Error from server (NotFound): nodes \"testnode123\" not found" >&2 + exit 1 +elif [[ "\$1" == "--kubeconfig" && "\$3" == "get" && "\$4" == "node" ]]; then + # Node is now registered + exit 0 +elif [[ "\$1" == "--kubeconfig" && "\$3" == "annotate" ]]; then + echo "node/testnode123 annotated" + exit 0 +fi +exit 1 +KUBECTL_EOF + chmod +x /opt/bin/kubectl + + # Use short timeout for testing + NODE_REGISTRATION_WAIT_ATTEMPTS=5 + + When call annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Waiting for node testnode123 to be registered in the cluster" + The stdout should include "Node testnode123 is registered in the cluster" + The stdout should include "Successfully set hosts plugin annotation" + End + + It 'should timeout and skip annotation if node never registers' + # Create valid corefile and hosts file + cat > "$UPDATED_LOCALDNS_CORE_FILE" <<'EOF' +.:53 { + hosts /etc/localdns/hosts { + fallthrough + } + forward . 168.63.129.16 +} +EOF + cat > "$LOCALDNS_HOSTS_FILE" <<'EOF' +10.0.0.1 mcr.microsoft.com +EOF + touch "$KUBECONFIG" + + # Create mock kubectl that always fails to find node + mkdir -p /opt/bin + cat > /opt/bin/kubectl <<'KUBECTL_EOF' +#!/bin/bash +if [[ "$1" == "--kubeconfig" && "$3" == "get" && "$4" == "node" ]]; then + echo "Error from server (NotFound): nodes \"testnode123\" not found" >&2 + exit 1 +fi +exit 1 +KUBECTL_EOF + chmod +x /opt/bin/kubectl + + # Use very short timeout for testing + NODE_REGISTRATION_WAIT_ATTEMPTS=2 + + When call annotate_node_with_hosts_plugin_status + The status should be success + The stdout should include "Waiting for node registration" + The stdout should include "Timeout waiting for node testnode123 to be registered" + End + End End diff --git a/spec/shellspec.Dockerfile b/spec/shellspec.Dockerfile index db8a68f7ebe..a8c98177361 100644 --- a/spec/shellspec.Dockerfile +++ b/spec/shellspec.Dockerfile @@ -4,7 +4,7 @@ FROM aksdataplanedev.azurecr.io/shellspec/shellspec-debian:0.28.1 RUN sed -i -e 's/\(deb\|security\).debian.org/archive.debian.org/g' /etc/apt/sources.list && \ apt-get update && \ - apt-get install -y --no-install-recommends gawk jq curl && \ + apt-get install -y --no-install-recommends gawk jq curl dnsutils && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* COPY ./ /src diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index c960d797a5c..7fe6075adb1 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -301,6 +301,18 @@ copyPackerFiles() { LOCALDNS_SERVICE_DELEGATE_SRC=/home/packer/localdns-delegate.conf LOCALDNS_SERVICE_DELEGATE_DEST=/etc/systemd/system/localdns.service.d/delegate.conf cpAndMode $LOCALDNS_SERVICE_DELEGATE_SRC $LOCALDNS_SERVICE_DELEGATE_DEST 0644 + + AKS_HOSTS_SETUP_SH_SRC=/home/packer/aks-hosts-setup.sh + AKS_HOSTS_SETUP_SH_DEST=/opt/azure/containers/aks-hosts-setup.sh + cpAndMode $AKS_HOSTS_SETUP_SH_SRC $AKS_HOSTS_SETUP_SH_DEST 0755 + + AKS_HOSTS_SETUP_SVC_SRC=/home/packer/aks-hosts-setup.service + AKS_HOSTS_SETUP_SVC_DEST=/etc/systemd/system/aks-hosts-setup.service + cpAndMode $AKS_HOSTS_SETUP_SVC_SRC $AKS_HOSTS_SETUP_SVC_DEST 0644 + + AKS_HOSTS_SETUP_TIMER_SRC=/home/packer/aks-hosts-setup.timer + AKS_HOSTS_SETUP_TIMER_DEST=/etc/systemd/system/aks-hosts-setup.timer + cpAndMode $AKS_HOSTS_SETUP_TIMER_SRC $AKS_HOSTS_SETUP_TIMER_DEST 0644 # --------------------------------------------------------------------------------------- # ------------------------- Files related to azure-network ------------------------------ diff --git a/vhdbuilder/packer/vhd-image-builder-acl-arm64.json b/vhdbuilder/packer/vhd-image-builder-acl-arm64.json index 6cebe0ec0f2..0087444602f 100644 --- a/vhdbuilder/packer/vhd-image-builder-acl-arm64.json +++ b/vhdbuilder/packer/vhd-image-builder-acl-arm64.json @@ -631,6 +631,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-acl.json b/vhdbuilder/packer/vhd-image-builder-acl.json index 03adb0f11f0..7768bb9316c 100644 --- a/vhdbuilder/packer/vhd-image-builder-acl.json +++ b/vhdbuilder/packer/vhd-image-builder-acl.json @@ -631,6 +631,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json index 615da5e9ee3..ada5349a4a5 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json @@ -702,6 +702,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-base.json b/vhdbuilder/packer/vhd-image-builder-base.json index bfe60f33041..839b7a5a9fc 100644 --- a/vhdbuilder/packer/vhd-image-builder-base.json +++ b/vhdbuilder/packer/vhd-image-builder-base.json @@ -710,6 +710,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-cvm.json b/vhdbuilder/packer/vhd-image-builder-cvm.json index 0e444781783..21f0fd7b52c 100644 --- a/vhdbuilder/packer/vhd-image-builder-cvm.json +++ b/vhdbuilder/packer/vhd-image-builder-cvm.json @@ -714,6 +714,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-flatcar-arm64.json b/vhdbuilder/packer/vhd-image-builder-flatcar-arm64.json index 203a22dc035..664a2d0880b 100644 --- a/vhdbuilder/packer/vhd-image-builder-flatcar-arm64.json +++ b/vhdbuilder/packer/vhd-image-builder-flatcar-arm64.json @@ -683,6 +683,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-flatcar.json b/vhdbuilder/packer/vhd-image-builder-flatcar.json index 959d78535d9..11f907a0ead 100644 --- a/vhdbuilder/packer/vhd-image-builder-flatcar.json +++ b/vhdbuilder/packer/vhd-image-builder-flatcar.json @@ -688,6 +688,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-mariner-arm64.json b/vhdbuilder/packer/vhd-image-builder-mariner-arm64.json index 6ed96281c5c..8f7dd5480fa 100644 --- a/vhdbuilder/packer/vhd-image-builder-mariner-arm64.json +++ b/vhdbuilder/packer/vhd-image-builder-mariner-arm64.json @@ -676,6 +676,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-mariner-cvm.json b/vhdbuilder/packer/vhd-image-builder-mariner-cvm.json index e4d58283d56..6e44f0ace68 100644 --- a/vhdbuilder/packer/vhd-image-builder-mariner-cvm.json +++ b/vhdbuilder/packer/vhd-image-builder-mariner-cvm.json @@ -677,6 +677,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", diff --git a/vhdbuilder/packer/vhd-image-builder-mariner.json b/vhdbuilder/packer/vhd-image-builder-mariner.json index 3fd5e90a8b3..714f32584c1 100644 --- a/vhdbuilder/packer/vhd-image-builder-mariner.json +++ b/vhdbuilder/packer/vhd-image-builder-mariner.json @@ -678,6 +678,21 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.sh", + "destination": "/home/packer/aks-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.service", + "destination": "/home/packer/aks-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-hosts-setup.timer", + "destination": "/home/packer/aks-hosts-setup.timer" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh",