From 3d9d27652998e12de5494e20ff3ee863071a03bd Mon Sep 17 00:00:00 2001 From: Qingqing Zheng Date: Tue, 17 Mar 2026 15:34:43 -0700 Subject: [PATCH 1/7] add cordon and drain for upgrading --- go.mod | 37 ++- go.sum | 87 +++++-- pkg/drift/node_maintenance.go | 368 +++++++++++++++++++++++++++++ pkg/drift/node_maintenance_test.go | 240 +++++++++++++++++++ pkg/drift/remediation.go | 7 + pkg/kube/client.go | 104 ++++++++ pkg/status/collector.go | 40 ++-- 7 files changed, 839 insertions(+), 44 deletions(-) create mode 100644 pkg/drift/node_maintenance.go create mode 100644 pkg/drift/node_maintenance_test.go create mode 100644 pkg/kube/client.go diff --git a/go.mod b/go.mod index f0acdaf3..a880a5f8 100644 --- a/go.mod +++ b/go.mod @@ -23,6 +23,7 @@ require ( k8s.io/api v0.35.0 k8s.io/apimachinery v0.35.0 k8s.io/client-go v0.35.0 + k8s.io/kubectl v0.35.0 k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 sigs.k8s.io/cluster-api v1.12.2 sigs.k8s.io/yaml v1.6.0 @@ -31,6 +32,7 @@ require ( require ( github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2 // indirect github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect + github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect github.com/Azure/go-autorest v14.2.0+incompatible // indirect github.com/Azure/go-autorest/autorest v0.11.29 // indirect github.com/Azure/go-autorest/autorest/adal v0.9.23 // indirect @@ -39,11 +41,16 @@ require ( github.com/Azure/go-autorest/tracing v0.6.0 // indirect github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 // indirect github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect + github.com/MakeNowJust/heredoc v1.0.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/chai2010/gettext-go v1.0.2 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/emicklei/go-restful/v3 v3.13.0 // indirect + github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect + github.com/go-errors/errors v1.4.2 // indirect github.com/go-logr/logr v1.4.3 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect @@ -53,25 +60,33 @@ require ( github.com/gogo/protobuf v1.3.2 // indirect github.com/golang-jwt/jwt/v4 v4.5.2 // indirect github.com/golang-jwt/jwt/v5 v5.3.0 // indirect + github.com/google/btree v1.1.3 // indirect github.com/google/gnostic-models v0.7.0 // indirect + github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/keybase/go-keychain v0.0.1 // indirect github.com/kylelemons/godebug v1.1.0 // indirect + github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect + github.com/mitchellh/go-wordwrap v1.0.1 // indirect + github.com/moby/term v0.5.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect + github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect + github.com/peterbourgon/diskv v2.0.1+incompatible // indirect github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect github.com/pkg/errors v0.9.1 // indirect - github.com/prometheus/client_golang v1.22.0 // indirect + github.com/prometheus/client_golang v1.23.2 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.62.0 // indirect - github.com/prometheus/procfs v0.15.1 // indirect + github.com/prometheus/common v0.66.1 // indirect + github.com/prometheus/procfs v0.16.1 // indirect github.com/rivo/uniseg v0.4.7 // indirect + github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/sagikazarmark/locafero v0.11.0 // indirect github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8 // indirect github.com/spf13/afero v1.15.0 // indirect @@ -79,25 +94,33 @@ require ( github.com/spf13/pflag v1.0.10 // indirect github.com/subosito/gotenv v1.6.0 // indirect github.com/x448/float16 v0.8.4 // indirect - go.opentelemetry.io/otel v1.35.0 // indirect - go.opentelemetry.io/otel/trace v1.35.0 // indirect + github.com/xlab/treeprint v1.2.0 // indirect + go.opentelemetry.io/otel v1.36.0 // indirect + go.opentelemetry.io/otel/trace v1.36.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/crypto v0.47.0 // indirect golang.org/x/net v0.49.0 // indirect + golang.org/x/oauth2 v0.33.0 // indirect + golang.org/x/sync v0.19.0 // indirect golang.org/x/sys v0.40.0 // indirect golang.org/x/term v0.39.0 // indirect golang.org/x/text v0.33.0 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb // indirect + golang.org/x/time v0.9.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250528174236-200df99c418a // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiextensions-apiserver v0.34.3 // indirect + k8s.io/cli-runtime v0.35.0 // indirect k8s.io/cluster-bootstrap v0.34.2 // indirect - k8s.io/component-base v0.34.3 // indirect + k8s.io/component-base v0.35.0 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect sigs.k8s.io/controller-runtime v0.22.5 // indirect sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect + sigs.k8s.io/kustomize/api v0.20.1 // indirect + sigs.k8s.io/kustomize/kyaml v0.20.1 // indirect sigs.k8s.io/randfill v1.0.0 // indirect sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect ) diff --git a/go.sum b/go.sum index 9a807769..8f968584 100644 --- a/go.sum +++ b/go.sum @@ -16,6 +16,8 @@ github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v2 v2.0.0 h1:PTFG github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v2 v2.0.0/go.mod h1:LRr2FzBTQlONPPa5HREE5+RjSCTXl7BwOvYOaWTqCaI= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.1.1 h1:7CBQ+Ei8SP2c6ydQTGCCrS35bDxgTMfoP2miAwK++OU= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.1.1/go.mod h1:c/wcGeGx5FUPbM/JltUYHZcKmigwyVLJlDq+4HdtXaw= +github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= +github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= github.com/Azure/go-autorest v14.2.0+incompatible h1:V5VMDjClD3GiElqLWO7mz2MxNAK/vTfRHdAubSIPRgs= github.com/Azure/go-autorest v14.2.0+incompatible/go.mod h1:r+4oMnoxhatjLLJ6zxSWATqVooLgysK6ZNox3g/xq24= github.com/Azure/go-autorest/autorest v0.11.29 h1:I4+HL/JDvErx2LjyzaVxllw2lRDB5/BT2Bm4g20iqYw= @@ -40,6 +42,8 @@ github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJ github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE= github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgvJqCH0sFfrBUTnUJSBrBf7++ypk+twtRs= github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk= +github.com/MakeNowJust/heredoc v1.0.0 h1:cXCdzVdstXyiTqTvfqk9SDHpKNjxuom+DOlyEeQ4pzQ= +github.com/MakeNowJust/heredoc v1.0.0/go.mod h1:mG5amYoWBHf8vpLOuehzbGGw0EHxpZZ6lCpQ4fNJ8LE= github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -48,24 +52,32 @@ github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/chai2010/gettext-go v1.0.2 h1:1Lwwip6Q2QGsAdl/ZKPCwTe9fe0CjlUbqj5bFNSjIRk= +github.com/chai2010/gettext-go v1.0.2/go.mod h1:y+wnP2cHYaVj19NZhYKAwEMH2CI1gNHeQQ+5AjwawxA= github.com/chengxilo/virtualterm v1.0.4 h1:Z6IpERbRVlfB8WkOmtbHiDbBANU7cimRIof7mk9/PwM= github.com/chengxilo/virtualterm v1.0.4/go.mod h1:DyxxBZz/x1iqJjFxTFcr6/x+jSpqN0iwWCOK1q10rlY= github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= +github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes= github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f h1:Wl78ApPPB2Wvf/TIe2xdyJxTlb6obmF18d8QdkxNDu4= +github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f/go.mod h1:OSYXu++VVOHnXeitef/D8n/6y4QV8uLHSFXX4NeXMGc= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA= +github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= @@ -94,6 +106,8 @@ github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9v github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= +github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= @@ -105,6 +119,8 @@ github.com/google/renameio/v2 v2.0.2 h1:qKZs+tfn+arruZZhQ7TKC/ergJunuJicWS6gLDt/ github.com/google/renameio/v2 v2.0.2/go.mod h1:OX+G6WHHpHq3NVj7cAOleLOwJfcQ1s3uUJQCrr78SWo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA= +github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= @@ -126,18 +142,26 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de h1:9TO3cAIGXtEhnIaL+V+BEER86oLrvS+kWobKpbJuye0= +github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de/go.mod h1:zAbeS9B/r2mtpb6U+EI2rYA5OAXxsYw6wTamcNW+zcE= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc= github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ= github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw= +github.com/mitchellh/go-wordwrap v1.0.1 h1:TLuKupo69TCn6TQSyGxwI1EblZZEsQ0vMlAFQflz0v0= +github.com/mitchellh/go-wordwrap v1.0.1/go.mod h1:R62XHJLzvMFRBbcrT7m7WgmE1eOyTSsCt+hzestvNj0= +github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= +github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 h1:n6/2gBQ3RWajuToeY6ZtZTIKv2v7ThUy5KKusIT0yc0= +github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod h1:Pm3mSP3c5uWn86xMLZ5Sa7JB9GsEZySvHYXCTK4E9q4= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns= @@ -146,29 +170,34 @@ github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= +github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI= +github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU= github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ= github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= -github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io= -github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= -github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= -github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= +github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= +github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= +github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sagikazarmark/locafero v0.11.0 h1:1iurJgmM9G3PA/I+wWYIOw/5SyBtxapeHDcg+AAIFXc= github.com/sagikazarmark/locafero v0.11.0/go.mod h1:nVIGvgyzw595SUSUE6tvCp3YYTeHs15MvlmU87WwIik= github.com/schollz/progressbar/v3 v3.19.0 h1:Ea18xuIRQXLAUidVDox3AbwfUhD0/1IvohyTutOIFoc= github.com/schollz/progressbar/v3 v3.19.0/go.mod h1:IsO3lpbaGuzh8zIMzgY3+J8l4C8GjO0Y9S69eFvNsec= +github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= +github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8 h1:+jumHNA0Wrelhe64i8F6HNlS8pkoyMv5sreGx2Ry5Rw= @@ -187,7 +216,10 @@ github.com/spf13/viper v1.21.0/go.mod h1:P0lhsswPGWD/1lZJ9ny3fYnVqxiegrlNrEmgLjb github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= @@ -199,21 +231,25 @@ github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8 github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= +github.com/xlab/treeprint v1.2.0 h1:HzHnuAF1plUN2zGlAFHbSQP2qJ0ZAD3XF5XD7OesXRQ= +github.com/xlab/treeprint v1.2.0/go.mod h1:gj5Gd3gPdKtR1ikdDK6fnFLdmIS0X30kTTuNd/WEJu0= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= -go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= -go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M= -go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE= -go.opentelemetry.io/otel/sdk v1.34.0 h1:95zS4k/2GOy069d321O8jWgYsW3MzVV+KuSPKp7Wr1A= -go.opentelemetry.io/otel/sdk v1.34.0/go.mod h1:0e/pNiaMAqaykJGKbi+tSjWfNNHMTxoC9qANsCzbyxU= +go.opentelemetry.io/otel v1.36.0 h1:UumtzIklRBY6cI/lllNZlALOF5nNIzJVb16APdvgTXg= +go.opentelemetry.io/otel v1.36.0/go.mod h1:/TcFMXYjyRNh8khOAO9ybYkqaDBb/70aVwkNML4pP8E= +go.opentelemetry.io/otel/metric v1.36.0 h1:MoWPKVhQvJ+eeXWHFBOPoBOi20jh6Iq2CcCREuTYufE= +go.opentelemetry.io/otel/metric v1.36.0/go.mod h1:zC7Ks+yeyJt4xig9DEw9kuUFe5C3zLbVjV2PzT6qzbs= +go.opentelemetry.io/otel/sdk v1.36.0 h1:b6SYIuLRs88ztox4EyrvRti80uXIFy+Sqzoh9kFULbs= +go.opentelemetry.io/otel/sdk v1.36.0/go.mod h1:+lC+mTgD+MUWfjJubi2vvXWcVxyr9rmlshZni72pXeY= go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce1EK0Gyvahk= go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= -go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= -go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= +go.opentelemetry.io/otel/trace v1.36.0 h1:ahxWNuqZjpdiFAyrIoQ4GIiAIhxAunQR6MUoKrsNd4w= +go.opentelemetry.io/otel/trace v1.36.0/go.mod h1:gQ+OnDZzrybY4k4seLzPAWNwVBBVlF2szhehOBB/tGA= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/mock v0.5.0 h1:KAMbZvZPyBPWgD14IrIQ38QCyjwpvVVV6K/bHl1IwQU= go.uber.org/mock v0.5.0/go.mod h1:ge71pBPLYDk7QIi1LupWxdAykm7KIEFchiOqd6z7qMM= go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= @@ -243,6 +279,8 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= +golang.org/x/oauth2 v0.33.0 h1:4Q+qn+E5z8gPRJfmRy7C2gGG3T4jIprK6aSYgTXGRpo= +golang.org/x/oauth2 v0.33.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -255,6 +293,7 @@ golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= @@ -274,6 +313,8 @@ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= +golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= +golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= @@ -285,8 +326,8 @@ golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb h1:TLPQVbx1GJ8VKZxz52VAxl1EBgKXXbTiU9Fc5fZeLn4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:LuRYeWDFV6WOn90g357N17oMCaxpgCnbi/44qJvDn2I= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250528174236-200df99c418a h1:v2PbRU4K3llS09c7zodFpNePeamkAwG3mPrAery9VeE= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250528174236-200df99c418a/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= google.golang.org/grpc v1.72.3 h1:6sysal2a4j9trATt+J/TSSEA/Q45ZrXzNh5zy4NMWuA= google.golang.org/grpc v1.72.3/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM= google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= @@ -296,6 +337,8 @@ gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntN gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/dnaeon/go-vcr.v4 v4.0.2 h1:7T5VYf2ifyK01ETHbJPl5A6XTpUljD4Trw3GEDcdedk= gopkg.in/dnaeon/go-vcr.v4 v4.0.2/go.mod h1:65yxh9goQVrudqofKtHA4JNFWd6XZRkWfKN4YpMx7KI= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= @@ -307,16 +350,20 @@ k8s.io/apiextensions-apiserver v0.34.3 h1:p10fGlkDY09eWKOTeUSioxwLukJnm+KuDZdrW7 k8s.io/apiextensions-apiserver v0.34.3/go.mod h1:aujxvqGFRdb/cmXYfcRTeppN7S2XV/t7WMEc64zB5A0= k8s.io/apimachinery v0.35.0 h1:Z2L3IHvPVv/MJ7xRxHEtk6GoJElaAqDCCU0S6ncYok8= k8s.io/apimachinery v0.35.0/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= +k8s.io/cli-runtime v0.35.0 h1:PEJtYS/Zr4p20PfZSLCbY6YvaoLrfByd6THQzPworUE= +k8s.io/cli-runtime v0.35.0/go.mod h1:VBRvHzosVAoVdP3XwUQn1Oqkvaa8facnokNkD7jOTMY= k8s.io/client-go v0.35.0 h1:IAW0ifFbfQQwQmga0UdoH0yvdqrbwMdq9vIFEhRpxBE= k8s.io/client-go v0.35.0/go.mod h1:q2E5AAyqcbeLGPdoRB+Nxe3KYTfPce1Dnu1myQdqz9o= k8s.io/cluster-bootstrap v0.34.2 h1:oKckPeunVCns37BntcsxaOesDul32yzGd3DFLjW2fc8= k8s.io/cluster-bootstrap v0.34.2/go.mod h1:f21byPR7X5nt12ivZi+J3pb4sG4SH6VySX8KAAJA8BY= -k8s.io/component-base v0.34.3 h1:zsEgw6ELqK0XncCQomgO9DpUIzlrYuZYA0Cgo+JWpVk= -k8s.io/component-base v0.34.3/go.mod h1:5iIlD8wPfWE/xSHTRfbjuvUul2WZbI2nOUK65XL0E/c= +k8s.io/component-base v0.35.0 h1:+yBrOhzri2S1BVqyVSvcM3PtPyx5GUxCK2tinZz1G94= +k8s.io/component-base v0.35.0/go.mod h1:85SCX4UCa6SCFt6p3IKAPej7jSnF3L8EbfSyMZayJR0= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE= k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= +k8s.io/kubectl v0.35.0 h1:cL/wJKHDe8E8+rP3G7avnymcMg6bH6JEcR5w5uo06wc= +k8s.io/kubectl v0.35.0/go.mod h1:VR5/TSkYyxZwrRwY5I5dDq6l5KXmiCb+9w8IKplk3Qo= k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/cluster-api v1.12.2 h1:+b+M2IygfvFZJq7bsaloNakimMEVNf81zkGR1IiuxXs= @@ -325,6 +372,10 @@ sigs.k8s.io/controller-runtime v0.22.5 h1:v3nfSUMowX/2WMp27J9slwGFyAt7IV0YwBxAkr sigs.k8s.io/controller-runtime v0.22.5/go.mod h1:pc5SoYWnWI6I+cBHYYdZ7B6YHZVY5xNfll88JB+vniI= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/kustomize/api v0.20.1 h1:iWP1Ydh3/lmldBnH/S5RXgT98vWYMaTUL1ADcr+Sv7I= +sigs.k8s.io/kustomize/api v0.20.1/go.mod h1:t6hUFxO+Ph0VxIk1sKp1WS0dOjbPCtLJ4p8aADLwqjM= +sigs.k8s.io/kustomize/kyaml v0.20.1 h1:PCMnA2mrVbRP3NIB6v9kYCAc38uvFLVs8j/CD567A78= +sigs.k8s.io/kustomize/kyaml v0.20.1/go.mod h1:0EmkQHRUsJxY8Ug9Niig1pUMSCGHxQ5RklbpV/Ri6po= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco= diff --git a/pkg/drift/node_maintenance.go b/pkg/drift/node_maintenance.go new file mode 100644 index 00000000..18370e53 --- /dev/null +++ b/pkg/drift/node_maintenance.go @@ -0,0 +1,368 @@ +package drift + +import ( + "bytes" + "context" + "errors" + "fmt" + "io" + "os" + "strings" + "sync" + "time" + + "github.com/sirupsen/logrus" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/kubectl/pkg/drain" + + "github.com/Azure/AKSFlexNode/pkg/config" + "github.com/Azure/AKSFlexNode/pkg/kube" + "github.com/Azure/AKSFlexNode/pkg/utils" +) + +const defaultDrainTimeout = 10 * time.Minute + +type nodeMaintenance interface { + IsCordoned(ctx context.Context, nodeName string) (bool, error) + Cordon(ctx context.Context, nodeName string) error + Drain(ctx context.Context, nodeName string) error + Uncordon(ctx context.Context, nodeName string) error +} + +type kubeNodeMaintenance struct { + cfg *config.Config + logger *logrus.Logger + + mu sync.Mutex + client *kubernetes.Clientset + initFrom string +} + +func newKubeNodeMaintenance(cfg *config.Config, logger *logrus.Logger) *kubeNodeMaintenance { + if logger == nil { + logger = logrus.New() + } + return &kubeNodeMaintenance{cfg: cfg, logger: logger} +} + +func (m *kubeNodeMaintenance) IsCordoned(ctx context.Context, nodeName string) (bool, error) { + cs, err := m.clientset(ctx) + if err != nil { + return false, err + } + n, err := cs.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return false, err + } + return n.Spec.Unschedulable, nil +} + +func (m *kubeNodeMaintenance) Cordon(ctx context.Context, nodeName string) error { + return m.cordonOrUncordon(ctx, nodeName, true) +} + +func (m *kubeNodeMaintenance) Uncordon(ctx context.Context, nodeName string) error { + return m.cordonOrUncordon(ctx, nodeName, false) +} + +func (m *kubeNodeMaintenance) Drain(ctx context.Context, nodeName string) error { + cs, err := m.clientset(ctx) + if err != nil { + return err + } + + h := m.drainHelper(ctx, cs) + if err := drain.RunNodeDrain(h, nodeName); err != nil { + if shouldRetryWithAdmin(err) { + cs2, adminErr := m.forceAdminClientset(ctx) + if adminErr == nil { + h2 := m.drainHelper(ctx, cs2) + return drain.RunNodeDrain(h2, nodeName) + } + } + return err + } + return nil +} + +func (m *kubeNodeMaintenance) cordonOrUncordon(ctx context.Context, nodeName string, cordon bool) error { + cs, err := m.clientset(ctx) + if err != nil { + return err + } + + n, err := cs.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return err + } + + h := m.drainHelper(ctx, cs) + if err := drain.RunCordonOrUncordon(h, n, cordon); err != nil { + if shouldRetryWithAdmin(err) { + cs2, adminErr := m.forceAdminClientset(ctx) + if adminErr == nil { + n2, err2 := cs2.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err2 != nil { + return err2 + } + h2 := m.drainHelper(ctx, cs2) + err2 = drain.RunCordonOrUncordon(h2, n2, cordon) + return err2 + } + } + return err + } + return nil +} + +func (m *kubeNodeMaintenance) drainHelper(ctx context.Context, cs *kubernetes.Clientset) *drain.Helper { + out := io.Discard + errOut := io.Discard + if m.logger != nil && m.logger.IsLevelEnabled(logrus.DebugLevel) { + w := &logrusLineWriter{logger: m.logger, level: logrus.DebugLevel} + out = w + errOut = w + } + + return &drain.Helper{ + Ctx: ctx, + Client: cs, + Force: false, + GracePeriodSeconds: -1, + IgnoreAllDaemonSets: true, + DeleteEmptyDirData: true, + Timeout: defaultDrainTimeout, + Out: out, + ErrOut: errOut, + } +} + +func (m *kubeNodeMaintenance) clientset(ctx context.Context) (*kubernetes.Clientset, error) { + m.mu.Lock() + cs := m.client + m.mu.Unlock() + if cs != nil { + return cs, nil + } + + // Prefer an admin client for maintenance operations (cordon/drain) because + // the kubelet/node identity is subject to NodeRestriction and may be unable + // to evict or even read pods once they are being deleted. + // if m.cfg != nil { + // cs, err := m.forceAdminClientset(ctx) + // if err == nil { + // return cs, nil + // } + // if m.logger != nil { + // m.logger.WithError(err).Debug("Failed to create admin clientset for node maintenance; falling back to kubelet kubeconfig") + // } + // } + + // Fall back to the local kubelet kubeconfig if present. + if utils.FileExists(config.KubeletKubeconfigPath) { + cs, err := kube.KubeletClientset() + if err == nil { + m.mu.Lock() + m.client = cs + m.initFrom = "kubelet-kubeconfig" + m.mu.Unlock() + return cs, nil + } + if m.logger != nil { + m.logger.WithError(err).Debug("Failed to create kubelet clientset for node maintenance") + } + } + + // Last resort: admin kubeconfig via the AKS management plane (may still fail if cfg is nil). + return m.forceAdminClientset(ctx) +} + +func (m *kubeNodeMaintenance) forceAdminClientset(ctx context.Context) (*kubernetes.Clientset, error) { + if m.cfg == nil { + return nil, errors.New("config is required to fetch cluster admin kubeconfig") + } + + cs, err := kube.AdminClientset(ctx, m.cfg) + if err != nil { + return nil, err + } + + m.mu.Lock() + m.client = cs + m.initFrom = "aks-admin-kubeconfig" + m.mu.Unlock() + + return cs, nil +} + +func shouldRetryWithAdmin(err error) bool { + if err == nil { + return false + } + // Prefer structured detection when possible... + if apierrors.IsForbidden(err) || apierrors.IsUnauthorized(err) { + return true + } + // ...but kubectl drain frequently wraps StatusErrors into plain strings. + msg := strings.ToLower(err.Error()) + return strings.Contains(msg, "forbidden") || strings.Contains(msg, "unauthorized") +} + +type cordonDrainState struct { + mu sync.Mutex + uncordon bool + nodeName string + initialized bool +} + +func (s *cordonDrainState) set(nodeName string, uncordon bool) { + s.mu.Lock() + s.nodeName = nodeName + s.uncordon = uncordon + s.initialized = true + s.mu.Unlock() +} + +func (s *cordonDrainState) shouldUncordon(nodeName string) bool { + s.mu.Lock() + defer s.mu.Unlock() + return s.initialized && s.nodeName == nodeName && s.uncordon +} + +type cordonAndDrainExecutor struct { + name string + logger *logrus.Logger + ops nodeMaintenance + state *cordonDrainState +} + +func newCordonAndDrainExecutor(name string, logger *logrus.Logger, ops nodeMaintenance, state *cordonDrainState) *cordonAndDrainExecutor { + return &cordonAndDrainExecutor{name: name, logger: logger, ops: ops, state: state} +} + +func (e *cordonAndDrainExecutor) GetName() string { return e.name } + +func (e *cordonAndDrainExecutor) IsCompleted(context.Context) bool { return false } + +func (e *cordonAndDrainExecutor) Execute(ctx context.Context) error { + if e.ops == nil { + return errors.New("node maintenance is nil") + } + + nodeName, err := os.Hostname() + if err != nil { + return fmt.Errorf("failed to get hostname for node maintenance: %w", err) + } + + alreadyCordoned, err := e.ops.IsCordoned(ctx, nodeName) + if err != nil { + return fmt.Errorf("failed to check if node is cordoned: %w", err) + } + + // Only uncordon if we changed the scheduling state. + uncordonAfter := !alreadyCordoned + cordonedByUs := false + + if !alreadyCordoned { + if e.logger != nil { + e.logger.Infof("Cordoning node %s before kubelet upgrade", nodeName) + } + if err := e.ops.Cordon(ctx, nodeName); err != nil { + return fmt.Errorf("failed to cordon node %s: %w", nodeName, err) + } + cordonedByUs = true + } + + if e.logger != nil { + e.logger.Infof("Draining node %s before kubelet upgrade", nodeName) + } + if err := e.ops.Drain(ctx, nodeName); err != nil { + if cordonedByUs { + // We cordoned the node as part of this remediation run; if draining fails we should + // revert the cordon so the node can continue to accept workloads. + if e.logger != nil { + e.logger.WithError(err).Warnf("Drain failed for node %s; uncordoning to restore scheduling", nodeName) + } + if uncordonErr := e.ops.Uncordon(ctx, nodeName); uncordonErr != nil { + return fmt.Errorf("failed to drain node %s: %w (uncordon after drain failure also failed: %v)", nodeName, err, uncordonErr) + } + } + return fmt.Errorf("failed to drain node %s: %w", nodeName, err) + } + + if e.state != nil { + e.state.set(nodeName, uncordonAfter) + } + return nil +} + +type uncordonExecutor struct { + name string + logger *logrus.Logger + ops nodeMaintenance + state *cordonDrainState +} + +func newUncordonExecutor(name string, logger *logrus.Logger, ops nodeMaintenance, state *cordonDrainState) *uncordonExecutor { + return &uncordonExecutor{name: name, logger: logger, ops: ops, state: state} +} + +func (e *uncordonExecutor) GetName() string { return e.name } + +func (e *uncordonExecutor) IsCompleted(context.Context) bool { return false } + +func (e *uncordonExecutor) Execute(ctx context.Context) error { + if e.ops == nil { + return errors.New("node maintenance is nil") + } + + nodeName, err := os.Hostname() + if err != nil { + return fmt.Errorf("failed to get hostname for node maintenance: %w", err) + } + + if e.state == nil || !e.state.shouldUncordon(nodeName) { + if e.logger != nil { + e.logger.Infof("Skipping uncordon for node %s (node was already cordoned)", nodeName) + } + return nil + } + + if e.logger != nil { + e.logger.Infof("Uncordoning node %s after kubelet upgrade", nodeName) + } + if err := e.ops.Uncordon(ctx, nodeName); err != nil { + return fmt.Errorf("failed to uncordon node %s: %w", nodeName, err) + } + return nil +} + +// logrusLineWriter writes each line to logrus at a fixed level. +// It intentionally buffers until a newline so kubectl drain helper output is readable. +type logrusLineWriter struct { + logger *logrus.Logger + level logrus.Level + + mu sync.Mutex + buf bytes.Buffer +} + +func (w *logrusLineWriter) Write(p []byte) (int, error) { + w.mu.Lock() + defer w.mu.Unlock() + + for _, b := range p { + if b == '\n' { + line := w.buf.String() + w.buf.Reset() + if w.logger != nil { + w.logger.Log(w.level, line) + } + continue + } + _ = w.buf.WriteByte(b) + } + return len(p), nil +} diff --git a/pkg/drift/node_maintenance_test.go b/pkg/drift/node_maintenance_test.go new file mode 100644 index 00000000..83d357e0 --- /dev/null +++ b/pkg/drift/node_maintenance_test.go @@ -0,0 +1,240 @@ +package drift + +import ( + "context" + "errors" + "os" + "reflect" + "testing" + + "github.com/sirupsen/logrus" +) + +type fakeNodeMaintenance struct { + cordoned bool + + cordonErr error + drainErr error + uncordonErr error + getErr error + + calls []string +} + +func (f *fakeNodeMaintenance) IsCordoned(ctx context.Context, nodeName string) (bool, error) { + _ = ctx + _ = nodeName + f.calls = append(f.calls, "IsCordoned") + return f.cordoned, f.getErr +} + +func (f *fakeNodeMaintenance) Cordon(ctx context.Context, nodeName string) error { + _ = ctx + _ = nodeName + f.calls = append(f.calls, "Cordon") + if f.cordonErr != nil { + return f.cordonErr + } + f.cordoned = true + return nil +} + +func (f *fakeNodeMaintenance) Drain(ctx context.Context, nodeName string) error { + _ = ctx + _ = nodeName + f.calls = append(f.calls, "Drain") + return f.drainErr +} + +func (f *fakeNodeMaintenance) Uncordon(ctx context.Context, nodeName string) error { + _ = ctx + _ = nodeName + f.calls = append(f.calls, "Uncordon") + if f.uncordonErr != nil { + return f.uncordonErr + } + f.cordoned = false + return nil +} + +func TestCordonAndDrainExecutor_CordonsThenDrains_UncordonAfter(t *testing.T) { + t.Parallel() + + ops := &fakeNodeMaintenance{cordoned: false} + state := &cordonDrainState{} + logger := logrus.New() + + cd := newCordonAndDrainExecutor("cordon-and-drain", logger, ops, state) + if err := cd.Execute(context.Background()); err != nil { + t.Fatalf("Execute err=%v, want nil", err) + } + + wantCalls := []string{"IsCordoned", "Cordon", "Drain"} + if !reflect.DeepEqual(ops.calls, wantCalls) { + t.Fatalf("calls=%v, want %v", ops.calls, wantCalls) + } + + nodeName, _ := os.Hostname() + if !state.shouldUncordon(nodeName) { + t.Fatalf("shouldUncordon=false, want true") + } + + u := newUncordonExecutor("uncordon", logger, ops, state) + if err := u.Execute(context.Background()); err != nil { + t.Fatalf("uncordon Execute err=%v, want nil", err) + } + + if got := ops.calls[len(ops.calls)-1]; got != "Uncordon" { + t.Fatalf("last call=%q, want Uncordon", got) + } +} + +func TestCordonAndDrainExecutor_AlreadyCordoned_DoesNotUncordon(t *testing.T) { + t.Parallel() + + ops := &fakeNodeMaintenance{cordoned: true} + state := &cordonDrainState{} + logger := logrus.New() + + cd := newCordonAndDrainExecutor("cordon-and-drain", logger, ops, state) + if err := cd.Execute(context.Background()); err != nil { + t.Fatalf("Execute err=%v, want nil", err) + } + + wantCalls := []string{"IsCordoned", "Drain"} + if !reflect.DeepEqual(ops.calls, wantCalls) { + t.Fatalf("calls=%v, want %v", ops.calls, wantCalls) + } + + u := newUncordonExecutor("uncordon", logger, ops, state) + if err := u.Execute(context.Background()); err != nil { + t.Fatalf("uncordon Execute err=%v, want nil", err) + } + + for _, c := range ops.calls { + if c == "Uncordon" { + t.Fatalf("unexpected Uncordon call; calls=%v", ops.calls) + } + } +} + +func TestCordonAndDrainExecutor_CordonFails_DoesNotDrain(t *testing.T) { + t.Parallel() + + boom := errors.New("boom") + ops := &fakeNodeMaintenance{cordoned: false, cordonErr: boom} + state := &cordonDrainState{} + logger := logrus.New() + + cd := newCordonAndDrainExecutor("cordon-and-drain", logger, ops, state) + err := cd.Execute(context.Background()) + if err == nil { + t.Fatalf("err=nil, want %v", boom) + } + if !errors.Is(err, boom) { + t.Fatalf("err=%v, want to contain %v", err, boom) + } + + wantCalls := []string{"IsCordoned", "Cordon"} + if !reflect.DeepEqual(ops.calls, wantCalls) { + t.Fatalf("calls=%v, want %v", ops.calls, wantCalls) + } +} + +func TestCordonAndDrainExecutor_DrainFails_UncordonsIfCordonedByUs(t *testing.T) { + t.Parallel() + + boom := errors.New("boom") + ops := &fakeNodeMaintenance{cordoned: false, drainErr: boom} + state := &cordonDrainState{} + logger := logrus.New() + + cd := newCordonAndDrainExecutor("cordon-and-drain", logger, ops, state) + err := cd.Execute(context.Background()) + if err == nil { + t.Fatalf("err=nil, want %v", boom) + } + if !errors.Is(err, boom) { + t.Fatalf("err=%v, want to contain %v", err, boom) + } + + wantCalls := []string{"IsCordoned", "Cordon", "Drain", "Uncordon"} + if !reflect.DeepEqual(ops.calls, wantCalls) { + t.Fatalf("calls=%v, want %v", ops.calls, wantCalls) + } + if ops.cordoned { + t.Fatalf("cordoned=true, want false") + } + + nodeName, _ := os.Hostname() + if state.shouldUncordon(nodeName) { + t.Fatalf("shouldUncordon=true, want false") + } +} + +func TestCordonAndDrainExecutor_DrainFails_DoesNotUncordonIfAlreadyCordoned(t *testing.T) { + t.Parallel() + + boom := errors.New("boom") + ops := &fakeNodeMaintenance{cordoned: true, drainErr: boom} + state := &cordonDrainState{} + logger := logrus.New() + + cd := newCordonAndDrainExecutor("cordon-and-drain", logger, ops, state) + err := cd.Execute(context.Background()) + if err == nil { + t.Fatalf("err=nil, want %v", boom) + } + if !errors.Is(err, boom) { + t.Fatalf("err=%v, want to contain %v", err, boom) + } + + wantCalls := []string{"IsCordoned", "Drain"} + if !reflect.DeepEqual(ops.calls, wantCalls) { + t.Fatalf("calls=%v, want %v", ops.calls, wantCalls) + } + if !ops.cordoned { + t.Fatalf("cordoned=false, want true") + } +} + +func TestShouldRetryWithAdmin(t *testing.T) { + t.Parallel() + + tcs := []struct { + name string + err error + want bool + }{ + { + name: "nil", + err: nil, + want: false, + }, + { + name: "forbidden wrapped as string", + err: errors.New("error when waiting for pod \"x\" to terminate: pods \"x\" is forbidden: User \"system:node:free-node\" cannot get resource \"pods\""), + want: true, + }, + { + name: "unauthorized wrapped as string", + err: errors.New("Unauthorized"), + want: true, + }, + { + name: "other error", + err: errors.New("context deadline exceeded"), + want: false, + }, + } + + for _, tc := range tcs { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + if got := shouldRetryWithAdmin(tc.err); got != tc.want { + t.Fatalf("shouldRetryWithAdmin(%v)=%v, want %v", tc.err, got, tc.want) + } + }) + } +} diff --git a/pkg/drift/remediation.go b/pkg/drift/remediation.go index 79eae188..84edec87 100644 --- a/pkg/drift/remediation.go +++ b/pkg/drift/remediation.go @@ -212,13 +212,20 @@ func runKubernetesUpgradeRemediation( return nil, errors.New("components API connection is required") } + // For kubelet upgrades we cordon+drain the node first to minimize disruption. + // We only uncordon if we cordoned the node in this remediation run. + nodeOps := newKubeNodeMaintenance(cfg, logger) + cordonState := &cordonDrainState{} + steps := []bootstrapper.Executor{ + newCordonAndDrainExecutor("cordon-and-drain", logger, nodeOps, cordonState), // Stop/disable kubelet around the upgrade so we don't run kubelet against partially-updated bits. bootstrapper.StopKubeletExecutor("stop-kubelet", conn, cfg), // Install the desired kube binaries version. bootstrapper.DownloadKubeBinariesExecutor("download-kube-binaries", conn, cfg), // Reconfigure + start kubelet to match the upgraded bits. bootstrapper.StartKubeletExecutor("start-kubelet", conn, cfg), + newUncordonExecutor("uncordon", logger, nodeOps, cordonState), } be := bootstrapper.NewBaseExecutor(cfg, logger) diff --git a/pkg/kube/client.go b/pkg/kube/client.go new file mode 100644 index 00000000..77bf8d5a --- /dev/null +++ b/pkg/kube/client.go @@ -0,0 +1,104 @@ +package kube + +import ( + "context" + "errors" + "fmt" + "sync" + + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v5" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" + + "github.com/Azure/AKSFlexNode/pkg/auth" + "github.com/Azure/AKSFlexNode/pkg/config" +) + +var ( + kubeletMu sync.Mutex + kubeletClient *kubernetes.Clientset + kubeletErr error +) + +// KubeletClientset returns a cached client-go clientset constructed from the +// local kubelet kubeconfig (config.KubeletKubeconfigPath). +// +// This is safe to share across status collection and drift remediation within +// the same agent process. +func KubeletClientset() (*kubernetes.Clientset, error) { + kubeletMu.Lock() + defer kubeletMu.Unlock() + + if kubeletClient != nil { + return kubeletClient, nil + } + + restCfg, err := clientcmd.BuildConfigFromFlags("", config.KubeletKubeconfigPath) + if err != nil { + kubeletErr = fmt.Errorf("build rest config from kubelet kubeconfig: %w", err) + return nil, kubeletErr + } + cs, err := kubernetes.NewForConfig(restCfg) + if err != nil { + kubeletErr = fmt.Errorf("create clientset from kubelet kubeconfig: %w", err) + return nil, kubeletErr + } + + kubeletClient = cs + kubeletErr = nil + return kubeletClient, nil +} + +// AdminClientset returns a client-go clientset constructed from the AKS cluster +// admin kubeconfig fetched via the Azure management plane. +func AdminClientset(ctx context.Context, cfg *config.Config) (*kubernetes.Clientset, error) { + if cfg == nil { + return nil, errors.New("cfg is nil") + } + + adminCfgBytes, err := fetchClusterAdminKubeconfig(ctx, cfg) + if err != nil { + return nil, err + } + restCfg, err := clientcmd.RESTConfigFromKubeConfig(adminCfgBytes) + if err != nil { + return nil, fmt.Errorf("build rest config from admin kubeconfig: %w", err) + } + cs, err := kubernetes.NewForConfig(restCfg) + if err != nil { + return nil, fmt.Errorf("create clientset from admin kubeconfig: %w", err) + } + return cs, nil +} + +func fetchClusterAdminKubeconfig(ctx context.Context, cfg *config.Config) ([]byte, error) { + cred, err := auth.NewAuthProvider().UserCredential(cfg) + if err != nil { + return nil, fmt.Errorf("get credential: %w", err) + } + + subID := cfg.GetTargetClusterSubscriptionID() + if subID == "" { + return nil, errors.New("target cluster subscription ID is empty") + } + + mcClient, err := armcontainerservice.NewManagedClustersClient(subID, cred, nil) + if err != nil { + return nil, fmt.Errorf("create managed clusters client: %w", err) + } + + clusterRG := cfg.GetTargetClusterResourceGroup() + clusterName := cfg.GetTargetClusterName() + if clusterRG == "" || clusterName == "" { + return nil, errors.New("target cluster resource group/name is empty") + } + + resp, err := mcClient.ListClusterAdminCredentials(ctx, clusterRG, clusterName, nil) + if err != nil { + return nil, fmt.Errorf("list cluster admin credentials for %s/%s: %w", clusterRG, clusterName, err) + } + if len(resp.Kubeconfigs) == 0 || resp.Kubeconfigs[0] == nil || len(resp.Kubeconfigs[0].Value) == 0 { + return nil, errors.New("cluster admin kubeconfig was empty") + } + return resp.Kubeconfigs[0].Value, nil +} diff --git a/pkg/status/collector.go b/pkg/status/collector.go index faa1f5f0..7630e29b 100644 --- a/pkg/status/collector.go +++ b/pkg/status/collector.go @@ -10,8 +10,10 @@ import ( "time" "github.com/Azure/AKSFlexNode/pkg/config" + "github.com/Azure/AKSFlexNode/pkg/kube" "github.com/Azure/AKSFlexNode/pkg/utils" "github.com/sirupsen/logrus" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) // Collector collects system and node status information @@ -198,32 +200,32 @@ func (c *Collector) isKubeletReady(ctx context.Context) string { return "Unknown" } - // Readiness condition status is one of: True, False, Unknown - args := []string{ - "--kubeconfig", - config.KubeletKubeconfigPath, - "get", - "node", - hostName, - "-o", - "jsonpath={.status.conditions[?(@.type==\"Ready\")].status}", + cs, err := kube.KubeletClientset() + if err != nil { + c.logger.Warnf("Failed to create kubelet clientset for readiness: %v", err) + return "Unknown" } - output, err := utils.RunCommandWithOutput("kubectl", args...) + n, err := cs.CoreV1().Nodes().Get(ctx, hostName, metav1.GetOptions{}) if err != nil { - // Log the kubectl error for debugging. - c.logger.Errorf("kubectl command failed: %v with output: %s", err, output) + c.logger.Warnf("Failed to get node %s for readiness: %v", hostName, err) return "Unknown" } - switch strings.TrimSpace(output) { - case "True": - return "Ready" - case "False": - return "NotReady" - default: - return "Unknown" + for _, cond := range n.Status.Conditions { + if cond.Type != "Ready" { + continue + } + switch string(cond.Status) { + case "True": + return "Ready" + case "False": + return "NotReady" + default: + return "Unknown" + } } + return "Unknown" } // NeedsBootstrap checks if the node needs to be (re)bootstrapped based on status file From 41bbc665755650915db098a1e5e6c592d7a2c981 Mon Sep 17 00:00:00 2001 From: Qingqing Zheng Date: Tue, 17 Mar 2026 16:47:36 -0700 Subject: [PATCH 2/7] add status file lock --- pkg/drift/remediation.go | 78 ++++++++++++++++++++++++++++---- pkg/drift/remediation_test.go | 43 ++++++++++++++++++ pkg/status/health.go | 83 ++++++++++++++++++++++++++++++----- pkg/status/health_test.go | 67 ++++++++++++++++++++++++++++ pkg/status/loader.go | 4 ++ pkg/status/lock.go | 19 ++++++++ pkg/status/writer.go | 6 +++ 7 files changed, 280 insertions(+), 20 deletions(-) create mode 100644 pkg/status/lock.go diff --git a/pkg/drift/remediation.go b/pkg/drift/remediation.go index 84edec87..ee0eecc3 100644 --- a/pkg/drift/remediation.go +++ b/pkg/drift/remediation.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "os" "sync/atomic" "time" @@ -18,6 +19,14 @@ import ( const driftKubernetesUpgradeOperation = "drift-kubernetes-upgrade" +const ( + upgradeStepCordonAndDrain = "cordon-and-drain" + upgradeStepStopKubelet = "stop-kubelet" + upgradeStepDownloadKubeBinaries = "download-kube-binaries" + upgradeStepStartKubelet = "start-kubelet" + upgradeStepUncordon = "uncordon" +) + // maxManagedClusterSpecAge is a safety guard to avoid acting on very stale spec snapshots. // In normal operation we run drift immediately after a successful spec collection, so this // should rarely block remediation. @@ -121,13 +130,24 @@ func detectAndRemediate( case RemediationActionKubernetesUpgrade: result, upgradeErr := runKubernetesUpgradeRemediation(ctx, cfg, logger, conn) if upgradeErr != nil { - status.MarkKubeletUnhealthyBestEffort(logger) + if shouldMarkKubeletUnhealthyAfterUpgradeFailure(result, upgradeErr) { + status.MarkKubeletUnhealthyBestEffort(logger) + } return fmt.Errorf("kubernetes upgrade remediation failed: %w", upgradeErr) } if err := handleExecutionResult(result, driftKubernetesUpgradeOperation, logger); err != nil { - status.MarkKubeletUnhealthyBestEffort(logger) + if shouldMarkKubeletUnhealthyAfterUpgradeFailure(result, err) { + status.MarkKubeletUnhealthyBestEffort(logger) + } return fmt.Errorf("kubernetes upgrade remediation execution failed: %w", err) } + // Best-effort: reflect the successful upgrade immediately in the status snapshot so + // subsequent health checks don't rely solely on the periodic status collector. + kubeletVersion := plan.DesiredKubernetesVersion + if kubeletVersion == "" && cfg != nil { + kubeletVersion = cfg.Kubernetes.Version + } + status.MarkKubeletHealthyAfterUpgradeBestEffort(logger, kubeletVersion) logger.Info("Kubernetes upgrade remediation completed successfully") return detectErr @@ -218,18 +238,30 @@ func runKubernetesUpgradeRemediation( cordonState := &cordonDrainState{} steps := []bootstrapper.Executor{ - newCordonAndDrainExecutor("cordon-and-drain", logger, nodeOps, cordonState), + newCordonAndDrainExecutor(upgradeStepCordonAndDrain, logger, nodeOps, cordonState), // Stop/disable kubelet around the upgrade so we don't run kubelet against partially-updated bits. - bootstrapper.StopKubeletExecutor("stop-kubelet", conn, cfg), + bootstrapper.StopKubeletExecutor(upgradeStepStopKubelet, conn, cfg), // Install the desired kube binaries version. - bootstrapper.DownloadKubeBinariesExecutor("download-kube-binaries", conn, cfg), + bootstrapper.DownloadKubeBinariesExecutor(upgradeStepDownloadKubeBinaries, conn, cfg), // Reconfigure + start kubelet to match the upgraded bits. - bootstrapper.StartKubeletExecutor("start-kubelet", conn, cfg), - newUncordonExecutor("uncordon", logger, nodeOps, cordonState), + bootstrapper.StartKubeletExecutor(upgradeStepStartKubelet, conn, cfg), + newUncordonExecutor(upgradeStepUncordon, logger, nodeOps, cordonState), } be := bootstrapper.NewBaseExecutor(cfg, logger) - return be.ExecuteSteps(ctx, steps, driftKubernetesUpgradeOperation) + result, err := be.ExecuteSteps(ctx, steps, driftKubernetesUpgradeOperation) + if err != nil && logger != nil { + // Special-case: if the only thing that failed was uncordon, best-effort retry so the + // node doesn't remain stuck unschedulable after a successful upgrade. + if failedStepName(result) == upgradeStepUncordon { + nodeName, hnErr := os.Hostname() + if hnErr == nil && cordonState.shouldUncordon(nodeName) { + logger.WithError(err).Warnf("Upgrade remediation failed at uncordon; retrying uncordon best-effort for node %s", nodeName) + _ = nodeOps.Uncordon(ctx, nodeName) + } + } + } + return result, err } // handleExecutionResult mirrors main's handleExecutionResult but lives in drift so remediation @@ -247,3 +279,33 @@ func handleExecutionResult(result *bootstrapper.ExecutionResult, operation strin return fmt.Errorf("%s failed: %s", operation, result.Error) } + +func failedStepName(result *bootstrapper.ExecutionResult) string { + if result == nil { + return "" + } + for _, sr := range result.StepResults { + if !sr.Success { + return sr.StepName + } + } + return "" +} + +func shouldMarkKubeletUnhealthyAfterUpgradeFailure(result *bootstrapper.ExecutionResult, upgradeErr error) bool { + if upgradeErr == nil { + return false + } + // Only mark kubelet unhealthy when the failure indicates kubelet/binaries are likely in a bad state. + // Cordon/drain failures are generally control-plane/RBAC/timeouts, and uncordon failures do not + // imply kubelet is unhealthy. + switch failedStepName(result) { + case upgradeStepCordonAndDrain, upgradeStepUncordon: + return false + case upgradeStepStopKubelet, upgradeStepDownloadKubeBinaries, upgradeStepStartKubelet: + return true + default: + // Unknown step; be conservative and trigger auto-bootstrap. + return true + } +} diff --git a/pkg/drift/remediation_test.go b/pkg/drift/remediation_test.go index b3b9e58c..b43b4319 100644 --- a/pkg/drift/remediation_test.go +++ b/pkg/drift/remediation_test.go @@ -3,12 +3,14 @@ package drift import ( "context" "errors" + "fmt" "sync/atomic" "testing" "time" "github.com/sirupsen/logrus" + "github.com/Azure/AKSFlexNode/pkg/bootstrapper" "github.com/Azure/AKSFlexNode/pkg/config" "github.com/Azure/AKSFlexNode/pkg/spec" "github.com/Azure/AKSFlexNode/pkg/status" @@ -175,3 +177,44 @@ func TestDetectAndRemediate_ReturnsDetectErrorIfNoFindings(t *testing.T) { t.Fatalf("err=%v, want to contain %v", err, wantErr) } } + +func TestShouldMarkKubeletUnhealthyAfterUpgradeFailure(t *testing.T) { + t.Parallel() + + makeResultFailingAt := func(step string) *bootstrapper.ExecutionResult { + return &bootstrapper.ExecutionResult{ + StepResults: []bootstrapper.StepResult{ + {StepName: step, Success: false, Error: "boom"}, + }, + Error: fmt.Sprintf("failed at %s", step), + } + } + + err := errors.New("boom") + + if got := shouldMarkKubeletUnhealthyAfterUpgradeFailure(makeResultFailingAt(upgradeStepCordonAndDrain), err); got { + t.Fatalf("cordon-and-drain failure marked unhealthy=true, want false") + } + if got := shouldMarkKubeletUnhealthyAfterUpgradeFailure(makeResultFailingAt(upgradeStepUncordon), err); got { + t.Fatalf("uncordon failure marked unhealthy=true, want false") + } + + if got := shouldMarkKubeletUnhealthyAfterUpgradeFailure(makeResultFailingAt(upgradeStepStopKubelet), err); !got { + t.Fatalf("stop-kubelet failure marked unhealthy=false, want true") + } + if got := shouldMarkKubeletUnhealthyAfterUpgradeFailure(makeResultFailingAt(upgradeStepDownloadKubeBinaries), err); !got { + t.Fatalf("download-kube-binaries failure marked unhealthy=false, want true") + } + if got := shouldMarkKubeletUnhealthyAfterUpgradeFailure(makeResultFailingAt(upgradeStepStartKubelet), err); !got { + t.Fatalf("start-kubelet failure marked unhealthy=false, want true") + } + + // Unknown step -> conservative true. + if got := shouldMarkKubeletUnhealthyAfterUpgradeFailure(makeResultFailingAt("something-else"), err); !got { + t.Fatalf("unknown step marked unhealthy=false, want true") + } + // No error -> never mark. + if got := shouldMarkKubeletUnhealthyAfterUpgradeFailure(makeResultFailingAt(upgradeStepStopKubelet), nil); got { + t.Fatalf("nil error marked unhealthy=true, want false") + } +} diff --git a/pkg/status/health.go b/pkg/status/health.go index e2577561..1d701208 100644 --- a/pkg/status/health.go +++ b/pkg/status/health.go @@ -34,20 +34,79 @@ func MarkKubeletUnhealthyBestEffortAtPath(logger *logrus.Logger, statusFilePath now = time.Now() } - snap, err := LoadStatusFromFile(statusFilePath) - if err != nil || snap == nil { - snap = &NodeStatus{} + _ = withStatusFileLock(func() error { + snap, err := loadStatusFromFileUnlocked(statusFilePath) + if err != nil || snap == nil { + snap = &NodeStatus{} + } + + // Make the status clearly unhealthy so NeedsBootstrap() will trigger. + snap.KubeletRunning = false + snap.KubeletReady = "Unknown" + snap.KubeletVersion = "unknown" + snap.LastUpdatedBy = LastUpdatedByDriftDetectionAndRemediation + snap.LastUpdatedReason = LastUpdatedReasonKubernetesVersionDrift + snap.LastUpdated = now + + if err := writeStatusToFileUnlocked(statusFilePath, snap); err != nil { + logger.Debugf("Failed to mark status unhealthy at %s: %v", statusFilePath, err) + } + return nil + }) +} + +// MarkKubeletHealthyAfterUpgradeBestEffort updates the existing status snapshot to reflect +// that kubelet should now be running with the desired version after a successful upgrade. +// +// This is intended to reduce reliance on the periodic status collection loop and to avoid +// triggering unnecessary auto-bootstrap due to stale/unknown kubelet status. +// +// It preserves other status fields (e.g., runc/containerd versions, Arc status). +func MarkKubeletHealthyAfterUpgradeBestEffort(logger *logrus.Logger, kubeletVersion string) { + if logger == nil { + logger = logrus.New() } - // Make the status clearly unhealthy so NeedsBootstrap() will trigger. - snap.KubeletRunning = false - snap.KubeletReady = "Unknown" - snap.KubeletVersion = "unknown" - snap.LastUpdatedBy = LastUpdatedByDriftDetectionAndRemediation - snap.LastUpdatedReason = LastUpdatedReasonKubernetesVersionDrift - snap.LastUpdated = now + statusFilePath := GetStatusFilePath() + MarkKubeletHealthyAfterUpgradeBestEffortAtPath(logger, statusFilePath, kubeletVersion, time.Time{}) +} - if err := WriteStatusToFile(statusFilePath, snap); err != nil { - logger.Debugf("Failed to mark status unhealthy at %s: %v", statusFilePath, err) +// MarkKubeletHealthyAfterUpgradeBestEffortAtPath is the path-based variant used by tests. +// +// If now is zero, time.Now() is used. +func MarkKubeletHealthyAfterUpgradeBestEffortAtPath(logger *logrus.Logger, statusFilePath string, kubeletVersion string, now time.Time) { + if logger == nil { + logger = logrus.New() + } + if statusFilePath == "" { + return + } + if now.IsZero() { + now = time.Now() } + + _ = withStatusFileLock(func() error { + snap, err := loadStatusFromFileUnlocked(statusFilePath) + if err != nil || snap == nil { + // Status file should generally exist, but avoid failing hard; create a minimal snapshot. + snap = &NodeStatus{} + } + + snap.KubeletRunning = true + if kubeletVersion != "" { + snap.KubeletVersion = kubeletVersion + } + if snap.KubeletReady == "" { + snap.KubeletReady = "Unknown" + } + + snap.LastUpdatedBy = LastUpdatedByDriftDetectionAndRemediation + snap.LastUpdatedReason = LastUpdatedReasonKubernetesVersionDrift + snap.LastUpdated = now + + if err := writeStatusToFileUnlocked(statusFilePath, snap); err != nil { + logger.Debugf("Failed to mark status healthy after upgrade at %s: %v", statusFilePath, err) + } + return nil + }) } diff --git a/pkg/status/health_test.go b/pkg/status/health_test.go index 6ff39ffe..f34b8541 100644 --- a/pkg/status/health_test.go +++ b/pkg/status/health_test.go @@ -41,3 +41,70 @@ func TestMarkKubeletUnhealthyBestEffortAtPath_CreatesOrUpdatesSnapshot(t *testin t.Fatalf("LastUpdated=%s, want %s", snap.LastUpdated.Format(time.RFC3339Nano), now.Format(time.RFC3339Nano)) } } + +func TestMarkKubeletHealthyAfterUpgradeBestEffortAtPath_UpdatesKubeletOnly(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + path := filepath.Join(dir, "status.json") + logger := logrus.New() + + // Seed a status snapshot similar to what the periodic collector would write. + seed := &NodeStatus{ + KubeletVersion: "1.30.0", + RuncVersion: "1.2.3", + ContainerdVersion: "2.0.0", + KubeletRunning: false, + KubeletReady: "NotReady", + ContainerdRunning: true, + LastUpdated: time.Date(2026, 2, 13, 11, 0, 0, 0, time.UTC), + LastUpdatedBy: LastUpdatedByStatusCollectionLoop, + LastUpdatedReason: LastUpdatedReasonPeriodicStatusLoop, + AgentVersion: "v0", + ArcStatus: ArcStatus{Connected: true, MachineName: "m"}, + } + if err := WriteStatusToFile(path, seed); err != nil { + t.Fatalf("WriteStatusToFile() err=%v", err) + } + + now := time.Date(2026, 2, 13, 12, 0, 0, 0, time.UTC) + MarkKubeletHealthyAfterUpgradeBestEffortAtPath(logger, path, "1.31.0", now) + + snap, err := LoadStatusFromFile(path) + if err != nil { + t.Fatalf("LoadStatusFromFile() err=%v", err) + } + + if snap.KubeletRunning != true { + t.Fatalf("KubeletRunning=%v, want true", snap.KubeletRunning) + } + if snap.KubeletVersion != "1.31.0" { + t.Fatalf("KubeletVersion=%q, want %q", snap.KubeletVersion, "1.31.0") + } + // Preserve KubeletReady if it was already set. + if snap.KubeletReady != "NotReady" { + t.Fatalf("KubeletReady=%q, want %q", snap.KubeletReady, "NotReady") + } + // Preserve other fields. + if snap.RuncVersion != "1.2.3" { + t.Fatalf("RuncVersion=%q, want %q", snap.RuncVersion, "1.2.3") + } + if snap.ContainerdVersion != "2.0.0" { + t.Fatalf("ContainerdVersion=%q, want %q", snap.ContainerdVersion, "2.0.0") + } + if snap.ContainerdRunning != true { + t.Fatalf("ContainerdRunning=%v, want true", snap.ContainerdRunning) + } + if snap.ArcStatus.MachineName != "m" || snap.ArcStatus.Connected != true { + t.Fatalf("ArcStatus=%+v, want machineName=%q connected=true", snap.ArcStatus, "m") + } + if snap.LastUpdatedBy != LastUpdatedByDriftDetectionAndRemediation { + t.Fatalf("LastUpdatedBy=%q, want %q", snap.LastUpdatedBy, LastUpdatedByDriftDetectionAndRemediation) + } + if snap.LastUpdatedReason != LastUpdatedReasonKubernetesVersionDrift { + t.Fatalf("LastUpdatedReason=%q, want %q", snap.LastUpdatedReason, LastUpdatedReasonKubernetesVersionDrift) + } + if !snap.LastUpdated.Equal(now) { + t.Fatalf("LastUpdated=%s, want %s", snap.LastUpdated.Format(time.RFC3339Nano), now.Format(time.RFC3339Nano)) + } +} diff --git a/pkg/status/loader.go b/pkg/status/loader.go index fb4a00aa..76c06145 100644 --- a/pkg/status/loader.go +++ b/pkg/status/loader.go @@ -13,6 +13,10 @@ func LoadStatus() (*NodeStatus, error) { // LoadStatusFromFile loads the node status snapshot from a JSON file. func LoadStatusFromFile(path string) (*NodeStatus, error) { + return loadStatusFromFileUnlocked(path) +} + +func loadStatusFromFileUnlocked(path string) (*NodeStatus, error) { if path == "" { return nil, fmt.Errorf("status path is empty") } diff --git a/pkg/status/lock.go b/pkg/status/lock.go new file mode 100644 index 00000000..bb5dd509 --- /dev/null +++ b/pkg/status/lock.go @@ -0,0 +1,19 @@ +package status + +import "sync" + +// statusFileMu serializes status snapshot updates within a single agent process. +// +// The agent runs multiple goroutines (periodic status collector and drift remediation) +// that can update the same JSON file. Writes are atomic on disk, but without a mutex +// we can still get last-writer-wins clobbering and read-modify-write lost updates. +// +// NOTE: This is intentionally an in-process lock only; we don't expect multiple +// agent processes on the same node. +var statusFileMu sync.Mutex + +func withStatusFileLock(fn func() error) error { + statusFileMu.Lock() + defer statusFileMu.Unlock() + return fn() +} diff --git a/pkg/status/writer.go b/pkg/status/writer.go index 8aaddf8b..3c7575cb 100644 --- a/pkg/status/writer.go +++ b/pkg/status/writer.go @@ -10,6 +10,12 @@ import ( // WriteStatusToFile persists the node status snapshot to a JSON file. // It writes atomically to avoid partial writes. func WriteStatusToFile(path string, nodeStatus *NodeStatus) error { + return withStatusFileLock(func() error { + return writeStatusToFileUnlocked(path, nodeStatus) + }) +} + +func writeStatusToFileUnlocked(path string, nodeStatus *NodeStatus) error { if path == "" { return fmt.Errorf("status path is empty") } From 34fdf7624da2459b0ff9b25b0910ae95055cf4df Mon Sep 17 00:00:00 2001 From: Qingqing Zheng Date: Tue, 17 Mar 2026 16:58:23 -0700 Subject: [PATCH 3/7] address copilot comments --- pkg/drift/node_maintenance.go | 28 ++++++++++++++++++---------- pkg/status/collector.go | 9 +++++---- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/pkg/drift/node_maintenance.go b/pkg/drift/node_maintenance.go index 18370e53..fec16d84 100644 --- a/pkg/drift/node_maintenance.go +++ b/pkg/drift/node_maintenance.go @@ -81,6 +81,10 @@ func (m *kubeNodeMaintenance) Drain(ctx context.Context, nodeName string) error h2 := m.drainHelper(ctx, cs2) return drain.RunNodeDrain(h2, nodeName) } + // Log failure to obtain admin clientset before returning the original error. + m.logger.WithError(adminErr).WithField("node", nodeName).Warn( + "failed to get admin clientset for drain retry; returning original error", + ) } return err } @@ -111,6 +115,10 @@ func (m *kubeNodeMaintenance) cordonOrUncordon(ctx context.Context, nodeName str err2 = drain.RunCordonOrUncordon(h2, n2, cordon) return err2 } + // Log failure to obtain admin clientset before returning the original error. + m.logger.WithError(adminErr).WithField("node", nodeName).Warn( + "failed to get admin clientset for cordon/uncordon retry; returning original error", + ) } return err } @@ -132,7 +140,7 @@ func (m *kubeNodeMaintenance) drainHelper(ctx context.Context, cs *kubernetes.Cl Force: false, GracePeriodSeconds: -1, IgnoreAllDaemonSets: true, - DeleteEmptyDirData: true, + DeleteEmptyDirData: false, Timeout: defaultDrainTimeout, Out: out, ErrOut: errOut, @@ -150,15 +158,15 @@ func (m *kubeNodeMaintenance) clientset(ctx context.Context) (*kubernetes.Client // Prefer an admin client for maintenance operations (cordon/drain) because // the kubelet/node identity is subject to NodeRestriction and may be unable // to evict or even read pods once they are being deleted. - // if m.cfg != nil { - // cs, err := m.forceAdminClientset(ctx) - // if err == nil { - // return cs, nil - // } - // if m.logger != nil { - // m.logger.WithError(err).Debug("Failed to create admin clientset for node maintenance; falling back to kubelet kubeconfig") - // } - // } + if m.cfg != nil { + cs, err := m.forceAdminClientset(ctx) + if err == nil { + return cs, nil + } + if m.logger != nil { + m.logger.WithError(err).Debug("Failed to create admin clientset for node maintenance; falling back to kubelet kubeconfig") + } + } // Fall back to the local kubelet kubeconfig if present. if utils.FileExists(config.KubeletKubeconfigPath) { diff --git a/pkg/status/collector.go b/pkg/status/collector.go index 7630e29b..d781ff8f 100644 --- a/pkg/status/collector.go +++ b/pkg/status/collector.go @@ -13,6 +13,7 @@ import ( "github.com/Azure/AKSFlexNode/pkg/kube" "github.com/Azure/AKSFlexNode/pkg/utils" "github.com/sirupsen/logrus" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -213,13 +214,13 @@ func (c *Collector) isKubeletReady(ctx context.Context) string { } for _, cond := range n.Status.Conditions { - if cond.Type != "Ready" { + if cond.Type != corev1.NodeReady { continue } - switch string(cond.Status) { - case "True": + switch cond.Status { + case corev1.ConditionTrue: return "Ready" - case "False": + case corev1.ConditionFalse: return "NotReady" default: return "Unknown" From 4b6f7898db077a64d03a42f26d13280589cdedad Mon Sep 17 00:00:00 2001 From: Qingqing Zheng Date: Tue, 17 Mar 2026 17:15:27 -0700 Subject: [PATCH 4/7] address copilot comments again --- pkg/drift/remediation.go | 4 ++-- pkg/status/collector.go | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pkg/drift/remediation.go b/pkg/drift/remediation.go index ee0eecc3..f407acba 100644 --- a/pkg/drift/remediation.go +++ b/pkg/drift/remediation.go @@ -305,7 +305,7 @@ func shouldMarkKubeletUnhealthyAfterUpgradeFailure(result *bootstrapper.Executio case upgradeStepStopKubelet, upgradeStepDownloadKubeBinaries, upgradeStepStartKubelet: return true default: - // Unknown step; be conservative and trigger auto-bootstrap. - return true + // Unknown step; avoid unnecessary auto-bootstrap unless we can positively identify a kubelet/binary issue. + return false } } diff --git a/pkg/status/collector.go b/pkg/status/collector.go index d781ff8f..dc4d72b4 100644 --- a/pkg/status/collector.go +++ b/pkg/status/collector.go @@ -207,7 +207,10 @@ func (c *Collector) isKubeletReady(ctx context.Context) string { return "Unknown" } - n, err := cs.CoreV1().Nodes().Get(ctx, hostName, metav1.GetOptions{}) + timeoutCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + + n, err := cs.CoreV1().Nodes().Get(timeoutCtx, hostName, metav1.GetOptions{}) if err != nil { c.logger.Warnf("Failed to get node %s for readiness: %v", hostName, err) return "Unknown" From 8f441ba0c4e089306914557ac2de967486ddd139 Mon Sep 17 00:00:00 2001 From: Qingqing Zheng Date: Tue, 17 Mar 2026 17:24:43 -0700 Subject: [PATCH 5/7] fix ut --- pkg/drift/remediation_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/drift/remediation_test.go b/pkg/drift/remediation_test.go index b43b4319..48d1b708 100644 --- a/pkg/drift/remediation_test.go +++ b/pkg/drift/remediation_test.go @@ -210,8 +210,8 @@ func TestShouldMarkKubeletUnhealthyAfterUpgradeFailure(t *testing.T) { } // Unknown step -> conservative true. - if got := shouldMarkKubeletUnhealthyAfterUpgradeFailure(makeResultFailingAt("something-else"), err); !got { - t.Fatalf("unknown step marked unhealthy=false, want true") + if got := shouldMarkKubeletUnhealthyAfterUpgradeFailure(makeResultFailingAt("something-else"), err); got { + t.Fatalf("unknown step marked unhealthy=true, want false") } // No error -> never mark. if got := shouldMarkKubeletUnhealthyAfterUpgradeFailure(makeResultFailingAt(upgradeStepStopKubelet), nil); got { From cceaacfb1bc10d0b7e3806f781ac9f7951b1e5b1 Mon Sep 17 00:00:00 2001 From: Qingqing Zheng Date: Wed, 18 Mar 2026 12:45:37 -0700 Subject: [PATCH 6/7] address comments --- pkg/drift/node_maintenance.go | 10 +++++++++- pkg/drift/node_maintenance_test.go | 10 ++++++++++ pkg/drift/remediation_test.go | 2 +- pkg/kube/client.go | 21 +++++++++++++++------ 4 files changed, 35 insertions(+), 8 deletions(-) diff --git a/pkg/drift/node_maintenance.go b/pkg/drift/node_maintenance.go index fec16d84..77f6ba5e 100644 --- a/pkg/drift/node_maintenance.go +++ b/pkg/drift/node_maintenance.go @@ -215,7 +215,15 @@ func shouldRetryWithAdmin(err error) bool { } // ...but kubectl drain frequently wraps StatusErrors into plain strings. msg := strings.ToLower(err.Error()) - return strings.Contains(msg, "forbidden") || strings.Contains(msg, "unauthorized") + if strings.Contains(msg, "forbidden") || strings.Contains(msg, "unauthorized") { + return true + } + // Admin kubeconfigs/certs can expire; transport-layer TLS failures won't be classified + // as apierrors.*. Treat these as signals to refresh credentials and retry. + if strings.Contains(msg, "x509:") || strings.Contains(msg, "tls:") { + return true + } + return false } type cordonDrainState struct { diff --git a/pkg/drift/node_maintenance_test.go b/pkg/drift/node_maintenance_test.go index 83d357e0..dea93556 100644 --- a/pkg/drift/node_maintenance_test.go +++ b/pkg/drift/node_maintenance_test.go @@ -221,6 +221,16 @@ func TestShouldRetryWithAdmin(t *testing.T) { err: errors.New("Unauthorized"), want: true, }, + { + name: "x509 cert expired", + err: errors.New("Get \"https://10.0.0.1:443\": x509: certificate has expired or is not yet valid"), + want: true, + }, + { + name: "tls bad certificate", + err: errors.New("remote error: tls: bad certificate"), + want: true, + }, { name: "other error", err: errors.New("context deadline exceeded"), diff --git a/pkg/drift/remediation_test.go b/pkg/drift/remediation_test.go index 48d1b708..9037a947 100644 --- a/pkg/drift/remediation_test.go +++ b/pkg/drift/remediation_test.go @@ -209,7 +209,7 @@ func TestShouldMarkKubeletUnhealthyAfterUpgradeFailure(t *testing.T) { t.Fatalf("start-kubelet failure marked unhealthy=false, want true") } - // Unknown step -> conservative true. + // Unknown step -> dont mark kubelet unhealthy if got := shouldMarkKubeletUnhealthyAfterUpgradeFailure(makeResultFailingAt("something-else"), err); got { t.Fatalf("unknown step marked unhealthy=true, want false") } diff --git a/pkg/kube/client.go b/pkg/kube/client.go index 77bf8d5a..bf76f7fb 100644 --- a/pkg/kube/client.go +++ b/pkg/kube/client.go @@ -17,7 +17,6 @@ import ( var ( kubeletMu sync.Mutex kubeletClient *kubernetes.Clientset - kubeletErr error ) // KubeletClientset returns a cached client-go clientset constructed from the @@ -35,20 +34,30 @@ func KubeletClientset() (*kubernetes.Clientset, error) { restCfg, err := clientcmd.BuildConfigFromFlags("", config.KubeletKubeconfigPath) if err != nil { - kubeletErr = fmt.Errorf("build rest config from kubelet kubeconfig: %w", err) - return nil, kubeletErr + return nil, fmt.Errorf("build rest config from kubelet kubeconfig: %w", err) } cs, err := kubernetes.NewForConfig(restCfg) if err != nil { - kubeletErr = fmt.Errorf("create clientset from kubelet kubeconfig: %w", err) - return nil, kubeletErr + return nil, fmt.Errorf("create clientset from kubelet kubeconfig: %w", err) } kubeletClient = cs - kubeletErr = nil return kubeletClient, nil } +// InvalidateKubeletClientset clears the cached kubelet clientset. +// +// This is useful if the kubelet kubeconfig on disk has rotated (cert renewal, +// bootstrap regeneration, etc.) and callers want subsequent operations to pick +// up the new credentials. +// +// It is safe to call concurrently. +func InvalidateKubeletClientset() { + kubeletMu.Lock() + defer kubeletMu.Unlock() + kubeletClient = nil +} + // AdminClientset returns a client-go clientset constructed from the AKS cluster // admin kubeconfig fetched via the Azure management plane. func AdminClientset(ctx context.Context, cfg *config.Config) (*kubernetes.Clientset, error) { From b3b94c62606925b0980b446f61997560e38328d6 Mon Sep 17 00:00:00 2001 From: Qingqing Zheng Date: Wed, 18 Mar 2026 12:49:24 -0700 Subject: [PATCH 7/7] missed one file --- pkg/drift/remediation.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/drift/remediation.go b/pkg/drift/remediation.go index f407acba..bcf1bd90 100644 --- a/pkg/drift/remediation.go +++ b/pkg/drift/remediation.go @@ -13,6 +13,7 @@ import ( "github.com/Azure/AKSFlexNode/pkg/bootstrapper" "github.com/Azure/AKSFlexNode/pkg/config" + "github.com/Azure/AKSFlexNode/pkg/kube" "github.com/Azure/AKSFlexNode/pkg/spec" "github.com/Azure/AKSFlexNode/pkg/status" ) @@ -143,6 +144,8 @@ func detectAndRemediate( } // Best-effort: reflect the successful upgrade immediately in the status snapshot so // subsequent health checks don't rely solely on the periodic status collector. + // Also invalidate any cached kubelet clientset so readiness checks pick up rotated kubeconfig/certs. + kube.InvalidateKubeletClientset() kubeletVersion := plan.DesiredKubernetesVersion if kubeletVersion == "" && cfg != nil { kubeletVersion = cfg.Kubernetes.Version