From 7476c07b97f07d6ed218ab192786c46d29299b86 Mon Sep 17 00:00:00 2001 From: tanyunqiang Date: Wed, 25 Feb 2026 11:03:25 +0800 Subject: [PATCH] fix(healthcheck): use immediate clear() to ensure k8s endpoint changes take effect Replace delayed_clear with immediate clear() in healthcheck_manager to fix an issue where k8s endpoint changes would not take effect immediately. The delayed clear could cause healthcheck to continue using stale IP addresses after endpoint updates. Old K8s Endpoint ip has destroyed, but healthCheck manager always check old ip address --- apisix/healthcheck_manager.lua | 19 ++++++++++++++++--- t/node/healthcheck-leak-bugfix.t | 7 ++++++- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/apisix/healthcheck_manager.lua b/apisix/healthcheck_manager.lua index 8133364ee292..d647e97f4e51 100644 --- a/apisix/healthcheck_manager.lua +++ b/apisix/healthcheck_manager.lua @@ -34,7 +34,6 @@ local _M = {} local working_pool = {} -- resource_path -> {version = ver, checker = checker} local waiting_pool = {} -- resource_path -> resource_ver -local DELAYED_CLEAR_TIMEOUT = 10 local healthcheck_shdict_name = "upstream-healthcheck" @@ -192,7 +191,14 @@ local function timer_create_checker() -- if a checker exists then delete it before creating a new one local existing_checker = working_pool[resource_path] if existing_checker then - existing_checker.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT) + -- use immediate clear() instead of delayed_clear() to ensure + -- k8s endpoint changes take effect immediately, avoiding stale IP addresses + local clear_ok, clear_err = pcall(function() + existing_checker.checker:clear() + end) + if not clear_ok then + core.log.warn("failed to clear checker: ", clear_err) + end existing_checker.checker:stop() core.log.info("releasing existing checker: ", tostring(existing_checker.checker), " for resource: ", resource_path, " and version: ", @@ -251,7 +257,14 @@ local function timer_working_pool_check() if need_destroy then working_pool[resource_path] = nil item.checker.dead = true - item.checker:delayed_clear(DELAYED_CLEAR_TIMEOUT) + -- use immediate clear() instead of delayed_clear() to ensure + -- k8s endpoint changes take effect immediately, avoiding stale IP addresses + local clear_ok, clear_err = pcall(function() + item.checker:clear() + end) + if not clear_ok then + core.log.warn("failed to clear checker: ", clear_err) + end item.checker:stop() core.log.info("try to release checker: ", tostring(item.checker), " for resource: ", resource_path, " and version : ", item.version) diff --git a/t/node/healthcheck-leak-bugfix.t b/t/node/healthcheck-leak-bugfix.t index bcab5689d152..86d3edd1e829 100644 --- a/t/node/healthcheck-leak-bugfix.t +++ b/t/node/healthcheck-leak-bugfix.t @@ -31,8 +31,13 @@ __DATA__ local new = healthcheck.new healthcheck.new = function(...) local obj = new(...) - local clear = obj.delayed_clear + local delayed_clear = obj.delayed_clear obj.delayed_clear = function(...) + ngx.log(ngx.WARN, "clear checker") + return delayed_clear(...) + end + local clear = obj.clear + obj.clear = function(...) ngx.log(ngx.WARN, "clear checker") return clear(...) end