-
Notifications
You must be signed in to change notification settings - Fork 35
Open
Labels
Description
-
Environment:
- Tarantool version: Tarantool 3.5.0-entrypoint-128-g71766c8bf9
- Vshard version: 0.1.38
- OS: linux, x86-64
-
Bug description: internal rebalancer errors (e.g.
"Replica %s has sending / receiving / garbage buckets"or"Rebalancer is not active ..."are hidden by log_ratelimitter. Reason: all these errors (returned fromrebalancer_request_state) have the sameerr_typeanderr_code. As a result log_ratelimitter does not log the last one. It is also unclear why"Rebalancer is not active ..."error appears in our logs. I checked the result ofrebalancer_request_state, this function never returned this king of error. -
Reproducer:
local t = require('luatest') local vtest = require('test.luatest_helpers.vtest') local vutil = require('vshard.util') local consts = require('vshard.consts') local test_group = t.group('rebalancer') local cfg_template = { sharding = { { replicas = { replica_1_a = { master = true, }, }, }, { replicas = { replica_2_a = { master = true, }, }, }, }, bucket_count = 10, replication_timeout = 0.1, } local global_cfg test_group.before_all(function(g) global_cfg = vtest.config_new(cfg_template) vtest.cluster_new(g, global_cfg) vtest.cluster_bootstrap(g, global_cfg) vtest.cluster_rebalancer_disable(g) end) test_group.after_all(function(g) g.cluster:drop() end) test_group.test_rebalancer_errors = function(g) g.replica_1_a:exec(function() vshard.storage.rebalancer_enable() vshard.storage.rebalancer_wakeup() end) t.helpers.retrying({}, function() t.assert(g.replica_1_a:grep_log('Rebalancer is not active ' .. 'or is in progress')) -- <--- Why this error appears? end) g.replica_1_a:exec(function() box.space._bucket:update(1, {{'=', 2, 'sending'}}) vshard.storage.rebalancer_wakeup() end) t.helpers.retrying({}, function() local log = string.format('Replica %s has sending buckets', g.replica_1_a:replicaset_uuid()) t.assert(g.replica_1_a:grep_log(log)) -- <--- Hanging as no "Replica %s has sending buckets" in logs end) end
-
Actual result:
2025-11-14 13:30:29.081 [49832] main/126/vshard.rebalancer/vshard.log_ratelimit log_ratelimit.lua:138 E> Error during downloading rebalancer states: {"replicaset_id":"00000000-0000-0000-0000
-000000000004","trace":[{"file":"/home/mrforza/Desktop/vshard/vshard/error.lua","line":284}],"code":32,"base_type":"ClientError","type":"ClientError","details":"Rebalancer is not active or i
s in progress","name":"PROC_LUA","message":"Rebalancer is not active or is in progress"}
2025-11-14 13:30:29.081 [49832] main/131/vshard.ratelimit_flush/vshard.util util.lua:114 I> ratelimit_flush_f has been started
2025-11-14 13:30:33.658 [49832] main/125/vshard.recovery/vshard.storage init.lua:951 I> Starting sending buckets recovery step
2025-11-14 13:30:33.658 [49832] main/125/vshard.recovery/vshard.storage init.lua:952 W> Can not find for bucket 1 its peer nil
- Expected behavior
2025-11-14 13:30:29.081 [49832] main/126/vshard.rebalancer/vshard.log_ratelimit log_ratelimit.lua:138 E> Error during downloading rebalancer states: {"replicaset_id":"00000000-0000-0000-0000-000000000004","trace":[{"file":"/home/mrforza/Desktop/vshard/vshard/error.lua","line":284}],"code":32,"base_type":"ClientError","type":"ClientError","details":"Rebalancer is not active or is in progress","name":"PROC_LUA","message":"Rebalancer is not active or is in progress"}
2025-11-14 13:30:29.081 [49832] main/131/vshard.ratelimit_flush/vshard.util util.lua:114 I> ratelimit_flush_f has been started
2025-11-14 13:30:33.658 [49832] main/125/vshard.recovery/vshard.storage init.lua:951 I> Starting sending buckets recovery step
2025-11-14 13:30:33.658 [49832] main/125/vshard.recovery/vshard.storage init.lua:952 W> Can not find for bucket 1 its peer nil
.
.
.
2025-11-14 13:30:29.081 [49832] main/126/vshard.rebalancer/vshard.log_ratelimit log_ratelimit.lua:138 E> Error during downloading rebalancer states: {"replicaset_id":"00000000-0000-0000-0000-000000000004","trace":[{"file":"/home/mrforza/Desktop/vshard/vshard/error.lua","line":284}],"code":32,"base_type":"ClientError","type":"ClientError","details":"Replica 00000000-0000-0000-0000-000000000004 has sending buckets","name":"PROC_LUA","message":"Replica 00000000-0000-0000-0000-000000000004 has sending buckets"}