diff --git a/config.go b/config.go index 28c08efd5..36c1eb4db 100644 --- a/config.go +++ b/config.go @@ -241,8 +241,8 @@ type Config struct { UDPBufferSize int // DeadNodeReclaimTime controls the time before a dead node's name can be - // reclaimed by one with a different address or port. By default, this is 0, - // meaning nodes cannot be reclaimed this way. + // reclaimed by one with a different address or port. Setting DeadNodeReclaimTime + // to 0 means that dead nodes will never be reclaimed. DeadNodeReclaimTime time.Duration // RequireNodeNames controls if the name of a node is required when sending @@ -319,6 +319,8 @@ func DefaultLANConfig() *Config { DisableTcpPings: false, // TCP pings are safe, even with mixed versions AwarenessMaxMultiplier: 8, // Probe interval backs off to 8 seconds + DeadNodeReclaimTime: 30 * time.Second, // Reclaim dead nodes after 30 seconds + GossipNodes: 3, // Gossip to 3 nodes GossipInterval: 200 * time.Millisecond, // Gossip more rapidly GossipToTheDeadTime: 30 * time.Second, // Same as push/pull diff --git a/state.go b/state.go index 01ed59e22..48ea8eecf 100644 --- a/state.go +++ b/state.go @@ -570,7 +570,7 @@ func (m *Memberlist) resetNodes() { defer m.nodeLock.Unlock() // Move dead nodes, but respect gossip to the dead interval - deadIdx := moveDeadNodes(m.nodes, m.config.GossipToTheDeadTime) + deadIdx := moveDeadNodes(m.nodes, m.config.DeadNodeReclaimTime, m.config.GossipToTheDeadTime) // Deregister the dead nodes for i := deadIdx; i < len(m.nodes); i++ { diff --git a/util.go b/util.go index 5d6f3eac7..e4e1030fa 100644 --- a/util.go +++ b/util.go @@ -105,7 +105,7 @@ func pushPullScale(interval time.Duration, n int) time.Duration { // moveDeadNodes moves dead and left nodes that that have not changed during the gossipToTheDeadTime interval // to the end of the slice and returns the index of the first moved node. -func moveDeadNodes(nodes []*nodeState, gossipToTheDeadTime time.Duration) int { +func moveDeadNodes(nodes []*nodeState, deadNodeReclaimTime, gossipToTheDeadTime time.Duration) int { numDead := 0 n := len(nodes) for i := 0; i < n-numDead; i++ { @@ -113,8 +113,18 @@ func moveDeadNodes(nodes []*nodeState, gossipToTheDeadTime time.Duration) int { continue } - // Respect the gossip to the dead interval - if time.Since(nodes[i].StateChange) <= gossipToTheDeadTime { + if nodes[i].State == StateDead && deadNodeReclaimTime == 0 { + // If deadNodeReclaimTime is 0, we don't reclaim dead nodes + continue + } + + reclaimTime := deadNodeReclaimTime + if gossipToTheDeadTime > reclaimTime { + reclaimTime = gossipToTheDeadTime + } + + // Respect the gossip to the dead interval and the dead node reclaim time + if time.Since(nodes[i].StateChange) <= reclaimTime { continue } diff --git a/util_test.go b/util_test.go index f15b4d216..2f8dc10d0 100644 --- a/util_test.go +++ b/util_test.go @@ -9,6 +9,7 @@ import ( "testing" "time" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -166,76 +167,111 @@ func TestPushPullScale(t *testing.T) { } func TestMoveDeadNodes(t *testing.T) { - nodes := []*nodeState{ - &nodeState{ + now := time.Now() + nodeStates := []*nodeState{ + { State: StateDead, - StateChange: time.Now().Add(-20 * time.Second), + StateChange: now.Add(-20 * time.Second), }, - &nodeState{ + { State: StateAlive, - StateChange: time.Now().Add(-20 * time.Second), + StateChange: now.Add(-20 * time.Second), }, - // This dead node should not be moved, as its state changed - // less than the specified GossipToTheDead time ago - &nodeState{ + { State: StateDead, - StateChange: time.Now().Add(-10 * time.Second), + StateChange: now.Add(-10 * time.Second), }, - // This left node should not be moved, as its state changed - // less than the specified GossipToTheDead time ago - &nodeState{ + { State: StateLeft, - StateChange: time.Now().Add(-10 * time.Second), + StateChange: now.Add(-10 * time.Second), }, - &nodeState{ + { State: StateLeft, - StateChange: time.Now().Add(-20 * time.Second), + StateChange: now.Add(-20 * time.Second), }, - &nodeState{ + { State: StateAlive, - StateChange: time.Now().Add(-20 * time.Second), + StateChange: now.Add(-20 * time.Second), }, - &nodeState{ + { State: StateDead, - StateChange: time.Now().Add(-20 * time.Second), + StateChange: now.Add(-20 * time.Second), }, - &nodeState{ + { State: StateAlive, - StateChange: time.Now().Add(-20 * time.Second), + StateChange: now.Add(-20 * time.Second), }, - &nodeState{ + { + State: StateLeft, + StateChange: now.Add(-20 * time.Second), + }, + { State: StateLeft, - StateChange: time.Now().Add(-20 * time.Second), + StateChange: now.Add(-30 * time.Second), + }, + { + State: StateDead, + StateChange: now.Add(-30 * time.Second), }, } - idx := moveDeadNodes(nodes, (15 * time.Second)) - if idx != 5 { - t.Fatalf("bad index") + tests := []struct { + name string + deadNodeReclaimTime time.Duration + gossipToTheDeadTime time.Duration + expectedIndex int + }{ + { + name: "Do not reclaim dead nodes when deadNodeReclaimTime is 0", + deadNodeReclaimTime: 0, + gossipToTheDeadTime: 15 * time.Second, + expectedIndex: 8, + }, + { + name: "Reclaim dead nodes when deadNodeReclaimTime is greater than 0", + deadNodeReclaimTime: 10 * time.Second, + gossipToTheDeadTime: 15 * time.Second, + expectedIndex: 5, + }, + { + name: "deadNodeReclaimTime is greater than gossipToTheDeadTime", + deadNodeReclaimTime: 25 * time.Second, + gossipToTheDeadTime: 15 * time.Second, + expectedIndex: 9, + }, } - for i := 0; i < idx; i++ { - switch i { - case 2: - // Recently dead node remains at index 2, - // since nodes are swapped out to move to end. - if nodes[i].State != StateDead { - t.Fatalf("Bad state %d", i) - } - case 3: - //Recently left node should remain at 3 - if nodes[i].State != StateLeft { - t.Fatalf("Bad State %d", i) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + states := make([]*nodeState, len(nodeStates)) + copy(states, nodeStates) + index := moveDeadNodes(states, tt.deadNodeReclaimTime, tt.gossipToTheDeadTime) + assert.Equal(t, tt.expectedIndex, index) + + reclaimTime := now.Add(tt.deadNodeReclaimTime) + if tt.gossipToTheDeadTime > tt.deadNodeReclaimTime { + reclaimTime = now.Add(tt.gossipToTheDeadTime) } - default: - if nodes[i].State != StateAlive { - t.Fatalf("Bad state %d", i) + + if tt.deadNodeReclaimTime == 0 { + for i := 0; i < index; i++ { + if states[i].State == StateLeft { + assert.True(t, states[i].StateChange.Before(reclaimTime), "node %d should have been moved", i) + } + } + for i := index; i < len(states); i++ { + assert.Equal(t, StateLeft, states[i].State) + } + } else { + for i := 0; i < index; i++ { + if states[i].DeadOrLeft() { + assert.True(t, states[i].StateChange.Before(reclaimTime), "node %d should have been moved", i) + } + } + for i := index; i < len(states); i++ { + assert.True(t, states[i].DeadOrLeft()) + } } - } - } - for i := idx; i < len(nodes); i++ { - if !nodes[i].DeadOrLeft() { - t.Fatalf("Bad state %d", i) - } + }) } }