@@ -657,6 +657,158 @@ def test_moving_average_updates_persist_across_workers
657657 assert_equal 5500.0 , chunks . first . estimated_duration
658658 end
659659
660+ # --- Election recovery tests ---
661+
662+ def test_master_death_triggers_re_election
663+ build = 'election-death'
664+ w1 = worker ( 1 , build_id : build , master_lock_ttl : 1 , max_election_attempts : 3 )
665+ assert_predicate w1 , :master?
666+
667+ # Simulate master death: delete the master-status key (lock expires)
668+ @redis . del ( "build:#{ build } :master-status" )
669+
670+ # Worker 2 should detect the dead master and become the new master
671+ w2 = worker ( 2 , build_id : build , master_lock_ttl : 1 , max_election_attempts : 3 )
672+ assert_predicate w2 , :master?
673+
674+ # Both workers should be able to poll successfully
675+ poll ( w2 )
676+ assert_predicate w2 , :exhausted?
677+ end
678+
679+ def test_fenced_push_rejects_stale_master
680+ build = 'fenced-push'
681+ w1 = worker ( 1 , build_id : build , master_lock_ttl : 30 )
682+ assert_predicate w1 , :master?
683+
684+ # Overwrite master-status to simulate another master winning election
685+ other_gen = 'other-generation-uuid'
686+ @redis . set ( "build:#{ build } :master-status" , "setup:#{ other_gen } " , ex : 30 )
687+
688+ # w1 still thinks it's master, but push.lua should reject because lock value changed
689+ w1 . send ( :push , [ 'ATest#test_foo' ] )
690+
691+ # master-status should still be the other master's value
692+ status = @redis . get ( "build:#{ build } :master-status" )
693+ assert_equal "setup:#{ other_gen } " , status , "Fenced push should not overwrite another master's status"
694+ end
695+
696+ def test_generation_stale_exits_poll
697+ build = 'gen-stale'
698+ w1 = worker ( 1 , build_id : build )
699+ assert_predicate w1 , :master?
700+
701+ w2 = worker ( 2 , build_id : build )
702+ refute_predicate w2 , :master?
703+
704+ # Drain the queue with w1 so w2 sees no tests (idle), but not exhausted
705+ poll ( w1 )
706+
707+ # Change generation before w2 polls — w2 will be idle and detect staleness
708+ @redis . set ( "build:#{ build } :current-generation" , "new-generation-uuid" )
709+
710+ tests_seen = [ ]
711+ w2 . poll do |test |
712+ tests_seen << test
713+ w2 . acknowledge ( test )
714+ end
715+
716+ assert_equal 0 , tests_seen . size , "Worker should exit poll immediately when generation is stale"
717+ end
718+
719+ def test_learn_generation_raises_when_key_missing
720+ build = 'learn-gen-missing'
721+ w1 = worker ( 1 , build_id : build )
722+ assert_predicate w1 , :master?
723+
724+ # Delete the current-generation key
725+ @redis . del ( "build:#{ build } :current-generation" )
726+
727+ # A non-master worker trying to learn generation should raise MasterDied
728+ assert_raises ( CI ::Queue ::Redis ::MasterDied ) do
729+ w2 = worker ( 2 , build_id : build , populate : false )
730+ w2 . send ( :learn_generation )
731+ end
732+ end
733+
734+ def test_max_election_attempts_raises_lost_master
735+ build = 'max-attempts'
736+
737+ # Worker 1 wins master and starts setup, then dies (lock expires)
738+ # We simulate this by having another process hold setup then expire
739+ # To prevent the test worker from winning master, keep re-setting the key
740+ # so it always sees "setup" then nil (death)
741+ t = Thread . new do
742+ loop do
743+ # Keep setting a short-lived setup lock so the worker always sees a dying master
744+ @redis . set ( "build:#{ build } :master-status" , "setup:dead-gen" , px : 50 )
745+ sleep 0.06
746+ end
747+ end
748+
749+ assert_raises ( CI ::Queue ::Redis ::LostMaster ) do
750+ worker ( 1 , build_id : build , max_election_attempts : 1 , queue_init_timeout : 0.5 )
751+ end
752+ ensure
753+ t &.kill
754+ end
755+
756+ def test_build_record_reads_generation_scoped_requeue_key
757+ build = 'requeue-gen'
758+ w1 = worker ( 1 , build_id : build , max_requeues : 1 , requeue_tolerance : 1.0 )
759+
760+ w1 . poll do |test |
761+ w1 . report_failure!
762+ unless w1 . requeue ( test )
763+ w1 . acknowledge ( test )
764+ end
765+ end
766+
767+ requeues = w1 . build . requeued_tests
768+ refute_empty requeues , "Should have requeued at least one test"
769+ end
770+
771+ def test_supervisor_handles_master_died
772+ build = 'supervisor-died'
773+ supervisor = CI ::Queue ::Redis ::Supervisor . new (
774+ @redis_url ,
775+ CI ::Queue ::Configuration . new (
776+ build_id : build ,
777+ worker_id : 'sup' ,
778+ timeout : 0.2 ,
779+ queue_init_timeout : 0.3 ,
780+ timing_redis_url : @redis_url ,
781+ )
782+ )
783+
784+ # wait_for_workers should return false (not crash) when no master exists
785+ result = supervisor . wait_for_workers
786+ refute result , "Supervisor should return false when master never appeared"
787+ end
788+
789+ def test_wait_for_master_detects_immediate_nil_status
790+ build = 'nil-status'
791+
792+ w = CI ::Queue ::Redis . new (
793+ @redis_url ,
794+ CI ::Queue ::Configuration . new (
795+ build_id : build ,
796+ worker_id : '2' ,
797+ timeout : 0.2 ,
798+ queue_init_timeout : 0.5 ,
799+ timing_redis_url : @redis_url ,
800+ )
801+ )
802+
803+ # Simulate that status was "setup" then expired (nil)
804+ @redis . set ( "build:#{ build } :master-status" , "setup:some-gen" , px : 1 )
805+ sleep 0.01 # Let it expire
806+
807+ assert_raises ( CI ::Queue ::Redis ::MasterDied ) do
808+ w . send ( :wait_for_master , timeout : 0.5 )
809+ end
810+ end
811+
660812 private
661813
662814 class MockTest
0 commit comments