Flaky test fixes (#580)

lucian-tosa · web-flow · commit 2bf57afa4dc1 · 2025-12-15T09:22:09.000Z
# Summary This PR aims to reduce the flakiness of the following tests: ### e2e_multi_cluster_sharded_snippets Increased the timeout of `test_running`, since in failing [runs](https://spruce.mongodb.com/task/mongodb_kubernetes_e2e_multi_cluster_kind_e2e_multi_cluster_sharded_snippets_7272a430f4150a5fa67753ae6dc4bcce3ba293e8_25_11_10_08_47_17/files?execution=0&sorts=STATUS%3AASC), by the time the diagnostics are collected, the resources become ready. ### e2e_multi_cluster_appdb_upgrade_downgrade_v1_27_to_mck Increased the timeout of `test_scale_appdb`. Similarly, the assertion on appdb status fails, but by the time diagnostics are collected, the resource becomes ready. ### e2e_appdb_tls_operator_upgrade_v1_32_to_mck In this test we have a race condition. ``` om-appdb-upgrade-tls 1 7.0.18 Running Pending Disabled 17m om-appdb-upgrade-tls 1 7.0.18 Running Running Disabled 17m om-appdb-upgrade-tls 1 7.0.18 Pending Running Disabled 17m om-appdb-upgrade-tls 1 7.0.18 Pending Running Disabled 18m om-appdb-upgrade-tls 1 7.0.18 Pending Running Disabled 18m om-appdb-upgrade-tls 1 7.0.18 Running Running Disabled 19m ``` There is a moment during the operator upgrade where the resource has the status of AppDB and OM set to running. This happens very briefly before the operator starts reconciling OM and sets the OM status to Pending. In that moment, the test will very quickly pass both assertions and move on to assert healthiness by connecting to OM. This will fail since OM was not actually ready. ``` Reaching phase Running for resource AppDbStatus took 216.2561867237091s Reaching phase Running for resource OmStatus took 0.0025169849395751953s ``` To fix this, I added a `persist_for` flag in our assertion methods. This makes sure that the phase we are currently asserting is reached and persists for a number of retries. ## Proof of Work Retried the above tests a few times, and all pass https://spruce.mongodb.com/version/6911c25146ed0e00077796e3/tasks?sorts=STATUS%3AASC%3BBASE_STATUS%3ADESC ## Checklist - [ ] Have you linked a jira ticket and/or is the ticket in the title? - [ ] Have you checked whether your jira ticket required DOCSP changes? - [ ] Have you added changelog file? - use `skip-changelog` label if not needed - refer to [Changelog files and Release Notes](https://github.com/mongodb/mongodb-kubernetes/blob/master/CONTRIBUTING.md#changelog-files-and-release-notes) section in CONTRIBUTING.md for more details
diff --git a/docker/mongodb-kubernetes-tests/kubetester/mongodb_common.py b/docker/mongodb-kubernetes-tests/kubetester/mongodb_common.py
@@ -11,20 +11,39 @@
 
 class MongoDBCommon:
     @TRACER.start_as_current_span("wait_for")
-    def wait_for(self, fn, timeout=None, should_raise=True):
+    def wait_for(self, fn, timeout=None, should_raise=True, persist_for=1):
+        """
+        Waits for the given function `fn` to return True, retrying until the timeout is reached.
+        If persist_for > 1, the function must return True for that many consecutive checks.
+        Optionally raises an exception if the condition is not met within the timeout.
+
+        Args:
+            fn: A callable that returns a boolean.
+            timeout: Maximum time to wait in seconds (default: 600).
+            should_raise: If True, raises an Exception on timeout (default: True).
+            persist_for: Number of consecutive successful checks required (default: 1).
+        Returns:
+            True if the condition is met within the timeout, otherwise raises Exception if `should_raise` is True.
+        """
         if timeout is None:
             timeout = 600
         initial_timeout = timeout
 
         wait = 3
+        retries = 0
         while timeout > 0:
             try:
                 self.reload()
             except Exception as e:
                 print(f"Caught error: {e} while waiting for {fn.__name__}")
                 pass
             if fn(self):
-                return True
+                retries += 1
+                if retries == persist_for:
+                    return True
+            else:
+                retries = 0
+
             timeout -= wait
             time.sleep(wait)
 
diff --git a/docker/mongodb-kubernetes-tests/kubetester/opsmanager.py b/docker/mongodb-kubernetes-tests/kubetester/opsmanager.py
@@ -1017,13 +1017,7 @@ def is_om_multi_cluster(self):
         return self["spec"].get("topology", "") == "MultiCluster"
 
     class StatusCommon:
-        def assert_reaches_phase(
-            self,
-            phase: Phase,
-            msg_regexp=None,
-            timeout=None,
-            ignore_errors=False,
-        ):
+        def assert_reaches_phase(self, phase: Phase, msg_regexp=None, timeout=None, ignore_errors=False):
             intermediate_events = (
                 # This can be an intermediate error, right before we check for this secret we create it.
                 # The cluster might just be slow
@@ -1057,6 +1051,41 @@ def assert_reaches_phase(
                 f"Reaching phase {phase.name} for resource {self.__class__.__name__} took {end_time - start_time}s"
             )
 
+        def assert_persist_phase(self, phase: Phase, msg_regexp=None, timeout=None, ignore_errors=False, persist_for=3):
+            intermediate_events = (
+                # This can be an intermediate error, right before we check for this secret we create it.
+                # The cluster might just be slow
+                "failed to locate the api key secret",
+                # etcd might be slow
+                "etcdserver: request timed out",
+            )
+
+            start_time = time.time()
+            self.ops_manager.wait_for(
+                lambda s: in_desired_state(
+                    current_state=self.get_phase(),
+                    desired_state=phase,
+                    current_generation=self.ops_manager.get_generation(),
+                    observed_generation=self.get_observed_generation(),
+                    current_message=self.get_message(),
+                    msg_regexp=msg_regexp,
+                    ignore_errors=ignore_errors,
+                    intermediate_events=intermediate_events,
+                ),
+                timeout,
+                should_raise=True,
+                persist_for=persist_for,
+            )
+            end_time = time.time()
+            span = trace.get_current_span()
+            span.set_attribute("mck.resource", self.__class__.__name__)
+            span.set_attribute("mck.action", "assert_phase")
+            span.set_attribute("mck.desired_phase", phase.name)
+            span.set_attribute("mck.time_needed", end_time - start_time)
+            logger.debug(
+                f"Persist phase {phase.name} ({persist_for} retries) for resource {self.__class__.__name__} took {end_time - start_time}s"
+            )
+
         def assert_abandons_phase(self, phase: Phase, timeout=None):
             return self.ops_manager.wait_for(lambda s: self.get_phase() != phase, timeout, should_raise=True)
 
@@ -1113,6 +1142,9 @@ def assert_abandons_phase(self, phase: Phase, timeout=400):
         def assert_reaches_phase(self, phase: Phase, msg_regexp=None, timeout=1000, ignore_errors=False):
             super().assert_reaches_phase(phase, msg_regexp, timeout, ignore_errors)
 
+        def assert_persist_phase(self, phase: Phase, msg_regexp=None, timeout=1000, ignore_errors=False, persist_for=1):
+            super().assert_persist_phase(phase, msg_regexp, timeout, ignore_errors, persist_for=persist_for)
+
         def get_phase(self) -> Optional[Phase]:
             try:
                 return Phase[self.ops_manager.get_status()["applicationDatabase"]["phase"]]
@@ -1159,6 +1191,9 @@ def assert_abandons_phase(self, phase: Phase, timeout=400):
         def assert_reaches_phase(self, phase: Phase, msg_regexp=None, timeout=1200, ignore_errors=False):
             super().assert_reaches_phase(phase, msg_regexp, timeout, ignore_errors)
 
+        def assert_persist_phase(self, phase: Phase, msg_regexp=None, timeout=1200, ignore_errors=False, persist_for=1):
+            super().assert_persist_phase(phase, msg_regexp, timeout, ignore_errors, persist_for=persist_for)
+
         def get_phase(self) -> Optional[Phase]:
             try:
                 return Phase[self.ops_manager.get_status()["opsManager"]["phase"]]
diff --git a/docker/mongodb-kubernetes-tests/tests/multicluster_appdb/multicluster_appdb_upgrade_downgrade_v1_27_to_mck.py b/docker/mongodb-kubernetes-tests/tests/multicluster_appdb/multicluster_appdb_upgrade_downgrade_v1_27_to_mck.py
@@ -256,7 +256,7 @@ def test_scale_appdb(self, ops_manager: MongoDBOpsManager):
         # Reordering the clusters triggers a change in the state
         ops_manager["spec"]["applicationDatabase"]["clusterSpecList"] = scale_on_upgrade.cluster_spec
         ops_manager.update()
-        ops_manager.appdb_status().assert_reaches_phase(Phase.Running, timeout=500)
+        ops_manager.appdb_status().assert_reaches_phase(Phase.Running, timeout=600)
         ops_manager.om_status().assert_reaches_phase(Phase.Running, timeout=250)
 
     def test_migrated_state_correctness(
diff --git a/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_snippets.py b/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_snippets.py
@@ -104,7 +104,7 @@ def test_running(namespace: str):
         try:
             logger.debug(f"Waiting for {sc.name} to reach Running phase")
             # Once the first resource reached Running, it shouldn't take more than ~300s for the others to do so
-            sc.assert_reaches_phase(Phase.Running, timeout=900 if first_iter else 300)
+            sc.assert_reaches_phase(Phase.Running, timeout=1200 if first_iter else 300)
             succeeded_resources.append(sc.name)
             first_iter = False
             logger.info(f"{sc.name} reached Running phase")
diff --git a/docker/mongodb-kubernetes-tests/tests/upgrades/appdb_tls_operator_upgrade_v1_32_to_mck.py b/docker/mongodb-kubernetes-tests/tests/upgrades/appdb_tls_operator_upgrade_v1_32_to_mck.py
@@ -161,8 +161,11 @@ def test_upgrade_operator(
 @mark.e2e_appdb_tls_operator_upgrade_v1_32_to_mck
 def test_om_tls_ok(ops_manager_tls: MongoDBOpsManager):
     ops_manager_tls.load()
-    ops_manager_tls.appdb_status().assert_reaches_phase(Phase.Running, timeout=900)
-    ops_manager_tls.om_status().assert_reaches_phase(Phase.Running, timeout=900)
+    # We use assert_persist_phase here to ensure that the status stays in Running phase for some time,
+    # to avoid false positives due to a transient Running state before starting reconciliation of OM.
+    # TODO: Revert after fixing root issue (https://jira.mongodb.org/browse/CLOUDP-364841)
+    ops_manager_tls.appdb_status().assert_persist_phase(Phase.Running, timeout=900, persist_for=3)
+    ops_manager_tls.om_status().assert_persist_phase(Phase.Running, timeout=900, persist_for=3)
     ops_manager_tls.get_om_tester().assert_healthiness()