Skip to content

Commit 2bf57af

Browse files
authored
Flaky test fixes (#580)
# Summary This PR aims to reduce the flakiness of the following tests: ### e2e_multi_cluster_sharded_snippets Increased the timeout of `test_running`, since in failing [runs](https://spruce.mongodb.com/task/mongodb_kubernetes_e2e_multi_cluster_kind_e2e_multi_cluster_sharded_snippets_7272a430f4150a5fa67753ae6dc4bcce3ba293e8_25_11_10_08_47_17/files?execution=0&sorts=STATUS%3AASC), by the time the diagnostics are collected, the resources become ready. ### e2e_multi_cluster_appdb_upgrade_downgrade_v1_27_to_mck Increased the timeout of `test_scale_appdb`. Similarly, the assertion on appdb status fails, but by the time diagnostics are collected, the resource becomes ready. ### e2e_appdb_tls_operator_upgrade_v1_32_to_mck In this test we have a race condition. ``` om-appdb-upgrade-tls 1 7.0.18 Running Pending Disabled 17m om-appdb-upgrade-tls 1 7.0.18 Running Running Disabled 17m om-appdb-upgrade-tls 1 7.0.18 Pending Running Disabled 17m om-appdb-upgrade-tls 1 7.0.18 Pending Running Disabled 18m om-appdb-upgrade-tls 1 7.0.18 Pending Running Disabled 18m om-appdb-upgrade-tls 1 7.0.18 Running Running Disabled 19m ``` There is a moment during the operator upgrade where the resource has the status of AppDB and OM set to running. This happens very briefly before the operator starts reconciling OM and sets the OM status to Pending. In that moment, the test will very quickly pass both assertions and move on to assert healthiness by connecting to OM. This will fail since OM was not actually ready. ``` Reaching phase Running for resource AppDbStatus took 216.2561867237091s Reaching phase Running for resource OmStatus took 0.0025169849395751953s ``` To fix this, I added a `persist_for` flag in our assertion methods. This makes sure that the phase we are currently asserting is reached and persists for a number of retries. ## Proof of Work Retried the above tests a few times, and all pass https://spruce.mongodb.com/version/6911c25146ed0e00077796e3/tasks?sorts=STATUS%3AASC%3BBASE_STATUS%3ADESC ## Checklist - [ ] Have you linked a jira ticket and/or is the ticket in the title? - [ ] Have you checked whether your jira ticket required DOCSP changes? - [ ] Have you added changelog file? - use `skip-changelog` label if not needed - refer to [Changelog files and Release Notes](https://github.com/mongodb/mongodb-kubernetes/blob/master/CONTRIBUTING.md#changelog-files-and-release-notes) section in CONTRIBUTING.md for more details
1 parent 2c19908 commit 2bf57af

File tree

5 files changed

+70
-13
lines changed

5 files changed

+70
-13
lines changed

docker/mongodb-kubernetes-tests/kubetester/mongodb_common.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,39 @@
1111

1212
class MongoDBCommon:
1313
@TRACER.start_as_current_span("wait_for")
14-
def wait_for(self, fn, timeout=None, should_raise=True):
14+
def wait_for(self, fn, timeout=None, should_raise=True, persist_for=1):
15+
"""
16+
Waits for the given function `fn` to return True, retrying until the timeout is reached.
17+
If persist_for > 1, the function must return True for that many consecutive checks.
18+
Optionally raises an exception if the condition is not met within the timeout.
19+
20+
Args:
21+
fn: A callable that returns a boolean.
22+
timeout: Maximum time to wait in seconds (default: 600).
23+
should_raise: If True, raises an Exception on timeout (default: True).
24+
persist_for: Number of consecutive successful checks required (default: 1).
25+
Returns:
26+
True if the condition is met within the timeout, otherwise raises Exception if `should_raise` is True.
27+
"""
1528
if timeout is None:
1629
timeout = 600
1730
initial_timeout = timeout
1831

1932
wait = 3
33+
retries = 0
2034
while timeout > 0:
2135
try:
2236
self.reload()
2337
except Exception as e:
2438
print(f"Caught error: {e} while waiting for {fn.__name__}")
2539
pass
2640
if fn(self):
27-
return True
41+
retries += 1
42+
if retries == persist_for:
43+
return True
44+
else:
45+
retries = 0
46+
2847
timeout -= wait
2948
time.sleep(wait)
3049

docker/mongodb-kubernetes-tests/kubetester/opsmanager.py

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,13 +1017,7 @@ def is_om_multi_cluster(self):
10171017
return self["spec"].get("topology", "") == "MultiCluster"
10181018

10191019
class StatusCommon:
1020-
def assert_reaches_phase(
1021-
self,
1022-
phase: Phase,
1023-
msg_regexp=None,
1024-
timeout=None,
1025-
ignore_errors=False,
1026-
):
1020+
def assert_reaches_phase(self, phase: Phase, msg_regexp=None, timeout=None, ignore_errors=False):
10271021
intermediate_events = (
10281022
# This can be an intermediate error, right before we check for this secret we create it.
10291023
# The cluster might just be slow
@@ -1057,6 +1051,41 @@ def assert_reaches_phase(
10571051
f"Reaching phase {phase.name} for resource {self.__class__.__name__} took {end_time - start_time}s"
10581052
)
10591053

1054+
def assert_persist_phase(self, phase: Phase, msg_regexp=None, timeout=None, ignore_errors=False, persist_for=3):
1055+
intermediate_events = (
1056+
# This can be an intermediate error, right before we check for this secret we create it.
1057+
# The cluster might just be slow
1058+
"failed to locate the api key secret",
1059+
# etcd might be slow
1060+
"etcdserver: request timed out",
1061+
)
1062+
1063+
start_time = time.time()
1064+
self.ops_manager.wait_for(
1065+
lambda s: in_desired_state(
1066+
current_state=self.get_phase(),
1067+
desired_state=phase,
1068+
current_generation=self.ops_manager.get_generation(),
1069+
observed_generation=self.get_observed_generation(),
1070+
current_message=self.get_message(),
1071+
msg_regexp=msg_regexp,
1072+
ignore_errors=ignore_errors,
1073+
intermediate_events=intermediate_events,
1074+
),
1075+
timeout,
1076+
should_raise=True,
1077+
persist_for=persist_for,
1078+
)
1079+
end_time = time.time()
1080+
span = trace.get_current_span()
1081+
span.set_attribute("mck.resource", self.__class__.__name__)
1082+
span.set_attribute("mck.action", "assert_phase")
1083+
span.set_attribute("mck.desired_phase", phase.name)
1084+
span.set_attribute("mck.time_needed", end_time - start_time)
1085+
logger.debug(
1086+
f"Persist phase {phase.name} ({persist_for} retries) for resource {self.__class__.__name__} took {end_time - start_time}s"
1087+
)
1088+
10601089
def assert_abandons_phase(self, phase: Phase, timeout=None):
10611090
return self.ops_manager.wait_for(lambda s: self.get_phase() != phase, timeout, should_raise=True)
10621091

@@ -1113,6 +1142,9 @@ def assert_abandons_phase(self, phase: Phase, timeout=400):
11131142
def assert_reaches_phase(self, phase: Phase, msg_regexp=None, timeout=1000, ignore_errors=False):
11141143
super().assert_reaches_phase(phase, msg_regexp, timeout, ignore_errors)
11151144

1145+
def assert_persist_phase(self, phase: Phase, msg_regexp=None, timeout=1000, ignore_errors=False, persist_for=1):
1146+
super().assert_persist_phase(phase, msg_regexp, timeout, ignore_errors, persist_for=persist_for)
1147+
11161148
def get_phase(self) -> Optional[Phase]:
11171149
try:
11181150
return Phase[self.ops_manager.get_status()["applicationDatabase"]["phase"]]
@@ -1159,6 +1191,9 @@ def assert_abandons_phase(self, phase: Phase, timeout=400):
11591191
def assert_reaches_phase(self, phase: Phase, msg_regexp=None, timeout=1200, ignore_errors=False):
11601192
super().assert_reaches_phase(phase, msg_regexp, timeout, ignore_errors)
11611193

1194+
def assert_persist_phase(self, phase: Phase, msg_regexp=None, timeout=1200, ignore_errors=False, persist_for=1):
1195+
super().assert_persist_phase(phase, msg_regexp, timeout, ignore_errors, persist_for=persist_for)
1196+
11621197
def get_phase(self) -> Optional[Phase]:
11631198
try:
11641199
return Phase[self.ops_manager.get_status()["opsManager"]["phase"]]

docker/mongodb-kubernetes-tests/tests/multicluster_appdb/multicluster_appdb_upgrade_downgrade_v1_27_to_mck.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ def test_scale_appdb(self, ops_manager: MongoDBOpsManager):
256256
# Reordering the clusters triggers a change in the state
257257
ops_manager["spec"]["applicationDatabase"]["clusterSpecList"] = scale_on_upgrade.cluster_spec
258258
ops_manager.update()
259-
ops_manager.appdb_status().assert_reaches_phase(Phase.Running, timeout=500)
259+
ops_manager.appdb_status().assert_reaches_phase(Phase.Running, timeout=600)
260260
ops_manager.om_status().assert_reaches_phase(Phase.Running, timeout=250)
261261

262262
def test_migrated_state_correctness(

docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_snippets.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def test_running(namespace: str):
104104
try:
105105
logger.debug(f"Waiting for {sc.name} to reach Running phase")
106106
# Once the first resource reached Running, it shouldn't take more than ~300s for the others to do so
107-
sc.assert_reaches_phase(Phase.Running, timeout=900 if first_iter else 300)
107+
sc.assert_reaches_phase(Phase.Running, timeout=1200 if first_iter else 300)
108108
succeeded_resources.append(sc.name)
109109
first_iter = False
110110
logger.info(f"{sc.name} reached Running phase")

docker/mongodb-kubernetes-tests/tests/upgrades/appdb_tls_operator_upgrade_v1_32_to_mck.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,8 +161,11 @@ def test_upgrade_operator(
161161
@mark.e2e_appdb_tls_operator_upgrade_v1_32_to_mck
162162
def test_om_tls_ok(ops_manager_tls: MongoDBOpsManager):
163163
ops_manager_tls.load()
164-
ops_manager_tls.appdb_status().assert_reaches_phase(Phase.Running, timeout=900)
165-
ops_manager_tls.om_status().assert_reaches_phase(Phase.Running, timeout=900)
164+
# We use assert_persist_phase here to ensure that the status stays in Running phase for some time,
165+
# to avoid false positives due to a transient Running state before starting reconciliation of OM.
166+
# TODO: Revert after fixing root issue (https://jira.mongodb.org/browse/CLOUDP-364841)
167+
ops_manager_tls.appdb_status().assert_persist_phase(Phase.Running, timeout=900, persist_for=3)
168+
ops_manager_tls.om_status().assert_persist_phase(Phase.Running, timeout=900, persist_for=3)
166169
ops_manager_tls.get_om_tester().assert_healthiness()
167170

168171

0 commit comments

Comments
 (0)