Skip to content

Commit b6a925a

Browse files
committed
iDRAC: Handle HTTP 500 internal errors with service reset
1 parent 1417880 commit b6a925a

File tree

1 file changed

+98
-17
lines changed
  • lisa/sut_orchestrator/baremetal/cluster

1 file changed

+98
-17
lines changed

lisa/sut_orchestrator/baremetal/cluster/idrac.py

Lines changed: 98 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -207,11 +207,27 @@ def reset(self, operation: str, force_run: bool = False) -> None:
207207
return
208208

209209
body = {"ResetType": operation}
210-
response = self.redfish_instance.post(
211-
"/redfish/v1/Systems/System.Embedded.1/Actions/ComputerSystem.Reset",
212-
body=body,
213-
)
214-
self._wait_for_completion(response)
210+
211+
# Try reset operation with iDRAC recovery on HTTP 500 errors
212+
try:
213+
response = self.redfish_instance.post(
214+
"/redfish/v1/Systems/System.Embedded.1/Actions/ComputerSystem.Reset",
215+
body=body,
216+
)
217+
self._wait_for_completion(response)
218+
except LisaException as e:
219+
if self._reset_if_idrac_error(str(e)):
220+
# iDRAC was reset, retry the operation once
221+
url = (
222+
"/redfish/v1/Systems/System.Embedded.1/Actions/"
223+
"ComputerSystem.Reset"
224+
)
225+
response = self.redfish_instance.post(url, body=body)
226+
self._wait_for_completion(response)
227+
else:
228+
# Not a retriable iDRAC error - re-raise original exception
229+
raise
230+
215231
if operation in self.state_dict.keys():
216232
check_till_timeout(
217233
lambda: self.get_power_state() == expected_state,
@@ -314,34 +330,93 @@ def _check_vm_cleared() -> bool:
314330
"VirtualMedia still appears inserted after ejects; continuing."
315331
)
316332

333+
def _reset_if_idrac_error(self, error_str: str) -> bool:
334+
"""
335+
Check if error indicates iDRAC internal issues and reset if needed.
336+
337+
Args:
338+
error_str: The error message string to check
339+
340+
Returns:
341+
True if this was an iDRAC error that triggered a reset, False otherwise
342+
343+
This method checks for specific iDRAC internal error message IDs.
344+
These message IDs are part of the Redfish standard and DMTF Base Registry:
345+
- IDRAC.2.8.SYS446: Dell iDRAC-specific message (stable across versions)
346+
- Base.1.12.InternalError: DMTF standard message (version-independent)
347+
348+
Both indicate transient iDRAC service errors that resolve after reset.
349+
Reference: DMTF DSP0268 (Message Registry Guide)
350+
"""
351+
is_idrac_internal_error = (
352+
"IDRAC.2.8.SYS446" in error_str or "Base.1.12.InternalError" in error_str
353+
)
354+
355+
if is_idrac_internal_error:
356+
# Per error message: "If the problem persists, consider resetting
357+
# the service."
358+
self._log.debug(
359+
"iDRAC internal server error detected. "
360+
"Resetting iDRAC service per error message guidance..."
361+
)
362+
self._reset_idrac()
363+
return True
364+
365+
return False
366+
317367
def _reset_idrac(self) -> None:
318-
"""Reset iDRAC to clear stale virtual media state."""
319-
self._log.debug("Resetting iDRAC (GracefulRestart) to clear stale VM state...")
368+
"""
369+
Reset iDRAC to recover from internal errors and clear stale state.
370+
371+
Handles session invalidation properly by logging out before reset
372+
and re-logging in after iDRAC restarts.
373+
"""
374+
self._log.info("Resetting iDRAC to recover from internal error...")
375+
376+
# Send reset request without waiting for completion
377+
# (to avoid recursion through _wait_for_completion)
320378
response = self.redfish_instance.post(
321379
"/redfish/v1/Managers/iDRAC.Embedded.1/Actions/Manager.Reset",
322380
body={"ResetType": "GracefulRestart"},
323381
)
324-
self._wait_for_completion(response)
325382

326-
# Poll manager until Enabled (up to 2 minutes)
327-
def _check_idrac_ready() -> bool:
383+
# Just check the immediate response status
384+
if response.status not in [200, 202, 204]:
385+
self._log.debug(
386+
f"iDRAC reset request returned status {response.status}, "
387+
f"continuing anyway"
388+
)
389+
390+
# Logout old session (will be invalidated by iDRAC reset anyway)
391+
self._log.debug("Logging out before iDRAC restart...")
392+
self.logout()
393+
394+
# Poll for iDRAC readiness (typically takes 3-4 minutes)
395+
self._log.debug("Waiting for iDRAC to restart and become ready...")
396+
397+
def _try_login() -> bool:
328398
try:
399+
self.login()
400+
# Verify we can actually query the manager
329401
mgr_state = self.redfish_instance.get(
330402
"/redfish/v1/Managers/iDRAC.Embedded.1"
331403
).dict
332404
if mgr_state.get("Status", {}).get("State") == "Enabled":
333405
self._log.info("iDRAC reset completed successfully")
334406
return True
335-
except Exception:
336-
# iDRAC may be restarting, ignore connection errors
337-
pass
338-
return False
407+
# Not enabled yet
408+
self.logout()
409+
return False
410+
except Exception as e:
411+
# iDRAC may still be restarting, ignore connection errors
412+
self._log.debug(f"iDRAC not ready yet: {e}")
413+
return False
339414

340415
check_till_timeout(
341-
_check_idrac_ready,
342-
timeout_message="iDRAC did not come back after Manager.Reset",
416+
_try_login,
417+
timeout_message="iDRAC did not recover after reset",
343418
timeout=IDRAC_RESET_TIMEOUT,
344-
interval=2,
419+
interval=5,
345420
)
346421

347422
def _insert_virtual_media(self, iso_http_url: str) -> None:
@@ -388,6 +463,12 @@ def _check_media_inserted() -> bool:
388463

389464
except LisaException as e:
390465
error_msg = str(e)
466+
467+
# Check for HTTP 500 internal server errors and reset if needed
468+
if self._reset_if_idrac_error(error_msg):
469+
# Re-raise to trigger retry
470+
raise
471+
391472
# Check for RAC0904 or reachability errors that need iDRAC reset
392473
is_reachability_error = (
393474
"RAC0904" in error_msg or "not accessible or reachable" in error_msg

0 commit comments

Comments
 (0)