@@ -207,11 +207,27 @@ def reset(self, operation: str, force_run: bool = False) -> None:
207207 return
208208
209209 body = {"ResetType" : operation }
210- response = self .redfish_instance .post (
211- "/redfish/v1/Systems/System.Embedded.1/Actions/ComputerSystem.Reset" ,
212- body = body ,
213- )
214- self ._wait_for_completion (response )
210+
211+ # Try reset operation with iDRAC recovery on HTTP 500 errors
212+ try :
213+ response = self .redfish_instance .post (
214+ "/redfish/v1/Systems/System.Embedded.1/Actions/ComputerSystem.Reset" ,
215+ body = body ,
216+ )
217+ self ._wait_for_completion (response )
218+ except LisaException as e :
219+ if self ._reset_if_idrac_error (str (e )):
220+ # iDRAC was reset, retry the operation once
221+ url = (
222+ "/redfish/v1/Systems/System.Embedded.1/Actions/"
223+ "ComputerSystem.Reset"
224+ )
225+ response = self .redfish_instance .post (url , body = body )
226+ self ._wait_for_completion (response )
227+ else :
228+ # Not a retriable iDRAC error - re-raise original exception
229+ raise
230+
215231 if operation in self .state_dict .keys ():
216232 check_till_timeout (
217233 lambda : self .get_power_state () == expected_state ,
@@ -314,34 +330,93 @@ def _check_vm_cleared() -> bool:
314330 "VirtualMedia still appears inserted after ejects; continuing."
315331 )
316332
333+ def _reset_if_idrac_error (self , error_str : str ) -> bool :
334+ """
335+ Check if error indicates iDRAC internal issues and reset if needed.
336+
337+ Args:
338+ error_str: The error message string to check
339+
340+ Returns:
341+ True if this was an iDRAC error that triggered a reset, False otherwise
342+
343+ This method checks for specific iDRAC internal error message IDs.
344+ These message IDs are part of the Redfish standard and DMTF Base Registry:
345+ - IDRAC.2.8.SYS446: Dell iDRAC-specific message (stable across versions)
346+ - Base.1.12.InternalError: DMTF standard message (version-independent)
347+
348+ Both indicate transient iDRAC service errors that resolve after reset.
349+ Reference: DMTF DSP0268 (Message Registry Guide)
350+ """
351+ is_idrac_internal_error = (
352+ "IDRAC.2.8.SYS446" in error_str or "Base.1.12.InternalError" in error_str
353+ )
354+
355+ if is_idrac_internal_error :
356+ # Per error message: "If the problem persists, consider resetting
357+ # the service."
358+ self ._log .debug (
359+ "iDRAC internal server error detected. "
360+ "Resetting iDRAC service per error message guidance..."
361+ )
362+ self ._reset_idrac ()
363+ return True
364+
365+ return False
366+
317367 def _reset_idrac (self ) -> None :
318- """Reset iDRAC to clear stale virtual media state."""
319- self ._log .debug ("Resetting iDRAC (GracefulRestart) to clear stale VM state..." )
368+ """
369+ Reset iDRAC to recover from internal errors and clear stale state.
370+
371+ Handles session invalidation properly by logging out before reset
372+ and re-logging in after iDRAC restarts.
373+ """
374+ self ._log .info ("Resetting iDRAC to recover from internal error..." )
375+
376+ # Send reset request without waiting for completion
377+ # (to avoid recursion through _wait_for_completion)
320378 response = self .redfish_instance .post (
321379 "/redfish/v1/Managers/iDRAC.Embedded.1/Actions/Manager.Reset" ,
322380 body = {"ResetType" : "GracefulRestart" },
323381 )
324- self ._wait_for_completion (response )
325382
326- # Poll manager until Enabled (up to 2 minutes)
327- def _check_idrac_ready () -> bool :
383+ # Just check the immediate response status
384+ if response .status not in [200 , 202 , 204 ]:
385+ self ._log .debug (
386+ f"iDRAC reset request returned status { response .status } , "
387+ f"continuing anyway"
388+ )
389+
390+ # Logout old session (will be invalidated by iDRAC reset anyway)
391+ self ._log .debug ("Logging out before iDRAC restart..." )
392+ self .logout ()
393+
394+ # Poll for iDRAC readiness (typically takes 3-4 minutes)
395+ self ._log .debug ("Waiting for iDRAC to restart and become ready..." )
396+
397+ def _try_login () -> bool :
328398 try :
399+ self .login ()
400+ # Verify we can actually query the manager
329401 mgr_state = self .redfish_instance .get (
330402 "/redfish/v1/Managers/iDRAC.Embedded.1"
331403 ).dict
332404 if mgr_state .get ("Status" , {}).get ("State" ) == "Enabled" :
333405 self ._log .info ("iDRAC reset completed successfully" )
334406 return True
335- except Exception :
336- # iDRAC may be restarting, ignore connection errors
337- pass
338- return False
407+ # Not enabled yet
408+ self .logout ()
409+ return False
410+ except Exception as e :
411+ # iDRAC may still be restarting, ignore connection errors
412+ self ._log .debug (f"iDRAC not ready yet: { e } " )
413+ return False
339414
340415 check_till_timeout (
341- _check_idrac_ready ,
342- timeout_message = "iDRAC did not come back after Manager.Reset " ,
416+ _try_login ,
417+ timeout_message = "iDRAC did not recover after reset " ,
343418 timeout = IDRAC_RESET_TIMEOUT ,
344- interval = 2 ,
419+ interval = 5 ,
345420 )
346421
347422 def _insert_virtual_media (self , iso_http_url : str ) -> None :
@@ -388,6 +463,12 @@ def _check_media_inserted() -> bool:
388463
389464 except LisaException as e :
390465 error_msg = str (e )
466+
467+ # Check for HTTP 500 internal server errors and reset if needed
468+ if self ._reset_if_idrac_error (error_msg ):
469+ # Re-raise to trigger retry
470+ raise
471+
391472 # Check for RAC0904 or reachability errors that need iDRAC reset
392473 is_reachability_error = (
393474 "RAC0904" in error_msg or "not accessible or reachable" in error_msg
0 commit comments