Implement enhanced stress-ng jobfile support with robust handling (#3922)

vyadavmsft · web-flow · commit fa955c8bd97f · 2025-07-18T14:30:44.000+08:00
* Enchance stressng to use jobfile.

* Remove debug logs

* fix black error

* Addressing comments

* nit fix
diff --git a/lisa/tools/stress_ng.py b/lisa/tools/stress_ng.py
@@ -1,6 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+from pathlib import Path
 from typing import cast
 
 from lisa.executable import Tool
@@ -58,7 +59,15 @@ def launch_cpu(self, num_cores: int = 0, timeout_in_seconds: int = 3600) -> None
         self.run(cmd, force_run=True, timeout=timeout_in_seconds)
 
     def launch_job_async(self, job_file: str, sudo: bool = False) -> Process:
-        return self.run_async(f"--job {job_file}", force_run=True, sudo=sudo)
+        job_cmd = f"--job {job_file}"
+        # filename without extension
+        job_filename = Path(job_file).stem
+        yaml_output_name = f"{job_filename}.yaml"
+        # Create full path to YAML file in working directory
+        yaml_output_path = self.node.working_path / yaml_output_name
+        job_cmd += f" --yaml {yaml_output_path}"
+
+        return self.run_async(job_cmd, force_run=True, sudo=sudo)
 
     def launch_class_async(
         self,
diff --git a/microsoft/testsuites/stress/stress_ng_suite.py b/microsoft/testsuites/stress/stress_ng_suite.py
@@ -1,14 +1,19 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
+import logging
 from pathlib import Path, PurePath
-from typing import Any, Dict, List, cast
+from typing import Any, Dict, List, Tuple, cast
+
+import yaml
 
 from lisa import Environment, RemoteNode, TestCaseMetadata, TestSuite, TestSuiteMetadata
+from lisa.base_tools import Cat
 from lisa.features import SerialConsole
 from lisa.messages import TestStatus, send_sub_test_result_message
 from lisa.testsuite import TestResult
 from lisa.tools import StressNg
 from lisa.util import SkippedException
+from lisa.util.logger import Logger
 from lisa.util.process import Process
 
 
@@ -34,14 +39,24 @@ class StressNgTestSuite(TestSuite):
     )
     def stress_ng_jobfile(
         self,
+        log: Logger,
         variables: Dict[str, Any],
         environment: Environment,
         result: TestResult,
     ) -> None:
         if self.CONFIG_VARIABLE in variables:
             jobs = variables[self.CONFIG_VARIABLE]
+
+            # Convert job file configuration to a list if needed
+            if not isinstance(jobs, list):
+                jobs = [job.strip() for job in str(jobs).split(",")]
+
             for job_file in jobs:
-                self._run_stress_ng_job(job_file, environment, result)
+                try:
+                    self._run_stress_ng_job(job_file, environment, result, log)
+                except Exception as e:
+                    log.error(f"Failed to run job file '{job_file}': {e}")
+                    raise
         else:
             raise SkippedException("No jobfile provided for stress-ng")
 
@@ -112,34 +127,219 @@ def _run_stress_ng_job(
         job_file: str,
         environment: Environment,
         test_result: TestResult,
+        log: Logger,
     ) -> None:
+        """
+        Execute a stress-ng job file on all nodes in the environment.
+
+        Args:
+            job_file: Path to the stress-ng job file
+            environment: Test environment containing target nodes
+            test_result: Test result object for reporting
+            log: Logger instance for detailed logging
+        """
+
         nodes = [cast(RemoteNode, node) for node in environment.nodes.list()]
-        procs: List[Process] = []
+        stress_processes: List[Process] = []
         job_file_name = Path(job_file).name
-        test_status = TestStatus.QUEUED
-        test_msg = ""
+
+        execution_status = TestStatus.QUEUED
+        execution_summary = ""
+
         try:
-            for node in nodes:
-                remote_working_dir = node.working_path / "stress_ng_jobs"
-                node.shell.mkdir(remote_working_dir, exist_ok=True)
-                job_file_dest = remote_working_dir / job_file_name
-                node.shell.copy(PurePath(job_file), job_file_dest)
-                procs.append(node.tools[StressNg].launch_job_async(str(job_file_dest)))
-            for proc in procs:
-                proc.wait_result(expected_exit_code=0)
-            test_status = TestStatus.PASSED
-        except Exception as e:
-            test_status = TestStatus.FAILED
-            test_msg = repr(e)
-        finally:
-            send_sub_test_result_message(
-                test_result=test_result,
-                test_case_name=job_file_name,
-                test_status=test_status,
-                test_message=test_msg,
+            self._deploy_and_launch_stress_jobs(
+                nodes, job_file, job_file_name, stress_processes, log
+            )
+
+            execution_status, execution_summary = self._monitor_stress_execution(
+                stress_processes, nodes, log, job_file_name
+            )
+
+        except Exception as execution_error:
+            execution_status = TestStatus.FAILED
+            execution_summary = (
+                f"Error: {type(execution_error).__name__}: {str(execution_error)}"
             )
             self._check_panic(nodes)
+            raise execution_error
+
+        finally:
+            self._report_test_results(
+                test_result, job_file_name, execution_status, execution_summary
+            )
+
+    def _deploy_and_launch_stress_jobs(
+        self,
+        nodes: List[RemoteNode],
+        job_file: str,
+        job_file_name: str,
+        stress_processes: List[Process],
+        log: Logger,
+    ) -> None:
+        """
+        Deploy job files to nodes and launch stress-ng processes.
+
+        Args:
+            nodes: List of target nodes
+            job_file: Local path to job file
+            job_file_name: Name of the job file
+            stress_processes: List to store launched processes
+            log: Logger instance for detailed logging
+        """
+        for node_index, node in enumerate(nodes):
+            try:
+                log.debug(f"Processing node {node_index + 1}/{len(nodes)}: {node.name}")
+
+                # Create dedicated workspace for stress-ng jobs
+                remote_workspace = node.working_path / "stress_ng_jobs"
+                node.shell.mkdir(remote_workspace, exist_ok=True)
+
+                # Deploy job file to remote node
+                remote_job_file = remote_workspace / job_file_name
+                node.shell.copy(PurePath(job_file), remote_job_file)
+
+                # Launch stress-ng with the job file
+                stress_process = node.tools[StressNg].launch_job_async(
+                    str(remote_job_file),
+                )
+                stress_processes.append(stress_process)
+
+            except Exception as deployment_error:
+                log.error(
+                    f"Failed to start stress job on node {node_index + 1}: "
+                    f"{deployment_error}"
+                )
+                if getattr(node, "log", None):
+                    node.log.error(f"Failed to start stress job: {deployment_error}")
+                raise deployment_error
+
+    def _monitor_stress_execution(
+        self,
+        stress_processes: List[Process],
+        nodes: List[RemoteNode],
+        log: Logger,
+        job_file_name: str,
+    ) -> Tuple[TestStatus, str]:
+        """
+        Monitor stress-ng execution and capture stress-ng info output.
+
+        Returns:
+            Tuple of (TestStatus, stress_ng_info_output)
+        """
+
+        failed_nodes = 0
+        node_outputs = []
+        exceptions_to_raise = []
+
+        # Wait for all processes and capture their output
+        for i, process in enumerate(stress_processes):
+            node_name = nodes[i].name
+            try:
+                process.wait_result(timeout=self.TIME_OUT, expected_exit_code=0)
+                log.info(f"{node_name} completed successfully")
+
+                # Process YAML output if applicable
+                node_output = self._process_yaml_output(nodes[i], job_file_name, log)
+
+                node_outputs.append(node_output)
+
+            except Exception as e:
+                failed_nodes += 1
+                error_output = f"=== {node_name} ===\nERROR: {str(e)}"
+                node_outputs.append(error_output)
+                log.error(f"{node_name} failed: {e}")
+                # Store the exception to re-raise after collecting all outputs
+                exceptions_to_raise.append(e)
+
+        # Combine all node outputs, including node names for clarity
+        execution_summary = f"Job: {job_file_name}\n\n"
+        for i, node_output in enumerate(node_outputs):
+            node_name = nodes[i].name
+            execution_summary += f"=== {node_name} ===\n{node_output}\n\n"
+
+        # If any processes failed, re-raise the first exception to fail the test
+        if exceptions_to_raise:
+            log.error(
+                f"Stress-ng job failed on {failed_nodes} node(s). "
+                f"Re-raising first exception to fail the test case."
+            )
+            raise exceptions_to_raise[0]
+
+        # Return status and stress-ng info output
+        overall_status = TestStatus.PASSED if failed_nodes == 0 else TestStatus.FAILED
+        return overall_status, execution_summary
+
+    def _report_test_results(
+        self,
+        test_result: TestResult,
+        job_file_name: str,
+        execution_status: TestStatus,
+        execution_summary: str,
+    ) -> None:
+        """
+        Report the stress test results through LISA's messaging system.
+
+        Args:
+            test_result: Test result object for reporting
+            job_file_name: Name of the executed job file
+            execution_status: Final test status (PASSED/FAILED)
+            execution_summary: Comprehensive execution summary
+        """
+        send_sub_test_result_message(
+            test_result=test_result,
+            test_case_name=job_file_name,
+            test_status=execution_status,
+            test_message=execution_summary,
+        )
 
     def _check_panic(self, nodes: List[RemoteNode]) -> None:
         for node in nodes:
             node.features[SerialConsole].check_panic(saved_path=None, force_run=True)
+
+    def _process_yaml_output(
+        self,
+        node: RemoteNode,
+        job_file_name: str,
+        log: Logger,
+    ) -> str:
+        """
+        Process YAML output file if it exists and return a concise summary string.
+        Only extracts 'system-info' and 'times' sections if present.
+        """
+        logging.getLogger("YamlManager").setLevel(logging.WARNING)
+
+        job_stem = Path(job_file_name).stem
+        yaml_filename = f"{job_stem}.yaml"
+        yaml_file_path = node.working_path / yaml_filename
+
+        if not node.shell.exists(yaml_file_path):
+            return "No YAML output file found"
+
+        cat = node.tools[Cat]
+        yaml_content = cat.read(str(yaml_file_path)).strip()
+        if not yaml_content:
+            return "YAML file is empty"
+
+        try:
+            parsed_yaml = yaml.safe_load(yaml_content)
+        except Exception as e:
+            log.warning(f"Failed to parse YAML content: {e}")
+            return "YAML parse error"
+
+        if not isinstance(parsed_yaml, dict):
+            return str(parsed_yaml) if parsed_yaml else "YAML file is empty or invalid"
+
+        # Only extract 'system-info' and 'times' if present
+        output_lines = []
+        for key in ("system-info", "times"):
+            if key in parsed_yaml:
+                output_lines.append(f"{key}:")
+                value = parsed_yaml[key]
+                if isinstance(value, dict):
+                    for sub_k, sub_v in value.items():
+                        output_lines.append(f"  {sub_k}: {sub_v}")
+                else:
+                    output_lines.append(f"  {value}")
+        if not output_lines:
+            return "No system-info or times in YAML"
+        return "\n".join(output_lines)