diff --git a/.gitignore b/.gitignore index 517031ee..f8bd130f 100644 --- a/.gitignore +++ b/.gitignore @@ -183,3 +183,11 @@ _version.py # NeMo Run .nemo_run/ + +# Local dev artifacts +configs/ +local/ +scripts/ +log.txt +torchrun_job.sh +test/run/core/ diff --git a/nemo_run/core/execution/slurm.py b/nemo_run/core/execution/slurm.py index 2b52d292..3f171255 100644 --- a/nemo_run/core/execution/slurm.py +++ b/nemo_run/core/execution/slurm.py @@ -347,6 +347,9 @@ class ResourceRequest: #: Template name to use for Ray jobs (e.g., "ray.sub.j2" or "ray_enroot.sub.j2") ray_template: str = "ray.sub.j2" + print_script: bool = False + """If True, print the generated sbatch script to stdout before submission.""" + #: Set by the executor; cannot be initialized job_name: str = field(init=False, default="nemo-job") stderr_to_stdout: bool = field(init=False, default=True) diff --git a/nemo_run/run/torchx_backend/schedulers/slurm.py b/nemo_run/run/torchx_backend/schedulers/slurm.py index 66fabd5d..4fbdaab1 100644 --- a/nemo_run/run/torchx_backend/schedulers/slurm.py +++ b/nemo_run/run/torchx_backend/schedulers/slurm.py @@ -185,6 +185,9 @@ def _submit_dryrun(self, app: AppDef, cfg: Executor) -> AppDryRunInfo[Any]: # t path = os.path.join(sbatch_dir, f"{executor.job_name}_sbatch.sh") script = req.materialize() + if executor.print_script: + print(script) + with open(path, "w") as f: f.write(script) diff --git a/test/run/torchx_backend/schedulers/test_slurm.py b/test/run/torchx_backend/schedulers/test_slurm.py index 9fc08ba0..1d8a9f0d 100644 --- a/test/run/torchx_backend/schedulers/test_slurm.py +++ b/test/run/torchx_backend/schedulers/test_slurm.py @@ -726,3 +726,38 @@ def test_non_heterogeneous_ray_cluster(slurm_scheduler, temp_dir): # Verify run_as_group was NOT set assert not hasattr(executor, "run_as_group") or not executor.run_as_group assert isinstance(dryrun_info.request, SlurmRayRequest) + + +@mock.patch("nemo_run.core.execution.slurm.fill_template") +def test_submit_dryrun_print_script( + mock_fill_template, slurm_scheduler, mock_app_def, slurm_executor, capsys +): + """When print_script=True, sbatch script is printed to stdout.""" + mock_fill_template.return_value = "#!/bin/bash\n# Mock script content" + slurm_executor.print_script = True + + with mock.patch.object(SlurmTunnelScheduler, "_initialize_tunnel"): + slurm_scheduler.tunnel = mock.MagicMock() + with mock.patch.object(SlurmExecutor, "package"): + with mock.patch("builtins.open", mock.mock_open()): + slurm_scheduler._submit_dryrun(mock_app_def, slurm_executor) + + captured = capsys.readouterr() + assert "#!/bin/bash" in captured.out + + +@mock.patch("nemo_run.core.execution.slurm.fill_template") +def test_submit_dryrun_no_print_by_default( + mock_fill_template, slurm_scheduler, mock_app_def, slurm_executor, capsys +): + """By default (print_script=False), nothing is printed to stdout.""" + mock_fill_template.return_value = "#!/bin/bash\n# Mock script content" + + with mock.patch.object(SlurmTunnelScheduler, "_initialize_tunnel"): + slurm_scheduler.tunnel = mock.MagicMock() + with mock.patch.object(SlurmExecutor, "package"): + with mock.patch("builtins.open", mock.mock_open()): + slurm_scheduler._submit_dryrun(mock_app_def, slurm_executor) + + captured = capsys.readouterr() + assert captured.out == ""