From 79100b2e3bad613d327cb6776ecf54268fdca8c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 16 Mar 2026 10:28:07 +0000
Subject: [PATCH 1/7] feat: add Experiment.export() to write runnable scripts
 without submitting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds `Experiment.export(output_dir)` which writes one script per job into
a self-contained directory plus a `submit_all.sh` launcher, enabling users
to inspect, version, and manually submit jobs without going through the
NeMo Run execution pipeline.

Each scheduler's `_submit_dryrun()` now writes its script to
`executor.experiment_dir` when set:
- LocalExecutor    → <task>.sh        (executable bash)
- DockerExecutor   → <task>.yaml
- SkypilotExecutor / SkypilotJobsExecutor → <task>.yaml
- LeptonExecutor   → <task>.sh        (executable bash)
- DGXCloudExecutor → <task>_torchrun_job.sh (was hardcoded, now uses job_name)

`Experiment.export()` redirects all executor experiment_dirs to output_dir,
runs dryrun to trigger script writing, then generates submit_all.sh with
the correct submit command per executor type (sbatch, sky launch, etc.).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/run/experiment.py                    | 78 +++++++++++++++++++
 .../run/torchx_backend/schedulers/dgxcloud.py |  4 +-
 .../run/torchx_backend/schedulers/docker.py   |  6 ++
 .../run/torchx_backend/schedulers/lepton.py   | 14 ++++
 .../run/torchx_backend/schedulers/local.py    | 16 ++++
 .../run/torchx_backend/schedulers/skypilot.py |  6 ++
 .../schedulers/skypilot_jobs.py               |  6 ++
 test/run/test_experiment.py                   | 69 ++++++++++++++++
 8 files changed, 197 insertions(+), 2 deletions(-)
diff --git a/nemo_run/run/experiment.py b/nemo_run/run/experiment.py
index 460f04f6..f998aafd 100644
--- a/nemo_run/run/experiment.py
+++ b/nemo_run/run/experiment.py
@@ -624,6 +624,51 @@ def dryrun(self, log: bool = True, exist_ok: bool = False, delete_exp_dir: bool
         if delete_exp_dir:
             shutil.rmtree(self._exp_dir)
 
+    def export(self, output_dir: str, exist_ok: bool = False) -> None:
+        """
+        Export runnable scripts for all tasks to output_dir without submitting.
+
+        Each task produces a script file in output_dir:
+          - SlurmExecutor        → <task>_sbatch.sh       (sbatch <file>)
+          - DGXCloudExecutor     → <task>_torchrun_job.sh
+          - LocalExecutor        → <task>.sh               (bash <file>)
+          - DockerExecutor       → <task>.yaml
+          - SkypilotExecutor     → <task>.yaml             (sky launch <file>)
+          - SkypilotJobsExecutor → <task>.yaml
+          - LeptonExecutor       → <task>.sh
+
+        Also generates submit_all.sh.
+
+        Args:
+            output_dir: Directory to write into. Created if it doesn't exist.
+            exist_ok: Passed to mkdir.
+        """
+        out = Path(output_dir)
+        out.mkdir(parents=True, exist_ok=exist_ok)
+
+        self._prepare(exist_ok=True)
+
+        # Redirect all executors to write scripts into output_dir
+        for job in self.jobs:
+            if isinstance(job, JobGroup):
+                executors = (
+                    [job.executors] if isinstance(job.executors, Executor) else job.executors
+                )
+                for executor in executors:
+                    executor.experiment_dir = str(out)
+            else:
+                job.executor.experiment_dir = str(out)
+
+        # Run dryrun — each scheduler writes its script file to output_dir
+        for job in self.jobs:
+            job.launch(wait=False, runner=self._runner, dryrun=True, direct=False, log_dryrun=False)
+
+        # Generate submit_all.sh
+        _write_submit_script(out, self._title, self.jobs)
+
+        self.console.log(f"[bold green]Exported scripts to {out}")
+        shutil.rmtree(self._exp_dir)
+
     def run(
         self,
         sequential: bool = False,
@@ -1332,6 +1377,39 @@ def _get_sorted_dirs(path: str) -> list[str]:
 _LOADED_MAINS = set()
 
 
+def _write_submit_script(out: Path, title: str, jobs: list) -> None:
+    _SUBMIT_CMDS = {
+        "SlurmExecutor": "sbatch",
+        "SkypilotExecutor": "sky launch",
+        "SkypilotJobsExecutor": "sky jobs launch",
+        "DockerExecutor": "docker compose -f",
+    }
+
+    lines = [
+        "#!/bin/bash",
+        f"# Submit all jobs for experiment: {title}",
+        "# Generated by NeMo Run",
+        "",
+        'SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"',
+        "",
+    ]
+
+    for job in jobs:
+        job_list = job.jobs if isinstance(job, JobGroup) else [job]
+        for j in job_list:
+            executor_type = type(j.executor).__name__
+            cmd = _SUBMIT_CMDS.get(executor_type, "bash")
+            scripts = sorted(out.glob(f"{j.id}*"))
+            for s in scripts:
+                if s.name == "submit_all.sh":
+                    continue
+                lines.append(f'{cmd} "$SCRIPT_DIR/{s.name}"')
+
+    submit = out / "submit_all.sh"
+    submit.write_text("\n".join(lines) + "\n")
+    submit.chmod(0o755)
+
+
 def maybe_load_external_main(exp_dir: str):
     main_file = Path(exp_dir) / "__main__.py"
     if main_file.exists() and main_file not in _LOADED_MAINS:
diff --git a/nemo_run/run/torchx_backend/schedulers/dgxcloud.py b/nemo_run/run/torchx_backend/schedulers/dgxcloud.py
index 9bc2c969..aa8b97de 100644
--- a/nemo_run/run/torchx_backend/schedulers/dgxcloud.py
+++ b/nemo_run/run/torchx_backend/schedulers/dgxcloud.py
@@ -120,7 +120,7 @@ def _submit_dryrun(  # type: ignore
         )
 
         # Write and copy sbatch script
-        path = os.path.join(executor.experiment_dir, "torchrun_job.sh")
+        path = os.path.join(executor.experiment_dir, f"{executor.job_name}_torchrun_job.sh")
         script = req.materialize()
 
         with open(path, "w") as f:
@@ -145,7 +145,7 @@ def schedule(self, dryrun_info: AppDryRunInfo[DGXRequest]) -> str:
 
         # The DGXExecutor's launch call typically returns (job_id, handle).
         # We'll call it without additional parameters here.
-        cmd = os.path.join(executor.experiment_dir, "torchrun_job.sh")
+        cmd = os.path.join(executor.experiment_dir, f"{executor.job_name}_torchrun_job.sh")
         req.launch_cmd = ["bash", cmd]
         job_id, status = executor.launch(name=req.name, cmd=req.launch_cmd)
         if not job_id:
diff --git a/nemo_run/run/torchx_backend/schedulers/docker.py b/nemo_run/run/torchx_backend/schedulers/docker.py
index 4f68920c..f7733cef 100644
--- a/nemo_run/run/torchx_backend/schedulers/docker.py
+++ b/nemo_run/run/torchx_backend/schedulers/docker.py
@@ -99,6 +99,12 @@ def _submit_dryrun(self, app: AppDef, cfg: Executor) -> AppDryRunInfo[DockerJobR
         basename = Path(executor.job_dir).name
         app_id = make_unique(basename)
         req = DockerJobRequest(id=app_id, executor=executor, containers=containers)
+
+        if executor.experiment_dir:
+            path = os.path.join(executor.experiment_dir, f"{executor.job_name}.yaml")
+            with open(path, "w") as f:
+                f.write(str(req))
+
         return AppDryRunInfo(req, repr)
 
     def schedule(self, dryrun_info: AppDryRunInfo[DockerJobRequest]) -> str:  # type: ignore
diff --git a/nemo_run/run/torchx_backend/schedulers/lepton.py b/nemo_run/run/torchx_backend/schedulers/lepton.py
index 0b012c19..6d34f1c9 100644
--- a/nemo_run/run/torchx_backend/schedulers/lepton.py
+++ b/nemo_run/run/torchx_backend/schedulers/lepton.py
@@ -16,6 +16,7 @@
 import json
 import logging
 import os
+import shlex
 import shutil
 import tempfile
 from dataclasses import dataclass
@@ -98,6 +99,19 @@ def _submit_dryrun(  # type: ignore
             role = values.apply(role)
 
         cmd = [role.entrypoint] + role.args
+
+        if executor.experiment_dir:
+            path = os.path.join(executor.experiment_dir, f"{executor.job_name}.sh")
+            lines = ["#!/bin/bash", "# Generated by NeMo Run", "# Submit via Lepton AI API", ""]
+            for key, val in executor.env_vars.items():
+                lines.append(f"export {key}={shlex.quote(str(val))}")
+            for key, val in role.env.items():
+                lines.append(f"export {key}={shlex.quote(str(val))}")
+            lines.append(" ".join(shlex.quote(p) for p in cmd))
+            with open(path, "w") as f:
+                f.write("\n".join(lines) + "\n")
+            os.chmod(path, 0o755)
+
         return AppDryRunInfo(
             LeptonRequest(app=app, executor=executor, cmd=cmd, name=role.name),
             # Minimal function to show the config, if any
diff --git a/nemo_run/run/torchx_backend/schedulers/local.py b/nemo_run/run/torchx_backend/schedulers/local.py
index 5e95c7dd..7c8913b2 100644
--- a/nemo_run/run/torchx_backend/schedulers/local.py
+++ b/nemo_run/run/torchx_backend/schedulers/local.py
@@ -16,6 +16,7 @@
 import json
 import os
 import pprint
+import shlex
 import shutil
 import tempfile
 import warnings
@@ -100,6 +101,21 @@ def _submit_dryrun(self, app: AppDef, cfg: Executor) -> AppDryRunInfo[PopenReque
         cfg_dict = asdict(cfg)
         cfg_dict["log_dir"] = cfg_dict.pop("job_dir")
         request = self._to_popen_request(app, cfg_dict)  # type: ignore
+
+        if cfg.experiment_dir:
+            path = os.path.join(cfg.experiment_dir, f"{app.name}.sh")
+            lines = ["#!/bin/bash", "# Generated by NeMo Run", ""]
+            for key, val in cfg.env_vars.items():
+                lines.append(f"export {key}={shlex.quote(str(val))}")
+            for role in app.roles:
+                for key, val in role.env.items():
+                    lines.append(f"export {key}={shlex.quote(str(val))}")
+                cmd_parts = [role.entrypoint] + role.args
+                lines.append(" ".join(shlex.quote(p) for p in cmd_parts))
+            with open(path, "w") as f:
+                f.write("\n".join(lines) + "\n")
+            os.chmod(path, 0o755)
+
         return AppDryRunInfo(request, lambda p: pprint.pformat(asdict(p), indent=2, width=80))
 
     def schedule(self, dryrun_info: AppDryRunInfo[PopenRequest]) -> str:
diff --git a/nemo_run/run/torchx_backend/schedulers/skypilot.py b/nemo_run/run/torchx_backend/schedulers/skypilot.py
index 2a3f4bb6..86f08156 100644
--- a/nemo_run/run/torchx_backend/schedulers/skypilot.py
+++ b/nemo_run/run/torchx_backend/schedulers/skypilot.py
@@ -143,6 +143,12 @@ def _submit_dryrun(  # type: ignore
         task = cfg.to_task(name=role.name, cmd=cmd, env_vars=role.env)
 
         req = SkypilotRequest(task=task, executor=cfg)
+
+        if cfg.experiment_dir:
+            path = os.path.join(cfg.experiment_dir, f"{cfg.job_name}.yaml")
+            with open(path, "w") as f:
+                f.write(common_utils.dump_yaml_str(req.task.to_yaml_config()))
+
         return AppDryRunInfo(req, lambda req: common_utils.dump_yaml_str(req.task.to_yaml_config()))
 
     def _validate(self, app: AppDef, scheduler: str) -> None:
diff --git a/nemo_run/run/torchx_backend/schedulers/skypilot_jobs.py b/nemo_run/run/torchx_backend/schedulers/skypilot_jobs.py
index d364c359..23b1440c 100644
--- a/nemo_run/run/torchx_backend/schedulers/skypilot_jobs.py
+++ b/nemo_run/run/torchx_backend/schedulers/skypilot_jobs.py
@@ -141,6 +141,12 @@ def _submit_dryrun(  # type: ignore
         task = cfg.to_task(name=role.name, cmd=cmd, env_vars=role.env)
 
         req = SkypilotJobsRequest(task=task, executor=cfg)
+
+        if cfg.experiment_dir:
+            path = os.path.join(cfg.experiment_dir, f"{cfg.job_name}.yaml")
+            with open(path, "w") as f:
+                f.write(common_utils.dump_yaml_str(req.task.to_yaml_config()))
+
         return AppDryRunInfo(req, lambda req: common_utils.dump_yaml_str(req.task.to_yaml_config()))
 
     def _validate(self, app: AppDef, scheduler: str) -> None:
diff --git a/test/run/test_experiment.py b/test/run/test_experiment.py
index 4f160237..4e48fb6e 100644
--- a/test/run/test_experiment.py
+++ b/test/run/test_experiment.py
@@ -1511,3 +1511,72 @@ def to_config(self):
             # Should pull tunnel and connect
             exp._initialize_tunnels(extract_from_executors=True)
             assert "t1" in exp.tunnels
+
+
+def test_experiment_export_local(temp_dir):
+    """export() with LocalExecutor writes a .sh script and submit_all.sh."""
+    output_dir = os.path.join(temp_dir, "exported")
+    with Experiment("test-exp") as exp:
+        task = run.Partial(dummy_function, x=1, y=2)
+        exp.add(task, name="hello-job")
+        exp.export(output_dir)
+
+    files = os.listdir(output_dir)
+    # At least one .sh script for the job and submit_all.sh
+    sh_scripts = [f for f in files if f.endswith(".sh") and f != "submit_all.sh"]
+    assert len(sh_scripts) >= 1, f"Expected a .sh script, got: {files}"
+    assert "submit_all.sh" in files
+
+    # submit_all.sh must be executable and contain "bash"
+    submit_path = os.path.join(output_dir, "submit_all.sh")
+    assert os.access(submit_path, os.X_OK)
+    content = Path(submit_path).read_text()
+    assert "bash" in content
+
+    # The job script must be executable
+    job_script_path = os.path.join(output_dir, sh_scripts[0])
+    assert os.access(job_script_path, os.X_OK)
+
+    # The exp_dir should have been cleaned up
+    assert not os.path.exists(exp._exp_dir)
+
+
+def test_experiment_export_creates_output_dir(temp_dir):
+    """export() creates the output directory if it does not exist."""
+    output_dir = os.path.join(temp_dir, "new_dir", "nested")
+    with Experiment("test-exp") as exp:
+        task = run.Partial(dummy_function, x=1, y=2)
+        exp.add(task, name="nested-job")
+        exp.export(output_dir)
+
+    assert os.path.isdir(output_dir)
+    assert "submit_all.sh" in os.listdir(output_dir)
+
+
+def test_experiment_export_exist_ok(temp_dir):
+    """export() with exist_ok=True does not raise if output_dir already exists."""
+    output_dir = os.path.join(temp_dir, "exists")
+    os.makedirs(output_dir)
+    with Experiment("test-exp") as exp:
+        task = run.Partial(dummy_function, x=1, y=2)
+        exp.add(task, name="job")
+        exp.export(output_dir, exist_ok=True)
+
+    assert "submit_all.sh" in os.listdir(output_dir)
+
+
+def test_experiment_export_multiple_jobs(temp_dir):
+    """export() produces one script per job and all are referenced in submit_all.sh."""
+    output_dir = os.path.join(temp_dir, "multi")
+    with Experiment("test-exp") as exp:
+        exp.add(run.Partial(dummy_function, x=1, y=2), name="job-a")
+        exp.add(run.Partial(dummy_function, x=3, y=4), name="job-b")
+        exp.export(output_dir)
+
+    files = os.listdir(output_dir)
+    sh_scripts = [f for f in files if f.endswith(".sh") and f != "submit_all.sh"]
+    assert len(sh_scripts) == 2, f"Expected 2 scripts, got: {sh_scripts}"
+
+    submit_content = Path(os.path.join(output_dir, "submit_all.sh")).read_text()
+    for script in sh_scripts:
+        assert script in submit_content, f"{script} not referenced in submit_all.sh"

From 9a6c46c37373b9cd2c71e0a45024ee91b67129c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 16 Mar 2026 10:35:08 +0000
Subject: [PATCH 2/7] docs: add export() e2e examples for Local, SLURM,
 DGXCloud, and Script tasks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Demonstrates Experiment.export() across all common executor types:
- export_local.py      — single LocalExecutor job → .sh + submit_all.sh
- export_multi_job.py  — three-job pipeline (preprocess/train/evaluate)
- export_script.py     — run.Script (inline bash) tasks
- export_slurm.py      — two SlurmExecutor jobs → *_sbatch.sh; no cluster needed
- export_dgxcloud.py   — DGXCloudExecutor job → *_torchrun_job.sh; no API calls needed

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 local/export_dgxcloud.py  | 69 ++++++++++++++++++++++++++++++++
 local/export_local.py     | 47 ++++++++++++++++++++++
 local/export_multi_job.py | 61 ++++++++++++++++++++++++++++
 local/export_script.py    | 42 ++++++++++++++++++++
 local/export_slurm.py     | 84 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 303 insertions(+)
 create mode 100644 local/export_dgxcloud.py
 create mode 100644 local/export_local.py
 create mode 100644 local/export_multi_job.py
 create mode 100644 local/export_script.py
 create mode 100644 local/export_slurm.py

diff --git a/local/export_dgxcloud.py b/local/export_dgxcloud.py
new file mode 100644
index 00000000..d1114a2d
--- /dev/null
+++ b/local/export_dgxcloud.py
@@ -0,0 +1,69 @@
+"""
+E2E example: Experiment.export() with DGXCloudExecutor
+
+Demonstrates exporting DGX Cloud jobs to a self-contained directory without
+any API calls or authentication. The generated script can be inspected and
+submitted manually via the DGX Cloud CLI or API.
+
+The output directory contains:
+  - <job>_torchrun_job.sh   (the torchrun launch script uploaded to the PVC)
+  - submit_all.sh           (calls: bash <job>_torchrun_job.sh for each job)
+
+Run:
+    python local/export_dgxcloud.py
+    cat /tmp/nemo_export_dgxcloud/train_torchrun_job.sh
+"""
+
+import os
+import shutil
+
+import nemo_run as run
+from nemo_run.core.execution.dgxcloud import DGXCloudExecutor
+
+OUTPUT_DIR = "/tmp/nemo_export_dgxcloud"
+shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+
+
+def train(model: str, steps: int = 10_000):
+    import torch
+
+    print(f"Training {model} on {torch.cuda.device_count()} GPUs for {steps} steps")
+
+
+# Configure the DGX Cloud executor (credentials are placeholders — not contacted during export)
+executor = DGXCloudExecutor(
+    base_url="https://api.ngc.nvidia.com/v2/org/my-org/dgxcloud",
+    kube_apiserver_url="https://my-cluster.k8s.example.com",
+    app_id="my-app-id",
+    app_secret="my-app-secret",
+    project_name="my-project",
+    container_image="nvcr.io/nvidia/nemo:latest",
+    pvc_nemo_run_dir="/mnt/pvc/nemo_run",
+    pvcs=[{"claimName": "nemo-pvc", "path": "/mnt/pvc"}],
+    nodes=2,
+    gpus_per_node=8,
+    packager=run.GitArchivePackager(),
+)
+
+with run.Experiment("export-dgxcloud-demo") as exp:
+    exp.add(
+        run.Partial(train, model="mistral-7b", steps=100_000),
+        executor=executor,
+        name="train",
+    )
+    exp.export(OUTPUT_DIR)
+
+files = sorted(os.listdir(OUTPUT_DIR))
+print(f"\nExported files: {files}")
+
+torchrun_scripts = [f for f in files if f.endswith("_torchrun_job.sh")]
+assert len(torchrun_scripts) == 1, f"Expected 1 torchrun script, got: {torchrun_scripts}"
+
+print("\n--- train_torchrun_job.sh (first 40 lines) ---")
+with open(f"{OUTPUT_DIR}/{torchrun_scripts[0]}") as f:
+    lines = f.readlines()
+    print("".join(lines[:40]))
+
+print("--- submit_all.sh ---")
+with open(f"{OUTPUT_DIR}/submit_all.sh") as f:
+    print(f.read())
diff --git a/local/export_local.py b/local/export_local.py
new file mode 100644
index 00000000..943c89fc
--- /dev/null
+++ b/local/export_local.py
@@ -0,0 +1,47 @@
+"""
+E2E example: Experiment.export() with LocalExecutor
+
+Demonstrates exporting a single job to a self-contained script directory
+without submitting anything. The output directory contains:
+  - hello-job.sh    (executable bash script)
+  - submit_all.sh   (launcher that calls: bash hello-job.sh)
+
+Run:
+    python local/export_local.py
+    ls /tmp/nemo_export_local/
+    cat /tmp/nemo_export_local/hello-job.sh
+    bash /tmp/nemo_export_local/submit_all.sh
+"""
+
+import os
+import shutil
+
+import nemo_run as run
+from nemo_run.core.execution.local import LocalExecutor
+
+OUTPUT_DIR = "/tmp/nemo_export_local"
+shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+
+
+def greet(name: str, times: int = 1):
+    for _ in range(times):
+        print(f"Hello, {name}!")
+
+
+with run.Experiment("export-local-demo") as exp:
+    task = run.Partial(greet, name="NeMo", times=3)
+    exp.add(task, executor=LocalExecutor(), name="hello-job")
+    exp.export(OUTPUT_DIR)
+
+files = sorted(os.listdir(OUTPUT_DIR))
+print(f"\nExported files: {files}")
+assert "hello-job.sh" in files, "Expected hello-job.sh"
+assert "submit_all.sh" in files, "Expected submit_all.sh"
+
+print("\n--- hello-job.sh ---")
+with open(f"{OUTPUT_DIR}/hello-job.sh") as f:
+    print(f.read())
+
+print("--- submit_all.sh ---")
+with open(f"{OUTPUT_DIR}/submit_all.sh") as f:
+    print(f.read())
diff --git a/local/export_multi_job.py b/local/export_multi_job.py
new file mode 100644
index 00000000..112911fe
--- /dev/null
+++ b/local/export_multi_job.py
@@ -0,0 +1,61 @@
+"""
+E2E example: Experiment.export() with multiple LocalExecutor jobs
+
+Demonstrates exporting multiple jobs to a shared output directory.
+Each job produces its own .sh script; submit_all.sh chains them all.
+
+Run:
+    python local/export_multi_job.py
+    ls /tmp/nemo_export_multi/
+    bash /tmp/nemo_export_multi/submit_all.sh
+"""
+
+import os
+import shutil
+
+import nemo_run as run
+from nemo_run.core.execution.local import LocalExecutor
+
+OUTPUT_DIR = "/tmp/nemo_export_multi"
+shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+
+
+def preprocess(dataset: str, workers: int = 4):
+    print(f"Preprocessing {dataset} with {workers} workers")
+
+
+def train(model: str, epochs: int = 10, lr: float = 1e-3):
+    print(f"Training {model} for {epochs} epochs at lr={lr}")
+
+
+def evaluate(model: str, split: str = "test"):
+    print(f"Evaluating {model} on {split} split")
+
+
+with run.Experiment("export-multi-demo") as exp:
+    exp.add(
+        run.Partial(preprocess, dataset="imagenet", workers=8),
+        executor=LocalExecutor(),
+        name="preprocess",
+    )
+    exp.add(
+        run.Partial(train, model="resnet50", epochs=50, lr=5e-4),
+        executor=LocalExecutor(),
+        name="train",
+    )
+    exp.add(
+        run.Partial(evaluate, model="resnet50", split="val"),
+        executor=LocalExecutor(),
+        name="evaluate",
+    )
+    exp.export(OUTPUT_DIR)
+
+files = sorted(os.listdir(OUTPUT_DIR))
+print(f"\nExported files: {files}")
+
+sh_scripts = [f for f in files if f.endswith(".sh") and f != "submit_all.sh"]
+assert len(sh_scripts) == 3, f"Expected 3 job scripts, got: {sh_scripts}"
+
+print("\n--- submit_all.sh ---")
+with open(f"{OUTPUT_DIR}/submit_all.sh") as f:
+    print(f.read())
diff --git a/local/export_script.py b/local/export_script.py
new file mode 100644
index 00000000..1ceaac9f
--- /dev/null
+++ b/local/export_script.py
@@ -0,0 +1,42 @@
+"""
+E2E example: Experiment.export() with run.Script tasks
+
+Shows that export() works with shell Script tasks (not just Partial),
+which is a common pattern for SLURM-style jobs where the user provides
+a raw bash script.
+
+The exported .sh file wraps the inline command; submit_all.sh calls
+`bash <script>.sh` for each job.
+
+Run:
+    python local/export_script.py
+    bash /tmp/nemo_export_script/submit_all.sh
+"""
+
+import os
+import shutil
+
+import nemo_run as run
+from nemo_run.core.execution.local import LocalExecutor
+
+OUTPUT_DIR = "/tmp/nemo_export_script"
+shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+
+with run.Experiment("export-script-demo") as exp:
+    exp.add(
+        run.Script(inline="echo 'Starting data download'; sleep 1; echo 'Done'"),
+        executor=LocalExecutor(),
+        name="download",
+    )
+    exp.add(
+        run.Script(inline="echo 'Unpacking archive'; sleep 1; echo 'Unpacked'"),
+        executor=LocalExecutor(),
+        name="unpack",
+    )
+    exp.export(OUTPUT_DIR)
+
+files = sorted(os.listdir(OUTPUT_DIR))
+print(f"\nExported files: {files}")
+print("\n--- submit_all.sh ---")
+with open(f"{OUTPUT_DIR}/submit_all.sh") as f:
+    print(f.read())
diff --git a/local/export_slurm.py b/local/export_slurm.py
new file mode 100644
index 00000000..9e6abba7
--- /dev/null
+++ b/local/export_slurm.py
@@ -0,0 +1,84 @@
+"""
+E2E example: Experiment.export() with SlurmExecutor
+
+Demonstrates exporting SLURM jobs to a self-contained directory without
+connecting to any cluster or submitting any job.
+
+The output directory contains:
+  - <job>_sbatch.sh   (ready-to-submit sbatch script)
+  - submit_all.sh     (calls: sbatch <job>_sbatch.sh for each job)
+
+To actually submit after export (requires cluster access):
+    sbatch /tmp/nemo_export_slurm/pretrain_sbatch.sh
+
+Run:
+    python local/export_slurm.py
+    cat /tmp/nemo_export_slurm/pretrain_sbatch.sh
+"""
+
+import os
+import shutil
+
+import nemo_run as run
+from nemo_run.core.execution.slurm import SlurmExecutor
+from nemo_run.core.tunnel.client import SSHTunnel
+
+OUTPUT_DIR = "/tmp/nemo_export_slurm"
+shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+
+
+def pretrain(model: str, num_steps: int = 1000, lr: float = 3e-4):
+    print(f"Pre-training {model} for {num_steps} steps at lr={lr}")
+
+
+def finetune(model: str, dataset: str, epochs: int = 5):
+    print(f"Fine-tuning {model} on {dataset} for {epochs} epochs")
+
+
+# Configure the SLURM executor (no real cluster needed for export)
+def make_slurm_executor(nodes: int = 1) -> SlurmExecutor:
+    tunnel = SSHTunnel(
+        host="my-cluster.example.com",  # placeholder — not contacted during export
+        user="myuser",
+        job_dir="/scratch/myuser/nemo_jobs",
+    )
+    return SlurmExecutor(
+        account="my_account",
+        partition="gpu",
+        nodes=nodes,
+        ntasks_per_node=8,
+        gpus_per_node=8,
+        container_image="nvcr.io/nvidia/nemo:latest",
+        time="04:00:00",
+        tunnel=tunnel,
+        packager=run.GitArchivePackager(),
+    )
+
+
+with run.Experiment("export-slurm-demo") as exp:
+    exp.add(
+        run.Partial(pretrain, model="llama-7b", num_steps=50_000, lr=1e-4),
+        executor=make_slurm_executor(nodes=4),
+        name="pretrain",
+    )
+    exp.add(
+        run.Partial(finetune, model="llama-7b", dataset="squad", epochs=3),
+        executor=make_slurm_executor(nodes=1),
+        name="finetune",
+    )
+    exp.export(OUTPUT_DIR)
+
+files = sorted(os.listdir(OUTPUT_DIR))
+print(f"\nExported files: {files}")
+
+sbatch_scripts = [f for f in files if f.endswith("_sbatch.sh")]
+assert len(sbatch_scripts) == 2, f"Expected 2 sbatch scripts, got: {sbatch_scripts}"
+
+print("\n--- pretrain_sbatch.sh (first 40 lines) ---")
+with open(f"{OUTPUT_DIR}/pretrain_sbatch.sh") as f:
+    lines = f.readlines()
+    print("".join(lines[:40]))
+
+print("--- submit_all.sh ---")
+with open(f"{OUTPUT_DIR}/submit_all.sh") as f:
+    print(f.read())

From ddd80e69b753ae05373ff12eb5d2e3b04a2b3ee3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 16 Mar 2026 10:36:57 +0000
Subject: [PATCH 3/7] Revert "docs: add export() e2e examples for Local, SLURM,
 DGXCloud, and Script tasks"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 9a6c46c37373b9cd2c71e0a45024ee91b67129c1.

Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 local/export_dgxcloud.py  | 69 --------------------------------
 local/export_local.py     | 47 ----------------------
 local/export_multi_job.py | 61 ----------------------------
 local/export_script.py    | 42 --------------------
 local/export_slurm.py     | 84 ---------------------------------------
 5 files changed, 303 deletions(-)
 delete mode 100644 local/export_dgxcloud.py
 delete mode 100644 local/export_local.py
 delete mode 100644 local/export_multi_job.py
 delete mode 100644 local/export_script.py
 delete mode 100644 local/export_slurm.py

diff --git a/local/export_dgxcloud.py b/local/export_dgxcloud.py
deleted file mode 100644
index d1114a2d..00000000
--- a/local/export_dgxcloud.py
+++ /dev/null
@@ -1,69 +0,0 @@
-"""
-E2E example: Experiment.export() with DGXCloudExecutor
-
-Demonstrates exporting DGX Cloud jobs to a self-contained directory without
-any API calls or authentication. The generated script can be inspected and
-submitted manually via the DGX Cloud CLI or API.
-
-The output directory contains:
-  - <job>_torchrun_job.sh   (the torchrun launch script uploaded to the PVC)
-  - submit_all.sh           (calls: bash <job>_torchrun_job.sh for each job)
-
-Run:
-    python local/export_dgxcloud.py
-    cat /tmp/nemo_export_dgxcloud/train_torchrun_job.sh
-"""
-
-import os
-import shutil
-
-import nemo_run as run
-from nemo_run.core.execution.dgxcloud import DGXCloudExecutor
-
-OUTPUT_DIR = "/tmp/nemo_export_dgxcloud"
-shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
-
-
-def train(model: str, steps: int = 10_000):
-    import torch
-
-    print(f"Training {model} on {torch.cuda.device_count()} GPUs for {steps} steps")
-
-
-# Configure the DGX Cloud executor (credentials are placeholders — not contacted during export)
-executor = DGXCloudExecutor(
-    base_url="https://api.ngc.nvidia.com/v2/org/my-org/dgxcloud",
-    kube_apiserver_url="https://my-cluster.k8s.example.com",
-    app_id="my-app-id",
-    app_secret="my-app-secret",
-    project_name="my-project",
-    container_image="nvcr.io/nvidia/nemo:latest",
-    pvc_nemo_run_dir="/mnt/pvc/nemo_run",
-    pvcs=[{"claimName": "nemo-pvc", "path": "/mnt/pvc"}],
-    nodes=2,
-    gpus_per_node=8,
-    packager=run.GitArchivePackager(),
-)
-
-with run.Experiment("export-dgxcloud-demo") as exp:
-    exp.add(
-        run.Partial(train, model="mistral-7b", steps=100_000),
-        executor=executor,
-        name="train",
-    )
-    exp.export(OUTPUT_DIR)
-
-files = sorted(os.listdir(OUTPUT_DIR))
-print(f"\nExported files: {files}")
-
-torchrun_scripts = [f for f in files if f.endswith("_torchrun_job.sh")]
-assert len(torchrun_scripts) == 1, f"Expected 1 torchrun script, got: {torchrun_scripts}"
-
-print("\n--- train_torchrun_job.sh (first 40 lines) ---")
-with open(f"{OUTPUT_DIR}/{torchrun_scripts[0]}") as f:
-    lines = f.readlines()
-    print("".join(lines[:40]))
-
-print("--- submit_all.sh ---")
-with open(f"{OUTPUT_DIR}/submit_all.sh") as f:
-    print(f.read())
diff --git a/local/export_local.py b/local/export_local.py
deleted file mode 100644
index 943c89fc..00000000
--- a/local/export_local.py
+++ /dev/null
@@ -1,47 +0,0 @@
-"""
-E2E example: Experiment.export() with LocalExecutor
-
-Demonstrates exporting a single job to a self-contained script directory
-without submitting anything. The output directory contains:
-  - hello-job.sh    (executable bash script)
-  - submit_all.sh   (launcher that calls: bash hello-job.sh)
-
-Run:
-    python local/export_local.py
-    ls /tmp/nemo_export_local/
-    cat /tmp/nemo_export_local/hello-job.sh
-    bash /tmp/nemo_export_local/submit_all.sh
-"""
-
-import os
-import shutil
-
-import nemo_run as run
-from nemo_run.core.execution.local import LocalExecutor
-
-OUTPUT_DIR = "/tmp/nemo_export_local"
-shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
-
-
-def greet(name: str, times: int = 1):
-    for _ in range(times):
-        print(f"Hello, {name}!")
-
-
-with run.Experiment("export-local-demo") as exp:
-    task = run.Partial(greet, name="NeMo", times=3)
-    exp.add(task, executor=LocalExecutor(), name="hello-job")
-    exp.export(OUTPUT_DIR)
-
-files = sorted(os.listdir(OUTPUT_DIR))
-print(f"\nExported files: {files}")
-assert "hello-job.sh" in files, "Expected hello-job.sh"
-assert "submit_all.sh" in files, "Expected submit_all.sh"
-
-print("\n--- hello-job.sh ---")
-with open(f"{OUTPUT_DIR}/hello-job.sh") as f:
-    print(f.read())
-
-print("--- submit_all.sh ---")
-with open(f"{OUTPUT_DIR}/submit_all.sh") as f:
-    print(f.read())
diff --git a/local/export_multi_job.py b/local/export_multi_job.py
deleted file mode 100644
index 112911fe..00000000
--- a/local/export_multi_job.py
+++ /dev/null
@@ -1,61 +0,0 @@
-"""
-E2E example: Experiment.export() with multiple LocalExecutor jobs
-
-Demonstrates exporting multiple jobs to a shared output directory.
-Each job produces its own .sh script; submit_all.sh chains them all.
-
-Run:
-    python local/export_multi_job.py
-    ls /tmp/nemo_export_multi/
-    bash /tmp/nemo_export_multi/submit_all.sh
-"""
-
-import os
-import shutil
-
-import nemo_run as run
-from nemo_run.core.execution.local import LocalExecutor
-
-OUTPUT_DIR = "/tmp/nemo_export_multi"
-shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
-
-
-def preprocess(dataset: str, workers: int = 4):
-    print(f"Preprocessing {dataset} with {workers} workers")
-
-
-def train(model: str, epochs: int = 10, lr: float = 1e-3):
-    print(f"Training {model} for {epochs} epochs at lr={lr}")
-
-
-def evaluate(model: str, split: str = "test"):
-    print(f"Evaluating {model} on {split} split")
-
-
-with run.Experiment("export-multi-demo") as exp:
-    exp.add(
-        run.Partial(preprocess, dataset="imagenet", workers=8),
-        executor=LocalExecutor(),
-        name="preprocess",
-    )
-    exp.add(
-        run.Partial(train, model="resnet50", epochs=50, lr=5e-4),
-        executor=LocalExecutor(),
-        name="train",
-    )
-    exp.add(
-        run.Partial(evaluate, model="resnet50", split="val"),
-        executor=LocalExecutor(),
-        name="evaluate",
-    )
-    exp.export(OUTPUT_DIR)
-
-files = sorted(os.listdir(OUTPUT_DIR))
-print(f"\nExported files: {files}")
-
-sh_scripts = [f for f in files if f.endswith(".sh") and f != "submit_all.sh"]
-assert len(sh_scripts) == 3, f"Expected 3 job scripts, got: {sh_scripts}"
-
-print("\n--- submit_all.sh ---")
-with open(f"{OUTPUT_DIR}/submit_all.sh") as f:
-    print(f.read())
diff --git a/local/export_script.py b/local/export_script.py
deleted file mode 100644
index 1ceaac9f..00000000
--- a/local/export_script.py
+++ /dev/null
@@ -1,42 +0,0 @@
-"""
-E2E example: Experiment.export() with run.Script tasks
-
-Shows that export() works with shell Script tasks (not just Partial),
-which is a common pattern for SLURM-style jobs where the user provides
-a raw bash script.
-
-The exported .sh file wraps the inline command; submit_all.sh calls
-`bash <script>.sh` for each job.
-
-Run:
-    python local/export_script.py
-    bash /tmp/nemo_export_script/submit_all.sh
-"""
-
-import os
-import shutil
-
-import nemo_run as run
-from nemo_run.core.execution.local import LocalExecutor
-
-OUTPUT_DIR = "/tmp/nemo_export_script"
-shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
-
-with run.Experiment("export-script-demo") as exp:
-    exp.add(
-        run.Script(inline="echo 'Starting data download'; sleep 1; echo 'Done'"),
-        executor=LocalExecutor(),
-        name="download",
-    )
-    exp.add(
-        run.Script(inline="echo 'Unpacking archive'; sleep 1; echo 'Unpacked'"),
-        executor=LocalExecutor(),
-        name="unpack",
-    )
-    exp.export(OUTPUT_DIR)
-
-files = sorted(os.listdir(OUTPUT_DIR))
-print(f"\nExported files: {files}")
-print("\n--- submit_all.sh ---")
-with open(f"{OUTPUT_DIR}/submit_all.sh") as f:
-    print(f.read())
diff --git a/local/export_slurm.py b/local/export_slurm.py
deleted file mode 100644
index 9e6abba7..00000000
--- a/local/export_slurm.py
+++ /dev/null
@@ -1,84 +0,0 @@
-"""
-E2E example: Experiment.export() with SlurmExecutor
-
-Demonstrates exporting SLURM jobs to a self-contained directory without
-connecting to any cluster or submitting any job.
-
-The output directory contains:
-  - <job>_sbatch.sh   (ready-to-submit sbatch script)
-  - submit_all.sh     (calls: sbatch <job>_sbatch.sh for each job)
-
-To actually submit after export (requires cluster access):
-    sbatch /tmp/nemo_export_slurm/pretrain_sbatch.sh
-
-Run:
-    python local/export_slurm.py
-    cat /tmp/nemo_export_slurm/pretrain_sbatch.sh
-"""
-
-import os
-import shutil
-
-import nemo_run as run
-from nemo_run.core.execution.slurm import SlurmExecutor
-from nemo_run.core.tunnel.client import SSHTunnel
-
-OUTPUT_DIR = "/tmp/nemo_export_slurm"
-shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
-
-
-def pretrain(model: str, num_steps: int = 1000, lr: float = 3e-4):
-    print(f"Pre-training {model} for {num_steps} steps at lr={lr}")
-
-
-def finetune(model: str, dataset: str, epochs: int = 5):
-    print(f"Fine-tuning {model} on {dataset} for {epochs} epochs")
-
-
-# Configure the SLURM executor (no real cluster needed for export)
-def make_slurm_executor(nodes: int = 1) -> SlurmExecutor:
-    tunnel = SSHTunnel(
-        host="my-cluster.example.com",  # placeholder — not contacted during export
-        user="myuser",
-        job_dir="/scratch/myuser/nemo_jobs",
-    )
-    return SlurmExecutor(
-        account="my_account",
-        partition="gpu",
-        nodes=nodes,
-        ntasks_per_node=8,
-        gpus_per_node=8,
-        container_image="nvcr.io/nvidia/nemo:latest",
-        time="04:00:00",
-        tunnel=tunnel,
-        packager=run.GitArchivePackager(),
-    )
-
-
-with run.Experiment("export-slurm-demo") as exp:
-    exp.add(
-        run.Partial(pretrain, model="llama-7b", num_steps=50_000, lr=1e-4),
-        executor=make_slurm_executor(nodes=4),
-        name="pretrain",
-    )
-    exp.add(
-        run.Partial(finetune, model="llama-7b", dataset="squad", epochs=3),
-        executor=make_slurm_executor(nodes=1),
-        name="finetune",
-    )
-    exp.export(OUTPUT_DIR)
-
-files = sorted(os.listdir(OUTPUT_DIR))
-print(f"\nExported files: {files}")
-
-sbatch_scripts = [f for f in files if f.endswith("_sbatch.sh")]
-assert len(sbatch_scripts) == 2, f"Expected 2 sbatch scripts, got: {sbatch_scripts}"
-
-print("\n--- pretrain_sbatch.sh (first 40 lines) ---")
-with open(f"{OUTPUT_DIR}/pretrain_sbatch.sh") as f:
-    lines = f.readlines()
-    print("".join(lines[:40]))
-
-print("--- submit_all.sh ---")
-with open(f"{OUTPUT_DIR}/submit_all.sh") as f:
-    print(f.read())

From 73e8514a50bbd038f8d0a0c96a0483813578224c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 16 Mar 2026 10:43:29 +0000
Subject: [PATCH 4/7] fix: restrict exported script permissions to 0o750
 (owner+group)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses CodeQL finding: overly permissive chmod 0o755 made generated
scripts world-readable/executable. Changed to 0o750 in local.py,
lepton.py, and the submit_all.sh writer in experiment.py.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/run/experiment.py                       | 2 +-
 nemo_run/run/torchx_backend/schedulers/lepton.py | 2 +-
 nemo_run/run/torchx_backend/schedulers/local.py  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/nemo_run/run/experiment.py b/nemo_run/run/experiment.py
index f998aafd..20b68de7 100644
--- a/nemo_run/run/experiment.py
+++ b/nemo_run/run/experiment.py
@@ -1407,7 +1407,7 @@ def _write_submit_script(out: Path, title: str, jobs: list) -> None:
 
     submit = out / "submit_all.sh"
     submit.write_text("\n".join(lines) + "\n")
-    submit.chmod(0o755)
+    submit.chmod(0o750)
 
 
 def maybe_load_external_main(exp_dir: str):
diff --git a/nemo_run/run/torchx_backend/schedulers/lepton.py b/nemo_run/run/torchx_backend/schedulers/lepton.py
index 6d34f1c9..8e3eb6ae 100644
--- a/nemo_run/run/torchx_backend/schedulers/lepton.py
+++ b/nemo_run/run/torchx_backend/schedulers/lepton.py
@@ -110,7 +110,7 @@ def _submit_dryrun(  # type: ignore
             lines.append(" ".join(shlex.quote(p) for p in cmd))
             with open(path, "w") as f:
                 f.write("\n".join(lines) + "\n")
-            os.chmod(path, 0o755)
+            os.chmod(path, 0o750)
 
         return AppDryRunInfo(
             LeptonRequest(app=app, executor=executor, cmd=cmd, name=role.name),
diff --git a/nemo_run/run/torchx_backend/schedulers/local.py b/nemo_run/run/torchx_backend/schedulers/local.py
index 7c8913b2..b84a68bd 100644
--- a/nemo_run/run/torchx_backend/schedulers/local.py
+++ b/nemo_run/run/torchx_backend/schedulers/local.py
@@ -114,7 +114,7 @@ def _submit_dryrun(self, app: AppDef, cfg: Executor) -> AppDryRunInfo[PopenReque
                 lines.append(" ".join(shlex.quote(p) for p in cmd_parts))
             with open(path, "w") as f:
                 f.write("\n".join(lines) + "\n")
-            os.chmod(path, 0o755)
+            os.chmod(path, 0o750)
 
         return AppDryRunInfo(request, lambda p: pprint.pformat(asdict(p), indent=2, width=80))
 

From 10e0f2013997afd9023ad14883e32cd135ec3ab5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 16 Mar 2026 10:46:54 +0000
Subject: [PATCH 5/7] fix: guard DGXCloud script write behind experiment_dir
 check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_submit_dryrun() was unconditionally writing torchrun_job.sh, crashing
with AttributeError when job_name is unset (executor not yet assigned
to an experiment). Guard with `if executor.experiment_dir:` consistent
with all other schedulers.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/run/torchx_backend/schedulers/dgxcloud.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/nemo_run/run/torchx_backend/schedulers/dgxcloud.py b/nemo_run/run/torchx_backend/schedulers/dgxcloud.py
index aa8b97de..5563f1dc 100644
--- a/nemo_run/run/torchx_backend/schedulers/dgxcloud.py
+++ b/nemo_run/run/torchx_backend/schedulers/dgxcloud.py
@@ -120,11 +120,11 @@ def _submit_dryrun(  # type: ignore
         )
 
         # Write and copy sbatch script
-        path = os.path.join(executor.experiment_dir, f"{executor.job_name}_torchrun_job.sh")
         script = req.materialize()
-
-        with open(path, "w") as f:
-            f.write(script)
+        if executor.experiment_dir:
+            path = os.path.join(executor.experiment_dir, f"{executor.job_name}_torchrun_job.sh")
+            with open(path, "w") as f:
+                f.write(script)
 
         return AppDryRunInfo(
             DGXRequest(app=app, executor=executor, cmd=cmd, name=role.name),

From 68a90516702ba7f9d6316505dd0b90c19c4e508e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 16 Mar 2026 11:05:45 +0000
Subject: [PATCH 6/7] test: improve coverage and fix chmod to 0o700 for
 exported scripts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Change chmod from 0o750 to 0o700 in local.py, lepton.py, and
  experiment.py to silence CodeQL "overly permissive" findings
- Fix _write_submit_script to handle JobGroup (uses job.executors,
  not the nonexistent job.jobs attribute)
- Add test_lepton.py (new file) covering create_scheduler,
  _submit_dryrun, file write, and no-write-without-experiment_dir
- Add test_submit_dryrun_writes_script/yaml to dgxcloud, docker,
  local, skypilot, and skypilot_jobs scheduler tests
- Add test_experiment_export_job_group covering the JobGroup branch
  in Experiment.export()
- Fix missing import os in test_dgxcloud.py

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/run/experiment.py                    | 18 +++-
 .../run/torchx_backend/schedulers/lepton.py   |  2 +-
 .../run/torchx_backend/schedulers/local.py    |  2 +-
 test/run/test_experiment.py                   | 23 +++++
 .../schedulers/test_dgxcloud.py               | 11 +++
 .../torchx_backend/schedulers/test_docker.py  | 10 ++
 .../torchx_backend/schedulers/test_lepton.py  | 93 +++++++++++++++++++
 .../torchx_backend/schedulers/test_local.py   | 12 +++
 .../schedulers/test_skypilot.py               | 11 +++
 .../schedulers/test_skypilot_jobs.py          | 10 ++
 10 files changed, 185 insertions(+), 7 deletions(-)
 create mode 100644 test/run/torchx_backend/schedulers/test_lepton.py

diff --git a/nemo_run/run/experiment.py b/nemo_run/run/experiment.py
index 20b68de7..ac7ebb7c 100644
--- a/nemo_run/run/experiment.py
+++ b/nemo_run/run/experiment.py
@@ -1395,11 +1395,19 @@ def _write_submit_script(out: Path, title: str, jobs: list) -> None:
     ]
 
     for job in jobs:
-        job_list = job.jobs if isinstance(job, JobGroup) else [job]
-        for j in job_list:
-            executor_type = type(j.executor).__name__
+        if isinstance(job, JobGroup):
+            executors = [job.executors] if isinstance(job.executors, Executor) else job.executors
+            executor_type = type(executors[0]).__name__
             cmd = _SUBMIT_CMDS.get(executor_type, "bash")
-            scripts = sorted(out.glob(f"{j.id}*"))
+            scripts = sorted(out.glob(f"{job.id}*"))
+            for s in scripts:
+                if s.name == "submit_all.sh":
+                    continue
+                lines.append(f'{cmd} "$SCRIPT_DIR/{s.name}"')
+        else:
+            executor_type = type(job.executor).__name__
+            cmd = _SUBMIT_CMDS.get(executor_type, "bash")
+            scripts = sorted(out.glob(f"{job.id}*"))
             for s in scripts:
                 if s.name == "submit_all.sh":
                     continue
@@ -1407,7 +1415,7 @@ def _write_submit_script(out: Path, title: str, jobs: list) -> None:
 
     submit = out / "submit_all.sh"
     submit.write_text("\n".join(lines) + "\n")
-    submit.chmod(0o750)
+    submit.chmod(0o700)
 
 
 def maybe_load_external_main(exp_dir: str):
diff --git a/nemo_run/run/torchx_backend/schedulers/lepton.py b/nemo_run/run/torchx_backend/schedulers/lepton.py
index 8e3eb6ae..1efb0529 100644
--- a/nemo_run/run/torchx_backend/schedulers/lepton.py
+++ b/nemo_run/run/torchx_backend/schedulers/lepton.py
@@ -110,7 +110,7 @@ def _submit_dryrun(  # type: ignore
             lines.append(" ".join(shlex.quote(p) for p in cmd))
             with open(path, "w") as f:
                 f.write("\n".join(lines) + "\n")
-            os.chmod(path, 0o750)
+            os.chmod(path, 0o700)
 
         return AppDryRunInfo(
             LeptonRequest(app=app, executor=executor, cmd=cmd, name=role.name),
diff --git a/nemo_run/run/torchx_backend/schedulers/local.py b/nemo_run/run/torchx_backend/schedulers/local.py
index b84a68bd..444a7bcf 100644
--- a/nemo_run/run/torchx_backend/schedulers/local.py
+++ b/nemo_run/run/torchx_backend/schedulers/local.py
@@ -114,7 +114,7 @@ def _submit_dryrun(self, app: AppDef, cfg: Executor) -> AppDryRunInfo[PopenReque
                 lines.append(" ".join(shlex.quote(p) for p in cmd_parts))
             with open(path, "w") as f:
                 f.write("\n".join(lines) + "\n")
-            os.chmod(path, 0o750)
+            os.chmod(path, 0o700)
 
         return AppDryRunInfo(request, lambda p: pprint.pformat(asdict(p), indent=2, width=80))
 
diff --git a/test/run/test_experiment.py b/test/run/test_experiment.py
index 4e48fb6e..a9207b8b 100644
--- a/test/run/test_experiment.py
+++ b/test/run/test_experiment.py
@@ -1580,3 +1580,26 @@ def test_experiment_export_multiple_jobs(temp_dir):
     submit_content = Path(os.path.join(output_dir, "submit_all.sh")).read_text()
     for script in sh_scripts:
         assert script in submit_content, f"{script} not referenced in submit_all.sh"
+
+
+def test_experiment_export_job_group(temp_dir):
+    """export() with a JobGroup redirects all executors and writes scripts."""
+    output_dir = os.path.join(temp_dir, "group_export")
+
+    with patch(
+        "nemo_run.run.job.JobGroup.SUPPORTED_EXECUTORS", new_callable=PropertyMock
+    ) as mock_supported:
+        mock_supported.return_value = {LocalExecutor}
+
+        with Experiment("test-exp") as exp:
+            from typing import Sequence
+
+            tasks: Sequence[run.Partial] = [
+                run.Partial(dummy_function, x=1, y=2),
+                run.Partial(dummy_function, x=3, y=4),
+            ]
+            exp.add(tasks, name="group-job")  # type: ignore
+            exp.export(output_dir)
+
+    files = os.listdir(output_dir)
+    assert "submit_all.sh" in files
diff --git a/test/run/torchx_backend/schedulers/test_dgxcloud.py b/test/run/torchx_backend/schedulers/test_dgxcloud.py
index 767c2106..d7427f61 100644
--- a/test/run/torchx_backend/schedulers/test_dgxcloud.py
+++ b/test/run/torchx_backend/schedulers/test_dgxcloud.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import tempfile
 from unittest import mock
 from unittest.mock import MagicMock
@@ -71,6 +72,16 @@ def test_submit_dryrun(dgx_cloud_scheduler, mock_app_def, dgx_cloud_executor):
         assert dryrun_info.request is not None
 
 
+def test_submit_dryrun_writes_script(dgx_cloud_scheduler, mock_app_def, dgx_cloud_executor):
+    with tempfile.TemporaryDirectory() as exp_dir:
+        dgx_cloud_executor.job_name = "test-job"
+        dgx_cloud_executor.experiment_dir = exp_dir
+        with mock.patch.object(DGXCloudExecutor, "package"):
+            dgx_cloud_scheduler._submit_dryrun(mock_app_def, dgx_cloud_executor)
+        script = os.path.join(exp_dir, "test-job_torchrun_job.sh")
+        assert os.path.isfile(script)
+
+
 def test_dgx_cloud_scheduler_methods(dgx_cloud_scheduler):
     # Test that basic methods exist
     assert hasattr(dgx_cloud_scheduler, "_submit_dryrun")
diff --git a/test/run/torchx_backend/schedulers/test_docker.py b/test/run/torchx_backend/schedulers/test_docker.py
index 551d8a60..10c63a4c 100644
--- a/test/run/torchx_backend/schedulers/test_docker.py
+++ b/test/run/torchx_backend/schedulers/test_docker.py
@@ -67,6 +67,16 @@ def test_submit_dryrun(docker_scheduler, mock_app_def, docker_executor):
         assert dryrun_info.request is not None
 
 
+def test_submit_dryrun_writes_yaml(docker_scheduler, mock_app_def, docker_executor):
+    with tempfile.TemporaryDirectory() as exp_dir:
+        docker_executor.job_name = "test-job"
+        docker_executor.experiment_dir = exp_dir
+        with mock.patch.object(DockerExecutor, "package"):
+            docker_scheduler._submit_dryrun(mock_app_def, docker_executor)
+        yaml_file = os.path.join(exp_dir, "test-job.yaml")
+        assert os.path.isfile(yaml_file)
+
+
 def test_check_docker_version_success():
     with mock.patch("subprocess.check_output") as mock_check_output:
         mock_check_output.return_value = b"Docker version 20.10.0, build abcdef\n"
diff --git a/test/run/torchx_backend/schedulers/test_lepton.py b/test/run/torchx_backend/schedulers/test_lepton.py
new file mode 100644
index 00000000..644edced
--- /dev/null
+++ b/test/run/torchx_backend/schedulers/test_lepton.py
@@ -0,0 +1,93 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+from unittest import mock
+
+import pytest
+from torchx.schedulers.api import AppDryRunInfo
+from torchx.specs import AppDef, Role
+
+from nemo_run.core.execution.lepton import LeptonExecutor
+from nemo_run.run.torchx_backend.schedulers.lepton import (
+    LeptonScheduler,
+    create_scheduler,
+)
+
+
+@pytest.fixture
+def mock_app_def():
+    return AppDef(name="test_app", roles=[Role(name="test_role", image="")])
+
+
+@pytest.fixture
+def lepton_executor():
+    return LeptonExecutor(
+        container_image="nvcr.io/nvidia/test:latest",
+        nemo_run_dir="/workspace/nemo_run",
+        job_dir=tempfile.mkdtemp(),
+    )
+
+
+@pytest.fixture
+def lepton_scheduler():
+    return create_scheduler(session_name="test_session")
+
+
+def test_create_scheduler():
+    scheduler = create_scheduler(session_name="test_session")
+    assert isinstance(scheduler, LeptonScheduler)
+    assert scheduler.session_name == "test_session"
+
+
+def test_lepton_scheduler_methods(lepton_scheduler):
+    assert hasattr(lepton_scheduler, "_submit_dryrun")
+    assert hasattr(lepton_scheduler, "schedule")
+    assert hasattr(lepton_scheduler, "describe")
+    assert hasattr(lepton_scheduler, "_cancel_existing")
+    assert hasattr(lepton_scheduler, "_validate")
+
+
+def test_submit_dryrun(lepton_scheduler, mock_app_def, lepton_executor):
+    with mock.patch.object(LeptonExecutor, "package") as mock_package:
+        mock_package.return_value = None
+
+        dryrun_info = lepton_scheduler._submit_dryrun(mock_app_def, lepton_executor)
+        assert isinstance(dryrun_info, AppDryRunInfo)
+        assert dryrun_info.request is not None
+
+
+def test_submit_dryrun_writes_script(lepton_scheduler, mock_app_def, lepton_executor):
+    with tempfile.TemporaryDirectory() as exp_dir:
+        lepton_executor.job_name = "test-job"
+        lepton_executor.experiment_dir = exp_dir
+        with mock.patch.object(LeptonExecutor, "package"):
+            lepton_scheduler._submit_dryrun(mock_app_def, lepton_executor)
+        script = os.path.join(exp_dir, "test-job.sh")
+        assert os.path.isfile(script)
+        content = open(script).read()
+        assert "#!/bin/bash" in content
+
+
+def test_submit_dryrun_no_file_without_experiment_dir(
+    lepton_scheduler, mock_app_def, lepton_executor
+):
+    with tempfile.TemporaryDirectory() as exp_dir:
+        # experiment_dir is NOT set
+        with mock.patch.object(LeptonExecutor, "package"):
+            lepton_scheduler._submit_dryrun(mock_app_def, lepton_executor)
+        # No script should have been written
+        assert len(os.listdir(exp_dir)) == 0
diff --git a/test/run/torchx_backend/schedulers/test_local.py b/test/run/torchx_backend/schedulers/test_local.py
index 5220d9aa..bb112aa2 100644
--- a/test/run/torchx_backend/schedulers/test_local.py
+++ b/test/run/torchx_backend/schedulers/test_local.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import tempfile
 from unittest import mock
 
@@ -59,6 +60,17 @@ def test_submit_dryrun(local_scheduler, mock_app_def, local_executor):
     # assert callable(dryrun_info.fmt)
 
 
+def test_submit_dryrun_writes_script(local_scheduler, mock_app_def, local_executor):
+    with tempfile.TemporaryDirectory() as exp_dir:
+        local_executor.experiment_dir = exp_dir
+        local_scheduler._submit_dryrun(mock_app_def, local_executor)
+        script = os.path.join(exp_dir, f"{mock_app_def.name}.sh")
+        assert os.path.isfile(script)
+        content = open(script).read()
+        assert "#!/bin/bash" in content
+        assert oct(os.stat(script).st_mode)[-3:] == "700"
+
+
 @mock.patch("nemo_run.run.torchx_backend.schedulers.local._save_job_dir")
 def test_schedule(mock_save, local_scheduler, mock_app_def, local_executor):
     dryrun_info = local_scheduler._submit_dryrun(mock_app_def, local_executor)
diff --git a/test/run/torchx_backend/schedulers/test_skypilot.py b/test/run/torchx_backend/schedulers/test_skypilot.py
index d5fc751e..7f24caea 100644
--- a/test/run/torchx_backend/schedulers/test_skypilot.py
+++ b/test/run/torchx_backend/schedulers/test_skypilot.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import tempfile
 from unittest import mock
 
@@ -70,6 +71,16 @@ def test_submit_dryrun(skypilot_scheduler, mock_app_def, skypilot_executor):
         assert dryrun_info.request is not None
 
 
+def test_submit_dryrun_writes_yaml(skypilot_scheduler, mock_app_def, skypilot_executor):
+    with tempfile.TemporaryDirectory() as exp_dir:
+        skypilot_executor.job_name = "test-job"
+        skypilot_executor.experiment_dir = exp_dir
+        with mock.patch.object(SkypilotExecutor, "package"):
+            skypilot_scheduler._submit_dryrun(mock_app_def, skypilot_executor)
+        yaml_file = os.path.join(exp_dir, "test-job.yaml")
+        assert os.path.isfile(yaml_file)
+
+
 def test_schedule(skypilot_scheduler, mock_app_def, skypilot_executor):
     class MockHandle:
         def get_cluster_name(self):
diff --git a/test/run/torchx_backend/schedulers/test_skypilot_jobs.py b/test/run/torchx_backend/schedulers/test_skypilot_jobs.py
index 46c6d75e..70ba27f9 100644
--- a/test/run/torchx_backend/schedulers/test_skypilot_jobs.py
+++ b/test/run/torchx_backend/schedulers/test_skypilot_jobs.py
@@ -73,6 +73,16 @@ def test_submit_dryrun(skypilot_jobs_scheduler, mock_app_def, skypilot_jobs_exec
         assert dryrun_info.request is not None
 
 
+def test_submit_dryrun_writes_yaml(skypilot_jobs_scheduler, mock_app_def, skypilot_jobs_executor):
+    with tempfile.TemporaryDirectory() as exp_dir:
+        skypilot_jobs_executor.job_name = "test-job"
+        skypilot_jobs_executor.experiment_dir = exp_dir
+        with mock.patch.object(SkypilotJobsExecutor, "package"):
+            skypilot_jobs_scheduler._submit_dryrun(mock_app_def, skypilot_jobs_executor)
+        yaml_file = os.path.join(exp_dir, "test-job.yaml")
+        assert os.path.isfile(yaml_file)
+
+
 def test_schedule(skypilot_jobs_scheduler, mock_app_def, skypilot_jobs_executor):
     class MockHandle:
         def get_cluster_name(self):

From f6c04bf70847ae1f3c0e8f8a8c045e1fbb95c573 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 16 Mar 2026 11:10:51 +0000
Subject: [PATCH 7/7] fix: use context manager for open() in tests to satisfy
 CodeQL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes code-scanning alerts 564 and 565 ("File is not always closed"):
- test_lepton.py: use `with open(script) as f` instead of bare open()
- test_local.py: same fix

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 test/run/torchx_backend/schedulers/test_lepton.py | 3 ++-
 test/run/torchx_backend/schedulers/test_local.py  | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/run/torchx_backend/schedulers/test_lepton.py b/test/run/torchx_backend/schedulers/test_lepton.py
index 644edced..bdd826f7 100644
--- a/test/run/torchx_backend/schedulers/test_lepton.py
+++ b/test/run/torchx_backend/schedulers/test_lepton.py
@@ -78,7 +78,8 @@ def test_submit_dryrun_writes_script(lepton_scheduler, mock_app_def, lepton_exec
             lepton_scheduler._submit_dryrun(mock_app_def, lepton_executor)
         script = os.path.join(exp_dir, "test-job.sh")
         assert os.path.isfile(script)
-        content = open(script).read()
+        with open(script) as f:
+            content = f.read()
         assert "#!/bin/bash" in content
 
 
diff --git a/test/run/torchx_backend/schedulers/test_local.py b/test/run/torchx_backend/schedulers/test_local.py
index bb112aa2..91d55346 100644
--- a/test/run/torchx_backend/schedulers/test_local.py
+++ b/test/run/torchx_backend/schedulers/test_local.py
@@ -66,7 +66,8 @@ def test_submit_dryrun_writes_script(local_scheduler, mock_app_def, local_execut
         local_scheduler._submit_dryrun(mock_app_def, local_executor)
         script = os.path.join(exp_dir, f"{mock_app_def.name}.sh")
         assert os.path.isfile(script)
-        content = open(script).read()
+        with open(script) as f:
+            content = f.read()
         assert "#!/bin/bash" in content
         assert oct(os.stat(script).st_mode)[-3:] == "700"