Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
216bf6a
Do not run CommandShell check during object creation
amaslenn Mar 20, 2026
0fb93cd
Update tests
amaslenn Mar 20, 2026
e75edc7
Add noqa: Vulture
amaslenn Mar 20, 2026
1567b71
Update copyright years
amaslenn Mar 20, 2026
65e7761
Make node placement correctly propagate M-Bridge
juntaowww Mar 6, 2026
4fbd785
Allow excluding nodes from group of nodes
juntaowww Mar 10, 2026
7b90e2d
Also init submodules when install git repos
juntaowww Mar 11, 2026
e8e453c
Add options to mount the repo in the container
juntaowww Mar 11, 2026
33dd1aa
Fix slurm gpu resource requesting
juntaowww Mar 11, 2026
3f61208
Update gb200 M-Bridge r0.3.0 qwen recipe
juntaowww Mar 11, 2026
0bbebb5
ruff check & ruff format
juntaowww Mar 11, 2026
638a0c1
Fix tests to reflect updates
juntaowww Mar 11, 2026
1d9aed4
Update copyright year
juntaowww Mar 11, 2026
fb24acd
Add back the defensive filter
juntaowww Mar 11, 2026
d549240
Enhance nodes allocation for edge cases
juntaowww Mar 11, 2026
5493528
Enhance tests by avoiding default names
juntaowww Mar 11, 2026
e298085
Make exclude nodes correctly propagate to M-Bridge
juntaowww Mar 11, 2026
b30a194
Improve error messages for excluding nodes
juntaowww Mar 16, 2026
17b5d1f
Improve exclude nodes APIs
juntaowww Mar 16, 2026
d09d889
Change additional_slurm_params separator to semi-colon
juntaowww Mar 16, 2026
a7e3dab
Make init_submodules optional
juntaowww Mar 16, 2026
6559932
Update configurations
juntaowww Mar 16, 2026
44904a0
Make extra_srun_args correctly propagate to M-Bridge
juntaowww Mar 16, 2026
f3488ae
Allow submission of jobs to resv nodes
juntaowww Mar 16, 2026
d2aa80b
Make the no-mount assertion less brittle
juntaowww Mar 16, 2026
84f4f56
Fix and improve the extra_srun_args propagation
juntaowww Mar 16, 2026
df4ce24
ruff format & copyright year
juntaowww Mar 16, 2026
72676cd
Fix init_submodules is silently bypassed on pre-existing clones
juntaowww Mar 17, 2026
99e6c23
Use shlex.split() instead of str.split()
juntaowww Mar 17, 2026
c42033e
Add missing regression: existing repo + init_submodules=True
juntaowww Mar 17, 2026
7b4fdb6
Export container-runtime env vars before Megatron-Bridge launcher
juntaowww Mar 19, 2026
ed1cd2f
Add supports_gpu_directives check
juntaowww Mar 19, 2026
abeb9b1
Fix tests
juntaowww Mar 19, 2026
fd50fce
Restructure configurations
juntaowww Mar 19, 2026
b17a2f5
Avoid real system calls
amaslenn Mar 20, 2026
f34a541
address linting issues
amaslenn Mar 20, 2026
9f95bf8
Revert unneeded changes
amaslenn Mar 20, 2026
9ab843a
Update copyright years
amaslenn Mar 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ extra_container_mounts = []
[[git_repos]]
url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
commit = "v0.3.0"
mount_as = "/opt/Megatron-Bridge"

[cmd_args]
gpu_type = "b200"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ extra_container_mounts = []
[[git_repos]]
url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
commit = "v0.3.0"
mount_as = "/opt/Megatron-Bridge"

[cmd_args]
gpu_type = "gb200"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ extra_container_mounts = []
[[git_repos]]
url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
commit = "v0.3.0"
mount_as = "/opt/Megatron-Bridge"

[cmd_args]
gpu_type = "gb300"
Expand All @@ -32,6 +31,7 @@ model_family_name = "qwen"
model_recipe_name = "qwen3_30b_a3b"
gpus_per_node = 4
num_gpus = 8
# mb = 4 # In case OOM, uncomment this for smaller micro-batch size
domain = "llm"
task = "pretrain"
compute_dtype = "fp8_mx"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ extra_container_mounts = []
[[git_repos]]
url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
commit = "v0.3.0"
mount_as = "/opt/Megatron-Bridge"

[cmd_args]
gpu_type = "h100"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "megatron_bridge_qwen_30b"

[[Tests]]
id = "megatron_bridge_qwen_30b"
test_name = "megatron_bridge_qwen_30b"
num_nodes = "2"

[[Tests.git_repos]]
url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
commit = "r0.3.0"
mount_as = "/opt/Megatron-Bridge"
init_submodules = true

[Tests.extra_env_vars]
PYTHONPATH = "/opt/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}"
3 changes: 2 additions & 1 deletion src/cloudai/_core/installables.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -91,6 +91,7 @@ class GitRepo(Installable, BaseModel):

url: str
commit: str
init_submodules: bool = False
installed_path: Optional[Path] = None
mount_as: Optional[str] = None

Expand Down
3 changes: 2 additions & 1 deletion src/cloudai/_core/test_scenario.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -62,6 +62,7 @@ class TestRun:
test: TestDefinition
num_nodes: Union[int, list[int]]
nodes: List[str]
exclude_nodes: List[str] = field(default_factory=list)
output_path: Path = Path("")
iterations: int = 1
current_iteration: int = 0
Expand Down
7 changes: 7 additions & 0 deletions src/cloudai/models/scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,13 @@ class TestRunModel(BaseModel):
test_name: Optional[str] = None
num_nodes: int | list[int] | None = None
nodes: list[str] = Field(default_factory=list)
exclude_nodes: list[str] = Field(
default_factory=list,
description=(
"Hostnames to exclude from the resolved node list. "
"Supports Slurm range syntax, e.g. ['node-048', 'node-[101-104]']."
),
)
weight: int = 0
iterations: int = 1
sol: Optional[float] = None
Expand Down
29 changes: 28 additions & 1 deletion src/cloudai/systems/kubernetes/kubernetes_installer.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,11 +156,23 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult:
verify_res = self._verify_commit(item.commit, repo_path)
if not verify_res.success:
return verify_res
if item.init_submodules:
res = self._init_submodules(repo_path)
if not res.success:
return res
item.installed_path = repo_path
msg = f"Git repository already exists at {repo_path}."
logging.debug(msg)
return InstallStatusResult(True, msg)

res = self._clone_and_setup_repo(item, repo_path)
if not res.success:
return res

item.installed_path = repo_path
return InstallStatusResult(True)

def _clone_and_setup_repo(self, item: GitRepo, repo_path: Path) -> InstallStatusResult:
res = self._clone_repository(item.url, repo_path)
if not res.success:
return res
Expand All @@ -172,7 +184,14 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult:
rmtree(repo_path)
return res

item.installed_path = repo_path
if item.init_submodules:
res = self._init_submodules(repo_path)
if not res.success:
logging.error(f"Submodule init failed, removing cloned repository at {repo_path}")
if repo_path.exists():
rmtree(repo_path)
return res

return InstallStatusResult(True)

def _install_python_executable(self, item: PythonExecutable) -> InstallStatusResult:
Expand Down Expand Up @@ -237,6 +256,14 @@ def _checkout_commit(self, commit_hash: str, path: Path) -> InstallStatusResult:
return InstallStatusResult(False, f"Failed to checkout commit {commit_hash}: {result.stderr}")
return InstallStatusResult(True)

def _init_submodules(self, path: Path) -> InstallStatusResult:
logging.debug(f"Initializing submodules in {path}")
submodule_cmd = ["git", "submodule", "update", "--init", "--recursive"]
result = subprocess.run(submodule_cmd, cwd=str(path), capture_output=True, text=True)
if result.returncode != 0:
return InstallStatusResult(False, f"Failed to initialize submodules: {result.stderr}")
return InstallStatusResult(True)

def _verify_commit(self, ref: str, path: Path) -> InstallStatusResult:
try:
result = subprocess.run(["git", "rev-parse", "HEAD"], cwd=str(path), capture_output=True, text=True)
Expand Down
10 changes: 9 additions & 1 deletion src/cloudai/systems/slurm/slurm_command_gen_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,9 @@ def _append_nodes_related_directives(self, content: List[str]) -> Optional[Path]

content.append(f"#SBATCH -N {num_nodes}")

if self.test_run.exclude_nodes:
content.append(f"#SBATCH --exclude={','.join(self.test_run.exclude_nodes)}")
Comment on lines +427 to +428
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Persist exclude_nodes in the stored run spec.

#SBATCH --exclude= only exists in the generated batch script. store_test_run() still dumps the raw srun command, and TestRunDetails has no exclude_nodes field, so the saved run metadata can no longer reconstruct the actual allocation request.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/cloudai/systems/slurm/slurm_command_gen_strategy.py` around lines 427 -
428, The generated SBATCH use of exclude_nodes isn't persisted: add an
exclude_nodes attribute to the TestRunDetails/data model and include
self.test_run.exclude_nodes when building the stored run spec in
store_test_run() in slurm_command_gen_strategy.py so the saved metadata can
reconstruct allocation requests; update any serialization/deserialization logic
that creates or consumes TestRunDetails (and any callers of store_test_run()) to
read/write this new field and ensure tests or storage schema migrations are
adjusted accordingly.


return None

def _format_env_vars(self, env_vars: Dict[str, Any]) -> str:
Expand Down Expand Up @@ -465,12 +468,17 @@ def get_cached_nodes_spec(self) -> tuple[int, list[str]]:
str(self.test_run.step),
str(self.test_run.nnodes),
",".join(self.test_run.nodes),
",".join(self.test_run.exclude_nodes),
]
)

if cache_key in self._node_spec_cache:
logging.debug(f"Using cached node allocation for {cache_key}: {self._node_spec_cache[cache_key]}")
return self._node_spec_cache[cache_key]

self._node_spec_cache[cache_key] = self.system.get_nodes_by_spec(self.test_run.nnodes, self.test_run.nodes)
num_nodes, node_list = self.system.get_nodes_by_spec(
self.test_run.nnodes, self.test_run.nodes, exclude_nodes=self.test_run.exclude_nodes or None
)

self._node_spec_cache[cache_key] = (num_nodes, node_list)
return self._node_spec_cache[cache_key]
29 changes: 28 additions & 1 deletion src/cloudai/systems/slurm/slurm_installer.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,11 +209,23 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult:
verify_res = self._verify_commit(item.commit, repo_path)
if not verify_res.success:
return verify_res
if item.init_submodules:
res = self._init_submodules(repo_path)
if not res.success:
return res
item.installed_path = repo_path
msg = f"Git repository already exists at {repo_path}."
logging.debug(msg)
return InstallStatusResult(True, msg)

res = self._clone_and_setup_repo(item, repo_path)
if not res.success:
return res

item.installed_path = repo_path
return InstallStatusResult(True)
Comment on lines +212 to +226
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "== Base installer orchestration =="
BASE_INSTALLER_FILE="$(fd 'base_installer\.py$' src | head -n 1 || true)"
if [ -n "${BASE_INSTALLER_FILE}" ]; then
  sed -n '1,260p' "${BASE_INSTALLER_FILE}"
fi

echo
echo "== Git repo installed checks =="
rg -n -C3 'def (_is_git_repo_installed|is_installed_one)|is_installed_one\(' \
  src/cloudai/systems/slurm/slurm_installer.py \
  src/cloudai/systems/kubernetes/kubernetes_installer.py

Repository: NVIDIA/cloudai

Length of output: 13058


🏁 Script executed:

# Get full implementation of is_installed_one and _is_git_repo_installed in slurm_installer.py
echo "=== slurm_installer.py: is_installed_one() ===" 
sed -n '139,170p' src/cloudai/systems/slurm/slurm_installer.py

echo
echo "=== slurm_installer.py: _is_git_repo_installed() ===" 
sed -n '446,485p' src/cloudai/systems/slurm/slurm_installer.py

echo
echo "=== kubernetes_installer.py: is_installed_one() ===" 
sed -n '110,145p' src/cloudai/systems/kubernetes/kubernetes_installer.py

echo
echo "=== Check if init_submodules is validated in status checks ==="
rg -n 'init_submodules' src/cloudai/systems/slurm/slurm_installer.py src/cloudai/systems/kubernetes/kubernetes_installer.py

Repository: NVIDIA/cloudai

Length of output: 6552


🏁 Script executed:

# Verify if _verify_commit checks submodules state
echo "=== _verify_commit implementation ===" 
rg -n -A10 'def _verify_commit' src/cloudai/systems/slurm/slurm_installer.py | head -20

Repository: NVIDIA/cloudai

Length of output: 700


Add init_submodules validation to status check methods.

The _is_git_repo_installed() method (and is_installed_one() in Kubernetes) validates only repository existence and commit, but does not check if submodules are initialized when item.init_submodules is true. While the current orchestration doesn't pre-check status before installing, the incomplete status API creates a correctness issue: external calls to is_installed() or future optimizations that check status first will incorrectly report a repository as installed if it exists at the correct commit but lacks initialized submodules.

Update _is_git_repo_installed() and the Kubernetes is_installed_one() GitRepo branch to verify submodules state when applicable, similar to the checks already present in _install_one_git_repo() and _clone_and_setup_repo().

🧰 Tools
🪛 Ruff (0.15.6)

[warning] 218-218: debug() call on root logger

(LOG015)


[warning] 219-219: Boolean positional value in function call

(FBT003)


[warning] 226-226: Boolean positional value in function call

(FBT003)

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/cloudai/systems/slurm/slurm_installer.py` around lines 212 - 226, The
status checks for Git repos currently only verify existence and commit but not
submodule initialization, so update _is_git_repo_installed() and the GitRepo
branch of is_installed_one() to also validate submodules when
item.init_submodules is true: call the same submodule-check logic used in
_install_one_git_repo()/_clone_and_setup_repo() (or factor that check into a
helper) to ensure submodules are initialized and at the correct state before
returning success; if the submodule check fails, return a failing
InstallStatusResult with an explanatory message.


def _clone_and_setup_repo(self, item: GitRepo, repo_path: Path) -> InstallStatusResult:
res = self._clone_repository(item.url, repo_path)
if not res.success:
return res
Expand All @@ -225,7 +237,14 @@ def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult:
rmtree(repo_path)
return res

item.installed_path = repo_path
if item.init_submodules:
res = self._init_submodules(repo_path)
if not res.success:
logging.error(f"Submodule init failed, removing cloned repository at {repo_path}")
if repo_path.exists():
rmtree(repo_path)
return res

return InstallStatusResult(True)

def _install_python_executable(self, item: PythonExecutable) -> InstallStatusResult:
Expand Down Expand Up @@ -290,6 +309,14 @@ def _checkout_commit(self, commit_hash: str, path: Path) -> InstallStatusResult:
return InstallStatusResult(False, f"Failed to checkout commit {commit_hash}: {result.stderr}")
return InstallStatusResult(True)

def _init_submodules(self, path: Path) -> InstallStatusResult:
logging.debug(f"Initializing submodules in {path}")
submodule_cmd = ["git", "submodule", "update", "--init", "--recursive"]
result = subprocess.run(submodule_cmd, cwd=str(path), capture_output=True, text=True)
if result.returncode != 0:
return InstallStatusResult(False, f"Failed to initialize submodules: {result.stderr}")
return InstallStatusResult(True)

def _verify_commit(self, ref: str, path: Path) -> InstallStatusResult:
try:
result = subprocess.run(["git", "rev-parse", "HEAD"], cwd=str(path), capture_output=True, text=True)
Expand Down
Loading
Loading