From 8279eab75b9ef4a3c136b4e4ff182c6bafbfbbe5 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 25 Mar 2026 12:16:35 +0100 Subject: [PATCH 1/4] using gpus-per-node from system if available --- .../megatron_bridge/megatron_bridge.py | 2 +- .../slurm_command_gen_strategy.py | 9 ++--- .../test_command_gen_strategy_slurm.py | 33 +++++++++++++++---- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/src/cloudai/workloads/megatron_bridge/megatron_bridge.py b/src/cloudai/workloads/megatron_bridge/megatron_bridge.py index 7ff8f72b3..b88cf0523 100644 --- a/src/cloudai/workloads/megatron_bridge/megatron_bridge.py +++ b/src/cloudai/workloads/megatron_bridge/megatron_bridge.py @@ -35,7 +35,7 @@ class MegatronBridgeCmdArgs(CmdArgs): time_limit: str = Field(default="00:05:00") container_image: str = Field(default="") num_gpus: int = Field(default=8) - gpus_per_node: int = Field(default=8) + gpus_per_node: int | None = None enable_vboost: bool | None = Field(default=False) dryrun: bool | None = Field(default=False) enable_nsys: bool | None = Field(default=False) diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py index abb0a1330..84e424df3 100644 --- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py @@ -341,7 +341,7 @@ def add_field(field: str, flag: str, value: Any) -> None: if args.dryrun and "dryrun" in fields_set: parts.append("-d") add_field("num_gpus", "-ng", args.num_gpus) - add_field("gpus_per_node", "-gn", args.gpus_per_node) + add_field("gpus_per_node", "-gn", args.gpus_per_node or self.system.gpus_per_node) # Always provide a stable golden values filename so Megatron-Bridge writes parsed metrics to disk. add("--golden_values_path", GOLDEN_VALUES_FILENAME) if mounts: @@ -455,9 +455,10 @@ def add_field(field: str, flag: str, value: Any) -> None: additional_slurm_params: list[str] = [] - if args.gpus_per_node and self.system.supports_gpu_directives: - additional_slurm_params.append(f"gpus-per-node={args.gpus_per_node}") - additional_slurm_params.append(f"gres=gpu:{args.gpus_per_node}") + gpus_per_node = args.gpus_per_node or self.system.gpus_per_node + if gpus_per_node and self.system.supports_gpu_directives: + additional_slurm_params.append(f"gpus-per-node={gpus_per_node}") + additional_slurm_params.append(f"gres=gpu:{gpus_per_node}") _, node_list = self.get_cached_nodes_spec() if node_list: diff --git a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py index 6a977d95e..2aeb67794 100644 --- a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py +++ b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py @@ -377,16 +377,37 @@ def test_no_mount_as_skips_repo_container_mount( assert f"{repo_path.absolute()}:" not in wrapper_content assert ":/opt/Megatron-Bridge" not in wrapper_content - def test_gpus_per_node_passed_as_additional_slurm_param( - self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun] + @pytest.mark.parametrize( + ("cmd_args_gpus_per_node", "system_gpus_per_node", "expected_gpus"), + ( + (None, None, None), + (2, None, 2), + (2, 4, 2), + (None, 4, 4), + ), + ) + def test_gpus_per_node( + self, + configured_slurm_system: SlurmSystem, + make_test_run: Callable[..., TestRun], + cmd_args_gpus_per_node: int | None, + system_gpus_per_node: int | None, + expected_gpus: int | None, ) -> None: configured_slurm_system.supports_gpu_directives_cache = True - tr = make_test_run(cmd_args_overrides={"gpus_per_node": 2}, output_subdir="out_gpus") + configured_slurm_system.gpus_per_node = system_gpus_per_node + tr = make_test_run(cmd_args_overrides={"gpus_per_node": cmd_args_gpus_per_node}, output_subdir="out_gpus") cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr) wrapper_content = self._wrapper_content(cmd_gen) - assert "--additional_slurm_params" in wrapper_content - assert "gpus-per-node=2" in wrapper_content - assert "gres=gpu:2" in wrapper_content + + if expected_gpus is None: + assert "--additional_slurm_params" not in wrapper_content + assert "-gn" not in wrapper_content + else: + assert "--additional_slurm_params" in wrapper_content + assert f"gpus-per-node={expected_gpus}" in wrapper_content + assert f"gres=gpu:{expected_gpus}" in wrapper_content + assert f"-gn {expected_gpus}" in wrapper_content def test_gpus_per_node_skipped_when_gpu_directives_unsupported( self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun] From 545da11defbbde4d47d3b3a093cd3da10a4de292 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 25 Mar 2026 12:27:40 +0100 Subject: [PATCH 2/4] update mbridge configs to use system gpus-per-node --- .../megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml | 1 - .../megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml | 1 - .../megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml | 1 - .../megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml | 1 - 4 files changed, 4 deletions(-) diff --git a/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml index 89cd95dd0..58a6d1dd7 100644 --- a/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml +++ b/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml @@ -29,7 +29,6 @@ gpu_type = "b200" container_image = "nvcr.io#nvidia/nemo:26.02.00" model_family_name = "qwen" model_recipe_name = "qwen3_30b_a3b" -gpus_per_node = 8 num_gpus = 8 domain = "llm" task = "pretrain" diff --git a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml index d0ca6e2fb..56f829352 100644 --- a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml +++ b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml @@ -29,7 +29,6 @@ gpu_type = "gb200" container_image = "nvcr.io#nvidia/nemo:26.02.00" model_family_name = "qwen" model_recipe_name = "qwen3_30b_a3b" -gpus_per_node = 4 num_gpus = 8 domain = "llm" task = "pretrain" diff --git a/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml index f68899a08..675d395f7 100644 --- a/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml +++ b/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml @@ -29,7 +29,6 @@ gpu_type = "gb300" container_image = "nvcr.io#nvidia/nemo:26.02.00" model_family_name = "qwen" model_recipe_name = "qwen3_30b_a3b" -gpus_per_node = 4 num_gpus = 8 # mb = 4 # In case OOM, uncomment this for smaller micro-batch size domain = "llm" diff --git a/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml index 84c52f893..78e595eae 100644 --- a/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml +++ b/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml @@ -29,7 +29,6 @@ gpu_type = "h100" container_image = "nvcr.io#nvidia/nemo:26.02.00" model_family_name = "qwen" model_recipe_name = "qwen3_30b_a3b" -gpus_per_node = 8 num_gpus = 16 domain = "llm" task = "pretrain" From d8c55fa2c36c1a4e746ce2c6d400153c87eadd79 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 25 Mar 2026 13:16:04 +0100 Subject: [PATCH 3/4] remove gpus-per-node from cmdargs --- .../workloads/megatron_bridge/megatron_bridge.py | 1 - .../megatron_bridge/slurm_command_gen_strategy.py | 9 ++++----- tests/ref_data/megatron-bridge.sbatch | 2 +- .../test_command_gen_strategy_slurm.py | 13 ++----------- 4 files changed, 7 insertions(+), 18 deletions(-) diff --git a/src/cloudai/workloads/megatron_bridge/megatron_bridge.py b/src/cloudai/workloads/megatron_bridge/megatron_bridge.py index b88cf0523..4f6c07d78 100644 --- a/src/cloudai/workloads/megatron_bridge/megatron_bridge.py +++ b/src/cloudai/workloads/megatron_bridge/megatron_bridge.py @@ -35,7 +35,6 @@ class MegatronBridgeCmdArgs(CmdArgs): time_limit: str = Field(default="00:05:00") container_image: str = Field(default="") num_gpus: int = Field(default=8) - gpus_per_node: int | None = None enable_vboost: bool | None = Field(default=False) dryrun: bool | None = Field(default=False) enable_nsys: bool | None = Field(default=False) diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py index 84e424df3..ef166eab1 100644 --- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py @@ -341,7 +341,7 @@ def add_field(field: str, flag: str, value: Any) -> None: if args.dryrun and "dryrun" in fields_set: parts.append("-d") add_field("num_gpus", "-ng", args.num_gpus) - add_field("gpus_per_node", "-gn", args.gpus_per_node or self.system.gpus_per_node) + add_field("gpus_per_node", "-gn", self.system.gpus_per_node) # Always provide a stable golden values filename so Megatron-Bridge writes parsed metrics to disk. add("--golden_values_path", GOLDEN_VALUES_FILENAME) if mounts: @@ -455,10 +455,9 @@ def add_field(field: str, flag: str, value: Any) -> None: additional_slurm_params: list[str] = [] - gpus_per_node = args.gpus_per_node or self.system.gpus_per_node - if gpus_per_node and self.system.supports_gpu_directives: - additional_slurm_params.append(f"gpus-per-node={gpus_per_node}") - additional_slurm_params.append(f"gres=gpu:{gpus_per_node}") + if self.system.gpus_per_node and self.system.supports_gpu_directives: + additional_slurm_params.append(f"gpus-per-node={self.system.gpus_per_node}") + additional_slurm_params.append(f"gres=gpu:{self.system.gpus_per_node}") _, node_list = self.get_cached_nodes_spec() if node_list: diff --git a/tests/ref_data/megatron-bridge.sbatch b/tests/ref_data/megatron-bridge.sbatch index 884b90091..637b27103 100644 --- a/tests/ref_data/megatron-bridge.sbatch +++ b/tests/ref_data/megatron-bridge.sbatch @@ -19,7 +19,7 @@ if [ "${WANDB_INSTALL_RC}" -ne 0 ]; then fi LAUNCH_RC=0 -NEMORUN_HOME="__OUTPUT_DIR__/output" __INSTALL_DIR__/Run__main-venv/bin/python __INSTALL_DIR__/Megatron-Bridge__main/scripts/performance/setup_experiment.py -p main -i __OUTPUT_DIR__/output/megatron_bridge_image.sqsh -hf dummy_token -ng 8 -gn 4 --golden_values_path cloudai_megatron_bridge_golden_values.json -cm __INSTALL_DIR__/Megatron-Bridge__main:/opt/Megatron-Bridge -cb 'export CUDA_VISIBLE_DEVICES=0,1,2,3' -cb 'export NCCL_DEBUG=INFO' -m qwen3 -mr 30b_a3b --detach false --additional_slurm_params 'gpus-per-node=4;gres=gpu:4' >>"$LOG" 2>&1 || LAUNCH_RC=$? +NEMORUN_HOME="__OUTPUT_DIR__/output" __INSTALL_DIR__/Run__main-venv/bin/python __INSTALL_DIR__/Megatron-Bridge__main/scripts/performance/setup_experiment.py -p main -i __OUTPUT_DIR__/output/megatron_bridge_image.sqsh -hf dummy_token -ng 8 -gn 8 --golden_values_path cloudai_megatron_bridge_golden_values.json -cm __INSTALL_DIR__/Megatron-Bridge__main:/opt/Megatron-Bridge -cb 'export CUDA_VISIBLE_DEVICES=0,1,2,3' -cb 'export NCCL_DEBUG=INFO' -m qwen3 -mr 30b_a3b --detach false --additional_slurm_params 'gpus-per-node=8;gres=gpu:8' >>"$LOG" 2>&1 || LAUNCH_RC=$? JOB_ID="" diff --git a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py index 2aeb67794..8c2e96acd 100644 --- a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py +++ b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py @@ -377,26 +377,17 @@ def test_no_mount_as_skips_repo_container_mount( assert f"{repo_path.absolute()}:" not in wrapper_content assert ":/opt/Megatron-Bridge" not in wrapper_content - @pytest.mark.parametrize( - ("cmd_args_gpus_per_node", "system_gpus_per_node", "expected_gpus"), - ( - (None, None, None), - (2, None, 2), - (2, 4, 2), - (None, 4, 4), - ), - ) + @pytest.mark.parametrize(("system_gpus_per_node", "expected_gpus"), ((None, None), (4, 4))) def test_gpus_per_node( self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun], - cmd_args_gpus_per_node: int | None, system_gpus_per_node: int | None, expected_gpus: int | None, ) -> None: configured_slurm_system.supports_gpu_directives_cache = True configured_slurm_system.gpus_per_node = system_gpus_per_node - tr = make_test_run(cmd_args_overrides={"gpus_per_node": cmd_args_gpus_per_node}, output_subdir="out_gpus") + tr = make_test_run(output_subdir="out_gpus") cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr) wrapper_content = self._wrapper_content(cmd_gen) From 9821af0c86ece1883f4ea3526d760c39b3c0a20d Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 25 Mar 2026 18:54:19 +0100 Subject: [PATCH 4/4] remove unused param --- tests/test_acceptance.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index e62f173fe..5c601d5a2 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -564,7 +564,6 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - model_family_name="qwen3", model_recipe_name="30b_a3b", num_gpus=8, - gpus_per_node=4, ), extra_env_vars={"CUDA_VISIBLE_DEVICES": "0,1,2,3", "NCCL_DEBUG": "INFO"}, extra_container_mounts=[],