diff --git a/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml index 89cd95dd0..58a6d1dd7 100644 --- a/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml +++ b/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml @@ -29,7 +29,6 @@ gpu_type = "b200" container_image = "nvcr.io#nvidia/nemo:26.02.00" model_family_name = "qwen" model_recipe_name = "qwen3_30b_a3b" -gpus_per_node = 8 num_gpus = 8 domain = "llm" task = "pretrain" diff --git a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml index d0ca6e2fb..56f829352 100644 --- a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml +++ b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml @@ -29,7 +29,6 @@ gpu_type = "gb200" container_image = "nvcr.io#nvidia/nemo:26.02.00" model_family_name = "qwen" model_recipe_name = "qwen3_30b_a3b" -gpus_per_node = 4 num_gpus = 8 domain = "llm" task = "pretrain" diff --git a/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml index f68899a08..675d395f7 100644 --- a/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml +++ b/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml @@ -29,7 +29,6 @@ gpu_type = "gb300" container_image = "nvcr.io#nvidia/nemo:26.02.00" model_family_name = "qwen" model_recipe_name = "qwen3_30b_a3b" -gpus_per_node = 4 num_gpus = 8 # mb = 4 # In case OOM, uncomment this for smaller micro-batch size domain = "llm" diff --git a/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml index 84c52f893..78e595eae 100644 --- a/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml +++ b/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml @@ -29,7 +29,6 @@ gpu_type = "h100" container_image = "nvcr.io#nvidia/nemo:26.02.00" model_family_name = "qwen" model_recipe_name = "qwen3_30b_a3b" -gpus_per_node = 8 num_gpus = 16 domain = "llm" task = "pretrain" diff --git a/src/cloudai/workloads/megatron_bridge/megatron_bridge.py b/src/cloudai/workloads/megatron_bridge/megatron_bridge.py index 7ff8f72b3..4f6c07d78 100644 --- a/src/cloudai/workloads/megatron_bridge/megatron_bridge.py +++ b/src/cloudai/workloads/megatron_bridge/megatron_bridge.py @@ -35,7 +35,6 @@ class MegatronBridgeCmdArgs(CmdArgs): time_limit: str = Field(default="00:05:00") container_image: str = Field(default="") num_gpus: int = Field(default=8) - gpus_per_node: int = Field(default=8) enable_vboost: bool | None = Field(default=False) dryrun: bool | None = Field(default=False) enable_nsys: bool | None = Field(default=False) diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py index abb0a1330..ef166eab1 100644 --- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py @@ -341,7 +341,7 @@ def add_field(field: str, flag: str, value: Any) -> None: if args.dryrun and "dryrun" in fields_set: parts.append("-d") add_field("num_gpus", "-ng", args.num_gpus) - add_field("gpus_per_node", "-gn", args.gpus_per_node) + add_field("gpus_per_node", "-gn", self.system.gpus_per_node) # Always provide a stable golden values filename so Megatron-Bridge writes parsed metrics to disk. add("--golden_values_path", GOLDEN_VALUES_FILENAME) if mounts: @@ -455,9 +455,9 @@ def add_field(field: str, flag: str, value: Any) -> None: additional_slurm_params: list[str] = [] - if args.gpus_per_node and self.system.supports_gpu_directives: - additional_slurm_params.append(f"gpus-per-node={args.gpus_per_node}") - additional_slurm_params.append(f"gres=gpu:{args.gpus_per_node}") + if self.system.gpus_per_node and self.system.supports_gpu_directives: + additional_slurm_params.append(f"gpus-per-node={self.system.gpus_per_node}") + additional_slurm_params.append(f"gres=gpu:{self.system.gpus_per_node}") _, node_list = self.get_cached_nodes_spec() if node_list: diff --git a/tests/ref_data/megatron-bridge.sbatch b/tests/ref_data/megatron-bridge.sbatch index 884b90091..637b27103 100644 --- a/tests/ref_data/megatron-bridge.sbatch +++ b/tests/ref_data/megatron-bridge.sbatch @@ -19,7 +19,7 @@ if [ "${WANDB_INSTALL_RC}" -ne 0 ]; then fi LAUNCH_RC=0 -NEMORUN_HOME="__OUTPUT_DIR__/output" __INSTALL_DIR__/Run__main-venv/bin/python __INSTALL_DIR__/Megatron-Bridge__main/scripts/performance/setup_experiment.py -p main -i __OUTPUT_DIR__/output/megatron_bridge_image.sqsh -hf dummy_token -ng 8 -gn 4 --golden_values_path cloudai_megatron_bridge_golden_values.json -cm __INSTALL_DIR__/Megatron-Bridge__main:/opt/Megatron-Bridge -cb 'export CUDA_VISIBLE_DEVICES=0,1,2,3' -cb 'export NCCL_DEBUG=INFO' -m qwen3 -mr 30b_a3b --detach false --additional_slurm_params 'gpus-per-node=4;gres=gpu:4' >>"$LOG" 2>&1 || LAUNCH_RC=$? +NEMORUN_HOME="__OUTPUT_DIR__/output" __INSTALL_DIR__/Run__main-venv/bin/python __INSTALL_DIR__/Megatron-Bridge__main/scripts/performance/setup_experiment.py -p main -i __OUTPUT_DIR__/output/megatron_bridge_image.sqsh -hf dummy_token -ng 8 -gn 8 --golden_values_path cloudai_megatron_bridge_golden_values.json -cm __INSTALL_DIR__/Megatron-Bridge__main:/opt/Megatron-Bridge -cb 'export CUDA_VISIBLE_DEVICES=0,1,2,3' -cb 'export NCCL_DEBUG=INFO' -m qwen3 -mr 30b_a3b --detach false --additional_slurm_params 'gpus-per-node=8;gres=gpu:8' >>"$LOG" 2>&1 || LAUNCH_RC=$? JOB_ID="" diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index e62f173fe..5c601d5a2 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -564,7 +564,6 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - model_family_name="qwen3", model_recipe_name="30b_a3b", num_gpus=8, - gpus_per_node=4, ), extra_env_vars={"CUDA_VISIBLE_DEVICES": "0,1,2,3", "NCCL_DEBUG": "INFO"}, extra_container_mounts=[], diff --git a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py index 6a977d95e..8c2e96acd 100644 --- a/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py +++ b/tests/workloads/megatron_bridge/test_command_gen_strategy_slurm.py @@ -377,16 +377,28 @@ def test_no_mount_as_skips_repo_container_mount( assert f"{repo_path.absolute()}:" not in wrapper_content assert ":/opt/Megatron-Bridge" not in wrapper_content - def test_gpus_per_node_passed_as_additional_slurm_param( - self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun] + @pytest.mark.parametrize(("system_gpus_per_node", "expected_gpus"), ((None, None), (4, 4))) + def test_gpus_per_node( + self, + configured_slurm_system: SlurmSystem, + make_test_run: Callable[..., TestRun], + system_gpus_per_node: int | None, + expected_gpus: int | None, ) -> None: configured_slurm_system.supports_gpu_directives_cache = True - tr = make_test_run(cmd_args_overrides={"gpus_per_node": 2}, output_subdir="out_gpus") + configured_slurm_system.gpus_per_node = system_gpus_per_node + tr = make_test_run(output_subdir="out_gpus") cmd_gen = MegatronBridgeSlurmCommandGenStrategy(configured_slurm_system, tr) wrapper_content = self._wrapper_content(cmd_gen) - assert "--additional_slurm_params" in wrapper_content - assert "gpus-per-node=2" in wrapper_content - assert "gres=gpu:2" in wrapper_content + + if expected_gpus is None: + assert "--additional_slurm_params" not in wrapper_content + assert "-gn" not in wrapper_content + else: + assert "--additional_slurm_params" in wrapper_content + assert f"gpus-per-node={expected_gpus}" in wrapper_content + assert f"gres=gpu:{expected_gpus}" in wrapper_content + assert f"-gn {expected_gpus}" in wrapper_content def test_gpus_per_node_skipped_when_gpu_directives_unsupported( self, configured_slurm_system: SlurmSystem, make_test_run: Callable[..., TestRun]