diff --git a/src/cloudai/workloads/megatron_bridge/megatron_bridge.py b/src/cloudai/workloads/megatron_bridge/megatron_bridge.py index 7ff8f72b3..e481017a1 100644 --- a/src/cloudai/workloads/megatron_bridge/megatron_bridge.py +++ b/src/cloudai/workloads/megatron_bridge/megatron_bridge.py @@ -32,7 +32,6 @@ class MegatronBridgeCmdArgs(CmdArgs): # Slurm/launcher-level gpu_type: str = Field(default="gb200") log_dir: str = Field(default="") - time_limit: str = Field(default="00:05:00") container_image: str = Field(default="") num_gpus: int = Field(default=8) gpus_per_node: int = Field(default=8) diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py index abb0a1330..fa7825bae 100644 --- a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py @@ -325,7 +325,7 @@ def add_field(field: str, flag: str, value: Any) -> None: add("-p", self.system.default_partition) add_field("gpu_type", "-g", args.gpu_type) add_field("log_dir", "-l", args.log_dir) - add_field("time_limit", "-t", args.time_limit) + add("-t", self.test_run.time_limit) if container_path: add_field("container_image", "-i", container_path) add_field("compute_dtype", "-c", args.compute_dtype) diff --git a/tests/ref_data/megatron-bridge.sbatch b/tests/ref_data/megatron-bridge.sbatch index 884b90091..e3d83952f 100644 --- a/tests/ref_data/megatron-bridge.sbatch +++ b/tests/ref_data/megatron-bridge.sbatch @@ -19,7 +19,7 @@ if [ "${WANDB_INSTALL_RC}" -ne 0 ]; then fi LAUNCH_RC=0 -NEMORUN_HOME="__OUTPUT_DIR__/output" __INSTALL_DIR__/Run__main-venv/bin/python __INSTALL_DIR__/Megatron-Bridge__main/scripts/performance/setup_experiment.py -p main -i __OUTPUT_DIR__/output/megatron_bridge_image.sqsh -hf dummy_token -ng 8 -gn 4 --golden_values_path cloudai_megatron_bridge_golden_values.json -cm __INSTALL_DIR__/Megatron-Bridge__main:/opt/Megatron-Bridge -cb 'export CUDA_VISIBLE_DEVICES=0,1,2,3' -cb 'export NCCL_DEBUG=INFO' -m qwen3 -mr 30b_a3b --detach false --additional_slurm_params 'gpus-per-node=4;gres=gpu:4' >>"$LOG" 2>&1 || LAUNCH_RC=$? +NEMORUN_HOME="__OUTPUT_DIR__/output" __INSTALL_DIR__/Run__main-venv/bin/python __INSTALL_DIR__/Megatron-Bridge__main/scripts/performance/setup_experiment.py -p main -t 00:20:00 -i __OUTPUT_DIR__/output/megatron_bridge_image.sqsh -hf dummy_token -ng 8 -gn 4 --golden_values_path cloudai_megatron_bridge_golden_values.json -cm __INSTALL_DIR__/Megatron-Bridge__main:/opt/Megatron-Bridge -cb 'export CUDA_VISIBLE_DEVICES=0,1,2,3' -cb 'export NCCL_DEBUG=INFO' -m qwen3 -mr 30b_a3b --detach false --additional_slurm_params 'gpus-per-node=4;gres=gpu:4' >>"$LOG" 2>&1 || LAUNCH_RC=$? JOB_ID="" diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index e62f173fe..928080551 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -173,8 +173,8 @@ def partial_tr(slurm_system: SlurmSystem) -> partial[TestRun]: return partial(TestRun, num_nodes=1, nodes=[], output_path=slurm_system.output_path) -def create_test_run(partial_tr: partial[TestRun], name: str, test_definition: TestDefinition) -> TestRun: - tr = partial_tr(name=name, test=test_definition) +def create_test_run(partial_tr: partial[TestRun], name: str, test_definition: TestDefinition, **kwargs) -> TestRun: + tr = partial_tr(name=name, test=test_definition, **kwargs) return tr @@ -576,6 +576,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - ) ], ), + time_limit="00:20:00", ), "vllm": lambda: create_test_run( partial_tr,