AMD-AGI · clairesonglee · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025
@@ -1,6 +1,6 @@
 # Base image
 # FROM docker.io/rocm/megatron-lm:v25.9_gfx942
-FROM docker.io/rocm/primus:v25.9_gfx942
+FROM docker.io/rocm/pyt-megatron-lm-jax-nightly-private:pytorch_rocm7.0_20251024
 
 # Specify the commit of Primus-Turbo when building: docker build --build-arg PRIMUS_TURBO_COMMIT=xxx .)
 ARG PRIMUS_TURBO_COMMIT

@@ -62,7 +62,7 @@ modules:
       # fused wgrad gemm and accumulation
       gradient_accumulation_fusion: false
       # recommend set `false` in fp8
-      moe_use_legacy_grouped_gemm: true
+      moe_use_legacy_grouped_gemm: false
       # fused topk router with aux score
       moe_use_fused_router_with_aux_score: false
       # pad 192/128 for deepseek attention

@@ -68,8 +68,8 @@ modules:
 
       # Turbo
       enable_primus_turbo: true
-      use_turbo_attention: true
-      use_turbo_grouped_mlp: true
+      use_turbo_attention: false
+      use_turbo_grouped_mlp: false
 
       # Cross entropy flags
       # cross_entropy_fusion_impl: "te"

@@ -0,0 +1,94 @@
+work_group: ${PRIMUS_TEAM:amd}
+user_name: ${PRIMUS_USER:root}
+exp_name: ${PRIMUS_EXP_NAME:mamba_370M-pretrain}
+workspace: ${PRIMUS_WORKSPACE:./output}
+
+modules:
+  pre_trainer:
+    framework: megatron
+    config: pre_trainer.yaml
+
+    # model to run
+    model: mamba_370M.yaml
+    overrides:
+      # log
+      wandb_project: "Primus_Mamba_Pretrain"
+      # disable_wandb: false
+      # disable_tensorboard: false
+      stderr_sink_level: DEBUG
+
+      eval_iters: 0
+
+      log_avg_skip_iterations: 2
+      log_avg_reset_interval: 50
+
+      train_iters: 50
+      micro_batch_size: 4
+      global_batch_size: 256
+
+      seq_length: 2048
+      max_position_embeddings: 2048
+
+      lr: 3.0e-4
+      min_lr: 0.0
+      lr_warmup_iters: 50000
+      lr_decay_iters: 73192188
+      lr_decay_style: cosine
+      weight_decay: 0.1
+      adam_beta1: 0.9
+      adam_beta2: 0.95
+      eod_mask_loss: true
+      init_method_std: 0.02
+      norm_epsilon: 1.0e-5
+
+      # Mamba-specific: must provide spec
+      spec: ['megatron.core.models.mamba.mamba_layer_specs', 'mamba_stack_spec']
+
+      # Tokenizer
+      tokenizer_type: HuggingFaceTokenizer
+      tokenizer_model: meta-llama/Llama-3.2-1B
+
+      # Mamba SSM parameters
+      is_hybrid_model: false
+      hybrid_attention_ratio: 0.0
+      hybrid_mlp_ratio: 0.0
+      mamba_state_dim: 16
+      mamba_head_dim: 64
+      mamba_num_groups: 8
+
+      # parallel
+      tensor_model_parallel_size: 1
+      pipeline_model_parallel_size: 1
+      expert_model_parallel_size: 1
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+      gradient_accumulation_fusion: false
+
+      # data
+      mock_data: true
+      train_data_path: null
+      valid_data_path: null
+      test_data_path: null
+
+      # ckpt
+      finetune: false
+      auto_continue_train: false
+      load: null
+      no_load_optim: null
+      no_load_rng: null
+      save: null
+      save_interval: 20000
+      no_save_optim: null
+      no_save_rng: null
+      disable_last_saving: true
+      ckpt_format: torch
+
+      # Turbo - may need to disable for Mamba if not supported
+      enable_primus_turbo: false
+      use_turbo_attention: false
+      use_turbo_grouped_mlp: false
+
+      # Cross entropy flags
+      # cross_entropy_fusion_impl: "native"
+      # cross_entropy_loss_fusion: false
+
@@ -0,0 +1,84 @@
+work_group: ${PRIMUS_TEAM:amd}
+user_name: ${PRIMUS_USER:root}
+exp_name: ${PRIMUS_EXP_NAME:mamba_hybrid_2.8B-pretrain}
+workspace: ${PRIMUS_WORKSPACE:./output}
+
+modules:
+  pre_trainer:
+    framework: megatron
+    config: pre_trainer.yaml
+
+    # model to run
+    model: mamba_hybrid_2.8B.yaml
+    overrides:
+      # log
+      wandb_project: "Primus_Mamba_Hybrid_Pretrain"
+      stderr_sink_level: DEBUG
+
+      eval_iters: 0
+
+      log_avg_skip_iterations: 2
+      log_avg_reset_interval: 50
+
+      train_iters: 100
+      micro_batch_size: 2
+      global_batch_size: 128
+
+      seq_length: 4096
+      max_position_embeddings: 4096
+
+      lr: 2.0e-4
+      min_lr: 2.0e-5
+      lr_warmup_iters: 200
+      lr_decay_iters: 10000
+      lr_decay_style: cosine
+      weight_decay: 0.1
+      adam_beta1: 0.9
+      adam_beta2: 0.95
+      eod_mask_loss: true
+      init_method_std: 0.02
+      norm_epsilon: 1.0e-5
+
+      # Mamba-specific: must provide spec
+      spec: ['megatron.core.models.mamba.mamba_layer_specs', 'mamba_stack_spec']
+
+      # Tokenizer
+      tokenizer_type: HuggingFaceTokenizer
+      tokenizer_model: meta-llama/Llama-3.2-1B
+
+      # Hybrid Mamba+Attention parameters
+      is_hybrid_model: true
+      hybrid_attention_ratio: 0.125
+      hybrid_mlp_ratio: 0.0
+      mamba_state_dim: 16
+      mamba_head_dim: 64
+      mamba_num_groups: 8
+
+      # parallel
+      tensor_model_parallel_size: 2
+      pipeline_model_parallel_size: 1
+      expert_model_parallel_size: 1
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+      gradient_accumulation_fusion: true
+
+      # data
+      mock_data: true
+      train_data_path: null
+      valid_data_path: null
+      test_data_path: null
+
+      # ckpt
+      finetune: false
+      auto_continue_train: false
+      load: null
+      save: null
+      save_interval: 10000
+      disable_last_saving: true
+      ckpt_format: torch
+
+      # Turbo - disable for Mamba layers, but attention layers may benefit
+      enable_primus_turbo: false
+      use_turbo_attention: false
+      use_turbo_grouped_mlp: false
+
@@ -68,6 +68,8 @@ modules:
       # pad 192/128 for deepseek attention
       fused_padded_mla_attention: false
 
+      multi_latent_attention: true
+
       # ckpt
       finetune: false
       auto_continue_train: false

@@ -63,7 +63,7 @@ modules:
       # pad 192/128 for deepseek attention
       fused_padded_mla_attention: false
 
-      multi_latent_attention: false
+        #multi_latent_attention: true
 
       # ckpt
       finetune: false
@@ -80,7 +80,7 @@ modules:
       eval_iters: 0
 
       # Turbo
-      enable_primus_turbo: true
+      enable_primus_turbo: false
       use_turbo_attention: false
       use_turbo_grouped_mlp: false
 

@@ -63,9 +63,6 @@ modules:
       # pad 192/128 for deepseek attention
       fused_padded_mla_attention: false
 
-      # Performance toggles
-      #multi_latent_attention: false
-      #apply_rope_fusion: true
 
       # ckpt
       finetune: false
@@ -82,7 +79,7 @@ modules:
       eval_iters: 0
 
       # Turbo
-      enable_primus_turbo: true
+      enable_primus_turbo: false
       use_turbo_attention: false
       use_turbo_grouped_mlp: false
 

@@ -143,6 +143,7 @@ docker_podman_proxy run --rm \
     --env TORCHTITAN_PATH \
     --env MAXTEXT_PATH \
     --env BACKEND_PATH \
+    --env REBUILD_PRIMUS_TURBO \
     "${ENV_ARGS[@]}" \
     --ipc=host --network=host \
     --device=/dev/kfd --device=/dev/dri \

@@ -275,7 +275,31 @@ export NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE=${NVTE_USE_OPTIMIZED_HIPIFIED_
 export NVTE_CK_USES_BWD_V3=${NVTE_CK_USES_BWD_V3:-0}
 
 # Note: Disable fp32 atomic due if you find any accuracy issue.
-export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=${PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32:0}
+export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=${PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32:-0}
+
+# install primus turbo from source
+export REBUILD_PRIMUS_TURBO=${REBUILD_PRIMUS_TURBO:-0}
+if [ "$REBUILD_PRIMUS_TURBO" == "1" ]; then
+    LOG_INFO "Rebuilding Primus Turbo from source..."
+    mkdir -p "/workspace/turbo"
+    cd "/workspace/turbo"
+
+    # Clean up old directory if exists to avoid git clone conflicts
+    if [ -d "Primus-Turbo" ]; then
+        LOG_INFO "Removing existing Primus-Turbo directory..."
+        rm -rf Primus-Turbo
+    fi
+
+    git clone https://github.com/AMD-AGI/Primus-Turbo.git --recursive 
+    cd Primus-Turbo
+    pip3 install -r requirements.txt
+    # Set GPU_ARCHS to compile Turbo for multiple AMD GPU architectures.
+    GPU_ARCHS="gfx942;gfx950" pip3 install --no-build-isolation .
+    cd "${PRIMUS_PATH}"
+    LOG_INFO "Rebuilding Primus Turbo from source done."
+else
+    LOG_INFO "Skip Primus Turbo rebuild. REBUILD_PRIMUS_TURBO=$REBUILD_PRIMUS_TURBO"
+fi
 
 # nvte debug envs
 export NVTE_DEBUG=0 # 0, 1

@@ -57,5 +57,6 @@ srun -N "${NNODES}" \
           export NNODES=\${SLURM_NNODES}
           export NODE_RANK=\${SLURM_PROCID}
           export GPUS_PER_NODE=\${SLURM_GPUS_ON_NODE}
+          export REBUILD_PRIMUS_TURBO=\${REBUILD_PRIMUS_TURBO}
           bash ${SCRIPT_DIR}/run_local_pretrain.sh \"\$@\" 2>&1 | tee ${LOG_FILE}
      " bash "$@"
@@ -13,14 +13,14 @@ modules:
     model: deepseek_v3_16b.yaml
     overrides:
       profiling:
-        enable_profiling: false
+        enable_profiling: true
         save_traces_folder: "profile_trace"
         profile_freq: 10
         enable_memory_snapshot: false
         save_memory_snapshot_folder: "memory_snapshot"
 
       metrics:
-        log_freq: 10
+        log_freq: 1
         disable_color_printing: false
         enable_tensorboard: false
         save_tb_folder: "tb"
@@ -38,11 +38,12 @@ modules:
         min_lr_factor: 0.1
 
       training:
+        debug_moe_force_load_balance: true
         local_batch_size: 4
         seq_len: 4096
         max_norm: 1.0            # grad norm clipping
-        steps: 1000
-        dataset: "c4"            # supported datasets: c4_test (2K), c4 (177M)
+        steps: 15
+        dataset: "c4_test"            # supported datasets: c4_test (2K), c4 (177M)
 
       parallelism:
         data_parallel_replicate_degree: 1
@@ -69,12 +70,16 @@ modules:
 
       compile:
         enable: true
-        components: ["loss"]     # ["model", "loss"]
+        components: ["model", "loss"]     # ["model", "loss"]
 
       primus_turbo:
         enable_primus_turbo: true
+        use_turbo_mx_linear: false
+        use_turbo_float8_linear: true
         enable_attention_float8: false
-
+        use_turbo_grouped_mm: true
+        use_moe_fp8: false
+
       # quantize:
       #   linear:
       #     float8: