Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
3de16a0
Disable torchtitan activation checkpointing for better 8B model perfo…
alfuyao1986 Oct 2, 2025
7d6d52b
Disable torchtitan activation checkpointing for better 8B model perfo…
alfuyao1986 Oct 2, 2025
18cd0be
Set default for MI300 best perf.
alfuyao1986 Oct 2, 2025
6075971
Update llama3.1_70B-BF16-pretrain.yaml drop log frequency to reduce l…
alfuyao1986 Oct 2, 2025
464d1ae
Update llama3.1_70B-FP8-pretrain.yaml drop log frequency to reduce lo…
alfuyao1986 Oct 2, 2025
05ed2ff
Update llama3.1_8B-BF16-pretrain.yaml drop log frequency to reduce lo…
alfuyao1986 Oct 2, 2025
aab4234
Update llama3.1_8B-FP8-pretrain.yaml drop log frequency to reduce log…
alfuyao1986 Oct 2, 2025
d1ec787
tw script update
clairesonglee Oct 6, 2025
5fcbe85
remove cluster-specific commands
clairesonglee Oct 8, 2025
e16b27b
update common perf arguments - ce fusion - moe gemms
vidushi8 Oct 9, 2025
5ca6561
Merge branch 'main' into release/v25.10
vidushi8 Nov 19, 2025
ba187e4
Revert "refactor(torchtitan): rollback Titan to 99c0cb2(20250907) and…
vidushi8 Nov 19, 2025
be3b984
torchtitan: tune FP8 configs and share quant settings
Xiaoming-AMD Nov 18, 2025
01f745d
update torchtitan yaml
vidushi8 Nov 19, 2025
e226b05
enable mla configs in DS models
vidushi8 Nov 19, 2025
9f94561
update fp8 llama3 70b tt yaml
vidushi8 Nov 19, 2025
299322e
update torctitan config to use real dataset
vidushi8 Nov 19, 2025
eabc2f8
update mi300 ds model yamls with mla
vidushi8 Nov 19, 2025
919f9f6
Revert "update mi300 ds model yamls with mla"
vidushi8 Nov 19, 2025
554be01
fix PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32 typo. It will now default to 0
vidushi8 Nov 20, 2025
116c196
Merge branch 'main' into release/v25.10
vidushi8 Nov 21, 2025
653f0ec
update torch profiler gzip to false
vidushi8 Nov 25, 2025
343b5fe
support turbo groupgemm in titan
Nov 19, 2025
04aa8ac
add turbo fp8 gemm and attn via Converter
llying-001 Nov 20, 2025
80f247a
update deepseek_v3 config and load balance config
llying-001 Nov 21, 2025
d48cd84
add classic attn but with issues
llying-001 Nov 21, 2025
18f37be
update classic attention args for deepseek_v3
llying-001 Nov 22, 2025
a835bb4
update config
llying-001 Nov 22, 2025
48fe128
support new turo fp8 api
JohnQinAMD Nov 25, 2025
ad43031
support install turbo from source
JohnQinAMD Nov 26, 2025
f89348d
add dsv3 config for MI355
JohnQinAMD Nov 26, 2025
d80b8d1
Merge dev/john/titan-ptc into release/v25.10
JohnQinAMD Nov 26, 2025
c892660
initial commit
clairesonglee Dec 10, 2025
d8ae27d
set self.lr_warmup_steps < self.lr_decay_steps
clairesonglee Dec 16, 2025
df0b00e
unwrap model to remove loss_mask parameter
clairesonglee Dec 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Base image
# FROM docker.io/rocm/megatron-lm:v25.9_gfx942
FROM docker.io/rocm/primus:v25.9_gfx942
FROM docker.io/rocm/pyt-megatron-lm-jax-nightly-private:pytorch_rocm7.0_20251024

# Specify the commit of Primus-Turbo when building: docker build --build-arg PRIMUS_TURBO_COMMIT=xxx .)
ARG PRIMUS_TURBO_COMMIT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ modules:
# fused wgrad gemm and accumulation
gradient_accumulation_fusion: false
# recommend set `false` in fp8
moe_use_legacy_grouped_gemm: true
moe_use_legacy_grouped_gemm: false
# fused topk router with aux score
moe_use_fused_router_with_aux_score: false
# pad 192/128 for deepseek attention
Expand Down
4 changes: 2 additions & 2 deletions examples/megatron/configs/MI300X/llama3.1_8B-pretrain.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ modules:

# Turbo
enable_primus_turbo: true
use_turbo_attention: true
use_turbo_grouped_mlp: true
use_turbo_attention: false
use_turbo_grouped_mlp: false

# Cross entropy flags
# cross_entropy_fusion_impl: "te"
Expand Down
94 changes: 94 additions & 0 deletions examples/megatron/configs/MI300X/mamba_370M-pretrain.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
work_group: ${PRIMUS_TEAM:amd}
user_name: ${PRIMUS_USER:root}
exp_name: ${PRIMUS_EXP_NAME:mamba_370M-pretrain}
workspace: ${PRIMUS_WORKSPACE:./output}

modules:
pre_trainer:
framework: megatron
config: pre_trainer.yaml

# model to run
model: mamba_370M.yaml
overrides:
# log
wandb_project: "Primus_Mamba_Pretrain"
# disable_wandb: false
# disable_tensorboard: false
stderr_sink_level: DEBUG

eval_iters: 0

log_avg_skip_iterations: 2
log_avg_reset_interval: 50

train_iters: 50
micro_batch_size: 4
global_batch_size: 256

seq_length: 2048
max_position_embeddings: 2048

lr: 3.0e-4
min_lr: 0.0
lr_warmup_iters: 50000
lr_decay_iters: 73192188
lr_decay_style: cosine
weight_decay: 0.1
adam_beta1: 0.9
adam_beta2: 0.95
eod_mask_loss: true
init_method_std: 0.02
norm_epsilon: 1.0e-5

# Mamba-specific: must provide spec
spec: ['megatron.core.models.mamba.mamba_layer_specs', 'mamba_stack_spec']

# Tokenizer
tokenizer_type: HuggingFaceTokenizer
tokenizer_model: meta-llama/Llama-3.2-1B

# Mamba SSM parameters
is_hybrid_model: false
hybrid_attention_ratio: 0.0
hybrid_mlp_ratio: 0.0
mamba_state_dim: 16
mamba_head_dim: 64
mamba_num_groups: 8

# parallel
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
overlap_grad_reduce: true
overlap_param_gather: true
gradient_accumulation_fusion: false

# data
mock_data: true
train_data_path: null
valid_data_path: null
test_data_path: null

# ckpt
finetune: false
auto_continue_train: false
load: null
no_load_optim: null
no_load_rng: null
save: null
save_interval: 20000
no_save_optim: null
no_save_rng: null
disable_last_saving: true
ckpt_format: torch

# Turbo - may need to disable for Mamba if not supported
enable_primus_turbo: false
use_turbo_attention: false
use_turbo_grouped_mlp: false

# Cross entropy flags
# cross_entropy_fusion_impl: "native"
# cross_entropy_loss_fusion: false

Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
work_group: ${PRIMUS_TEAM:amd}
user_name: ${PRIMUS_USER:root}
exp_name: ${PRIMUS_EXP_NAME:mamba_hybrid_2.8B-pretrain}
workspace: ${PRIMUS_WORKSPACE:./output}

modules:
pre_trainer:
framework: megatron
config: pre_trainer.yaml

# model to run
model: mamba_hybrid_2.8B.yaml
overrides:
# log
wandb_project: "Primus_Mamba_Hybrid_Pretrain"
stderr_sink_level: DEBUG

eval_iters: 0

log_avg_skip_iterations: 2
log_avg_reset_interval: 50

train_iters: 100
micro_batch_size: 2
global_batch_size: 128

seq_length: 4096
max_position_embeddings: 4096

lr: 2.0e-4
min_lr: 2.0e-5
lr_warmup_iters: 200
lr_decay_iters: 10000
lr_decay_style: cosine
weight_decay: 0.1
adam_beta1: 0.9
adam_beta2: 0.95
eod_mask_loss: true
init_method_std: 0.02
norm_epsilon: 1.0e-5

# Mamba-specific: must provide spec
spec: ['megatron.core.models.mamba.mamba_layer_specs', 'mamba_stack_spec']

# Tokenizer
tokenizer_type: HuggingFaceTokenizer
tokenizer_model: meta-llama/Llama-3.2-1B

# Hybrid Mamba+Attention parameters
is_hybrid_model: true
hybrid_attention_ratio: 0.125
hybrid_mlp_ratio: 0.0
mamba_state_dim: 16
mamba_head_dim: 64
mamba_num_groups: 8

# parallel
tensor_model_parallel_size: 2
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
overlap_grad_reduce: true
overlap_param_gather: true
gradient_accumulation_fusion: true

# data
mock_data: true
train_data_path: null
valid_data_path: null
test_data_path: null

# ckpt
finetune: false
auto_continue_train: false
load: null
save: null
save_interval: 10000
disable_last_saving: true
ckpt_format: torch

# Turbo - disable for Mamba layers, but attention layers may benefit
enable_primus_turbo: false
use_turbo_attention: false
use_turbo_grouped_mlp: false

2 changes: 2 additions & 0 deletions examples/megatron/configs/MI355X/deepseek_v2-pretrain.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ modules:
# pad 192/128 for deepseek attention
fused_padded_mla_attention: false

multi_latent_attention: true

# ckpt
finetune: false
auto_continue_train: false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ modules:
# pad 192/128 for deepseek attention
fused_padded_mla_attention: false

multi_latent_attention: false
#multi_latent_attention: true

# ckpt
finetune: false
Expand All @@ -80,7 +80,7 @@ modules:
eval_iters: 0

# Turbo
enable_primus_turbo: true
enable_primus_turbo: false
use_turbo_attention: false
use_turbo_grouped_mlp: false

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,6 @@ modules:
# pad 192/128 for deepseek attention
fused_padded_mla_attention: false

# Performance toggles
#multi_latent_attention: false
#apply_rope_fusion: true

# ckpt
finetune: false
Expand All @@ -82,7 +79,7 @@ modules:
eval_iters: 0

# Turbo
enable_primus_turbo: true
enable_primus_turbo: false
use_turbo_attention: false
use_turbo_grouped_mlp: false

Expand Down
1 change: 1 addition & 0 deletions examples/run_local_pretrain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ docker_podman_proxy run --rm \
--env TORCHTITAN_PATH \
--env MAXTEXT_PATH \
--env BACKEND_PATH \
--env REBUILD_PRIMUS_TURBO \
"${ENV_ARGS[@]}" \
--ipc=host --network=host \
--device=/dev/kfd --device=/dev/dri \
Expand Down
26 changes: 25 additions & 1 deletion examples/run_pretrain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,31 @@ export NVTE_USE_OPTIMIZED_HIPIFIED_CAST_TRANSPOSE=${NVTE_USE_OPTIMIZED_HIPIFIED_
export NVTE_CK_USES_BWD_V3=${NVTE_CK_USES_BWD_V3:-0}

# Note: Disable fp32 atomic due if you find any accuracy issue.
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=${PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32:0}
export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=${PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32:-0}

# install primus turbo from source
export REBUILD_PRIMUS_TURBO=${REBUILD_PRIMUS_TURBO:-0}
if [ "$REBUILD_PRIMUS_TURBO" == "1" ]; then
LOG_INFO "Rebuilding Primus Turbo from source..."
mkdir -p "/workspace/turbo"
cd "/workspace/turbo"

# Clean up old directory if exists to avoid git clone conflicts
if [ -d "Primus-Turbo" ]; then
LOG_INFO "Removing existing Primus-Turbo directory..."
rm -rf Primus-Turbo
fi

git clone https://github.com/AMD-AGI/Primus-Turbo.git --recursive
cd Primus-Turbo
pip3 install -r requirements.txt
# Set GPU_ARCHS to compile Turbo for multiple AMD GPU architectures.
GPU_ARCHS="gfx942;gfx950" pip3 install --no-build-isolation .
cd "${PRIMUS_PATH}"
LOG_INFO "Rebuilding Primus Turbo from source done."
else
LOG_INFO "Skip Primus Turbo rebuild. REBUILD_PRIMUS_TURBO=$REBUILD_PRIMUS_TURBO"
fi

# nvte debug envs
export NVTE_DEBUG=0 # 0, 1
Expand Down
1 change: 1 addition & 0 deletions examples/run_slurm_pretrain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,5 +57,6 @@ srun -N "${NNODES}" \
export NNODES=\${SLURM_NNODES}
export NODE_RANK=\${SLURM_PROCID}
export GPUS_PER_NODE=\${SLURM_GPUS_ON_NODE}
export REBUILD_PRIMUS_TURBO=\${REBUILD_PRIMUS_TURBO}
bash ${SCRIPT_DIR}/run_local_pretrain.sh \"\$@\" 2>&1 | tee ${LOG_FILE}
" bash "$@"
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@ modules:
model: deepseek_v3_16b.yaml
overrides:
profiling:
enable_profiling: false
enable_profiling: true
save_traces_folder: "profile_trace"
profile_freq: 10
enable_memory_snapshot: false
save_memory_snapshot_folder: "memory_snapshot"

metrics:
log_freq: 10
log_freq: 1
disable_color_printing: false
enable_tensorboard: false
save_tb_folder: "tb"
Expand All @@ -38,11 +38,12 @@ modules:
min_lr_factor: 0.1

training:
debug_moe_force_load_balance: true
local_batch_size: 4
seq_len: 4096
max_norm: 1.0 # grad norm clipping
steps: 1000
dataset: "c4" # supported datasets: c4_test (2K), c4 (177M)
steps: 15
dataset: "c4_test" # supported datasets: c4_test (2K), c4 (177M)

parallelism:
data_parallel_replicate_degree: 1
Expand All @@ -69,12 +70,16 @@ modules:

compile:
enable: true
components: ["loss"] # ["model", "loss"]
components: ["model", "loss"] # ["model", "loss"]

primus_turbo:
enable_primus_turbo: true
use_turbo_mx_linear: false
use_turbo_float8_linear: true
enable_attention_float8: false

use_turbo_grouped_mm: true
use_moe_fp8: false

# quantize:
# linear:
# float8:
Expand Down
Loading