Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
80e2d26
initial commit
clairesonglee Dec 10, 2025
d23d79f
set self.lr_warmup_steps < self.lr_decay_steps
clairesonglee Dec 16, 2025
3381850
unwrap model to remove loss_mask parameter
clairesonglee Dec 18, 2025
277f3e1
add zebra-llama (hybrid mla mamba model) support
Mingyuyang-1 Jan 13, 2026
11b22c6
add Zebra-Llama 3B configurations
Mingyuyang-1 Jan 22, 2026
1dec95e
add Zebra-Llama 1B configs and remove unused configs
Mingyuyang-1 Jan 23, 2026
34508db
remove unused configs
Mingyuyang-1 Jan 23, 2026
2f3ab49
Set submodule mamba to track enable-primus-hybrid-models branch
clairesonglee Jan 29, 2026
159e441
set moe_layer_freq default value of 1
clairesonglee Feb 3, 2026
f798e77
set final_logit_softcapping and router_logit_softcapping to null
clairesonglee Feb 3, 2026
d7f4faf
use mamba builder
clairesonglee Feb 4, 2026
4e4c445
Merge branch 'main' into clairlee/dev/hybrid
clairesonglee Feb 4, 2026
d5bdbb8
adjust zebra-llama architecture and training
Mingyuyang-1 Feb 5, 2026
0fdeabb
Merge branch 'main' into clairlee/dev/hybrid
clairesonglee Feb 5, 2026
6bc8f60
Potential fix for pull request finding 'Unused local variable'
clairesonglee Feb 5, 2026
7e43595
code lint with pre-commit
clairesonglee Feb 6, 2026
a99a28f
[Docs] & [Feature]: Add Post-Training Documentation and Update Qwen3_…
kailashg26 Feb 19, 2026
564fa38
set grad_accum_fusion=false for triton 3.6.0 compatibility
clairesonglee Feb 12, 2026
1028e65
refactor(megatron): add tokenizer override patch, move to new arch (#…
HuangWei-95 Feb 5, 2026
2edff4e
Fix: Add model_type detection for mamba/hybrid models in core runtime
clairesonglee Feb 19, 2026
3ff8294
ci(deterministic): add env for megatron ci test (#539)
HuangWei-95 Feb 6, 2026
50b30e0
Update Docker base image from v25.10 to v26.1 (#534)
WangLingxun Feb 6, 2026
e2f5faa
merge with main
clairesonglee Feb 20, 2026
07e32b9
Merge main into clairlee/dev/hybrid: Resolve conflicts and preserve m…
clairesonglee Feb 20, 2026
07a5462
resolve unit test error
clairesonglee Feb 20, 2026
f4e79ed
use gpt model provider by default for compatibility
clairesonglee Feb 20, 2026
1808caa
Merge branch 'release/v26.2' into hybrid/release/v26.2
clairesonglee Feb 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
[submodule "third_party/Megatron-Bridge"]
path = third_party/Megatron-Bridge
url = https://github.com/NVIDIA-NeMo/Megatron-Bridge.git
[submodule "third_party/mamba"]
path = third_party/mamba
url = https://github.com/AndreasKaratzas/mamba.git
branch = enable-primus-hybrid-models
[submodule "third_party/HummingbirdXT"]
path = third_party/HummingbirdXT
url = https://github.com/AMD-AGI/HummingbirdXT.git
85 changes: 85 additions & 0 deletions examples/megatron/configs/MI300X/mamba_370M-pretrain.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
work_group: ${PRIMUS_TEAM:amd}
user_name: ${PRIMUS_USER:root}
exp_name: ${PRIMUS_EXP_NAME:mamba_370M-pretrain}
workspace: ${PRIMUS_WORKSPACE:./output}

modules:
pre_trainer:
framework: megatron
config: pre_trainer.yaml

# model to run
model: mamba_370M.yaml
overrides:
# log
wandb_project: "Primus_Mamba_Pretrain"
# disable_wandb: false
# disable_tensorboard: false
stderr_sink_level: DEBUG

eval_iters: 0

log_avg_skip_iterations: 2
log_avg_reset_interval: 50

train_iters: 50
micro_batch_size: 4
global_batch_size: 256

seq_length: 2048
max_position_embeddings: 2048

lr: 3.0e-4
min_lr: 0.0
lr_warmup_iters: 50000
lr_decay_iters: 73192188
lr_decay_style: cosine
weight_decay: 0.1
adam_beta1: 0.9
adam_beta2: 0.95
eod_mask_loss: true
init_method_std: 0.02
norm_epsilon: 1.0e-5

# Mamba-specific: must provide spec
spec: ['megatron.core.models.mamba.mamba_layer_specs', 'mamba_stack_spec']

# Tokenizer
tokenizer_type: HuggingFaceTokenizer
tokenizer_model: EleutherAI/gpt-neox-20b

# parallel
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
overlap_grad_reduce: true
overlap_param_gather: true
gradient_accumulation_fusion: false

# data
mock_data: true
train_data_path: null
valid_data_path: null
test_data_path: null

# ckpt
finetune: false
auto_continue_train: false
load: null
no_load_optim: null
no_load_rng: null
save: null
save_interval: 20000
no_save_optim: null
no_save_rng: null
disable_last_saving: true
ckpt_format: torch

# Turbo - may need to disable for Mamba if not supported
enable_primus_turbo: false
use_turbo_attention: false
use_turbo_grouped_mlp: false

# Cross entropy flags
# cross_entropy_fusion_impl: "native"
# cross_entropy_loss_fusion: false
70 changes: 70 additions & 0 deletions examples/megatron/configs/MI300X/zebra_llama_1B-pretrain.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
work_group: ${PRIMUS_TEAM:amd}
user_name: ${PRIMUS_USER:root}
exp_name: ${PRIMUS_EXP_NAME:zebra_llama_1B-pretrain}
workspace: ${PRIMUS_WORKSPACE:./output}

modules:
pre_trainer:
framework: megatron
config: pre_trainer.yaml

# model to run
model: zebra_llama_1B.yaml
overrides:
# log
wandb_project: "Primus_Zebra_Llama_1B_Pretrain"
stderr_sink_level: DEBUG

eval_iters: 0

log_avg_skip_iterations: 2
log_avg_reset_interval: 50

train_iters: 100
micro_batch_size: 2
global_batch_size: 16

seq_length: 8192
max_position_embeddings: 8192
original_max_position_embeddings: 8192

lr: 2.0e-4
min_lr: 2.0e-5
lr_warmup_iters: 200
lr_decay_iters: 10000
lr_decay_style: cosine
weight_decay: 0.1
adam_beta1: 0.9
adam_beta2: 0.95
eod_mask_loss: true

# Mamba-specific: must provide spec
# Use custom hybrid Mamba+MLA spec
spec: ['primus.backends.megatron.core.models.hybrid.hybrid_mamba_mla_layer_specs', 'hybrid_stack_spec']

# Tokenizer
tokenizer_type: HuggingFaceTokenizer
tokenizer_model: meta-llama/Llama-3.2-1B

# parallel
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
overlap_grad_reduce: true
overlap_param_gather: true
gradient_accumulation_fusion: false

# data
mock_data: true
train_data_path: null
valid_data_path: null
test_data_path: null

# ckpt
finetune: false
auto_continue_train: false
load: null
save: null
save_interval: 10000
disable_last_saving: true
ckpt_format: torch
70 changes: 70 additions & 0 deletions examples/megatron/configs/MI300X/zebra_llama_3B-pretrain.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
work_group: ${PRIMUS_TEAM:amd}
user_name: ${PRIMUS_USER:root}
exp_name: ${PRIMUS_EXP_NAME:zebra_llama_3B-pretrain}
workspace: ${PRIMUS_WORKSPACE:./output}

modules:
pre_trainer:
framework: megatron
config: pre_trainer.yaml

# model to run
model: zebra_llama_3B.yaml
overrides:
# log
wandb_project: "Primus_Zebra_Llama_3B_Pretrain"
stderr_sink_level: DEBUG

eval_iters: 0

log_avg_skip_iterations: 2
log_avg_reset_interval: 50

train_iters: 100
micro_batch_size: 2
global_batch_size: 16

seq_length: 8192
max_position_embeddings: 8192
original_max_position_embeddings: 8192

lr: 2.0e-4
min_lr: 2.0e-5
lr_warmup_iters: 200
lr_decay_iters: 10000
lr_decay_style: cosine
weight_decay: 0.1
adam_beta1: 0.9
adam_beta2: 0.95
eod_mask_loss: true

# Mamba-specific: must provide spec
# Use custom hybrid Mamba+MLA spec
spec: ['primus.backends.megatron.core.models.hybrid.hybrid_mamba_mla_layer_specs', 'hybrid_stack_spec']

# Tokenizer
tokenizer_type: HuggingFaceTokenizer
tokenizer_model: meta-llama/Llama-3.2-3B

# parallel
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
overlap_grad_reduce: true
overlap_param_gather: true
gradient_accumulation_fusion: false

# data
mock_data: true
train_data_path: null
valid_data_path: null
test_data_path: null

# ckpt
finetune: false
auto_continue_train: false
load: null
save: null
save_interval: 10000
disable_last_saving: true
ckpt_format: torch
70 changes: 70 additions & 0 deletions examples/megatron/configs/MI300X/zebra_llama_8B-pretrain.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
work_group: ${PRIMUS_TEAM:amd}
user_name: ${PRIMUS_USER:root}
exp_name: ${PRIMUS_EXP_NAME:zebra_llama_8B-pretrain}
workspace: ${PRIMUS_WORKSPACE:./output}

modules:
pre_trainer:
framework: megatron
config: pre_trainer.yaml

# model to run
model: zebra_llama_8B.yaml
overrides:
# log
wandb_project: "Primus_Zebra_Llama_8B_Pretrain"
stderr_sink_level: DEBUG

eval_iters: 0

log_avg_skip_iterations: 2
log_avg_reset_interval: 50

train_iters: 100
micro_batch_size: 2
global_batch_size: 16

seq_length: 8192
max_position_embeddings: 8192
original_max_position_embeddings: 8192

lr: 2.0e-4
min_lr: 2.0e-5
lr_warmup_iters: 200
lr_decay_iters: 10000
lr_decay_style: cosine
weight_decay: 0.1
adam_beta1: 0.9
adam_beta2: 0.95
eod_mask_loss: true

# Mamba-specific: must provide spec
# Use custom hybrid Mamba+MLA spec
spec: ['primus.backends.megatron.core.models.hybrid.hybrid_mamba_mla_layer_specs', 'hybrid_stack_spec']

# Tokenizer
tokenizer_type: HuggingFaceTokenizer
tokenizer_model: meta-llama/Llama-3.1-8B

# parallel
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
overlap_grad_reduce: true
overlap_param_gather: true
gradient_accumulation_fusion: false

# data
mock_data: true
train_data_path: null
valid_data_path: null
test_data_path: null

# ckpt
finetune: false
auto_continue_train: false
load: null
save: null
save_interval: 10000
disable_last_saving: true
ckpt_format: torch
70 changes: 70 additions & 0 deletions examples/megatron/configs/MI355X/zebra_llama_1B-pretrain.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
work_group: ${PRIMUS_TEAM:amd}
user_name: ${PRIMUS_USER:root}
exp_name: ${PRIMUS_EXP_NAME:zebra_llama_1B-pretrain}
workspace: ${PRIMUS_WORKSPACE:./output}

modules:
pre_trainer:
framework: megatron
config: pre_trainer.yaml

# model to run
model: zebra_llama_1B.yaml
overrides:
# log
wandb_project: "Primus_Zebra_Llama_1B_Pretrain"
stderr_sink_level: DEBUG

eval_iters: 0

log_avg_skip_iterations: 2
log_avg_reset_interval: 50

train_iters: 100
micro_batch_size: 16
global_batch_size: 128

seq_length: 8192
max_position_embeddings: 8192
original_max_position_embeddings: 8192

lr: 2.0e-4
min_lr: 2.0e-5
lr_warmup_iters: 200
lr_decay_iters: 10000
lr_decay_style: cosine
weight_decay: 0.1
adam_beta1: 0.9
adam_beta2: 0.95
eod_mask_loss: true

# Mamba-specific: must provide spec
# Use custom hybrid Mamba+MLA spec
spec: ['primus.backends.megatron.core.models.hybrid.hybrid_mamba_mla_layer_specs', 'hybrid_stack_spec']

# Tokenizer
tokenizer_type: HuggingFaceTokenizer
tokenizer_model: meta-llama/Llama-3.2-1B

# parallel
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
overlap_grad_reduce: true
overlap_param_gather: true
gradient_accumulation_fusion: false

# data
mock_data: true
train_data_path: null
valid_data_path: null
test_data_path: null

# ckpt
finetune: false
auto_continue_train: false
load: null
save: null
save_interval: 10000
disable_last_saving: true
ckpt_format: torch
Loading
Loading