From a66e02088415354074c25bc4a7f6564736f1e398 Mon Sep 17 00:00:00 2001 From: rkteddy <838474413@qq.com> Date: Sat, 28 Feb 2026 22:41:24 -0500 Subject: [PATCH] Fix upcycling state dict conversion for mixed dense/MoE models --- megatron/core/transformer/moe/upcycling_utils.py | 10 ++++++++-- megatron/training/training.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/megatron/core/transformer/moe/upcycling_utils.py b/megatron/core/transformer/moe/upcycling_utils.py index 4f9d2b7d835..72d1ffc0163 100644 --- a/megatron/core/transformer/moe/upcycling_utils.py +++ b/megatron/core/transformer/moe/upcycling_utils.py @@ -218,7 +218,10 @@ def _convert_key_value( value = src_dict[key] new_value = value_process_func(value) new_key = key.replace(key_replace_old, key_replace_new) - dist_dict[new_key] = new_value.clone() if hasattr(new_value, 'clone') else new_value + if ( + new_key in dist_dict + ): # Skip keys absent in target (e.g. dense layers in mixed dense/MoE) + dist_dict[new_key] = new_value.clone() if hasattr(new_value, 'clone') else new_value return _convert_key_value( @@ -257,7 +260,10 @@ def _expand_key_value( params = value_process_func(param) for idx in range(num_local_experts): new_key = key.replace(key_replace_old, key_replace_new).format(idx) - dist_dict[new_key] = params[ep_rank * num_local_experts + idx] + if ( + new_key in dist_dict + ): # Skip keys absent in target (e.g. dense layers in mixed dense/MoE) + dist_dict[new_key] = params[ep_rank * num_local_experts + idx] return if experts_type == ExpertsType.SequentialMLP: diff --git a/megatron/training/training.py b/megatron/training/training.py index 380a443f149..be4aa8dc12f 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -1569,7 +1569,7 @@ def setup_model_and_optimizer( unwrapped_model, dense_model_for_upcycling, load_kwargs={ - 'model': dense_model_for_upcycling, + 'ddp_model': dense_model_for_upcycling, 'optimizer': None, 'opt_param_scheduler': None, },