support ernie4.5-fp4

zoooo0820 · zoooo0820 · commit eb089b38b0f3 · 2025-11-19T18:59:59.000+08:00
diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py
@@ -57,6 +57,7 @@ def __init__(
         kv_cache_quant_algo: str | None,
         exclude_modules: list[str],
         group_size: int = 16,
+        is_checkpoint_bf16: bool = False,
     ) -> None:
         self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
         if is_checkpoint_nvfp4_serialized:
@@ -72,6 +73,7 @@ def __init__(
         self.quant_max_bound = 6
         self.quant_min_bound = -6
         self.quant_round_type = 1
+        self.is_checkpoint_bf16 = is_checkpoint_bf16
 
     def name(self) -> str:
         return "modelopt_fp4"
@@ -406,6 +408,8 @@ def __init__(self, quant_config: ModelOptNvFp4Config):
         if self.backend == "none":
             raise ValueError("No valid NVFP4 flashinfer MoE backend found. " "Please check your platform capability.")
 
+        logger.info(f"Using {self.backend} for NVFP4 FusedMoE")
+
     def create_weights(self, layer, **extra_weight_attrs):
         """
         Triton MoE create weight process.
diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py
@@ -17,7 +17,6 @@
 from __future__ import annotations
 
 import inspect
-import re
 from functools import partial
 from typing import Dict, Union
 
@@ -543,7 +542,6 @@ def load_weights(self, weights_iterator) -> None:
 
         from fastdeploy.model_executor.utils import (
             default_weight_loader,
-            process_weights_after_loading,
             rename_offline_ckpt_suffix_to_fd_suffix,
         )
 
@@ -590,8 +588,6 @@ def load_weights(self, weights_iterator) -> None:
         )
         params_dict = dict(self.named_parameters())
 
-        process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()))
-
         for loaded_weight_name, loaded_weight in weights_iterator:
             loaded_weight_name = loaded_weight_name.replace("model", "ernie")
             for param_name, weight_name, exp_id, shard_id, is_moe in all_param_mapping:
@@ -620,10 +616,10 @@ def load_weights(self, weights_iterator) -> None:
             else:
                 weight_loader(param, loaded_weight, shard_id)
 
-            model_sublayer_name = re.sub(
-                r"\.(up_gate_proj_weight|down_proj_weight|weight|cache_k_scale|cache_v_scale)$", "", model_param_name
-            )
-            process_weights_after_loading_fn(model_sublayer_name, param)
+        for name, sublayer in self.named_sublayers():
+            quant_method = getattr(sublayer, "quant_method", None)
+            if quant_method is not None and hasattr(quant_method, "process_weights_after_loading"):
+                quant_method.process_weights_after_loading(sublayer)
 
         if self.tie_word_embeddings:
             self.lm_head.load_state_dict({self.lm_head.weight_key: self.ernie.embed_tokens.embeddings.weight})
diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py
@@ -392,7 +392,6 @@ def load_weights(self, weights_iterator) -> None:
         ]
         expert_params_mapping = self.get_expert_mapping()
         params_dict = dict(self.named_parameters())
-        # process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()))
         for loaded_weight_name, loaded_weight in weights_iterator:
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in loaded_weight_name:
@@ -426,8 +425,6 @@ def load_weights(self, weights_iterator) -> None:
                     weight_loader = getattr(param, "weight_loader", default_weight_loader(self.fd_config))
                     weight_loader(param, loaded_weight)
 
-            # model_sublayer_name = re.sub(r"\.(up_gate_proj_weight|down_proj_weight|weight)$", "", model_param_name)
-            # process_weights_after_loading_fn(model_sublayer_name, param)
         for name, sublayer in self.named_sublayers():
             quant_method = getattr(sublayer, "quant_method", None)
             if quant_method is not None and hasattr(quant_method, "process_weights_after_loading"):
diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py
@@ -371,6 +371,8 @@ def fn(loaded_weight_name, is_moe):
         # Can be extended to other offline quantization suffixes if needed.
         if (is_moe and moe_quant_type == "block_wise_fp8") or (not is_moe and dense_quant_type == "block_wise_fp8"):
             fd_suffix_map = fp8_suffix_map
+        else:
+            fd_suffix_map = {}
         for ckpt_suffix, fd_suffix in fd_suffix_map.items():
             if re.search(rf"{ckpt_suffix}$", loaded_weight_name):
                 loaded_weight_name = loaded_weight_name.replace(ckpt_suffix, fd_suffix)