Skip to content

Commit eb089b3

Browse files
committed
support ernie4.5-fp4
1 parent c329d92 commit eb089b3

File tree

4 files changed

+10
-11
lines changed

4 files changed

+10
-11
lines changed

fastdeploy/model_executor/layers/quantization/nvfp4.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ def __init__(
5757
kv_cache_quant_algo: str | None,
5858
exclude_modules: list[str],
5959
group_size: int = 16,
60+
is_checkpoint_bf16: bool = False,
6061
) -> None:
6162
self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
6263
if is_checkpoint_nvfp4_serialized:
@@ -72,6 +73,7 @@ def __init__(
7273
self.quant_max_bound = 6
7374
self.quant_min_bound = -6
7475
self.quant_round_type = 1
76+
self.is_checkpoint_bf16 = is_checkpoint_bf16
7577

7678
def name(self) -> str:
7779
return "modelopt_fp4"
@@ -406,6 +408,8 @@ def __init__(self, quant_config: ModelOptNvFp4Config):
406408
if self.backend == "none":
407409
raise ValueError("No valid NVFP4 flashinfer MoE backend found. " "Please check your platform capability.")
408410

411+
logger.info(f"Using {self.backend} for NVFP4 FusedMoE")
412+
409413
def create_weights(self, layer, **extra_weight_attrs):
410414
"""
411415
Triton MoE create weight process.

fastdeploy/model_executor/models/ernie4_5_moe.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from __future__ import annotations
1818

1919
import inspect
20-
import re
2120
from functools import partial
2221
from typing import Dict, Union
2322

@@ -543,7 +542,6 @@ def load_weights(self, weights_iterator) -> None:
543542

544543
from fastdeploy.model_executor.utils import (
545544
default_weight_loader,
546-
process_weights_after_loading,
547545
rename_offline_ckpt_suffix_to_fd_suffix,
548546
)
549547

@@ -590,8 +588,6 @@ def load_weights(self, weights_iterator) -> None:
590588
)
591589
params_dict = dict(self.named_parameters())
592590

593-
process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()))
594-
595591
for loaded_weight_name, loaded_weight in weights_iterator:
596592
loaded_weight_name = loaded_weight_name.replace("model", "ernie")
597593
for param_name, weight_name, exp_id, shard_id, is_moe in all_param_mapping:
@@ -620,10 +616,10 @@ def load_weights(self, weights_iterator) -> None:
620616
else:
621617
weight_loader(param, loaded_weight, shard_id)
622618

623-
model_sublayer_name = re.sub(
624-
r"\.(up_gate_proj_weight|down_proj_weight|weight|cache_k_scale|cache_v_scale)$", "", model_param_name
625-
)
626-
process_weights_after_loading_fn(model_sublayer_name, param)
619+
for name, sublayer in self.named_sublayers():
620+
quant_method = getattr(sublayer, "quant_method", None)
621+
if quant_method is not None and hasattr(quant_method, "process_weights_after_loading"):
622+
quant_method.process_weights_after_loading(sublayer)
627623

628624
if self.tie_word_embeddings:
629625
self.lm_head.load_state_dict({self.lm_head.weight_key: self.ernie.embed_tokens.embeddings.weight})

fastdeploy/model_executor/models/qwen3moe.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,6 @@ def load_weights(self, weights_iterator) -> None:
392392
]
393393
expert_params_mapping = self.get_expert_mapping()
394394
params_dict = dict(self.named_parameters())
395-
# process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()))
396395
for loaded_weight_name, loaded_weight in weights_iterator:
397396
for param_name, weight_name, shard_id in stacked_params_mapping:
398397
if weight_name not in loaded_weight_name:
@@ -426,8 +425,6 @@ def load_weights(self, weights_iterator) -> None:
426425
weight_loader = getattr(param, "weight_loader", default_weight_loader(self.fd_config))
427426
weight_loader(param, loaded_weight)
428427

429-
# model_sublayer_name = re.sub(r"\.(up_gate_proj_weight|down_proj_weight|weight)$", "", model_param_name)
430-
# process_weights_after_loading_fn(model_sublayer_name, param)
431428
for name, sublayer in self.named_sublayers():
432429
quant_method = getattr(sublayer, "quant_method", None)
433430
if quant_method is not None and hasattr(quant_method, "process_weights_after_loading"):

fastdeploy/model_executor/utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,8 @@ def fn(loaded_weight_name, is_moe):
371371
# Can be extended to other offline quantization suffixes if needed.
372372
if (is_moe and moe_quant_type == "block_wise_fp8") or (not is_moe and dense_quant_type == "block_wise_fp8"):
373373
fd_suffix_map = fp8_suffix_map
374+
else:
375+
fd_suffix_map = {}
374376
for ckpt_suffix, fd_suffix in fd_suffix_map.items():
375377
if re.search(rf"{ckpt_suffix}$", loaded_weight_name):
376378
loaded_weight_name = loaded_weight_name.replace(ckpt_suffix, fd_suffix)

0 commit comments

Comments
 (0)