From 077b2ec52f78a306c8ecce44139010f563e552c4 Mon Sep 17 00:00:00 2001 From: iosmers Date: Fri, 31 Oct 2025 11:29:35 +0000 Subject: [PATCH 1/7] add v1 support for bf16 --- .../layers/backends/xpu/moe/fused_moe.py | 161 +++++++++++++----- fastdeploy/model_executor/utils.py | 2 +- 2 files changed, 121 insertions(+), 42 deletions(-) diff --git a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py index 540988266d1..9675a637ba8 100644 --- a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py +++ b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py @@ -29,6 +29,8 @@ weight_quantize_xpu, xpu_moe_layer, ) +from fastdeploy.model_executor.utils import default_weight_loader, set_weight_attrs +from fastdeploy.platforms import current_platform class XPUMoEMethod(MoEMethodBase): @@ -61,78 +63,155 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs): """ create weight process. """ - self.up_gate_proj_weight_shape = [ - layer.num_local_experts, - layer.moe_intermediate_size * 2, - layer.hidden_size, - ] - self.down_proj_weight_shape = [ - layer.num_local_experts, - layer.hidden_size, - layer.moe_intermediate_size, - ] - if self.moe_quant_type in ["weight_only_int4", "w4a8"]: - self.up_gate_proj_weight_shape[-1] //= 2 - self.down_proj_weight_shape[-1] //= 2 - - setattr( - layer, - self.added_weight_attrs[0], - layer.create_parameter( + if layer.fd_config.load_config.load_choices == "default_v1" and self.quant_config.is_checkpoint_bf16: + if current_platform.is_cuda(): + self.up_gate_proj_weight_shape = [ + layer.num_local_experts, + layer.hidden_size, + layer.moe_intermediate_size * 2, + ] + self.down_proj_weight_shape = [layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size] + extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 1, "down": 0, "up": 1}} + else: + self.up_gate_proj_weight_shape = [ + layer.num_local_experts, + layer.moe_intermediate_size * 2, + layer.hidden_size, + ] + self.down_proj_weight_shape = [layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size] + extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}} + + layer.up_gate_proj_weight = layer.create_parameter( shape=self.up_gate_proj_weight_shape, - dtype=self.weight_dtype, + dtype=layer.weight_dtype, default_initializer=paddle.nn.initializer.Constant(0), - ), - ) - setattr( - layer, - self.added_weight_attrs[1], - layer.create_parameter( + ) + + layer.down_proj_weight = layer.create_parameter( shape=self.down_proj_weight_shape, - dtype=self.weight_dtype, + dtype=layer.weight_dtype, default_initializer=paddle.nn.initializer.Constant(0), - ), - ) + ) - if self.moe_quant_type in ["weight_only_int8", "w8a8", "weight_only_int4", "w4a8"]: - self.up_gate_proj_scale_shape = [ + set_weight_attrs( + layer.up_gate_proj_weight, + { + "weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)), + "weight_need_transpose": extra_weight_attrs.get("model_format") == "torch", + }, + ) + set_weight_attrs( + layer.down_proj_weight, + { + "weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)), + "weight_need_transpose": extra_weight_attrs.get("model_format") == "torch", + }, + ) + + if layer.with_bias: + layer.up_gate_proj_bias = layer.create_parameter( + shape=[layer.num_experts, layer.moe_intermediate_size * 2], + dtype=layer.weight_dtype, + default_initializer=paddle.nn.initializer.Constant(0), + ) + + layer.down_proj_bias = layer.create_parameter( + shape=[layer.num_experts, layer.hidden_size], + dtype=layer.weight_dtype, + default_initializer=paddle.nn.initializer.Constant(0), + ) + set_weight_attrs( + layer.up_gate_proj_bias, + { + "weight_loader": extra_weight_attrs.get( + "weight_loader", default_weight_loader(layer.fd_config) + ), + "model_format": extra_weight_attrs.get("model_format", ""), + }, + ) + set_weight_attrs( + layer.down_proj_bias, + { + "weight_loader": extra_weight_attrs.get( + "weight_loader", default_weight_loader(layer.fd_config) + ), + "model_format": extra_weight_attrs.get("model_format", ""), + }, + ) + + else: + self.up_gate_proj_weight_shape = [ layer.num_local_experts, layer.moe_intermediate_size * 2, + layer.hidden_size, ] - self.down_proj_scale_shape = [ + self.down_proj_weight_shape = [ layer.num_local_experts, layer.hidden_size, + layer.moe_intermediate_size, ] + if self.moe_quant_type in ["weight_only_int4", "w4a8"]: + self.up_gate_proj_weight_shape[-1] //= 2 + self.down_proj_weight_shape[-1] //= 2 + setattr( layer, - self.added_scale_attrs[0], + self.added_weight_attrs[0], layer.create_parameter( - shape=self.up_gate_proj_scale_shape, - dtype=self.scale_dtype, + shape=self.up_gate_proj_weight_shape, + dtype=self.weight_dtype, default_initializer=paddle.nn.initializer.Constant(0), ), ) setattr( layer, - self.added_scale_attrs[1], + self.added_weight_attrs[1], layer.create_parameter( - shape=self.down_proj_scale_shape, - dtype=self.scale_dtype, + shape=self.down_proj_weight_shape, + dtype=self.weight_dtype, default_initializer=paddle.nn.initializer.Constant(0), ), ) - if self.moe_quant_type in ["w8a8", "w4a8"]: - for in_scale_name in self.added_in_scale_attrs: + if self.moe_quant_type in ["weight_only_int8", "w8a8", "weight_only_int4", "w4a8"]: + self.up_gate_proj_scale_shape = [ + layer.num_local_experts, + layer.moe_intermediate_size * 2, + ] + self.down_proj_scale_shape = [ + layer.num_local_experts, + layer.hidden_size, + ] setattr( layer, - in_scale_name, + self.added_scale_attrs[0], layer.create_parameter( - shape=[layer.num_local_experts], + shape=self.up_gate_proj_scale_shape, dtype=self.scale_dtype, default_initializer=paddle.nn.initializer.Constant(0), ), ) + setattr( + layer, + self.added_scale_attrs[1], + layer.create_parameter( + shape=self.down_proj_scale_shape, + dtype=self.scale_dtype, + default_initializer=paddle.nn.initializer.Constant(0), + ), + ) + + if self.moe_quant_type in ["w8a8", "w4a8"]: + for in_scale_name in self.added_in_scale_attrs: + setattr( + layer, + in_scale_name, + layer.create_parameter( + shape=[layer.num_local_experts], + dtype=self.scale_dtype, + default_initializer=paddle.nn.initializer.Constant(0), + ), + ) def process_loaded_weights(self, layer: nn.Layer, state_dict): up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict) diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index 4ef3f6e451a..cf481da4ea4 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -261,7 +261,7 @@ def v1_loader_support(fd_config): def _err_msg(msg: str) -> str: logger.info(msg + "; fallback to the v0 loader for model loading.") - if not current_platform.is_cuda(): + if not current_platform.is_cuda() or not current_platform.is_xpu(): _err_msg("v1loader currently does not support backends other than CUDA") return False From 19af29266f03212a27d48c5c3bed1487421bfcc8 Mon Sep 17 00:00:00 2001 From: iosmers Date: Fri, 31 Oct 2025 11:47:12 +0000 Subject: [PATCH 2/7] update --- fastdeploy/model_executor/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index cf481da4ea4..97aff592a67 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -261,8 +261,8 @@ def v1_loader_support(fd_config): def _err_msg(msg: str) -> str: logger.info(msg + "; fallback to the v0 loader for model loading.") - if not current_platform.is_cuda() or not current_platform.is_xpu(): - _err_msg("v1loader currently does not support backends other than CUDA") + if not (current_platform.is_cuda() and current_platform.is_xpu()): + _err_msg("v1loader currently only support backends gpu and xpu") return False if is_pre_sliced_weight(fd_config.model_config.model): From 817ac3f4701a996aa1e871e25c19f599b5920093 Mon Sep 17 00:00:00 2001 From: iosmers Date: Fri, 31 Oct 2025 11:53:48 +0000 Subject: [PATCH 3/7] update --- fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py | 2 +- fastdeploy/model_executor/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py index 9675a637ba8..696cb7559d4 100644 --- a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py +++ b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py @@ -63,7 +63,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs): """ create weight process. """ - if layer.fd_config.load_config.load_choices == "default_v1" and self.quant_config.is_checkpoint_bf16: + if layer.fd_config.load_config.load_choices == "default_v1" and self.moe_quant_type in ["w16a16"]: if current_platform.is_cuda(): self.up_gate_proj_weight_shape = [ layer.num_local_experts, diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index 97aff592a67..b6bf24881e2 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -261,7 +261,7 @@ def v1_loader_support(fd_config): def _err_msg(msg: str) -> str: logger.info(msg + "; fallback to the v0 loader for model loading.") - if not (current_platform.is_cuda() and current_platform.is_xpu()): + if not (current_platform.is_cuda() or not current_platform.is_xpu()): _err_msg("v1loader currently only support backends gpu and xpu") return False From 30e0bf6993f079d8d87def74b09ace356262cb9f Mon Sep 17 00:00:00 2001 From: iosmers Date: Fri, 31 Oct 2025 12:05:57 +0000 Subject: [PATCH 4/7] update --- fastdeploy/worker/worker_process.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 5c585d53dbd..123ba78c97b 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -823,6 +823,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: update_fd_config_for_mm(fd_config) if fd_config.load_config.load_choices == "default_v1" and not v1_loader_support(fd_config): fd_config.load_config.load_choices = "default" + print(f"fd_config.load_config.load_choices: {fd_config.load_config.load_choices}") architecture = fd_config.model_config.architectures[0] if "PaddleOCR" in architecture: From 7143b33e805aac4402949cf7820084ddc4e8333c Mon Sep 17 00:00:00 2001 From: iosmers Date: Fri, 31 Oct 2025 12:07:09 +0000 Subject: [PATCH 5/7] update --- fastdeploy/model_executor/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index b6bf24881e2..28278d5654f 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -261,7 +261,7 @@ def v1_loader_support(fd_config): def _err_msg(msg: str) -> str: logger.info(msg + "; fallback to the v0 loader for model loading.") - if not (current_platform.is_cuda() or not current_platform.is_xpu()): + if not (current_platform.is_cuda() or current_platform.is_xpu()): _err_msg("v1loader currently only support backends gpu and xpu") return False From 336c419d4eb4e1c353b8225b803b4c502f39f7bd Mon Sep 17 00:00:00 2001 From: iosmers Date: Fri, 31 Oct 2025 12:08:36 +0000 Subject: [PATCH 6/7] update --- fastdeploy/worker/worker_process.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 123ba78c97b..5c585d53dbd 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -823,7 +823,6 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: update_fd_config_for_mm(fd_config) if fd_config.load_config.load_choices == "default_v1" and not v1_loader_support(fd_config): fd_config.load_config.load_choices = "default" - print(f"fd_config.load_config.load_choices: {fd_config.load_config.load_choices}") architecture = fd_config.model_config.architectures[0] if "PaddleOCR" in architecture: From ca133e5a80c4c368183398489efc1ebc00fb8726 Mon Sep 17 00:00:00 2001 From: iosmers Date: Fri, 31 Oct 2025 14:51:49 +0000 Subject: [PATCH 7/7] update code --- scripts/run_ci_xpu.sh | 9 ++++++--- tests/ci_use/XPU_45T/run_ep.py | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/scripts/run_ci_xpu.sh b/scripts/run_ci_xpu.sh index a0e5fef264b..5ab341a5c87 100644 --- a/scripts/run_ci_xpu.sh +++ b/scripts/run_ci_xpu.sh @@ -56,7 +56,8 @@ python -m fastdeploy.entrypoints.openai.api_server \ --num-gpu-blocks-override 16384 \ --max-model-len 32768 \ --max-num-seqs 128 \ - --quantization wint4 > server.log 2>&1 & + --quantization wint4 \ + --load-choices default > server.log 2>&1 & sleep 60 # 探活 @@ -123,7 +124,8 @@ python -m fastdeploy.entrypoints.openai.api_server \ --num-gpu-blocks-override 16384 \ --max-model-len 32768 \ --max-num-seqs 64 \ - --quantization "W4A8" > server.log 2>&1 & + --quantization "W4A8" \ + --load-choices default > server.log 2>&1 & sleep 60 # 探活 @@ -193,7 +195,8 @@ python -m fastdeploy.entrypoints.openai.api_server \ --enable-mm \ --mm-processor-kwargs '{"video_max_frames": 30}' \ --limit-mm-per-prompt '{"image": 10, "video": 3}' \ - --reasoning-parser ernie-45-vl > server.log 2>&1 & + --reasoning-parser ernie-45-vl \ + --load-choices default > server.log 2>&1 & sleep 60 # 探活 diff --git a/tests/ci_use/XPU_45T/run_ep.py b/tests/ci_use/XPU_45T/run_ep.py index c82242aa394..e411396d69a 100644 --- a/tests/ci_use/XPU_45T/run_ep.py +++ b/tests/ci_use/XPU_45T/run_ep.py @@ -44,6 +44,7 @@ def test_fd_ep(): quantization="wint4", engine_worker_queue_port=engine_worker_queue_port, max_num_seqs=8, + load_choices="default", ) try: