Skip to content

Commit 6e162c2

Browse files
committed
support v1 loader
1 parent 3cbca75 commit 6e162c2

File tree

4 files changed

+132
-50
lines changed

4 files changed

+132
-50
lines changed

fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py

Lines changed: 120 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
weight_quantize_xpu,
3030
xpu_moe_layer,
3131
)
32+
from fastdeploy.model_executor.utils import default_weight_loader, set_weight_attrs
33+
from fastdeploy.platforms import current_platform
3234

3335

3436
class XPUMoEMethod(MoEMethodBase):
@@ -61,78 +63,155 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
6163
"""
6264
create weight process.
6365
"""
64-
self.up_gate_proj_weight_shape = [
65-
layer.num_local_experts,
66-
layer.moe_intermediate_size * 2,
67-
layer.hidden_size,
68-
]
69-
self.down_proj_weight_shape = [
70-
layer.num_local_experts,
71-
layer.hidden_size,
72-
layer.moe_intermediate_size,
73-
]
74-
if self.moe_quant_type in ["weight_only_int4", "w4a8"]:
75-
self.up_gate_proj_weight_shape[-1] //= 2
76-
self.down_proj_weight_shape[-1] //= 2
77-
78-
setattr(
79-
layer,
80-
self.added_weight_attrs[0],
81-
layer.create_parameter(
66+
if layer.fd_config.load_config.load_choices == "default_v1" and self.moe_quant_type in ["w16a16"]:
67+
if current_platform.is_cuda():
68+
self.up_gate_proj_weight_shape = [
69+
layer.num_local_experts,
70+
layer.hidden_size,
71+
layer.moe_intermediate_size * 2,
72+
]
73+
self.down_proj_weight_shape = [layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size]
74+
extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 1, "down": 0, "up": 1}}
75+
else:
76+
self.up_gate_proj_weight_shape = [
77+
layer.num_local_experts,
78+
layer.moe_intermediate_size * 2,
79+
layer.hidden_size,
80+
]
81+
self.down_proj_weight_shape = [layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size]
82+
extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}
83+
84+
layer.up_gate_proj_weight = layer.create_parameter(
8285
shape=self.up_gate_proj_weight_shape,
83-
dtype=self.weight_dtype,
86+
dtype=layer.weight_dtype,
8487
default_initializer=paddle.nn.initializer.Constant(0),
85-
),
86-
)
87-
setattr(
88-
layer,
89-
self.added_weight_attrs[1],
90-
layer.create_parameter(
88+
)
89+
90+
layer.down_proj_weight = layer.create_parameter(
9191
shape=self.down_proj_weight_shape,
92-
dtype=self.weight_dtype,
92+
dtype=layer.weight_dtype,
9393
default_initializer=paddle.nn.initializer.Constant(0),
94-
),
95-
)
94+
)
9695

97-
if self.moe_quant_type in ["weight_only_int8", "w8a8", "weight_only_int4", "w4a8"]:
98-
self.up_gate_proj_scale_shape = [
96+
set_weight_attrs(
97+
layer.up_gate_proj_weight,
98+
{
99+
"weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
100+
"weight_need_transpose": extra_weight_attrs.get("model_format") == "torch",
101+
},
102+
)
103+
set_weight_attrs(
104+
layer.down_proj_weight,
105+
{
106+
"weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
107+
"weight_need_transpose": extra_weight_attrs.get("model_format") == "torch",
108+
},
109+
)
110+
111+
if layer.with_bias:
112+
layer.up_gate_proj_bias = layer.create_parameter(
113+
shape=[layer.num_experts, layer.moe_intermediate_size * 2],
114+
dtype=layer.weight_dtype,
115+
default_initializer=paddle.nn.initializer.Constant(0),
116+
)
117+
118+
layer.down_proj_bias = layer.create_parameter(
119+
shape=[layer.num_experts, layer.hidden_size],
120+
dtype=layer.weight_dtype,
121+
default_initializer=paddle.nn.initializer.Constant(0),
122+
)
123+
set_weight_attrs(
124+
layer.up_gate_proj_bias,
125+
{
126+
"weight_loader": extra_weight_attrs.get(
127+
"weight_loader", default_weight_loader(layer.fd_config)
128+
),
129+
"model_format": extra_weight_attrs.get("model_format", ""),
130+
},
131+
)
132+
set_weight_attrs(
133+
layer.down_proj_bias,
134+
{
135+
"weight_loader": extra_weight_attrs.get(
136+
"weight_loader", default_weight_loader(layer.fd_config)
137+
),
138+
"model_format": extra_weight_attrs.get("model_format", ""),
139+
},
140+
)
141+
142+
else:
143+
self.up_gate_proj_weight_shape = [
99144
layer.num_local_experts,
100145
layer.moe_intermediate_size * 2,
146+
layer.hidden_size,
101147
]
102-
self.down_proj_scale_shape = [
148+
self.down_proj_weight_shape = [
103149
layer.num_local_experts,
104150
layer.hidden_size,
151+
layer.moe_intermediate_size,
105152
]
153+
if self.moe_quant_type in ["weight_only_int4", "w4a8"]:
154+
self.up_gate_proj_weight_shape[-1] //= 2
155+
self.down_proj_weight_shape[-1] //= 2
156+
106157
setattr(
107158
layer,
108-
self.added_scale_attrs[0],
159+
self.added_weight_attrs[0],
109160
layer.create_parameter(
110-
shape=self.up_gate_proj_scale_shape,
111-
dtype=self.scale_dtype,
161+
shape=self.up_gate_proj_weight_shape,
162+
dtype=self.weight_dtype,
112163
default_initializer=paddle.nn.initializer.Constant(0),
113164
),
114165
)
115166
setattr(
116167
layer,
117-
self.added_scale_attrs[1],
168+
self.added_weight_attrs[1],
118169
layer.create_parameter(
119-
shape=self.down_proj_scale_shape,
120-
dtype=self.scale_dtype,
170+
shape=self.down_proj_weight_shape,
171+
dtype=self.weight_dtype,
121172
default_initializer=paddle.nn.initializer.Constant(0),
122173
),
123174
)
124175

125-
if self.moe_quant_type in ["w8a8", "w4a8"]:
126-
for in_scale_name in self.added_in_scale_attrs:
176+
if self.moe_quant_type in ["weight_only_int8", "w8a8", "weight_only_int4", "w4a8"]:
177+
self.up_gate_proj_scale_shape = [
178+
layer.num_local_experts,
179+
layer.moe_intermediate_size * 2,
180+
]
181+
self.down_proj_scale_shape = [
182+
layer.num_local_experts,
183+
layer.hidden_size,
184+
]
127185
setattr(
128186
layer,
129-
in_scale_name,
187+
self.added_scale_attrs[0],
130188
layer.create_parameter(
131-
shape=[layer.num_local_experts],
189+
shape=self.up_gate_proj_scale_shape,
132190
dtype=self.scale_dtype,
133191
default_initializer=paddle.nn.initializer.Constant(0),
134192
),
135193
)
194+
setattr(
195+
layer,
196+
self.added_scale_attrs[1],
197+
layer.create_parameter(
198+
shape=self.down_proj_scale_shape,
199+
dtype=self.scale_dtype,
200+
default_initializer=paddle.nn.initializer.Constant(0),
201+
),
202+
)
203+
204+
if self.moe_quant_type in ["w8a8", "w4a8"]:
205+
for in_scale_name in self.added_in_scale_attrs:
206+
setattr(
207+
layer,
208+
in_scale_name,
209+
layer.create_parameter(
210+
shape=[layer.num_local_experts],
211+
dtype=self.scale_dtype,
212+
default_initializer=paddle.nn.initializer.Constant(0),
213+
),
214+
)
136215

137216
def process_loaded_weights(self, layer: nn.Layer, state_dict):
138217
up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)

fastdeploy/model_executor/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -261,8 +261,8 @@ def v1_loader_support(fd_config):
261261
def _err_msg(msg: str) -> str:
262262
logger.info(msg + "; fallback to the v0 loader for model loading.")
263263

264-
if not current_platform.is_cuda():
265-
_err_msg("v1loader currently does not support backends other than CUDA")
264+
if not (current_platform.is_cuda() or current_platform.is_xpu()):
265+
_err_msg("v1loader currently only support backends gpu and xpu")
266266
return False
267267

268268
if is_pre_sliced_weight(fd_config.model_config.model):

scripts/run_ci_xpu.sh

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,12 @@ echo "uninstall org"
2020
python -m pip uninstall paddlepaddle-xpu -y
2121
python -m pip uninstall fastdeploy-xpu -y
2222

23-
python -m pip install paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/nightly/xpu-p800/
23+
python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Xpu-P800-SelfBuiltPypiUse/latest/paddlepaddle_xpu-0.0.0-cp310-cp310-linux_x86_64.whl
2424

2525
echo "build whl"
26-
bash custom_ops/xpu_ops/download_dependencies.sh develop
26+
bash custom_ops/xpu_ops/download_dependencies.sh stable
2727
export CLANG_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xtdk
2828
export XVLLM_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xvllm
29-
3029
bash build.sh || exit 1
3130

3231
echo "pip others"
@@ -54,7 +53,8 @@ python -m fastdeploy.entrypoints.openai.api_server \
5453
--num-gpu-blocks-override 16384 \
5554
--max-model-len 32768 \
5655
--max-num-seqs 128 \
57-
--quantization wint4 > server.log 2>&1 &
56+
--quantization wint4 \
57+
--load-choices default > server.log 2>&1 &
5858

5959
sleep 60
6060
# 探活
@@ -121,7 +121,8 @@ python -m fastdeploy.entrypoints.openai.api_server \
121121
--num-gpu-blocks-override 16384 \
122122
--max-model-len 32768 \
123123
--max-num-seqs 64 \
124-
--quantization "W4A8" > server.log 2>&1 &
124+
--quantization "W4A8" \
125+
--load-choices default > server.log 2>&1 &
125126

126127
sleep 60
127128
# 探活
@@ -191,7 +192,8 @@ python -m fastdeploy.entrypoints.openai.api_server \
191192
--enable-mm \
192193
--mm-processor-kwargs '{"video_max_frames": 30}' \
193194
--limit-mm-per-prompt '{"image": 10, "video": 3}' \
194-
--reasoning-parser ernie-45-vl > server.log 2>&1 &
195+
--reasoning-parser ernie-45-vl \
196+
--load-choices default > server.log 2>&1 &
195197

196198
sleep 60
197199
# 探活
@@ -283,4 +285,4 @@ if [ ${ep_exit_code} -ne 0 ]; then
283285
cat log/workerlog.0
284286
echo "EP并行 相关测试失败,请检查pr代码"
285287
exit 1
286-
fi
288+
fi

tests/ci_use/XPU_45T/run_ep.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def test_fd_ep():
4444
quantization="wint4",
4545
engine_worker_queue_port=engine_worker_queue_port,
4646
max_num_seqs=8,
47+
load_choices="default",
4748
)
4849

4950
try:

0 commit comments

Comments
 (0)