File tree Expand file tree Collapse file tree 1 file changed +2
-3
lines changed
fastdeploy/model_executor/layers/quantization Expand file tree Collapse file tree 1 file changed +2
-3
lines changed Original file line number Diff line number Diff line change 3030from .quant_base import QuantConfigBase , QuantMethodBase
3131
3232if has_flashinfer ():
33- from flashinfer import fp4_quantize as scaled_fp4_quant # need to use vllm version
33+ from flashinfer import fp4_quantize
3434 from flashinfer import mm_fp4 as fp4_gemm
3535
3636
@@ -353,10 +353,9 @@ def apply(
353353 output_dtype = x .dtype
354354
355355 # Quantize BF16 or FP16 to (FP4 and interleaved block scale)
356- x_fp4 , x_scale_interleaved = scaled_fp4_quant (x , layer .input_scale_inv )
356+ x_fp4 , x_scale_interleaved = fp4_quantize (x , layer .input_scale_inv )
357357
358358 assert x_fp4 .dtype == paddle .uint8
359- assert x_scale_interleaved .dtype == paddle .float8_e4m3fn
360359 assert layer .weight .dtype == paddle .uint8
361360 assert layer .weight_scale_interleaved .dtype == paddle .float8_e4m3fn
362361 assert layer .alpha .dtype == paddle .float32
You can’t perform that action at this time.
0 commit comments