Skip to content

Commit 0594090

Browse files
committed
dense fp4 OK, cudagraph error
1 parent 7137054 commit 0594090

File tree

1 file changed

+2
-3
lines changed
  • fastdeploy/model_executor/layers/quantization

1 file changed

+2
-3
lines changed

fastdeploy/model_executor/layers/quantization/nvfp4.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from .quant_base import QuantConfigBase, QuantMethodBase
3131

3232
if has_flashinfer():
33-
from flashinfer import fp4_quantize as scaled_fp4_quant # need to use vllm version
33+
from flashinfer import fp4_quantize
3434
from flashinfer import mm_fp4 as fp4_gemm
3535

3636

@@ -353,10 +353,9 @@ def apply(
353353
output_dtype = x.dtype
354354

355355
# Quantize BF16 or FP16 to (FP4 and interleaved block scale)
356-
x_fp4, x_scale_interleaved = scaled_fp4_quant(x, layer.input_scale_inv)
356+
x_fp4, x_scale_interleaved = fp4_quantize(x, layer.input_scale_inv)
357357

358358
assert x_fp4.dtype == paddle.uint8
359-
assert x_scale_interleaved.dtype == paddle.float8_e4m3fn
360359
assert layer.weight.dtype == paddle.uint8
361360
assert layer.weight_scale_interleaved.dtype == paddle.float8_e4m3fn
362361
assert layer.alpha.dtype == paddle.float32

0 commit comments

Comments
 (0)