fix: fp8 quantization failure of qwen 2.5 VL 7B model (#10112)

Signed-off-by: PanJason <pyyjason@gmail.com>
This commit is contained in:
Yueyang Pan
2025-09-27 07:05:23 +02:00
committed by GitHub
parent 37f3325b06
commit 8260574729
5 changed files with 81 additions and 14 deletions

View File

@@ -393,13 +393,23 @@ class W8A8Int8LinearMethod(LinearMethodBase):
x.dtype,
True, # is_vnni
)
x_q, x_scale = per_token_quant_int8(x)
return int8_scaled_mm(
x_q, layer.weight, x_scale, layer.weight_scale, out_dtype=x.dtype, bias=bias
x_q_2d = x_q.view(-1, x_q.shape[-1])
x_scale_2d = x_scale.view(-1, x_scale.shape[-1])
output_shape = [*x_q.shape[:-1], layer.weight.shape[1]]
output = int8_scaled_mm(
x_q_2d,
layer.weight,
x_scale_2d,
layer.weight_scale,
out_dtype=x.dtype,
bias=bias,
)
return output.view(output_shape)
class W8A8Int8MoEMethod(FusedMoEMethodBase):
"""MoE method for INT8.