fix: fp8 quantization failure of qwen 2.5 VL 7B model (#10112)
Signed-off-by: PanJason <pyyjason@gmail.com>
This commit is contained in:
@@ -393,13 +393,23 @@ class W8A8Int8LinearMethod(LinearMethodBase):
|
||||
x.dtype,
|
||||
True, # is_vnni
|
||||
)
|
||||
|
||||
x_q, x_scale = per_token_quant_int8(x)
|
||||
|
||||
return int8_scaled_mm(
|
||||
x_q, layer.weight, x_scale, layer.weight_scale, out_dtype=x.dtype, bias=bias
|
||||
x_q_2d = x_q.view(-1, x_q.shape[-1])
|
||||
x_scale_2d = x_scale.view(-1, x_scale.shape[-1])
|
||||
output_shape = [*x_q.shape[:-1], layer.weight.shape[1]]
|
||||
|
||||
output = int8_scaled_mm(
|
||||
x_q_2d,
|
||||
layer.weight,
|
||||
x_scale_2d,
|
||||
layer.weight_scale,
|
||||
out_dtype=x.dtype,
|
||||
bias=bias,
|
||||
)
|
||||
|
||||
return output.view(output_shape)
|
||||
|
||||
|
||||
class W8A8Int8MoEMethod(FusedMoEMethodBase):
|
||||
"""MoE method for INT8.
|
||||
|
||||
Reference in New Issue
Block a user