fix: fp8 quantization failure of qwen 2.5 VL 7B model (#10112)

Signed-off-by: PanJason <pyyjason@gmail.com>
2025-09-27 07:05:23 +02:00
parent 37f3325b06
commit 8260574729
5 changed files with 81 additions and 14 deletions
--- a/python/sglang/srt/layers/quantization/w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/w8a8_int8.py
@@ -393,13 +393,23 @@ class W8A8Int8LinearMethod(LinearMethodBase):
                x.dtype,
                True,  # is_vnni
            )
-
        x_q, x_scale = per_token_quant_int8(x)

-        return int8_scaled_mm(
-            x_q, layer.weight, x_scale, layer.weight_scale, out_dtype=x.dtype, bias=bias
+        x_q_2d = x_q.view(-1, x_q.shape[-1])
+        x_scale_2d = x_scale.view(-1, x_scale.shape[-1])
+        output_shape = [*x_q.shape[:-1], layer.weight.shape[1]]
+
+        output = int8_scaled_mm(
+            x_q_2d,
+            layer.weight,
+            x_scale_2d,
+            layer.weight_scale,
+            out_dtype=x.dtype,
+            bias=bias,
        )

+        return output.view(output_shape)
+

 class W8A8Int8MoEMethod(FusedMoEMethodBase):
    """MoE method for INT8.