Use sgl-kernel sgl_per_token_group_quant_int8 (#4971)

2025-04-27 11:19:42 +08:00
parent bdbe5f816b
commit a086a11305
2 changed files with 39 additions and 2 deletions
--- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
@@ -755,6 +755,9 @@ def invoke_fused_moe_kernel(
        from sglang.srt.layers.quantization.fp8_kernel import (
            sglang_per_token_group_quant_fp8,
        )
        from sglang.srt.layers.quantization.int8_kernel import (
            sglang_per_token_group_quant_int8,
        )
    else:
        from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
@@ -794,7 +797,10 @@ def invoke_fused_moe_kernel(
            # activation block-wise int8 quantization
            assert len(block_shape) == 2
            block_n, block_k = block_shape[0], block_shape[1]
-            A, A_scale = per_token_group_quant_int8(A, block_k)
+            if _is_cuda:
                A, A_scale = sglang_per_token_group_quant_int8(A, block_k)
            else:
                A, A_scale = per_token_group_quant_int8(A, block_k)
            assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
            assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2]
            assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
--- a/python/sglang/srt/layers/quantization/int8_kernel.py
+++ b/python/sglang/srt/layers/quantization/int8_kernel.py
@@ -8,7 +8,11 @@ import torch
 import triton
 import triton.language as tl
-from sglang.srt.utils import get_device_name
+from sglang.srt.utils import get_device_name, is_cuda
 _is_cuda = is_cuda()
 if _is_cuda:
    from sgl_kernel import sgl_per_token_group_quant_int8
 logger = logging.getLogger(__name__)
@@ -165,6 +169,33 @@ def per_token_group_quant_int8(
    return x_q, x_s
 def sglang_per_token_group_quant_int8(
    x: torch.Tensor,
    group_size: int,
    eps: float = 1e-10,
    dtype: torch.dtype = torch.int8,
 ):
    assert (
        x.shape[-1] % group_size == 0
    ), "the last dimension of `x` cannot be divisible by `group_size`"
    assert x.is_contiguous(), "`x` is not contiguous"
    iinfo = torch.iinfo(dtype)
    int8_max = iinfo.max
    int8_min = iinfo.min
    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
    x_s = torch.empty(
        x.shape[:-1] + (x.shape[-1] // group_size,),
        device=x.device,
        dtype=torch.float32,
    )
    sgl_per_token_group_quant_int8(x, x_q, x_s, group_size, eps, int8_min, int8_max)
    return x_q, x_s
@triton.jit
 def _w8a8_block_int8_matmul(
    # Pointers to inputs and output