Add int8 quant kernel (#2848)

2025-01-13 13:16:58 +08:00
parent a879c2fb4c
commit 85b2e05770
2 changed files with 147 additions and 0 deletions
--- a/benchmark/kernels/quantization/bench_int8_quant.py
+++ b/benchmark/kernels/quantization/bench_int8_quant.py
@@ -0,0 +1,94 @@
+import argparse
+
+import torch
+import triton
+from vllm._custom_ops import scaled_int8_quant as vllm_scaled_int8_quant
+
+from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
+
+
+@torch.compile(backend="inductor")
+def torch_int8_quant(x):
+    int8_max = torch.iinfo(torch.int8).max
+
+    abs_max = x.abs().max(dim=-1, keepdim=True).values
+    scales = abs_max.to(torch.float32) / float(int8_max)
+
+    q_x = (x / scales).round().to(torch.int8)
+
+    return q_x, scales
+
+
+def _test_accuracy_once(M, K, input_dtype, device):
+    x = torch.randn(M, K, dtype=input_dtype, device=device) * 5000
+    out, scales, _ = vllm_scaled_int8_quant(x, symmetric=True)
+    out1, scales1 = per_token_quant_int8(x)
+    out2, scales2 = torch_int8_quant(x)
+    torch.testing.assert_close(out, out2, atol=1, rtol=0)
+    torch.testing.assert_close(out, out1, atol=1, rtol=0)
+    torch.testing.assert_close(scales, scales2)
+    torch.testing.assert_close(scales1, scales2)
+    print(f"M: {M}, K: {K}, type: {input_dtype} OK")
+
+
+def test_accuracy():
+    Ms = [1, 13, 128, 1024, 2048, 4096]
+    Ks = [512, 1024, 2048, 8192]
+    input_dtypes = [torch.float16, torch.bfloat16]
+    for M in Ms:
+        for K in Ks:
+            for input_dtype in input_dtypes:
+                _test_accuracy_once(M, K, input_dtype, "cuda")
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048],
+        x_log=False,
+        line_arg="provider",
+        line_vals=["vllm op", "triton", "torch.compile"],
+        line_names=["vllm op", "triton", "torch.compile"],
+        styles=[("blue", "-"), ("orange", "-"), ("red", "-")],
+        ylabel="ms",
+        plot_name="int8 per token quant",
+        args={},
+    )
+)
+def benchmark(batch_size, provider):
+    M, K = batch_size, 16384
+    x = torch.randn(M, K, dtype=torch.float16, device="cuda") * 1000
+
+    quantiles = [0.5, 0.2, 0.8]
+    if provider == "vllm op":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: vllm_scaled_int8_quant(x, symmetric=True),
+            quantiles=quantiles,
+        )
+    if provider == "triton":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: per_token_quant_int8(x),
+            quantiles=quantiles,
+        )
+    if provider == "torch.compile":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: torch_int8_quant(x),
+            quantiles=quantiles,
+        )
+
+    return ms, min_ms, max_ms
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default="./bench_int8_quant_res",
+        help="Path to save int8 quant benchmark results",
+    )
+    args = parser.parse_args()
+
+    test_accuracy()
+
+    benchmark.run(print_data=True, show_plots=True, save_path=args.save_path)
--- a/python/sglang/srt/layers/quantization/int8_kernel.py
+++ b/python/sglang/srt/layers/quantization/int8_kernel.py
@@ -0,0 +1,53 @@
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _per_token_quant_int8(
+    x_ptr,
+    xq_ptr,
+    scale_ptr,
+    stride_x,
+    stride_xq,
+    N,
+    BLOCK: tl.constexpr,
+):
+    # Adapted from https://github.com/InternLM/lmdeploy/blob/086481ed84b59bee3b8e4274e5fc69620040c048/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py#L282
+    row_id = tl.program_id(0)
+
+    cols = tl.arange(0, BLOCK)
+    mask = cols < N
+
+    x = tl.load(x_ptr + row_id * stride_x + cols, mask=mask, other=0.0).to(tl.float32)
+    absmax = tl.maximum(tl.max(tl.abs(x)), 1e-10)
+    scale_x = absmax / 127
+    x_q = tl.extra.cuda.libdevice.round(x / scale_x).to(tl.int8)
+
+    tl.store(xq_ptr + row_id * stride_xq + cols, x_q, mask=mask)
+    tl.store(scale_ptr + row_id, scale_x)
+
+
+def per_token_quant_int8(x):
+    M = x.numel() // x.shape[-1]
+    N = x.shape[-1]
+    x_q = torch.empty_like(x, device=x.device, dtype=torch.int8)
+    scales = torch.empty(x.shape[:-1] + (1,), device=x.device, dtype=torch.float32)
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+
+    assert x.is_contiguous()
+    _per_token_quant_int8[(M,)](
+        x,
+        x_q,
+        scales,
+        stride_x=x.stride(-2),
+        stride_xq=x_q.stride(-2),
+        N=N,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+
+    return x_q, scales