[7/n] decouple quantization impl from vllm dependency - gguf kernel (#11019)

2025-10-12 05:04:57 +08:00
parent b5dcfd4154
commit 8fdcd98efe
19 changed files with 7936 additions and 1 deletions
--- a/sgl-kernel/python/sgl_kernel/init.py
+++ b/sgl-kernel/python/sgl_kernel/init.py
@@ -288,10 +288,19 @@ from sgl_kernel.moe import (
    fp8_blockwise_scaled_grouped_mm,
    moe_align_block_size,
    moe_fused_gate,
+    moe_sum,
    moe_sum_reduce,
    prepare_moe_input,
    topk_softmax,
 )
+from sgl_kernel.quantization import (
+    ggml_dequantize,
+    ggml_moe_a8,
+    ggml_moe_a8_vec,
+    ggml_moe_get_block_size,
+    ggml_mul_mat_a8,
+    ggml_mul_mat_vec_a8,
+)
 from sgl_kernel.sampling import (
    min_p_sampling_from_probs,
    top_k_mask_logits,
--- a/sgl-kernel/python/sgl_kernel/moe.py
+++ b/sgl-kernel/python/sgl_kernel/moe.py
@@ -48,6 +48,16 @@ def moe_sum_reduce(
    )


+def moe_sum(
+    input_tensor: torch.Tensor,
+    output_tensor: torch.Tensor,
+):
+    torch.ops.sgl_kernel.moe_sum.default(
+        input_tensor,
+        output_tensor,
+    )
+
+
 def moe_fused_gate(
    input_tensor,
    bias,
--- a/sgl-kernel/python/sgl_kernel/quantization/init.py
+++ b/sgl-kernel/python/sgl_kernel/quantization/init.py
@@ -0,0 +1,8 @@
+from .gguf import (
+    ggml_dequantize,
+    ggml_moe_a8,
+    ggml_moe_a8_vec,
+    ggml_moe_get_block_size,
+    ggml_mul_mat_a8,
+    ggml_mul_mat_vec_a8,
+)
--- a/sgl-kernel/python/sgl_kernel/quantization/gguf.py
+++ b/sgl-kernel/python/sgl_kernel/quantization/gguf.py
@@ -0,0 +1,62 @@
+import torch
+
+
+def ggml_dequantize(
+    weight: torch.Tensor, quant_type: int, M: int, N: int, dtype: torch.dtype
+):
+    assert M > 0 and N > 0, "GGUF weight Input shape must be of positive dimensions"
+    return torch.ops.sgl_kernel.ggml_dequantize.default(weight, quant_type, M, N, dtype)
+
+
+def ggml_mul_mat_vec_a8(
+    weight: torch.Tensor, x: torch.Tensor, quant_type: int, row: int
+) -> torch.Tensor:
+    return torch.ops.sgl_kernel.ggml_mul_mat_vec_a8.default(weight, x, quant_type, row)
+
+
+def ggml_mul_mat_a8(
+    weight: torch.Tensor, x: torch.Tensor, quant_type: int, row: int
+) -> torch.Tensor:
+    return torch.ops.sgl_kernel.ggml_mul_mat_a8.default(weight, x, quant_type, row)
+
+
+def ggml_moe_a8(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_token_post_padded: torch.Tensor,
+    type: int,
+    row: int,
+    topk: int,
+    tokens: int,
+) -> torch.Tensor:
+    return torch.ops.sgl_kernel.ggml_moe_a8.default(
+        input,
+        weight,
+        sorted_token_ids,
+        expert_ids,
+        num_token_post_padded,
+        type,
+        row,
+        topk,
+        tokens,
+    )
+
+
+def ggml_moe_a8_vec(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    top_k: int,
+    type: int,
+    row: int,
+    tokens: int,
+) -> torch.Tensor:
+    return torch.ops.sgl_kernel.ggml_moe_a8_vec.default(
+        input, weight, topk_ids, top_k, type, row, tokens
+    )
+
+
+def ggml_moe_get_block_size(type: int) -> int:
+    return torch.ops.sgl_kernel.ggml_moe_get_block_size.default(type)