[7/n] decouple quantization impl from vllm dependency - gguf kernel (#11019)

This commit is contained in:
PGFLMG
2025-10-12 05:04:57 +08:00
committed by GitHub
parent b5dcfd4154
commit 8fdcd98efe
19 changed files with 7936 additions and 1 deletions

View File

@@ -288,10 +288,19 @@ from sgl_kernel.moe import (
fp8_blockwise_scaled_grouped_mm,
moe_align_block_size,
moe_fused_gate,
moe_sum,
moe_sum_reduce,
prepare_moe_input,
topk_softmax,
)
from sgl_kernel.quantization import (
ggml_dequantize,
ggml_moe_a8,
ggml_moe_a8_vec,
ggml_moe_get_block_size,
ggml_mul_mat_a8,
ggml_mul_mat_vec_a8,
)
from sgl_kernel.sampling import (
min_p_sampling_from_probs,
top_k_mask_logits,

View File

@@ -48,6 +48,16 @@ def moe_sum_reduce(
)
def moe_sum(
input_tensor: torch.Tensor,
output_tensor: torch.Tensor,
):
torch.ops.sgl_kernel.moe_sum.default(
input_tensor,
output_tensor,
)
def moe_fused_gate(
input_tensor,
bias,

View File

@@ -0,0 +1,8 @@
from .gguf import (
ggml_dequantize,
ggml_moe_a8,
ggml_moe_a8_vec,
ggml_moe_get_block_size,
ggml_mul_mat_a8,
ggml_mul_mat_vec_a8,
)

View File

@@ -0,0 +1,62 @@
import torch
def ggml_dequantize(
weight: torch.Tensor, quant_type: int, M: int, N: int, dtype: torch.dtype
):
assert M > 0 and N > 0, "GGUF weight Input shape must be of positive dimensions"
return torch.ops.sgl_kernel.ggml_dequantize.default(weight, quant_type, M, N, dtype)
def ggml_mul_mat_vec_a8(
weight: torch.Tensor, x: torch.Tensor, quant_type: int, row: int
) -> torch.Tensor:
return torch.ops.sgl_kernel.ggml_mul_mat_vec_a8.default(weight, x, quant_type, row)
def ggml_mul_mat_a8(
weight: torch.Tensor, x: torch.Tensor, quant_type: int, row: int
) -> torch.Tensor:
return torch.ops.sgl_kernel.ggml_mul_mat_a8.default(weight, x, quant_type, row)
def ggml_moe_a8(
input: torch.Tensor,
weight: torch.Tensor,
sorted_token_ids: torch.Tensor,
expert_ids: torch.Tensor,
num_token_post_padded: torch.Tensor,
type: int,
row: int,
topk: int,
tokens: int,
) -> torch.Tensor:
return torch.ops.sgl_kernel.ggml_moe_a8.default(
input,
weight,
sorted_token_ids,
expert_ids,
num_token_post_padded,
type,
row,
topk,
tokens,
)
def ggml_moe_a8_vec(
input: torch.Tensor,
weight: torch.Tensor,
topk_ids: torch.Tensor,
top_k: int,
type: int,
row: int,
tokens: int,
) -> torch.Tensor:
return torch.ops.sgl_kernel.ggml_moe_a8_vec.default(
input, weight, topk_ids, top_k, type, row, tokens
)
def ggml_moe_get_block_size(type: int) -> int:
return torch.ops.sgl_kernel.ggml_moe_get_block_size.default(type)