[7/n] decouple quantization impl from vllm dependency - gguf kernel (#11019)
This commit is contained in:
@@ -288,10 +288,19 @@ from sgl_kernel.moe import (
|
||||
fp8_blockwise_scaled_grouped_mm,
|
||||
moe_align_block_size,
|
||||
moe_fused_gate,
|
||||
moe_sum,
|
||||
moe_sum_reduce,
|
||||
prepare_moe_input,
|
||||
topk_softmax,
|
||||
)
|
||||
from sgl_kernel.quantization import (
|
||||
ggml_dequantize,
|
||||
ggml_moe_a8,
|
||||
ggml_moe_a8_vec,
|
||||
ggml_moe_get_block_size,
|
||||
ggml_mul_mat_a8,
|
||||
ggml_mul_mat_vec_a8,
|
||||
)
|
||||
from sgl_kernel.sampling import (
|
||||
min_p_sampling_from_probs,
|
||||
top_k_mask_logits,
|
||||
|
||||
@@ -48,6 +48,16 @@ def moe_sum_reduce(
|
||||
)
|
||||
|
||||
|
||||
def moe_sum(
|
||||
input_tensor: torch.Tensor,
|
||||
output_tensor: torch.Tensor,
|
||||
):
|
||||
torch.ops.sgl_kernel.moe_sum.default(
|
||||
input_tensor,
|
||||
output_tensor,
|
||||
)
|
||||
|
||||
|
||||
def moe_fused_gate(
|
||||
input_tensor,
|
||||
bias,
|
||||
|
||||
8
sgl-kernel/python/sgl_kernel/quantization/__init__.py
Normal file
8
sgl-kernel/python/sgl_kernel/quantization/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from .gguf import (
|
||||
ggml_dequantize,
|
||||
ggml_moe_a8,
|
||||
ggml_moe_a8_vec,
|
||||
ggml_moe_get_block_size,
|
||||
ggml_mul_mat_a8,
|
||||
ggml_mul_mat_vec_a8,
|
||||
)
|
||||
62
sgl-kernel/python/sgl_kernel/quantization/gguf.py
Normal file
62
sgl-kernel/python/sgl_kernel/quantization/gguf.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import torch
|
||||
|
||||
|
||||
def ggml_dequantize(
|
||||
weight: torch.Tensor, quant_type: int, M: int, N: int, dtype: torch.dtype
|
||||
):
|
||||
assert M > 0 and N > 0, "GGUF weight Input shape must be of positive dimensions"
|
||||
return torch.ops.sgl_kernel.ggml_dequantize.default(weight, quant_type, M, N, dtype)
|
||||
|
||||
|
||||
def ggml_mul_mat_vec_a8(
|
||||
weight: torch.Tensor, x: torch.Tensor, quant_type: int, row: int
|
||||
) -> torch.Tensor:
|
||||
return torch.ops.sgl_kernel.ggml_mul_mat_vec_a8.default(weight, x, quant_type, row)
|
||||
|
||||
|
||||
def ggml_mul_mat_a8(
|
||||
weight: torch.Tensor, x: torch.Tensor, quant_type: int, row: int
|
||||
) -> torch.Tensor:
|
||||
return torch.ops.sgl_kernel.ggml_mul_mat_a8.default(weight, x, quant_type, row)
|
||||
|
||||
|
||||
def ggml_moe_a8(
|
||||
input: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
sorted_token_ids: torch.Tensor,
|
||||
expert_ids: torch.Tensor,
|
||||
num_token_post_padded: torch.Tensor,
|
||||
type: int,
|
||||
row: int,
|
||||
topk: int,
|
||||
tokens: int,
|
||||
) -> torch.Tensor:
|
||||
return torch.ops.sgl_kernel.ggml_moe_a8.default(
|
||||
input,
|
||||
weight,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
num_token_post_padded,
|
||||
type,
|
||||
row,
|
||||
topk,
|
||||
tokens,
|
||||
)
|
||||
|
||||
|
||||
def ggml_moe_a8_vec(
|
||||
input: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
top_k: int,
|
||||
type: int,
|
||||
row: int,
|
||||
tokens: int,
|
||||
) -> torch.Tensor:
|
||||
return torch.ops.sgl_kernel.ggml_moe_a8_vec.default(
|
||||
input, weight, topk_ids, top_k, type, row, tokens
|
||||
)
|
||||
|
||||
|
||||
def ggml_moe_get_block_size(type: int) -> int:
|
||||
return torch.ops.sgl_kernel.ggml_moe_get_block_size.default(type)
|
||||
Reference in New Issue
Block a user