[4/n]decouple quantization implementation from vLLM dependency (#9191)

Co-authored-by: AniZpZ <aniz1905@gmail.com>
Co-authored-by: Yineng Zhang <me@zhyncs.com>
This commit is contained in:
Hongbo Xu
2025-08-15 03:05:46 +08:00
committed by GitHub
parent 63d82a776a
commit 2cc9eeab01
8 changed files with 37 additions and 74 deletions

View File

@@ -35,22 +35,18 @@ from sglang.srt.layers.quantization.utils import get_scalar_types, replace_param
if TYPE_CHECKING:
from sglang.srt.layers.moe.topk import TopKOutput
try:
from vllm import _custom_ops as ops
warnings.warn(
f"Using kernels directly from vllm. This might lead to performance degradation or "
f"missing functionalities as certain kernels may not be optimized. "
)
except ImportError:
ops = None
from sglang.srt.utils import is_cuda, is_hip
_is_cuda = is_cuda()
_is_hip = is_hip()
if _is_cuda:
from sgl_kernel import awq_dequantize, fused_marlin_moe
from sgl_kernel import (
awq_dequantize,
awq_marlin_moe_repack,
awq_marlin_repack,
fused_marlin_moe,
)
elif _is_hip:
from sglang.srt.layers.quantization.awq_triton import (
@@ -519,7 +515,7 @@ class AWQMarlinLinearMethod(LinearMethodBase):
layer.workspace = marlin_make_workspace(device)
# Repack weights from AWQ format to marlin format.
marlin_qweight = ops.awq_marlin_repack(
marlin_qweight = awq_marlin_repack(
layer.qweight,
size_k=layer.input_size_per_partition,
size_n=layer.output_size_per_partition,
@@ -687,7 +683,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
requires_grad=False,
)
marlin_w13_qweight = ops.awq_marlin_moe_repack(
marlin_w13_qweight = awq_marlin_moe_repack(
layer.w13_qweight,
layer.w13_g_idx_sort_indices,
size_k=layer.w13_qweight.shape[1],
@@ -696,7 +692,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
)
replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
marlin_w2_qweight = ops.awq_marlin_moe_repack(
marlin_w2_qweight = awq_marlin_moe_repack(
layer.w2_qweight,
layer.w2_g_idx_sort_indices,
size_k=layer.w2_qweight.shape[1],