[4/n]decouple quantization implementation from vLLM dependency (#9191)

Co-authored-by: AniZpZ <aniz1905@gmail.com> Co-authored-by: Yineng Zhang <me@zhyncs.com>
2025-08-15 03:05:46 +08:00
parent 63d82a776a
commit 2cc9eeab01
8 changed files with 37 additions and 74 deletions
--- a/python/sglang/srt/layers/quantization/awq.py
+++ b/python/sglang/srt/layers/quantization/awq.py
@@ -35,22 +35,18 @@ from sglang.srt.layers.quantization.utils import get_scalar_types, replace_param
 if TYPE_CHECKING:
    from sglang.srt.layers.moe.topk import TopKOutput

-try:
-    from vllm import _custom_ops as ops
-
-    warnings.warn(
-        f"Using kernels directly from vllm. This might lead to performance degradation or "
-        f"missing functionalities as certain kernels may not be optimized. "
-    )
-except ImportError:
-    ops = None
-
 from sglang.srt.utils import is_cuda, is_hip

 _is_cuda = is_cuda()
 _is_hip = is_hip()
 if _is_cuda:
-    from sgl_kernel import awq_dequantize, fused_marlin_moe
+    from sgl_kernel import (
+        awq_dequantize,
+        awq_marlin_moe_repack,
+        awq_marlin_repack,
+        fused_marlin_moe,
+    )
+

 elif _is_hip:
    from sglang.srt.layers.quantization.awq_triton import (
@@ -519,7 +515,7 @@ class AWQMarlinLinearMethod(LinearMethodBase):
        layer.workspace = marlin_make_workspace(device)

        # Repack weights from AWQ format to marlin format.
-        marlin_qweight = ops.awq_marlin_repack(
+        marlin_qweight = awq_marlin_repack(
            layer.qweight,
            size_k=layer.input_size_per_partition,
            size_n=layer.output_size_per_partition,
@@ -687,7 +683,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
            requires_grad=False,
        )

-        marlin_w13_qweight = ops.awq_marlin_moe_repack(
+        marlin_w13_qweight = awq_marlin_moe_repack(
            layer.w13_qweight,
            layer.w13_g_idx_sort_indices,
            size_k=layer.w13_qweight.shape[1],
@@ -696,7 +692,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
        )
        replace_parameter(layer, "w13_qweight", marlin_w13_qweight)

-        marlin_w2_qweight = ops.awq_marlin_moe_repack(
+        marlin_w2_qweight = awq_marlin_moe_repack(
            layer.w2_qweight,
            layer.w2_g_idx_sort_indices,
            size_k=layer.w2_qweight.shape[1],