[4/n]decouple quantization implementation from vLLM dependency (#9191)
Co-authored-by: AniZpZ <aniz1905@gmail.com> Co-authored-by: Yineng Zhang <me@zhyncs.com>
This commit is contained in:
@@ -35,22 +35,18 @@ from sglang.srt.layers.quantization.utils import get_scalar_types, replace_param
|
||||
if TYPE_CHECKING:
|
||||
from sglang.srt.layers.moe.topk import TopKOutput
|
||||
|
||||
try:
|
||||
from vllm import _custom_ops as ops
|
||||
|
||||
warnings.warn(
|
||||
f"Using kernels directly from vllm. This might lead to performance degradation or "
|
||||
f"missing functionalities as certain kernels may not be optimized. "
|
||||
)
|
||||
except ImportError:
|
||||
ops = None
|
||||
|
||||
from sglang.srt.utils import is_cuda, is_hip
|
||||
|
||||
_is_cuda = is_cuda()
|
||||
_is_hip = is_hip()
|
||||
if _is_cuda:
|
||||
from sgl_kernel import awq_dequantize, fused_marlin_moe
|
||||
from sgl_kernel import (
|
||||
awq_dequantize,
|
||||
awq_marlin_moe_repack,
|
||||
awq_marlin_repack,
|
||||
fused_marlin_moe,
|
||||
)
|
||||
|
||||
|
||||
elif _is_hip:
|
||||
from sglang.srt.layers.quantization.awq_triton import (
|
||||
@@ -519,7 +515,7 @@ class AWQMarlinLinearMethod(LinearMethodBase):
|
||||
layer.workspace = marlin_make_workspace(device)
|
||||
|
||||
# Repack weights from AWQ format to marlin format.
|
||||
marlin_qweight = ops.awq_marlin_repack(
|
||||
marlin_qweight = awq_marlin_repack(
|
||||
layer.qweight,
|
||||
size_k=layer.input_size_per_partition,
|
||||
size_n=layer.output_size_per_partition,
|
||||
@@ -687,7 +683,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
|
||||
requires_grad=False,
|
||||
)
|
||||
|
||||
marlin_w13_qweight = ops.awq_marlin_moe_repack(
|
||||
marlin_w13_qweight = awq_marlin_moe_repack(
|
||||
layer.w13_qweight,
|
||||
layer.w13_g_idx_sort_indices,
|
||||
size_k=layer.w13_qweight.shape[1],
|
||||
@@ -696,7 +692,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
|
||||
)
|
||||
replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
|
||||
|
||||
marlin_w2_qweight = ops.awq_marlin_moe_repack(
|
||||
marlin_w2_qweight = awq_marlin_moe_repack(
|
||||
layer.w2_qweight,
|
||||
layer.w2_g_idx_sort_indices,
|
||||
size_k=layer.w2_qweight.shape[1],
|
||||
|
||||
Reference in New Issue
Block a user