From b3fa5dc3c8934b5697648606b8f4fbc522f7b932 Mon Sep 17 00:00:00 2001 From: Kyungmin Lee <30465912+lkm2835@users.noreply.github.com> Date: Wed, 2 Jul 2025 14:34:43 +0900 Subject: [PATCH] Fix GPTQMarlinMoE (#7697) --- python/sglang/srt/layers/quantization/gptq.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/gptq.py b/python/sglang/srt/layers/quantization/gptq.py index b032ff4c1..9e2b3e063 100644 --- a/python/sglang/srt/layers/quantization/gptq.py +++ b/python/sglang/srt/layers/quantization/gptq.py @@ -344,6 +344,10 @@ class GPTQMarlinConfig(QuantizationConfig): if (num_bits, sym) not in cls.TYPE_MAP: return False + assert ( + VLLM_AVAILABLE + ), "vllm is not installed, to use gptq_marlin, please install vllm" + return check_marlin_supported( quant_type=cls.TYPE_MAP[(num_bits, sym)], group_size=group_size ) @@ -726,6 +730,6 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): g_idx2=layer.w2_g_idx, sort_indices1=layer.w13_g_idx_sort_indices, sort_indices2=layer.w2_g_idx_sort_indices, - num_bits=self.quant_config.quant_type.size_bits, + quant_type_id=self.quant_config.quant_type.id, is_k_full=self.is_k_full, ).to(orig_dtype)