diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py index d3583975d..132a0c31f 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py @@ -207,15 +207,11 @@ class FusedMoE(torch.nn.Module): gemm1_clamp_limit=gemm1_clamp_limit, ) - if quant_config is None: - self.quant_method: FusedMoEMethodBase = UnquantizedFusedMoEMethod( - self.use_triton_kernels - ) - else: - self.quant_method: FusedMoEMethodBase = quant_config.get_quant_method( - self, prefix - ) - assert self.quant_method is not None + self.quant_method: Optional[FusedMoEMethodBase] = None + if quant_config is not None: + self.quant_method = quant_config.get_quant_method(self, prefix) + if self.quant_method is None: + self.quant_method = UnquantizedFusedMoEMethod(self.use_triton_kernels) self.quant_method.create_weights( layer=self, diff --git a/python/sglang/srt/layers/quantization/quark/quark.py b/python/sglang/srt/layers/quantization/quark/quark.py index 6d5a66544..d0fbe74ef 100644 --- a/python/sglang/srt/layers/quantization/quark/quark.py +++ b/python/sglang/srt/layers/quantization/quark/quark.py @@ -65,7 +65,9 @@ class QuarkConfig(QuantizationConfig): if should_ignore_layer( prefix, ignore=exclude_layers, fused_mapping=self.packed_modules_mapping ): - return UnquantizedLinearMethod() + if isinstance(layer, LinearBase): + return UnquantizedLinearMethod() + return None if isinstance(layer, LinearBase): scheme = self.get_scheme(layer=layer, layer_name=prefix)