From b58226510f7d568251d847b5dc63c05bf84e297c Mon Sep 17 00:00:00 2001 From: AniZpZ Date: Thu, 3 Jul 2025 16:52:44 +0800 Subject: [PATCH] fix dsv3 fused proj check (#7738) --- python/sglang/srt/models/deepseek_v2.py | 29 +++++++++++++------------ 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index d4b2d72f7..8e9c4c496 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -336,10 +336,12 @@ class DeepseekV2MoE(nn.Module): else {} ), ) - is_packed_weight = ( - self.shared_experts.gate_up_proj.quant_method.quant_config.get_name() - in ["awq", "moe_wna16"] - ) + is_packed_weight = hasattr( + self.shared_experts.gate_up_proj.quant_method, "quant_config" + ) and self.shared_experts.gate_up_proj.quant_method.quant_config.get_name() in { + "awq", + "moe_wna16", + } self.shared_experts_is_int8 = ( not is_packed_weight and self.shared_experts.gate_up_proj.weight.dtype == torch.int8 @@ -891,21 +893,20 @@ class DeepseekV2AttentionMLA(nn.Module): # If we have self.fused_qkv_a_proj_with_mqa and we're running on CPU, we will choose the torch.ops.sgl_kernel.qkv_proj_with_rope_fused_weight kernel # which requires self.w_kc and self.w_vc to be packed. # If not, we will use torch.bmm and weight shouldn't be packed in this case - if ( - hasattr(self, "fused_qkv_a_proj_with_mqa") - and _is_cpu - and _is_cpu_amx_available - ): + has_fused_proj = hasattr(self, "fused_qkv_a_proj_with_mqa") + if has_fused_proj and _is_cpu and _is_cpu_amx_available: self.quant_method = PackWeightMethod( weight_names=["w_kc", "w_vc"], transpose_dims=[[1, 2], [1, 2]] ) is_packed_weight = ( - self.fused_qkv_a_proj_with_mqa.quant_method.quant_config.get_name() - in ["awq", "moe_wna16"] + has_fused_proj + and hasattr(self.fused_qkv_a_proj_with_mqa.quant_method, "quant_config") + and self.fused_qkv_a_proj_with_mqa.quant_method.quant_config.get_name() + in {"awq", "moe_wna16"} ) self.use_min_latency_fused_a_gemm = ( - hasattr(self, "fused_qkv_a_proj_with_mqa") + has_fused_proj and not is_packed_weight and self.fused_qkv_a_proj_with_mqa.weight.dtype == torch.bfloat16 and self.fused_qkv_a_proj_with_mqa.weight.shape[0] == 2112 @@ -915,12 +916,12 @@ class DeepseekV2AttentionMLA(nn.Module): ) self.qkv_proj_with_rope_is_int8 = ( - hasattr(self, "fused_qkv_a_proj_with_mqa") + has_fused_proj and not is_packed_weight and self.fused_qkv_a_proj_with_mqa.weight.dtype == torch.int8 ) self.qkv_proj_with_rope_is_fp8 = ( - hasattr(self, "fused_qkv_a_proj_with_mqa") + has_fused_proj and not is_packed_weight and self.fused_qkv_a_proj_with_mqa.weight.dtype == torch.float8_e4m3fn )