[Bugfix] fix mtp profile run error where main model and mtp model use different quantization (#4102)

### What this PR does / why we need it? In PR https://github.com/vllm-project/vllm-ascend/pull/3420, we initially placed the quantization type (quant_type) in the MoECommMethod class. However, since MoECommMethod follows a singleton pattern, it couldn't accommodate scenarios where different layers in the model might use different quantization approaches (e.g., MTP modules using floating-point computation while the main model employs quantized computation). In this PR, we've moved the quantization type to the AscendFusedMoe class and pass it as a parameter to MoECommMethod. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? ```bash export HCCL_BUFFSIZE=1024 export VLLM_VERSION=0.11.0 vllm serve /home/data/DeepSeek-R1_w8a8/ \ --data-parallel-size 2 \ --tensor-parallel-size 8 \ --enable-expert-parallel \ --served-model-name dsv3 \ --max-model-len 32768 \ --max-num-batched-tokens 4096 \ --max-num-seqs 16 \ --quantization ascend \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ --speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}' ``` - vLLM version: v0.11.0 - vLLM main: 83f478bb19 --------- Signed-off-by: realliujiaxu <realliujiaxu@163.com>
2025-11-13 11:02:31 +08:00
parent 17259cb265
commit 5093192769
6 changed files with 82 additions and 76 deletions
--- a/vllm_ascend/ops/fused_moe/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe/fused_moe.py
@@ -37,6 +37,11 @@ from vllm_ascend.eplb.core.eplb_utils import (determine_default_expert_map,
 from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
 from vllm_ascend.ops.fused_moe.experts_selector import select_experts
 from vllm_ascend.ops.fused_moe.moe_comm_method import setup_moe_comm_method
+from vllm_ascend.ops.fused_moe.prepare_finalize import QuantType
+from vllm_ascend.quantization.w4a8_dynamic import \
+    AscendW4A8DynamicFusedMoEMethod
+from vllm_ascend.quantization.w8a8_dynamic import \
+    AscendW8A8DynamicFusedMoEMethod
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, enable_sp, is_310p,
                               is_enable_nz, npu_stream_switch,
                               shared_expert_dp_enabled,
@@ -289,7 +294,23 @@ class AscendFusedMoE(FusedMoE):

        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp

-        setup_moe_comm_method(self.moe_config, self.quant_method)
+        setup_moe_comm_method(self.moe_config)
+        self.quant_type = self._get_quant_type()
+
+    def _get_quant_type(self) -> QuantType:
+        quant_method = self.quant_method
+        if not hasattr(quant_method,
+                       "quant_method") or quant_method.quant_method is None:
+            return QuantType.NONE
+
+        method = quant_method.quant_method
+
+        if isinstance(method, AscendW8A8DynamicFusedMoEMethod):
+            return QuantType.W8A8
+        elif isinstance(method, AscendW4A8DynamicFusedMoEMethod):
+            return QuantType.W4A8
+        else:
+            return QuantType.NONE

    def update_expert_map(self, new_expert_map):
        self.expert_map = new_expert_map
@@ -334,7 +355,8 @@ class AscendFusedMoE(FusedMoE):
            hidden_states=hidden_states,
            router_logits=router_logits,
            replace_allreduce=forward_context.sp_enabled,
-            enable_shared_expert_dp=self.enable_shared_expert_dp)
+            enable_shared_expert_dp=self.enable_shared_expert_dp,
+            quant_type=self.quant_type)

        if isinstance(hidden_states, tuple):
            hidden_states, pertoken_scale = hidden_states