[Bugfix] fix mtp profile run error where main model and mtp model use different quantization (#4102)

### What this PR does / why we need it? In PR https://github.com/vllm-project/vllm-ascend/pull/3420, we initially placed the quantization type (quant_type) in the MoECommMethod class. However, since MoECommMethod follows a singleton pattern, it couldn't accommodate scenarios where different layers in the model might use different quantization approaches (e.g., MTP modules using floating-point computation while the main model employs quantized computation). In this PR, we've moved the quantization type to the AscendFusedMoe class and pass it as a parameter to MoECommMethod. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? ```bash export HCCL_BUFFSIZE=1024 export VLLM_VERSION=0.11.0 vllm serve /home/data/DeepSeek-R1_w8a8/ \ --data-parallel-size 2 \ --tensor-parallel-size 8 \ --enable-expert-parallel \ --served-model-name dsv3 \ --max-model-len 32768 \ --max-num-batched-tokens 4096 \ --max-num-seqs 16 \ --quantization ascend \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ --speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}' ``` - vLLM version: v0.11.0 - vLLM main: 83f478bb19 --------- Signed-off-by: realliujiaxu <realliujiaxu@163.com>
2025-11-13 11:02:31 +08:00
parent 17259cb265
commit 5093192769
6 changed files with 82 additions and 76 deletions
--- a/tests/ut/ops/test_moe_comm_method.py
+++ b/tests/ut/ops/test_moe_comm_method.py
@@ -7,6 +7,7 @@ from tests.ut.base import TestBase
 from vllm_ascend.ops.fused_moe.moe_comm_method import (AllGatherCommImpl,
                                                       AlltoAllCommImpl,
                                                       MC2CommImpl)
+from vllm_ascend.ops.fused_moe.prepare_finalize import QuantType


 class TestMoECommMethod(TestBase):
@@ -67,7 +68,7 @@ class TestMoECommMethod(TestBase):

        # Verify prepare was called with correct arguments
        mock_pf_instance.prepare.assert_called_once_with(
-            hidden_states, router_logits, False, False)
+            hidden_states, router_logits, False, False, QuantType.NONE)

        # Test finalize method
        comm_impl.finalize(h_out,
@@ -115,7 +116,7 @@ class TestMoECommMethod(TestBase):

        # Verify prepare was called with correct arguments
        mock_pf_instance.prepare.assert_called_once_with(
-            hidden_states, router_logits, False, False)
+            hidden_states, router_logits, False, False, QuantType.NONE)

        # Test finalize method
        comm_impl.finalize(h_out,
@@ -165,7 +166,7 @@ class TestMoECommMethod(TestBase):

        # Verify prepare was called with correct arguments
        mock_pf_instance.prepare.assert_called_once_with(
-            hidden_states, router_logits, False, False)
+            hidden_states, router_logits, False, False, QuantType.NONE)

    @patch("vllm_ascend.ops.fused_moe.moe_comm_method.get_current_vllm_config")
    @patch("vllm_ascend.ops.fused_moe.moe_comm_method.get_forward_context")