[V1][BUGFIX][0.10.1] FIX mtp on main branch (#2632)

### What this PR does / why we need it? Fix MTP torchair bug caused by torchair refactor and moe refactor Depends on PRs: fused moe fix: https://github.com/vllm-project/vllm-ascend/pull/2627 torchair multi DP fix: https://github.com/vllm-project/vllm-ascend/pull/2626 ### Does this PR introduce _any_ user-facing change? when dp is enabled, to run mtp online server, need to disable server log due to the current metrics does not support multi dp `--disable-log-stats` ### How was this patch tested? - vLLM version: v0.10.1.1 - vLLM main: 7c8271cd1e Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-09-02 11:12:41 +08:00
parent fef18b60bc
commit 214b32a346
4 changed files with 125 additions and 4 deletions
--- a/tests/ut/torchair/ops/test_torchair_fused_moe.py
+++ b/tests/ut/torchair/ops/test_torchair_fused_moe.py
@@ -23,6 +23,8 @@ from pytest_mock import MockerFixture
 from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase

 from vllm_ascend.ascend_forward_context import _get_fused_moe_state
+from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod
+from vllm_ascend.quantization.quantizer import W8A8Quantizer
 from vllm_ascend.torchair.ops.torchair_fused_moe import (
    TorchairAscendFusedMoE, TorchairAscendUnquantizedFusedMoEMethod)
 from vllm_ascend.utils import AscendSocVersion, adapt_patch  # noqa E402
@@ -233,12 +235,28 @@ class TestTorchairAscendFusedMoe:
        mock_quant_config = MagicMock()
        mock_quant_method = MockFusedMoEMethod()
        mock_quant_config.get_quant_method.return_value = mock_quant_method
+        mock_quant_config.is_layer_skipped_ascend.return_value = False
+        with patch(
+                'vllm_ascend.quantization.quantizer.AscendQuantizer.get_quantizer',
+                return_value=W8A8Quantizer):
+            moe = TorchairAscendFusedMoE(**default_moe_config,
+                                         quant_config=mock_quant_config)
+
+            assert moe.quant_method is not None
+            assert isinstance(moe.quant_method, AscendFusedMoEMethod)
+
+    def test_init_with_mixed_quant(self, mock_dist_env, default_moe_config):
+        mock_quant_config = MagicMock()
+        mock_quant_method = MockFusedMoEMethod()
+        mock_quant_config.get_quant_method.return_value = mock_quant_method
+        mock_quant_config.is_layer_skipped_ascend.return_value = True

        moe = TorchairAscendFusedMoE(**default_moe_config,
                                     quant_config=mock_quant_config)

        assert moe.quant_method is not None
-        assert moe.quant_method == mock_quant_method
+        assert isinstance(moe.quant_method,
+                          TorchairAscendUnquantizedFusedMoEMethod)

    @pytest.mark.parametrize(
        "others_param",