[V1][BUGFIX][0.10.1] FIX mtp on main branch (#2632)

### What this PR does / why we need it? Fix MTP torchair bug caused by torchair refactor and moe refactor Depends on PRs: fused moe fix: https://github.com/vllm-project/vllm-ascend/pull/2627 torchair multi DP fix: https://github.com/vllm-project/vllm-ascend/pull/2626 ### Does this PR introduce _any_ user-facing change? when dp is enabled, to run mtp online server, need to disable server log due to the current metrics does not support multi dp `--disable-log-stats` ### How was this patch tested? - vLLM version: v0.10.1.1 - vLLM main: 7c8271cd1e Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-09-02 11:12:41 +08:00
parent fef18b60bc
commit 214b32a346
4 changed files with 125 additions and 4 deletions
--- a/vllm_ascend/worker/mtp_proposer_v1.py
+++ b/vllm_ascend/worker/mtp_proposer_v1.py
@@ -18,6 +18,8 @@ from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ascend_forward_context import set_ascend_forward_context
 from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
 from vllm_ascend.models.deepseek_mtp import CustomDeepSeekMTP
+from vllm_ascend.torchair.models.torchair_deepseek_mtp import \
+    TorchairDeepSeekMTP
 from vllm_ascend.torchair.utils import TorchairCommonAttentionMetadata
 from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable

@@ -266,8 +268,12 @@ class MtpProposer:
        with set_default_torch_dtype(
                draft_model_config.dtype), set_current_vllm_config(
                    self.vllm_config):
-            self.model = CustomDeepSeekMTP(
-                vllm_config=self.vllm_config).to(target_device)
+            if self.torchair_graph_enabled:
+                self.model = TorchairDeepSeekMTP(
+                    vllm_config=self.vllm_config).to(target_device)
+            else:
+                self.model = CustomDeepSeekMTP(
+                    vllm_config=self.vllm_config).to(target_device)

        draft_attn_layer_names = (
            get_layers_from_vllm_config(self.vllm_config, Attention).keys() -