From 9f7de45b752bdc3e2427db99838cf12c88fc0b06 Mon Sep 17 00:00:00 2001 From: Nagisa125 <166619298+Nagisa125@users.noreply.github.com> Date: Fri, 31 Oct 2025 14:34:28 +0800 Subject: [PATCH] [Bugfix] fix MTP support for lmhead_tensor_parallel_size (#3921) ### What this PR does / why we need it? Fix the issue of MTP being enabled and setting Imhead_tensor_parallel_size=16 causing the inference to hang. Signed-off-by: wyh145 <1987244901@qq.com> --- vllm_ascend/ops/vocab_parallel_embedding.py | 2 +- vllm_ascend/worker/model_runner_v1.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/ops/vocab_parallel_embedding.py b/vllm_ascend/ops/vocab_parallel_embedding.py index 69be390..a89c228 100644 --- a/vllm_ascend/ops/vocab_parallel_embedding.py +++ b/vllm_ascend/ops/vocab_parallel_embedding.py @@ -51,7 +51,7 @@ class AscendVocabParallelEmbedding(VocabParallelEmbedding): prefix: str = ""): nn.Module.__init__(self) - if lmhead_tp_enable() and prefix.find("lm_head") != -1: + if lmhead_tp_enable() and prefix.find("head") != -1: self.comm_group = get_lmhead_tp_group() else: self.comm_group = get_tp_group() diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index d0237b5..bd76756 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2516,7 +2516,8 @@ class NPUModelRunner(LoRAModelRunnerMixin): aclgraph_runtime_mode=aclgraph_runtime_mode, batch_descriptor=batch_descriptor) if need_dummy_logits: - dummy_compute_logits(hidden_states) + self.drafter.model.compute_logits( + hidden_states[dummy_indices]) if self.in_profile_run and self.dynamic_eplb: self.model.clear_all_moe_loads() if not self.in_profile_run and self.dynamic_eplb: