[Bugfix] Fix MTP support for lmhead_tensor_parallel_size (#3915)

### What this PR does / why we need it? Fix the issue of MTP being enabled and setting Imhead_tensor_parallel_size=16 causing the inference to hang. - vLLM version: v0.11.0 - vLLM main: 83f478bb19 Signed-off-by: wyh145 <1987244901@qq.com>
2025-10-31 10:30:28 +08:00
parent 1966885be2
commit 6764777f00
2 changed files with 3 additions and 2 deletions
--- a/vllm_ascend/ops/vocab_parallel_embedding.py
+++ b/vllm_ascend/ops/vocab_parallel_embedding.py
@@ -51,7 +51,7 @@ class AscendVocabParallelEmbedding(VocabParallelEmbedding):
                 prefix: str = ""):
        nn.Module.__init__(self)

-        if lmhead_tp_enable() and prefix.find("lm_head") != -1:
+        if lmhead_tp_enable() and prefix.find("head") != -1:
            self.comm_group = get_lmhead_tp_group()
        else:
            self.comm_group = get_tp_group()
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2913,7 +2913,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                    aclgraph_runtime_mode=aclgraph_runtime_mode,
                    batch_descriptor=batch_descriptor)
                if need_dummy_logits:
-                    dummy_compute_logits(hidden_states)
+                    self.drafter.model.compute_logits(
+                        hidden_states[dummy_indices])
            if self.in_profile_run and self.dynamic_eplb:
                self.model.clear_all_moe_loads()
            if not self.in_profile_run and self.dynamic_eplb: