diff --git a/vllm_ascend/ops/vocab_parallel_embedding.py b/vllm_ascend/ops/vocab_parallel_embedding.py index 69be390..a89c228 100644 --- a/vllm_ascend/ops/vocab_parallel_embedding.py +++ b/vllm_ascend/ops/vocab_parallel_embedding.py @@ -51,7 +51,7 @@ class AscendVocabParallelEmbedding(VocabParallelEmbedding): prefix: str = ""): nn.Module.__init__(self) - if lmhead_tp_enable() and prefix.find("lm_head") != -1: + if lmhead_tp_enable() and prefix.find("head") != -1: self.comm_group = get_lmhead_tp_group() else: self.comm_group = get_tp_group() diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index d0237b5..bd76756 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2516,7 +2516,8 @@ class NPUModelRunner(LoRAModelRunnerMixin): aclgraph_runtime_mode=aclgraph_runtime_mode, batch_descriptor=batch_descriptor) if need_dummy_logits: - dummy_compute_logits(hidden_states) + self.drafter.model.compute_logits( + hidden_states[dummy_indices]) if self.in_profile_run and self.dynamic_eplb: self.model.clear_all_moe_loads() if not self.in_profile_run and self.dynamic_eplb: