[Bugfix] Fix MTP support for lmhead_tensor_parallel_size (#3915)
### What this PR does / why we need it?
Fix the issue of MTP being enabled and setting
Imhead_tensor_parallel_size=16 causing the inference to hang.
- vLLM version: v0.11.0
- vLLM main:
83f478bb19
Signed-off-by: wyh145 <1987244901@qq.com>
This commit is contained in:
@@ -51,7 +51,7 @@ class AscendVocabParallelEmbedding(VocabParallelEmbedding):
|
|||||||
prefix: str = ""):
|
prefix: str = ""):
|
||||||
nn.Module.__init__(self)
|
nn.Module.__init__(self)
|
||||||
|
|
||||||
if lmhead_tp_enable() and prefix.find("lm_head") != -1:
|
if lmhead_tp_enable() and prefix.find("head") != -1:
|
||||||
self.comm_group = get_lmhead_tp_group()
|
self.comm_group = get_lmhead_tp_group()
|
||||||
else:
|
else:
|
||||||
self.comm_group = get_tp_group()
|
self.comm_group = get_tp_group()
|
||||||
|
|||||||
@@ -2913,7 +2913,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
||||||
batch_descriptor=batch_descriptor)
|
batch_descriptor=batch_descriptor)
|
||||||
if need_dummy_logits:
|
if need_dummy_logits:
|
||||||
dummy_compute_logits(hidden_states)
|
self.drafter.model.compute_logits(
|
||||||
|
hidden_states[dummy_indices])
|
||||||
if self.in_profile_run and self.dynamic_eplb:
|
if self.in_profile_run and self.dynamic_eplb:
|
||||||
self.model.clear_all_moe_loads()
|
self.model.clear_all_moe_loads()
|
||||||
if not self.in_profile_run and self.dynamic_eplb:
|
if not self.in_profile_run and self.dynamic_eplb:
|
||||||
|
|||||||
Reference in New Issue
Block a user