[Bugfix] Fix PP+PCP and PP+flashcomm1 bugs (#5416)

- Fixed the computing of final hidden_states when enabling pipeline
parallel and prefill context parallel at the same time. Only in the last
PP rank, hidden_states are required and have right tensor type.
- Fixed the shape of intermediate_tensors in the dummy_run when enabling
pipeline parallel and flashcomm1. The intermediate_tensors should be
divided by tp_size. Otherwise, the moe will raise issues.
- Fixed the shape of self.intermediate_tensors for sufficient slice
space

- vLLM version: release/v0.13.0
- vLLM main:
81786c8774

---------

Signed-off-by: Jingchun Gao <gaojingchun1@huawei.com>
This commit is contained in:
Jingchun Gao
2026-01-26 16:53:07 +08:00
committed by GitHub
parent 7d119df2a9
commit b390e0ef78

View File

@@ -1033,8 +1033,10 @@ class NPUModelRunner(GPUModelRunner):
hidden_states, IntermediateTensors):
hidden_states = self._all_gather_hidden_states_and_aux(
hidden_states)
return hidden_states if self.pcp_size == 1 else self.pcp_manager.get_restore_hidden_states(
hidden_states)
if self.pcp_size > 1 and get_pp_group().is_last_rank:
hidden_states = self.pcp_manager.get_restore_hidden_states(
hidden_states)
return hidden_states
def _build_attn_state(self, num_reqs, num_scheduled_tokens,
num_valid_tokens):
@@ -2071,19 +2073,24 @@ class NPUModelRunner(GPUModelRunner):
else:
# When PP and flashcomm1 are enabled, during dummy_run the estimated space should divide num_tokens by tp_size;
# otherwise, on non-first PP ranks it would effectively perform an extra all-gather, leading to incorrect memory estimation and potentially causing OOM.
actual_tokens = num_tokens
intermediate_tokens = num_tokens_padded
if enable_sp():
tp_size = get_tensor_model_parallel_world_size()
actual_tokens = num_tokens // tp_size
intermediate_tokens = (num_tokens_padded + tp_size -
1) // tp_size
if self.intermediate_tensors is None:
max_actual_tokens = self.max_num_tokens
if enable_sp():
max_actual_tokens = (self.max_num_tokens + tp_size -
1) // tp_size
self.intermediate_tensors = (
self.model.make_empty_intermediate_tensors(
batch_size=actual_tokens,
batch_size=max_actual_tokens,
dtype=self.dtype,
device=self.device))
intermediate_tensors = IntermediateTensors({
k:
v[:num_tokens_padded]
v[:intermediate_tokens]
for k, v in self.intermediate_tensors.items()
})