[Bugfix] Fix PP+PCP and PP+flashcomm1 bugs (#5416)
- Fixed the computing of final hidden_states when enabling pipeline
parallel and prefill context parallel at the same time. Only in the last
PP rank, hidden_states are required and have right tensor type.
- Fixed the shape of intermediate_tensors in the dummy_run when enabling
pipeline parallel and flashcomm1. The intermediate_tensors should be
divided by tp_size. Otherwise, the moe will raise issues.
- Fixed the shape of self.intermediate_tensors for sufficient slice
space
- vLLM version: release/v0.13.0
- vLLM main:
81786c8774
---------
Signed-off-by: Jingchun Gao <gaojingchun1@huawei.com>
This commit is contained in:
@@ -1033,8 +1033,10 @@ class NPUModelRunner(GPUModelRunner):
|
||||
hidden_states, IntermediateTensors):
|
||||
hidden_states = self._all_gather_hidden_states_and_aux(
|
||||
hidden_states)
|
||||
return hidden_states if self.pcp_size == 1 else self.pcp_manager.get_restore_hidden_states(
|
||||
hidden_states)
|
||||
if self.pcp_size > 1 and get_pp_group().is_last_rank:
|
||||
hidden_states = self.pcp_manager.get_restore_hidden_states(
|
||||
hidden_states)
|
||||
return hidden_states
|
||||
|
||||
def _build_attn_state(self, num_reqs, num_scheduled_tokens,
|
||||
num_valid_tokens):
|
||||
@@ -2071,19 +2073,24 @@ class NPUModelRunner(GPUModelRunner):
|
||||
else:
|
||||
# When PP and flashcomm1 are enabled, during dummy_run the estimated space should divide num_tokens by tp_size;
|
||||
# otherwise, on non-first PP ranks it would effectively perform an extra all-gather, leading to incorrect memory estimation and potentially causing OOM.
|
||||
actual_tokens = num_tokens
|
||||
intermediate_tokens = num_tokens_padded
|
||||
if enable_sp():
|
||||
tp_size = get_tensor_model_parallel_world_size()
|
||||
actual_tokens = num_tokens // tp_size
|
||||
intermediate_tokens = (num_tokens_padded + tp_size -
|
||||
1) // tp_size
|
||||
if self.intermediate_tensors is None:
|
||||
max_actual_tokens = self.max_num_tokens
|
||||
if enable_sp():
|
||||
max_actual_tokens = (self.max_num_tokens + tp_size -
|
||||
1) // tp_size
|
||||
self.intermediate_tensors = (
|
||||
self.model.make_empty_intermediate_tensors(
|
||||
batch_size=actual_tokens,
|
||||
batch_size=max_actual_tokens,
|
||||
dtype=self.dtype,
|
||||
device=self.device))
|
||||
intermediate_tensors = IntermediateTensors({
|
||||
k:
|
||||
v[:num_tokens_padded]
|
||||
v[:intermediate_tokens]
|
||||
for k, v in self.intermediate_tensors.items()
|
||||
})
|
||||
|
||||
|
||||
Reference in New Issue
Block a user