[Bugfix] Fix PP+PCP and PP+flashcomm1 bugs (#5416)
- Fixed the computing of final hidden_states when enabling pipeline
parallel and prefill context parallel at the same time. Only in the last
PP rank, hidden_states are required and have right tensor type.
- Fixed the shape of intermediate_tensors in the dummy_run when enabling
pipeline parallel and flashcomm1. The intermediate_tensors should be
divided by tp_size. Otherwise, the moe will raise issues.
- Fixed the shape of self.intermediate_tensors for sufficient slice
space
- vLLM version: release/v0.13.0
- vLLM main:
81786c8774
---------
Signed-off-by: Jingchun Gao <gaojingchun1@huawei.com>
This commit is contained in:
@@ -1033,8 +1033,10 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
hidden_states, IntermediateTensors):
|
hidden_states, IntermediateTensors):
|
||||||
hidden_states = self._all_gather_hidden_states_and_aux(
|
hidden_states = self._all_gather_hidden_states_and_aux(
|
||||||
hidden_states)
|
hidden_states)
|
||||||
return hidden_states if self.pcp_size == 1 else self.pcp_manager.get_restore_hidden_states(
|
if self.pcp_size > 1 and get_pp_group().is_last_rank:
|
||||||
|
hidden_states = self.pcp_manager.get_restore_hidden_states(
|
||||||
hidden_states)
|
hidden_states)
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
def _build_attn_state(self, num_reqs, num_scheduled_tokens,
|
def _build_attn_state(self, num_reqs, num_scheduled_tokens,
|
||||||
num_valid_tokens):
|
num_valid_tokens):
|
||||||
@@ -2071,19 +2073,24 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
else:
|
else:
|
||||||
# When PP and flashcomm1 are enabled, during dummy_run the estimated space should divide num_tokens by tp_size;
|
# When PP and flashcomm1 are enabled, during dummy_run the estimated space should divide num_tokens by tp_size;
|
||||||
# otherwise, on non-first PP ranks it would effectively perform an extra all-gather, leading to incorrect memory estimation and potentially causing OOM.
|
# otherwise, on non-first PP ranks it would effectively perform an extra all-gather, leading to incorrect memory estimation and potentially causing OOM.
|
||||||
actual_tokens = num_tokens
|
intermediate_tokens = num_tokens_padded
|
||||||
if enable_sp():
|
if enable_sp():
|
||||||
tp_size = get_tensor_model_parallel_world_size()
|
tp_size = get_tensor_model_parallel_world_size()
|
||||||
actual_tokens = num_tokens // tp_size
|
intermediate_tokens = (num_tokens_padded + tp_size -
|
||||||
|
1) // tp_size
|
||||||
if self.intermediate_tensors is None:
|
if self.intermediate_tensors is None:
|
||||||
|
max_actual_tokens = self.max_num_tokens
|
||||||
|
if enable_sp():
|
||||||
|
max_actual_tokens = (self.max_num_tokens + tp_size -
|
||||||
|
1) // tp_size
|
||||||
self.intermediate_tensors = (
|
self.intermediate_tensors = (
|
||||||
self.model.make_empty_intermediate_tensors(
|
self.model.make_empty_intermediate_tensors(
|
||||||
batch_size=actual_tokens,
|
batch_size=max_actual_tokens,
|
||||||
dtype=self.dtype,
|
dtype=self.dtype,
|
||||||
device=self.device))
|
device=self.device))
|
||||||
intermediate_tensors = IntermediateTensors({
|
intermediate_tensors = IntermediateTensors({
|
||||||
k:
|
k:
|
||||||
v[:num_tokens_padded]
|
v[:intermediate_tokens]
|
||||||
for k, v in self.intermediate_tensors.items()
|
for k, v in self.intermediate_tensors.items()
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user