[Bugfix] Fix PP+PCP and PP+flashcomm1 bugs (#5416)

- Fixed the computing of final hidden_states when enabling pipeline parallel and prefill context parallel at the same time. Only in the last PP rank, hidden_states are required and have right tensor type. - Fixed the shape of intermediate_tensors in the dummy_run when enabling pipeline parallel and flashcomm1. The intermediate_tensors should be divided by tp_size. Otherwise, the moe will raise issues. - Fixed the shape of self.intermediate_tensors for sufficient slice space - vLLM version: release/v0.13.0 - vLLM main: 81786c8774 --------- Signed-off-by: Jingchun Gao <gaojingchun1@huawei.com>
2026-01-26 16:53:07 +08:00
parent 7d119df2a9
commit b390e0ef78
1 changed files with 13 additions and 6 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1033,8 +1033,10 @@ class NPUModelRunner(GPUModelRunner):
                hidden_states, IntermediateTensors):
            hidden_states = self._all_gather_hidden_states_and_aux(
                hidden_states)
-        return hidden_states if self.pcp_size == 1 else self.pcp_manager.get_restore_hidden_states(
-            hidden_states)
+        if self.pcp_size > 1 and get_pp_group().is_last_rank:
+            hidden_states = self.pcp_manager.get_restore_hidden_states(
+                hidden_states)
+        return hidden_states

    def _build_attn_state(self, num_reqs, num_scheduled_tokens,
                          num_valid_tokens):
@@ -2071,19 +2073,24 @@ class NPUModelRunner(GPUModelRunner):
            else:
                # When PP and flashcomm1 are enabled, during dummy_run the estimated space should divide num_tokens by tp_size;
                # otherwise, on non-first PP ranks it would effectively perform an extra all-gather, leading to incorrect memory estimation and potentially causing OOM.
-                actual_tokens = num_tokens
+                intermediate_tokens = num_tokens_padded
                if enable_sp():
                    tp_size = get_tensor_model_parallel_world_size()
-                    actual_tokens = num_tokens // tp_size
+                    intermediate_tokens = (num_tokens_padded + tp_size -
+                                           1) // tp_size
                if self.intermediate_tensors is None:
+                    max_actual_tokens = self.max_num_tokens
+                    if enable_sp():
+                        max_actual_tokens = (self.max_num_tokens + tp_size -
+                                             1) // tp_size
                    self.intermediate_tensors = (
                        self.model.make_empty_intermediate_tensors(
-                            batch_size=actual_tokens,
+                            batch_size=max_actual_tokens,
                            dtype=self.dtype,
                            device=self.device))
                intermediate_tensors = IntermediateTensors({
                    k:
-                    v[:num_tokens_padded]
+                    v[:intermediate_tokens]
                    for k, v in self.intermediate_tensors.items()
                })