From b390e0ef78afb31d5ee26fdd26e4f33e89d645b3 Mon Sep 17 00:00:00 2001
From: Jingchun Gao <63247409+gjc0824@users.noreply.github.com>
Date: Mon, 26 Jan 2026 16:53:07 +0800
Subject: [PATCH] [Bugfix] Fix PP+PCP and PP+flashcomm1 bugs (#5416)

- Fixed the computing of final hidden_states when enabling pipeline
parallel and prefill context parallel at the same time. Only in the last
PP rank, hidden_states are required and have right tensor type.
- Fixed the shape of intermediate_tensors in the dummy_run when enabling
pipeline parallel and flashcomm1. The intermediate_tensors should be
divided by tp_size. Otherwise, the moe will raise issues.
- Fixed the shape of self.intermediate_tensors for sufficient slice
space

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/81786c87748b0177111dfdc07af5351d8389baa1

---------

Signed-off-by: Jingchun Gao <gaojingchun1@huawei.com>
---
 vllm_ascend/worker/model_runner_v1.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 6d823721..c63717c1 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1033,8 +1033,10 @@ class NPUModelRunner(GPUModelRunner):
                 hidden_states, IntermediateTensors):
             hidden_states = self._all_gather_hidden_states_and_aux(
                 hidden_states)
-        return hidden_states if self.pcp_size == 1 else self.pcp_manager.get_restore_hidden_states(
-            hidden_states)
+        if self.pcp_size > 1 and get_pp_group().is_last_rank:
+            hidden_states = self.pcp_manager.get_restore_hidden_states(
+                hidden_states)
+        return hidden_states
 
     def _build_attn_state(self, num_reqs, num_scheduled_tokens,
                           num_valid_tokens):
@@ -2071,19 +2073,24 @@ class NPUModelRunner(GPUModelRunner):
             else:
                 # When PP and flashcomm1 are enabled, during dummy_run the estimated space should divide num_tokens by tp_size;
                 # otherwise, on non-first PP ranks it would effectively perform an extra all-gather, leading to incorrect memory estimation and potentially causing OOM.
-                actual_tokens = num_tokens
+                intermediate_tokens = num_tokens_padded
                 if enable_sp():
                     tp_size = get_tensor_model_parallel_world_size()
-                    actual_tokens = num_tokens // tp_size
+                    intermediate_tokens = (num_tokens_padded + tp_size -
+                                           1) // tp_size
                 if self.intermediate_tensors is None:
+                    max_actual_tokens = self.max_num_tokens
+                    if enable_sp():
+                        max_actual_tokens = (self.max_num_tokens + tp_size -
+                                             1) // tp_size
                     self.intermediate_tensors = (
                         self.model.make_empty_intermediate_tensors(
-                            batch_size=actual_tokens,
+                            batch_size=max_actual_tokens,
                             dtype=self.dtype,
                             device=self.device))
                 intermediate_tensors = IntermediateTensors({
                     k:
-                    v[:num_tokens_padded]
+                    v[:intermediate_tokens]
                     for k, v in self.intermediate_tensors.items()
                 })