From c377e7393301f2ef7f7c92ec4397723b5fcc380b Mon Sep 17 00:00:00 2001
From: Qiu <qiuchunshuo@huawei.com>
Date: Fri, 13 Mar 2026 10:27:23 +0800
Subject: [PATCH] Perf(PP): support PP with async scheduling. (#7136)

### What this PR does / why we need it?
Follow up the PR https://github.com/vllm-project/vllm/pull/32618, this
PR provides async scheduling support for PP in vllm-ascend.
---
- vLLM version: v0.16.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/4034c3d32e30d01639459edd3ab486f56993876d

Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
---
 vllm_ascend/worker/model_runner_v1.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index cee31250..2c05ebbe 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1430,6 +1430,11 @@ class NPUModelRunner(GPUModelRunner):
 
         if self.execute_model_state is None:
             # Nothing to do (PP non-final rank case), output isn't used.
+            # receive sampled token ids from the last PP rank when using
+            # async scheduling + pipeline parallelism so downstream code
+            # (e.g., PCP input preparation) can access them.
+            if self.use_async_scheduling and get_pp_group().world_size > 1:
+                self._pp_receive_prev_sampled_token_ids_to_input_batch()
             if not kv_connector_output:
                 return None  # noqa
             # In case of PP with kv transfer, we need to pass through the
@@ -1564,6 +1569,14 @@ class NPUModelRunner(GPUModelRunner):
                 global_stream().wait_event(self.sampling_done_event)
                 self._update_states_after_model_execute(sampler_output.sampled_token_ids, scheduler_output)
 
+        # In async scheduling + PP, broadcast sampled token ids from the
+        # last PP rank so other PP ranks can receive them without going
+        # through the scheduler/engine IPC path.
+        if self.use_async_scheduling:
+            pp = get_pp_group()
+            if pp.world_size > 1 and pp.is_last_rank:
+                self._pp_broadcast_prev_sampled_token_ids(sampler_output.sampled_token_ids)
+
         if not self.use_async_scheduling:
             return model_runner_output
         return AsyncGPUModelRunnerOutput(