From d752c030e9e94621169237e4ed74fab81b72f415 Mon Sep 17 00:00:00 2001
From: weiguihua2 <weiguihua2@huawei.com>
Date: Thu, 25 Dec 2025 11:58:52 +0800
Subject: [PATCH] [Bugfix] fix pcp 128K break (#5266)

### What this PR does / why we need it?
[Bugfix] Fixing the issue where 128K context does not work in long
sequence scenarios.

This issue is caused by not splitting num_token according to pcp_size
during profile_run.
During `profile_run`, a warm-up is performed based on
`self.max_num_tokens`. When PCP is enabled, each PCP group will only
schedule up to `self.max_num_tokens / pcp_size`. After `profile_run` is
completed, the original scheduling size needs to be restored.

This is a temporary workaround; once
https://github.com/vllm-project/vllm/pull/28988/files is implemented,
this part can be removed.

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
---------
Signed-off-by: weiguihua2 <weiguihua2@huawei.com>
---
 vllm_ascend/worker/model_runner_v1.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index d67632b2..b6adeba0 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -192,6 +192,7 @@ class NPUModelRunner(GPUModelRunner):
     def __init__(self, vllm_config: VllmConfig, device: torch.device):
         with _torch_cuda_wrapper():
             super().__init__(vllm_config, device)
+        self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
         self.max_num_reqs = self.scheduler_config.max_num_seqs
         self.dp_size = vllm_config.parallel_config.data_parallel_size
         self.dp_rank = vllm_config.parallel_config.data_parallel_rank
@@ -2194,7 +2195,14 @@ class NPUModelRunner(GPUModelRunner):
             self._dummy_run(mc2_tokens_capacity,
                             with_prefill=True,
                             is_profile=True)
+        origin_max_num_tokens = self.max_num_tokens
+        # in the pcp scenario, the split sequence needs to be used for profile run
+        # TODO: after the vllm pcp function is launched, this logic needs to be brought up to the community
+        if self.pcp_size > 1:
+            self.max_num_tokens = math.ceil(self.max_num_tokens /
+                                            (self.pcp_size * 2)) * 2
         super().profile_run()
+        self.max_num_tokens = origin_max_num_tokens
 
     def eplb_warmup(self):
         if self.dynamic_eplb and not self.is_eplb_warmuped: