From d752c030e9e94621169237e4ed74fab81b72f415 Mon Sep 17 00:00:00 2001 From: weiguihua2 Date: Thu, 25 Dec 2025 11:58:52 +0800 Subject: [PATCH] [Bugfix] fix pcp 128K break (#5266) ### What this PR does / why we need it? [Bugfix] Fixing the issue where 128K context does not work in long sequence scenarios. This issue is caused by not splitting num_token according to pcp_size during profile_run. During `profile_run`, a warm-up is performed based on `self.max_num_tokens`. When PCP is enabled, each PCP group will only schedule up to `self.max_num_tokens / pcp_size`. After `profile_run` is completed, the original scheduling size needs to be restored. This is a temporary workaround; once https://github.com/vllm-project/vllm/pull/28988/files is implemented, this part can be removed. - vLLM version: release/v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: weiguihua2 --- vllm_ascend/worker/model_runner_v1.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index d67632b2..b6adeba0 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -192,6 +192,7 @@ class NPUModelRunner(GPUModelRunner): def __init__(self, vllm_config: VllmConfig, device: torch.device): with _torch_cuda_wrapper(): super().__init__(vllm_config, device) + self.max_num_tokens = self.scheduler_config.max_num_batched_tokens self.max_num_reqs = self.scheduler_config.max_num_seqs self.dp_size = vllm_config.parallel_config.data_parallel_size self.dp_rank = vllm_config.parallel_config.data_parallel_rank @@ -2194,7 +2195,14 @@ class NPUModelRunner(GPUModelRunner): self._dummy_run(mc2_tokens_capacity, with_prefill=True, is_profile=True) + origin_max_num_tokens = self.max_num_tokens + # in the pcp scenario, the split sequence needs to be used for profile run + # TODO: after the vllm pcp function is launched, this logic needs to be brought up to the community + if self.pcp_size > 1: + self.max_num_tokens = math.ceil(self.max_num_tokens / + (self.pcp_size * 2)) * 2 super().profile_run() + self.max_num_tokens = origin_max_num_tokens def eplb_warmup(self): if self.dynamic_eplb and not self.is_eplb_warmuped: