[Bugfix] fix pcp 128K break (#5266)

### What this PR does / why we need it? [Bugfix] Fixing the issue where 128K context does not work in long sequence scenarios. This issue is caused by not splitting num_token according to pcp_size during profile_run. During `profile_run`, a warm-up is performed based on `self.max_num_tokens`. When PCP is enabled, each PCP group will only schedule up to `self.max_num_tokens / pcp_size`. After `profile_run` is completed, the original scheduling size needs to be restored. This is a temporary workaround; once https://github.com/vllm-project/vllm/pull/28988/files is implemented, this part can be removed. - vLLM version: release/v0.13.0 - vLLM main: ad32e3e19c --------- Signed-off-by: weiguihua2 <weiguihua2@huawei.com>
2025-12-25 11:58:52 +08:00
parent 8caad0510d
commit d752c030e9
1 changed files with 8 additions and 0 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -192,6 +192,7 @@ class NPUModelRunner(GPUModelRunner):
    def __init__(self, vllm_config: VllmConfig, device: torch.device):
        with _torch_cuda_wrapper():
            super().__init__(vllm_config, device)
+        self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
        self.max_num_reqs = self.scheduler_config.max_num_seqs
        self.dp_size = vllm_config.parallel_config.data_parallel_size
        self.dp_rank = vllm_config.parallel_config.data_parallel_rank
@@ -2194,7 +2195,14 @@ class NPUModelRunner(GPUModelRunner):
            self._dummy_run(mc2_tokens_capacity,
                            with_prefill=True,
                            is_profile=True)
+        origin_max_num_tokens = self.max_num_tokens
+        # in the pcp scenario, the split sequence needs to be used for profile run
+        # TODO: after the vllm pcp function is launched, this logic needs to be brought up to the community
+        if self.pcp_size > 1:
+            self.max_num_tokens = math.ceil(self.max_num_tokens /
+                                            (self.pcp_size * 2)) * 2
        super().profile_run()
+        self.max_num_tokens = origin_max_num_tokens

    def eplb_warmup(self):
        if self.dynamic_eplb and not self.is_eplb_warmuped: