[Bugfix] fix pcp 128K break (#5266)

### What this PR does / why we need it?
[Bugfix] Fixing the issue where 128K context does not work in long
sequence scenarios.

This issue is caused by not splitting num_token according to pcp_size
during profile_run.
During `profile_run`, a warm-up is performed based on
`self.max_num_tokens`. When PCP is enabled, each PCP group will only
schedule up to `self.max_num_tokens / pcp_size`. After `profile_run` is
completed, the original scheduling size needs to be restored.

This is a temporary workaround; once
https://github.com/vllm-project/vllm/pull/28988/files is implemented,
this part can be removed.

- vLLM version: release/v0.13.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: weiguihua2 <weiguihua2@huawei.com>
This commit is contained in:
weiguihua2
2025-12-25 11:58:52 +08:00
committed by GitHub
parent 8caad0510d
commit d752c030e9

View File

@@ -192,6 +192,7 @@ class NPUModelRunner(GPUModelRunner):
def __init__(self, vllm_config: VllmConfig, device: torch.device): def __init__(self, vllm_config: VllmConfig, device: torch.device):
with _torch_cuda_wrapper(): with _torch_cuda_wrapper():
super().__init__(vllm_config, device) super().__init__(vllm_config, device)
self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
self.max_num_reqs = self.scheduler_config.max_num_seqs self.max_num_reqs = self.scheduler_config.max_num_seqs
self.dp_size = vllm_config.parallel_config.data_parallel_size self.dp_size = vllm_config.parallel_config.data_parallel_size
self.dp_rank = vllm_config.parallel_config.data_parallel_rank self.dp_rank = vllm_config.parallel_config.data_parallel_rank
@@ -2194,7 +2195,14 @@ class NPUModelRunner(GPUModelRunner):
self._dummy_run(mc2_tokens_capacity, self._dummy_run(mc2_tokens_capacity,
with_prefill=True, with_prefill=True,
is_profile=True) is_profile=True)
origin_max_num_tokens = self.max_num_tokens
# in the pcp scenario, the split sequence needs to be used for profile run
# TODO: after the vllm pcp function is launched, this logic needs to be brought up to the community
if self.pcp_size > 1:
self.max_num_tokens = math.ceil(self.max_num_tokens /
(self.pcp_size * 2)) * 2
super().profile_run() super().profile_run()
self.max_num_tokens = origin_max_num_tokens
def eplb_warmup(self): def eplb_warmup(self):
if self.dynamic_eplb and not self.is_eplb_warmuped: if self.dynamic_eplb and not self.is_eplb_warmuped: