[Bugfix] fix pcp 128K break (#5266)
### What this PR does / why we need it?
[Bugfix] Fixing the issue where 128K context does not work in long
sequence scenarios.
This issue is caused by not splitting num_token according to pcp_size
during profile_run.
During `profile_run`, a warm-up is performed based on
`self.max_num_tokens`. When PCP is enabled, each PCP group will only
schedule up to `self.max_num_tokens / pcp_size`. After `profile_run` is
completed, the original scheduling size needs to be restored.
This is a temporary workaround; once
https://github.com/vllm-project/vllm/pull/28988/files is implemented,
this part can be removed.
- vLLM version: release/v0.13.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: weiguihua2 <weiguihua2@huawei.com>
This commit is contained in:
@@ -192,6 +192,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
def __init__(self, vllm_config: VllmConfig, device: torch.device):
|
||||
with _torch_cuda_wrapper():
|
||||
super().__init__(vllm_config, device)
|
||||
self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
|
||||
self.max_num_reqs = self.scheduler_config.max_num_seqs
|
||||
self.dp_size = vllm_config.parallel_config.data_parallel_size
|
||||
self.dp_rank = vllm_config.parallel_config.data_parallel_rank
|
||||
@@ -2194,7 +2195,14 @@ class NPUModelRunner(GPUModelRunner):
|
||||
self._dummy_run(mc2_tokens_capacity,
|
||||
with_prefill=True,
|
||||
is_profile=True)
|
||||
origin_max_num_tokens = self.max_num_tokens
|
||||
# in the pcp scenario, the split sequence needs to be used for profile run
|
||||
# TODO: after the vllm pcp function is launched, this logic needs to be brought up to the community
|
||||
if self.pcp_size > 1:
|
||||
self.max_num_tokens = math.ceil(self.max_num_tokens /
|
||||
(self.pcp_size * 2)) * 2
|
||||
super().profile_run()
|
||||
self.max_num_tokens = origin_max_num_tokens
|
||||
|
||||
def eplb_warmup(self):
|
||||
if self.dynamic_eplb and not self.is_eplb_warmuped:
|
||||
|
||||
Reference in New Issue
Block a user