diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index d67632b2..b6adeba0 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -192,6 +192,7 @@ class NPUModelRunner(GPUModelRunner): def __init__(self, vllm_config: VllmConfig, device: torch.device): with _torch_cuda_wrapper(): super().__init__(vllm_config, device) + self.max_num_tokens = self.scheduler_config.max_num_batched_tokens self.max_num_reqs = self.scheduler_config.max_num_seqs self.dp_size = vllm_config.parallel_config.data_parallel_size self.dp_rank = vllm_config.parallel_config.data_parallel_rank @@ -2194,7 +2195,14 @@ class NPUModelRunner(GPUModelRunner): self._dummy_run(mc2_tokens_capacity, with_prefill=True, is_profile=True) + origin_max_num_tokens = self.max_num_tokens + # in the pcp scenario, the split sequence needs to be used for profile run + # TODO: after the vllm pcp function is launched, this logic needs to be brought up to the community + if self.pcp_size > 1: + self.max_num_tokens = math.ceil(self.max_num_tokens / + (self.pcp_size * 2)) * 2 super().profile_run() + self.max_num_tokens = origin_max_num_tokens def eplb_warmup(self): if self.dynamic_eplb and not self.is_eplb_warmuped: