From b4d6672d018689430551f7ce2d115b4847bce239 Mon Sep 17 00:00:00 2001 From: rjg-lyh <83491835+rjg-lyh@users.noreply.github.com> Date: Thu, 22 May 2025 10:33:50 +0800 Subject: [PATCH] [BugFix] Fix chunked prefill bugs in engine v1 (#844) ### What this PR does / why we need it? Fix the bugs when run deepseek model in engine v1. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? CI passed with new added/existing test. --------- Signed-off-by: rjg-lyh <1318825571@qq.com> --- vllm_ascend/platform.py | 3 +++ vllm_ascend/worker/model_runner_v1.py | 14 +++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 2d8834b..28cda89 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -204,6 +204,9 @@ class NPUPlatform(Platform): "ascend_scheduler_config", None) is not None: additional_scheduler_config = additional_config.get( "ascend_scheduler_config") + if vllm_config.scheduler_config.enable_chunked_prefill: + additional_scheduler_config[ + "enable_chunked_prefill"] = True from vllm_ascend.core.schedule_config import \ AscendSchedulerConfig ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config( diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 1803743..89c2348 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -120,6 +120,13 @@ class NPUModelRunner: self.max_num_tokens = self.scheduler_config.max_num_batched_tokens self.max_num_reqs = self.scheduler_config.max_num_seqs + additional_config = vllm_config.additional_config + if additional_config and additional_config.get( + "ascend_scheduler_config", None) is not None: + self.use_v0_scheduler = True + else: + self.use_v0_scheduler = False + self.graph_block_tables = np.zeros( (self.vllm_config.scheduler_config.max_num_seqs, (self.model_config.max_model_len + self.block_size - 1) // @@ -545,13 +552,14 @@ class NPUModelRunner: block_offsets, out=self.slot_mapping_np[:total_num_scheduled_tokens]) - if self.chunked_prefill_enabled: - attn_state = AscendAttentionState.ChunkedPrefill - elif np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens): + if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens): attn_state = AscendAttentionState.PrefillNoCache # We assume it is the decode stage, where prefill occurs but only one token is not hit in cache. elif np.all(num_scheduled_tokens == 1): attn_state = AscendAttentionState.DecodeOnly + # splitfuse + elif not self.use_v0_scheduler or self.chunked_prefill_enabled: + attn_state = AscendAttentionState.ChunkedPrefill else: attn_state = AscendAttentionState.PrefillCacheHit