[BugFix] Fix chunked prefill bugs in engine v1 (#844)
### What this PR does / why we need it? Fix the bugs when run deepseek model in engine v1. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? CI passed with new added/existing test. --------- Signed-off-by: rjg-lyh <1318825571@qq.com>
This commit is contained in:
@@ -204,6 +204,9 @@ class NPUPlatform(Platform):
|
|||||||
"ascend_scheduler_config", None) is not None:
|
"ascend_scheduler_config", None) is not None:
|
||||||
additional_scheduler_config = additional_config.get(
|
additional_scheduler_config = additional_config.get(
|
||||||
"ascend_scheduler_config")
|
"ascend_scheduler_config")
|
||||||
|
if vllm_config.scheduler_config.enable_chunked_prefill:
|
||||||
|
additional_scheduler_config[
|
||||||
|
"enable_chunked_prefill"] = True
|
||||||
from vllm_ascend.core.schedule_config import \
|
from vllm_ascend.core.schedule_config import \
|
||||||
AscendSchedulerConfig
|
AscendSchedulerConfig
|
||||||
ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config(
|
ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config(
|
||||||
|
|||||||
@@ -120,6 +120,13 @@ class NPUModelRunner:
|
|||||||
self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
|
self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
|
||||||
self.max_num_reqs = self.scheduler_config.max_num_seqs
|
self.max_num_reqs = self.scheduler_config.max_num_seqs
|
||||||
|
|
||||||
|
additional_config = vllm_config.additional_config
|
||||||
|
if additional_config and additional_config.get(
|
||||||
|
"ascend_scheduler_config", None) is not None:
|
||||||
|
self.use_v0_scheduler = True
|
||||||
|
else:
|
||||||
|
self.use_v0_scheduler = False
|
||||||
|
|
||||||
self.graph_block_tables = np.zeros(
|
self.graph_block_tables = np.zeros(
|
||||||
(self.vllm_config.scheduler_config.max_num_seqs,
|
(self.vllm_config.scheduler_config.max_num_seqs,
|
||||||
(self.model_config.max_model_len + self.block_size - 1) //
|
(self.model_config.max_model_len + self.block_size - 1) //
|
||||||
@@ -545,13 +552,14 @@ class NPUModelRunner:
|
|||||||
block_offsets,
|
block_offsets,
|
||||||
out=self.slot_mapping_np[:total_num_scheduled_tokens])
|
out=self.slot_mapping_np[:total_num_scheduled_tokens])
|
||||||
|
|
||||||
if self.chunked_prefill_enabled:
|
if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens):
|
||||||
attn_state = AscendAttentionState.ChunkedPrefill
|
|
||||||
elif np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens):
|
|
||||||
attn_state = AscendAttentionState.PrefillNoCache
|
attn_state = AscendAttentionState.PrefillNoCache
|
||||||
# We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
|
# We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
|
||||||
elif np.all(num_scheduled_tokens == 1):
|
elif np.all(num_scheduled_tokens == 1):
|
||||||
attn_state = AscendAttentionState.DecodeOnly
|
attn_state = AscendAttentionState.DecodeOnly
|
||||||
|
# splitfuse
|
||||||
|
elif not self.use_v0_scheduler or self.chunked_prefill_enabled:
|
||||||
|
attn_state = AscendAttentionState.ChunkedPrefill
|
||||||
else:
|
else:
|
||||||
attn_state = AscendAttentionState.PrefillCacheHit
|
attn_state = AscendAttentionState.PrefillCacheHit
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user