Revert "drop ascend scheduler" (#4580)
Reverts vllm-project/vllm-ascend#4498 - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2
This commit is contained in:
@@ -330,6 +330,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
# Ascend-specific configurations
|
||||
self.ascend_config = get_ascend_config()
|
||||
if self.ascend_config.ascend_scheduler_config.enabled:
|
||||
self.chunked_prefill_enabled = self.scheduler_config.chunked_prefill_enabled
|
||||
else:
|
||||
self.chunked_prefill_enabled = True
|
||||
self.weight_prefetch_method = WeightPrefetchMethod(
|
||||
self.ascend_config.weight_prefetch_config)
|
||||
# Dump / PrecisionDebugger configuration now comes from AscendConfig
|
||||
@@ -1938,6 +1942,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
def _build_attn_state(self, num_reqs, num_scheduled_tokens,
|
||||
num_valid_tokens):
|
||||
ascend_config = get_ascend_config()
|
||||
if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens):
|
||||
attn_state = AscendAttentionState.PrefillNoCache
|
||||
# We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
|
||||
@@ -1954,7 +1959,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
else:
|
||||
attn_state = AscendAttentionState.ChunkedPrefill
|
||||
# splitfuse
|
||||
elif self.scheduler_config.enable_chunked_prefill:
|
||||
elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled:
|
||||
attn_state = AscendAttentionState.ChunkedPrefill
|
||||
else:
|
||||
attn_state = AscendAttentionState.PrefillCacheHit
|
||||
|
||||
Reference in New Issue
Block a user