Revert "drop ascend scheduler" (#4580)

Reverts vllm-project/vllm-ascend#4498
- vLLM version: v0.11.2
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2
This commit is contained in:
Mengqing Cao
2025-11-29 22:20:48 +08:00
committed by GitHub
parent 4dbe4fd123
commit 517fd9272d
52 changed files with 2948 additions and 85 deletions

View File

@@ -330,6 +330,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# Ascend-specific configurations
self.ascend_config = get_ascend_config()
if self.ascend_config.ascend_scheduler_config.enabled:
self.chunked_prefill_enabled = self.scheduler_config.chunked_prefill_enabled
else:
self.chunked_prefill_enabled = True
self.weight_prefetch_method = WeightPrefetchMethod(
self.ascend_config.weight_prefetch_config)
# Dump / PrecisionDebugger configuration now comes from AscendConfig
@@ -1938,6 +1942,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
def _build_attn_state(self, num_reqs, num_scheduled_tokens,
num_valid_tokens):
ascend_config = get_ascend_config()
if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens):
attn_state = AscendAttentionState.PrefillNoCache
# We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
@@ -1954,7 +1959,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
else:
attn_state = AscendAttentionState.ChunkedPrefill
# splitfuse
elif self.scheduler_config.enable_chunked_prefill:
elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled:
attn_state = AscendAttentionState.ChunkedPrefill
else:
attn_state = AscendAttentionState.PrefillCacheHit