Drop ascend scheduler (#4623)
It's safe to drop ascend scheduler now. The related test and doc has
been removed already
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -332,10 +332,6 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
|
||||
|
||||
# Ascend-specific configurations
|
||||
self.ascend_config = get_ascend_config()
|
||||
if self.ascend_config.ascend_scheduler_config.enabled:
|
||||
self.chunked_prefill_enabled = self.scheduler_config.enable_chunked_prefill
|
||||
else:
|
||||
self.chunked_prefill_enabled = True
|
||||
self.weight_prefetch_method = WeightPrefetchMethod(
|
||||
self.ascend_config.weight_prefetch_config)
|
||||
# Dump / PrecisionDebugger configuration now comes from AscendConfig
|
||||
@@ -1932,7 +1928,6 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
|
||||
|
||||
def _build_attn_state(self, num_reqs, num_scheduled_tokens,
|
||||
num_valid_tokens):
|
||||
ascend_config = get_ascend_config()
|
||||
if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens):
|
||||
attn_state = AscendAttentionState.PrefillNoCache
|
||||
# We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
|
||||
@@ -1949,7 +1944,7 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
|
||||
else:
|
||||
attn_state = AscendAttentionState.ChunkedPrefill
|
||||
# splitfuse
|
||||
elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled:
|
||||
elif self.scheduler_config.enable_chunked_prefill:
|
||||
attn_state = AscendAttentionState.ChunkedPrefill
|
||||
else:
|
||||
attn_state = AscendAttentionState.PrefillCacheHit
|
||||
|
||||
Reference in New Issue
Block a user