[BugFix] Fix ascend scheduler bugs. (#822)
This PR fixes two bugs in AscendScheduler: 1. When running with high concurrency, the length of running queue may exceed the limit of max_num_seqs 2. When some requests are prempted and recomputing is activated, the logic of computing new tokens is wrong. Signed-off-by: whx-sjtu <2952154980@qq.com>
This commit is contained in:
@@ -74,7 +74,7 @@ class AscendScheduler(Scheduler):
|
|||||||
|
|
||||||
# Schedule prefill requests first.
|
# Schedule prefill requests first.
|
||||||
while self.waiting and token_budget > 0:
|
while self.waiting and token_budget > 0:
|
||||||
if len(scheduled_new_reqs) == self.max_num_running_reqs:
|
if len(self.running) == self.max_num_running_reqs:
|
||||||
break
|
break
|
||||||
|
|
||||||
request = self.waiting[0]
|
request = self.waiting[0]
|
||||||
@@ -96,7 +96,7 @@ class AscendScheduler(Scheduler):
|
|||||||
# Get already-cached tokens.
|
# Get already-cached tokens.
|
||||||
computed_blocks, num_computed_tokens = (
|
computed_blocks, num_computed_tokens = (
|
||||||
self.kv_cache_manager.get_computed_blocks(request))
|
self.kv_cache_manager.get_computed_blocks(request))
|
||||||
num_new_tokens = request.num_prompt_tokens - num_computed_tokens
|
num_new_tokens = request.num_tokens - num_computed_tokens
|
||||||
if (0 < self.scheduler_config.long_prefill_token_threshold <
|
if (0 < self.scheduler_config.long_prefill_token_threshold <
|
||||||
num_new_tokens):
|
num_new_tokens):
|
||||||
num_new_tokens = (
|
num_new_tokens = (
|
||||||
|
|||||||
Reference in New Issue
Block a user