[v0.11.0][BugFix][P/D] Modify the recalculation logic to prevent waiting requests from filling up the D node KVCache (#3686)
### What this PR does / why we need it? Modify the recalculation logic to prevent waiting requests from filling up the D node KVCache Signed-off-by: underfituu <hzhucong@163.com>
This commit is contained in:
@@ -565,7 +565,7 @@ async def _handle_completions(api: str, request: Request):
|
|||||||
chunk_json = json.loads(chunk_str)
|
chunk_json = json.loads(chunk_str)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
# if chunk is [done], skip it.
|
# if chunk is [done], skip it.
|
||||||
logger.warning(
|
logger.debug(
|
||||||
f"Skipping chunk: {chunk_str}")
|
f"Skipping chunk: {chunk_str}")
|
||||||
yield chunk
|
yield chunk
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -548,7 +548,7 @@ async def _handle_completions(api: str, request: Request):
|
|||||||
chunk_json = json.loads(chunk_str)
|
chunk_json = json.loads(chunk_str)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
# if chunk is [done], skip it.
|
# if chunk is [done], skip it.
|
||||||
logger.warning(
|
logger.debug(
|
||||||
f"Skipping chunk: {chunk_str}")
|
f"Skipping chunk: {chunk_str}")
|
||||||
yield chunk
|
yield chunk
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -361,7 +361,7 @@ class RecomputeScheduler(SchedulerInterface):
|
|||||||
skipped_waiting_requests = create_request_queue(self.policy)
|
skipped_waiting_requests = create_request_queue(self.policy)
|
||||||
|
|
||||||
# Next, schedule the WAITING requests.
|
# Next, schedule the WAITING requests.
|
||||||
if not preempted_reqs:
|
if not preempted_reqs and not recomputed_reqs:
|
||||||
while self.waiting and token_budget > 0:
|
while self.waiting and token_budget > 0:
|
||||||
if len(self.running) == self.max_num_running_reqs:
|
if len(self.running) == self.max_num_running_reqs:
|
||||||
break
|
break
|
||||||
|
|||||||
Reference in New Issue
Block a user