[BugFix][P/D] Modify the recalculation logic to prevent waiting requests from filling up the D node KVCache (#3641)
### What this PR does / why we need it?
Modify the recalculation logic to prevent waiting requests from filling
up the D node KVCache
- vLLM version: v0.11.0rc3
- vLLM main:
17c540a993
Signed-off-by: underfituu <hzhucong@163.com>
This commit is contained in:
@@ -565,7 +565,7 @@ async def _handle_completions(api: str, request: Request):
|
||||
chunk_json = json.loads(chunk_str)
|
||||
except json.JSONDecodeError:
|
||||
# if chunk is [done], skip it.
|
||||
logger.warning(
|
||||
logger.debug(
|
||||
f"Skipping chunk: {chunk_str}")
|
||||
yield chunk
|
||||
continue
|
||||
|
||||
@@ -548,7 +548,7 @@ async def _handle_completions(api: str, request: Request):
|
||||
chunk_json = json.loads(chunk_str)
|
||||
except json.JSONDecodeError:
|
||||
# if chunk is [done], skip it.
|
||||
logger.warning(
|
||||
logger.debug(
|
||||
f"Skipping chunk: {chunk_str}")
|
||||
yield chunk
|
||||
continue
|
||||
|
||||
Reference in New Issue
Block a user