From 292cf339c323cc89e266e5ce8ac6f47cdd1a631c Mon Sep 17 00:00:00 2001 From: hucong <33891520+underfituu@users.noreply.github.com> Date: Sat, 25 Oct 2025 09:14:20 +0800 Subject: [PATCH] [BugFix][P/D] Modify the recalculation logic to prevent waiting requests from filling up the D node KVCache (#3641) ### What this PR does / why we need it? Modify the recalculation logic to prevent waiting requests from filling up the D node KVCache - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44 Signed-off-by: underfituu --- .../load_balance_proxy_layerwise_server_example.py | 2 +- .../load_balance_proxy_server_example.py | 2 +- vllm_ascend/core/recompute_scheduler.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py b/examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py index 1336e5a3..c6001553 100644 --- a/examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py +++ b/examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py @@ -565,7 +565,7 @@ async def _handle_completions(api: str, request: Request): chunk_json = json.loads(chunk_str) except json.JSONDecodeError: # if chunk is [done], skip it. - logger.warning( + logger.debug( f"Skipping chunk: {chunk_str}") yield chunk continue diff --git a/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py b/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py index 0e28deb8..880ed69e 100644 --- a/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py +++ b/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py @@ -548,7 +548,7 @@ async def _handle_completions(api: str, request: Request): chunk_json = json.loads(chunk_str) except json.JSONDecodeError: # if chunk is [done], skip it. - logger.warning( + logger.debug( f"Skipping chunk: {chunk_str}") yield chunk continue diff --git a/vllm_ascend/core/recompute_scheduler.py b/vllm_ascend/core/recompute_scheduler.py index 0240660a..14a5d273 100644 --- a/vllm_ascend/core/recompute_scheduler.py +++ b/vllm_ascend/core/recompute_scheduler.py @@ -362,7 +362,7 @@ class RecomputeScheduler(SchedulerInterface): skipped_waiting_requests = create_request_queue(self.policy) # Next, schedule the WAITING requests. - if not preempted_reqs: + if not preempted_reqs and not recomputed_reqs: while self.waiting and token_budget > 0: if len(self.running) == self.max_num_running_reqs: break