[BugFix][P/D] Modify the recalculation logic to prevent waiting requests from filling up the D node KVCache (#3641)

### What this PR does / why we need it? Modify the recalculation logic to prevent waiting requests from filling up the D node KVCache - vLLM version: v0.11.0rc3 - vLLM main: 17c540a993 Signed-off-by: underfituu <hzhucong@163.com>
2025-10-25 09:14:20 +08:00
parent 39b994a987
commit 292cf339c3
3 changed files with 3 additions and 3 deletions
--- a/examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py
+++ b/examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py
@@ -565,7 +565,7 @@ async def _handle_completions(api: str, request: Request):
                            chunk_json = json.loads(chunk_str)
                        except json.JSONDecodeError:
                            # if chunk is [done], skip it.
-                            logger.warning(
+                            logger.debug(
                                f"Skipping chunk: {chunk_str}")
                            yield chunk
                            continue
--- a/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py
+++ b/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py
@@ -548,7 +548,7 @@ async def _handle_completions(api: str, request: Request):
                            chunk_json = json.loads(chunk_str)
                        except json.JSONDecodeError:
                            # if chunk is [done], skip it.
-                            logger.warning(
+                            logger.debug(
                                f"Skipping chunk: {chunk_str}")
                            yield chunk
                            continue
--- a/vllm_ascend/core/recompute_scheduler.py
+++ b/vllm_ascend/core/recompute_scheduler.py
@@ -362,7 +362,7 @@ class RecomputeScheduler(SchedulerInterface):
        skipped_waiting_requests = create_request_queue(self.policy)
        # Next, schedule the WAITING requests.
-        if not preempted_reqs:
+        if not preempted_reqs and not recomputed_reqs:
            while self.waiting and token_budget > 0:
                if len(self.running) == self.max_num_running_reqs:
                    break