Fix memory leak when aborting decode request in PD-Disagg (#9817)

Co-authored-by: Lianmin Zheng <15100009+merrymercy@users.noreply.github.com>
2025-08-30 14:36:03 +08:00
parent 8abe8deae6
commit 836873b99f
2 changed files with 4 additions and 13 deletions
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2378,6 +2378,10 @@ class Scheduler(
            # We still need to send something back to TokenizerManager to clean up the state.
            req = self.waiting_queue.pop(i)
            self.send_to_tokenizer.send_pyobj(AbortReq(req.rid))
+            # For disaggregation decode mode, the request in the waiting queue has KV cache allocated.
+            if self.disaggregation_mode == DisaggregationMode.DECODE:
+                self.tree_cache.cache_finished_req(req)
+
            logger.debug(f"Abort queued request. {req.rid=}")

        # Delete the requests in the grammar queue