From 836873b99f0000ca04d5bdefbef2b4a1235025b8 Mon Sep 17 00:00:00 2001
From: Liangsheng Yin <hnyls2002@gmail.com>
Date: Sat, 30 Aug 2025 14:36:03 +0800
Subject: [PATCH] Fix memory leak when aborting decode request in PD-Disagg
 (#9817)

Co-authored-by: Lianmin Zheng <15100009+merrymercy@users.noreply.github.com>
---
 python/sglang/srt/disaggregation/launch_lb.py | 13 -------------
 python/sglang/srt/managers/scheduler.py       |  4 ++++
 2 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/python/sglang/srt/disaggregation/launch_lb.py b/python/sglang/srt/disaggregation/launch_lb.py
index faa52f873..eb0be6573 100644
--- a/python/sglang/srt/disaggregation/launch_lb.py
+++ b/python/sglang/srt/disaggregation/launch_lb.py
@@ -6,7 +6,6 @@ from sglang.srt.disaggregation.mini_lb import PrefillConfig, run
 
 @dataclasses.dataclass
 class LBArgs:
-    rust_lb: bool = False
     host: str = "0.0.0.0"
     port: int = 8000
     policy: str = "random"
@@ -17,11 +16,6 @@ class LBArgs:
 
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
-        parser.add_argument(
-            "--rust-lb",
-            action="store_true",
-            help="Deprecated, please use SGLang Router instead, this argument will have no effect.",
-        )
         parser.add_argument(
             "--host",
             type=str,
@@ -92,7 +86,6 @@ class LBArgs:
         ]
 
         return cls(
-            rust_lb=args.rust_lb,
             host=args.host,
             port=args.port,
             policy=args.policy,
@@ -102,12 +95,6 @@ class LBArgs:
             timeout=args.timeout,
         )
 
-    def __post_init__(self):
-        if not self.rust_lb:
-            assert (
-                self.policy == "random"
-            ), "Only random policy is supported for Python load balancer"
-
 
 def main():
     parser = argparse.ArgumentParser(
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 54028ce65..f7de3275e 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2378,6 +2378,10 @@ class Scheduler(
             # We still need to send something back to TokenizerManager to clean up the state.
             req = self.waiting_queue.pop(i)
             self.send_to_tokenizer.send_pyobj(AbortReq(req.rid))
+            # For disaggregation decode mode, the request in the waiting queue has KV cache allocated.
+            if self.disaggregation_mode == DisaggregationMode.DECODE:
+                self.tree_cache.cache_finished_req(req)
+
             logger.debug(f"Abort queued request. {req.rid=}")
 
         # Delete the requests in the grammar queue