From 836873b99f0000ca04d5bdefbef2b4a1235025b8 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Sat, 30 Aug 2025 14:36:03 +0800 Subject: [PATCH] Fix memory leak when aborting decode request in PD-Disagg (#9817) Co-authored-by: Lianmin Zheng <15100009+merrymercy@users.noreply.github.com> --- python/sglang/srt/disaggregation/launch_lb.py | 13 ------------- python/sglang/srt/managers/scheduler.py | 4 ++++ 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/python/sglang/srt/disaggregation/launch_lb.py b/python/sglang/srt/disaggregation/launch_lb.py index faa52f873..eb0be6573 100644 --- a/python/sglang/srt/disaggregation/launch_lb.py +++ b/python/sglang/srt/disaggregation/launch_lb.py @@ -6,7 +6,6 @@ from sglang.srt.disaggregation.mini_lb import PrefillConfig, run @dataclasses.dataclass class LBArgs: - rust_lb: bool = False host: str = "0.0.0.0" port: int = 8000 policy: str = "random" @@ -17,11 +16,6 @@ class LBArgs: @staticmethod def add_cli_args(parser: argparse.ArgumentParser): - parser.add_argument( - "--rust-lb", - action="store_true", - help="Deprecated, please use SGLang Router instead, this argument will have no effect.", - ) parser.add_argument( "--host", type=str, @@ -92,7 +86,6 @@ class LBArgs: ] return cls( - rust_lb=args.rust_lb, host=args.host, port=args.port, policy=args.policy, @@ -102,12 +95,6 @@ class LBArgs: timeout=args.timeout, ) - def __post_init__(self): - if not self.rust_lb: - assert ( - self.policy == "random" - ), "Only random policy is supported for Python load balancer" - def main(): parser = argparse.ArgumentParser( diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 54028ce65..f7de3275e 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2378,6 +2378,10 @@ class Scheduler( # We still need to send something back to TokenizerManager to clean up the state. req = self.waiting_queue.pop(i) self.send_to_tokenizer.send_pyobj(AbortReq(req.rid)) + # For disaggregation decode mode, the request in the waiting queue has KV cache allocated. + if self.disaggregation_mode == DisaggregationMode.DECODE: + self.tree_cache.cache_finished_req(req) + logger.debug(f"Abort queued request. {req.rid=}") # Delete the requests in the grammar queue