From 0305c5053feeb108e27368aa9d1e5bc4fb93e886 Mon Sep 17 00:00:00 2001 From: Wenxuan Tan Date: Sun, 3 Aug 2025 02:03:16 -0500 Subject: [PATCH] Reduce memory accumulation in long-running server (#8306) Co-authored-by: Liangsheng Yin --- python/sglang/global_config.py | 6 +++++- python/sglang/srt/managers/scheduler.py | 8 ++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/python/sglang/global_config.py b/python/sglang/global_config.py index c331e64db..f006bd94c 100644 --- a/python/sglang/global_config.py +++ b/python/sglang/global_config.py @@ -30,7 +30,11 @@ class GlobalConfig: self.default_new_token_ratio_decay_steps = float( os.environ.get("SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS", 600) ) - + self.torch_empty_cache_interval = float( + os.environ.get( + "SGLANG_EMPTY_CACHE_INTERVAL", -1 + ) # in seconds. Set if you observe high memory accumulation over a long serving period. + ) # Runtime constants: others self.retract_decode_steps = 20 self.flashinfer_workspace_size = os.environ.get( diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index d2298cf38..2a0b139f6 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -2362,11 +2362,19 @@ class IdleSleeper: def __init__(self, sockets): self.poller = zmq.Poller() + self.last_empty_time = time.time() for s in sockets: self.poller.register(s, zmq.POLLIN) def maybe_sleep(self): self.poller.poll(1000) + if ( + global_config.torch_empty_cache_interval > 0 + and time.time() - self.last_empty_time + > global_config.torch_empty_cache_interval + ): + self.last_empty_time = time.time() + torch.cuda.empty_cache() def is_health_check_generate_req(recv_req):