From 0305c5053feeb108e27368aa9d1e5bc4fb93e886 Mon Sep 17 00:00:00 2001
From: Wenxuan Tan <wtan45@wisc.edu>
Date: Sun, 3 Aug 2025 02:03:16 -0500
Subject: [PATCH] Reduce memory accumulation in long-running server (#8306)

Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
---
 python/sglang/global_config.py          | 6 +++++-
 python/sglang/srt/managers/scheduler.py | 8 ++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/python/sglang/global_config.py b/python/sglang/global_config.py
index c331e64db..f006bd94c 100644
--- a/python/sglang/global_config.py
+++ b/python/sglang/global_config.py
@@ -30,7 +30,11 @@ class GlobalConfig:
         self.default_new_token_ratio_decay_steps = float(
             os.environ.get("SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS", 600)
         )
-
+        self.torch_empty_cache_interval = float(
+            os.environ.get(
+                "SGLANG_EMPTY_CACHE_INTERVAL", -1
+            )  # in seconds. Set if you observe high memory accumulation over a long serving period.
+        )
         # Runtime constants: others
         self.retract_decode_steps = 20
         self.flashinfer_workspace_size = os.environ.get(
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index d2298cf38..2a0b139f6 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2362,11 +2362,19 @@ class IdleSleeper:
 
     def __init__(self, sockets):
         self.poller = zmq.Poller()
+        self.last_empty_time = time.time()
         for s in sockets:
             self.poller.register(s, zmq.POLLIN)
 
     def maybe_sleep(self):
         self.poller.poll(1000)
+        if (
+            global_config.torch_empty_cache_interval > 0
+            and time.time() - self.last_empty_time
+            > global_config.torch_empty_cache_interval
+        ):
+            self.last_empty_time = time.time()
+            torch.cuda.empty_cache()
 
 
 def is_health_check_generate_req(recv_req):