Reduce memory accumulation in long-running server (#8306)

Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
2025-08-03 02:03:16 -05:00
parent 8675bdf246
commit 0305c5053f
2 changed files with 13 additions and 1 deletions
--- a/python/sglang/global_config.py
+++ b/python/sglang/global_config.py
@@ -30,7 +30,11 @@ class GlobalConfig:
        self.default_new_token_ratio_decay_steps = float(
            os.environ.get("SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS", 600)
        )
-
+        self.torch_empty_cache_interval = float(
+            os.environ.get(
+                "SGLANG_EMPTY_CACHE_INTERVAL", -1
+            )  # in seconds. Set if you observe high memory accumulation over a long serving period.
+        )
        # Runtime constants: others
        self.retract_decode_steps = 20
        self.flashinfer_workspace_size = os.environ.get(