Support GC Freezing to improve latency & throughput (#9241)

Co-authored-by: Chanh Nguyen <cnguyen@linkedin.com> Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
2025-08-22 22:43:09 -07:00
parent 7e880286b5
commit 127d4b0d5e
8 changed files with 119 additions and 1 deletions
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -536,6 +536,22 @@ class Engine(EngineBase):
            self.tokenizer_manager.resume_memory_occupation(obj, None)
        )

+    def freeze_gc(self):
+        """
+        To maintain a high performance server with low latency, we want to reduce the
+        stalls caused by the garbage collector scanning through a large number of objects.
+
+        It is usually helpful to start the server and warm it up with real requests to
+        initialize many of the long-lived objects that do not need to be garbage collected.
+
+        After sufficient warmup, we can call this function to freeze the garbage collector
+        so that all objects created before this point are considered out of scope for garbage
+        collection.
+        """
+
+        loop = asyncio.get_event_loop()
+        loop.run_until_complete(self.tokenizer_manager.freeze_gc())
+
    """
    Execute an RPC call on all scheduler processes.
    """
--- a/python/sglang/srt/entrypoints/http_server.py
+++ b/python/sglang/srt/entrypoints/http_server.py
@@ -511,6 +511,18 @@ async def stop_profile_async():
    )


+@app.api_route("/freeze_gc", methods=["GET", "POST"])
+async def freeze_gc_async():
+    """
+    See engine.freeze_gc for more details.
+    """
+    await _global_state.tokenizer_manager.freeze_gc()
+    return Response(
+        content="Garbage collection frozen.\n",
+        status_code=200,
+    )
+
+
@app.api_route("/start_expert_distribution_record", methods=["GET", "POST"])
 async def start_expert_distribution_record_async():
    """Start recording the expert distribution. Clear the previous record if any."""