Support GC Freezing to improve latency & throughput (#9241)
Co-authored-by: Chanh Nguyen <cnguyen@linkedin.com> Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
This commit is contained in:
@@ -536,6 +536,22 @@ class Engine(EngineBase):
|
||||
self.tokenizer_manager.resume_memory_occupation(obj, None)
|
||||
)
|
||||
|
||||
def freeze_gc(self):
|
||||
"""
|
||||
To maintain a high performance server with low latency, we want to reduce the
|
||||
stalls caused by the garbage collector scanning through a large number of objects.
|
||||
|
||||
It is usually helpful to start the server and warm it up with real requests to
|
||||
initialize many of the long-lived objects that do not need to be garbage collected.
|
||||
|
||||
After sufficient warmup, we can call this function to freeze the garbage collector
|
||||
so that all objects created before this point are considered out of scope for garbage
|
||||
collection.
|
||||
"""
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.run_until_complete(self.tokenizer_manager.freeze_gc())
|
||||
|
||||
"""
|
||||
Execute an RPC call on all scheduler processes.
|
||||
"""
|
||||
|
||||
@@ -511,6 +511,18 @@ async def stop_profile_async():
|
||||
)
|
||||
|
||||
|
||||
@app.api_route("/freeze_gc", methods=["GET", "POST"])
|
||||
async def freeze_gc_async():
|
||||
"""
|
||||
See engine.freeze_gc for more details.
|
||||
"""
|
||||
await _global_state.tokenizer_manager.freeze_gc()
|
||||
return Response(
|
||||
content="Garbage collection frozen.\n",
|
||||
status_code=200,
|
||||
)
|
||||
|
||||
|
||||
@app.api_route("/start_expert_distribution_record", methods=["GET", "POST"])
|
||||
async def start_expert_distribution_record_async():
|
||||
"""Start recording the expert distribution. Clear the previous record if any."""
|
||||
|
||||
Reference in New Issue
Block a user