Support GC Freezing to improve latency & throughput (#9241)

Co-authored-by: Chanh Nguyen <cnguyen@linkedin.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
This commit is contained in:
Chanh Nguyen
2025-08-22 22:43:09 -07:00
committed by GitHub
parent 7e880286b5
commit 127d4b0d5e
8 changed files with 119 additions and 1 deletions

View File

@@ -78,6 +78,7 @@ from sglang.srt.managers.io_struct import (
ExpertDistributionReqOutput,
FlushCacheReqInput,
FlushCacheReqOutput,
FreezeGCReq,
GenerateReqInput,
GetInternalStateReq,
GetInternalStateReqOutput,
@@ -122,7 +123,9 @@ from sglang.srt.metrics.collector import TokenizerMetricsCollector
from sglang.srt.sampling.sampling_params import SamplingParams
from sglang.srt.server_args import PortArgs, ServerArgs
from sglang.srt.utils import (
configure_gc_warning,
dataclass_to_string_truncated,
freeze_gc,
get_bool_env_var,
get_zmq_socket,
kill_process_tree,
@@ -352,6 +355,10 @@ class TokenizerManager:
collect_tokens_histogram=self.server_args.collect_tokens_histogram,
)
# Configure GC warning
if self.server_args.gc_warning_threshold_secs > 0.0:
configure_gc_warning(self.server_args.gc_warning_threshold_secs)
# Communicators
self.init_weights_update_group_communicator = _Communicator(
self.send_to_scheduler, server_args.dp_size
@@ -446,6 +453,10 @@ class TokenizerManager:
ProfileReqOutput,
self.profile_communicator.handle_recv,
),
(
FreezeGCReq,
lambda x: None,
), # For handling case when scheduler skips detokenizer and forwards back to the tokenizer manager, we ignore it.
(
GetInternalStateReqOutput,
self.get_internal_state_communicator.handle_recv,
@@ -1359,6 +1370,12 @@ class TokenizerManager:
logging.info(f"Config logging: {obj=}")
self.log_request_metadata = self.get_log_request_metadata()
async def freeze_gc(self):
"""Send a freeze_gc message to the scheduler first, then freeze locally."""
self.send_to_scheduler.send_pyobj(FreezeGCReq())
freeze_gc("Tokenizer Manager")
return None
def create_abort_task(self, obj: GenerateReqInput):
# Abort the request if the client is disconnected.
async def abort_request():