Support GC Freezing to improve latency & throughput (#9241)
Co-authored-by: Chanh Nguyen <cnguyen@linkedin.com> Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
This commit is contained in:
@@ -31,10 +31,12 @@ from sglang.srt.managers.io_struct import (
|
||||
BatchMultimodalOut,
|
||||
BatchStrOut,
|
||||
BatchTokenIDOut,
|
||||
FreezeGCReq,
|
||||
)
|
||||
from sglang.srt.server_args import PortArgs, ServerArgs
|
||||
from sglang.srt.utils import (
|
||||
configure_logger,
|
||||
freeze_gc,
|
||||
get_zmq_socket,
|
||||
kill_itself_when_parent_died,
|
||||
)
|
||||
@@ -100,6 +102,7 @@ class DetokenizerManager:
|
||||
(BatchEmbeddingOut, self.handle_batch_embedding_out),
|
||||
(BatchTokenIDOut, self.handle_batch_token_id_out),
|
||||
(BatchMultimodalDecodeReq, self.handle_multimodal_decode_req),
|
||||
(FreezeGCReq, self.handle_freeze_gc_req),
|
||||
]
|
||||
)
|
||||
|
||||
@@ -108,7 +111,8 @@ class DetokenizerManager:
|
||||
while True:
|
||||
recv_obj = self.recv_from_scheduler.recv_pyobj()
|
||||
output = self._request_dispatcher(recv_obj)
|
||||
self.send_to_tokenizer.send_pyobj(output)
|
||||
if output is not None:
|
||||
self.send_to_tokenizer.send_pyobj(output)
|
||||
|
||||
def trim_matched_stop(
|
||||
self, output: Union[str, List[int]], finished_reason: Dict, no_stop_trim: bool
|
||||
@@ -247,6 +251,10 @@ class DetokenizerManager:
|
||||
cached_tokens=recv_obj.cached_tokens,
|
||||
)
|
||||
|
||||
def handle_freeze_gc_req(self, recv_req: FreezeGCReq):
|
||||
freeze_gc("Detokenizer Manager")
|
||||
return None
|
||||
|
||||
|
||||
class LimitedCapacityDict(OrderedDict):
|
||||
def __init__(self, capacity: int, *args, **kwargs):
|
||||
|
||||
@@ -1005,6 +1005,11 @@ class ProfileReqOutput:
|
||||
message: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class FreezeGCReq:
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConfigureLoggingReq:
|
||||
log_requests: Optional[bool] = None
|
||||
|
||||
@@ -72,6 +72,7 @@ from sglang.srt.managers.io_struct import (
|
||||
ExpertDistributionReqOutput,
|
||||
FlushCacheReqInput,
|
||||
FlushCacheReqOutput,
|
||||
FreezeGCReq,
|
||||
GetInternalStateReq,
|
||||
GetInternalStateReqOutput,
|
||||
GetWeightsByNameReqInput,
|
||||
@@ -145,6 +146,7 @@ from sglang.srt.utils import (
|
||||
configure_gc_logger,
|
||||
configure_logger,
|
||||
disable_request_logging,
|
||||
freeze_gc,
|
||||
get_available_gpu_memory,
|
||||
get_bool_env_var,
|
||||
get_zmq_socket,
|
||||
@@ -524,6 +526,7 @@ class Scheduler(
|
||||
(ResumeMemoryOccupationReqInput, self.resume_memory_occupation),
|
||||
(SlowDownReqInput, self.slow_down),
|
||||
(ProfileReq, self.profile),
|
||||
(FreezeGCReq, self.handle_freeze_gc),
|
||||
(GetInternalStateReq, self.get_internal_state),
|
||||
(SetInternalStateReq, self.set_internal_state),
|
||||
(RpcReqInput, self.handle_rpc_request),
|
||||
@@ -2469,6 +2472,12 @@ class Scheduler(
|
||||
if self.idle_sleeper is not None:
|
||||
self.idle_sleeper.maybe_sleep()
|
||||
|
||||
def handle_freeze_gc(self, recv_req: FreezeGCReq):
|
||||
"""Handle freeze_gc request: freeze scheduler's GC and forward to detokenizer."""
|
||||
freeze_gc("Scheduler")
|
||||
self.send_to_detokenizer.send_pyobj(recv_req)
|
||||
return None
|
||||
|
||||
|
||||
class IdleSleeper:
|
||||
"""
|
||||
|
||||
@@ -78,6 +78,7 @@ from sglang.srt.managers.io_struct import (
|
||||
ExpertDistributionReqOutput,
|
||||
FlushCacheReqInput,
|
||||
FlushCacheReqOutput,
|
||||
FreezeGCReq,
|
||||
GenerateReqInput,
|
||||
GetInternalStateReq,
|
||||
GetInternalStateReqOutput,
|
||||
@@ -122,7 +123,9 @@ from sglang.srt.metrics.collector import TokenizerMetricsCollector
|
||||
from sglang.srt.sampling.sampling_params import SamplingParams
|
||||
from sglang.srt.server_args import PortArgs, ServerArgs
|
||||
from sglang.srt.utils import (
|
||||
configure_gc_warning,
|
||||
dataclass_to_string_truncated,
|
||||
freeze_gc,
|
||||
get_bool_env_var,
|
||||
get_zmq_socket,
|
||||
kill_process_tree,
|
||||
@@ -352,6 +355,10 @@ class TokenizerManager:
|
||||
collect_tokens_histogram=self.server_args.collect_tokens_histogram,
|
||||
)
|
||||
|
||||
# Configure GC warning
|
||||
if self.server_args.gc_warning_threshold_secs > 0.0:
|
||||
configure_gc_warning(self.server_args.gc_warning_threshold_secs)
|
||||
|
||||
# Communicators
|
||||
self.init_weights_update_group_communicator = _Communicator(
|
||||
self.send_to_scheduler, server_args.dp_size
|
||||
@@ -446,6 +453,10 @@ class TokenizerManager:
|
||||
ProfileReqOutput,
|
||||
self.profile_communicator.handle_recv,
|
||||
),
|
||||
(
|
||||
FreezeGCReq,
|
||||
lambda x: None,
|
||||
), # For handling case when scheduler skips detokenizer and forwards back to the tokenizer manager, we ignore it.
|
||||
(
|
||||
GetInternalStateReqOutput,
|
||||
self.get_internal_state_communicator.handle_recv,
|
||||
@@ -1359,6 +1370,12 @@ class TokenizerManager:
|
||||
logging.info(f"Config logging: {obj=}")
|
||||
self.log_request_metadata = self.get_log_request_metadata()
|
||||
|
||||
async def freeze_gc(self):
|
||||
"""Send a freeze_gc message to the scheduler first, then freeze locally."""
|
||||
self.send_to_scheduler.send_pyobj(FreezeGCReq())
|
||||
freeze_gc("Tokenizer Manager")
|
||||
return None
|
||||
|
||||
def create_abort_task(self, obj: GenerateReqInput):
|
||||
# Abort the request if the client is disconnected.
|
||||
async def abort_request():
|
||||
|
||||
Reference in New Issue
Block a user