Support GC Freezing to improve latency & throughput (#9241)
Co-authored-by: Chanh Nguyen <cnguyen@linkedin.com> Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
This commit is contained in:
@@ -31,10 +31,12 @@ from sglang.srt.managers.io_struct import (
|
||||
BatchMultimodalOut,
|
||||
BatchStrOut,
|
||||
BatchTokenIDOut,
|
||||
FreezeGCReq,
|
||||
)
|
||||
from sglang.srt.server_args import PortArgs, ServerArgs
|
||||
from sglang.srt.utils import (
|
||||
configure_logger,
|
||||
freeze_gc,
|
||||
get_zmq_socket,
|
||||
kill_itself_when_parent_died,
|
||||
)
|
||||
@@ -100,6 +102,7 @@ class DetokenizerManager:
|
||||
(BatchEmbeddingOut, self.handle_batch_embedding_out),
|
||||
(BatchTokenIDOut, self.handle_batch_token_id_out),
|
||||
(BatchMultimodalDecodeReq, self.handle_multimodal_decode_req),
|
||||
(FreezeGCReq, self.handle_freeze_gc_req),
|
||||
]
|
||||
)
|
||||
|
||||
@@ -108,7 +111,8 @@ class DetokenizerManager:
|
||||
while True:
|
||||
recv_obj = self.recv_from_scheduler.recv_pyobj()
|
||||
output = self._request_dispatcher(recv_obj)
|
||||
self.send_to_tokenizer.send_pyobj(output)
|
||||
if output is not None:
|
||||
self.send_to_tokenizer.send_pyobj(output)
|
||||
|
||||
def trim_matched_stop(
|
||||
self, output: Union[str, List[int]], finished_reason: Dict, no_stop_trim: bool
|
||||
@@ -247,6 +251,10 @@ class DetokenizerManager:
|
||||
cached_tokens=recv_obj.cached_tokens,
|
||||
)
|
||||
|
||||
def handle_freeze_gc_req(self, recv_req: FreezeGCReq):
|
||||
freeze_gc("Detokenizer Manager")
|
||||
return None
|
||||
|
||||
|
||||
class LimitedCapacityDict(OrderedDict):
|
||||
def __init__(self, capacity: int, *args, **kwargs):
|
||||
|
||||
Reference in New Issue
Block a user