Support Multi Process Tokenizer Manager(#6555) (#8964)

Signed-off-by: ybyang <ybyang7@iflytek.com>
Signed-off-by: huanglong <huanglong@linux.alibaba.com>
Co-authored-by: Huang Long <121648372+LLLL114@users.noreply.github.com>
Co-authored-by: huanglong <huanglong@linux.alibaba.com>
Co-authored-by: Shangming Cai <csmthu@gmail.com>
This commit is contained in:
ybyang
2025-09-01 16:00:13 +08:00
committed by GitHub
parent 4750cddf68
commit 5f77e1292d
11 changed files with 1030 additions and 80 deletions

View File

@@ -84,6 +84,8 @@ from sglang.srt.managers.io_struct import (
InitWeightsUpdateGroupReqInput,
LoadLoRAAdapterReqInput,
LoadLoRAAdapterReqOutput,
MultiTokenizerRegisterReq,
MultiTokenizerWarpper,
OpenSessionReqInput,
OpenSessionReqOutput,
ProfileReq,
@@ -257,7 +259,6 @@ class Scheduler(
# Init inter-process communication
context = zmq.Context(2)
self.idle_sleeper = None
if self.pp_rank == 0 and self.attn_tp_rank == 0:
self.recv_from_tokenizer = get_zmq_socket(
context, zmq.PULL, port_args.scheduler_input_ipc_name, False
@@ -540,6 +541,7 @@ class Scheduler(
(ExpertDistributionReq, self.expert_distribution_handle),
(LoadLoRAAdapterReqInput, self.load_lora_adapter),
(UnloadLoRAAdapterReqInput, self.unload_lora_adapter),
(MultiTokenizerRegisterReq, self.register_multi_tokenizer),
]
)
@@ -1101,6 +1103,17 @@ class Scheduler(
)
self.send_to_tokenizer.send_pyobj(abort_req)
continue
# If it is a MultiTokenizerWarpper, unwrap it and handle the inner request.
if isinstance(recv_req, MultiTokenizerWarpper):
worker_id = recv_req.worker_id
recv_req = recv_req.obj
output = self._request_dispatcher(recv_req)
if output is not None:
output = MultiTokenizerWarpper(worker_id, output)
self.send_to_tokenizer.send_pyobj(output)
continue
output = self._request_dispatcher(recv_req)
if output is not None:
if isinstance(output, RpcReqOutput):
@@ -2474,6 +2487,10 @@ class Scheduler(
result = self.tp_worker.unload_lora_adapter(recv_req)
return result
def register_multi_tokenizer(self, recv_req: MultiTokenizerRegisterReq):
self.send_to_detokenizer.send_pyobj(recv_req)
return recv_req
def slow_down(self, recv_req: SlowDownReqInput):
t = recv_req.forward_sleep_time
if t is not None and t <= 0: