[Performance] Batch Send from Tokenizer Manager. (#9436)
This commit is contained in:
committed by
GitHub
parent
3aec3d4f8b
commit
ea0696b924
@@ -67,6 +67,8 @@ from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
||||
from sglang.srt.layers.moe import initialize_moe_config
|
||||
from sglang.srt.managers.io_struct import (
|
||||
AbortReq,
|
||||
BatchTokenizedEmbeddingReqInput,
|
||||
BatchTokenizedGenerateReqInput,
|
||||
CloseSessionReqInput,
|
||||
ExpertDistributionReq,
|
||||
ExpertDistributionReqOutput,
|
||||
@@ -510,6 +512,8 @@ class Scheduler(
|
||||
[
|
||||
(TokenizedGenerateReqInput, self.handle_generate_request),
|
||||
(TokenizedEmbeddingReqInput, self.handle_embedding_request),
|
||||
(BatchTokenizedGenerateReqInput, self.handle_batch_generate_request),
|
||||
(BatchTokenizedEmbeddingReqInput, self.handle_batch_embedding_request),
|
||||
(FlushCacheReqInput, self.flush_cache_wrapped),
|
||||
(AbortReq, self.abort_request),
|
||||
(OpenSessionReqInput, self.open_session),
|
||||
@@ -1018,14 +1022,26 @@ class Scheduler(
|
||||
req
|
||||
for req in recv_reqs
|
||||
if isinstance(
|
||||
req, (TokenizedGenerateReqInput, TokenizedEmbeddingReqInput)
|
||||
req,
|
||||
(
|
||||
TokenizedGenerateReqInput,
|
||||
TokenizedEmbeddingReqInput,
|
||||
BatchTokenizedGenerateReqInput,
|
||||
BatchTokenizedEmbeddingReqInput,
|
||||
),
|
||||
)
|
||||
]
|
||||
control_reqs = [
|
||||
req
|
||||
for req in recv_reqs
|
||||
if not isinstance(
|
||||
req, (TokenizedGenerateReqInput, TokenizedEmbeddingReqInput)
|
||||
req,
|
||||
(
|
||||
TokenizedGenerateReqInput,
|
||||
TokenizedEmbeddingReqInput,
|
||||
BatchTokenizedGenerateReqInput,
|
||||
BatchTokenizedEmbeddingReqInput,
|
||||
),
|
||||
)
|
||||
]
|
||||
else:
|
||||
@@ -1253,6 +1269,17 @@ class Scheduler(
|
||||
else:
|
||||
self._add_request_to_queue(req)
|
||||
|
||||
def handle_batch_generate_request(
|
||||
self,
|
||||
recv_req: BatchTokenizedGenerateReqInput,
|
||||
):
|
||||
"""Handle optimized batch generate request."""
|
||||
logger.debug(f"Processing batch generate request with {len(recv_req)} requests")
|
||||
|
||||
# Process each request in the batch
|
||||
for tokenized_req in recv_req:
|
||||
self.handle_generate_request(tokenized_req)
|
||||
|
||||
def _add_request_to_queue(self, req: Req):
|
||||
req.queue_time_start = time.perf_counter()
|
||||
if self.disaggregation_mode == DisaggregationMode.PREFILL:
|
||||
@@ -1335,6 +1362,19 @@ class Scheduler(
|
||||
req.logprob_start_len = len(req.origin_input_ids) - 1
|
||||
self._add_request_to_queue(req)
|
||||
|
||||
def handle_batch_embedding_request(
|
||||
self,
|
||||
recv_req: BatchTokenizedEmbeddingReqInput,
|
||||
):
|
||||
"""Handle optimized batch embedding request."""
|
||||
logger.debug(
|
||||
f"Processing batch embedding request with {len(recv_req)} requests"
|
||||
)
|
||||
|
||||
# Process each request in the batch
|
||||
for tokenized_req in recv_req:
|
||||
self.handle_embedding_request(tokenized_req)
|
||||
|
||||
def self_check_during_idle(self):
|
||||
self.check_memory()
|
||||
self.check_tree_cache()
|
||||
@@ -2513,7 +2553,15 @@ def is_health_check_generate_req(recv_req):
|
||||
|
||||
|
||||
def is_work_request(recv_req):
|
||||
return isinstance(recv_req, (TokenizedGenerateReqInput, TokenizedEmbeddingReqInput))
|
||||
return isinstance(
|
||||
recv_req,
|
||||
(
|
||||
TokenizedGenerateReqInput,
|
||||
TokenizedEmbeddingReqInput,
|
||||
BatchTokenizedGenerateReqInput,
|
||||
BatchTokenizedEmbeddingReqInput,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def run_scheduler_process(
|
||||
|
||||
Reference in New Issue
Block a user