Support colocating requests (#7973)

This commit is contained in:
fzyzcjy
2025-07-29 13:51:49 +08:00
committed by GitHub
parent 59d0bf012f
commit 0ce84c822b
6 changed files with 179 additions and 6 deletions

View File

@@ -27,6 +27,7 @@ import threading
import time
import uuid
from collections import deque
from contextlib import nullcontext
from datetime import datetime
from http import HTTPStatus
from typing import (
@@ -69,6 +70,7 @@ from sglang.srt.managers.io_struct import (
BatchMultimodalOut,
BatchStrOut,
BatchTokenIDOut,
BlockReqType,
CloseSessionReqInput,
ConfigureLoggingReq,
EmbeddingReqInput,
@@ -114,6 +116,7 @@ from sglang.srt.managers.io_struct import (
)
from sglang.srt.managers.mm_utils import TensorTransportMode
from sglang.srt.managers.multimodal_processor import get_mm_processor, import_processors
from sglang.srt.managers.scheduler_input_blocker import input_blocker_guard_region
from sglang.srt.metrics.collector import TokenizerMetricsCollector
from sglang.srt.sampling.sampling_params import SamplingParams
from sglang.srt.server_args import PortArgs, ServerArgs
@@ -819,12 +822,21 @@ class TokenizerManager:
rids.append(tmp_obj.rid)
else:
# Sequential tokenization and processing
for i in range(batch_size):
tmp_obj = obj[i]
tokenized_obj = await self._tokenize_one_request(tmp_obj)
state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
generators.append(self._wait_one_response(tmp_obj, state, request))
rids.append(tmp_obj.rid)
with (
input_blocker_guard_region(send_to_scheduler=self.send_to_scheduler)
if get_bool_env_var("SGLANG_ENABLE_COLOCATED_BATCH_GEN")
else nullcontext()
):
for i in range(batch_size):
tmp_obj = obj[i]
tokenized_obj = await self._tokenize_one_request(tmp_obj)
state = self._send_one_request(
tmp_obj, tokenized_obj, created_time
)
generators.append(
self._wait_one_response(tmp_obj, state, request)
)
rids.append(tmp_obj.rid)
else:
# FIXME: When using batch and parallel_sample_num together, the perf is not optimal.
if batch_size > 128: