[4/N]DP refactor: support watching mode get_load and shortest queue strategy (#10201)

This commit is contained in:
Liangsheng Yin
2025-09-15 10:06:08 +08:00
committed by GitHub
parent ca63f075b7
commit 305c9e8c2d
12 changed files with 202 additions and 44 deletions

View File

@@ -64,6 +64,7 @@ from sglang.srt.managers.io_struct import (
EmbeddingReqInput,
FreezeGCReq,
GenerateReqInput,
GetLoadReqInput,
HealthCheckOutput,
MultiTokenizerWrapper,
OpenSessionReqInput,
@@ -73,6 +74,7 @@ from sglang.srt.managers.io_struct import (
TokenizedGenerateReqInput,
UpdateWeightFromDiskReqInput,
UpdateWeightFromDiskReqOutput,
WatchLoadUpdateReq,
)
from sglang.srt.managers.mm_utils import TensorTransportMode
from sglang.srt.managers.multimodal_processor import get_mm_processor, import_processors
@@ -1240,6 +1242,9 @@ class TokenizerManager(TokenizerCommunicatorMixin):
self.asyncio_tasks.add(
loop.create_task(print_exception_wrapper(self.sigterm_watchdog))
)
self.asyncio_tasks.add(
loop.create_task(print_exception_wrapper(self.watch_load_thread))
)
def dump_requests_before_crash(self):
if self.crash_dump_performed:
@@ -1844,6 +1849,20 @@ class TokenizerManager(TokenizerCommunicatorMixin):
return scores
async def watch_load_thread(self):
# Only for dp_controller when dp_size > 1
if (
self.server_args.dp_size == 1
or self.server_args.load_balance_method == "round_robin"
):
return
while True:
await asyncio.sleep(self.server_args.load_watch_interval)
loads = await self.get_load_communicator(GetLoadReqInput())
load_udpate_req = WatchLoadUpdateReq(loads=loads)
self.send_to_scheduler.send_pyobj(load_udpate_req)
class ServerStatus(Enum):
Up = "Up"