Improve streaming, log_level, memory report, weight loading, and benchmark script (#7632)
Co-authored-by: Kan Wu <wukanustc@gmail.com>
This commit is contained in:
@@ -4,6 +4,7 @@ from typing import List
|
||||
import numpy as np
|
||||
import tqdm
|
||||
|
||||
from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST
|
||||
from sglang.srt.managers.io_struct import GenerateReqInput
|
||||
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
||||
|
||||
@@ -20,17 +21,21 @@ def warmup(name: str) -> callable:
|
||||
return decorator
|
||||
|
||||
|
||||
async def execute_warmups(warmup_names: List[str], tokenizer_manager: TokenizerManager):
|
||||
async def execute_warmups(
|
||||
disaggregation_mode: str,
|
||||
warmup_names: List[str],
|
||||
tokenizer_manager: TokenizerManager,
|
||||
):
|
||||
for warmup_name in warmup_names:
|
||||
if warmup_name not in _warmup_registry:
|
||||
logger.warning(f"Could not find custom warmup {warmup_name}")
|
||||
continue
|
||||
logger.info(f"Running warmup {warmup_name}")
|
||||
await _warmup_registry[warmup_name](tokenizer_manager)
|
||||
await _warmup_registry[warmup_name](disaggregation_mode, tokenizer_manager)
|
||||
|
||||
|
||||
@warmup("voice_chat")
|
||||
async def voice_chat(tokenizer_manager: TokenizerManager):
|
||||
async def voice_chat(disaggregation_mode: str, tokenizer_manager: TokenizerManager):
|
||||
# this warms up the fused_moe triton kernels and caches them
|
||||
# if we don't do this we break real time inference for voice chat
|
||||
for i in tqdm.trange(1, 512):
|
||||
@@ -44,4 +49,8 @@ async def voice_chat(tokenizer_manager: TokenizerManager):
|
||||
"min_p": 0.0,
|
||||
},
|
||||
)
|
||||
if disaggregation_mode != "null":
|
||||
generate_req_input.bootstrap_room = 0
|
||||
generate_req_input.bootstrap_host = FAKE_BOOTSTRAP_HOST
|
||||
|
||||
await tokenizer_manager.generate_request(generate_req_input, None).__anext__()
|
||||
|
||||
Reference in New Issue
Block a user