Improve streaming, log_level, memory report, weight loading, and benchmark script (#7632)

Co-authored-by: Kan Wu <wukanustc@gmail.com>
This commit is contained in:
Lianmin Zheng
2025-06-29 23:16:19 -07:00
committed by GitHub
parent c5131f7a2f
commit 22352d47a9
24 changed files with 626 additions and 160 deletions

View File

@@ -126,8 +126,6 @@ def set_global_state(global_state: _GlobalState):
@asynccontextmanager
async def lifespan(fast_api_app: FastAPI):
server_args: ServerArgs = fast_api_app.server_args
# Initialize OpenAI serving handlers
fast_api_app.state.openai_serving_completion = OpenAIServingCompletion(
_global_state.tokenizer_manager, _global_state.template_manager
@@ -145,9 +143,12 @@ async def lifespan(fast_api_app: FastAPI):
_global_state.tokenizer_manager
)
server_args: ServerArgs = fast_api_app.server_args
if server_args.warmups is not None:
await execute_warmups(
server_args.warmups.split(","), _global_state.tokenizer_manager
server_args.disaggregation_mode,
server_args.warmups.split(","),
_global_state.tokenizer_manager,
)
logger.info("Warmup ended")
@@ -280,13 +281,17 @@ async def get_model_info():
"model_path": _global_state.tokenizer_manager.model_path,
"tokenizer_path": _global_state.tokenizer_manager.server_args.tokenizer_path,
"is_generation": _global_state.tokenizer_manager.is_generation,
"preferred_sampling_params": _global_state.tokenizer_manager.server_args.preferred_sampling_params,
}
return result
@app.get("/get_server_info")
async def get_server_info():
internal_states = await _global_state.tokenizer_manager.get_internal_state()
# Returns interna states per DP.
internal_states: List[Dict[Any, Any]] = (
await _global_state.tokenizer_manager.get_internal_state()
)
return {
**dataclasses.asdict(_global_state.tokenizer_manager.server_args),
**_global_state.scheduler_info,
@@ -300,6 +305,8 @@ async def get_load():
return await _global_state.tokenizer_manager.get_load()
# example usage:
# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"max_micro_batch_size": 8}}'
@app.api_route("/set_internal_state", methods=["POST", "PUT"])
async def set_internal_state(obj: SetInternalStateReq, request: Request):
res = await _global_state.tokenizer_manager.set_internal_state(obj)
@@ -886,6 +893,15 @@ def launch_server(
add_prometheus_middleware(app)
enable_func_timer()
image_token_text = None
if (
tokenizer_manager.image_token_id is not None
and not server_args.skip_tokenizer_init
):
image_token_text = tokenizer_manager.tokenizer.decode(
[tokenizer_manager.image_token_id]
)
# Send a warmup request - we will create the thread launch it
# in the lifespan after all other warmups have fired.
warmup_thread = threading.Thread(
@@ -893,7 +909,7 @@ def launch_server(
args=(
server_args,
pipe_finish_writer,
_global_state.tokenizer_manager.image_token_id,
image_token_text,
launch_callback,
),
)
@@ -1022,9 +1038,10 @@ def _wait_and_warmup(
return
# Debug print
# logger.info(f"{res.json()=}")
# logger.info(f"warmup request returns: {res.json()=}")
logger.info("The server is fired up and ready to roll!")
if pipe_finish_writer is not None:
pipe_finish_writer.send("ready")