Improve streaming, log_level, memory report, weight loading, and benchmark script (#7632)
Co-authored-by: Kan Wu <wukanustc@gmail.com>
This commit is contained in:
@@ -126,8 +126,6 @@ def set_global_state(global_state: _GlobalState):
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(fast_api_app: FastAPI):
|
||||
server_args: ServerArgs = fast_api_app.server_args
|
||||
|
||||
# Initialize OpenAI serving handlers
|
||||
fast_api_app.state.openai_serving_completion = OpenAIServingCompletion(
|
||||
_global_state.tokenizer_manager, _global_state.template_manager
|
||||
@@ -145,9 +143,12 @@ async def lifespan(fast_api_app: FastAPI):
|
||||
_global_state.tokenizer_manager
|
||||
)
|
||||
|
||||
server_args: ServerArgs = fast_api_app.server_args
|
||||
if server_args.warmups is not None:
|
||||
await execute_warmups(
|
||||
server_args.warmups.split(","), _global_state.tokenizer_manager
|
||||
server_args.disaggregation_mode,
|
||||
server_args.warmups.split(","),
|
||||
_global_state.tokenizer_manager,
|
||||
)
|
||||
logger.info("Warmup ended")
|
||||
|
||||
@@ -280,13 +281,17 @@ async def get_model_info():
|
||||
"model_path": _global_state.tokenizer_manager.model_path,
|
||||
"tokenizer_path": _global_state.tokenizer_manager.server_args.tokenizer_path,
|
||||
"is_generation": _global_state.tokenizer_manager.is_generation,
|
||||
"preferred_sampling_params": _global_state.tokenizer_manager.server_args.preferred_sampling_params,
|
||||
}
|
||||
return result
|
||||
|
||||
|
||||
@app.get("/get_server_info")
|
||||
async def get_server_info():
|
||||
internal_states = await _global_state.tokenizer_manager.get_internal_state()
|
||||
# Returns interna states per DP.
|
||||
internal_states: List[Dict[Any, Any]] = (
|
||||
await _global_state.tokenizer_manager.get_internal_state()
|
||||
)
|
||||
return {
|
||||
**dataclasses.asdict(_global_state.tokenizer_manager.server_args),
|
||||
**_global_state.scheduler_info,
|
||||
@@ -300,6 +305,8 @@ async def get_load():
|
||||
return await _global_state.tokenizer_manager.get_load()
|
||||
|
||||
|
||||
# example usage:
|
||||
# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"max_micro_batch_size": 8}}'
|
||||
@app.api_route("/set_internal_state", methods=["POST", "PUT"])
|
||||
async def set_internal_state(obj: SetInternalStateReq, request: Request):
|
||||
res = await _global_state.tokenizer_manager.set_internal_state(obj)
|
||||
@@ -886,6 +893,15 @@ def launch_server(
|
||||
add_prometheus_middleware(app)
|
||||
enable_func_timer()
|
||||
|
||||
image_token_text = None
|
||||
if (
|
||||
tokenizer_manager.image_token_id is not None
|
||||
and not server_args.skip_tokenizer_init
|
||||
):
|
||||
image_token_text = tokenizer_manager.tokenizer.decode(
|
||||
[tokenizer_manager.image_token_id]
|
||||
)
|
||||
|
||||
# Send a warmup request - we will create the thread launch it
|
||||
# in the lifespan after all other warmups have fired.
|
||||
warmup_thread = threading.Thread(
|
||||
@@ -893,7 +909,7 @@ def launch_server(
|
||||
args=(
|
||||
server_args,
|
||||
pipe_finish_writer,
|
||||
_global_state.tokenizer_manager.image_token_id,
|
||||
image_token_text,
|
||||
launch_callback,
|
||||
),
|
||||
)
|
||||
@@ -1022,9 +1038,10 @@ def _wait_and_warmup(
|
||||
return
|
||||
|
||||
# Debug print
|
||||
# logger.info(f"{res.json()=}")
|
||||
# logger.info(f"warmup request returns: {res.json()=}")
|
||||
|
||||
logger.info("The server is fired up and ready to roll!")
|
||||
|
||||
if pipe_finish_writer is not None:
|
||||
pipe_finish_writer.send("ready")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user