feat(example/fastapi): support --startup-timeout using Qwen3-Next-80B-A3B-Instruct as example (#11710)

Co-authored-by: chenan01 <chenan01@cheche-MacBook-Pro.local>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
Kindyaa
2025-10-19 02:50:34 +08:00
committed by GitHub
parent f9a7d9b3dc
commit c44e985dc2

View File

@@ -4,7 +4,7 @@ FastAPI server example for text generation using SGLang Engine and demonstrating
Starts the server, sends requests to it, and prints responses. Starts the server, sends requests to it, and prints responses.
Usage: Usage:
python fastapi_engine_inference.py --model-path Qwen/Qwen2.5-0.5B-Instruct --tp_size 1 --host 127.0.0.1 --port 8000 python fastapi_engine_inference.py --model-path Qwen/Qwen2.5-0.5B-Instruct --tp_size 1 --host 127.0.0.1 --port 8000 [--startup-timeout 60]
""" """
import os import os
@@ -160,6 +160,12 @@ if __name__ == "__main__":
parser.add_argument("--port", type=int, default=8000) parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--model-path", type=str, default="Qwen/Qwen2.5-0.5B-Instruct") parser.add_argument("--model-path", type=str, default="Qwen/Qwen2.5-0.5B-Instruct")
parser.add_argument("--tp_size", type=int, default=1) parser.add_argument("--tp_size", type=int, default=1)
parser.add_argument(
"--startup-timeout",
type=int,
default=60,
help="Time in seconds to wait for the server to be ready (default: %(default)s)",
)
args = parser.parse_args() args = parser.parse_args()
# Pass the model to the child uvicorn process via an env var # Pass the model to the child uvicorn process via an env var
@@ -167,7 +173,7 @@ if __name__ == "__main__":
os.environ["TP_SIZE"] = str(args.tp_size) os.environ["TP_SIZE"] = str(args.tp_size)
# Start the server # Start the server
process = start_server(args) process = start_server(args, timeout=args.startup_timeout)
# Define the prompts and sampling parameters # Define the prompts and sampling parameters
prompts = [ prompts = [