2024-01-08 04:37:50 +00:00
|
|
|
"""SRT: SGLang Runtime"""
|
2024-02-11 05:50:13 -08:00
|
|
|
|
2024-01-08 04:37:50 +00:00
|
|
|
import asyncio
|
2024-03-11 20:06:52 +08:00
|
|
|
import dataclasses
|
2024-01-08 04:37:50 +00:00
|
|
|
import json
|
2024-05-12 06:41:32 -07:00
|
|
|
import logging
|
2024-01-08 04:37:50 +00:00
|
|
|
import multiprocessing as mp
|
2024-01-18 23:43:09 -08:00
|
|
|
import os
|
2024-01-08 04:37:50 +00:00
|
|
|
import sys
|
|
|
|
|
import threading
|
|
|
|
|
import time
|
2024-05-17 05:49:31 -07:00
|
|
|
from http import HTTPStatus
|
2024-05-27 21:24:10 -07:00
|
|
|
from typing import Optional
|
2024-01-08 04:37:50 +00:00
|
|
|
|
2024-05-12 04:54:07 -07:00
|
|
|
# Fix a bug of Python threading
|
2024-04-23 22:36:33 +08:00
|
|
|
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
|
|
|
|
|
|
2024-01-21 15:17:30 -08:00
|
|
|
import aiohttp
|
2024-01-08 04:37:50 +00:00
|
|
|
import psutil
|
|
|
|
|
import requests
|
|
|
|
|
import uvicorn
|
|
|
|
|
import uvloop
|
2024-05-12 06:41:32 -07:00
|
|
|
from fastapi import FastAPI, Request
|
2024-05-13 15:56:00 -07:00
|
|
|
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
2024-04-22 22:38:09 +08:00
|
|
|
|
2024-01-08 04:37:50 +00:00
|
|
|
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
2024-02-09 10:13:02 +08:00
|
|
|
from sglang.srt.constrained import disable_cache
|
2024-01-21 15:17:30 -08:00
|
|
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
2024-01-08 04:37:50 +00:00
|
|
|
from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
|
2024-05-12 06:41:32 -07:00
|
|
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
2024-05-27 21:24:10 -07:00
|
|
|
from sglang.srt.managers.controller.manager_single import start_controller_process as start_controller_process_single
|
|
|
|
|
from sglang.srt.managers.controller.manager_multi import start_controller_process as start_controller_process_multi
|
2024-01-08 04:37:50 +00:00
|
|
|
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
2024-05-12 06:41:32 -07:00
|
|
|
from sglang.srt.openai_api_adapter import (
|
2024-05-14 22:40:46 +08:00
|
|
|
load_chat_template_for_openai_api,
|
|
|
|
|
v1_chat_completions,
|
|
|
|
|
v1_completions,
|
|
|
|
|
)
|
2024-05-27 21:24:10 -07:00
|
|
|
from sglang.srt.server_args import ModelPortArgs, PortArgs, ServerArgs
|
2024-05-11 20:55:00 -07:00
|
|
|
from sglang.srt.utils import (
|
2024-05-14 22:40:46 +08:00
|
|
|
API_KEY_HEADER_NAME,
|
|
|
|
|
APIKeyValidatorMiddleware,
|
2024-05-11 20:55:00 -07:00
|
|
|
allocate_init_ports,
|
|
|
|
|
assert_pkg_version,
|
2024-05-12 04:54:07 -07:00
|
|
|
enable_show_time_cost,
|
2024-05-11 20:55:00 -07:00
|
|
|
)
|
2024-05-16 18:07:30 -07:00
|
|
|
from sglang.utils import get_exception_traceback
|
|
|
|
|
|
2024-01-08 04:37:50 +00:00
|
|
|
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
|
|
|
|
|
2024-03-11 05:24:24 -07:00
|
|
|
|
2024-01-08 04:37:50 +00:00
|
|
|
app = FastAPI()
|
|
|
|
|
tokenizer_manager = None
|
|
|
|
|
|
|
|
|
|
|
2024-01-23 22:00:28 -05:00
|
|
|
@app.get("/health")
|
|
|
|
|
async def health() -> Response:
|
|
|
|
|
"""Health check."""
|
|
|
|
|
return Response(status_code=200)
|
|
|
|
|
|
|
|
|
|
|
2024-01-08 04:37:50 +00:00
|
|
|
@app.get("/get_model_info")
|
|
|
|
|
async def get_model_info():
|
|
|
|
|
result = {
|
|
|
|
|
"model_path": tokenizer_manager.model_path,
|
|
|
|
|
}
|
|
|
|
|
return result
|
|
|
|
|
|
2024-01-18 23:43:09 -08:00
|
|
|
|
2024-03-11 20:06:52 +08:00
|
|
|
@app.get("/get_server_args")
|
|
|
|
|
async def get_server_args():
|
|
|
|
|
return dataclasses.asdict(tokenizer_manager.server_args)
|
|
|
|
|
|
|
|
|
|
|
2024-01-26 13:32:59 +08:00
|
|
|
@app.get("/flush_cache")
|
|
|
|
|
async def flush_cache():
|
2024-05-17 05:49:31 -07:00
|
|
|
tokenizer_manager.flush_cache()
|
2024-01-26 13:32:59 +08:00
|
|
|
return Response(
|
2024-02-06 11:34:15 -08:00
|
|
|
content="Cache flushed.\nPlease check backend logs for more details. "
|
|
|
|
|
"(When there are running or waiting requests, the operation will not be performed.)\n",
|
2024-01-26 13:32:59 +08:00
|
|
|
status_code=200,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2024-05-17 05:49:31 -07:00
|
|
|
async def generate_request(obj: GenerateReqInput, request: Request):
|
2024-05-11 20:55:00 -07:00
|
|
|
if obj.stream:
|
2024-05-18 22:23:53 -07:00
|
|
|
|
2024-05-11 20:55:00 -07:00
|
|
|
async def stream_results():
|
2024-05-17 05:49:31 -07:00
|
|
|
try:
|
|
|
|
|
async for out in tokenizer_manager.generate_request(obj, request):
|
|
|
|
|
yield f"data: {json.dumps(out, ensure_ascii=False)}\n\n"
|
|
|
|
|
except ValueError as e:
|
|
|
|
|
out = {"error": {"message": str(e)}}
|
2024-05-11 20:55:00 -07:00
|
|
|
yield f"data: {json.dumps(out, ensure_ascii=False)}\n\n"
|
|
|
|
|
yield "data: [DONE]\n\n"
|
|
|
|
|
|
2024-05-20 18:41:21 -07:00
|
|
|
return StreamingResponse(stream_results(), media_type="text/event-stream",
|
|
|
|
|
background=tokenizer_manager.create_abort_task(obj))
|
2024-05-17 05:49:31 -07:00
|
|
|
else:
|
|
|
|
|
try:
|
|
|
|
|
ret = await tokenizer_manager.generate_request(obj, request).__anext__()
|
|
|
|
|
return ret
|
|
|
|
|
except ValueError as e:
|
2024-05-18 22:23:53 -07:00
|
|
|
return JSONResponse(
|
|
|
|
|
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
|
|
|
|
)
|
|
|
|
|
|
2024-05-11 20:55:00 -07:00
|
|
|
|
2024-05-17 02:35:15 -07:00
|
|
|
app.post("/generate")(generate_request)
|
|
|
|
|
app.put("/generate")(generate_request)
|
|
|
|
|
|
2024-05-11 20:55:00 -07:00
|
|
|
|
2024-01-08 04:37:50 +00:00
|
|
|
@app.post("/v1/completions")
|
2024-05-12 06:41:32 -07:00
|
|
|
async def openai_v1_completions(raw_request: Request):
|
|
|
|
|
return await v1_completions(tokenizer_manager, raw_request)
|
2024-01-08 04:37:50 +00:00
|
|
|
|
|
|
|
|
|
2024-01-18 23:43:09 -08:00
|
|
|
@app.post("/v1/chat/completions")
|
2024-05-12 06:41:32 -07:00
|
|
|
async def openai_v1_chat_completions(raw_request: Request):
|
|
|
|
|
return await v1_chat_completions(tokenizer_manager, raw_request)
|
2024-05-12 07:37:49 +08:00
|
|
|
|
2024-05-11 20:55:00 -07:00
|
|
|
|
2024-05-14 07:57:00 +08:00
|
|
|
def launch_server(server_args: ServerArgs, pipe_finish_writer, model_overide_args=None):
|
2024-05-11 20:55:00 -07:00
|
|
|
global tokenizer_manager
|
|
|
|
|
|
2024-05-12 06:41:32 -07:00
|
|
|
logging.basicConfig(
|
|
|
|
|
level=getattr(logging, server_args.log_level.upper()),
|
|
|
|
|
format="%(message)s",
|
|
|
|
|
)
|
|
|
|
|
|
2024-05-11 20:55:00 -07:00
|
|
|
# Set global environments
|
|
|
|
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
2024-04-09 23:27:31 +08:00
|
|
|
if server_args.show_time_cost:
|
|
|
|
|
enable_show_time_cost()
|
2024-02-08 00:50:12 +08:00
|
|
|
if server_args.disable_disk_cache:
|
|
|
|
|
disable_cache()
|
2024-05-11 20:55:00 -07:00
|
|
|
if server_args.enable_flashinfer:
|
|
|
|
|
assert_pkg_version("flashinfer", "0.0.4")
|
|
|
|
|
if server_args.chat_template:
|
|
|
|
|
# TODO: replace this with huggingface transformers template
|
|
|
|
|
load_chat_template_for_openai_api(server_args.chat_template)
|
2024-02-08 00:50:12 +08:00
|
|
|
|
2024-05-11 20:55:00 -07:00
|
|
|
# Allocate ports
|
|
|
|
|
server_args.port, server_args.additional_ports = allocate_init_ports(
|
2024-05-27 21:24:10 -07:00
|
|
|
server_args.port,
|
|
|
|
|
server_args.additional_ports,
|
|
|
|
|
server_args.tp_size,
|
|
|
|
|
server_args.dp_size,
|
2024-01-08 04:37:50 +00:00
|
|
|
)
|
2024-05-27 21:24:10 -07:00
|
|
|
|
|
|
|
|
# Init local models port args
|
|
|
|
|
ports = server_args.additional_ports
|
|
|
|
|
tp = server_args.tp_size
|
|
|
|
|
model_port_args = []
|
|
|
|
|
for i in range(server_args.dp_size):
|
|
|
|
|
model_port_args.append(
|
|
|
|
|
ModelPortArgs(
|
|
|
|
|
nccl_port=ports[3 + i * (tp + 1)],
|
|
|
|
|
model_tp_ports=ports[3 + i * (tp + 1) + 1 : 3 + (i + 1) * (tp + 1)],
|
|
|
|
|
)
|
|
|
|
|
)
|
2024-01-08 04:37:50 +00:00
|
|
|
port_args = PortArgs(
|
2024-05-27 21:24:10 -07:00
|
|
|
tokenizer_port=ports[0],
|
|
|
|
|
router_port=ports[1],
|
|
|
|
|
detokenizer_port=ports[2],
|
|
|
|
|
model_port_args=model_port_args,
|
2024-01-08 04:37:50 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Launch processes
|
2024-05-14 07:57:00 +08:00
|
|
|
tokenizer_manager = TokenizerManager(server_args, port_args, model_overide_args)
|
2024-01-08 04:37:50 +00:00
|
|
|
pipe_router_reader, pipe_router_writer = mp.Pipe(duplex=False)
|
|
|
|
|
pipe_detoken_reader, pipe_detoken_writer = mp.Pipe(duplex=False)
|
|
|
|
|
|
2024-05-27 21:24:10 -07:00
|
|
|
if server_args.dp_size == 1:
|
|
|
|
|
start_process = start_controller_process_single
|
|
|
|
|
else:
|
|
|
|
|
start_process = start_controller_process_multi
|
2024-01-08 04:37:50 +00:00
|
|
|
proc_router = mp.Process(
|
2024-05-27 21:24:10 -07:00
|
|
|
target=start_process,
|
2024-05-14 07:57:00 +08:00
|
|
|
args=(server_args, port_args, pipe_router_writer, model_overide_args),
|
2024-01-08 04:37:50 +00:00
|
|
|
)
|
|
|
|
|
proc_router.start()
|
|
|
|
|
proc_detoken = mp.Process(
|
|
|
|
|
target=start_detokenizer_process,
|
|
|
|
|
args=(
|
|
|
|
|
server_args,
|
|
|
|
|
port_args,
|
|
|
|
|
pipe_detoken_writer,
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
proc_detoken.start()
|
|
|
|
|
|
|
|
|
|
# Wait for the model to finish loading
|
|
|
|
|
router_init_state = pipe_router_reader.recv()
|
|
|
|
|
detoken_init_state = pipe_detoken_reader.recv()
|
|
|
|
|
|
|
|
|
|
if router_init_state != "init ok" or detoken_init_state != "init ok":
|
|
|
|
|
proc_router.kill()
|
|
|
|
|
proc_detoken.kill()
|
2024-05-14 07:57:00 +08:00
|
|
|
print(
|
|
|
|
|
f"Initialization failed. router_init_state: {router_init_state}", flush=True
|
|
|
|
|
)
|
|
|
|
|
print(
|
|
|
|
|
f"Initialization failed. detoken_init_state: {detoken_init_state}",
|
|
|
|
|
flush=True,
|
|
|
|
|
)
|
2024-01-08 04:37:50 +00:00
|
|
|
sys.exit(1)
|
|
|
|
|
assert proc_router.is_alive() and proc_detoken.is_alive()
|
|
|
|
|
|
2024-03-11 13:16:10 +01:00
|
|
|
if server_args.api_key and server_args.api_key != "":
|
|
|
|
|
app.add_middleware(APIKeyValidatorMiddleware, api_key=server_args.api_key)
|
|
|
|
|
|
2024-05-17 05:49:31 -07:00
|
|
|
# Send a warmup request
|
2024-02-12 04:43:14 -08:00
|
|
|
def _wait_and_warmup():
|
2024-03-11 13:16:10 +01:00
|
|
|
headers = {}
|
2024-02-12 04:43:14 -08:00
|
|
|
url = server_args.url()
|
2024-05-11 20:55:00 -07:00
|
|
|
if server_args.api_key:
|
2024-03-11 13:16:10 +01:00
|
|
|
headers[API_KEY_HEADER_NAME] = server_args.api_key
|
|
|
|
|
|
2024-05-11 20:55:00 -07:00
|
|
|
# Wait until the server is launched
|
2024-03-11 13:16:10 +01:00
|
|
|
for _ in range(120):
|
|
|
|
|
time.sleep(0.5)
|
2024-02-12 04:43:14 -08:00
|
|
|
try:
|
2024-03-11 13:16:10 +01:00
|
|
|
requests.get(url + "/get_model_info", timeout=5, headers=headers)
|
2024-02-12 04:43:14 -08:00
|
|
|
break
|
2024-04-23 22:36:33 +08:00
|
|
|
except requests.exceptions.RequestException as e:
|
2024-02-12 04:43:14 -08:00
|
|
|
pass
|
2024-01-08 04:37:50 +00:00
|
|
|
|
2024-05-11 20:55:00 -07:00
|
|
|
# Send a warmup request
|
2024-02-05 14:21:16 -08:00
|
|
|
try:
|
2024-06-07 19:22:34 -07:00
|
|
|
for _ in range(server_args.dp_size):
|
|
|
|
|
res = requests.post(
|
|
|
|
|
url + "/generate",
|
|
|
|
|
json={
|
|
|
|
|
"text": "The capital city of France is",
|
|
|
|
|
"sampling_params": {
|
|
|
|
|
"temperature": 0,
|
|
|
|
|
"max_new_tokens": 16,
|
|
|
|
|
},
|
2024-02-12 04:43:14 -08:00
|
|
|
},
|
2024-06-07 19:22:34 -07:00
|
|
|
headers=headers,
|
|
|
|
|
timeout=600,
|
|
|
|
|
)
|
|
|
|
|
assert res.status_code == 200
|
|
|
|
|
except Exception:
|
2024-02-12 04:43:14 -08:00
|
|
|
if pipe_finish_writer is not None:
|
2024-05-11 20:55:00 -07:00
|
|
|
pipe_finish_writer.send(get_exception_traceback())
|
|
|
|
|
print(f"Initialization failed. warmup error: {e}")
|
|
|
|
|
raise e
|
2024-02-05 14:21:16 -08:00
|
|
|
|
|
|
|
|
if pipe_finish_writer is not None:
|
2024-02-12 04:43:14 -08:00
|
|
|
pipe_finish_writer.send("init ok")
|
2024-02-05 14:21:16 -08:00
|
|
|
|
2024-02-12 04:43:14 -08:00
|
|
|
t = threading.Thread(target=_wait_and_warmup)
|
|
|
|
|
t.start()
|
2024-05-17 05:49:31 -07:00
|
|
|
|
|
|
|
|
# Listen for requests
|
2024-02-12 04:43:14 -08:00
|
|
|
try:
|
2024-05-11 20:55:00 -07:00
|
|
|
uvicorn.run(
|
|
|
|
|
app,
|
|
|
|
|
host=server_args.host,
|
|
|
|
|
port=server_args.port,
|
|
|
|
|
log_level=server_args.log_level,
|
|
|
|
|
timeout_keep_alive=5,
|
|
|
|
|
loop="uvloop",
|
|
|
|
|
)
|
2024-02-12 04:43:14 -08:00
|
|
|
finally:
|
|
|
|
|
t.join()
|
2024-01-08 04:37:50 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class Runtime:
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
2024-05-27 21:24:10 -07:00
|
|
|
log_level: str = "error",
|
2024-05-14 07:57:00 +08:00
|
|
|
model_overide_args: Optional[dict] = None,
|
2024-05-11 20:55:00 -07:00
|
|
|
*args,
|
|
|
|
|
**kwargs,
|
2024-01-08 04:37:50 +00:00
|
|
|
):
|
2024-05-11 20:55:00 -07:00
|
|
|
"""See the arguments in server_args.py::ServerArgs"""
|
2024-05-27 21:24:10 -07:00
|
|
|
self.server_args = ServerArgs(*args, log_level=log_level, **kwargs)
|
2024-05-11 20:55:00 -07:00
|
|
|
|
|
|
|
|
# Pre-allocate ports
|
|
|
|
|
self.server_args.port, self.server_args.additional_ports = allocate_init_ports(
|
2024-05-14 07:57:00 +08:00
|
|
|
self.server_args.port,
|
|
|
|
|
self.server_args.additional_ports,
|
|
|
|
|
self.server_args.tp_size,
|
2024-05-27 21:24:10 -07:00
|
|
|
self.server_args.dp_size,
|
2024-05-14 07:57:00 +08:00
|
|
|
)
|
2024-01-24 08:55:38 +00:00
|
|
|
|
2024-01-21 15:17:30 -08:00
|
|
|
self.url = self.server_args.url()
|
|
|
|
|
self.generate_url = (
|
|
|
|
|
f"http://{self.server_args.host}:{self.server_args.port}/generate"
|
|
|
|
|
)
|
2024-01-08 04:37:50 +00:00
|
|
|
|
|
|
|
|
self.pid = None
|
|
|
|
|
pipe_reader, pipe_writer = mp.Pipe(duplex=False)
|
2024-05-14 07:57:00 +08:00
|
|
|
proc = mp.Process(
|
|
|
|
|
target=launch_server,
|
|
|
|
|
args=(self.server_args, pipe_writer, model_overide_args),
|
|
|
|
|
)
|
2024-01-08 04:37:50 +00:00
|
|
|
proc.start()
|
2024-01-30 04:29:32 -08:00
|
|
|
pipe_writer.close()
|
2024-01-08 04:37:50 +00:00
|
|
|
self.pid = proc.pid
|
|
|
|
|
|
2024-01-30 04:29:32 -08:00
|
|
|
try:
|
|
|
|
|
init_state = pipe_reader.recv()
|
|
|
|
|
except EOFError:
|
|
|
|
|
init_state = ""
|
|
|
|
|
|
2024-01-08 04:37:50 +00:00
|
|
|
if init_state != "init ok":
|
|
|
|
|
self.shutdown()
|
2024-05-14 07:57:00 +08:00
|
|
|
raise RuntimeError(
|
|
|
|
|
"Initialization failed. Please see the error messages above."
|
|
|
|
|
)
|
2024-01-08 04:37:50 +00:00
|
|
|
|
|
|
|
|
self.endpoint = RuntimeEndpoint(self.url)
|
|
|
|
|
|
|
|
|
|
def shutdown(self):
|
|
|
|
|
if self.pid is not None:
|
2024-01-30 04:29:32 -08:00
|
|
|
try:
|
|
|
|
|
parent = psutil.Process(self.pid)
|
|
|
|
|
except psutil.NoSuchProcess:
|
|
|
|
|
return
|
2024-01-08 04:37:50 +00:00
|
|
|
children = parent.children(recursive=True)
|
|
|
|
|
for child in children:
|
|
|
|
|
child.kill()
|
|
|
|
|
psutil.wait_procs(children, timeout=5)
|
|
|
|
|
parent.kill()
|
|
|
|
|
parent.wait(timeout=5)
|
|
|
|
|
self.pid = None
|
|
|
|
|
|
2024-01-21 15:17:30 -08:00
|
|
|
def get_tokenizer(self):
|
|
|
|
|
return get_tokenizer(
|
|
|
|
|
self.server_args.tokenizer_path,
|
|
|
|
|
tokenizer_mode=self.server_args.tokenizer_mode,
|
|
|
|
|
trust_remote_code=self.server_args.trust_remote_code,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
async def add_request(
|
|
|
|
|
self,
|
|
|
|
|
prompt: str,
|
|
|
|
|
sampling_params,
|
2024-05-11 20:55:00 -07:00
|
|
|
):
|
2024-01-21 15:17:30 -08:00
|
|
|
json_data = {
|
|
|
|
|
"text": prompt,
|
|
|
|
|
"sampling_params": sampling_params,
|
|
|
|
|
"stream": True,
|
|
|
|
|
}
|
|
|
|
|
pos = 0
|
|
|
|
|
|
|
|
|
|
timeout = aiohttp.ClientTimeout(total=3 * 3600)
|
|
|
|
|
async with aiohttp.ClientSession(timeout=timeout, trust_env=True) as session:
|
|
|
|
|
async with session.post(self.generate_url, json=json_data) as response:
|
|
|
|
|
async for chunk, _ in response.content.iter_chunks():
|
|
|
|
|
chunk = chunk.decode("utf-8")
|
|
|
|
|
if chunk and chunk.startswith("data:"):
|
|
|
|
|
if chunk == "data: [DONE]\n\n":
|
|
|
|
|
break
|
|
|
|
|
data = json.loads(chunk[5:].strip("\n"))
|
|
|
|
|
cur = data["text"][pos:]
|
|
|
|
|
if cur:
|
|
|
|
|
yield cur
|
|
|
|
|
pos += len(cur)
|
|
|
|
|
|
2024-01-08 04:37:50 +00:00
|
|
|
def __del__(self):
|
2024-05-14 07:57:00 +08:00
|
|
|
self.shutdown()
|