Signed-off-by: ybyang <ybyang7@iflytek.com> Signed-off-by: huanglong <huanglong@linux.alibaba.com> Co-authored-by: Huang Long <121648372+LLLL114@users.noreply.github.com> Co-authored-by: huanglong <huanglong@linux.alibaba.com> Co-authored-by: Shangming Cai <csmthu@gmail.com>
This commit is contained in:
@@ -60,6 +60,7 @@ from sglang.srt.managers.io_struct import (
|
||||
UpdateWeightsFromDistributedReqInput,
|
||||
UpdateWeightsFromTensorReqInput,
|
||||
)
|
||||
from sglang.srt.managers.multi_tokenizer_mixin import MultiTokenizerRouter
|
||||
from sglang.srt.managers.scheduler import run_scheduler_process
|
||||
from sglang.srt.managers.template_manager import TemplateManager
|
||||
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
||||
@@ -814,18 +815,24 @@ def _launch_subprocesses(
|
||||
),
|
||||
)
|
||||
detoken_proc.start()
|
||||
if server_args.tokenizer_worker_num > 1:
|
||||
# Launch multi-tokenizer router
|
||||
tokenizer_manager = MultiTokenizerRouter(server_args, port_args)
|
||||
|
||||
# Launch tokenizer process
|
||||
tokenizer_manager = TokenizerManager(server_args, port_args)
|
||||
# Initialize templates
|
||||
template_manager = None
|
||||
else:
|
||||
# Launch tokenizer process
|
||||
tokenizer_manager = TokenizerManager(server_args, port_args)
|
||||
|
||||
# Initialize templates
|
||||
template_manager = TemplateManager()
|
||||
template_manager.initialize_templates(
|
||||
tokenizer_manager=tokenizer_manager,
|
||||
model_path=server_args.model_path,
|
||||
chat_template=server_args.chat_template,
|
||||
completion_template=server_args.completion_template,
|
||||
)
|
||||
# Initialize templates
|
||||
template_manager = TemplateManager()
|
||||
template_manager.initialize_templates(
|
||||
tokenizer_manager=tokenizer_manager,
|
||||
model_path=server_args.model_path,
|
||||
chat_template=server_args.chat_template,
|
||||
completion_template=server_args.completion_template,
|
||||
)
|
||||
|
||||
# Wait for the model to finish loading
|
||||
scheduler_infos = []
|
||||
|
||||
@@ -23,6 +23,7 @@ import json
|
||||
import logging
|
||||
import multiprocessing as multiprocessing
|
||||
import os
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
from http import HTTPStatus
|
||||
@@ -91,11 +92,18 @@ from sglang.srt.managers.io_struct import (
|
||||
UpdateWeightVersionReqInput,
|
||||
VertexGenerateReqInput,
|
||||
)
|
||||
from sglang.srt.managers.multi_tokenizer_mixin import (
|
||||
MultiTokenizerManager,
|
||||
deserialize_data,
|
||||
get_main_process_id,
|
||||
read_from_shared_memory,
|
||||
write_data_for_multi_tokenizer,
|
||||
)
|
||||
from sglang.srt.managers.template_manager import TemplateManager
|
||||
from sglang.srt.managers.tokenizer_manager import ServerStatus, TokenizerManager
|
||||
from sglang.srt.metrics.func_timer import enable_func_timer
|
||||
from sglang.srt.reasoning_parser import ReasoningParser
|
||||
from sglang.srt.server_args import ServerArgs
|
||||
from sglang.srt.server_args import PortArgs, ServerArgs
|
||||
from sglang.srt.utils import (
|
||||
add_api_key_middleware,
|
||||
add_prometheus_middleware,
|
||||
@@ -130,8 +138,79 @@ def set_global_state(global_state: _GlobalState):
|
||||
_global_state = global_state
|
||||
|
||||
|
||||
# Function to set up all middlewares for multi-tokenizer compatibility
|
||||
def setup_middlewares(api_key: Optional[str], enable_metrics: bool):
|
||||
"""Setup all middlewares for both single and multi-process modes"""
|
||||
worker_pid = os.getpid()
|
||||
|
||||
if api_key:
|
||||
add_api_key_middleware(app, api_key)
|
||||
logger.info(f"Worker {worker_pid} added API key middleware")
|
||||
|
||||
if enable_metrics:
|
||||
add_prometheus_middleware(app)
|
||||
enable_func_timer()
|
||||
logger.info(f"Worker {worker_pid} added prometheus middleware")
|
||||
|
||||
|
||||
async def init_multi_tokenizer() -> ServerArgs:
|
||||
"""Read args information from shm and init tokenizer manager for current process"""
|
||||
pid = os.getpid()
|
||||
main_pid = get_main_process_id()
|
||||
logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
|
||||
|
||||
# Read configuration from shared memory
|
||||
port_args_data = read_from_shared_memory(f"port_args_{main_pid}")
|
||||
server_args_data = read_from_shared_memory(f"server_args_{main_pid}")
|
||||
scheduler_info_data = read_from_shared_memory(f"scheduler_info_{main_pid}")
|
||||
port_args, server_args = deserialize_data(port_args_data, server_args_data)
|
||||
scheduler_info = scheduler_info_data
|
||||
|
||||
port_args.tokenizer_ipc_name = (
|
||||
f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
|
||||
)
|
||||
|
||||
# Launch multi-tokenizer manager process
|
||||
tokenizer_manager = MultiTokenizerManager(server_args, port_args)
|
||||
template_manager = TemplateManager()
|
||||
template_manager.initialize_templates(
|
||||
tokenizer_manager=tokenizer_manager,
|
||||
model_path=server_args.model_path,
|
||||
chat_template=server_args.chat_template,
|
||||
completion_template=server_args.completion_template,
|
||||
)
|
||||
# Register this tokenizer with the main tokenizer manager
|
||||
await tokenizer_manager.register_to_main_tokenizer_manager()
|
||||
|
||||
tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
|
||||
set_global_state(
|
||||
_GlobalState(
|
||||
tokenizer_manager=tokenizer_manager,
|
||||
template_manager=template_manager,
|
||||
scheduler_info=scheduler_info,
|
||||
)
|
||||
)
|
||||
return server_args
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(fast_api_app: FastAPI):
|
||||
server_args = getattr(fast_api_app, "server_args", None)
|
||||
if server_args is None:
|
||||
# Initialize multi-tokenizer support for worker processes
|
||||
fast_api_app.server_args = await init_multi_tokenizer()
|
||||
setup_middlewares(
|
||||
fast_api_app.server_args.api_key, fast_api_app.server_args.enable_metrics
|
||||
)
|
||||
fast_api_app.warmup_thread = threading.Thread(
|
||||
target=_wait_and_warmup,
|
||||
args=(
|
||||
fast_api_app.server_args,
|
||||
None, # pipe_finish_writer not needed in worker
|
||||
None, # launch_callback not needed in worker
|
||||
),
|
||||
)
|
||||
|
||||
# Initialize OpenAI serving handlers
|
||||
fast_api_app.state.openai_serving_completion = OpenAIServingCompletion(
|
||||
_global_state.tokenizer_manager, _global_state.template_manager
|
||||
@@ -191,7 +270,15 @@ async def lifespan(fast_api_app: FastAPI):
|
||||
warmup_thread = getattr(fast_api_app, "warmup_thread", None)
|
||||
if warmup_thread is not None:
|
||||
warmup_thread.start()
|
||||
yield
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
if server_args.tokenizer_worker_num > 1:
|
||||
pid = os.getpid()
|
||||
logger.info(f"uvicorn worker {pid} ending...")
|
||||
warmup_thread.join()
|
||||
logger.info(f"uvicorn worker {pid} ended.")
|
||||
|
||||
|
||||
# Fast API
|
||||
@@ -1078,9 +1165,19 @@ def launch_server(
|
||||
1. The HTTP server, Engine, and TokenizerManager both run in the main process.
|
||||
2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
|
||||
"""
|
||||
tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
|
||||
server_args=server_args
|
||||
)
|
||||
if server_args.tokenizer_worker_num > 1:
|
||||
port_args = PortArgs.init_new(server_args)
|
||||
port_args.tokenizer_worker_ipc_name = (
|
||||
f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
|
||||
)
|
||||
tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
|
||||
server_args=server_args, port_args=port_args
|
||||
)
|
||||
else:
|
||||
tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
|
||||
server_args=server_args,
|
||||
)
|
||||
|
||||
set_global_state(
|
||||
_GlobalState(
|
||||
tokenizer_manager=tokenizer_manager,
|
||||
@@ -1089,42 +1186,75 @@ def launch_server(
|
||||
)
|
||||
)
|
||||
|
||||
# Add api key authorization
|
||||
if server_args.api_key:
|
||||
add_api_key_middleware(app, server_args.api_key)
|
||||
if server_args.tokenizer_worker_num > 1:
|
||||
port_args_shm, server_args_shm, scheduler_info_shm = (
|
||||
write_data_for_multi_tokenizer(
|
||||
port_args,
|
||||
server_args,
|
||||
scheduler_info,
|
||||
)
|
||||
)
|
||||
else:
|
||||
# Add api key authorization
|
||||
if server_args.api_key:
|
||||
add_api_key_middleware(app, server_args.api_key)
|
||||
|
||||
# Add prometheus middleware
|
||||
if server_args.enable_metrics:
|
||||
add_prometheus_middleware(app)
|
||||
enable_func_timer()
|
||||
# Add prometheus middleware
|
||||
if server_args.enable_metrics:
|
||||
add_prometheus_middleware(app)
|
||||
enable_func_timer()
|
||||
|
||||
# Send a warmup request - we will create the thread launch it
|
||||
# in the lifespan after all other warmups have fired.
|
||||
warmup_thread = threading.Thread(
|
||||
target=_wait_and_warmup,
|
||||
args=(
|
||||
server_args,
|
||||
pipe_finish_writer,
|
||||
launch_callback,
|
||||
),
|
||||
)
|
||||
app.warmup_thread = warmup_thread
|
||||
# Send a warmup request - we will create the thread launch it
|
||||
# in the lifespan after all other warmups have fired.
|
||||
warmup_thread = threading.Thread(
|
||||
target=_wait_and_warmup,
|
||||
args=(
|
||||
server_args,
|
||||
pipe_finish_writer,
|
||||
launch_callback,
|
||||
),
|
||||
)
|
||||
app.warmup_thread = warmup_thread
|
||||
|
||||
try:
|
||||
# Update logging configs
|
||||
set_uvicorn_logging_configs()
|
||||
app.server_args = server_args
|
||||
# Listen for HTTP requests
|
||||
uvicorn.run(
|
||||
app,
|
||||
host=server_args.host,
|
||||
port=server_args.port,
|
||||
log_level=server_args.log_level_http or server_args.log_level,
|
||||
timeout_keep_alive=5,
|
||||
loop="uvloop",
|
||||
)
|
||||
if server_args.tokenizer_worker_num > 1:
|
||||
from uvicorn.config import LOGGING_CONFIG
|
||||
|
||||
LOGGING_CONFIG["loggers"]["sglang.srt.entrypoints.http_server"] = {
|
||||
"handlers": ["default"],
|
||||
"level": "INFO",
|
||||
"propagate": False,
|
||||
}
|
||||
uvicorn.run(
|
||||
"sglang.srt.entrypoints.http_server:app",
|
||||
host=server_args.host,
|
||||
port=server_args.port,
|
||||
log_level=server_args.log_level_http or server_args.log_level,
|
||||
timeout_keep_alive=5,
|
||||
loop="uvloop",
|
||||
workers=server_args.tokenizer_worker_num,
|
||||
)
|
||||
else:
|
||||
uvicorn.run(
|
||||
app,
|
||||
host=server_args.host,
|
||||
port=server_args.port,
|
||||
log_level=server_args.log_level_http or server_args.log_level,
|
||||
timeout_keep_alive=5,
|
||||
loop="uvloop",
|
||||
)
|
||||
finally:
|
||||
warmup_thread.join()
|
||||
if server_args.tokenizer_worker_num > 1:
|
||||
port_args_shm.unlink()
|
||||
server_args_shm.unlink()
|
||||
scheduler_info_shm.unlink()
|
||||
_global_state.tokenizer_manager.clear_tokenizer_mapping()
|
||||
else:
|
||||
warmup_thread.join()
|
||||
|
||||
|
||||
def _execute_server_warmup(
|
||||
|
||||
Reference in New Issue
Block a user