[router][grpc] Add serve_grpc to launch_server and log id for HealthCheck (#11564)

This commit is contained in:
Chang Su
2025-10-13 16:07:19 -07:00
committed by GitHub
parent 065ce81574
commit 887c2b4575
9 changed files with 68 additions and 93 deletions

View File

@@ -1,9 +1,9 @@
"""Launch the inference server."""
import asyncio
import os
import sys
from sglang.srt.entrypoints.http_server import launch_server
from sglang.srt.server_args import prepare_server_args
from sglang.srt.utils import kill_process_tree
@@ -11,6 +11,13 @@ if __name__ == "__main__":
server_args = prepare_server_args(sys.argv[1:])
try:
launch_server(server_args)
if server_args.grpc_mode:
from sglang.srt.entrypoints.grpc_server import serve_grpc
asyncio.run(serve_grpc(server_args))
else:
from sglang.srt.entrypoints.http_server import launch_server
launch_server(server_args)
finally:
kill_process_tree(os.getpid(), include_parent=False)

View File

@@ -22,8 +22,8 @@ from grpc_reflection.v1alpha import reflection
import sglang
from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
from sglang.srt.entrypoints.grpc_request_manager import GrpcRequestManager
from sglang.srt.grpc import sglang_scheduler_pb2, sglang_scheduler_pb2_grpc
from sglang.srt.grpc.grpc_request_manager import GrpcRequestManager
from sglang.srt.managers.data_parallel_controller import (
run_data_parallel_controller_process,
)
@@ -68,6 +68,8 @@ def _launch_scheduler_process_only(
# Configure global environment
configure_logger(server_args)
server_args.check_server_args()
# Fix CUDA multiprocessing issues - must be called before any CUDA operations
mp.set_start_method("spawn", force=True)
# Allocate ports for inter-process communications
if port_args is None:
@@ -317,7 +319,8 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
Check the health of the inference server by sending a special request to generate one token.
Similar to HTTP server's /health endpoint.
"""
logger.info("Receive health check request")
rid = f"HEALTH_CHECK_{time.time()}"
logger.info(f"Receive health check request: {rid}")
if self.request_manager.gracefully_exit:
logger.info(
@@ -328,7 +331,6 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
)
# Create a special health check request
rid = f"HEALTH_CHECK_{time.time()}"
sampling_params = SGLSamplingParams(max_new_tokens=1, temperature=0.0)
sampling_params.normalize(tokenizer=None)
@@ -919,25 +921,3 @@ async def serve_grpc(
proc.join(timeout=1.0)
logger.info("All scheduler processes terminated")
def main():
"""Main entry point for standalone gRPC server."""
# Fix CUDA multiprocessing issues - must be called before any CUDA operations
mp.set_start_method("spawn", force=True)
parser = argparse.ArgumentParser(description="SGLang Standalone gRPC Server")
ServerArgs.add_cli_args(parser)
args = parser.parse_args()
server_args = ServerArgs.from_cli_args(args)
# Run server
asyncio.run(
serve_grpc(
server_args=server_args,
)
)
if __name__ == "__main__":
main()

View File

@@ -326,10 +326,7 @@ message EmbedError {
// Management Operations
// =====================
message HealthCheckRequest {
// Input for health test generation (must be tokenized)
TokenizedInput tokenized = 1;
}
message HealthCheckRequest {}
message HealthCheckResponse {
bool healthy = 1;

File diff suppressed because one or more lines are too long

View File

@@ -320,10 +320,8 @@ class EmbedError(_message.Message):
def __init__(self, message: _Optional[str] = ..., code: _Optional[str] = ..., details: _Optional[str] = ...) -> None: ...
class HealthCheckRequest(_message.Message):
__slots__ = ("tokenized",)
TOKENIZED_FIELD_NUMBER: _ClassVar[int]
tokenized: TokenizedInput
def __init__(self, tokenized: _Optional[_Union[TokenizedInput, _Mapping]] = ...) -> None: ...
__slots__ = ()
def __init__(self) -> None: ...
class HealthCheckResponse(_message.Message):
__slots__ = ("healthy", "message")

View File

@@ -194,6 +194,7 @@ class ServerArgs:
# HTTP server
host: str = "127.0.0.1"
port: int = 30000
grpc_mode: bool = False
skip_server_warmup: bool = False
warmups: Optional[str] = None
nccl_port: Optional[int] = None
@@ -1516,6 +1517,11 @@ class ServerArgs:
default=ServerArgs.port,
help="The port of the HTTP server.",
)
parser.add_argument(
"--grpc-mode",
action="store_true",
help="If set, use gRPC server instead of HTTP server.",
)
parser.add_argument(
"--skip-server-warmup",
action="store_true",