[router] Expose worker startup secs & Return error instead of panic for router init (#3016)

This commit is contained in:
Byron Hsu
2025-01-20 12:45:13 -08:00
committed by GitHub
parent 5dfcacfcb1
commit 0311ce8e1c
7 changed files with 124 additions and 47 deletions

View File

@@ -68,7 +68,7 @@ def run_server(server_args, dp_rank):
# create new process group
os.setpgrp()
setproctitle(f"sglang::server")
setproctitle("sglang::server")
# Set SGLANG_DP_RANK environment variable
os.environ["SGLANG_DP_RANK"] = str(dp_rank)
@@ -120,9 +120,26 @@ def find_available_ports(base_port: int, count: int) -> List[int]:
def cleanup_processes(processes: List[mp.Process]):
for process in processes:
logger.info(f"Terminating process {process.pid}")
process.terminate()
logger.info("All processes terminated")
logger.info(f"Terminating process group {process.pid}")
try:
os.killpg(process.pid, signal.SIGTERM)
except ProcessLookupError:
# Process group may already be terminated
pass
# Wait for processes to terminate
for process in processes:
process.join(timeout=5)
if process.is_alive():
logger.warning(
f"Process {process.pid} did not terminate gracefully, forcing kill"
)
try:
os.killpg(process.pid, signal.SIGKILL)
except ProcessLookupError:
pass
logger.info("All process groups terminated")
def main():
@@ -173,7 +190,12 @@ def main():
]
# Start the router
router = launch_router(router_args)
try:
launch_router(router_args)
except Exception as e:
logger.error(f"Failed to start router: {e}")
cleanup_processes(server_processes)
sys.exit(1)
if __name__ == "__main__":