[router] Expose worker startup secs & Return error instead of panic for router init (#3016)
This commit is contained in:
@@ -33,6 +33,7 @@ class RouterArgs:
|
||||
|
||||
# Routing policy
|
||||
policy: str = "cache_aware"
|
||||
worker_startup_timeout_secs: int = 300
|
||||
cache_threshold: float = 0.5
|
||||
balance_abs_threshold: int = 32
|
||||
balance_rel_threshold: float = 1.0001
|
||||
@@ -87,6 +88,12 @@ class RouterArgs:
|
||||
choices=["random", "round_robin", "cache_aware"],
|
||||
help="Load balancing policy to use",
|
||||
)
|
||||
parser.add_argument(
|
||||
f"--{prefix}worker-startup-timeout-secs",
|
||||
type=int,
|
||||
default=RouterArgs.worker_startup_timeout_secs,
|
||||
help="Timeout in seconds for worker startup",
|
||||
)
|
||||
parser.add_argument(
|
||||
f"--{prefix}cache-threshold",
|
||||
type=float,
|
||||
@@ -147,6 +154,9 @@ class RouterArgs:
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
policy=getattr(args, f"{prefix}policy"),
|
||||
worker_startup_timeout_secs=getattr(
|
||||
args, f"{prefix}worker_startup_timeout_secs"
|
||||
),
|
||||
cache_threshold=getattr(args, f"{prefix}cache_threshold"),
|
||||
balance_abs_threshold=getattr(args, f"{prefix}balance_abs_threshold"),
|
||||
balance_rel_threshold=getattr(args, f"{prefix}balance_rel_threshold"),
|
||||
@@ -188,9 +198,10 @@ def launch_router(args: argparse.Namespace) -> Optional[Router]:
|
||||
|
||||
router = Router(
|
||||
worker_urls=router_args.worker_urls,
|
||||
policy=policy_from_str(router_args.policy),
|
||||
host=router_args.host,
|
||||
port=router_args.port,
|
||||
policy=policy_from_str(router_args.policy),
|
||||
worker_startup_timeout_secs=router_args.worker_startup_timeout_secs,
|
||||
cache_threshold=router_args.cache_threshold,
|
||||
balance_abs_threshold=router_args.balance_abs_threshold,
|
||||
balance_rel_threshold=router_args.balance_rel_threshold,
|
||||
@@ -205,7 +216,7 @@ def launch_router(args: argparse.Namespace) -> Optional[Router]:
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error starting router: {e}")
|
||||
return None
|
||||
raise e
|
||||
|
||||
|
||||
class CustomHelpFormatter(
|
||||
@@ -239,10 +250,7 @@ Examples:
|
||||
|
||||
def main() -> None:
|
||||
router_args = parse_router_args(sys.argv[1:])
|
||||
router = launch_router(router_args)
|
||||
|
||||
if router is None:
|
||||
sys.exit(1)
|
||||
launch_router(router_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -68,7 +68,7 @@ def run_server(server_args, dp_rank):
|
||||
# create new process group
|
||||
os.setpgrp()
|
||||
|
||||
setproctitle(f"sglang::server")
|
||||
setproctitle("sglang::server")
|
||||
# Set SGLANG_DP_RANK environment variable
|
||||
os.environ["SGLANG_DP_RANK"] = str(dp_rank)
|
||||
|
||||
@@ -120,9 +120,26 @@ def find_available_ports(base_port: int, count: int) -> List[int]:
|
||||
|
||||
def cleanup_processes(processes: List[mp.Process]):
|
||||
for process in processes:
|
||||
logger.info(f"Terminating process {process.pid}")
|
||||
process.terminate()
|
||||
logger.info("All processes terminated")
|
||||
logger.info(f"Terminating process group {process.pid}")
|
||||
try:
|
||||
os.killpg(process.pid, signal.SIGTERM)
|
||||
except ProcessLookupError:
|
||||
# Process group may already be terminated
|
||||
pass
|
||||
|
||||
# Wait for processes to terminate
|
||||
for process in processes:
|
||||
process.join(timeout=5)
|
||||
if process.is_alive():
|
||||
logger.warning(
|
||||
f"Process {process.pid} did not terminate gracefully, forcing kill"
|
||||
)
|
||||
try:
|
||||
os.killpg(process.pid, signal.SIGKILL)
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
|
||||
logger.info("All process groups terminated")
|
||||
|
||||
|
||||
def main():
|
||||
@@ -173,7 +190,12 @@ def main():
|
||||
]
|
||||
|
||||
# Start the router
|
||||
router = launch_router(router_args)
|
||||
try:
|
||||
launch_router(router_args)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to start router: {e}")
|
||||
cleanup_processes(server_processes)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -17,6 +17,7 @@ class Router:
|
||||
- PolicyType.CacheAware: Distribute requests based on cache state and load balance
|
||||
host: Host address to bind the router server. Default: '127.0.0.1'
|
||||
port: Port number to bind the router server. Default: 3001
|
||||
worker_startup_timeout_secs: Timeout in seconds for worker startup. Default: 300
|
||||
cache_threshold: Cache threshold (0.0-1.0) for cache-aware routing. Routes to cached worker
|
||||
if the match rate exceeds threshold, otherwise routes to the worker with the smallest
|
||||
tree. Default: 0.5
|
||||
@@ -37,6 +38,7 @@ class Router:
|
||||
policy: PolicyType = PolicyType.RoundRobin,
|
||||
host: str = "127.0.0.1",
|
||||
port: int = 3001,
|
||||
worker_startup_timeout_secs: int = 300,
|
||||
cache_threshold: float = 0.50,
|
||||
balance_abs_threshold: int = 32,
|
||||
balance_rel_threshold: float = 1.0001,
|
||||
@@ -50,6 +52,7 @@ class Router:
|
||||
policy=policy,
|
||||
host=host,
|
||||
port=port,
|
||||
worker_startup_timeout_secs=worker_startup_timeout_secs,
|
||||
cache_threshold=cache_threshold,
|
||||
balance_abs_threshold=balance_abs_threshold,
|
||||
balance_rel_threshold=balance_rel_threshold,
|
||||
|
||||
Reference in New Issue
Block a user