Rename rust folder to sgl-router (#2464)

Signed-off-by: Ata Fatahi <immrata@gmail.com>
2024-12-12 12:40:41 -05:00
parent 2673fa29d4
commit e3b3acfa6f
22 changed files with 13 additions and 13 deletions
--- a/sgl-router/py_src/sglang_router/init.py
+++ b/sgl-router/py_src/sglang_router/init.py
@@ -0,0 +1,11 @@
+# a lightweihgt wrapper on router with argument type and comments
+from sglang_router_rs import PolicyType
+
+# no wrapper on policy type => direct export
+from .router import Router
+
+__all__ = ["Router", "PolicyType"]
+
+from sglang_router.version import __version__
+
+__all__ += ["__version__"]
--- a/sgl-router/py_src/sglang_router/launch_router.py
+++ b/sgl-router/py_src/sglang_router/launch_router.py
@@ -0,0 +1,249 @@
+import argparse
+import dataclasses
+import logging
+import sys
+from typing import List, Optional
+
+from sglang_router import Router
+from sglang_router_rs import PolicyType
+
+
+def setup_logger():
+    logger = logging.getLogger("router")
+    logger.setLevel(logging.INFO)
+
+    formatter = logging.Formatter(
+        "[Router (Python)] %(asctime)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    handler = logging.StreamHandler()
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+
+    return logger
+
+
+@dataclasses.dataclass
+class RouterArgs:
+    # Worker configuration
+    worker_urls: List[str]
+    host: str = "127.0.0.1"
+    port: int = 30000
+
+    # Routing policy
+    policy: str = "cache_aware"
+    cache_threshold: float = 0.5
+    balance_abs_threshold: int = 32
+    balance_rel_threshold: float = 1.0001
+    eviction_interval: int = 60
+    max_tree_size: int = 2**24
+    max_payload_size: int = 4 * 1024 * 1024  # 4MB
+    verbose: bool = False
+
+    @staticmethod
+    def add_cli_args(
+        parser: argparse.ArgumentParser,
+        use_router_prefix: bool = False,
+        exclude_host_port: bool = False,
+    ):
+        """
+        Add router-specific arguments to an argument parser.
+
+        Args:
+            parser: The argument parser to add arguments to
+            use_router_prefix: If True, prefix all arguments with 'router-' to avoid conflicts
+            exclude_host_port: If True, don't add host and port arguments (used when inheriting from server)
+        """
+        prefix = "router-" if use_router_prefix else ""
+
+        # Worker configuration
+        if not exclude_host_port:
+            parser.add_argument(
+                "--host",
+                type=str,
+                default=RouterArgs.host,
+                help="Host address to bind the router server",
+            )
+            parser.add_argument(
+                "--port",
+                type=int,
+                default=RouterArgs.port,
+                help="Port number to bind the router server",
+            )
+
+        parser.add_argument(
+            "--worker-urls",
+            type=str,
+            nargs="+",
+            help="List of worker URLs (e.g., http://worker1:8000 http://worker2:8000)",
+        )
+
+        # Routing policy configuration
+        parser.add_argument(
+            f"--{prefix}policy",
+            type=str,
+            default=RouterArgs.policy,
+            choices=["random", "round_robin", "cache_aware"],
+            help="Load balancing policy to use",
+        )
+        parser.add_argument(
+            f"--{prefix}cache-threshold",
+            type=float,
+            default=RouterArgs.cache_threshold,
+            help="Cache threshold (0.0-1.0) for cache-aware routing",
+        )
+        parser.add_argument(
+            f"--{prefix}balance-abs-threshold",
+            type=int,
+            default=RouterArgs.balance_abs_threshold,
+            help="Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold. Otherwise, use cache aware",
+        )
+        parser.add_argument(
+            f"--{prefix}balance-rel-threshold",
+            type=float,
+            default=RouterArgs.balance_rel_threshold,
+            help="Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold. Otherwise, use cache aware",
+        )
+        parser.add_argument(
+            f"--{prefix}eviction-interval",
+            type=int,
+            default=RouterArgs.eviction_interval,
+            help="Interval in seconds between cache eviction operations",
+        )
+        parser.add_argument(
+            f"--{prefix}max-tree-size",
+            type=int,
+            default=RouterArgs.max_tree_size,
+            help="Maximum size of the approximation tree for cache-aware routing",
+        )
+        parser.add_argument(
+            f"--{prefix}max-payload-size",
+            type=int,
+            default=RouterArgs.max_payload_size,
+            help="Maximum payload size in bytes",
+        )
+        parser.add_argument(
+            f"--{prefix}verbose",
+            action="store_true",
+            help="Enable verbose logging",
+        )
+
+    @classmethod
+    def from_cli_args(
+        cls, args: argparse.Namespace, use_router_prefix: bool = False
+    ) -> "RouterArgs":
+        """
+        Create RouterArgs instance from parsed command line arguments.
+
+        Args:
+            args: Parsed command line arguments
+            use_router_prefix: If True, look for arguments with 'router-' prefix
+        """
+        prefix = "router_" if use_router_prefix else ""
+        return cls(
+            worker_urls=args.worker_urls,
+            host=args.host,
+            port=args.port,
+            policy=getattr(args, f"{prefix}policy"),
+            cache_threshold=getattr(args, f"{prefix}cache_threshold"),
+            balance_abs_threshold=getattr(args, f"{prefix}balance_abs_threshold"),
+            balance_rel_threshold=getattr(args, f"{prefix}balance_rel_threshold"),
+            eviction_interval=getattr(args, f"{prefix}eviction_interval"),
+            max_tree_size=getattr(args, f"{prefix}max_tree_size"),
+            max_payload_size=getattr(args, f"{prefix}max_payload_size"),
+            verbose=getattr(args, f"{prefix}verbose", False),
+        )
+
+
+def policy_from_str(policy_str: str) -> PolicyType:
+    """Convert policy string to PolicyType enum."""
+    policy_map = {
+        "random": PolicyType.Random,
+        "round_robin": PolicyType.RoundRobin,
+        "cache_aware": PolicyType.CacheAware,
+    }
+    return policy_map[policy_str]
+
+
+def launch_router(args: argparse.Namespace) -> Optional[Router]:
+    """
+    Launch the SGLang router with the configuration from parsed arguments.
+
+    Args:
+        args: Namespace object containing router configuration
+            Can be either raw argparse.Namespace or converted RouterArgs
+
+    Returns:
+        Router instance if successful, None if failed
+    """
+    logger = logging.getLogger("router")
+    try:
+        # Convert to RouterArgs if needed
+        if not isinstance(args, RouterArgs):
+            router_args = RouterArgs.from_cli_args(args)
+        else:
+            router_args = args
+
+        router = Router(
+            worker_urls=router_args.worker_urls,
+            policy=policy_from_str(router_args.policy),
+            host=router_args.host,
+            port=router_args.port,
+            cache_threshold=router_args.cache_threshold,
+            balance_abs_threshold=router_args.balance_abs_threshold,
+            balance_rel_threshold=router_args.balance_rel_threshold,
+            eviction_interval_secs=router_args.eviction_interval,
+            max_tree_size=router_args.max_tree_size,
+            max_payload_size=router_args.max_payload_size,
+            verbose=router_args.verbose,
+        )
+
+        router.start()
+        return router
+
+    except Exception as e:
+        logger.error(f"Error starting router: {e}")
+        return None
+
+
+class CustomHelpFormatter(
+    argparse.RawDescriptionHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
+):
+    """Custom formatter that preserves both description formatting and shows defaults"""
+
+    pass
+
+
+def parse_router_args(args: List[str]) -> RouterArgs:
+    """Parse command line arguments and return RouterArgs instance."""
+    parser = argparse.ArgumentParser(
+        description="""SGLang Router - High-performance request distribution across worker nodes
+
+Usage:
+This launcher enables starting a router with individual worker instances. It is useful for
+multi-node setups or when you want to start workers and router separately.
+
+Examples:
+  python -m sglang_router.launch_router --worker-urls http://worker1:8000 http://worker2:8000
+  python -m sglang_router.launch_router --worker-urls http://worker1:8000 http://worker2:8000 --cache-threshold 0.7 --balance-abs-threshold 64 --balance-rel-threshold 1.2
+
+    """,
+        formatter_class=CustomHelpFormatter,
+    )
+
+    RouterArgs.add_cli_args(parser, use_router_prefix=False)
+    return RouterArgs.from_cli_args(parser.parse_args(args), use_router_prefix=False)
+
+
+def main() -> None:
+    logger = setup_logger()
+    router_args = parse_router_args(sys.argv[1:])
+    router = launch_router(router_args)
+
+    if router is None:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/sgl-router/py_src/sglang_router/launch_server.py
+++ b/sgl-router/py_src/sglang_router/launch_server.py
@@ -0,0 +1,180 @@
+import argparse
+import copy
+import logging
+import multiprocessing as mp
+import os
+import random
+import signal
+import sys
+import time
+from typing import List
+
+import requests
+from setproctitle import setproctitle
+from sglang_router.launch_router import RouterArgs, launch_router
+
+from sglang.srt.server import launch_server
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import is_port_available
+
+
+def setup_logger():
+    logger = logging.getLogger("router")
+    logger.setLevel(logging.INFO)
+
+    formatter = logging.Formatter(
+        "[Router (Python)] %(asctime)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    handler = logging.StreamHandler()
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+
+    return logger
+
+
+logger = setup_logger()
+
+
+# Create new process group
+def run_server(server_args, dp_rank):
+    """
+    Note:
+
+    1. Without os.setpgrp(), all processes share the same PGID. When you press Ctrl+C, the terminal sends SIGINT to all processes in the group simultaneously.
+    This can cause leaf processes to terminate first, which messes up the cleaning order and produces orphaned processes.
+
+    Terminal (PGID=100)
+    └── Main Python Process (PGID=100)
+        └── Server Process 1 (PGID=100)
+            └── Scheduler 1
+            └── Detokenizer 1
+        └── Server Process 2 (PGID=100)
+            └── Scheduler 2
+            └── Detokenizer 2
+
+    2. With os.setpgrp(), the main Python process and its children are in a separate group. Now:
+
+    Terminal (PGID=100)
+    └── Main Python Process (PGID=200)
+        └── Server Process 1 (PGID=300)
+            └── Scheduler 1
+            └── Detokenizer 1
+        └── Server Process 2 (PGID=400)
+            └── Scheduler 2
+            └── Detokenizer 2
+    """
+    # create new process group
+    os.setpgrp()
+
+    setproctitle(f"sglang::server")
+    # Set SGLANG_DP_RANK environment variable
+    os.environ["SGLANG_DP_RANK"] = str(dp_rank)
+
+    launch_server(server_args)
+
+
+def launch_server_process(
+    server_args: ServerArgs, worker_port: int, dp_id: int
+) -> mp.Process:
+    """Launch a single server process with the given args and port."""
+    server_args = copy.deepcopy(server_args)
+    server_args.port = worker_port
+    server_args.base_gpu_id = dp_id * server_args.tp_size
+    server_args.dp_size = 1
+
+    proc = mp.Process(target=run_server, args=(server_args, dp_id))
+    proc.start()
+    return proc
+
+
+def wait_for_server_health(host: str, port: int, timeout: int = 300) -> bool:
+    """Wait for server to be healthy by checking /health endpoint."""
+    start_time = time.time()
+    url = f"http://{host}:{port}/health"
+
+    while time.time() - start_time < timeout:
+        try:
+            response = requests.get(url, timeout=5)
+            if response.status_code == 200:
+                return True
+        except requests.exceptions.RequestException:
+            pass
+        time.sleep(1)
+    return False
+
+
+def find_available_ports(base_port: int, count: int) -> List[int]:
+    """Find consecutive available ports starting from base_port."""
+    available_ports = []
+    current_port = base_port
+
+    while len(available_ports) < count:
+        if is_port_available(current_port):
+            available_ports.append(current_port)
+        current_port += random.randint(100, 1000)
+
+    return available_ports
+
+
+def cleanup_processes(processes: List[mp.Process]):
+    for process in processes:
+        logger.info(f"Terminating process {process.pid}")
+        process.terminate()
+    logger.info("All processes terminated")
+
+
+def main():
+    # CUDA runtime isn't fork-safe, which can lead to subtle bugs or crashes
+    mp.set_start_method("spawn")
+
+    parser = argparse.ArgumentParser(
+        description="Launch SGLang router and server processes"
+    )
+
+    ServerArgs.add_cli_args(parser)
+    RouterArgs.add_cli_args(parser, use_router_prefix=True, exclude_host_port=True)
+    parser.add_argument(
+        "--router-dp-worker-base-port",
+        type=int,
+        default=31000,
+        help="Base port number for data parallel workers",
+    )
+
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    router_args = RouterArgs.from_cli_args(args, use_router_prefix=True)
+
+    # Find available ports for workers
+    worker_ports = find_available_ports(
+        args.router_dp_worker_base_port, server_args.dp_size
+    )
+
+    # Start server processes
+    server_processes = []
+
+    for i, worker_port in enumerate(worker_ports):
+        logger.info(f"Launching DP server process {i} on port {worker_port}")
+        proc = launch_server_process(server_args, worker_port, i)
+        server_processes.append(proc)
+
+    signal.signal(signal.SIGINT, lambda sig, frame: cleanup_processes(server_processes))
+    signal.signal(
+        signal.SIGTERM, lambda sig, frame: cleanup_processes(server_processes)
+    )
+    signal.signal(
+        signal.SIGQUIT, lambda sig, frame: cleanup_processes(server_processes)
+    )
+
+    # Update router args with worker URLs
+    router_args.worker_urls = [
+        f"http://{server_args.host}:{port}" for port in worker_ports
+    ]
+
+    # Start the router
+    router = launch_router(router_args)
+
+
+if __name__ == "__main__":
+    main()
--- a/sgl-router/py_src/sglang_router/router.py
+++ b/sgl-router/py_src/sglang_router/router.py
@@ -0,0 +1,67 @@
+from typing import List, Optional
+
+from sglang_router_rs import PolicyType
+from sglang_router_rs import Router as _Router
+
+
+class Router:
+    """
+    A high-performance router for distributing requests across worker nodes.
+
+    Args:
+        worker_urls: List of URLs for worker nodes that will handle requests. Each URL should include
+            the protocol, host, and port (e.g., ['http://worker1:8000', 'http://worker2:8000'])
+        policy: Load balancing policy to use. Options:
+            - PolicyType.Random: Randomly select workers
+            - PolicyType.RoundRobin: Distribute requests in round-robin fashion
+            - PolicyType.CacheAware: Distribute requests based on cache state and load balance
+        host: Host address to bind the router server. Default: '127.0.0.1'
+        port: Port number to bind the router server. Default: 3001
+        cache_threshold: Cache threshold (0.0-1.0) for cache-aware routing. Routes to cached worker
+            if the match rate exceeds threshold, otherwise routes to the worker with the smallest
+            tree. Default: 0.5
+        balance_abs_threshold: Load balancing is triggered when (max_load - min_load) > abs_threshold
+            AND max_load > min_load * rel_threshold. Otherwise, use cache aware. Default: 32
+        balance_rel_threshold: Load balancing is triggered when (max_load - min_load) > abs_threshold
+            AND max_load > min_load * rel_threshold. Otherwise, use cache aware. Default: 1.0001
+        eviction_interval_secs: Interval in seconds between cache eviction operations in cache-aware
+            routing. Default: 60
+        max_payload_size: Maximum payload size in bytes. Default: 4MB
+        max_tree_size: Maximum size of the approximation tree for cache-aware routing. Default: 2^24
+        verbose: Enable verbose logging. Default: False
+    """
+
+    def __init__(
+        self,
+        worker_urls: List[str],
+        policy: PolicyType = PolicyType.RoundRobin,
+        host: str = "127.0.0.1",
+        port: int = 3001,
+        cache_threshold: float = 0.50,
+        balance_abs_threshold: int = 32,
+        balance_rel_threshold: float = 1.0001,
+        eviction_interval_secs: int = 60,
+        max_tree_size: int = 2**24,
+        max_payload_size: int = 4 * 1024 * 1024,  # 4MB
+        verbose: bool = False,
+    ):
+        self._router = _Router(
+            worker_urls=worker_urls,
+            policy=policy,
+            host=host,
+            port=port,
+            cache_threshold=cache_threshold,
+            balance_abs_threshold=balance_abs_threshold,
+            balance_rel_threshold=balance_rel_threshold,
+            eviction_interval_secs=eviction_interval_secs,
+            max_tree_size=max_tree_size,
+            max_payload_size=max_payload_size,
+            verbose=verbose,
+        )
+
+    def start(self) -> None:
+        """Start the router server.
+
+        This method blocks until the server is shut down.
+        """
+        self._router.start()
--- a/sgl-router/py_src/sglang_router/version.py
+++ b/sgl-router/py_src/sglang_router/version.py
@@ -0,0 +1 @@
+__version__ = "0.1.1"