[router] allow one router to support different model families and serving mode (#10244)

2025-09-12 19:18:27 -04:00
parent 321fecab74
commit 2f173ea074
28 changed files with 3528 additions and 837 deletions
--- a/sgl-router/py_src/sglang_router/router.py
+++ b/sgl-router/py_src/sglang_router/router.py
@@ -46,6 +46,9 @@ class Router:
        max_payload_size: Maximum payload size in bytes. Default: 256MB
        max_tree_size: Maximum size of the approximation tree for cache-aware routing. Default: 2^24
        dp_aware: Enable data parallelism aware schedule. Default: False
+        enable_igw: Enable IGW (Inference-Gateway) mode for multi-model support. When enabled,
+            the router can manage multiple models simultaneously with per-model load balancing
+            policies. Default: False
        api_key: The api key used for the authorization with the worker.
            Useful when the dp aware scheduling strategy is enabled.
            Default: None
--- a/sgl-router/py_src/sglang_router/router_args.py
+++ b/sgl-router/py_src/sglang_router/router_args.py
@@ -34,6 +34,7 @@ class RouterArgs:
    max_tree_size: int = 2**26
    max_payload_size: int = 512 * 1024 * 1024  # 512MB default for large batches
    dp_aware: bool = False
+    enable_igw: bool = False  # Enable IGW (Inter-Gateway) mode for multi-model support
    api_key: Optional[str] = None
    log_dir: Optional[str] = None
    log_level: Optional[str] = None
@@ -227,6 +228,11 @@ class RouterArgs:
            action="store_true",
            help="Enable data parallelism aware schedule",
        )
+        parser.add_argument(
+            f"--{prefix}enable-igw",
+            action="store_true",
+            help="Enable IGW (Inference-Gateway) mode for multi-model support",
+        )
        parser.add_argument(
            f"--{prefix}api-key",
            type=str,