[Misc] add service discovery for sgl router
This commit is contained in:
762
sgl-router/Cargo.lock
generated
762
sgl-router/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -30,7 +30,9 @@ tracing = "0.1"
|
|||||||
tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "chrono"] }
|
tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "chrono"] }
|
||||||
tracing-log = "0.2"
|
tracing-log = "0.2"
|
||||||
tracing-appender = "0.2.3"
|
tracing-appender = "0.2.3"
|
||||||
|
kube = { version = "0.88.1", features = ["runtime", "derive"] }
|
||||||
|
k8s-openapi = { version = "0.21.0", features = ["v1_29"] }
|
||||||
|
futures = "0.3"
|
||||||
[profile.release]
|
[profile.release]
|
||||||
lto = "thin"
|
lto = "thin"
|
||||||
codegen-units = 1
|
codegen-units = 1
|
||||||
|
|||||||
@@ -81,6 +81,41 @@ router = Router(
|
|||||||
|
|
||||||
Use the `--verbose` flag with the CLI for more detailed logs.
|
Use the `--verbose` flag with the CLI for more detailed logs.
|
||||||
|
|
||||||
|
### Kubernetes Service Discovery
|
||||||
|
|
||||||
|
SGL Router supports automatic service discovery for worker nodes in Kubernetes environments. When enabled, the router will automatically:
|
||||||
|
|
||||||
|
- Discover and add worker pods with matching labels
|
||||||
|
- Remove unhealthy or deleted worker pods
|
||||||
|
- Dynamically adjust the worker pool based on pod health and availability
|
||||||
|
|
||||||
|
#### Command Line Usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m sglang_router.launch_router \
|
||||||
|
--service-discovery \
|
||||||
|
--selector app=sglang-worker role=inference \
|
||||||
|
--service-discovery-port 8000 \
|
||||||
|
--service-discovery-namespace default
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Service Discovery Arguments
|
||||||
|
|
||||||
|
- `--service-discovery`: Enable Kubernetes service discovery feature
|
||||||
|
- `--selector`: One or more label key-value pairs for pod selection (format: key1=value1 key2=value2)
|
||||||
|
- `--service-discovery-port`: Port to use when generating worker URLs (default: 80)
|
||||||
|
- `--service-discovery-namespace`: Optional. Kubernetes namespace to watch for pods. If not provided, watches all namespaces (requires cluster-wide permissions)
|
||||||
|
|
||||||
|
#### RBAC Requirements
|
||||||
|
|
||||||
|
When using service discovery, you must configure proper Kubernetes RBAC permissions:
|
||||||
|
|
||||||
|
- **If using namespace-scoped discovery** (with `--service-discovery-namespace`):
|
||||||
|
Set up a ServiceAccount, Role, and RoleBinding
|
||||||
|
|
||||||
|
- **If watching all namespaces** (without specifying namespace):
|
||||||
|
Set up a ServiceAccount, ClusterRole, and ClusterRoleBinding with permissions to list/watch pods at the cluster level
|
||||||
|
|
||||||
### Troubleshooting
|
### Troubleshooting
|
||||||
|
|
||||||
1. If rust analyzer is not working in VSCode, set `rust-analyzer.linkedProjects` to the absolute path of `Cargo.toml` in your repo. For example:
|
1. If rust analyzer is not working in VSCode, set `rust-analyzer.linkedProjects` to the absolute path of `Cargo.toml` in your repo. For example:
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import argparse
|
|||||||
import dataclasses
|
import dataclasses
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
from typing import List, Optional
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
from sglang_router import Router
|
from sglang_router import Router
|
||||||
from sglang_router_rs import PolicyType
|
from sglang_router_rs import PolicyType
|
||||||
@@ -43,6 +43,11 @@ class RouterArgs:
|
|||||||
max_payload_size: int = 4 * 1024 * 1024 # 4MB
|
max_payload_size: int = 4 * 1024 * 1024 # 4MB
|
||||||
verbose: bool = False
|
verbose: bool = False
|
||||||
log_dir: Optional[str] = None
|
log_dir: Optional[str] = None
|
||||||
|
# Service discovery configuration
|
||||||
|
service_discovery: bool = False
|
||||||
|
selector: Dict[str, str] = dataclasses.field(default_factory=dict)
|
||||||
|
service_discovery_port: int = 80
|
||||||
|
service_discovery_namespace: Optional[str] = None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def add_cli_args(
|
def add_cli_args(
|
||||||
@@ -149,6 +154,28 @@ class RouterArgs:
|
|||||||
default=None,
|
default=None,
|
||||||
help="Directory to store log files. If not specified, logs are only output to console.",
|
help="Directory to store log files. If not specified, logs are only output to console.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
f"--{prefix}service-discovery",
|
||||||
|
action="store_true",
|
||||||
|
help="Enable Kubernetes service discovery",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
f"--{prefix}selector",
|
||||||
|
type=str,
|
||||||
|
nargs="+",
|
||||||
|
help="Label selector for Kubernetes service discovery (format: key1=value1 key2=value2)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
f"--{prefix}service-discovery-port",
|
||||||
|
type=int,
|
||||||
|
default=RouterArgs.service_discovery_port,
|
||||||
|
help="Port to use for discovered worker pods",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
f"--{prefix}service-discovery-namespace",
|
||||||
|
type=str,
|
||||||
|
help="Kubernetes namespace to watch for pods. If not provided, watches all namespaces (requires cluster-wide permissions)",
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_cli_args(
|
def from_cli_args(
|
||||||
@@ -182,8 +209,26 @@ class RouterArgs:
|
|||||||
max_payload_size=getattr(args, f"{prefix}max_payload_size"),
|
max_payload_size=getattr(args, f"{prefix}max_payload_size"),
|
||||||
verbose=getattr(args, f"{prefix}verbose", False),
|
verbose=getattr(args, f"{prefix}verbose", False),
|
||||||
log_dir=getattr(args, f"{prefix}log_dir", None),
|
log_dir=getattr(args, f"{prefix}log_dir", None),
|
||||||
|
service_discovery=getattr(args, f"{prefix}service_discovery", False),
|
||||||
|
selector=cls._parse_selector(getattr(args, f"{prefix}selector", None)),
|
||||||
|
service_discovery_port=getattr(args, f"{prefix}service_discovery_port"),
|
||||||
|
service_discovery_namespace=getattr(
|
||||||
|
args, f"{prefix}service_discovery_namespace", None
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_selector(selector_list):
|
||||||
|
if not selector_list:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
selector = {}
|
||||||
|
for item in selector_list:
|
||||||
|
if "=" in item:
|
||||||
|
key, value = item.split("=", 1)
|
||||||
|
selector[key] = value
|
||||||
|
return selector
|
||||||
|
|
||||||
|
|
||||||
def policy_from_str(policy_str: str) -> PolicyType:
|
def policy_from_str(policy_str: str) -> PolicyType:
|
||||||
"""Convert policy string to PolicyType enum."""
|
"""Convert policy string to PolicyType enum."""
|
||||||
@@ -229,6 +274,10 @@ def launch_router(args: argparse.Namespace) -> Optional[Router]:
|
|||||||
max_payload_size=router_args.max_payload_size,
|
max_payload_size=router_args.max_payload_size,
|
||||||
verbose=router_args.verbose,
|
verbose=router_args.verbose,
|
||||||
log_dir=router_args.log_dir,
|
log_dir=router_args.log_dir,
|
||||||
|
service_discovery=router_args.service_discovery,
|
||||||
|
selector=router_args.selector,
|
||||||
|
service_discovery_port=router_args.service_discovery_port,
|
||||||
|
service_discovery_namespace=router_args.service_discovery_namespace,
|
||||||
)
|
)
|
||||||
|
|
||||||
router.start()
|
router.start()
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from typing import List, Optional
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
from sglang_router_rs import PolicyType
|
from sglang_router_rs import PolicyType
|
||||||
from sglang_router_rs import Router as _Router
|
from sglang_router_rs import Router as _Router
|
||||||
@@ -32,6 +32,14 @@ class Router:
|
|||||||
max_tree_size: Maximum size of the approximation tree for cache-aware routing. Default: 2^24
|
max_tree_size: Maximum size of the approximation tree for cache-aware routing. Default: 2^24
|
||||||
verbose: Enable verbose logging. Default: False
|
verbose: Enable verbose logging. Default: False
|
||||||
log_dir: Directory to store log files. If None, logs are only output to console. Default: None
|
log_dir: Directory to store log files. If None, logs are only output to console. Default: None
|
||||||
|
service_discovery: Enable Kubernetes service discovery. When enabled, the router will
|
||||||
|
automatically discover worker pods based on the selector. Default: False
|
||||||
|
selector: Dictionary mapping of label keys to values for Kubernetes pod selection.
|
||||||
|
Example: {"app": "sglang-worker"}. Default: {}
|
||||||
|
service_discovery_port: Port to use for service discovery. The router will generate
|
||||||
|
worker URLs using this port. Default: 80
|
||||||
|
service_discovery_namespace: Kubernetes namespace to watch for pods. If not provided,
|
||||||
|
watches pods across all namespaces (requires cluster-wide permissions). Default: None
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -50,7 +58,14 @@ class Router:
|
|||||||
max_payload_size: int = 4 * 1024 * 1024, # 4MB
|
max_payload_size: int = 4 * 1024 * 1024, # 4MB
|
||||||
verbose: bool = False,
|
verbose: bool = False,
|
||||||
log_dir: Optional[str] = None,
|
log_dir: Optional[str] = None,
|
||||||
|
service_discovery: bool = False,
|
||||||
|
selector: Dict[str, str] = None,
|
||||||
|
service_discovery_port: int = 80,
|
||||||
|
service_discovery_namespace: Optional[str] = None,
|
||||||
):
|
):
|
||||||
|
if selector is None:
|
||||||
|
selector = {}
|
||||||
|
|
||||||
self._router = _Router(
|
self._router = _Router(
|
||||||
worker_urls=worker_urls,
|
worker_urls=worker_urls,
|
||||||
policy=policy,
|
policy=policy,
|
||||||
@@ -66,6 +81,10 @@ class Router:
|
|||||||
max_payload_size=max_payload_size,
|
max_payload_size=max_payload_size,
|
||||||
verbose=verbose,
|
verbose=verbose,
|
||||||
log_dir=log_dir,
|
log_dir=log_dir,
|
||||||
|
service_discovery=service_discovery,
|
||||||
|
selector=selector,
|
||||||
|
service_discovery_port=service_discovery_port,
|
||||||
|
service_discovery_namespace=service_discovery_namespace,
|
||||||
)
|
)
|
||||||
|
|
||||||
def start(self) -> None:
|
def start(self) -> None:
|
||||||
|
|||||||
@@ -38,6 +38,10 @@ class TestLaunchRouter(unittest.TestCase):
|
|||||||
max_payload_size=4 * 1024 * 1024, # 4MB
|
max_payload_size=4 * 1024 * 1024, # 4MB
|
||||||
verbose=False,
|
verbose=False,
|
||||||
log_dir=None,
|
log_dir=None,
|
||||||
|
service_discovery=False,
|
||||||
|
selector=None,
|
||||||
|
service_discovery_port=80,
|
||||||
|
service_discovery_namespace=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_router_args(self, **kwargs):
|
def create_router_args(self, **kwargs):
|
||||||
@@ -79,6 +83,23 @@ class TestLaunchRouter(unittest.TestCase):
|
|||||||
args = self.create_router_args(worker_urls=[])
|
args = self.create_router_args(worker_urls=[])
|
||||||
self.run_router_process(args)
|
self.run_router_process(args)
|
||||||
|
|
||||||
|
def test_launch_router_with_service_discovery(self):
|
||||||
|
# Test router startup with service discovery enabled but no selectors
|
||||||
|
args = self.create_router_args(
|
||||||
|
worker_urls=[], service_discovery=True, selector=["app=test-worker"]
|
||||||
|
)
|
||||||
|
self.run_router_process(args)
|
||||||
|
|
||||||
|
def test_launch_router_with_service_discovery_namespace(self):
|
||||||
|
# Test router startup with service discovery enabled and namespace specified
|
||||||
|
args = self.create_router_args(
|
||||||
|
worker_urls=[],
|
||||||
|
service_discovery=True,
|
||||||
|
selector=["app=test-worker"],
|
||||||
|
service_discovery_namespace="test-namespace",
|
||||||
|
)
|
||||||
|
self.run_router_process(args)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -24,6 +24,10 @@ def popen_launch_router(
|
|||||||
max_payload_size: int = None,
|
max_payload_size: int = None,
|
||||||
api_key: str = None,
|
api_key: str = None,
|
||||||
log_dir: str = None,
|
log_dir: str = None,
|
||||||
|
service_discovery: bool = False,
|
||||||
|
selector: list = None,
|
||||||
|
service_discovery_port: int = 80,
|
||||||
|
service_discovery_namespace: str = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Launch the router server process.
|
Launch the router server process.
|
||||||
@@ -37,6 +41,10 @@ def popen_launch_router(
|
|||||||
max_payload_size: Maximum payload size in bytes
|
max_payload_size: Maximum payload size in bytes
|
||||||
api_key: API key for the router
|
api_key: API key for the router
|
||||||
log_dir: Directory to store log files. If None, logs are only output to console.
|
log_dir: Directory to store log files. If None, logs are only output to console.
|
||||||
|
service_discovery: Enable Kubernetes service discovery
|
||||||
|
selector: List of label selectors in format ["key1=value1", "key2=value2"]
|
||||||
|
service_discovery_port: Port to use for service discovery
|
||||||
|
service_discovery_namespace: Kubernetes namespace to watch for pods. If None, watches all namespaces.
|
||||||
"""
|
"""
|
||||||
_, host, port = base_url.split(":")
|
_, host, port = base_url.split(":")
|
||||||
host = host[2:]
|
host = host[2:]
|
||||||
@@ -65,6 +73,20 @@ def popen_launch_router(
|
|||||||
if max_payload_size is not None:
|
if max_payload_size is not None:
|
||||||
command.extend(["--router-max-payload-size", str(max_payload_size)])
|
command.extend(["--router-max-payload-size", str(max_payload_size)])
|
||||||
|
|
||||||
|
if service_discovery:
|
||||||
|
command.append("--router-service-discovery")
|
||||||
|
|
||||||
|
if selector:
|
||||||
|
command.extend(["--router-selector"] + selector)
|
||||||
|
|
||||||
|
if service_discovery_port != 80:
|
||||||
|
command.extend(["--router-service-discovery-port", str(service_discovery_port)])
|
||||||
|
|
||||||
|
if service_discovery_namespace:
|
||||||
|
command.extend(
|
||||||
|
["--router-service-discovery-namespace", service_discovery_namespace]
|
||||||
|
)
|
||||||
|
|
||||||
if log_dir is not None:
|
if log_dir is not None:
|
||||||
command.extend(["--log-dir", log_dir])
|
command.extend(["--log-dir", log_dir])
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,9 @@
|
|||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
pub mod logging;
|
pub mod logging;
|
||||||
|
use std::collections::HashMap;
|
||||||
pub mod router;
|
pub mod router;
|
||||||
pub mod server;
|
pub mod server;
|
||||||
|
pub mod service_discovery;
|
||||||
pub mod tree;
|
pub mod tree;
|
||||||
|
|
||||||
#[pyclass(eq)]
|
#[pyclass(eq)]
|
||||||
@@ -29,6 +31,10 @@ struct Router {
|
|||||||
max_payload_size: usize,
|
max_payload_size: usize,
|
||||||
verbose: bool,
|
verbose: bool,
|
||||||
log_dir: Option<String>,
|
log_dir: Option<String>,
|
||||||
|
service_discovery: bool,
|
||||||
|
selector: HashMap<String, String>,
|
||||||
|
service_discovery_port: u16,
|
||||||
|
service_discovery_namespace: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
@@ -49,6 +55,10 @@ impl Router {
|
|||||||
max_payload_size = 4 * 1024 * 1024,
|
max_payload_size = 4 * 1024 * 1024,
|
||||||
verbose = false,
|
verbose = false,
|
||||||
log_dir = None,
|
log_dir = None,
|
||||||
|
service_discovery = false,
|
||||||
|
selector = HashMap::new(),
|
||||||
|
service_discovery_port = 80,
|
||||||
|
service_discovery_namespace = None
|
||||||
))]
|
))]
|
||||||
fn new(
|
fn new(
|
||||||
worker_urls: Vec<String>,
|
worker_urls: Vec<String>,
|
||||||
@@ -65,6 +75,10 @@ impl Router {
|
|||||||
max_payload_size: usize,
|
max_payload_size: usize,
|
||||||
verbose: bool,
|
verbose: bool,
|
||||||
log_dir: Option<String>,
|
log_dir: Option<String>,
|
||||||
|
service_discovery: bool,
|
||||||
|
selector: HashMap<String, String>,
|
||||||
|
service_discovery_port: u16,
|
||||||
|
service_discovery_namespace: Option<String>,
|
||||||
) -> PyResult<Self> {
|
) -> PyResult<Self> {
|
||||||
Ok(Router {
|
Ok(Router {
|
||||||
host,
|
host,
|
||||||
@@ -81,6 +95,10 @@ impl Router {
|
|||||||
max_payload_size,
|
max_payload_size,
|
||||||
verbose,
|
verbose,
|
||||||
log_dir,
|
log_dir,
|
||||||
|
service_discovery,
|
||||||
|
selector,
|
||||||
|
service_discovery_port,
|
||||||
|
service_discovery_namespace,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -105,6 +123,19 @@ impl Router {
|
|||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Create service discovery config if enabled
|
||||||
|
let service_discovery_config = if self.service_discovery {
|
||||||
|
Some(service_discovery::ServiceDiscoveryConfig {
|
||||||
|
enabled: true,
|
||||||
|
selector: self.selector.clone(),
|
||||||
|
check_interval: std::time::Duration::from_secs(60),
|
||||||
|
port: self.service_discovery_port,
|
||||||
|
namespace: self.service_discovery_namespace.clone(),
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
actix_web::rt::System::new().block_on(async move {
|
actix_web::rt::System::new().block_on(async move {
|
||||||
server::startup(server::ServerConfig {
|
server::startup(server::ServerConfig {
|
||||||
host: self.host.clone(),
|
host: self.host.clone(),
|
||||||
@@ -114,6 +145,7 @@ impl Router {
|
|||||||
verbose: self.verbose,
|
verbose: self.verbose,
|
||||||
max_payload_size: self.max_payload_size,
|
max_payload_size: self.max_payload_size,
|
||||||
log_dir: self.log_dir.clone(),
|
log_dir: self.log_dir.clone(),
|
||||||
|
service_discovery_config,
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
.map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e.to_string()))?;
|
.map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e.to_string()))?;
|
||||||
|
|||||||
@@ -240,6 +240,15 @@ impl Router {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get a reference to the worker URLs shared across threads
|
||||||
|
pub fn get_worker_urls(&self) -> Arc<RwLock<Vec<String>>> {
|
||||||
|
match self {
|
||||||
|
Router::RoundRobin { worker_urls, .. } => Arc::clone(worker_urls),
|
||||||
|
Router::Random { worker_urls, .. } => Arc::clone(worker_urls),
|
||||||
|
Router::CacheAware { worker_urls, .. } => Arc::clone(worker_urls),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn wait_for_healthy_workers(
|
fn wait_for_healthy_workers(
|
||||||
worker_urls: &[String],
|
worker_urls: &[String],
|
||||||
timeout_secs: u64,
|
timeout_secs: u64,
|
||||||
|
|||||||
@@ -1,26 +1,30 @@
|
|||||||
use crate::logging::{self, LoggingConfig};
|
use crate::logging::{self, LoggingConfig};
|
||||||
use crate::router::PolicyConfig;
|
use crate::router::PolicyConfig;
|
||||||
use crate::router::Router;
|
use crate::router::Router;
|
||||||
|
use crate::service_discovery::{start_service_discovery, ServiceDiscoveryConfig};
|
||||||
use actix_web::{
|
use actix_web::{
|
||||||
error, get, post, web, App, Error, HttpRequest, HttpResponse, HttpServer, Responder,
|
error, get, post, web, App, Error, HttpRequest, HttpResponse, HttpServer, Responder,
|
||||||
};
|
};
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use futures_util::StreamExt;
|
use futures_util::StreamExt;
|
||||||
|
use reqwest::Client;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::atomic::{AtomicBool, Ordering};
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tracing::{info, Level};
|
use tokio::spawn;
|
||||||
|
use tracing::{error, info, warn, Level};
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct AppState {
|
pub struct AppState {
|
||||||
router: Router,
|
router: Router,
|
||||||
client: reqwest::Client,
|
client: Client,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AppState {
|
impl AppState {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
worker_urls: Vec<String>,
|
worker_urls: Vec<String>,
|
||||||
client: reqwest::Client,
|
client: Client,
|
||||||
policy_config: PolicyConfig,
|
policy_config: PolicyConfig,
|
||||||
) -> Result<Self, String> {
|
) -> Result<Self, String> {
|
||||||
// Create router based on policy
|
// Create router based on policy
|
||||||
@@ -149,6 +153,7 @@ pub struct ServerConfig {
|
|||||||
pub verbose: bool,
|
pub verbose: bool,
|
||||||
pub max_payload_size: usize,
|
pub max_payload_size: usize,
|
||||||
pub log_dir: Option<String>,
|
pub log_dir: Option<String>,
|
||||||
|
pub service_discovery_config: Option<ServiceDiscoveryConfig>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn startup(config: ServerConfig) -> std::io::Result<()> {
|
pub async fn startup(config: ServerConfig) -> std::io::Result<()> {
|
||||||
@@ -180,7 +185,15 @@ pub async fn startup(config: ServerConfig) -> std::io::Result<()> {
|
|||||||
config.max_payload_size / (1024 * 1024)
|
config.max_payload_size / (1024 * 1024)
|
||||||
);
|
);
|
||||||
|
|
||||||
let client = reqwest::Client::builder()
|
// Log service discovery status
|
||||||
|
if let Some(service_discovery_config) = &config.service_discovery_config {
|
||||||
|
info!("🚧 Service discovery enabled");
|
||||||
|
info!("🚧 Selector: {:?}", service_discovery_config.selector);
|
||||||
|
} else {
|
||||||
|
info!("🚧 Service discovery disabled");
|
||||||
|
}
|
||||||
|
|
||||||
|
let client = Client::builder()
|
||||||
.pool_idle_timeout(Some(Duration::from_secs(50)))
|
.pool_idle_timeout(Some(Duration::from_secs(50)))
|
||||||
.build()
|
.build()
|
||||||
.expect("Failed to create HTTP client");
|
.expect("Failed to create HTTP client");
|
||||||
@@ -194,6 +207,30 @@ pub async fn startup(config: ServerConfig) -> std::io::Result<()> {
|
|||||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?,
|
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Start the service discovery if enabled
|
||||||
|
if let Some(service_discovery_config) = config.service_discovery_config {
|
||||||
|
if service_discovery_config.enabled {
|
||||||
|
let worker_urls = Arc::clone(&app_state.router.get_worker_urls());
|
||||||
|
|
||||||
|
match start_service_discovery(service_discovery_config, worker_urls).await {
|
||||||
|
Ok(handle) => {
|
||||||
|
info!("✅ Service discovery started successfully");
|
||||||
|
|
||||||
|
// Spawn a task to handle the service discovery thread
|
||||||
|
spawn(async move {
|
||||||
|
if let Err(e) = handle.await {
|
||||||
|
error!("Service discovery task failed: {:?}", e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("Failed to start service discovery: {}", e);
|
||||||
|
warn!("Continuing without service discovery");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
info!("✅ Serving router on {}:{}", config.host, config.port);
|
info!("✅ Serving router on {}:{}", config.host, config.port);
|
||||||
info!("✅ Serving workers on {:?}", config.worker_urls);
|
info!("✅ Serving workers on {:?}", config.worker_urls);
|
||||||
|
|
||||||
|
|||||||
285
sgl-router/src/service_discovery.rs
Normal file
285
sgl-router/src/service_discovery.rs
Normal file
@@ -0,0 +1,285 @@
|
|||||||
|
use futures::{StreamExt, TryStreamExt};
|
||||||
|
use k8s_openapi::api::core::v1::Pod;
|
||||||
|
use kube::{
|
||||||
|
api::Api,
|
||||||
|
runtime::watcher::{watcher, Config},
|
||||||
|
runtime::WatchStreamExt,
|
||||||
|
Client,
|
||||||
|
};
|
||||||
|
use log::{error, info, warn};
|
||||||
|
use std::collections::{HashMap, HashSet};
|
||||||
|
use std::sync::{Arc, Mutex, RwLock};
|
||||||
|
use std::time::Duration;
|
||||||
|
use tokio::task;
|
||||||
|
use tokio::time;
|
||||||
|
|
||||||
|
/// Represents the service discovery configuration
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct ServiceDiscoveryConfig {
|
||||||
|
pub enabled: bool,
|
||||||
|
pub selector: HashMap<String, String>,
|
||||||
|
pub check_interval: Duration,
|
||||||
|
pub port: u16,
|
||||||
|
pub namespace: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for ServiceDiscoveryConfig {
|
||||||
|
fn default() -> Self {
|
||||||
|
ServiceDiscoveryConfig {
|
||||||
|
enabled: false,
|
||||||
|
selector: HashMap::new(),
|
||||||
|
check_interval: Duration::from_secs(60),
|
||||||
|
port: 80, // Default port to connect to pods
|
||||||
|
namespace: None, // None means watch all namespaces
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents a Kubernetes pod's information used for worker management
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
|
pub struct PodInfo {
|
||||||
|
pub name: String,
|
||||||
|
pub ip: String,
|
||||||
|
pub status: String,
|
||||||
|
pub is_ready: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PodInfo {
|
||||||
|
pub fn from_pod(pod: &Pod) -> Option<Self> {
|
||||||
|
let name = pod.metadata.name.clone()?;
|
||||||
|
let status = pod.status.clone()?;
|
||||||
|
let pod_ip = status.pod_ip?;
|
||||||
|
|
||||||
|
let is_ready = if let Some(conditions) = &status.conditions {
|
||||||
|
conditions
|
||||||
|
.iter()
|
||||||
|
.any(|condition| condition.type_ == "Ready" && condition.status == "True")
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
};
|
||||||
|
|
||||||
|
let pod_status = status.phase.unwrap_or_else(|| "Unknown".to_string());
|
||||||
|
|
||||||
|
Some(PodInfo {
|
||||||
|
name,
|
||||||
|
ip: pod_ip,
|
||||||
|
status: pod_status,
|
||||||
|
is_ready,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if the pod is in a state where it can accept traffic
|
||||||
|
pub fn is_healthy(&self) -> bool {
|
||||||
|
self.is_ready && self.status == "Running"
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generates a worker URL for this pod
|
||||||
|
pub fn worker_url(&self, port: u16) -> String {
|
||||||
|
format!("http://{}:{}", self.ip, port)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn start_service_discovery(
|
||||||
|
config: ServiceDiscoveryConfig,
|
||||||
|
worker_urls: Arc<RwLock<Vec<String>>>,
|
||||||
|
) -> Result<task::JoinHandle<()>, kube::Error> {
|
||||||
|
// Don't initialize anything if service discovery is disabled
|
||||||
|
if !config.enabled {
|
||||||
|
// Return a generic error when service discovery is disabled
|
||||||
|
return Err(kube::Error::Api(kube::error::ErrorResponse {
|
||||||
|
status: "Disabled".to_string(),
|
||||||
|
message: "Service discovery is disabled".to_string(),
|
||||||
|
reason: "ConfigurationError".to_string(),
|
||||||
|
code: 400,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize Kubernetes client
|
||||||
|
let client = Client::try_default().await?;
|
||||||
|
|
||||||
|
// Construct label selector string from map
|
||||||
|
let label_selector = config
|
||||||
|
.selector
|
||||||
|
.iter()
|
||||||
|
.map(|(k, v)| format!("{}={}", k, v))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join(",");
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"Starting Kubernetes service discovery with selector: {}",
|
||||||
|
label_selector
|
||||||
|
);
|
||||||
|
|
||||||
|
// Create the task that will run in the background
|
||||||
|
let handle = task::spawn(async move {
|
||||||
|
// We'll track pods we've already added to avoid duplicates
|
||||||
|
let tracked_pods = Arc::new(Mutex::new(HashSet::new()));
|
||||||
|
|
||||||
|
// Create a watcher for pods
|
||||||
|
let pods: Api<Pod> = if let Some(namespace) = &config.namespace {
|
||||||
|
Api::namespaced(client, namespace)
|
||||||
|
} else {
|
||||||
|
Api::all(client)
|
||||||
|
};
|
||||||
|
|
||||||
|
info!("Kubernetes service discovery initialized successfully");
|
||||||
|
|
||||||
|
// Create an Arc for the selector map
|
||||||
|
let selector = Arc::new(config.selector);
|
||||||
|
let port = config.port;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
// Create a watcher with the proper parameters according to the kube-rs API
|
||||||
|
let watcher_config = Config::default();
|
||||||
|
let watcher_stream = watcher(pods.clone(), watcher_config).applied_objects();
|
||||||
|
|
||||||
|
// Clone Arcs for the closures
|
||||||
|
let selector_clone = Arc::clone(&selector);
|
||||||
|
let tracked_pods_clone = Arc::clone(&tracked_pods);
|
||||||
|
let worker_urls_clone = Arc::clone(&worker_urls);
|
||||||
|
|
||||||
|
// Apply label selector filter separately since we can't do it directly with the watcher anymore
|
||||||
|
let filtered_stream = watcher_stream.filter_map(move |obj_res| {
|
||||||
|
let selector_inner = Arc::clone(&selector_clone);
|
||||||
|
|
||||||
|
async move {
|
||||||
|
match obj_res {
|
||||||
|
Ok(pod) => {
|
||||||
|
// Only process pods matching our label selector
|
||||||
|
if pod.metadata.labels.as_ref().map_or(false, |labels| {
|
||||||
|
// Check if the pod has all the labels from our selector
|
||||||
|
selector_inner.iter().all(|(k, v)| {
|
||||||
|
labels.get(k).map_or(false, |label_value| label_value == v)
|
||||||
|
})
|
||||||
|
}) {
|
||||||
|
Some(Ok(pod))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => Some(Err(e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Clone again for the next closure
|
||||||
|
let tracked_pods_clone2 = Arc::clone(&tracked_pods_clone);
|
||||||
|
let worker_urls_clone2 = Arc::clone(&worker_urls_clone);
|
||||||
|
|
||||||
|
match filtered_stream
|
||||||
|
.try_for_each(move |pod| {
|
||||||
|
let tracked_pods_inner = Arc::clone(&tracked_pods_clone2);
|
||||||
|
let worker_urls_inner = Arc::clone(&worker_urls_clone2);
|
||||||
|
|
||||||
|
async move {
|
||||||
|
if let Some(pod_info) = PodInfo::from_pod(&pod) {
|
||||||
|
if pod.metadata.deletion_timestamp.is_some() {
|
||||||
|
handle_pod_deletion(
|
||||||
|
&pod_info,
|
||||||
|
tracked_pods_inner,
|
||||||
|
worker_urls_inner,
|
||||||
|
port,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
} else {
|
||||||
|
handle_pod_event(
|
||||||
|
&pod_info,
|
||||||
|
tracked_pods_inner,
|
||||||
|
worker_urls_inner,
|
||||||
|
port,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(_) => {}
|
||||||
|
Err(err) => {
|
||||||
|
error!("Error in Kubernetes watcher: {}", err);
|
||||||
|
// Wait a bit before retrying
|
||||||
|
time::sleep(Duration::from_secs(5)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the watcher exits for some reason, wait a bit before restarting
|
||||||
|
warn!(
|
||||||
|
"Kubernetes watcher exited, restarting in {} seconds",
|
||||||
|
config.check_interval.as_secs()
|
||||||
|
);
|
||||||
|
time::sleep(config.check_interval).await;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(handle)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn handle_pod_event(
|
||||||
|
pod_info: &PodInfo,
|
||||||
|
tracked_pods: Arc<Mutex<HashSet<PodInfo>>>,
|
||||||
|
worker_urls: Arc<RwLock<Vec<String>>>,
|
||||||
|
port: u16,
|
||||||
|
) {
|
||||||
|
let worker_url = pod_info.worker_url(port);
|
||||||
|
|
||||||
|
// Check if pod is already tracked
|
||||||
|
let already_tracked = {
|
||||||
|
let tracker = tracked_pods.lock().unwrap();
|
||||||
|
tracker.contains(pod_info)
|
||||||
|
};
|
||||||
|
|
||||||
|
// If pod is healthy and not already tracked, add it
|
||||||
|
if pod_info.is_healthy() {
|
||||||
|
if !already_tracked {
|
||||||
|
info!(
|
||||||
|
"Adding healthy pod {} ({}) as worker",
|
||||||
|
pod_info.name, pod_info.ip
|
||||||
|
);
|
||||||
|
|
||||||
|
// Add URL to worker list
|
||||||
|
let mut urls = worker_urls.write().unwrap();
|
||||||
|
if !urls.contains(&worker_url) {
|
||||||
|
urls.push(worker_url.clone());
|
||||||
|
info!("Added new worker URL: {}", worker_url);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Track this pod
|
||||||
|
let mut tracker = tracked_pods.lock().unwrap();
|
||||||
|
tracker.insert(pod_info.clone());
|
||||||
|
}
|
||||||
|
} else if already_tracked {
|
||||||
|
// If pod was healthy before but not anymore, remove it
|
||||||
|
handle_pod_deletion(pod_info, tracked_pods, worker_urls, port).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn handle_pod_deletion(
|
||||||
|
pod_info: &PodInfo,
|
||||||
|
tracked_pods: Arc<Mutex<HashSet<PodInfo>>>,
|
||||||
|
worker_urls: Arc<RwLock<Vec<String>>>,
|
||||||
|
port: u16,
|
||||||
|
) {
|
||||||
|
let worker_url = pod_info.worker_url(port);
|
||||||
|
|
||||||
|
// Remove the pod from our tracking
|
||||||
|
let was_tracked = {
|
||||||
|
let mut tracker = tracked_pods.lock().unwrap();
|
||||||
|
tracker.remove(pod_info)
|
||||||
|
};
|
||||||
|
|
||||||
|
if was_tracked {
|
||||||
|
info!(
|
||||||
|
"Removing pod {} ({}) from workers",
|
||||||
|
pod_info.name, pod_info.ip
|
||||||
|
);
|
||||||
|
|
||||||
|
// Remove URL from worker list
|
||||||
|
let mut urls = worker_urls.write().unwrap();
|
||||||
|
if let Some(idx) = urls.iter().position(|url| url == &worker_url) {
|
||||||
|
urls.remove(idx);
|
||||||
|
info!("Removed worker URL: {}", worker_url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user