[Router] Fix k8s Service Discovery (#6766)
Co-authored-by: Simo Lin <linsimo.mark@gmail.com>
This commit is contained in:
@@ -21,7 +21,7 @@ serde_json = "1.0"
|
||||
pyo3 = { version = "0.22.5", features = ["extension-module"] }
|
||||
dashmap = "6.1.0"
|
||||
http = "1.1.0"
|
||||
tokio = "1.42.0"
|
||||
tokio = { version = "1.42.0", features = ["macros", "rt-multi-thread"] }
|
||||
# Added for enhanced logging system
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "chrono"] }
|
||||
|
||||
@@ -18,7 +18,7 @@ use tracing::{error, info, warn, Level};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct AppState {
|
||||
router: Router,
|
||||
router: Arc<Router>,
|
||||
client: Client,
|
||||
}
|
||||
|
||||
@@ -29,7 +29,7 @@ impl AppState {
|
||||
policy_config: PolicyConfig,
|
||||
) -> Result<Self, String> {
|
||||
// Create router based on policy
|
||||
let router = Router::new(worker_urls, policy_config)?;
|
||||
let router = Arc::new(Router::new(worker_urls, policy_config)?);
|
||||
Ok(Self { router, client })
|
||||
}
|
||||
}
|
||||
@@ -218,24 +218,23 @@ pub async fn startup(config: ServerConfig) -> std::io::Result<()> {
|
||||
.build()
|
||||
.expect("Failed to create HTTP client");
|
||||
|
||||
let app_state = web::Data::new(
|
||||
AppState::new(
|
||||
config.worker_urls.clone(),
|
||||
client.clone(), // Clone the client here
|
||||
config.policy_config.clone(),
|
||||
)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?,
|
||||
);
|
||||
let app_state_init = AppState::new(
|
||||
config.worker_urls.clone(),
|
||||
client.clone(),
|
||||
config.policy_config.clone(),
|
||||
)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
|
||||
let router_arc = Arc::clone(&app_state_init.router);
|
||||
let app_state = web::Data::new(app_state_init);
|
||||
|
||||
// Start the service discovery if enabled
|
||||
if let Some(service_discovery_config) = config.service_discovery_config {
|
||||
if service_discovery_config.enabled {
|
||||
let worker_urls = Arc::clone(&app_state.router.get_worker_urls());
|
||||
|
||||
match start_service_discovery(service_discovery_config, worker_urls).await {
|
||||
info!("🚧 Initializing Kubernetes service discovery");
|
||||
// Pass the Arc<Router> directly
|
||||
match start_service_discovery(service_discovery_config, router_arc).await {
|
||||
Ok(handle) => {
|
||||
info!("✅ Service discovery started successfully");
|
||||
|
||||
// Spawn a task to handle the service discovery thread
|
||||
spawn(async move {
|
||||
if let Err(e) = handle.await {
|
||||
@@ -252,7 +251,10 @@ pub async fn startup(config: ServerConfig) -> std::io::Result<()> {
|
||||
}
|
||||
|
||||
info!("✅ Serving router on {}:{}", config.host, config.port);
|
||||
info!("✅ Serving workers on {:?}", config.worker_urls);
|
||||
info!(
|
||||
"✅ Serving workers on {:?}",
|
||||
app_state.router.get_worker_urls().read().unwrap()
|
||||
);
|
||||
|
||||
HttpServer::new(move || {
|
||||
App::new()
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use crate::router::Router;
|
||||
|
||||
use futures::{StreamExt, TryStreamExt};
|
||||
use k8s_openapi::api::core::v1::Pod;
|
||||
use kube::{
|
||||
@@ -7,11 +9,12 @@ use kube::{
|
||||
Client,
|
||||
};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::{Arc, Mutex, RwLock};
|
||||
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Duration;
|
||||
use tokio::task;
|
||||
use tokio::time;
|
||||
use tracing::{error, info, warn};
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
/// Represents the service discovery configuration
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -81,7 +84,7 @@ impl PodInfo {
|
||||
|
||||
pub async fn start_service_discovery(
|
||||
config: ServiceDiscoveryConfig,
|
||||
worker_urls: Arc<RwLock<Vec<String>>>,
|
||||
router: Arc<Router>,
|
||||
) -> Result<task::JoinHandle<()>, kube::Error> {
|
||||
// Don't initialize anything if service discovery is disabled
|
||||
if !config.enabled {
|
||||
@@ -136,7 +139,6 @@ pub async fn start_service_discovery(
|
||||
// Clone Arcs for the closures
|
||||
let selector_clone = Arc::clone(&selector);
|
||||
let tracked_pods_clone = Arc::clone(&tracked_pods);
|
||||
let worker_urls_clone = Arc::clone(&worker_urls);
|
||||
|
||||
// Apply label selector filter separately since we can't do it directly with the watcher anymore
|
||||
let filtered_stream = watcher_stream.filter_map(move |obj_res| {
|
||||
@@ -164,12 +166,12 @@ pub async fn start_service_discovery(
|
||||
|
||||
// Clone again for the next closure
|
||||
let tracked_pods_clone2 = Arc::clone(&tracked_pods_clone);
|
||||
let worker_urls_clone2 = Arc::clone(&worker_urls_clone);
|
||||
let router_clone = Arc::clone(&router);
|
||||
|
||||
match filtered_stream
|
||||
.try_for_each(move |pod| {
|
||||
let tracked_pods_inner = Arc::clone(&tracked_pods_clone2);
|
||||
let worker_urls_inner = Arc::clone(&worker_urls_clone2);
|
||||
let router_inner = Arc::clone(&router_clone);
|
||||
|
||||
async move {
|
||||
if let Some(pod_info) = PodInfo::from_pod(&pod) {
|
||||
@@ -177,18 +179,13 @@ pub async fn start_service_discovery(
|
||||
handle_pod_deletion(
|
||||
&pod_info,
|
||||
tracked_pods_inner,
|
||||
worker_urls_inner,
|
||||
router_inner,
|
||||
port,
|
||||
)
|
||||
.await;
|
||||
} else {
|
||||
handle_pod_event(
|
||||
&pod_info,
|
||||
tracked_pods_inner,
|
||||
worker_urls_inner,
|
||||
port,
|
||||
)
|
||||
.await;
|
||||
handle_pod_event(&pod_info, tracked_pods_inner, router_inner, port)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -219,7 +216,7 @@ pub async fn start_service_discovery(
|
||||
async fn handle_pod_event(
|
||||
pod_info: &PodInfo,
|
||||
tracked_pods: Arc<Mutex<HashSet<PodInfo>>>,
|
||||
worker_urls: Arc<RwLock<Vec<String>>>,
|
||||
router: Arc<Router>,
|
||||
port: u16,
|
||||
) {
|
||||
let worker_url = pod_info.worker_url(port);
|
||||
@@ -234,52 +231,275 @@ async fn handle_pod_event(
|
||||
if pod_info.is_healthy() {
|
||||
if !already_tracked {
|
||||
info!(
|
||||
"Adding healthy pod {} ({}) as worker",
|
||||
pod_info.name, pod_info.ip
|
||||
"Healthy pod found: {}. Adding worker: {}",
|
||||
pod_info.name, worker_url
|
||||
);
|
||||
|
||||
// Add URL to worker list
|
||||
let mut urls = worker_urls.write().unwrap();
|
||||
if !urls.contains(&worker_url) {
|
||||
urls.push(worker_url.clone());
|
||||
info!("Added new worker URL: {}", worker_url);
|
||||
match router.add_worker(&worker_url).await {
|
||||
Ok(msg) => {
|
||||
info!("Router add_worker: {}", msg);
|
||||
let mut tracker = tracked_pods.lock().unwrap();
|
||||
tracker.insert(pod_info.clone());
|
||||
}
|
||||
Err(e) => error!("Failed to add worker {} to router: {}", worker_url, e),
|
||||
}
|
||||
|
||||
// Track this pod
|
||||
let mut tracker = tracked_pods.lock().unwrap();
|
||||
tracker.insert(pod_info.clone());
|
||||
}
|
||||
} else if already_tracked {
|
||||
// If pod was healthy before but not anymore, remove it
|
||||
handle_pod_deletion(pod_info, tracked_pods, worker_urls, port).await;
|
||||
handle_pod_deletion(pod_info, tracked_pods, router, port).await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_pod_deletion(
|
||||
pod_info: &PodInfo,
|
||||
tracked_pods: Arc<Mutex<HashSet<PodInfo>>>,
|
||||
worker_urls: Arc<RwLock<Vec<String>>>,
|
||||
router: Arc<Router>,
|
||||
port: u16,
|
||||
) {
|
||||
let worker_url = pod_info.worker_url(port);
|
||||
let mut tracked = tracked_pods.lock().unwrap();
|
||||
|
||||
// Remove the pod from our tracking
|
||||
let was_tracked = {
|
||||
let mut tracker = tracked_pods.lock().unwrap();
|
||||
tracker.remove(pod_info)
|
||||
};
|
||||
|
||||
if was_tracked {
|
||||
if tracked.remove(pod_info) {
|
||||
info!(
|
||||
"Removing pod {} ({}) from workers",
|
||||
pod_info.name, pod_info.ip
|
||||
"Pod deleted: {}. Removing worker: {}",
|
||||
pod_info.name, worker_url
|
||||
);
|
||||
router.remove_worker(&worker_url);
|
||||
} else {
|
||||
// This case might occur if a pod is deleted before it was ever marked healthy and added.
|
||||
// Or if the event is duplicated. No action needed on the router if it wasn't tracked (and thus not added).
|
||||
debug!(
|
||||
"Pod deletion event for untracked/already removed pod: {}. Worker URL: {}",
|
||||
pod_info.name, worker_url
|
||||
);
|
||||
|
||||
// Remove URL from worker list
|
||||
let mut urls = worker_urls.write().unwrap();
|
||||
if let Some(idx) = urls.iter().position(|url| url == &worker_url) {
|
||||
urls.remove(idx);
|
||||
info!("Removed worker URL: {}", worker_url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::router::Router;
|
||||
use k8s_openapi::api::core::v1::{Pod, PodCondition, PodSpec, PodStatus};
|
||||
use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
|
||||
use k8s_openapi::apimachinery::pkg::apis::meta::v1::Time;
|
||||
use std::sync::RwLock;
|
||||
|
||||
// Helper function to create a Pod for testing PodInfo::from_pod
|
||||
fn create_k8s_pod(
|
||||
name: Option<&str>,
|
||||
ip: Option<&str>,
|
||||
phase: Option<&str>,
|
||||
ready_status: Option<&str>,
|
||||
deletion_timestamp: Option<Time>,
|
||||
) -> Pod {
|
||||
let mut pod = Pod {
|
||||
metadata: ObjectMeta {
|
||||
name: name.map(String::from),
|
||||
deletion_timestamp,
|
||||
..Default::default()
|
||||
},
|
||||
spec: Some(PodSpec::default()),
|
||||
status: None,
|
||||
};
|
||||
|
||||
if ip.is_some() || phase.is_some() || ready_status.is_some() {
|
||||
let mut pod_status = PodStatus {
|
||||
pod_ip: ip.map(String::from),
|
||||
phase: phase.map(String::from),
|
||||
conditions: None,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
if let Some(status_str) = ready_status {
|
||||
let condition = PodCondition {
|
||||
type_: "Ready".to_string(),
|
||||
status: status_str.to_string(),
|
||||
last_probe_time: None,
|
||||
last_transition_time: None,
|
||||
message: None,
|
||||
reason: None,
|
||||
};
|
||||
pod_status.conditions = Some(vec![condition]);
|
||||
}
|
||||
pod.status = Some(pod_status);
|
||||
}
|
||||
pod
|
||||
}
|
||||
|
||||
// Helper to create a Router instance for testing event handlers
|
||||
fn create_test_router() -> Arc<Router> {
|
||||
let worker_urls = Arc::new(RwLock::new(Vec::new()));
|
||||
Arc::new(Router::Random {
|
||||
worker_urls,
|
||||
timeout_secs: 5,
|
||||
interval_secs: 1,
|
||||
})
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_service_discovery_config_default() {
|
||||
let config = ServiceDiscoveryConfig::default();
|
||||
assert!(!config.enabled);
|
||||
assert!(config.selector.is_empty());
|
||||
assert_eq!(config.check_interval, Duration::from_secs(60));
|
||||
assert_eq!(config.port, 80);
|
||||
assert!(config.namespace.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pod_info_from_pod_valid() {
|
||||
let k8s_pod = create_k8s_pod(
|
||||
Some("test-pod"),
|
||||
Some("10.0.0.1"),
|
||||
Some("Running"),
|
||||
Some("True"),
|
||||
None,
|
||||
);
|
||||
let pod_info = PodInfo::from_pod(&k8s_pod).unwrap();
|
||||
assert_eq!(pod_info.name, "test-pod");
|
||||
assert_eq!(pod_info.ip, "10.0.0.1");
|
||||
assert_eq!(pod_info.status, "Running");
|
||||
assert!(pod_info.is_ready);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pod_info_from_pod_not_ready() {
|
||||
let k8s_pod = create_k8s_pod(
|
||||
Some("test-pod"),
|
||||
Some("10.0.0.1"),
|
||||
Some("Running"),
|
||||
Some("False"),
|
||||
None,
|
||||
);
|
||||
let pod_info = PodInfo::from_pod(&k8s_pod).unwrap();
|
||||
assert!(!pod_info.is_ready);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pod_info_from_pod_no_conditions() {
|
||||
let k8s_pod = create_k8s_pod(
|
||||
Some("test-pod"),
|
||||
Some("10.0.0.1"),
|
||||
Some("Running"),
|
||||
None,
|
||||
None,
|
||||
);
|
||||
let pod_info = PodInfo::from_pod(&k8s_pod).unwrap();
|
||||
assert!(!pod_info.is_ready);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pod_info_from_pod_missing_name() {
|
||||
let k8s_pod = create_k8s_pod(None, Some("10.0.0.1"), Some("Running"), Some("True"), None);
|
||||
assert!(PodInfo::from_pod(&k8s_pod).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pod_info_from_pod_missing_ip() {
|
||||
let k8s_pod = create_k8s_pod(Some("test-pod"), None, Some("Running"), Some("True"), None);
|
||||
assert!(PodInfo::from_pod(&k8s_pod).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pod_info_from_pod_missing_status_phase() {
|
||||
let k8s_pod = create_k8s_pod(Some("test-pod"), Some("10.0.0.1"), None, Some("True"), None);
|
||||
let pod_info = PodInfo::from_pod(&k8s_pod).unwrap();
|
||||
assert_eq!(pod_info.status, "Unknown");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pod_info_from_pod_no_status_object() {
|
||||
let mut k8s_pod = create_k8s_pod(Some("test-pod"), None, None, None, None);
|
||||
k8s_pod.status = None;
|
||||
assert!(PodInfo::from_pod(&k8s_pod).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pod_info_is_healthy() {
|
||||
let healthy_pod = PodInfo {
|
||||
name: "p1".into(),
|
||||
ip: "1.1.1.1".into(),
|
||||
status: "Running".into(),
|
||||
is_ready: true,
|
||||
};
|
||||
assert!(healthy_pod.is_healthy());
|
||||
|
||||
let not_ready_pod = PodInfo {
|
||||
name: "p2".into(),
|
||||
ip: "1.1.1.2".into(),
|
||||
status: "Running".into(),
|
||||
is_ready: false,
|
||||
};
|
||||
assert!(!not_ready_pod.is_healthy());
|
||||
|
||||
let not_running_pod = PodInfo {
|
||||
name: "p3".into(),
|
||||
ip: "1.1.1.3".into(),
|
||||
status: "Pending".into(),
|
||||
is_ready: true,
|
||||
};
|
||||
assert!(!not_running_pod.is_healthy());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pod_info_worker_url() {
|
||||
let pod_info = PodInfo {
|
||||
name: "p1".into(),
|
||||
ip: "1.2.3.4".into(),
|
||||
status: "Running".into(),
|
||||
is_ready: true,
|
||||
};
|
||||
assert_eq!(pod_info.worker_url(8080), "http://1.2.3.4:8080");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_handle_pod_event_add_unhealthy_pod() {
|
||||
let router = create_test_router();
|
||||
let tracked_pods = Arc::new(Mutex::new(HashSet::new()));
|
||||
let pod_info = PodInfo {
|
||||
name: "pod1".into(),
|
||||
ip: "1.2.3.4".into(),
|
||||
status: "Pending".into(),
|
||||
is_ready: false,
|
||||
};
|
||||
let port = 8080u16;
|
||||
|
||||
handle_pod_event(
|
||||
&pod_info,
|
||||
Arc::clone(&tracked_pods),
|
||||
Arc::clone(&router),
|
||||
port,
|
||||
)
|
||||
.await;
|
||||
|
||||
assert!(!tracked_pods.lock().unwrap().contains(&pod_info));
|
||||
assert!(!router
|
||||
.get_worker_urls()
|
||||
.read()
|
||||
.unwrap()
|
||||
.contains(&pod_info.worker_url(port)));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_handle_pod_deletion_non_existing_pod() {
|
||||
let router = create_test_router();
|
||||
let tracked_pods = Arc::new(Mutex::new(HashSet::new()));
|
||||
let pod_info = PodInfo {
|
||||
name: "pod1".into(),
|
||||
ip: "1.2.3.4".into(),
|
||||
status: "Running".into(),
|
||||
is_ready: true,
|
||||
};
|
||||
let port = 8080u16;
|
||||
|
||||
handle_pod_deletion(
|
||||
&pod_info,
|
||||
Arc::clone(&tracked_pods),
|
||||
Arc::clone(&router),
|
||||
port,
|
||||
)
|
||||
.await;
|
||||
|
||||
assert!(tracked_pods.lock().unwrap().is_empty());
|
||||
assert!(router.get_worker_urls().read().unwrap().is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user