[router] consolidate worker load monitoring (#10894)

This commit is contained in:
Simo Lin
2025-09-25 09:59:30 -04:00
committed by GitHub
parent 77830a265e
commit d511b2d905
7 changed files with 199 additions and 232 deletions

View File

@@ -25,5 +25,5 @@ pub use worker::{
Worker, WorkerFactory, WorkerLoadGuard, WorkerType,
};
pub use worker_builder::{BasicWorkerBuilder, DPAwareWorkerBuilder};
pub use worker_manager::{DpInfo, ServerInfo, WorkerManager};
pub use worker_manager::{DpInfo, LoadMonitor, ServerInfo, WorkerManager};
pub use worker_registry::{WorkerId, WorkerRegistry, WorkerRegistryStats};

View File

@@ -23,6 +23,8 @@ use serde_json::Value;
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::{watch, Mutex};
use tokio::task::JoinHandle;
use tracing::{debug, error, info, warn};
static HTTP_CLIENT: Lazy<reqwest::Client> = Lazy::new(|| {
@@ -1177,6 +1179,139 @@ impl WorkerManager {
}
}
/// Load monitoring service that periodically fetches worker loads
pub struct LoadMonitor {
worker_registry: Arc<WorkerRegistry>,
policy_registry: Arc<PolicyRegistry>,
client: reqwest::Client,
interval: Duration,
tx: watch::Sender<HashMap<String, isize>>,
rx: watch::Receiver<HashMap<String, isize>>,
monitor_handle: Arc<Mutex<Option<JoinHandle<()>>>>,
}
impl LoadMonitor {
/// Create a new load monitor
pub fn new(
worker_registry: Arc<WorkerRegistry>,
policy_registry: Arc<PolicyRegistry>,
client: reqwest::Client,
interval_secs: u64,
) -> Self {
let (tx, rx) = watch::channel(HashMap::new());
Self {
worker_registry,
policy_registry,
client,
interval: Duration::from_secs(interval_secs),
tx,
rx,
monitor_handle: Arc::new(Mutex::new(None)),
}
}
/// Start monitoring worker loads
pub async fn start(&self) {
let mut handle_guard = self.monitor_handle.lock().await;
if handle_guard.is_some() {
debug!("Load monitoring already running");
return;
}
info!(
"Starting load monitoring with interval: {:?}",
self.interval
);
let worker_registry = Arc::clone(&self.worker_registry);
let policy_registry = Arc::clone(&self.policy_registry);
let client = self.client.clone();
let interval = self.interval;
let tx = self.tx.clone();
let handle = tokio::spawn(async move {
Self::monitor_loop(worker_registry, policy_registry, client, interval, tx).await;
});
*handle_guard = Some(handle);
}
/// Stop monitoring worker loads
pub async fn stop(&self) {
let mut handle_guard = self.monitor_handle.lock().await;
if let Some(handle) = handle_guard.take() {
info!("Stopping load monitoring");
handle.abort();
let _ = handle.await; // Wait for task to finish
}
}
/// Get a receiver for load updates
pub fn subscribe(&self) -> watch::Receiver<HashMap<String, isize>> {
self.rx.clone()
}
/// The main monitoring loop
async fn monitor_loop(
worker_registry: Arc<WorkerRegistry>,
policy_registry: Arc<PolicyRegistry>,
client: reqwest::Client,
interval: Duration,
tx: watch::Sender<HashMap<String, isize>>,
) {
let mut interval_timer = tokio::time::interval(interval);
loop {
interval_timer.tick().await;
let power_of_two_policies = policy_registry.get_all_power_of_two_policies();
if power_of_two_policies.is_empty() {
debug!("No PowerOfTwo policies found, skipping load fetch");
continue;
}
let result = WorkerManager::get_all_worker_loads(&worker_registry, &client).await;
let mut loads = HashMap::new();
for load_info in result.loads {
loads.insert(load_info.worker, load_info.load);
}
if !loads.is_empty() {
debug!(
"Fetched loads from {} workers, updating {} PowerOfTwo policies",
loads.len(),
power_of_two_policies.len()
);
for policy in &power_of_two_policies {
policy.update_loads(&loads);
}
let _ = tx.send(loads);
} else {
warn!("No loads fetched from workers");
}
}
}
/// Check if monitoring is currently active
pub async fn is_running(&self) -> bool {
let handle_guard = self.monitor_handle.lock().await;
handle_guard.is_some()
}
}
impl Drop for LoadMonitor {
fn drop(&mut self) {
if let Ok(mut handle_guard) = self.monitor_handle.try_lock() {
if let Some(handle) = handle_guard.take() {
handle.abort();
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;