[router] address worker load tracking consistency (#9523)

Co-authored-by: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
This commit is contained in:
Simo Lin
2025-08-26 06:40:51 -07:00
committed by GitHub
parent 0936c766ed
commit 3578eb1e9b
3 changed files with 95 additions and 2 deletions

View File

@@ -55,6 +55,12 @@ pub trait Worker: Send + Sync + fmt::Debug {
/// Decrement the load counter
fn decrement_load(&self);
/// Reset the load counter to 0 (for sync/recovery)
fn reset_load(&self) {
// Default implementation - does nothing
// Workers that track load should override this
}
/// Get the number of processed requests
fn processed_requests(&self) -> usize;
@@ -364,6 +370,10 @@ impl Worker for BasicWorker {
.ok();
}
fn reset_load(&self) {
self.load_counter.store(0, Ordering::Relaxed);
}
fn processed_requests(&self) -> usize {
self.processed_counter.load(Ordering::Relaxed)
}
@@ -449,6 +459,10 @@ impl Worker for DPAwareWorker {
self.base_worker.decrement_load();
}
fn reset_load(&self) {
self.base_worker.reset_load();
}
fn processed_requests(&self) -> usize {
self.base_worker.processed_requests()
}
@@ -825,6 +839,10 @@ pub fn start_health_checker(
let mut interval =
tokio::time::interval(tokio::time::Duration::from_secs(check_interval_secs));
// Counter for periodic load reset (every 10 health check cycles)
let mut check_count = 0u64;
const LOAD_RESET_INTERVAL: u64 = 10;
loop {
interval.tick().await;
@@ -834,6 +852,8 @@ pub fn start_health_checker(
break;
}
check_count += 1;
// Check health of all workers
let workers_to_check = match workers.read() {
Ok(guard) => guard.iter().map(|w| w.clone_worker()).collect::<Vec<_>>(),
@@ -843,6 +863,22 @@ pub fn start_health_checker(
}
};
// Periodically reset load counters to prevent drift
// Only do this when we believe all workers should be idle
if check_count.is_multiple_of(LOAD_RESET_INTERVAL) {
let max_load = workers_to_check.iter().map(|w| w.load()).max().unwrap_or(0);
// Only reset if load appears to be very low (likely drift)
if max_load <= 2 {
tracing::debug!(
"Resetting load counters to prevent drift (max_load: {})",
max_load
);
for worker in &workers_to_check {
worker.reset_load();
}
}
}
// Perform health checks concurrently
let health_checks = workers_to_check.iter().map(|worker| {
let worker_url = worker.url().to_string();