[router] cleanup worker health check to return early (#11310)
This commit is contained in:
@@ -9,11 +9,14 @@ use serde_json;
|
|||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||||
use std::sync::{Arc, LazyLock};
|
use std::sync::{Arc, LazyLock};
|
||||||
|
use std::time::Duration;
|
||||||
|
use std::time::Instant;
|
||||||
use tokio::sync::{Mutex, RwLock};
|
use tokio::sync::{Mutex, RwLock};
|
||||||
|
use tokio::time;
|
||||||
|
|
||||||
static WORKER_CLIENT: LazyLock<reqwest::Client> = LazyLock::new(|| {
|
static WORKER_CLIENT: LazyLock<reqwest::Client> = LazyLock::new(|| {
|
||||||
reqwest::Client::builder()
|
reqwest::Client::builder()
|
||||||
.timeout(std::time::Duration::from_secs(30))
|
.timeout(Duration::from_secs(30))
|
||||||
.build()
|
.build()
|
||||||
.expect("Failed to create worker HTTP client")
|
.expect("Failed to create worker HTTP client")
|
||||||
});
|
});
|
||||||
@@ -227,6 +230,8 @@ pub trait Worker: Send + Sync + fmt::Debug {
|
|||||||
async fn reset_grpc_client(&self) -> WorkerResult<()> {
|
async fn reset_grpc_client(&self) -> WorkerResult<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
async fn grpc_health_check(&self) -> WorkerResult<bool>;
|
||||||
|
async fn http_health_check(&self) -> WorkerResult<bool>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Connection mode for worker communication
|
/// Connection mode for worker communication
|
||||||
@@ -407,66 +412,9 @@ impl Worker for BasicWorker {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn check_health_async(&self) -> WorkerResult<()> {
|
async fn check_health_async(&self) -> WorkerResult<()> {
|
||||||
use std::time::Duration;
|
|
||||||
|
|
||||||
let health_result = match &self.metadata.connection_mode {
|
let health_result = match &self.metadata.connection_mode {
|
||||||
ConnectionMode::Http => {
|
ConnectionMode::Http => self.http_health_check().await?,
|
||||||
let url = self.normalised_url()?;
|
ConnectionMode::Grpc { .. } => self.grpc_health_check().await?,
|
||||||
let health_url = format!("{}{}", url, self.metadata.health_config.endpoint);
|
|
||||||
let timeout = Duration::from_secs(self.metadata.health_config.timeout_secs);
|
|
||||||
|
|
||||||
let mut request = WORKER_CLIENT.get(&health_url).timeout(timeout);
|
|
||||||
|
|
||||||
if let Some(ref api_key) = self.metadata.api_key {
|
|
||||||
request = request.header("Authorization", format!("Bearer {}", api_key));
|
|
||||||
}
|
|
||||||
|
|
||||||
match request.send().await {
|
|
||||||
Ok(response) => response.status().is_success(),
|
|
||||||
Err(_) => false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ConnectionMode::Grpc { .. } => {
|
|
||||||
// Use the new get_grpc_client() method for lazy initialization
|
|
||||||
match self.get_grpc_client().await {
|
|
||||||
Ok(Some(grpc_client)) => {
|
|
||||||
let mut client = grpc_client.lock().await;
|
|
||||||
match client.health_check().await {
|
|
||||||
Ok(response) => {
|
|
||||||
tracing::debug!(
|
|
||||||
"gRPC health check succeeded for {}: healthy={}",
|
|
||||||
self.metadata.url,
|
|
||||||
response.healthy
|
|
||||||
);
|
|
||||||
response.healthy
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
tracing::warn!(
|
|
||||||
"gRPC health check RPC failed for {}: {:?}",
|
|
||||||
self.metadata.url,
|
|
||||||
e
|
|
||||||
);
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(None) => {
|
|
||||||
tracing::error!(
|
|
||||||
"Worker {} is not a gRPC worker but has gRPC connection mode",
|
|
||||||
self.metadata.url
|
|
||||||
);
|
|
||||||
false
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
tracing::error!(
|
|
||||||
"Failed to get gRPC client for worker {}: {:?}",
|
|
||||||
self.metadata.url,
|
|
||||||
e
|
|
||||||
);
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
if health_result {
|
if health_result {
|
||||||
@@ -594,6 +542,61 @@ impl Worker for BasicWorker {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn grpc_health_check(&self) -> WorkerResult<bool> {
|
||||||
|
let timeout = Duration::from_secs(self.metadata.health_config.timeout_secs);
|
||||||
|
let maybe = self.get_grpc_client().await?;
|
||||||
|
let Some(grpc_client) = maybe else {
|
||||||
|
tracing::error!(
|
||||||
|
"Worker {} is not a gRPC worker but connection mode is gRPC",
|
||||||
|
self.metadata.url
|
||||||
|
);
|
||||||
|
return Ok(false);
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut client = grpc_client.lock().await;
|
||||||
|
match time::timeout(timeout, client.health_check()).await {
|
||||||
|
Ok(Ok(resp)) => {
|
||||||
|
tracing::debug!(
|
||||||
|
"gRPC health OK for {}: healthy={}",
|
||||||
|
self.metadata.url,
|
||||||
|
resp.healthy
|
||||||
|
);
|
||||||
|
Ok(resp.healthy)
|
||||||
|
}
|
||||||
|
Ok(Err(err)) => {
|
||||||
|
tracing::warn!("gRPC health RPC error for {}: {err:?}", self.metadata.url);
|
||||||
|
Ok(false)
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
tracing::warn!("gRPC health timed out for {}", self.metadata.url);
|
||||||
|
Ok(false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn http_health_check(&self) -> WorkerResult<bool> {
|
||||||
|
let timeout = Duration::from_secs(self.metadata.health_config.timeout_secs);
|
||||||
|
|
||||||
|
let url = self.normalised_url()?;
|
||||||
|
let health_url = format!("{}{}", url, self.metadata.health_config.endpoint);
|
||||||
|
|
||||||
|
let mut req = WORKER_CLIENT.get(health_url).timeout(timeout);
|
||||||
|
if let Some(api_key) = &self.metadata.api_key {
|
||||||
|
req = req.bearer_auth(api_key);
|
||||||
|
}
|
||||||
|
|
||||||
|
match req.send().await {
|
||||||
|
Ok(resp) => Ok(resp.status().is_success()),
|
||||||
|
Err(err) => {
|
||||||
|
tracing::warn!(
|
||||||
|
"HTTP health check failed for {}: {err:?}",
|
||||||
|
self.metadata.url
|
||||||
|
);
|
||||||
|
Ok(false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A DP-aware worker that handles data-parallel routing
|
/// A DP-aware worker that handles data-parallel routing
|
||||||
@@ -730,6 +733,14 @@ impl Worker for DPAwareWorker {
|
|||||||
async fn reset_grpc_client(&self) -> WorkerResult<()> {
|
async fn reset_grpc_client(&self) -> WorkerResult<()> {
|
||||||
self.base_worker.reset_grpc_client().await
|
self.base_worker.reset_grpc_client().await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn grpc_health_check(&self) -> WorkerResult<bool> {
|
||||||
|
self.base_worker.grpc_health_check().await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn http_health_check(&self) -> WorkerResult<bool> {
|
||||||
|
self.base_worker.http_health_check().await
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Worker factory for creating workers of different types
|
/// Worker factory for creating workers of different types
|
||||||
@@ -755,10 +766,8 @@ impl WorkerFactory {
|
|||||||
/// Static health validation before creating a worker
|
/// Static health validation before creating a worker
|
||||||
/// This replaces wait_for_worker_health in handlers
|
/// This replaces wait_for_worker_health in handlers
|
||||||
pub async fn validate_health(url: &str, timeout_secs: u64) -> WorkerResult<()> {
|
pub async fn validate_health(url: &str, timeout_secs: u64) -> WorkerResult<()> {
|
||||||
use std::time::Instant;
|
|
||||||
|
|
||||||
let start_time = Instant::now();
|
let start_time = Instant::now();
|
||||||
let timeout = std::time::Duration::from_secs(timeout_secs);
|
let timeout = Duration::from_secs(timeout_secs);
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
if start_time.elapsed() > timeout {
|
if start_time.elapsed() > timeout {
|
||||||
@@ -775,7 +784,7 @@ impl WorkerFactory {
|
|||||||
// API key authentication is handled in the worker instance's check_health_async method
|
// API key authentication is handled in the worker instance's check_health_async method
|
||||||
match WORKER_CLIENT
|
match WORKER_CLIENT
|
||||||
.get(format!("{}/health", url))
|
.get(format!("{}/health", url))
|
||||||
.timeout(std::time::Duration::from_secs(5))
|
.timeout(Duration::from_secs(5))
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
@@ -795,7 +804,7 @@ impl WorkerFactory {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -891,8 +900,7 @@ pub fn start_health_checker(
|
|||||||
let shutdown_clone = shutdown.clone();
|
let shutdown_clone = shutdown.clone();
|
||||||
|
|
||||||
let handle = tokio::spawn(async move {
|
let handle = tokio::spawn(async move {
|
||||||
let mut interval =
|
let mut interval = tokio::time::interval(Duration::from_secs(check_interval_secs));
|
||||||
tokio::time::interval(tokio::time::Duration::from_secs(check_interval_secs));
|
|
||||||
|
|
||||||
// Counter for periodic load reset (every 10 health check cycles)
|
// Counter for periodic load reset (every 10 health check cycles)
|
||||||
let mut check_count = 0u64;
|
let mut check_count = 0u64;
|
||||||
|
|||||||
@@ -953,113 +953,105 @@ impl WorkerManager {
|
|||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Mark all workers as unhealthy initially
|
||||||
info!(
|
info!(
|
||||||
"Marking {} workers as unhealthy before initial health checks",
|
"Marking {} workers as unhealthy before health checks",
|
||||||
workers.len()
|
workers.len()
|
||||||
);
|
);
|
||||||
for worker in &workers {
|
for worker in &workers {
|
||||||
worker.set_healthy(false);
|
worker.set_healthy(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
info!(
|
|
||||||
"Performing initial health checks for {} workers",
|
|
||||||
workers.len()
|
|
||||||
);
|
|
||||||
let health_check_futures: Vec<_> = workers
|
|
||||||
.iter()
|
|
||||||
.map(|worker| {
|
|
||||||
let w = worker.clone();
|
|
||||||
let url = worker.url().to_string();
|
|
||||||
async move {
|
|
||||||
match w.check_health_async().await {
|
|
||||||
Ok(_) => {
|
|
||||||
w.set_healthy(true);
|
|
||||||
debug!(
|
|
||||||
"Worker {} passed initial health check and marked healthy",
|
|
||||||
url
|
|
||||||
);
|
|
||||||
Ok(url)
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
warn!("Worker {} failed initial health check: {}", url, e);
|
|
||||||
Err(url)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let health_results = future::join_all(health_check_futures).await;
|
|
||||||
let failed_checks: Vec<_> = health_results.into_iter().filter_map(|r| r.err()).collect();
|
|
||||||
|
|
||||||
if !failed_checks.is_empty() {
|
|
||||||
info!(
|
|
||||||
"Initial health check: {} workers failed: {:?}",
|
|
||||||
failed_checks.len(),
|
|
||||||
failed_checks
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
|
// 1. Filter unhealthy workers
|
||||||
let workers = registry.get_all();
|
let workers = registry.get_all();
|
||||||
let healthy_workers: Vec<_> = workers
|
|
||||||
.iter()
|
|
||||||
.filter(|w| w.is_healthy())
|
|
||||||
.map(|w| w.url().to_string())
|
|
||||||
.collect();
|
|
||||||
let unhealthy_workers: Vec<_> = workers
|
let unhealthy_workers: Vec<_> = workers
|
||||||
.iter()
|
.iter()
|
||||||
.filter(|w| !w.is_healthy())
|
.filter(|w| !w.is_healthy())
|
||||||
.map(|w| w.url().to_string())
|
.cloned()
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
|
// 2. If all workers are healthy, return immediately
|
||||||
if unhealthy_workers.is_empty() {
|
if unhealthy_workers.is_empty() {
|
||||||
|
let healthy_urls: Vec<_> = workers.iter().map(|w| w.url().to_string()).collect();
|
||||||
info!(
|
info!(
|
||||||
"All {} workers are healthy: {:?}",
|
"All {} workers are healthy: {:?}",
|
||||||
workers.len(),
|
workers.len(),
|
||||||
healthy_workers
|
healthy_urls
|
||||||
);
|
);
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check timeout
|
||||||
if start_time.elapsed() > timeout {
|
if start_time.elapsed() > timeout {
|
||||||
|
let healthy_workers: Vec<_> = workers
|
||||||
|
.iter()
|
||||||
|
.filter(|w| w.is_healthy())
|
||||||
|
.map(|w| w.url().to_string())
|
||||||
|
.collect();
|
||||||
|
let unhealthy_urls: Vec<_> = unhealthy_workers
|
||||||
|
.iter()
|
||||||
|
.map(|w| w.url().to_string())
|
||||||
|
.collect();
|
||||||
|
|
||||||
error!(
|
error!(
|
||||||
"Workers failed to become healthy after {}s. Unhealthy: {:?}, Healthy: {:?}",
|
"Workers failed to become healthy after {}s. Unhealthy: {:?}, Healthy: {:?}",
|
||||||
timeout_secs, unhealthy_workers, healthy_workers
|
timeout_secs, unhealthy_urls, healthy_workers
|
||||||
);
|
);
|
||||||
return Err(format!(
|
return Err(format!(
|
||||||
"Workers failed to become healthy after {}s. Unhealthy: {:?}",
|
"Workers failed to become healthy after {}s. Unhealthy: {:?}",
|
||||||
timeout_secs, unhealthy_workers
|
timeout_secs, unhealthy_urls
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let unhealthy_urls: Vec<_> = unhealthy_workers
|
||||||
|
.iter()
|
||||||
|
.map(|w| w.url().to_string())
|
||||||
|
.collect();
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"Waiting for {} workers to become healthy. Unhealthy: {:?}",
|
"Waiting for {} workers to become healthy. Unhealthy: {:?}",
|
||||||
unhealthy_workers.len(),
|
unhealthy_workers.len(),
|
||||||
unhealthy_workers
|
unhealthy_urls
|
||||||
);
|
);
|
||||||
|
|
||||||
let unhealthy_workers_to_check = workers
|
// 3. Check health of all unhealthy workers in parallel
|
||||||
|
let health_check_futures: Vec<_> = unhealthy_workers
|
||||||
.iter()
|
.iter()
|
||||||
.filter(|w| !w.is_healthy())
|
.map(|worker| {
|
||||||
.cloned()
|
let w = worker.clone();
|
||||||
.collect::<Vec<_>>();
|
let url = worker.url().to_string();
|
||||||
|
async move {
|
||||||
for worker in unhealthy_workers_to_check {
|
match w.check_health_async().await {
|
||||||
let url = worker.url().to_string();
|
Ok(_) => {
|
||||||
match worker.check_health_async().await {
|
w.set_healthy(true);
|
||||||
Ok(_) => {
|
debug!("Worker {} now healthy", url);
|
||||||
if !worker.is_healthy() {
|
}
|
||||||
worker.set_healthy(true);
|
Err(e) => {
|
||||||
debug!("Worker {} now healthy after health check", url);
|
debug!("Worker {} health check failed: {}", url, e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(e) => {
|
})
|
||||||
debug!("Worker {} health check failed: {}", url, e);
|
.collect();
|
||||||
}
|
|
||||||
}
|
future::join_all(health_check_futures).await;
|
||||||
|
|
||||||
|
// 4. Check if all workers are now healthy after health checks
|
||||||
|
let still_unhealthy: Vec<_> = workers.iter().filter(|w| !w.is_healthy()).collect();
|
||||||
|
|
||||||
|
// 5. If all workers are now healthy, return immediately without sleeping
|
||||||
|
if still_unhealthy.is_empty() {
|
||||||
|
let healthy_urls: Vec<_> = workers.iter().map(|w| w.url().to_string()).collect();
|
||||||
|
info!(
|
||||||
|
"All {} workers are healthy: {:?}",
|
||||||
|
workers.len(),
|
||||||
|
healthy_urls
|
||||||
|
);
|
||||||
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 6. Otherwise, sleep before next iteration
|
||||||
tokio::time::sleep(check_interval).await;
|
tokio::time::sleep(check_interval).await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user