[router] allow more health check configuration (#9198)

This commit is contained in:
Simo Lin
2025-08-15 08:07:45 -07:00
committed by GitHub
parent 0c8594e67d
commit 21b8846066
15 changed files with 398 additions and 166 deletions

View File

@@ -49,6 +49,8 @@ pub struct RouterConfig {
/// Disable circuit breaker (overrides circuit_breaker.failure_threshold to u32::MAX when true)
#[serde(default)]
pub disable_circuit_breaker: bool,
/// Health check configuration
pub health_check: HealthCheckConfig,
}
/// Routing mode configuration
@@ -183,7 +185,7 @@ impl Default for DiscoveryConfig {
enabled: false,
namespace: None,
port: 8000,
check_interval_secs: 60,
check_interval_secs: 120,
selector: HashMap::new(),
prefill_selector: HashMap::new(),
decode_selector: HashMap::new(),
@@ -212,17 +214,44 @@ pub struct RetryConfig {
impl Default for RetryConfig {
fn default() -> Self {
Self {
max_retries: 3,
initial_backoff_ms: 100,
max_backoff_ms: 10000,
backoff_multiplier: 2.0,
jitter_factor: 0.1,
max_retries: 5,
initial_backoff_ms: 50,
max_backoff_ms: 30000,
backoff_multiplier: 1.5,
jitter_factor: 0.2,
}
}
}
fn default_retry_jitter_factor() -> f32 {
0.1
0.2
}
/// Health check configuration for worker monitoring
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealthCheckConfig {
/// Number of consecutive failures before marking unhealthy
pub failure_threshold: u32,
/// Number of consecutive successes before marking healthy
pub success_threshold: u32,
/// Timeout for health check requests in seconds
pub timeout_secs: u64,
/// Interval between health checks in seconds
pub check_interval_secs: u64,
/// Health check endpoint path
pub endpoint: String,
}
impl Default for HealthCheckConfig {
fn default() -> Self {
Self {
failure_threshold: 3,
success_threshold: 2,
timeout_secs: 5,
check_interval_secs: 60,
endpoint: "/health".to_string(),
}
}
}
/// Circuit breaker configuration for worker reliability
@@ -241,10 +270,10 @@ pub struct CircuitBreakerConfig {
impl Default for CircuitBreakerConfig {
fn default() -> Self {
Self {
failure_threshold: 5,
success_threshold: 2,
timeout_duration_secs: 30,
window_duration_secs: 60,
failure_threshold: 10,
success_threshold: 3,
timeout_duration_secs: 60,
window_duration_secs: 120,
}
}
}
@@ -276,10 +305,10 @@ impl Default for RouterConfig {
policy: PolicyConfig::Random,
host: "127.0.0.1".to_string(),
port: 3001,
max_payload_size: 268_435_456, // 256MB
request_timeout_secs: 3600, // 1 hour to match Python mini LB
worker_startup_timeout_secs: 300,
worker_startup_check_interval_secs: 10,
max_payload_size: 536_870_912, // 512MB
request_timeout_secs: 1800, // 30 minutes
worker_startup_timeout_secs: 600,
worker_startup_check_interval_secs: 30,
dp_aware: false,
api_key: None,
discovery: None,
@@ -287,12 +316,13 @@ impl Default for RouterConfig {
log_dir: None,
log_level: None,
request_id_headers: None,
max_concurrent_requests: 64,
max_concurrent_requests: 256,
cors_allowed_origins: vec![],
retry: RetryConfig::default(),
circuit_breaker: CircuitBreakerConfig::default(),
disable_retries: false,
disable_circuit_breaker: false,
health_check: HealthCheckConfig::default(),
}
}
}
@@ -365,10 +395,10 @@ mod tests {
assert!(matches!(config.policy, PolicyConfig::Random));
assert_eq!(config.host, "127.0.0.1");
assert_eq!(config.port, 3001);
assert_eq!(config.max_payload_size, 268_435_456);
assert_eq!(config.request_timeout_secs, 3600);
assert_eq!(config.worker_startup_timeout_secs, 300);
assert_eq!(config.worker_startup_check_interval_secs, 10);
assert_eq!(config.max_payload_size, 536_870_912);
assert_eq!(config.request_timeout_secs, 1800);
assert_eq!(config.worker_startup_timeout_secs, 600);
assert_eq!(config.worker_startup_check_interval_secs, 30);
assert!(config.discovery.is_none());
assert!(config.metrics.is_none());
assert!(config.log_dir.is_none());
@@ -425,6 +455,7 @@ mod tests {
circuit_breaker: CircuitBreakerConfig::default(),
disable_retries: false,
disable_circuit_breaker: false,
health_check: HealthCheckConfig::default(),
};
let json = serde_json::to_string(&config).unwrap();
@@ -614,7 +645,7 @@ mod tests {
assert!(!config.enabled);
assert!(config.namespace.is_none());
assert_eq!(config.port, 8000);
assert_eq!(config.check_interval_secs, 60);
assert_eq!(config.check_interval_secs, 120);
assert!(config.selector.is_empty());
assert!(config.prefill_selector.is_empty());
assert!(config.decode_selector.is_empty());
@@ -856,6 +887,7 @@ mod tests {
circuit_breaker: CircuitBreakerConfig::default(),
disable_retries: false,
disable_circuit_breaker: false,
health_check: HealthCheckConfig::default(),
};
assert!(config.mode.is_pd_mode());
@@ -911,6 +943,7 @@ mod tests {
circuit_breaker: CircuitBreakerConfig::default(),
disable_retries: false,
disable_circuit_breaker: false,
health_check: HealthCheckConfig::default(),
};
assert!(!config.mode.is_pd_mode());
@@ -962,6 +995,7 @@ mod tests {
circuit_breaker: CircuitBreakerConfig::default(),
disable_retries: false,
disable_circuit_breaker: false,
health_check: HealthCheckConfig::default(),
};
assert!(config.has_service_discovery());