[router] allow more health check configuration (#9198)
This commit is contained in:
@@ -49,6 +49,8 @@ pub struct RouterConfig {
|
||||
/// Disable circuit breaker (overrides circuit_breaker.failure_threshold to u32::MAX when true)
|
||||
#[serde(default)]
|
||||
pub disable_circuit_breaker: bool,
|
||||
/// Health check configuration
|
||||
pub health_check: HealthCheckConfig,
|
||||
}
|
||||
|
||||
/// Routing mode configuration
|
||||
@@ -183,7 +185,7 @@ impl Default for DiscoveryConfig {
|
||||
enabled: false,
|
||||
namespace: None,
|
||||
port: 8000,
|
||||
check_interval_secs: 60,
|
||||
check_interval_secs: 120,
|
||||
selector: HashMap::new(),
|
||||
prefill_selector: HashMap::new(),
|
||||
decode_selector: HashMap::new(),
|
||||
@@ -212,17 +214,44 @@ pub struct RetryConfig {
|
||||
impl Default for RetryConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_retries: 3,
|
||||
initial_backoff_ms: 100,
|
||||
max_backoff_ms: 10000,
|
||||
backoff_multiplier: 2.0,
|
||||
jitter_factor: 0.1,
|
||||
max_retries: 5,
|
||||
initial_backoff_ms: 50,
|
||||
max_backoff_ms: 30000,
|
||||
backoff_multiplier: 1.5,
|
||||
jitter_factor: 0.2,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn default_retry_jitter_factor() -> f32 {
|
||||
0.1
|
||||
0.2
|
||||
}
|
||||
|
||||
/// Health check configuration for worker monitoring
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct HealthCheckConfig {
|
||||
/// Number of consecutive failures before marking unhealthy
|
||||
pub failure_threshold: u32,
|
||||
/// Number of consecutive successes before marking healthy
|
||||
pub success_threshold: u32,
|
||||
/// Timeout for health check requests in seconds
|
||||
pub timeout_secs: u64,
|
||||
/// Interval between health checks in seconds
|
||||
pub check_interval_secs: u64,
|
||||
/// Health check endpoint path
|
||||
pub endpoint: String,
|
||||
}
|
||||
|
||||
impl Default for HealthCheckConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
failure_threshold: 3,
|
||||
success_threshold: 2,
|
||||
timeout_secs: 5,
|
||||
check_interval_secs: 60,
|
||||
endpoint: "/health".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Circuit breaker configuration for worker reliability
|
||||
@@ -241,10 +270,10 @@ pub struct CircuitBreakerConfig {
|
||||
impl Default for CircuitBreakerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
failure_threshold: 5,
|
||||
success_threshold: 2,
|
||||
timeout_duration_secs: 30,
|
||||
window_duration_secs: 60,
|
||||
failure_threshold: 10,
|
||||
success_threshold: 3,
|
||||
timeout_duration_secs: 60,
|
||||
window_duration_secs: 120,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -276,10 +305,10 @@ impl Default for RouterConfig {
|
||||
policy: PolicyConfig::Random,
|
||||
host: "127.0.0.1".to_string(),
|
||||
port: 3001,
|
||||
max_payload_size: 268_435_456, // 256MB
|
||||
request_timeout_secs: 3600, // 1 hour to match Python mini LB
|
||||
worker_startup_timeout_secs: 300,
|
||||
worker_startup_check_interval_secs: 10,
|
||||
max_payload_size: 536_870_912, // 512MB
|
||||
request_timeout_secs: 1800, // 30 minutes
|
||||
worker_startup_timeout_secs: 600,
|
||||
worker_startup_check_interval_secs: 30,
|
||||
dp_aware: false,
|
||||
api_key: None,
|
||||
discovery: None,
|
||||
@@ -287,12 +316,13 @@ impl Default for RouterConfig {
|
||||
log_dir: None,
|
||||
log_level: None,
|
||||
request_id_headers: None,
|
||||
max_concurrent_requests: 64,
|
||||
max_concurrent_requests: 256,
|
||||
cors_allowed_origins: vec![],
|
||||
retry: RetryConfig::default(),
|
||||
circuit_breaker: CircuitBreakerConfig::default(),
|
||||
disable_retries: false,
|
||||
disable_circuit_breaker: false,
|
||||
health_check: HealthCheckConfig::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -365,10 +395,10 @@ mod tests {
|
||||
assert!(matches!(config.policy, PolicyConfig::Random));
|
||||
assert_eq!(config.host, "127.0.0.1");
|
||||
assert_eq!(config.port, 3001);
|
||||
assert_eq!(config.max_payload_size, 268_435_456);
|
||||
assert_eq!(config.request_timeout_secs, 3600);
|
||||
assert_eq!(config.worker_startup_timeout_secs, 300);
|
||||
assert_eq!(config.worker_startup_check_interval_secs, 10);
|
||||
assert_eq!(config.max_payload_size, 536_870_912);
|
||||
assert_eq!(config.request_timeout_secs, 1800);
|
||||
assert_eq!(config.worker_startup_timeout_secs, 600);
|
||||
assert_eq!(config.worker_startup_check_interval_secs, 30);
|
||||
assert!(config.discovery.is_none());
|
||||
assert!(config.metrics.is_none());
|
||||
assert!(config.log_dir.is_none());
|
||||
@@ -425,6 +455,7 @@ mod tests {
|
||||
circuit_breaker: CircuitBreakerConfig::default(),
|
||||
disable_retries: false,
|
||||
disable_circuit_breaker: false,
|
||||
health_check: HealthCheckConfig::default(),
|
||||
};
|
||||
|
||||
let json = serde_json::to_string(&config).unwrap();
|
||||
@@ -614,7 +645,7 @@ mod tests {
|
||||
assert!(!config.enabled);
|
||||
assert!(config.namespace.is_none());
|
||||
assert_eq!(config.port, 8000);
|
||||
assert_eq!(config.check_interval_secs, 60);
|
||||
assert_eq!(config.check_interval_secs, 120);
|
||||
assert!(config.selector.is_empty());
|
||||
assert!(config.prefill_selector.is_empty());
|
||||
assert!(config.decode_selector.is_empty());
|
||||
@@ -856,6 +887,7 @@ mod tests {
|
||||
circuit_breaker: CircuitBreakerConfig::default(),
|
||||
disable_retries: false,
|
||||
disable_circuit_breaker: false,
|
||||
health_check: HealthCheckConfig::default(),
|
||||
};
|
||||
|
||||
assert!(config.mode.is_pd_mode());
|
||||
@@ -911,6 +943,7 @@ mod tests {
|
||||
circuit_breaker: CircuitBreakerConfig::default(),
|
||||
disable_retries: false,
|
||||
disable_circuit_breaker: false,
|
||||
health_check: HealthCheckConfig::default(),
|
||||
};
|
||||
|
||||
assert!(!config.mode.is_pd_mode());
|
||||
@@ -962,6 +995,7 @@ mod tests {
|
||||
circuit_breaker: CircuitBreakerConfig::default(),
|
||||
disable_retries: false,
|
||||
disable_circuit_breaker: false,
|
||||
health_check: HealthCheckConfig::default(),
|
||||
};
|
||||
|
||||
assert!(config.has_service_discovery());
|
||||
|
||||
Reference in New Issue
Block a user