[router] router circuit breaker core (#8941)

This commit is contained in:
Simo Lin
2025-08-08 09:20:22 -07:00
committed by GitHub
parent 9020f7fc32
commit 61a4680494
13 changed files with 818 additions and 18 deletions

View File

@@ -51,6 +51,7 @@ impl RouterFactory {
ctx.router_config.dp_aware,
ctx.router_config.api_key.clone(),
ctx.router_config.retry.clone(),
ctx.router_config.circuit_breaker.clone(),
)?;
Ok(Box::new(router))
@@ -81,6 +82,7 @@ impl RouterFactory {
ctx.router_config.worker_startup_timeout_secs,
ctx.router_config.worker_startup_check_interval_secs,
ctx.router_config.retry.clone(),
ctx.router_config.circuit_breaker.clone(),
)?;
Ok(Box::new(router))

View File

@@ -1,8 +1,8 @@
// PD (Prefill-Decode) Router Implementation
// This module handles routing for disaggregated prefill-decode systems
use super::pd_types::{api_path, PDRouterError};
use crate::config::types::RetryConfig;
use crate::core::{HealthChecker, Worker, WorkerFactory, WorkerLoadGuard};
use crate::config::types::{CircuitBreakerConfig as ConfigCircuitBreakerConfig, RetryConfig};
use crate::core::{CircuitBreakerConfig, HealthChecker, Worker, WorkerFactory, WorkerLoadGuard};
use crate::metrics::RouterMetrics;
use crate::openai_api_types::{ChatCompletionRequest, CompletionRequest, GenerateRequest};
use crate::policies::LoadBalancingPolicy;
@@ -41,6 +41,7 @@ pub struct PDRouter {
// Dedicated client for prefill fire-and-forget (non-logprob) requests
pub prefill_client: Client,
pub retry_config: RetryConfig,
pub circuit_breaker_config: CircuitBreakerConfig,
_prefill_health_checker: Option<HealthChecker>,
_decode_health_checker: Option<HealthChecker>,
}
@@ -68,8 +69,12 @@ impl PDRouter {
// Wait for the new server to be healthy
self.wait_for_server_health(&url).await?;
// Create Worker for the new prefill server
let worker = WorkerFactory::create_prefill(url.clone(), bootstrap_port);
// Create Worker for the new prefill server with circuit breaker configuration
let worker = WorkerFactory::create_prefill_with_config(
url.clone(),
bootstrap_port,
self.circuit_breaker_config.clone(),
);
// Add to prefill workers list
let mut workers = self
@@ -99,8 +104,11 @@ impl PDRouter {
// Wait for the new server to be healthy
self.wait_for_server_health(&url).await?;
// Create Worker for the new decode server
let worker = WorkerFactory::create_decode(url.clone());
// Create Worker for the new decode server with circuit breaker configuration
let worker = WorkerFactory::create_decode_with_config(
url.clone(),
self.circuit_breaker_config.clone(),
);
// Add to decode workers list
let mut workers = self
@@ -189,16 +197,31 @@ impl PDRouter {
timeout_secs: u64,
interval_secs: u64,
retry_config: RetryConfig,
circuit_breaker_config: ConfigCircuitBreakerConfig,
) -> Result<Self, String> {
// Convert config CircuitBreakerConfig to core CircuitBreakerConfig
let core_cb_config = CircuitBreakerConfig {
failure_threshold: circuit_breaker_config.failure_threshold,
success_threshold: circuit_breaker_config.success_threshold,
timeout_duration: std::time::Duration::from_secs(
circuit_breaker_config.timeout_duration_secs,
),
window_duration: std::time::Duration::from_secs(
circuit_breaker_config.window_duration_secs,
),
};
// Convert URLs to Worker trait objects
let prefill_workers: Vec<Box<dyn Worker>> = prefill_urls
.into_iter()
.map(|(url, port)| WorkerFactory::create_prefill(url, port))
.map(|(url, port)| {
WorkerFactory::create_prefill_with_config(url, port, core_cb_config.clone())
})
.collect();
let decode_workers: Vec<Box<dyn Worker>> = decode_urls
.into_iter()
.map(WorkerFactory::create_decode)
.map(|url| WorkerFactory::create_decode_with_config(url, core_cb_config.clone()))
.collect();
// Wait for PD workers to be healthy (skip if empty - for service discovery mode)
@@ -280,6 +303,7 @@ impl PDRouter {
client,
prefill_client,
retry_config,
circuit_breaker_config: core_cb_config,
_prefill_health_checker: Some(prefill_health_checker),
_decode_health_checker: Some(decode_health_checker),
})
@@ -1848,6 +1872,7 @@ mod tests {
client: Client::new(),
prefill_client: Client::new(),
retry_config: RetryConfig::default(),
circuit_breaker_config: CircuitBreakerConfig::default(),
_prefill_health_checker: None,
_decode_health_checker: None,
}

View File

@@ -1,5 +1,5 @@
use crate::config::types::RetryConfig;
use crate::core::{HealthChecker, Worker, WorkerFactory};
use crate::config::types::{CircuitBreakerConfig as ConfigCircuitBreakerConfig, RetryConfig};
use crate::core::{CircuitBreakerConfig, HealthChecker, Worker, WorkerFactory};
use crate::metrics::RouterMetrics;
use crate::openai_api_types::{ChatCompletionRequest, CompletionRequest, GenerateRequest};
use crate::policies::LoadBalancingPolicy;
@@ -42,6 +42,7 @@ pub struct Router {
dp_aware: bool,
api_key: Option<String>,
retry_config: RetryConfig,
circuit_breaker_config: CircuitBreakerConfig,
_worker_loads: Arc<tokio::sync::watch::Receiver<HashMap<String, isize>>>,
_load_monitor_handle: Option<Arc<tokio::task::JoinHandle<()>>>,
_health_checker: Option<HealthChecker>,
@@ -58,6 +59,7 @@ impl Router {
dp_aware: bool,
api_key: Option<String>,
retry_config: RetryConfig,
circuit_breaker_config: ConfigCircuitBreakerConfig,
) -> Result<Self, String> {
// Update active workers gauge
RouterMetrics::set_active_workers(worker_urls.len());
@@ -75,10 +77,24 @@ impl Router {
worker_urls
};
// Convert config CircuitBreakerConfig to core CircuitBreakerConfig
let core_cb_config = CircuitBreakerConfig {
failure_threshold: circuit_breaker_config.failure_threshold,
success_threshold: circuit_breaker_config.success_threshold,
timeout_duration: std::time::Duration::from_secs(
circuit_breaker_config.timeout_duration_secs,
),
window_duration: std::time::Duration::from_secs(
circuit_breaker_config.window_duration_secs,
),
};
// Create Worker trait objects from URLs
let workers: Vec<Box<dyn Worker>> = worker_urls
.iter()
.map(|url| WorkerFactory::create_regular(url.clone()))
.map(|url| {
WorkerFactory::create_regular_with_config(url.clone(), core_cb_config.clone())
})
.collect();
// Initialize policy with workers if needed (e.g., for cache-aware)
@@ -125,6 +141,7 @@ impl Router {
dp_aware,
api_key,
retry_config,
circuit_breaker_config: core_cb_config,
_worker_loads: worker_loads,
_load_monitor_handle: load_monitor_handle,
_health_checker: Some(health_checker),
@@ -752,7 +769,10 @@ impl Router {
continue;
}
info!("Added worker: {}", dp_url);
let new_worker = WorkerFactory::create_regular(dp_url.to_string());
let new_worker = WorkerFactory::create_regular_with_config(
dp_url.to_string(),
self.circuit_breaker_config.clone(),
);
workers_guard.push(new_worker);
worker_added = true;
}
@@ -764,7 +784,10 @@ impl Router {
return Err(format!("Worker {} already exists", worker_url));
}
info!("Added worker: {}", worker_url);
let new_worker = WorkerFactory::create_regular(worker_url.to_string());
let new_worker = WorkerFactory::create_regular_with_config(
worker_url.to_string(),
self.circuit_breaker_config.clone(),
);
workers_guard.push(new_worker);
}
@@ -1223,6 +1246,7 @@ mod tests {
api_key: None,
client: Client::new(),
retry_config: RetryConfig::default(),
circuit_breaker_config: CircuitBreakerConfig::default(),
_worker_loads: Arc::new(rx),
_load_monitor_handle: None,
_health_checker: None,