[router] router circuit breaker core (#8941)

This commit is contained in:
Simo Lin
2025-08-08 09:20:22 -07:00
committed by GitHub
parent 9020f7fc32
commit 61a4680494
13 changed files with 818 additions and 18 deletions

View File

@@ -1,4 +1,4 @@
use super::{WorkerError, WorkerResult};
use super::{CircuitBreaker, CircuitBreakerConfig, WorkerError, WorkerResult};
use async_trait::async_trait;
use futures;
use serde_json;
@@ -66,6 +66,19 @@ pub trait Worker: Send + Sync + fmt::Debug {
/// Clone the worker (for trait objects)
fn clone_worker(&self) -> Box<dyn Worker>;
/// Get the circuit breaker for this worker
fn circuit_breaker(&self) -> &CircuitBreaker;
/// Check if the worker is available (healthy + circuit closed/half-open)
fn is_available(&self) -> bool {
self.is_healthy() && self.circuit_breaker().can_execute()
}
/// Record the outcome of a request to this worker
fn record_outcome(&self, success: bool) {
self.circuit_breaker().record_outcome(success);
}
// === DP-aware methods ===
/// Check if this worker is DP-aware
@@ -172,6 +185,7 @@ pub struct BasicWorker {
load_counter: Arc<AtomicUsize>,
processed_counter: Arc<AtomicUsize>,
healthy: Arc<AtomicBool>,
circuit_breaker: CircuitBreaker,
}
impl BasicWorker {
@@ -188,6 +202,7 @@ impl BasicWorker {
load_counter: Arc::new(AtomicUsize::new(0)),
processed_counter: Arc::new(AtomicUsize::new(0)),
healthy: Arc::new(AtomicBool::new(true)),
circuit_breaker: CircuitBreaker::new(),
}
}
@@ -201,6 +216,11 @@ impl BasicWorker {
self
}
pub fn with_circuit_breaker_config(mut self, config: CircuitBreakerConfig) -> Self {
self.circuit_breaker = CircuitBreaker::with_config(config);
self
}
pub fn normalised_url(&self) -> WorkerResult<&str> {
if self.url().contains("@") {
// Need to extract the URL from "http://host:port@dp_rank"
@@ -304,6 +324,10 @@ impl Worker for BasicWorker {
fn clone_worker(&self) -> Box<dyn Worker> {
Box::new(self.clone())
}
fn circuit_breaker(&self) -> &CircuitBreaker {
&self.circuit_breaker
}
}
/// A DP-aware worker that handles data-parallel routing
@@ -421,6 +445,10 @@ impl Worker for DPAwareWorker {
Box::new(self.clone())
}
fn circuit_breaker(&self) -> &CircuitBreaker {
self.base_worker.circuit_breaker()
}
// DP-aware specific implementations
fn is_dp_aware(&self) -> bool {
@@ -469,6 +497,17 @@ impl WorkerFactory {
Box::new(BasicWorker::new(url, WorkerType::Regular))
}
/// Create a regular worker with custom circuit breaker configuration
pub fn create_regular_with_config(
url: String,
circuit_breaker_config: CircuitBreakerConfig,
) -> Box<dyn Worker> {
Box::new(
BasicWorker::new(url, WorkerType::Regular)
.with_circuit_breaker_config(circuit_breaker_config),
)
}
/// Create a prefill worker with optional bootstrap port
pub fn create_prefill(url: String, bootstrap_port: Option<u16>) -> Box<dyn Worker> {
Box::new(BasicWorker::new(
@@ -477,11 +516,34 @@ impl WorkerFactory {
))
}
/// Create a prefill worker with custom circuit breaker configuration
pub fn create_prefill_with_config(
url: String,
bootstrap_port: Option<u16>,
circuit_breaker_config: CircuitBreakerConfig,
) -> Box<dyn Worker> {
Box::new(
BasicWorker::new(url, WorkerType::Prefill { bootstrap_port })
.with_circuit_breaker_config(circuit_breaker_config),
)
}
/// Create a decode worker
pub fn create_decode(url: String) -> Box<dyn Worker> {
Box::new(BasicWorker::new(url, WorkerType::Decode))
}
/// Create a decode worker with custom circuit breaker configuration
pub fn create_decode_with_config(
url: String,
circuit_breaker_config: CircuitBreakerConfig,
) -> Box<dyn Worker> {
Box::new(
BasicWorker::new(url, WorkerType::Decode)
.with_circuit_breaker_config(circuit_breaker_config),
)
}
/// Create workers from URLs with automatic type detection
pub fn create_from_urls(
regular_urls: Vec<String>,
@@ -796,6 +858,7 @@ pub fn start_health_checker(
mod tests {
use super::*;
use std::sync::RwLock;
use std::thread;
use std::time::Duration;
use tokio::time::timeout;
@@ -1574,6 +1637,94 @@ mod tests {
assert_eq!(workers[1].url(), "http://w2:8080");
}
// ===== Circuit Breaker Integration Tests =====
#[test]
fn test_worker_circuit_breaker() {
let worker = BasicWorker::new("http://test:8080".to_string(), WorkerType::Regular);
// Initial state should be available
assert!(worker.is_available());
assert_eq!(
worker.circuit_breaker().state(),
crate::core::CircuitState::Closed
);
// Record some failures
worker.record_outcome(false);
worker.record_outcome(false);
// Still available (default threshold is 5)
assert!(worker.is_available());
// Record more failures to open circuit
worker.record_outcome(false);
worker.record_outcome(false);
worker.record_outcome(false);
// Circuit should be open, worker not available
assert!(!worker.is_available());
assert!(worker.is_healthy()); // Still healthy
assert!(!worker.circuit_breaker().can_execute()); // But circuit is open
}
#[test]
fn test_worker_with_circuit_breaker_config() {
let config = crate::core::CircuitBreakerConfig {
failure_threshold: 2,
success_threshold: 1,
timeout_duration: Duration::from_millis(100),
window_duration: Duration::from_secs(60),
};
let worker = BasicWorker::new("http://test:8080".to_string(), WorkerType::Regular)
.with_circuit_breaker_config(config);
// Should open after 2 failures
worker.record_outcome(false);
assert!(worker.is_available());
worker.record_outcome(false);
assert!(!worker.is_available());
// Wait for timeout
thread::sleep(Duration::from_millis(150));
// Should be half-open
assert!(worker.is_available());
assert_eq!(
worker.circuit_breaker().state(),
crate::core::CircuitState::HalfOpen
);
// Success should close it
worker.record_outcome(true);
assert_eq!(
worker.circuit_breaker().state(),
crate::core::CircuitState::Closed
);
}
#[test]
fn test_dp_aware_worker_circuit_breaker() {
let dp_worker =
DPAwareWorker::new("http://worker:8080".to_string(), 0, 2, WorkerType::Regular);
// Should have circuit breaker
assert!(dp_worker.is_available());
// Record failures
for _ in 0..5 {
dp_worker.record_outcome(false);
}
// Should not be available
assert!(!dp_worker.is_available());
assert_eq!(
dp_worker.circuit_breaker().state(),
crate::core::CircuitState::Open
);
}
// ===== Integration tests =====
#[tokio::test]