[router] router circuit breaker core (#8941)
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
use super::{WorkerError, WorkerResult};
|
||||
use super::{CircuitBreaker, CircuitBreakerConfig, WorkerError, WorkerResult};
|
||||
use async_trait::async_trait;
|
||||
use futures;
|
||||
use serde_json;
|
||||
@@ -66,6 +66,19 @@ pub trait Worker: Send + Sync + fmt::Debug {
|
||||
/// Clone the worker (for trait objects)
|
||||
fn clone_worker(&self) -> Box<dyn Worker>;
|
||||
|
||||
/// Get the circuit breaker for this worker
|
||||
fn circuit_breaker(&self) -> &CircuitBreaker;
|
||||
|
||||
/// Check if the worker is available (healthy + circuit closed/half-open)
|
||||
fn is_available(&self) -> bool {
|
||||
self.is_healthy() && self.circuit_breaker().can_execute()
|
||||
}
|
||||
|
||||
/// Record the outcome of a request to this worker
|
||||
fn record_outcome(&self, success: bool) {
|
||||
self.circuit_breaker().record_outcome(success);
|
||||
}
|
||||
|
||||
// === DP-aware methods ===
|
||||
|
||||
/// Check if this worker is DP-aware
|
||||
@@ -172,6 +185,7 @@ pub struct BasicWorker {
|
||||
load_counter: Arc<AtomicUsize>,
|
||||
processed_counter: Arc<AtomicUsize>,
|
||||
healthy: Arc<AtomicBool>,
|
||||
circuit_breaker: CircuitBreaker,
|
||||
}
|
||||
|
||||
impl BasicWorker {
|
||||
@@ -188,6 +202,7 @@ impl BasicWorker {
|
||||
load_counter: Arc::new(AtomicUsize::new(0)),
|
||||
processed_counter: Arc::new(AtomicUsize::new(0)),
|
||||
healthy: Arc::new(AtomicBool::new(true)),
|
||||
circuit_breaker: CircuitBreaker::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -201,6 +216,11 @@ impl BasicWorker {
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_circuit_breaker_config(mut self, config: CircuitBreakerConfig) -> Self {
|
||||
self.circuit_breaker = CircuitBreaker::with_config(config);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn normalised_url(&self) -> WorkerResult<&str> {
|
||||
if self.url().contains("@") {
|
||||
// Need to extract the URL from "http://host:port@dp_rank"
|
||||
@@ -304,6 +324,10 @@ impl Worker for BasicWorker {
|
||||
fn clone_worker(&self) -> Box<dyn Worker> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
|
||||
fn circuit_breaker(&self) -> &CircuitBreaker {
|
||||
&self.circuit_breaker
|
||||
}
|
||||
}
|
||||
|
||||
/// A DP-aware worker that handles data-parallel routing
|
||||
@@ -421,6 +445,10 @@ impl Worker for DPAwareWorker {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
|
||||
fn circuit_breaker(&self) -> &CircuitBreaker {
|
||||
self.base_worker.circuit_breaker()
|
||||
}
|
||||
|
||||
// DP-aware specific implementations
|
||||
|
||||
fn is_dp_aware(&self) -> bool {
|
||||
@@ -469,6 +497,17 @@ impl WorkerFactory {
|
||||
Box::new(BasicWorker::new(url, WorkerType::Regular))
|
||||
}
|
||||
|
||||
/// Create a regular worker with custom circuit breaker configuration
|
||||
pub fn create_regular_with_config(
|
||||
url: String,
|
||||
circuit_breaker_config: CircuitBreakerConfig,
|
||||
) -> Box<dyn Worker> {
|
||||
Box::new(
|
||||
BasicWorker::new(url, WorkerType::Regular)
|
||||
.with_circuit_breaker_config(circuit_breaker_config),
|
||||
)
|
||||
}
|
||||
|
||||
/// Create a prefill worker with optional bootstrap port
|
||||
pub fn create_prefill(url: String, bootstrap_port: Option<u16>) -> Box<dyn Worker> {
|
||||
Box::new(BasicWorker::new(
|
||||
@@ -477,11 +516,34 @@ impl WorkerFactory {
|
||||
))
|
||||
}
|
||||
|
||||
/// Create a prefill worker with custom circuit breaker configuration
|
||||
pub fn create_prefill_with_config(
|
||||
url: String,
|
||||
bootstrap_port: Option<u16>,
|
||||
circuit_breaker_config: CircuitBreakerConfig,
|
||||
) -> Box<dyn Worker> {
|
||||
Box::new(
|
||||
BasicWorker::new(url, WorkerType::Prefill { bootstrap_port })
|
||||
.with_circuit_breaker_config(circuit_breaker_config),
|
||||
)
|
||||
}
|
||||
|
||||
/// Create a decode worker
|
||||
pub fn create_decode(url: String) -> Box<dyn Worker> {
|
||||
Box::new(BasicWorker::new(url, WorkerType::Decode))
|
||||
}
|
||||
|
||||
/// Create a decode worker with custom circuit breaker configuration
|
||||
pub fn create_decode_with_config(
|
||||
url: String,
|
||||
circuit_breaker_config: CircuitBreakerConfig,
|
||||
) -> Box<dyn Worker> {
|
||||
Box::new(
|
||||
BasicWorker::new(url, WorkerType::Decode)
|
||||
.with_circuit_breaker_config(circuit_breaker_config),
|
||||
)
|
||||
}
|
||||
|
||||
/// Create workers from URLs with automatic type detection
|
||||
pub fn create_from_urls(
|
||||
regular_urls: Vec<String>,
|
||||
@@ -796,6 +858,7 @@ pub fn start_health_checker(
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::RwLock;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use tokio::time::timeout;
|
||||
|
||||
@@ -1574,6 +1637,94 @@ mod tests {
|
||||
assert_eq!(workers[1].url(), "http://w2:8080");
|
||||
}
|
||||
|
||||
// ===== Circuit Breaker Integration Tests =====
|
||||
|
||||
#[test]
|
||||
fn test_worker_circuit_breaker() {
|
||||
let worker = BasicWorker::new("http://test:8080".to_string(), WorkerType::Regular);
|
||||
|
||||
// Initial state should be available
|
||||
assert!(worker.is_available());
|
||||
assert_eq!(
|
||||
worker.circuit_breaker().state(),
|
||||
crate::core::CircuitState::Closed
|
||||
);
|
||||
|
||||
// Record some failures
|
||||
worker.record_outcome(false);
|
||||
worker.record_outcome(false);
|
||||
|
||||
// Still available (default threshold is 5)
|
||||
assert!(worker.is_available());
|
||||
|
||||
// Record more failures to open circuit
|
||||
worker.record_outcome(false);
|
||||
worker.record_outcome(false);
|
||||
worker.record_outcome(false);
|
||||
|
||||
// Circuit should be open, worker not available
|
||||
assert!(!worker.is_available());
|
||||
assert!(worker.is_healthy()); // Still healthy
|
||||
assert!(!worker.circuit_breaker().can_execute()); // But circuit is open
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_worker_with_circuit_breaker_config() {
|
||||
let config = crate::core::CircuitBreakerConfig {
|
||||
failure_threshold: 2,
|
||||
success_threshold: 1,
|
||||
timeout_duration: Duration::from_millis(100),
|
||||
window_duration: Duration::from_secs(60),
|
||||
};
|
||||
|
||||
let worker = BasicWorker::new("http://test:8080".to_string(), WorkerType::Regular)
|
||||
.with_circuit_breaker_config(config);
|
||||
|
||||
// Should open after 2 failures
|
||||
worker.record_outcome(false);
|
||||
assert!(worker.is_available());
|
||||
worker.record_outcome(false);
|
||||
assert!(!worker.is_available());
|
||||
|
||||
// Wait for timeout
|
||||
thread::sleep(Duration::from_millis(150));
|
||||
|
||||
// Should be half-open
|
||||
assert!(worker.is_available());
|
||||
assert_eq!(
|
||||
worker.circuit_breaker().state(),
|
||||
crate::core::CircuitState::HalfOpen
|
||||
);
|
||||
|
||||
// Success should close it
|
||||
worker.record_outcome(true);
|
||||
assert_eq!(
|
||||
worker.circuit_breaker().state(),
|
||||
crate::core::CircuitState::Closed
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dp_aware_worker_circuit_breaker() {
|
||||
let dp_worker =
|
||||
DPAwareWorker::new("http://worker:8080".to_string(), 0, 2, WorkerType::Regular);
|
||||
|
||||
// Should have circuit breaker
|
||||
assert!(dp_worker.is_available());
|
||||
|
||||
// Record failures
|
||||
for _ in 0..5 {
|
||||
dp_worker.record_outcome(false);
|
||||
}
|
||||
|
||||
// Should not be available
|
||||
assert!(!dp_worker.is_available());
|
||||
assert_eq!(
|
||||
dp_worker.circuit_breaker().state(),
|
||||
crate::core::CircuitState::Open
|
||||
);
|
||||
}
|
||||
|
||||
// ===== Integration tests =====
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
Reference in New Issue
Block a user