[pd-router] add retry and circuit breakfor for pd router (#9051)

This commit is contained in:
Simo Lin
2025-08-11 05:53:26 -07:00
committed by GitHub
parent a6452b7188
commit 6f81a710f7
4 changed files with 236 additions and 177 deletions

View File

@@ -16,7 +16,7 @@ pub use circuit_breaker::{
CircuitBreaker, CircuitBreakerConfig, CircuitBreakerStats, CircuitState,
};
pub use error::{WorkerError, WorkerResult};
pub use retry::{BackoffCalculator, RetryError, RetryExecutor};
pub use retry::{is_retryable_status, BackoffCalculator, RetryError, RetryExecutor};
pub use worker::{
start_health_checker, BasicWorker, DPAwareWorker, HealthChecker, Worker, WorkerCollection,
WorkerFactory, WorkerLoadGuard, WorkerType,

View File

@@ -1,9 +1,23 @@
use crate::config::types::RetryConfig;
use axum::http::StatusCode;
use axum::response::Response;
use rand::Rng;
use std::time::Duration;
use tracing::debug;
/// Check if an HTTP status code indicates a retryable error
pub fn is_retryable_status(status: StatusCode) -> bool {
matches!(
status,
StatusCode::REQUEST_TIMEOUT
| StatusCode::TOO_MANY_REQUESTS
| StatusCode::INTERNAL_SERVER_ERROR
| StatusCode::BAD_GATEWAY
| StatusCode::SERVICE_UNAVAILABLE
| StatusCode::GATEWAY_TIMEOUT
)
}
/// Computes exponential backoff with optional jitter.
#[derive(Debug, Clone)]
pub struct BackoffCalculator;
@@ -21,8 +35,8 @@ impl BackoffCalculator {
// Apply jitter in range [-j, +j]
let jitter = config.jitter_factor.max(0.0).min(1.0);
if jitter > 0.0 {
let mut rng = rand::thread_rng();
let jitter_scale: f32 = rng.gen_range(-jitter..=jitter);
let mut rng = rand::rng();
let jitter_scale: f32 = rng.random_range(-jitter..=jitter);
let jitter_ms = (delay_ms as f32 * jitter_scale)
.round()
.max(-(delay_ms as f32));