[router] consolidate health endpoints and flush cache (#10876)

This commit is contained in:
Simo Lin
2025-09-24 18:23:21 -04:00
committed by GitHub
parent 4a87ba217f
commit 7a06ef984d
12 changed files with 220 additions and 424 deletions

View File

@@ -829,25 +829,6 @@ impl RouterTrait for Router {
self
}
async fn health(&self, _req: Request<Body>) -> Response {
let workers = self.worker_registry.get_all();
let unhealthy_servers: Vec<_> = workers
.iter()
.filter(|w| !w.is_healthy())
.map(|w| w.url().to_string())
.collect();
if unhealthy_servers.is_empty() {
(StatusCode::OK, "All servers healthy").into_response()
} else {
(
StatusCode::SERVICE_UNAVAILABLE,
format!("Unhealthy servers: {:?}", unhealthy_servers),
)
.into_response()
}
}
async fn health_generate(&self, req: Request<Body>) -> Response {
self.proxy_get_request(req, "health_generate").await
}
@@ -972,68 +953,6 @@ impl RouterTrait for Router {
}
}
async fn flush_cache(&self) -> Response {
// Get all workers
let workers = self.worker_registry.get_all();
let worker_urls: Vec<String> = workers.iter().map(|w| w.url().to_string()).collect();
// Send requests to all workers concurrently without headers
let mut tasks = Vec::new();
for worker_url in &worker_urls {
// Get the worker's API key if available
let api_key = self
.worker_registry
.get_by_url(worker_url)
.and_then(|w| w.api_key().clone());
let worker_url = if self.dp_aware {
// Need to extract the URL from "http://host:port@dp_rank"
let (worker_url_prefix, _dp_rank) = match Self::extract_dp_rank(worker_url) {
Ok(tup) => tup,
Err(e) => {
error!("Failed to extract dp_rank: {}", e);
return (
StatusCode::INTERNAL_SERVER_ERROR,
format!("Failed to extract dp_rank: {}", e),
)
.into_response();
}
};
worker_url_prefix
} else {
worker_url
};
let mut request_builder = self.client.post(format!("{}/flush_cache", worker_url));
if let Some(key) = api_key {
request_builder =
request_builder.header("Authorization", format!("Bearer {}", key));
}
tasks.push(request_builder.send());
}
// Wait for all responses
let results = futures_util::future::join_all(tasks).await;
// Check if all succeeded
let all_success = results.iter().all(|r| {
r.as_ref()
.map(|res| res.status().is_success())
.unwrap_or(false)
});
if all_success {
(StatusCode::OK, "Cache flushed on all servers").into_response()
} else {
(
StatusCode::INTERNAL_SERVER_ERROR,
"Cache flush failed on one or more servers",
)
.into_response()
}
}
async fn get_worker_loads(&self) -> Response {
let urls_with_key = self.worker_registry.get_all_urls_with_api_key();
let mut loads = Vec::new();
@@ -1056,32 +975,6 @@ impl RouterTrait for Router {
fn router_type(&self) -> &'static str {
"regular"
}
fn readiness(&self) -> Response {
// Regular router is ready if it has at least one healthy worker
let workers = self.worker_registry.get_all();
let healthy_count = workers.iter().filter(|w| w.is_healthy()).count();
let total_workers = workers.len();
if healthy_count > 0 {
Json(serde_json::json!({
"status": "ready",
"healthy_workers": healthy_count,
"total_workers": total_workers
}))
.into_response()
} else {
(
StatusCode::SERVICE_UNAVAILABLE,
Json(serde_json::json!({
"status": "not_ready",
"reason": "no healthy workers available",
"total_workers": total_workers
})),
)
.into_response()
}
}
}
#[cfg(test)]