[router] improve router logs and request id header (#8415)

This commit is contained in:
Simo Lin
2025-07-27 19:30:19 -07:00
committed by GitHub
parent dd487e5553
commit fe6a445d1e
17 changed files with 426 additions and 128 deletions

View File

@@ -1,5 +1,6 @@
use crate::core::{HealthChecker, Worker, WorkerFactory};
use crate::metrics::RouterMetrics;
use crate::middleware::get_request_id;
use crate::policies::LoadBalancingPolicy;
use actix_web::http::header::{HeaderValue, CONTENT_TYPE};
use actix_web::{HttpRequest, HttpResponse};
@@ -134,32 +135,26 @@ impl Router {
match sync_client.get(&format!("{}/health", url)).send() {
Ok(res) => {
if !res.status().is_success() {
let msg = format!(
"Worker heatlh check is pending with status {}",
res.status()
);
info!("{}", msg);
all_healthy = false;
unhealthy_workers.push((url, msg));
unhealthy_workers.push((url, format!("status: {}", res.status())));
}
}
Err(_) => {
let msg = format!("Worker is not ready yet");
info!("{}", msg);
all_healthy = false;
unhealthy_workers.push((url, msg));
unhealthy_workers.push((url, "not ready".to_string()));
}
}
}
if all_healthy {
info!("All workers are healthy");
info!("All {} workers are healthy", worker_urls.len());
return Ok(());
} else {
info!("Initializing workers:");
for (url, reason) in &unhealthy_workers {
info!(" {} - {}", url, reason);
}
debug!(
"Waiting for {} workers to become healthy ({} unhealthy)",
worker_urls.len(),
unhealthy_workers.len()
);
thread::sleep(Duration::from_secs(interval_secs));
}
}
@@ -181,6 +176,7 @@ impl Router {
route: &str,
req: &HttpRequest,
) -> HttpResponse {
let request_id = get_request_id(req);
let start = Instant::now();
let mut request_builder = client.get(format!("{}{}", worker_url, route));
@@ -202,14 +198,32 @@ impl Router {
match res.bytes().await {
Ok(body) => HttpResponse::build(status).body(body.to_vec()),
Err(e) => HttpResponse::InternalServerError()
.body(format!("Failed to read response body: {}", e)),
Err(e) => {
error!(
request_id = %request_id,
worker_url = %worker_url,
route = %route,
error = %e,
"Failed to read response body"
);
HttpResponse::InternalServerError()
.body(format!("Failed to read response body: {}", e))
}
}
}
Err(e) => HttpResponse::InternalServerError().body(format!(
"Failed to send request to worker {}: {}",
worker_url, e
)),
Err(e) => {
error!(
request_id = %request_id,
worker_url = %worker_url,
route = %route,
error = %e,
"Failed to send request to worker"
);
HttpResponse::InternalServerError().body(format!(
"Failed to send request to worker {}: {}",
worker_url, e
))
}
};
// Record request metrics
@@ -231,6 +245,7 @@ impl Router {
route: &str,
req: &HttpRequest,
) -> HttpResponse {
let request_id = get_request_id(req);
const MAX_REQUEST_RETRIES: u32 = 3;
const MAX_TOTAL_RETRIES: u32 = 6;
let mut total_retries = 0;
@@ -260,17 +275,23 @@ impl Router {
}
warn!(
"Request to {} failed (attempt {}/{})",
worker_url,
request_retries + 1,
MAX_REQUEST_RETRIES
request_id = %request_id,
route = %route,
worker_url = %worker_url,
attempt = request_retries + 1,
max_attempts = MAX_REQUEST_RETRIES,
"Request failed"
);
request_retries += 1;
total_retries += 1;
if request_retries == MAX_REQUEST_RETRIES {
warn!("Removing failed worker: {}", worker_url);
warn!(
request_id = %request_id,
worker_url = %worker_url,
"Removing failed worker"
);
self.remove_worker(&worker_url);
break;
}
@@ -293,6 +314,7 @@ impl Router {
typed_req: &T,
route: &str,
) -> HttpResponse {
let request_id = get_request_id(req);
// Handle retries like the original implementation
let start = Instant::now();
const MAX_REQUEST_RETRIES: u32 = 3;
@@ -357,17 +379,19 @@ impl Router {
}
warn!(
"Generate request to {} failed (attempt {}/{})",
worker_url,
request_retries + 1,
MAX_REQUEST_RETRIES
request_id = %request_id,
"Generate request failed route={} worker_url={} attempt={} max_attempts={}",
route, worker_url, request_retries + 1, MAX_REQUEST_RETRIES
);
request_retries += 1;
total_retries += 1;
if request_retries == MAX_REQUEST_RETRIES {
warn!("Removing failed worker: {}", worker_url);
warn!(
request_id = %request_id,
"Removing failed worker after typed request failures worker_url={}", worker_url
);
self.remove_worker(&worker_url);
break;
}
@@ -402,13 +426,9 @@ impl Router {
is_stream: bool,
load_incremented: bool, // Whether load was incremented for this request
) -> HttpResponse {
let request_id = get_request_id(req);
let start = Instant::now();
// Debug: Log what we're sending
if let Ok(json_str) = serde_json::to_string_pretty(typed_req) {
debug!("Sending request to {}: {}", route, json_str);
}
let mut request_builder = client
.post(format!("{}{}", worker_url, route))
.json(typed_req); // Use json() directly with typed request
@@ -424,7 +444,11 @@ impl Router {
let res = match request_builder.send().await {
Ok(res) => res,
Err(e) => {
error!("Failed to send request to {}: {}", worker_url, e);
error!(
request_id = %request_id,
"Failed to send typed request worker_url={} route={} error={}",
worker_url, route, e
);
// Decrement load on error if it was incremented
if load_incremented {
@@ -497,7 +521,6 @@ impl Router {
&worker_url,
worker.load(),
);
debug!("Streaming is done!!")
}
}
}
@@ -536,7 +559,6 @@ impl Router {
match client.get(&format!("{}/health", worker_url)).send().await {
Ok(res) => {
if res.status().is_success() {
info!("Worker {} health check passed", worker_url);
let mut workers_guard = self.workers.write().unwrap();
if workers_guard.iter().any(|w| w.url() == worker_url) {
return Err(format!("Worker {} already exists", worker_url));
@@ -560,8 +582,8 @@ impl Router {
return Ok(format!("Successfully added worker: {}", worker_url));
} else {
info!(
"Worker {} health check is pending with status: {}.",
debug!(
"Worker {} health check pending - status: {}",
worker_url,
res.status()
);
@@ -576,10 +598,7 @@ impl Router {
}
}
Err(e) => {
info!(
"Worker {} health check is pending with error: {}",
worker_url, e
);
debug!("Worker {} health check pending - error: {}", worker_url, e);
// if the url does not have http or https prefix, warn users
if !worker_url.starts_with("http://") && !worker_url.starts_with("https://") {
@@ -611,7 +630,6 @@ impl Router {
.downcast_ref::<crate::policies::CacheAwarePolicy>()
{
cache_aware.remove_worker(worker_url);
info!("Removed worker from tree: {}", worker_url);
}
}
@@ -675,7 +693,6 @@ impl Router {
for url in &worker_urls {
if let Some(load) = Self::get_worker_load_static(&client, url).await {
loads.insert(url.clone(), load);
debug!("Worker {} load: {}", url, load);
}
}