[router] improve router logs and request id header (#8415)

This commit is contained in:
Simo Lin
2025-07-27 19:30:19 -07:00
committed by GitHub
parent dd487e5553
commit fe6a445d1e
17 changed files with 426 additions and 128 deletions

View File

@@ -5,6 +5,7 @@ use super::pd_types::{api_path, Bootstrap, ChatReqInput, GenerateReqInput, PDRou
use super::request_adapter::ToPdRequest;
use crate::core::{HealthChecker, Worker, WorkerFactory, WorkerLoadGuard};
use crate::metrics::RouterMetrics;
use crate::middleware::get_request_id;
use crate::openai_api_types::{ChatCompletionRequest, CompletionRequest, GenerateRequest};
use crate::policies::LoadBalancingPolicy;
use crate::tree::Tree;
@@ -16,7 +17,6 @@ use std::collections::HashMap;
use std::sync::{Arc, Mutex, RwLock};
use std::time::{Duration, Instant};
use tracing::{debug, error, info, warn};
use uuid::Uuid;
#[derive(Debug)]
pub struct PDRouter {
@@ -307,8 +307,8 @@ impl PDRouter {
mut typed_req: GenerateReqInput,
route: &str,
) -> HttpResponse {
let request_id = get_request_id(req);
let start = Instant::now();
let _request_id = Uuid::new_v4();
// Get stream flag and return_logprob flag before moving the request
let is_stream = typed_req.stream;
@@ -328,7 +328,10 @@ impl PDRouter {
let (prefill, decode) = match self.select_pd_pair(client, request_text).await {
Ok(pair) => pair,
Err(e) => {
error!("Failed to select PD pair: {}", e);
error!(
request_id = %request_id,
"Failed to select PD pair error={}", e
);
RouterMetrics::record_pd_error("server_selection");
return HttpResponse::ServiceUnavailable()
.body(format!("No available servers: {}", e));
@@ -337,15 +340,17 @@ impl PDRouter {
// Log routing decision
info!(
"PD routing: {} -> prefill={}, decode={}",
route,
prefill.url(),
decode.url()
request_id = %request_id,
"PD routing decision route={} prefill_url={} decode_url={}",
route, prefill.url(), decode.url()
);
// Add bootstrap info using the trait method
if let Err(e) = typed_req.add_bootstrap_info(prefill.as_ref()) {
error!("Failed to add bootstrap info: {}", e);
error!(
request_id = %request_id,
"Failed to add bootstrap info error={}", e
);
RouterMetrics::record_pd_error("bootstrap_injection");
return HttpResponse::InternalServerError()
.body(format!("Bootstrap injection failed: {}", e));
@@ -355,7 +360,10 @@ impl PDRouter {
let json_with_bootstrap = match serde_json::to_value(&typed_req) {
Ok(json) => json,
Err(e) => {
error!("Failed to serialize request: {}", e);
error!(
request_id = %request_id,
"Failed to serialize request error={}", e
);
return HttpResponse::InternalServerError().body("Failed to serialize request");
}
};
@@ -383,6 +391,7 @@ impl PDRouter {
mut typed_req: ChatReqInput,
route: &str,
) -> HttpResponse {
let request_id = get_request_id(req);
let start = Instant::now();
// Get stream flag and return_logprob flag before moving the request
@@ -406,7 +415,10 @@ impl PDRouter {
let (prefill, decode) = match self.select_pd_pair(client, request_text).await {
Ok(pair) => pair,
Err(e) => {
error!("Failed to select PD pair: {}", e);
error!(
request_id = %request_id,
"Failed to select PD pair error={}", e
);
RouterMetrics::record_pd_error("server_selection");
return HttpResponse::ServiceUnavailable()
.body(format!("No available servers: {}", e));
@@ -415,15 +427,17 @@ impl PDRouter {
// Log routing decision
info!(
"PD routing: {} -> prefill={}, decode={}",
route,
prefill.url(),
decode.url()
request_id = %request_id,
"PD routing decision route={} prefill_url={} decode_url={}",
route, prefill.url(), decode.url()
);
// Add bootstrap info using the trait method
if let Err(e) = typed_req.add_bootstrap_info(prefill.as_ref()) {
error!("Failed to add bootstrap info: {}", e);
error!(
request_id = %request_id,
"Failed to add bootstrap info error={}", e
);
RouterMetrics::record_pd_error("bootstrap_injection");
return HttpResponse::InternalServerError()
.body(format!("Bootstrap injection failed: {}", e));
@@ -433,7 +447,10 @@ impl PDRouter {
let json_with_bootstrap = match serde_json::to_value(&typed_req) {
Ok(json) => json,
Err(e) => {
error!("Failed to serialize request: {}", e);
error!(
request_id = %request_id,
"Failed to serialize request error={}", e
);
return HttpResponse::InternalServerError().body("Failed to serialize request");
}
};
@@ -461,6 +478,7 @@ impl PDRouter {
mut typed_req: CompletionRequest,
route: &str,
) -> HttpResponse {
let request_id = get_request_id(req);
let start = Instant::now();
// Get stream flag and return_logprob flag before moving the request
@@ -477,7 +495,10 @@ impl PDRouter {
let (prefill, decode) = match self.select_pd_pair(client, request_text).await {
Ok(pair) => pair,
Err(e) => {
error!("Failed to select PD pair: {}", e);
error!(
request_id = %request_id,
"Failed to select PD pair error={}", e
);
RouterMetrics::record_pd_error("server_selection");
return HttpResponse::ServiceUnavailable()
.body(format!("No available servers: {}", e));
@@ -486,15 +507,17 @@ impl PDRouter {
// Log routing decision
info!(
"PD routing: {} -> prefill={}, decode={}",
route,
prefill.url(),
decode.url()
request_id = %request_id,
"PD routing decision route={} prefill_url={} decode_url={}",
route, prefill.url(), decode.url()
);
// Add bootstrap info using the trait method
if let Err(e) = typed_req.add_bootstrap_info(prefill.as_ref()) {
error!("Failed to add bootstrap info: {}", e);
error!(
request_id = %request_id,
"Failed to add bootstrap info error={}", e
);
RouterMetrics::record_pd_error("bootstrap_injection");
return HttpResponse::InternalServerError()
.body(format!("Bootstrap injection failed: {}", e));
@@ -504,7 +527,10 @@ impl PDRouter {
let json_with_bootstrap = match serde_json::to_value(&typed_req) {
Ok(json) => json,
Err(e) => {
error!("Failed to serialize request: {}", e);
error!(
request_id = %request_id,
"Failed to serialize request error={}", e
);
return HttpResponse::InternalServerError().body("Failed to serialize request");
}
};
@@ -538,6 +564,7 @@ impl PDRouter {
return_logprob: bool,
start_time: Instant,
) -> HttpResponse {
let request_id = get_request_id(req);
// Update load tracking for both workers
let _guard = WorkerLoadGuard::new_multi(vec![prefill, decode]);
@@ -578,9 +605,9 @@ impl PDRouter {
if !status.is_success() {
RouterMetrics::record_pd_decode_error(decode.url());
error!(
"Decode server {} returned error status: {}",
decode.url(),
status
request_id = %request_id,
"Decode server returned error status decode_url={} status={}",
decode.url(), status
);
// Return the error response from decode server
@@ -598,9 +625,9 @@ impl PDRouter {
// Log prefill errors for debugging
if let Err(e) = &prefill_result {
error!(
"Prefill server {} failed (non-critical): {}",
prefill.url(),
e
request_id = %request_id,
"Prefill server failed (non-critical) prefill_url={} error={}",
prefill.url(), e
);
RouterMetrics::record_pd_prefill_error(prefill.url());
}
@@ -684,7 +711,12 @@ impl PDRouter {
}
}
Err(e) => {
error!("Decode request failed: {}", e);
error!(
request_id = %request_id,
decode_url = %decode.url(),
error = %e,
"Decode request failed"
);
RouterMetrics::record_pd_decode_error(decode.url());
HttpResponse::BadGateway().body(format!("Decode server error: {}", e))
}

View File

@@ -1,5 +1,6 @@
use crate::core::{HealthChecker, Worker, WorkerFactory};
use crate::metrics::RouterMetrics;
use crate::middleware::get_request_id;
use crate::policies::LoadBalancingPolicy;
use actix_web::http::header::{HeaderValue, CONTENT_TYPE};
use actix_web::{HttpRequest, HttpResponse};
@@ -134,32 +135,26 @@ impl Router {
match sync_client.get(&format!("{}/health", url)).send() {
Ok(res) => {
if !res.status().is_success() {
let msg = format!(
"Worker heatlh check is pending with status {}",
res.status()
);
info!("{}", msg);
all_healthy = false;
unhealthy_workers.push((url, msg));
unhealthy_workers.push((url, format!("status: {}", res.status())));
}
}
Err(_) => {
let msg = format!("Worker is not ready yet");
info!("{}", msg);
all_healthy = false;
unhealthy_workers.push((url, msg));
unhealthy_workers.push((url, "not ready".to_string()));
}
}
}
if all_healthy {
info!("All workers are healthy");
info!("All {} workers are healthy", worker_urls.len());
return Ok(());
} else {
info!("Initializing workers:");
for (url, reason) in &unhealthy_workers {
info!(" {} - {}", url, reason);
}
debug!(
"Waiting for {} workers to become healthy ({} unhealthy)",
worker_urls.len(),
unhealthy_workers.len()
);
thread::sleep(Duration::from_secs(interval_secs));
}
}
@@ -181,6 +176,7 @@ impl Router {
route: &str,
req: &HttpRequest,
) -> HttpResponse {
let request_id = get_request_id(req);
let start = Instant::now();
let mut request_builder = client.get(format!("{}{}", worker_url, route));
@@ -202,14 +198,32 @@ impl Router {
match res.bytes().await {
Ok(body) => HttpResponse::build(status).body(body.to_vec()),
Err(e) => HttpResponse::InternalServerError()
.body(format!("Failed to read response body: {}", e)),
Err(e) => {
error!(
request_id = %request_id,
worker_url = %worker_url,
route = %route,
error = %e,
"Failed to read response body"
);
HttpResponse::InternalServerError()
.body(format!("Failed to read response body: {}", e))
}
}
}
Err(e) => HttpResponse::InternalServerError().body(format!(
"Failed to send request to worker {}: {}",
worker_url, e
)),
Err(e) => {
error!(
request_id = %request_id,
worker_url = %worker_url,
route = %route,
error = %e,
"Failed to send request to worker"
);
HttpResponse::InternalServerError().body(format!(
"Failed to send request to worker {}: {}",
worker_url, e
))
}
};
// Record request metrics
@@ -231,6 +245,7 @@ impl Router {
route: &str,
req: &HttpRequest,
) -> HttpResponse {
let request_id = get_request_id(req);
const MAX_REQUEST_RETRIES: u32 = 3;
const MAX_TOTAL_RETRIES: u32 = 6;
let mut total_retries = 0;
@@ -260,17 +275,23 @@ impl Router {
}
warn!(
"Request to {} failed (attempt {}/{})",
worker_url,
request_retries + 1,
MAX_REQUEST_RETRIES
request_id = %request_id,
route = %route,
worker_url = %worker_url,
attempt = request_retries + 1,
max_attempts = MAX_REQUEST_RETRIES,
"Request failed"
);
request_retries += 1;
total_retries += 1;
if request_retries == MAX_REQUEST_RETRIES {
warn!("Removing failed worker: {}", worker_url);
warn!(
request_id = %request_id,
worker_url = %worker_url,
"Removing failed worker"
);
self.remove_worker(&worker_url);
break;
}
@@ -293,6 +314,7 @@ impl Router {
typed_req: &T,
route: &str,
) -> HttpResponse {
let request_id = get_request_id(req);
// Handle retries like the original implementation
let start = Instant::now();
const MAX_REQUEST_RETRIES: u32 = 3;
@@ -357,17 +379,19 @@ impl Router {
}
warn!(
"Generate request to {} failed (attempt {}/{})",
worker_url,
request_retries + 1,
MAX_REQUEST_RETRIES
request_id = %request_id,
"Generate request failed route={} worker_url={} attempt={} max_attempts={}",
route, worker_url, request_retries + 1, MAX_REQUEST_RETRIES
);
request_retries += 1;
total_retries += 1;
if request_retries == MAX_REQUEST_RETRIES {
warn!("Removing failed worker: {}", worker_url);
warn!(
request_id = %request_id,
"Removing failed worker after typed request failures worker_url={}", worker_url
);
self.remove_worker(&worker_url);
break;
}
@@ -402,13 +426,9 @@ impl Router {
is_stream: bool,
load_incremented: bool, // Whether load was incremented for this request
) -> HttpResponse {
let request_id = get_request_id(req);
let start = Instant::now();
// Debug: Log what we're sending
if let Ok(json_str) = serde_json::to_string_pretty(typed_req) {
debug!("Sending request to {}: {}", route, json_str);
}
let mut request_builder = client
.post(format!("{}{}", worker_url, route))
.json(typed_req); // Use json() directly with typed request
@@ -424,7 +444,11 @@ impl Router {
let res = match request_builder.send().await {
Ok(res) => res,
Err(e) => {
error!("Failed to send request to {}: {}", worker_url, e);
error!(
request_id = %request_id,
"Failed to send typed request worker_url={} route={} error={}",
worker_url, route, e
);
// Decrement load on error if it was incremented
if load_incremented {
@@ -497,7 +521,6 @@ impl Router {
&worker_url,
worker.load(),
);
debug!("Streaming is done!!")
}
}
}
@@ -536,7 +559,6 @@ impl Router {
match client.get(&format!("{}/health", worker_url)).send().await {
Ok(res) => {
if res.status().is_success() {
info!("Worker {} health check passed", worker_url);
let mut workers_guard = self.workers.write().unwrap();
if workers_guard.iter().any(|w| w.url() == worker_url) {
return Err(format!("Worker {} already exists", worker_url));
@@ -560,8 +582,8 @@ impl Router {
return Ok(format!("Successfully added worker: {}", worker_url));
} else {
info!(
"Worker {} health check is pending with status: {}.",
debug!(
"Worker {} health check pending - status: {}",
worker_url,
res.status()
);
@@ -576,10 +598,7 @@ impl Router {
}
}
Err(e) => {
info!(
"Worker {} health check is pending with error: {}",
worker_url, e
);
debug!("Worker {} health check pending - error: {}", worker_url, e);
// if the url does not have http or https prefix, warn users
if !worker_url.starts_with("http://") && !worker_url.starts_with("https://") {
@@ -611,7 +630,6 @@ impl Router {
.downcast_ref::<crate::policies::CacheAwarePolicy>()
{
cache_aware.remove_worker(worker_url);
info!("Removed worker from tree: {}", worker_url);
}
}
@@ -675,7 +693,6 @@ impl Router {
for url in &worker_urls {
if let Some(load) = Self::get_worker_load_static(&client, url).await {
loads.insert(url.clone(), load);
debug!("Worker {} load: {}", url, load);
}
}