[router] add health checking in router init (#2393)
This commit is contained in:
2
rust/Cargo.lock
generated
2
rust/Cargo.lock
generated
@@ -851,6 +851,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1986,6 +1987,7 @@ dependencies = [
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"encoding_rs",
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"h2 0.4.6",
|
||||
|
||||
@@ -19,7 +19,7 @@ serde = { version = "1.0", features = ["derive"] }
|
||||
clap = { version = "4.4", features = ["derive"] }
|
||||
bytes = "1.8.0"
|
||||
rand = "0.8.5"
|
||||
reqwest = { version = "0.12.8", features = ["stream"] }
|
||||
reqwest = { version = "0.12.8", features = ["stream", "blocking"] }
|
||||
futures-util = "0.3"
|
||||
serde_json = "1.0"
|
||||
pyo3 = { version = "0.22.5", features = ["extension-module"] }
|
||||
|
||||
@@ -167,13 +167,6 @@ def main():
|
||||
signal.SIGQUIT, lambda sig, frame: cleanup_processes(server_processes)
|
||||
)
|
||||
|
||||
for port in worker_ports:
|
||||
if not wait_for_server_health(server_args.host, port):
|
||||
logger.error(f"Server on port {port} failed to become healthy")
|
||||
break
|
||||
|
||||
logger.info("All servers are healthy. Starting router...")
|
||||
|
||||
# Update router args with worker URLs
|
||||
router_args.worker_urls = [
|
||||
f"http://{server_args.host}:{port}" for port in worker_ports
|
||||
|
||||
@@ -93,7 +93,7 @@ pub enum Router {
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum PolicyConfig {
|
||||
RandomConfig,
|
||||
RoundRobinConfig,
|
||||
@@ -127,9 +127,14 @@ fn get_text_from_request(body: &Bytes, route: &str) -> String {
|
||||
|
||||
return "".to_string();
|
||||
}
|
||||
|
||||
impl Router {
|
||||
pub fn new(worker_urls: Vec<String>, policy_config: PolicyConfig) -> Self {
|
||||
match policy_config {
|
||||
pub fn new(worker_urls: Vec<String>, policy_config: PolicyConfig) -> Result<Self, String> {
|
||||
// Wait until all workers are healthy
|
||||
Self::wait_for_healthy_workers(&worker_urls, 300, 10)?;
|
||||
|
||||
// Create router based on policy...
|
||||
Ok(match policy_config {
|
||||
PolicyConfig::RandomConfig => Router::Random {
|
||||
worker_urls: Arc::new(RwLock::new(worker_urls)),
|
||||
},
|
||||
@@ -196,7 +201,7 @@ impl Router {
|
||||
_eviction_thread: Some(eviction_thread),
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_first(&self) -> Option<String> {
|
||||
@@ -213,6 +218,59 @@ impl Router {
|
||||
}
|
||||
}
|
||||
|
||||
fn wait_for_healthy_workers(
|
||||
worker_urls: &[String],
|
||||
timeout_secs: u64,
|
||||
interval_secs: u64,
|
||||
) -> Result<(), String> {
|
||||
let start_time = std::time::Instant::now();
|
||||
let sync_client = reqwest::blocking::Client::new();
|
||||
|
||||
loop {
|
||||
if start_time.elapsed() > Duration::from_secs(timeout_secs) {
|
||||
return Err(format!(
|
||||
"Timeout {}s waiting for workers to become healthy",
|
||||
timeout_secs
|
||||
));
|
||||
}
|
||||
|
||||
let mut all_healthy = true;
|
||||
let mut unhealthy_workers = Vec::new();
|
||||
|
||||
for url in worker_urls {
|
||||
match sync_client.get(&format!("{}/health", url)).send() {
|
||||
Ok(res) => {
|
||||
if !res.status().is_success() {
|
||||
info!(
|
||||
"Worker {} health check is pending with status: {}.",
|
||||
url,
|
||||
res.status()
|
||||
);
|
||||
all_healthy = false;
|
||||
unhealthy_workers.push((url, format!("Status: {}", res.status())));
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
info!("Worker {} health check is pending with error: {}", url, e);
|
||||
all_healthy = false;
|
||||
unhealthy_workers.push((url, format!("Error: {}", e)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if all_healthy {
|
||||
info!("All workers are healthy");
|
||||
return Ok(());
|
||||
} else {
|
||||
info!("Unhealthy workers:");
|
||||
for (url, reason) in &unhealthy_workers {
|
||||
info!(" {} - {}", url, reason);
|
||||
}
|
||||
thread::sleep(Duration::from_secs(interval_secs));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn dispatch(
|
||||
&self,
|
||||
client: &reqwest::Client,
|
||||
@@ -386,7 +444,7 @@ impl Router {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn add_worker(&self, worker_url: String) -> HttpResponse {
|
||||
pub async fn add_worker(&self, worker_url: String) -> Result<String, String> {
|
||||
let interval_secs = 10; // check every 10 seconds
|
||||
let timeout_secs = 300; // 5 minutes
|
||||
|
||||
@@ -395,7 +453,7 @@ impl Router {
|
||||
|
||||
loop {
|
||||
if start_time.elapsed() > Duration::from_secs(timeout_secs) {
|
||||
return HttpResponse::InternalServerError().body(format!(
|
||||
return Err(format!(
|
||||
"Timeout {}s waiting for worker {} to become healthy",
|
||||
timeout_secs, worker_url
|
||||
));
|
||||
@@ -411,19 +469,40 @@ impl Router {
|
||||
info!("Worker {} health check passed", worker_url);
|
||||
let mut urls = worker_urls.write().unwrap();
|
||||
if urls.contains(&worker_url) {
|
||||
return HttpResponse::BadRequest()
|
||||
.body(format!("Worker {} already exists", worker_url));
|
||||
return Err(format!("Worker {} already exists", worker_url));
|
||||
}
|
||||
info!("Added worker: {}", worker_url);
|
||||
urls.push(worker_url.clone());
|
||||
}
|
||||
}
|
||||
return HttpResponse::Ok()
|
||||
.body(format!("Successfully added worker: {}", worker_url));
|
||||
|
||||
// If cache aware, initialize the queues for the new worker
|
||||
if let Router::CacheAware {
|
||||
running_queue,
|
||||
processed_queue,
|
||||
tree,
|
||||
..
|
||||
} = self
|
||||
{
|
||||
// Add worker to running queue with initial count of 0
|
||||
running_queue.lock().unwrap().insert(worker_url.clone(), 0);
|
||||
|
||||
// Add worker to processed queue with initial count of 0
|
||||
processed_queue
|
||||
.lock()
|
||||
.unwrap()
|
||||
.insert(worker_url.clone(), 0);
|
||||
|
||||
// Add worker to tree
|
||||
tree.lock().unwrap().insert(&"".to_string(), &worker_url);
|
||||
}
|
||||
|
||||
return Ok(format!("Successfully added worker: {}", worker_url));
|
||||
} else {
|
||||
info!(
|
||||
"Worker {} health check failed with status: {}. The worker might still be starting up.",
|
||||
worker_url, res.status()
|
||||
"Worker {} health check is pending with status: {}.",
|
||||
worker_url,
|
||||
res.status()
|
||||
);
|
||||
// if the url does not have http or https prefix, warn users
|
||||
if !worker_url.starts_with("http://") && !worker_url.starts_with("https://")
|
||||
@@ -436,7 +515,10 @@ impl Router {
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
info!("Worker {} health check failed: {}", worker_url, e);
|
||||
info!(
|
||||
"Worker {} health check is pending with error: {}",
|
||||
worker_url, e
|
||||
);
|
||||
|
||||
// if the url does not have http or https prefix, warn users
|
||||
if !worker_url.starts_with("http://") && !worker_url.starts_with("https://") {
|
||||
@@ -463,9 +545,20 @@ impl Router {
|
||||
}
|
||||
|
||||
// if cache aware, remove the worker from the tree
|
||||
if let Router::CacheAware { tree, .. } = self {
|
||||
if let Router::CacheAware {
|
||||
tree,
|
||||
running_queue,
|
||||
processed_queue,
|
||||
..
|
||||
} = self
|
||||
{
|
||||
tree.lock().unwrap().remove_tenant(&worker_url);
|
||||
info!("Removed worker from tree: {}", worker_url);
|
||||
running_queue.lock().unwrap().remove(&worker_url);
|
||||
processed_queue.lock().unwrap().remove(&worker_url);
|
||||
info!(
|
||||
"Removed worker from tree and cleaned up queues: {}",
|
||||
worker_url
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,7 +20,10 @@ impl AppState {
|
||||
policy_config: PolicyConfig,
|
||||
) -> Self {
|
||||
// Create router based on policy
|
||||
let router = Router::new(worker_urls, policy_config);
|
||||
let router = match Router::new(worker_urls, policy_config) {
|
||||
Ok(router) => router,
|
||||
Err(error) => panic!("Failed to create router: {}", error),
|
||||
};
|
||||
|
||||
Self { router, client }
|
||||
}
|
||||
@@ -141,7 +144,11 @@ async fn add_worker(
|
||||
.body("Worker URL required. Provide 'url' query parameter")
|
||||
}
|
||||
};
|
||||
data.router.add_worker(worker_url).await
|
||||
|
||||
match data.router.add_worker(worker_url).await {
|
||||
Ok(message) => HttpResponse::Ok().body(message),
|
||||
Err(error) => HttpResponse::BadRequest().body(error),
|
||||
}
|
||||
}
|
||||
|
||||
#[post("/remove_worker")]
|
||||
@@ -187,20 +194,20 @@ pub async fn startup(config: ServerConfig) -> std::io::Result<()> {
|
||||
)
|
||||
.init();
|
||||
|
||||
info!("Starting server on {}:{}", config.host, config.port);
|
||||
info!("Worker URLs: {:?}", config.worker_urls);
|
||||
info!("Policy Config: {:?}", config.policy_config);
|
||||
|
||||
let client = reqwest::Client::builder()
|
||||
.build()
|
||||
.expect("Failed to create HTTP client");
|
||||
|
||||
let app_state = web::Data::new(AppState::new(
|
||||
config.worker_urls,
|
||||
config.worker_urls.clone(),
|
||||
client,
|
||||
config.policy_config,
|
||||
config.policy_config.clone(),
|
||||
));
|
||||
|
||||
info!("✅ Starting router on {}:{}", config.host, config.port);
|
||||
info!("✅ Serving Worker URLs: {:?}", config.worker_urls);
|
||||
info!("✅ Policy Config: {:?}", config.policy_config);
|
||||
|
||||
HttpServer::new(move || {
|
||||
App::new()
|
||||
.app_data(app_state.clone())
|
||||
|
||||
Reference in New Issue
Block a user