[router] allow one router to support different model families and serving mode (#10244)

This commit is contained in:
Simo Lin
2025-09-12 19:18:27 -04:00
committed by GitHub
parent 321fecab74
commit 2f173ea074
28 changed files with 3528 additions and 837 deletions

View File

@@ -27,9 +27,9 @@ use tracing::{info, warn};
#[allow(dead_code)] // Fields will be used once implementation is complete
pub struct GrpcPDRouter {
/// Prefill worker connections
prefill_workers: Arc<RwLock<Vec<Box<dyn Worker>>>>,
prefill_workers: Arc<RwLock<Vec<Arc<dyn Worker>>>>,
/// Decode worker connections
decode_workers: Arc<RwLock<Vec<Box<dyn Worker>>>>,
decode_workers: Arc<RwLock<Vec<Arc<dyn Worker>>>>,
/// gRPC clients for prefill workers
prefill_grpc_clients: Arc<RwLock<HashMap<String, SglangSchedulerClient>>>,
/// gRPC clients for decode workers
@@ -127,7 +127,7 @@ impl GrpcPDRouter {
}
// Create Prefill Worker trait objects with gRPC connection mode
let prefill_workers: Vec<Box<dyn Worker>> = prefill_urls
let prefill_workers: Vec<Arc<dyn Worker>> = prefill_urls
.iter()
.map(|(url, bootstrap_port)| {
let worker = BasicWorker::with_connection_mode(
@@ -147,12 +147,12 @@ impl GrpcPDRouter {
failure_threshold: ctx.router_config.health_check.failure_threshold,
success_threshold: ctx.router_config.health_check.success_threshold,
});
Box::new(worker) as Box<dyn Worker>
Arc::new(worker) as Arc<dyn Worker>
})
.collect();
// Create Decode Worker trait objects with gRPC connection mode
let decode_workers: Vec<Box<dyn Worker>> = decode_urls
let decode_workers: Vec<Arc<dyn Worker>> = decode_urls
.iter()
.map(|url| {
let worker = BasicWorker::with_connection_mode(
@@ -168,7 +168,7 @@ impl GrpcPDRouter {
failure_threshold: ctx.router_config.health_check.failure_threshold,
success_threshold: ctx.router_config.health_check.success_threshold,
});
Box::new(worker) as Box<dyn Worker>
Arc::new(worker) as Arc<dyn Worker>
})
.collect();
@@ -269,6 +269,7 @@ impl RouterTrait for GrpcPDRouter {
&self,
_headers: Option<&HeaderMap>,
_body: &crate::protocols::spec::GenerateRequest,
_model_id: Option<&str>,
) -> Response {
(StatusCode::NOT_IMPLEMENTED).into_response()
}
@@ -277,6 +278,7 @@ impl RouterTrait for GrpcPDRouter {
&self,
_headers: Option<&HeaderMap>,
_body: &crate::protocols::spec::ChatCompletionRequest,
_model_id: Option<&str>,
) -> Response {
(StatusCode::NOT_IMPLEMENTED).into_response()
}
@@ -285,6 +287,7 @@ impl RouterTrait for GrpcPDRouter {
&self,
_headers: Option<&HeaderMap>,
_body: &crate::protocols::spec::CompletionRequest,
_model_id: Option<&str>,
) -> Response {
(StatusCode::NOT_IMPLEMENTED).into_response()
}
@@ -293,6 +296,7 @@ impl RouterTrait for GrpcPDRouter {
&self,
_headers: Option<&HeaderMap>,
_body: &crate::protocols::spec::ResponsesRequest,
_model_id: Option<&str>,
) -> Response {
(StatusCode::NOT_IMPLEMENTED).into_response()
}
@@ -305,6 +309,7 @@ impl RouterTrait for GrpcPDRouter {
&self,
_headers: Option<&HeaderMap>,
_body: &crate::protocols::spec::RerankRequest,
_model_id: Option<&str>,
) -> Response {
(StatusCode::NOT_IMPLEMENTED).into_response()
}

View File

@@ -27,7 +27,7 @@ use tracing::{info, warn};
#[allow(dead_code)] // Fields will be used once implementation is complete
pub struct GrpcRouter {
/// Worker connections
workers: Arc<RwLock<Vec<Box<dyn Worker>>>>,
workers: Arc<RwLock<Vec<Arc<dyn Worker>>>>,
/// gRPC clients for each worker
grpc_clients: Arc<RwLock<HashMap<String, SglangSchedulerClient>>>,
/// Load balancing policy
@@ -103,7 +103,7 @@ impl GrpcRouter {
}
// Create Worker trait objects with gRPC connection mode
let mut workers: Vec<Box<dyn Worker>> = Vec::new();
let mut workers: Vec<Arc<dyn Worker>> = Vec::new();
// Move clients from the HashMap to the workers
for url in &worker_urls {
@@ -123,7 +123,7 @@ impl GrpcRouter {
})
.with_grpc_client(client);
workers.push(Box::new(worker) as Box<dyn Worker>);
workers.push(Arc::new(worker) as Arc<dyn Worker>);
} else {
warn!("No gRPC client for worker {}, skipping", url);
}
@@ -202,6 +202,7 @@ impl RouterTrait for GrpcRouter {
&self,
_headers: Option<&HeaderMap>,
_body: &crate::protocols::spec::GenerateRequest,
_model_id: Option<&str>,
) -> Response {
(StatusCode::NOT_IMPLEMENTED).into_response()
}
@@ -210,6 +211,7 @@ impl RouterTrait for GrpcRouter {
&self,
_headers: Option<&HeaderMap>,
_body: &crate::protocols::spec::ChatCompletionRequest,
_model_id: Option<&str>,
) -> Response {
(StatusCode::NOT_IMPLEMENTED).into_response()
}
@@ -218,6 +220,7 @@ impl RouterTrait for GrpcRouter {
&self,
_headers: Option<&HeaderMap>,
_body: &crate::protocols::spec::CompletionRequest,
_model_id: Option<&str>,
) -> Response {
(StatusCode::NOT_IMPLEMENTED).into_response()
}
@@ -226,6 +229,7 @@ impl RouterTrait for GrpcRouter {
&self,
_headers: Option<&HeaderMap>,
_body: &crate::protocols::spec::ResponsesRequest,
_model_id: Option<&str>,
) -> Response {
(StatusCode::NOT_IMPLEMENTED).into_response()
}
@@ -238,6 +242,7 @@ impl RouterTrait for GrpcRouter {
&self,
_headers: Option<&HeaderMap>,
_body: &crate::protocols::spec::RerankRequest,
_model_id: Option<&str>,
) -> Response {
(StatusCode::NOT_IMPLEMENTED).into_response()
}