[router]: Add Embedding routing logic (#10129)
Signed-off-by: Jintao Zhang <zhangjintao9020@gmail.com> Co-authored-by: Waël Boukhobza <wawa_wael@live.fr>
This commit is contained in:
@@ -309,7 +309,12 @@ impl RouterTrait for GrpcPDRouter {
|
||||
(StatusCode::NOT_IMPLEMENTED).into_response()
|
||||
}
|
||||
|
||||
async fn route_embeddings(&self, _headers: Option<&HeaderMap>, _body: Body) -> Response {
|
||||
async fn route_embeddings(
|
||||
&self,
|
||||
_headers: Option<&HeaderMap>,
|
||||
_body: &crate::protocols::spec::EmbeddingRequest,
|
||||
_model_id: Option<&str>,
|
||||
) -> Response {
|
||||
(StatusCode::NOT_IMPLEMENTED).into_response()
|
||||
}
|
||||
|
||||
|
||||
@@ -242,7 +242,12 @@ impl RouterTrait for GrpcRouter {
|
||||
(StatusCode::NOT_IMPLEMENTED).into_response()
|
||||
}
|
||||
|
||||
async fn route_embeddings(&self, _headers: Option<&HeaderMap>, _body: Body) -> Response {
|
||||
async fn route_embeddings(
|
||||
&self,
|
||||
_headers: Option<&HeaderMap>,
|
||||
_body: &crate::protocols::spec::EmbeddingRequest,
|
||||
_model_id: Option<&str>,
|
||||
) -> Response {
|
||||
(StatusCode::NOT_IMPLEMENTED).into_response()
|
||||
}
|
||||
|
||||
|
||||
@@ -395,7 +395,12 @@ impl super::super::RouterTrait for OpenAIRouter {
|
||||
}
|
||||
}
|
||||
|
||||
async fn route_embeddings(&self, _headers: Option<&HeaderMap>, _body: Body) -> Response {
|
||||
async fn route_embeddings(
|
||||
&self,
|
||||
_headers: Option<&HeaderMap>,
|
||||
_body: &crate::protocols::spec::EmbeddingRequest,
|
||||
_model_id: Option<&str>,
|
||||
) -> Response {
|
||||
(
|
||||
StatusCode::NOT_IMPLEMENTED,
|
||||
"Embeddings endpoint not implemented for OpenAI backend",
|
||||
|
||||
@@ -1938,8 +1938,17 @@ impl RouterTrait for PDRouter {
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn route_embeddings(&self, _headers: Option<&HeaderMap>, _body: Body) -> Response {
|
||||
todo!()
|
||||
async fn route_embeddings(
|
||||
&self,
|
||||
_headers: Option<&HeaderMap>,
|
||||
_body: &crate::protocols::spec::EmbeddingRequest,
|
||||
_model_id: Option<&str>,
|
||||
) -> Response {
|
||||
(
|
||||
StatusCode::NOT_IMPLEMENTED,
|
||||
"Embeddings endpoint not implemented for PD router",
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn route_rerank(
|
||||
|
||||
@@ -6,8 +6,8 @@ use crate::core::{
|
||||
use crate::metrics::RouterMetrics;
|
||||
use crate::policies::{LoadBalancingPolicy, PolicyRegistry};
|
||||
use crate::protocols::spec::{
|
||||
ChatCompletionRequest, CompletionRequest, GenerateRequest, GenerationRequest, RerankRequest,
|
||||
RerankResponse, RerankResult, ResponsesRequest,
|
||||
ChatCompletionRequest, CompletionRequest, EmbeddingRequest, GenerateRequest, GenerationRequest,
|
||||
RerankRequest, RerankResponse, RerankResult, ResponsesRequest,
|
||||
};
|
||||
use crate::routers::header_utils;
|
||||
use crate::routers::{RouterTrait, WorkerManagement};
|
||||
@@ -1430,8 +1430,28 @@ impl RouterTrait for Router {
|
||||
self.route_post_empty_request(headers, &endpoint).await
|
||||
}
|
||||
|
||||
async fn route_embeddings(&self, _headers: Option<&HeaderMap>, _body: Body) -> Response {
|
||||
todo!()
|
||||
async fn route_embeddings(
|
||||
&self,
|
||||
headers: Option<&HeaderMap>,
|
||||
body: &EmbeddingRequest,
|
||||
model_id: Option<&str>,
|
||||
) -> Response {
|
||||
// Record embeddings-specific metrics in addition to general request metrics
|
||||
let start = Instant::now();
|
||||
let res = self
|
||||
.route_typed_request(headers, body, "/v1/embeddings", model_id)
|
||||
.await;
|
||||
|
||||
// Embedding specific metrics
|
||||
if res.status().is_success() {
|
||||
RouterMetrics::record_embeddings_request();
|
||||
RouterMetrics::record_embeddings_duration(start.elapsed());
|
||||
} else {
|
||||
let error_type = format!("http_{}", res.status().as_u16());
|
||||
RouterMetrics::record_embeddings_error(&error_type);
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
async fn route_rerank(
|
||||
|
||||
@@ -10,7 +10,8 @@ use axum::{
|
||||
use std::fmt::Debug;
|
||||
|
||||
use crate::protocols::spec::{
|
||||
ChatCompletionRequest, CompletionRequest, GenerateRequest, RerankRequest, ResponsesRequest,
|
||||
ChatCompletionRequest, CompletionRequest, EmbeddingRequest, GenerateRequest, RerankRequest,
|
||||
ResponsesRequest,
|
||||
};
|
||||
|
||||
pub mod factory;
|
||||
@@ -123,7 +124,13 @@ pub trait RouterTrait: Send + Sync + Debug + WorkerManagement {
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn route_embeddings(&self, headers: Option<&HeaderMap>, body: Body) -> Response;
|
||||
/// Route embedding requests (OpenAI-compatible /v1/embeddings)
|
||||
async fn route_embeddings(
|
||||
&self,
|
||||
headers: Option<&HeaderMap>,
|
||||
body: &EmbeddingRequest,
|
||||
model_id: Option<&str>,
|
||||
) -> Response;
|
||||
|
||||
async fn route_rerank(
|
||||
&self,
|
||||
|
||||
@@ -7,7 +7,8 @@
|
||||
use crate::config::RouterConfig;
|
||||
use crate::core::{CircuitBreakerConfig, Worker, WorkerFactory, WorkerRegistry};
|
||||
use crate::protocols::spec::{
|
||||
ChatCompletionRequest, CompletionRequest, GenerateRequest, RerankRequest, ResponsesRequest,
|
||||
ChatCompletionRequest, CompletionRequest, EmbeddingRequest, GenerateRequest, RerankRequest,
|
||||
ResponsesRequest,
|
||||
};
|
||||
use crate::protocols::worker_spec::{
|
||||
ServerInfo, WorkerApiResponse, WorkerConfigRequest, WorkerErrorResponse, WorkerInfo,
|
||||
@@ -665,22 +666,6 @@ impl RouterTrait for RouterManager {
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn get_response(&self, _headers: Option<&HeaderMap>, _response_id: &str) -> Response {
|
||||
(
|
||||
StatusCode::NOT_IMPLEMENTED,
|
||||
"responses api not yet implemented in inference gateway mode",
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn cancel_response(&self, _headers: Option<&HeaderMap>, _response_id: &str) -> Response {
|
||||
(
|
||||
StatusCode::NOT_IMPLEMENTED,
|
||||
"responses api not yet implemented in inference gateway mode",
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
async fn delete_response(&self, _headers: Option<&HeaderMap>, _response_id: &str) -> Response {
|
||||
(
|
||||
StatusCode::NOT_IMPLEMENTED,
|
||||
@@ -701,17 +686,51 @@ impl RouterTrait for RouterManager {
|
||||
.into_response()
|
||||
}
|
||||
|
||||
/// Route embeddings request
|
||||
async fn route_embeddings(&self, headers: Option<&HeaderMap>, body: Body) -> Response {
|
||||
// Try to select a router based on headers
|
||||
async fn get_response(&self, headers: Option<&HeaderMap>, response_id: &str) -> Response {
|
||||
let router = self.select_router_for_request(headers, None);
|
||||
|
||||
if let Some(router) = router {
|
||||
router.route_embeddings(headers, body).await
|
||||
router.get_response(headers, response_id).await
|
||||
} else {
|
||||
(
|
||||
StatusCode::NOT_FOUND,
|
||||
"No router available for embeddings request",
|
||||
format!("No router available to get response '{}'", response_id),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
}
|
||||
|
||||
async fn cancel_response(&self, headers: Option<&HeaderMap>, response_id: &str) -> Response {
|
||||
let router = self.select_router_for_request(headers, None);
|
||||
if let Some(router) = router {
|
||||
router.cancel_response(headers, response_id).await
|
||||
} else {
|
||||
(
|
||||
StatusCode::NOT_FOUND,
|
||||
format!("No router available to cancel response '{}'", response_id),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
}
|
||||
|
||||
/// Route embeddings request
|
||||
async fn route_embeddings(
|
||||
&self,
|
||||
headers: Option<&HeaderMap>,
|
||||
body: &EmbeddingRequest,
|
||||
_model_id: Option<&str>,
|
||||
) -> Response {
|
||||
// Select router based on headers and model
|
||||
let router = self.select_router_for_request(headers, Some(&body.model));
|
||||
|
||||
if let Some(router) = router {
|
||||
router
|
||||
.route_embeddings(headers, body, Some(&body.model))
|
||||
.await
|
||||
} else {
|
||||
// Return 404 when the specified model is not found
|
||||
(
|
||||
StatusCode::NOT_FOUND,
|
||||
format!("Model '{}' not found or no router available", body.model),
|
||||
)
|
||||
.into_response()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user