[router] Basic OAI Response api (#10346)

This commit is contained in:
Keyang Ru
2025-09-11 20:56:17 -07:00
committed by GitHub
parent 27778010fc
commit a23bdeaf04
9 changed files with 245 additions and 5 deletions

View File

@@ -289,6 +289,14 @@ impl RouterTrait for GrpcPDRouter {
(StatusCode::NOT_IMPLEMENTED).into_response()
}
async fn route_responses(
&self,
_headers: Option<&HeaderMap>,
_body: &crate::protocols::spec::ResponsesRequest,
) -> Response {
(StatusCode::NOT_IMPLEMENTED).into_response()
}
async fn route_embeddings(&self, _headers: Option<&HeaderMap>, _body: Body) -> Response {
(StatusCode::NOT_IMPLEMENTED).into_response()
}

View File

@@ -222,6 +222,14 @@ impl RouterTrait for GrpcRouter {
(StatusCode::NOT_IMPLEMENTED).into_response()
}
async fn route_responses(
&self,
_headers: Option<&HeaderMap>,
_body: &crate::protocols::spec::ResponsesRequest,
) -> Response {
(StatusCode::NOT_IMPLEMENTED).into_response()
}
async fn route_embeddings(&self, _headers: Option<&HeaderMap>, _body: Body) -> Response {
(StatusCode::NOT_IMPLEMENTED).into_response()
}

View File

@@ -333,6 +333,18 @@ impl super::super::RouterTrait for OpenAIRouter {
.into_response()
}
async fn route_responses(
&self,
_headers: Option<&HeaderMap>,
_body: &crate::protocols::spec::ResponsesRequest,
) -> Response {
(
StatusCode::NOT_IMPLEMENTED,
"Responses endpoint not implemented for OpenAI router",
)
.into_response()
}
async fn flush_cache(&self) -> Response {
(
StatusCode::NOT_IMPLEMENTED,

View File

@@ -9,8 +9,8 @@ use crate::core::{
use crate::metrics::RouterMetrics;
use crate::policies::LoadBalancingPolicy;
use crate::protocols::spec::{
ChatCompletionRequest, ChatMessage, CompletionRequest, GenerateRequest, StringOrArray,
UserMessageContent,
ChatCompletionRequest, ChatMessage, CompletionRequest, GenerateRequest, ResponsesRequest,
StringOrArray, UserMessageContent,
};
use crate::routers::header_utils;
use crate::routers::{RouterTrait, WorkerManagement};
@@ -1930,6 +1930,18 @@ impl RouterTrait for PDRouter {
self.execute_dual_dispatch(headers, body, context).await
}
async fn route_responses(
&self,
_headers: Option<&HeaderMap>,
_body: &ResponsesRequest,
) -> Response {
(
StatusCode::NOT_IMPLEMENTED,
"Responses endpoint not implemented for PD router",
)
.into_response()
}
async fn route_embeddings(&self, _headers: Option<&HeaderMap>, _body: Body) -> Response {
todo!()
}

View File

@@ -6,7 +6,7 @@ use crate::core::{
use crate::metrics::RouterMetrics;
use crate::policies::LoadBalancingPolicy;
use crate::protocols::spec::{
ChatCompletionRequest, CompletionRequest, GenerateRequest, GenerationRequest,
ChatCompletionRequest, CompletionRequest, GenerateRequest, GenerationRequest, ResponsesRequest,
};
use crate::routers::header_utils;
use crate::routers::{RouterTrait, WorkerManagement};
@@ -1210,6 +1210,15 @@ impl RouterTrait for Router {
.await
}
async fn route_responses(
&self,
headers: Option<&HeaderMap>,
body: &ResponsesRequest,
) -> Response {
self.route_typed_request(headers, body, "/v1/responses")
.await
}
async fn route_embeddings(&self, _headers: Option<&HeaderMap>, _body: Body) -> Response {
todo!()
}

View File

@@ -9,7 +9,9 @@ use axum::{
};
use std::fmt::Debug;
use crate::protocols::spec::{ChatCompletionRequest, CompletionRequest, GenerateRequest};
use crate::protocols::spec::{
ChatCompletionRequest, CompletionRequest, GenerateRequest, ResponsesRequest,
};
pub mod factory;
pub mod grpc;
@@ -78,6 +80,13 @@ pub trait RouterTrait: Send + Sync + Debug + WorkerManagement {
body: &CompletionRequest,
) -> Response;
/// Route a responses request
async fn route_responses(
&self,
headers: Option<&HeaderMap>,
body: &ResponsesRequest,
) -> Response;
async fn route_embeddings(&self, headers: Option<&HeaderMap>, body: Body) -> Response;
async fn route_rerank(&self, headers: Option<&HeaderMap>, body: Body) -> Response;

View File

@@ -2,7 +2,9 @@ use crate::config::RouterConfig;
use crate::logging::{self, LoggingConfig};
use crate::metrics::{self, PrometheusConfig};
use crate::middleware::TokenBucket;
use crate::protocols::spec::{ChatCompletionRequest, CompletionRequest, GenerateRequest};
use crate::protocols::spec::{
ChatCompletionRequest, CompletionRequest, GenerateRequest, ResponsesRequest,
};
use crate::reasoning_parser::ParserFactory;
use crate::routers::{RouterFactory, RouterTrait};
use crate::service_discovery::{start_service_discovery, ServiceDiscoveryConfig};
@@ -150,6 +152,14 @@ async fn v1_completions(
state.router.route_completion(Some(&headers), &body).await
}
async fn v1_responses(
State(state): State<Arc<AppState>>,
headers: http::HeaderMap,
Json(body): Json<ResponsesRequest>,
) -> Response {
state.router.route_responses(Some(&headers), &body).await
}
// Worker management endpoints
async fn add_worker(
State(state): State<Arc<AppState>>,
@@ -227,6 +237,7 @@ pub fn build_app(
.route("/generate", post(generate))
.route("/v1/chat/completions", post(v1_chat_completions))
.route("/v1/completions", post(v1_completions))
.route("/v1/responses", post(v1_responses))
.route_layer(axum::middleware::from_fn_with_state(
app_state.clone(),
crate::middleware::concurrency_limit_middleware,