[router] Implement OpenAI Responses API specification (#9367)

2025-08-19 20:14:47 -07:00
parent 5fbad308cd
commit 5ae5ecaa15
6 changed files with 1095 additions and 0 deletions
--- a/sgl-router/src/protocols/openai/mod.rs
+++ b/sgl-router/src/protocols/openai/mod.rs
@@ -5,3 +5,4 @@ pub mod chat;
 pub mod common;
 pub mod completions;
 pub mod errors;
+pub mod responses;
--- a/sgl-router/src/protocols/openai/responses/mod.rs
+++ b/sgl-router/src/protocols/openai/responses/mod.rs
@@ -0,0 +1,10 @@
+// Responses API module
+
+pub mod request;
+pub mod response;
+pub mod types;
+
+// Re-export main types for convenience
+pub use request::ResponsesRequest;
+pub use response::ResponsesResponse;
+pub use types::*;
--- a/sgl-router/src/protocols/openai/responses/request.rs
+++ b/sgl-router/src/protocols/openai/responses/request.rs
@@ -0,0 +1,300 @@
+// Responses API request types
+
+use crate::protocols::common::{GenerationRequest, StringOrArray};
+use crate::protocols::openai::responses::types::*;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+fn generate_request_id() -> String {
+    format!("resp_{}", uuid::Uuid::new_v4().simple())
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ResponsesRequest {
+    // ============= Core OpenAI API fields =============
+    /// Run the request in the background
+    #[serde(default)]
+    pub background: bool,
+
+    /// Fields to include in the response
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub include: Option<Vec<IncludeField>>,
+
+    /// Input content - can be string or structured items
+    pub input: ResponseInput,
+
+    /// System instructions for the model
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub instructions: Option<String>,
+
+    /// Maximum number of output tokens
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_output_tokens: Option<u32>,
+
+    /// Maximum number of tool calls
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_tool_calls: Option<u32>,
+
+    /// Additional metadata
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub metadata: Option<HashMap<String, serde_json::Value>>,
+
+    /// Model to use (optional to match vLLM)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub model: Option<String>,
+
+    /// Whether to enable parallel tool calls
+    #[serde(default = "default_true")]
+    pub parallel_tool_calls: bool,
+
+    /// ID of previous response to continue from
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub previous_response_id: Option<String>,
+
+    /// Reasoning configuration
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub reasoning: Option<ResponseReasoningParam>,
+
+    /// Service tier
+    #[serde(default)]
+    pub service_tier: ServiceTier,
+
+    /// Whether to store the response
+    #[serde(default = "default_true")]
+    pub store: bool,
+
+    /// Whether to stream the response
+    #[serde(default)]
+    pub stream: bool,
+
+    /// Temperature for sampling
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub temperature: Option<f32>,
+
+    /// Tool choice behavior
+    #[serde(default)]
+    pub tool_choice: ToolChoice,
+
+    /// Available tools
+    #[serde(default)]
+    pub tools: Vec<ResponseTool>,
+
+    /// Number of top logprobs to return
+    #[serde(default)]
+    pub top_logprobs: u32,
+
+    /// Top-p sampling parameter
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_p: Option<f32>,
+
+    /// Truncation behavior
+    #[serde(default)]
+    pub truncation: Truncation,
+
+    /// User identifier
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub user: Option<String>,
+
+    // ============= SGLang Extensions =============
+    /// Request ID
+    #[serde(default = "generate_request_id")]
+    pub request_id: String,
+
+    /// Request priority
+    #[serde(default)]
+    pub priority: i32,
+
+    /// Frequency penalty
+    #[serde(default)]
+    pub frequency_penalty: f32,
+
+    /// Presence penalty
+    #[serde(default)]
+    pub presence_penalty: f32,
+
+    /// Stop sequences
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stop: Option<StringOrArray>,
+
+    /// Top-k sampling parameter
+    #[serde(default = "default_top_k")]
+    pub top_k: i32,
+
+    /// Min-p sampling parameter
+    #[serde(default)]
+    pub min_p: f32,
+
+    /// Repetition penalty
+    #[serde(default = "default_repetition_penalty")]
+    pub repetition_penalty: f32,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(untagged)]
+pub enum ResponseInput {
+    Text(String),
+    Items(Vec<ResponseInputOutputItem>),
+}
+
+fn default_top_k() -> i32 {
+    -1
+}
+
+fn default_repetition_penalty() -> f32 {
+    1.0
+}
+
+fn default_true() -> bool {
+    true
+}
+
+impl ResponsesRequest {
+    /// Default sampling parameters
+    const DEFAULT_TEMPERATURE: f32 = 0.7;
+    const DEFAULT_TOP_P: f32 = 1.0;
+
+    /// Convert to sampling parameters for generation
+    pub fn to_sampling_params(
+        &self,
+        default_max_tokens: u32,
+        default_params: Option<HashMap<String, serde_json::Value>>,
+    ) -> HashMap<String, serde_json::Value> {
+        let mut params = HashMap::new();
+
+        // Use max_output_tokens if available
+        let max_tokens = if let Some(max_output) = self.max_output_tokens {
+            std::cmp::min(max_output, default_max_tokens)
+        } else {
+            default_max_tokens
+        };
+
+        // Avoid exceeding context length by minus 1 token
+        let max_tokens = max_tokens.saturating_sub(1);
+
+        // Temperature
+        let temperature = self.temperature.unwrap_or_else(|| {
+            default_params
+                .as_ref()
+                .and_then(|p| p.get("temperature"))
+                .and_then(|v| v.as_f64())
+                .map(|v| v as f32)
+                .unwrap_or(Self::DEFAULT_TEMPERATURE)
+        });
+
+        // Top-p
+        let top_p = self.top_p.unwrap_or_else(|| {
+            default_params
+                .as_ref()
+                .and_then(|p| p.get("top_p"))
+                .and_then(|v| v.as_f64())
+                .map(|v| v as f32)
+                .unwrap_or(Self::DEFAULT_TOP_P)
+        });
+
+        params.insert(
+            "max_new_tokens".to_string(),
+            serde_json::Value::Number(serde_json::Number::from(max_tokens)),
+        );
+        params.insert(
+            "temperature".to_string(),
+            serde_json::Value::Number(serde_json::Number::from_f64(temperature as f64).unwrap()),
+        );
+        params.insert(
+            "top_p".to_string(),
+            serde_json::Value::Number(serde_json::Number::from_f64(top_p as f64).unwrap()),
+        );
+        params.insert(
+            "frequency_penalty".to_string(),
+            serde_json::Value::Number(
+                serde_json::Number::from_f64(self.frequency_penalty as f64).unwrap(),
+            ),
+        );
+        params.insert(
+            "presence_penalty".to_string(),
+            serde_json::Value::Number(
+                serde_json::Number::from_f64(self.presence_penalty as f64).unwrap(),
+            ),
+        );
+        params.insert(
+            "top_k".to_string(),
+            serde_json::Value::Number(serde_json::Number::from(self.top_k)),
+        );
+        params.insert(
+            "min_p".to_string(),
+            serde_json::Value::Number(serde_json::Number::from_f64(self.min_p as f64).unwrap()),
+        );
+        params.insert(
+            "repetition_penalty".to_string(),
+            serde_json::Value::Number(
+                serde_json::Number::from_f64(self.repetition_penalty as f64).unwrap(),
+            ),
+        );
+
+        if let Some(ref stop) = self.stop {
+            match serde_json::to_value(stop) {
+                Ok(value) => params.insert("stop".to_string(), value),
+                Err(_) => params.insert("stop".to_string(), serde_json::Value::Null),
+            };
+        }
+
+        // Apply any additional default parameters
+        if let Some(default_params) = default_params {
+            for (key, value) in default_params {
+                params.entry(key).or_insert(value);
+            }
+        }
+
+        params
+    }
+}
+
+impl GenerationRequest for ResponsesRequest {
+    fn is_stream(&self) -> bool {
+        self.stream
+    }
+
+    fn get_model(&self) -> Option<&str> {
+        self.model.as_deref()
+    }
+
+    fn extract_text_for_routing(&self) -> String {
+        match &self.input {
+            ResponseInput::Text(text) => text.clone(),
+            ResponseInput::Items(items) => items
+                .iter()
+                .filter_map(|item| match item {
+                    ResponseInputOutputItem::Message { content, .. } => {
+                        let texts: Vec<String> = content
+                            .iter()
+                            .map(|part| match part {
+                                ResponseContentPart::OutputText { text, .. } => text.clone(),
+                            })
+                            .collect();
+                        if texts.is_empty() {
+                            None
+                        } else {
+                            Some(texts.join(" "))
+                        }
+                    }
+                    ResponseInputOutputItem::Reasoning { content, .. } => {
+                        let texts: Vec<String> = content
+                            .iter()
+                            .map(|part| match part {
+                                ResponseReasoningContent::ReasoningText { text } => text.clone(),
+                            })
+                            .collect();
+                        if texts.is_empty() {
+                            None
+                        } else {
+                            Some(texts.join(" "))
+                        }
+                    }
+                    ResponseInputOutputItem::FunctionToolCall { arguments, .. } => {
+                        Some(arguments.clone())
+                    }
+                })
+                .collect::<Vec<String>>()
+                .join(" "),
+        }
+    }
+}
--- a/sgl-router/src/protocols/openai/responses/response.rs
+++ b/sgl-router/src/protocols/openai/responses/response.rs
@@ -0,0 +1,280 @@
+// Responses API response types
+
+use crate::protocols::openai::responses::request::ResponsesRequest;
+use crate::protocols::openai::responses::types::*;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+fn generate_response_id() -> String {
+    format!("resp_{}", uuid::Uuid::new_v4().simple())
+}
+
+fn current_timestamp() -> i64 {
+    std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .unwrap_or_else(|_| std::time::Duration::from_secs(0))
+        .as_secs() as i64
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ResponsesResponse {
+    /// Response ID
+    #[serde(default = "generate_response_id")]
+    pub id: String,
+
+    /// Object type
+    #[serde(default = "default_object_type")]
+    pub object: String,
+
+    /// Creation timestamp
+    #[serde(default = "current_timestamp")]
+    pub created_at: i64,
+
+    /// Model name
+    pub model: String,
+
+    /// Output items
+    #[serde(default)]
+    pub output: Vec<ResponseOutputItem>,
+
+    /// Response status
+    pub status: ResponseStatus,
+
+    /// Usage information
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub usage: Option<UsageInfo>,
+
+    /// Whether parallel tool calls are enabled
+    #[serde(default = "default_true")]
+    pub parallel_tool_calls: bool,
+
+    /// Tool choice setting
+    #[serde(default = "default_tool_choice")]
+    pub tool_choice: String,
+
+    /// Available tools
+    #[serde(default)]
+    pub tools: Vec<ResponseTool>,
+}
+
+fn default_object_type() -> String {
+    "response".to_string()
+}
+
+fn default_true() -> bool {
+    true
+}
+
+fn default_tool_choice() -> String {
+    "auto".to_string()
+}
+
+impl ResponsesResponse {
+    /// Create a response from a request
+    #[allow(clippy::too_many_arguments)]
+    pub fn from_request(
+        request: &ResponsesRequest,
+        _sampling_params: &HashMap<String, serde_json::Value>,
+        model_name: String,
+        created_time: i64,
+        output: Vec<ResponseOutputItem>,
+        status: ResponseStatus,
+        usage: Option<UsageInfo>,
+    ) -> Self {
+        Self {
+            id: request.request_id.clone(),
+            object: "response".to_string(),
+            created_at: created_time,
+            model: model_name,
+            output,
+            status,
+            usage,
+            parallel_tool_calls: request.parallel_tool_calls,
+            tool_choice: match request.tool_choice {
+                ToolChoice::Auto => "auto".to_string(),
+                ToolChoice::Required => "required".to_string(),
+                ToolChoice::None => "none".to_string(),
+            },
+            tools: request.tools.clone(),
+        }
+    }
+
+    /// Create a new response with default values
+    pub fn new(request_id: String, model: String, status: ResponseStatus) -> Self {
+        Self {
+            id: request_id,
+            object: "response".to_string(),
+            created_at: current_timestamp(),
+            model,
+            output: Vec::new(),
+            status,
+            usage: None,
+            parallel_tool_calls: true,
+            tool_choice: "auto".to_string(),
+            tools: Vec::new(),
+        }
+    }
+
+    /// Add an output item to the response
+    pub fn add_output(&mut self, item: ResponseOutputItem) {
+        self.output.push(item);
+    }
+
+    /// Set the usage information
+    pub fn set_usage(&mut self, usage: UsageInfo) {
+        self.usage = Some(usage);
+    }
+
+    /// Update the status
+    pub fn set_status(&mut self, status: ResponseStatus) {
+        self.status = status;
+    }
+
+    /// Check if the response is complete
+    pub fn is_complete(&self) -> bool {
+        matches!(self.status, ResponseStatus::Completed)
+    }
+
+    /// Check if the response is in progress
+    pub fn is_in_progress(&self) -> bool {
+        matches!(self.status, ResponseStatus::InProgress)
+    }
+
+    /// Check if the response failed
+    pub fn is_failed(&self) -> bool {
+        matches!(self.status, ResponseStatus::Failed)
+    }
+
+    /// Check if the response was cancelled
+    pub fn is_cancelled(&self) -> bool {
+        matches!(self.status, ResponseStatus::Cancelled)
+    }
+
+    /// Check if the response is queued
+    pub fn is_queued(&self) -> bool {
+        matches!(self.status, ResponseStatus::Queued)
+    }
+
+    /// Convert usage to OpenAI Responses API format
+    pub fn usage_in_response_format(
+        &self,
+    ) -> Option<crate::protocols::openai::responses::types::ResponseUsage> {
+        self.usage.as_ref().map(|usage| usage.to_response_usage())
+    }
+
+    /// Get the response as a JSON value with usage in response format
+    pub fn to_response_format(&self) -> serde_json::Value {
+        let mut response = serde_json::to_value(self).unwrap_or(serde_json::Value::Null);
+
+        // Convert usage to response format if present
+        if let Some(usage) = &self.usage {
+            if let Ok(usage_value) = serde_json::to_value(usage.to_response_usage()) {
+                response["usage"] = usage_value;
+            }
+        }
+
+        response
+    }
+}
+
+// ============= Helper Functions =============
+
+impl ResponseOutputItem {
+    /// Create a new message output item
+    pub fn new_message(
+        id: String,
+        role: String,
+        content: Vec<ResponseContentPart>,
+        status: String,
+    ) -> Self {
+        Self::Message {
+            id,
+            role,
+            content,
+            status,
+        }
+    }
+
+    /// Create a new reasoning output item
+    pub fn new_reasoning(
+        id: String,
+        summary: Vec<String>,
+        content: Vec<ResponseReasoningContent>,
+        status: Option<String>,
+    ) -> Self {
+        Self::Reasoning {
+            id,
+            summary,
+            content,
+            status,
+        }
+    }
+
+    /// Create a new function tool call output item
+    pub fn new_function_tool_call(
+        id: String,
+        name: String,
+        arguments: String,
+        output: Option<String>,
+        status: String,
+    ) -> Self {
+        Self::FunctionToolCall {
+            id,
+            name,
+            arguments,
+            output,
+            status,
+        }
+    }
+}
+
+impl ResponseContentPart {
+    /// Create a new text content part
+    pub fn new_text(
+        text: String,
+        annotations: Vec<String>,
+        logprobs: Option<crate::protocols::openai::common::ChatLogProbs>,
+    ) -> Self {
+        Self::OutputText {
+            text,
+            annotations,
+            logprobs,
+        }
+    }
+}
+
+impl ResponseReasoningContent {
+    /// Create a new reasoning text content
+    pub fn new_reasoning_text(text: String) -> Self {
+        Self::ReasoningText { text }
+    }
+}
+
+impl UsageInfo {
+    /// Create a new usage info with token counts
+    pub fn new(prompt_tokens: u32, completion_tokens: u32, reasoning_tokens: Option<u32>) -> Self {
+        Self {
+            prompt_tokens,
+            completion_tokens,
+            total_tokens: prompt_tokens + completion_tokens,
+            reasoning_tokens,
+            prompt_tokens_details: None,
+        }
+    }
+
+    /// Create usage info with cached token details
+    pub fn new_with_cached(
+        prompt_tokens: u32,
+        completion_tokens: u32,
+        reasoning_tokens: Option<u32>,
+        cached_tokens: u32,
+    ) -> Self {
+        Self {
+            prompt_tokens,
+            completion_tokens,
+            total_tokens: prompt_tokens + completion_tokens,
+            reasoning_tokens,
+            prompt_tokens_details: Some(PromptTokenUsageInfo { cached_tokens }),
+        }
+    }
+}
--- a/sgl-router/src/protocols/openai/responses/types.rs
+++ b/sgl-router/src/protocols/openai/responses/types.rs
@@ -0,0 +1,296 @@
+// Supporting types for Responses API
+
+use crate::protocols::openai::common::ChatLogProbs;
+use serde::{Deserialize, Serialize};
+
+// ============= Tool Definitions =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ResponseTool {
+    #[serde(rename = "type")]
+    pub r#type: ResponseToolType,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ResponseToolType {
+    WebSearchPreview,
+    CodeInterpreter,
+}
+
+// ============= Reasoning Configuration =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ResponseReasoningParam {
+    #[serde(default = "default_reasoning_effort")]
+    pub effort: Option<ReasoningEffort>,
+}
+
+fn default_reasoning_effort() -> Option<ReasoningEffort> {
+    Some(ReasoningEffort::Medium)
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ReasoningEffort {
+    Low,
+    Medium,
+    High,
+}
+
+// ============= Input/Output Items =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(tag = "type")]
+#[serde(rename_all = "snake_case")]
+pub enum ResponseInputOutputItem {
+    #[serde(rename = "message")]
+    Message {
+        id: String,
+        role: String,
+        content: Vec<ResponseContentPart>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        status: Option<String>,
+    },
+    #[serde(rename = "reasoning")]
+    Reasoning {
+        id: String,
+        #[serde(skip_serializing_if = "Vec::is_empty")]
+        summary: Vec<String>,
+        content: Vec<ResponseReasoningContent>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        status: Option<String>,
+    },
+    #[serde(rename = "function_tool_call")]
+    FunctionToolCall {
+        id: String,
+        name: String,
+        arguments: String,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        output: Option<String>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        status: Option<String>,
+    },
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(tag = "type")]
+#[serde(rename_all = "snake_case")]
+pub enum ResponseContentPart {
+    #[serde(rename = "output_text")]
+    OutputText {
+        text: String,
+        #[serde(skip_serializing_if = "Vec::is_empty")]
+        annotations: Vec<String>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        logprobs: Option<ChatLogProbs>,
+    },
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(tag = "type")]
+#[serde(rename_all = "snake_case")]
+pub enum ResponseReasoningContent {
+    #[serde(rename = "reasoning_text")]
+    ReasoningText { text: String },
+}
+
+// ============= Output Items for Response =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(tag = "type")]
+#[serde(rename_all = "snake_case")]
+pub enum ResponseOutputItem {
+    #[serde(rename = "message")]
+    Message {
+        id: String,
+        role: String,
+        content: Vec<ResponseContentPart>,
+        status: String,
+    },
+    #[serde(rename = "reasoning")]
+    Reasoning {
+        id: String,
+        #[serde(skip_serializing_if = "Vec::is_empty")]
+        summary: Vec<String>,
+        content: Vec<ResponseReasoningContent>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        status: Option<String>,
+    },
+    #[serde(rename = "function_tool_call")]
+    FunctionToolCall {
+        id: String,
+        name: String,
+        arguments: String,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        output: Option<String>,
+        status: String,
+    },
+}
+
+// ============= Service Tier =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ServiceTier {
+    Auto,
+    Default,
+    Flex,
+    Scale,
+    Priority,
+}
+
+impl Default for ServiceTier {
+    fn default() -> Self {
+        Self::Auto
+    }
+}
+
+// ============= Tool Choice =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ToolChoice {
+    Auto,
+    Required,
+    None,
+}
+
+impl Default for ToolChoice {
+    fn default() -> Self {
+        Self::Auto
+    }
+}
+
+// ============= Truncation =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum Truncation {
+    Auto,
+    Disabled,
+}
+
+impl Default for Truncation {
+    fn default() -> Self {
+        Self::Disabled
+    }
+}
+
+// ============= Response Status =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ResponseStatus {
+    Queued,
+    InProgress,
+    Completed,
+    Failed,
+    Cancelled,
+}
+
+// ============= Include Fields =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum IncludeField {
+    #[serde(rename = "code_interpreter_call.outputs")]
+    CodeInterpreterCallOutputs,
+    #[serde(rename = "computer_call_output.output.image_url")]
+    ComputerCallOutputImageUrl,
+    #[serde(rename = "file_search_call.results")]
+    FileSearchCallResults,
+    #[serde(rename = "message.input_image.image_url")]
+    MessageInputImageUrl,
+    #[serde(rename = "message.output_text.logprobs")]
+    MessageOutputTextLogprobs,
+    #[serde(rename = "reasoning.encrypted_content")]
+    ReasoningEncryptedContent,
+}
+
+// ============= Usage Info =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct UsageInfo {
+    pub prompt_tokens: u32,
+    pub completion_tokens: u32,
+    pub total_tokens: u32,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub reasoning_tokens: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub prompt_tokens_details: Option<PromptTokenUsageInfo>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct PromptTokenUsageInfo {
+    pub cached_tokens: u32,
+}
+
+// ============= Response Usage Format =============
+
+/// OpenAI Responses API usage format (different from standard UsageInfo)
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ResponseUsage {
+    pub input_tokens: u32,
+    pub output_tokens: u32,
+    pub total_tokens: u32,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub input_tokens_details: Option<InputTokensDetails>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub output_tokens_details: Option<OutputTokensDetails>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct InputTokensDetails {
+    pub cached_tokens: u32,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct OutputTokensDetails {
+    pub reasoning_tokens: u32,
+}
+
+impl UsageInfo {
+    /// Convert to OpenAI Responses API format
+    pub fn to_response_usage(&self) -> ResponseUsage {
+        ResponseUsage {
+            input_tokens: self.prompt_tokens,
+            output_tokens: self.completion_tokens,
+            total_tokens: self.total_tokens,
+            input_tokens_details: self.prompt_tokens_details.as_ref().map(|details| {
+                InputTokensDetails {
+                    cached_tokens: details.cached_tokens,
+                }
+            }),
+            output_tokens_details: self.reasoning_tokens.map(|tokens| OutputTokensDetails {
+                reasoning_tokens: tokens,
+            }),
+        }
+    }
+}
+
+impl From<UsageInfo> for ResponseUsage {
+    fn from(usage: UsageInfo) -> Self {
+        usage.to_response_usage()
+    }
+}
+
+impl ResponseUsage {
+    /// Convert back to standard UsageInfo format
+    pub fn to_usage_info(&self) -> UsageInfo {
+        UsageInfo {
+            prompt_tokens: self.input_tokens,
+            completion_tokens: self.output_tokens,
+            total_tokens: self.total_tokens,
+            reasoning_tokens: self
+                .output_tokens_details
+                .as_ref()
+                .map(|details| details.reasoning_tokens),
+            prompt_tokens_details: self.input_tokens_details.as_ref().map(|details| {
+                PromptTokenUsageInfo {
+                    cached_tokens: details.cached_tokens,
+                }
+            }),
+        }
+    }
+}
--- a/sgl-router/tests/responses_api_test.rs
+++ b/sgl-router/tests/responses_api_test.rs
@@ -0,0 +1,208 @@
+// Integration test for Responses API
+
+use sglang_router_rs::protocols::common::GenerationRequest;
+use sglang_router_rs::protocols::openai::responses::request::ResponseInput;
+use sglang_router_rs::protocols::openai::responses::*;
+
+#[test]
+fn test_responses_request_creation() {
+    let request = ResponsesRequest {
+        background: false,
+        include: None,
+        input: ResponseInput::Text("Hello, world!".to_string()),
+        instructions: Some("Be helpful".to_string()),
+        max_output_tokens: Some(100),
+        max_tool_calls: None,
+        metadata: None,
+        model: Some("test-model".to_string()),
+        parallel_tool_calls: true,
+        previous_response_id: None,
+        reasoning: Some(ResponseReasoningParam {
+            effort: Some(ReasoningEffort::Medium),
+        }),
+        service_tier: ServiceTier::Auto,
+        store: true,
+        stream: false,
+        temperature: Some(0.7),
+        tool_choice: ToolChoice::Auto,
+        tools: vec![ResponseTool {
+            r#type: ResponseToolType::WebSearchPreview,
+        }],
+        top_logprobs: 5,
+        top_p: Some(0.9),
+        truncation: Truncation::Disabled,
+        user: Some("test-user".to_string()),
+        request_id: "resp_test123".to_string(),
+        priority: 0,
+        frequency_penalty: 0.0,
+        presence_penalty: 0.0,
+        stop: None,
+        top_k: -1,
+        min_p: 0.0,
+        repetition_penalty: 1.0,
+    };
+
+    // Test GenerationRequest trait implementation
+    assert!(!request.is_stream());
+    assert_eq!(request.get_model(), Some("test-model"));
+    let routing_text = request.extract_text_for_routing();
+    assert_eq!(routing_text, "Hello, world!");
+}
+
+#[test]
+fn test_sampling_params_conversion() {
+    let request = ResponsesRequest {
+        background: false,
+        include: None,
+        input: ResponseInput::Text("Test".to_string()),
+        instructions: None,
+        max_output_tokens: Some(50),
+        max_tool_calls: None,
+        metadata: None,
+        model: Some("test-model".to_string()),
+        parallel_tool_calls: true, // Use default true
+        previous_response_id: None,
+        reasoning: None,
+        service_tier: ServiceTier::Auto,
+        store: true, // Use default true
+        stream: false,
+        temperature: Some(0.8),
+        tool_choice: ToolChoice::Auto,
+        tools: vec![],
+        top_logprobs: 0, // Use default 0
+        top_p: Some(0.95),
+        truncation: Truncation::Auto,
+        user: None,
+        request_id: "resp_test456".to_string(),
+        priority: 0,
+        frequency_penalty: 0.1,
+        presence_penalty: 0.2,
+        stop: None,
+        top_k: 10,
+        min_p: 0.05,
+        repetition_penalty: 1.1,
+    };
+
+    let params = request.to_sampling_params(1000, None);
+
+    // Check that parameters are converted correctly
+    assert!(params.contains_key("temperature"));
+    assert!(params.contains_key("top_p"));
+    assert!(params.contains_key("frequency_penalty"));
+    assert!(params.contains_key("max_new_tokens"));
+}
+
+#[test]
+fn test_responses_response_creation() {
+    let response = ResponsesResponse::new(
+        "resp_test789".to_string(),
+        "test-model".to_string(),
+        ResponseStatus::Completed,
+    );
+
+    assert_eq!(response.id, "resp_test789");
+    assert_eq!(response.model, "test-model");
+    assert!(response.is_complete());
+    assert!(!response.is_in_progress());
+    assert!(!response.is_failed());
+}
+
+#[test]
+fn test_usage_conversion() {
+    let usage_info = UsageInfo::new_with_cached(15, 25, Some(8), 3);
+    let response_usage = usage_info.to_response_usage();
+
+    assert_eq!(response_usage.input_tokens, 15);
+    assert_eq!(response_usage.output_tokens, 25);
+    assert_eq!(response_usage.total_tokens, 40);
+
+    // Check details are converted correctly
+    assert!(response_usage.input_tokens_details.is_some());
+    assert_eq!(
+        response_usage
+            .input_tokens_details
+            .as_ref()
+            .unwrap()
+            .cached_tokens,
+        3
+    );
+
+    assert!(response_usage.output_tokens_details.is_some());
+    assert_eq!(
+        response_usage
+            .output_tokens_details
+            .as_ref()
+            .unwrap()
+            .reasoning_tokens,
+        8
+    );
+
+    // Test reverse conversion
+    let back_to_usage = response_usage.to_usage_info();
+    assert_eq!(back_to_usage.prompt_tokens, 15);
+    assert_eq!(back_to_usage.completion_tokens, 25);
+    assert_eq!(back_to_usage.reasoning_tokens, Some(8));
+}
+
+#[test]
+fn test_reasoning_param_default() {
+    let param = ResponseReasoningParam {
+        effort: Some(ReasoningEffort::Medium),
+    };
+
+    // Test JSON serialization/deserialization preserves default
+    let json = serde_json::to_string(&param).unwrap();
+    let parsed: ResponseReasoningParam = serde_json::from_str(&json).unwrap();
+
+    assert!(matches!(parsed.effort, Some(ReasoningEffort::Medium)));
+}
+
+#[test]
+fn test_json_serialization() {
+    let request = ResponsesRequest {
+        background: true,
+        include: None,
+        input: ResponseInput::Text("Test input".to_string()),
+        instructions: Some("Test instructions".to_string()),
+        max_output_tokens: Some(200),
+        max_tool_calls: Some(5),
+        metadata: None,
+        model: Some("gpt-4".to_string()),
+        parallel_tool_calls: false,
+        previous_response_id: None,
+        reasoning: Some(ResponseReasoningParam {
+            effort: Some(ReasoningEffort::High),
+        }),
+        service_tier: ServiceTier::Priority,
+        store: false,
+        stream: true,
+        temperature: Some(0.9),
+        tool_choice: ToolChoice::Required,
+        tools: vec![ResponseTool {
+            r#type: ResponseToolType::CodeInterpreter,
+        }],
+        top_logprobs: 10,
+        top_p: Some(0.8),
+        truncation: Truncation::Auto,
+        user: Some("test_user".to_string()),
+        request_id: "resp_comprehensive_test".to_string(),
+        priority: 1,
+        frequency_penalty: 0.3,
+        presence_penalty: 0.4,
+        stop: None,
+        top_k: 50,
+        min_p: 0.1,
+        repetition_penalty: 1.2,
+    };
+
+    // Test that everything can be serialized to JSON and back
+    let json = serde_json::to_string(&request).expect("Serialization should work");
+    let parsed: ResponsesRequest =
+        serde_json::from_str(&json).expect("Deserialization should work");
+
+    assert_eq!(parsed.request_id, "resp_comprehensive_test");
+    assert_eq!(parsed.model, Some("gpt-4".to_string()));
+    assert!(parsed.background);
+    assert!(parsed.stream);
+    assert_eq!(parsed.tools.len(), 1);
+}