[router][grpc] Support v1/responses API (#11926)

2025-10-21 17:41:48 -07:00
parent 704160017d
commit 70f6309cd4
17 changed files with 3611 additions and 29 deletions
--- a/python/sglang/srt/entrypoints/openai/serving_responses.py
+++ b/python/sglang/srt/entrypoints/openai/serving_responses.py
@@ -778,7 +778,9 @@ class OpenAIServingResponses(OpenAIServingChat):
            # Update the status to "cancelled"
            response.status = "cancelled"

-        # Abort the request
+        # The response_id is the same as the rid used when submitting the request
+        self.tokenizer_manager.abort_request(rid=response_id)
+
        if task := self.background_tasks.get(response_id):
            task.cancel()
            try:
--- a/sgl-router/src/data_connector/conversations.rs
+++ b/sgl-router/src/data_connector/conversations.rs
@@ -52,6 +52,9 @@ pub type ConversationMetadata = JsonMap<String, Value>;
 /// Input payload for creating a conversation
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
 pub struct NewConversation {
+    /// Optional conversation ID (if None, a random ID will be generated)
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub id: Option<ConversationId>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub metadata: Option<ConversationMetadata>,
 }
@@ -68,7 +71,7 @@ pub struct Conversation {
 impl Conversation {
    pub fn new(new_conversation: NewConversation) -> Self {
        Self {
-            id: ConversationId::new(),
+            id: new_conversation.id.unwrap_or_default(),
            created_at: Utc::now(),
            metadata: new_conversation.metadata,
        }
--- a/sgl-router/src/mcp/client_manager.rs
+++ b/sgl-router/src/mcp/client_manager.rs
@@ -180,21 +180,49 @@ impl McpClientManager {
        let backoff = ExponentialBackoffBuilder::new()
            .with_initial_interval(Duration::from_secs(1))
            .with_max_interval(Duration::from_secs(30))
-            .with_max_elapsed_time(Some(Duration::from_secs(120)))
+            .with_max_elapsed_time(Some(Duration::from_secs(30)))
            .build();

        backoff::future::retry(backoff, || async {
            match Self::connect_server_impl(config).await {
                Ok(client) => Ok(client),
                Err(e) => {
-                    tracing::warn!("Failed to connect to '{}', retrying: {}", config.name, e);
-                    Err(backoff::Error::transient(e))
+                    if Self::is_permanent_error(&e) {
+                        tracing::error!(
+                            "Permanent error connecting to '{}': {} - not retrying",
+                            config.name,
+                            e
+                        );
+                        Err(backoff::Error::permanent(e))
+                    } else {
+                        tracing::warn!("Failed to connect to '{}', retrying: {}", config.name, e);
+                        Err(backoff::Error::transient(e))
+                    }
                }
            }
        })
        .await
    }

+    /// Determine if an error is permanent (should not retry) or transient (should retry)
+    fn is_permanent_error(error: &McpError) -> bool {
+        match error {
+            McpError::Config(_) => true,
+            McpError::Auth(_) => true,
+            McpError::ServerNotFound(_) => true,
+            McpError::Transport(_) => true,
+            McpError::ConnectionFailed(msg) => {
+                msg.contains("initialize")
+                    || msg.contains("connection closed")
+                    || msg.contains("connection refused")
+                    || msg.contains("invalid URL")
+                    || msg.contains("not found")
+            }
+            // Tool-related errors shouldn't occur during connection
+            _ => false,
+        }
+    }
+
    /// Internal implementation of server connection
    async fn connect_server_impl(
        config: &McpServerConfig,
--- a/sgl-router/src/protocols/responses.rs
+++ b/sgl-router/src/protocols/responses.rs
@@ -411,6 +411,14 @@ fn default_repetition_penalty() -> f32 {
    1.0
 }

+fn default_temperature() -> Option<f32> {
+    Some(1.0)
+}
+
+fn default_top_p() -> Option<f32> {
+    Some(1.0)
+}
+
 // ============================================================================
 // Request/Response Types
 // ============================================================================
@@ -477,7 +485,10 @@ pub struct ResponsesRequest {
    pub stream: Option<bool>,

    /// Temperature for sampling
-    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(
+        default = "default_temperature",
+        skip_serializing_if = "Option::is_none"
+    )]
    pub temperature: Option<f32>,

    /// Tool choice behavior
@@ -493,7 +504,7 @@ pub struct ResponsesRequest {
    pub top_logprobs: Option<u32>,

    /// Top-p sampling parameter
-    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default = "default_top_p", skip_serializing_if = "Option::is_none")]
    pub top_p: Option<f32>,

    /// Truncation behavior
--- a/sgl-router/src/routers/grpc/mod.rs
+++ b/sgl-router/src/routers/grpc/mod.rs
@@ -6,6 +6,7 @@ pub mod context;
 pub mod pd_router;
 pub mod pipeline;
 pub mod processing;
+pub mod responses;
 pub mod router;
 pub mod streaming;
 pub mod utils;
--- a/sgl-router/src/routers/grpc/pipeline.rs
+++ b/sgl-router/src/routers/grpc/pipeline.rs
@@ -4,6 +4,8 @@
 //! that transform a RequestContext through its lifecycle.

 use std::{
+    borrow::Cow,
+    collections::HashMap,
    sync::Arc,
    time::{Instant, SystemTime, UNIX_EPOCH},
 };
@@ -12,15 +14,20 @@ use async_trait::async_trait;
 use axum::response::{IntoResponse, Response};
 use proto::DisaggregatedParams;
 use rand::Rng;
+use tokio::sync::RwLock;
 use tracing::{debug, error, warn};
 use uuid::Uuid;

-use super::{context::*, processing, streaming, utils};
+use super::{context::*, processing, responses::BackgroundTaskInfo, streaming, utils};
 use crate::{
    core::{ConnectionMode, Worker, WorkerRegistry, WorkerType},
    grpc_client::proto,
    policies::PolicyRegistry,
-    protocols::{chat::ChatCompletionRequest, common::InputIds, generate::GenerateRequest},
+    protocols::{
+        chat::{ChatCompletionRequest, ChatCompletionResponse},
+        common::InputIds,
+        generate::GenerateRequest,
+    },
    reasoning_parser::ParserFactory as ReasoningParserFactory,
    tokenizer::traits::Tokenizer,
    tool_parser::ParserFactory as ToolParserFactory,
@@ -131,7 +138,7 @@ impl PreparationStage {
            token_ids,
            processed_messages: Some(processed_messages),
            tool_constraints: tool_call_constraint,
-            filtered_request: if matches!(body_ref, std::borrow::Cow::Owned(_)) {
+            filtered_request: if matches!(body_ref, Cow::Owned(_)) {
                Some(body_ref.into_owned())
            } else {
                None
@@ -1090,4 +1097,86 @@ impl RequestPipeline {
            None => utils::internal_error_static("No response produced"),
        }
    }
+
+    /// Execute chat pipeline for responses endpoint (Result-based for easier composition)
+    ///
+    /// This is used by the responses module and returns Result instead of Response.
+    /// It also supports background mode cancellation via background_tasks.
+    pub async fn execute_chat_for_responses(
+        &self,
+        request: Arc<ChatCompletionRequest>,
+        headers: Option<http::HeaderMap>,
+        model_id: Option<String>,
+        components: Arc<SharedComponents>,
+        response_id: Option<String>,
+        background_tasks: Option<Arc<RwLock<HashMap<String, BackgroundTaskInfo>>>>,
+    ) -> Result<ChatCompletionResponse, String> {
+        let mut ctx = RequestContext::for_chat(request, headers, model_id, components);
+
+        // Execute each stage in sequence
+        for (idx, stage) in self.stages.iter().enumerate() {
+            match stage.execute(&mut ctx).await {
+                Ok(Some(_response)) => {
+                    // Streaming not supported for responses sync mode
+                    return Err("Streaming is not supported in this context".to_string());
+                }
+                Ok(None) => {
+                    let stage_name = stage.name();
+
+                    // After ClientAcquisitionStage, store client for background task cancellation
+                    if stage_name == "ClientAcquisition" {
+                        if let (Some(ref clients), Some(ref resp_id), Some(ref tasks)) =
+                            (&ctx.state.clients, &response_id, &background_tasks)
+                        {
+                            let client_to_store = match clients {
+                                ClientSelection::Single { client } => client.clone(),
+                                ClientSelection::Dual { decode, .. } => decode.clone(),
+                            };
+
+                            if let Some(task_info) = tasks.write().await.get_mut(resp_id.as_str()) {
+                                *task_info.client.write().await = Some(client_to_store);
+                                debug!("Stored client for response_id: {}", resp_id);
+                            }
+                        }
+                    }
+
+                    // After DispatchMetadataStage, store grpc_request_id for background task cancellation
+                    if stage_name == "DispatchMetadata" {
+                        if let (Some(ref dispatch), Some(ref resp_id), Some(ref tasks)) =
+                            (&ctx.state.dispatch, &response_id, &background_tasks)
+                        {
+                            let grpc_request_id = dispatch.request_id.clone();
+
+                            if let Some(task_info) = tasks.write().await.get_mut(resp_id.as_str()) {
+                                task_info.grpc_request_id = grpc_request_id.clone();
+                                debug!("Stored grpc_request_id for response_id: {}", resp_id);
+                            }
+                        }
+                    }
+
+                    // Continue to next stage
+                    continue;
+                }
+                Err(response) => {
+                    // Error occurred
+                    error!(
+                        "Stage {} ({}) failed with status {}",
+                        idx + 1,
+                        stage.name(),
+                        response.status()
+                    );
+                    return Err(format!("Pipeline stage {} failed", stage.name()));
+                }
+            }
+        }
+
+        // Extract final response
+        match ctx.state.response.final_response {
+            Some(FinalResponse::Chat(response)) => Ok(response),
+            Some(FinalResponse::Generate(_)) => {
+                Err("Internal error: wrong response type".to_string())
+            }
+            None => Err("No response produced".to_string()),
+        }
+    }
 }
--- a/sgl-router/src/routers/grpc/processing.rs
+++ b/sgl-router/src/routers/grpc/processing.rs
@@ -408,10 +408,7 @@ impl ResponseProcessor {
                            tool_type: "function".to_string(),
                            function: FunctionCallResponse {
                                name: tc.function.name,
-                                arguments: Some(
-                                    serde_json::to_string(&tc.function.arguments)
-                                        .unwrap_or_else(|_| "{}".to_string()),
-                                ),
+                                arguments: Some(tc.function.arguments),
                            },
                        }
                    })
--- a/sgl-router/src/routers/grpc/responses/conversions.rs
+++ b/sgl-router/src/routers/grpc/responses/conversions.rs
@@ -0,0 +1,365 @@
+//! Conversion utilities for translating between /v1/responses and /v1/chat/completions formats
+//!
+//! This module implements the conversion approach where:
+//! 1. ResponsesRequest → ChatCompletionRequest (for backend processing)
+//! 2. ChatCompletionResponse → ResponsesResponse (for client response)
+//!
+//! This allows the gRPC router to reuse the existing chat pipeline infrastructure
+//! without requiring Python backend changes.
+
+use crate::protocols::{
+    chat::{ChatCompletionRequest, ChatCompletionResponse, ChatMessage, UserMessageContent},
+    common::{FunctionCallResponse, StreamOptions, ToolCall, UsageInfo},
+    responses::{
+        ResponseContentPart, ResponseInput, ResponseInputOutputItem, ResponseOutputItem,
+        ResponseStatus, ResponsesRequest, ResponsesResponse, ResponsesUsage,
+    },
+};
+
+/// Convert a ResponsesRequest to ChatCompletionRequest for processing through the chat pipeline
+///
+/// # Conversion Logic
+/// - `input` (text/items) → `messages` (chat messages)
+/// - `instructions` → system message (prepended)
+/// - `max_output_tokens` → `max_completion_tokens`
+/// - Tool-related fields are passed through
+/// - Response-specific fields (previous_response_id, conversation) are handled by router
+pub fn responses_to_chat(req: &ResponsesRequest) -> Result<ChatCompletionRequest, String> {
+    let mut messages = Vec::new();
+
+    // 1. Add system message if instructions provided
+    if let Some(instructions) = &req.instructions {
+        messages.push(ChatMessage::System {
+            content: instructions.clone(),
+            name: None,
+        });
+    }
+
+    // 2. Convert input to chat messages
+    match &req.input {
+        ResponseInput::Text(text) => {
+            // Simple text input → user message
+            messages.push(ChatMessage::User {
+                content: UserMessageContent::Text(text.clone()),
+                name: None,
+            });
+        }
+        ResponseInput::Items(items) => {
+            // Structured items → convert each to appropriate chat message
+            for item in items {
+                match item {
+                    ResponseInputOutputItem::Message { role, content, .. } => {
+                        // Extract text from content parts
+                        let text = extract_text_from_content(content);
+
+                        match role.as_str() {
+                            "user" => {
+                                messages.push(ChatMessage::User {
+                                    content: UserMessageContent::Text(text),
+                                    name: None,
+                                });
+                            }
+                            "assistant" => {
+                                messages.push(ChatMessage::Assistant {
+                                    content: Some(text),
+                                    name: None,
+                                    tool_calls: None,
+                                    reasoning_content: None,
+                                });
+                            }
+                            "system" => {
+                                messages.push(ChatMessage::System {
+                                    content: text,
+                                    name: None,
+                                });
+                            }
+                            _ => {
+                                // Unknown role, treat as user message
+                                messages.push(ChatMessage::User {
+                                    content: UserMessageContent::Text(text),
+                                    name: None,
+                                });
+                            }
+                        }
+                    }
+                    ResponseInputOutputItem::FunctionToolCall {
+                        id,
+                        name,
+                        arguments,
+                        output,
+                        ..
+                    } => {
+                        // Tool call from history - add as assistant message with tool call
+                        // followed by tool response if output exists
+
+                        // Add assistant message with tool_calls (the LLM's decision)
+                        messages.push(ChatMessage::Assistant {
+                            content: None,
+                            name: None,
+                            tool_calls: Some(vec![ToolCall {
+                                id: id.clone(),
+                                tool_type: "function".to_string(),
+                                function: FunctionCallResponse {
+                                    name: name.clone(),
+                                    arguments: Some(arguments.clone()),
+                                },
+                            }]),
+                            reasoning_content: None,
+                        });
+
+                        // Add tool result message if output exists
+                        if let Some(output_text) = output {
+                            messages.push(ChatMessage::Tool {
+                                content: output_text.clone(),
+                                tool_call_id: id.clone(),
+                            });
+                        }
+                    }
+                    ResponseInputOutputItem::Reasoning { content, .. } => {
+                        // Reasoning content - add as assistant message with reasoning_content
+                        let reasoning_text = content
+                            .iter()
+                            .map(|c| match c {
+                                crate::protocols::responses::ResponseReasoningContent::ReasoningText { text } => {
+                                    text.as_str()
+                                }
+                            })
+                            .collect::<Vec<_>>()
+                            .join("\n");
+
+                        messages.push(ChatMessage::Assistant {
+                            content: None,
+                            name: None,
+                            tool_calls: None,
+                            reasoning_content: Some(reasoning_text),
+                        });
+                    }
+                }
+            }
+        }
+    }
+
+    // Ensure we have at least one message
+    if messages.is_empty() {
+        return Err("Request must contain at least one message".to_string());
+    }
+
+    // 3. Build ChatCompletionRequest
+    let is_streaming = req.stream.unwrap_or(false);
+
+    Ok(ChatCompletionRequest {
+        messages,
+        model: req.model.clone().unwrap_or_else(|| "default".to_string()),
+        temperature: req.temperature,
+        max_completion_tokens: req.max_output_tokens,
+        stream: is_streaming,
+        stream_options: if is_streaming {
+            Some(StreamOptions {
+                include_usage: Some(true),
+            })
+        } else {
+            None
+        },
+        parallel_tool_calls: req.parallel_tool_calls,
+        top_logprobs: req.top_logprobs,
+        top_p: req.top_p,
+        skip_special_tokens: true, // Always skip special tokens // TODO: except for gpt-oss
+        // Note: tools and tool_choice will be handled separately for MCP transformation
+        tools: None,       // Will be set by caller if needed
+        tool_choice: None, // Will be set by caller if needed
+        ..Default::default()
+    })
+}
+
+/// Extract text content from ResponseContentPart array
+fn extract_text_from_content(content: &[ResponseContentPart]) -> String {
+    content
+        .iter()
+        .filter_map(|part| match part {
+            ResponseContentPart::InputText { text } => Some(text.as_str()),
+            ResponseContentPart::OutputText { text, .. } => Some(text.as_str()),
+            _ => None,
+        })
+        .collect::<Vec<_>>()
+        .join("")
+}
+
+/// Convert a ChatCompletionResponse to ResponsesResponse
+///
+/// # Conversion Logic
+/// - `id` → `id` (pass through)
+/// - `model` → `model` (pass through)
+/// - `choices[0].message` → `output` array (convert to ResponseOutputItem::Message)
+/// - `choices[0].finish_reason` → determines `status` (stop/length → Completed)
+/// - `created` timestamp → `created_at`
+pub fn chat_to_responses(
+    chat_resp: &ChatCompletionResponse,
+    original_req: &ResponsesRequest,
+) -> Result<ResponsesResponse, String> {
+    // Extract the first choice (responses API doesn't support n>1)
+    let choice = chat_resp
+        .choices
+        .first()
+        .ok_or_else(|| "Chat response contains no choices".to_string())?;
+
+    // Convert assistant message to output items
+    let mut output: Vec<ResponseOutputItem> = Vec::new();
+
+    // Convert message content to output item
+    if let Some(content) = &choice.message.content {
+        if !content.is_empty() {
+            output.push(ResponseOutputItem::Message {
+                id: format!("msg_{}", chat_resp.id),
+                role: "assistant".to_string(),
+                content: vec![ResponseContentPart::OutputText {
+                    text: content.clone(),
+                    annotations: vec![],
+                    logprobs: choice.logprobs.clone(),
+                }],
+                status: "completed".to_string(),
+            });
+        }
+    }
+
+    // Convert reasoning content if present (O1-style models)
+    if let Some(reasoning) = &choice.message.reasoning_content {
+        if !reasoning.is_empty() {
+            output.push(ResponseOutputItem::Reasoning {
+                id: format!("reasoning_{}", chat_resp.id),
+                summary: vec![],
+                content: vec![
+                    crate::protocols::responses::ResponseReasoningContent::ReasoningText {
+                        text: reasoning.clone(),
+                    },
+                ],
+                status: Some("completed".to_string()),
+            });
+        }
+    }
+
+    // Convert tool calls if present
+    if let Some(tool_calls) = &choice.message.tool_calls {
+        for tool_call in tool_calls {
+            output.push(ResponseOutputItem::FunctionToolCall {
+                id: tool_call.id.clone(),
+                name: tool_call.function.name.clone(),
+                arguments: tool_call.function.arguments.clone().unwrap_or_default(),
+                output: None, // Tool hasn't been executed yet
+                status: "in_progress".to_string(),
+            });
+        }
+    }
+
+    // Determine response status based on finish_reason
+    let status = match choice.finish_reason.as_deref() {
+        Some("stop") | Some("length") => ResponseStatus::Completed,
+        Some("tool_calls") => ResponseStatus::InProgress, // Waiting for tool execution
+        Some("failed") | Some("error") => ResponseStatus::Failed,
+        _ => ResponseStatus::Completed, // Default to completed
+    };
+
+    // Convert usage from Usage to UsageInfo, then wrap in ResponsesUsage
+    let usage = chat_resp.usage.as_ref().map(|u| {
+        let usage_info = UsageInfo {
+            prompt_tokens: u.prompt_tokens,
+            completion_tokens: u.completion_tokens,
+            total_tokens: u.total_tokens,
+            reasoning_tokens: u
+                .completion_tokens_details
+                .as_ref()
+                .and_then(|d| d.reasoning_tokens),
+            prompt_tokens_details: None, // Chat response doesn't have this
+        };
+        ResponsesUsage::Classic(usage_info)
+    });
+
+    // Generate response
+    Ok(ResponsesResponse {
+        id: chat_resp.id.clone(),
+        object: "response".to_string(),
+        created_at: chat_resp.created as i64,
+        status,
+        error: None,
+        incomplete_details: None,
+        instructions: original_req.instructions.clone(),
+        max_output_tokens: original_req.max_output_tokens,
+        model: chat_resp.model.clone(),
+        output,
+        parallel_tool_calls: original_req.parallel_tool_calls.unwrap_or(true),
+        previous_response_id: original_req.previous_response_id.clone(),
+        reasoning: None, // TODO: Map reasoning effort if needed
+        store: original_req.store.unwrap_or(true),
+        temperature: original_req.temperature,
+        text: None,
+        tool_choice: "auto".to_string(), // TODO: Map from original request
+        tools: original_req.tools.clone().unwrap_or_default(),
+        top_p: original_req.top_p,
+        truncation: None,
+        usage,
+        user: None, // No user field in chat response
+        metadata: original_req.metadata.clone().unwrap_or_default(),
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_text_input_conversion() {
+        let req = ResponsesRequest {
+            input: ResponseInput::Text("Hello, world!".to_string()),
+            instructions: Some("You are a helpful assistant.".to_string()),
+            model: Some("gpt-4".to_string()),
+            temperature: Some(0.7),
+            ..Default::default()
+        };
+
+        let chat_req = responses_to_chat(&req).unwrap();
+        assert_eq!(chat_req.messages.len(), 2); // system + user
+        assert_eq!(chat_req.model, "gpt-4");
+        assert_eq!(chat_req.temperature, Some(0.7));
+    }
+
+    #[test]
+    fn test_items_input_conversion() {
+        let req = ResponsesRequest {
+            input: ResponseInput::Items(vec![
+                ResponseInputOutputItem::Message {
+                    id: "msg_1".to_string(),
+                    role: "user".to_string(),
+                    content: vec![ResponseContentPart::InputText {
+                        text: "Hello!".to_string(),
+                    }],
+                    status: None,
+                },
+                ResponseInputOutputItem::Message {
+                    id: "msg_2".to_string(),
+                    role: "assistant".to_string(),
+                    content: vec![ResponseContentPart::OutputText {
+                        text: "Hi there!".to_string(),
+                        annotations: vec![],
+                        logprobs: None,
+                    }],
+                    status: None,
+                },
+            ]),
+            ..Default::default()
+        };
+
+        let chat_req = responses_to_chat(&req).unwrap();
+        assert_eq!(chat_req.messages.len(), 2); // user + assistant
+    }
+
+    #[test]
+    fn test_empty_input_error() {
+        let req = ResponsesRequest {
+            input: ResponseInput::Text("".to_string()),
+            ..Default::default()
+        };
+
+        // Empty text should still create a user message, so this should succeed
+        let result = responses_to_chat(&req);
+        assert!(result.is_ok());
+    }
+}
--- a/sgl-router/src/routers/grpc/responses/handlers.rs
+++ b/sgl-router/src/routers/grpc/responses/handlers.rs
--- a/sgl-router/src/routers/grpc/responses/mod.rs
+++ b/sgl-router/src/routers/grpc/responses/mod.rs
@@ -0,0 +1,20 @@
+//! gRPC Router `/v1/responses` endpoint implementation
+//!
+//! This module handles all responses-specific logic including:
+//! - Request validation
+//! - Conversation history and response chain loading
+//! - Background mode execution
+//! - Streaming support
+//! - MCP tool loop wrapper
+//! - Response persistence
+
+// Module declarations
+mod conversions;
+mod handlers;
+pub mod streaming;
+pub mod tool_loop;
+pub mod types;
+
+// Public exports
+pub use handlers::{cancel_response_impl, get_response_impl, route_responses};
+pub use types::BackgroundTaskInfo;
--- a/sgl-router/src/routers/grpc/responses/streaming.rs
+++ b/sgl-router/src/routers/grpc/responses/streaming.rs
@@ -0,0 +1,574 @@
+//! Streaming infrastructure for /v1/responses endpoint
+
+use std::collections::HashMap;
+
+use bytes::Bytes;
+use serde_json::json;
+use tokio::sync::mpsc;
+use uuid::Uuid;
+
+use crate::protocols::chat::ChatCompletionStreamResponse;
+
+pub(super) enum OutputItemType {
+    Message,
+    McpListTools,
+    McpCall,
+    Reasoning,
+}
+
+/// Status of an output item
+#[derive(Debug, Clone, PartialEq)]
+enum ItemStatus {
+    InProgress,
+    Completed,
+}
+
+/// State tracking for a single output item
+#[derive(Debug, Clone)]
+struct OutputItemState {
+    output_index: usize,
+    status: ItemStatus,
+}
+
+// ============================================================================
+// Streaming Event Emitter
+// ============================================================================
+
+/// OpenAI-compatible event emitter for /v1/responses streaming
+///
+/// Manages state and sequence numbers to emit proper event types:
+/// - response.created
+/// - response.in_progress
+/// - response.output_item.added
+/// - response.content_part.added
+/// - response.output_text.delta (multiple)
+/// - response.output_text.done
+/// - response.content_part.done
+/// - response.output_item.done
+/// - response.completed
+/// - response.mcp_list_tools.in_progress
+/// - response.mcp_list_tools.completed
+/// - response.mcp_call.in_progress
+/// - response.mcp_call_arguments.delta
+/// - response.mcp_call_arguments.done
+/// - response.mcp_call.completed
+/// - response.mcp_call.failed
+pub(super) struct ResponseStreamEventEmitter {
+    sequence_number: u64,
+    response_id: String,
+    model: String,
+    created_at: u64,
+    message_id: String,
+    accumulated_text: String,
+    has_emitted_created: bool,
+    has_emitted_in_progress: bool,
+    has_emitted_output_item_added: bool,
+    has_emitted_content_part_added: bool,
+    // MCP call tracking
+    mcp_call_accumulated_args: HashMap<String, String>,
+    // Output item tracking (NEW)
+    output_items: Vec<OutputItemState>,
+    next_output_index: usize,
+    current_message_output_index: Option<usize>, // Tracks output_index of current message
+    current_item_id: Option<String>,             // Tracks item_id of current item
+}
+
+impl ResponseStreamEventEmitter {
+    pub(super) fn new(response_id: String, model: String, created_at: u64) -> Self {
+        let message_id = format!("msg_{}", Uuid::new_v4());
+
+        Self {
+            sequence_number: 0,
+            response_id,
+            model,
+            created_at,
+            message_id,
+            accumulated_text: String::new(),
+            has_emitted_created: false,
+            has_emitted_in_progress: false,
+            has_emitted_output_item_added: false,
+            has_emitted_content_part_added: false,
+            mcp_call_accumulated_args: HashMap::new(),
+            output_items: Vec::new(),
+            next_output_index: 0,
+            current_message_output_index: None,
+            current_item_id: None,
+        }
+    }
+
+    fn next_sequence(&mut self) -> u64 {
+        let seq = self.sequence_number;
+        self.sequence_number += 1;
+        seq
+    }
+
+    pub(super) fn emit_created(&mut self) -> serde_json::Value {
+        self.has_emitted_created = true;
+        json!({
+            "type": "response.created",
+            "sequence_number": self.next_sequence(),
+            "response": {
+                "id": self.response_id,
+                "object": "response",
+                "created_at": self.created_at,
+                "status": "in_progress",
+                "model": self.model,
+                "output": []
+            }
+        })
+    }
+
+    pub(super) fn emit_in_progress(&mut self) -> serde_json::Value {
+        self.has_emitted_in_progress = true;
+        json!({
+            "type": "response.in_progress",
+            "sequence_number": self.next_sequence(),
+            "response": {
+                "id": self.response_id,
+                "object": "response",
+                "status": "in_progress"
+            }
+        })
+    }
+
+    pub(super) fn emit_content_part_added(
+        &mut self,
+        output_index: usize,
+        item_id: &str,
+        content_index: usize,
+    ) -> serde_json::Value {
+        self.has_emitted_content_part_added = true;
+        json!({
+            "type": "response.content_part.added",
+            "sequence_number": self.next_sequence(),
+            "output_index": output_index,
+            "item_id": item_id,
+            "content_index": content_index,
+            "part": {
+                "type": "text",
+                "text": ""
+            }
+        })
+    }
+
+    pub(super) fn emit_text_delta(
+        &mut self,
+        delta: &str,
+        output_index: usize,
+        item_id: &str,
+        content_index: usize,
+    ) -> serde_json::Value {
+        self.accumulated_text.push_str(delta);
+        json!({
+            "type": "response.output_text.delta",
+            "sequence_number": self.next_sequence(),
+            "output_index": output_index,
+            "item_id": item_id,
+            "content_index": content_index,
+            "delta": delta
+        })
+    }
+
+    pub(super) fn emit_text_done(
+        &mut self,
+        output_index: usize,
+        item_id: &str,
+        content_index: usize,
+    ) -> serde_json::Value {
+        json!({
+            "type": "response.output_text.done",
+            "sequence_number": self.next_sequence(),
+            "output_index": output_index,
+            "item_id": item_id,
+            "content_index": content_index,
+            "text": self.accumulated_text.clone()
+        })
+    }
+
+    pub(super) fn emit_content_part_done(
+        &mut self,
+        output_index: usize,
+        item_id: &str,
+        content_index: usize,
+    ) -> serde_json::Value {
+        json!({
+            "type": "response.content_part.done",
+            "sequence_number": self.next_sequence(),
+            "output_index": output_index,
+            "item_id": item_id,
+            "content_index": content_index,
+            "part": {
+                "type": "text",
+                "text": self.accumulated_text.clone()
+            }
+        })
+    }
+
+    pub(super) fn emit_completed(
+        &mut self,
+        usage: Option<&serde_json::Value>,
+    ) -> serde_json::Value {
+        let mut response = json!({
+            "type": "response.completed",
+            "sequence_number": self.next_sequence(),
+            "response": {
+                "id": self.response_id,
+                "object": "response",
+                "created_at": self.created_at,
+                "status": "completed",
+                "model": self.model,
+                "output": [{
+                    "id": self.message_id.clone(),
+                    "type": "message",
+                    "role": "assistant",
+                    "content": [{
+                        "type": "text",
+                        "text": self.accumulated_text.clone()
+                    }]
+                }]
+            }
+        });
+
+        if let Some(usage_val) = usage {
+            response["response"]["usage"] = usage_val.clone();
+        }
+
+        response
+    }
+
+    // ========================================================================
+    // MCP Event Emission Methods
+    // ========================================================================
+
+    pub(super) fn emit_mcp_list_tools_in_progress(
+        &mut self,
+        output_index: usize,
+    ) -> serde_json::Value {
+        json!({
+            "type": "response.mcp_list_tools.in_progress",
+            "sequence_number": self.next_sequence(),
+            "output_index": output_index
+        })
+    }
+
+    pub(super) fn emit_mcp_list_tools_completed(
+        &mut self,
+        output_index: usize,
+        tools: &[crate::mcp::ToolInfo],
+    ) -> serde_json::Value {
+        let tool_items: Vec<_> = tools
+            .iter()
+            .map(|t| {
+                json!({
+                    "name": t.name,
+                    "description": t.description,
+                    "input_schema": t.parameters.clone().unwrap_or_else(|| json!({
+                        "type": "object",
+                        "properties": {},
+                        "required": []
+                    }))
+                })
+            })
+            .collect();
+
+        json!({
+            "type": "response.mcp_list_tools.completed",
+            "sequence_number": self.next_sequence(),
+            "output_index": output_index,
+            "tools": tool_items
+        })
+    }
+
+    pub(super) fn emit_mcp_call_in_progress(
+        &mut self,
+        output_index: usize,
+        item_id: &str,
+    ) -> serde_json::Value {
+        json!({
+            "type": "response.mcp_call.in_progress",
+            "sequence_number": self.next_sequence(),
+            "output_index": output_index,
+            "item_id": item_id
+        })
+    }
+
+    pub(super) fn emit_mcp_call_arguments_delta(
+        &mut self,
+        output_index: usize,
+        item_id: &str,
+        delta: &str,
+    ) -> serde_json::Value {
+        // Accumulate arguments for this call
+        self.mcp_call_accumulated_args
+            .entry(item_id.to_string())
+            .or_default()
+            .push_str(delta);
+
+        json!({
+            "type": "response.mcp_call_arguments.delta",
+            "sequence_number": self.next_sequence(),
+            "output_index": output_index,
+            "item_id": item_id,
+            "delta": delta
+        })
+    }
+
+    pub(super) fn emit_mcp_call_arguments_done(
+        &mut self,
+        output_index: usize,
+        item_id: &str,
+        arguments: &str,
+    ) -> serde_json::Value {
+        json!({
+            "type": "response.mcp_call_arguments.done",
+            "sequence_number": self.next_sequence(),
+            "output_index": output_index,
+            "item_id": item_id,
+            "arguments": arguments
+        })
+    }
+
+    pub(super) fn emit_mcp_call_completed(
+        &mut self,
+        output_index: usize,
+        item_id: &str,
+    ) -> serde_json::Value {
+        json!({
+            "type": "response.mcp_call.completed",
+            "sequence_number": self.next_sequence(),
+            "output_index": output_index,
+            "item_id": item_id
+        })
+    }
+
+    pub(super) fn emit_mcp_call_failed(
+        &mut self,
+        output_index: usize,
+        item_id: &str,
+        error: &str,
+    ) -> serde_json::Value {
+        json!({
+            "type": "response.mcp_call.failed",
+            "sequence_number": self.next_sequence(),
+            "output_index": output_index,
+            "item_id": item_id,
+            "error": error
+        })
+    }
+
+    // ========================================================================
+    // Output Item Wrapper Events
+    // ========================================================================
+
+    /// Emit response.output_item.added event
+    pub(super) fn emit_output_item_added(
+        &mut self,
+        output_index: usize,
+        item: &serde_json::Value,
+    ) -> serde_json::Value {
+        json!({
+            "type": "response.output_item.added",
+            "sequence_number": self.next_sequence(),
+            "output_index": output_index,
+            "item": item
+        })
+    }
+
+    /// Emit response.output_item.done event
+    pub(super) fn emit_output_item_done(
+        &mut self,
+        output_index: usize,
+        item: &serde_json::Value,
+    ) -> serde_json::Value {
+        json!({
+            "type": "response.output_item.done",
+            "sequence_number": self.next_sequence(),
+            "output_index": output_index,
+            "item": item
+        })
+    }
+
+    /// Generate unique ID for item type
+    fn generate_item_id(prefix: &str) -> String {
+        format!("{}_{}", prefix, Uuid::new_v4().to_string().replace("-", ""))
+    }
+
+    /// Allocate next output index and track item
+    pub(super) fn allocate_output_index(&mut self, item_type: OutputItemType) -> (usize, String) {
+        let index = self.next_output_index;
+        self.next_output_index += 1;
+
+        let id_prefix = match &item_type {
+            OutputItemType::McpListTools => "mcpl",
+            OutputItemType::McpCall => "mcp",
+            OutputItemType::Message => "msg",
+            OutputItemType::Reasoning => "rs",
+        };
+
+        let id = Self::generate_item_id(id_prefix);
+
+        self.output_items.push(OutputItemState {
+            output_index: index,
+            status: ItemStatus::InProgress,
+        });
+
+        (index, id)
+    }
+
+    /// Mark output item as completed
+    pub(super) fn complete_output_item(&mut self, output_index: usize) {
+        if let Some(item) = self
+            .output_items
+            .iter_mut()
+            .find(|i| i.output_index == output_index)
+        {
+            item.status = ItemStatus::Completed;
+        }
+    }
+
+    /// Emit reasoning item wrapper events (added + done)
+    ///
+    /// Reasoning items in OpenAI format are simple placeholders emitted between tool iterations.
+    /// They don't have streaming content - just wrapper events with empty/null content.
+    pub(super) fn emit_reasoning_item(
+        &mut self,
+        tx: &mpsc::UnboundedSender<Result<Bytes, std::io::Error>>,
+        reasoning_content: Option<String>,
+    ) -> Result<(), String> {
+        // Allocate output index and generate ID
+        let (output_index, item_id) = self.allocate_output_index(OutputItemType::Reasoning);
+
+        // Build reasoning item structure
+        let item = json!({
+            "id": item_id,
+            "type": "reasoning",
+            "summary": [],
+            "content": reasoning_content,
+            "encrypted_content": null,
+            "status": null
+        });
+
+        // Emit output_item.added
+        let added_event = self.emit_output_item_added(output_index, &item);
+        self.send_event(&added_event, tx)?;
+
+        // Immediately emit output_item.done (no streaming for reasoning)
+        let done_event = self.emit_output_item_done(output_index, &item);
+        self.send_event(&done_event, tx)?;
+
+        // Mark as completed
+        self.complete_output_item(output_index);
+
+        Ok(())
+    }
+
+    /// Process a chunk and emit appropriate events
+    pub(super) fn process_chunk(
+        &mut self,
+        chunk: &ChatCompletionStreamResponse,
+        tx: &mpsc::UnboundedSender<Result<Bytes, std::io::Error>>,
+    ) -> Result<(), String> {
+        // Process content if present
+        if let Some(choice) = chunk.choices.first() {
+            if let Some(content) = &choice.delta.content {
+                if !content.is_empty() {
+                    // Allocate output_index and item_id for this message item (once per message)
+                    if self.current_item_id.is_none() {
+                        let (output_index, item_id) =
+                            self.allocate_output_index(OutputItemType::Message);
+
+                        // Build message item structure
+                        let item = json!({
+                            "id": item_id,
+                            "type": "message",
+                            "role": "assistant",
+                            "content": []
+                        });
+
+                        // Emit output_item.added
+                        let event = self.emit_output_item_added(output_index, &item);
+                        self.send_event(&event, tx)?;
+                        self.has_emitted_output_item_added = true;
+
+                        // Store for subsequent events
+                        self.current_item_id = Some(item_id);
+                        self.current_message_output_index = Some(output_index);
+                    }
+
+                    let output_index = self.current_message_output_index.unwrap();
+                    let item_id = self.current_item_id.clone().unwrap(); // Clone to avoid borrow checker issues
+                    let content_index = 0; // Single content part for now
+
+                    // Emit content_part.added before first delta
+                    if !self.has_emitted_content_part_added {
+                        let event =
+                            self.emit_content_part_added(output_index, &item_id, content_index);
+                        self.send_event(&event, tx)?;
+                        self.has_emitted_content_part_added = true;
+                    }
+
+                    // Emit text delta
+                    let event =
+                        self.emit_text_delta(content, output_index, &item_id, content_index);
+                    self.send_event(&event, tx)?;
+                }
+            }
+
+            // Check for finish_reason to emit completion events
+            if let Some(reason) = &choice.finish_reason {
+                if reason == "stop" || reason == "length" {
+                    let output_index = self.current_message_output_index.unwrap();
+                    let item_id = self.current_item_id.clone().unwrap(); // Clone to avoid borrow checker issues
+                    let content_index = 0;
+
+                    // Emit closing events
+                    if self.has_emitted_content_part_added {
+                        let event = self.emit_text_done(output_index, &item_id, content_index);
+                        self.send_event(&event, tx)?;
+                        let event =
+                            self.emit_content_part_done(output_index, &item_id, content_index);
+                        self.send_event(&event, tx)?;
+                    }
+
+                    if self.has_emitted_output_item_added {
+                        // Build complete message item for output_item.done
+                        let item = json!({
+                            "id": item_id,
+                            "type": "message",
+                            "role": "assistant",
+                            "content": [{
+                                "type": "text",
+                                "text": self.accumulated_text.clone()
+                            }]
+                        });
+                        let event = self.emit_output_item_done(output_index, &item);
+                        self.send_event(&event, tx)?;
+                    }
+
+                    // Mark item as completed
+                    self.complete_output_item(output_index);
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    pub(super) fn send_event(
+        &self,
+        event: &serde_json::Value,
+        tx: &mpsc::UnboundedSender<Result<Bytes, std::io::Error>>,
+    ) -> Result<(), String> {
+        let event_json = serde_json::to_string(event)
+            .map_err(|e| format!("Failed to serialize event: {}", e))?;
+
+        if tx
+            .send(Ok(Bytes::from(format!("data: {}\n\n", event_json))))
+            .is_err()
+        {
+            return Err("Client disconnected".to_string());
+        }
+
+        Ok(())
+    }
+}
--- a/sgl-router/src/routers/grpc/responses/tool_loop.rs
+++ b/sgl-router/src/routers/grpc/responses/tool_loop.rs
--- a/sgl-router/src/routers/grpc/responses/types.rs
+++ b/sgl-router/src/routers/grpc/responses/types.rs
@@ -0,0 +1,18 @@
+//! Type definitions for /v1/responses endpoint
+
+use std::sync::Arc;
+
+use tokio::{sync::RwLock, task::JoinHandle};
+
+/// Information stored for background tasks to enable end-to-end cancellation
+///
+/// This struct enables cancelling both the Rust task AND the Python scheduler processing.
+/// The client field is lazily initialized during pipeline execution.
+pub struct BackgroundTaskInfo {
+    /// Tokio task handle for aborting the Rust task
+    pub handle: JoinHandle<()>,
+    /// gRPC request_id sent to Python scheduler (chatcmpl-* prefix)
+    pub grpc_request_id: String,
+    /// gRPC client for sending abort requests to Python (set after client acquisition)
+    pub client: Arc<RwLock<Option<crate::grpc_client::SglangSchedulerClient>>>,
+}
--- a/sgl-router/src/routers/grpc/router.rs
+++ b/sgl-router/src/routers/grpc/router.rs
@@ -1,6 +1,6 @@
 // gRPC Router Implementation

-use std::sync::Arc;
+use std::{collections::HashMap, sync::Arc};

 use async_trait::async_trait;
 use axum::{
@@ -9,12 +9,20 @@ use axum::{
    http::{HeaderMap, StatusCode},
    response::{IntoResponse, Response},
 };
+use tokio::sync::RwLock;
 use tracing::debug;

-use super::{context::SharedComponents, pipeline::RequestPipeline};
+use super::{
+    context::SharedComponents,
+    pipeline::RequestPipeline,
+    responses::{self, BackgroundTaskInfo},
+};
 use crate::{
    config::types::RetryConfig,
    core::WorkerRegistry,
+    data_connector::{
+        SharedConversationItemStorage, SharedConversationStorage, SharedResponseStorage,
+    },
    policies::PolicyRegistry,
    protocols::{
        chat::ChatCompletionRequest,
@@ -48,6 +56,14 @@ pub struct GrpcRouter {
    configured_tool_parser: Option<String>,
    pipeline: RequestPipeline,
    shared_components: Arc<SharedComponents>,
+    // Storage backends for /v1/responses support
+    response_storage: SharedResponseStorage,
+    conversation_storage: SharedConversationStorage,
+    conversation_item_storage: SharedConversationItemStorage,
+    // Optional MCP manager for tool execution (enabled via SGLANG_MCP_CONFIG env var)
+    mcp_manager: Option<Arc<crate::mcp::McpClientManager>>,
+    // Background task handles for cancellation support (includes gRPC client for Python abort)
+    background_tasks: Arc<RwLock<HashMap<String, BackgroundTaskInfo>>>,
 }

 impl GrpcRouter {
@@ -73,6 +89,31 @@ impl GrpcRouter {
        let worker_registry = ctx.worker_registry.clone();
        let policy_registry = ctx.policy_registry.clone();

+        // Extract storage backends from context
+        let response_storage = ctx.response_storage.clone();
+        let conversation_storage = ctx.conversation_storage.clone();
+        let conversation_item_storage = ctx.conversation_item_storage.clone();
+
+        // Optional MCP manager activation via env var path (config-driven gate)
+        let mcp_manager = match std::env::var("SGLANG_MCP_CONFIG").ok() {
+            Some(path) if !path.trim().is_empty() => {
+                match crate::mcp::McpConfig::from_file(&path).await {
+                    Ok(cfg) => match crate::mcp::McpClientManager::new(cfg).await {
+                        Ok(mgr) => Some(Arc::new(mgr)),
+                        Err(err) => {
+                            tracing::warn!("Failed to initialize MCP manager: {}", err);
+                            None
+                        }
+                    },
+                    Err(err) => {
+                        tracing::warn!("Failed to load MCP config from '{}': {}", path, err);
+                        None
+                    }
+                }
+            }
+            _ => None,
+        };
+
        // Create shared components for pipeline
        let shared_components = Arc::new(SharedComponents {
            tokenizer: tokenizer.clone(),
@@ -104,6 +145,11 @@ impl GrpcRouter {
            configured_tool_parser: ctx.configured_tool_parser.clone(),
            pipeline,
            shared_components,
+            response_storage,
+            conversation_storage,
+            conversation_item_storage,
+            mcp_manager,
+            background_tasks: Arc::new(RwLock::new(HashMap::new())),
        })
    }

@@ -217,24 +263,45 @@ impl RouterTrait for GrpcRouter {

    async fn route_responses(
        &self,
-        _headers: Option<&HeaderMap>,
-        _body: &ResponsesRequest,
-        _model_id: Option<&str>,
+        headers: Option<&HeaderMap>,
+        body: &ResponsesRequest,
+        model_id: Option<&str>,
    ) -> Response {
-        (StatusCode::NOT_IMPLEMENTED).into_response()
+        // Use responses module for ALL requests (streaming and non-streaming)
+        // Responses module handles:
+        // - Request validation (previous_response_id XOR conversation)
+        // - Loading response chain / conversation history from storage
+        // - Conversion: ResponsesRequest → ChatCompletionRequest
+        // - Execution through chat pipeline stages
+        // - Conversion: ChatCompletionResponse → ResponsesResponse
+        // - Response persistence
+        // - MCP tool loop wrapper (future)
+        responses::route_responses(
+            &self.pipeline,
+            Arc::new(body.clone()),
+            headers.cloned(),
+            model_id.map(|s| s.to_string()),
+            self.shared_components.clone(),
+            self.response_storage.clone(),
+            self.conversation_storage.clone(),
+            self.conversation_item_storage.clone(),
+            self.background_tasks.clone(),
+        )
+        .await
    }

    async fn get_response(
        &self,
        _headers: Option<&HeaderMap>,
-        _response_id: &str,
+        response_id: &str,
        _params: &ResponsesGetParams,
    ) -> Response {
-        (StatusCode::NOT_IMPLEMENTED).into_response()
+        responses::get_response_impl(&self.response_storage, response_id).await
    }

-    async fn cancel_response(&self, _headers: Option<&HeaderMap>, _response_id: &str) -> Response {
-        (StatusCode::NOT_IMPLEMENTED).into_response()
+    async fn cancel_response(&self, _headers: Option<&HeaderMap>, response_id: &str) -> Response {
+        responses::cancel_response_impl(&self.response_storage, &self.background_tasks, response_id)
+            .await
    }

    async fn route_classify(
--- a/sgl-router/src/routers/openai/conversations.rs
+++ b/sgl-router/src/routers/openai/conversations.rs
@@ -62,7 +62,10 @@ pub(super) async fn create_conversation(
        None => None,
    };

-    let new_conv = NewConversation { metadata };
+    let new_conv = NewConversation {
+        id: None, // Generate random ID (OpenAI behavior for POST /v1/conversations)
+        metadata,
+    };

    match conversation_storage.create_conversation(new_conv).await {
        Ok(conversation) => {
@@ -952,7 +955,7 @@ fn item_to_json(item: &crate::data_connector::conversation_items::ConversationIt
 // ============================================================================

 /// Persist conversation items (delegates to persist_items_with_storages)
-pub(super) async fn persist_conversation_items(
+pub async fn persist_conversation_items(
    conversation_storage: Arc<dyn ConversationStorage>,
    item_storage: Arc<dyn ConversationItemStorage>,
    response_storage: Arc<dyn ResponseStorage>,
--- a/sgl-router/src/routers/openai/mcp.rs
+++ b/sgl-router/src/routers/openai/mcp.rs
@@ -129,7 +129,7 @@ impl FunctionCallInProgress {
 // ============================================================================

 /// Build a request-scoped MCP manager from request tools, if present.
-pub(super) async fn mcp_manager_from_request_tools(
+pub async fn mcp_manager_from_request_tools(
    tools: &[ResponseTool],
 ) -> Option<Arc<McpClientManager>> {
    let tool = tools
--- a/sgl-router/src/routers/openai/mod.rs
+++ b/sgl-router/src/routers/openai/mod.rs
@@ -7,8 +7,8 @@
 //! - Multi-turn tool execution loops
 //! - SSE (Server-Sent Events) streaming

-mod conversations;
-mod mcp;
+pub mod conversations;
+pub mod mcp;
 mod responses;
 mod router;
 mod streaming;