[router][grpc] Fix streaming bugs: empty tool names, state pollution, and panics (#11373)

2025-10-09 03:53:23 -07:00
parent a4b424c632
commit ab926dd697
33 changed files with 1145 additions and 534 deletions
--- a/sgl-router/src/routers/grpc/context.rs
+++ b/sgl-router/src/routers/grpc/context.rs
@@ -15,10 +15,10 @@ use crate::grpc_client::{proto, SglangSchedulerClient};
 use crate::protocols::spec::{
    ChatCompletionRequest, ChatCompletionResponse, GenerateRequest, GenerateResponse,
 };
-use crate::reasoning_parser::ReasoningParserFactory;
+use crate::reasoning_parser::ParserFactory as ReasoningParserFactory;
 use crate::tokenizer::stop::StopSequenceDecoder;
 use crate::tokenizer::traits::Tokenizer;
-use crate::tool_parser::ToolParserFactory;
+use crate::tool_parser::ParserFactory as ToolParserFactory;

 // ============================================================================
 // Core Context Types
--- a/sgl-router/src/routers/grpc/pd_router.rs
+++ b/sgl-router/src/routers/grpc/pd_router.rs
@@ -7,11 +7,11 @@ use crate::protocols::spec::{
    ChatCompletionRequest, CompletionRequest, EmbeddingRequest, GenerateRequest, RerankRequest,
    ResponsesGetParams, ResponsesRequest,
 };
-use crate::reasoning_parser::ReasoningParserFactory;
+use crate::reasoning_parser::ParserFactory as ReasoningParserFactory;
 use crate::routers::RouterTrait;
 use crate::server::AppContext;
 use crate::tokenizer::traits::Tokenizer;
-use crate::tool_parser::ToolParserFactory;
+use crate::tool_parser::ParserFactory as ToolParserFactory;
 use async_trait::async_trait;
 use axum::{
    body::Body,
--- a/sgl-router/src/routers/grpc/processing.rs
+++ b/sgl-router/src/routers/grpc/processing.rs
@@ -13,10 +13,10 @@ use crate::protocols::spec::{
    ChatChoice, ChatCompletionMessage, ChatCompletionRequest, FunctionCallResponse, ToolCall,
    ToolChoice, ToolChoiceValue,
 };
-use crate::reasoning_parser::ReasoningParserFactory;
+use crate::reasoning_parser::ParserFactory as ReasoningParserFactory;
 use crate::tokenizer::stop::{SequenceDecoderOutput, StopSequenceDecoder};
 use crate::tokenizer::traits::Tokenizer;
-use crate::tool_parser::ToolParserFactory;
+use crate::tool_parser::ParserFactory as ToolParserFactory;

 use super::utils;

--- a/sgl-router/src/routers/grpc/router.rs
+++ b/sgl-router/src/routers/grpc/router.rs
@@ -18,11 +18,11 @@ use crate::protocols::spec::{
    ChatCompletionRequest, CompletionRequest, EmbeddingRequest, GenerateRequest, RerankRequest,
    ResponsesGetParams, ResponsesRequest,
 };
-use crate::reasoning_parser::ReasoningParserFactory;
+use crate::reasoning_parser::ParserFactory as ReasoningParserFactory;
 use crate::routers::RouterTrait;
 use crate::server::AppContext;
 use crate::tokenizer::traits::Tokenizer;
-use crate::tool_parser::ToolParserFactory;
+use crate::tool_parser::ParserFactory as ToolParserFactory;

 /// gRPC router implementation for SGLang
 #[derive(Clone)]
--- a/sgl-router/src/routers/grpc/streaming.rs
+++ b/sgl-router/src/routers/grpc/streaming.rs
@@ -34,8 +34,8 @@ use tokio::sync::mpsc;
 #[derive(Clone)]
 pub struct StreamingProcessor {
    tokenizer: Arc<dyn Tokenizer>,
-    tool_parser_factory: crate::tool_parser::ToolParserFactory,
-    reasoning_parser_factory: crate::reasoning_parser::ReasoningParserFactory,
+    tool_parser_factory: crate::tool_parser::ParserFactory,
+    reasoning_parser_factory: crate::reasoning_parser::ParserFactory,
    configured_tool_parser: Option<String>,
    configured_reasoning_parser: Option<String>,
 }
@@ -43,8 +43,8 @@ pub struct StreamingProcessor {
 impl StreamingProcessor {
    pub fn new(
        tokenizer: Arc<dyn Tokenizer>,
-        tool_parser_factory: crate::tool_parser::ToolParserFactory,
-        reasoning_parser_factory: crate::reasoning_parser::ReasoningParserFactory,
+        tool_parser_factory: crate::tool_parser::ParserFactory,
+        reasoning_parser_factory: crate::reasoning_parser::ParserFactory,
        configured_tool_parser: Option<String>,
        configured_reasoning_parser: Option<String>,
    ) -> Self {
@@ -195,6 +195,47 @@ impl StreamingProcessor {
        let created = dispatch.created;
        let system_fingerprint = dispatch.weight_version.as_deref();

+        // Check parser availability once upfront (log warning only once per request)
+        let reasoning_parser_available = if separate_reasoning {
+            if let Some(parser_name) = self.configured_reasoning_parser.as_ref() {
+                self.reasoning_parser_factory
+                    .registry()
+                    .has_parser(parser_name)
+            } else {
+                self.reasoning_parser_factory
+                    .registry()
+                    .has_parser_for_model(model)
+            }
+        } else {
+            false
+        };
+
+        let tool_parser_available = if tools.is_some() {
+            if let Some(parser_name) = self.configured_tool_parser.as_ref() {
+                self.tool_parser_factory.registry().has_parser(parser_name)
+            } else {
+                self.tool_parser_factory
+                    .registry()
+                    .has_parser_for_model(model)
+            }
+        } else {
+            false
+        };
+
+        if separate_reasoning && !reasoning_parser_available {
+            warn!(
+                "No reasoning parser found for model '{}', skipping reasoning parsing",
+                model
+            );
+        }
+
+        if tools.is_some() && !tool_parser_available {
+            warn!(
+                "No tool parser found for model '{}', skipping tool call parsing",
+                model
+            );
+        }
+
        // Phase 2: Main streaming loop
        while let Some(response) = grpc_stream.next().await {
            let gen_response = response.map_err(|e| format!("Stream error: {}", e))?;
@@ -276,7 +317,7 @@ impl StreamingProcessor {
                    stream_buffer.push_str(&delta);

                    // Reasoning content handling
-                    let in_reasoning = if separate_reasoning {
+                    let in_reasoning = if separate_reasoning && reasoning_parser_available {
                        let (normal_text, reasoning_chunk, in_reasoning) = self
                            .process_reasoning_stream(
                                &delta,
@@ -303,8 +344,12 @@ impl StreamingProcessor {
                    let tool_choice_enabled =
                        !matches!(tool_choice, Some(ToolChoice::Value(ToolChoiceValue::None)));

-                    if !in_reasoning && tool_choice_enabled && tools.is_some() {
-                        let (should_skip, tool_chunks) = self
+                    if !in_reasoning
+                        && tool_choice_enabled
+                        && tools.is_some()
+                        && tool_parser_available
+                    {
+                        let tool_chunks = self
                            .process_tool_calls_stream(
                                &delta,
                                index,
@@ -325,10 +370,9 @@ impl StreamingProcessor {
                                .map_err(|_| "Failed to send tool call chunk".to_string())?;
                        }

-                        // Continue to process the next chunk as we have tool chunks
-                        if should_skip {
-                            continue;
-                        }
+                        // Always skip regular content when tool parsing is active
+                        // Parser either emitted chunks or buffered content
+                        continue;
                    }

                    // Regular content emission
@@ -963,13 +1007,15 @@ impl StreamingProcessor {
        created: u64,
        system_fingerprint: Option<&str>,
    ) -> (String, Option<ChatCompletionStreamResponse>, bool) {
-        // Get or create parser for this index
+        // Create fresh parser for this index (not pooled, to avoid state pollution)
        reasoning_parsers.entry(index).or_insert_with(|| {
-            utils::get_reasoning_parser(
+            let parser = utils::create_reasoning_parser(
                &self.reasoning_parser_factory,
                self.configured_reasoning_parser.as_ref(),
                model,
            )
+            .expect("Parser should be available - checked upfront");
+            Arc::new(tokio::sync::Mutex::new(parser))
        });

        if let Some(pooled_parser) = reasoning_parsers.get(&index) {
@@ -1034,20 +1080,23 @@ impl StreamingProcessor {
        created: u64,
        system_fingerprint: Option<&str>,
        history_tool_calls_count: usize,
-    ) -> (bool, Vec<ChatCompletionStreamResponse>) {
+    ) -> Vec<ChatCompletionStreamResponse> {
        let mut chunks = Vec::new();

-        // Get or create parser for this index
+        // Create fresh parser for this index (not pooled, to avoid state pollution)
        tool_parsers.entry(index).or_insert_with(|| {
-            utils::get_tool_parser(
+            let parser = utils::create_tool_parser(
                &self.tool_parser_factory,
                self.configured_tool_parser.as_ref(),
                model,
            )
+            .expect("Parser should be available - checked upfront");
+            Arc::new(tokio::sync::Mutex::new(parser))
        });

        if let Some(pooled_parser) = tool_parsers.get(&index) {
            let mut parser = pooled_parser.lock().await;
+
            match parser.parse_incremental(delta, tools).await {
                Ok(crate::tool_parser::StreamingParseResult { normal_text, calls }) => {
                    // Emit normal text if present
@@ -1129,8 +1178,7 @@ impl StreamingProcessor {
                        });
                    }

-                    // If we emitted chunks, skip regular content
-                    return (!chunks.is_empty(), chunks);
+                    return chunks;
                }
                Err(e) => {
                    error!("Tool call parsing error: {}", e);
@@ -1138,7 +1186,7 @@ impl StreamingProcessor {
            }
        }

-        (false, chunks)
+        chunks
    }

    /// Format a response as SSE chunk into a reusable buffer
--- a/sgl-router/src/routers/grpc/utils.rs
+++ b/sgl-router/src/routers/grpc/utils.rs
@@ -677,13 +677,12 @@ pub fn generate_tool_call_id(
 ///
 /// If a parser name is explicitly configured, use that parser.
 /// Otherwise, auto-detect based on the model name.
+/// Get a pooled reasoning parser (for non-streaming where state doesn't matter)
 pub fn get_reasoning_parser(
-    reasoning_parser_factory: &crate::reasoning_parser::ReasoningParserFactory,
+    reasoning_parser_factory: &crate::reasoning_parser::ParserFactory,
    configured_parser: Option<&String>,
    model: &str,
 ) -> crate::reasoning_parser::PooledParser {
-    use tracing::warn;
-
    if let Some(parser_name) = configured_parser {
        // Use configured parser if specified
        reasoning_parser_factory
@@ -702,17 +701,40 @@ pub fn get_reasoning_parser(
    }
 }

+/// Create a fresh reasoning parser instance (for streaming where state isolation is needed)
+pub fn create_reasoning_parser(
+    reasoning_parser_factory: &crate::reasoning_parser::ParserFactory,
+    configured_parser: Option<&String>,
+    model: &str,
+) -> Option<Box<dyn crate::reasoning_parser::ReasoningParser>> {
+    if let Some(parser_name) = configured_parser {
+        // Use configured parser if specified
+        reasoning_parser_factory
+            .registry()
+            .create_parser(parser_name)
+            .or_else(|| {
+                warn!(
+                    "Configured reasoning parser '{}' not found, falling back to model-based selection",
+                    parser_name
+                );
+                reasoning_parser_factory.registry().create_for_model(model)
+            })
+    } else {
+        // Auto-detect based on model
+        reasoning_parser_factory.registry().create_for_model(model)
+    }
+}
+
 /// Get the appropriate tool parser for a model
 ///
 /// If a parser name is explicitly configured, use that parser.
 /// Otherwise, auto-detect based on the model name.
+/// Get a pooled tool parser (for non-streaming where state doesn't matter)
 pub fn get_tool_parser(
-    tool_parser_factory: &crate::tool_parser::ToolParserFactory,
+    tool_parser_factory: &crate::tool_parser::ParserFactory,
    configured_parser: Option<&String>,
    model: &str,
-) -> crate::tool_parser::PooledToolParser {
-    use tracing::warn;
-
+) -> crate::tool_parser::PooledParser {
    if let Some(parser_name) = configured_parser {
        // Use configured parser if specified
        tool_parser_factory
@@ -731,6 +753,30 @@ pub fn get_tool_parser(
    }
 }

+/// Create a fresh tool parser instance (for streaming where state isolation is needed)
+pub fn create_tool_parser(
+    tool_parser_factory: &crate::tool_parser::ParserFactory,
+    configured_parser: Option<&String>,
+    model: &str,
+) -> Option<Box<dyn crate::tool_parser::ToolParser>> {
+    if let Some(parser_name) = configured_parser {
+        // Use configured parser if specified
+        tool_parser_factory
+            .registry()
+            .create_parser(parser_name)
+            .or_else(|| {
+                warn!(
+                    "Configured tool parser '{}' not found, falling back to model-based selection",
+                    parser_name
+                );
+                tool_parser_factory.registry().create_for_model(model)
+            })
+    } else {
+        // Auto-detect based on model
+        tool_parser_factory.registry().create_for_model(model)
+    }
+}
+
 /// Convert proto::OutputLogProbs to OpenAI ChatLogProbs format
 ///
 /// This function decodes token IDs using the tokenizer and builds the logprobs structure