[router][grpc] Fix streaming bugs: empty tool names, state pollution, and panics (#11373)
This commit is contained in:
@@ -15,10 +15,10 @@ use crate::grpc_client::{proto, SglangSchedulerClient};
|
||||
use crate::protocols::spec::{
|
||||
ChatCompletionRequest, ChatCompletionResponse, GenerateRequest, GenerateResponse,
|
||||
};
|
||||
use crate::reasoning_parser::ReasoningParserFactory;
|
||||
use crate::reasoning_parser::ParserFactory as ReasoningParserFactory;
|
||||
use crate::tokenizer::stop::StopSequenceDecoder;
|
||||
use crate::tokenizer::traits::Tokenizer;
|
||||
use crate::tool_parser::ToolParserFactory;
|
||||
use crate::tool_parser::ParserFactory as ToolParserFactory;
|
||||
|
||||
// ============================================================================
|
||||
// Core Context Types
|
||||
|
||||
@@ -7,11 +7,11 @@ use crate::protocols::spec::{
|
||||
ChatCompletionRequest, CompletionRequest, EmbeddingRequest, GenerateRequest, RerankRequest,
|
||||
ResponsesGetParams, ResponsesRequest,
|
||||
};
|
||||
use crate::reasoning_parser::ReasoningParserFactory;
|
||||
use crate::reasoning_parser::ParserFactory as ReasoningParserFactory;
|
||||
use crate::routers::RouterTrait;
|
||||
use crate::server::AppContext;
|
||||
use crate::tokenizer::traits::Tokenizer;
|
||||
use crate::tool_parser::ToolParserFactory;
|
||||
use crate::tool_parser::ParserFactory as ToolParserFactory;
|
||||
use async_trait::async_trait;
|
||||
use axum::{
|
||||
body::Body,
|
||||
|
||||
@@ -13,10 +13,10 @@ use crate::protocols::spec::{
|
||||
ChatChoice, ChatCompletionMessage, ChatCompletionRequest, FunctionCallResponse, ToolCall,
|
||||
ToolChoice, ToolChoiceValue,
|
||||
};
|
||||
use crate::reasoning_parser::ReasoningParserFactory;
|
||||
use crate::reasoning_parser::ParserFactory as ReasoningParserFactory;
|
||||
use crate::tokenizer::stop::{SequenceDecoderOutput, StopSequenceDecoder};
|
||||
use crate::tokenizer::traits::Tokenizer;
|
||||
use crate::tool_parser::ToolParserFactory;
|
||||
use crate::tool_parser::ParserFactory as ToolParserFactory;
|
||||
|
||||
use super::utils;
|
||||
|
||||
|
||||
@@ -18,11 +18,11 @@ use crate::protocols::spec::{
|
||||
ChatCompletionRequest, CompletionRequest, EmbeddingRequest, GenerateRequest, RerankRequest,
|
||||
ResponsesGetParams, ResponsesRequest,
|
||||
};
|
||||
use crate::reasoning_parser::ReasoningParserFactory;
|
||||
use crate::reasoning_parser::ParserFactory as ReasoningParserFactory;
|
||||
use crate::routers::RouterTrait;
|
||||
use crate::server::AppContext;
|
||||
use crate::tokenizer::traits::Tokenizer;
|
||||
use crate::tool_parser::ToolParserFactory;
|
||||
use crate::tool_parser::ParserFactory as ToolParserFactory;
|
||||
|
||||
/// gRPC router implementation for SGLang
|
||||
#[derive(Clone)]
|
||||
|
||||
@@ -34,8 +34,8 @@ use tokio::sync::mpsc;
|
||||
#[derive(Clone)]
|
||||
pub struct StreamingProcessor {
|
||||
tokenizer: Arc<dyn Tokenizer>,
|
||||
tool_parser_factory: crate::tool_parser::ToolParserFactory,
|
||||
reasoning_parser_factory: crate::reasoning_parser::ReasoningParserFactory,
|
||||
tool_parser_factory: crate::tool_parser::ParserFactory,
|
||||
reasoning_parser_factory: crate::reasoning_parser::ParserFactory,
|
||||
configured_tool_parser: Option<String>,
|
||||
configured_reasoning_parser: Option<String>,
|
||||
}
|
||||
@@ -43,8 +43,8 @@ pub struct StreamingProcessor {
|
||||
impl StreamingProcessor {
|
||||
pub fn new(
|
||||
tokenizer: Arc<dyn Tokenizer>,
|
||||
tool_parser_factory: crate::tool_parser::ToolParserFactory,
|
||||
reasoning_parser_factory: crate::reasoning_parser::ReasoningParserFactory,
|
||||
tool_parser_factory: crate::tool_parser::ParserFactory,
|
||||
reasoning_parser_factory: crate::reasoning_parser::ParserFactory,
|
||||
configured_tool_parser: Option<String>,
|
||||
configured_reasoning_parser: Option<String>,
|
||||
) -> Self {
|
||||
@@ -195,6 +195,47 @@ impl StreamingProcessor {
|
||||
let created = dispatch.created;
|
||||
let system_fingerprint = dispatch.weight_version.as_deref();
|
||||
|
||||
// Check parser availability once upfront (log warning only once per request)
|
||||
let reasoning_parser_available = if separate_reasoning {
|
||||
if let Some(parser_name) = self.configured_reasoning_parser.as_ref() {
|
||||
self.reasoning_parser_factory
|
||||
.registry()
|
||||
.has_parser(parser_name)
|
||||
} else {
|
||||
self.reasoning_parser_factory
|
||||
.registry()
|
||||
.has_parser_for_model(model)
|
||||
}
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
let tool_parser_available = if tools.is_some() {
|
||||
if let Some(parser_name) = self.configured_tool_parser.as_ref() {
|
||||
self.tool_parser_factory.registry().has_parser(parser_name)
|
||||
} else {
|
||||
self.tool_parser_factory
|
||||
.registry()
|
||||
.has_parser_for_model(model)
|
||||
}
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
if separate_reasoning && !reasoning_parser_available {
|
||||
warn!(
|
||||
"No reasoning parser found for model '{}', skipping reasoning parsing",
|
||||
model
|
||||
);
|
||||
}
|
||||
|
||||
if tools.is_some() && !tool_parser_available {
|
||||
warn!(
|
||||
"No tool parser found for model '{}', skipping tool call parsing",
|
||||
model
|
||||
);
|
||||
}
|
||||
|
||||
// Phase 2: Main streaming loop
|
||||
while let Some(response) = grpc_stream.next().await {
|
||||
let gen_response = response.map_err(|e| format!("Stream error: {}", e))?;
|
||||
@@ -276,7 +317,7 @@ impl StreamingProcessor {
|
||||
stream_buffer.push_str(&delta);
|
||||
|
||||
// Reasoning content handling
|
||||
let in_reasoning = if separate_reasoning {
|
||||
let in_reasoning = if separate_reasoning && reasoning_parser_available {
|
||||
let (normal_text, reasoning_chunk, in_reasoning) = self
|
||||
.process_reasoning_stream(
|
||||
&delta,
|
||||
@@ -303,8 +344,12 @@ impl StreamingProcessor {
|
||||
let tool_choice_enabled =
|
||||
!matches!(tool_choice, Some(ToolChoice::Value(ToolChoiceValue::None)));
|
||||
|
||||
if !in_reasoning && tool_choice_enabled && tools.is_some() {
|
||||
let (should_skip, tool_chunks) = self
|
||||
if !in_reasoning
|
||||
&& tool_choice_enabled
|
||||
&& tools.is_some()
|
||||
&& tool_parser_available
|
||||
{
|
||||
let tool_chunks = self
|
||||
.process_tool_calls_stream(
|
||||
&delta,
|
||||
index,
|
||||
@@ -325,10 +370,9 @@ impl StreamingProcessor {
|
||||
.map_err(|_| "Failed to send tool call chunk".to_string())?;
|
||||
}
|
||||
|
||||
// Continue to process the next chunk as we have tool chunks
|
||||
if should_skip {
|
||||
continue;
|
||||
}
|
||||
// Always skip regular content when tool parsing is active
|
||||
// Parser either emitted chunks or buffered content
|
||||
continue;
|
||||
}
|
||||
|
||||
// Regular content emission
|
||||
@@ -963,13 +1007,15 @@ impl StreamingProcessor {
|
||||
created: u64,
|
||||
system_fingerprint: Option<&str>,
|
||||
) -> (String, Option<ChatCompletionStreamResponse>, bool) {
|
||||
// Get or create parser for this index
|
||||
// Create fresh parser for this index (not pooled, to avoid state pollution)
|
||||
reasoning_parsers.entry(index).or_insert_with(|| {
|
||||
utils::get_reasoning_parser(
|
||||
let parser = utils::create_reasoning_parser(
|
||||
&self.reasoning_parser_factory,
|
||||
self.configured_reasoning_parser.as_ref(),
|
||||
model,
|
||||
)
|
||||
.expect("Parser should be available - checked upfront");
|
||||
Arc::new(tokio::sync::Mutex::new(parser))
|
||||
});
|
||||
|
||||
if let Some(pooled_parser) = reasoning_parsers.get(&index) {
|
||||
@@ -1034,20 +1080,23 @@ impl StreamingProcessor {
|
||||
created: u64,
|
||||
system_fingerprint: Option<&str>,
|
||||
history_tool_calls_count: usize,
|
||||
) -> (bool, Vec<ChatCompletionStreamResponse>) {
|
||||
) -> Vec<ChatCompletionStreamResponse> {
|
||||
let mut chunks = Vec::new();
|
||||
|
||||
// Get or create parser for this index
|
||||
// Create fresh parser for this index (not pooled, to avoid state pollution)
|
||||
tool_parsers.entry(index).or_insert_with(|| {
|
||||
utils::get_tool_parser(
|
||||
let parser = utils::create_tool_parser(
|
||||
&self.tool_parser_factory,
|
||||
self.configured_tool_parser.as_ref(),
|
||||
model,
|
||||
)
|
||||
.expect("Parser should be available - checked upfront");
|
||||
Arc::new(tokio::sync::Mutex::new(parser))
|
||||
});
|
||||
|
||||
if let Some(pooled_parser) = tool_parsers.get(&index) {
|
||||
let mut parser = pooled_parser.lock().await;
|
||||
|
||||
match parser.parse_incremental(delta, tools).await {
|
||||
Ok(crate::tool_parser::StreamingParseResult { normal_text, calls }) => {
|
||||
// Emit normal text if present
|
||||
@@ -1129,8 +1178,7 @@ impl StreamingProcessor {
|
||||
});
|
||||
}
|
||||
|
||||
// If we emitted chunks, skip regular content
|
||||
return (!chunks.is_empty(), chunks);
|
||||
return chunks;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Tool call parsing error: {}", e);
|
||||
@@ -1138,7 +1186,7 @@ impl StreamingProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
(false, chunks)
|
||||
chunks
|
||||
}
|
||||
|
||||
/// Format a response as SSE chunk into a reusable buffer
|
||||
|
||||
@@ -677,13 +677,12 @@ pub fn generate_tool_call_id(
|
||||
///
|
||||
/// If a parser name is explicitly configured, use that parser.
|
||||
/// Otherwise, auto-detect based on the model name.
|
||||
/// Get a pooled reasoning parser (for non-streaming where state doesn't matter)
|
||||
pub fn get_reasoning_parser(
|
||||
reasoning_parser_factory: &crate::reasoning_parser::ReasoningParserFactory,
|
||||
reasoning_parser_factory: &crate::reasoning_parser::ParserFactory,
|
||||
configured_parser: Option<&String>,
|
||||
model: &str,
|
||||
) -> crate::reasoning_parser::PooledParser {
|
||||
use tracing::warn;
|
||||
|
||||
if let Some(parser_name) = configured_parser {
|
||||
// Use configured parser if specified
|
||||
reasoning_parser_factory
|
||||
@@ -702,17 +701,40 @@ pub fn get_reasoning_parser(
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a fresh reasoning parser instance (for streaming where state isolation is needed)
|
||||
pub fn create_reasoning_parser(
|
||||
reasoning_parser_factory: &crate::reasoning_parser::ParserFactory,
|
||||
configured_parser: Option<&String>,
|
||||
model: &str,
|
||||
) -> Option<Box<dyn crate::reasoning_parser::ReasoningParser>> {
|
||||
if let Some(parser_name) = configured_parser {
|
||||
// Use configured parser if specified
|
||||
reasoning_parser_factory
|
||||
.registry()
|
||||
.create_parser(parser_name)
|
||||
.or_else(|| {
|
||||
warn!(
|
||||
"Configured reasoning parser '{}' not found, falling back to model-based selection",
|
||||
parser_name
|
||||
);
|
||||
reasoning_parser_factory.registry().create_for_model(model)
|
||||
})
|
||||
} else {
|
||||
// Auto-detect based on model
|
||||
reasoning_parser_factory.registry().create_for_model(model)
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the appropriate tool parser for a model
|
||||
///
|
||||
/// If a parser name is explicitly configured, use that parser.
|
||||
/// Otherwise, auto-detect based on the model name.
|
||||
/// Get a pooled tool parser (for non-streaming where state doesn't matter)
|
||||
pub fn get_tool_parser(
|
||||
tool_parser_factory: &crate::tool_parser::ToolParserFactory,
|
||||
tool_parser_factory: &crate::tool_parser::ParserFactory,
|
||||
configured_parser: Option<&String>,
|
||||
model: &str,
|
||||
) -> crate::tool_parser::PooledToolParser {
|
||||
use tracing::warn;
|
||||
|
||||
) -> crate::tool_parser::PooledParser {
|
||||
if let Some(parser_name) = configured_parser {
|
||||
// Use configured parser if specified
|
||||
tool_parser_factory
|
||||
@@ -731,6 +753,30 @@ pub fn get_tool_parser(
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a fresh tool parser instance (for streaming where state isolation is needed)
|
||||
pub fn create_tool_parser(
|
||||
tool_parser_factory: &crate::tool_parser::ParserFactory,
|
||||
configured_parser: Option<&String>,
|
||||
model: &str,
|
||||
) -> Option<Box<dyn crate::tool_parser::ToolParser>> {
|
||||
if let Some(parser_name) = configured_parser {
|
||||
// Use configured parser if specified
|
||||
tool_parser_factory
|
||||
.registry()
|
||||
.create_parser(parser_name)
|
||||
.or_else(|| {
|
||||
warn!(
|
||||
"Configured tool parser '{}' not found, falling back to model-based selection",
|
||||
parser_name
|
||||
);
|
||||
tool_parser_factory.registry().create_for_model(model)
|
||||
})
|
||||
} else {
|
||||
// Auto-detect based on model
|
||||
tool_parser_factory.registry().create_for_model(model)
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert proto::OutputLogProbs to OpenAI ChatLogProbs format
|
||||
///
|
||||
/// This function decodes token IDs using the tokenizer and builds the logprobs structure
|
||||
|
||||
Reference in New Issue
Block a user