From f556ac8bd8f6cfad85ce4da6d6b10c775cb43278 Mon Sep 17 00:00:00 2001 From: Simo Lin Date: Fri, 22 Aug 2025 12:13:04 -0700 Subject: [PATCH] [router] add json tool parser (#9516) --- sgl-router/src/tool_parser/json_parser.rs | 390 +++++++++++++ sgl-router/src/tool_parser/mod.rs | 3 +- sgl-router/src/tool_parser/registry.rs | 28 +- sgl-router/src/tool_parser/tests.rs | 637 ++++++++++++++++++++++ 4 files changed, 1049 insertions(+), 9 deletions(-) create mode 100644 sgl-router/src/tool_parser/json_parser.rs diff --git a/sgl-router/src/tool_parser/json_parser.rs b/sgl-router/src/tool_parser/json_parser.rs new file mode 100644 index 000000000..4dd7efc64 --- /dev/null +++ b/sgl-router/src/tool_parser/json_parser.rs @@ -0,0 +1,390 @@ +use async_trait::async_trait; +use regex::Regex; +use serde_json::Value; + +use crate::tool_parser::{ + errors::{ToolParserError, ToolParserResult}, + partial_json::PartialJson, + state::ParseState, + traits::ToolParser, + types::{FunctionCall, StreamResult, ToolCall}, +}; + +/// JSON format parser for tool calls +/// +/// Handles various JSON formats for function calling: +/// - Single tool call: {"name": "fn", "arguments": {...}} +/// - Multiple tool calls: [{"name": "fn1", "arguments": {...}}, ...] +/// - With parameters instead of arguments: {"name": "fn", "parameters": {...}} +/// +/// Supports configurable token markers for different models +pub struct JsonParser { + /// Token(s) that mark the start of tool calls + start_tokens: Vec, + /// Token(s) that mark the end of tool calls + end_tokens: Vec, + /// Separator between multiple tool calls (reserved for future use) + _separator: String, + /// Parser for handling incomplete JSON during streaming + partial_json: PartialJson, + /// Regex patterns for extracting content between tokens + extractors: Vec, +} + +impl JsonParser { + /// Create a new JSON parser with default configuration + pub fn new() -> Self { + Self::with_config( + vec![], // No wrapper tokens by default + vec![], + ", ".to_string(), + ) + } + + /// Create a parser with custom token configuration + pub fn with_config( + start_tokens: Vec, + end_tokens: Vec, + separator: String, + ) -> Self { + // Build extraction patterns for each token pair + let extractors = start_tokens + .iter() + .zip(end_tokens.iter()) + .filter_map(|(start, end)| { + if !start.is_empty() && !end.is_empty() { + // Use (?s) flag to enable DOTALL mode so . matches newlines + let pattern = + format!(r"(?s){}(.*?){}", regex::escape(start), regex::escape(end)); + Regex::new(&pattern).ok() + } else { + None + } + }) + .collect(); + + Self { + start_tokens, + end_tokens, + _separator: separator, + partial_json: PartialJson::default(), + extractors, + } + } + + /// Extract JSON content from text, handling wrapper tokens if configured + fn extract_json_content<'a>(&self, text: &'a str) -> &'a str { + let mut content = text.trim(); + + // Try each extractor pattern + for extractor in &self.extractors { + if let Some(captures) = extractor.captures(content) { + if let Some(matched) = captures.get(1) { + content = matched.as_str().trim(); + break; + } + } + } + + // Handle special case where there's a start token but no end token + for (start, end) in self.start_tokens.iter().zip(self.end_tokens.iter()) { + if !start.is_empty() && end.is_empty() { + content = content.strip_prefix(start).unwrap_or(content); + } + } + + content + } + + /// Parse a single JSON object into a ToolCall + fn parse_single_object(&self, obj: &Value) -> ToolParserResult> { + // Check if this looks like a tool call + let name = obj + .get("name") + .or_else(|| obj.get("function")) + .and_then(|v| v.as_str()); + + if let Some(name) = name { + // Get arguments - support both "arguments" and "parameters" keys + let empty_obj = Value::Object(serde_json::Map::new()); + let args = obj + .get("arguments") + .or_else(|| obj.get("parameters")) + .unwrap_or(&empty_obj); + + // Convert arguments to JSON string + let arguments = serde_json::to_string(args) + .map_err(|e| ToolParserError::ParsingFailed(e.to_string()))?; + + // Generate a unique ID if not provided + let id = obj + .get("id") + .and_then(|v| v.as_str()) + .map(String::from) + .unwrap_or_else(|| format!("call_{}", uuid::Uuid::new_v4())); + + Ok(Some(ToolCall { + id, + r#type: "function".to_string(), + function: FunctionCall { + name: name.to_string(), + arguments, + }, + })) + } else { + Ok(None) + } + } + + /// Parse JSON value(s) into tool calls + fn parse_json_value(&self, value: &Value) -> ToolParserResult> { + let mut tools = Vec::new(); + + match value { + Value::Array(arr) => { + // Parse each element in the array + for item in arr { + if let Some(tool) = self.parse_single_object(item)? { + tools.push(tool); + } + } + } + Value::Object(_) => { + // Single tool call + if let Some(tool) = self.parse_single_object(value)? { + tools.push(tool); + } + } + _ => { + // Not a valid tool call format + return Ok(vec![]); + } + } + + Ok(tools) + } + + /// Check if text contains potential tool call markers + fn has_tool_markers(&self, text: &str) -> bool { + // If no start tokens configured, check for JSON structure + if self.start_tokens.is_empty() { + // For JSON, we just need to see the start of an object or array + return text.contains('{') || text.contains('['); + } + + // Check for any start token + self.start_tokens.iter().any(|token| text.contains(token)) + } +} + +impl Default for JsonParser { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl ToolParser for JsonParser { + async fn parse_complete(&self, text: &str) -> ToolParserResult> { + // Extract JSON content from wrapper tokens if present + let json_content = self.extract_json_content(text); + + // Try to parse as JSON + match serde_json::from_str::(json_content) { + Ok(value) => self.parse_json_value(&value), + Err(_) => { + // Not valid JSON, return empty + Ok(vec![]) + } + } + } + + async fn parse_incremental( + &self, + chunk: &str, + state: &mut ParseState, + ) -> ToolParserResult { + state.buffer.push_str(chunk); + + // Check if we have potential tool calls + if !self.has_tool_markers(&state.buffer) { + // No tool markers, return as incomplete + return Ok(StreamResult::Incomplete); + } + + // Extract JSON content + let json_content = self.extract_json_content(&state.buffer); + + // Try to parse with partial JSON parser + match self.partial_json.parse_value(json_content) { + Ok((value, consumed)) => { + // Check if we have a complete JSON structure + if consumed == json_content.len() { + // Complete JSON, parse tool calls + let tools = self.parse_json_value(&value)?; + if !tools.is_empty() { + // Clear buffer since we consumed everything + state.buffer.clear(); + + // Return the first tool as complete (simplified for Phase 2) + if let Some(tool) = tools.into_iter().next() { + return Ok(StreamResult::ToolComplete(tool)); + } + } + } else { + // Partial JSON, try to extract tool name + if let Some(name) = value.get("name").and_then(|v| v.as_str()) { + // Simple implementation for Phase 2 + // Just return the tool name once we see it + if !state.in_string { + state.in_string = true; // Use as a flag for "name sent" + return Ok(StreamResult::ToolName { + index: 0, + name: name.to_string(), + }); + } + + // Check for complete arguments + if let Some(args) = + value.get("arguments").or_else(|| value.get("parameters")) + { + if let Ok(args_str) = serde_json::to_string(args) { + // Return arguments as a single update + return Ok(StreamResult::ToolArguments { + index: 0, + arguments: args_str, + }); + } + } + } + } + } + Err(_) => { + // Failed to parse even as partial JSON + // Keep buffering + } + } + + Ok(StreamResult::Incomplete) + } + + fn detect_format(&self, text: &str) -> bool { + // Check if text contains JSON-like structure + if self.has_tool_markers(text) { + // Try to extract and parse + let json_content = self.extract_json_content(text); + + // Check if it looks like valid JSON for tool calls + if let Ok(value) = serde_json::from_str::(json_content) { + match value { + Value::Object(ref obj) => { + // Check for tool call structure + obj.contains_key("name") || obj.contains_key("function") + } + Value::Array(ref arr) => { + // Check if array contains tool-like objects + arr.iter().any(|v| { + v.as_object().is_some_and(|o| { + o.contains_key("name") || o.contains_key("function") + }) + }) + } + _ => false, + } + } else { + false + } + } else { + false + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_parse_single_tool_call() { + let parser = JsonParser::new(); + let input = r#"{"name": "get_weather", "arguments": {"location": "San Francisco"}}"#; + + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.name, "get_weather"); + } + + #[tokio::test] + async fn test_parse_multiple_tool_calls() { + let parser = JsonParser::new(); + let input = r#"[ + {"name": "get_weather", "arguments": {"location": "SF"}}, + {"name": "search", "arguments": {"query": "news"}} + ]"#; + + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 2); + assert_eq!(result[0].function.name, "get_weather"); + assert_eq!(result[1].function.name, "search"); + } + + #[tokio::test] + async fn test_parse_with_parameters_key() { + let parser = JsonParser::new(); + let input = r#"{"name": "calculate", "parameters": {"x": 10, "y": 20}}"#; + + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.name, "calculate"); + assert!(result[0].function.arguments.contains("10")); + } + + #[tokio::test] + async fn test_parse_with_wrapper_tokens() { + let parser = JsonParser::with_config( + vec!["".to_string()], + vec!["".to_string()], + ", ".to_string(), + ); + + let input = r#"{"name": "test", "arguments": {}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.name, "test"); + } + + #[test] + fn test_detect_format() { + let parser = JsonParser::new(); + + assert!(parser.detect_format(r#"{"name": "test", "arguments": {}}"#)); + assert!(parser.detect_format(r#"[{"name": "test"}]"#)); + assert!(!parser.detect_format("plain text")); + assert!(!parser.detect_format(r#"{"key": "value"}"#)); + } + + #[tokio::test] + async fn test_streaming_parse() { + // Phase 2 simplified streaming test + // Just verify that streaming eventually produces a complete tool call + let parser = JsonParser::new(); + let mut state = ParseState::new(); + + // Send complete JSON in one go (simplified for Phase 2) + let full_json = r#"{"name": "get_weather", "arguments": {"location": "SF"}}"#; + + let result = parser + .parse_incremental(full_json, &mut state) + .await + .unwrap(); + + // Should get a complete tool immediately with complete JSON + match result { + StreamResult::ToolComplete(tool) => { + assert_eq!(tool.function.name, "get_weather"); + assert!(tool.function.arguments.contains("SF")); + } + _ => panic!("Expected ToolComplete for complete JSON input"), + } + } +} diff --git a/sgl-router/src/tool_parser/mod.rs b/sgl-router/src/tool_parser/mod.rs index 9545e4de0..01d42385f 100644 --- a/sgl-router/src/tool_parser/mod.rs +++ b/sgl-router/src/tool_parser/mod.rs @@ -1,8 +1,8 @@ /// Tool parser module for handling function/tool calls in model outputs /// /// This module provides infrastructure for parsing tool calls from various model formats. -/// Phase 1 focuses on core infrastructure: types, traits, registry, and partial JSON parsing. pub mod errors; +pub mod json_parser; pub mod partial_json; pub mod registry; pub mod state; @@ -14,6 +14,7 @@ mod tests; // Re-export commonly used types pub use errors::{ToolParserError, ToolParserResult}; +pub use json_parser::JsonParser; pub use registry::ParserRegistry; pub use state::{ParsePhase, ParseState}; pub use traits::{PartialJsonParser, ToolParser}; diff --git a/sgl-router/src/tool_parser/registry.rs b/sgl-router/src/tool_parser/registry.rs index aca354e7c..11153dfd5 100644 --- a/sgl-router/src/tool_parser/registry.rs +++ b/sgl-router/src/tool_parser/registry.rs @@ -1,3 +1,4 @@ +use crate::tool_parser::json_parser::JsonParser; use crate::tool_parser::traits::ToolParser; use std::collections::HashMap; use std::sync::Arc; @@ -21,6 +22,9 @@ impl ParserRegistry { default_parser: "json".to_string(), }; + // Register default parsers + registry.register_default_parsers(); + // Register default model mappings registry.register_default_mappings(); @@ -75,6 +79,14 @@ impl ParserRegistry { .collect() } + /// Register default parsers + fn register_default_parsers(&mut self) { + // JSON parser - most common format + self.register_parser("json", Arc::new(JsonParser::new())); + + // Note: Additional parsers (mistral, qwen, llama) will be added in later phases + } + /// Register default model mappings fn register_default_mappings(&mut self) { // OpenAI models @@ -85,16 +97,16 @@ impl ParserRegistry { // Anthropic models self.map_model("claude-*", "json"); - // Mistral models - self.map_model("mistral-*", "mistral"); - self.map_model("mixtral-*", "mistral"); + // Mistral models (will use json until mistral parser is implemented) + self.map_model("mistral-*", "json"); + self.map_model("mixtral-*", "json"); - // Qwen models - self.map_model("qwen*", "qwen"); + // Qwen models (will use json until qwen parser is implemented) + self.map_model("qwen*", "json"); - // Llama models - self.map_model("llama-*", "llama"); - self.map_model("meta-llama-*", "llama"); + // Llama models (will use json until llama parser is implemented) + self.map_model("llama-*", "json"); + self.map_model("meta-llama-*", "json"); // Other models default to JSON self.map_model("gemini-*", "json"); diff --git a/sgl-router/src/tool_parser/tests.rs b/sgl-router/src/tool_parser/tests.rs index e13c614a0..2635e0350 100644 --- a/sgl-router/src/tool_parser/tests.rs +++ b/sgl-router/src/tool_parser/tests.rs @@ -1,7 +1,9 @@ use super::*; +use crate::tool_parser::json_parser::JsonParser; use crate::tool_parser::partial_json::{ compute_diff, find_common_prefix, is_complete_json, PartialJson, }; +use crate::tool_parser::traits::ToolParser; #[test] fn test_parse_state_new() { @@ -247,3 +249,638 @@ fn test_partial_tool_call() { assert!(partial.name_sent); assert_eq!(partial.streamed_args, r#"{"key": "#); } + +#[tokio::test] +async fn test_json_parser_complete_single() { + let parser = JsonParser::new(); + + // Test single tool call with arguments + let input = r#"{"name": "get_weather", "arguments": {"location": "San Francisco", "units": "celsius"}}"#; + let result = parser.parse_complete(input).await.unwrap(); + + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.name, "get_weather"); + assert!(result[0].function.arguments.contains("San Francisco")); + assert!(result[0].function.arguments.contains("celsius")); +} + +#[tokio::test] +async fn test_json_parser_complete_array() { + let parser = JsonParser::new(); + + // Test array of tool calls + let input = r#"[ + {"name": "get_weather", "arguments": {"location": "SF"}}, + {"name": "get_news", "arguments": {"query": "technology"}} + ]"#; + + let result = parser.parse_complete(input).await.unwrap(); + + assert_eq!(result.len(), 2); + assert_eq!(result[0].function.name, "get_weather"); + assert_eq!(result[1].function.name, "get_news"); +} + +#[tokio::test] +async fn test_json_parser_with_parameters() { + let parser = JsonParser::new(); + + // Test with "parameters" instead of "arguments" + let input = r#"{"name": "calculate", "parameters": {"x": 10, "y": 20, "operation": "add"}}"#; + let result = parser.parse_complete(input).await.unwrap(); + + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.name, "calculate"); + assert!(result[0].function.arguments.contains("10")); + assert!(result[0].function.arguments.contains("20")); + assert!(result[0].function.arguments.contains("add")); +} + +#[tokio::test] +async fn test_json_parser_with_tokens() { + // Test with custom wrapper tokens + let parser = JsonParser::with_config( + vec!["[TOOL_CALLS] [".to_string()], + vec!["]".to_string()], + ", ".to_string(), + ); + + let input = r#"[TOOL_CALLS] [{"name": "search", "arguments": {"query": "rust programming"}}]"#; + let result = parser.parse_complete(input).await.unwrap(); + + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.name, "search"); +} + +#[tokio::test] +async fn test_multiline_json_with_tokens() { + // Test that regex with (?s) flag properly handles multi-line JSON + let parser = JsonParser::with_config( + vec!["".to_string()], + vec!["".to_string()], + ", ".to_string(), + ); + + // Pretty-printed multi-line JSON + let input = r#"{ + "name": "get_weather", + "arguments": { + "location": "San Francisco", + "units": "celsius", + "include_forecast": true + } +}"#; + + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.name, "get_weather"); + assert!(result[0].function.arguments.contains("San Francisco")); + assert!(result[0].function.arguments.contains("celsius")); + assert!(result[0].function.arguments.contains("true")); +} + +#[tokio::test] +async fn test_multiline_json_array() { + // Test multi-line JSON array without wrapper tokens + let parser = JsonParser::new(); + + let input = r#"[ + { + "name": "function1", + "arguments": { + "param1": "value1", + "param2": 42 + } + }, + { + "name": "function2", + "parameters": { + "data": [1, 2, 3], + "flag": false + } + } +]"#; + + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 2); + assert_eq!(result[0].function.name, "function1"); + assert_eq!(result[1].function.name, "function2"); + assert!(result[0].function.arguments.contains("value1")); + assert!(result[1].function.arguments.contains("[1,2,3]")); +} + +#[test] +fn test_json_parser_format_detection() { + let parser = JsonParser::new(); + + // Should detect valid tool call formats + assert!(parser.detect_format(r#"{"name": "test", "arguments": {}}"#)); + assert!(parser.detect_format(r#"{"name": "test", "parameters": {"x": 1}}"#)); + assert!(parser.detect_format(r#"[{"name": "test"}]"#)); + + // Should not detect non-tool formats + assert!(!parser.detect_format("plain text")); + assert!(!parser.detect_format(r#"{"key": "value"}"#)); + assert!(!parser.detect_format(r#"{"data": {"nested": true}}"#)); +} + +#[tokio::test] +async fn test_json_parser_streaming() { + // Phase 2 simplified streaming test + let parser = JsonParser::new(); + let mut state = ParseState::new(); + + // Test with complete JSON (simplified for Phase 2) + let full_json = r#"{"name": "get_weather", "arguments": {"location": "San Francisco"}}"#; + + let result = parser + .parse_incremental(full_json, &mut state) + .await + .unwrap(); + + match result { + StreamResult::ToolComplete(tool) => { + assert_eq!(tool.function.name, "get_weather"); + assert!(tool.function.arguments.contains("San Francisco")); + } + _ => panic!("Expected ToolComplete for complete JSON"), + } +} + +#[tokio::test] +async fn test_registry_with_json_parser() { + let registry = ParserRegistry::new(); + + // JSON parser should be registered by default + assert!(registry.has_parser("json")); + + // Should get JSON parser for OpenAI models + let parser = registry.get_parser("gpt-4-turbo").unwrap(); + + // Test that the parser works + let input = r#"{"name": "test", "arguments": {"x": 1}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.name, "test"); +} + +#[tokio::test] +async fn test_json_parser_invalid_input() { + let parser = JsonParser::new(); + + // Invalid JSON should return empty results + assert_eq!(parser.parse_complete("not json").await.unwrap().len(), 0); + assert_eq!(parser.parse_complete("{invalid}").await.unwrap().len(), 0); + assert_eq!(parser.parse_complete("").await.unwrap().len(), 0); +} + +#[tokio::test] +async fn test_json_parser_empty_arguments() { + let parser = JsonParser::new(); + + // Tool call with no arguments + let input = r#"{"name": "get_time"}"#; + let result = parser.parse_complete(input).await.unwrap(); + + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.name, "get_time"); + assert_eq!(result[0].function.arguments, "{}"); +} + +#[cfg(test)] +mod failure_cases { + use super::*; + + #[tokio::test] + async fn test_malformed_tool_missing_name() { + let parser = JsonParser::new(); + + // Missing name field + let input = r#"{"arguments": {"x": 1}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 0, "Should return empty for tool without name"); + + // Empty name + let input = r#"{"name": "", "arguments": {"x": 1}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1, "Should accept empty name string"); + assert_eq!(result[0].function.name, ""); + } + + #[tokio::test] + async fn test_invalid_arguments_json() { + let parser = JsonParser::new(); + + // Arguments is a string instead of object + let input = r#"{"name": "test", "arguments": "not an object"}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + // Should serialize the string as JSON + assert!(result[0].function.arguments.contains("not an object")); + + // Arguments is a number + let input = r#"{"name": "test", "arguments": 42}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.arguments, "42"); + + // Arguments is null + let input = r#"{"name": "test", "arguments": null}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.arguments, "null"); + } + + #[tokio::test] + async fn test_broken_wrapper_tokens() { + let parser = JsonParser::with_config( + vec!["".to_string()], + vec!["".to_string()], + ", ".to_string(), + ); + + // Missing end token + let input = r#"{"name": "test", "arguments": {}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!( + result.len(), + 0, + "Should fail to parse without complete wrapper" + ); + + // Missing start token - parser looks for complete wrapper, so this won't parse + let input = r#"{"name": "test", "arguments": {}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!( + result.len(), + 0, + "Should not parse JSON with incomplete wrapper" + ); + + // Mismatched tokens + let input = r#"{"name": "test", "arguments": {}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 0, "Should fail with mismatched tokens"); + } + + #[tokio::test] + async fn test_invalid_json_structures() { + let parser = JsonParser::new(); + + // Trailing comma + let input = r#"{"name": "test", "arguments": {"x": 1,}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 0, "Should reject JSON with trailing comma"); + + // Missing quotes on keys + let input = r#"{name: "test", arguments: {}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 0, "Should reject invalid JSON syntax"); + + // Unclosed object + let input = r#"{"name": "test", "arguments": {"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 0, "Should reject incomplete JSON"); + } +} + +#[cfg(test)] +mod edge_cases { + use super::*; + + #[tokio::test] + async fn test_unicode_in_names_and_arguments() { + let parser = JsonParser::new(); + + // Unicode in function name + let input = r#"{"name": "获取天气", "arguments": {"location": "北京"}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.name, "获取天气"); + assert!(result[0].function.arguments.contains("北京")); + + // Emoji in arguments + let input = r#"{"name": "send_message", "arguments": {"text": "Hello 👋 World 🌍"}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert!(result[0].function.arguments.contains("👋")); + assert!(result[0].function.arguments.contains("🌍")); + } + + #[tokio::test] + async fn test_escaped_characters() { + let parser = JsonParser::new(); + + // Escaped quotes in arguments + let input = r#"{"name": "echo", "arguments": {"text": "He said \"hello\""}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert!(result[0].function.arguments.contains(r#"\"hello\""#)); + + // Escaped backslashes + let input = r#"{"name": "path", "arguments": {"dir": "C:\\Users\\test"}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert!(result[0].function.arguments.contains("\\\\")); + + // Newlines and tabs + let input = r#"{"name": "format", "arguments": {"text": "line1\nline2\ttabbed"}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert!(result[0].function.arguments.contains("\\n")); + assert!(result[0].function.arguments.contains("\\t")); + } + + #[tokio::test] + async fn test_very_large_payloads() { + let parser = JsonParser::new(); + + // Large arguments object + let mut large_args = r#"{"name": "process", "arguments": {"#.to_string(); + for i in 0..1000 { + large_args.push_str(&format!(r#""field_{}": "value_{}","#, i, i)); + } + large_args.push_str(r#""final": "value"}}"#); + + let result = parser.parse_complete(&large_args).await.unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.name, "process"); + assert!(result[0].function.arguments.contains("field_999")); + + // Large array of tool calls + let mut large_array = "[".to_string(); + for i in 0..100 { + if i > 0 { + large_array.push(','); + } + large_array.push_str(&format!(r#"{{"name": "func_{}", "arguments": {{}}}}"#, i)); + } + large_array.push(']'); + + let result = parser.parse_complete(&large_array).await.unwrap(); + assert_eq!(result.len(), 100); + assert_eq!(result[99].function.name, "func_99"); + } + + #[tokio::test] + async fn test_mixed_array_tools_and_non_tools() { + let parser = JsonParser::new(); + + // Array with both tool calls and non-tool objects + let input = r#"[ + {"name": "tool1", "arguments": {}}, + {"not_a_tool": "just_data"}, + {"name": "tool2", "parameters": {"x": 1}}, + {"key": "value", "another": "field"} + ]"#; + + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 2, "Should only parse valid tool calls"); + assert_eq!(result[0].function.name, "tool1"); + assert_eq!(result[1].function.name, "tool2"); + } + + #[tokio::test] + async fn test_duplicate_keys_in_json() { + let parser = JsonParser::new(); + + // JSON with duplicate keys (last one wins in most parsers) + let input = r#"{"name": "first", "name": "second", "arguments": {"x": 1, "x": 2}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert_eq!( + result[0].function.name, "second", + "Last duplicate key should win" + ); + assert!( + result[0].function.arguments.contains("2"), + "Last duplicate value should win" + ); + } + + #[tokio::test] + async fn test_null_values_in_arguments() { + let parser = JsonParser::new(); + + // Null values in arguments + let input = r#"{"name": "test", "arguments": {"required": "value", "optional": null}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert!(result[0].function.arguments.contains("null")); + + // Array with null + let input = r#"{"name": "test", "arguments": {"items": [1, null, "three"]}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert!(result[0].function.arguments.contains("null")); + } + + #[tokio::test] + async fn test_multiple_token_pairs_with_conflicts() { + // Test with overlapping token patterns + let parser = JsonParser::with_config( + vec!["<<".to_string(), "".to_string()], + vec![">>".to_string(), "".to_string()], + ", ".to_string(), + ); + + // First pattern + let input = r#"<<{"name": "test1", "arguments": {}}>>"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.name, "test1"); + + // Second pattern + let input = r#"{"name": "test2", "arguments": {}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.name, "test2"); + + // Nested patterns (should use first match) + let input = r#"<{"name": "test3", "arguments": {}}>"#; + let result = parser.parse_complete(input).await.unwrap(); + // This is tricky - depends on regex behavior + // The parser should handle this gracefully + assert!(result.len() <= 1, "Should not parse multiple times"); + } + + #[tokio::test] + async fn test_streaming_with_partial_chunks() { + let parser = JsonParser::new(); + + // Test 1: Very incomplete JSON (just opening brace) should return Incomplete + let mut state1 = ParseState::new(); + let partial = r#"{"#; + let result = parser + .parse_incremental(partial, &mut state1) + .await + .unwrap(); + assert!( + matches!(result, StreamResult::Incomplete), + "Should return Incomplete for just opening brace" + ); + + // Test 2: Complete JSON should return ToolComplete + let mut state2 = ParseState::new(); + let complete = r#"{"name": "get_weather", "arguments": {"location": "SF"}}"#; + let result = parser + .parse_incremental(complete, &mut state2) + .await + .unwrap(); + + match result { + StreamResult::ToolComplete(tool) => { + assert_eq!(tool.function.name, "get_weather"); + let args: serde_json::Value = + serde_json::from_str(&tool.function.arguments).unwrap(); + assert_eq!(args["location"], "SF"); + } + _ => panic!("Expected ToolComplete for complete JSON"), + } + + // Test 3: Partial JSON with name - Phase 2 behavior + // The PartialJson parser can complete partial JSON by filling in missing values + let mut state3 = ParseState::new(); + let partial_with_name = r#"{"name": "test", "argum"#; + let result = parser + .parse_incremental(partial_with_name, &mut state3) + .await + .unwrap(); + + match result { + StreamResult::ToolComplete(tool) => { + assert_eq!(tool.function.name, "test"); + // Arguments will be empty object since "argum" is incomplete + assert_eq!(tool.function.arguments, "{}"); + } + StreamResult::ToolName { name, .. } => { + assert_eq!(name, "test"); + } + StreamResult::Incomplete => { + // Also acceptable if parser decides to wait + } + _ => panic!("Unexpected result for partial JSON with name"), + } + } + + #[tokio::test] + async fn test_special_json_values() { + let parser = JsonParser::new(); + + // Boolean values + let input = r#"{"name": "toggle", "arguments": {"enabled": true, "disabled": false}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert!(result[0].function.arguments.contains("true")); + assert!(result[0].function.arguments.contains("false")); + + // Numbers (including float and negative) + let input = r#"{"name": "calc", "arguments": {"int": 42, "float": 3.14, "negative": -17}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert!(result[0].function.arguments.contains("42")); + assert!(result[0].function.arguments.contains("3.14")); + assert!(result[0].function.arguments.contains("-17")); + + // Empty arrays and objects + let input = r#"{"name": "test", "arguments": {"empty_arr": [], "empty_obj": {}}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert!(result[0].function.arguments.contains("[]")); + assert!(result[0].function.arguments.contains("{}")); + } + + #[tokio::test] + async fn test_function_field_alternative() { + let parser = JsonParser::new(); + + // Using "function" instead of "name" + let input = r#"{"function": "test_func", "arguments": {"x": 1}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.name, "test_func"); + + // Both "name" and "function" present (name should take precedence) + let input = r#"{"name": "primary", "function": "secondary", "arguments": {}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.name, "primary"); + } + + #[tokio::test] + async fn test_whitespace_handling() { + let parser = JsonParser::new(); + + // Extra whitespace everywhere + let input = r#" { + "name" : "test" , + "arguments" : { + "key" : "value" + } + } "#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.name, "test"); + + // Minified JSON (no whitespace) + let input = r#"{"name":"compact","arguments":{"a":1,"b":2}}"#; + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.name, "compact"); + } +} + +#[cfg(test)] +mod stress_tests { + use super::*; + + #[tokio::test] + async fn test_deeply_nested_arguments() { + let parser = JsonParser::new(); + + // Deeply nested structure + let input = r#"{ + "name": "nested", + "arguments": { + "level1": { + "level2": { + "level3": { + "level4": { + "level5": { + "value": "deep" + } + } + } + } + } + } + }"#; + + let result = parser.parse_complete(input).await.unwrap(); + assert_eq!(result.len(), 1); + assert!(result[0].function.arguments.contains("deep")); + } + + #[tokio::test] + async fn test_concurrent_parser_usage() { + // Test that parser can be used concurrently + let parser = std::sync::Arc::new(JsonParser::new()); + + let mut handles = vec![]; + + for i in 0..10 { + let parser_clone = parser.clone(); + let handle = tokio::spawn(async move { + let input = format!(r#"{{"name": "func_{}", "arguments": {{}}}}"#, i); + let result = parser_clone.parse_complete(&input).await.unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].function.name, format!("func_{}", i)); + }); + handles.push(handle); + } + + for handle in handles { + handle.await.unwrap(); + } + } +}