[router] add ut for mistral, llama, pythonic, and streaming tool parser (#9632)

Co-authored-by: Chang Su <chang.s.su@oracle.com>
2025-08-25 22:02:15 -07:00
parent dc1decc6af
commit e2e378caba
11 changed files with 2371 additions and 0 deletions
--- a/sgl-router/tests/tool_parser_edge_cases.rs
+++ b/sgl-router/tests/tool_parser_edge_cases.rs
@@ -0,0 +1,330 @@
+//! Edge Cases and Error Handling Tests
+//!
+//! Tests for malformed input, edge cases, and error recovery
+
+use sglang_router_rs::tool_parser::{
+    JsonParser, MistralParser, ParseState, ParserRegistry, PythonicParser, QwenParser,
+    StreamResult, ToolParser,
+};
+
+#[tokio::test]
+async fn test_empty_input() {
+    let registry = ParserRegistry::new();
+    let parsers = vec!["json", "mistral", "qwen", "pythonic", "llama"];
+
+    for parser_name in parsers {
+        let parser = registry
+            .get_parser(&format!("test-{}", parser_name))
+            .unwrap();
+        let result = parser.parse_complete("").await.unwrap();
+        assert_eq!(
+            result.len(),
+            0,
+            "Parser {} should return empty for empty input",
+            parser_name
+        );
+    }
+}
+
+#[tokio::test]
+async fn test_plain_text_no_tools() {
+    let plain_text = "This is just a regular response with no tool calls whatsoever.";
+
+    let json_parser = JsonParser::new();
+    assert_eq!(
+        json_parser.parse_complete(plain_text).await.unwrap().len(),
+        0
+    );
+
+    let mistral_parser = MistralParser::new();
+    assert_eq!(
+        mistral_parser
+            .parse_complete(plain_text)
+            .await
+            .unwrap()
+            .len(),
+        0
+    );
+
+    let qwen_parser = QwenParser::new();
+    assert_eq!(
+        qwen_parser.parse_complete(plain_text).await.unwrap().len(),
+        0
+    );
+
+    let pythonic_parser = PythonicParser::new();
+    assert_eq!(
+        pythonic_parser
+            .parse_complete(plain_text)
+            .await
+            .unwrap()
+            .len(),
+        0
+    );
+}
+
+#[tokio::test]
+async fn test_incomplete_json() {
+    let json_parser = JsonParser::new();
+
+    let incomplete_cases = vec![
+        r#"{"name": "test""#,                 // Missing closing brace
+        r#"{"name": "test", "arguments":"#,   // Incomplete arguments
+        r#"{"name": "test", "arguments": {"#, // Incomplete nested object
+    ];
+
+    for input in incomplete_cases {
+        let result = json_parser.parse_complete(input).await.unwrap();
+        assert_eq!(
+            result.len(),
+            0,
+            "Should not parse incomplete JSON: {}",
+            input
+        );
+    }
+
+    // This case might actually parse because [{"name": "test"}] is complete
+    // The trailing comma suggests more items but the first item is valid
+    let _result = json_parser
+        .parse_complete(r#"[{"name": "test"},"#)
+        .await
+        .unwrap();
+    // This could parse the first element or return empty - implementation dependent
+}
+
+#[tokio::test]
+async fn test_malformed_mistral() {
+    let parser = MistralParser::new();
+
+    let malformed_cases = vec![
+        "[TOOL_CALLS]",                // Missing array
+        "[TOOL_CALLS] {",              // Not an array
+        "[TOOL_CALLS] [",              // Incomplete array
+        "[TOOL_CALLS] [{]",            // Invalid JSON in array
+        "[TOOL_CALLS] [{\"name\": }]", // Invalid value
+    ];
+
+    for input in malformed_cases {
+        // Parser might return error or empty vec for malformed input
+        if let Ok(result) = parser.parse_complete(input).await {
+            assert_eq!(
+                result.len(),
+                0,
+                "Should not parse malformed Mistral: {}",
+                input
+            );
+        }
+        // Error is also acceptable for malformed input
+    }
+}
+
+#[tokio::test]
+async fn test_missing_required_fields() {
+    let json_parser = JsonParser::new();
+
+    // Missing name field
+    let input = r#"{"arguments": {"x": 1}}"#;
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 0, "Should not parse without name field");
+
+    // Name is not a string
+    let input = r#"{"name": 123, "arguments": {}}"#;
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 0, "Should not parse with non-string name");
+}
+
+#[tokio::test]
+async fn test_very_long_strings() {
+    let json_parser = JsonParser::new();
+
+    let long_string = "x".repeat(10000);
+    let input = format!(
+        r#"{{"name": "test", "arguments": {{"data": "{}"}}}}"#,
+        long_string
+    );
+
+    let result = json_parser.parse_complete(&input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "test");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["data"].as_str().unwrap().len(), 10000);
+}
+
+#[tokio::test]
+async fn test_unicode_edge_cases() {
+    let json_parser = JsonParser::new();
+
+    // Various Unicode characters including emojis, CJK, RTL text
+    let input = r#"{"name": "translate", "arguments": {"text": "Hello 世界 🌍 مرحبا עולם"}}"#;
+
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["text"], "Hello 世界 🌍 مرحبا עולם");
+}
+
+#[tokio::test]
+async fn test_nested_brackets_in_strings() {
+    // Test that parsers correctly handle brackets within string literals
+
+    let mistral_parser = MistralParser::new();
+    let input = r#"[TOOL_CALLS] [{"name": "echo", "arguments": {"text": "Array: [1, 2, 3]"}}]"#;
+    let result = mistral_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["text"], "Array: [1, 2, 3]");
+
+    let pythonic_parser = PythonicParser::new();
+    let input = r#"[echo(text="List: [a, b, c]")]"#;
+    let result = pythonic_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["text"], "List: [a, b, c]");
+}
+
+#[tokio::test]
+async fn test_multiple_formats_in_text() {
+    // Test that parsers don't get confused by other formats in the text
+
+    let json_parser = JsonParser::new();
+    let input = r#"
+    Here's some text with [TOOL_CALLS] that shouldn't trigger.
+    {"name": "actual_tool", "arguments": {}}
+    And some more text with <tool_call> tags.
+    "#;
+
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "actual_tool");
+}
+
+#[tokio::test]
+async fn test_escaped_characters() {
+    let json_parser = JsonParser::new();
+
+    let input = r#"{"name": "write", "arguments": {"content": "Line 1\nLine 2\r\nLine 3\tTabbed\\Backslash\"Quote"}}"#;
+
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    let content = args["content"].as_str().unwrap();
+    assert!(content.contains('\n'));
+    assert!(content.contains('\t'));
+    assert!(content.contains('\\'));
+    assert!(content.contains('"'));
+}
+
+#[tokio::test]
+async fn test_numeric_edge_cases() {
+    let json_parser = JsonParser::new();
+
+    let input = r#"{
+        "name": "calculate",
+        "arguments": {
+            "int": 42,
+            "float": 123.456,
+            "scientific": 1.23e-4,
+            "negative": -999,
+            "zero": 0,
+            "large": 9007199254740991
+        }
+    }"#;
+
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["int"], 42);
+    assert_eq!(args["float"], 123.456);
+    assert_eq!(args["scientific"], 0.000123);
+    assert_eq!(args["negative"], -999);
+    assert_eq!(args["zero"], 0);
+    assert_eq!(args["large"], 9007199254740991i64);
+}
+
+#[tokio::test]
+async fn test_null_and_boolean_values() {
+    let json_parser = JsonParser::new();
+
+    let input = r#"{
+        "name": "configure",
+        "arguments": {
+            "enabled": true,
+            "disabled": false,
+            "optional": null
+        }
+    }"#;
+
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["enabled"], true);
+    assert_eq!(args["disabled"], false);
+    assert_eq!(args["optional"], serde_json::Value::Null);
+}
+
+#[tokio::test]
+async fn test_partial_token_at_buffer_boundary() {
+    let parser = QwenParser::new();
+    let mut state = ParseState::new();
+
+    // Test case that would fail with the bug:
+    // Send exactly "<tool" which is a 5-character prefix of "<tool_call>\n"
+    let result = parser.parse_incremental("<tool", &mut state).await.unwrap();
+    assert!(matches!(result, StreamResult::Incomplete));
+    assert_eq!(state.buffer, "<tool");
+
+    // Complete the token
+    let result = parser
+        .parse_incremental(
+            "_call>\n{\"name\": \"test\", \"arguments\": {}}\n</tool_call>",
+            &mut state,
+        )
+        .await
+        .unwrap();
+
+    // Should successfully parse after completing
+    match result {
+        StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "test");
+        }
+        _ => {
+            // In Phase 2 simplified streaming, might get Incomplete
+            // The important thing is it didn't fail to recognize the partial token
+        }
+    }
+}
+
+#[tokio::test]
+async fn test_exact_prefix_lengths() {
+    let parser = QwenParser::new();
+
+    // Test various exact prefix lengths that would be missed by exclusive range
+    let test_cases = vec![
+        ("<", 1),            // 1-char prefix
+        ("<t", 2),           // 2-char prefix
+        ("<tool", 5),        // 5-char prefix (the main bug case)
+        ("<tool_call", 10),  // 10-char prefix
+        ("<tool_call>", 11), // 11-char prefix (full start without \n)
+    ];
+
+    for (prefix, expected_len) in test_cases {
+        let mut state = ParseState::new();
+        let result = parser.parse_incremental(prefix, &mut state).await.unwrap();
+        assert!(
+            matches!(result, StreamResult::Incomplete),
+            "Prefix '{}' (len {}) should be incomplete",
+            prefix,
+            expected_len
+        );
+        assert_eq!(
+            state.buffer, prefix,
+            "Buffer should contain the prefix '{}'",
+            prefix
+        );
+    }
+}