From e483c1eae514f9a1ef36526ba435f37ef159aeb5 Mon Sep 17 00:00:00 2001
From: Simo Lin <linsimo.mark@gmail.com>
Date: Fri, 17 Oct 2025 10:21:00 -0700
Subject: [PATCH] [router] Fix UTF-8 Boundary Panic in Stop Sequence Decoder
 (#11766)

---
 sgl-router/src/tokenizer/stop.rs | 170 ++++++++++++++++++++++++++++---
 1 file changed, 158 insertions(+), 12 deletions(-)
diff --git a/sgl-router/src/tokenizer/stop.rs b/sgl-router/src/tokenizer/stop.rs
index 14f68793d..ef0630e82 100644
--- a/sgl-router/src/tokenizer/stop.rs
+++ b/sgl-router/src/tokenizer/stop.rs
@@ -149,29 +149,45 @@ impl StopSequenceDecoder {
 
         // Check for partial matches: is the end of jail_buffer the start of any stop_seq?
         // This handles stop sequences split across tokens
-        let mut longest_partial = 0;
+        let buffer_len = self.jail_buffer.len();
+        let mut best_split_pos: Option<usize> = None;
+
         for stop_seq in self
             .config
             .stop_sequences
             .iter()
             .chain(&self.config.visible_stop_sequences)
         {
-            // Check suffixes of jail_buffer that match prefixes of stop_seq
-            // We check up to stop_seq.len() - 1 to avoid rechecking exact matches
-            let max_len = self.jail_buffer.len().min(stop_seq.len() - 1);
-            for len in 1..=max_len {
-                let suffix = &self.jail_buffer[self.jail_buffer.len() - len..];
-                if stop_seq.starts_with(suffix) {
-                    longest_partial = longest_partial.max(len);
+            let stop_len = stop_seq.len();
+
+            if stop_len <= 1 || buffer_len == 0 {
+                continue;
+            }
+
+            let max_len = buffer_len.min(stop_len - 1);
+
+            for len in (1..=max_len).rev() {
+                let suffix_start = buffer_len - len;
+
+                if !self.jail_buffer.is_char_boundary(suffix_start) {
+                    continue;
+                }
+
+                let suffix = &self.jail_buffer[suffix_start..];
+
+                if stop_seq.starts_with(suffix)
+                    && best_split_pos.is_none_or(|current| suffix_start < current)
+                {
+                    best_split_pos = Some(suffix_start);
+                    break;
                 }
             }
         }
 
-        if longest_partial > 0 {
+        if let Some(split_pos) = best_split_pos {
             // Hold the partial match, flush the rest
-            let split_pos = self.jail_buffer.len() - longest_partial;
-            let to_output = self.jail_buffer[..split_pos].to_string();
-            self.jail_buffer = self.jail_buffer[split_pos..].to_string();
+            // Drain [0..split_pos] as output, keep [split_pos..] in jail_buffer
+            let to_output = self.jail_buffer.drain(..split_pos).collect::<String>();
 
             if to_output.is_empty() {
                 Ok(SequenceDecoderOutput::Held)
@@ -457,4 +473,134 @@ mod tests {
             ));
         }
     }
+
+    #[test]
+    fn test_utf8_multibyte_character_boundaries() {
+        // This test verifies the fix for the UTF-8 boundary panic
+        // The panic occurred when trying to slice jail_buffer at a byte index
+        // that was in the middle of a multi-byte UTF-8 character (e.g., '×')
+        use crate::tokenizer::mock::MockTokenizer;
+
+        let tokenizer = Arc::new(MockTokenizer::new());
+
+        // Configure stop sequence with a multi-byte character
+        let config = StopSequenceConfig::default().with_stop_sequence(" ×");
+
+        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
+
+        // Simulate the scenario: jail_buffer will contain " ×" (space + multiplication sign)
+        // The '×' character is UTF-8 encoded as bytes [0xC3, 0x97] (2 bytes)
+        // When checking for partial matches, we must not slice in the middle of these bytes
+
+        // This should not panic - the fix ensures we only slice at char boundaries
+        let result = decoder.process_token(1); // Will add some text to jail_buffer
+        assert!(result.is_ok());
+
+        // Even with multi-byte UTF-8 characters in the buffer, processing should work
+        let result = decoder.process_token(2);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_utf8_multibyte_delta_character() {
+        // Test for: byte index 1 is not a char boundary; it is inside 'Δ' (bytes 0..2) of `Δ`
+        // 'Δ' (U+0394 GREEK CAPITAL LETTER DELTA) is encoded as [0xCE, 0x94] (2 bytes)
+        let tokenizer = Arc::new(MockTokenizer::new());
+        let config = StopSequenceConfig::default().with_stop_sequence("Δ");
+
+        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
+
+        // Process tokens - should not panic when checking partial matches
+        let result = decoder.process_token(1);
+        assert!(result.is_ok());
+        let result = decoder.process_token(2);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_utf8_multibyte_degree_character() {
+        // Test for: byte index 1 is not a char boundary; it is inside '°' (bytes 0..2) of `°`
+        // '°' (U+00B0 DEGREE SIGN) is encoded as [0xC2, 0xB0] (2 bytes)
+        let tokenizer = Arc::new(MockTokenizer::new());
+        let config = StopSequenceConfig::default().with_stop_sequence("°");
+
+        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
+
+        // Process tokens - should not panic when checking partial matches
+        let result = decoder.process_token(1);
+        assert!(result.is_ok());
+        let result = decoder.process_token(2);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_utf8_multibyte_triangle_character() {
+        // Test for: byte index 4 is not a char boundary; it is inside '∆' (bytes 2..5) of ` (∆`
+        // '∆' (U+2206 INCREMENT) is encoded as [0xE2, 0x88, 0x86] (3 bytes)
+        let tokenizer = Arc::new(MockTokenizer::new());
+        let config = StopSequenceConfig::default().with_stop_sequence(" (∆");
+
+        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
+
+        // Process tokens - should not panic when checking partial matches
+        let result = decoder.process_token(1);
+        assert!(result.is_ok());
+        let result = decoder.process_token(2);
+        assert!(result.is_ok());
+        let result = decoder.process_token(3);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_utf8_multibyte_en_dash_character() {
+        // Test for: byte index 3 is not a char boundary; it is inside '–' (bytes 1..4) of ` –`
+        // '–' (U+2013 EN DASH) is encoded as [0xE2, 0x80, 0x93] (3 bytes)
+        let tokenizer = Arc::new(MockTokenizer::new());
+        let config = StopSequenceConfig::default().with_stop_sequence(" –");
+
+        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
+
+        // Process tokens - should not panic when checking partial matches
+        let result = decoder.process_token(1);
+        assert!(result.is_ok());
+        let result = decoder.process_token(2);
+        assert!(result.is_ok());
+        let result = decoder.process_token(3);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_utf8_multibyte_various_characters() {
+        // Comprehensive test with multiple multi-byte UTF-8 characters
+        // Tests 2-byte, 3-byte, and 4-byte UTF-8 sequences
+        let test_cases = vec![
+            ("×", "multiplication sign - 2 bytes"),
+            ("Δ", "Greek Delta - 2 bytes"),
+            ("°", "degree sign - 2 bytes"),
+            ("∆", "increment - 3 bytes"),
+            ("–", "en dash - 3 bytes"),
+            ("€", "euro sign - 3 bytes"),
+            ("中", "Chinese character - 3 bytes"),
+            ("🚀", "rocket emoji - 4 bytes"),
+            ("💡", "lightbulb emoji - 4 bytes"),
+        ];
+
+        for (stop_char, description) in test_cases {
+            let tokenizer = Arc::new(MockTokenizer::new());
+            let config = StopSequenceConfig::default().with_stop_sequence(stop_char);
+
+            let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
+
+            // Process multiple tokens - should not panic
+            for token_id in 1..=5 {
+                let result = decoder.process_token(token_id);
+                assert!(
+                    result.is_ok(),
+                    "Failed on {} with token {}",
+                    description,
+                    token_id
+                );
+            }
+        }
+    }
 }