From 7e831efee8c0ab00f1c5d77f8293dee0602fdbe9 Mon Sep 17 00:00:00 2001
From: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com>
Date: Thu, 31 Jul 2025 21:49:45 -0700
Subject: [PATCH] Fix chat template handling for OpenAI serving (#8635)

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
---
 python/sglang/srt/conversation.py             | 112 ------------------
 .../sglang/srt/managers/template_manager.py   |  47 ++++----
 .../sglang/srt/managers/tokenizer_manager.py  |   4 +-
 .../srt/multimodal/processors/gemma3n.py      |   7 --
 test/srt/test_vision_openai_server_a.py       |  19 ++-
 test/srt/test_vision_openai_server_b.py       |  26 +++-
 test/srt/test_vision_openai_server_common.py  |  24 ++--
 7 files changed, 83 insertions(+), 156 deletions(-)
diff --git a/python/sglang/srt/conversation.py b/python/sglang/srt/conversation.py
index c34527591..1d1340a1d 100644
--- a/python/sglang/srt/conversation.py
+++ b/python/sglang/srt/conversation.py
@@ -954,20 +954,6 @@ register_conv_template(
     )
 )
 
-register_conv_template(
-    Conversation(
-        name="mimo-vl",
-        system_message="You are MiMo, an AI assistant developed by Xiaomi.",
-        system_template="<|im_start|>system\n{system_message}",
-        roles=("<|im_start|>user", "<|im_start|>assistant"),
-        sep="<|im_end|>\n",
-        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
-        stop_str=["<|im_end|>"],
-        image_token="<|vision_start|><|image_pad|><|vision_end|>",
-    )
-)
-
-
 register_conv_template(
     Conversation(
         name="qwen2-audio",
@@ -981,51 +967,11 @@ register_conv_template(
     )
 )
 
-register_conv_template(
-    Conversation(
-        name="llama_4_vision",
-        system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
-        system_template="<|header_start|>system<|header_end|>\n\n{system_message}<|eot|>",
-        roles=("user", "assistant"),
-        sep_style=SeparatorStyle.LLAMA4,
-        sep="",
-        stop_str="<|eot|>",
-        image_token="<|image|>",
-    )
-)
-
-register_conv_template(
-    Conversation(
-        name="step3-vl",
-        system_message="<｜begin▁of▁sentence｜>You are a helpful assistant",
-        system_template="{system_message}\n",
-        roles=(
-            "<|BOT|>user\n",
-            "<|BOT|>assistant\n<think>\n",
-        ),
-        sep="<|EOT|>",
-        sep_style=SeparatorStyle.NO_COLON_SINGLE,
-        stop_str="<|EOT|>",
-        image_token="<im_patch>",
-        # add_bos=True,
-    )
-)
-
 
 @register_conv_template_matching_function
 def match_internvl(model_path: str):
     if re.search(r"internvl", model_path, re.IGNORECASE):
         return "internvl-2-5"
-    if re.search(r"intern.*s1", model_path, re.IGNORECASE):
-        return "interns1"
-
-
-@register_conv_template_matching_function
-def match_llama_vision(model_path: str):
-    if re.search(r"llama.*3\.2.*vision", model_path, re.IGNORECASE):
-        return "llama_3_vision"
-    if re.search(r"llama.*4.*", model_path, re.IGNORECASE):
-        return "llama_4_vision"
 
 
 @register_conv_template_matching_function
@@ -1040,22 +986,6 @@ def match_vicuna(model_path: str):
         return "vicuna_v1.1"
 
 
-@register_conv_template_matching_function
-def match_llama2_chat(model_path: str):
-    if re.search(
-        r"llama-2.*chat|codellama.*instruct",
-        model_path,
-        re.IGNORECASE,
-    ):
-        return "llama-2"
-
-
-@register_conv_template_matching_function
-def match_mistral(model_path: str):
-    if re.search(r"pixtral|(mistral|mixtral).*instruct", model_path, re.IGNORECASE):
-        return "mistral"
-
-
 @register_conv_template_matching_function
 def match_deepseek_vl(model_path: str):
     if re.search(r"deepseek.*vl2", model_path, re.IGNORECASE):
@@ -1064,12 +994,6 @@ def match_deepseek_vl(model_path: str):
 
 @register_conv_template_matching_function
 def match_qwen_chat_ml(model_path: str):
-    if re.search(r"gme.*qwen.*vl", model_path, re.IGNORECASE):
-        return "gme-qwen2-vl"
-    if re.search(r"qwen.*vl", model_path, re.IGNORECASE):
-        return "qwen2-vl"
-    if re.search(r"qwen.*audio", model_path, re.IGNORECASE):
-        return "qwen2-audio"
     if re.search(
         r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2",
         model_path,
@@ -1078,12 +1002,6 @@ def match_qwen_chat_ml(model_path: str):
         return "chatml-llava"
 
 
-@register_conv_template_matching_function
-def match_gemma3_instruct(model_path: str):
-    if re.search(r"gemma-3.*it", model_path, re.IGNORECASE):
-        return "gemma-it"
-
-
 @register_conv_template_matching_function
 def match_openbmb_minicpm(model_path: str):
     if re.search(r"minicpm-v", model_path, re.IGNORECASE):
@@ -1092,37 +1010,7 @@ def match_openbmb_minicpm(model_path: str):
         return "minicpmo"
 
 
-@register_conv_template_matching_function
-def match_moonshot_kimivl(model_path: str):
-    if re.search(r"kimi.*vl", model_path, re.IGNORECASE):
-        return "kimi-vl"
-
-
-@register_conv_template_matching_function
-def match_devstral(model_path: str):
-    if re.search(r"devstral", model_path, re.IGNORECASE):
-        return "devstral"
-
-
 @register_conv_template_matching_function
 def match_phi_4_mm(model_path: str):
     if "phi-4-multimodal" in model_path.lower():
         return "phi-4-mm"
-
-
-@register_conv_template_matching_function
-def match_vila(model_path: str):
-    if re.search(r"vila", model_path, re.IGNORECASE):
-        return "chatml"
-
-
-@register_conv_template_matching_function
-def match_mimo_vl(model_path: str):
-    if re.search(r"mimo.*vl", model_path, re.IGNORECASE):
-        return "mimo-vl"
-
-
-# @register_conv_template_matching_function
-# def match_step3(model_path: str):
-#     if re.search(r"step3", model_path, re.IGNORECASE):
-#         return "step3-vl"
diff --git a/python/sglang/srt/managers/template_manager.py b/python/sglang/srt/managers/template_manager.py
index e340f65f0..c98e3d14a 100644
--- a/python/sglang/srt/managers/template_manager.py
+++ b/python/sglang/srt/managers/template_manager.py
@@ -84,26 +84,27 @@ class TemplateManager:
         if chat_template_arg:
             self._load_explicit_chat_template(tokenizer_manager, chat_template_arg)
         else:
-            # Try HuggingFace template first
-            hf_template = self._resolve_hf_chat_template(tokenizer_manager)
-            if hf_template:
-                self._jinja_template_content_format = (
-                    detect_jinja_template_content_format(hf_template)
-                )
-                logger.info(
-                    f"Using default HuggingFace chat template with detected content format: {self._jinja_template_content_format}"
-                )
-                return
-
-            # Fallback to SGLang template guessing
+            # Guess chat template from model path
             self.guess_chat_template_from_model_path(model_path)
 
-            # Set default format if no template was found
+            # If no pre-defined template was found, fallback to HuggingFace template
             if self._chat_template_name is None:
-                self._jinja_template_content_format = "string"
-                logger.info(
-                    "No chat template found, defaulting to 'string' content format"
-                )
+                # Try HuggingFace template first
+                hf_template = self._resolve_hf_chat_template(tokenizer_manager)
+                if hf_template:
+                    # override the chat template
+                    tokenizer_manager.tokenizer.chat_template = hf_template
+                    self._jinja_template_content_format = (
+                        detect_jinja_template_content_format(hf_template)
+                    )
+                    logger.info(
+                        f"Using default HuggingFace chat template with detected content format: {self._jinja_template_content_format}"
+                    )
+                    return
+
+            # Default to string content format if no template was found
+            self._jinja_template_content_format = "string"
+            logger.info("No chat template found, defaulting to 'string' content format")
 
     def _load_explicit_chat_template(
         self, tokenizer_manager, chat_template_arg: str
@@ -257,13 +258,15 @@ class TemplateManager:
 
         Returns the chat template string if found, None otherwise.
         """
-        tokenizer = tokenizer_manager.tokenizer
-
-        # Try to get AutoTokenizer chat template
         try:
-            return tokenizer.get_chat_template()
+            if processor := tokenizer_manager.processor:
+                if hasattr(processor, "chat_template") and processor.chat_template:
+                    return processor.chat_template
+            if tokenizer := tokenizer_manager.tokenizer:
+                if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
+                    return tokenizer.chat_template
         except Exception as e:
-            logger.debug(f"Error getting chat template via get_chat_template(): {e}")
+            logger.debug(f"Error getting chat template: {e}")
 
         logger.debug("No HuggingFace chat template found")
         return None
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 9250c6866..46fd967e5 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -225,10 +225,10 @@ class TokenizerManager:
                 self.tokenizer = get_tokenizer_from_processor(self.processor)
                 os.environ["TOKENIZERS_PARALLELISM"] = "false"
         else:
-            self.mm_processor = None
+            self.mm_processor = self.processor = None
 
             if server_args.skip_tokenizer_init:
-                self.tokenizer = self.processor = None
+                self.tokenizer = None
             else:
                 self.tokenizer = get_tokenizer(
                     server_args.tokenizer_path,
diff --git a/python/sglang/srt/multimodal/processors/gemma3n.py b/python/sglang/srt/multimodal/processors/gemma3n.py
index 4bfbcaffa..9ea8b8be3 100644
--- a/python/sglang/srt/multimodal/processors/gemma3n.py
+++ b/python/sglang/srt/multimodal/processors/gemma3n.py
@@ -12,7 +12,6 @@
 # limitations under the License.
 # ==============================================================================
 
-import re
 from typing import Dict, List, Optional, Union
 
 from sglang.srt.managers.multimodal_processor import (
@@ -38,14 +37,8 @@ class Gemma3nSGLangProcessor(SGLangBaseProcessor):
         self.mm_tokens = MultimodalSpecialTokens(
             image_token="<image_soft_token>",
             image_token_id=hf_config.image_token_id,
-            image_token_regex=re.compile(
-                r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
-            ),
             audio_token="<audio_soft_token>",
             audio_token_id=hf_config.audio_token_id,
-            audio_token_regex=re.compile(
-                r"<start_of_audio>(?:(?:<audio_soft_token>)*<end_of_audio>)?"
-            ),
         ).build(_processor)
 
     async def process_mm_data_async(
diff --git a/test/srt/test_vision_openai_server_a.py b/test/srt/test_vision_openai_server_a.py
index 4c41e2fec..9d69b918c 100644
--- a/test/srt/test_vision_openai_server_a.py
+++ b/test/srt/test_vision_openai_server_a.py
@@ -31,6 +31,8 @@ class TestQwen2VLServer(TestOpenAIVisionServer):
             other_args=[
                 "--mem-fraction-static",
                 "0.35",
+                "--cuda-graph-max-bs",
+                "4",
             ],
         )
         cls.base_url += "/v1"
@@ -53,6 +55,8 @@ class TestQwen2_5_VLServer(TestOpenAIVisionServer):
             other_args=[
                 "--mem-fraction-static",
                 "0.35",
+                "--cuda-graph-max-bs",
+                "4",
             ],
         )
         cls.base_url += "/v1"
@@ -76,6 +80,8 @@ class TestVLMContextLengthIssue(CustomTestCase):
                 "--context-length",
                 "300",
                 "--mem-fraction-static=0.75",
+                "--cuda-graph-max-bs",
+                "4",
             ],
         )
         cls.base_url += "/v1"
@@ -149,6 +155,8 @@ class TestMinicpmvServer(TestOpenAIVisionServer):
                 "--trust-remote-code",
                 "--mem-fraction-static",
                 "0.35",
+                "--cuda-graph-max-bs",
+                "4",
             ],
         )
         cls.base_url += "/v1"
@@ -164,7 +172,11 @@ class TestInternVL2_5Server(TestOpenAIVisionServer):
             cls.model,
             cls.base_url,
             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=["--trust-remote-code"],
+            other_args=[
+                "--trust-remote-code",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
         )
         cls.base_url += "/v1"
 
@@ -183,6 +195,8 @@ class TestMinicpmoServer(TestOpenAIVisionServer):
                 "--trust-remote-code",
                 "--mem-fraction-static",
                 "0.65",
+                "--cuda-graph-max-bs",
+                "4",
             ],
         )
         cls.base_url += "/v1"
@@ -207,10 +221,13 @@ class TestMimoVLServer(TestOpenAIVisionServer):
                 "--trust-remote-code",
                 "--mem-fraction-static",
                 "0.6",
+                "--cuda-graph-max-bs",
+                "4",
             ],
         )
         cls.base_url += "/v1"
 
 
 if __name__ == "__main__":
+    del TestOpenAIVisionServer
     unittest.main()
diff --git a/test/srt/test_vision_openai_server_b.py b/test/srt/test_vision_openai_server_b.py
index 533312aaf..81dc2c6b7 100644
--- a/test/srt/test_vision_openai_server_b.py
+++ b/test/srt/test_vision_openai_server_b.py
@@ -23,6 +23,8 @@ class TestPixtralServer(TestOpenAIVisionServer):
                 "--trust-remote-code",
                 "--mem-fraction-static",
                 "0.70",
+                "--cuda-graph-max-bs",
+                "4",
             ],
         )
         cls.base_url += "/v1"
@@ -45,6 +47,8 @@ class TestMistral3_1Server(TestOpenAIVisionServer):
                 "--trust-remote-code",
                 "--mem-fraction-static",
                 "0.75",
+                "--cuda-graph-max-bs",
+                "4",
             ],
         )
         cls.base_url += "/v1"
@@ -67,7 +71,8 @@ class TestDeepseekVL2Server(TestOpenAIVisionServer):
                 "--trust-remote-code",
                 "--context-length",
                 "4096",
-                "--disable-cuda-graph",
+                "--cuda-graph-max-bs",
+                "4",
             ],
         )
         cls.base_url += "/v1"
@@ -90,6 +95,8 @@ class TestJanusProServer(TestOpenAIVisionServer):
                 "--trust-remote-code",
                 "--mem-fraction-static",
                 "0.35",
+                "--cuda-graph-max-bs",
+                "4",
             ],
         )
         cls.base_url += "/v1"
@@ -120,6 +127,10 @@ class TestJanusProServer(TestOpenAIVisionServer):
 #                 "0.8",
 #                 "--tp-size=8",
 #                 "--context-length=8192",
+#                 "--mm-attention-backend",
+#                 "fa3",
+#                 "--cuda-graph-max-bs",
+#                 "4",
 #             ],
 #         )
 #         cls.base_url += "/v1"
@@ -143,6 +154,8 @@ class TestGemma3itServer(TestOpenAIVisionServer):
                 "--mem-fraction-static",
                 "0.70",
                 "--enable-multimodal",
+                "--cuda-graph-max-bs",
+                "4",
             ],
         )
         cls.base_url += "/v1"
@@ -154,7 +167,7 @@ class TestGemma3itServer(TestOpenAIVisionServer):
 class TestGemma3nServer(TestOpenAIVisionServer):
     @classmethod
     def setUpClass(cls):
-        cls.model = "google/gemma-3n-E2B-it"
+        cls.model = "google/gemma-3n-E4B-it"
         cls.base_url = DEFAULT_URL_FOR_TEST
         cls.api_key = "sk-123456"
         cls.process = popen_launch_server(
@@ -166,7 +179,7 @@ class TestGemma3nServer(TestOpenAIVisionServer):
                 "--mem-fraction-static",
                 "0.70",
                 "--cuda-graph-max-bs",
-                "1",
+                "4",
             ],
         )
         cls.base_url += "/v1"
@@ -193,6 +206,8 @@ class TestKimiVLServer(TestOpenAIVisionServer):
                 "4096",
                 "--dtype",
                 "bfloat16",
+                "--cuda-graph-max-bs",
+                "4",
             ],
         )
         cls.base_url += "/v1"
@@ -233,6 +248,8 @@ class TestPhi4MMServer(TestOpenAIVisionServer):
                 "--lora-paths",
                 f"vision={constants.HF_HUB_CACHE}/models--microsoft--Phi-4-multimodal-instruct/snapshots/{revision}/vision-lora",
                 f"speech={constants.HF_HUB_CACHE}/models--microsoft--Phi-4-multimodal-instruct/snapshots/{revision}/speech-lora",
+                "--cuda-graph-max-bs",
+                "4",
             ],
         )
         cls.base_url += "/v1"
@@ -277,10 +294,13 @@ class TestVILAServer(TestOpenAIVisionServer):
                 "--trust-remote-code",
                 "--context-length=65536",
                 f"--revision={cls.revision}",
+                "--cuda-graph-max-bs",
+                "4",
             ],
         )
         cls.base_url += "/v1"
 
 
 if __name__ == "__main__":
+    del TestOpenAIVisionServer
     unittest.main()
diff --git a/test/srt/test_vision_openai_server_common.py b/test/srt/test_vision_openai_server_common.py
index 2f7e404cb..d652c6be1 100644
--- a/test/srt/test_vision_openai_server_common.py
+++ b/test/srt/test_vision_openai_server_common.py
@@ -71,7 +71,7 @@ class TestOpenAIVisionServer(CustomTestCase):
                         },
                         {
                             "type": "text",
-                            "text": "Describe this image in a very short sentence.",
+                            "text": "Describe this image in a sentence.",
                         },
                     ],
                 },
@@ -119,7 +119,7 @@ class TestOpenAIVisionServer(CustomTestCase):
                         },
                         {
                             "type": "text",
-                            "text": "Describe this image in a very short sentence.",
+                            "text": "Describe this image in a sentence.",
                         },
                     ],
                 },
@@ -455,7 +455,7 @@ class TestOpenAIVisionServer(CustomTestCase):
         content.append(
             {
                 "type": "text",
-                "text": "Describe this image in a very short sentence.",
+                "text": "Describe this image in a sentence.",
             }
         )
 
@@ -528,14 +528,20 @@ class TestOpenAIVisionServer(CustomTestCase):
         # a fragment of Trump's speech
         audio_response = self.get_audio_response(
             AUDIO_TRUMP_SPEECH_URL,
-            "I have an audio sample. Please repeat the person's words",
+            "Listen to this audio and write down the audio transcription in English.",
             category="speech",
         )
-        assert "thank you" in audio_response
-        assert "it's a privilege to be here" in audio_response
-        assert "leader" in audio_response
-        assert "science" in audio_response
-        assert "art" in audio_response
+        check_list = [
+            "thank you",
+            "it's a privilege to be here",
+            "leader",
+            "science",
+            "art",
+        ]
+        for check_word in check_list:
+            assert (
+                check_word in audio_response
+            ), f"audio_response: ｜{audio_response}｜ should contain ｜{check_word}｜"
 
     def _test_audio_ambient_completion(self):
         # bird song