From 7e831efee8c0ab00f1c5d77f8293dee0602fdbe9 Mon Sep 17 00:00:00 2001 From: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com> Date: Thu, 31 Jul 2025 21:49:45 -0700 Subject: [PATCH] Fix chat template handling for OpenAI serving (#8635) Signed-off-by: Xinyuan Tong Signed-off-by: Xinyuan Tong --- python/sglang/srt/conversation.py | 112 ------------------ .../sglang/srt/managers/template_manager.py | 47 ++++---- .../sglang/srt/managers/tokenizer_manager.py | 4 +- .../srt/multimodal/processors/gemma3n.py | 7 -- test/srt/test_vision_openai_server_a.py | 19 ++- test/srt/test_vision_openai_server_b.py | 26 +++- test/srt/test_vision_openai_server_common.py | 24 ++-- 7 files changed, 83 insertions(+), 156 deletions(-) diff --git a/python/sglang/srt/conversation.py b/python/sglang/srt/conversation.py index c34527591..1d1340a1d 100644 --- a/python/sglang/srt/conversation.py +++ b/python/sglang/srt/conversation.py @@ -954,20 +954,6 @@ register_conv_template( ) ) -register_conv_template( - Conversation( - name="mimo-vl", - system_message="You are MiMo, an AI assistant developed by Xiaomi.", - system_template="<|im_start|>system\n{system_message}", - roles=("<|im_start|>user", "<|im_start|>assistant"), - sep="<|im_end|>\n", - sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE, - stop_str=["<|im_end|>"], - image_token="<|vision_start|><|image_pad|><|vision_end|>", - ) -) - - register_conv_template( Conversation( name="qwen2-audio", @@ -981,51 +967,11 @@ register_conv_template( ) ) -register_conv_template( - Conversation( - name="llama_4_vision", - system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.", - system_template="<|header_start|>system<|header_end|>\n\n{system_message}<|eot|>", - roles=("user", "assistant"), - sep_style=SeparatorStyle.LLAMA4, - sep="", - stop_str="<|eot|>", - image_token="<|image|>", - ) -) - -register_conv_template( - Conversation( - name="step3-vl", - system_message="<|begin▁of▁sentence|>You are a helpful assistant", - system_template="{system_message}\n", - roles=( - "<|BOT|>user\n", - "<|BOT|>assistant\n\n", - ), - sep="<|EOT|>", - sep_style=SeparatorStyle.NO_COLON_SINGLE, - stop_str="<|EOT|>", - image_token="", - # add_bos=True, - ) -) - @register_conv_template_matching_function def match_internvl(model_path: str): if re.search(r"internvl", model_path, re.IGNORECASE): return "internvl-2-5" - if re.search(r"intern.*s1", model_path, re.IGNORECASE): - return "interns1" - - -@register_conv_template_matching_function -def match_llama_vision(model_path: str): - if re.search(r"llama.*3\.2.*vision", model_path, re.IGNORECASE): - return "llama_3_vision" - if re.search(r"llama.*4.*", model_path, re.IGNORECASE): - return "llama_4_vision" @register_conv_template_matching_function @@ -1040,22 +986,6 @@ def match_vicuna(model_path: str): return "vicuna_v1.1" -@register_conv_template_matching_function -def match_llama2_chat(model_path: str): - if re.search( - r"llama-2.*chat|codellama.*instruct", - model_path, - re.IGNORECASE, - ): - return "llama-2" - - -@register_conv_template_matching_function -def match_mistral(model_path: str): - if re.search(r"pixtral|(mistral|mixtral).*instruct", model_path, re.IGNORECASE): - return "mistral" - - @register_conv_template_matching_function def match_deepseek_vl(model_path: str): if re.search(r"deepseek.*vl2", model_path, re.IGNORECASE): @@ -1064,12 +994,6 @@ def match_deepseek_vl(model_path: str): @register_conv_template_matching_function def match_qwen_chat_ml(model_path: str): - if re.search(r"gme.*qwen.*vl", model_path, re.IGNORECASE): - return "gme-qwen2-vl" - if re.search(r"qwen.*vl", model_path, re.IGNORECASE): - return "qwen2-vl" - if re.search(r"qwen.*audio", model_path, re.IGNORECASE): - return "qwen2-audio" if re.search( r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2", model_path, @@ -1078,12 +1002,6 @@ def match_qwen_chat_ml(model_path: str): return "chatml-llava" -@register_conv_template_matching_function -def match_gemma3_instruct(model_path: str): - if re.search(r"gemma-3.*it", model_path, re.IGNORECASE): - return "gemma-it" - - @register_conv_template_matching_function def match_openbmb_minicpm(model_path: str): if re.search(r"minicpm-v", model_path, re.IGNORECASE): @@ -1092,37 +1010,7 @@ def match_openbmb_minicpm(model_path: str): return "minicpmo" -@register_conv_template_matching_function -def match_moonshot_kimivl(model_path: str): - if re.search(r"kimi.*vl", model_path, re.IGNORECASE): - return "kimi-vl" - - -@register_conv_template_matching_function -def match_devstral(model_path: str): - if re.search(r"devstral", model_path, re.IGNORECASE): - return "devstral" - - @register_conv_template_matching_function def match_phi_4_mm(model_path: str): if "phi-4-multimodal" in model_path.lower(): return "phi-4-mm" - - -@register_conv_template_matching_function -def match_vila(model_path: str): - if re.search(r"vila", model_path, re.IGNORECASE): - return "chatml" - - -@register_conv_template_matching_function -def match_mimo_vl(model_path: str): - if re.search(r"mimo.*vl", model_path, re.IGNORECASE): - return "mimo-vl" - - -# @register_conv_template_matching_function -# def match_step3(model_path: str): -# if re.search(r"step3", model_path, re.IGNORECASE): -# return "step3-vl" diff --git a/python/sglang/srt/managers/template_manager.py b/python/sglang/srt/managers/template_manager.py index e340f65f0..c98e3d14a 100644 --- a/python/sglang/srt/managers/template_manager.py +++ b/python/sglang/srt/managers/template_manager.py @@ -84,26 +84,27 @@ class TemplateManager: if chat_template_arg: self._load_explicit_chat_template(tokenizer_manager, chat_template_arg) else: - # Try HuggingFace template first - hf_template = self._resolve_hf_chat_template(tokenizer_manager) - if hf_template: - self._jinja_template_content_format = ( - detect_jinja_template_content_format(hf_template) - ) - logger.info( - f"Using default HuggingFace chat template with detected content format: {self._jinja_template_content_format}" - ) - return - - # Fallback to SGLang template guessing + # Guess chat template from model path self.guess_chat_template_from_model_path(model_path) - # Set default format if no template was found + # If no pre-defined template was found, fallback to HuggingFace template if self._chat_template_name is None: - self._jinja_template_content_format = "string" - logger.info( - "No chat template found, defaulting to 'string' content format" - ) + # Try HuggingFace template first + hf_template = self._resolve_hf_chat_template(tokenizer_manager) + if hf_template: + # override the chat template + tokenizer_manager.tokenizer.chat_template = hf_template + self._jinja_template_content_format = ( + detect_jinja_template_content_format(hf_template) + ) + logger.info( + f"Using default HuggingFace chat template with detected content format: {self._jinja_template_content_format}" + ) + return + + # Default to string content format if no template was found + self._jinja_template_content_format = "string" + logger.info("No chat template found, defaulting to 'string' content format") def _load_explicit_chat_template( self, tokenizer_manager, chat_template_arg: str @@ -257,13 +258,15 @@ class TemplateManager: Returns the chat template string if found, None otherwise. """ - tokenizer = tokenizer_manager.tokenizer - - # Try to get AutoTokenizer chat template try: - return tokenizer.get_chat_template() + if processor := tokenizer_manager.processor: + if hasattr(processor, "chat_template") and processor.chat_template: + return processor.chat_template + if tokenizer := tokenizer_manager.tokenizer: + if hasattr(tokenizer, "chat_template") and tokenizer.chat_template: + return tokenizer.chat_template except Exception as e: - logger.debug(f"Error getting chat template via get_chat_template(): {e}") + logger.debug(f"Error getting chat template: {e}") logger.debug("No HuggingFace chat template found") return None diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 9250c6866..46fd967e5 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -225,10 +225,10 @@ class TokenizerManager: self.tokenizer = get_tokenizer_from_processor(self.processor) os.environ["TOKENIZERS_PARALLELISM"] = "false" else: - self.mm_processor = None + self.mm_processor = self.processor = None if server_args.skip_tokenizer_init: - self.tokenizer = self.processor = None + self.tokenizer = None else: self.tokenizer = get_tokenizer( server_args.tokenizer_path, diff --git a/python/sglang/srt/multimodal/processors/gemma3n.py b/python/sglang/srt/multimodal/processors/gemma3n.py index 4bfbcaffa..9ea8b8be3 100644 --- a/python/sglang/srt/multimodal/processors/gemma3n.py +++ b/python/sglang/srt/multimodal/processors/gemma3n.py @@ -12,7 +12,6 @@ # limitations under the License. # ============================================================================== -import re from typing import Dict, List, Optional, Union from sglang.srt.managers.multimodal_processor import ( @@ -38,14 +37,8 @@ class Gemma3nSGLangProcessor(SGLangBaseProcessor): self.mm_tokens = MultimodalSpecialTokens( image_token="", image_token_id=hf_config.image_token_id, - image_token_regex=re.compile( - r"(?:(?:)*)?" - ), audio_token="", audio_token_id=hf_config.audio_token_id, - audio_token_regex=re.compile( - r"(?:(?:)*)?" - ), ).build(_processor) async def process_mm_data_async( diff --git a/test/srt/test_vision_openai_server_a.py b/test/srt/test_vision_openai_server_a.py index 4c41e2fec..9d69b918c 100644 --- a/test/srt/test_vision_openai_server_a.py +++ b/test/srt/test_vision_openai_server_a.py @@ -31,6 +31,8 @@ class TestQwen2VLServer(TestOpenAIVisionServer): other_args=[ "--mem-fraction-static", "0.35", + "--cuda-graph-max-bs", + "4", ], ) cls.base_url += "/v1" @@ -53,6 +55,8 @@ class TestQwen2_5_VLServer(TestOpenAIVisionServer): other_args=[ "--mem-fraction-static", "0.35", + "--cuda-graph-max-bs", + "4", ], ) cls.base_url += "/v1" @@ -76,6 +80,8 @@ class TestVLMContextLengthIssue(CustomTestCase): "--context-length", "300", "--mem-fraction-static=0.75", + "--cuda-graph-max-bs", + "4", ], ) cls.base_url += "/v1" @@ -149,6 +155,8 @@ class TestMinicpmvServer(TestOpenAIVisionServer): "--trust-remote-code", "--mem-fraction-static", "0.35", + "--cuda-graph-max-bs", + "4", ], ) cls.base_url += "/v1" @@ -164,7 +172,11 @@ class TestInternVL2_5Server(TestOpenAIVisionServer): cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=["--trust-remote-code"], + other_args=[ + "--trust-remote-code", + "--cuda-graph-max-bs", + "4", + ], ) cls.base_url += "/v1" @@ -183,6 +195,8 @@ class TestMinicpmoServer(TestOpenAIVisionServer): "--trust-remote-code", "--mem-fraction-static", "0.65", + "--cuda-graph-max-bs", + "4", ], ) cls.base_url += "/v1" @@ -207,10 +221,13 @@ class TestMimoVLServer(TestOpenAIVisionServer): "--trust-remote-code", "--mem-fraction-static", "0.6", + "--cuda-graph-max-bs", + "4", ], ) cls.base_url += "/v1" if __name__ == "__main__": + del TestOpenAIVisionServer unittest.main() diff --git a/test/srt/test_vision_openai_server_b.py b/test/srt/test_vision_openai_server_b.py index 533312aaf..81dc2c6b7 100644 --- a/test/srt/test_vision_openai_server_b.py +++ b/test/srt/test_vision_openai_server_b.py @@ -23,6 +23,8 @@ class TestPixtralServer(TestOpenAIVisionServer): "--trust-remote-code", "--mem-fraction-static", "0.70", + "--cuda-graph-max-bs", + "4", ], ) cls.base_url += "/v1" @@ -45,6 +47,8 @@ class TestMistral3_1Server(TestOpenAIVisionServer): "--trust-remote-code", "--mem-fraction-static", "0.75", + "--cuda-graph-max-bs", + "4", ], ) cls.base_url += "/v1" @@ -67,7 +71,8 @@ class TestDeepseekVL2Server(TestOpenAIVisionServer): "--trust-remote-code", "--context-length", "4096", - "--disable-cuda-graph", + "--cuda-graph-max-bs", + "4", ], ) cls.base_url += "/v1" @@ -90,6 +95,8 @@ class TestJanusProServer(TestOpenAIVisionServer): "--trust-remote-code", "--mem-fraction-static", "0.35", + "--cuda-graph-max-bs", + "4", ], ) cls.base_url += "/v1" @@ -120,6 +127,10 @@ class TestJanusProServer(TestOpenAIVisionServer): # "0.8", # "--tp-size=8", # "--context-length=8192", +# "--mm-attention-backend", +# "fa3", +# "--cuda-graph-max-bs", +# "4", # ], # ) # cls.base_url += "/v1" @@ -143,6 +154,8 @@ class TestGemma3itServer(TestOpenAIVisionServer): "--mem-fraction-static", "0.70", "--enable-multimodal", + "--cuda-graph-max-bs", + "4", ], ) cls.base_url += "/v1" @@ -154,7 +167,7 @@ class TestGemma3itServer(TestOpenAIVisionServer): class TestGemma3nServer(TestOpenAIVisionServer): @classmethod def setUpClass(cls): - cls.model = "google/gemma-3n-E2B-it" + cls.model = "google/gemma-3n-E4B-it" cls.base_url = DEFAULT_URL_FOR_TEST cls.api_key = "sk-123456" cls.process = popen_launch_server( @@ -166,7 +179,7 @@ class TestGemma3nServer(TestOpenAIVisionServer): "--mem-fraction-static", "0.70", "--cuda-graph-max-bs", - "1", + "4", ], ) cls.base_url += "/v1" @@ -193,6 +206,8 @@ class TestKimiVLServer(TestOpenAIVisionServer): "4096", "--dtype", "bfloat16", + "--cuda-graph-max-bs", + "4", ], ) cls.base_url += "/v1" @@ -233,6 +248,8 @@ class TestPhi4MMServer(TestOpenAIVisionServer): "--lora-paths", f"vision={constants.HF_HUB_CACHE}/models--microsoft--Phi-4-multimodal-instruct/snapshots/{revision}/vision-lora", f"speech={constants.HF_HUB_CACHE}/models--microsoft--Phi-4-multimodal-instruct/snapshots/{revision}/speech-lora", + "--cuda-graph-max-bs", + "4", ], ) cls.base_url += "/v1" @@ -277,10 +294,13 @@ class TestVILAServer(TestOpenAIVisionServer): "--trust-remote-code", "--context-length=65536", f"--revision={cls.revision}", + "--cuda-graph-max-bs", + "4", ], ) cls.base_url += "/v1" if __name__ == "__main__": + del TestOpenAIVisionServer unittest.main() diff --git a/test/srt/test_vision_openai_server_common.py b/test/srt/test_vision_openai_server_common.py index 2f7e404cb..d652c6be1 100644 --- a/test/srt/test_vision_openai_server_common.py +++ b/test/srt/test_vision_openai_server_common.py @@ -71,7 +71,7 @@ class TestOpenAIVisionServer(CustomTestCase): }, { "type": "text", - "text": "Describe this image in a very short sentence.", + "text": "Describe this image in a sentence.", }, ], }, @@ -119,7 +119,7 @@ class TestOpenAIVisionServer(CustomTestCase): }, { "type": "text", - "text": "Describe this image in a very short sentence.", + "text": "Describe this image in a sentence.", }, ], }, @@ -455,7 +455,7 @@ class TestOpenAIVisionServer(CustomTestCase): content.append( { "type": "text", - "text": "Describe this image in a very short sentence.", + "text": "Describe this image in a sentence.", } ) @@ -528,14 +528,20 @@ class TestOpenAIVisionServer(CustomTestCase): # a fragment of Trump's speech audio_response = self.get_audio_response( AUDIO_TRUMP_SPEECH_URL, - "I have an audio sample. Please repeat the person's words", + "Listen to this audio and write down the audio transcription in English.", category="speech", ) - assert "thank you" in audio_response - assert "it's a privilege to be here" in audio_response - assert "leader" in audio_response - assert "science" in audio_response - assert "art" in audio_response + check_list = [ + "thank you", + "it's a privilege to be here", + "leader", + "science", + "art", + ] + for check_word in check_list: + assert ( + check_word in audio_response + ), f"audio_response: |{audio_response}| should contain |{check_word}|" def _test_audio_ambient_completion(self): # bird song