diff --git a/docs/supported_models/multimodal_language_models.md b/docs/supported_models/multimodal_language_models.md index a691c0967..665d8de7e 100644 --- a/docs/supported_models/multimodal_language_models.md +++ b/docs/supported_models/multimodal_language_models.md @@ -38,3 +38,4 @@ in the GitHub search bar. | **Kimi-VL** (A3B) | `moonshotai/Kimi-VL-A3B-Instruct` | `kimi-vl` | Kimi-VL is a multimodal model that can understand and generate text from images. | | **Mistral-Small-3.1-24B** | `mistralai/Mistral-Small-3.1-24B-Instruct-2503` | `mistral` | Mistral 3.1 is a multimodal model that can generate text from text or images input. It also supports tool calling and structured output. | | **Phi-4-multimodal-instruct** | `microsoft/Phi-4-multimodal-instruct` | `phi-4-mm` | Phi-4-multimodal-instruct is the multimodal variant of the Phi-4-mini model, enhanced with LoRA for improved multimodal capabilities. Currently, it supports only text and vision modalities in SGLang. | +| **MiMo-VL** (7B) | `XiaomiMiMo/MiMo-VL-7B-RL` | `mimo-vl` | Xiaomi's compact yet powerful vision-language model featuring a native resolution ViT encoder for fine-grained visual details, an MLP projector for cross-modal alignment, and the MiMo-7B language model optimized for complex reasoning tasks. | diff --git a/python/sglang/srt/conversation.py b/python/sglang/srt/conversation.py index c2608f1f1..facd82637 100644 --- a/python/sglang/srt/conversation.py +++ b/python/sglang/srt/conversation.py @@ -921,6 +921,19 @@ register_conv_template( ) ) +register_conv_template( + Conversation( + name="mimo-vl", + system_message="You are MiMo, an AI assistant developed by Xiaomi.", + system_template="<|im_start|>system\n{system_message}", + roles=("<|im_start|>user", "<|im_start|>assistant"), + sep="<|im_end|>\n", + sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE, + stop_str=["<|im_end|>"], + image_token="<|vision_start|><|image_pad|><|vision_end|>", + ) +) + register_conv_template( Conversation( @@ -1049,3 +1062,9 @@ def match_phi_4_mm(model_path: str): def match_vila(model_path: str): if re.search(r"vila", model_path, re.IGNORECASE): return "chatml" + + +@register_conv_template_matching_function +def match_mimo_vl(model_path: str): + if re.search(r"mimo.*vl", model_path, re.IGNORECASE): + return "mimo-vl" diff --git a/test/srt/test_vision_openai_server_a.py b/test/srt/test_vision_openai_server_a.py index a4a2e770d..5c30eec1c 100644 --- a/test/srt/test_vision_openai_server_a.py +++ b/test/srt/test_vision_openai_server_a.py @@ -185,5 +185,25 @@ class TestMinicpmoServer(TestOpenAIVisionServer): self._test_audio_ambient_completion() +class TestMimoVLServer(TestOpenAIVisionServer): + @classmethod + def setUpClass(cls): + cls.model = "XiaomiMiMo/MiMo-VL-7B-RL" + cls.base_url = DEFAULT_URL_FOR_TEST + cls.api_key = "sk-123456" + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + api_key=cls.api_key, + other_args=[ + "--trust-remote-code", + "--mem-fraction-static", + "0.6", + ], + ) + cls.base_url += "/v1" + + if __name__ == "__main__": unittest.main()