diff --git a/docs/supported_models/vision_language_models.md b/docs/supported_models/vision_language_models.md new file mode 100644 index 000000000..be0baba78 --- /dev/null +++ b/docs/supported_models/vision_language_models.md @@ -0,0 +1,29 @@ +# Vision Language Models + +These models accept multi-modal inputs (e.g., images and text) and generate text output. They augment language models with visual encoders and require a specific chat template for handling vision prompts. + +## Example launch Command + +```shell +python3 -m sglang.launch_server \ + --model-path meta-llama/Llama-3.2-11B-Vision-Instruct \ # example HF/local path + --host 0.0.0.0 \ + --port 30000 \ +``` + +## Supporting Matrixs + +| Model Family (Variants) | Example HuggingFace Identifier | Chat Template | Description | +|--------------------------------|--------------------------------------------------|----------------------|----------------------------------------------------------------------------------------| +| **Qwen-VL** (Qwen2 series) | `Qwen/Qwen2.5-VL-7B-Instruct` | `qwen2-vl` | Alibaba’s vision-language extension of Qwen; for example, Qwen2.5-VL (7B and larger variants) can analyze and converse about image content. | +| **DeepSeek-VL2** | `deepseek-ai/deepseek-vl2` | `deepseek-vl2` | Vision-language variant of DeepSeek (with a dedicated image processor), enabling advanced multimodal reasoning on image and text inputs. | +| **Janus-Pro** (1B, 7B) | `deepseek-ai/Janus-Pro-7B` | `janus-pro` | DeepSeek’s open-source multimodal model capable of both image understanding and generation. Janus-Pro employs a decoupled architecture for separate visual encoding paths, enhancing performance in both tasks. | +| **MiniCPM-V / MiniCPM-o** | `openbmb/MiniCPM-V-2_6` | `minicpmv` | MiniCPM-V (2.6, ~8B) supports image inputs, and MiniCPM-o adds audio/video; these multimodal LLMs are optimized for end-side deployment on mobile/edge devices. | +| **Llama 3.2 Vision** (11B) | `meta-llama/Llama-3.2-11B-Vision-Instruct` | `llama_3_vision` | Vision-enabled variant of Llama 3 (11B) that accepts image inputs for visual question answering and other multimodal tasks. | +| **Pixtral** (12B, 124B) | `mistral-community/pixtral-12b` | `mistral` | Pixtral is a vision-language model from Mistral AI that can process both text and images. | +| **LLaVA** (v1.5 & v1.6) | *e.g.* `liuhaotian/llava-v1.5-13b` | `vicuna_v1.1` | Open vision-chat models that add an image encoder to LLaMA/Vicuna (e.g. LLaMA2 13B) for following multimodal instruction prompts. | +| **LLaVA-NeXT** (8B, 72B) | `lmms-lab/llava-next-72b` | `chatml-llava` | Improved LLaVA models (with an 8B Llama3 version and a 72B version) offering enhanced visual instruction-following and accuracy on multimodal benchmarks. | +| **LLaVA-OneVision** | `lmms-lab/llava-onevision-qwen2-7b-ov` | `chatml-llava` | Enhanced LLaVA variant integrating Qwen as the backbone; supports multiple images (and even video frames) as inputs via an OpenAI Vision API-compatible format. | +| **Gemma 3 (Multimodal)** | `google/gemma-3-4b-it` | `gemma-it` | Gemma 3’s larger models (4B, 12B, 27B) accept images (each image encoded as 256 tokens) alongside text in a combined 128K-token context. | +| **Kimi-VL** (A3B) | `moonshotai/Kimi-VL-A3B-Instruct` | `kimi-vl` | Kimi-VL is a multimodal model that can understand and generate text from images. | +| **Mistral-Small-3.1-24B** | `mistralai/Mistral-Small-3.1-24B-Instruct-2503` | `mistral` | Mistral 3.1 is a multimodal model that can generate text from text or images input. It also supports tool calling and structured output. | diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index 623d38a47..a3c9e4ed9 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -549,6 +549,7 @@ multimodal_model_archs = [ "LlavaVidForCausalLM", "MiniCPMO", "MiniCPMV", + "Mistral3ForConditionalGeneration", "MultiModalityCausalLM", "MllamaForConditionalGeneration", "Qwen2VLForConditionalGeneration", diff --git a/python/sglang/srt/managers/multimodal_processors/llava.py b/python/sglang/srt/managers/multimodal_processors/llava.py index c3190c697..5da52c11e 100644 --- a/python/sglang/srt/managers/multimodal_processors/llava.py +++ b/python/sglang/srt/managers/multimodal_processors/llava.py @@ -20,6 +20,7 @@ from sglang.srt.models.llava import ( LlavaQwenForCausalLM, ) from sglang.srt.models.llavavid import LlavaVidForCausalLM +from sglang.srt.models.mistral import Mistral3ForConditionalGeneration from sglang.srt.utils import load_image, logger from sglang.utils import get_exception_traceback @@ -176,10 +177,10 @@ class LlavaImageProcessor(BaseMultimodalProcessor): class LlavaMultimodalProcessor(BaseMultimodalProcessor): """ - This is a wrapper class used to identify the multimodal processor for Llava architecture models. + This is a wrapper class used to identify the multimodal processor for Llava architectures' vision model. """ - models = [LlavaForConditionalGeneration] + models = [LlavaForConditionalGeneration, Mistral3ForConditionalGeneration] def _get_sgl_processor_cls(self, model_type: str): if hf_name := HF_MAPPING_NAMES.get(model_type): diff --git a/python/sglang/srt/models/llava.py b/python/sglang/srt/models/llava.py index 5077211d4..62ed1d374 100644 --- a/python/sglang/srt/models/llava.py +++ b/python/sglang/srt/models/llava.py @@ -135,7 +135,6 @@ class LlavaBaseForCausalLM(nn.Module): """ image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) # NOTE: This is not memory efficient. (output_hidden_states=True) will save all the hidden stated. - selected_image_feature = image_outputs.hidden_states[self.vision_feature_layer] if self.vision_feature_select_strategy in ["default", "patch"]: selected_image_feature = selected_image_feature[:, 1:] @@ -146,7 +145,6 @@ class LlavaBaseForCausalLM(nn.Module): f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}" ) image_features = self.multi_modal_projector(selected_image_feature) - return image_features @torch.no_grad() @@ -613,6 +611,10 @@ class LlavaForConditionalGeneration(LlavaBaseForCausalLM): MULTIMODAL_PROJECTOR_TYPE = LlavaMultiModalProjector + @property + def dtype(self): + return self.torch_dtype + def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs): if hasattr(self.vision_tower, "pad_input_ids"): return self.vision_tower.pad_input_ids(input_ids, image_inputs) @@ -672,11 +674,17 @@ class LlavaForConditionalGeneration(LlavaBaseForCausalLM): assert hasattr(config, "text_config") assert hasattr(config, "vision_config") self.config = config - self.text_config = config.text_config - self.vision_config = config.vision_config + self.text_config = self.config.text_config + self.vision_config = self.config.vision_config + self.torch_dtype = getattr(self.config, "torch_dtype") + + if not getattr(self.text_config, "torch_dtype"): + self.text_config.torch_dtype = self.torch_dtype + if not getattr(self.vision_config, "torch_dtype"): + self.vision_config.torch_dtype = self.torch_dtype if not hasattr(self.config, "vocab_size"): - self.config.vocab_size = self.config.text_config.vocab_size + self.config.vocab_size = self.text_config.vocab_size if not hasattr(self.config, "image_aspect_ratio"): self.config.image_aspect_ratio = "anyres" if not hasattr(self.config, "image_grid_pinpoints"): @@ -697,39 +705,39 @@ class LlavaForConditionalGeneration(LlavaBaseForCausalLM): if not hasattr(self.config, "projector_hidden_act"): self.config.projector_hidden_act = "gelu" - self.vision_feature_layer = getattr(config, "vision_feature_layer", -1) + self.vision_feature_layer = getattr(self.config, "vision_feature_layer", -1) self.vision_feature_select_strategy = getattr( - config, "vision_feature_select_strategy", "full" + self.config, "vision_feature_select_strategy", "full" ) - self.image_size = self.config.vision_config.image_size - self.patch_size = self.config.vision_config.patch_size + self.image_size = self.vision_config.image_size + self.patch_size = self.vision_config.patch_size - self.mm_patch_merge_type = config.mm_patch_merge_type - self.image_aspect_ratio = config.image_aspect_ratio - self.image_grid_pinpoints = config.image_grid_pinpoints + self.mm_patch_merge_type = self.config.mm_patch_merge_type + self.image_aspect_ratio = self.config.image_aspect_ratio + self.image_grid_pinpoints = self.config.image_grid_pinpoints self.image_feature_len = int((self.image_size // self.patch_size) ** 2) self.multi_modal_projector = self.MULTIMODAL_PROJECTOR_TYPE(config) language_model_cls = self._get_sgl_model_cls( - config.text_config, AutoModelForCausalLM + self.text_config, AutoModelForCausalLM ) - vision_model_cls = self._get_sgl_model_cls(config.vision_config, AutoModel) + vision_model_cls = self._get_sgl_model_cls(self.vision_config, AutoModel) self.language_model = language_model_cls( - config.text_config, + self.text_config, quant_config=quant_config, prefix=add_prefix("language_model", prefix), ) self.vision_tower = vision_model_cls( - config.vision_config, + self.vision_config, quant_config=quant_config, prefix=add_prefix("vision_tower", prefix), ) - if "unpad" in getattr(config, "mm_patch_merge_type", ""): + if "unpad" in getattr(self.config, "mm_patch_merge_type", ""): self.language_model.model.image_newline = nn.Parameter( - torch.empty(config.text_config.hidden_size, dtype=torch.float16) + torch.empty(self.text_config.hidden_size, dtype=self.torch_dtype) ) def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor: diff --git a/python/sglang/srt/models/mistral.py b/python/sglang/srt/models/mistral.py index 70d2ca3e0..b0c6beaab 100644 --- a/python/sglang/srt/models/mistral.py +++ b/python/sglang/srt/models/mistral.py @@ -13,6 +13,12 @@ # ============================================================================== """Inference-only Mistral model.""" +from typing import List, Union + +import torch +from transformers.models.mistral3.modeling_mistral3 import Mistral3MultiModalProjector + +from sglang.srt.managers.schedule_batch import MultimodalDataItem from sglang.srt.models.llama import LlamaForCausalLM @@ -20,4 +26,68 @@ class MistralForCausalLM(LlamaForCausalLM): pass -EntryClass = MistralForCausalLM +class Mistral3ForConditionalGeneration: + MULTIMODAL_PROJECTOR_TYPE = Mistral3MultiModalProjector + + def __init__(self, **kwargs): + # lazy load inner class + # to bypass circular import + from sglang.srt.models.llava import LlavaForConditionalGeneration + + # override config: mistral's projector adds patchmerger that doesn't require padding + kwargs["config"].vision_config.pad_image_border = False + + self.inner = LlavaForConditionalGeneration(**kwargs) + self.inner.multi_modal_projector = self.MULTIMODAL_PROJECTOR_TYPE( + kwargs["config"] + ) + self.inner.get_image_feature = self.get_image_feature + + def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor: + """Extract features from image inputs. + + Args: + items: List of MultimodalDataItem objects containing image data + Note that an item can be either "image" or "multi-images" + + Returns: + torch.Tensor: features from image inputs, concatenated + """ + features = [] + for item in items: + # in each item, we assume pixel_values is always batched + pixel_values, image_sizes = item.pixel_values, item.image_sizes + image_outputs = self.vision_tower( + pixel_values, image_sizes, output_hidden_states=True + ) + selected_image_feature = image_outputs.hidden_states[ + self.vision_feature_layer + ] + + if self.vision_feature_select_strategy in ["default", "patch"]: + selected_image_feature = selected_image_feature[:, 1:] + elif self.vision_feature_select_strategy == "full": + selected_image_feature = selected_image_feature + else: + raise ValueError( + f"Unexpected select feature: {self.vision_feature_select_strategy}" + ) + features.append( + self.multi_modal_projector( + selected_image_feature.squeeze(0), image_sizes + ) + ) + ret = torch.cat(features, dim=0) + return ret + + def __getattr__(self, name): + return getattr(self.inner, name) + + def __hasattr__(self, name): + return hasattr(self.inner, name) + + def __call__(self, *args, **kwargs): + return self.inner(*args, **kwargs) + + +EntryClass = [MistralForCausalLM, Mistral3ForConditionalGeneration] diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py index be55f4bcd..895e19f5a 100644 --- a/test/srt/test_vision_openai_server.py +++ b/test/srt/test_vision_openai_server.py @@ -664,6 +664,28 @@ class TestPixtralServer(TestOpenAIVisionServer): pass +class TestMistral3_1Server(TestOpenAIVisionServer): + @classmethod + def setUpClass(cls): + cls.model = "unsloth/Mistral-Small-3.1-24B-Instruct-2503" + cls.base_url = DEFAULT_URL_FOR_TEST + cls.api_key = "sk-123456" + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--mem-fraction-static", + "0.8", + ], + ) + cls.base_url += "/v1" + + def test_video_chat_completion(self): + pass + + class TestDeepseekVL2Server(TestOpenAIVisionServer): @classmethod def setUpClass(cls):