model(vlm): mistral 3.1 (#5099)
Co-authored-by: KivenChen <sleigh-queue-0y@icloud.com>
This commit is contained in:
29
docs/supported_models/vision_language_models.md
Normal file
29
docs/supported_models/vision_language_models.md
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
# Vision Language Models
|
||||||
|
|
||||||
|
These models accept multi-modal inputs (e.g., images and text) and generate text output. They augment language models with visual encoders and require a specific chat template for handling vision prompts.
|
||||||
|
|
||||||
|
## Example launch Command
|
||||||
|
|
||||||
|
```shell
|
||||||
|
python3 -m sglang.launch_server \
|
||||||
|
--model-path meta-llama/Llama-3.2-11B-Vision-Instruct \ # example HF/local path
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port 30000 \
|
||||||
|
```
|
||||||
|
|
||||||
|
## Supporting Matrixs
|
||||||
|
|
||||||
|
| Model Family (Variants) | Example HuggingFace Identifier | Chat Template | Description |
|
||||||
|
|--------------------------------|--------------------------------------------------|----------------------|----------------------------------------------------------------------------------------|
|
||||||
|
| **Qwen-VL** (Qwen2 series) | `Qwen/Qwen2.5-VL-7B-Instruct` | `qwen2-vl` | Alibaba’s vision-language extension of Qwen; for example, Qwen2.5-VL (7B and larger variants) can analyze and converse about image content. |
|
||||||
|
| **DeepSeek-VL2** | `deepseek-ai/deepseek-vl2` | `deepseek-vl2` | Vision-language variant of DeepSeek (with a dedicated image processor), enabling advanced multimodal reasoning on image and text inputs. |
|
||||||
|
| **Janus-Pro** (1B, 7B) | `deepseek-ai/Janus-Pro-7B` | `janus-pro` | DeepSeek’s open-source multimodal model capable of both image understanding and generation. Janus-Pro employs a decoupled architecture for separate visual encoding paths, enhancing performance in both tasks. |
|
||||||
|
| **MiniCPM-V / MiniCPM-o** | `openbmb/MiniCPM-V-2_6` | `minicpmv` | MiniCPM-V (2.6, ~8B) supports image inputs, and MiniCPM-o adds audio/video; these multimodal LLMs are optimized for end-side deployment on mobile/edge devices. |
|
||||||
|
| **Llama 3.2 Vision** (11B) | `meta-llama/Llama-3.2-11B-Vision-Instruct` | `llama_3_vision` | Vision-enabled variant of Llama 3 (11B) that accepts image inputs for visual question answering and other multimodal tasks. |
|
||||||
|
| **Pixtral** (12B, 124B) | `mistral-community/pixtral-12b` | `mistral` | Pixtral is a vision-language model from Mistral AI that can process both text and images. |
|
||||||
|
| **LLaVA** (v1.5 & v1.6) | *e.g.* `liuhaotian/llava-v1.5-13b` | `vicuna_v1.1` | Open vision-chat models that add an image encoder to LLaMA/Vicuna (e.g. LLaMA2 13B) for following multimodal instruction prompts. |
|
||||||
|
| **LLaVA-NeXT** (8B, 72B) | `lmms-lab/llava-next-72b` | `chatml-llava` | Improved LLaVA models (with an 8B Llama3 version and a 72B version) offering enhanced visual instruction-following and accuracy on multimodal benchmarks. |
|
||||||
|
| **LLaVA-OneVision** | `lmms-lab/llava-onevision-qwen2-7b-ov` | `chatml-llava` | Enhanced LLaVA variant integrating Qwen as the backbone; supports multiple images (and even video frames) as inputs via an OpenAI Vision API-compatible format. |
|
||||||
|
| **Gemma 3 (Multimodal)** | `google/gemma-3-4b-it` | `gemma-it` | Gemma 3’s larger models (4B, 12B, 27B) accept images (each image encoded as 256 tokens) alongside text in a combined 128K-token context. |
|
||||||
|
| **Kimi-VL** (A3B) | `moonshotai/Kimi-VL-A3B-Instruct` | `kimi-vl` | Kimi-VL is a multimodal model that can understand and generate text from images. |
|
||||||
|
| **Mistral-Small-3.1-24B** | `mistralai/Mistral-Small-3.1-24B-Instruct-2503` | `mistral` | Mistral 3.1 is a multimodal model that can generate text from text or images input. It also supports tool calling and structured output. |
|
||||||
@@ -549,6 +549,7 @@ multimodal_model_archs = [
|
|||||||
"LlavaVidForCausalLM",
|
"LlavaVidForCausalLM",
|
||||||
"MiniCPMO",
|
"MiniCPMO",
|
||||||
"MiniCPMV",
|
"MiniCPMV",
|
||||||
|
"Mistral3ForConditionalGeneration",
|
||||||
"MultiModalityCausalLM",
|
"MultiModalityCausalLM",
|
||||||
"MllamaForConditionalGeneration",
|
"MllamaForConditionalGeneration",
|
||||||
"Qwen2VLForConditionalGeneration",
|
"Qwen2VLForConditionalGeneration",
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ from sglang.srt.models.llava import (
|
|||||||
LlavaQwenForCausalLM,
|
LlavaQwenForCausalLM,
|
||||||
)
|
)
|
||||||
from sglang.srt.models.llavavid import LlavaVidForCausalLM
|
from sglang.srt.models.llavavid import LlavaVidForCausalLM
|
||||||
|
from sglang.srt.models.mistral import Mistral3ForConditionalGeneration
|
||||||
from sglang.srt.utils import load_image, logger
|
from sglang.srt.utils import load_image, logger
|
||||||
from sglang.utils import get_exception_traceback
|
from sglang.utils import get_exception_traceback
|
||||||
|
|
||||||
@@ -176,10 +177,10 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
|
|||||||
|
|
||||||
class LlavaMultimodalProcessor(BaseMultimodalProcessor):
|
class LlavaMultimodalProcessor(BaseMultimodalProcessor):
|
||||||
"""
|
"""
|
||||||
This is a wrapper class used to identify the multimodal processor for Llava architecture models.
|
This is a wrapper class used to identify the multimodal processor for Llava architectures' vision model.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
models = [LlavaForConditionalGeneration]
|
models = [LlavaForConditionalGeneration, Mistral3ForConditionalGeneration]
|
||||||
|
|
||||||
def _get_sgl_processor_cls(self, model_type: str):
|
def _get_sgl_processor_cls(self, model_type: str):
|
||||||
if hf_name := HF_MAPPING_NAMES.get(model_type):
|
if hf_name := HF_MAPPING_NAMES.get(model_type):
|
||||||
|
|||||||
@@ -135,7 +135,6 @@ class LlavaBaseForCausalLM(nn.Module):
|
|||||||
"""
|
"""
|
||||||
image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
|
image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
|
||||||
# NOTE: This is not memory efficient. (output_hidden_states=True) will save all the hidden stated.
|
# NOTE: This is not memory efficient. (output_hidden_states=True) will save all the hidden stated.
|
||||||
|
|
||||||
selected_image_feature = image_outputs.hidden_states[self.vision_feature_layer]
|
selected_image_feature = image_outputs.hidden_states[self.vision_feature_layer]
|
||||||
if self.vision_feature_select_strategy in ["default", "patch"]:
|
if self.vision_feature_select_strategy in ["default", "patch"]:
|
||||||
selected_image_feature = selected_image_feature[:, 1:]
|
selected_image_feature = selected_image_feature[:, 1:]
|
||||||
@@ -146,7 +145,6 @@ class LlavaBaseForCausalLM(nn.Module):
|
|||||||
f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
|
f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
|
||||||
)
|
)
|
||||||
image_features = self.multi_modal_projector(selected_image_feature)
|
image_features = self.multi_modal_projector(selected_image_feature)
|
||||||
|
|
||||||
return image_features
|
return image_features
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
@@ -613,6 +611,10 @@ class LlavaForConditionalGeneration(LlavaBaseForCausalLM):
|
|||||||
|
|
||||||
MULTIMODAL_PROJECTOR_TYPE = LlavaMultiModalProjector
|
MULTIMODAL_PROJECTOR_TYPE = LlavaMultiModalProjector
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dtype(self):
|
||||||
|
return self.torch_dtype
|
||||||
|
|
||||||
def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
|
def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
|
||||||
if hasattr(self.vision_tower, "pad_input_ids"):
|
if hasattr(self.vision_tower, "pad_input_ids"):
|
||||||
return self.vision_tower.pad_input_ids(input_ids, image_inputs)
|
return self.vision_tower.pad_input_ids(input_ids, image_inputs)
|
||||||
@@ -672,11 +674,17 @@ class LlavaForConditionalGeneration(LlavaBaseForCausalLM):
|
|||||||
assert hasattr(config, "text_config")
|
assert hasattr(config, "text_config")
|
||||||
assert hasattr(config, "vision_config")
|
assert hasattr(config, "vision_config")
|
||||||
self.config = config
|
self.config = config
|
||||||
self.text_config = config.text_config
|
self.text_config = self.config.text_config
|
||||||
self.vision_config = config.vision_config
|
self.vision_config = self.config.vision_config
|
||||||
|
self.torch_dtype = getattr(self.config, "torch_dtype")
|
||||||
|
|
||||||
|
if not getattr(self.text_config, "torch_dtype"):
|
||||||
|
self.text_config.torch_dtype = self.torch_dtype
|
||||||
|
if not getattr(self.vision_config, "torch_dtype"):
|
||||||
|
self.vision_config.torch_dtype = self.torch_dtype
|
||||||
|
|
||||||
if not hasattr(self.config, "vocab_size"):
|
if not hasattr(self.config, "vocab_size"):
|
||||||
self.config.vocab_size = self.config.text_config.vocab_size
|
self.config.vocab_size = self.text_config.vocab_size
|
||||||
if not hasattr(self.config, "image_aspect_ratio"):
|
if not hasattr(self.config, "image_aspect_ratio"):
|
||||||
self.config.image_aspect_ratio = "anyres"
|
self.config.image_aspect_ratio = "anyres"
|
||||||
if not hasattr(self.config, "image_grid_pinpoints"):
|
if not hasattr(self.config, "image_grid_pinpoints"):
|
||||||
@@ -697,39 +705,39 @@ class LlavaForConditionalGeneration(LlavaBaseForCausalLM):
|
|||||||
if not hasattr(self.config, "projector_hidden_act"):
|
if not hasattr(self.config, "projector_hidden_act"):
|
||||||
self.config.projector_hidden_act = "gelu"
|
self.config.projector_hidden_act = "gelu"
|
||||||
|
|
||||||
self.vision_feature_layer = getattr(config, "vision_feature_layer", -1)
|
self.vision_feature_layer = getattr(self.config, "vision_feature_layer", -1)
|
||||||
self.vision_feature_select_strategy = getattr(
|
self.vision_feature_select_strategy = getattr(
|
||||||
config, "vision_feature_select_strategy", "full"
|
self.config, "vision_feature_select_strategy", "full"
|
||||||
)
|
)
|
||||||
self.image_size = self.config.vision_config.image_size
|
self.image_size = self.vision_config.image_size
|
||||||
self.patch_size = self.config.vision_config.patch_size
|
self.patch_size = self.vision_config.patch_size
|
||||||
|
|
||||||
self.mm_patch_merge_type = config.mm_patch_merge_type
|
self.mm_patch_merge_type = self.config.mm_patch_merge_type
|
||||||
self.image_aspect_ratio = config.image_aspect_ratio
|
self.image_aspect_ratio = self.config.image_aspect_ratio
|
||||||
self.image_grid_pinpoints = config.image_grid_pinpoints
|
self.image_grid_pinpoints = self.config.image_grid_pinpoints
|
||||||
|
|
||||||
self.image_feature_len = int((self.image_size // self.patch_size) ** 2)
|
self.image_feature_len = int((self.image_size // self.patch_size) ** 2)
|
||||||
|
|
||||||
self.multi_modal_projector = self.MULTIMODAL_PROJECTOR_TYPE(config)
|
self.multi_modal_projector = self.MULTIMODAL_PROJECTOR_TYPE(config)
|
||||||
|
|
||||||
language_model_cls = self._get_sgl_model_cls(
|
language_model_cls = self._get_sgl_model_cls(
|
||||||
config.text_config, AutoModelForCausalLM
|
self.text_config, AutoModelForCausalLM
|
||||||
)
|
)
|
||||||
vision_model_cls = self._get_sgl_model_cls(config.vision_config, AutoModel)
|
vision_model_cls = self._get_sgl_model_cls(self.vision_config, AutoModel)
|
||||||
self.language_model = language_model_cls(
|
self.language_model = language_model_cls(
|
||||||
config.text_config,
|
self.text_config,
|
||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
prefix=add_prefix("language_model", prefix),
|
prefix=add_prefix("language_model", prefix),
|
||||||
)
|
)
|
||||||
self.vision_tower = vision_model_cls(
|
self.vision_tower = vision_model_cls(
|
||||||
config.vision_config,
|
self.vision_config,
|
||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
prefix=add_prefix("vision_tower", prefix),
|
prefix=add_prefix("vision_tower", prefix),
|
||||||
)
|
)
|
||||||
|
|
||||||
if "unpad" in getattr(config, "mm_patch_merge_type", ""):
|
if "unpad" in getattr(self.config, "mm_patch_merge_type", ""):
|
||||||
self.language_model.model.image_newline = nn.Parameter(
|
self.language_model.model.image_newline = nn.Parameter(
|
||||||
torch.empty(config.text_config.hidden_size, dtype=torch.float16)
|
torch.empty(self.text_config.hidden_size, dtype=self.torch_dtype)
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
|
def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
|
||||||
|
|||||||
@@ -13,6 +13,12 @@
|
|||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
"""Inference-only Mistral model."""
|
"""Inference-only Mistral model."""
|
||||||
|
|
||||||
|
from typing import List, Union
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from transformers.models.mistral3.modeling_mistral3 import Mistral3MultiModalProjector
|
||||||
|
|
||||||
|
from sglang.srt.managers.schedule_batch import MultimodalDataItem
|
||||||
from sglang.srt.models.llama import LlamaForCausalLM
|
from sglang.srt.models.llama import LlamaForCausalLM
|
||||||
|
|
||||||
|
|
||||||
@@ -20,4 +26,68 @@ class MistralForCausalLM(LlamaForCausalLM):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
EntryClass = MistralForCausalLM
|
class Mistral3ForConditionalGeneration:
|
||||||
|
MULTIMODAL_PROJECTOR_TYPE = Mistral3MultiModalProjector
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
# lazy load inner class
|
||||||
|
# to bypass circular import
|
||||||
|
from sglang.srt.models.llava import LlavaForConditionalGeneration
|
||||||
|
|
||||||
|
# override config: mistral's projector adds patchmerger that doesn't require padding
|
||||||
|
kwargs["config"].vision_config.pad_image_border = False
|
||||||
|
|
||||||
|
self.inner = LlavaForConditionalGeneration(**kwargs)
|
||||||
|
self.inner.multi_modal_projector = self.MULTIMODAL_PROJECTOR_TYPE(
|
||||||
|
kwargs["config"]
|
||||||
|
)
|
||||||
|
self.inner.get_image_feature = self.get_image_feature
|
||||||
|
|
||||||
|
def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
|
||||||
|
"""Extract features from image inputs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
items: List of MultimodalDataItem objects containing image data
|
||||||
|
Note that an item can be either "image" or "multi-images"
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
torch.Tensor: features from image inputs, concatenated
|
||||||
|
"""
|
||||||
|
features = []
|
||||||
|
for item in items:
|
||||||
|
# in each item, we assume pixel_values is always batched
|
||||||
|
pixel_values, image_sizes = item.pixel_values, item.image_sizes
|
||||||
|
image_outputs = self.vision_tower(
|
||||||
|
pixel_values, image_sizes, output_hidden_states=True
|
||||||
|
)
|
||||||
|
selected_image_feature = image_outputs.hidden_states[
|
||||||
|
self.vision_feature_layer
|
||||||
|
]
|
||||||
|
|
||||||
|
if self.vision_feature_select_strategy in ["default", "patch"]:
|
||||||
|
selected_image_feature = selected_image_feature[:, 1:]
|
||||||
|
elif self.vision_feature_select_strategy == "full":
|
||||||
|
selected_image_feature = selected_image_feature
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unexpected select feature: {self.vision_feature_select_strategy}"
|
||||||
|
)
|
||||||
|
features.append(
|
||||||
|
self.multi_modal_projector(
|
||||||
|
selected_image_feature.squeeze(0), image_sizes
|
||||||
|
)
|
||||||
|
)
|
||||||
|
ret = torch.cat(features, dim=0)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def __getattr__(self, name):
|
||||||
|
return getattr(self.inner, name)
|
||||||
|
|
||||||
|
def __hasattr__(self, name):
|
||||||
|
return hasattr(self.inner, name)
|
||||||
|
|
||||||
|
def __call__(self, *args, **kwargs):
|
||||||
|
return self.inner(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
EntryClass = [MistralForCausalLM, Mistral3ForConditionalGeneration]
|
||||||
|
|||||||
@@ -664,6 +664,28 @@ class TestPixtralServer(TestOpenAIVisionServer):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class TestMistral3_1Server(TestOpenAIVisionServer):
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls):
|
||||||
|
cls.model = "unsloth/Mistral-Small-3.1-24B-Instruct-2503"
|
||||||
|
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||||
|
cls.api_key = "sk-123456"
|
||||||
|
cls.process = popen_launch_server(
|
||||||
|
cls.model,
|
||||||
|
cls.base_url,
|
||||||
|
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||||
|
other_args=[
|
||||||
|
"--trust-remote-code",
|
||||||
|
"--mem-fraction-static",
|
||||||
|
"0.8",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
cls.base_url += "/v1"
|
||||||
|
|
||||||
|
def test_video_chat_completion(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class TestDeepseekVL2Server(TestOpenAIVisionServer):
|
class TestDeepseekVL2Server(TestOpenAIVisionServer):
|
||||||
@classmethod
|
@classmethod
|
||||||
def setUpClass(cls):
|
def setUpClass(cls):
|
||||||
|
|||||||
Reference in New Issue
Block a user