diff --git a/python/sglang/srt/configs/deepseekvl2.py b/python/sglang/srt/configs/deepseekvl2.py index a04549224..9d2f3de59 100644 --- a/python/sglang/srt/configs/deepseekvl2.py +++ b/python/sglang/srt/configs/deepseekvl2.py @@ -182,7 +182,7 @@ class DeepseekVLV2Processor(ProcessorMixin): tokenized_str, images, seq_mask, spatial_crop = self.tokenize_with_images( messages, pil_images[image_index : image_index + image_token_cnt], - bos=False, + bos=True, eos=True, cropping=len(pil_images) <= 2, max_req_input_len=max_req_input_len, diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index f066c5b1b..13dacee37 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -162,7 +162,9 @@ class ModelConfig: self.attention_arch = AttentionArch.MLA self.kv_lora_rank = self.hf_config.kv_lora_rank self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim - elif "DeepseekVL2ForCausalLM" in self.hf_config.architectures: + elif "DeepseekVL2ForCausalLM" in self.hf_config.architectures and getattr( + self.hf_text_config, "use_mla", True + ): self.head_dim = 256 self.attention_arch = AttentionArch.MLA self.kv_lora_rank = self.hf_text_config.kv_lora_rank diff --git a/python/sglang/srt/conversation.py b/python/sglang/srt/conversation.py index 8e9129224..3fd45b467 100644 --- a/python/sglang/srt/conversation.py +++ b/python/sglang/srt/conversation.py @@ -463,6 +463,30 @@ def generate_embedding_convs( return convs +# Models in which system adds modality tokens at prompt start automatically +# when media inputs exceed modality tokens in prompt (e.g. 3 images but 2 tokens) +_MODELS_REQUIRING_MODALITY_SUPPLEMENT = {"deepseek-vl2"} + + +# adapted from https://github.com/vllm-project/vllm/blob/5124f5bf51b83e6f344c1bc6652e8c4d81313b34/vllm/entrypoints/chat_utils.py#L856 +def _get_full_multimodal_text_prompt( + modality_token: str, modality_count: int, text_prompt: str +) -> str: + """Combine multimodal prompts for a multimodal language model.""" + + # For any existing placeholder in the text prompt, we leave it as is + left: int = modality_count - text_prompt.count(modality_token) + if left < 0: + raise ValueError( + f"Found more '{modality_token}' placeholders in input prompt than " + "actual multimodal data items." + ) + + # NOTE: For now we always add missing modality_token at the front of + # the prompt. This may change to be customizable in the future. + return "\n".join([modality_token] * left + [text_prompt]) + + def generate_chat_conv( request: ChatCompletionRequest, template_name: str ) -> Conversation: @@ -520,6 +544,12 @@ def generate_chat_conv( if conv.name != "qwen2-vl" else conv.image_token ) + add_token_as_needed: bool = ( + conv.name in _MODELS_REQUIRING_MODALITY_SUPPLEMENT + ) + if add_token_as_needed: + image_token = "" + audio_token = conv.audio_token for content in message.content: if content.type == "text": @@ -533,7 +563,10 @@ def generate_chat_conv( elif content.type == "audio_url": real_content += audio_token conv.append_audio(content.audio_url.url) - + if add_token_as_needed: + real_content = _get_full_multimodal_text_prompt( + conv.image_token, num_image_url, real_content + ) conv.append_message(conv.roles[0], real_content) elif msg_role == "assistant": parsed_content = "" diff --git a/python/sglang/srt/models/deepseek.py b/python/sglang/srt/models/deepseek.py index 1664f17ff..95bfe001a 100644 --- a/python/sglang/srt/models/deepseek.py +++ b/python/sglang/srt/models/deepseek.py @@ -382,8 +382,14 @@ class DeepseekModel(nn.Module): input_ids: torch.Tensor, positions: torch.Tensor, forward_batch: ForwardBatch, + input_embeds: torch.Tensor = None, ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) + + if input_embeds is None: + hidden_states = self.embed_tokens(input_ids) + else: + hidden_states = input_embeds + residual = None for i in range(len(self.layers)): layer = self.layers[i] @@ -416,14 +422,18 @@ class DeepseekForCausalLM(nn.Module): ) self.logits_processor = LogitsProcessor(config) + def get_input_embeddings(self) -> nn.Embedding: + return self.model.embed_tokens + @torch.no_grad() def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, forward_batch: ForwardBatch, + input_embeds: torch.Tensor = None, ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, forward_batch) + hidden_states = self.model(input_ids, positions, forward_batch, input_embeds) return self.logits_processor( input_ids, hidden_states, self.lm_head, forward_batch ) diff --git a/python/sglang/srt/models/deepseek_vl2.py b/python/sglang/srt/models/deepseek_vl2.py index c67d8da02..c0af78aca 100644 --- a/python/sglang/srt/models/deepseek_vl2.py +++ b/python/sglang/srt/models/deepseek_vl2.py @@ -18,6 +18,7 @@ from sglang.srt.managers.mm_utils import ( from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader +from sglang.srt.models.deepseek import DeepseekForCausalLM from sglang.srt.models.deepseek_v2 import DeepseekV2ForCausalLM @@ -189,7 +190,11 @@ class DeepseekVL2ForCausalLM(nn.Module): # ----------- language model ------------ language_config = config.language_config - self.language_model = DeepseekV2ForCausalLM(language_config) + if language_config.use_mla: + self.language_model = DeepseekV2ForCausalLM(language_config) + else: + # deepseek-vl2-tiny forbids mla + self.language_model = DeepseekForCausalLM(language_config) def _init_vision_module( self, vision_config, quant_config: Optional[QuantizationConfig] diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py index efed5fdb9..8f5eed220 100644 --- a/test/srt/test_vision_openai_server.py +++ b/test/srt/test_vision_openai_server.py @@ -654,6 +654,30 @@ class TestDeepseekVL2Server(TestOpenAIVisionServer): pass +class TestDeepseekVL2TinyServer(TestOpenAIVisionServer): + @classmethod + def setUpClass(cls): + cls.model = "deepseek-ai/deepseek-vl2-tiny" + cls.base_url = DEFAULT_URL_FOR_TEST + cls.api_key = "sk-123456" + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--chat-template", + "deepseek-vl2", + "--context-length", + "4096", + ], + ) + cls.base_url += "/v1" + + def test_video_chat_completion(self): + pass + + class TestJanusProServer(TestOpenAIVisionServer): @classmethod def setUpClass(cls):