[Fix]: support deepseek-vl2-tiny model (#5552)
Co-authored-by: bppps <zouyu.zzx@alibaba-inc.com>
This commit is contained in:
@@ -182,7 +182,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
|||||||
tokenized_str, images, seq_mask, spatial_crop = self.tokenize_with_images(
|
tokenized_str, images, seq_mask, spatial_crop = self.tokenize_with_images(
|
||||||
messages,
|
messages,
|
||||||
pil_images[image_index : image_index + image_token_cnt],
|
pil_images[image_index : image_index + image_token_cnt],
|
||||||
bos=False,
|
bos=True,
|
||||||
eos=True,
|
eos=True,
|
||||||
cropping=len(pil_images) <= 2,
|
cropping=len(pil_images) <= 2,
|
||||||
max_req_input_len=max_req_input_len,
|
max_req_input_len=max_req_input_len,
|
||||||
|
|||||||
@@ -162,7 +162,9 @@ class ModelConfig:
|
|||||||
self.attention_arch = AttentionArch.MLA
|
self.attention_arch = AttentionArch.MLA
|
||||||
self.kv_lora_rank = self.hf_config.kv_lora_rank
|
self.kv_lora_rank = self.hf_config.kv_lora_rank
|
||||||
self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
|
self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
|
||||||
elif "DeepseekVL2ForCausalLM" in self.hf_config.architectures:
|
elif "DeepseekVL2ForCausalLM" in self.hf_config.architectures and getattr(
|
||||||
|
self.hf_text_config, "use_mla", True
|
||||||
|
):
|
||||||
self.head_dim = 256
|
self.head_dim = 256
|
||||||
self.attention_arch = AttentionArch.MLA
|
self.attention_arch = AttentionArch.MLA
|
||||||
self.kv_lora_rank = self.hf_text_config.kv_lora_rank
|
self.kv_lora_rank = self.hf_text_config.kv_lora_rank
|
||||||
|
|||||||
@@ -463,6 +463,30 @@ def generate_embedding_convs(
|
|||||||
return convs
|
return convs
|
||||||
|
|
||||||
|
|
||||||
|
# Models in which system adds modality tokens at prompt start automatically
|
||||||
|
# when media inputs exceed modality tokens in prompt (e.g. 3 images but 2 <image> tokens)
|
||||||
|
_MODELS_REQUIRING_MODALITY_SUPPLEMENT = {"deepseek-vl2"}
|
||||||
|
|
||||||
|
|
||||||
|
# adapted from https://github.com/vllm-project/vllm/blob/5124f5bf51b83e6f344c1bc6652e8c4d81313b34/vllm/entrypoints/chat_utils.py#L856
|
||||||
|
def _get_full_multimodal_text_prompt(
|
||||||
|
modality_token: str, modality_count: int, text_prompt: str
|
||||||
|
) -> str:
|
||||||
|
"""Combine multimodal prompts for a multimodal language model."""
|
||||||
|
|
||||||
|
# For any existing placeholder in the text prompt, we leave it as is
|
||||||
|
left: int = modality_count - text_prompt.count(modality_token)
|
||||||
|
if left < 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"Found more '{modality_token}' placeholders in input prompt than "
|
||||||
|
"actual multimodal data items."
|
||||||
|
)
|
||||||
|
|
||||||
|
# NOTE: For now we always add missing modality_token at the front of
|
||||||
|
# the prompt. This may change to be customizable in the future.
|
||||||
|
return "\n".join([modality_token] * left + [text_prompt])
|
||||||
|
|
||||||
|
|
||||||
def generate_chat_conv(
|
def generate_chat_conv(
|
||||||
request: ChatCompletionRequest, template_name: str
|
request: ChatCompletionRequest, template_name: str
|
||||||
) -> Conversation:
|
) -> Conversation:
|
||||||
@@ -520,6 +544,12 @@ def generate_chat_conv(
|
|||||||
if conv.name != "qwen2-vl"
|
if conv.name != "qwen2-vl"
|
||||||
else conv.image_token
|
else conv.image_token
|
||||||
)
|
)
|
||||||
|
add_token_as_needed: bool = (
|
||||||
|
conv.name in _MODELS_REQUIRING_MODALITY_SUPPLEMENT
|
||||||
|
)
|
||||||
|
if add_token_as_needed:
|
||||||
|
image_token = ""
|
||||||
|
|
||||||
audio_token = conv.audio_token
|
audio_token = conv.audio_token
|
||||||
for content in message.content:
|
for content in message.content:
|
||||||
if content.type == "text":
|
if content.type == "text":
|
||||||
@@ -533,7 +563,10 @@ def generate_chat_conv(
|
|||||||
elif content.type == "audio_url":
|
elif content.type == "audio_url":
|
||||||
real_content += audio_token
|
real_content += audio_token
|
||||||
conv.append_audio(content.audio_url.url)
|
conv.append_audio(content.audio_url.url)
|
||||||
|
if add_token_as_needed:
|
||||||
|
real_content = _get_full_multimodal_text_prompt(
|
||||||
|
conv.image_token, num_image_url, real_content
|
||||||
|
)
|
||||||
conv.append_message(conv.roles[0], real_content)
|
conv.append_message(conv.roles[0], real_content)
|
||||||
elif msg_role == "assistant":
|
elif msg_role == "assistant":
|
||||||
parsed_content = ""
|
parsed_content = ""
|
||||||
|
|||||||
@@ -382,8 +382,14 @@ class DeepseekModel(nn.Module):
|
|||||||
input_ids: torch.Tensor,
|
input_ids: torch.Tensor,
|
||||||
positions: torch.Tensor,
|
positions: torch.Tensor,
|
||||||
forward_batch: ForwardBatch,
|
forward_batch: ForwardBatch,
|
||||||
|
input_embeds: torch.Tensor = None,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
hidden_states = self.embed_tokens(input_ids)
|
|
||||||
|
if input_embeds is None:
|
||||||
|
hidden_states = self.embed_tokens(input_ids)
|
||||||
|
else:
|
||||||
|
hidden_states = input_embeds
|
||||||
|
|
||||||
residual = None
|
residual = None
|
||||||
for i in range(len(self.layers)):
|
for i in range(len(self.layers)):
|
||||||
layer = self.layers[i]
|
layer = self.layers[i]
|
||||||
@@ -416,14 +422,18 @@ class DeepseekForCausalLM(nn.Module):
|
|||||||
)
|
)
|
||||||
self.logits_processor = LogitsProcessor(config)
|
self.logits_processor = LogitsProcessor(config)
|
||||||
|
|
||||||
|
def get_input_embeddings(self) -> nn.Embedding:
|
||||||
|
return self.model.embed_tokens
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_ids: torch.Tensor,
|
input_ids: torch.Tensor,
|
||||||
positions: torch.Tensor,
|
positions: torch.Tensor,
|
||||||
forward_batch: ForwardBatch,
|
forward_batch: ForwardBatch,
|
||||||
|
input_embeds: torch.Tensor = None,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
hidden_states = self.model(input_ids, positions, forward_batch)
|
hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
|
||||||
return self.logits_processor(
|
return self.logits_processor(
|
||||||
input_ids, hidden_states, self.lm_head, forward_batch
|
input_ids, hidden_states, self.lm_head, forward_batch
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ from sglang.srt.managers.mm_utils import (
|
|||||||
from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
|
from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
|
||||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
||||||
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
||||||
|
from sglang.srt.models.deepseek import DeepseekForCausalLM
|
||||||
from sglang.srt.models.deepseek_v2 import DeepseekV2ForCausalLM
|
from sglang.srt.models.deepseek_v2 import DeepseekV2ForCausalLM
|
||||||
|
|
||||||
|
|
||||||
@@ -189,7 +190,11 @@ class DeepseekVL2ForCausalLM(nn.Module):
|
|||||||
|
|
||||||
# ----------- language model ------------
|
# ----------- language model ------------
|
||||||
language_config = config.language_config
|
language_config = config.language_config
|
||||||
self.language_model = DeepseekV2ForCausalLM(language_config)
|
if language_config.use_mla:
|
||||||
|
self.language_model = DeepseekV2ForCausalLM(language_config)
|
||||||
|
else:
|
||||||
|
# deepseek-vl2-tiny forbids mla
|
||||||
|
self.language_model = DeepseekForCausalLM(language_config)
|
||||||
|
|
||||||
def _init_vision_module(
|
def _init_vision_module(
|
||||||
self, vision_config, quant_config: Optional[QuantizationConfig]
|
self, vision_config, quant_config: Optional[QuantizationConfig]
|
||||||
|
|||||||
@@ -654,6 +654,30 @@ class TestDeepseekVL2Server(TestOpenAIVisionServer):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class TestDeepseekVL2TinyServer(TestOpenAIVisionServer):
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls):
|
||||||
|
cls.model = "deepseek-ai/deepseek-vl2-tiny"
|
||||||
|
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||||
|
cls.api_key = "sk-123456"
|
||||||
|
cls.process = popen_launch_server(
|
||||||
|
cls.model,
|
||||||
|
cls.base_url,
|
||||||
|
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||||
|
other_args=[
|
||||||
|
"--trust-remote-code",
|
||||||
|
"--chat-template",
|
||||||
|
"deepseek-vl2",
|
||||||
|
"--context-length",
|
||||||
|
"4096",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
cls.base_url += "/v1"
|
||||||
|
|
||||||
|
def test_video_chat_completion(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class TestJanusProServer(TestOpenAIVisionServer):
|
class TestJanusProServer(TestOpenAIVisionServer):
|
||||||
@classmethod
|
@classmethod
|
||||||
def setUpClass(cls):
|
def setUpClass(cls):
|
||||||
|
|||||||
Reference in New Issue
Block a user