vlm: support video as an input modality (#5888)
This commit is contained in:
@@ -3,7 +3,6 @@ Unit tests for Jinja chat template utils.
|
||||
"""
|
||||
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
from sglang.srt.jinja_template_utils import (
|
||||
detect_jinja_template_content_format,
|
||||
@@ -76,11 +75,12 @@ class TestTemplateContentFormatDetection(CustomTestCase):
|
||||
}
|
||||
|
||||
image_data = []
|
||||
video_data = []
|
||||
audio_data = []
|
||||
modalities = []
|
||||
|
||||
result = process_content_for_template_format(
|
||||
msg_dict, "openai", image_data, audio_data, modalities
|
||||
msg_dict, "openai", image_data, video_data, audio_data, modalities
|
||||
)
|
||||
|
||||
# Check that image_data was extracted
|
||||
@@ -111,11 +111,12 @@ class TestTemplateContentFormatDetection(CustomTestCase):
|
||||
}
|
||||
|
||||
image_data = []
|
||||
video_data = []
|
||||
audio_data = []
|
||||
modalities = []
|
||||
|
||||
result = process_content_for_template_format(
|
||||
msg_dict, "string", image_data, audio_data, modalities
|
||||
msg_dict, "string", image_data, video_data, audio_data, modalities
|
||||
)
|
||||
|
||||
# For string format, should flatten to text only
|
||||
@@ -139,11 +140,12 @@ class TestTemplateContentFormatDetection(CustomTestCase):
|
||||
}
|
||||
|
||||
image_data = []
|
||||
video_data = []
|
||||
audio_data = []
|
||||
modalities = []
|
||||
|
||||
result = process_content_for_template_format(
|
||||
msg_dict, "openai", image_data, audio_data, modalities
|
||||
msg_dict, "openai", image_data, video_data, audio_data, modalities
|
||||
)
|
||||
|
||||
# Check that audio_data was extracted
|
||||
@@ -162,11 +164,12 @@ class TestTemplateContentFormatDetection(CustomTestCase):
|
||||
msg_dict = {"role": "user", "content": "Hello world"}
|
||||
|
||||
image_data = []
|
||||
video_data = []
|
||||
audio_data = []
|
||||
modalities = []
|
||||
|
||||
result = process_content_for_template_format(
|
||||
msg_dict, "openai", image_data, audio_data, modalities
|
||||
msg_dict, "openai", image_data, video_data, audio_data, modalities
|
||||
)
|
||||
|
||||
# Should pass through unchanged
|
||||
@@ -188,11 +191,12 @@ class TestTemplateContentFormatDetection(CustomTestCase):
|
||||
}
|
||||
|
||||
image_data = []
|
||||
video_data = []
|
||||
audio_data = []
|
||||
modalities = []
|
||||
|
||||
result = process_content_for_template_format(
|
||||
msg_dict, "openai", image_data, audio_data, modalities
|
||||
msg_dict, "openai", image_data, video_data, audio_data, modalities
|
||||
)
|
||||
|
||||
# Check that modalities was extracted
|
||||
@@ -209,11 +213,12 @@ class TestTemplateContentFormatDetection(CustomTestCase):
|
||||
}
|
||||
|
||||
image_data = []
|
||||
video_data = []
|
||||
audio_data = []
|
||||
modalities = []
|
||||
|
||||
result = process_content_for_template_format(
|
||||
msg_dict, "string", image_data, audio_data, modalities
|
||||
msg_dict, "string", image_data, video_data, audio_data, modalities
|
||||
)
|
||||
|
||||
# None values should be filtered out
|
||||
|
||||
@@ -35,6 +35,9 @@ class TestQwen2VLServer(TestOpenAIVisionServer):
|
||||
)
|
||||
cls.base_url += "/v1"
|
||||
|
||||
def test_video_chat_completion(self):
|
||||
self._test_video_chat_completion()
|
||||
|
||||
|
||||
class TestQwen2_5_VLServer(TestOpenAIVisionServer):
|
||||
@classmethod
|
||||
@@ -54,6 +57,9 @@ class TestQwen2_5_VLServer(TestOpenAIVisionServer):
|
||||
)
|
||||
cls.base_url += "/v1"
|
||||
|
||||
def test_video_chat_completion(self):
|
||||
self._test_video_chat_completion()
|
||||
|
||||
|
||||
class TestVLMContextLengthIssue(CustomTestCase):
|
||||
@classmethod
|
||||
|
||||
@@ -93,7 +93,7 @@ class TestJanusProServer(TestOpenAIVisionServer):
|
||||
)
|
||||
cls.base_url += "/v1"
|
||||
|
||||
def test_video_chat_completion(self):
|
||||
def test_video_images_chat_completion(self):
|
||||
pass
|
||||
|
||||
def test_single_image_chat_completion(self):
|
||||
@@ -170,7 +170,7 @@ class TestKimiVLServer(TestOpenAIVisionServer):
|
||||
)
|
||||
cls.base_url += "/v1"
|
||||
|
||||
def test_video_chat_completion(self):
|
||||
def test_video_images_chat_completion(self):
|
||||
pass
|
||||
|
||||
|
||||
|
||||
@@ -198,7 +198,7 @@ class TestOpenAIVisionServer(CustomTestCase):
|
||||
assert response.usage.completion_tokens > 0
|
||||
assert response.usage.total_tokens > 0
|
||||
|
||||
def prepare_video_messages(self, video_path):
|
||||
def prepare_video_images_messages(self, video_path):
|
||||
# the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
|
||||
# the size of the video embeds differs from the `modality` argument when preprocessed
|
||||
|
||||
@@ -208,7 +208,7 @@ class TestOpenAIVisionServer(CustomTestCase):
|
||||
# from transformers import AutoTokenizer
|
||||
from decord import VideoReader, cpu
|
||||
|
||||
max_frames_num = 20
|
||||
max_frames_num = 10
|
||||
vr = VideoReader(video_path, ctx=cpu(0))
|
||||
total_frame_num = len(vr)
|
||||
uniform_sampled_frames = np.linspace(
|
||||
@@ -229,7 +229,7 @@ class TestOpenAIVisionServer(CustomTestCase):
|
||||
frame_format = {
|
||||
"type": "image_url",
|
||||
"image_url": {"url": "data:image/jpeg;base64,{}"},
|
||||
"modalities": "video",
|
||||
"modalities": "image",
|
||||
}
|
||||
|
||||
for base64_frame in base64_frames:
|
||||
@@ -243,15 +243,14 @@ class TestOpenAIVisionServer(CustomTestCase):
|
||||
|
||||
return messages
|
||||
|
||||
def prepare_video_messages_video_direct(self, video_path):
|
||||
def prepare_video_messages(self, video_path):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"video:{video_path}"},
|
||||
"modalities": "video",
|
||||
"type": "video_url",
|
||||
"video_url": {"url": f"{video_path}"},
|
||||
},
|
||||
{"type": "text", "text": "Please describe the video in detail."},
|
||||
],
|
||||
@@ -275,13 +274,57 @@ class TestOpenAIVisionServer(CustomTestCase):
|
||||
f.write(response.content)
|
||||
return file_path
|
||||
|
||||
def test_video_chat_completion(self):
|
||||
# this test samples frames of video as input, but not video directly
|
||||
def test_video_images_chat_completion(self):
|
||||
url = VIDEO_JOBS_URL
|
||||
file_path = self.get_or_download_file(url)
|
||||
|
||||
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
|
||||
|
||||
messages = self.prepare_video_images_messages(file_path)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="default",
|
||||
messages=messages,
|
||||
temperature=0,
|
||||
max_tokens=1024,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
video_response = response.choices[0].message.content
|
||||
|
||||
print("-" * 30)
|
||||
print(f"Video images response:\n{video_response}")
|
||||
print("-" * 30)
|
||||
|
||||
# Add assertions to validate the video response
|
||||
assert (
|
||||
"iPod" in video_response
|
||||
or "device" in video_response
|
||||
or "microphone" in video_response
|
||||
), video_response
|
||||
assert (
|
||||
"man" in video_response
|
||||
or "person" in video_response
|
||||
or "individual" in video_response
|
||||
or "speaker" in video_response
|
||||
), video_response
|
||||
assert (
|
||||
"present" in video_response
|
||||
or "examine" in video_response
|
||||
or "display" in video_response
|
||||
or "hold" in video_response
|
||||
)
|
||||
assert "black" in video_response or "dark" in video_response
|
||||
self.assertIsNotNone(video_response)
|
||||
self.assertGreater(len(video_response), 0)
|
||||
|
||||
def _test_video_chat_completion(self):
|
||||
url = VIDEO_JOBS_URL
|
||||
file_path = self.get_or_download_file(url)
|
||||
|
||||
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
|
||||
|
||||
# messages = self.prepare_video_messages_video_direct(file_path)
|
||||
messages = self.prepare_video_messages(file_path)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
@@ -301,7 +344,9 @@ class TestOpenAIVisionServer(CustomTestCase):
|
||||
|
||||
# Add assertions to validate the video response
|
||||
assert (
|
||||
"iPod" in video_response or "device" in video_response
|
||||
"iPod" in video_response
|
||||
or "device" in video_response
|
||||
or "microphone" in video_response
|
||||
), f"video_response: {video_response}, should contain 'iPod' or 'device'"
|
||||
assert (
|
||||
"man" in video_response
|
||||
|
||||
@@ -10,15 +10,8 @@ import requests
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from PIL import Image
|
||||
from transformers import (
|
||||
AutoModel,
|
||||
AutoProcessor,
|
||||
AutoTokenizer,
|
||||
Gemma3ForConditionalGeneration,
|
||||
Qwen2_5_VLForConditionalGeneration,
|
||||
)
|
||||
from transformers import AutoModel, AutoProcessor, AutoTokenizer
|
||||
|
||||
from sglang import Engine
|
||||
from sglang.srt.configs.model_config import ModelConfig
|
||||
from sglang.srt.conversation import generate_chat_conv
|
||||
from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest
|
||||
@@ -169,107 +162,107 @@ class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase):
|
||||
|
||||
|
||||
# TODO: MiniCPMV is not compatible with transformers==4.52.3, temporarily disabled
|
||||
# class TestMiniCPMVLogits(VisionLLMLogitsBase):
|
||||
# @classmethod
|
||||
# def setUpClass(cls):
|
||||
# super().setUpClass()
|
||||
# cls.model_path = "openbmb/MiniCPM-V-2_6"
|
||||
# cls.tokenizer = AutoTokenizer.from_pretrained(
|
||||
# cls.model_path, trust_remote_code=True
|
||||
# )
|
||||
# cls.processor = AutoProcessor.from_pretrained(
|
||||
# cls.model_path, trust_remote_code=True
|
||||
# )
|
||||
# cls.chat_template = "minicpmv"
|
||||
#
|
||||
# cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
# cls.hf_model = (
|
||||
# AutoModel.from_pretrained(
|
||||
# cls.model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
|
||||
# )
|
||||
# .eval()
|
||||
# .to(cls.device)
|
||||
# )
|
||||
# init_embedding_cache(0)
|
||||
#
|
||||
# async def test_vlm_embedding_output(self):
|
||||
# """
|
||||
# Compares the embedding output of vlm
|
||||
# """
|
||||
# inputs = self.get_processor_output()
|
||||
#
|
||||
# with torch.no_grad():
|
||||
# # hf
|
||||
# model_inputs = {
|
||||
# "input_ids": inputs.input_ids,
|
||||
# "image_bound": inputs.image_bound,
|
||||
# "pixel_values": inputs.pixel_values,
|
||||
# "tgt_sizes": inputs.tgt_sizes,
|
||||
# }
|
||||
# (hf_output, _) = self.hf_model.get_vllm_embedding(
|
||||
# model_inputs,
|
||||
# )
|
||||
# hf_output = hf_output.squeeze(0)
|
||||
#
|
||||
# # sglang
|
||||
# model = self.get_sglang_model()
|
||||
# input_ids = inputs["input_ids"].to(self.device).flatten()
|
||||
#
|
||||
# pixel_values = inputs["pixel_values"]
|
||||
# tgt_sizes = inputs["tgt_sizes"]
|
||||
# pixel_values_flat: List[torch.Tensor] = []
|
||||
# tgt_sizes_flat: List[torch.Tensor] = []
|
||||
# for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
|
||||
# # per image
|
||||
# if len(pixel_b) != len(tgt_b):
|
||||
# raise ValueError(
|
||||
# "Inconsistent N lengths, found: "
|
||||
# f"{len(pixel_b)} vs {len(tgt_b)}"
|
||||
# )
|
||||
# for pixel_n, tgt_n in zip(pixel_b, tgt_b):
|
||||
# pixel_values_flat += [pixel_n]
|
||||
# tgt_sizes_flat += [tgt_n]
|
||||
#
|
||||
# im_start_id, im_end_id = (
|
||||
# self.tokenizer.im_start_id,
|
||||
# self.tokenizer.im_end_id,
|
||||
# )
|
||||
# slice_start_id, slice_end_id = (
|
||||
# self.tokenizer.slice_start_id,
|
||||
# self.tokenizer.slice_end_id,
|
||||
# )
|
||||
#
|
||||
# image_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
|
||||
# input_ids=input_ids, mm_start_id=im_start_id, mm_end_id=im_end_id
|
||||
# )
|
||||
# slice_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
|
||||
# input_ids=input_ids, mm_start_id=slice_start_id, mm_end_id=slice_end_id
|
||||
# )
|
||||
# image_offsets.extend(slice_offsets)
|
||||
# image_offsets = sorted(image_offsets)
|
||||
#
|
||||
# sglang_output = embed_mm_inputs(
|
||||
# mm_inputs_list=[
|
||||
# MultimodalInputs(
|
||||
# mm_items=[
|
||||
# MultimodalDataItem(
|
||||
# pixel_values=pixel_values_flat,
|
||||
# image_offsets=image_offsets,
|
||||
# tgt_size=tgt_sizes_flat,
|
||||
# modality=Modality.IMAGE,
|
||||
# pad_value=self.processor.tokenizer.unk_token_id,
|
||||
# )
|
||||
# ]
|
||||
# ),
|
||||
# ],
|
||||
# extend_prefix_lens=[0],
|
||||
# extend_seq_lens=[input_ids.shape[0]],
|
||||
# input_ids=input_ids,
|
||||
# input_embedding=model.get_input_embeddings(),
|
||||
# image_data_embedding_func=model.get_image_feature,
|
||||
# placeholder_tokens={
|
||||
# Modality.IMAGE: self.processor.tokenizer.unk_token_id,
|
||||
# },
|
||||
# )
|
||||
#
|
||||
# self.compare_outputs(sglang_output, hf_output)
|
||||
class TestMiniCPMVLogits(VisionLLMLogitsBase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
super().setUpClass()
|
||||
cls.model_path = "openbmb/MiniCPM-V-2_6"
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(
|
||||
cls.model_path, trust_remote_code=True
|
||||
)
|
||||
cls.processor = AutoProcessor.from_pretrained(
|
||||
cls.model_path, trust_remote_code=True
|
||||
)
|
||||
cls.chat_template = "minicpmv"
|
||||
|
||||
cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
cls.hf_model = (
|
||||
AutoModel.from_pretrained(
|
||||
cls.model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
|
||||
)
|
||||
.eval()
|
||||
.to(cls.device)
|
||||
)
|
||||
init_embedding_cache()
|
||||
|
||||
async def test_vlm_embedding_output(self):
|
||||
"""
|
||||
Compares the embedding output of vlm
|
||||
"""
|
||||
inputs = self.get_processor_output()
|
||||
|
||||
with torch.no_grad():
|
||||
# hf
|
||||
model_inputs = {
|
||||
"input_ids": inputs.input_ids,
|
||||
"image_bound": inputs.image_bound,
|
||||
"pixel_values": inputs.pixel_values,
|
||||
"tgt_sizes": inputs.tgt_sizes,
|
||||
}
|
||||
(hf_output, _) = self.hf_model.get_vllm_embedding(
|
||||
model_inputs,
|
||||
)
|
||||
hf_output = hf_output.squeeze(0)
|
||||
|
||||
# sglang
|
||||
model = self.get_sglang_model()
|
||||
input_ids = inputs["input_ids"].to(self.device).flatten()
|
||||
|
||||
pixel_values = inputs["pixel_values"]
|
||||
tgt_sizes = inputs["tgt_sizes"]
|
||||
pixel_values_flat: List[torch.Tensor] = []
|
||||
tgt_sizes_flat: List[torch.Tensor] = []
|
||||
for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
|
||||
# per image
|
||||
if len(pixel_b) != len(tgt_b):
|
||||
raise ValueError(
|
||||
"Inconsistent N lengths, found: "
|
||||
f"{len(pixel_b)} vs {len(tgt_b)}"
|
||||
)
|
||||
for pixel_n, tgt_n in zip(pixel_b, tgt_b):
|
||||
pixel_values_flat += [pixel_n]
|
||||
tgt_sizes_flat += [tgt_n]
|
||||
|
||||
im_start_id, im_end_id = (
|
||||
self.tokenizer.im_start_id,
|
||||
self.tokenizer.im_end_id,
|
||||
)
|
||||
slice_start_id, slice_end_id = (
|
||||
self.tokenizer.slice_start_id,
|
||||
self.tokenizer.slice_end_id,
|
||||
)
|
||||
|
||||
image_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
|
||||
input_ids=input_ids, mm_start_id=im_start_id, mm_end_id=im_end_id
|
||||
)
|
||||
slice_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
|
||||
input_ids=input_ids, mm_start_id=slice_start_id, mm_end_id=slice_end_id
|
||||
)
|
||||
image_offsets.extend(slice_offsets)
|
||||
image_offsets = sorted(image_offsets)
|
||||
|
||||
sglang_output = embed_mm_inputs(
|
||||
mm_inputs_list=[
|
||||
MultimodalInputs(
|
||||
mm_items=[
|
||||
MultimodalDataItem(
|
||||
pixel_values=pixel_values_flat,
|
||||
offsets=image_offsets,
|
||||
tgt_size=tgt_sizes_flat,
|
||||
modality=Modality.IMAGE,
|
||||
pad_value=self.processor.tokenizer.unk_token_id,
|
||||
)
|
||||
]
|
||||
),
|
||||
],
|
||||
extend_prefix_lens=[0],
|
||||
extend_seq_lens=[input_ids.shape[0]],
|
||||
input_ids=input_ids,
|
||||
input_embedding=model.get_input_embeddings(),
|
||||
multimodal_model=model,
|
||||
placeholder_tokens={
|
||||
Modality.IMAGE: self.processor.tokenizer.unk_token_id,
|
||||
},
|
||||
)
|
||||
|
||||
self.compare_outputs(sglang_output, hf_output)
|
||||
|
||||
Reference in New Issue
Block a user