vlm: support video as an input modality (#5888)
This commit is contained in:
@@ -198,7 +198,7 @@ class TestOpenAIVisionServer(CustomTestCase):
|
||||
assert response.usage.completion_tokens > 0
|
||||
assert response.usage.total_tokens > 0
|
||||
|
||||
def prepare_video_messages(self, video_path):
|
||||
def prepare_video_images_messages(self, video_path):
|
||||
# the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
|
||||
# the size of the video embeds differs from the `modality` argument when preprocessed
|
||||
|
||||
@@ -208,7 +208,7 @@ class TestOpenAIVisionServer(CustomTestCase):
|
||||
# from transformers import AutoTokenizer
|
||||
from decord import VideoReader, cpu
|
||||
|
||||
max_frames_num = 20
|
||||
max_frames_num = 10
|
||||
vr = VideoReader(video_path, ctx=cpu(0))
|
||||
total_frame_num = len(vr)
|
||||
uniform_sampled_frames = np.linspace(
|
||||
@@ -229,7 +229,7 @@ class TestOpenAIVisionServer(CustomTestCase):
|
||||
frame_format = {
|
||||
"type": "image_url",
|
||||
"image_url": {"url": "data:image/jpeg;base64,{}"},
|
||||
"modalities": "video",
|
||||
"modalities": "image",
|
||||
}
|
||||
|
||||
for base64_frame in base64_frames:
|
||||
@@ -243,15 +243,14 @@ class TestOpenAIVisionServer(CustomTestCase):
|
||||
|
||||
return messages
|
||||
|
||||
def prepare_video_messages_video_direct(self, video_path):
|
||||
def prepare_video_messages(self, video_path):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"video:{video_path}"},
|
||||
"modalities": "video",
|
||||
"type": "video_url",
|
||||
"video_url": {"url": f"{video_path}"},
|
||||
},
|
||||
{"type": "text", "text": "Please describe the video in detail."},
|
||||
],
|
||||
@@ -275,13 +274,57 @@ class TestOpenAIVisionServer(CustomTestCase):
|
||||
f.write(response.content)
|
||||
return file_path
|
||||
|
||||
def test_video_chat_completion(self):
|
||||
# this test samples frames of video as input, but not video directly
|
||||
def test_video_images_chat_completion(self):
|
||||
url = VIDEO_JOBS_URL
|
||||
file_path = self.get_or_download_file(url)
|
||||
|
||||
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
|
||||
|
||||
messages = self.prepare_video_images_messages(file_path)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="default",
|
||||
messages=messages,
|
||||
temperature=0,
|
||||
max_tokens=1024,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
video_response = response.choices[0].message.content
|
||||
|
||||
print("-" * 30)
|
||||
print(f"Video images response:\n{video_response}")
|
||||
print("-" * 30)
|
||||
|
||||
# Add assertions to validate the video response
|
||||
assert (
|
||||
"iPod" in video_response
|
||||
or "device" in video_response
|
||||
or "microphone" in video_response
|
||||
), video_response
|
||||
assert (
|
||||
"man" in video_response
|
||||
or "person" in video_response
|
||||
or "individual" in video_response
|
||||
or "speaker" in video_response
|
||||
), video_response
|
||||
assert (
|
||||
"present" in video_response
|
||||
or "examine" in video_response
|
||||
or "display" in video_response
|
||||
or "hold" in video_response
|
||||
)
|
||||
assert "black" in video_response or "dark" in video_response
|
||||
self.assertIsNotNone(video_response)
|
||||
self.assertGreater(len(video_response), 0)
|
||||
|
||||
def _test_video_chat_completion(self):
|
||||
url = VIDEO_JOBS_URL
|
||||
file_path = self.get_or_download_file(url)
|
||||
|
||||
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
|
||||
|
||||
# messages = self.prepare_video_messages_video_direct(file_path)
|
||||
messages = self.prepare_video_messages(file_path)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
@@ -301,7 +344,9 @@ class TestOpenAIVisionServer(CustomTestCase):
|
||||
|
||||
# Add assertions to validate the video response
|
||||
assert (
|
||||
"iPod" in video_response or "device" in video_response
|
||||
"iPod" in video_response
|
||||
or "device" in video_response
|
||||
or "microphone" in video_response
|
||||
), f"video_response: {video_response}, should contain 'iPod' or 'device'"
|
||||
assert (
|
||||
"man" in video_response
|
||||
|
||||
Reference in New Issue
Block a user