vlm: support video as an input modality (#5888)

This commit is contained in:
Mick
2025-07-10 14:48:35 +08:00
committed by GitHub
parent 4ed57807c2
commit b5e3d6031c
42 changed files with 887 additions and 524 deletions

View File

@@ -3,7 +3,6 @@ Unit tests for Jinja chat template utils.
"""
import unittest
from unittest.mock import patch
from sglang.srt.jinja_template_utils import (
detect_jinja_template_content_format,
@@ -76,11 +75,12 @@ class TestTemplateContentFormatDetection(CustomTestCase):
}
image_data = []
video_data = []
audio_data = []
modalities = []
result = process_content_for_template_format(
msg_dict, "openai", image_data, audio_data, modalities
msg_dict, "openai", image_data, video_data, audio_data, modalities
)
# Check that image_data was extracted
@@ -111,11 +111,12 @@ class TestTemplateContentFormatDetection(CustomTestCase):
}
image_data = []
video_data = []
audio_data = []
modalities = []
result = process_content_for_template_format(
msg_dict, "string", image_data, audio_data, modalities
msg_dict, "string", image_data, video_data, audio_data, modalities
)
# For string format, should flatten to text only
@@ -139,11 +140,12 @@ class TestTemplateContentFormatDetection(CustomTestCase):
}
image_data = []
video_data = []
audio_data = []
modalities = []
result = process_content_for_template_format(
msg_dict, "openai", image_data, audio_data, modalities
msg_dict, "openai", image_data, video_data, audio_data, modalities
)
# Check that audio_data was extracted
@@ -162,11 +164,12 @@ class TestTemplateContentFormatDetection(CustomTestCase):
msg_dict = {"role": "user", "content": "Hello world"}
image_data = []
video_data = []
audio_data = []
modalities = []
result = process_content_for_template_format(
msg_dict, "openai", image_data, audio_data, modalities
msg_dict, "openai", image_data, video_data, audio_data, modalities
)
# Should pass through unchanged
@@ -188,11 +191,12 @@ class TestTemplateContentFormatDetection(CustomTestCase):
}
image_data = []
video_data = []
audio_data = []
modalities = []
result = process_content_for_template_format(
msg_dict, "openai", image_data, audio_data, modalities
msg_dict, "openai", image_data, video_data, audio_data, modalities
)
# Check that modalities was extracted
@@ -209,11 +213,12 @@ class TestTemplateContentFormatDetection(CustomTestCase):
}
image_data = []
video_data = []
audio_data = []
modalities = []
result = process_content_for_template_format(
msg_dict, "string", image_data, audio_data, modalities
msg_dict, "string", image_data, video_data, audio_data, modalities
)
# None values should be filtered out