diff --git a/python/sglang/srt/multimodal/processors/glm4v.py b/python/sglang/srt/multimodal/processors/glm4v.py index 58c55c0f8..e3c8edc92 100644 --- a/python/sglang/srt/multimodal/processors/glm4v.py +++ b/python/sglang/srt/multimodal/processors/glm4v.py @@ -2,7 +2,6 @@ import re from typing import List, Union from decord import VideoReader -from transformers.video_utils import VideoMetadata from sglang.srt.layers.rotary_embedding import MRotaryEmbedding from sglang.srt.models.glm4v import Glm4vForConditionalGeneration @@ -66,17 +65,18 @@ class Glm4vImageProcessor(SGLangBaseProcessor): total_num_frames = len(vr) duration = total_num_frames / video_fps if video_fps else 0 - metadata = VideoMetadata( - total_num_frames=int(total_num_frames), - fps=float(video_fps), - duration=float(duration), - video_backend="decord", - ) - # Extract all frames indices = list(range(total_num_frames)) frames = vr.get_batch(indices).asnumpy() - metadata.frames_indices = indices + + # Return metadata as dict so transformers can properly create VideoMetadata objects + metadata = { + "total_num_frames": int(total_num_frames), + "fps": float(video_fps), + "duration": float(duration), + "video_backend": "decord", + "frames_indices": indices, + } return frames, metadata diff --git a/test/srt/test_vision_openai_server_b.py b/test/srt/test_vision_openai_server_b.py index fd952f82f..6c2fa86d5 100644 --- a/test/srt/test_vision_openai_server_b.py +++ b/test/srt/test_vision_openai_server_b.py @@ -217,31 +217,27 @@ class TestKimiVLServer(ImageOpenAITestMixin): pass -# Skip for ci test -# class TestGLM41VServer(TestOpenAIVisionServer): -# @classmethod -# def setUpClass(cls): -# cls.model = "zai-org/GLM-4.1V-9B-Thinking" -# cls.base_url = DEFAULT_URL_FOR_TEST -# cls.api_key = "sk-123456" -# cls.process = popen_launch_server( -# cls.model, -# cls.base_url, -# timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, -# other_args=[ -# "--trust-remote-code", -# "--mem-fraction-static", -# "0.68", -# "--cuda-graph-max-bs", -# "4", -# "--reasoning-parser", -# "glm45", -# ], -# ) -# cls.base_url += "/v1" - -# def test_video_chat_completion(self): -# self._test_video_chat_completion() +class TestGLM41VServer(ImageOpenAITestMixin, VideoOpenAITestMixin): + @classmethod + def setUpClass(cls): + cls.model = "zai-org/GLM-4.1V-9B-Thinking" + cls.base_url = DEFAULT_URL_FOR_TEST + cls.api_key = "sk-123456" + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--mem-fraction-static", + "0.68", + "--cuda-graph-max-bs", + "4", + "--reasoning-parser", + "glm45", + ], + ) + cls.base_url += "/v1" if __name__ == "__main__":