[BugFix][Qwen3-VL]: add metadata for video in qwen3-vl (#11377)

2025-10-22 06:36:01 +08:00
parent 9792b9d7e3
commit fde2decf8b
1 changed files with 25 additions and 7 deletions
--- a/python/sglang/srt/multimodal/processors/qwen_vl.py
+++ b/python/sglang/srt/multimodal/processors/qwen_vl.py
@@ -214,7 +214,14 @@ async def preprocess_video(
        interpolation=InterpolationMode.BICUBIC,
        antialias=True,
    ).float()
-    return video
+    video_metadata = {
        "fps": video_fps,
        "duration": total_frames / video_fps,
        "total_num_frames": total_frames,
        "frames_indices": idx,
        "video_backend": "torchvision",
    }
    return video, video_metadata
 # Compatible with Qwen-VL & Qwen-Omni Series
@@ -279,14 +286,25 @@ class QwenVLImageProcessor(SGLangBaseProcessor):
            resize_tasks = [resize_image_async(image) for image in base_output.images]
            base_output.images = await asyncio.gather(*resize_tasks)
        video_metadata = None
        if base_output.videos:
-            base_output.videos = [
+            video_results = await asyncio.gather(
-                await preprocess_video(video) for video in base_output.videos
+                *[preprocess_video(video) for video in base_output.videos]
-            ]
+            )
            base_output.videos, video_metadata = map(list, zip(*video_results))
-        mm_items, input_ids, ret = self.process_and_combine_mm_data(
+        # NOTE: for qwen3-vl, video_meta need to be passed in, since do_sample_frames is already done in preprocess_video
-            base_output, self.mm_tokens
+        if self.hf_config.model_type in ("qwen3_vl", "qwen3_vl_moe"):
-        )
+            mm_items, input_ids, ret = self.process_and_combine_mm_data(
                base_output,
                self.mm_tokens,
                video_metadata=video_metadata,
                do_sample_frames=False,
            )
        else:
            mm_items, input_ids, ret = self.process_and_combine_mm_data(
                base_output, self.mm_tokens
            )
        audio_feature_lengths = None