From fde2decf8b5989d7ab172ffe258f8c99eb1e643a Mon Sep 17 00:00:00 2001
From: Zheng Wengang <zwg0606@gmail.com>
Date: Wed, 22 Oct 2025 06:36:01 +0800
Subject: [PATCH] [BugFix][Qwen3-VL]: add metadata for video in qwen3-vl
 (#11377)

---
 .../srt/multimodal/processors/qwen_vl.py      | 32 +++++++++++++++----
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/python/sglang/srt/multimodal/processors/qwen_vl.py b/python/sglang/srt/multimodal/processors/qwen_vl.py
index 21787b392..1bf12708a 100644
--- a/python/sglang/srt/multimodal/processors/qwen_vl.py
+++ b/python/sglang/srt/multimodal/processors/qwen_vl.py
@@ -214,7 +214,14 @@ async def preprocess_video(
         interpolation=InterpolationMode.BICUBIC,
         antialias=True,
     ).float()
-    return video
+    video_metadata = {
+        "fps": video_fps,
+        "duration": total_frames / video_fps,
+        "total_num_frames": total_frames,
+        "frames_indices": idx,
+        "video_backend": "torchvision",
+    }
+    return video, video_metadata
 
 
 # Compatible with Qwen-VL & Qwen-Omni Series
@@ -279,14 +286,25 @@ class QwenVLImageProcessor(SGLangBaseProcessor):
             resize_tasks = [resize_image_async(image) for image in base_output.images]
             base_output.images = await asyncio.gather(*resize_tasks)
 
+        video_metadata = None
         if base_output.videos:
-            base_output.videos = [
-                await preprocess_video(video) for video in base_output.videos
-            ]
+            video_results = await asyncio.gather(
+                *[preprocess_video(video) for video in base_output.videos]
+            )
+            base_output.videos, video_metadata = map(list, zip(*video_results))
 
-        mm_items, input_ids, ret = self.process_and_combine_mm_data(
-            base_output, self.mm_tokens
-        )
+        # NOTE: for qwen3-vl, video_meta need to be passed in, since do_sample_frames is already done in preprocess_video
+        if self.hf_config.model_type in ("qwen3_vl", "qwen3_vl_moe"):
+            mm_items, input_ids, ret = self.process_and_combine_mm_data(
+                base_output,
+                self.mm_tokens,
+                video_metadata=video_metadata,
+                do_sample_frames=False,
+            )
+        else:
+            mm_items, input_ids, ret = self.process_and_combine_mm_data(
+                base_output, self.mm_tokens
+            )
 
         audio_feature_lengths = None