From b5e3d6031c33fe84be861ccb6d0fcfe2c21d062a Mon Sep 17 00:00:00 2001 From: Mick Date: Thu, 10 Jul 2025 14:48:35 +0800 Subject: [PATCH] vlm: support video as an input modality (#5888) --- python/sglang/srt/conversation.py | 23 +- .../sglang/srt/entrypoints/openai/protocol.py | 11 + .../srt/entrypoints/openai/serving_chat.py | 7 + python/sglang/srt/jinja_template_utils.py | 8 + python/sglang/srt/managers/io_struct.py | 27 +- python/sglang/srt/managers/mm_utils.py | 149 +++----- python/sglang/srt/managers/schedule_batch.py | 21 +- .../srt/model_executor/forward_batch_info.py | 14 +- .../sglang/srt/models/deepseek_janus_pro.py | 2 +- python/sglang/srt/models/deepseek_vl2.py | 2 +- python/sglang/srt/models/gemma3_mm.py | 2 +- python/sglang/srt/models/gemma3n_mm.py | 9 +- python/sglang/srt/models/internvl.py | 10 +- python/sglang/srt/models/kimi_vl.py | 10 +- python/sglang/srt/models/llava.py | 4 +- python/sglang/srt/models/llavavid.py | 2 +- python/sglang/srt/models/minicpmo.py | 3 +- python/sglang/srt/models/minicpmv.py | 2 +- python/sglang/srt/models/mllama4.py | 17 +- python/sglang/srt/models/phi4mm.py | 10 +- python/sglang/srt/models/qwen2_5_vl.py | 17 +- python/sglang/srt/models/qwen2_vl.py | 13 +- python/sglang/srt/models/vila.py | 10 +- .../multimodal/processors/base_processor.py | 328 +++++++++++------- .../multimodal/processors/deepseek_vl_v2.py | 2 +- .../srt/multimodal/processors/gemma3.py | 6 +- .../srt/multimodal/processors/gemma3n.py | 2 +- .../srt/multimodal/processors/internvl.py | 2 +- .../srt/multimodal/processors/janus_pro.py | 2 +- .../srt/multimodal/processors/kimi_vl.py | 2 +- .../srt/multimodal/processors/minicpm.py | 7 +- .../srt/multimodal/processors/mllama4.py | 2 +- .../srt/multimodal/processors/phi4mm.py | 2 +- .../srt/multimodal/processors/pixtral.py | 2 +- .../srt/multimodal/processors/qwen_vl.py | 283 ++++++++++----- .../sglang/srt/multimodal/processors/vila.py | 2 +- python/sglang/srt/utils.py | 85 +++-- test/srt/test_jinja_template_utils.py | 19 +- test/srt/test_vision_openai_server_a.py | 6 + test/srt/test_vision_openai_server_b.py | 4 +- test/srt/test_vision_openai_server_common.py | 65 +++- test/srt/test_vlm_accuracy.py | 217 ++++++------ 42 files changed, 887 insertions(+), 524 deletions(-) diff --git a/python/sglang/srt/conversation.py b/python/sglang/srt/conversation.py index facd82637..c085c4423 100644 --- a/python/sglang/srt/conversation.py +++ b/python/sglang/srt/conversation.py @@ -88,9 +88,11 @@ class Conversation: stop_str: Union[str, List[str]] = None # The string that represents an image token in the prompt image_token: str = "" + video_token: str = "