From 3a911b854dfa9fb2999f1e6bb7db8957aeb89b11 Mon Sep 17 00:00:00 2001 From: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com> Date: Mon, 30 Jun 2025 23:14:48 -0700 Subject: [PATCH] Refactor mm processors and Enable mixed modality processing (#7629) Signed-off-by: Xinyuan Tong --- python/sglang/srt/managers/mm_utils.py | 78 ++--- python/sglang/srt/managers/schedule_batch.py | 9 + .../sglang/srt/managers/tokenizer_manager.py | 25 +- python/sglang/srt/models/deepseek_vl2.py | 8 +- python/sglang/srt/models/gemma3n_mm.py | 24 +- python/sglang/srt/models/kimi_vl.py | 3 +- python/sglang/srt/models/mllama4.py | 5 +- python/sglang/srt/models/phi4mm.py | 4 +- python/sglang/srt/models/pixtral.py | 10 +- python/sglang/srt/models/qwen2_5_vl.py | 4 +- python/sglang/srt/models/qwen2_vl.py | 5 +- python/sglang/srt/models/vila.py | 11 +- .../multimodal/processors/base_processor.py | 309 ++++++++---------- .../sglang/srt/multimodal/processors/clip.py | 11 +- .../multimodal/processors/deepseek_vl_v2.py | 9 +- .../srt/multimodal/processors/gemma3.py | 9 +- .../srt/multimodal/processors/gemma3n.py | 21 +- .../srt/multimodal/processors/internvl.py | 7 - .../srt/multimodal/processors/janus_pro.py | 6 - .../srt/multimodal/processors/kimi_vl.py | 9 +- .../sglang/srt/multimodal/processors/llava.py | 6 - .../srt/multimodal/processors/minicpm.py | 9 +- .../sglang/srt/multimodal/processors/mlama.py | 12 +- .../srt/multimodal/processors/mllama4.py | 3 - .../srt/multimodal/processors/phi4mm.py | 12 +- .../srt/multimodal/processors/pixtral.py | 6 - .../srt/multimodal/processors/qwen_vl.py | 10 +- .../sglang/srt/multimodal/processors/vila.py | 38 +-- 28 files changed, 235 insertions(+), 428 deletions(-) diff --git a/python/sglang/srt/managers/mm_utils.py b/python/sglang/srt/managers/mm_utils.py index d05df897f..94abc80df 100644 --- a/python/sglang/srt/managers/mm_utils.py +++ b/python/sglang/srt/managers/mm_utils.py @@ -125,74 +125,38 @@ class MultiModalityDataPaddingPatternMultimodalTokens(MultiModalityDataPaddingPa e.g. ...., or