vlm: support video as an input modality (#5888)

2025-07-10 14:48:35 +08:00
parent 4ed57807c2
commit b5e3d6031c
42 changed files with 887 additions and 524 deletions
--- a/python/sglang/srt/models/llava.py
+++ b/python/sglang/srt/models/llava.py
@@ -787,7 +787,9 @@ class LlavaForConditionalGeneration(LlavaBaseForCausalLM):
            forward_batch=forward_batch,
            get_embedding=get_embedding,
            language_model=self.language_model,
-            image_data_embedding_func=self.get_image_feature,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
            placeholder_tokens=None,  # using mm_item.pad_value
            positions=positions,
        )