Support precomputed multimodal features for Qwen-VL and Gemma3 models. (#6136)

Co-authored-by: Yury Sulsky <ysulsky@tesla.com>
2025-05-16 12:26:15 -07:00
parent c23a7072b6
commit f19a9204cd
14 changed files with 592 additions and 125 deletions
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -47,6 +47,7 @@ from sglang.srt.managers.io_struct import (
    EmbeddingReqInput,
    GenerateReqInput,
    GetWeightsByNameReqInput,
+    ImageDataItem,
    InitWeightsUpdateGroupReqInput,
    ReleaseMemoryOccupationReqInput,
    ResumeMemoryOccupationReqInput,
@@ -150,9 +151,9 @@ class Engine(EngineBase):
        # See also python/sglang/srt/utils.py:load_image for more details.
        image_data: Optional[
            Union[
-                List[List[Union[Image, str]]],
-                List[Union[Image, str]],
-                Union[Image, str],
+                List[List[ImageDataItem]],
+                List[ImageDataItem],
+                ImageDataItem,
            ]
        ] = None,
        return_logprob: Optional[Union[List[bool], bool]] = False,
@@ -221,9 +222,9 @@ class Engine(EngineBase):
        # See also python/sglang/srt/utils.py:load_image for more details.
        image_data: Optional[
            Union[
-                List[List[Union[Image, str]]],
-                List[Union[Image, str]],
-                Union[Image, str],
+                List[List[ImageDataItem]],
+                List[ImageDataItem],
+                ImageDataItem,
            ]
        ] = None,
        return_logprob: Optional[Union[List[bool], bool]] = False,