[feat&refactor] Enhance multimodal input support with refactor io_struct (#4938)

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
2025-04-08 14:48:07 -07:00
parent f8194b267c
commit d09a51f1f6
4 changed files with 811 additions and 104 deletions
--- a/python/sglang/srt/entrypoints/verl_engine.py
+++ b/python/sglang/srt/entrypoints/verl_engine.py
@@ -16,6 +16,7 @@ from typing import Dict, List, Optional, Tuple, Union

 import torch
 import torch.distributed as dist
+from PIL.Image import Image
 from torch.distributed.tensor import DeviceMesh, DTensor

 from sglang.srt.model_executor.model_runner import LocalSerializedTensor
@@ -56,9 +57,19 @@ class VerlEngine:
        sampling_params: Optional[Union[List[Dict], Dict]] = None,
        # The token ids for text; one can either specify text or input_ids.
        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
-        # The image input. It can be a file name, a url, or base64 encoded string.
-        # See also python/sglang/srt/utils.py:load_image.
-        image_data: Optional[Union[List[str], str]] = None,
+        # The image input. It can be an image instance, file name, URL, or base64 encoded string.
+        # Can be formatted as:
+        # - Single image for a single request
+        # - List of images (one per request in a batch)
+        # - List of lists of images (multiple images per request)
+        # See also python/sglang/srt/utils.py:load_image for more details.
+        image_data: Optional[
+            Union[
+                List[List[Union[Image, str]]],
+                List[Union[Image, str]],
+                Union[Image, str],
+            ]
+        ] = None,
        return_logprob: Optional[Union[List[bool], bool]] = False,
        logprob_start_len: Optional[Union[List[int], int]] = None,
        top_logprobs_num: Optional[Union[List[int], int]] = None,