[VLM RLHF] Take Image input for verl vlm rollout (#4915)

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com> Co-authored-by: GeLee <leege233@gmail.com>
2025-04-01 20:03:17 -07:00
parent 12047f5e94
commit 9eb49e878b
4 changed files with 18 additions and 12 deletions
--- a/python/sglang/srt/managers/multimodal_processors/base_processor.py
+++ b/python/sglang/srt/managers/multimodal_processors/base_processor.py
@@ -139,8 +139,6 @@ class BaseMultimodalProcessor(ABC):
        else:
            multimodal_tokens.image_token = multimodal_tokens.image_token

-        assert isinstance(prompt, str)
-
        if isinstance(prompt, list) and return_text:
            assert len(prompt) and isinstance(prompt[0], int)
            prompt = self._processor.tokenizer.decode(prompt)
@@ -204,7 +202,16 @@ class BaseMultimodalProcessor(ABC):
                            continue

                    image_sizes += frames[0].size * len(frames)
-                    hashes += [hash(image_file)] * len(frames)
+
+                    # Generate a hashable value for the image file
+                    if isinstance(image_file, Image.Image):
+                        # For PIL.Image objects, use the ID as a hashable value
+                        hash_value = hash(id(image_file))
+                    else:
+                        # For other types (strings, etc.), use the regular hash
+                        hash_value = hash(image_file)
+
+                    hashes += [hash_value] * len(frames)
                    images += frames
                    image_index += 1
                    if frames_to_process != 0:
--- a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py
+++ b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py
@@ -5,7 +5,7 @@ from typing import List, Union
 import torch
 from PIL import Image

-from sglang.srt.managers.multimodal_processor import (
+from sglang.srt.managers.multimodal_processors.base_processor import (
    BaseMultimodalProcessor as SGLangBaseProcessor,
 )
 from sglang.srt.managers.multimodal_processors.base_processor import (