[Feat] Add modalities for vision server when handling pixel values for llava (#1346)

2024-09-09 17:07:34 +08:00
parent 8e6bdf851c
commit 662ecd9368
11 changed files with 40 additions and 2 deletions
--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -50,6 +50,8 @@ class GenerateReqInput:
    return_text_in_logprobs: bool = False
    # Whether to stream output.
    stream: bool = False
+    # The modalities of the image data [image, multi-images, video]
+    modalities: Optional[List[str]] = None

    def post_init(self):
        if (self.text is None and self.input_ids is None) or (
@@ -177,6 +179,8 @@ class TokenizedGenerateReqInput:
    top_logprobs_num: int
    # Whether to stream output
    stream: bool
+    # Modalities of the input images
+    modalites: Optional[List[str]] = None


@dataclass