[Feat] Add modalities for vision server when handling pixel values for llava (#1346)

This commit is contained in:
Kaichen Zhang - NTU
2024-09-09 17:07:34 +08:00
committed by GitHub
parent 8e6bdf851c
commit 662ecd9368
11 changed files with 40 additions and 2 deletions

View File

@@ -50,6 +50,8 @@ class GenerateReqInput:
return_text_in_logprobs: bool = False
# Whether to stream output.
stream: bool = False
# The modalities of the image data [image, multi-images, video]
modalities: Optional[List[str]] = None
def post_init(self):
if (self.text is None and self.input_ids is None) or (
@@ -177,6 +179,8 @@ class TokenizedGenerateReqInput:
top_logprobs_num: int
# Whether to stream output
stream: bool
# Modalities of the input images
modalites: Optional[List[str]] = None
@dataclass