[Feat] Add modalities for vision server when handling pixel values for llava (#1346)
This commit is contained in:
committed by
GitHub
parent
8e6bdf851c
commit
662ecd9368
@@ -50,6 +50,8 @@ class GenerateReqInput:
|
||||
return_text_in_logprobs: bool = False
|
||||
# Whether to stream output.
|
||||
stream: bool = False
|
||||
# The modalities of the image data [image, multi-images, video]
|
||||
modalities: Optional[List[str]] = None
|
||||
|
||||
def post_init(self):
|
||||
if (self.text is None and self.input_ids is None) or (
|
||||
@@ -177,6 +179,8 @@ class TokenizedGenerateReqInput:
|
||||
top_logprobs_num: int
|
||||
# Whether to stream output
|
||||
stream: bool
|
||||
# Modalities of the input images
|
||||
modalites: Optional[List[str]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
Reference in New Issue
Block a user