[Feat] Add modalities for vision server when handling pixel values for llava (#1346)

This commit is contained in:
Kaichen Zhang - NTU
2024-09-09 17:07:34 +08:00
committed by GitHub
parent 8e6bdf851c
commit 662ecd9368
11 changed files with 40 additions and 2 deletions

View File

@@ -71,6 +71,7 @@ class Conversation:
# Stop criteria (the default one is EOS token)
stop_str: Union[str, List[str]] = None
image_data: Optional[List[str]] = None
modalities: Optional[List[str]] = None
def get_prompt(self) -> str:
"""Get the prompt for generation."""
@@ -379,6 +380,7 @@ def generate_chat_conv(
sep2=conv.sep2,
stop_str=conv.stop_str,
image_data=[],
modalities=[],
)
if isinstance(request.messages, str):
@@ -408,6 +410,7 @@ def generate_chat_conv(
for content in message.content:
if content.type == "image_url":
num_image_url += 1
conv.modalities.append(content.modalities)
if num_image_url > 1:
image_token = "<image>"
else: