[Feat] Add modalities for vision server when handling pixel values for llava (#1346)

This commit is contained in:
Kaichen Zhang - NTU
2024-09-09 17:07:34 +08:00
committed by GitHub
parent 8e6bdf851c
commit 662ecd9368
11 changed files with 40 additions and 2 deletions

View File

@@ -93,12 +93,14 @@ def multi_image_stream_request_test(client):
"image_url": {
"url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png"
},
"modalities": "multi-images",
},
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png"
},
"modalities": "multi-images",
},
{
"type": "text",
@@ -218,6 +220,7 @@ def prepare_video_messages(video_path):
frame_format = {
"type": "image_url",
"image_url": {"url": "data:image/jpeg;base64,{}"},
"modalities": "video",
}
for base64_frame in base64_frames: