feat: update multimodal data handling in engine entrypoint (#8002)
Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
This commit is contained in:
@@ -46,9 +46,9 @@ from sglang.srt.managers.io_struct import (
|
||||
EmbeddingReqInput,
|
||||
GenerateReqInput,
|
||||
GetWeightsByNameReqInput,
|
||||
ImageDataItem,
|
||||
InitWeightsUpdateGroupReqInput,
|
||||
LoadLoRAAdapterReqInput,
|
||||
MultimodalDataInputFormat,
|
||||
ReleaseMemoryOccupationReqInput,
|
||||
ResumeMemoryOccupationReqInput,
|
||||
RpcReqInput,
|
||||
@@ -148,13 +148,9 @@ class Engine(EngineBase):
|
||||
# - List of images (one per request in a batch)
|
||||
# - List of lists of images (multiple images per request)
|
||||
# See also python/sglang/srt/utils.py:load_image for more details.
|
||||
image_data: Optional[
|
||||
Union[
|
||||
List[List[ImageDataItem]],
|
||||
List[ImageDataItem],
|
||||
ImageDataItem,
|
||||
]
|
||||
] = None,
|
||||
image_data: Optional[MultimodalDataInputFormat] = None,
|
||||
audio_data: Optional[MultimodalDataInputFormat] = None,
|
||||
video_data: Optional[MultimodalDataInputFormat] = None,
|
||||
return_logprob: Optional[Union[List[bool], bool]] = False,
|
||||
logprob_start_len: Optional[Union[List[int], int]] = None,
|
||||
top_logprobs_num: Optional[Union[List[int], int]] = None,
|
||||
@@ -187,6 +183,8 @@ class Engine(EngineBase):
|
||||
input_ids=input_ids,
|
||||
sampling_params=sampling_params,
|
||||
image_data=image_data,
|
||||
audio_data=audio_data,
|
||||
video_data=video_data,
|
||||
return_logprob=return_logprob,
|
||||
logprob_start_len=logprob_start_len,
|
||||
top_logprobs_num=top_logprobs_num,
|
||||
@@ -231,13 +229,9 @@ class Engine(EngineBase):
|
||||
# - List of images (one per request in a batch)
|
||||
# - List of lists of images (multiple images per request)
|
||||
# See also python/sglang/srt/utils.py:load_image for more details.
|
||||
image_data: Optional[
|
||||
Union[
|
||||
List[List[ImageDataItem]],
|
||||
List[ImageDataItem],
|
||||
ImageDataItem,
|
||||
]
|
||||
] = None,
|
||||
image_data: Optional[MultimodalDataInputFormat] = None,
|
||||
audio_data: Optional[MultimodalDataInputFormat] = None,
|
||||
video_data: Optional[MultimodalDataInputFormat] = None,
|
||||
return_logprob: Optional[Union[List[bool], bool]] = False,
|
||||
logprob_start_len: Optional[Union[List[int], int]] = None,
|
||||
top_logprobs_num: Optional[Union[List[int], int]] = None,
|
||||
@@ -272,6 +266,8 @@ class Engine(EngineBase):
|
||||
input_ids=input_ids,
|
||||
sampling_params=sampling_params,
|
||||
image_data=image_data,
|
||||
audio_data=audio_data,
|
||||
video_data=video_data,
|
||||
return_logprob=return_logprob,
|
||||
logprob_start_len=logprob_start_len,
|
||||
top_logprobs_num=top_logprobs_num,
|
||||
@@ -295,19 +291,20 @@ class Engine(EngineBase):
|
||||
def encode(
|
||||
self,
|
||||
prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
|
||||
image_data: Optional[
|
||||
Union[
|
||||
List[List[Union[Image, str]]],
|
||||
List[Union[Image, str]],
|
||||
Union[Image, str],
|
||||
]
|
||||
] = None,
|
||||
image_data: Optional[MultimodalDataInputFormat] = None,
|
||||
audio_data: Optional[MultimodalDataInputFormat] = None,
|
||||
video_data: Optional[MultimodalDataInputFormat] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
|
||||
Please refer to `EmbeddingReqInput` for the documentation.
|
||||
"""
|
||||
obj = EmbeddingReqInput(text=prompt, image_data=image_data)
|
||||
obj = EmbeddingReqInput(
|
||||
text=prompt,
|
||||
image_data=image_data,
|
||||
audio_data=audio_data,
|
||||
video_data=video_data,
|
||||
)
|
||||
loop = asyncio.get_event_loop()
|
||||
generator = self.tokenizer_manager.generate_request(obj, None)
|
||||
ret = loop.run_until_complete(generator.__anext__())
|
||||
@@ -316,7 +313,9 @@ class Engine(EngineBase):
|
||||
async def async_encode(
|
||||
self,
|
||||
prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
|
||||
image_data: Optional[Union[List[str], str]] = None,
|
||||
image_data: Optional[MultimodalDataInputFormat] = None,
|
||||
audio_data: Optional[MultimodalDataInputFormat] = None,
|
||||
video_data: Optional[MultimodalDataInputFormat] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Asynchronous version of encode method.
|
||||
@@ -324,7 +323,12 @@ class Engine(EngineBase):
|
||||
The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
|
||||
Please refer to `EmbeddingReqInput` for the documentation.
|
||||
"""
|
||||
obj = EmbeddingReqInput(text=prompt, image_data=image_data)
|
||||
obj = EmbeddingReqInput(
|
||||
text=prompt,
|
||||
image_data=image_data,
|
||||
audio_data=audio_data,
|
||||
video_data=video_data,
|
||||
)
|
||||
generator = self.tokenizer_manager.generate_request(obj, None)
|
||||
return await generator.__anext__()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user