feat: update multimodal data handling in engine entrypoint (#8002)

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
This commit is contained in:
Xinyuan Tong
2025-07-15 00:12:22 -07:00
committed by GitHub
parent c268c11c71
commit 6e923dbd30
3 changed files with 55 additions and 42 deletions

View File

@@ -8,7 +8,7 @@ from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from sglang.srt.managers.io_struct import (
EmbeddingReqInput,
GenerateReqInput,
ImageDataItem,
ImageDataInputItem,
)
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.models.vila import VILAForConditionalGeneration
@@ -42,7 +42,7 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
async def process_mm_data_async(
self,
image_data: Optional[ImageDataItem | List[ImageDataItem]],
image_data: Optional[ImageDataInputItem | List[ImageDataInputItem]],
input_text: str | List[int],
request_obj: GenerateReqInput | EmbeddingReqInput,
max_req_input_len: int,