vlm: support video as an input modality (#5888)

This commit is contained in:
Mick
2025-07-10 14:48:35 +08:00
committed by GitHub
parent 4ed57807c2
commit b5e3d6031c
42 changed files with 887 additions and 524 deletions

View File

@@ -267,6 +267,10 @@ class ChatCompletionMessageContentImageURL(BaseModel):
detail: Optional[Literal["auto", "low", "high"]] = "auto"
class ChatCompletionMessageContentVideoURL(BaseModel):
url: str
class ChatCompletionMessageContentAudioURL(BaseModel):
url: str
@@ -277,6 +281,11 @@ class ChatCompletionMessageContentImagePart(BaseModel):
modalities: Optional[Literal["image", "multi-images", "video"]] = "image"
class ChatCompletionMessageContentVideoPart(BaseModel):
type: Literal["video_url"]
video_url: ChatCompletionMessageContentVideoURL
class ChatCompletionMessageContentAudioPart(BaseModel):
type: Literal["audio_url"]
audio_url: ChatCompletionMessageContentAudioURL
@@ -285,6 +294,7 @@ class ChatCompletionMessageContentAudioPart(BaseModel):
ChatCompletionMessageContentPart = Union[
ChatCompletionMessageContentTextPart,
ChatCompletionMessageContentImagePart,
ChatCompletionMessageContentVideoPart,
ChatCompletionMessageContentAudioPart,
]
@@ -629,6 +639,7 @@ class MessageProcessingResult:
prompt_ids: Union[str, List[int]]
image_data: Optional[Any]
audio_data: Optional[Any]
video_data: Optional[Any]
modalities: List[str]
stop: List[str]
tool_call_constraint: Optional[Any] = None

View File

@@ -82,6 +82,7 @@ class OpenAIServingChat(OpenAIServingBase):
adapted_request = GenerateReqInput(
**prompt_kwargs,
image_data=processed_messages.image_data,
video_data=processed_messages.video_data,
audio_data=processed_messages.audio_data,
sampling_params=sampling_params,
return_logprob=request.logprobs,
@@ -143,6 +144,7 @@ class OpenAIServingChat(OpenAIServingBase):
prompt_ids = []
openai_compatible_messages = []
image_data = []
video_data = []
audio_data = []
modalities = []
@@ -158,6 +160,7 @@ class OpenAIServingChat(OpenAIServingBase):
msg_dict,
template_content_format,
image_data,
video_data,
audio_data,
modalities,
)
@@ -214,11 +217,13 @@ class OpenAIServingChat(OpenAIServingBase):
stop = request.stop
image_data = image_data if image_data else None
audio_data = audio_data if audio_data else None
video_data = video_data if video_data else None
modalities = modalities if modalities else []
return MessageProcessingResult(
prompt=prompt,
prompt_ids=prompt_ids,
image_data=image_data,
video_data=video_data,
audio_data=audio_data,
modalities=modalities,
stop=stop,
@@ -260,6 +265,7 @@ class OpenAIServingChat(OpenAIServingBase):
prompt = conv.get_prompt()
image_data = conv.image_data if conv.image_data else None
video_data = conv.video_data if conv.video_data else None
audio_data = conv.audio_data if conv.audio_data else None
modalities = conv.modalities if conv.modalities else []
stop = copy.copy(conv.stop_str or [] if not request.ignore_eos else [])
@@ -277,6 +283,7 @@ class OpenAIServingChat(OpenAIServingBase):
prompt=prompt,
prompt_ids=prompt_ids,
image_data=image_data,
video_data=video_data,
audio_data=audio_data,
modalities=modalities,
stop=stop,