vlm: support video as an input modality (#5888)
This commit is contained in:
@@ -267,6 +267,10 @@ class ChatCompletionMessageContentImageURL(BaseModel):
|
||||
detail: Optional[Literal["auto", "low", "high"]] = "auto"
|
||||
|
||||
|
||||
class ChatCompletionMessageContentVideoURL(BaseModel):
|
||||
url: str
|
||||
|
||||
|
||||
class ChatCompletionMessageContentAudioURL(BaseModel):
|
||||
url: str
|
||||
|
||||
@@ -277,6 +281,11 @@ class ChatCompletionMessageContentImagePart(BaseModel):
|
||||
modalities: Optional[Literal["image", "multi-images", "video"]] = "image"
|
||||
|
||||
|
||||
class ChatCompletionMessageContentVideoPart(BaseModel):
|
||||
type: Literal["video_url"]
|
||||
video_url: ChatCompletionMessageContentVideoURL
|
||||
|
||||
|
||||
class ChatCompletionMessageContentAudioPart(BaseModel):
|
||||
type: Literal["audio_url"]
|
||||
audio_url: ChatCompletionMessageContentAudioURL
|
||||
@@ -285,6 +294,7 @@ class ChatCompletionMessageContentAudioPart(BaseModel):
|
||||
ChatCompletionMessageContentPart = Union[
|
||||
ChatCompletionMessageContentTextPart,
|
||||
ChatCompletionMessageContentImagePart,
|
||||
ChatCompletionMessageContentVideoPart,
|
||||
ChatCompletionMessageContentAudioPart,
|
||||
]
|
||||
|
||||
@@ -629,6 +639,7 @@ class MessageProcessingResult:
|
||||
prompt_ids: Union[str, List[int]]
|
||||
image_data: Optional[Any]
|
||||
audio_data: Optional[Any]
|
||||
video_data: Optional[Any]
|
||||
modalities: List[str]
|
||||
stop: List[str]
|
||||
tool_call_constraint: Optional[Any] = None
|
||||
|
||||
@@ -82,6 +82,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
||||
adapted_request = GenerateReqInput(
|
||||
**prompt_kwargs,
|
||||
image_data=processed_messages.image_data,
|
||||
video_data=processed_messages.video_data,
|
||||
audio_data=processed_messages.audio_data,
|
||||
sampling_params=sampling_params,
|
||||
return_logprob=request.logprobs,
|
||||
@@ -143,6 +144,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
||||
prompt_ids = []
|
||||
openai_compatible_messages = []
|
||||
image_data = []
|
||||
video_data = []
|
||||
audio_data = []
|
||||
modalities = []
|
||||
|
||||
@@ -158,6 +160,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
||||
msg_dict,
|
||||
template_content_format,
|
||||
image_data,
|
||||
video_data,
|
||||
audio_data,
|
||||
modalities,
|
||||
)
|
||||
@@ -214,11 +217,13 @@ class OpenAIServingChat(OpenAIServingBase):
|
||||
stop = request.stop
|
||||
image_data = image_data if image_data else None
|
||||
audio_data = audio_data if audio_data else None
|
||||
video_data = video_data if video_data else None
|
||||
modalities = modalities if modalities else []
|
||||
return MessageProcessingResult(
|
||||
prompt=prompt,
|
||||
prompt_ids=prompt_ids,
|
||||
image_data=image_data,
|
||||
video_data=video_data,
|
||||
audio_data=audio_data,
|
||||
modalities=modalities,
|
||||
stop=stop,
|
||||
@@ -260,6 +265,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
||||
prompt = conv.get_prompt()
|
||||
|
||||
image_data = conv.image_data if conv.image_data else None
|
||||
video_data = conv.video_data if conv.video_data else None
|
||||
audio_data = conv.audio_data if conv.audio_data else None
|
||||
modalities = conv.modalities if conv.modalities else []
|
||||
stop = copy.copy(conv.stop_str or [] if not request.ignore_eos else [])
|
||||
@@ -277,6 +283,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
||||
prompt=prompt,
|
||||
prompt_ids=prompt_ids,
|
||||
image_data=image_data,
|
||||
video_data=video_data,
|
||||
audio_data=audio_data,
|
||||
modalities=modalities,
|
||||
stop=stop,
|
||||
|
||||
Reference in New Issue
Block a user