# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import json import warnings from abc import ABC, abstractmethod from collections import Counter, defaultdict from collections.abc import Awaitable, Callable, Iterable from functools import cached_property, lru_cache, partial from itertools import accumulate from pathlib import Path from typing import TYPE_CHECKING, Any, Generic, Literal, TypeAlias, TypeVar, cast from openai.types.chat import ( ChatCompletionAssistantMessageParam, ChatCompletionContentPartImageParam, ChatCompletionContentPartInputAudioParam, ChatCompletionContentPartRefusalParam, ChatCompletionContentPartTextParam, ChatCompletionFunctionToolParam, ChatCompletionMessageToolCallParam, ChatCompletionToolMessageParam, ) from openai.types.chat import ( ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam, ) from openai.types.chat import ( ChatCompletionMessageParam as OpenAIChatCompletionMessageParam, ) from openai.types.chat.chat_completion_content_part_input_audio_param import InputAudio from openai.types.responses import ResponseInputImageParam from openai_harmony import Message as OpenAIHarmonyMessage from PIL import Image from pydantic import BaseModel, ConfigDict, TypeAdapter # pydantic needs the TypedDict from typing_extensions from typing_extensions import Required, TypedDict from vllm import envs from vllm.config import ModelConfig from vllm.logger import init_logger from vllm.model_executor.models import SupportsMultiModal from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict from vllm.multimodal.inputs import ( MultiModalBatchedField, MultiModalFlatField, MultiModalSharedField, VisionChunk, VisionChunkImage, VisionChunkVideo, ) from vllm.multimodal.media import MEDIA_CONNECTOR_REGISTRY, MediaConnector from vllm.multimodal.processing import BaseMultiModalProcessor from vllm.utils import random_uuid from vllm.utils.collection_utils import is_list_of from vllm.utils.import_utils import LazyLoader if TYPE_CHECKING: import torch import transformers else: transformers = LazyLoader("transformers", globals(), "transformers") torch = LazyLoader("torch", globals(), "torch") logger = init_logger(__name__) def __getattr__(name: str): if name == "resolve_hf_chat_template": from vllm.renderers.hf import resolve_chat_template warnings.warn( "`vllm.entrypoints.chat_utils.resolve_hf_chat_template` has been moved to " "`vllm.renderers.hf.resolve_chat_template`. " "The old name will be removed in v0.16.", DeprecationWarning, stacklevel=2, ) return resolve_chat_template raise AttributeError(f"module {__name__!r} has no attribute {name!r}") class ChatTemplateResolutionError(ValueError): """Raised when chat template resolution fails. This is a subclass of ValueError for backward compatibility with existing exception handlers. """ MODALITY_PLACEHOLDERS_MAP = { "image": "<##IMAGE##>", "audio": "<##AUDIO##>", "video": "<##VIDEO##>", } class AudioURL(TypedDict, total=False): url: Required[str] """ Either a URL of the audio or a data URL with base64 encoded audio data. """ class ChatCompletionContentPartAudioParam(TypedDict, total=False): audio_url: Required[AudioURL] type: Required[Literal["audio_url"]] """The type of the content part.""" class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False): image_embeds: str | dict[str, str] | None """ The image embeddings. It can be either: - A single base64 string. - A dictionary where each value is a base64 string. """ type: Required[Literal["image_embeds"]] """The type of the content part.""" uuid: str | None """ User-provided UUID of a media. User must guarantee that it is properly generated and unique for different medias. """ class ChatCompletionContentPartAudioEmbedsParam(TypedDict, total=False): audio_embeds: str | dict[str, str] | None """ The audio embeddings. It can be either: - A single base64 string representing a serialized torch tensor. - A dictionary where each value is a base64 string. """ type: Required[Literal["audio_embeds"]] """The type of the content part.""" uuid: str | None """ User-provided UUID of a media. User must guarantee that it is properly generated and unique for different medias. """ class VideoURL(TypedDict, total=False): url: Required[str] """ Either a URL of the video or a data URL with base64 encoded video data. """ class ChatCompletionContentPartVideoParam(TypedDict, total=False): video_url: Required[VideoURL] type: Required[Literal["video_url"]] """The type of the content part.""" class PILImage(BaseModel): """ A PIL.Image.Image object. """ image_pil: Image.Image model_config = ConfigDict(arbitrary_types_allowed=True) class CustomChatCompletionContentPILImageParam(TypedDict, total=False): """A simpler version of the param that only accepts a PIL image. Example: { "image_pil": ImageAsset('cherry_blossom').pil_image } """ image_pil: PILImage | None uuid: str | None """ User-provided UUID of a media. User must guarantee that it is properly generated and unique for different medias. """ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): """A simpler version of the param that only accepts a plain image_url. This is supported by OpenAI API, although it is not documented. Example: { "image_url": "https://example.com/image.jpg" } """ image_url: str | None uuid: str | None """ User-provided UUID of a media. User must guarantee that it is properly generated and unique for different medias. """ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): """A simpler version of the param that only accepts a plain audio_url. Example: { "audio_url": "https://example.com/audio.mp3" } """ audio_url: str | None class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): """A simpler version of the param that only accepts a plain audio_url. Example: { "video_url": "https://example.com/video.mp4" } """ video_url: str | None uuid: str | None """ User-provided UUID of a media. User must guarantee that it is properly generated and unique for different medias. """ class CustomThinkCompletionContentParam(TypedDict, total=False): """A Think Completion Content Param that accepts a plain text and a boolean. Example: { "thinking": "I am thinking about the answer", "closed": True, "type": "thinking" } """ thinking: Required[str] """The thinking content.""" closed: bool """Whether the thinking is closed.""" type: Required[Literal["thinking"]] """The thinking type.""" ChatCompletionContentPartParam: TypeAlias = ( OpenAIChatCompletionContentPartParam | ChatCompletionContentPartAudioParam | ChatCompletionContentPartInputAudioParam | ChatCompletionContentPartVideoParam | ChatCompletionContentPartRefusalParam | CustomChatCompletionContentPILImageParam | CustomChatCompletionContentSimpleImageParam | ChatCompletionContentPartImageEmbedsParam | ChatCompletionContentPartAudioEmbedsParam | CustomChatCompletionContentSimpleAudioParam | CustomChatCompletionContentSimpleVideoParam | str | CustomThinkCompletionContentParam ) class CustomChatCompletionMessageParam(TypedDict, total=False): """Enables custom roles in the Chat Completion API.""" role: Required[str] """The role of the message's author.""" content: str | list[ChatCompletionContentPartParam] """The contents of the message.""" name: str """An optional name for the participant. Provides the model information to differentiate between participants of the same role. """ tool_call_id: str | None """Tool call that this message is responding to.""" tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None """The tool calls generated by the model, such as function calls.""" reasoning: str | None """The reasoning content for interleaved thinking.""" tools: list[ChatCompletionFunctionToolParam] | None """The tools for developer role.""" ChatCompletionMessageParam: TypeAlias = ( OpenAIChatCompletionMessageParam | CustomChatCompletionMessageParam | OpenAIHarmonyMessage ) # TODO: Make fields ReadOnly once mypy supports it class ConversationMessage(TypedDict, total=False): role: Required[str] """The role of the message's author.""" content: str | None | list[dict[str, str]] """The contents of the message""" tool_call_id: str | None """Tool call that this message is responding to.""" name: str | None """The name of the function to call""" tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None """The tool calls generated by the model, such as function calls.""" reasoning: str | None """The reasoning content for interleaved thinking.""" reasoning_content: str | None """Deprecated: The reasoning content for interleaved thinking.""" tools: list[ChatCompletionFunctionToolParam] | None """The tools for developer role.""" # Passed in by user ChatTemplateContentFormatOption = Literal["auto", "string", "openai"] # After resolving "auto" ChatTemplateContentFormat = Literal["string", "openai"] ModalityStr = Literal[ "image", "audio", "video", "image_embeds", "audio_embeds", "vision_chunk" ] _T = TypeVar("_T") # Backward compatibility for single item input class _BatchedSingleItemField(MultiModalSharedField): pass def _detect_field( tensors: list[torch.Tensor], mm_processor: BaseMultiModalProcessor, ): first_item = tensors[0] hidden_size = mm_processor.info.ctx.model_config.get_inputs_embeds_size() if ( len(tensors) == 1 and first_item.ndim == 3 and first_item.shape[0] == 1 and first_item.shape[-1] == hidden_size ): logger.warning( "Batched multi-modal embedding inputs are deprecated for Chat API. " "Please pass a separate content part for each multi-modal item." ) return _BatchedSingleItemField(batch_size=1) first_shape = first_item.shape if all(t.shape == first_shape for t in tensors): return MultiModalBatchedField() size_per_item = [len(tensor) for tensor in tensors] slice_idxs = [0, *accumulate(size_per_item)] slices = [ (slice(slice_idxs[i], slice_idxs[i + 1]),) for i in range(len(size_per_item)) ] return MultiModalFlatField(slices=slices) def _merge_embeds( data_items: list[dict[str, "torch.Tensor"]], mm_processor: BaseMultiModalProcessor, ): if not data_items: return {} first_keys = set(data_items[0].keys()) if any(set(item.keys()) != first_keys for item in data_items[1:]): raise ValueError( "All dictionaries in the list of embeddings must have the same keys." ) fields = { key: _detect_field([item[key] for item in data_items], mm_processor) for key in first_keys } data_merged = { key: field._reduce_data([item[key] for item in data_items], pin_memory=False) for key, field in fields.items() } try: # TODO: Support per-request mm_processor_kwargs parsed_configs = mm_processor._get_mm_fields_config( transformers.BatchFeature(data_merged), {}, ) parsed_fields = {key: parsed_configs[key].field for key in first_keys} keys_to_update = [ key for key in first_keys if ( fields[key] != parsed_fields[key] and not isinstance(fields[key], _BatchedSingleItemField) ) ] for key in keys_to_update: data_merged[key] = parsed_fields[key]._reduce_data( [item[key] for item in data_items], pin_memory=False ) except Exception: logger.exception( "Error when parsing merged embeddings. " "Falling back to auto-detected fields." ) return data_merged def _get_embeds_data( modality: str, data_items: list[Any], mm_processor: BaseMultiModalProcessor, ): if len(data_items) == 0: return data_items if all(item is None for item in data_items): return data_items if is_list_of(data_items, torch.Tensor): embeds_key = f"{modality}_embeds" dict_items = [{embeds_key: item} for item in data_items] return _merge_embeds(dict_items, mm_processor)[embeds_key] if is_list_of(data_items, dict): return _merge_embeds(data_items, mm_processor) raise NotImplementedError(type(data_items)) class BaseMultiModalItemTracker(ABC, Generic[_T]): """ Tracks multi-modal items in a given request and ensures that the number of multi-modal items in a given request does not exceed the configured maximum per prompt. """ def __init__(self, model_config: ModelConfig): super().__init__() self._model_config = model_config self._items_by_modality = defaultdict[str, list[_T]](list) # Track original modality for each vision_chunk item (image or video) self._modality_order = defaultdict[str, list[str]](list) @cached_property def use_unified_vision_chunk_modality(self) -> bool: """Check if model uses unified vision_chunk modality for images/videos.""" return getattr(self._model_config.hf_config, "use_unified_vision_chunk", False) @property def model_config(self) -> ModelConfig: return self._model_config @cached_property def model_cls(self) -> type[SupportsMultiModal]: from vllm.model_executor.model_loader import get_model_cls model_cls = get_model_cls(self.model_config) return cast(type[SupportsMultiModal], model_cls) @property def allowed_local_media_path(self): return self._model_config.allowed_local_media_path @property def allowed_media_domains(self): return self._model_config.allowed_media_domains @property def mm_registry(self): return MULTIMODAL_REGISTRY @cached_property def mm_processor(self): return self.mm_registry.create_processor(self.model_config) def add(self, modality: ModalityStr, item: _T) -> str | None: """ Add a multi-modal item to the current prompt and returns the placeholder string to use, if any. An optional uuid can be added which serves as a unique identifier of the media. """ input_modality = modality.replace("_embeds", "") original_modality = modality use_vision_chunk = ( self.use_unified_vision_chunk_modality and original_modality in ["video", "image"] ) # If use_unified_vision_chunk_modality is enabled, # map image/video to vision_chunk if use_vision_chunk: # To avoid validation fail # because models with use_unified_vision_chunk_modality=True # will only accept vision_chunk modality. input_modality = "vision_chunk" num_items = len(self._items_by_modality[input_modality]) + 1 else: num_items = len(self._items_by_modality[original_modality]) + 1 mm_config = self.model_config.multimodal_config if ( mm_config is not None and mm_config.enable_mm_embeds and mm_config.get_limit_per_prompt(input_modality) == 0 and original_modality.endswith("_embeds") ): # Skip validation: embeddings bypass limit when enable_mm_embeds=True pass else: self.mm_processor.info.validate_num_items(input_modality, num_items) # Track original modality for vision_chunk items if use_vision_chunk: self._items_by_modality[input_modality].append(item) # type: ignore self._modality_order["vision_chunk"].append(original_modality) else: self._items_by_modality[original_modality].append(item) return self.model_cls.get_placeholder_str(modality, num_items) @abstractmethod def create_parser(self) -> "BaseMultiModalContentParser": raise NotImplementedError def _resolve_vision_chunk_items( vision_chunk_items: list[tuple[object, str | None]], mm_processor: BaseMultiModalProcessor, vision_chunks_modality_order: list[str], ): # Process vision_chunk items - extract from (data, modality) tuples # and convert to VisionChunk types with proper UUID handling vision_chunks_uuids = [uuid for data, uuid in vision_chunk_items] assert len(vision_chunk_items) == len(vision_chunks_modality_order), ( f"vision_chunk items ({len(vision_chunk_items)}) and " f"modality_order ({len(vision_chunks_modality_order)}) must have same length" ) processed_chunks: list[VisionChunk] = [] video_idx = 0 for inner_modality, (data, uuid) in zip( vision_chunks_modality_order, vision_chunk_items ): if inner_modality == "image": # Cast data to proper type for image # Use .media (PIL.Image) directly to avoid redundant # bytes→PIL conversion in media_processor if hasattr(data, "media"): image_data = data.media # type: ignore[union-attr] processed_chunks.append( VisionChunkImage(type="image", image=image_data, uuid=uuid) ) else: processed_chunks.append(data) # type: ignore[arg-type] elif inner_modality == "video": # For video, we may need to split into chunks # if processor supports it # For now, just wrap as a video chunk placeholder if hasattr(mm_processor, "split_video_chunks") and data is not None: try: video_uuid = uuid or random_uuid() # video await result is (video_data, video_meta) tuple if isinstance(data, tuple) and len(data) >= 1: video_data = data[0] else: video_data = data video_chunks = mm_processor.split_video_chunks(video_data) for i, vc in enumerate(video_chunks): processed_chunks.append( VisionChunkVideo( type="video_chunk", video_chunk=vc["video_chunk"], uuid=f"{video_uuid}-{i}", video_idx=video_idx, prompt=vc["prompt"], ) ) video_idx += 1 except Exception as e: logger.warning("Failed to split video chunks: %s", e) processed_chunks.append(data) # type: ignore[arg-type] else: processed_chunks.append(data) # type: ignore[arg-type] return processed_chunks, vision_chunks_uuids def _resolve_items( items_by_modality: dict[str, list[tuple[object, str | None]]], mm_processor: BaseMultiModalProcessor, modality_order: dict[str, list[str]], ) -> tuple[MultiModalDataDict, MultiModalUUIDDict]: if "image" in items_by_modality and "image_embeds" in items_by_modality: raise ValueError("Mixing raw image and embedding inputs is not allowed") if "audio" in items_by_modality and "audio_embeds" in items_by_modality: raise ValueError("Mixing raw audio and embedding inputs is not allowed") mm_data = {} mm_uuids = {} if "image_embeds" in items_by_modality: mm_data["image"] = _get_embeds_data( "image", [data for data, uuid in items_by_modality["image_embeds"]], mm_processor, ) mm_uuids["image"] = [uuid for data, uuid in items_by_modality["image_embeds"]] if "image" in items_by_modality: mm_data["image"] = [data for data, uuid in items_by_modality["image"]] mm_uuids["image"] = [uuid for data, uuid in items_by_modality["image"]] if "audio_embeds" in items_by_modality: mm_data["audio"] = _get_embeds_data( "audio", [data for data, uuid in items_by_modality["audio_embeds"]], mm_processor, ) mm_uuids["audio"] = [uuid for data, uuid in items_by_modality["audio_embeds"]] if "audio" in items_by_modality: mm_data["audio"] = [data for data, uuid in items_by_modality["audio"]] mm_uuids["audio"] = [uuid for data, uuid in items_by_modality["audio"]] if "video" in items_by_modality: mm_data["video"] = [data for data, uuid in items_by_modality["video"]] mm_uuids["video"] = [uuid for data, uuid in items_by_modality["video"]] if "vision_chunk" in items_by_modality: # Process vision_chunk items - extract from (data, modality) tuples # and convert to VisionChunk types with proper UUID handling processed_chunks, vision_chunk_uuids = _resolve_vision_chunk_items( items_by_modality["vision_chunk"], mm_processor, modality_order.get("vision_chunk", []), ) mm_data["vision_chunk"] = processed_chunks mm_uuids["vision_chunk"] = vision_chunk_uuids return mm_data, mm_uuids class MultiModalItemTracker(BaseMultiModalItemTracker[tuple[object, str | None]]): def resolve_items( self, ) -> tuple[MultiModalDataDict | None, MultiModalUUIDDict | None]: if not self._items_by_modality: return None, None return _resolve_items( dict(self._items_by_modality), self.mm_processor, self._modality_order ) def create_parser(self) -> "BaseMultiModalContentParser": return MultiModalContentParser(self) class AsyncMultiModalItemTracker( BaseMultiModalItemTracker[Awaitable[tuple[object, str | None]]] ): async def resolve_items( self, ) -> tuple[MultiModalDataDict | None, MultiModalUUIDDict | None]: if not self._items_by_modality: return None, None resolved_items_by_modality = { modality: await asyncio.gather(*coros) for modality, coros in self._items_by_modality.items() } return _resolve_items( resolved_items_by_modality, self.mm_processor, self._modality_order ) def create_parser(self) -> "BaseMultiModalContentParser": return AsyncMultiModalContentParser(self) class BaseMultiModalContentParser(ABC): def __init__(self) -> None: super().__init__() # stores model placeholders list with corresponding # general MM placeholder: # { # "<##IMAGE##>": ["", "", ""], # "<##AUDIO##>": ["