Iluvatar-mrv100 SDK 4.3.0

2025-09-15 14:58:11 +08:00
parent 9efe891f99
commit 8af8290b1d
1052 changed files with 294967 additions and 1 deletions
--- a/vllm/inputs/init.py
+++ b/vllm/inputs/init.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from .data import (DecoderOnlyInputs, EncoderDecoderInputs,
+                   ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType,
+                   SingletonInputs, SingletonInputsAdapter, SingletonPrompt,
+                   TextPrompt, TokenInputs, TokensPrompt,
+                   build_explicit_enc_dec_prompt, to_enc_dec_tuple_list,
+                   token_inputs, zip_enc_dec_prompts)
+from .registry import (DummyData, InputContext, InputProcessingContext,
+                       InputRegistry)
+
+INPUT_REGISTRY = InputRegistry()
+"""
+The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine`
+to dispatch data processing according to the target model.
+"""
+
+__all__ = [
+    "TextPrompt",
+    "TokensPrompt",
+    "PromptType",
+    "SingletonPrompt",
+    "ExplicitEncoderDecoderPrompt",
+    "TokenInputs",
+    "token_inputs",
+    "DecoderOnlyInputs",
+    "EncoderDecoderInputs",
+    "ProcessorInputs",
+    "SingletonInputs",
+    "SingletonInputsAdapter",
+    "build_explicit_enc_dec_prompt",
+    "to_enc_dec_tuple_list",
+    "zip_enc_dec_prompts",
+    "INPUT_REGISTRY",
+    "DummyData",
+    "InputContext",
+    "InputProcessingContext",
+    "InputRegistry",
+]
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -0,0 +1,405 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from collections.abc import Iterable
+from dataclasses import dataclass
+from functools import cached_property
+from typing import TYPE_CHECKING, Any, Generic, Literal, Optional, Union, cast
+
+import torch
+from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never
+
+if TYPE_CHECKING:
+    from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs,
+                                 MultiModalPlaceholderDict)
+    from vllm.multimodal.inputs import MultiModalInputs
+
+
+class TextPrompt(TypedDict):
+    """Schema for a text prompt."""
+
+    prompt: str
+    """The input text to be tokenized before passing to the model."""
+
+    multi_modal_data: NotRequired["MultiModalDataDict"]
+    """
+    Optional multi-modal data to pass to the model,
+    if the model supports it.
+    """
+
+    mm_processor_kwargs: NotRequired[dict[str, Any]]
+    """
+    Optional multi-modal processor kwargs to be forwarded to the
+    multimodal input mapper & processor. Note that if multiple modalities
+    have registered mappers etc for the model being considered, we attempt
+    to pass the mm_processor_kwargs to each of them.
+    """
+
+
+class TokensPrompt(TypedDict):
+    """Schema for a tokenized prompt."""
+
+    prompt_token_ids: list[int]
+    """A list of token IDs to pass to the model."""
+
+    token_type_ids: NotRequired[list[int]]
+    """A list of token type IDs to pass to the cross encoder model."""
+
+    multi_modal_data: NotRequired["MultiModalDataDict"]
+    """
+    Optional multi-modal data to pass to the model,
+    if the model supports it.
+    """
+
+    mm_processor_kwargs: NotRequired[dict[str, Any]]
+    """
+    Optional multi-modal processor kwargs to be forwarded to the
+    multimodal input mapper & processor. Note that if multiple modalities
+    have registered mappers etc for the model being considered, we attempt
+    to pass the mm_processor_kwargs to each of them.
+    """
+
+
+SingletonPrompt = Union[str, TextPrompt, TokensPrompt]
+"""
+Set of possible schemas for a single prompt:
+
+- A text prompt (:class:`str` or :class:`TextPrompt`)
+- A tokenized prompt (:class:`TokensPrompt`)
+
+Note that "singleton" is as opposed to a data structure
+which encapsulates multiple prompts, i.e. of the sort
+which may be utilized for encoder/decoder models when
+the user desires to express both the encoder & decoder
+prompts explicitly, i.e. :class:`ExplicitEncoderDecoderPrompt`
+
+A prompt of type :class:`SingletonPrompt` may be employed
+as (1) input to a decoder-only model, (2) input to
+the encoder of an encoder/decoder model, in the scenario
+where the decoder-prompt is not specified explicitly, or
+(3) as a member of a larger data structure encapsulating
+more than one prompt, i.e. :class:`ExplicitEncoderDecoderPrompt`
+"""
+
+_T1_co = TypeVar("_T1_co",
+                 bound=SingletonPrompt,
+                 default=SingletonPrompt,
+                 covariant=True)
+_T2_co = TypeVar("_T2_co",
+                 bound=SingletonPrompt,
+                 default=SingletonPrompt,
+                 covariant=True)
+
+
+# TODO: Make fields ReadOnly once mypy supports it
+class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
+    """
+    Represents an encoder/decoder model input prompt,
+    comprising an explicit encoder prompt and a decoder prompt.
+
+    The encoder and decoder prompts, respectively, may be formatted
+    according to any of the :class:`SingletonPrompt` schemas,
+    and are not required to have the same schema.
+
+    Only the encoder prompt may have multi-modal data. mm_processor_kwargs
+    should be at the top-level, and should not be set in the encoder/decoder
+    prompts, since they are agnostic to the encoder/decoder.
+
+    Note that an :class:`ExplicitEncoderDecoderPrompt` may not
+    be used as an input to a decoder-only model,
+    and that the :code:`encoder_prompt` and :code:`decoder_prompt`
+    fields of this data structure themselves must be
+    :class:`SingletonPrompt` instances.
+    """
+
+    encoder_prompt: _T1_co
+
+    decoder_prompt: Optional[_T2_co]
+
+    mm_processor_kwargs: NotRequired[dict[str, Any]]
+
+
+PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt]
+"""
+Set of possible schemas for an LLM input, including
+both decoder-only and encoder/decoder input types:
+
+- A text prompt (:class:`str` or :class:`TextPrompt`)
+- A tokenized prompt (:class:`TokensPrompt`)
+- A single data structure containing both an encoder and a decoder prompt
+  (:class:`ExplicitEncoderDecoderPrompt`)
+"""
+
+
+class TokenInputs(TypedDict):
+    """Represents token-based inputs."""
+
+    type: Literal["token"]
+    """The type of inputs."""
+
+    prompt_token_ids: list[int]
+    """The token IDs of the prompt."""
+
+    token_type_ids: NotRequired[list[int]]
+    """The token type IDs of the prompt."""
+
+    prompt: NotRequired[str]
+    """
+    The original prompt text corresponding to the token IDs, if available.
+    """
+
+    multi_modal_data: NotRequired["MultiModalDataDict"]
+    """
+    Optional multi-modal data to pass to the model,
+    if the model supports it.
+    """
+
+    multi_modal_inputs: NotRequired["MultiModalKwargs"]
+    """
+    Optional multi-modal inputs to pass to the model,
+    if the model supports it.
+    """
+
+    multi_modal_placeholders: NotRequired["MultiModalPlaceholderDict"]
+    """
+    Placeholder ranges for the multi-modal data.
+    """
+
+    multi_modal_hashes: NotRequired[list[str]]
+    """
+    The hashes of the multi-modal data.
+    """
+
+    mm_processor_kwargs: NotRequired[dict[str, Any]]
+    """
+    Optional multi-modal processor kwargs to be forwarded to the
+    multimodal input mapper & processor. Note that if multiple modalities
+    have registered mappers etc for the model being considered, we attempt
+    to pass the mm_processor_kwargs to each of them.
+    """
+
+
+def token_inputs(
+    prompt_token_ids: list[int],
+    token_type_ids: Optional[list[int]] = None,
+    prompt: Optional[str] = None,
+    multi_modal_data: Optional["MultiModalDataDict"] = None,
+    multi_modal_inputs: Optional["MultiModalKwargs"] = None,
+    multi_modal_hashes: Optional[list[str]] = None,
+    multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
+    mm_processor_kwargs: Optional[dict[str, Any]] = None,
+) -> TokenInputs:
+    """Construct :class:`TokenInputs` from optional values."""
+    inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
+
+    if prompt is not None:
+        inputs["prompt"] = prompt
+    if token_type_ids is not None:
+        inputs["token_type_ids"] = token_type_ids
+    if multi_modal_data is not None:
+        inputs["multi_modal_data"] = multi_modal_data
+    if multi_modal_inputs is not None:
+        inputs["multi_modal_inputs"] = multi_modal_inputs
+    if multi_modal_hashes is not None:
+        inputs["multi_modal_hashes"] = multi_modal_hashes
+    if multi_modal_placeholders is not None:
+        inputs["multi_modal_placeholders"] = multi_modal_placeholders
+    if mm_processor_kwargs is not None:
+        inputs["mm_processor_kwargs"] = mm_processor_kwargs
+
+    return inputs
+
+
+DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputs"]
+"""
+The inputs in :class:`~vllm.LLMEngine` before they are
+passed to the model executor.
+This specifies the data required for decoder-only models.
+"""
+
+
+class EncoderDecoderInputs(TypedDict):
+    """
+    The inputs in :class:`~vllm.LLMEngine` before they are
+    passed to the model executor.
+
+    This specifies the required data for encoder-decoder models.
+    """
+    encoder: Union[TokenInputs, "MultiModalInputs"]
+    """The inputs for the encoder portion."""
+
+    decoder: Union[TokenInputs, "MultiModalInputs"]
+    """The inputs for the decoder portion."""
+
+
+SingletonInputs = Union[TokenInputs, "MultiModalInputs"]
+"""
+A processed :class:`SingletonPrompt` which can be passed to
+:class:`vllm.sequence.Sequence`.
+"""
+
+
+@dataclass
+class SingletonInputsAdapter:
+    """
+    Unified interface to access the components of :class:`SingletonInputs`.
+    """
+    inputs: SingletonInputs
+
+    @cached_property
+    def prompt(self) -> Optional[str]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token" or inputs["type"] == "multimodal":
+            return inputs.get("prompt")
+
+        assert_never(inputs)  # type: ignore[arg-type]
+
+    @cached_property
+    def prompt_token_ids(self) -> list[int]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token" or inputs["type"] == "multimodal":
+            return inputs.get("prompt_token_ids", [])
+
+        assert_never(inputs)  # type: ignore[arg-type]
+
+    @cached_property
+    def token_type_ids(self) -> list[int]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token" or inputs["type"] == "multimodal":
+            return inputs.get("token_type_ids", [])
+
+        assert_never(inputs)  # type: ignore[arg-type]
+
+    @cached_property
+    def prompt_embeds(self) -> Optional[torch.Tensor]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token" or inputs["type"] == "multimodal":
+            return None
+
+        assert_never(inputs)  # type: ignore[arg-type]
+
+    @cached_property
+    def multi_modal_data(self) -> "MultiModalDataDict":
+        inputs = self.inputs
+
+        if inputs["type"] == "token":
+            return inputs.get("multi_modal_data", {})
+
+        if inputs["type"] == "multimodal":
+            return inputs.get("mm_kwargs", {})
+
+        assert_never(inputs)  # type: ignore[arg-type]
+
+    @cached_property
+    def multi_modal_inputs(self) -> Union[dict, "MultiModalKwargs"]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token":
+            return inputs.get("multi_modal_inputs", {})
+
+        if inputs["type"] == "multimodal":
+            return inputs.get("mm_kwargs", {})
+
+        assert_never(inputs)  # type: ignore[arg-type]
+
+    @cached_property
+    def multi_modal_hashes(self) -> list[str]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token":
+            return inputs.get("multi_modal_hashes", [])
+
+        if inputs["type"] == "multimodal":
+            # only the case when we use MultiModalInputs
+            return inputs.get("mm_hashes", [])  # type: ignore[return-value]
+
+        assert_never(inputs)  # type: ignore[arg-type]
+
+    @cached_property
+    def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
+        inputs = self.inputs
+
+        if inputs["type"] == "token":
+            return inputs.get("multi_modal_placeholders", {})
+
+        if inputs["type"] == "multimodal":
+            return inputs.get("mm_placeholders", {})
+
+        assert_never(inputs)  # type: ignore[arg-type]
+
+    @cached_property
+    def mm_processor_kwargs(self) -> dict[str, Any]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token":
+            return inputs.get("mm_processor_kwargs", {})
+
+        if inputs["type"] == "multimodal":
+            return {}
+
+        assert_never(inputs)  # type: ignore[arg-type]
+
+
+ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
+"""
+The inputs to :data:`vllm.inputs.InputProcessor`.
+"""
+
+_T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt)
+_T2 = TypeVar("_T2", bound=SingletonPrompt, default=SingletonPrompt)
+
+
+def build_explicit_enc_dec_prompt(
+    encoder_prompt: _T1,
+    decoder_prompt: Optional[_T2],
+    mm_processor_kwargs: Optional[dict[str, Any]] = None,
+) -> ExplicitEncoderDecoderPrompt[_T1, _T2]:
+    if mm_processor_kwargs is None:
+        mm_processor_kwargs = {}
+    return ExplicitEncoderDecoderPrompt(
+        encoder_prompt=encoder_prompt,
+        decoder_prompt=decoder_prompt,
+        mm_processor_kwargs=mm_processor_kwargs)
+
+
+def zip_enc_dec_prompts(
+    enc_prompts: Iterable[_T1],
+    dec_prompts: Iterable[Optional[_T2]],
+    mm_processor_kwargs: Optional[Union[Iterable[dict[str, Any]],
+                                        dict[str, Any]]] = None,
+) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
+    """
+    Zip encoder and decoder prompts together into a list of
+    :class:`ExplicitEncoderDecoderPrompt` instances.
+    
+    ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same
+    dictionary will be used for every encoder/decoder prompt. If an iterable is
+    provided, it will be zipped with the encoder/decoder prompts.
+    """
+    if mm_processor_kwargs is None:
+        mm_processor_kwargs = cast(dict[str, Any], {})
+    if isinstance(mm_processor_kwargs, dict):
+        return [
+            build_explicit_enc_dec_prompt(
+                encoder_prompt, decoder_prompt,
+                cast(dict[str, Any], mm_processor_kwargs))
+            for (encoder_prompt,
+                 decoder_prompt) in zip(enc_prompts, dec_prompts)
+        ]
+    return [
+        build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt,
+                                      mm_proc_kwargs)
+        for (encoder_prompt, decoder_prompt, mm_proc_kwargs
+             ) in zip(enc_prompts, dec_prompts, mm_processor_kwargs)
+    ]
+
+
+def to_enc_dec_tuple_list(
+    enc_dec_prompts: Iterable[ExplicitEncoderDecoderPrompt[_T1, _T2]],
+) -> list[tuple[_T1, Optional[_T2]]]:
+    return [(enc_dec_prompt["encoder_prompt"],
+             enc_dec_prompt["decoder_prompt"])
+            for enc_dec_prompt in enc_dec_prompts]
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -0,0 +1,121 @@
+# SPDX-License-Identifier: Apache-2.0
+from collections.abc import Sequence
+from typing import Literal, Optional, TypedDict, Union, cast, overload
+
+from typing_extensions import TypeIs
+
+from vllm.utils import is_list_of
+
+from .data import (ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType,
+                   SingletonInputs, SingletonPrompt, TextPrompt, TokensPrompt)
+
+
+class ParsedText(TypedDict):
+    content: str
+    is_tokens: Literal[False]
+
+
+class ParsedTokens(TypedDict):
+    content: list[int]
+    is_tokens: Literal[True]
+
+
+@overload
+def parse_and_batch_prompt(
+        prompt: Union[str, list[str]]) -> Sequence[ParsedText]:
+    ...
+
+
+@overload
+def parse_and_batch_prompt(
+        prompt: Union[list[int], list[list[int]]]) -> Sequence[ParsedTokens]:
+    ...
+
+
+def parse_and_batch_prompt(
+    prompt: Union[str, list[str], list[int], list[list[int]]],
+) -> Union[Sequence[ParsedText], Sequence[ParsedTokens]]:
+    if isinstance(prompt, str):
+        # case 1: a string
+        return [ParsedText(content=prompt, is_tokens=False)]
+
+    if isinstance(prompt, list):
+        if len(prompt) == 0:
+            raise ValueError("please provide at least one prompt")
+
+        if is_list_of(prompt, str):
+            # case 2: array of strings
+            prompt = cast(list[str], prompt)
+            return [
+                ParsedText(content=elem, is_tokens=False) for elem in prompt
+            ]
+        if is_list_of(prompt, int):
+            # case 3: array of tokens
+            prompt = cast(list[int], prompt)
+            return [ParsedTokens(content=prompt, is_tokens=True)]
+        if is_list_of(prompt, list):
+            prompt = cast(list[list[int]], prompt)
+            if len(prompt[0]) == 0:
+                raise ValueError("please provide at least one prompt")
+
+            if is_list_of(prompt[0], int):
+                # case 4: array of token arrays
+                return [
+                    ParsedTokens(content=elem, is_tokens=True)
+                    for elem in prompt
+                ]
+
+    raise TypeError("prompt must be a string, array of strings, "
+                    "array of tokens, or array of token arrays")
+
+
+class ParsedStrPrompt(TypedDict):
+    type: Literal["str"]
+    content: str
+
+
+class ParsedTextPrompt(TypedDict):
+    type: Literal["text"]
+    content: TextPrompt
+
+
+class ParsedTokensPrompt(TypedDict):
+    type: Literal["tokens"]
+    content: TokensPrompt
+
+
+def parse_singleton_prompt(
+    prompt: SingletonPrompt,
+) -> Union[ParsedStrPrompt, ParsedTextPrompt, ParsedTokensPrompt]:
+    if isinstance(prompt, str):
+        return ParsedStrPrompt(type="str", content=prompt)
+    elif isinstance(prompt, dict):
+        if "prompt_token_ids" in prompt:
+            return ParsedTokensPrompt(type="tokens",
+                                      content=prompt)  # type: ignore
+        elif "prompt" in prompt:
+            return ParsedTextPrompt(type="text", content=prompt)
+
+    raise TypeError("inputs must be a string, TextPrompt, or TokensPrompt")
+
+
+def is_token_prompt(prompt: PromptType) -> TypeIs[TokensPrompt]:
+    return isinstance(prompt, dict) and "prompt_token_ids" in prompt
+
+
+def is_explicit_encoder_decoder_prompt(
+        prompt: PromptType) -> TypeIs[ExplicitEncoderDecoderPrompt]:
+    return isinstance(prompt, dict) and "encoder_prompt" in prompt
+
+
+def split_enc_dec_inputs(
+    inputs: ProcessorInputs,
+) -> tuple[Optional[SingletonInputs], SingletonInputs]:
+    if "encoder" in inputs and "decoder" in inputs:
+        # NOTE: This passes pyright but not mypy
+        return (
+            inputs["encoder"],  # type: ignore[typeddict-item]
+            inputs["decoder"],  # type: ignore[typeddict-item]
+        )
+
+    return None, inputs
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -0,0 +1,783 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import asyncio
+from collections.abc import Mapping
+from typing import Optional, Union, cast
+
+from typing_extensions import assert_never
+
+from vllm.config import ModelConfig
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
+                                    MultiModalInputs)
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
+
+from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs,
+                   PromptType, SingletonInputs, SingletonPrompt, token_inputs)
+from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt
+
+logger = init_logger(__name__)
+
+
+class InputPreprocessor:
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        tokenizer: Optional[BaseTokenizerGroup],
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+    ) -> None:
+        super().__init__()
+
+        self.model_config = model_config
+        self.tokenizer = tokenizer
+        self.mm_registry = mm_registry
+
+    def get_tokenizer_group(self) -> BaseTokenizerGroup:
+        if self.tokenizer is None:
+            raise ValueError("You cannot pass text prompts when "
+                             "`skip_tokenizer_init` is True")
+
+        return self.tokenizer
+
+    def get_bos_token_id(self,
+                         lora_request: Optional[LoRARequest] = None
+                         ) -> Optional[int]:
+        if self.tokenizer is None:
+            logger.warning("Using None for BOS token id because tokenizer "
+                           "is not initialized")
+            return None
+
+        return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id
+
+    def get_eos_token_id(self,
+                         lora_request: Optional[LoRARequest] = None
+                         ) -> Optional[int]:
+        if self.tokenizer is None:
+            logger.warning("Using None for EOS token id because tokenizer "
+                           "is not initialized")
+            return None
+
+        return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id
+
+    def get_decoder_start_token_id(self) -> Optional[int]:
+        '''
+        Obtain the decoder start token id employed by an encoder/decoder
+        model. Returns None for non-encoder/decoder models or if the
+        model config is unavailable.
+        '''
+
+        if not self.model_config.is_encoder_decoder:
+            logger.warning_once(
+                "Using None for decoder start token id because "
+                "this is not an encoder/decoder model.")
+            return None
+
+        if (self.model_config is None or self.model_config.hf_config is None):
+            logger.warning_once(
+                "Using None for decoder start token id because "
+                "model config is not available.")
+            return None
+
+        dec_start_token_id = getattr(self.model_config.hf_config,
+                                     'decoder_start_token_id', None)
+        if dec_start_token_id is None:
+            logger.warning_once(
+                "Falling back on <BOS> for decoder start token "
+                "id because decoder start token id is not "
+                "available.")
+            dec_start_token_id = self.get_bos_token_id()
+
+        return dec_start_token_id
+
+    def _get_default_enc_dec_decoder_prompt(self) -> list[int]:
+        '''
+        Specifically for encoder/decoder models:
+        generate a default decoder prompt for when
+        the user specifies only the encoder prompt.
+
+        Encoder/decoder models utilize the decoder
+        prompt in different ways; as new models are
+        added, it is intended that this function
+        will be extended to produce differing
+        default decoder prompts, depending on the
+        model variety.
+
+        Absent a special case, the default behavior
+        of this method is to mirror the behavior of
+        the HuggingFace (HF) GenerationMixin for a None
+        decoder prompt, which is to employ a logit processor
+        setting to force the first decoded token to be <BOS>.
+        Here, this behavior is approximated by having the
+        "default" decoder prompt be <BOS>.
+
+        However, it is possible that in the future
+        other models may have different or more
+        complex logic for the default decoder prompt.
+        This motivates having a special helper method
+        for default decoder prompts.
+
+        Returns:
+
+        * prompt_token_ids
+        '''
+
+        bos_token_id = self.get_bos_token_id()
+        assert bos_token_id is not None
+        return [bos_token_id]
+
+    def _prepare_decoder_input_ids_for_generation(
+        self,
+        decoder_input_ids: Optional[list[int]],
+    ) -> list[int]:
+        """
+        Prepares `decoder_input_ids` for generation with encoder-decoder models.
+
+        Based on
+
+        https://github.com/huggingface/transformers/blob/
+        4037a2b5b1278736e566aec12e169100275545ea/
+        src/transformers/generation/utils.py
+
+        specifically GenerationMixin._prepare_decoder_input_ids_for_generation()
+
+        Arguments:
+
+        * decoder_input_ids: input token ids to preprocess
+
+        Returns:
+
+        * Processed token list
+        """
+
+        decoder_start_token_id = self.get_decoder_start_token_id()
+        assert decoder_start_token_id is not None
+
+        if decoder_input_ids is None:
+            # no decoder prompt input ->
+            # use decoder_start_token_id as decoder_input_ids
+            decoder_input_ids = self._get_default_enc_dec_decoder_prompt()
+
+        if (len(decoder_input_ids) == 0
+                or decoder_input_ids[0] != decoder_start_token_id):
+            decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
+
+        return decoder_input_ids
+
+    def _apply_prompt_adapter(
+        self,
+        prompt_token_ids: list[int],
+        prompt_adapter_request: Optional[PromptAdapterRequest],
+    ) -> list[int]:
+        if prompt_adapter_request:
+            prompt_token_ids = (
+                [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
+                + prompt_token_ids)
+
+        return prompt_token_ids
+
+    def _tokenize_prompt(
+        self,
+        prompt: str,
+        lora_request: Optional[LoRARequest],
+    ) -> list[int]:
+        """
+        Apply the model's tokenizer to a text prompt, returning the
+        corresponding token IDs.
+        """
+        tokenizer = self.get_tokenizer_group()
+        add_special_tokens = None
+        if self.model_config.hf_config.model_type == "whisper":
+            # For Whisper, special tokens should be provided by the user based
+            # on the task and language of their request. Also needed to avoid
+            # appending an EOS token to the prompt which disrupts generation.
+            add_special_tokens = False
+
+        if (self.model_config.encoder_config is not None
+                and self.model_config.encoder_config.get(
+                    "do_lower_case", False)):
+            prompt = prompt.lower()
+
+        return tokenizer.encode(prompt=prompt,
+                                lora_request=lora_request,
+                                add_special_tokens=add_special_tokens)
+
+    async def _tokenize_prompt_async(
+        self,
+        prompt: str,
+        lora_request: Optional[LoRARequest],
+    ) -> list[int]:
+        """Async version of :meth:`_tokenize_prompt`."""
+        tokenizer = self.get_tokenizer_group()
+        add_special_tokens = None
+        if self.model_config.hf_config.model_type == "whisper":
+            # For Whisper, special tokens should be provided by the user based
+            # on the task and language of their request. Also needed to avoid
+            # appending an EOS token to the prompt which disrupts generation.
+            add_special_tokens = False
+        return await tokenizer.encode_async(
+            prompt=prompt,
+            lora_request=lora_request,
+            add_special_tokens=add_special_tokens)
+
+    def _can_process_multimodal(self) -> bool:
+        model_config = self.model_config
+
+        if not model_config.is_multimodal_model:
+            raise ValueError("Your model does not support multi-modal inputs")
+
+        # Interim measure so we can handle models that have yet to be
+        # updated to use the new multi-modal processor
+        can_process_multimodal = self.mm_registry.has_processor(model_config)
+        if not can_process_multimodal:
+            from vllm.model_executor.models.registry import _VLLM_MODELS
+            if not any(arch in _VLLM_MODELS
+                       for arch in model_config.architectures):
+                logger.warning_once(
+                    "Your model uses the legacy input pipeline, which will be "
+                    "removed in an upcoming release. "
+                    "Please upgrade to the new multi-modal processing pipeline "
+                    "(https://docs.vllm.ai/en/latest/design/mm_processing.html)"
+                )
+
+        return can_process_multimodal
+
+    def _process_multimodal(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Optional[Mapping[str, object]],
+        lora_request: Optional[LoRARequest],
+        return_mm_hashes: bool = False,
+    ) -> MultiModalInputs:
+        """
+        Apply the model's multi-modal processor to a multi-modal prompt,
+        returning the corresponding token IDs and metadata.
+        """
+        # At the moment on model (PrithviGeoSpatialMAE) requires to be
+        # initialized without a tokenizer while using also multi-modal
+        # input.
+        if not self.tokenizer:
+            tokenizer = object()  # Dummy
+        else:
+            tokenizer_group = self.get_tokenizer_group()
+            tokenizer = tokenizer_group.get_lora_tokenizer(lora_request)
+
+        mm_processor = self.mm_registry.create_processor(self.model_config,
+                                                         tokenizer=tokenizer)
+
+        if mm_processor_kwargs is None:
+            mm_processor_kwargs = {}
+
+        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
+                                  return_mm_hashes)
+
+    async def _process_multimodal_async(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Optional[Mapping[str, object]],
+        lora_request: Optional[LoRARequest],
+        return_mm_hashes: bool = False,
+    ) -> MultiModalInputs:
+        """Async version of :meth:`_process_multimodal`."""
+        # At the moment on model (PrithviGeoSpatialMAE) requires to be
+        # initialized without a tokenizer while using also multi-modal
+        # input.
+        if not self.tokenizer:
+            tokenizer = object()  # Dummy
+        else:
+            tokenizer_group = self.get_tokenizer_group()
+            tokenizer = await tokenizer_group.get_lora_tokenizer_async(
+                lora_request)
+
+        mm_processor = self.mm_registry.create_processor(self.model_config,
+                                                         tokenizer=tokenizer)
+        if mm_processor_kwargs is None:
+            mm_processor_kwargs = {}
+
+        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
+                                  return_mm_hashes)
+
+    def _prompt_to_llm_inputs(
+        self,
+        prompt: SingletonPrompt,
+        lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> SingletonInputs:
+        """
+        Extract the singleton inputs from a prompt.
+
+        Arguments:
+
+        * prompt: single encoder or decoder input prompt
+        * lora_request: this is only valid for decoder prompts
+        * return_mm_hashes: whether to return multimodal hashes
+
+        Returns:
+
+        * :class:`SingletonInputs` instance
+        """
+        parsed = parse_singleton_prompt(prompt)
+
+        if parsed["type"] == "str":
+            prompt_text = parsed["content"]
+            prompt_token_ids = self._tokenize_prompt(
+                prompt_text,
+                lora_request=lora_request,
+            )
+
+            return token_inputs(
+                prompt=prompt_text,
+                prompt_token_ids=prompt_token_ids,
+            )
+
+        if parsed["type"] == "tokens":
+            tokens_content = parsed["content"]
+
+            prompt_token_ids = tokens_content["prompt_token_ids"]
+            token_type_ids = tokens_content.get("token_type_ids")
+            multi_modal_data = tokens_content.get("multi_modal_data")
+            mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")
+
+            if multi_modal_data is not None and self._can_process_multimodal():
+                return self._process_multimodal(
+                    prompt_token_ids,
+                    multi_modal_data,
+                    mm_processor_kwargs,
+                    lora_request=lora_request,
+                    return_mm_hashes=return_mm_hashes,
+                )
+
+            return token_inputs(
+                prompt_token_ids=prompt_token_ids,
+                token_type_ids=token_type_ids,
+                multi_modal_data=multi_modal_data,
+                mm_processor_kwargs=mm_processor_kwargs,
+            )
+
+        if parsed["type"] == "text":
+            text_content = parsed["content"]
+
+            prompt_text = text_content["prompt"]
+            multi_modal_data = text_content.get("multi_modal_data")
+            mm_processor_kwargs = text_content.get("mm_processor_kwargs")
+
+            if multi_modal_data is not None and self._can_process_multimodal():
+                return self._process_multimodal(
+                    prompt_text,
+                    multi_modal_data,
+                    mm_processor_kwargs,
+                    lora_request=lora_request,
+                    return_mm_hashes=return_mm_hashes,
+                )
+
+            prompt_token_ids = self._tokenize_prompt(
+                prompt_text,
+                lora_request=lora_request,
+            )
+
+            return token_inputs(
+                prompt=prompt_text,
+                prompt_token_ids=prompt_token_ids,
+                multi_modal_data=multi_modal_data,
+                mm_processor_kwargs=mm_processor_kwargs,
+            )
+
+        assert_never(parsed)
+
+    async def _prompt_to_llm_inputs_async(
+        self,
+        prompt: SingletonPrompt,
+        lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> SingletonInputs:
+        """Async version of :meth:`_extract_prompt_components`."""
+        parsed = parse_singleton_prompt(prompt)
+
+        if parsed["type"] == "str":
+            prompt_text = parsed["content"]
+            prompt_token_ids = await self._tokenize_prompt_async(
+                prompt_text,
+                lora_request=lora_request,
+            )
+
+            return token_inputs(
+                prompt=prompt_text,
+                prompt_token_ids=prompt_token_ids,
+            )
+
+        if parsed["type"] == "tokens":
+            tokens_content = parsed["content"]
+
+            prompt_token_ids = tokens_content["prompt_token_ids"]
+            multi_modal_data = tokens_content.get("multi_modal_data")
+            mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")
+
+            if multi_modal_data is not None and self._can_process_multimodal():
+                return await self._process_multimodal_async(
+                    prompt_token_ids,
+                    multi_modal_data,
+                    mm_processor_kwargs,
+                    lora_request=lora_request,
+                    return_mm_hashes=return_mm_hashes,
+                )
+
+            return token_inputs(
+                prompt_token_ids=prompt_token_ids,
+                multi_modal_data=multi_modal_data,
+                mm_processor_kwargs=mm_processor_kwargs,
+            )
+
+        if parsed["type"] == "text":
+            text_content = parsed["content"]
+
+            prompt_text = text_content["prompt"]
+            multi_modal_data = text_content.get("multi_modal_data")
+            mm_processor_kwargs = text_content.get("mm_processor_kwargs")
+
+            if multi_modal_data is not None and self._can_process_multimodal():
+                return await self._process_multimodal_async(
+                    prompt_text,
+                    multi_modal_data,
+                    mm_processor_kwargs,
+                    lora_request=lora_request,
+                    return_mm_hashes=return_mm_hashes,
+                )
+
+            prompt_token_ids = await self._tokenize_prompt_async(
+                prompt_text,
+                lora_request=lora_request,
+            )
+
+            return token_inputs(
+                prompt=prompt_text,
+                prompt_token_ids=prompt_token_ids,
+                multi_modal_data=multi_modal_data,
+                mm_processor_kwargs=mm_processor_kwargs,
+            )
+
+        assert_never(parsed)
+
+    def _build_enc_dec_llm_inputs(
+        self,
+        encoder_inputs: SingletonInputs,
+        decoder_inputs: Optional[SingletonInputs],
+    ) -> EncoderDecoderInputs:
+        if (encoder_inputs["type"] == "token"
+                or encoder_inputs["type"] == "multimodal"):
+            pass
+        else:
+            assert_never(encoder_inputs)  # type: ignore[arg-type]
+
+        if decoder_inputs is None:
+            if self.model_config.hf_config.model_type == "whisper":
+                # For Whisper models, the text prompt should go to the decoder.
+                # If no explicit encoder/decoder inputs, then copy the prompt
+                # from the encoder to the decoder. The encoder tokens are later
+                # overridden by the audio features.
+                dec_token_ids = encoder_inputs["prompt_token_ids"].copy()
+            else:
+                dec_token_ids = self._prepare_decoder_input_ids_for_generation(
+                    None)
+            decoder_inputs = token_inputs(dec_token_ids)
+        elif (decoder_inputs["type"] == "token"
+              or decoder_inputs["type"] == "multimodal"):
+            dec_token_ids = self._prepare_decoder_input_ids_for_generation(
+                decoder_inputs["prompt_token_ids"])
+            decoder_inputs["prompt_token_ids"] = dec_token_ids
+
+            if "multi_modal_data" in decoder_inputs:
+                raise ValueError("Multi-modal decoder inputs of encoder-"
+                                 "decoder models are not supported yet")
+        else:
+            assert_never(encoder_inputs)  # type: ignore[arg-type]
+
+        return EncoderDecoderInputs(
+            encoder=encoder_inputs,
+            decoder=decoder_inputs,
+        )
+
+    def _separate_enc_dec_inputs_from_mm_processor_outputs(
+        self,
+        inputs: SingletonInputs,
+        decoder_inputs_to_override: Optional[SingletonInputs] = None,
+    ) -> tuple[SingletonInputs, SingletonInputs]:
+        """
+        For encoder/decoder models only:
+        Separate Encoder/Decoder inputs from a MultiModalEncDecInputs
+        """
+        encoder_inputs: SingletonInputs
+        decoder_inputs: SingletonInputs
+        if inputs["type"] == "multimodal":
+            # Multimodal data inputs
+            assert ("encoder_prompt" in inputs
+                    and "encoder_prompt_token_ids" in inputs)
+            inputs = cast(MultiModalEncDecInputs, inputs)
+            encoder_inputs = token_inputs(
+                prompt=inputs["encoder_prompt"],
+                prompt_token_ids=inputs["encoder_prompt_token_ids"],
+            )
+            if decoder_inputs_to_override is not None:
+                decoder_inputs = MultiModalInputs(
+                    type="multimodal",
+                    prompt=decoder_inputs_to_override.get("prompt", ""),
+                    prompt_token_ids=decoder_inputs_to_override[
+                        "prompt_token_ids"],
+                    mm_kwargs=inputs["mm_kwargs"],
+                    mm_hashes=inputs["mm_hashes"],
+                    mm_placeholders=inputs["mm_placeholders"],
+                )
+            else:
+                decoder_inputs = MultiModalInputs(
+                    type="multimodal",
+                    prompt=inputs["prompt"],
+                    prompt_token_ids=inputs["prompt_token_ids"],
+                    mm_kwargs=inputs["mm_kwargs"],
+                    mm_hashes=inputs["mm_hashes"],
+                    mm_placeholders=inputs["mm_placeholders"],
+                )
+        elif inputs["type"] == "token":
+            # Text-only inputs
+            encoder_inputs = token_inputs(prompt="", prompt_token_ids=[])
+            decoder_inputs = decoder_inputs_to_override or inputs
+        else:
+            assert_never(inputs)  # type: ignore[arg-type]
+        return encoder_inputs, decoder_inputs
+
+    def _process_encoder_decoder_prompt(
+        self,
+        prompt: PromptType,
+    ) -> EncoderDecoderInputs:
+        """
+        For encoder/decoder models only:
+        Process an input prompt into an :class:`EncoderDecoderInputs` instance.
+
+        There are two types of input prompts:
+        singleton prompts which carry only the
+        encoder prompt, and explicit encoder/decoder
+        prompts which carry both the encoder and the
+        decoder prompts as member variables.
+
+        This function handles the following scenarios:
+        * Singleton encoder prompt: extract encoder prompt
+          token ids & infer default decoder prompt token ids
+        * Explicit encoder/decoder prompt: extract encoder
+          and decoder prompt token ids
+
+        Note that for Explicit encoder/decoder prompts,
+        each sub-prompt (encoder or decoder prompt) can
+        have any possible singleton type; thus this
+        method relies on helper functions to obtain
+        token ids for the sub-prompts.
+
+        Arguments:
+
+        * prompt: an input prompt
+
+        Returns:
+
+        * :class:`EncoderDecoderInputs` instance
+        """
+        encoder_inputs: SingletonInputs
+        decoder_inputs: Optional[SingletonInputs]
+
+        if is_explicit_encoder_decoder_prompt(prompt):
+            encoder_inputs = self._prompt_to_llm_inputs(
+                prompt["encoder_prompt"])
+            if (decoder_input := prompt["decoder_prompt"]) is None:
+                decoder_inputs = None
+            else:
+                decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
+            # For multimodal model, override decoder prompt from processor
+            # with explicit decoder prompt.
+            if self.model_config.is_multimodal_model and (
+                    self._can_process_multimodal()):
+                encoder_inputs, decoder_inputs = (
+                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
+                        encoder_inputs, decoder_inputs))
+        else:
+            inputs = self._prompt_to_llm_inputs(prompt)
+            if self.model_config.is_multimodal_model and (
+                    self._can_process_multimodal()):
+                # Encoder-Decoder Multimodal model
+                encoder_inputs, decoder_inputs = (
+                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
+                        inputs))
+            else:
+                encoder_inputs = inputs
+
+                decoder_inputs = None
+
+        return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
+
+    async def _process_encoder_decoder_prompt_async(
+        self,
+        prompt: PromptType,
+    ) -> EncoderDecoderInputs:
+        """Async version of :meth:`_process_encoder_decoder_prompt`."""
+        encoder_inputs: SingletonInputs
+        decoder_inputs: Optional[SingletonInputs]
+
+        if is_explicit_encoder_decoder_prompt(prompt):
+            encoder_task = self._prompt_to_llm_inputs_async(
+                prompt["encoder_prompt"])
+
+            if (decoder_input := prompt["decoder_prompt"]) is None:
+                encoder_inputs = await encoder_task
+                decoder_inputs = None
+            else:
+                decoder_task = self._prompt_to_llm_inputs_async(decoder_input)
+
+                encoder_inputs, decoder_inputs = await asyncio.gather(
+                    encoder_task, decoder_task)
+
+            # For multimodal model, override decoder prompt from processor
+            # with explicit decoder prompt.
+            if self.model_config.is_multimodal_model and (
+                    self._can_process_multimodal()):
+                encoder_inputs, decoder_inputs = (
+                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
+                        encoder_inputs, decoder_inputs))
+        else:
+            inputs = await self._prompt_to_llm_inputs_async(prompt)
+            if self.model_config.is_multimodal_model and (
+                    self._can_process_multimodal()):
+                # Encoder-Decoder Multimodal model
+                encoder_inputs, decoder_inputs = (
+                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
+                        inputs))
+            else:
+                encoder_inputs = inputs
+
+                decoder_inputs = None
+
+        return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
+
+    def _build_decoder_only_llm_inputs(
+        self,
+        prompt_inputs: DecoderOnlyInputs,
+        prompt_adapter_request: Optional[PromptAdapterRequest],
+    ) -> DecoderOnlyInputs:
+        if (prompt_inputs["type"] == "token"
+                or prompt_inputs["type"] == "multimodal"):
+            prompt_inputs["prompt_token_ids"] = self._apply_prompt_adapter(
+                prompt_inputs["prompt_token_ids"],
+                prompt_adapter_request=prompt_adapter_request,
+            )
+        else:
+            assert_never(prompt_inputs)  # type: ignore[arg-type]
+
+        return prompt_inputs
+
+    def _process_decoder_only_prompt(
+        self,
+        prompt: SingletonPrompt,
+        lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> DecoderOnlyInputs:
+        """
+        For decoder-only models:
+        Process an input prompt into an :class:`DecoderOnlyInputs` instance.
+
+        Arguments:
+
+        * prompt: input prompt
+        * lora_request
+        * prompt_adapter_request
+        * return_mm_hashes
+
+        Returns:
+
+        * :class:`DecoderOnlyInputs` instance
+        """
+
+        prompt_comps = self._prompt_to_llm_inputs(
+            prompt,
+            lora_request=lora_request,
+            return_mm_hashes=return_mm_hashes,
+        )
+
+        return self._build_decoder_only_llm_inputs(
+            prompt_comps,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+
+    async def _process_decoder_only_prompt_async(
+        self,
+        prompt: SingletonPrompt,
+        lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> DecoderOnlyInputs:
+        """Async version of :meth:`_process_decoder_only_prompt`."""
+        prompt_comps = await self._prompt_to_llm_inputs_async(
+            prompt,
+            lora_request=lora_request,
+            return_mm_hashes=return_mm_hashes,
+        )
+
+        return self._build_decoder_only_llm_inputs(
+            prompt_comps,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+
+    def preprocess(
+        self,
+        prompt: PromptType,
+        lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> ProcessorInputs:
+        """Preprocess the input prompt."""
+        if self.model_config.is_encoder_decoder:
+            assert not return_mm_hashes, (
+                "Multimodal hashes for encoder-decoder models should not be ",
+                "returned until they are supported on vLLM V1.")
+            # Encoder-decoder model requires special mapping of
+            # input prompts to encoder & decoder
+            return self._process_encoder_decoder_prompt(prompt)
+
+        if is_explicit_encoder_decoder_prompt(prompt):
+            raise ValueError("Cannot pass encoder-decoder prompt "
+                             "to decoder-only models")
+
+        # Decoder-only operation
+        return self._process_decoder_only_prompt(
+            prompt,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
+            return_mm_hashes=return_mm_hashes,
+        )
+
+    async def preprocess_async(
+        self,
+        prompt: PromptType,
+        lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> ProcessorInputs:
+        """Async version of :meth:`preprocess`."""
+        if self.model_config.is_encoder_decoder:
+            assert not return_mm_hashes, (
+                "Multimodal hashes for encoder-decoder models should not be ",
+                "returned until they are supported on vLLM V1.")
+            # Encoder-decoder model requires special mapping of
+            # input prompts to encoder & decoder
+            return await self._process_encoder_decoder_prompt_async(prompt)
+
+        if is_explicit_encoder_decoder_prompt(prompt):
+            raise ValueError("Cannot pass encoder-decoder prompt "
+                             "to decoder-only models")
+
+        # Decoder-only operation
+        return await self._process_decoder_only_prompt_async(
+            prompt,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
+            return_mm_hashes=return_mm_hashes,
+        )
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -0,0 +1,487 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import functools
+from collections import UserDict
+from collections.abc import Mapping
+from dataclasses import dataclass
+from typing import (TYPE_CHECKING, Any, Callable, NamedTuple, Optional,
+                    Protocol, Union)
+
+from torch import nn
+from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
+from typing_extensions import TypeVar, assert_never
+
+from vllm.logger import init_logger
+from vllm.transformers_utils.processor import cached_processor_from_config
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
+                        resolve_mm_processor_kwargs)
+
+from .data import ProcessorInputs, SingletonInputs
+from .parse import split_enc_dec_inputs
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+    from vllm.multimodal import (MultiModalDataDict, MultiModalPlaceholderDict,
+                                 MultiModalRegistry)
+    from vllm.sequence import SequenceData
+
+logger = init_logger(__name__)
+
+_T = TypeVar("_T")
+_C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig)
+_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
+
+
+@dataclass(frozen=True)
+class InputContext:
+    """
+    Contains information about the model which may be used to
+    modify the inputs.
+    """
+
+    model_config: "ModelConfig"
+    """The configuration of the model."""
+
+    def get_hf_config(
+        self,
+        typ: Union[type[_C], tuple[type[_C], ...]] = PretrainedConfig,
+        /,
+    ) -> _C:
+        """
+        Get the HuggingFace configuration
+        (:class:`transformers.PretrainedConfig`) of the model,
+        additionally checking its type.
+
+        Raises:
+            TypeError: If the configuration is not of the specified type.
+        """
+        hf_config = self.model_config.hf_config
+        if not isinstance(hf_config, typ):
+            raise TypeError("Invalid type of HuggingFace config. "
+                            f"Expected type: {typ}, but "
+                            f"found type: {type(hf_config)}")
+
+        return hf_config
+
+    def get_hf_image_processor_config(self) -> dict[str, Any]:
+        """
+        Get the HuggingFace image processor configuration of the model.
+        """
+        return self.model_config.hf_image_processor_config
+
+    def get_mm_config(self):
+        """
+        Get the multimodal config of the model.
+
+        Raises:
+            RuntimeError: If the model is not a multimodal model.
+        """
+        mm_config = self.model_config.multimodal_config
+        if mm_config is None:
+            raise RuntimeError("Not a multimodal model")
+
+        return mm_config
+
+    def get_hf_processor(
+        self,
+        typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
+        /,
+        **kwargs: object,
+    ) -> _P:
+        """
+        Get the HuggingFace processor
+        (:class:`transformers.ProcessorMixin`) of the model,
+        additionally checking its type.
+
+        Raises:
+            TypeError: If the processor is not of the specified type.
+        """
+        return cached_processor_from_config(
+            self.model_config,
+            processor_cls=typ,
+            **kwargs,
+        )
+
+    def init_processor(
+        self,
+        typ: type[_T],
+        /,
+        **kwargs: object,
+    ) -> _T:
+        """
+        Initialize a HuggingFace-like processor class, merging the
+        keyword arguments with those in the model's configuration.
+        """
+        base_kwargs = self.model_config.mm_processor_kwargs
+        if base_kwargs is None:
+            base_kwargs = {}
+
+        merged_kwargs = {**base_kwargs, **kwargs}
+
+        return typ(**merged_kwargs)
+
+
+@dataclass(frozen=True)
+class InputProcessingContext(InputContext):
+    tokenizer: AnyTokenizer
+    """The tokenizer used to tokenize the inputs."""
+
+    def get_hf_processor(
+        self,
+        typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
+        /,
+        **kwargs: object,
+    ) -> _P:
+        return super().get_hf_processor(
+            typ,
+            tokenizer=self.tokenizer,
+            **kwargs,
+        )
+
+    def call_hf_processor(
+        self,
+        hf_processor: ProcessorMixin,
+        data: Mapping[str, object],
+        kwargs: Mapping[str, object] = {},
+    ) -> BatchFeature:
+        """
+        Call :code:`hf_processor` on the prompt :code:`data`
+        (text, image, audio...) with configurable options :code:`kwargs`.
+        """
+        assert callable(hf_processor)
+
+        base_kwargs = self.model_config.mm_processor_kwargs
+        if base_kwargs is None:
+            base_kwargs = {}
+
+        merged_kwargs = resolve_mm_processor_kwargs(
+            base_kwargs,
+            kwargs,
+            hf_processor,
+            requires_kw_only=False,
+            allow_var_kwargs=True,
+        )
+
+        try:
+            return hf_processor(**data, **merged_kwargs, return_tensors="pt")
+        except Exception as exc:
+            msg = (f"Failed to apply {type(hf_processor).__name__} "
+                   f"on data={data} with kwargs={merged_kwargs}")
+
+            raise RuntimeError(msg) from exc
+
+
+N = TypeVar("N", bound=type[nn.Module])
+
+
+class DummyData(NamedTuple):
+    """Dummy data used for profiling."""
+
+    seq_data: "SequenceData"
+    multi_modal_data: Optional["MultiModalDataDict"] = None
+    multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None
+
+
+class DummyDataFactory(Protocol):
+
+    def __call__(
+        self,
+        ctx: InputContext,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        **mm_processor_kwargs: Any,
+    ) -> DummyData:
+        """
+        Create dummy data to be inputted into the model.
+
+        Note:
+            :data:`InputProcessor` is not applied to the dummy data.
+
+            The :code:`mm_processor_kwargs` are overrides provided at
+            initialization time to values in the config whose values
+            may affect the number of tokens per instance.
+        """
+        ...
+
+
+class _MultiModalCounts(UserDict[str, int]):
+    """
+    Wraps `mm_counts` for a more informative error message
+    when attempting to access a plugin that does not exist.
+    """
+
+    def __getitem__(self, key: str) -> int:
+        try:
+            return super().__getitem__(key)
+        except KeyError as exc:
+            msg = (f"There is no multi-modal plugin with the key: {key}. "
+                   f"Available keys: {set(self.keys())}")
+            raise KeyError(msg) from exc
+
+
+InputProcessor = Callable[[InputContext, ProcessorInputs], ProcessorInputs]
+"""Preprocess the inputs to the model."""
+
+
+class InputRegistry:
+    """
+    A registry to dispatch data processing
+    according to the target model.
+    """
+
+    def __init__(self) -> None:
+        self._dummy_factories_by_model_type = \
+            ClassRegistry[nn.Module, DummyDataFactory]()
+        self._dummy_encoder_factories_by_model_type = \
+            ClassRegistry[nn.Module, DummyDataFactory]()
+        self._input_processors_by_model_type = \
+            ClassRegistry[nn.Module, InputProcessor]()
+
+    def _default_dummy_data_factory(
+        self,
+        ctx: InputContext,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> DummyData:
+        """
+        The default dummy data factory represents the longest possible text
+        that can be inputted to the model.
+
+        Note:
+            :data:`InputProcessor` is not applied to the dummy data.
+        """
+        # Avoid circular import
+        from vllm.sequence import SequenceData
+
+        return DummyData(SequenceData.from_prompt_token_counts((0, seq_len)))
+
+    def register_dummy_data(self, factory: DummyDataFactory):
+        """
+        Register a dummy data factory to a model class.
+
+        During memory profiling, the provided function is invoked to create
+        dummy data to be inputted into the model. The resulting memory usage
+        should be an upper bound of what the model would use at inference time.
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if self._dummy_factories_by_model_type.contains(model_cls,
+                                                            strict=True):
+                logger.warning(
+                    "Model class %s already has dummy data "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            self._dummy_factories_by_model_type[model_cls] = factory
+
+            return model_cls
+
+        return wrapper
+
+    def _get_dummy_data_factory(self, model_cls: type[nn.Module]):
+        return self._dummy_factories_by_model_type \
+            .get(model_cls, self._default_dummy_data_factory)
+
+    def register_dummy_encoder_data(self, factory: DummyDataFactory):
+        """
+        Register a dummy encoder data factory to a model class
+
+        This is similar to :meth:`~register_dummy_data`, but for encoder input.
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if self._dummy_encoder_factories_by_model_type.contains(
+                    model_cls, strict=True):
+                logger.warning(
+                    "Model class %s already has dummy encoder data "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            self._dummy_encoder_factories_by_model_type[model_cls] = factory
+
+            return model_cls
+
+        return wrapper
+
+    def _get_dummy_encoder_data_factory(self, model_cls: type[nn.Module]):
+        return self._dummy_encoder_factories_by_model_type \
+            .get(model_cls, self._default_dummy_data_factory)
+
+    def dummy_data_for_profiling(
+        self,
+        model_config: "ModelConfig",
+        seq_len: int,
+        mm_registry: "MultiModalRegistry",
+        is_encoder_data: bool = False,
+    ) -> DummyData:
+        """
+        Create dummy data for profiling the memory usage of a model.
+
+        The model is identified by ``model_config``.
+
+        Note:
+            This should be called after
+            :meth:`~MultiModalRegistry.init_mm_limits_per_prompt`.
+        """
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+        from vllm.multimodal import MultiModalKwargs
+        from vllm.multimodal.profiling import MultiModalProfiler
+        from vllm.sequence import SequenceData
+
+        if mm_registry.has_processor(model_config):
+            processor = mm_registry.create_processor(model_config,
+                                                     disable_cache=True)
+            profiler = MultiModalProfiler(processor)
+
+            dummy_data_v1 = (profiler.get_encoder_dummy_data(seq_len)
+                             if is_encoder_data else
+                             profiler.get_decoder_dummy_data(seq_len))
+            _seq_data = SequenceData.from_seqs(
+                dummy_data_v1.prompt_token_ids)  # type: ignore[attr-defined]
+
+            dummy_data = DummyData(
+                seq_data=_seq_data,
+                multi_modal_data=getattr(dummy_data_v1, "multi_modal_data",
+                                         None),
+                multi_modal_placeholders=getattr(dummy_data_v1,
+                                                 "multi_modal_placeholders",
+                                                 None),
+            )
+        else:
+            model_cls, _ = get_model_architecture(model_config)
+            if is_encoder_data:
+                dummy_factory = self._get_dummy_encoder_data_factory(model_cls)
+            else:
+                dummy_factory = self._get_dummy_data_factory(model_cls)
+            mm_counts = mm_registry.get_mm_limits_per_prompt(model_config)
+            mm_processor_kwargs = get_allowed_kwarg_only_overrides(
+                dummy_factory,
+                overrides=model_config.mm_processor_kwargs,
+                requires_kw_only=False,
+                allow_var_kwargs=True,
+            )
+
+            dummy_data = dummy_factory(InputContext(model_config), seq_len,
+                                       _MultiModalCounts(mm_counts),
+                                       **mm_processor_kwargs)
+
+        # Having more tokens is over-conservative but otherwise fine
+        num_tokens = dummy_data.seq_data.prompt_token_ids
+        if len(num_tokens) < seq_len:
+            if is_encoder_data:
+                logger.warning_once(
+                    f"Expected at least {seq_len} dummy encoder tokens for "
+                    f"profiling, but found {len(num_tokens)} tokens instead.")
+            else:
+                raise AssertionError(
+                    f"Expected at least {seq_len} dummy tokens for profiling, "
+                    f"but found {len(num_tokens)} tokens instead.")
+
+        if (dummy_data.multi_modal_data is not None and
+                not isinstance(dummy_data.multi_modal_data, MultiModalKwargs)):
+            for k, v in dummy_data.multi_modal_data.items():
+                num_items = len(v) if isinstance(v, list) else 1
+                num_expected = mm_counts[k]
+                assert num_items >= num_expected, (
+                    f"Expected at least {num_expected} dummy '{k}' instances "
+                    f"for profiling, but found {num_items} instances instead.")
+
+        return dummy_data
+
+    def _default_input_processor(
+        self,
+        ctx: InputContext,
+        inputs: ProcessorInputs,
+        **kwargs: object,
+    ) -> ProcessorInputs:
+        """The default input processor is a no-op."""
+        return inputs
+
+    def register_input_processor(self, processor: InputProcessor):
+        """
+        Register an input processor to a model class.
+
+        The provided function is invoked on each input to the model. This
+        happens before
+        :meth:`~vllm.multimodal.registry.MultiModalRegistry.map_input`.
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if self._input_processors_by_model_type.contains(model_cls,
+                                                             strict=True):
+                logger.warning(
+                    "Model class %s already has input processor "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            self._input_processors_by_model_type[model_cls] = processor
+
+            return model_cls
+
+        return wrapper
+
+    def _get_model_input_processor(self, model_cls: type[nn.Module]):
+        return self._input_processors_by_model_type \
+            .get(model_cls, self._default_input_processor)
+
+    def _ensure_mm_kwargs(
+        self,
+        inputs: SingletonInputs,
+        mm_processor_kwargs: dict[str, Any],
+    ):
+        if inputs["type"] == "token":
+            # In case the input processor for that model fails to set it
+            if "mm_processor_kwargs" not in inputs:
+                inputs["mm_processor_kwargs"] = mm_processor_kwargs
+        elif inputs["type"] == "multimodal":
+            # Be more strict in V2
+            assert "mm_kwargs" in inputs
+        else:
+            assert_never(inputs["type"])  # type: ignore[arg-type]
+
+    def process_input(self, model_config: "ModelConfig",
+                      inputs: ProcessorInputs) -> ProcessorInputs:
+        """
+        Apply an input processor to an instance of model inputs.
+
+        The model is identified by ``model_config``.
+        """
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        model_cls, _ = get_model_architecture(model_config)
+        processor = self._get_model_input_processor(model_cls)
+
+        # Handle multimodal processor kwargs with priority:
+        #     Inference kwargs -> Init kwargs -> {}
+        # If it's empty, it'll fall back to the default kwarg values
+        mm_processor_kwargs = resolve_mm_processor_kwargs(
+            model_config.mm_processor_kwargs,
+            inputs.get("mm_processor_kwargs", {}),  # type: ignore
+            processor,
+            requires_kw_only=False,
+            allow_var_kwargs=True,
+        )
+
+        processed_inputs = processor(
+            InputContext(model_config),
+            inputs,
+            **mm_processor_kwargs,
+        )
+
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
+        if encoder_inputs is not None:
+            self._ensure_mm_kwargs(encoder_inputs, mm_processor_kwargs)
+        if decoder_inputs is not None:
+            self._ensure_mm_kwargs(decoder_inputs, mm_processor_kwargs)
+
+        return processed_inputs
+
+    def create_input_processor(self, model_config: "ModelConfig"):
+        """
+        Create an input processor (see :meth:`_process_input`) for a
+        specific model.
+        """
+        return functools.partial(self.process_input, model_config)