First commit

2025-08-05 19:02:46 +08:00
parent 9efe891f99
commit 99fb9f5cb0
1412 changed files with 203615 additions and 0 deletions
--- a/vllm/inputs/init.py
+++ b/vllm/inputs/init.py
@@ -0,0 +1,44 @@
+from .data import (EncoderDecoderLLMInputs, ExplicitEncoderDecoderPrompt,
+                   LLMInputs, PromptType, SingletonPrompt, TextPrompt,
+                   TokensPrompt, build_explicit_enc_dec_prompt,
+                   to_enc_dec_tuple_list, zip_enc_dec_prompts)
+from .registry import InputContext, InputRegistry
+
+INPUT_REGISTRY = InputRegistry()
+"""
+The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine`
+to dispatch data processing according to the target model.
+
+See also:
+    :ref:`input_processing_pipeline`
+"""
+
+__all__ = [
+    "TextPrompt",
+    "TokensPrompt",
+    "PromptType",
+    "SingletonPrompt",
+    "ExplicitEncoderDecoderPrompt",
+    "LLMInputs",
+    "EncoderDecoderLLMInputs",
+    "build_explicit_enc_dec_prompt",
+    "to_enc_dec_tuple_list",
+    "zip_enc_dec_prompts",
+    "INPUT_REGISTRY",
+    "InputContext",
+    "InputRegistry",
+]
+
+
+def __getattr__(name: str):
+    if name == "PromptInput":
+        import warnings
+
+        msg = ("PromptInput has been renamed to PromptType. "
+               "The original name will be removed in an upcoming version.")
+
+        warnings.warn(DeprecationWarning(msg), stacklevel=2)
+
+        return PromptType
+
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
--- a/vllm/inputs/pycache/init.cpython-310.pyc
+++ b/vllm/inputs/pycache/init.cpython-310.pyc
--- a/vllm/inputs/pycache/data.cpython-310.pyc
+++ b/vllm/inputs/pycache/data.cpython-310.pyc
--- a/vllm/inputs/pycache/parse.cpython-310.pyc
+++ b/vllm/inputs/pycache/parse.cpython-310.pyc
--- a/vllm/inputs/pycache/preprocess.cpython-310.pyc
+++ b/vllm/inputs/pycache/preprocess.cpython-310.pyc
--- a/vllm/inputs/pycache/registry.cpython-310.pyc
+++ b/vllm/inputs/pycache/registry.cpython-310.pyc
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -0,0 +1,242 @@
+from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List,
+                    Optional, Tuple, Union)
+
+from typing_extensions import NotRequired, TypedDict, TypeVar
+
+if TYPE_CHECKING:
+    from vllm.multimodal import MultiModalDataDict
+
+
+class TextPrompt(TypedDict):
+    """Schema for a text prompt."""
+
+    prompt: str
+    """The input text to be tokenized before passing to the model."""
+
+    multi_modal_data: NotRequired["MultiModalDataDict"]
+    """
+    Optional multi-modal data to pass to the model,
+    if the model supports it.
+    """
+
+    mm_processor_kwargs: NotRequired[Dict[str, Any]]
+    """
+    Optional multi-modal processor kwargs to be forwarded to the
+    multimodal input mapper & processor. Note that if multiple modalities
+    have registered mappers etc for the model being considered, we attempt
+    to pass the mm_processor_kwargs to each of them.
+    """
+
+
+class TokensPrompt(TypedDict):
+    """Schema for a tokenized prompt."""
+
+    prompt_token_ids: List[int]
+    """A list of token IDs to pass to the model."""
+
+    multi_modal_data: NotRequired["MultiModalDataDict"]
+    """
+    Optional multi-modal data to pass to the model,
+    if the model supports it.
+    """
+
+    mm_processor_kwargs: NotRequired[Dict[str, Any]]
+    """
+    Optional multi-modal processor kwargs to be forwarded to the
+    multimodal input mapper & processor. Note that if multiple modalities
+    have registered mappers etc for the model being considered, we attempt
+    to pass the mm_processor_kwargs to each of them.
+    """
+
+
+SingletonPrompt = Union[str, TextPrompt, TokensPrompt]
+"""
+Set of possible schemas for a single LLM input:
+
+- A text prompt (:class:`str` or :class:`TextPrompt`)
+- A tokenized prompt (:class:`TokensPrompt`)
+
+Note that "singleton" is as opposed to a data structure
+which encapsulates multiple prompts, i.e. of the sort
+which may be utilized for encoder/decoder models when
+the user desires to express both the encoder & decoder
+prompts explicitly, i.e. :class:`ExplicitEncoderDecoderPrompt`
+
+A prompt of type :class:`SingletonPrompt` may be employed
+as (1) input to a decoder-only model, (2) input to
+the encoder of an encoder/decoder model, in the scenario
+where the decoder-prompt is not specified explicitly, or
+(3) as a member of a larger data structure encapsulating
+more than one prompt, i.e. :class:`ExplicitEncoderDecoderPrompt`
+"""
+
+_T1_co = TypeVar("_T1_co",
+                 bound=SingletonPrompt,
+                 default=SingletonPrompt,
+                 covariant=True)
+_T2_co = TypeVar("_T2_co",
+                 bound=SingletonPrompt,
+                 default=SingletonPrompt,
+                 covariant=True)
+
+
+# TODO: Make fields ReadOnly once mypy supports it
+class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
+    """
+    Represents an encoder/decoder model input prompt,
+    comprising an explicit encoder prompt and a decoder prompt.
+
+    The encoder and decoder prompts, respectively, may be formatted
+    according to any of the :class:`SingletonPrompt` schemas,
+    and are not required to have the same schema.
+
+    Only the encoder prompt may have multi-modal data. mm_processor_kwargs
+    should be at the top-level, and should not be set in the encoder/decoder
+    prompts, since they are agnostic to the encoder/decoder.
+
+    Note that an :class:`ExplicitEncoderDecoderPrompt` may not
+    be used as an input to a decoder-only model,
+    and that the :code:`encoder_prompt` and :code:`decoder_prompt`
+    fields of this data structure themselves must be
+    :class:`SingletonPrompt` instances.
+    """
+
+    encoder_prompt: _T1_co
+
+    decoder_prompt: Optional[_T2_co]
+
+    mm_processor_kwargs: NotRequired[Dict[str, Any]]
+
+
+PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt]
+"""
+Set of possible schemas for an LLM input, including
+both decoder-only and encoder/decoder input types:
+
+- A text prompt (:class:`str` or :class:`TextPrompt`)
+- A tokenized prompt (:class:`TokensPrompt`)
+- A single data structure containing both an encoder and a decoder prompt
+  (:class:`ExplicitEncoderDecoderPrompt`)
+"""
+
+
+class LLMInputs(TypedDict):
+    """
+    The inputs in :class:`~vllm.LLMEngine` before they are
+    passed to the model executor.
+
+    This specifies the data required for decoder-only models.
+    """
+    prompt_token_ids: List[int]
+    """The token IDs of the prompt."""
+
+    prompt: NotRequired[Optional[str]]
+    """
+    The original prompt text corresponding to the token IDs, if available.
+    """
+
+    multi_modal_data: NotRequired[Optional["MultiModalDataDict"]]
+    """
+    Optional multi-modal data to pass to the model,
+    if the model supports it.
+    """
+
+    mm_processor_kwargs: NotRequired[Optional[Dict[str, Any]]]
+    """
+    Optional multi-modal processor kwargs to be forwarded to the
+    multimodal input mapper & processor. Note that if multiple modalities
+    have registered mappers etc for the model being considered, we attempt
+    to pass the mm_processor_kwargs to each of them.
+    """
+
+
+class EncoderDecoderLLMInputs(LLMInputs):
+    """
+    The inputs in :class:`~vllm.LLMEngine` before they are
+    passed to the model executor.
+
+    This specifies the required data for encoder-decoder models.
+    """
+    encoder_prompt_token_ids: List[int]
+    """The token IDs of the encoder prompt."""
+
+    encoder_prompt: NotRequired[Optional[str]]
+    """
+    The original encoder prompt text corresponding to the token IDs, if
+    available.
+    """
+
+    encoder_multi_modal_data: NotRequired[Optional["MultiModalDataDict"]]
+    """
+    Optional multi-modal data to pass to the encoder model,
+    if the model supports it.
+    """
+
+
+_T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt)
+_T2 = TypeVar("_T2", bound=SingletonPrompt, default=SingletonPrompt)
+
+
+def build_explicit_enc_dec_prompt(
+    encoder_prompt: _T1,
+    decoder_prompt: Optional[_T2],
+    mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+) -> ExplicitEncoderDecoderPrompt[_T1, _T2]:
+    if mm_processor_kwargs is None:
+        mm_processor_kwargs = {}
+    return ExplicitEncoderDecoderPrompt(
+        encoder_prompt=encoder_prompt,
+        decoder_prompt=decoder_prompt,
+        mm_processor_kwargs=mm_processor_kwargs)
+
+
+def zip_enc_dec_prompts(
+    enc_prompts: Iterable[_T1],
+    dec_prompts: Iterable[Optional[_T2]],
+    mm_processor_kwargs: Optional[Union[Iterable[Dict[str, Any]],
+                                        Dict[str, Any]]] = None,
+) -> List[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
+    """
+    Zip encoder and decoder prompts together into a list of
+    :class:`ExplicitEncoderDecoderPrompt` instances. mm_processor_kwargs
+    may also be provided; if a dict is passed, the same dictionary will be
+    used for every encoder/decoder prompt. If an iterable is provided, it will
+    be zipped with the encoder/decoder prompts.
+    """
+    if mm_processor_kwargs is None:
+        mm_processor_kwargs = {}
+    if isinstance(mm_processor_kwargs, Dict):
+        return [
+            build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt,
+                                          mm_processor_kwargs)
+            for (encoder_prompt,
+                 decoder_prompt) in zip(enc_prompts, dec_prompts)
+        ]
+    return [
+        build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt,
+                                      mm_proc_kwargs)
+        for (encoder_prompt, decoder_prompt, mm_proc_kwargs
+             ) in zip(enc_prompts, dec_prompts, mm_processor_kwargs)
+    ]
+
+
+def to_enc_dec_tuple_list(
+    enc_dec_prompts: Iterable[ExplicitEncoderDecoderPrompt[_T1, _T2]],
+) -> List[Tuple[_T1, Optional[_T2]]]:
+    return [(enc_dec_prompt["encoder_prompt"],
+             enc_dec_prompt["decoder_prompt"])
+            for enc_dec_prompt in enc_dec_prompts]
+
+
+def __getattr__(name: str):
+    if name == "PromptInput":
+        import warnings
+
+        msg = ("PromptInput has been renamed to PromptType. "
+               "The original name will be removed in an upcoming version.")
+
+        warnings.warn(DeprecationWarning(msg), stacklevel=2)
+
+        return PromptType
+
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -0,0 +1,106 @@
+from typing import List, Literal, Sequence, TypedDict, Union, overload
+
+from typing_extensions import TypeIs
+
+from vllm.utils import is_list_of
+
+from .data import (EncoderDecoderLLMInputs, ExplicitEncoderDecoderPrompt,
+                   LLMInputs, PromptType, SingletonPrompt, TextPrompt,
+                   TokensPrompt)
+
+
+class ParsedText(TypedDict):
+    content: str
+    is_tokens: Literal[False]
+
+
+class ParsedTokens(TypedDict):
+    content: List[int]
+    is_tokens: Literal[True]
+
+
+@overload
+def parse_and_batch_prompt(
+        prompt: Union[str, List[str]]) -> Sequence[ParsedText]:
+    ...
+
+
+@overload
+def parse_and_batch_prompt(
+        prompt: Union[List[int], List[List[int]]]) -> Sequence[ParsedTokens]:
+    ...
+
+
+def parse_and_batch_prompt(
+    prompt: Union[str, List[str], List[int], List[List[int]]],
+) -> Union[Sequence[ParsedText], Sequence[ParsedTokens]]:
+    if isinstance(prompt, str):
+        # case 1: a string
+        return [ParsedText(content=prompt, is_tokens=False)]
+
+    if isinstance(prompt, list):
+        if len(prompt) == 0:
+            raise ValueError("please provide at least one prompt")
+
+        if is_list_of(prompt, str):
+            # case 2: array of strings
+            return [
+                ParsedText(content=elem, is_tokens=False) for elem in prompt
+            ]
+        if is_list_of(prompt, int):
+            # case 3: array of tokens
+            return [ParsedTokens(content=prompt, is_tokens=True)]
+        if is_list_of(prompt, list):
+            if len(prompt[0]) == 0:
+                raise ValueError("please provide at least one prompt")
+
+            if is_list_of(prompt[0], int):
+                # case 4: array of token arrays
+                return [
+                    ParsedTokens(content=elem, is_tokens=True)
+                    for elem in prompt
+                ]
+
+    raise TypeError("prompt must be a string, array of strings, "
+                    "array of tokens, or array of token arrays")
+
+
+class ParsedStrPrompt(TypedDict):
+    type: Literal["str"]
+    content: str
+
+
+class ParsedTextPrompt(TypedDict):
+    type: Literal["text"]
+    content: TextPrompt
+
+
+class ParsedTokensPrompt(TypedDict):
+    type: Literal["tokens"]
+    content: TokensPrompt
+
+
+def parse_singleton_prompt(
+    prompt: SingletonPrompt,
+) -> Union[ParsedStrPrompt, ParsedTextPrompt, ParsedTokensPrompt]:
+    if isinstance(prompt, str):
+        return ParsedStrPrompt(type="str", content=prompt)
+    elif isinstance(prompt, dict):
+        if "prompt_token_ids" in prompt:
+            return ParsedTokensPrompt(type="tokens",
+                                      content=prompt)  # type: ignore
+        elif "prompt" in prompt:
+            return ParsedTextPrompt(type="text", content=prompt)
+
+    raise TypeError("inputs must be a string, TextPrompt, or TokensPrompt")
+
+
+def is_explicit_encoder_decoder_prompt(
+        prompt: PromptType) -> TypeIs[ExplicitEncoderDecoderPrompt]:
+    return isinstance(prompt, dict) and "encoder_prompt" in prompt
+
+
+def is_valid_encoder_decoder_llm_inputs(
+    inputs: Union[LLMInputs, EncoderDecoderLLMInputs],
+) -> TypeIs[EncoderDecoderLLMInputs]:
+    return "encoder_prompt_token_ids" in inputs
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -0,0 +1,580 @@
+import asyncio
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+
+from typing_extensions import assert_never
+
+from vllm.config import ModelConfig
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
+from vllm.utils import print_warning_once
+
+from .data import (EncoderDecoderLLMInputs, LLMInputs, PromptType,
+                   SingletonPrompt)
+from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt
+
+if TYPE_CHECKING:
+    from vllm.multimodal import MultiModalDataDict
+
+logger = init_logger(__name__)
+
+PromptComponents = Tuple[Optional[str], List[int],
+                         Optional["MultiModalDataDict"], Optional[Dict[str,
+                                                                       Any]]]
+DecoderPromptComponents = Tuple[Optional[str], Optional[List[int]],
+                                Optional["MultiModalDataDict"],
+                                Optional[Dict[str, Any]]]
+
+
+class InputPreprocessor:
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        tokenizer: Optional[BaseTokenizerGroup],
+    ) -> None:
+        super().__init__()
+
+        self.model_config = model_config
+        self.tokenizer = tokenizer
+
+    def get_tokenizer_group(self) -> BaseTokenizerGroup:
+        if self.tokenizer is None:
+            raise ValueError("You cannot pass text prompts when "
+                             "`skip_tokenizer_init` is True")
+
+        return self.tokenizer
+
+    def get_bos_token_id(self,
+                         lora_request: Optional[LoRARequest] = None
+                         ) -> Optional[int]:
+        if self.tokenizer is None:
+            logger.warning("Using None for BOS token id because tokenizer "
+                           "is not initialized")
+            return None
+
+        return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id
+
+    def get_eos_token_id(self,
+                         lora_request: Optional[LoRARequest] = None
+                         ) -> Optional[int]:
+        if self.tokenizer is None:
+            logger.warning("Using None for EOS token id because tokenizer "
+                           "is not initialized")
+            return None
+
+        return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id
+
+    def get_decoder_start_token_id(self) -> Optional[int]:
+        '''
+        Obtain the decoder start token id employed by an encoder/decoder
+        model. Returns None for non-encoder/decoder models or if the
+        model config is unavailable.
+        '''
+
+        if not self.is_encoder_decoder_model():
+            print_warning_once("Using None for decoder start token id because "
+                               "this is not an encoder/decoder model.")
+            return None
+
+        if (self.model_config is None or self.model_config.hf_config is None):
+            print_warning_once("Using None for decoder start token id because "
+                               "model config is not available.")
+            return None
+
+        dec_start_token_id = getattr(self.model_config.hf_config,
+                                     'decoder_start_token_id', None)
+        if dec_start_token_id is None:
+            print_warning_once("Falling back on <BOS> for decoder start token "
+                               "id because decoder start token id is not "
+                               "available.")
+            dec_start_token_id = self.get_bos_token_id()
+
+        return dec_start_token_id
+
+    def _get_default_enc_dec_decoder_prompt(self) -> List[int]:
+        '''
+        Specifically for encoder/decoder models:
+        generate a default decoder prompt for when
+        the user specifies only the encoder prompt.
+
+        Encoder/decoder models utilize the decoder
+        prompt in different ways; as new models are
+        added, it is intended that this function
+        will be extended to produce differing
+        default decoder prompts, depending on the
+        model variety.
+
+        Absent a special case, the default behavior
+        of this method is to mirror the behavior of
+        the HuggingFace (HF) GenerationMixin for a None
+        decoder prompt, which is to employ a logit processor
+        setting to force the first decoded token to be <BOS>.
+        Here, this behavior is approximated by having the
+        "default" decoder prompt be <BOS>.
+
+        However, it is possible that in the future
+        other models may have different or more 
+        complex logic for the default decoder prompt.
+        This motivates having a special helper method
+        for default decoder prompts.
+
+        Returns:
+
+        * prompt_token_ids
+        '''
+
+        bos_token_id = self.get_bos_token_id()
+        assert bos_token_id is not None
+        return [bos_token_id]
+
+    def _prepare_decoder_input_ids_for_generation(
+        self,
+        decoder_input_ids: Optional[List[int]],
+        force_bos: bool = True,
+    ) -> List[int]:
+        """
+        Prepares `decoder_input_ids` for generation with encoder-decoder models.
+
+        Based on
+
+        https://github.com/huggingface/transformers/blob/
+        4037a2b5b1278736e566aec12e169100275545ea/
+        src/transformers/generation/utils.py
+
+        specifically GenerationMixin._prepare_decoder_input_ids_for_generation()
+
+        Arguments:
+
+        * decoder_input_ids: input token ids to preprocess
+
+        Returns:
+
+        * Processed token list
+        """
+
+        decoder_start_token_id = self.get_decoder_start_token_id()
+        assert decoder_start_token_id is not None
+
+        if decoder_input_ids is None:
+            # no decoder prompt input ->
+            # use decoder_start_token_id as decoder_input_ids
+            decoder_input_ids = self._get_default_enc_dec_decoder_prompt()
+
+        if force_bos and (len(decoder_input_ids) == 0
+                          or decoder_input_ids[0] != decoder_start_token_id):
+            decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
+
+        return decoder_input_ids
+
+    def _apply_prompt_adapter(
+        self,
+        prompt_token_ids: List[int],
+        prompt_adapter_request: Optional[PromptAdapterRequest],
+    ) -> List[int]:
+        if prompt_adapter_request:
+            prompt_token_ids = (
+                [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
+                + prompt_token_ids)
+
+        return prompt_token_ids
+
+    def _tokenize_prompt(
+        self,
+        prompt: str,
+        request_id: str,
+        lora_request: Optional[LoRARequest],
+    ) -> List[int]:
+        """
+        Apply the model's tokenizer to a text prompt, returning the
+        corresponding token IDs.
+        """
+        tokenizer = self.get_tokenizer_group()
+
+        return tokenizer.encode(request_id=request_id,
+                                prompt=prompt,
+                                lora_request=lora_request)
+
+    async def _tokenize_prompt_async(
+        self,
+        prompt: str,
+        request_id: str,
+        lora_request: Optional[LoRARequest],
+    ) -> List[int]:
+        """Async version of :meth:`_tokenize_prompt`."""
+        tokenizer = self.get_tokenizer_group()
+
+        return await tokenizer.encode_async(request_id=request_id,
+                                            prompt=prompt,
+                                            lora_request=lora_request)
+
+    def _extract_prompt_components(
+        self,
+        prompt: SingletonPrompt,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> PromptComponents:
+        '''
+        Extract the components of any single encoder or decoder input prompt.
+
+        Arguments:
+
+        * request_id
+        * prompt: single encoder or decoder input prompt
+        * lora_request: this is only valid for decoder prompts
+
+        Returns:
+
+        * prompt
+        * prompt_token_ids
+        * multi_modal_data
+        * mm_processor_kwargs (request-level input processor/mapper overrides)
+        '''
+
+        parsed = parse_singleton_prompt(prompt)
+
+        if parsed["type"] == "str":
+            prompt_text = parsed["content"]
+            prompt_token_ids = self._tokenize_prompt(
+                prompt_text,
+                request_id=request_id,
+                lora_request=lora_request,
+            )
+            multi_modal_data = None
+            mm_processor_kwargs = None
+        elif parsed["type"] == "tokens":
+            prompt_text = None
+            prompt_token_ids = parsed["content"]["prompt_token_ids"]
+            multi_modal_data = parsed["content"].get("multi_modal_data")
+            mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs")
+        elif parsed["type"] == "text":
+            prompt_text = parsed["content"]["prompt"]
+            prompt_token_ids = self._tokenize_prompt(
+                prompt_text,
+                request_id=request_id,
+                lora_request=lora_request,
+            )
+            multi_modal_data = parsed["content"].get("multi_modal_data")
+            mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs")
+        else:
+            assert_never(parsed)
+
+        return (prompt_text, prompt_token_ids, multi_modal_data,
+                mm_processor_kwargs)
+
+    async def _extract_prompt_components_async(
+        self,
+        prompt: SingletonPrompt,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> PromptComponents:
+        """Async version of :meth:`_extract_prompt_components`."""
+        parsed = parse_singleton_prompt(prompt)
+
+        if parsed["type"] == "str":
+            prompt_text = parsed["content"]
+            prompt_token_ids = await self._tokenize_prompt_async(
+                prompt_text,
+                request_id=request_id,
+                lora_request=lora_request,
+            )
+            multi_modal_data = None
+            mm_processor_kwargs = None
+        elif parsed["type"] == "tokens":
+            prompt_text = None
+            prompt_token_ids = parsed["content"]["prompt_token_ids"]
+            multi_modal_data = parsed["content"].get("multi_modal_data")
+            mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs")
+        elif parsed["type"] == "text":
+            prompt_text = parsed["content"]["prompt"]
+            prompt_token_ids = await self._tokenize_prompt_async(
+                prompt_text,
+                request_id=request_id,
+                lora_request=lora_request,
+            )
+            multi_modal_data = parsed["content"].get("multi_modal_data")
+            mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs")
+        else:
+            assert_never(parsed)
+
+        return (prompt_text, prompt_token_ids, multi_modal_data,
+                mm_processor_kwargs)
+
+    def _build_enc_dec_llm_inputs(
+        self,
+        encoder_comps: PromptComponents,
+        decoder_comps: DecoderPromptComponents,
+        mm_processor_kwargs: Dict[str, Any],
+    ) -> EncoderDecoderLLMInputs:
+        encoder_prompt, encoder_prompt_ids, encoder_mm_data, _ = encoder_comps
+        decoder_prompt, decoder_prompt_ids, decoder_mm_data, _ = decoder_comps
+
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
+        if decoder_mm_data is not None:
+            raise ValueError(
+                "Multi-modality decoder inputs of encoder-decoder models are "
+                "not supported yet")
+
+        # For Multi-Modal models (e.g., mllama), the text input can be
+        # <|image|><|begin_of_text|>hello world. And we should not add
+        # another <|begin_of_text|> to the beginning.
+        decoder_prompt_ids = (self._prepare_decoder_input_ids_for_generation(
+            decoder_prompt_ids,
+            force_bos=(encoder_mm_data is None and decoder_mm_data is None)))
+
+        return EncoderDecoderLLMInputs(
+            prompt_token_ids=decoder_prompt_ids,
+            prompt=decoder_prompt,
+            multi_modal_data=decoder_mm_data,
+            mm_processor_kwargs=mm_processor_kwargs,
+            encoder_prompt_token_ids=encoder_prompt_ids,
+            encoder_prompt=encoder_prompt,
+            encoder_multi_modal_data=encoder_mm_data,
+        )
+
+    def _process_encoder_decoder_prompt(
+        self,
+        prompt: PromptType,
+        request_id: str,
+    ) -> EncoderDecoderLLMInputs:
+        '''
+        For encoder/decoder models only:
+        Process an input prompt into an
+        :class:`EncoderDecoderLLMInputs` instance.
+
+        There are two types of input prompts:
+        singleton prompts which carry only the
+        encoder prompt, and explicit encoder/decoder
+        prompts which carry both the encoder and the
+        decoder prompts as member variables.
+
+        This function handles the following scenarios:
+        * Singleton encoder prompt: extract encoder prompt
+          token ids & infer default decoder prompt token ids
+        * Explicit encoder/decoder prompt: extract encoder
+          and decoder prompt token ids
+
+        Note that for Explicit encoder/decoder prompts,
+        each sub-prompt (encoder or decoder prompt) can
+        have any possible singleton type; thus this
+        method relies on helper functions to obtain
+        token ids for the sub-prompts.
+        
+        Arguments:
+
+        * prompt: an input prompt
+        * request_id
+
+        Returns:
+
+        * :class:`EncoderDecoderLLMInputs` instance
+        '''
+
+        encoder_comps: PromptComponents
+        decoder_comps: DecoderPromptComponents
+
+        if is_explicit_encoder_decoder_prompt(prompt):
+            encoder_comps = self._extract_prompt_components(
+                prompt["encoder_prompt"],
+                request_id=request_id,
+            )
+
+            if (decoder_input := prompt["decoder_prompt"]) is None:
+                decoder_comps = None, None, None, None
+            else:
+                decoder_comps = self._extract_prompt_components(
+                    decoder_input,
+                    request_id=request_id,
+                )
+            # Handle this carefully in case it was directly initialized by user
+            mm_processor_kwargs = prompt.get("mm_processor_kwargs", {})
+        else:
+            encoder_comps = self._extract_prompt_components(
+                prompt,
+                request_id=request_id,
+            )
+            # If there are no decoder components, we assume the
+            # mm_processor_kwargs are in the encoder prompt
+            mm_processor_kwargs = encoder_comps[-1] if encoder_comps[
+                -1] is not None else {}
+            decoder_comps = None, None, None, None
+
+        return self._build_enc_dec_llm_inputs(
+            encoder_comps,
+            decoder_comps,
+            mm_processor_kwargs,
+        )
+
+    async def _process_encoder_decoder_prompt_async(
+        self,
+        prompt: PromptType,
+        request_id: str,
+    ) -> EncoderDecoderLLMInputs:
+        """Async version of :meth:`_process_encoder_decoder_prompt`."""
+        encoder_comps: PromptComponents
+        decoder_comps: DecoderPromptComponents
+
+        if is_explicit_encoder_decoder_prompt(prompt):
+            encoder_task = self._extract_prompt_components_async(
+                prompt["encoder_prompt"],
+                request_id=request_id,
+            )
+
+            if (decoder_input := prompt["decoder_prompt"]) is None:
+                encoder_comps = await encoder_task
+                decoder_comps = None, None, None, None
+            else:
+                decoder_task = self._extract_prompt_components_async(
+                    decoder_input,
+                    request_id=request_id,
+                )
+
+                encoder_comps, decoder_comps = await asyncio.gather(
+                    encoder_task, decoder_task)
+            mm_processor_kwargs = prompt["mm_processor_kwargs"]
+        else:
+            encoder_comps = await self._extract_prompt_components_async(
+                prompt,
+                request_id=request_id,
+            )
+            # If there are no decoder components, we assume the
+            # mm_processor_kwargs are in the encoder prompt
+            mm_processor_kwargs = encoder_comps[-1] if encoder_comps[
+                -1] is not None else {}
+            decoder_comps = None, None, None, None
+
+        return self._build_enc_dec_llm_inputs(
+            encoder_comps,
+            decoder_comps,
+            mm_processor_kwargs,
+        )
+
+    def _build_decoder_only_llm_inputs(
+        self,
+        prompt_comps: PromptComponents,
+        prompt_adapter_request: Optional[PromptAdapterRequest],
+    ) -> LLMInputs:
+        (prompt, prompt_token_ids, multi_modal_data,
+         mm_processor_kwargs) = prompt_comps
+
+        prompt_token_ids = self._apply_prompt_adapter(
+            prompt_token_ids, prompt_adapter_request=prompt_adapter_request)
+
+        return LLMInputs(prompt_token_ids=prompt_token_ids,
+                         prompt=prompt,
+                         multi_modal_data=multi_modal_data,
+                         mm_processor_kwargs=mm_processor_kwargs)
+
+    def _process_decoder_only_prompt(
+        self,
+        prompt: SingletonPrompt,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> LLMInputs:
+        '''
+        For decoder-only models:
+        Process an input prompt into an :class:`LLMInputs` instance.
+
+        Arguments:
+
+        * prompt: input prompt
+        * request_id
+        * lora_request
+        * prompt_adapter_request
+
+        Returns:
+
+        * :class:`LLMInputs` instance
+        '''
+
+        prompt_comps = self._extract_prompt_components(
+            prompt,
+            request_id=request_id,
+            lora_request=lora_request,
+        )
+
+        return self._build_decoder_only_llm_inputs(
+            prompt_comps,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+
+    async def _process_decoder_only_prompt_async(
+        self,
+        prompt: SingletonPrompt,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> LLMInputs:
+        """Async version of :meth:`_process_decoder_only_prompt`."""
+        prompt_comps = await self._extract_prompt_components_async(
+            prompt,
+            request_id=request_id,
+            lora_request=lora_request,
+        )
+
+        return self._build_decoder_only_llm_inputs(
+            prompt_comps,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+
+    def preprocess(
+        self,
+        prompt: PromptType,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> Union[LLMInputs, EncoderDecoderLLMInputs]:
+        """Preprocess the input prompt."""
+        if self.is_encoder_decoder_model():
+            # Encoder-decoder model requires special mapping of
+            # input prompts to encoder & decoder
+            return self._process_encoder_decoder_prompt(
+                prompt,
+                request_id=request_id,
+            )
+
+        if is_explicit_encoder_decoder_prompt(prompt):
+            raise ValueError("Cannot pass encoder-decoder prompt "
+                             "to decoder-only models")
+
+        # Decoder-only operation
+        return self._process_decoder_only_prompt(
+            prompt,
+            request_id=request_id,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+
+    async def preprocess_async(
+        self,
+        prompt: PromptType,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> Union[LLMInputs, EncoderDecoderLLMInputs]:
+        """Async version of :meth:`preprocess`."""
+        if self.is_encoder_decoder_model():
+            # Encoder-decoder model requires special mapping of
+            # input prompts to encoder & decoder
+            return await self._process_encoder_decoder_prompt_async(
+                prompt,
+                request_id=request_id,
+            )
+
+        if is_explicit_encoder_decoder_prompt(prompt):
+            raise ValueError("Cannot pass encoder-decoder prompt "
+                             "to decoder-only models")
+
+        # Decoder-only operation
+        return await self._process_decoder_only_prompt_async(
+            prompt,
+            request_id=request_id,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+
+    def is_encoder_decoder_model(self):
+        return self.model_config.is_encoder_decoder_model
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -0,0 +1,314 @@
+import functools
+from collections import UserDict
+from dataclasses import dataclass
+from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional,
+                    Protocol, Tuple, Type)
+
+from torch import nn
+from transformers import PretrainedConfig
+from typing_extensions import TypeVar
+
+from vllm.logger import init_logger
+from vllm.utils import (get_allowed_kwarg_only_overrides, print_warning_once,
+                        resolve_mm_processor_kwargs)
+
+from .data import LLMInputs
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+    from vllm.multimodal import MultiModalDataDict, MultiModalRegistry
+    from vllm.sequence import SequenceData
+
+logger = init_logger(__name__)
+
+C = TypeVar("C", bound=PretrainedConfig, default=PretrainedConfig)
+
+
+@dataclass(frozen=True)
+class InputContext:
+    """
+    Contains information about the model which may be used to
+    modify the inputs.
+    """
+
+    model_config: "ModelConfig"
+    """The configuration of the model."""
+
+    def get_hf_config(self, hf_config_type: Type[C] = PretrainedConfig) -> C:
+        """
+        Get the HuggingFace configuration
+        (:class:`transformers.PretrainedConfig`) of the model,
+        additionally checking its type.
+
+        Raises:
+            TypeError: If the model is not of the specified type.
+        """
+
+        hf_config = self.model_config.hf_config
+        if not isinstance(hf_config, hf_config_type):
+            raise TypeError("Invalid type of HuggingFace config. "
+                            f"Expected type: {hf_config_type}, but "
+                            f"found type: {type(hf_config)}")
+
+        return hf_config
+
+    def get_hf_image_processor_config(self) -> Dict[str, Any]:
+        """
+        Get the HuggingFace image processor configuration of the model.
+        """
+
+        return self.model_config.hf_image_processor_config
+
+
+N = TypeVar("N", bound=Type[nn.Module])
+
+
+class DummyDataFactory(Protocol):
+
+    def __call__(
+        self,
+        ctx: InputContext,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        **mm_processor_kwargs: Any,
+    ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]:
+        """
+        Create dummy data to be inputted into the model.
+
+        Note:
+            :data:`InputProcessor` is not applied to the dummy data.
+
+            The :code:`mm_processor_kwargs` are overrides provided at
+            initialization time to values in the config whose values
+            may affect the number of tokens per instance.
+        """
+        ...
+
+
+class _MultiModalCounts(UserDict):
+    """
+    Wraps `mm_counts` for a more informative error message
+    when attempting to access a plugin that does not exist.
+    """
+
+    def __getitem__(self, key: str) -> int:
+        try:
+            return super().__getitem__(key)
+        except KeyError as exc:
+            msg = (f"There is no multi-modal plugin with the key: {key}. "
+                   f"Available keys: {set(self.keys())}")
+            raise KeyError(msg) from exc
+
+
+InputProcessor = Callable[[InputContext, LLMInputs], LLMInputs]
+"""Preprocess the inputs to the model."""
+
+
+class InputRegistry:
+    """
+    A registry to dispatch data processing
+    according to the target model.
+    """
+
+    def __init__(self) -> None:
+        self._dummy_factories_by_model_type: Dict[Type[nn.Module],
+                                                  DummyDataFactory] = {}
+        self._dummy_encoder_factories_by_model_type: Dict[
+            Type[nn.Module], DummyDataFactory] = {}
+        self._input_processors_by_model_type: Dict[Type[nn.Module],
+                                                   InputProcessor] = {}
+
+    def _default_dummy_data_factory(
+        self,
+        ctx: InputContext,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]:
+        """
+        The default dummy data factory represents the longest possible text
+        that can be inputted to the model.
+
+        Note:
+            :data:`InputProcessor` is not applied to the dummy data.
+        """
+        # Avoid circular import
+        from vllm.sequence import SequenceData
+
+        dummy_seq_data = SequenceData.from_token_counts((0, seq_len))
+        dummy_multi_modal_data = None
+
+        return dummy_seq_data, dummy_multi_modal_data
+
+    def register_dummy_data(self, factory: DummyDataFactory):
+        """
+        Register a dummy data factory to a model class.
+
+        During memory profiling, the provided function is invoked to create
+        dummy data to be inputted into the model. The resulting memory usage
+        should be an upper bound of what the model would use at inference time.
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if model_cls in self._dummy_factories_by_model_type:
+                logger.warning(
+                    "Model class %s already has dummy data "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            self._dummy_factories_by_model_type[model_cls] = factory
+
+            return model_cls
+
+        return wrapper
+
+    def _get_dummy_data_factory(self, model_cls: Type[nn.Module]):
+        return self._dummy_factories_by_model_type \
+            .get(model_cls, self._default_dummy_data_factory)
+
+    def register_dummy_encoder_data(self, factory: DummyDataFactory):
+        """
+        Register a dummy encoder data factory to a model class
+
+        This is similar to :meth:`~register_dummy_data`, but for encoder input.
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if model_cls in self._dummy_encoder_factories_by_model_type:
+                logger.warning(
+                    "Model class %s already has dummy encoder data "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            self._dummy_encoder_factories_by_model_type[model_cls] = factory
+
+            return model_cls
+
+        return wrapper
+
+    def _get_dummy_encoder_data_factory(self, model_cls: Type[nn.Module]):
+        return self._dummy_encoder_factories_by_model_type \
+            .get(model_cls, self._default_dummy_data_factory)
+
+    def dummy_data_for_profiling(
+        self,
+        model_config: "ModelConfig",
+        seq_len: int,
+        mm_registry: "MultiModalRegistry",
+        is_encoder_data: bool = False,
+    ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]:
+        """
+        Create dummy data for profiling the memory usage of a model.
+
+        The model is identified by ``model_config``.
+
+        See also:
+            :ref:`enabling_multimodal_inputs`
+
+        Note:
+            This should be called after
+            :meth:`~MultiModalRegistry.init_mm_limits_per_prompt`.
+        """
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        model_cls, _ = get_model_architecture(model_config)
+        if is_encoder_data:
+            dummy_factory = self._get_dummy_encoder_data_factory(model_cls)
+        else:
+            dummy_factory = self._get_dummy_data_factory(model_cls)
+        mm_counts = mm_registry.get_mm_limits_per_prompt(model_config)
+        mm_processor_kwargs = get_allowed_kwarg_only_overrides(
+            dummy_factory, overrides=model_config.mm_processor_kwargs)
+
+        seq_data, mm_data = dummy_factory(InputContext(model_config), seq_len,
+                                          _MultiModalCounts(mm_counts),
+                                          **mm_processor_kwargs)
+
+        # Having more tokens is over-conservative but otherwise fine
+        num_tokens = seq_data.prompt_token_ids
+        if len(num_tokens) < seq_len:
+            if is_encoder_data:
+                print_warning_once(
+                    f"Expected at least {seq_len} dummy encoder tokens for "
+                    f"profiling, but found {len(num_tokens)} tokens instead.")
+            else:
+                raise AssertionError(
+                    f"Expected at least {seq_len} dummy tokens for profiling, "
+                    f"but found {len(num_tokens)} tokens instead.")
+        if mm_data is not None:
+            for k, v in mm_data.items():
+                num_items = len(v) if isinstance(v, list) else 1
+                num_expected = mm_counts[k]
+                assert num_items >= num_expected, (
+                    f"Expected at least {num_expected} dummy '{k}' instances "
+                    f"for profiling, but found {num_items} instances instead.")
+
+        return seq_data, mm_data
+
+    def _default_input_processor(self, ctx: InputContext,
+                                 inputs: LLMInputs) -> LLMInputs:
+        """The default input processor is a no-op."""
+        return inputs
+
+    def register_input_processor(self, processor: InputProcessor):
+        """
+        Register an input processor to a model class.
+
+        The provided function is invoked on each input to the model. This
+        happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`.
+
+        See also:
+            :ref:`input_processing_pipeline`
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if model_cls in self._input_processors_by_model_type:
+                logger.warning(
+                    "Model class %s already has input processor "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            self._input_processors_by_model_type[model_cls] = processor
+
+            return model_cls
+
+        return wrapper
+
+    def _get_model_input_processor(self, model_cls: Type[nn.Module]):
+        return self._input_processors_by_model_type \
+            .get(model_cls, self._default_input_processor)
+
+    def process_input(self, model_config: "ModelConfig",
+                      inputs: LLMInputs) -> LLMInputs:
+        """
+        Apply an input processor to an instance of model inputs.
+
+        The model is identified by ``model_config``.
+
+        See also:
+            :ref:`input_processing_pipeline`
+        """
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        model_cls, _ = get_model_architecture(model_config)
+        processor = self._get_model_input_processor(model_cls)
+
+        # Handle multimodal processor kwargs with priority:
+        #     Inference kwargs -> Init kwargs -> {}
+        # If it's empty, it'll fall back to the default kwarg values
+        mm_processor_kwargs = resolve_mm_processor_kwargs(
+            model_config.mm_processor_kwargs,
+            inputs.get("mm_processor_kwargs"),
+            processor,
+        )
+
+        return processor(InputContext(model_config), inputs,
+                         **mm_processor_kwargs)
+
+    def create_input_processor(self, model_config: "ModelConfig"):
+        """
+        Create an input processor (see :meth:`_process_input`) for a
+        specific model.
+        """
+        return functools.partial(self.process_input, model_config)