This commit is contained in:
root
2026-04-09 11:23:47 +08:00
parent 8082d5f4b2
commit 72387e4fa8
1885 changed files with 611521 additions and 1 deletions

38
vllm/inputs/__init__.py Normal file
View File

@@ -0,0 +1,38 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from .data import (
DataPrompt,
DecoderOnlyInputs,
EmbedsInputs,
EmbedsPrompt,
EncoderDecoderInputs,
ExplicitEncoderDecoderPrompt,
ProcessorInputs,
PromptType,
SingletonInputs,
SingletonPrompt,
TextPrompt,
TokenInputs,
TokensPrompt,
embeds_inputs,
token_inputs,
)
__all__ = [
"DataPrompt",
"TextPrompt",
"TokensPrompt",
"PromptType",
"SingletonPrompt",
"ExplicitEncoderDecoderPrompt",
"TokenInputs",
"EmbedsInputs",
"EmbedsPrompt",
"token_inputs",
"embeds_inputs",
"DecoderOnlyInputs",
"EncoderDecoderInputs",
"ProcessorInputs",
"SingletonInputs",
]

411
vllm/inputs/data.py Normal file
View File

@@ -0,0 +1,411 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import TYPE_CHECKING, Any, Literal, TypeAlias
import torch
from typing_extensions import NotRequired, TypedDict, assert_never
if TYPE_CHECKING:
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalEncDecInputs,
MultiModalInputs,
MultiModalUUIDDict,
)
else:
MultiModalDataDict = object
MultiModalEncDecInputs = object
MultiModalInputs = object
MultiModalUUIDDict = object
# Inputs to LLM API
class _PromptOptions(TypedDict):
"""
Additional options available to all
[`SingletonPrompt`][vllm.inputs.data.SingletonPrompt].
"""
multi_modal_data: NotRequired[MultiModalDataDict | None]
"""
Optional multi-modal data to pass to the model,
if the model supports it.
"""
mm_processor_kwargs: NotRequired[dict[str, Any] | None]
"""
Optional multi-modal processor kwargs to be forwarded to the
multimodal input mapper & processor. Note that if multiple modalities
have registered mappers etc for the model being considered, we attempt
to pass the mm_processor_kwargs to each of them.
"""
multi_modal_uuids: NotRequired[MultiModalUUIDDict]
"""
Optional user-specified UUIDs for multimodal items, mapped by modality.
Lists must match the number of items per modality and may contain `None`.
For `None` entries, the hasher will compute IDs automatically; non-None
entries override the default hashes for caching, and MUST be unique per
multimodal item.
"""
cache_salt: NotRequired[str]
"""
Optional cache salt to be used for prefix caching.
"""
class TextPrompt(_PromptOptions):
"""Schema for a text prompt."""
prompt: str
"""The input text to be tokenized before passing to the model."""
class TokensPrompt(_PromptOptions):
"""Schema for a tokenized prompt."""
prompt_token_ids: list[int]
"""A list of token IDs to pass to the model."""
prompt: NotRequired[str]
"""The prompt text corresponding to the token IDs, if available."""
token_type_ids: NotRequired[list[int]]
"""A list of token type IDs to pass to the cross encoder model."""
class EmbedsPrompt(_PromptOptions):
"""Schema for a prompt provided via token embeddings."""
prompt_embeds: torch.Tensor
"""The embeddings of the prompt."""
prompt: NotRequired[str]
"""The prompt text corresponding to the token embeddings, if available."""
DecoderOnlyPrompt: TypeAlias = (
str | TextPrompt | list[int] | TokensPrompt | EmbedsPrompt
)
"""
Schema of a prompt for a decoder-only model:
- A text prompt (string or [`TextPrompt`][vllm.inputs.data.TextPrompt])
- A tokenized prompt (list of token IDs, or
[`TokensPrompt`][vllm.inputs.data.TokensPrompt])
- An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.data.EmbedsPrompt])
For encoder-decoder models, passing a singleton prompt is shorthand for passing
`ExplicitEncoderDecoderPrompt(encoder_prompt=prompt, decoder_prompt=None)`.
"""
EncoderPrompt: TypeAlias = str | TextPrompt | list[int] | TokensPrompt
"""
Schema of a prompt for the encoder part of a encoder-decoder model:
- A text prompt (string or [`TextPrompt`][vllm.inputs.data.TextPrompt])
- A tokenized prompt (list of token IDs, or
[`TokensPrompt`][vllm.inputs.data.TokensPrompt])
"""
DecoderPrompt: TypeAlias = str | TextPrompt | list[int] | TokensPrompt
"""
Schema of a prompt for the decoder part of an encoder-decoder model:
- A text prompt (string or [`TextPrompt`][vllm.inputs.data.TextPrompt])
- A tokenized prompt (list of token IDs, or
[`TokensPrompt`][vllm.inputs.data.TokensPrompt])
Note:
Multi-modal inputs are not supported for decoder prompts.
"""
class ExplicitEncoderDecoderPrompt(TypedDict):
"""
Schema for a pair of encoder and decoder singleton prompts.
Note:
This schema is not valid for decoder-only models.
"""
encoder_prompt: EncoderPrompt
"""The prompt for the encoder part of the model."""
decoder_prompt: DecoderPrompt | None
"""
The prompt for the decoder part of the model.
Passing `None` will cause the prompt to be inferred automatically.
"""
EncoderDecoderPrompt: TypeAlias = EncoderPrompt | ExplicitEncoderDecoderPrompt
"""
Schema for a prompt for an encoder-decoder model.
You can pass a singleton encoder prompt, in which case the decoder prompt is
considered to be `None` (i.e., infer automatically).
"""
SingletonPrompt: TypeAlias = DecoderOnlyPrompt | EncoderPrompt | DecoderPrompt
"""
Schema for a single prompt. This is as opposed to a data structure
which encapsulates multiple prompts, such as
[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt].
"""
PromptType: TypeAlias = DecoderOnlyPrompt | EncoderDecoderPrompt
"""
Schema for any prompt, regardless of model type.
This is the input format accepted by most [`LLM`][vllm.entrypoints.llm.LLM] APIs.
"""
class DataPrompt(_PromptOptions):
"""
Represents generic inputs that are converted to
[`PromptType`][vllm.inputs.data.PromptType] by IO processor plugins.
"""
data: Any
"""The input data."""
data_format: str
"""The input data format."""
# Outputs of processor
class _InputOptions(TypedDict):
"""
Additional options available to all input types.
"""
arrival_time: NotRequired[float]
"""The time when the input was received (before rendering)."""
cache_salt: NotRequired[str]
"""Optional cache salt to be used for prefix caching."""
class TokenInputs(_InputOptions):
"""Represents token-based inputs."""
type: Literal["token"]
"""The type of inputs."""
prompt_token_ids: list[int]
"""The token IDs of the prompt."""
prompt: NotRequired[str]
"""The prompt text corresponding to the token IDs, if available."""
def token_inputs(
prompt_token_ids: list[int],
*,
prompt: str | None = None,
cache_salt: str | None = None,
) -> TokenInputs:
"""Construct [`TokenInputs`][vllm.inputs.data.TokenInputs] from optional
values."""
inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
if prompt is not None:
inputs["prompt"] = prompt
if cache_salt is not None:
inputs["cache_salt"] = cache_salt
return inputs
class EmbedsInputs(_InputOptions):
"""Represents embeddings-based inputs."""
type: Literal["embeds"]
"""The type of inputs."""
prompt_embeds: torch.Tensor
"""The embeddings of the prompt."""
prompt: NotRequired[str]
"""The prompt text corresponding to the token IDs, if available."""
def embeds_inputs(
prompt_embeds: torch.Tensor,
*,
prompt: str | None = None,
cache_salt: str | None = None,
) -> EmbedsInputs:
"""Construct [`EmbedsInputs`][vllm.inputs.data.EmbedsInputs] from optional
values."""
inputs = EmbedsInputs(type="embeds", prompt_embeds=prompt_embeds)
if prompt is not None:
inputs["prompt"] = prompt
if cache_salt is not None:
inputs["cache_salt"] = cache_salt
return inputs
DecoderOnlyInputs: TypeAlias = TokenInputs | EmbedsInputs | MultiModalInputs
"""
A processed prompt from
[`InputPreprocessor`][vllm.inputs.preprocess.InputPreprocessor]
which can be passed to
[`InputProcessor`][vllm.v1.engine.input_processor.InputProcessor]
for decoder-only models.
"""
EncoderInputs: TypeAlias = TokenInputs | MultiModalEncDecInputs
"""
A processed encoder prompt from
[`InputPreprocessor`][vllm.inputs.preprocess.InputPreprocessor]
which can be passed to
[`InputProcessor`][vllm.v1.engine.input_processor.InputProcessor]
for encoder-decoder models.
"""
DecoderInputs: TypeAlias = TokenInputs | MultiModalInputs
"""
A processed decoder prompt from
[`InputPreprocessor`][vllm.inputs.preprocess.InputPreprocessor]
which can be passed to
[`InputProcessor`][vllm.v1.engine.input_processor.InputProcessor]
for encoder-decoder models.
"""
class EncoderDecoderInputs(TypedDict):
"""
A processed pair of encoder and decoder singleton prompts.
[`InputPreprocessor`][vllm.inputs.preprocess.InputPreprocessor]
which can be passed to
[`InputProcessor`][vllm.v1.engine.input_processor.InputProcessor]
for encoder-decoder models.
"""
type: Literal["enc_dec"]
encoder_prompt: EncoderInputs
"""The inputs for the encoder portion."""
decoder_prompt: DecoderInputs
"""The inputs for the decoder portion."""
arrival_time: NotRequired[float]
"""The time when the input was received (before rendering)."""
ProcessorInputs: TypeAlias = DecoderOnlyInputs | EncoderDecoderInputs
"""
A processed prompt from
[`InputPreprocessor`][vllm.inputs.preprocess.InputPreprocessor]
which can be passed to
[`InputProcessor`][vllm.v1.engine.input_processor.InputProcessor].
"""
SingletonInputs: TypeAlias = DecoderOnlyInputs | MultiModalEncDecInputs
"""The inputs for a single encoder/decoder prompt."""
def _validate_enc_inputs(inputs: SingletonInputs) -> EncoderInputs:
if inputs["type"] == "embeds":
raise ValueError(
"Embedding inputs are not supported for encoder-decoder models"
)
if inputs["type"] == "multimodal" and "encoder_prompt_token_ids" not in inputs:
raise RuntimeError(
"You should register an encoder-decoder multi-modal processor "
"for encoder-decoder models."
)
return inputs # type: ignore[return-value]
def _validate_dec_inputs(inputs: SingletonInputs) -> DecoderInputs:
if inputs["type"] == "embeds":
raise ValueError(
"Embedding inputs are not supported for encoder-decoder models"
)
return inputs
def _prepare_decoder_input_ids_for_generation(
decoder_input_ids: list[int],
decoder_start_token_id: int,
) -> list[int]:
"""
Prepare `decoder_input_ids` for generation with encoder-decoder models,
according to `GenerationMixin._prepare_decoder_input_ids_for_generation()`.
Source:
https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/generation/utils.py
"""
if len(decoder_input_ids) == 0 or decoder_input_ids[0] != decoder_start_token_id:
decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
return decoder_input_ids
def build_enc_dec_inputs(
encoder_inputs: SingletonInputs,
decoder_inputs: SingletonInputs | None,
decoder_start_token_id: int,
) -> EncoderDecoderInputs:
enc_inputs = _validate_enc_inputs(encoder_inputs)
if decoder_inputs is None:
dec_inputs: DecoderInputs = enc_inputs
else:
dec_inputs = _validate_dec_inputs(decoder_inputs)
enc_inputs_new: EncoderInputs
dec_inputs_new: DecoderInputs
if enc_inputs["type"] == "multimodal":
from vllm.multimodal.inputs import mm_inputs
enc_inputs_new = token_inputs(
enc_inputs["encoder_prompt_token_ids"],
prompt=enc_inputs.get("encoder_prompt"),
)
dec_inputs_new = mm_inputs(
prompt_token_ids=dec_inputs["prompt_token_ids"],
prompt=dec_inputs.get("prompt"),
mm_kwargs=enc_inputs["mm_kwargs"],
mm_hashes=enc_inputs["mm_hashes"],
mm_placeholders=enc_inputs["mm_placeholders"],
)
elif enc_inputs["type"] == "token":
enc_inputs_new = token_inputs(prompt_token_ids=[])
dec_inputs_new = dec_inputs
else:
assert_never(enc_inputs)
dec_inputs_new["prompt_token_ids"] = _prepare_decoder_input_ids_for_generation(
dec_inputs_new["prompt_token_ids"],
decoder_start_token_id,
)
if cache_salt := enc_inputs.get("cache_salt"):
dec_inputs_new["cache_salt"] = cache_salt
return EncoderDecoderInputs(
type="enc_dec",
encoder_prompt=enc_inputs_new,
decoder_prompt=dec_inputs_new,
)

13
vllm/inputs/parse.py Normal file
View File

@@ -0,0 +1,13 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from .data import ProcessorInputs, SingletonInputs
def split_enc_dec_inputs(
inputs: ProcessorInputs,
) -> tuple[SingletonInputs | None, SingletonInputs]:
if inputs["type"] == "enc_dec":
return inputs["encoder_prompt"], inputs["decoder_prompt"]
return None, inputs

320
vllm/inputs/preprocess.py Normal file
View File

@@ -0,0 +1,320 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Mapping
from typing import Any, overload
from typing_extensions import assert_never
from vllm.config import VllmConfig
from vllm.inputs.data import build_enc_dec_inputs
from vllm.logger import init_logger
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalInputs,
MultiModalUUIDDict,
)
from vllm.renderers import BaseRenderer, renderer_from_config
from vllm.renderers.inputs import (
DecoderDictPrompt,
DecoderOnlyDictPrompt,
EncoderDecoderDictPrompt,
EncoderDictPrompt,
SingletonDictPrompt,
)
from vllm.renderers.inputs.preprocess import parse_dec_only_prompt, parse_enc_dec_prompt
from vllm.tokenizers import TokenizerLike
from .data import (
DecoderInputs,
DecoderOnlyInputs,
EmbedsInputs,
EmbedsPrompt,
EncoderDecoderInputs,
EncoderInputs,
ProcessorInputs,
PromptType,
SingletonInputs,
TextPrompt,
TokenInputs,
TokensPrompt,
token_inputs,
)
logger = init_logger(__name__)
class InputPreprocessor:
def __init__(
self,
vllm_config: VllmConfig,
renderer: BaseRenderer | None = None,
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
) -> None:
super().__init__()
self.model_config = vllm_config.model_config
self.renderer = renderer or renderer_from_config(vllm_config)
self.mm_registry = mm_registry
@property
def tokenizer(self) -> TokenizerLike | None:
return self.renderer.tokenizer
def get_tokenizer(self) -> TokenizerLike:
return self.renderer.get_tokenizer()
def _tokenize_prompt(
self,
prompt: str,
tokenization_kwargs: dict[str, Any] | None = None,
) -> list[int]:
"""
Apply the model's tokenizer to a text prompt, returning the
corresponding token IDs.
"""
renderer = self.renderer
tok_params = renderer.default_cmpl_tok_params.with_kwargs(
**(tokenization_kwargs or {})
)
tok_prompt = renderer._tokenize_singleton_prompt(
TextPrompt(prompt=prompt),
tok_params,
)
return tok_prompt["prompt_token_ids"]
def _process_multimodal(
self,
prompt: str | list[int],
mm_data: MultiModalDataDict,
mm_processor_kwargs: Mapping[str, object] | None = None,
tokenization_kwargs: dict[str, Any] | None = None,
*,
mm_uuids: MultiModalUUIDDict | None = None,
) -> MultiModalInputs:
"""
Apply the model's multi-modal processor to a multi-modal prompt,
returning the corresponding token IDs and metadata.
"""
return self.renderer._process_multimodal(
prompt,
mm_data,
mm_uuids=mm_uuids,
mm_processor_kwargs=mm_processor_kwargs,
tokenization_kwargs=tokenization_kwargs,
)
def _process_embeds(
self,
parsed_content: EmbedsPrompt,
) -> EmbedsInputs:
return self.renderer._process_embeds(parsed_content)
def _truncate_inputs(
self, inputs: list[int], tokenization_kwargs: dict[str, Any] | None = None
) -> list[int]:
renderer = self.renderer
tok_params = renderer.default_cmpl_tok_params.with_kwargs(
**(tokenization_kwargs or {})
)
tok_prompt = renderer._tokenize_singleton_prompt(
TokensPrompt(prompt_token_ids=inputs),
tok_params,
)
return tok_prompt["prompt_token_ids"]
def _process_tokens(
self,
parsed_content: TokensPrompt,
tokenization_kwargs: dict[str, Any] | None = None,
) -> TokenInputs | MultiModalInputs:
prompt_token_ids = self._truncate_inputs(
parsed_content["prompt_token_ids"], tokenization_kwargs
)
inputs: TokenInputs | MultiModalInputs
if multi_modal_data := parsed_content.get("multi_modal_data"):
inputs = self._process_multimodal(
prompt_token_ids,
multi_modal_data,
parsed_content.get("mm_processor_kwargs"),
tokenization_kwargs=tokenization_kwargs,
mm_uuids=parsed_content.get("multi_modal_uuids"),
)
else:
inputs = token_inputs(prompt_token_ids)
if prompt_text := parsed_content.get("prompt"):
inputs["prompt"] = prompt_text
if cache_salt := parsed_content.get("cache_salt"):
inputs["cache_salt"] = cache_salt
return inputs
def _process_text(
self,
parsed_content: TextPrompt,
tokenization_kwargs: dict[str, Any] | None = None,
) -> TokenInputs | MultiModalInputs:
prompt_text = parsed_content["prompt"]
inputs: TokenInputs | MultiModalInputs
if multi_modal_data := parsed_content.get("multi_modal_data"):
inputs = self._process_multimodal(
prompt_text,
multi_modal_data,
parsed_content.get("mm_processor_kwargs") or {},
tokenization_kwargs=tokenization_kwargs,
)
else:
prompt_token_ids = self._tokenize_prompt(
prompt_text,
tokenization_kwargs=tokenization_kwargs,
)
inputs = token_inputs(prompt_token_ids)
inputs["prompt"] = prompt_text
if cache_salt := parsed_content.get("cache_salt"):
inputs["cache_salt"] = cache_salt
return inputs
@overload
def _prompt_to_llm_inputs(
self,
prompt: EncoderDictPrompt,
tokenization_kwargs: dict[str, Any] | None = None,
) -> EncoderInputs: ...
@overload
def _prompt_to_llm_inputs( # type: ignore[misc]
self,
prompt: DecoderDictPrompt,
tokenization_kwargs: dict[str, Any] | None = None,
) -> DecoderInputs: ...
@overload
def _prompt_to_llm_inputs( # type: ignore[misc]
self,
prompt: DecoderOnlyDictPrompt,
tokenization_kwargs: dict[str, Any] | None = None,
) -> DecoderOnlyInputs: ...
def _prompt_to_llm_inputs(
self,
prompt: SingletonDictPrompt,
tokenization_kwargs: dict[str, Any] | None = None,
) -> SingletonInputs:
"""
Extract the singleton inputs from a prompt.
Arguments:
* prompt: single encoder or decoder input prompt
Returns:
* [`SingletonInputs`][vllm.inputs.data.SingletonInputs] instance
"""
if "prompt_embeds" in prompt:
return self._process_embeds(prompt) # type: ignore[arg-type]
if "prompt_token_ids" in prompt:
return self._process_tokens(prompt) # type: ignore[arg-type]
if "prompt" in prompt:
return self._process_text(
prompt, # type: ignore[arg-type]
tokenization_kwargs=tokenization_kwargs,
)
assert_never(prompt) # type: ignore[arg-type]
def _process_encoder_decoder_prompt(
self,
prompt: EncoderDecoderDictPrompt,
tokenization_kwargs: dict[str, Any] | None = None,
) -> EncoderDecoderInputs:
"""
For encoder/decoder models only:
Process an input prompt into an
[`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
instance.
Arguments:
* prompt: an input prompt
Returns:
* [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
instance
"""
encoder_prompt = prompt["encoder_prompt"]
decoder_prompt = prompt["decoder_prompt"]
return build_enc_dec_inputs(
encoder_inputs=self._prompt_to_llm_inputs(
encoder_prompt,
tokenization_kwargs=tokenization_kwargs,
),
decoder_inputs=(
None
if decoder_prompt is None
else self._prompt_to_llm_inputs(
decoder_prompt,
tokenization_kwargs=tokenization_kwargs,
)
),
decoder_start_token_id=self.renderer.get_dec_start_token_id(),
)
def _process_decoder_only_prompt(
self,
prompt: DecoderOnlyDictPrompt,
tokenization_kwargs: dict[str, Any] | None = None,
) -> DecoderOnlyInputs:
"""
For decoder-only models:
Process an input prompt into a
[`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance.
Arguments:
* prompt: input prompt
Returns:
* [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance
"""
return self._prompt_to_llm_inputs(
prompt,
tokenization_kwargs=tokenization_kwargs,
)
def preprocess(
self,
prompt: PromptType,
tokenization_kwargs: dict[str, Any] | None = None,
) -> ProcessorInputs:
"""Preprocess the input prompt."""
if self.model_config.is_encoder_decoder:
# Encoder-decoder model requires special mapping of
# input prompts to encoder & decoder.
return self._process_encoder_decoder_prompt(
parse_enc_dec_prompt(prompt),
tokenization_kwargs,
)
return self._process_decoder_only_prompt(
parse_dec_only_prompt(prompt),
tokenization_kwargs=tokenization_kwargs,
)