Iluvatar-mrv100 SDK 4.3.0
This commit is contained in:
39
vllm/inputs/__init__.py
Normal file
39
vllm/inputs/__init__.py
Normal file
@@ -0,0 +1,39 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from .data import (DecoderOnlyInputs, EncoderDecoderInputs,
|
||||
ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType,
|
||||
SingletonInputs, SingletonInputsAdapter, SingletonPrompt,
|
||||
TextPrompt, TokenInputs, TokensPrompt,
|
||||
build_explicit_enc_dec_prompt, to_enc_dec_tuple_list,
|
||||
token_inputs, zip_enc_dec_prompts)
|
||||
from .registry import (DummyData, InputContext, InputProcessingContext,
|
||||
InputRegistry)
|
||||
|
||||
INPUT_REGISTRY = InputRegistry()
|
||||
"""
|
||||
The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine`
|
||||
to dispatch data processing according to the target model.
|
||||
"""
|
||||
|
||||
__all__ = [
|
||||
"TextPrompt",
|
||||
"TokensPrompt",
|
||||
"PromptType",
|
||||
"SingletonPrompt",
|
||||
"ExplicitEncoderDecoderPrompt",
|
||||
"TokenInputs",
|
||||
"token_inputs",
|
||||
"DecoderOnlyInputs",
|
||||
"EncoderDecoderInputs",
|
||||
"ProcessorInputs",
|
||||
"SingletonInputs",
|
||||
"SingletonInputsAdapter",
|
||||
"build_explicit_enc_dec_prompt",
|
||||
"to_enc_dec_tuple_list",
|
||||
"zip_enc_dec_prompts",
|
||||
"INPUT_REGISTRY",
|
||||
"DummyData",
|
||||
"InputContext",
|
||||
"InputProcessingContext",
|
||||
"InputRegistry",
|
||||
]
|
||||
405
vllm/inputs/data.py
Normal file
405
vllm/inputs/data.py
Normal file
@@ -0,0 +1,405 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from collections.abc import Iterable
|
||||
from dataclasses import dataclass
|
||||
from functools import cached_property
|
||||
from typing import TYPE_CHECKING, Any, Generic, Literal, Optional, Union, cast
|
||||
|
||||
import torch
|
||||
from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs,
|
||||
MultiModalPlaceholderDict)
|
||||
from vllm.multimodal.inputs import MultiModalInputs
|
||||
|
||||
|
||||
class TextPrompt(TypedDict):
|
||||
"""Schema for a text prompt."""
|
||||
|
||||
prompt: str
|
||||
"""The input text to be tokenized before passing to the model."""
|
||||
|
||||
multi_modal_data: NotRequired["MultiModalDataDict"]
|
||||
"""
|
||||
Optional multi-modal data to pass to the model,
|
||||
if the model supports it.
|
||||
"""
|
||||
|
||||
mm_processor_kwargs: NotRequired[dict[str, Any]]
|
||||
"""
|
||||
Optional multi-modal processor kwargs to be forwarded to the
|
||||
multimodal input mapper & processor. Note that if multiple modalities
|
||||
have registered mappers etc for the model being considered, we attempt
|
||||
to pass the mm_processor_kwargs to each of them.
|
||||
"""
|
||||
|
||||
|
||||
class TokensPrompt(TypedDict):
|
||||
"""Schema for a tokenized prompt."""
|
||||
|
||||
prompt_token_ids: list[int]
|
||||
"""A list of token IDs to pass to the model."""
|
||||
|
||||
token_type_ids: NotRequired[list[int]]
|
||||
"""A list of token type IDs to pass to the cross encoder model."""
|
||||
|
||||
multi_modal_data: NotRequired["MultiModalDataDict"]
|
||||
"""
|
||||
Optional multi-modal data to pass to the model,
|
||||
if the model supports it.
|
||||
"""
|
||||
|
||||
mm_processor_kwargs: NotRequired[dict[str, Any]]
|
||||
"""
|
||||
Optional multi-modal processor kwargs to be forwarded to the
|
||||
multimodal input mapper & processor. Note that if multiple modalities
|
||||
have registered mappers etc for the model being considered, we attempt
|
||||
to pass the mm_processor_kwargs to each of them.
|
||||
"""
|
||||
|
||||
|
||||
SingletonPrompt = Union[str, TextPrompt, TokensPrompt]
|
||||
"""
|
||||
Set of possible schemas for a single prompt:
|
||||
|
||||
- A text prompt (:class:`str` or :class:`TextPrompt`)
|
||||
- A tokenized prompt (:class:`TokensPrompt`)
|
||||
|
||||
Note that "singleton" is as opposed to a data structure
|
||||
which encapsulates multiple prompts, i.e. of the sort
|
||||
which may be utilized for encoder/decoder models when
|
||||
the user desires to express both the encoder & decoder
|
||||
prompts explicitly, i.e. :class:`ExplicitEncoderDecoderPrompt`
|
||||
|
||||
A prompt of type :class:`SingletonPrompt` may be employed
|
||||
as (1) input to a decoder-only model, (2) input to
|
||||
the encoder of an encoder/decoder model, in the scenario
|
||||
where the decoder-prompt is not specified explicitly, or
|
||||
(3) as a member of a larger data structure encapsulating
|
||||
more than one prompt, i.e. :class:`ExplicitEncoderDecoderPrompt`
|
||||
"""
|
||||
|
||||
_T1_co = TypeVar("_T1_co",
|
||||
bound=SingletonPrompt,
|
||||
default=SingletonPrompt,
|
||||
covariant=True)
|
||||
_T2_co = TypeVar("_T2_co",
|
||||
bound=SingletonPrompt,
|
||||
default=SingletonPrompt,
|
||||
covariant=True)
|
||||
|
||||
|
||||
# TODO: Make fields ReadOnly once mypy supports it
|
||||
class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
|
||||
"""
|
||||
Represents an encoder/decoder model input prompt,
|
||||
comprising an explicit encoder prompt and a decoder prompt.
|
||||
|
||||
The encoder and decoder prompts, respectively, may be formatted
|
||||
according to any of the :class:`SingletonPrompt` schemas,
|
||||
and are not required to have the same schema.
|
||||
|
||||
Only the encoder prompt may have multi-modal data. mm_processor_kwargs
|
||||
should be at the top-level, and should not be set in the encoder/decoder
|
||||
prompts, since they are agnostic to the encoder/decoder.
|
||||
|
||||
Note that an :class:`ExplicitEncoderDecoderPrompt` may not
|
||||
be used as an input to a decoder-only model,
|
||||
and that the :code:`encoder_prompt` and :code:`decoder_prompt`
|
||||
fields of this data structure themselves must be
|
||||
:class:`SingletonPrompt` instances.
|
||||
"""
|
||||
|
||||
encoder_prompt: _T1_co
|
||||
|
||||
decoder_prompt: Optional[_T2_co]
|
||||
|
||||
mm_processor_kwargs: NotRequired[dict[str, Any]]
|
||||
|
||||
|
||||
PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt]
|
||||
"""
|
||||
Set of possible schemas for an LLM input, including
|
||||
both decoder-only and encoder/decoder input types:
|
||||
|
||||
- A text prompt (:class:`str` or :class:`TextPrompt`)
|
||||
- A tokenized prompt (:class:`TokensPrompt`)
|
||||
- A single data structure containing both an encoder and a decoder prompt
|
||||
(:class:`ExplicitEncoderDecoderPrompt`)
|
||||
"""
|
||||
|
||||
|
||||
class TokenInputs(TypedDict):
|
||||
"""Represents token-based inputs."""
|
||||
|
||||
type: Literal["token"]
|
||||
"""The type of inputs."""
|
||||
|
||||
prompt_token_ids: list[int]
|
||||
"""The token IDs of the prompt."""
|
||||
|
||||
token_type_ids: NotRequired[list[int]]
|
||||
"""The token type IDs of the prompt."""
|
||||
|
||||
prompt: NotRequired[str]
|
||||
"""
|
||||
The original prompt text corresponding to the token IDs, if available.
|
||||
"""
|
||||
|
||||
multi_modal_data: NotRequired["MultiModalDataDict"]
|
||||
"""
|
||||
Optional multi-modal data to pass to the model,
|
||||
if the model supports it.
|
||||
"""
|
||||
|
||||
multi_modal_inputs: NotRequired["MultiModalKwargs"]
|
||||
"""
|
||||
Optional multi-modal inputs to pass to the model,
|
||||
if the model supports it.
|
||||
"""
|
||||
|
||||
multi_modal_placeholders: NotRequired["MultiModalPlaceholderDict"]
|
||||
"""
|
||||
Placeholder ranges for the multi-modal data.
|
||||
"""
|
||||
|
||||
multi_modal_hashes: NotRequired[list[str]]
|
||||
"""
|
||||
The hashes of the multi-modal data.
|
||||
"""
|
||||
|
||||
mm_processor_kwargs: NotRequired[dict[str, Any]]
|
||||
"""
|
||||
Optional multi-modal processor kwargs to be forwarded to the
|
||||
multimodal input mapper & processor. Note that if multiple modalities
|
||||
have registered mappers etc for the model being considered, we attempt
|
||||
to pass the mm_processor_kwargs to each of them.
|
||||
"""
|
||||
|
||||
|
||||
def token_inputs(
|
||||
prompt_token_ids: list[int],
|
||||
token_type_ids: Optional[list[int]] = None,
|
||||
prompt: Optional[str] = None,
|
||||
multi_modal_data: Optional["MultiModalDataDict"] = None,
|
||||
multi_modal_inputs: Optional["MultiModalKwargs"] = None,
|
||||
multi_modal_hashes: Optional[list[str]] = None,
|
||||
multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
|
||||
mm_processor_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> TokenInputs:
|
||||
"""Construct :class:`TokenInputs` from optional values."""
|
||||
inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
|
||||
|
||||
if prompt is not None:
|
||||
inputs["prompt"] = prompt
|
||||
if token_type_ids is not None:
|
||||
inputs["token_type_ids"] = token_type_ids
|
||||
if multi_modal_data is not None:
|
||||
inputs["multi_modal_data"] = multi_modal_data
|
||||
if multi_modal_inputs is not None:
|
||||
inputs["multi_modal_inputs"] = multi_modal_inputs
|
||||
if multi_modal_hashes is not None:
|
||||
inputs["multi_modal_hashes"] = multi_modal_hashes
|
||||
if multi_modal_placeholders is not None:
|
||||
inputs["multi_modal_placeholders"] = multi_modal_placeholders
|
||||
if mm_processor_kwargs is not None:
|
||||
inputs["mm_processor_kwargs"] = mm_processor_kwargs
|
||||
|
||||
return inputs
|
||||
|
||||
|
||||
DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputs"]
|
||||
"""
|
||||
The inputs in :class:`~vllm.LLMEngine` before they are
|
||||
passed to the model executor.
|
||||
This specifies the data required for decoder-only models.
|
||||
"""
|
||||
|
||||
|
||||
class EncoderDecoderInputs(TypedDict):
|
||||
"""
|
||||
The inputs in :class:`~vllm.LLMEngine` before they are
|
||||
passed to the model executor.
|
||||
|
||||
This specifies the required data for encoder-decoder models.
|
||||
"""
|
||||
encoder: Union[TokenInputs, "MultiModalInputs"]
|
||||
"""The inputs for the encoder portion."""
|
||||
|
||||
decoder: Union[TokenInputs, "MultiModalInputs"]
|
||||
"""The inputs for the decoder portion."""
|
||||
|
||||
|
||||
SingletonInputs = Union[TokenInputs, "MultiModalInputs"]
|
||||
"""
|
||||
A processed :class:`SingletonPrompt` which can be passed to
|
||||
:class:`vllm.sequence.Sequence`.
|
||||
"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class SingletonInputsAdapter:
|
||||
"""
|
||||
Unified interface to access the components of :class:`SingletonInputs`.
|
||||
"""
|
||||
inputs: SingletonInputs
|
||||
|
||||
@cached_property
|
||||
def prompt(self) -> Optional[str]:
|
||||
inputs = self.inputs
|
||||
|
||||
if inputs["type"] == "token" or inputs["type"] == "multimodal":
|
||||
return inputs.get("prompt")
|
||||
|
||||
assert_never(inputs) # type: ignore[arg-type]
|
||||
|
||||
@cached_property
|
||||
def prompt_token_ids(self) -> list[int]:
|
||||
inputs = self.inputs
|
||||
|
||||
if inputs["type"] == "token" or inputs["type"] == "multimodal":
|
||||
return inputs.get("prompt_token_ids", [])
|
||||
|
||||
assert_never(inputs) # type: ignore[arg-type]
|
||||
|
||||
@cached_property
|
||||
def token_type_ids(self) -> list[int]:
|
||||
inputs = self.inputs
|
||||
|
||||
if inputs["type"] == "token" or inputs["type"] == "multimodal":
|
||||
return inputs.get("token_type_ids", [])
|
||||
|
||||
assert_never(inputs) # type: ignore[arg-type]
|
||||
|
||||
@cached_property
|
||||
def prompt_embeds(self) -> Optional[torch.Tensor]:
|
||||
inputs = self.inputs
|
||||
|
||||
if inputs["type"] == "token" or inputs["type"] == "multimodal":
|
||||
return None
|
||||
|
||||
assert_never(inputs) # type: ignore[arg-type]
|
||||
|
||||
@cached_property
|
||||
def multi_modal_data(self) -> "MultiModalDataDict":
|
||||
inputs = self.inputs
|
||||
|
||||
if inputs["type"] == "token":
|
||||
return inputs.get("multi_modal_data", {})
|
||||
|
||||
if inputs["type"] == "multimodal":
|
||||
return inputs.get("mm_kwargs", {})
|
||||
|
||||
assert_never(inputs) # type: ignore[arg-type]
|
||||
|
||||
@cached_property
|
||||
def multi_modal_inputs(self) -> Union[dict, "MultiModalKwargs"]:
|
||||
inputs = self.inputs
|
||||
|
||||
if inputs["type"] == "token":
|
||||
return inputs.get("multi_modal_inputs", {})
|
||||
|
||||
if inputs["type"] == "multimodal":
|
||||
return inputs.get("mm_kwargs", {})
|
||||
|
||||
assert_never(inputs) # type: ignore[arg-type]
|
||||
|
||||
@cached_property
|
||||
def multi_modal_hashes(self) -> list[str]:
|
||||
inputs = self.inputs
|
||||
|
||||
if inputs["type"] == "token":
|
||||
return inputs.get("multi_modal_hashes", [])
|
||||
|
||||
if inputs["type"] == "multimodal":
|
||||
# only the case when we use MultiModalInputs
|
||||
return inputs.get("mm_hashes", []) # type: ignore[return-value]
|
||||
|
||||
assert_never(inputs) # type: ignore[arg-type]
|
||||
|
||||
@cached_property
|
||||
def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
|
||||
inputs = self.inputs
|
||||
|
||||
if inputs["type"] == "token":
|
||||
return inputs.get("multi_modal_placeholders", {})
|
||||
|
||||
if inputs["type"] == "multimodal":
|
||||
return inputs.get("mm_placeholders", {})
|
||||
|
||||
assert_never(inputs) # type: ignore[arg-type]
|
||||
|
||||
@cached_property
|
||||
def mm_processor_kwargs(self) -> dict[str, Any]:
|
||||
inputs = self.inputs
|
||||
|
||||
if inputs["type"] == "token":
|
||||
return inputs.get("mm_processor_kwargs", {})
|
||||
|
||||
if inputs["type"] == "multimodal":
|
||||
return {}
|
||||
|
||||
assert_never(inputs) # type: ignore[arg-type]
|
||||
|
||||
|
||||
ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
|
||||
"""
|
||||
The inputs to :data:`vllm.inputs.InputProcessor`.
|
||||
"""
|
||||
|
||||
_T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt)
|
||||
_T2 = TypeVar("_T2", bound=SingletonPrompt, default=SingletonPrompt)
|
||||
|
||||
|
||||
def build_explicit_enc_dec_prompt(
|
||||
encoder_prompt: _T1,
|
||||
decoder_prompt: Optional[_T2],
|
||||
mm_processor_kwargs: Optional[dict[str, Any]] = None,
|
||||
) -> ExplicitEncoderDecoderPrompt[_T1, _T2]:
|
||||
if mm_processor_kwargs is None:
|
||||
mm_processor_kwargs = {}
|
||||
return ExplicitEncoderDecoderPrompt(
|
||||
encoder_prompt=encoder_prompt,
|
||||
decoder_prompt=decoder_prompt,
|
||||
mm_processor_kwargs=mm_processor_kwargs)
|
||||
|
||||
|
||||
def zip_enc_dec_prompts(
|
||||
enc_prompts: Iterable[_T1],
|
||||
dec_prompts: Iterable[Optional[_T2]],
|
||||
mm_processor_kwargs: Optional[Union[Iterable[dict[str, Any]],
|
||||
dict[str, Any]]] = None,
|
||||
) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
|
||||
"""
|
||||
Zip encoder and decoder prompts together into a list of
|
||||
:class:`ExplicitEncoderDecoderPrompt` instances.
|
||||
|
||||
``mm_processor_kwargs`` may also be provided; if a dict is passed, the same
|
||||
dictionary will be used for every encoder/decoder prompt. If an iterable is
|
||||
provided, it will be zipped with the encoder/decoder prompts.
|
||||
"""
|
||||
if mm_processor_kwargs is None:
|
||||
mm_processor_kwargs = cast(dict[str, Any], {})
|
||||
if isinstance(mm_processor_kwargs, dict):
|
||||
return [
|
||||
build_explicit_enc_dec_prompt(
|
||||
encoder_prompt, decoder_prompt,
|
||||
cast(dict[str, Any], mm_processor_kwargs))
|
||||
for (encoder_prompt,
|
||||
decoder_prompt) in zip(enc_prompts, dec_prompts)
|
||||
]
|
||||
return [
|
||||
build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt,
|
||||
mm_proc_kwargs)
|
||||
for (encoder_prompt, decoder_prompt, mm_proc_kwargs
|
||||
) in zip(enc_prompts, dec_prompts, mm_processor_kwargs)
|
||||
]
|
||||
|
||||
|
||||
def to_enc_dec_tuple_list(
|
||||
enc_dec_prompts: Iterable[ExplicitEncoderDecoderPrompt[_T1, _T2]],
|
||||
) -> list[tuple[_T1, Optional[_T2]]]:
|
||||
return [(enc_dec_prompt["encoder_prompt"],
|
||||
enc_dec_prompt["decoder_prompt"])
|
||||
for enc_dec_prompt in enc_dec_prompts]
|
||||
121
vllm/inputs/parse.py
Normal file
121
vllm/inputs/parse.py
Normal file
@@ -0,0 +1,121 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from collections.abc import Sequence
|
||||
from typing import Literal, Optional, TypedDict, Union, cast, overload
|
||||
|
||||
from typing_extensions import TypeIs
|
||||
|
||||
from vllm.utils import is_list_of
|
||||
|
||||
from .data import (ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType,
|
||||
SingletonInputs, SingletonPrompt, TextPrompt, TokensPrompt)
|
||||
|
||||
|
||||
class ParsedText(TypedDict):
|
||||
content: str
|
||||
is_tokens: Literal[False]
|
||||
|
||||
|
||||
class ParsedTokens(TypedDict):
|
||||
content: list[int]
|
||||
is_tokens: Literal[True]
|
||||
|
||||
|
||||
@overload
|
||||
def parse_and_batch_prompt(
|
||||
prompt: Union[str, list[str]]) -> Sequence[ParsedText]:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def parse_and_batch_prompt(
|
||||
prompt: Union[list[int], list[list[int]]]) -> Sequence[ParsedTokens]:
|
||||
...
|
||||
|
||||
|
||||
def parse_and_batch_prompt(
|
||||
prompt: Union[str, list[str], list[int], list[list[int]]],
|
||||
) -> Union[Sequence[ParsedText], Sequence[ParsedTokens]]:
|
||||
if isinstance(prompt, str):
|
||||
# case 1: a string
|
||||
return [ParsedText(content=prompt, is_tokens=False)]
|
||||
|
||||
if isinstance(prompt, list):
|
||||
if len(prompt) == 0:
|
||||
raise ValueError("please provide at least one prompt")
|
||||
|
||||
if is_list_of(prompt, str):
|
||||
# case 2: array of strings
|
||||
prompt = cast(list[str], prompt)
|
||||
return [
|
||||
ParsedText(content=elem, is_tokens=False) for elem in prompt
|
||||
]
|
||||
if is_list_of(prompt, int):
|
||||
# case 3: array of tokens
|
||||
prompt = cast(list[int], prompt)
|
||||
return [ParsedTokens(content=prompt, is_tokens=True)]
|
||||
if is_list_of(prompt, list):
|
||||
prompt = cast(list[list[int]], prompt)
|
||||
if len(prompt[0]) == 0:
|
||||
raise ValueError("please provide at least one prompt")
|
||||
|
||||
if is_list_of(prompt[0], int):
|
||||
# case 4: array of token arrays
|
||||
return [
|
||||
ParsedTokens(content=elem, is_tokens=True)
|
||||
for elem in prompt
|
||||
]
|
||||
|
||||
raise TypeError("prompt must be a string, array of strings, "
|
||||
"array of tokens, or array of token arrays")
|
||||
|
||||
|
||||
class ParsedStrPrompt(TypedDict):
|
||||
type: Literal["str"]
|
||||
content: str
|
||||
|
||||
|
||||
class ParsedTextPrompt(TypedDict):
|
||||
type: Literal["text"]
|
||||
content: TextPrompt
|
||||
|
||||
|
||||
class ParsedTokensPrompt(TypedDict):
|
||||
type: Literal["tokens"]
|
||||
content: TokensPrompt
|
||||
|
||||
|
||||
def parse_singleton_prompt(
|
||||
prompt: SingletonPrompt,
|
||||
) -> Union[ParsedStrPrompt, ParsedTextPrompt, ParsedTokensPrompt]:
|
||||
if isinstance(prompt, str):
|
||||
return ParsedStrPrompt(type="str", content=prompt)
|
||||
elif isinstance(prompt, dict):
|
||||
if "prompt_token_ids" in prompt:
|
||||
return ParsedTokensPrompt(type="tokens",
|
||||
content=prompt) # type: ignore
|
||||
elif "prompt" in prompt:
|
||||
return ParsedTextPrompt(type="text", content=prompt)
|
||||
|
||||
raise TypeError("inputs must be a string, TextPrompt, or TokensPrompt")
|
||||
|
||||
|
||||
def is_token_prompt(prompt: PromptType) -> TypeIs[TokensPrompt]:
|
||||
return isinstance(prompt, dict) and "prompt_token_ids" in prompt
|
||||
|
||||
|
||||
def is_explicit_encoder_decoder_prompt(
|
||||
prompt: PromptType) -> TypeIs[ExplicitEncoderDecoderPrompt]:
|
||||
return isinstance(prompt, dict) and "encoder_prompt" in prompt
|
||||
|
||||
|
||||
def split_enc_dec_inputs(
|
||||
inputs: ProcessorInputs,
|
||||
) -> tuple[Optional[SingletonInputs], SingletonInputs]:
|
||||
if "encoder" in inputs and "decoder" in inputs:
|
||||
# NOTE: This passes pyright but not mypy
|
||||
return (
|
||||
inputs["encoder"], # type: ignore[typeddict-item]
|
||||
inputs["decoder"], # type: ignore[typeddict-item]
|
||||
)
|
||||
|
||||
return None, inputs
|
||||
783
vllm/inputs/preprocess.py
Normal file
783
vllm/inputs/preprocess.py
Normal file
@@ -0,0 +1,783 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import asyncio
|
||||
from collections.abc import Mapping
|
||||
from typing import Optional, Union, cast
|
||||
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
|
||||
MultiModalInputs)
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
|
||||
|
||||
from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs,
|
||||
PromptType, SingletonInputs, SingletonPrompt, token_inputs)
|
||||
from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class InputPreprocessor:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_config: ModelConfig,
|
||||
tokenizer: Optional[BaseTokenizerGroup],
|
||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.model_config = model_config
|
||||
self.tokenizer = tokenizer
|
||||
self.mm_registry = mm_registry
|
||||
|
||||
def get_tokenizer_group(self) -> BaseTokenizerGroup:
|
||||
if self.tokenizer is None:
|
||||
raise ValueError("You cannot pass text prompts when "
|
||||
"`skip_tokenizer_init` is True")
|
||||
|
||||
return self.tokenizer
|
||||
|
||||
def get_bos_token_id(self,
|
||||
lora_request: Optional[LoRARequest] = None
|
||||
) -> Optional[int]:
|
||||
if self.tokenizer is None:
|
||||
logger.warning("Using None for BOS token id because tokenizer "
|
||||
"is not initialized")
|
||||
return None
|
||||
|
||||
return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id
|
||||
|
||||
def get_eos_token_id(self,
|
||||
lora_request: Optional[LoRARequest] = None
|
||||
) -> Optional[int]:
|
||||
if self.tokenizer is None:
|
||||
logger.warning("Using None for EOS token id because tokenizer "
|
||||
"is not initialized")
|
||||
return None
|
||||
|
||||
return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id
|
||||
|
||||
def get_decoder_start_token_id(self) -> Optional[int]:
|
||||
'''
|
||||
Obtain the decoder start token id employed by an encoder/decoder
|
||||
model. Returns None for non-encoder/decoder models or if the
|
||||
model config is unavailable.
|
||||
'''
|
||||
|
||||
if not self.model_config.is_encoder_decoder:
|
||||
logger.warning_once(
|
||||
"Using None for decoder start token id because "
|
||||
"this is not an encoder/decoder model.")
|
||||
return None
|
||||
|
||||
if (self.model_config is None or self.model_config.hf_config is None):
|
||||
logger.warning_once(
|
||||
"Using None for decoder start token id because "
|
||||
"model config is not available.")
|
||||
return None
|
||||
|
||||
dec_start_token_id = getattr(self.model_config.hf_config,
|
||||
'decoder_start_token_id', None)
|
||||
if dec_start_token_id is None:
|
||||
logger.warning_once(
|
||||
"Falling back on <BOS> for decoder start token "
|
||||
"id because decoder start token id is not "
|
||||
"available.")
|
||||
dec_start_token_id = self.get_bos_token_id()
|
||||
|
||||
return dec_start_token_id
|
||||
|
||||
def _get_default_enc_dec_decoder_prompt(self) -> list[int]:
|
||||
'''
|
||||
Specifically for encoder/decoder models:
|
||||
generate a default decoder prompt for when
|
||||
the user specifies only the encoder prompt.
|
||||
|
||||
Encoder/decoder models utilize the decoder
|
||||
prompt in different ways; as new models are
|
||||
added, it is intended that this function
|
||||
will be extended to produce differing
|
||||
default decoder prompts, depending on the
|
||||
model variety.
|
||||
|
||||
Absent a special case, the default behavior
|
||||
of this method is to mirror the behavior of
|
||||
the HuggingFace (HF) GenerationMixin for a None
|
||||
decoder prompt, which is to employ a logit processor
|
||||
setting to force the first decoded token to be <BOS>.
|
||||
Here, this behavior is approximated by having the
|
||||
"default" decoder prompt be <BOS>.
|
||||
|
||||
However, it is possible that in the future
|
||||
other models may have different or more
|
||||
complex logic for the default decoder prompt.
|
||||
This motivates having a special helper method
|
||||
for default decoder prompts.
|
||||
|
||||
Returns:
|
||||
|
||||
* prompt_token_ids
|
||||
'''
|
||||
|
||||
bos_token_id = self.get_bos_token_id()
|
||||
assert bos_token_id is not None
|
||||
return [bos_token_id]
|
||||
|
||||
def _prepare_decoder_input_ids_for_generation(
|
||||
self,
|
||||
decoder_input_ids: Optional[list[int]],
|
||||
) -> list[int]:
|
||||
"""
|
||||
Prepares `decoder_input_ids` for generation with encoder-decoder models.
|
||||
|
||||
Based on
|
||||
|
||||
https://github.com/huggingface/transformers/blob/
|
||||
4037a2b5b1278736e566aec12e169100275545ea/
|
||||
src/transformers/generation/utils.py
|
||||
|
||||
specifically GenerationMixin._prepare_decoder_input_ids_for_generation()
|
||||
|
||||
Arguments:
|
||||
|
||||
* decoder_input_ids: input token ids to preprocess
|
||||
|
||||
Returns:
|
||||
|
||||
* Processed token list
|
||||
"""
|
||||
|
||||
decoder_start_token_id = self.get_decoder_start_token_id()
|
||||
assert decoder_start_token_id is not None
|
||||
|
||||
if decoder_input_ids is None:
|
||||
# no decoder prompt input ->
|
||||
# use decoder_start_token_id as decoder_input_ids
|
||||
decoder_input_ids = self._get_default_enc_dec_decoder_prompt()
|
||||
|
||||
if (len(decoder_input_ids) == 0
|
||||
or decoder_input_ids[0] != decoder_start_token_id):
|
||||
decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
|
||||
|
||||
return decoder_input_ids
|
||||
|
||||
def _apply_prompt_adapter(
|
||||
self,
|
||||
prompt_token_ids: list[int],
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest],
|
||||
) -> list[int]:
|
||||
if prompt_adapter_request:
|
||||
prompt_token_ids = (
|
||||
[0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
|
||||
+ prompt_token_ids)
|
||||
|
||||
return prompt_token_ids
|
||||
|
||||
def _tokenize_prompt(
|
||||
self,
|
||||
prompt: str,
|
||||
lora_request: Optional[LoRARequest],
|
||||
) -> list[int]:
|
||||
"""
|
||||
Apply the model's tokenizer to a text prompt, returning the
|
||||
corresponding token IDs.
|
||||
"""
|
||||
tokenizer = self.get_tokenizer_group()
|
||||
add_special_tokens = None
|
||||
if self.model_config.hf_config.model_type == "whisper":
|
||||
# For Whisper, special tokens should be provided by the user based
|
||||
# on the task and language of their request. Also needed to avoid
|
||||
# appending an EOS token to the prompt which disrupts generation.
|
||||
add_special_tokens = False
|
||||
|
||||
if (self.model_config.encoder_config is not None
|
||||
and self.model_config.encoder_config.get(
|
||||
"do_lower_case", False)):
|
||||
prompt = prompt.lower()
|
||||
|
||||
return tokenizer.encode(prompt=prompt,
|
||||
lora_request=lora_request,
|
||||
add_special_tokens=add_special_tokens)
|
||||
|
||||
async def _tokenize_prompt_async(
|
||||
self,
|
||||
prompt: str,
|
||||
lora_request: Optional[LoRARequest],
|
||||
) -> list[int]:
|
||||
"""Async version of :meth:`_tokenize_prompt`."""
|
||||
tokenizer = self.get_tokenizer_group()
|
||||
add_special_tokens = None
|
||||
if self.model_config.hf_config.model_type == "whisper":
|
||||
# For Whisper, special tokens should be provided by the user based
|
||||
# on the task and language of their request. Also needed to avoid
|
||||
# appending an EOS token to the prompt which disrupts generation.
|
||||
add_special_tokens = False
|
||||
return await tokenizer.encode_async(
|
||||
prompt=prompt,
|
||||
lora_request=lora_request,
|
||||
add_special_tokens=add_special_tokens)
|
||||
|
||||
def _can_process_multimodal(self) -> bool:
|
||||
model_config = self.model_config
|
||||
|
||||
if not model_config.is_multimodal_model:
|
||||
raise ValueError("Your model does not support multi-modal inputs")
|
||||
|
||||
# Interim measure so we can handle models that have yet to be
|
||||
# updated to use the new multi-modal processor
|
||||
can_process_multimodal = self.mm_registry.has_processor(model_config)
|
||||
if not can_process_multimodal:
|
||||
from vllm.model_executor.models.registry import _VLLM_MODELS
|
||||
if not any(arch in _VLLM_MODELS
|
||||
for arch in model_config.architectures):
|
||||
logger.warning_once(
|
||||
"Your model uses the legacy input pipeline, which will be "
|
||||
"removed in an upcoming release. "
|
||||
"Please upgrade to the new multi-modal processing pipeline "
|
||||
"(https://docs.vllm.ai/en/latest/design/mm_processing.html)"
|
||||
)
|
||||
|
||||
return can_process_multimodal
|
||||
|
||||
def _process_multimodal(
|
||||
self,
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data: MultiModalDataDict,
|
||||
mm_processor_kwargs: Optional[Mapping[str, object]],
|
||||
lora_request: Optional[LoRARequest],
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalInputs:
|
||||
"""
|
||||
Apply the model's multi-modal processor to a multi-modal prompt,
|
||||
returning the corresponding token IDs and metadata.
|
||||
"""
|
||||
# At the moment on model (PrithviGeoSpatialMAE) requires to be
|
||||
# initialized without a tokenizer while using also multi-modal
|
||||
# input.
|
||||
if not self.tokenizer:
|
||||
tokenizer = object() # Dummy
|
||||
else:
|
||||
tokenizer_group = self.get_tokenizer_group()
|
||||
tokenizer = tokenizer_group.get_lora_tokenizer(lora_request)
|
||||
|
||||
mm_processor = self.mm_registry.create_processor(self.model_config,
|
||||
tokenizer=tokenizer)
|
||||
|
||||
if mm_processor_kwargs is None:
|
||||
mm_processor_kwargs = {}
|
||||
|
||||
return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
|
||||
return_mm_hashes)
|
||||
|
||||
async def _process_multimodal_async(
|
||||
self,
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data: MultiModalDataDict,
|
||||
mm_processor_kwargs: Optional[Mapping[str, object]],
|
||||
lora_request: Optional[LoRARequest],
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalInputs:
|
||||
"""Async version of :meth:`_process_multimodal`."""
|
||||
# At the moment on model (PrithviGeoSpatialMAE) requires to be
|
||||
# initialized without a tokenizer while using also multi-modal
|
||||
# input.
|
||||
if not self.tokenizer:
|
||||
tokenizer = object() # Dummy
|
||||
else:
|
||||
tokenizer_group = self.get_tokenizer_group()
|
||||
tokenizer = await tokenizer_group.get_lora_tokenizer_async(
|
||||
lora_request)
|
||||
|
||||
mm_processor = self.mm_registry.create_processor(self.model_config,
|
||||
tokenizer=tokenizer)
|
||||
if mm_processor_kwargs is None:
|
||||
mm_processor_kwargs = {}
|
||||
|
||||
return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
|
||||
return_mm_hashes)
|
||||
|
||||
def _prompt_to_llm_inputs(
|
||||
self,
|
||||
prompt: SingletonPrompt,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> SingletonInputs:
|
||||
"""
|
||||
Extract the singleton inputs from a prompt.
|
||||
|
||||
Arguments:
|
||||
|
||||
* prompt: single encoder or decoder input prompt
|
||||
* lora_request: this is only valid for decoder prompts
|
||||
* return_mm_hashes: whether to return multimodal hashes
|
||||
|
||||
Returns:
|
||||
|
||||
* :class:`SingletonInputs` instance
|
||||
"""
|
||||
parsed = parse_singleton_prompt(prompt)
|
||||
|
||||
if parsed["type"] == "str":
|
||||
prompt_text = parsed["content"]
|
||||
prompt_token_ids = self._tokenize_prompt(
|
||||
prompt_text,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
return token_inputs(
|
||||
prompt=prompt_text,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
)
|
||||
|
||||
if parsed["type"] == "tokens":
|
||||
tokens_content = parsed["content"]
|
||||
|
||||
prompt_token_ids = tokens_content["prompt_token_ids"]
|
||||
token_type_ids = tokens_content.get("token_type_ids")
|
||||
multi_modal_data = tokens_content.get("multi_modal_data")
|
||||
mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")
|
||||
|
||||
if multi_modal_data is not None and self._can_process_multimodal():
|
||||
return self._process_multimodal(
|
||||
prompt_token_ids,
|
||||
multi_modal_data,
|
||||
mm_processor_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
return token_inputs(
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
token_type_ids=token_type_ids,
|
||||
multi_modal_data=multi_modal_data,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
)
|
||||
|
||||
if parsed["type"] == "text":
|
||||
text_content = parsed["content"]
|
||||
|
||||
prompt_text = text_content["prompt"]
|
||||
multi_modal_data = text_content.get("multi_modal_data")
|
||||
mm_processor_kwargs = text_content.get("mm_processor_kwargs")
|
||||
|
||||
if multi_modal_data is not None and self._can_process_multimodal():
|
||||
return self._process_multimodal(
|
||||
prompt_text,
|
||||
multi_modal_data,
|
||||
mm_processor_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
prompt_token_ids = self._tokenize_prompt(
|
||||
prompt_text,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
return token_inputs(
|
||||
prompt=prompt_text,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
multi_modal_data=multi_modal_data,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
)
|
||||
|
||||
assert_never(parsed)
|
||||
|
||||
async def _prompt_to_llm_inputs_async(
|
||||
self,
|
||||
prompt: SingletonPrompt,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> SingletonInputs:
|
||||
"""Async version of :meth:`_extract_prompt_components`."""
|
||||
parsed = parse_singleton_prompt(prompt)
|
||||
|
||||
if parsed["type"] == "str":
|
||||
prompt_text = parsed["content"]
|
||||
prompt_token_ids = await self._tokenize_prompt_async(
|
||||
prompt_text,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
return token_inputs(
|
||||
prompt=prompt_text,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
)
|
||||
|
||||
if parsed["type"] == "tokens":
|
||||
tokens_content = parsed["content"]
|
||||
|
||||
prompt_token_ids = tokens_content["prompt_token_ids"]
|
||||
multi_modal_data = tokens_content.get("multi_modal_data")
|
||||
mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")
|
||||
|
||||
if multi_modal_data is not None and self._can_process_multimodal():
|
||||
return await self._process_multimodal_async(
|
||||
prompt_token_ids,
|
||||
multi_modal_data,
|
||||
mm_processor_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
return token_inputs(
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
multi_modal_data=multi_modal_data,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
)
|
||||
|
||||
if parsed["type"] == "text":
|
||||
text_content = parsed["content"]
|
||||
|
||||
prompt_text = text_content["prompt"]
|
||||
multi_modal_data = text_content.get("multi_modal_data")
|
||||
mm_processor_kwargs = text_content.get("mm_processor_kwargs")
|
||||
|
||||
if multi_modal_data is not None and self._can_process_multimodal():
|
||||
return await self._process_multimodal_async(
|
||||
prompt_text,
|
||||
multi_modal_data,
|
||||
mm_processor_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
prompt_token_ids = await self._tokenize_prompt_async(
|
||||
prompt_text,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
return token_inputs(
|
||||
prompt=prompt_text,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
multi_modal_data=multi_modal_data,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
)
|
||||
|
||||
assert_never(parsed)
|
||||
|
||||
def _build_enc_dec_llm_inputs(
|
||||
self,
|
||||
encoder_inputs: SingletonInputs,
|
||||
decoder_inputs: Optional[SingletonInputs],
|
||||
) -> EncoderDecoderInputs:
|
||||
if (encoder_inputs["type"] == "token"
|
||||
or encoder_inputs["type"] == "multimodal"):
|
||||
pass
|
||||
else:
|
||||
assert_never(encoder_inputs) # type: ignore[arg-type]
|
||||
|
||||
if decoder_inputs is None:
|
||||
if self.model_config.hf_config.model_type == "whisper":
|
||||
# For Whisper models, the text prompt should go to the decoder.
|
||||
# If no explicit encoder/decoder inputs, then copy the prompt
|
||||
# from the encoder to the decoder. The encoder tokens are later
|
||||
# overridden by the audio features.
|
||||
dec_token_ids = encoder_inputs["prompt_token_ids"].copy()
|
||||
else:
|
||||
dec_token_ids = self._prepare_decoder_input_ids_for_generation(
|
||||
None)
|
||||
decoder_inputs = token_inputs(dec_token_ids)
|
||||
elif (decoder_inputs["type"] == "token"
|
||||
or decoder_inputs["type"] == "multimodal"):
|
||||
dec_token_ids = self._prepare_decoder_input_ids_for_generation(
|
||||
decoder_inputs["prompt_token_ids"])
|
||||
decoder_inputs["prompt_token_ids"] = dec_token_ids
|
||||
|
||||
if "multi_modal_data" in decoder_inputs:
|
||||
raise ValueError("Multi-modal decoder inputs of encoder-"
|
||||
"decoder models are not supported yet")
|
||||
else:
|
||||
assert_never(encoder_inputs) # type: ignore[arg-type]
|
||||
|
||||
return EncoderDecoderInputs(
|
||||
encoder=encoder_inputs,
|
||||
decoder=decoder_inputs,
|
||||
)
|
||||
|
||||
def _separate_enc_dec_inputs_from_mm_processor_outputs(
|
||||
self,
|
||||
inputs: SingletonInputs,
|
||||
decoder_inputs_to_override: Optional[SingletonInputs] = None,
|
||||
) -> tuple[SingletonInputs, SingletonInputs]:
|
||||
"""
|
||||
For encoder/decoder models only:
|
||||
Separate Encoder/Decoder inputs from a MultiModalEncDecInputs
|
||||
"""
|
||||
encoder_inputs: SingletonInputs
|
||||
decoder_inputs: SingletonInputs
|
||||
if inputs["type"] == "multimodal":
|
||||
# Multimodal data inputs
|
||||
assert ("encoder_prompt" in inputs
|
||||
and "encoder_prompt_token_ids" in inputs)
|
||||
inputs = cast(MultiModalEncDecInputs, inputs)
|
||||
encoder_inputs = token_inputs(
|
||||
prompt=inputs["encoder_prompt"],
|
||||
prompt_token_ids=inputs["encoder_prompt_token_ids"],
|
||||
)
|
||||
if decoder_inputs_to_override is not None:
|
||||
decoder_inputs = MultiModalInputs(
|
||||
type="multimodal",
|
||||
prompt=decoder_inputs_to_override.get("prompt", ""),
|
||||
prompt_token_ids=decoder_inputs_to_override[
|
||||
"prompt_token_ids"],
|
||||
mm_kwargs=inputs["mm_kwargs"],
|
||||
mm_hashes=inputs["mm_hashes"],
|
||||
mm_placeholders=inputs["mm_placeholders"],
|
||||
)
|
||||
else:
|
||||
decoder_inputs = MultiModalInputs(
|
||||
type="multimodal",
|
||||
prompt=inputs["prompt"],
|
||||
prompt_token_ids=inputs["prompt_token_ids"],
|
||||
mm_kwargs=inputs["mm_kwargs"],
|
||||
mm_hashes=inputs["mm_hashes"],
|
||||
mm_placeholders=inputs["mm_placeholders"],
|
||||
)
|
||||
elif inputs["type"] == "token":
|
||||
# Text-only inputs
|
||||
encoder_inputs = token_inputs(prompt="", prompt_token_ids=[])
|
||||
decoder_inputs = decoder_inputs_to_override or inputs
|
||||
else:
|
||||
assert_never(inputs) # type: ignore[arg-type]
|
||||
return encoder_inputs, decoder_inputs
|
||||
|
||||
def _process_encoder_decoder_prompt(
|
||||
self,
|
||||
prompt: PromptType,
|
||||
) -> EncoderDecoderInputs:
|
||||
"""
|
||||
For encoder/decoder models only:
|
||||
Process an input prompt into an :class:`EncoderDecoderInputs` instance.
|
||||
|
||||
There are two types of input prompts:
|
||||
singleton prompts which carry only the
|
||||
encoder prompt, and explicit encoder/decoder
|
||||
prompts which carry both the encoder and the
|
||||
decoder prompts as member variables.
|
||||
|
||||
This function handles the following scenarios:
|
||||
* Singleton encoder prompt: extract encoder prompt
|
||||
token ids & infer default decoder prompt token ids
|
||||
* Explicit encoder/decoder prompt: extract encoder
|
||||
and decoder prompt token ids
|
||||
|
||||
Note that for Explicit encoder/decoder prompts,
|
||||
each sub-prompt (encoder or decoder prompt) can
|
||||
have any possible singleton type; thus this
|
||||
method relies on helper functions to obtain
|
||||
token ids for the sub-prompts.
|
||||
|
||||
Arguments:
|
||||
|
||||
* prompt: an input prompt
|
||||
|
||||
Returns:
|
||||
|
||||
* :class:`EncoderDecoderInputs` instance
|
||||
"""
|
||||
encoder_inputs: SingletonInputs
|
||||
decoder_inputs: Optional[SingletonInputs]
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(prompt):
|
||||
encoder_inputs = self._prompt_to_llm_inputs(
|
||||
prompt["encoder_prompt"])
|
||||
if (decoder_input := prompt["decoder_prompt"]) is None:
|
||||
decoder_inputs = None
|
||||
else:
|
||||
decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
|
||||
# For multimodal model, override decoder prompt from processor
|
||||
# with explicit decoder prompt.
|
||||
if self.model_config.is_multimodal_model and (
|
||||
self._can_process_multimodal()):
|
||||
encoder_inputs, decoder_inputs = (
|
||||
self._separate_enc_dec_inputs_from_mm_processor_outputs(
|
||||
encoder_inputs, decoder_inputs))
|
||||
else:
|
||||
inputs = self._prompt_to_llm_inputs(prompt)
|
||||
if self.model_config.is_multimodal_model and (
|
||||
self._can_process_multimodal()):
|
||||
# Encoder-Decoder Multimodal model
|
||||
encoder_inputs, decoder_inputs = (
|
||||
self._separate_enc_dec_inputs_from_mm_processor_outputs(
|
||||
inputs))
|
||||
else:
|
||||
encoder_inputs = inputs
|
||||
|
||||
decoder_inputs = None
|
||||
|
||||
return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
|
||||
|
||||
async def _process_encoder_decoder_prompt_async(
|
||||
self,
|
||||
prompt: PromptType,
|
||||
) -> EncoderDecoderInputs:
|
||||
"""Async version of :meth:`_process_encoder_decoder_prompt`."""
|
||||
encoder_inputs: SingletonInputs
|
||||
decoder_inputs: Optional[SingletonInputs]
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(prompt):
|
||||
encoder_task = self._prompt_to_llm_inputs_async(
|
||||
prompt["encoder_prompt"])
|
||||
|
||||
if (decoder_input := prompt["decoder_prompt"]) is None:
|
||||
encoder_inputs = await encoder_task
|
||||
decoder_inputs = None
|
||||
else:
|
||||
decoder_task = self._prompt_to_llm_inputs_async(decoder_input)
|
||||
|
||||
encoder_inputs, decoder_inputs = await asyncio.gather(
|
||||
encoder_task, decoder_task)
|
||||
|
||||
# For multimodal model, override decoder prompt from processor
|
||||
# with explicit decoder prompt.
|
||||
if self.model_config.is_multimodal_model and (
|
||||
self._can_process_multimodal()):
|
||||
encoder_inputs, decoder_inputs = (
|
||||
self._separate_enc_dec_inputs_from_mm_processor_outputs(
|
||||
encoder_inputs, decoder_inputs))
|
||||
else:
|
||||
inputs = await self._prompt_to_llm_inputs_async(prompt)
|
||||
if self.model_config.is_multimodal_model and (
|
||||
self._can_process_multimodal()):
|
||||
# Encoder-Decoder Multimodal model
|
||||
encoder_inputs, decoder_inputs = (
|
||||
self._separate_enc_dec_inputs_from_mm_processor_outputs(
|
||||
inputs))
|
||||
else:
|
||||
encoder_inputs = inputs
|
||||
|
||||
decoder_inputs = None
|
||||
|
||||
return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
|
||||
|
||||
def _build_decoder_only_llm_inputs(
|
||||
self,
|
||||
prompt_inputs: DecoderOnlyInputs,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest],
|
||||
) -> DecoderOnlyInputs:
|
||||
if (prompt_inputs["type"] == "token"
|
||||
or prompt_inputs["type"] == "multimodal"):
|
||||
prompt_inputs["prompt_token_ids"] = self._apply_prompt_adapter(
|
||||
prompt_inputs["prompt_token_ids"],
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
)
|
||||
else:
|
||||
assert_never(prompt_inputs) # type: ignore[arg-type]
|
||||
|
||||
return prompt_inputs
|
||||
|
||||
def _process_decoder_only_prompt(
|
||||
self,
|
||||
prompt: SingletonPrompt,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> DecoderOnlyInputs:
|
||||
"""
|
||||
For decoder-only models:
|
||||
Process an input prompt into an :class:`DecoderOnlyInputs` instance.
|
||||
|
||||
Arguments:
|
||||
|
||||
* prompt: input prompt
|
||||
* lora_request
|
||||
* prompt_adapter_request
|
||||
* return_mm_hashes
|
||||
|
||||
Returns:
|
||||
|
||||
* :class:`DecoderOnlyInputs` instance
|
||||
"""
|
||||
|
||||
prompt_comps = self._prompt_to_llm_inputs(
|
||||
prompt,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
return self._build_decoder_only_llm_inputs(
|
||||
prompt_comps,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
)
|
||||
|
||||
async def _process_decoder_only_prompt_async(
|
||||
self,
|
||||
prompt: SingletonPrompt,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> DecoderOnlyInputs:
|
||||
"""Async version of :meth:`_process_decoder_only_prompt`."""
|
||||
prompt_comps = await self._prompt_to_llm_inputs_async(
|
||||
prompt,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
return self._build_decoder_only_llm_inputs(
|
||||
prompt_comps,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
)
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
prompt: PromptType,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> ProcessorInputs:
|
||||
"""Preprocess the input prompt."""
|
||||
if self.model_config.is_encoder_decoder:
|
||||
assert not return_mm_hashes, (
|
||||
"Multimodal hashes for encoder-decoder models should not be ",
|
||||
"returned until they are supported on vLLM V1.")
|
||||
# Encoder-decoder model requires special mapping of
|
||||
# input prompts to encoder & decoder
|
||||
return self._process_encoder_decoder_prompt(prompt)
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(prompt):
|
||||
raise ValueError("Cannot pass encoder-decoder prompt "
|
||||
"to decoder-only models")
|
||||
|
||||
# Decoder-only operation
|
||||
return self._process_decoder_only_prompt(
|
||||
prompt,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
async def preprocess_async(
|
||||
self,
|
||||
prompt: PromptType,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> ProcessorInputs:
|
||||
"""Async version of :meth:`preprocess`."""
|
||||
if self.model_config.is_encoder_decoder:
|
||||
assert not return_mm_hashes, (
|
||||
"Multimodal hashes for encoder-decoder models should not be ",
|
||||
"returned until they are supported on vLLM V1.")
|
||||
# Encoder-decoder model requires special mapping of
|
||||
# input prompts to encoder & decoder
|
||||
return await self._process_encoder_decoder_prompt_async(prompt)
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(prompt):
|
||||
raise ValueError("Cannot pass encoder-decoder prompt "
|
||||
"to decoder-only models")
|
||||
|
||||
# Decoder-only operation
|
||||
return await self._process_decoder_only_prompt_async(
|
||||
prompt,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
487
vllm/inputs/registry.py
Normal file
487
vllm/inputs/registry.py
Normal file
@@ -0,0 +1,487 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import functools
|
||||
from collections import UserDict
|
||||
from collections.abc import Mapping
|
||||
from dataclasses import dataclass
|
||||
from typing import (TYPE_CHECKING, Any, Callable, NamedTuple, Optional,
|
||||
Protocol, Union)
|
||||
|
||||
from torch import nn
|
||||
from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
|
||||
from typing_extensions import TypeVar, assert_never
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.processor import cached_processor_from_config
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
|
||||
resolve_mm_processor_kwargs)
|
||||
|
||||
from .data import ProcessorInputs, SingletonInputs
|
||||
from .parse import split_enc_dec_inputs
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.multimodal import (MultiModalDataDict, MultiModalPlaceholderDict,
|
||||
MultiModalRegistry)
|
||||
from vllm.sequence import SequenceData
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
_T = TypeVar("_T")
|
||||
_C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig)
|
||||
_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class InputContext:
|
||||
"""
|
||||
Contains information about the model which may be used to
|
||||
modify the inputs.
|
||||
"""
|
||||
|
||||
model_config: "ModelConfig"
|
||||
"""The configuration of the model."""
|
||||
|
||||
def get_hf_config(
|
||||
self,
|
||||
typ: Union[type[_C], tuple[type[_C], ...]] = PretrainedConfig,
|
||||
/,
|
||||
) -> _C:
|
||||
"""
|
||||
Get the HuggingFace configuration
|
||||
(:class:`transformers.PretrainedConfig`) of the model,
|
||||
additionally checking its type.
|
||||
|
||||
Raises:
|
||||
TypeError: If the configuration is not of the specified type.
|
||||
"""
|
||||
hf_config = self.model_config.hf_config
|
||||
if not isinstance(hf_config, typ):
|
||||
raise TypeError("Invalid type of HuggingFace config. "
|
||||
f"Expected type: {typ}, but "
|
||||
f"found type: {type(hf_config)}")
|
||||
|
||||
return hf_config
|
||||
|
||||
def get_hf_image_processor_config(self) -> dict[str, Any]:
|
||||
"""
|
||||
Get the HuggingFace image processor configuration of the model.
|
||||
"""
|
||||
return self.model_config.hf_image_processor_config
|
||||
|
||||
def get_mm_config(self):
|
||||
"""
|
||||
Get the multimodal config of the model.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the model is not a multimodal model.
|
||||
"""
|
||||
mm_config = self.model_config.multimodal_config
|
||||
if mm_config is None:
|
||||
raise RuntimeError("Not a multimodal model")
|
||||
|
||||
return mm_config
|
||||
|
||||
def get_hf_processor(
|
||||
self,
|
||||
typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
|
||||
/,
|
||||
**kwargs: object,
|
||||
) -> _P:
|
||||
"""
|
||||
Get the HuggingFace processor
|
||||
(:class:`transformers.ProcessorMixin`) of the model,
|
||||
additionally checking its type.
|
||||
|
||||
Raises:
|
||||
TypeError: If the processor is not of the specified type.
|
||||
"""
|
||||
return cached_processor_from_config(
|
||||
self.model_config,
|
||||
processor_cls=typ,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def init_processor(
|
||||
self,
|
||||
typ: type[_T],
|
||||
/,
|
||||
**kwargs: object,
|
||||
) -> _T:
|
||||
"""
|
||||
Initialize a HuggingFace-like processor class, merging the
|
||||
keyword arguments with those in the model's configuration.
|
||||
"""
|
||||
base_kwargs = self.model_config.mm_processor_kwargs
|
||||
if base_kwargs is None:
|
||||
base_kwargs = {}
|
||||
|
||||
merged_kwargs = {**base_kwargs, **kwargs}
|
||||
|
||||
return typ(**merged_kwargs)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class InputProcessingContext(InputContext):
|
||||
tokenizer: AnyTokenizer
|
||||
"""The tokenizer used to tokenize the inputs."""
|
||||
|
||||
def get_hf_processor(
|
||||
self,
|
||||
typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
|
||||
/,
|
||||
**kwargs: object,
|
||||
) -> _P:
|
||||
return super().get_hf_processor(
|
||||
typ,
|
||||
tokenizer=self.tokenizer,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def call_hf_processor(
|
||||
self,
|
||||
hf_processor: ProcessorMixin,
|
||||
data: Mapping[str, object],
|
||||
kwargs: Mapping[str, object] = {},
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Call :code:`hf_processor` on the prompt :code:`data`
|
||||
(text, image, audio...) with configurable options :code:`kwargs`.
|
||||
"""
|
||||
assert callable(hf_processor)
|
||||
|
||||
base_kwargs = self.model_config.mm_processor_kwargs
|
||||
if base_kwargs is None:
|
||||
base_kwargs = {}
|
||||
|
||||
merged_kwargs = resolve_mm_processor_kwargs(
|
||||
base_kwargs,
|
||||
kwargs,
|
||||
hf_processor,
|
||||
requires_kw_only=False,
|
||||
allow_var_kwargs=True,
|
||||
)
|
||||
|
||||
try:
|
||||
return hf_processor(**data, **merged_kwargs, return_tensors="pt")
|
||||
except Exception as exc:
|
||||
msg = (f"Failed to apply {type(hf_processor).__name__} "
|
||||
f"on data={data} with kwargs={merged_kwargs}")
|
||||
|
||||
raise RuntimeError(msg) from exc
|
||||
|
||||
|
||||
N = TypeVar("N", bound=type[nn.Module])
|
||||
|
||||
|
||||
class DummyData(NamedTuple):
|
||||
"""Dummy data used for profiling."""
|
||||
|
||||
seq_data: "SequenceData"
|
||||
multi_modal_data: Optional["MultiModalDataDict"] = None
|
||||
multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None
|
||||
|
||||
|
||||
class DummyDataFactory(Protocol):
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
ctx: InputContext,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
**mm_processor_kwargs: Any,
|
||||
) -> DummyData:
|
||||
"""
|
||||
Create dummy data to be inputted into the model.
|
||||
|
||||
Note:
|
||||
:data:`InputProcessor` is not applied to the dummy data.
|
||||
|
||||
The :code:`mm_processor_kwargs` are overrides provided at
|
||||
initialization time to values in the config whose values
|
||||
may affect the number of tokens per instance.
|
||||
"""
|
||||
...
|
||||
|
||||
|
||||
class _MultiModalCounts(UserDict[str, int]):
|
||||
"""
|
||||
Wraps `mm_counts` for a more informative error message
|
||||
when attempting to access a plugin that does not exist.
|
||||
"""
|
||||
|
||||
def __getitem__(self, key: str) -> int:
|
||||
try:
|
||||
return super().__getitem__(key)
|
||||
except KeyError as exc:
|
||||
msg = (f"There is no multi-modal plugin with the key: {key}. "
|
||||
f"Available keys: {set(self.keys())}")
|
||||
raise KeyError(msg) from exc
|
||||
|
||||
|
||||
InputProcessor = Callable[[InputContext, ProcessorInputs], ProcessorInputs]
|
||||
"""Preprocess the inputs to the model."""
|
||||
|
||||
|
||||
class InputRegistry:
|
||||
"""
|
||||
A registry to dispatch data processing
|
||||
according to the target model.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._dummy_factories_by_model_type = \
|
||||
ClassRegistry[nn.Module, DummyDataFactory]()
|
||||
self._dummy_encoder_factories_by_model_type = \
|
||||
ClassRegistry[nn.Module, DummyDataFactory]()
|
||||
self._input_processors_by_model_type = \
|
||||
ClassRegistry[nn.Module, InputProcessor]()
|
||||
|
||||
def _default_dummy_data_factory(
|
||||
self,
|
||||
ctx: InputContext,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> DummyData:
|
||||
"""
|
||||
The default dummy data factory represents the longest possible text
|
||||
that can be inputted to the model.
|
||||
|
||||
Note:
|
||||
:data:`InputProcessor` is not applied to the dummy data.
|
||||
"""
|
||||
# Avoid circular import
|
||||
from vllm.sequence import SequenceData
|
||||
|
||||
return DummyData(SequenceData.from_prompt_token_counts((0, seq_len)))
|
||||
|
||||
def register_dummy_data(self, factory: DummyDataFactory):
|
||||
"""
|
||||
Register a dummy data factory to a model class.
|
||||
|
||||
During memory profiling, the provided function is invoked to create
|
||||
dummy data to be inputted into the model. The resulting memory usage
|
||||
should be an upper bound of what the model would use at inference time.
|
||||
"""
|
||||
|
||||
def wrapper(model_cls: N) -> N:
|
||||
if self._dummy_factories_by_model_type.contains(model_cls,
|
||||
strict=True):
|
||||
logger.warning(
|
||||
"Model class %s already has dummy data "
|
||||
"registered to %s. It is overwritten by the new one.",
|
||||
model_cls, self)
|
||||
|
||||
self._dummy_factories_by_model_type[model_cls] = factory
|
||||
|
||||
return model_cls
|
||||
|
||||
return wrapper
|
||||
|
||||
def _get_dummy_data_factory(self, model_cls: type[nn.Module]):
|
||||
return self._dummy_factories_by_model_type \
|
||||
.get(model_cls, self._default_dummy_data_factory)
|
||||
|
||||
def register_dummy_encoder_data(self, factory: DummyDataFactory):
|
||||
"""
|
||||
Register a dummy encoder data factory to a model class
|
||||
|
||||
This is similar to :meth:`~register_dummy_data`, but for encoder input.
|
||||
"""
|
||||
|
||||
def wrapper(model_cls: N) -> N:
|
||||
if self._dummy_encoder_factories_by_model_type.contains(
|
||||
model_cls, strict=True):
|
||||
logger.warning(
|
||||
"Model class %s already has dummy encoder data "
|
||||
"registered to %s. It is overwritten by the new one.",
|
||||
model_cls, self)
|
||||
|
||||
self._dummy_encoder_factories_by_model_type[model_cls] = factory
|
||||
|
||||
return model_cls
|
||||
|
||||
return wrapper
|
||||
|
||||
def _get_dummy_encoder_data_factory(self, model_cls: type[nn.Module]):
|
||||
return self._dummy_encoder_factories_by_model_type \
|
||||
.get(model_cls, self._default_dummy_data_factory)
|
||||
|
||||
def dummy_data_for_profiling(
|
||||
self,
|
||||
model_config: "ModelConfig",
|
||||
seq_len: int,
|
||||
mm_registry: "MultiModalRegistry",
|
||||
is_encoder_data: bool = False,
|
||||
) -> DummyData:
|
||||
"""
|
||||
Create dummy data for profiling the memory usage of a model.
|
||||
|
||||
The model is identified by ``model_config``.
|
||||
|
||||
Note:
|
||||
This should be called after
|
||||
:meth:`~MultiModalRegistry.init_mm_limits_per_prompt`.
|
||||
"""
|
||||
# Avoid circular import
|
||||
from vllm.model_executor.model_loader import get_model_architecture
|
||||
from vllm.multimodal import MultiModalKwargs
|
||||
from vllm.multimodal.profiling import MultiModalProfiler
|
||||
from vllm.sequence import SequenceData
|
||||
|
||||
if mm_registry.has_processor(model_config):
|
||||
processor = mm_registry.create_processor(model_config,
|
||||
disable_cache=True)
|
||||
profiler = MultiModalProfiler(processor)
|
||||
|
||||
dummy_data_v1 = (profiler.get_encoder_dummy_data(seq_len)
|
||||
if is_encoder_data else
|
||||
profiler.get_decoder_dummy_data(seq_len))
|
||||
_seq_data = SequenceData.from_seqs(
|
||||
dummy_data_v1.prompt_token_ids) # type: ignore[attr-defined]
|
||||
|
||||
dummy_data = DummyData(
|
||||
seq_data=_seq_data,
|
||||
multi_modal_data=getattr(dummy_data_v1, "multi_modal_data",
|
||||
None),
|
||||
multi_modal_placeholders=getattr(dummy_data_v1,
|
||||
"multi_modal_placeholders",
|
||||
None),
|
||||
)
|
||||
else:
|
||||
model_cls, _ = get_model_architecture(model_config)
|
||||
if is_encoder_data:
|
||||
dummy_factory = self._get_dummy_encoder_data_factory(model_cls)
|
||||
else:
|
||||
dummy_factory = self._get_dummy_data_factory(model_cls)
|
||||
mm_counts = mm_registry.get_mm_limits_per_prompt(model_config)
|
||||
mm_processor_kwargs = get_allowed_kwarg_only_overrides(
|
||||
dummy_factory,
|
||||
overrides=model_config.mm_processor_kwargs,
|
||||
requires_kw_only=False,
|
||||
allow_var_kwargs=True,
|
||||
)
|
||||
|
||||
dummy_data = dummy_factory(InputContext(model_config), seq_len,
|
||||
_MultiModalCounts(mm_counts),
|
||||
**mm_processor_kwargs)
|
||||
|
||||
# Having more tokens is over-conservative but otherwise fine
|
||||
num_tokens = dummy_data.seq_data.prompt_token_ids
|
||||
if len(num_tokens) < seq_len:
|
||||
if is_encoder_data:
|
||||
logger.warning_once(
|
||||
f"Expected at least {seq_len} dummy encoder tokens for "
|
||||
f"profiling, but found {len(num_tokens)} tokens instead.")
|
||||
else:
|
||||
raise AssertionError(
|
||||
f"Expected at least {seq_len} dummy tokens for profiling, "
|
||||
f"but found {len(num_tokens)} tokens instead.")
|
||||
|
||||
if (dummy_data.multi_modal_data is not None and
|
||||
not isinstance(dummy_data.multi_modal_data, MultiModalKwargs)):
|
||||
for k, v in dummy_data.multi_modal_data.items():
|
||||
num_items = len(v) if isinstance(v, list) else 1
|
||||
num_expected = mm_counts[k]
|
||||
assert num_items >= num_expected, (
|
||||
f"Expected at least {num_expected} dummy '{k}' instances "
|
||||
f"for profiling, but found {num_items} instances instead.")
|
||||
|
||||
return dummy_data
|
||||
|
||||
def _default_input_processor(
|
||||
self,
|
||||
ctx: InputContext,
|
||||
inputs: ProcessorInputs,
|
||||
**kwargs: object,
|
||||
) -> ProcessorInputs:
|
||||
"""The default input processor is a no-op."""
|
||||
return inputs
|
||||
|
||||
def register_input_processor(self, processor: InputProcessor):
|
||||
"""
|
||||
Register an input processor to a model class.
|
||||
|
||||
The provided function is invoked on each input to the model. This
|
||||
happens before
|
||||
:meth:`~vllm.multimodal.registry.MultiModalRegistry.map_input`.
|
||||
"""
|
||||
|
||||
def wrapper(model_cls: N) -> N:
|
||||
if self._input_processors_by_model_type.contains(model_cls,
|
||||
strict=True):
|
||||
logger.warning(
|
||||
"Model class %s already has input processor "
|
||||
"registered to %s. It is overwritten by the new one.",
|
||||
model_cls, self)
|
||||
|
||||
self._input_processors_by_model_type[model_cls] = processor
|
||||
|
||||
return model_cls
|
||||
|
||||
return wrapper
|
||||
|
||||
def _get_model_input_processor(self, model_cls: type[nn.Module]):
|
||||
return self._input_processors_by_model_type \
|
||||
.get(model_cls, self._default_input_processor)
|
||||
|
||||
def _ensure_mm_kwargs(
|
||||
self,
|
||||
inputs: SingletonInputs,
|
||||
mm_processor_kwargs: dict[str, Any],
|
||||
):
|
||||
if inputs["type"] == "token":
|
||||
# In case the input processor for that model fails to set it
|
||||
if "mm_processor_kwargs" not in inputs:
|
||||
inputs["mm_processor_kwargs"] = mm_processor_kwargs
|
||||
elif inputs["type"] == "multimodal":
|
||||
# Be more strict in V2
|
||||
assert "mm_kwargs" in inputs
|
||||
else:
|
||||
assert_never(inputs["type"]) # type: ignore[arg-type]
|
||||
|
||||
def process_input(self, model_config: "ModelConfig",
|
||||
inputs: ProcessorInputs) -> ProcessorInputs:
|
||||
"""
|
||||
Apply an input processor to an instance of model inputs.
|
||||
|
||||
The model is identified by ``model_config``.
|
||||
"""
|
||||
# Avoid circular import
|
||||
from vllm.model_executor.model_loader import get_model_architecture
|
||||
|
||||
model_cls, _ = get_model_architecture(model_config)
|
||||
processor = self._get_model_input_processor(model_cls)
|
||||
|
||||
# Handle multimodal processor kwargs with priority:
|
||||
# Inference kwargs -> Init kwargs -> {}
|
||||
# If it's empty, it'll fall back to the default kwarg values
|
||||
mm_processor_kwargs = resolve_mm_processor_kwargs(
|
||||
model_config.mm_processor_kwargs,
|
||||
inputs.get("mm_processor_kwargs", {}), # type: ignore
|
||||
processor,
|
||||
requires_kw_only=False,
|
||||
allow_var_kwargs=True,
|
||||
)
|
||||
|
||||
processed_inputs = processor(
|
||||
InputContext(model_config),
|
||||
inputs,
|
||||
**mm_processor_kwargs,
|
||||
)
|
||||
|
||||
encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
|
||||
if encoder_inputs is not None:
|
||||
self._ensure_mm_kwargs(encoder_inputs, mm_processor_kwargs)
|
||||
if decoder_inputs is not None:
|
||||
self._ensure_mm_kwargs(decoder_inputs, mm_processor_kwargs)
|
||||
|
||||
return processed_inputs
|
||||
|
||||
def create_input_processor(self, model_config: "ModelConfig"):
|
||||
"""
|
||||
Create an input processor (see :meth:`_process_input`) for a
|
||||
specific model.
|
||||
"""
|
||||
return functools.partial(self.process_input, model_config)
|
||||
Reference in New Issue
Block a user