First commit
This commit is contained in:
24
vllm/multimodal/__init__.py
Normal file
24
vllm/multimodal/__init__.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from .base import (BatchedTensorInputs, MultiModalDataBuiltins,
|
||||
MultiModalDataDict, MultiModalInputs, MultiModalPlugin,
|
||||
NestedTensors)
|
||||
from .registry import MultiModalRegistry
|
||||
|
||||
MULTIMODAL_REGISTRY = MultiModalRegistry()
|
||||
"""
|
||||
The global :class:`~MultiModalRegistry` is used by model runners to
|
||||
dispatch data processing according to its modality and the target model.
|
||||
|
||||
See also:
|
||||
:ref:`input_processing_pipeline`
|
||||
"""
|
||||
|
||||
__all__ = [
|
||||
"BatchedTensorInputs",
|
||||
"MultiModalDataBuiltins",
|
||||
"MultiModalDataDict",
|
||||
"MultiModalInputs",
|
||||
"MultiModalPlugin",
|
||||
"NestedTensors",
|
||||
"MULTIMODAL_REGISTRY",
|
||||
"MultiModalRegistry",
|
||||
]
|
||||
BIN
vllm/multimodal/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
vllm/multimodal/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
BIN
vllm/multimodal/__pycache__/audio.cpython-310.pyc
Normal file
BIN
vllm/multimodal/__pycache__/audio.cpython-310.pyc
Normal file
Binary file not shown.
BIN
vllm/multimodal/__pycache__/base.cpython-310.pyc
Normal file
BIN
vllm/multimodal/__pycache__/base.cpython-310.pyc
Normal file
Binary file not shown.
BIN
vllm/multimodal/__pycache__/image.cpython-310.pyc
Normal file
BIN
vllm/multimodal/__pycache__/image.cpython-310.pyc
Normal file
Binary file not shown.
BIN
vllm/multimodal/__pycache__/registry.cpython-310.pyc
Normal file
BIN
vllm/multimodal/__pycache__/registry.cpython-310.pyc
Normal file
Binary file not shown.
BIN
vllm/multimodal/__pycache__/utils.cpython-310.pyc
Normal file
BIN
vllm/multimodal/__pycache__/utils.cpython-310.pyc
Normal file
Binary file not shown.
BIN
vllm/multimodal/__pycache__/video.cpython-310.pyc
Normal file
BIN
vllm/multimodal/__pycache__/video.cpython-310.pyc
Normal file
Binary file not shown.
17
vllm/multimodal/audio.py
Normal file
17
vllm/multimodal/audio.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from vllm.inputs.registry import InputContext
|
||||
from vllm.multimodal.base import MultiModalInputs, MultiModalPlugin
|
||||
|
||||
|
||||
class AudioPlugin(MultiModalPlugin):
|
||||
"""Plugin for audio data."""
|
||||
|
||||
def get_data_key(self) -> str:
|
||||
return "audio"
|
||||
|
||||
def _default_input_mapper(self, ctx: InputContext, data: object,
|
||||
**mm_processor_kwargs) -> MultiModalInputs:
|
||||
raise NotImplementedError("There is no default audio input mapper")
|
||||
|
||||
def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
|
||||
raise NotImplementedError(
|
||||
"There is no default maximum multimodal tokens")
|
||||
368
vllm/multimodal/base.py
Normal file
368
vllm/multimodal/base.py
Normal file
@@ -0,0 +1,368 @@
|
||||
import sys
|
||||
from abc import ABC, abstractmethod
|
||||
from collections import UserDict, defaultdict
|
||||
from typing import (Any, Callable, Dict, List, Mapping, Optional, Tuple, Type,
|
||||
TypedDict, TypeVar, Union, cast, final)
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.types
|
||||
from PIL import Image
|
||||
from torch import nn
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.inputs import InputContext
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import (JSONTree, get_allowed_kwarg_only_overrides, is_list_of,
|
||||
json_map_leaves, resolve_mm_processor_kwargs)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor]
|
||||
"""
|
||||
Uses a list instead of a tensor if the dimensions of each element do not match.
|
||||
"""
|
||||
|
||||
BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors]
|
||||
"""
|
||||
A dictionary containing nested tensors which have been batched via
|
||||
:meth:`MultiModalInputs.batch`.
|
||||
"""
|
||||
|
||||
if sys.version_info < (3, 9):
|
||||
# UserDict cannot be subscripted
|
||||
class _MultiModalInputsBase(UserDict):
|
||||
pass
|
||||
else:
|
||||
|
||||
class _MultiModalInputsBase(UserDict[str, NestedTensors]):
|
||||
pass
|
||||
|
||||
|
||||
class MultiModalInputs(_MultiModalInputsBase):
|
||||
"""
|
||||
A dictionary that represents the keyword arguments to
|
||||
:meth:`~torch.nn.Module.forward`.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
|
||||
"""
|
||||
Recursively stacks lists of tensors when they all have the same shape.
|
||||
"""
|
||||
if isinstance(nested_tensors, torch.Tensor):
|
||||
return nested_tensors
|
||||
|
||||
if isinstance(nested_tensors, np.ndarray):
|
||||
return torch.from_numpy(nested_tensors)
|
||||
|
||||
if isinstance(nested_tensors, (int, float)):
|
||||
return torch.tensor(nested_tensors)
|
||||
|
||||
stacked = [MultiModalInputs._try_stack(t) for t in nested_tensors]
|
||||
if not is_list_of(stacked, torch.Tensor, check="all"):
|
||||
# Only tensors (not lists) can be stacked.
|
||||
return stacked
|
||||
|
||||
tensors_ = cast(List[torch.Tensor], stacked)
|
||||
if any(t.shape != tensors_[0].shape for t in tensors_):
|
||||
# The tensors have incompatible shapes and can't be stacked.
|
||||
return tensors_
|
||||
|
||||
return torch.stack(tensors_)
|
||||
|
||||
@staticmethod
|
||||
def batch(inputs_list: List["MultiModalInputs"]) -> BatchedTensorInputs:
|
||||
"""
|
||||
Batch multiple inputs together into a dictionary.
|
||||
|
||||
The resulting dictionary has the same keys as the inputs.
|
||||
If the corresponding value from each input is a tensor and they all
|
||||
share the same shape, the output value is a single batched tensor;
|
||||
otherwise, the output value is a list containing the original value
|
||||
from each input.
|
||||
"""
|
||||
if len(inputs_list) == 0:
|
||||
return {}
|
||||
|
||||
item_lists: Dict[str, List[NestedTensors]] = defaultdict(list)
|
||||
|
||||
for inputs in inputs_list:
|
||||
# For models that supports multiple modalities (e.g. Qwen2-VL),
|
||||
# different modalities will return different data keys,
|
||||
# so batch() should skip the same key check.
|
||||
|
||||
for k, v in inputs.items():
|
||||
item_lists[k].append(v)
|
||||
|
||||
return {
|
||||
k: MultiModalInputs._try_stack(item_list)
|
||||
for k, item_list in item_lists.items()
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def as_kwargs(
|
||||
batched_inputs: BatchedTensorInputs,
|
||||
*,
|
||||
device: torch.types.Device,
|
||||
) -> BatchedTensorInputs:
|
||||
json_inputs = cast(JSONTree[torch.Tensor], batched_inputs)
|
||||
|
||||
json_mapped = json_map_leaves(
|
||||
lambda x: x.to(device, non_blocking=True),
|
||||
json_inputs,
|
||||
)
|
||||
|
||||
return cast(BatchedTensorInputs, json_mapped)
|
||||
|
||||
|
||||
_T = TypeVar("_T")
|
||||
|
||||
MultiModalData: TypeAlias = Union[_T, List[_T]]
|
||||
"""
|
||||
Either a single data instance, or a list of data instances.
|
||||
|
||||
The number of data instances allowed per modality is restricted by
|
||||
`--limit-mm-per-prompt`.
|
||||
"""
|
||||
|
||||
|
||||
@final
|
||||
class MultiModalDataBuiltins(TypedDict, total=False):
|
||||
"""Modality types that are predefined by vLLM."""
|
||||
|
||||
image: MultiModalData[Image.Image]
|
||||
"""The input image(s)."""
|
||||
|
||||
audio: MultiModalData[Tuple[np.ndarray, Union[int, float]]]
|
||||
"""The input audio item(s) and corresponding sampling rate(s)."""
|
||||
|
||||
|
||||
MultiModalDataDict = Union[MultiModalDataBuiltins,
|
||||
Mapping[str, MultiModalData[object]]]
|
||||
"""
|
||||
A dictionary containing an item for each modality type to input.
|
||||
|
||||
Note:
|
||||
This dictionary also accepts modality keys defined outside
|
||||
:class:`MultiModalDataBuiltins` as long as a customized plugin is registered
|
||||
through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
|
||||
Read more on that :ref:`here <adding_multimodal_plugin>`.
|
||||
"""
|
||||
|
||||
MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]],
|
||||
MultiModalInputs]
|
||||
"""
|
||||
Return a dictionary to be passed as keyword arguments to
|
||||
:meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
|
||||
and processors in HuggingFace Transformers.
|
||||
|
||||
If the data is not supported, throw :exc:`TypeError`.
|
||||
"""
|
||||
|
||||
MultiModalTokensCalc = Union[int, Callable[[InputContext], int]]
|
||||
"""
|
||||
Calculate the maximum number of multimodal tokens input to the language
|
||||
model. This does not include tokens that correspond to the input text.
|
||||
"""
|
||||
|
||||
N = TypeVar("N", bound=Type[nn.Module])
|
||||
|
||||
|
||||
class MultiModalPlugin(ABC):
|
||||
"""
|
||||
Base class that defines data processing logic for a specific modality.
|
||||
|
||||
In particular, we adopt a registry pattern to dispatch data processing
|
||||
according to the model being used (considering that different models may
|
||||
process the same data differently). This registry is in turn used by
|
||||
:class:`~MultiModalRegistry` which acts at a higher level
|
||||
(i.e., the modality of the data).
|
||||
|
||||
See also:
|
||||
:ref:`adding_multimodal_plugin`
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._input_mappers: Dict[Type[nn.Module], MultiModalInputMapper] = {}
|
||||
self._max_mm_tokens: Dict[Type[nn.Module], MultiModalTokensCalc] = {}
|
||||
|
||||
@abstractmethod
|
||||
def get_data_key(self) -> str:
|
||||
"""
|
||||
Get the data key corresponding to the modality.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def _default_input_mapper(
|
||||
self,
|
||||
ctx: InputContext,
|
||||
data: MultiModalData[object],
|
||||
**mm_processor_kwargs,
|
||||
) -> MultiModalInputs:
|
||||
"""
|
||||
Return a dictionary to be passed as keyword arguments to
|
||||
:meth:`~torch.nn.Module.forward`. This is similar in concept to
|
||||
tokenizers and processors in HuggingFace Transformers.
|
||||
|
||||
If the data is not supported, throw :exc:`TypeError`.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def register_input_mapper(
|
||||
self,
|
||||
mapper: Optional[MultiModalInputMapper] = None,
|
||||
):
|
||||
"""
|
||||
Register an input mapper to a model class.
|
||||
|
||||
When the model receives input data that matches the modality served by
|
||||
this plugin (see :meth:`get_data_key`), the provided function is
|
||||
invoked to transform the data into a dictionary of model inputs.
|
||||
|
||||
If `None` is provided, then the default input mapper is used instead.
|
||||
|
||||
See also:
|
||||
- :ref:`input_processing_pipeline`
|
||||
- :ref:`enabling_multimodal_inputs`
|
||||
"""
|
||||
|
||||
def wrapper(model_cls: N) -> N:
|
||||
if model_cls in self._input_mappers:
|
||||
logger.warning(
|
||||
"Model class %s already has an input mapper "
|
||||
"registered to %s. It is overwritten by the new one.",
|
||||
model_cls, self)
|
||||
|
||||
self._input_mappers[model_cls] = mapper \
|
||||
or self._default_input_mapper
|
||||
|
||||
return model_cls
|
||||
|
||||
return wrapper
|
||||
|
||||
def map_input(self, model_config: ModelConfig,
|
||||
data: MultiModalData[object],
|
||||
mm_processor_kwargs: Dict[str, Any]) -> MultiModalInputs:
|
||||
"""
|
||||
Transform the data into a dictionary of model inputs using the
|
||||
input mapper registered for that model.
|
||||
|
||||
The model is identified by ``model_config``.
|
||||
|
||||
Raises:
|
||||
TypeError: If the data type is not supported.
|
||||
|
||||
See also:
|
||||
- :ref:`input_processing_pipeline`
|
||||
- :ref:`enabling_multimodal_inputs`
|
||||
"""
|
||||
# Avoid circular import
|
||||
from vllm.model_executor.model_loader import get_model_architecture
|
||||
|
||||
model_cls, _ = get_model_architecture(model_config)
|
||||
|
||||
mapper = self._input_mappers.get(model_cls)
|
||||
|
||||
if mapper is None:
|
||||
raise KeyError(f"No input mapper in {self} is registered for "
|
||||
f"model class {model_cls.__name__}.")
|
||||
|
||||
# In the case of the default mapper, we have to get resource
|
||||
# processor through its HuggingFace autoclass; since this goes
|
||||
# through **kwargs, we can't inspect it the same way, so we allow
|
||||
# drop mm_processor_kwargs based on signature inspection
|
||||
# if we're using the default mapper.
|
||||
#
|
||||
# This should be safe in general due to the sanitation, since the
|
||||
# transformers resource should filter unused kwargs anyway.
|
||||
uses_default_mapper = mapper == self._default_input_mapper
|
||||
mm_processor_kwargs = resolve_mm_processor_kwargs(
|
||||
model_config.mm_processor_kwargs,
|
||||
mm_processor_kwargs,
|
||||
callable=mapper,
|
||||
allow_var_kwargs=uses_default_mapper,
|
||||
)
|
||||
return mapper(InputContext(model_config), data, **mm_processor_kwargs)
|
||||
|
||||
@abstractmethod
|
||||
def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
|
||||
"""
|
||||
Calculate the maximum number of tokens, corresponding to a single
|
||||
instance of multimodal data, that are passed to the language model.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _validate_max_multimodal_tokens(self, max_mm_tokens: int):
|
||||
if max_mm_tokens < 1:
|
||||
raise ValueError("You should set the number of tokens to a "
|
||||
f"positive integer. Found: {max_mm_tokens}")
|
||||
|
||||
def register_max_multimodal_tokens(
|
||||
self,
|
||||
max_mm_tokens: Optional[MultiModalTokensCalc] = None,
|
||||
):
|
||||
"""
|
||||
Register the maximum number of tokens, corresponding to a single
|
||||
instance of multimodal data, that are passed to the language model
|
||||
for a model class.
|
||||
|
||||
If `None` is provided, then the default calculation is used instead.
|
||||
|
||||
See also:
|
||||
:ref:`enabling_multimodal_inputs`
|
||||
"""
|
||||
|
||||
def wrapper(model_cls: N) -> N:
|
||||
if model_cls in self._max_mm_tokens:
|
||||
logger.warning(
|
||||
"Model class %s already calculates maximum number of "
|
||||
"tokens in %s. It is overwritten by the new one.",
|
||||
model_cls, self)
|
||||
|
||||
if isinstance(max_mm_tokens, int):
|
||||
self._validate_max_multimodal_tokens(max_mm_tokens)
|
||||
|
||||
self._max_mm_tokens[model_cls] = max_mm_tokens \
|
||||
or self._default_max_multimodal_tokens
|
||||
|
||||
return model_cls
|
||||
|
||||
return wrapper
|
||||
|
||||
def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
|
||||
"""
|
||||
Get the maximum number of multi-modal tokens
|
||||
for profiling the memory usage of a model.
|
||||
|
||||
If this registry is not applicable to the model, `0` is returned.
|
||||
|
||||
The model is identified by ``model_config``.
|
||||
|
||||
See also:
|
||||
:ref:`enabling_multimodal_inputs`
|
||||
"""
|
||||
# Avoid circular import
|
||||
from vllm.model_executor.model_loader import get_model_architecture
|
||||
|
||||
model_cls, _ = get_model_architecture(model_config)
|
||||
|
||||
if model_cls not in self._input_mappers:
|
||||
return 0
|
||||
|
||||
max_mm_tokens = self._max_mm_tokens.get(model_cls)
|
||||
if max_mm_tokens is None:
|
||||
raise KeyError(f"No maximum number of multi-modal tokens is given "
|
||||
f"for model class {model_cls.__name__} in {self}.")
|
||||
|
||||
if callable(max_mm_tokens):
|
||||
mm_processor_kwargs = get_allowed_kwarg_only_overrides(
|
||||
max_mm_tokens, overrides=model_config.mm_processor_kwargs)
|
||||
max_mm_tokens = max_mm_tokens(InputContext(model_config),
|
||||
**mm_processor_kwargs)
|
||||
|
||||
self._validate_max_multimodal_tokens(max_mm_tokens)
|
||||
|
||||
return max_mm_tokens
|
||||
88
vllm/multimodal/image.py
Normal file
88
vllm/multimodal/image.py
Normal file
@@ -0,0 +1,88 @@
|
||||
from functools import lru_cache
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers.image_processing_base import BatchFeature
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.inputs.registry import InputContext
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.processor import get_image_processor
|
||||
from vllm.utils import is_list_of
|
||||
|
||||
from .base import MultiModalData, MultiModalInputs, MultiModalPlugin
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
cached_get_image_processor = lru_cache(get_image_processor)
|
||||
|
||||
|
||||
class ImagePlugin(MultiModalPlugin):
|
||||
"""Plugin for image data."""
|
||||
|
||||
def get_data_key(self) -> str:
|
||||
return "image"
|
||||
|
||||
def _get_hf_image_processor(
|
||||
self,
|
||||
model_config: ModelConfig,
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
if mm_processor_kwargs is None:
|
||||
mm_processor_kwargs = {}
|
||||
return cached_get_image_processor(
|
||||
model_config.model,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
**mm_processor_kwargs)
|
||||
|
||||
def _default_input_mapper(
|
||||
self,
|
||||
ctx: InputContext,
|
||||
data: MultiModalData[object],
|
||||
**mm_processor_kwargs,
|
||||
) -> MultiModalInputs:
|
||||
model_config = ctx.model_config
|
||||
|
||||
# Processed by input processor
|
||||
if isinstance(data, BatchFeature):
|
||||
return MultiModalInputs(data.data)
|
||||
|
||||
# PIL image
|
||||
if isinstance(data, Image.Image) or is_list_of(data, Image.Image):
|
||||
image_processor = self._get_hf_image_processor(
|
||||
model_config,
|
||||
mm_processor_kwargs,
|
||||
)
|
||||
|
||||
if image_processor is None:
|
||||
raise RuntimeError("No HuggingFace processor is available "
|
||||
"to process the image object")
|
||||
try:
|
||||
# NOTE: It may make sense to forward the mm_processor_kwargs
|
||||
# here too. For now, to keep it simple, we only allow it be
|
||||
# used for the initialization call though, just in case the
|
||||
# signatures of the preprocessor initializer don't match
|
||||
# preprocess()
|
||||
batch_data = image_processor \
|
||||
.preprocess(data, return_tensors="pt") \
|
||||
.data
|
||||
except Exception:
|
||||
logger.error(
|
||||
"Failed to process image (%s) with the default mapper. "
|
||||
"This is most likely an edge-case with this model's image "
|
||||
"processor in transformers (type: %s), and not vLLM.",
|
||||
data,
|
||||
type(image_processor).__name__)
|
||||
raise
|
||||
|
||||
return MultiModalInputs(batch_data)
|
||||
|
||||
# Image embedding
|
||||
elif isinstance(data, torch.Tensor) or is_list_of(data, torch.Tensor):
|
||||
return MultiModalInputs({"image_embeds": data})
|
||||
|
||||
raise TypeError(f"Invalid image type: {type(data)}")
|
||||
|
||||
def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
|
||||
return 3000
|
||||
243
vllm/multimodal/registry.py
Normal file
243
vllm/multimodal/registry.py
Normal file
@@ -0,0 +1,243 @@
|
||||
import functools
|
||||
from collections import UserDict
|
||||
from typing import Any, Dict, Mapping, Optional, Sequence
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from .audio import AudioPlugin
|
||||
from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs,
|
||||
MultiModalPlugin, MultiModalTokensCalc, NestedTensors)
|
||||
from .image import ImagePlugin
|
||||
from .video import VideoPlugin
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class _MultiModalLimits(UserDict):
|
||||
"""
|
||||
Wraps `_limits_by_model` for a more informative error message
|
||||
when attempting to access a model that does not exist.
|
||||
"""
|
||||
|
||||
def __getitem__(self, key: ModelConfig) -> Dict[str, int]:
|
||||
try:
|
||||
return super().__getitem__(key)
|
||||
except KeyError as exc:
|
||||
msg = (f"Cannot find `mm_limits` for model={key.model}. Did you "
|
||||
"forget to call `init_mm_limits_per_prompt`?")
|
||||
raise KeyError(msg) from exc
|
||||
|
||||
|
||||
class MultiModalRegistry:
|
||||
"""
|
||||
A registry that dispatches data processing to the
|
||||
:class:`~vllm.multimodal.MultiModalPlugin` for each modality.
|
||||
"""
|
||||
|
||||
DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin(), VideoPlugin())
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
plugins: Sequence[MultiModalPlugin] = DEFAULT_PLUGINS) -> None:
|
||||
self._plugins = {p.get_data_key(): p for p in plugins}
|
||||
|
||||
# This is used for non-multimodal models
|
||||
self._disabled_limits_per_plugin = {k: 0 for k in self._plugins}
|
||||
|
||||
self._limits_by_model = _MultiModalLimits()
|
||||
|
||||
def register_plugin(self, plugin: MultiModalPlugin) -> None:
|
||||
"""
|
||||
Register a multi-modal plugin so it can be recognized by vLLM.
|
||||
|
||||
See also:
|
||||
:ref:`adding_multimodal_plugin`
|
||||
"""
|
||||
data_type_key = plugin.get_data_key()
|
||||
|
||||
if data_type_key in self._plugins:
|
||||
logger.warning(
|
||||
"A plugin is already registered for data type %s, "
|
||||
"and will be overwritten by the new plugin %s.", data_type_key,
|
||||
plugin)
|
||||
|
||||
self._plugins[data_type_key] = plugin
|
||||
|
||||
def _get_plugin(self, data_type_key: str):
|
||||
plugin = self._plugins.get(data_type_key)
|
||||
if plugin is not None:
|
||||
return plugin
|
||||
|
||||
msg = f"Unknown multi-modal data type: {data_type_key}"
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
def register_input_mapper(
|
||||
self,
|
||||
data_type_key: str,
|
||||
mapper: Optional[MultiModalInputMapper] = None,
|
||||
):
|
||||
"""
|
||||
Register an input mapper for a specific modality to a model class.
|
||||
|
||||
See :meth:`MultiModalPlugin.register_input_mapper` for more details.
|
||||
"""
|
||||
return self._get_plugin(data_type_key).register_input_mapper(mapper)
|
||||
|
||||
def register_image_input_mapper(
|
||||
self,
|
||||
mapper: Optional[MultiModalInputMapper] = None,
|
||||
):
|
||||
"""
|
||||
Register an input mapper for image data to a model class.
|
||||
|
||||
See :meth:`MultiModalPlugin.register_input_mapper` for more details.
|
||||
"""
|
||||
return self.register_input_mapper("image", mapper)
|
||||
|
||||
def map_input(
|
||||
self,
|
||||
model_config: ModelConfig,
|
||||
data: MultiModalDataDict,
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
|
||||
) -> MultiModalInputs:
|
||||
"""
|
||||
Apply an input mapper to the data passed to the model.
|
||||
|
||||
The data belonging to each modality is passed to the corresponding
|
||||
plugin which in turn converts the data into into keyword arguments
|
||||
via the input mapper registered for that model.
|
||||
|
||||
See :meth:`MultiModalPlugin.map_input` for more details.
|
||||
|
||||
Note:
|
||||
This should be called after :meth:`init_mm_limits_per_prompt`.
|
||||
"""
|
||||
merged_dict: Dict[str, NestedTensors] = {}
|
||||
|
||||
for data_key, data_value in data.items():
|
||||
plugin = self._get_plugin(data_key)
|
||||
|
||||
num_items = len(data_value) if isinstance(data_value, list) else 1
|
||||
max_items = self._limits_by_model[model_config][data_key]
|
||||
if num_items > max_items:
|
||||
raise ValueError(
|
||||
f"You set {data_key}={max_items} (or defaulted to 1) in "
|
||||
f"`--limit-mm-per-prompt`, but found {num_items} items "
|
||||
"in the same prompt.")
|
||||
|
||||
input_dict = plugin.map_input(model_config, data_value,
|
||||
mm_processor_kwargs)
|
||||
for input_key, input_tensor in input_dict.items():
|
||||
if input_key in merged_dict:
|
||||
raise ValueError(f"The input mappers (keys={set(data)}) "
|
||||
f"resulted in a conflicting keyword "
|
||||
f"argument to `forward()`: {input_key}")
|
||||
|
||||
merged_dict[input_key] = input_tensor
|
||||
|
||||
return MultiModalInputs(merged_dict)
|
||||
|
||||
def create_input_mapper(self, model_config: ModelConfig):
|
||||
"""
|
||||
Create an input mapper (see :meth:`map_input`) for a specific model.
|
||||
"""
|
||||
# NOTE - we currently make the assumption that if a model has multiple
|
||||
# supported modalities, they take the same kwargs. For the default,
|
||||
# this could be an issue in the future if it falls back to two HF
|
||||
# resources and we can't inspect the signature easily since it's
|
||||
# getting initialized through the autoclass.
|
||||
#
|
||||
# If this is a problem in the future, we should revisit it, but since
|
||||
# it potentially introduces a lot of complexity for a currently
|
||||
# uncommon case, we do not for simplicity of both use & implementation
|
||||
return functools.partial(self.map_input, model_config)
|
||||
|
||||
def register_max_multimodal_tokens(
|
||||
self,
|
||||
data_type_key: str,
|
||||
max_mm_tokens: Optional[MultiModalTokensCalc] = None,
|
||||
):
|
||||
"""
|
||||
Register the maximum number of tokens, corresponding to a single
|
||||
instance of multimodal data belonging to a specific modality, that are
|
||||
passed to the language model for a model class.
|
||||
"""
|
||||
return self._get_plugin(data_type_key) \
|
||||
.register_max_multimodal_tokens(max_mm_tokens)
|
||||
|
||||
def register_max_image_tokens(
|
||||
self,
|
||||
max_mm_tokens: Optional[MultiModalTokensCalc] = None,
|
||||
):
|
||||
"""
|
||||
Register the maximum number of image tokens, corresponding to a single
|
||||
image, that are passed to the language model for a model class.
|
||||
"""
|
||||
return self.register_max_multimodal_tokens("image", max_mm_tokens)
|
||||
|
||||
def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
|
||||
"""
|
||||
Get the maximum number of multi-modal tokens
|
||||
for profiling the memory usage of a model.
|
||||
|
||||
See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
|
||||
|
||||
Note:
|
||||
This should be called after :meth:`init_mm_limits_per_prompt`.
|
||||
"""
|
||||
limits_per_plugin = self._limits_by_model[model_config]
|
||||
|
||||
return sum((limits_per_plugin[key] *
|
||||
plugin.get_max_multimodal_tokens(model_config))
|
||||
for key, plugin in self._plugins.items())
|
||||
|
||||
def init_mm_limits_per_prompt(
|
||||
self,
|
||||
model_config: ModelConfig,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize the maximum number of multi-modal input instances for each
|
||||
modality that are allowed per prompt for a model class.
|
||||
"""
|
||||
if model_config in self._limits_by_model:
|
||||
logger.warning(
|
||||
"`mm_limits` has already been set for model=%s, and will "
|
||||
"be overwritten by the new values.", model_config.model)
|
||||
|
||||
multimodal_config = model_config.multimodal_config
|
||||
if multimodal_config is None:
|
||||
limits_per_plugin = self._disabled_limits_per_plugin
|
||||
else:
|
||||
config_limits_per_plugin = multimodal_config.limit_per_prompt
|
||||
|
||||
extra_keys = config_limits_per_plugin.keys() - self._plugins.keys()
|
||||
if extra_keys:
|
||||
logger.warning(
|
||||
"Detected extra keys in `--limit-mm-per-prompt` which "
|
||||
"are not registered as multi-modal plugins: %s. "
|
||||
"They will be ignored.", extra_keys)
|
||||
|
||||
# NOTE: Currently the default is set to 1 for each plugin
|
||||
# TODO: Automatically determine the limits based on budget
|
||||
# once more models support multi-image inputs
|
||||
limits_per_plugin = {
|
||||
key: config_limits_per_plugin.get(key, 1)
|
||||
for key in self._plugins
|
||||
}
|
||||
|
||||
self._limits_by_model[model_config] = limits_per_plugin
|
||||
|
||||
def get_mm_limits_per_prompt(
|
||||
self,
|
||||
model_config: ModelConfig,
|
||||
) -> Mapping[str, int]:
|
||||
"""
|
||||
Get the maximum number of multi-modal input instances for each modality
|
||||
that are allowed per prompt for a model class.
|
||||
|
||||
Note:
|
||||
This should be called after :meth:`init_mm_limits_per_prompt`.
|
||||
"""
|
||||
return self._limits_by_model[model_config]
|
||||
323
vllm/multimodal/utils.py
Normal file
323
vllm/multimodal/utils.py
Normal file
@@ -0,0 +1,323 @@
|
||||
import base64
|
||||
from functools import lru_cache
|
||||
from io import BytesIO
|
||||
from typing import Any, List, Optional, Tuple, TypeVar, Union
|
||||
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
from PIL import Image
|
||||
|
||||
from vllm.connections import global_http_connection
|
||||
from vllm.envs import VLLM_AUDIO_FETCH_TIMEOUT, VLLM_IMAGE_FETCH_TIMEOUT
|
||||
from vllm.logger import init_logger
|
||||
from vllm.multimodal.base import MultiModalDataDict
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
cached_get_tokenizer = lru_cache(get_tokenizer)
|
||||
|
||||
|
||||
def _load_image_from_bytes(b: bytes):
|
||||
image = Image.open(BytesIO(b))
|
||||
image.load()
|
||||
return image
|
||||
|
||||
|
||||
def _load_image_from_data_url(image_url: str):
|
||||
# Only split once and assume the second part is the base64 encoded image
|
||||
_, image_base64 = image_url.split(",", 1)
|
||||
return load_image_from_base64(image_base64)
|
||||
|
||||
|
||||
def fetch_image(image_url: str, *, image_mode: str = "RGB") -> Image.Image:
|
||||
"""
|
||||
Load a PIL image from a HTTP or base64 data URL.
|
||||
|
||||
By default, the image is converted into RGB format.
|
||||
"""
|
||||
if image_url.startswith('http'):
|
||||
image_raw = global_http_connection.get_bytes(
|
||||
image_url, timeout=VLLM_IMAGE_FETCH_TIMEOUT)
|
||||
image = _load_image_from_bytes(image_raw)
|
||||
|
||||
elif image_url.startswith('data:image'):
|
||||
image = _load_image_from_data_url(image_url)
|
||||
else:
|
||||
raise ValueError("Invalid 'image_url': A valid 'image_url' must start "
|
||||
"with either 'data:image' or 'http'.")
|
||||
|
||||
return image.convert(image_mode)
|
||||
|
||||
|
||||
async def async_fetch_image(image_url: str,
|
||||
*,
|
||||
image_mode: str = "RGB") -> Image.Image:
|
||||
"""
|
||||
Asynchronously load a PIL image from a HTTP or base64 data URL.
|
||||
|
||||
By default, the image is converted into RGB format.
|
||||
"""
|
||||
if image_url.startswith('http'):
|
||||
image_raw = await global_http_connection.async_get_bytes(
|
||||
image_url, timeout=VLLM_IMAGE_FETCH_TIMEOUT)
|
||||
image = _load_image_from_bytes(image_raw)
|
||||
|
||||
elif image_url.startswith('data:image'):
|
||||
image = _load_image_from_data_url(image_url)
|
||||
else:
|
||||
raise ValueError("Invalid 'image_url': A valid 'image_url' must start "
|
||||
"with either 'data:image' or 'http'.")
|
||||
|
||||
return image.convert(image_mode)
|
||||
|
||||
|
||||
def try_import_audio_packages() -> Tuple[Any, Any]:
|
||||
try:
|
||||
import librosa
|
||||
import soundfile
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install vllm[audio] for audio support.") from None
|
||||
return librosa, soundfile
|
||||
|
||||
|
||||
def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
|
||||
"""
|
||||
Load audio from a URL.
|
||||
"""
|
||||
librosa, _ = try_import_audio_packages()
|
||||
|
||||
if audio_url.startswith("http"):
|
||||
audio_bytes = global_http_connection.get_bytes(
|
||||
audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT)
|
||||
elif audio_url.startswith("data:audio"):
|
||||
_, audio_base64 = audio_url.split(",", 1)
|
||||
audio_bytes = base64.b64decode(audio_base64)
|
||||
else:
|
||||
raise ValueError("Invalid 'audio_url': A valid 'audio_url' must start "
|
||||
"with either 'data:audio' or 'http'.")
|
||||
|
||||
return librosa.load(BytesIO(audio_bytes), sr=None)
|
||||
|
||||
|
||||
async def async_fetch_audio(
|
||||
audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
|
||||
"""
|
||||
Asynchronously fetch audio from a URL.
|
||||
"""
|
||||
librosa, _ = try_import_audio_packages()
|
||||
|
||||
if audio_url.startswith("http"):
|
||||
audio_bytes = await global_http_connection.async_get_bytes(
|
||||
audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT)
|
||||
elif audio_url.startswith("data:audio"):
|
||||
_, audio_base64 = audio_url.split(",", 1)
|
||||
audio_bytes = base64.b64decode(audio_base64)
|
||||
else:
|
||||
raise ValueError("Invalid 'audio_url': A valid 'audio_url' must start "
|
||||
"with either 'data:audio' or 'http'.")
|
||||
|
||||
return librosa.load(BytesIO(audio_bytes), sr=None)
|
||||
|
||||
|
||||
def get_and_parse_audio(audio_url: str) -> MultiModalDataDict:
|
||||
audio, sr = fetch_audio(audio_url)
|
||||
return {"audio": (audio, sr)}
|
||||
|
||||
|
||||
def get_and_parse_image(image_url: str) -> MultiModalDataDict:
|
||||
image = fetch_image(image_url)
|
||||
return {"image": image}
|
||||
|
||||
|
||||
async def async_get_and_parse_audio(audio_url: str) -> MultiModalDataDict:
|
||||
audio, sr = await async_fetch_audio(audio_url)
|
||||
return {"audio": (audio, sr)}
|
||||
|
||||
|
||||
async def async_get_and_parse_image(image_url: str) -> MultiModalDataDict:
|
||||
image = await async_fetch_image(image_url)
|
||||
return {"image": image}
|
||||
|
||||
|
||||
def encode_audio_base64(
|
||||
audio: np.ndarray,
|
||||
sampling_rate: int,
|
||||
) -> str:
|
||||
"""Encode audio as base64."""
|
||||
_, soundfile = try_import_audio_packages()
|
||||
|
||||
buffered = BytesIO()
|
||||
soundfile.write(buffered, audio, sampling_rate, format="WAV")
|
||||
|
||||
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||
|
||||
|
||||
def encode_image_base64(
|
||||
image: Image.Image,
|
||||
*,
|
||||
image_mode: str = "RGB",
|
||||
format: str = "JPEG",
|
||||
) -> str:
|
||||
"""
|
||||
Encode a pillow image to base64 format.
|
||||
|
||||
By default, the image is converted into RGB format before being encoded.
|
||||
"""
|
||||
buffered = BytesIO()
|
||||
image = image.convert(image_mode)
|
||||
image.save(buffered, format)
|
||||
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||
|
||||
|
||||
def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
|
||||
"""Load image from base64 format."""
|
||||
return _load_image_from_bytes(base64.b64decode(image))
|
||||
|
||||
|
||||
def rescale_image_size(image: Image.Image,
|
||||
size_factor: float,
|
||||
transpose: int = -1) -> Image.Image:
|
||||
"""Rescale the dimensions of an image by a constant factor."""
|
||||
new_width = int(image.width * size_factor)
|
||||
new_height = int(image.height * size_factor)
|
||||
image = image.resize((new_width, new_height))
|
||||
if transpose >= 0:
|
||||
image = image.transpose(Image.Transpose(transpose))
|
||||
return image
|
||||
|
||||
|
||||
def try_import_video_packages() -> Any:
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install vllm[video] for video support.") from None
|
||||
return cv2
|
||||
|
||||
|
||||
def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray:
|
||||
cv2 = try_import_video_packages()
|
||||
|
||||
num_frames, _, _, channels = frames.shape
|
||||
new_height, new_width = size
|
||||
resized_frames = np.empty((num_frames, new_height, new_width, channels),
|
||||
dtype=frames.dtype)
|
||||
for i, frame in enumerate(frames):
|
||||
resized_frame = cv2.resize(frame, (new_width, new_height))
|
||||
resized_frames[i] = resized_frame
|
||||
return resized_frames
|
||||
|
||||
|
||||
def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
|
||||
_, height, width, _ = frames.shape
|
||||
new_height = int(height * size_factor)
|
||||
new_width = int(width * size_factor)
|
||||
|
||||
return resize_video(frames, (new_height, new_width))
|
||||
|
||||
|
||||
def sample_frames_from_video(frames: npt.NDArray,
|
||||
num_frames: int) -> npt.NDArray:
|
||||
total_frames = frames.shape[0]
|
||||
if num_frames == -1:
|
||||
return frames
|
||||
else:
|
||||
frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
|
||||
sampled_frames = frames[frame_indices, ...]
|
||||
return sampled_frames
|
||||
|
||||
|
||||
# Utilities for input processors
|
||||
_T = TypeVar("_T", str, int)
|
||||
|
||||
|
||||
def repeat_and_pad_token(
|
||||
token: _T,
|
||||
*,
|
||||
repeat_count: int = 1,
|
||||
pad_token_left: Optional[_T] = None,
|
||||
pad_token_right: Optional[_T] = None,
|
||||
) -> List[_T]:
|
||||
replacement = [token] * repeat_count
|
||||
if pad_token_left is not None:
|
||||
replacement = [pad_token_left] + replacement
|
||||
if pad_token_right is not None:
|
||||
replacement = replacement + [pad_token_right]
|
||||
|
||||
return replacement
|
||||
|
||||
|
||||
def repeat_and_pad_placeholder_tokens(
|
||||
tokenizer: AnyTokenizer,
|
||||
prompt: Optional[str],
|
||||
prompt_token_ids: List[int],
|
||||
*,
|
||||
placeholder_token_id: int,
|
||||
repeat_count: Union[int, List[int]],
|
||||
pad_token_left: Optional[int] = None,
|
||||
pad_token_right: Optional[int] = None,
|
||||
) -> Tuple[Optional[str], List[int]]:
|
||||
if isinstance(repeat_count, int):
|
||||
repeat_count = [repeat_count]
|
||||
|
||||
if prompt is None:
|
||||
new_prompt = None
|
||||
else:
|
||||
placeholder_token_str = tokenizer.decode(placeholder_token_id)
|
||||
pad_token_str_left = (None if pad_token_left is None else
|
||||
tokenizer.decode(pad_token_left))
|
||||
pad_token_str_right = (None if pad_token_right is None else
|
||||
tokenizer.decode(pad_token_right))
|
||||
|
||||
placeholder_token_count = prompt.count(placeholder_token_str)
|
||||
# This is an arbitrary number to distinguish between the two cases
|
||||
if placeholder_token_count > 16:
|
||||
logger.warning(
|
||||
"Please follow the prompt format that is "
|
||||
"documented on HuggingFace which does not involve "
|
||||
"repeating %s tokens.", placeholder_token_str)
|
||||
if placeholder_token_count < len(repeat_count):
|
||||
logger.warning(
|
||||
"The number of multi-modal placeholder tokens in the prompt "
|
||||
"is less than the number of multi-modal inputs. Extra "
|
||||
"placeholder tokens will be treated as plain text")
|
||||
repeat_count = repeat_count[:placeholder_token_count]
|
||||
|
||||
prompt_parts = prompt.split(placeholder_token_str,
|
||||
maxsplit=len(repeat_count))
|
||||
new_prompt = ""
|
||||
for i, repeat_count_item in enumerate(repeat_count):
|
||||
replacement_str = "".join(
|
||||
repeat_and_pad_token(
|
||||
placeholder_token_str,
|
||||
repeat_count=repeat_count_item,
|
||||
pad_token_left=pad_token_str_left,
|
||||
pad_token_right=pad_token_str_right,
|
||||
))
|
||||
# The image tokens are removed to be consistent with HuggingFace
|
||||
new_prompt += prompt_parts[i] + replacement_str
|
||||
new_prompt += prompt_parts[-1]
|
||||
|
||||
new_token_ids: List[int] = []
|
||||
placeholder_token_idx = 0
|
||||
for i, token in enumerate(prompt_token_ids):
|
||||
if token == placeholder_token_id:
|
||||
replacement_ids = repeat_and_pad_token(
|
||||
placeholder_token_id,
|
||||
repeat_count=repeat_count[placeholder_token_idx],
|
||||
pad_token_left=pad_token_left,
|
||||
pad_token_right=pad_token_right,
|
||||
)
|
||||
new_token_ids.extend(replacement_ids)
|
||||
placeholder_token_idx += 1
|
||||
|
||||
# No need to further scan the list since we replaced all tokens
|
||||
if placeholder_token_idx >= len(repeat_count):
|
||||
new_token_ids.extend(prompt_token_ids[i + 1:])
|
||||
break
|
||||
else:
|
||||
new_token_ids.append(token)
|
||||
|
||||
return new_prompt, new_token_ids
|
||||
86
vllm/multimodal/video.py
Normal file
86
vllm/multimodal/video.py
Normal file
@@ -0,0 +1,86 @@
|
||||
from functools import lru_cache
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.inputs.registry import InputContext
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.processor import get_video_processor
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.utils import is_list_of
|
||||
|
||||
from .base import MultiModalData, MultiModalInputs
|
||||
from .image import ImagePlugin
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
cached_get_video_processor = lru_cache(get_video_processor)
|
||||
cached_get_tokenizer = lru_cache(get_tokenizer)
|
||||
|
||||
VideoInput = Union[
|
||||
"np.ndarray", # single video input
|
||||
List["np.ndarray"],
|
||||
# TODO: support more types
|
||||
# List[Image.Image], List[List[Image.Image]],
|
||||
# "torch.Tensor",
|
||||
# List["torch.Tensor"],
|
||||
# List[List["np.ndarrray"]],
|
||||
# List[List["torch.Tensor"]],
|
||||
]
|
||||
|
||||
|
||||
class VideoPlugin(ImagePlugin):
|
||||
"""Plugin for video data."""
|
||||
|
||||
def get_data_key(self) -> str:
|
||||
return "video"
|
||||
|
||||
def _get_hf_video_processor(
|
||||
self,
|
||||
model_config: ModelConfig,
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
if mm_processor_kwargs is None:
|
||||
mm_processor_kwargs = {}
|
||||
return cached_get_video_processor(
|
||||
model_config.model,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
**mm_processor_kwargs)
|
||||
|
||||
def _default_input_mapper(
|
||||
self,
|
||||
ctx: InputContext,
|
||||
data: MultiModalData[object],
|
||||
**mm_processor_kwargs,
|
||||
) -> MultiModalInputs:
|
||||
model_config = ctx.model_config
|
||||
|
||||
# single video input as np.ndarray
|
||||
if isinstance(data, np.ndarray):
|
||||
video_processor = self._get_hf_video_processor(
|
||||
model_config,
|
||||
mm_processor_kwargs,
|
||||
)
|
||||
if video_processor is None:
|
||||
raise RuntimeError("No HuggingFace processor is available "
|
||||
"to process the image object")
|
||||
try:
|
||||
# NOTE: Similar to image; it may be a good idea to filter and
|
||||
# pass mm_processor_kwargs here too, but for now we don't to
|
||||
# avoid extra complexity if the initializer and preprocess
|
||||
# signatures of the processor don't align
|
||||
batch_data = video_processor(data, return_tensors="pt").data
|
||||
except Exception:
|
||||
logger.error("Failed to process image (%s)", data)
|
||||
raise
|
||||
|
||||
return MultiModalInputs(batch_data)
|
||||
elif is_list_of(data, np.ndarray):
|
||||
raise NotImplementedError(
|
||||
"Multi video for a prompt is not supported yet")
|
||||
|
||||
raise TypeError(f"Invalid video type: {type(data)}")
|
||||
|
||||
def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
|
||||
return 4096
|
||||
Reference in New Issue
Block a user