[gpt-oss] Add gpt-oss bf16 support
This commit is contained in:
237
vllm/inputs/registry.py
Normal file
237
vllm/inputs/registry.py
Normal file
@@ -0,0 +1,237 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Mapping
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
|
||||
|
||||
import torch
|
||||
from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
|
||||
from typing_extensions import TypeVar
|
||||
|
||||
from vllm.jsontree import JSONTree, json_map_leaves
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.processor import cached_processor_from_config
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.utils import resolve_mm_processor_kwargs
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.multimodal import (MultiModalDataDict, MultiModalPlaceholderDict,
|
||||
MultiModalRegistry)
|
||||
from vllm.sequence import SequenceData
|
||||
|
||||
_T = TypeVar("_T")
|
||||
_C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig)
|
||||
_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class InputContext:
|
||||
"""
|
||||
Contains information about the model which may be used to
|
||||
modify the inputs.
|
||||
"""
|
||||
|
||||
model_config: "ModelConfig"
|
||||
"""The configuration of the model."""
|
||||
|
||||
def get_hf_config(
|
||||
self,
|
||||
typ: Union[type[_C], tuple[type[_C], ...]] = PretrainedConfig,
|
||||
/,
|
||||
) -> _C:
|
||||
"""
|
||||
Get the HuggingFace configuration
|
||||
(`transformers.PretrainedConfig`) of the model,
|
||||
additionally checking its type.
|
||||
|
||||
Raises:
|
||||
TypeError: If the configuration is not of the specified type.
|
||||
"""
|
||||
hf_config = self.model_config.hf_config
|
||||
if not isinstance(hf_config, typ):
|
||||
raise TypeError("Invalid type of HuggingFace config. "
|
||||
f"Expected type: {typ}, but "
|
||||
f"found type: {type(hf_config)}")
|
||||
|
||||
return hf_config
|
||||
|
||||
def get_hf_image_processor_config(self) -> dict[str, Any]:
|
||||
"""
|
||||
Get the HuggingFace image processor configuration of the model.
|
||||
"""
|
||||
return self.model_config.hf_image_processor_config
|
||||
|
||||
def get_mm_config(self):
|
||||
"""
|
||||
Get the multimodal config of the model.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the model is not a multimodal model.
|
||||
"""
|
||||
mm_config = self.model_config.multimodal_config
|
||||
if mm_config is None:
|
||||
raise RuntimeError("Not a multimodal model")
|
||||
|
||||
return mm_config
|
||||
|
||||
def get_hf_processor(
|
||||
self,
|
||||
typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
|
||||
/,
|
||||
**kwargs: object,
|
||||
) -> _P:
|
||||
"""
|
||||
Get the HuggingFace processor
|
||||
(`transformers.ProcessorMixin`) of the model,
|
||||
additionally checking its type.
|
||||
|
||||
Raises:
|
||||
TypeError: If the processor is not of the specified type.
|
||||
"""
|
||||
return cached_processor_from_config(
|
||||
self.model_config,
|
||||
processor_cls=typ,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def init_processor(
|
||||
self,
|
||||
typ: type[_T],
|
||||
/,
|
||||
**kwargs: object,
|
||||
) -> _T:
|
||||
"""
|
||||
Initialize a HuggingFace-like processor class, merging the
|
||||
keyword arguments with those in the model's configuration.
|
||||
"""
|
||||
mm_config = self.model_config.get_multimodal_config()
|
||||
base_kwargs = mm_config.mm_processor_kwargs
|
||||
if base_kwargs is None:
|
||||
base_kwargs = {}
|
||||
|
||||
merged_kwargs = {**base_kwargs, **kwargs}
|
||||
|
||||
return typ(**merged_kwargs)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class InputProcessingContext(InputContext):
|
||||
tokenizer: AnyTokenizer
|
||||
"""The tokenizer used to tokenize the inputs."""
|
||||
|
||||
def get_hf_processor(
|
||||
self,
|
||||
typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
|
||||
/,
|
||||
**kwargs: object,
|
||||
) -> _P:
|
||||
return super().get_hf_processor(
|
||||
typ,
|
||||
tokenizer=self.tokenizer,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def call_hf_processor(
|
||||
self,
|
||||
hf_processor: ProcessorMixin,
|
||||
data: Mapping[str, object],
|
||||
kwargs: Mapping[str, object] = {},
|
||||
) -> Union[BatchFeature, JSONTree]:
|
||||
"""
|
||||
Call `hf_processor` on the prompt `data`
|
||||
(text, image, audio...) with configurable options `kwargs`.
|
||||
"""
|
||||
assert callable(hf_processor)
|
||||
|
||||
mm_config = self.model_config.get_multimodal_config()
|
||||
base_kwargs = mm_config.mm_processor_kwargs
|
||||
if base_kwargs is None:
|
||||
base_kwargs = {}
|
||||
|
||||
merged_kwargs = resolve_mm_processor_kwargs(
|
||||
base_kwargs,
|
||||
kwargs,
|
||||
hf_processor,
|
||||
requires_kw_only=False,
|
||||
allow_var_kwargs=True,
|
||||
)
|
||||
|
||||
def maybe_cast_dtype(x):
|
||||
# This mimics the behavior of transformers.BatchFeature
|
||||
if isinstance(x, torch.Tensor) and x.is_floating_point():
|
||||
return x.to(dtype=self.model_config.dtype)
|
||||
return x
|
||||
|
||||
try:
|
||||
output = hf_processor(**data, **merged_kwargs, return_tensors="pt")
|
||||
# this emulates output.to(dtype=self.model_config.dtype)
|
||||
cast_output = json_map_leaves(maybe_cast_dtype, output)
|
||||
if isinstance(output, BatchFeature):
|
||||
return BatchFeature(cast_output)
|
||||
|
||||
logger.warning_once(
|
||||
f"{type(hf_processor).__name__} did not return `BatchFeature`. "
|
||||
"Make sure to match the behaviour of `ProcessorMixin` when "
|
||||
"implementing custom processors.")
|
||||
return cast_output
|
||||
|
||||
except Exception as exc:
|
||||
msg = (f"Failed to apply {type(hf_processor).__name__} "
|
||||
f"on data={data} with kwargs={merged_kwargs}")
|
||||
|
||||
raise ValueError(msg) from exc
|
||||
|
||||
|
||||
class DummyData(NamedTuple):
|
||||
"""
|
||||
Dummy data used for profiling.
|
||||
|
||||
Note: This is only used in V0.
|
||||
"""
|
||||
|
||||
seq_data: "SequenceData"
|
||||
multi_modal_data: Optional["MultiModalDataDict"] = None
|
||||
multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None
|
||||
|
||||
|
||||
class InputRegistry:
|
||||
"""
|
||||
Note: This is only used in V0.
|
||||
"""
|
||||
|
||||
def dummy_data_for_profiling(
|
||||
self,
|
||||
model_config: "ModelConfig",
|
||||
seq_len: int,
|
||||
mm_registry: "MultiModalRegistry",
|
||||
is_encoder_data: bool = False,
|
||||
) -> DummyData:
|
||||
"""
|
||||
Create dummy data for profiling the memory usage of a model.
|
||||
|
||||
The model is identified by ``model_config``.
|
||||
"""
|
||||
# Avoid circular import
|
||||
from vllm.sequence import SequenceData
|
||||
|
||||
if not model_config.is_multimodal_model:
|
||||
seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
|
||||
return DummyData(seq_data=seq_data)
|
||||
|
||||
# Encoder dummy data does not contain multi-modal data
|
||||
if is_encoder_data:
|
||||
enc_data = mm_registry.get_encoder_dummy_data(
|
||||
model_config, seq_len)
|
||||
seq_data = SequenceData.from_seqs(enc_data.prompt_token_ids)
|
||||
return DummyData(seq_data=seq_data)
|
||||
|
||||
dec_data = mm_registry.get_decoder_dummy_data(model_config, seq_len)
|
||||
|
||||
return DummyData(
|
||||
seq_data=SequenceData.from_seqs(dec_data.prompt_token_ids),
|
||||
multi_modal_data=dec_data.multi_modal_data,
|
||||
multi_modal_placeholders=dec_data.multi_modal_placeholders,
|
||||
)
|
||||
Reference in New Issue
Block a user