90 lines
3.5 KiB
Python
90 lines
3.5 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
"""Radio vision model configuration"""
|
|
|
|
from transformers.configuration_utils import PretrainedConfig
|
|
from transformers.utils import logging
|
|
|
|
logger = logging.get_logger(__name__)
|
|
|
|
VIT_TIMM_DIM_BY_NAME: dict[str, tuple[int, int, int, int]] = {
|
|
"vit_small_patch16_224": (384, 12, 6, 1536),
|
|
"vit_base_patch16_224": (768, 12, 12, 3072),
|
|
"vit_large_patch16_224": (1024, 24, 16, 4096),
|
|
"vit_huge_patch16_224": (1280, 32, 16, 5120),
|
|
}
|
|
|
|
OPENAI_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
|
|
OPENAI_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
|
|
|
|
|
|
class RadioConfig(PretrainedConfig):
|
|
r"""
|
|
This is the configuration class to store the configuration of a Radio
|
|
vision model. It is used to instantiate a Radio model according to the
|
|
specified arguments, defining the model architecture.
|
|
|
|
Args:
|
|
model_name: Name of the vision transformer model
|
|
(e.g., "vit_base_patch16_224"). Used to determine architecture
|
|
dimensions from `VIT_TIMM_DIM_BY_NAME`.
|
|
image_size: The size (resolution) of each image.
|
|
patch_size: The size (resolution) of each patch.
|
|
qkv_bias: Whether to add a bias to the queries, keys and values.
|
|
qk_normalization: Whether to apply normalization to queries and keys.
|
|
norm_type: The normalization type to use.
|
|
layer_norm_eps: The epsilon used by the layer normalization layers.
|
|
initializer_factor: A factor for initializing all weight matrices.
|
|
hidden_act: The non-linear activation function in the encoder.
|
|
max_img_size: Maximum image size for position embeddings.
|
|
norm_mean: Mean values for image normalization (RGB channels).
|
|
Defaults to (0.48145466, 0.4578275, 0.40821073)).
|
|
norm_std: Standard deviation values for image normalization
|
|
(RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).
|
|
reg_tokens: Number of register tokens to use.
|
|
"""
|
|
|
|
model_type = "radio"
|
|
|
|
def __init__(
|
|
self,
|
|
model_name: str,
|
|
image_size: int = 224,
|
|
patch_size: int = 16,
|
|
qkv_bias: bool = True,
|
|
qk_normalization: bool = False,
|
|
norm_type: str = "layer_norm",
|
|
layer_norm_eps: float = 1e-6,
|
|
initializer_factor: float = 1.0,
|
|
hidden_act: str = "gelu",
|
|
max_img_size: int = 2048,
|
|
norm_mean: tuple[float, float, float] | list = OPENAI_CLIP_MEAN,
|
|
norm_std: tuple[float, float, float] | list = OPENAI_CLIP_STD,
|
|
reg_tokens: int | None = None,
|
|
**kwargs,
|
|
):
|
|
self.model_name = model_name
|
|
(
|
|
self.hidden_size,
|
|
self.num_hidden_layers,
|
|
self.num_attention_heads,
|
|
self.intermediate_size,
|
|
) = VIT_TIMM_DIM_BY_NAME[model_name]
|
|
self.image_size = image_size
|
|
self.patch_size = patch_size
|
|
self.qkv_bias = qkv_bias
|
|
self.qk_normalization = qk_normalization
|
|
self.norm_type = norm_type
|
|
self.layer_norm_eps = layer_norm_eps
|
|
self.initializer_factor = initializer_factor
|
|
self.hidden_act = hidden_act
|
|
self.max_img_size = max_img_size
|
|
self.norm_mean = (
|
|
list(norm_mean) if isinstance(norm_mean, (tuple, list)) else norm_mean
|
|
)
|
|
self.norm_std = (
|
|
list(norm_std) if isinstance(norm_std, (tuple, list)) else norm_std
|
|
)
|
|
self.reg_tokens = reg_tokens
|
|
super().__init__(**kwargs)
|