Sync from v0.13
This commit is contained in:
@@ -1,16 +1,102 @@
|
||||
from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
|
||||
from vllm.transformers_utils.configs.dbrx import DbrxConfig
|
||||
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
|
||||
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
|
||||
# `FalconConfig` class from the official HuggingFace transformers library.
|
||||
from vllm.transformers_utils.configs.falcon import RWConfig
|
||||
from vllm.transformers_utils.configs.jais import JAISConfig
|
||||
from vllm.transformers_utils.configs.mpt import MPTConfig
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Model configs may be defined in this directory for the following reasons:
|
||||
|
||||
- There is no configuration file defined by HF Hub or Transformers library.
|
||||
- There is a need to override the existing config to support vLLM.
|
||||
- The HF model_type isn't recognized by the Transformers library but can
|
||||
be mapped to an existing Transformers config, such as
|
||||
deepseek-ai/DeepSeek-V3.2-Exp.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
|
||||
_CLASS_TO_MODULE: dict[str, str] = {
|
||||
"AfmoeConfig": "vllm.transformers_utils.configs.afmoe",
|
||||
"BagelConfig": "vllm.transformers_utils.configs.bagel",
|
||||
"ChatGLMConfig": "vllm.transformers_utils.configs.chatglm",
|
||||
"DeepseekVLV2Config": "vllm.transformers_utils.configs.deepseek_vl2",
|
||||
"DotsOCRConfig": "vllm.transformers_utils.configs.dotsocr",
|
||||
"EAGLEConfig": "vllm.transformers_utils.configs.eagle",
|
||||
"FlexOlmoConfig": "vllm.transformers_utils.configs.flex_olmo",
|
||||
"HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl",
|
||||
"HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl",
|
||||
"HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl",
|
||||
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
|
||||
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
|
||||
# `FalconConfig` class from the official HuggingFace transformers library.
|
||||
"RWConfig": "vllm.transformers_utils.configs.falcon",
|
||||
"JAISConfig": "vllm.transformers_utils.configs.jais",
|
||||
"Lfm2MoeConfig": "vllm.transformers_utils.configs.lfm2_moe",
|
||||
"MedusaConfig": "vllm.transformers_utils.configs.medusa",
|
||||
"MiDashengLMConfig": "vllm.transformers_utils.configs.midashenglm",
|
||||
"MLPSpeculatorConfig": "vllm.transformers_utils.configs.mlp_speculator",
|
||||
"MoonViTConfig": "vllm.transformers_utils.configs.moonvit",
|
||||
"KimiLinearConfig": "vllm.transformers_utils.configs.kimi_linear",
|
||||
"KimiVLConfig": "vllm.transformers_utils.configs.kimi_vl",
|
||||
"NemotronConfig": "vllm.transformers_utils.configs.nemotron",
|
||||
"NemotronHConfig": "vllm.transformers_utils.configs.nemotron_h",
|
||||
"Olmo3Config": "vllm.transformers_utils.configs.olmo3",
|
||||
"OvisConfig": "vllm.transformers_utils.configs.ovis",
|
||||
"RadioConfig": "vllm.transformers_utils.configs.radio",
|
||||
"SpeculatorsConfig": "vllm.transformers_utils.configs.speculators.base",
|
||||
"UltravoxConfig": "vllm.transformers_utils.configs.ultravox",
|
||||
"Step3VLConfig": "vllm.transformers_utils.configs.step3_vl",
|
||||
"Step3VisionEncoderConfig": "vllm.transformers_utils.configs.step3_vl",
|
||||
"Step3TextConfig": "vllm.transformers_utils.configs.step3_vl",
|
||||
"Qwen3NextConfig": "vllm.transformers_utils.configs.qwen3_next",
|
||||
"Tarsier2Config": "vllm.transformers_utils.configs.tarsier2",
|
||||
# Special case: DeepseekV3Config is from HuggingFace Transformers
|
||||
"DeepseekV3Config": "transformers",
|
||||
}
|
||||
|
||||
__all__ = [
|
||||
"AfmoeConfig",
|
||||
"BagelConfig",
|
||||
"ChatGLMConfig",
|
||||
"DbrxConfig",
|
||||
"MPTConfig",
|
||||
"DeepseekVLV2Config",
|
||||
"DeepseekV3Config",
|
||||
"DotsOCRConfig",
|
||||
"EAGLEConfig",
|
||||
"FlexOlmoConfig",
|
||||
"HunYuanVLConfig",
|
||||
"HunYuanVLTextConfig",
|
||||
"HunYuanVLVisionConfig",
|
||||
"RWConfig",
|
||||
"JAISConfig",
|
||||
"Lfm2MoeConfig",
|
||||
"MedusaConfig",
|
||||
"MiDashengLMConfig",
|
||||
"MLPSpeculatorConfig",
|
||||
"MoonViTConfig",
|
||||
"KimiLinearConfig",
|
||||
"KimiVLConfig",
|
||||
"NemotronConfig",
|
||||
"NemotronHConfig",
|
||||
"Olmo3Config",
|
||||
"OvisConfig",
|
||||
"RadioConfig",
|
||||
"SpeculatorsConfig",
|
||||
"UltravoxConfig",
|
||||
"Step3VLConfig",
|
||||
"Step3VisionEncoderConfig",
|
||||
"Step3TextConfig",
|
||||
"Qwen3NextConfig",
|
||||
"Tarsier2Config",
|
||||
]
|
||||
|
||||
|
||||
def __getattr__(name: str):
|
||||
if name in _CLASS_TO_MODULE:
|
||||
module_name = _CLASS_TO_MODULE[name]
|
||||
module = importlib.import_module(module_name)
|
||||
return getattr(module, name)
|
||||
|
||||
raise AttributeError(f"module 'configs' has no attribute '{name}'")
|
||||
|
||||
|
||||
def __dir__():
|
||||
return sorted(list(__all__))
|
||||
|
||||
87
vllm/transformers_utils/configs/afmoe.py
Normal file
87
vllm/transformers_utils/configs/afmoe.py
Normal file
@@ -0,0 +1,87 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class AfmoeConfig(PretrainedConfig):
|
||||
model_type = "afmoe"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int = 200_192,
|
||||
hidden_size: int = 2048,
|
||||
intermediate_size: int = 6144,
|
||||
moe_intermediate_size: int = 1408,
|
||||
num_hidden_layers: int = 32,
|
||||
num_dense_layers: int = 1,
|
||||
num_attention_heads: int = 16,
|
||||
num_key_value_heads: int | None = None,
|
||||
head_dim: int = 128,
|
||||
hidden_act: str = "silu",
|
||||
max_position_embeddings: int = 131072,
|
||||
initializer_range: float = 0.02,
|
||||
rms_norm_eps: float = 1e-5,
|
||||
use_cache: bool = True,
|
||||
tie_word_embeddings: bool = False,
|
||||
rope_parameters: dict | None = None,
|
||||
rope_scaling: dict | None = None,
|
||||
num_experts: int = 64,
|
||||
num_experts_per_tok: int = 6,
|
||||
num_shared_experts: int = 2,
|
||||
num_expert_groups: int = 1,
|
||||
num_limited_groups: int = 1,
|
||||
score_func: str = "sigmoid",
|
||||
route_norm: bool = True,
|
||||
route_scale: float = 1.0,
|
||||
global_attn_every_n_layers: int = 4,
|
||||
sliding_window: int = 2048,
|
||||
layer_types: list[str] | None = None,
|
||||
attention_dropout: float = 0.0,
|
||||
mup_enabled: bool = False,
|
||||
n_group: int = 1,
|
||||
topk_group: int = 1,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_dense_layers = num_dense_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_key_value_heads = num_key_value_heads or num_attention_heads
|
||||
self.head_dim = head_dim
|
||||
self.hidden_act = hidden_act
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
rope_theta = kwargs.pop("rope_theta", 10000.0)
|
||||
if rope_parameters is None:
|
||||
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
|
||||
self.rope_parameters = rope_parameters
|
||||
self.rope_scaling = rope_scaling
|
||||
|
||||
self.moe_intermediate_size = moe_intermediate_size
|
||||
self.num_experts = num_experts
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.num_shared_experts = num_shared_experts
|
||||
self.num_expert_groups = num_expert_groups
|
||||
self.num_limited_groups = num_limited_groups
|
||||
self.score_func = score_func
|
||||
self.route_norm = route_norm
|
||||
self.route_scale = route_scale
|
||||
|
||||
self.global_attn_every_n_layers = global_attn_every_n_layers
|
||||
self.sliding_window = sliding_window
|
||||
self.layer_types = layer_types
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
self.mup_enabled = mup_enabled
|
||||
self.n_group = n_group
|
||||
self.topk_group = topk_group
|
||||
|
||||
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
|
||||
|
||||
|
||||
__all__ = ["AfmoeConfig"]
|
||||
216
vllm/transformers_utils/configs/arctic.py
Normal file
216
vllm/transformers_utils/configs/arctic.py
Normal file
@@ -0,0 +1,216 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# Copied from
|
||||
# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
|
||||
"""Arctic model configuration"""
|
||||
|
||||
from dataclasses import asdict, dataclass
|
||||
from typing import Any
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
ARCTIC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"arctic": "https://huggingface.co/Snowflake/snowflake-arctic-instruct/tree/main/config.json",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ArcticLoRAConfig:
|
||||
lora_r: int = 64
|
||||
lora_alpha: float = 16
|
||||
shard_base_weights: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class ArcticQuantizationConfig:
|
||||
q_bits: int = 8
|
||||
rounding: str = "nearest"
|
||||
mantissa_bits: int = 3
|
||||
group_size: int = 128
|
||||
|
||||
|
||||
class ArcticConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`ArcticModel`]. It is used to instantiate an
|
||||
Arctic model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
||||
with the defaults will yield a similar configuration to that of the #TODO(rsamdani): add what model has the default config..
|
||||
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 32000):
|
||||
Vocabulary size of the Arctic model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`ArcticModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 14336):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 8):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function (function or string) in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
|
||||
The maximum sequence length that this model might ever be used with. Arctic's sliding window attention
|
||||
allows sequence of up to 4096*32 tokens.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*):
|
||||
The id of the padding token.
|
||||
bos_token_id (`int`, *optional*, defaults to 1):
|
||||
The id of the "beginning-of-sequence" token.
|
||||
eos_token_id (`int`, *optional*, defaults to 2):
|
||||
The id of the "end-of-sequence" token.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model's input and output word embeddings should be tied.
|
||||
rope_parameters (`dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
|
||||
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
||||
accordingly.
|
||||
Expected contents:
|
||||
`rope_theta` (`float`): The base period of the RoPE embeddings.
|
||||
`rope_type` (`str`):
|
||||
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
|
||||
'llama3'], with 'default' being the original RoPE implementation.
|
||||
sliding_window (`int`, *optional*):
|
||||
Sliding window attention window size. If not specified, will default to `4096`.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
num_experts_per_tok (`int`, *optional*, defaults to 2):
|
||||
The number of experts to root per-token, can be also interpreted as the `top-p` routing
|
||||
parameter
|
||||
num_local_experts (`int`, *optional*, defaults to 8):
|
||||
Number of experts per Sparse MLP layer.
|
||||
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
|
||||
The aux loss factor for the total loss.
|
||||
|
||||
```python
|
||||
>>> from transformers import ArcticModel, ArcticConfig
|
||||
|
||||
>>> # Initializing a Arctic 7B style configuration TODO(rsamdani): verify which model does the default configuration correspond to.
|
||||
>>> configuration = ArcticConfig()
|
||||
|
||||
>>> # Initializing a model from the Arctic 7B style configuration
|
||||
>>> model = ArcticModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "arctic"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=32000,
|
||||
hidden_size=4096,
|
||||
intermediate_size=14336,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=4096,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-5,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
tie_word_embeddings=False,
|
||||
rope_parameters: dict[str, Any] | None = None,
|
||||
sliding_window=None,
|
||||
attention_dropout=0.0,
|
||||
num_experts_per_tok=1,
|
||||
num_local_experts=8,
|
||||
router_aux_loss_coef=0.001,
|
||||
moe_layer_frequency=2,
|
||||
parallel_attn_mlp_res=False,
|
||||
moe_train_capacity_factor=1,
|
||||
moe_eval_capacity_factor=1,
|
||||
enable_expert_tensor_parallelism=False,
|
||||
moe_min_capacity=0,
|
||||
moe_token_dropping=True,
|
||||
quantization=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.sliding_window = sliding_window
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
rope_theta = kwargs.pop("rope_theta", 1e6)
|
||||
if rope_parameters is None:
|
||||
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
|
||||
self.rope_parameters = rope_parameters
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.num_local_experts = num_local_experts
|
||||
self.router_aux_loss_coef = router_aux_loss_coef
|
||||
self.moe_layer_frequency = moe_layer_frequency
|
||||
self.moe_train_capacity_factor = moe_train_capacity_factor
|
||||
self.moe_eval_capacity_factor = moe_eval_capacity_factor
|
||||
self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
|
||||
self.moe_min_capacity = moe_min_capacity
|
||||
self.moe_token_dropping = moe_token_dropping
|
||||
self.parallel_attn_mlp_res = parallel_attn_mlp_res
|
||||
if isinstance(quantization, dict):
|
||||
self.quantization = ArcticQuantizationConfig(**quantization)
|
||||
else:
|
||||
self.quantization = quantization
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, config_dict: dict[str, Any], **kwargs) -> "ArcticConfig":
|
||||
result = super().from_dict(config_dict, **kwargs)
|
||||
config = result[0] if isinstance(result, tuple) else result
|
||||
if isinstance(config.quantization, dict):
|
||||
config.quantization = ArcticQuantizationConfig(**config.quantization)
|
||||
return result
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
ret = super().to_dict()
|
||||
if isinstance(ret["quantization"], ArcticQuantizationConfig):
|
||||
ret["quantization"] = asdict(ret["quantization"])
|
||||
return ret
|
||||
53
vllm/transformers_utils/configs/bagel.py
Normal file
53
vllm/transformers_utils/configs/bagel.py
Normal file
@@ -0,0 +1,53 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from transformers import PretrainedConfig, SiglipVisionConfig
|
||||
from transformers.models.qwen2 import Qwen2Config
|
||||
|
||||
|
||||
class BagelConfig(PretrainedConfig):
|
||||
"""Configuration class for BAGEL model."""
|
||||
|
||||
model_type = "bagel"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
visual_gen: bool = True,
|
||||
visual_und: bool = True,
|
||||
llm_config: dict | Qwen2Config | None = None,
|
||||
vit_config: dict | SiglipVisionConfig | None = None,
|
||||
vae_config: dict | None = None,
|
||||
latent_patch_size: int = 2,
|
||||
max_latent_size: int = 32,
|
||||
vit_max_num_patch_per_side: int = 70,
|
||||
connector_act: str = "gelu_pytorch_tanh",
|
||||
interpolate_pos: bool = False,
|
||||
timestep_shift: float = 1.0,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.visual_gen = visual_gen
|
||||
self.visual_und = visual_und
|
||||
|
||||
# Convert dict configs to proper config objects
|
||||
if isinstance(llm_config, dict):
|
||||
self.llm_config = Qwen2Config(**llm_config)
|
||||
else:
|
||||
self.llm_config = llm_config or Qwen2Config()
|
||||
|
||||
if isinstance(vit_config, dict):
|
||||
self.vit_config = SiglipVisionConfig(**vit_config)
|
||||
else:
|
||||
self.vit_config = vit_config or SiglipVisionConfig()
|
||||
|
||||
self.vae_config = vae_config or {"z_channels": 16, "downsample": 8}
|
||||
self.latent_patch_size = latent_patch_size
|
||||
self.max_latent_size = max_latent_size
|
||||
self.vit_max_num_patch_per_side = vit_max_num_patch_per_side
|
||||
self.connector_act = connector_act
|
||||
self.interpolate_pos = interpolate_pos
|
||||
self.timestep_shift = timestep_shift
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
"""Return the hidden size of the language model."""
|
||||
return self.llm_config.hidden_size
|
||||
@@ -1,6 +1,8 @@
|
||||
# coding=utf-8
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Adapted from
|
||||
# https://github.com/THUDM/ChatGLM2-6B
|
||||
# https://github.com/zai-org/ChatGLM2-6B
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
@@ -11,33 +13,35 @@ class ChatGLMConfig(PretrainedConfig):
|
||||
"n_head_kv": "multi_query_group_num",
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
num_layers=28,
|
||||
padded_vocab_size=65024,
|
||||
hidden_size=4096,
|
||||
ffn_hidden_size=13696,
|
||||
kv_channels=128,
|
||||
num_attention_heads=32,
|
||||
seq_length=2048,
|
||||
hidden_dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
layernorm_epsilon=1e-5,
|
||||
rmsnorm=True,
|
||||
apply_residual_connection_post_layernorm=False,
|
||||
post_layer_norm=True,
|
||||
add_bias_linear=False,
|
||||
add_qkv_bias=False,
|
||||
interleaved_qkv=False,
|
||||
bias_dropout_fusion=True,
|
||||
multi_query_attention=False,
|
||||
multi_query_group_num=1,
|
||||
apply_query_key_layer_scaling=True,
|
||||
attention_softmax_in_fp32=True,
|
||||
fp32_residual_connection=False,
|
||||
quantization_bit=0,
|
||||
pre_seq_len=None,
|
||||
prefix_projection=False,
|
||||
**kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
num_layers=28,
|
||||
padded_vocab_size=65024,
|
||||
hidden_size=4096,
|
||||
ffn_hidden_size=13696,
|
||||
kv_channels=128,
|
||||
num_attention_heads=32,
|
||||
seq_length=2048,
|
||||
hidden_dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
layernorm_epsilon=1e-5,
|
||||
rmsnorm=True,
|
||||
apply_residual_connection_post_layernorm=False,
|
||||
post_layer_norm=True,
|
||||
add_bias_linear=False,
|
||||
add_qkv_bias=False,
|
||||
interleaved_qkv=False,
|
||||
bias_dropout_fusion=True,
|
||||
multi_query_attention=False,
|
||||
multi_query_group_num=1,
|
||||
apply_query_key_layer_scaling=True,
|
||||
attention_softmax_in_fp32=True,
|
||||
fp32_residual_connection=False,
|
||||
quantization_bit=0,
|
||||
pre_seq_len=None,
|
||||
prefix_projection=False,
|
||||
**kwargs,
|
||||
):
|
||||
self.num_layers = num_layers
|
||||
self.vocab_size = padded_vocab_size
|
||||
self.padded_vocab_size = padded_vocab_size
|
||||
@@ -46,12 +50,15 @@ class ChatGLMConfig(PretrainedConfig):
|
||||
self.kv_channels = kv_channels
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.seq_length = seq_length
|
||||
# It is to be compatible with long lora.
|
||||
self.max_position_embeddings = seq_length
|
||||
self.hidden_dropout = hidden_dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.layernorm_epsilon = layernorm_epsilon
|
||||
self.rmsnorm = rmsnorm
|
||||
self.apply_residual_connection_post_layernorm = (
|
||||
apply_residual_connection_post_layernorm)
|
||||
apply_residual_connection_post_layernorm
|
||||
)
|
||||
self.post_layer_norm = post_layer_norm
|
||||
self.add_bias_linear = add_bias_linear
|
||||
self.add_qkv_bias = add_qkv_bias
|
||||
|
||||
@@ -1,278 +0,0 @@
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# Copied from
|
||||
# https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py
|
||||
"""Dbrx configuration."""
|
||||
|
||||
from typing import Any, Optional
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {} # type: ignore
|
||||
|
||||
|
||||
class DbrxAttentionConfig(PretrainedConfig):
|
||||
"""Configuration class for Dbrx Attention.
|
||||
|
||||
[`DbrxAttention`] class. It is used to instantiate attention layers
|
||||
according to the specified arguments, defining the layers architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
attn_pdrop (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probability for the attention layers.
|
||||
clip_qkv (`float`, *optional*, defaults to None):
|
||||
If not `None`, clip the queries, keys, and values in the attention layer to this value.
|
||||
kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
|
||||
rope_theta (float): The base frequency for rope.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
attn_pdrop: float = 0,
|
||||
clip_qkv: Optional[float] = None,
|
||||
kv_n_heads: int = 1,
|
||||
rope_theta: float = 10000.0,
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.attn_pdrop = attn_pdrop
|
||||
self.clip_qkv = clip_qkv
|
||||
self.kv_n_heads = kv_n_heads
|
||||
self.rope_theta = rope_theta
|
||||
|
||||
for k in ["model_type"]:
|
||||
if k in kwargs:
|
||||
kwargs.pop(k)
|
||||
if len(kwargs) != 0:
|
||||
raise ValueError(f"Found unknown {kwargs=}")
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls, pretrained_model_name_or_path: str, **kwargs: Any
|
||||
) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs
|
||||
)
|
||||
|
||||
if config_dict.get("model_type") == "dbrx":
|
||||
config_dict = config_dict["attn_config"]
|
||||
|
||||
if (
|
||||
"model_type" in config_dict
|
||||
and hasattr(cls, "model_type")
|
||||
and config_dict["model_type"] != cls.model_type
|
||||
):
|
||||
logger.warning(
|
||||
"You are using a model of type %s to instantiate a model of "
|
||||
"type %s. This is not supported for all configurations of "
|
||||
"models and can yield errors.",
|
||||
config_dict["model_type"], cls.model_type)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class DbrxFFNConfig(PretrainedConfig):
|
||||
"""Configuration class for Dbrx FFN.
|
||||
|
||||
[`DbrxFFN`] class. It is used to instantiate feedforward layers according to
|
||||
the specified arguments, defining the layers architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
|
||||
The dict should have a key 'name' with the value being the name of
|
||||
the activation function along with any additional keyword arguments.
|
||||
ffn_hidden_size (int, optional): The hidden size of the feedforward network.
|
||||
moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
|
||||
moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
|
||||
moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
|
||||
moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
|
||||
moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
|
||||
uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
|
||||
This should only be used for benchmarking purposes.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ffn_act_fn: Optional[dict] = None,
|
||||
ffn_hidden_size: int = 3584,
|
||||
moe_num_experts: int = 4,
|
||||
moe_top_k: int = 1,
|
||||
moe_jitter_eps: Optional[float] = None,
|
||||
moe_loss_weight: float = 0.01,
|
||||
moe_normalize_expert_weights: Optional[float] = 1,
|
||||
uniform_expert_assignment: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__()
|
||||
if ffn_act_fn is None:
|
||||
ffn_act_fn = {"name": "silu"}
|
||||
self.ffn_act_fn = ffn_act_fn
|
||||
self.ffn_hidden_size = ffn_hidden_size
|
||||
self.moe_num_experts = moe_num_experts
|
||||
self.moe_top_k = moe_top_k
|
||||
self.moe_jitter_eps = moe_jitter_eps
|
||||
self.moe_loss_weight = moe_loss_weight
|
||||
self.moe_normalize_expert_weights = moe_normalize_expert_weights
|
||||
self.uniform_expert_assignment = uniform_expert_assignment
|
||||
|
||||
for k in ["model_type"]:
|
||||
if k in kwargs:
|
||||
kwargs.pop(k)
|
||||
if len(kwargs) != 0:
|
||||
raise ValueError(f"Found unknown {kwargs=}")
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls, pretrained_model_name_or_path: str, **kwargs: Any
|
||||
) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs
|
||||
)
|
||||
|
||||
if config_dict.get("model_type") == "dbrx":
|
||||
config_dict = config_dict["ffn_config"]
|
||||
|
||||
if (
|
||||
"model_type" in config_dict
|
||||
and hasattr(cls, "model_type")
|
||||
and config_dict["model_type"] != cls.model_type
|
||||
):
|
||||
logger.warning(
|
||||
"You are using a model of type %s to instantiate a model of "
|
||||
"type %s. This is not supported for all "
|
||||
"configurations of models and can yield errors.", config_dict["model_type"], cls.model_type)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class DbrxConfig(PretrainedConfig):
|
||||
"""Configuration class for Dbrx.
|
||||
|
||||
[`DbrxModel`]. It is used to instantiate a Dbrx model according to the
|
||||
specified arguments, defining the model architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
d_model (`int`, *optional*, defaults to 6144):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
n_heads (`int`, *optional*, defaults to 48):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
n_layers (`int`, *optional*, defaults to 40):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
max_seq_len (`int`, *optional*, defaults to 32768):
|
||||
The maximum sequence length of the model.
|
||||
vocab_size (`int`, *optional*, defaults to 100352):
|
||||
Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
|
||||
the `inputs_ids` passed when calling [`DbrxModel`].
|
||||
resid_pdrop (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probability applied to the attention output before combining with residual.
|
||||
emb_pdrop (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probability for the embedding layer.
|
||||
attn_config (`dict`, *optional*):
|
||||
A dictionary used to configure the model's attention module.
|
||||
ffn_config (`dict`, *optional*):
|
||||
A dictionary used to configure the model's FFN module.
|
||||
use_cache (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
output_router_logits (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the router logits should be returned by the model. Enabling this will also
|
||||
allow the model to output the auxiliary loss. See [here]() for more details
|
||||
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
|
||||
The aux loss factor for the total loss.
|
||||
|
||||
|
||||
Example:
|
||||
```python
|
||||
>>> from transformers import DbrxConfig, DbrxModel
|
||||
|
||||
>>> # Initializing a Dbrx configuration
|
||||
>>> configuration = DbrxConfig()
|
||||
|
||||
>>> # Initializing a model (with random weights) from the configuration
|
||||
>>> model = DbrxModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```
|
||||
"""
|
||||
|
||||
model_type = "dbrx"
|
||||
attribute_map = {
|
||||
"num_attention_heads": "n_heads",
|
||||
"hidden_size": "d_model",
|
||||
"num_hidden_layers": "n_layers",
|
||||
"max_position_embeddings": "max_seq_len",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
d_model: int = 2048,
|
||||
n_heads: int = 16,
|
||||
n_layers: int = 24,
|
||||
max_seq_len: int = 2048,
|
||||
vocab_size: int = 32000,
|
||||
resid_pdrop: float = 0.0,
|
||||
emb_pdrop: float = 0.0,
|
||||
attn_config: Optional[DbrxAttentionConfig] = None,
|
||||
ffn_config: Optional[DbrxFFNConfig] = None,
|
||||
use_cache: bool = True,
|
||||
initializer_range: float = 0.02,
|
||||
output_router_logits: bool = False,
|
||||
router_aux_loss_coef: float = 0.05,
|
||||
**kwargs: Any,
|
||||
):
|
||||
if attn_config is None:
|
||||
self.attn_config = DbrxAttentionConfig()
|
||||
elif isinstance(attn_config, dict):
|
||||
self.attn_config = DbrxAttentionConfig(**attn_config)
|
||||
else:
|
||||
self.attn_config = attn_config
|
||||
|
||||
if ffn_config is None:
|
||||
self.ffn_config = DbrxFFNConfig()
|
||||
elif isinstance(ffn_config, dict):
|
||||
self.ffn_config = DbrxFFNConfig(**ffn_config)
|
||||
else:
|
||||
self.ffn_config = ffn_config
|
||||
|
||||
self.d_model = d_model
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.max_seq_len = max_seq_len
|
||||
self.vocab_size = vocab_size
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.emb_pdrop = emb_pdrop
|
||||
self.use_cache = use_cache
|
||||
self.initializer_range = initializer_range
|
||||
self.output_router_logits = output_router_logits
|
||||
self.router_aux_loss_coef = router_aux_loss_coef
|
||||
|
||||
tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
|
||||
if tie_word_embeddings:
|
||||
raise ValueError(
|
||||
"tie_word_embeddings is not supported for Dbrx models."
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
126
vllm/transformers_utils/configs/deepseek_vl2.py
Normal file
126
vllm/transformers_utils/configs/deepseek_vl2.py
Normal file
@@ -0,0 +1,126 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
|
||||
|
||||
from transformers import DeepseekV2Config, PretrainedConfig
|
||||
|
||||
|
||||
class VisionEncoderConfig(PretrainedConfig):
|
||||
model_type: str = "vision"
|
||||
|
||||
model_name: str = "vit_so400m_patch14_siglip_384.webli"
|
||||
image_size: int = 384
|
||||
patch_size: int = 16
|
||||
width: int = 1024
|
||||
layers: int = 24
|
||||
heads: int = 16
|
||||
mlp_ratio: int = 4
|
||||
global_pool: str = "map"
|
||||
ignore_head: bool = True
|
||||
class_token: bool = False
|
||||
num_classes: int = 0
|
||||
use_checkpoint: bool = False
|
||||
weight_init: str = "skip"
|
||||
deterministic: bool = False
|
||||
num_recomputing_layers: int = 0
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str = "vit_so400m_patch14_siglip_384.webli",
|
||||
image_size: int = 384,
|
||||
patch_size: int = 16,
|
||||
width: int = 1024,
|
||||
layers: int = 24,
|
||||
heads: int = 16,
|
||||
mlp_ratio: int = 4,
|
||||
global_pool: str = "map",
|
||||
ignore_head: bool = True,
|
||||
class_token: bool = False,
|
||||
num_classes: int = 0,
|
||||
use_checkpoint: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
self.model_name = model_name
|
||||
self.image_size = image_size
|
||||
self.patch_size = patch_size
|
||||
self.width = width
|
||||
self.layers = layers
|
||||
self.heads = heads
|
||||
self.mlp_ratio = mlp_ratio
|
||||
self.global_pool = global_pool
|
||||
self.ignore_head = ignore_head
|
||||
self.class_token = class_token
|
||||
self.num_classes = num_classes
|
||||
self.use_checkpoint = use_checkpoint
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
class MlpProjectorConfig(PretrainedConfig):
|
||||
model_type = "mlp_projector"
|
||||
projector_type: str = "downsample_mlp_gelu"
|
||||
input_dim: int = 1152
|
||||
n_embed: int = 2048
|
||||
depth: int = 2
|
||||
mlp_ratio: int = 1
|
||||
downsample_ratio: int = 2
|
||||
token_pooling: bool = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
projector_type: str = "downsample_mlp_gelu",
|
||||
input_dim: int = 1152,
|
||||
n_embed: int = 2048,
|
||||
depth: int = 2,
|
||||
mlp_ratio: int = 1,
|
||||
downsample_ratio: int = 2,
|
||||
**kwargs,
|
||||
):
|
||||
self.projector_type = projector_type
|
||||
self.input_dim = input_dim
|
||||
self.n_embed = n_embed
|
||||
self.depth = depth
|
||||
self.mlp_ratio = mlp_ratio
|
||||
self.downsample_ratio = downsample_ratio
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
class DeepseekVLV2Config(PretrainedConfig):
|
||||
model_type = "deepseek_vl_v2"
|
||||
vision_config: VisionEncoderConfig
|
||||
projector_config: MlpProjectorConfig
|
||||
|
||||
tile_tag: str = "2D"
|
||||
global_view_pos: str = "head"
|
||||
candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tile_tag: str = "tile_tag",
|
||||
global_view_pos: str = "head",
|
||||
candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),),
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
vision_config = kwargs.get("vision_config", {})
|
||||
self.vision_config = VisionEncoderConfig(**vision_config)
|
||||
|
||||
projector_config = kwargs.get("projector_config", {})
|
||||
self.projector_config = MlpProjectorConfig(**projector_config)
|
||||
|
||||
language_config = kwargs.get("language_config", {})
|
||||
self.text_config = DeepseekV2Config(**language_config)
|
||||
|
||||
self.tile_tag = tile_tag
|
||||
self.global_view_pos = global_view_pos
|
||||
self.candidate_resolutions = candidate_resolutions
|
||||
self.vocab_size = self.text_config.vocab_size
|
||||
|
||||
# update model_type for OCR model
|
||||
if "DeepseekOCRForCausalLM" in (
|
||||
self.architectures or kwargs.get("architectures", [])
|
||||
):
|
||||
self.model_type = "deepseek_ocr"
|
||||
71
vllm/transformers_utils/configs/dotsocr.py
Normal file
71
vllm/transformers_utils/configs/dotsocr.py
Normal file
@@ -0,0 +1,71 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.models.qwen2 import Qwen2Config
|
||||
|
||||
|
||||
class DotsVisionConfig(PretrainedConfig):
|
||||
model_type: str = "dots_vit"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embed_dim: int = 1536, # vision encoder embed size
|
||||
hidden_size: int = 1536, # after merger hidden size
|
||||
intermediate_size: int = 4224,
|
||||
num_hidden_layers: int = 42,
|
||||
num_attention_heads: int = 12,
|
||||
num_channels: int = 3,
|
||||
patch_size: int = 14,
|
||||
spatial_merge_size: int = 2,
|
||||
temporal_patch_size: int = 1,
|
||||
rms_norm_eps: float = 1e-5,
|
||||
use_bias: bool = False,
|
||||
attn_implementation="flash_attention_2",
|
||||
initializer_range=0.02,
|
||||
init_merger_std=0.02,
|
||||
is_causal=False, # ve causal forward
|
||||
post_norm=True,
|
||||
gradient_checkpointing=False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.embed_dim = embed_dim
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_channels = num_channels
|
||||
self.patch_size = patch_size
|
||||
self.spatial_merge_size = spatial_merge_size
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_bias = use_bias
|
||||
self.attn_implementation = attn_implementation
|
||||
self.initializer_range = initializer_range
|
||||
self.init_merger_std = init_merger_std
|
||||
self.is_causal = is_causal
|
||||
self.post_norm = post_norm
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
|
||||
|
||||
class DotsOCRConfig(Qwen2Config):
|
||||
model_type = "dots_ocr"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image_token_id=151665,
|
||||
video_token_id=151656,
|
||||
vision_config: dict | None = None,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.image_token_id = image_token_id
|
||||
self.video_token_id = video_token_id
|
||||
self.vision_config = DotsVisionConfig(**(vision_config or {}))
|
||||
|
||||
def save_pretrained(self, save_directory, **kwargs):
|
||||
self._auto_class = None
|
||||
super().save_pretrained(save_directory, **kwargs)
|
||||
90
vllm/transformers_utils/configs/eagle.py
Normal file
90
vllm/transformers_utils/configs/eagle.py
Normal file
@@ -0,0 +1,90 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
|
||||
from transformers import AutoConfig, DeepseekV2Config, PretrainedConfig
|
||||
|
||||
|
||||
class EAGLEConfig(PretrainedConfig):
|
||||
model_type = "eagle"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: PretrainedConfig | dict | None = None,
|
||||
truncated_vocab_size: int | None = None,
|
||||
method: str | None = "eagle",
|
||||
**kwargs,
|
||||
):
|
||||
model_config: PretrainedConfig | DeepseekV2Config | None
|
||||
if isinstance(model, dict):
|
||||
model_config = AutoConfig.for_model(**model)
|
||||
else:
|
||||
model_config = model
|
||||
|
||||
for k, v in kwargs.items():
|
||||
if k != "architectures" and k != "model_type" and hasattr(model_config, k):
|
||||
setattr(model_config, k, v)
|
||||
|
||||
self.model = model_config
|
||||
|
||||
if self.model is None:
|
||||
self.truncated_vocab_size = None
|
||||
else:
|
||||
self.truncated_vocab_size = (
|
||||
self.model.vocab_size
|
||||
if truncated_vocab_size is None
|
||||
else truncated_vocab_size
|
||||
)
|
||||
|
||||
# Eagle model name should follow naming convention of
|
||||
# LlamaForCausalLM -> EagleLlamaForCausalLM
|
||||
# LlamaForCausalLM -> Eagle3LlamaForCausalLM
|
||||
# LlamaForCausalLMEagle3 -> LlamaForCausalLMEagle3
|
||||
if method == "eagle":
|
||||
assert self.model is not None, (
|
||||
"model should not be None when method is eagle"
|
||||
)
|
||||
kwargs["architectures"] = [
|
||||
f"Eagle{arch}" if not arch.startswith("Eagle") else arch
|
||||
for arch in self.model.architectures
|
||||
]
|
||||
|
||||
elif method == "eagle3":
|
||||
assert self.model is not None, (
|
||||
"model should not be None when method is eagle3"
|
||||
)
|
||||
kwargs["architectures"] = [
|
||||
arch
|
||||
if arch.startswith("Eagle3") or arch.endswith("Eagle3")
|
||||
else f"Eagle3{arch}"
|
||||
for arch in self.model.architectures
|
||||
]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid method {method}. Supported methods are eagle and eagle3."
|
||||
)
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if self.model is not None:
|
||||
for k, v in self.model.to_dict().items():
|
||||
if k not in kwargs:
|
||||
setattr(self, k, v)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
pretrained_model_name_or_path: str | os.PathLike,
|
||||
**kwargs,
|
||||
) -> "EAGLEConfig":
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs
|
||||
)
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
def to_json_string(self, use_diff: bool = True) -> str:
|
||||
# we override use_diff to False as initializing
|
||||
# EAGLEConfig with default arguments is not supported
|
||||
del use_diff
|
||||
return super().to_json_string(use_diff=False)
|
||||
@@ -1,3 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Adapted from
|
||||
# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
@@ -16,6 +19,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Falcon configuration"""
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
@@ -74,9 +78,7 @@ class RWConfig(PretrainedConfig):
|
||||
# Hack for falcon-40b
|
||||
self.new_decoder_architecture = True
|
||||
|
||||
super().__init__(bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
**kwargs)
|
||||
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
||||
|
||||
@property
|
||||
def head_dim(self):
|
||||
|
||||
82
vllm/transformers_utils/configs/flex_olmo.py
Normal file
82
vllm/transformers_utils/configs/flex_olmo.py
Normal file
@@ -0,0 +1,82 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class FlexOlmoConfig(PretrainedConfig):
|
||||
model_type = "flex_olmo"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=100352,
|
||||
hidden_size=4096,
|
||||
intermediate_size=11008,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=4096,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-06,
|
||||
use_cache=True,
|
||||
pad_token_id=100277,
|
||||
bos_token_id=None,
|
||||
eos_token_id=100257,
|
||||
tie_word_embeddings=False,
|
||||
rope_parameters: dict[str, Any] | None = None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
num_experts_per_tok=5,
|
||||
num_experts=7,
|
||||
output_router_logits=False,
|
||||
router_aux_loss_coef=0.01,
|
||||
norm_topk_prob=False,
|
||||
**kwargs,
|
||||
):
|
||||
if "architectures" not in kwargs:
|
||||
kwargs["architectures"] = ["FlexOlmoForCausalLM"]
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
|
||||
rope_scaling = kwargs.pop("rope_scaling", None)
|
||||
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
|
||||
rope_theta = kwargs.pop("rope_theta", 500000.0)
|
||||
if "rope_theta" not in rope_parameters:
|
||||
rope_parameters["rope_theta"] = rope_theta
|
||||
self.rope_parameters = rope_parameters
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.num_experts = num_experts
|
||||
self.output_router_logits = output_router_logits
|
||||
self.router_aux_loss_coef = router_aux_loss_coef
|
||||
self.norm_topk_prob = norm_topk_prob
|
||||
# Validate the correctness of rotary position embeddings parameters
|
||||
# BC: if there is a 'type' field, move it to 'rope_type'.
|
||||
if self.rope_parameters is not None and "type" in self.rope_parameters:
|
||||
self.rope_parameters["rope_type"] = self.rope_parameters["type"]
|
||||
322
vllm/transformers_utils/configs/hunyuan_vl.py
Normal file
322
vllm/transformers_utils/configs/hunyuan_vl.py
Normal file
@@ -0,0 +1,322 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/configuration_hunyuan_vl.py
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class HunYuanVLVisionConfig(PretrainedConfig):
|
||||
model_type = "hunyuan_vl"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_act="gelu",
|
||||
hidden_size=1152,
|
||||
intermediate_size=4304,
|
||||
interpolate_mode="bilinear",
|
||||
rms_norm_eps=1e-05,
|
||||
learnable_mlp_pooling_size=0,
|
||||
num_attention_heads=16,
|
||||
num_key_value_heads=None,
|
||||
num_channels=3,
|
||||
num_hidden_layers=27,
|
||||
out_hidden_size=4096,
|
||||
patch_size=16,
|
||||
remove_prenorm=True,
|
||||
spatial_merge_size=2,
|
||||
temporal_patch_size=1,
|
||||
resize_resolution=2048,
|
||||
img_max_token_num=4096,
|
||||
max_image_size=2048,
|
||||
video_max_image_size=768,
|
||||
video_min_image_size=256,
|
||||
min_image_size=512,
|
||||
anyres_vit_max_image_size=2048,
|
||||
max_vit_seq_len=16384,
|
||||
text_hidden_size=3072,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.hidden_act = hidden_act
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.interpolate_mode = interpolate_mode
|
||||
self.learnable_mlp_pooling_size = learnable_mlp_pooling_size
|
||||
self.num_attention_heads = num_attention_heads
|
||||
if not num_key_value_heads:
|
||||
self.num_key_value_heads = num_attention_heads
|
||||
else:
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.num_channels = num_channels
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.out_hidden_size = out_hidden_size
|
||||
self.patch_size = patch_size
|
||||
self.remove_prenorm = remove_prenorm
|
||||
self.spatial_merge_size = spatial_merge_size
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
|
||||
self.resize_resolution = resize_resolution
|
||||
self.img_max_token_num = img_max_token_num
|
||||
self.max_image_size = max_image_size
|
||||
self.min_image_size = min_image_size
|
||||
self.video_max_image_size = video_max_image_size
|
||||
self.video_min_image_size = video_min_image_size
|
||||
self.anyres_vit_max_image_size = anyres_vit_max_image_size
|
||||
self.max_vit_seq_len = max_vit_seq_len
|
||||
self.text_hidden_size = text_hidden_size
|
||||
|
||||
|
||||
class HunYuanVLTextConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`HunYuanVLTextConfig`]. It is used to instantiate an
|
||||
HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
||||
with the defaults will yield a similar configuration to that of the HunYuan-7B.
|
||||
Hunyuan-7B-Instruct [tencent/Hunyuan-7B-Instruct](https://huggingface.co/tencent/Hunyuan-7B-Instruct).
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 290943):
|
||||
Vocabulary size of the HunYuan model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`HunYuanVLTextConfig`]
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 11008):
|
||||
Dimension of the MLP representations or shared MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
|
||||
`num_attention_heads`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function (function or string) in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*, defaults to 0):
|
||||
Padding token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 1):
|
||||
Beginning of stream token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 2):
|
||||
End of stream token id.
|
||||
eod_token_id (int, *optional*, defaults to 3):
|
||||
Token ID representing the end-of-document marker. Used to indicate the termination of a text sequence.
|
||||
Example: In multi-document processing, this token helps the model distinguish between separate documents.
|
||||
pretraining_tp (`int`, *optional*, defaults to 1):
|
||||
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
|
||||
document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
|
||||
necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
|
||||
issue](https://github.com/pytorch/pytorch/issues/76232).
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether to tie weight embeddings
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
rope_scaling (`Dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
|
||||
strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
|
||||
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
|
||||
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how
|
||||
these scaling strategies behave:
|
||||
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
|
||||
experimental feature, subject to breaking API changes in future versions.
|
||||
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
head_dim (`int`, *optional*, defaults to 128):
|
||||
The attention head dimension.
|
||||
""" # noqa: E501
|
||||
|
||||
model_type = "hunyuan_vl_text"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=290943,
|
||||
hidden_size=4096,
|
||||
intermediate_size: int = 11008,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=2048,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-5,
|
||||
use_cache=True,
|
||||
pad_token_id=0,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
eod_token_id=3,
|
||||
pretraining_tp=1,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
head_dim=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.head_dim = head_dim
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.pretraining_tp = pretraining_tp
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
# self._rope_scaling_validation() # TODO: Need validation?
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _rope_scaling_validation(self):
|
||||
"""
|
||||
Validate the `rope_scaling` configuration.
|
||||
"""
|
||||
if self.rope_scaling is None:
|
||||
return
|
||||
|
||||
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
|
||||
raise ValueError(
|
||||
"`rope_scaling` must be a dictionary with with two fields, `type` and "
|
||||
f"`factor` or `type` and `alpha`, got {self.rope_scaling}"
|
||||
)
|
||||
rope_scaling_type = self.rope_scaling.get("type", None)
|
||||
rope_scaling_factor = self.rope_scaling.get("factor", None)
|
||||
rope_scaling_alpha = self.rope_scaling.get("alpha", None)
|
||||
if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
|
||||
raise ValueError(
|
||||
"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], "
|
||||
f"got {rope_scaling_type}"
|
||||
)
|
||||
if rope_scaling_factor is None and rope_scaling_alpha is None:
|
||||
raise ValueError(
|
||||
"`rope_scaling`'s factor or alpha field must be have one, "
|
||||
"got both of none"
|
||||
)
|
||||
if rope_scaling_factor is not None and (
|
||||
not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0
|
||||
):
|
||||
raise ValueError(
|
||||
"`rope_scaling`'s factor field must be a float > 1.0, "
|
||||
f"got {rope_scaling_factor}"
|
||||
)
|
||||
if rope_scaling_alpha is not None and (
|
||||
not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0
|
||||
):
|
||||
raise ValueError(
|
||||
"`rope_scaling`'s alpha field must be a float > 1.0, "
|
||||
f"got {rope_scaling_alpha}"
|
||||
)
|
||||
|
||||
|
||||
class HunYuanVLConfig(PretrainedConfig):
|
||||
model_type = "hunyuan_vl"
|
||||
sub_configs = {
|
||||
"vision_config": HunYuanVLVisionConfig,
|
||||
"text_config": HunYuanVLTextConfig,
|
||||
}
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text_config=None,
|
||||
vision_config=None,
|
||||
im_start_id=120118,
|
||||
im_end_id=120119,
|
||||
image_token_id=120120,
|
||||
im_newline_id=120121,
|
||||
video_start_id=120122,
|
||||
video_end_id=120123,
|
||||
**kwargs,
|
||||
):
|
||||
# We need to init super() here so that it does not reset values
|
||||
# that are in text config to the BaseClass defaults. The Base
|
||||
# config has many text related defaults and not all defaults are
|
||||
# same as for `HunYuanVLTextConfig`.
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if isinstance(vision_config, dict):
|
||||
self.vision_config = self.sub_configs["vision_config"](**vision_config)
|
||||
elif vision_config is None:
|
||||
self.vision_config = self.sub_configs["vision_config"]()
|
||||
|
||||
if isinstance(text_config, dict):
|
||||
self.text_config = self.sub_configs["text_config"](**text_config)
|
||||
elif text_config is None:
|
||||
# For BC use all kwargs to init `TextConfig`
|
||||
self.text_config = self.sub_configs["text_config"](**kwargs)
|
||||
|
||||
self.image_token_id = image_token_id
|
||||
self.im_start_id = im_start_id
|
||||
self.im_end_id = im_end_id
|
||||
self.im_newline_id = im_newline_id
|
||||
self.video_start_id = video_start_id
|
||||
self.video_end_id = video_end_id
|
||||
|
||||
self.vision_config.text_hidden_size = self.text_config.hidden_size
|
||||
|
||||
# Attention implementation to use. It sets it recursively on sub-configs
|
||||
# so we call it again in the end.
|
||||
self._attn_implementation = kwargs.pop("attn_implementation", None)
|
||||
|
||||
def __setattr__(self, key, value):
|
||||
if (
|
||||
(text_config := super().__getattribute__("__dict__").get("text_config"))
|
||||
is not None
|
||||
and key not in ["dtype", "_attn_implementation_internal"]
|
||||
and key in text_config.__dict__
|
||||
):
|
||||
setattr(text_config, key, value)
|
||||
else:
|
||||
super().__setattr__(key, value)
|
||||
|
||||
def __getattribute__(self, key):
|
||||
if "text_config" in super().__getattribute__("__dict__") and key not in [
|
||||
"_name_or_path",
|
||||
"model_type",
|
||||
"dtype",
|
||||
"_attn_implementation_internal",
|
||||
]:
|
||||
text_config = super().__getattribute__("text_config")
|
||||
if key in text_config.__dict__:
|
||||
return getattr(text_config, key)
|
||||
|
||||
return super().__getattribute__(key)
|
||||
@@ -1,6 +1,7 @@
|
||||
# coding=utf-8
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
|
||||
# Copyright (c) 2024 - 2024 Moore Threads Technology Co., Ltd("Moore Threads"). All rights reserved.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright 2023 Cerebras Systems.
|
||||
#
|
||||
@@ -73,10 +74,9 @@ class JAISConfig(PretrainedConfig):
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values
|
||||
attentions (not used by all models).
|
||||
scale_attn_by_inverse_layer_idx (`bool`, *optional*,
|
||||
defaults to `False`):
|
||||
Whether to additionally scale attention weights by
|
||||
`1 / layer_idx + 1`.
|
||||
scale_attn_by_inverse_layer_idx (`bool`, *optional*, default `True`):
|
||||
Whether to additionally scale attention weights
|
||||
by `1 / layer_idx + 1`.
|
||||
reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
|
||||
Whether to scale keys (K) prior to computing attention
|
||||
(dot-product)
|
||||
@@ -98,7 +98,7 @@ class JAISConfig(PretrainedConfig):
|
||||
Scale attention weights by dividing by hidden_size instead of
|
||||
sqrt(hidden_size). Need to set scale_attn_weights to `True` as
|
||||
well.
|
||||
alibi_scaling (`Dict`, *optional*):
|
||||
alibi_scaling (`dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for ALiBi
|
||||
embeddings. Currently only supports linear
|
||||
scaling strategy. Can specify either the scaling `factor` (must be
|
||||
@@ -108,7 +108,7 @@ class JAISConfig(PretrainedConfig):
|
||||
formats are `{"type": strategy name, "factor": scaling factor}` or
|
||||
`{"type": strategy name,
|
||||
"train_seq_len": training sequence length}`.
|
||||
architectures (`List`, *optional*, defaults to ['JAISLMHeadModel']):
|
||||
architectures (`list`, *optional*, defaults to ['JAISLMHeadModel']):
|
||||
architecture names for Jais.
|
||||
|
||||
Example:
|
||||
@@ -209,29 +209,35 @@ class JAISConfig(PretrainedConfig):
|
||||
if self.alibi_scaling is None:
|
||||
return
|
||||
|
||||
if (not isinstance(self.alibi_scaling, dict)
|
||||
or len(self.alibi_scaling) != 2):
|
||||
if not isinstance(self.alibi_scaling, dict) or len(self.alibi_scaling) != 2:
|
||||
raise ValueError(
|
||||
"`alibi_scaling` must be a dictionary with two fields,"
|
||||
"`alibi_scaling` must be a dictionary with two fields, "
|
||||
"`type` and `factor` or `type` and `train_seq_len`, "
|
||||
f"got {self.alibi_scaling}")
|
||||
f"got {self.alibi_scaling}"
|
||||
)
|
||||
alibi_scaling_type = self.alibi_scaling.get("type", None)
|
||||
alibi_scaling_factor = self.alibi_scaling.get("factor", None)
|
||||
alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
|
||||
if alibi_scaling_type is None or alibi_scaling_type != "linear":
|
||||
raise ValueError(f"`alibi_scaling`'s type field must be 'linear',"
|
||||
f"got {alibi_scaling_type}")
|
||||
if (alibi_scaling_factor is not None
|
||||
and not isinstance(alibi_scaling_factor, float)
|
||||
or (alibi_scaling_factor is not None
|
||||
and alibi_scaling_factor <= 1.0)):
|
||||
raise ValueError(
|
||||
f"`alibi_scaling`'s factor field must be a float > 1.0,"
|
||||
f"got {alibi_scaling_factor}")
|
||||
if (alibi_dynamic_scaling is not None
|
||||
and not isinstance(alibi_dynamic_scaling, int)
|
||||
or (alibi_dynamic_scaling is not None
|
||||
and alibi_dynamic_scaling <= 1)):
|
||||
f"`alibi_scaling`'s type field must be 'linear', "
|
||||
f"got {alibi_scaling_type}"
|
||||
)
|
||||
if (
|
||||
alibi_scaling_factor is not None
|
||||
and not isinstance(alibi_scaling_factor, float)
|
||||
or (alibi_scaling_factor is not None and alibi_scaling_factor <= 1.0)
|
||||
):
|
||||
raise ValueError(
|
||||
f"`alibi_scaling`'s `train_seq_len` field must be an"
|
||||
f"integer > 1, got {alibi_dynamic_scaling}")
|
||||
f"`alibi_scaling`'s factor field must be a float > 1.0, "
|
||||
f"got {alibi_scaling_factor}"
|
||||
)
|
||||
if (
|
||||
alibi_dynamic_scaling is not None
|
||||
and not isinstance(alibi_dynamic_scaling, int)
|
||||
or (alibi_dynamic_scaling is not None and alibi_dynamic_scaling <= 1)
|
||||
):
|
||||
raise ValueError(
|
||||
f"`alibi_scaling`'s `train_seq_len` field must be an "
|
||||
f"integer > 1, got {alibi_dynamic_scaling}"
|
||||
)
|
||||
|
||||
148
vllm/transformers_utils/configs/kimi_linear.py
Normal file
148
vllm/transformers_utils/configs/kimi_linear.py
Normal file
@@ -0,0 +1,148 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class KimiLinearConfig(PretrainedConfig):
|
||||
model_type = "kimi_linear"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_type="kimi_linear",
|
||||
vocab_size=163840,
|
||||
hidden_size=4096,
|
||||
head_dim=None,
|
||||
intermediate_size=11008,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="silu",
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-6,
|
||||
use_cache=True,
|
||||
pad_token_id=0,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
rope_parameters=None,
|
||||
tie_word_embeddings=False,
|
||||
moe_intermediate_size: int | None = None,
|
||||
moe_renormalize: bool = True,
|
||||
moe_router_activation_func: str = "sigmoid",
|
||||
num_experts: int | None = None,
|
||||
num_experts_per_token: int | None = None,
|
||||
num_shared_experts: int = 0,
|
||||
routed_scaling_factor: float = 1.0,
|
||||
first_k_dense_replace: int = 0,
|
||||
moe_layer_freq: int = 1,
|
||||
use_grouped_topk: bool = True,
|
||||
num_expert_group: int = 1,
|
||||
topk_group: int = 1,
|
||||
q_lora_rank: int | None = None,
|
||||
kv_lora_rank: int | None = None,
|
||||
qk_nope_head_dim: int | None = None,
|
||||
qk_rope_head_dim: int | None = None,
|
||||
v_head_dim: int | None = None,
|
||||
mla_use_nope: bool | None = False,
|
||||
num_nextn_predict_layers: int = 0,
|
||||
linear_attn_config: dict | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
self.model_type = model_type
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.head_dim = (
|
||||
head_dim if head_dim is not None else hidden_size // num_attention_heads
|
||||
)
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
|
||||
rope_scaling = kwargs.pop("rope_scaling", None)
|
||||
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
|
||||
rope_theta = kwargs.pop("rope_theta", 10000.0)
|
||||
if "rope_theta" not in rope_parameters:
|
||||
rope_parameters["rope_theta"] = rope_theta
|
||||
self.rope_parameters = rope_parameters
|
||||
|
||||
self.q_lora_rank = q_lora_rank
|
||||
self.kv_lora_rank = kv_lora_rank
|
||||
self.qk_nope_head_dim = qk_nope_head_dim
|
||||
self.qk_rope_head_dim = qk_rope_head_dim
|
||||
self.v_head_dim = v_head_dim
|
||||
self.mla_use_nope = mla_use_nope
|
||||
# moe config
|
||||
self.num_experts = num_experts
|
||||
self.num_experts_per_token = num_experts_per_token
|
||||
self.moe_renormalize = moe_renormalize
|
||||
self.num_shared_experts = num_shared_experts
|
||||
self.routed_scaling_factor = routed_scaling_factor
|
||||
self.moe_router_activation_func = moe_router_activation_func
|
||||
assert self.moe_router_activation_func in ("softmax", "sigmoid")
|
||||
self.moe_intermediate_size = moe_intermediate_size
|
||||
self.first_k_dense_replace = first_k_dense_replace
|
||||
self.moe_layer_freq = moe_layer_freq
|
||||
self.use_grouped_topk = use_grouped_topk
|
||||
self.num_expert_group = num_expert_group
|
||||
self.topk_group = topk_group
|
||||
self.num_nextn_predict_layers = num_nextn_predict_layers
|
||||
|
||||
if linear_attn_config is not None:
|
||||
assert linear_attn_config["kda_layers"] is not None
|
||||
assert linear_attn_config["full_attn_layers"] is not None
|
||||
self.linear_attn_config = linear_attn_config
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def is_mla(self):
|
||||
return (
|
||||
self.q_lora_rank is not None
|
||||
or self.kv_lora_rank is not None
|
||||
or self.qk_nope_head_dim is not None
|
||||
or self.qk_rope_head_dim is not None
|
||||
or self.v_head_dim is not None
|
||||
or self.mla_use_nope is True
|
||||
)
|
||||
|
||||
@property
|
||||
def is_moe(self):
|
||||
return self.num_experts is not None
|
||||
|
||||
@property
|
||||
def is_linear_attn(self) -> bool:
|
||||
return not (
|
||||
self.linear_attn_config is None
|
||||
or (
|
||||
isinstance(self.linear_attn_config, dict)
|
||||
and self.linear_attn_config["kda_layers"] is not None
|
||||
and len(self.linear_attn_config["kda_layers"]) == 0
|
||||
)
|
||||
)
|
||||
|
||||
def is_kda_layer(self, layer_idx: int):
|
||||
return (
|
||||
self.linear_attn_config is not None
|
||||
and (layer_idx + 1) in self.linear_attn_config["kda_layers"]
|
||||
)
|
||||
38
vllm/transformers_utils/configs/kimi_vl.py
Normal file
38
vllm/transformers_utils/configs/kimi_vl.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
|
||||
|
||||
from transformers import DeepseekV2Config
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
from vllm.transformers_utils.configs.moonvit import MoonViTConfig
|
||||
|
||||
|
||||
class KimiVLConfig(PretrainedConfig):
|
||||
model_type = "kimi_vl"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vision_config: dict | MoonViTConfig | None = None,
|
||||
text_config: dict | DeepseekV2Config | None = None,
|
||||
ignore_index: int = -100,
|
||||
media_placeholder_token_id: int = 163605,
|
||||
pad_token_id: int = 0,
|
||||
**kwargs,
|
||||
):
|
||||
if vision_config is None:
|
||||
vision_config = MoonViTConfig()
|
||||
elif isinstance(vision_config, dict):
|
||||
vision_config = MoonViTConfig(**vision_config)
|
||||
self.vision_config = vision_config
|
||||
|
||||
if text_config is None:
|
||||
text_config = DeepseekV2Config()
|
||||
elif isinstance(text_config, dict):
|
||||
text_config = DeepseekV2Config(**text_config)
|
||||
self.text_config = text_config
|
||||
|
||||
self.ignore_index = ignore_index
|
||||
self.media_placeholder_token_id = media_placeholder_token_id
|
||||
|
||||
super().__init__(pad_token_id=pad_token_id, **kwargs)
|
||||
163
vllm/transformers_utils/configs/lfm2_moe.py
Normal file
163
vllm/transformers_utils/configs/lfm2_moe.py
Normal file
@@ -0,0 +1,163 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class Lfm2MoeConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`Lfm2MoeModel`]. It is used to instantiate a LFM2 Moe
|
||||
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
||||
defaults will yield a similar configuration to that of the LFM2-8B-A1B model.
|
||||
e.g. [LiquidAI/LFM2-8B-A1B](https://huggingface.co/LiquidAI/LFM2-8B-A1B)
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 65536):
|
||||
Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`Lfm2Model`]
|
||||
hidden_size (`int`, *optional*, defaults to 2048):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 7168):
|
||||
Dimension of the MLP representations.
|
||||
moe_intermediate_size (`int`, *optional*, defaults to 1792):
|
||||
Intermediate size of the routed expert.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
pad_token_id (`int`, *optional*, defaults to 0):
|
||||
Padding token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 1):
|
||||
Beginning of stream token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 2):
|
||||
End of stream token id.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
|
||||
Whether to tie weight embeddings
|
||||
rope_parameters (`dict`, *optional*):
|
||||
The parameters of the RoPE embeddings.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 128000):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the rms normalization layers.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 8):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details, check out [this
|
||||
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
|
||||
`num_attention_heads`.
|
||||
conv_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use bias in the conv layers.
|
||||
conv_L_cache (`int`, *optional*, defaults to 3):
|
||||
L_cache dim in the conv layers.
|
||||
num_dense_layers (`int`, *optional*, defaults to 2):
|
||||
Number of dense Lfm2MoeMLP layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
|
||||
num_experts_per_tok (`int`, *optional*, defaults to 4):
|
||||
Number of selected experts.
|
||||
num_experts (`int`, *optional*, defaults to 32):
|
||||
Number of routed experts.
|
||||
use_expert_bias (`bool`, *optional*, defaults to `True`):
|
||||
Whether to use the expert bias on the routing weights.
|
||||
routed_scaling_factor (`float`, *optional*, defaults to 1.0):
|
||||
Scaling factor for routed experts in MoE models.
|
||||
norm_topk_prob (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the topk probabilities.
|
||||
layer_types (`Optional`, *optional*):
|
||||
Type of each layers.
|
||||
|
||||
```python
|
||||
>>> from transformers import Lfm2MoeModel, Lfm2MoeConfig
|
||||
|
||||
>>> # Initializing a LFM2 Moe model
|
||||
>>> configuration = Lfm2MoeConfig()
|
||||
|
||||
>>> # Initializing a model from the LFM2-8B-A1B style configuration
|
||||
>>> model = Lfm2MoeModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```""" # noqa: E501
|
||||
|
||||
model_type = "lfm2_moe"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int = 65536,
|
||||
hidden_size: int = 2048,
|
||||
intermediate_size: int = 7168,
|
||||
moe_intermediate_size: int = 1792,
|
||||
num_hidden_layers: int = 32,
|
||||
pad_token_id: int = 0,
|
||||
bos_token_id: int = 1,
|
||||
eos_token_id: int = 2,
|
||||
tie_word_embeddings: bool = True,
|
||||
rope_parameters: dict[str, Any] | None = None,
|
||||
max_position_embeddings: int = 128_000,
|
||||
use_cache: bool = True,
|
||||
norm_eps: float = 0.00001,
|
||||
num_attention_heads: int = 32,
|
||||
num_key_value_heads: int = 8,
|
||||
conv_bias: bool = False,
|
||||
conv_L_cache: int = 3,
|
||||
num_dense_layers: int = 2,
|
||||
num_experts_per_tok: int = 4,
|
||||
num_experts: int = 32,
|
||||
use_expert_bias: bool = True,
|
||||
routed_scaling_factor: float = 1.0,
|
||||
norm_topk_prob: bool = True,
|
||||
layer_types: list[str] | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
rope_theta = kwargs.pop("rope_theta", 1000000.0)
|
||||
if rope_parameters is None:
|
||||
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
|
||||
self.rope_parameters = rope_parameters
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.use_cache = use_cache
|
||||
self.norm_eps = norm_eps
|
||||
|
||||
# attn operator config
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
|
||||
# custom operator config
|
||||
self.conv_bias = conv_bias
|
||||
self.conv_L_cache = conv_L_cache
|
||||
|
||||
# moe config
|
||||
self.num_dense_layers = num_dense_layers
|
||||
self.moe_intermediate_size = moe_intermediate_size
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.num_experts = num_experts
|
||||
self.use_expert_bias = use_expert_bias
|
||||
self.routed_scaling_factor = routed_scaling_factor
|
||||
self.norm_topk_prob = norm_topk_prob
|
||||
self.layer_types = layer_types
|
||||
|
||||
tie_word_embeddings = kwargs.get(
|
||||
"tie_embedding", tie_word_embeddings
|
||||
) # to fit original config keys
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Lfm2MoeConfig"]
|
||||
65
vllm/transformers_utils/configs/medusa.py
Normal file
65
vllm/transformers_utils/configs/medusa.py
Normal file
@@ -0,0 +1,65 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class MedusaConfig(PretrainedConfig):
|
||||
model_type = "medusa"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size: int = 4096,
|
||||
vocab_size: int = 32001,
|
||||
num_heads: int = 5,
|
||||
num_hidden_layers: int = 1,
|
||||
max_paths: int = 64,
|
||||
topk: int = 10,
|
||||
truncated_vocab_size: int | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
self.hidden_size = hidden_size
|
||||
self.vocab_size = vocab_size
|
||||
self.num_heads = num_heads
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.max_paths = max_paths
|
||||
self.topk = topk
|
||||
self.max_seq_len = int(2**20)
|
||||
self.truncated_vocab_size = (
|
||||
vocab_size if truncated_vocab_size is None else truncated_vocab_size
|
||||
)
|
||||
if "architectures" not in kwargs:
|
||||
kwargs["architectures"] = ["MedusaModel"]
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
pretrained_model_name_or_path: str | os.PathLike,
|
||||
**kwargs,
|
||||
) -> "MedusaConfig":
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs
|
||||
)
|
||||
for k in list(config_dict.keys()):
|
||||
if "num" in k:
|
||||
if "heads" in k:
|
||||
config_dict["num_heads"] = config_dict.pop(k)
|
||||
elif "layers" in k:
|
||||
config_dict["num_hidden_layers"] = config_dict.pop(k)
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
@property
|
||||
def num_attention_heads(self):
|
||||
return 0
|
||||
|
||||
@property
|
||||
def num_lookahead_tokens(self):
|
||||
return self.num_heads
|
||||
|
||||
@num_lookahead_tokens.setter
|
||||
def num_lookahead_tokens(self, num_lookahead_tokens: int):
|
||||
self.num_heads = num_lookahead_tokens
|
||||
103
vllm/transformers_utils/configs/midashenglm.py
Normal file
103
vllm/transformers_utils/configs/midashenglm.py
Normal file
@@ -0,0 +1,103 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Copyright 2025 Horizon team, Xiaomi MiLM Plus.
|
||||
# Copyright 2024 The Qwen team.
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
|
||||
Qwen2_5OmniTextConfig,
|
||||
)
|
||||
|
||||
|
||||
class DashengConfig(PretrainedConfig):
|
||||
model_type = "midashenglm_dasheng_encoder"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embed_dim: int = 768,
|
||||
outputdim: int = 527,
|
||||
patch_size: int | tuple[int, int] = 16,
|
||||
patch_stride: int | tuple[int, int] = 16,
|
||||
input_channels: int = 1,
|
||||
target_length: int = 1012,
|
||||
depth: int = 12,
|
||||
num_heads: int = 12,
|
||||
mlp_ratio: float = 4.0,
|
||||
qkv_bias: bool = True,
|
||||
init_values: float | None = None,
|
||||
drop_rate: float = 0.0,
|
||||
attn_drop_rate: float = 0.0,
|
||||
f_min: float = 0.0,
|
||||
f_max: float = 8000.0,
|
||||
center: bool = True,
|
||||
win_length: int = 512,
|
||||
hop_length: int = 160,
|
||||
sample_rate: int = 16000,
|
||||
n_fft: int = 512,
|
||||
n_mels: int = 64,
|
||||
**kwargs,
|
||||
):
|
||||
self.embed_dim = embed_dim
|
||||
self.outputdim = outputdim
|
||||
self.patch_size = patch_size
|
||||
self.patch_stride = patch_stride
|
||||
self.input_channels = input_channels
|
||||
self.target_length = target_length
|
||||
self.depth = depth
|
||||
self.num_heads = num_heads
|
||||
self.mlp_ratio = mlp_ratio
|
||||
self.qkv_bias = qkv_bias
|
||||
self.init_values = init_values
|
||||
self.drop_rate = drop_rate
|
||||
self.attn_drop_rate = attn_drop_rate
|
||||
self.f_min = f_min
|
||||
self.f_max = f_max
|
||||
self.center = center
|
||||
self.win_length = win_length
|
||||
self.hop_length = hop_length
|
||||
self.sample_rate = sample_rate
|
||||
self.n_fft = n_fft
|
||||
self.n_mels = n_mels
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
class MiDashengLMConfig(PretrainedConfig):
|
||||
model_type = "midashenglm"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
audio_encoder_config: dict | None = None,
|
||||
subsample_factor: int = 5,
|
||||
text_config: dict | None = None,
|
||||
audio_token_id: int | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
self.audio_encoder_config = DashengConfig(**(audio_encoder_config or {}))
|
||||
self.subsample_factor = subsample_factor
|
||||
self.text_config = (
|
||||
Qwen2_5OmniTextConfig(**text_config)
|
||||
if text_config
|
||||
else Qwen2_5OmniTextConfig()
|
||||
)
|
||||
self.text_config.rope_parameters = None # uses_mrope is false
|
||||
self.audio_token_id = audio_token_id
|
||||
super().__init__(**kwargs)
|
||||
235
vllm/transformers_utils/configs/mistral.py
Normal file
235
vllm/transformers_utils/configs/mistral.py
Normal file
@@ -0,0 +1,235 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any
|
||||
|
||||
from transformers import PretrainedConfig, WhisperConfig
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def adapt_config_dict(
|
||||
config_dict: dict[str, Any],
|
||||
defaults: dict[str, Any],
|
||||
) -> PretrainedConfig:
|
||||
config_dict = _remap_general_mistral_args(config_dict)
|
||||
|
||||
if bool(config_dict.get("quantization")):
|
||||
config_dict = _remap_mistral_quantization_args(config_dict)
|
||||
|
||||
is_moe = bool(config_dict.get("moe"))
|
||||
is_mistral_large_3 = (
|
||||
is_moe and (config_dict["moe"].get("num_shared_experts") or 0) > 0
|
||||
)
|
||||
if config_dict.get("model_type") == "mamba":
|
||||
config_dict["architectures"] = ["Mamba2ForCausalLM"]
|
||||
elif is_moe and is_mistral_large_3:
|
||||
config_dict = _remap_moe_args(config_dict)
|
||||
config_dict["model_type"] = "deepseek_v3"
|
||||
config_dict["architectures"] = ["MistralLarge3ForCausalLM"]
|
||||
|
||||
assert "llama_4_scaling" in config_dict, (
|
||||
"MistralLarge3 expect llama4 scaling config."
|
||||
)
|
||||
llama_4_scaling_config_keys = ["original_max_position_embeddings", "beta"]
|
||||
assert all(
|
||||
[
|
||||
key in config_dict["llama_4_scaling"]
|
||||
for key in llama_4_scaling_config_keys
|
||||
]
|
||||
), (
|
||||
"llama_4_scaling config should define the keys: "
|
||||
f"{','.join(llama_4_scaling_config_keys)}"
|
||||
)
|
||||
elif is_moe:
|
||||
config_dict["architectures"] = ["MixtralForCausalLM"]
|
||||
else:
|
||||
config_dict["architectures"] = ["MistralForCausalLM"]
|
||||
|
||||
if bool(config_dict.get("yarn")):
|
||||
config_dict = _remap_mistral_yarn_args(config_dict)
|
||||
|
||||
if bool(config_dict.get("llama_4_scaling")):
|
||||
llama_4_scaling_config_keys = ["original_max_position_embeddings", "beta"]
|
||||
assert all(
|
||||
[
|
||||
key in config_dict["llama_4_scaling"]
|
||||
for key in llama_4_scaling_config_keys
|
||||
]
|
||||
), (
|
||||
"llama_4_scaling config should define the keys: "
|
||||
f"{','.join(llama_4_scaling_config_keys)}"
|
||||
)
|
||||
|
||||
is_vision = (config_dict.get("multimodal") or {}).get(
|
||||
"vision_encoder_args"
|
||||
) or config_dict.get("vision_encoder")
|
||||
is_audio = bool(
|
||||
((config_dict.get("multimodal") or {}).get("whisper_model_args") or {}).get(
|
||||
"encoder_args"
|
||||
)
|
||||
)
|
||||
|
||||
assert not (is_vision and is_audio), "Vision and audio are mutually exclusive"
|
||||
|
||||
if is_vision:
|
||||
config_dict = _remap_mistral_vision_args(config_dict)
|
||||
if is_audio:
|
||||
config_dict = _remap_mistral_audio_args(config_dict)
|
||||
|
||||
for k, v in defaults.items():
|
||||
config_dict.setdefault(k, v)
|
||||
|
||||
config = PretrainedConfig.from_dict(config_dict)
|
||||
|
||||
logger.debug("Initialized config %s", config)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _remap_mistral_vision_args(config: dict) -> dict:
|
||||
if config.get("multimodal"):
|
||||
vision_config = config.pop("multimodal")
|
||||
else:
|
||||
vision_config = config.pop("vision_encoder")
|
||||
|
||||
quant_config = config.get("quantization_config")
|
||||
config = {
|
||||
"model_type": "pixtral",
|
||||
"architectures": ["PixtralForConditionalGeneration"],
|
||||
"text_config": PretrainedConfig.from_dict(config),
|
||||
"vision_config": PretrainedConfig.from_dict(vision_config),
|
||||
}
|
||||
if quant_config:
|
||||
config["quantization_config"] = quant_config
|
||||
return config
|
||||
|
||||
|
||||
def _remap_mistral_yarn_args(config: dict) -> dict:
|
||||
yarn_config_map = {
|
||||
"factor": "factor",
|
||||
"original_max_position_embeddings": "original_max_position_embeddings",
|
||||
"beta": "beta_fast",
|
||||
"alpha": "beta_slow",
|
||||
"apply_scale": "apply_yarn_scaling",
|
||||
}
|
||||
yarn_config = config.get("yarn") or {}
|
||||
config["rope_parameters"] = {
|
||||
"rope_type": "yarn",
|
||||
"mscale_all_dim": 1,
|
||||
}
|
||||
|
||||
if rope_theta := config.pop("rope_theta", None):
|
||||
config["rope_parameters"]["rope_theta"] = rope_theta
|
||||
|
||||
for old_name, new_name in yarn_config_map.items():
|
||||
if old_name in yarn_config:
|
||||
config["rope_parameters"][new_name] = yarn_config.pop(old_name)
|
||||
|
||||
assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}"
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _remap_general_mistral_args(config: dict) -> dict:
|
||||
# Mistral key -> HF key
|
||||
config_mapping = {
|
||||
"dim": "hidden_size",
|
||||
"norm_eps": "rms_norm_eps",
|
||||
"n_kv_heads": "num_key_value_heads",
|
||||
"n_layers": "num_hidden_layers",
|
||||
"n_heads": "num_attention_heads",
|
||||
"hidden_dim": "intermediate_size",
|
||||
}
|
||||
# HF key -> (Mistral key, default value)
|
||||
top_level_mapping_with_default = {
|
||||
"model_type": ("model_type", "transformer"),
|
||||
"hidden_act": ("activation", "silu"),
|
||||
"tie_word_embeddings": ("tied_embeddings", False),
|
||||
"max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)),
|
||||
"max_position_embeddings": ("max_position_embeddings", 128_000),
|
||||
}
|
||||
|
||||
for key, new_key in config_mapping.items():
|
||||
if key in config:
|
||||
config[new_key] = config.pop(key)
|
||||
|
||||
for new_key, (key, default_value) in top_level_mapping_with_default.items():
|
||||
config[new_key] = config.pop(key, default_value)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _remap_mistral_quantization_args(config: dict) -> dict:
|
||||
if config.get("quantization"):
|
||||
quantization = config.pop("quantization", {})
|
||||
if quantization.get("qformat_weight") == "fp8_e4m3":
|
||||
qscheme_act = quantization.get("qscheme_act")
|
||||
assert qscheme_act in ("NO_SCALES", "TENSOR", None), (
|
||||
"Only NO_SCALES and TENSOR (default) are supported for qscheme_act"
|
||||
)
|
||||
is_dynamic = qscheme_act == "NO_SCALES"
|
||||
config["quantization_config"] = {
|
||||
"quant_method": "fp8",
|
||||
"activation_scheme": "dynamic" if is_dynamic else "static",
|
||||
}
|
||||
else:
|
||||
raise ValueError(f"Found unknown quantization='{quantization}' in config")
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _remap_mistral_audio_args(config: dict) -> dict:
|
||||
whisper_args = config["multimodal"].pop("whisper_model_args")
|
||||
encoder_args = whisper_args["encoder_args"]
|
||||
downsample_args = whisper_args["downsample_args"]
|
||||
|
||||
quant_config = config.get("quantization_config")
|
||||
config = {
|
||||
"model_type": "whixtral",
|
||||
"architectures": ["VoxtralForConditionalGeneration"],
|
||||
"text_config": PretrainedConfig.from_dict(config),
|
||||
"audio_config": WhisperConfig(
|
||||
num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"],
|
||||
window_size=encoder_args["audio_encoding_args"]["window_size"],
|
||||
sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"],
|
||||
hop_length=encoder_args["audio_encoding_args"]["hop_length"],
|
||||
downsample_factor=downsample_args["downsample_factor"],
|
||||
d_model=encoder_args["dim"],
|
||||
encoder_layers=encoder_args["n_layers"],
|
||||
encoder_ffn_dim=encoder_args["hidden_dim"],
|
||||
encoder_attention_heads=encoder_args["n_heads"],
|
||||
vocab_size=encoder_args["vocab_size"],
|
||||
max_source_positions=encoder_args["max_source_positions"],
|
||||
is_encoder_decoder=False, # Override WhisperConfig default
|
||||
),
|
||||
}
|
||||
if quant_config:
|
||||
config["quantization_config"] = quant_config
|
||||
return config
|
||||
|
||||
|
||||
def _remap_moe_args(config: dict) -> dict:
|
||||
moe_config_map = {
|
||||
"route_every_n": "moe_layer_freq",
|
||||
"first_k_dense_replace": "first_k_dense_replace",
|
||||
"num_experts_per_tok": "num_experts_per_tok",
|
||||
"num_experts": "n_routed_experts",
|
||||
"expert_hidden_dim": "moe_intermediate_size",
|
||||
"routed_scale": "routed_scaling_factor",
|
||||
"num_shared_experts": "n_shared_experts",
|
||||
"num_expert_groups": "n_group",
|
||||
"num_expert_groups_per_tok": "topk_group",
|
||||
}
|
||||
moe_config = config.get("moe", {})
|
||||
for old_name, new_name in moe_config_map.items():
|
||||
if old_name in moe_config:
|
||||
value = moe_config.pop(old_name)
|
||||
config[new_name] = value
|
||||
|
||||
config["topk_method"] = None
|
||||
config["norm_topk_prob"] = True
|
||||
config["scoring_func"] = "softmax"
|
||||
|
||||
return config
|
||||
69
vllm/transformers_utils/configs/mlp_speculator.py
Normal file
69
vllm/transformers_utils/configs/mlp_speculator.py
Normal file
@@ -0,0 +1,69 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class MLPSpeculatorConfig(PretrainedConfig):
|
||||
model_type = "mlp_speculator"
|
||||
|
||||
attribute_map = {
|
||||
"hidden_size": "emb_dim",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int = 32000,
|
||||
emb_dim: int = 4096,
|
||||
inner_dim: int = 0,
|
||||
n_predict: int = 3,
|
||||
top_k_tokens_per_head: list[int] | None = None,
|
||||
n_candidates: int = 5,
|
||||
tie_weights: bool = False,
|
||||
scale_input: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize an MLPSpeculatorConfig
|
||||
|
||||
Args:
|
||||
vocab_size: int
|
||||
the model vocab size
|
||||
emb_dim: int
|
||||
the model embedding dimension
|
||||
inner_dim: int
|
||||
the inner dimension of the model. If 0, will be the emb_dim.
|
||||
n_predict: int
|
||||
the number of lookaheads for the speculator
|
||||
top_k_tokens_per_head: list[int]
|
||||
Number of tokens to consider from each head when forming the
|
||||
candidate tree.
|
||||
For each candidate branch in the tree, head n produces topk[n]
|
||||
additional sub-branches.
|
||||
NOTE: This parameter is currently unused.
|
||||
n_candidates: int
|
||||
number of child candidates to create per sequence
|
||||
tie_weights: bool
|
||||
If true, use a single set of weights for every model
|
||||
head/stage after the first. The initial projection
|
||||
from the base model may have a different size, so that
|
||||
stays separate.
|
||||
scale_input: bool
|
||||
if True, will scale the initial hidden states from
|
||||
the base model.
|
||||
"""
|
||||
if top_k_tokens_per_head is None:
|
||||
top_k_tokens_per_head = [5, 4, 3]
|
||||
assert len(top_k_tokens_per_head) == n_predict
|
||||
self.vocab_size = vocab_size
|
||||
self.emb_dim = emb_dim
|
||||
self.inner_dim = inner_dim
|
||||
self.n_predict = n_predict
|
||||
self.top_k_tokens_per_head = top_k_tokens_per_head
|
||||
self.n_candidates = n_candidates
|
||||
self.num_lookahead_tokens = n_predict
|
||||
self.tie_weights = tie_weights
|
||||
self.scale_input = scale_input
|
||||
|
||||
super().__init__(**kwargs)
|
||||
33
vllm/transformers_utils/configs/moonvit.py
Normal file
33
vllm/transformers_utils/configs/moonvit.py
Normal file
@@ -0,0 +1,33 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class MoonViTConfig(PretrainedConfig):
|
||||
model_type = "moonvit"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
patch_size: int = 14,
|
||||
init_pos_emb_height: int = 64,
|
||||
init_pos_emb_width: int = 64,
|
||||
num_attention_heads: int = 16,
|
||||
num_hidden_layers: int = 27,
|
||||
hidden_size: int = 1152,
|
||||
intermediate_size: int = 4304,
|
||||
merge_kernel_size: tuple[int, int] = (2, 2),
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.patch_size = patch_size
|
||||
# Positional embedding config
|
||||
self.init_pos_emb_height = init_pos_emb_height
|
||||
self.init_pos_emb_width = init_pos_emb_width
|
||||
# Transformer config
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
# Patch merger config
|
||||
self.merge_kernel_size = merge_kernel_size
|
||||
@@ -1,178 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copied from
|
||||
# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
|
||||
"""A HuggingFace-style model configuration."""
|
||||
import warnings
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
attn_config_defaults: Dict = {
|
||||
'attn_type': 'multihead_attention',
|
||||
'attn_pdrop': 0.0,
|
||||
'attn_impl': 'triton',
|
||||
'qk_ln': False,
|
||||
'clip_qkv': None,
|
||||
'softmax_scale': None,
|
||||
'prefix_lm': False,
|
||||
'attn_uses_sequence_id': False,
|
||||
'alibi': False,
|
||||
'alibi_bias_max': 8
|
||||
}
|
||||
ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'}
|
||||
init_config_defaults: Dict = {
|
||||
'name': 'kaiming_normal_',
|
||||
'fan_mode': 'fan_in',
|
||||
'init_nonlinearity': 'relu',
|
||||
'init_div_is_residual': True,
|
||||
'emb_init_std': None,
|
||||
'emb_init_uniform_lim': None,
|
||||
'init_std': None,
|
||||
'init_gain': 0.0
|
||||
}
|
||||
|
||||
|
||||
class MPTConfig(PretrainedConfig):
|
||||
model_type = 'mpt'
|
||||
attribute_map = {
|
||||
'num_attention_heads': 'n_heads',
|
||||
'hidden_size': 'd_model',
|
||||
'num_hidden_layers': 'n_layers',
|
||||
}
|
||||
|
||||
# pylint: disable=dangerous-default-value
|
||||
def __init__(self,
|
||||
d_model: int = 2048,
|
||||
n_heads: int = 16,
|
||||
n_layers: int = 24,
|
||||
expansion_ratio: int = 4,
|
||||
max_seq_len: int = 2048,
|
||||
vocab_size: int = 50368,
|
||||
resid_pdrop: float = 0.0,
|
||||
emb_pdrop: float = 0.0,
|
||||
learned_pos_emb: bool = True,
|
||||
attn_config: Dict = attn_config_defaults,
|
||||
ffn_config: Dict = ffn_config_defaults,
|
||||
init_device: str = 'cpu',
|
||||
logit_scale: Optional[Union[float, str]] = None,
|
||||
no_bias: bool = False,
|
||||
embedding_fraction: float = 1.0,
|
||||
norm_type: str = 'low_precision_layernorm',
|
||||
use_cache: bool = False,
|
||||
init_config: Dict = init_config_defaults,
|
||||
fc_type: str = 'torch',
|
||||
verbose: Optional[int] = None,
|
||||
**kwargs: Any):
|
||||
self.d_model = d_model
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.expansion_ratio = expansion_ratio
|
||||
self.max_seq_len = max_seq_len
|
||||
self.vocab_size = vocab_size
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.emb_pdrop = emb_pdrop
|
||||
self.learned_pos_emb = learned_pos_emb
|
||||
self.attn_config = attn_config
|
||||
self.ffn_config = ffn_config
|
||||
self.init_device = init_device
|
||||
self.logit_scale = logit_scale
|
||||
self.no_bias = no_bias
|
||||
self.embedding_fraction = embedding_fraction
|
||||
self.norm_type = norm_type
|
||||
self.use_cache = use_cache
|
||||
self.init_config = init_config
|
||||
self.fc_type = fc_type
|
||||
if verbose is not None:
|
||||
warnings.warn(DeprecationWarning(
|
||||
'verbose argument for MPTConfig is now ignored and '
|
||||
'will be removed. Use python_log_level instead.'),
|
||||
stacklevel=2)
|
||||
if 'name' in kwargs:
|
||||
del kwargs['name']
|
||||
if 'loss_fn' in kwargs:
|
||||
del kwargs['loss_fn']
|
||||
if self.attn_config.get('alibi', False):
|
||||
self.learned_pos_emb = False
|
||||
warnings.warn(
|
||||
f'alibi is turned on, setting `learned_pos_emb` '
|
||||
f'to {self.learned_pos_emb}`',
|
||||
stacklevel=2)
|
||||
super().__init__(**kwargs)
|
||||
self._validate_config()
|
||||
|
||||
def _set_config_defaults(
|
||||
self, config: Dict[str, Any],
|
||||
config_defaults: Dict[str, Any]) -> Dict[str, Any]:
|
||||
for (k, v) in config_defaults.items():
|
||||
if k not in config:
|
||||
config[k] = v
|
||||
return config
|
||||
|
||||
def _validate_config(self) -> None:
|
||||
self.attn_config = self._set_config_defaults(self.attn_config,
|
||||
attn_config_defaults)
|
||||
self.ffn_config = self._set_config_defaults(self.ffn_config,
|
||||
ffn_config_defaults)
|
||||
self.init_config = self._set_config_defaults(self.init_config,
|
||||
init_config_defaults)
|
||||
if self.d_model % self.n_heads != 0:
|
||||
raise ValueError('d_model must be divisible by n_heads')
|
||||
if any((
|
||||
prob < 0 or prob > 1 for prob in
|
||||
[self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop]
|
||||
)):
|
||||
raise ValueError(
|
||||
"self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
|
||||
"probabilities and must be between 0 and 1")
|
||||
if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
|
||||
raise ValueError(
|
||||
f"Unknown attn_impl={self.attn_config['attn_impl']}")
|
||||
if self.attn_config['prefix_lm'] and self.attn_config[
|
||||
'attn_impl'] not in ['torch', 'triton']:
|
||||
raise NotImplementedError(
|
||||
'prefix_lm only implemented with torch and triton attention.')
|
||||
if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [
|
||||
'torch', 'triton'
|
||||
]:
|
||||
raise NotImplementedError(
|
||||
'alibi only implemented with torch and triton attention.')
|
||||
if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
|
||||
'attn_impl'] not in ['torch', 'triton']:
|
||||
raise NotImplementedError(
|
||||
'attn_uses_sequence_id only implemented with torch '
|
||||
'and triton attention.')
|
||||
if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
|
||||
raise ValueError(
|
||||
'model.embedding_fraction must be between 0 (exclusive) '
|
||||
'and 1 (inclusive)!')
|
||||
if isinstance(self.logit_scale,
|
||||
str) and self.logit_scale != 'inv_sqrt_d_model':
|
||||
raise ValueError(
|
||||
f"self.logit_scale={self.logit_scale!r} is not recognized as "
|
||||
"an option; use numeric value or 'inv_sqrt_d_model'.")
|
||||
if self.init_config.get('name', None) is None:
|
||||
raise ValueError(
|
||||
f"self.init_config={self.init_config!r} 'name' needs to be set."
|
||||
)
|
||||
if not self.learned_pos_emb and (not self.attn_config['alibi']):
|
||||
warnings.warn(
|
||||
'Positional information not being provided to the model.',
|
||||
stacklevel=2)
|
||||
if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
|
||||
try:
|
||||
# pylint: disable=import-outside-toplevel
|
||||
import transformer_engine.pytorch as te
|
||||
del te
|
||||
except Exception as exc:
|
||||
raise ImportError(
|
||||
'TransformerEngine import fail. `fc_type: te` requires '
|
||||
'TransformerEngine be installed. '
|
||||
'The required version of transformer_engine also requires '
|
||||
'FlashAttention v1.0.6 is installed:\n'
|
||||
'pip install flash-attn==1.0.6 --no-build-isolation \n'
|
||||
'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
|
||||
) from exc
|
||||
if self.ffn_config['ffn_type'] == 'mptmlp':
|
||||
self.ffn_config['fc_type'] = self.fc_type
|
||||
elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
|
||||
self.ffn_config['bias'] = not self.no_bias
|
||||
220
vllm/transformers_utils/configs/nemotron.py
Normal file
220
vllm/transformers_utils/configs/nemotron.py
Normal file
@@ -0,0 +1,220 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Copyright 2024 HuggingFace Inc. team. All rights reserved.
|
||||
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Nemotron model configuration"""
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class NemotronConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a
|
||||
[`NemotronModel`]. It is used to instantiate a Nemotron model
|
||||
according to the specified arguments, defining the model architecture.
|
||||
Instantiating a configuration with the defaults will yield a similar
|
||||
configuration to that of the Nemotron-8B.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be
|
||||
used to control the model outputs. Read the documentation from
|
||||
[`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 256000):
|
||||
Vocabulary size of the Nemotron model. Defines the number of
|
||||
different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`NemotronModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 6144):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 24576):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 48):
|
||||
Number of attention heads for each attention layer in the
|
||||
Transformer decoder.
|
||||
head_dim (`int`, *optional*):
|
||||
Projection weights dimension in multi-head attention. Set to
|
||||
hidden_size // num_attention_heads if None
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
This is the number of key_value heads that should be used to
|
||||
implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use
|
||||
Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1 the model will use Multi Query Attention
|
||||
(MQA) otherwise GQA is used. When converting a multi-head
|
||||
checkpoint to a GQA checkpoint, each group key and value
|
||||
head should be constructed by meanpooling all the original
|
||||
heads within that group. For more details checkout
|
||||
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
|
||||
is not specified, will default to `num_attention_heads`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
|
||||
The non-linear activation function (function or string) in the
|
||||
decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 4096):
|
||||
The maximum sequence length that this model might ever be used
|
||||
with.
|
||||
initializer_range (`float`, *optional*, defaults to 0.0134):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values
|
||||
attentions (not used by all models). Only relevant if
|
||||
`config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*):
|
||||
Padding token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 2):
|
||||
Beginning of stream token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 3):
|
||||
End of stream token id.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether to tie weight embeddings
|
||||
rope_parameters (`dict`, *optional*):
|
||||
The parameters of the RoPE embeddings. Expected contents:
|
||||
`rope_theta` (`float`): The base period of the RoPE embeddings.
|
||||
`rope_type` (`str`):
|
||||
The sub-variant of RoPE to use. Can be one of ['default', 'linear',
|
||||
'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the
|
||||
original RoPE implementation.
|
||||
`partial_rotary_factor` (`float`, *optional*, defaults to 0.5):
|
||||
Percentage of the query and keys which will have rotary embedding.
|
||||
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value and output
|
||||
projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
mlp_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in up_proj and down_proj layers in the MLP
|
||||
layers.
|
||||
|
||||
```python
|
||||
>>> from transformers import NemotronModel, NemotronConfig
|
||||
>>> # Initializing a Nemotron nemotron-15b style configuration
|
||||
>>> configuration = NemotronConfig()
|
||||
>>> # Initializing a model from the nemotron-15b style configuration
|
||||
>>> model = NemotronModel(configuration)
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "nemotron"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=256000,
|
||||
hidden_size=6144,
|
||||
intermediate_size=24576,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=48,
|
||||
head_dim=None,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="relu2",
|
||||
max_position_embeddings=4096,
|
||||
initializer_range=0.0134,
|
||||
norm_eps=1e-5,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=2,
|
||||
eos_token_id=3,
|
||||
tie_word_embeddings=False,
|
||||
rope_parameters=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
mlp_bias=False,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
head_dim = head_dim or kwargs.get("kv_channels")
|
||||
self.head_dim = (
|
||||
head_dim if head_dim is not None else (hidden_size // num_attention_heads)
|
||||
)
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.norm_eps = norm_eps
|
||||
self.use_cache = use_cache
|
||||
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
|
||||
rope_scaling = kwargs.pop("rope_scaling", None)
|
||||
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
|
||||
rope_theta = kwargs.pop("rope_theta", 10000.0)
|
||||
if "rope_theta" not in rope_parameters:
|
||||
rope_parameters["rope_theta"] = rope_theta
|
||||
# for backward compatibility
|
||||
partial_rotary_factor = (
|
||||
kwargs.get("rope_percent")
|
||||
or kwargs.get("rope_percentage")
|
||||
or kwargs.get("partial_rotary_factor")
|
||||
or 0.5
|
||||
)
|
||||
if "partial_rotary_factor" not in rope_parameters:
|
||||
rope_parameters["partial_rotary_factor"] = partial_rotary_factor
|
||||
self.rope_parameters = rope_parameters
|
||||
self._rope_parameters_validation()
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
self.mlp_bias = mlp_bias
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _rope_parameters_validation(self):
|
||||
"""
|
||||
Validate the `rope_parameters` configuration.
|
||||
"""
|
||||
if self.rope_parameters is None:
|
||||
return
|
||||
|
||||
rope_type: str | None = self.rope_parameters.get("rope_type", None)
|
||||
factor: float | None = self.rope_parameters.get("factor", None)
|
||||
|
||||
if rope_type not in {"default", "linear", "dynamic"}:
|
||||
raise ValueError(
|
||||
"`rope_type` must be one of ['default', 'linear', 'dynamic'], "
|
||||
f"got {rope_type}"
|
||||
)
|
||||
if rope_type != "default":
|
||||
if factor is None:
|
||||
raise ValueError(
|
||||
"If `rope_type` is not 'default', `rope_parameters` "
|
||||
"must include a `factor` field. Got `None`."
|
||||
)
|
||||
if not isinstance(factor, float) or factor <= 1.0:
|
||||
raise ValueError(
|
||||
"`rope_parameters`'s factor field must be a float > 1, got "
|
||||
f"{factor}"
|
||||
)
|
||||
284
vllm/transformers_utils/configs/nemotron_h.py
Normal file
284
vllm/transformers_utils/configs/nemotron_h.py
Normal file
@@ -0,0 +1,284 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Copyright 2024 HuggingFace Inc. team. All rights reserved.
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""NemotronH model configuration"""
|
||||
|
||||
import regex as re
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class NemotronHConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a
|
||||
[`NemotronHModel`]. It is used to instantiate a NemotronH model according
|
||||
to the specified arguments, defining the model architecture. Instantiating
|
||||
a configuration with the defaults will yield a similar configuration to
|
||||
that of the NemotronH-v0.1 model.
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 131072):
|
||||
Vocabulary size of the NemotronH model. Defines the number of
|
||||
different tokens that can be represented by the `inputs_ids`
|
||||
passed when calling [`NemotronHModel`]
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model's input and output word embeddings should be
|
||||
tied. Note that this is only relevant if the model has an output
|
||||
word embedding layer.
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 21504):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 52):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
hybrid_override_pattern (`str`, *optional*, defaults to
|
||||
`"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
|
||||
The pattern of the hybrid model. The pattern is a string of
|
||||
characters where each character represents
|
||||
M: Mamba2, *: Attention, -: MLP
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the
|
||||
Transformer encoder.
|
||||
attention_head_dim (`int`, *optional*, defaults to 128):
|
||||
Dimension of each attention head.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 8):
|
||||
This is the number of key_value heads that should be used to
|
||||
implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use
|
||||
Multi Head Attention (MHA), if `num_key_value_heads=1` the model
|
||||
will use Multi Query Attention (MQA) otherwise GQA is used.
|
||||
mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
|
||||
The non-linear activation function in the MLP layers.
|
||||
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use bias in attention layers.
|
||||
mlp_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use bias in MLP layers.
|
||||
use_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use bias in the model.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
residual_in_fp32 (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not residuals should be in `float32`. If set to `False`
|
||||
residuals will keep the same `dtype` as the rest of the model.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values
|
||||
attentions (not used by all models). Only relevant if
|
||||
`config.is_decoder=True`.
|
||||
num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
|
||||
Number of prompt logits to calculate during generation. If `None`,
|
||||
all logits will be calculated. If an integer value, only last
|
||||
`num_logits_to_keep` logits will be calculated.
|
||||
pad_token_id (`int`, *optional*, defaults to 0):
|
||||
The id of the padding token.
|
||||
bos_token_id (`int`, *optional*, defaults to 1):
|
||||
The id of the "beginning-of-sequence" token.
|
||||
eos_token_id (`int`, *optional*, defaults to 2):
|
||||
The id of the "end-of-sequence" token.
|
||||
sliding_window (`int`, *optional*, defaults to None):
|
||||
Sliding window attention window size.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 4096):
|
||||
The maximum sequence length that this model might ever be used
|
||||
with.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
hidden_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the hidden states.
|
||||
use_mamba_kernels (`bool`, *optional*, defaults to `True`):
|
||||
Flag indicating whether or not to use the fast mamba kernels.
|
||||
These are available only if `mamba-ssm` and `causal-conv1d`
|
||||
are installed, and the mamba modules are running on a CUDA device.
|
||||
ssm_state_size (`int`, *optional*, defaults to 128):
|
||||
The dimension of the mamba state space latents.
|
||||
mamba_num_heads (`int`, *optional*, defaults to 128):
|
||||
Number of heads in Mamba layers.
|
||||
mamba_n_groups (`int`, *optional*, defaults to 8):
|
||||
Number of groups in Mamba layers.
|
||||
mamba_head_dim (`int`, *optional*, defaults to 64):
|
||||
Dimension of each Mamba head.
|
||||
mamba_d_conv (`int`, *optional*, defaults to 4):
|
||||
The size of the mamba convolution kernel.
|
||||
mamba_expand (`int`, *optional*, defaults to 2):
|
||||
Expanding factor used to determine the mamba intermediate size.
|
||||
mamba_hidden_act (`str`, *optional*, defaults to "silu"):
|
||||
The non-linear activation function in the Mamba layers.
|
||||
mamba_dt_min (`float`, *optional*, defaults to 0.001):
|
||||
Minimum value for the time step in Mamba.
|
||||
mamba_dt_max (`float`, *optional*, defaults to 0.1):
|
||||
Maximum value for the time step in Mamba.
|
||||
mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
|
||||
Limits for the time step in Mamba.
|
||||
mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
|
||||
Floor value for time step initialization in Mamba.
|
||||
mamba_conv_bias (`bool`, *optional*, defaults to `True`):
|
||||
Whether to use bias in the convolution layer of the mamba mixer
|
||||
block.
|
||||
mamba_proj_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use bias in the input and output projections of the
|
||||
mamba mixer block.
|
||||
mamba_chunk_size (`int`, *optional*, defaults to 256):
|
||||
Size of chunks for Mamba processing.
|
||||
rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the pre-normalization residual connections.
|
||||
"""
|
||||
|
||||
model_type = "nemotron_h"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=131072,
|
||||
tie_word_embeddings=False,
|
||||
hidden_size=4096,
|
||||
intermediate_size=21504,
|
||||
num_hidden_layers=52,
|
||||
hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
|
||||
num_attention_heads=32,
|
||||
head_dim=128,
|
||||
num_key_value_heads=8, # nemo: num_query_groups
|
||||
mlp_hidden_act="relu2",
|
||||
attention_bias=False,
|
||||
mlp_bias=False,
|
||||
use_bias=False,
|
||||
initializer_range=0.02, # nemo: init_method_std
|
||||
layer_norm_epsilon=1e-5, # nemo: layernorm_epsilon
|
||||
residual_in_fp32=False, # Megatron Core default value
|
||||
use_cache=True,
|
||||
num_logits_to_keep=1,
|
||||
pad_token_id=0,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
sliding_window=None,
|
||||
max_position_embeddings=4096,
|
||||
attention_dropout=0.0,
|
||||
hidden_dropout=0.0, # * ADDED
|
||||
use_mamba_kernels=True,
|
||||
ssm_state_size=128, # mamba_state_size
|
||||
mamba_num_heads=128,
|
||||
mamba_n_groups=8, # nemo: mamba_ssm_ngroups = num_heads
|
||||
mamba_head_dim=64,
|
||||
mamba_d_conv=4,
|
||||
mamba_expand=2,
|
||||
mamba_hidden_act="silu",
|
||||
mamba_dt_min=0.001,
|
||||
mamba_dt_max=0.1,
|
||||
mamba_dt_limit=(0.0, float("inf")),
|
||||
mamba_dt_init_floor=1e-4,
|
||||
mamba_conv_bias=True,
|
||||
mamba_proj_bias=False,
|
||||
mamba_chunk_size=256,
|
||||
rescale_prenorm_residual=True,
|
||||
n_routed_experts=8,
|
||||
n_shared_experts=1,
|
||||
moe_intermediate_size=7688,
|
||||
moe_shared_expert_intermediate_size=7688,
|
||||
moe_latent_size=None,
|
||||
num_experts_per_tok=2,
|
||||
routed_scaling_factor=1.0,
|
||||
n_group=1,
|
||||
topk_group=1,
|
||||
norm_topk_prob=True,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.tie_word_embeddings = tie_word_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.hybrid_override_pattern = hybrid_override_pattern
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.head_dim = head_dim
|
||||
self.sliding_window = sliding_window
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.attention_dropout = attention_dropout
|
||||
self.hidden_dropout = hidden_dropout
|
||||
|
||||
# Validate hybrid_override_pattern
|
||||
# M: Mamba2, *: Attention, -: MLP
|
||||
assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
|
||||
"hybrid_override_pattern must have same length as num_hidden_layers"
|
||||
)
|
||||
assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
|
||||
"hybrid_override_pattern must only contain characters 'M', '*', or '-'"
|
||||
)
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.mlp_hidden_act = mlp_hidden_act
|
||||
self.attention_bias = attention_bias
|
||||
self.mlp_bias = mlp_bias
|
||||
self.use_bias = use_bias
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.residual_in_fp32 = residual_in_fp32
|
||||
|
||||
self.use_cache = use_cache
|
||||
self.num_logits_to_keep = num_logits_to_keep
|
||||
|
||||
self.use_mamba_kernels = use_mamba_kernels
|
||||
self.n_groups = mamba_n_groups
|
||||
self.mamba_head_dim = mamba_head_dim
|
||||
self.ssm_state_size = ssm_state_size
|
||||
self.mamba_num_heads = mamba_num_heads
|
||||
self.conv_kernel = mamba_d_conv
|
||||
self.expand = mamba_expand
|
||||
self.mamba_hidden_act = mamba_hidden_act
|
||||
self.time_step_min = mamba_dt_min
|
||||
self.time_step_max = mamba_dt_max
|
||||
self.time_step_limit = mamba_dt_limit
|
||||
self.time_step_floor = mamba_dt_init_floor
|
||||
self.use_conv_bias = mamba_conv_bias
|
||||
self.mamba_proj_bias = mamba_proj_bias
|
||||
self.chunk_size = mamba_chunk_size
|
||||
self.rescale_prenorm_residual = rescale_prenorm_residual
|
||||
self.n_routed_experts = n_routed_experts
|
||||
self.n_shared_experts = n_shared_experts
|
||||
self.moe_intermediate_size = moe_intermediate_size
|
||||
self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size # noqa: E501
|
||||
self.moe_latent_size = moe_latent_size
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.routed_scaling_factor = routed_scaling_factor
|
||||
self.n_group = n_group
|
||||
self.topk_group = topk_group
|
||||
self.norm_topk_prob = norm_topk_prob
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def layers_block_type(self):
|
||||
return [
|
||||
"mamba"
|
||||
if self.hybrid_override_pattern[i] == "M"
|
||||
else "attention"
|
||||
if self.hybrid_override_pattern[i] == "*"
|
||||
else "mlp"
|
||||
if self.hybrid_override_pattern[i] == "-"
|
||||
else "moe"
|
||||
for i in range(self.num_hidden_layers)
|
||||
]
|
||||
83
vllm/transformers_utils/configs/olmo3.py
Normal file
83
vllm/transformers_utils/configs/olmo3.py
Normal file
@@ -0,0 +1,83 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class Olmo3Config(PretrainedConfig):
|
||||
model_type = "olmo3"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=50304,
|
||||
hidden_size=4096,
|
||||
intermediate_size=11008,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=2048,
|
||||
initializer_range=0.02,
|
||||
use_cache=True,
|
||||
pad_token_id=1,
|
||||
bos_token_id=None,
|
||||
eos_token_id=50279,
|
||||
tie_word_embeddings=False,
|
||||
rope_parameters=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
rms_norm_eps=1e-5,
|
||||
sliding_window=4096,
|
||||
layer_types=None,
|
||||
**kwargs,
|
||||
):
|
||||
# This model uses Olmo3ForCausalLM in transformers but Olmo2ForCausalLM
|
||||
# in vLLM.
|
||||
if "architectures" not in kwargs:
|
||||
kwargs["architectures"] = ["Olmo2ForCausalLM"]
|
||||
elif "Olmo3ForCausalLM" in kwargs["architectures"]:
|
||||
kwargs["architectures"].remove("Olmo3ForCausalLM")
|
||||
kwargs["architectures"].append("Olmo2ForCausalLM")
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.use_cache = use_cache
|
||||
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
|
||||
rope_scaling = kwargs.pop("rope_scaling", None)
|
||||
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
|
||||
rope_theta = kwargs.pop("rope_theta", 10000.0)
|
||||
if "rope_theta" not in rope_parameters:
|
||||
rope_parameters["rope_theta"] = rope_theta
|
||||
self.rope_parameters = rope_parameters
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
|
||||
self.sliding_window = sliding_window
|
||||
self.layer_types = layer_types
|
||||
if self.layer_types is None:
|
||||
self.layer_types = [
|
||||
"sliding_attention" if (i + 1) % 4 != 0 else "full_attention"
|
||||
for i in range(self.num_hidden_layers)
|
||||
]
|
||||
182
vllm/transformers_utils/configs/ovis.py
Normal file
182
vllm/transformers_utils/configs/ovis.py
Normal file
@@ -0,0 +1,182 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# ruff: noqa: E501
|
||||
# adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
|
||||
# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
|
||||
# Ovis Config with AimV2 config registration removed for Transformers compatibility
|
||||
from typing import Any
|
||||
|
||||
from transformers import AutoConfig, PretrainedConfig
|
||||
|
||||
|
||||
class AIMv2Config(PretrainedConfig):
|
||||
"""This is the configuration class to store the configuration of an [`AIMv2Model`].
|
||||
Instantiating a configuration with the defaults will yield a similar configuration
|
||||
to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224).
|
||||
Args:
|
||||
hidden_size: Dimension of the hidden representations.
|
||||
intermediate_size: Dimension of the SwiGLU representations.
|
||||
num_hidden_layers: Number of hidden layers in the Transformer.
|
||||
num_attention_heads: Number of attention heads for each attention layer
|
||||
in the Transformer.
|
||||
num_channels: Number of input channels.
|
||||
image_size: Image size.
|
||||
patch_size: Patch size.
|
||||
rms_norm_eps: Epsilon value used for the RMS normalization layer.
|
||||
attention_dropout: Dropout ratio for attention probabilities.
|
||||
projection_dropout: Dropout ratio for the projection layer after the attention.
|
||||
qkv_bias: Whether to add a bias to the queries, keys and values.
|
||||
use_bias: Whether to add a bias in the feed-forward and projection layers.
|
||||
kwargs: Keyword arguments for the [`PretrainedConfig`].
|
||||
"""
|
||||
|
||||
model_type: str = "aimv2"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size: int = 1024,
|
||||
intermediate_size: int = 2816,
|
||||
num_hidden_layers: int = 24,
|
||||
num_attention_heads: int = 8,
|
||||
num_channels: int = 3,
|
||||
image_size: int = 224,
|
||||
patch_size: int = 14,
|
||||
rms_norm_eps: float = 1e-5,
|
||||
attention_dropout: float = 0.0,
|
||||
projection_dropout: float = 0.0,
|
||||
qkv_bias: bool = False,
|
||||
use_bias: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_channels = num_channels
|
||||
self.patch_size = patch_size
|
||||
self.image_size = image_size
|
||||
self.attention_dropout = attention_dropout
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
|
||||
self.projection_dropout = projection_dropout
|
||||
self.qkv_bias = qkv_bias
|
||||
self.use_bias = use_bias
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Visual Tokenizer Configuration
|
||||
# ----------------------------------------------------------------------
|
||||
class BaseVisualTokenizerConfig(PretrainedConfig):
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=16384,
|
||||
tokenize_function="softmax",
|
||||
tau=1.0,
|
||||
depths=None,
|
||||
drop_cls_token=False,
|
||||
backbone_config: PretrainedConfig | dict | None = None,
|
||||
hidden_stride: int = 1,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.tokenize_function = tokenize_function
|
||||
self.tau = tau
|
||||
if isinstance(depths, str):
|
||||
depths = [int(x) for x in depths.split("|")]
|
||||
self.depths = depths
|
||||
self.backbone_kwargs = dict[str, Any]()
|
||||
self.drop_cls_token = drop_cls_token
|
||||
if backbone_config is not None:
|
||||
assert isinstance(backbone_config, (PretrainedConfig, dict)), (
|
||||
f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
|
||||
)
|
||||
if not isinstance(backbone_config, PretrainedConfig):
|
||||
model_type = backbone_config["model_type"]
|
||||
if model_type != "aimv2":
|
||||
backbone_config.pop("model_type")
|
||||
backbone_config = AutoConfig.for_model(
|
||||
model_type, **backbone_config
|
||||
)
|
||||
else:
|
||||
backbone_config = AIMv2Config(**backbone_config)
|
||||
self.backbone_config = backbone_config
|
||||
self.hidden_stride = hidden_stride
|
||||
|
||||
|
||||
class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
|
||||
model_type = "aimv2_visual_tokenizer"
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if self.drop_cls_token:
|
||||
self.drop_cls_token = False
|
||||
if self.depths:
|
||||
assert len(self.depths) == 1
|
||||
self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
|
||||
|
||||
|
||||
class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
|
||||
model_type = "siglip_visual_tokenizer"
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if self.drop_cls_token:
|
||||
self.drop_cls_token = False
|
||||
if self.depths:
|
||||
assert len(self.depths) == 1
|
||||
self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
|
||||
|
||||
|
||||
AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
|
||||
AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Ovis Configuration
|
||||
# ----------------------------------------------------------------------
|
||||
class OvisConfig(PretrainedConfig):
|
||||
model_type = "ovis"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
llm_config: PretrainedConfig | dict | None = None,
|
||||
visual_tokenizer_config: PretrainedConfig | dict | None = None,
|
||||
multimodal_max_length=8192,
|
||||
hidden_size=None,
|
||||
conversation_formatter_class=None,
|
||||
llm_attn_implementation=None,
|
||||
disable_tie_weight=False,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
if llm_config is not None:
|
||||
assert isinstance(llm_config, (PretrainedConfig, dict)), (
|
||||
f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
|
||||
)
|
||||
if not isinstance(llm_config, PretrainedConfig):
|
||||
model_type = llm_config["model_type"]
|
||||
llm_config.pop("model_type")
|
||||
llm_config = AutoConfig.for_model(model_type, **llm_config)
|
||||
|
||||
# map llm_config to text_config
|
||||
self.text_config = llm_config
|
||||
if visual_tokenizer_config is not None:
|
||||
assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), (
|
||||
f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
|
||||
)
|
||||
if not isinstance(visual_tokenizer_config, PretrainedConfig):
|
||||
model_type = visual_tokenizer_config["model_type"]
|
||||
visual_tokenizer_config.pop("model_type")
|
||||
visual_tokenizer_config = AutoConfig.for_model(
|
||||
model_type, **visual_tokenizer_config
|
||||
)
|
||||
|
||||
self.visual_tokenizer_config = visual_tokenizer_config
|
||||
self.multimodal_max_length = multimodal_max_length
|
||||
self.hidden_size = hidden_size
|
||||
self.conversation_formatter_class = conversation_formatter_class
|
||||
self.llm_attn_implementation = llm_attn_implementation
|
||||
self.disable_tie_weight = disable_tie_weight
|
||||
277
vllm/transformers_utils/configs/qwen3_next.py
Normal file
277
vllm/transformers_utils/configs/qwen3_next.py
Normal file
@@ -0,0 +1,277 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Qwen3-Next model configuration"""
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig, layer_type_validation
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class Qwen3NextConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`Qwen3NextModel`]. It is used to instantiate a
|
||||
Qwen3-Next model according to the specified arguments, defining the model architecture.
|
||||
Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
Qwen3-Next-80B-A3B-Instruct [Qwen/Qwen3-Next-80B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct).
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 151936):
|
||||
Vocabulary size of the model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids`.
|
||||
hidden_size (`int`, *optional*, defaults to 2048):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 5632):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 48):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 2):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
||||
hidden_act (`str`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 32768):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model's input and output word embeddings should be tied.
|
||||
rope_parameters (`dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
|
||||
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
||||
accordingly.
|
||||
Expected contents:
|
||||
`rope_theta` (`float`): The base period of the RoPE embeddings.
|
||||
`rope_type` (`str`):
|
||||
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
|
||||
'llama3'], with 'default' being the original RoPE implementation.
|
||||
`factor` (`float`, *optional*):
|
||||
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
|
||||
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
|
||||
original maximum pre-trained length.
|
||||
`original_max_position_embeddings` (`int`, *optional*):
|
||||
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
|
||||
pretraining.
|
||||
`attention_factor` (`float`, *optional*):
|
||||
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
|
||||
computation. If unspecified, it defaults to value recommended by the implementation, using the
|
||||
`factor` field to infer the suggested value.
|
||||
`beta_fast` (`float`, *optional*):
|
||||
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
|
||||
ramp function. If unspecified, it defaults to 32.
|
||||
`beta_slow` (`float`, *optional*):
|
||||
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
|
||||
ramp function. If unspecified, it defaults to 1.
|
||||
`short_factor` (`List[float]`, *optional*):
|
||||
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
|
||||
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
||||
size divided by the number of attention heads divided by 2
|
||||
`long_factor` (`List[float]`, *optional*):
|
||||
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
|
||||
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
||||
size divided by the number of attention heads divided by 2
|
||||
`low_freq_factor` (`float`, *optional*):
|
||||
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
|
||||
`high_freq_factor` (`float`, *optional*):
|
||||
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
|
||||
`partial_rotary_factor` (`float`, *optional*, defaults to 0.25):
|
||||
Percentage of the query and keys which will have rotary embedding.
|
||||
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
head_dim (`int`, *optional*, defaults to 256):
|
||||
Projection weights dimension in multi-head attention.
|
||||
linear_conv_kernel_dim (`int`, *optional*, defaults to 4):
|
||||
Kernel size of the convolution used in linear attention layers.
|
||||
linear_key_head_dim (`int`, *optional*, defaults to 128):
|
||||
Dimension of each key head in linear attention.
|
||||
linear_value_head_dim (`int`, *optional*, defaults to 128):
|
||||
Dimension of each value head in linear attention.
|
||||
linear_num_key_heads (`int`, *optional*, defaults to 16):
|
||||
Number of key heads used in linear attention layers.
|
||||
linear_num_value_heads (`int`, *optional*, defaults to 32):
|
||||
Number of value heads used in linear attention layers.
|
||||
decoder_sparse_step (`int`, *optional*, defaults to 1):
|
||||
The frequency of the MoE layer.
|
||||
moe_intermediate_size (`int`, *optional*, defaults to 512):
|
||||
Intermediate size of the routed expert.
|
||||
shared_expert_intermediate_size (`int`, *optional*, defaults to 512):
|
||||
Intermediate size of the shared expert.
|
||||
num_experts_per_tok (`int`, *optional*, defaults to 10):
|
||||
Number of selected experts.
|
||||
num_experts (`int`, *optional*, defaults to 512):
|
||||
Number of routed experts.
|
||||
norm_topk_prob (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the topk probabilities.
|
||||
output_router_logits (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the router logits should be returned by the model. Enabling this will also
|
||||
allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
|
||||
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
|
||||
The aux loss factor for the total loss.
|
||||
mlp_only_layers (`list[int]`, *optional*, defaults to `[]`):
|
||||
Indicate which layers use Qwen3NextMLP rather than Qwen3NextSparseMoeBlock
|
||||
The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
|
||||
If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
|
||||
layer_types (`list[str]`, *optional*):
|
||||
Types of each layer (attention or linear).
|
||||
|
||||
```python
|
||||
>>> from transformers import Qwen3NextModel, Qwen3NextConfig
|
||||
|
||||
>>> # Initializing a Qwen3Next style configuration
|
||||
>>> configuration = Qwen3NextConfig()
|
||||
|
||||
>>> # Initializing a model from the Qwen3-Next-80B-A3B style configuration
|
||||
>>> model = Qwen3NextModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```
|
||||
""" # noqa: E501
|
||||
|
||||
model_type = "qwen3_next"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
base_model_tp_plan = {
|
||||
"layers.*.self_attn.q_proj": "colwise",
|
||||
"layers.*.self_attn.k_proj": "colwise",
|
||||
"layers.*.self_attn.v_proj": "colwise",
|
||||
"layers.*.self_attn.o_proj": "rowwise",
|
||||
"layers.*.mlp.experts.*.gate_proj": "colwise",
|
||||
"layers.*.mlp.experts.*.up_proj": "colwise",
|
||||
"layers.*.mlp.experts.*.down_proj": "rowwise",
|
||||
"layers.*.mlp.shared_experts.gate_proj": "colwise",
|
||||
"layers.*.mlp.shared_experts.up_proj": "colwise",
|
||||
"layers.*.mlp.shared_experts.down_proj": "rowwise",
|
||||
"layers.*.mlp.gate_proj": "colwise",
|
||||
"layers.*.mlp.up_proj": "colwise",
|
||||
"layers.*.mlp.down_proj": "rowwise",
|
||||
}
|
||||
base_model_pp_plan = {
|
||||
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
||||
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
||||
"norm": (["hidden_states"], ["hidden_states"]),
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=151936,
|
||||
hidden_size=2048,
|
||||
intermediate_size=5632,
|
||||
num_hidden_layers=48,
|
||||
num_attention_heads=16,
|
||||
num_key_value_heads=2,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=32768,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-6,
|
||||
use_cache=True,
|
||||
tie_word_embeddings=False,
|
||||
rope_parameters=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
head_dim=256,
|
||||
linear_conv_kernel_dim=4,
|
||||
linear_key_head_dim=128,
|
||||
linear_value_head_dim=128,
|
||||
linear_num_key_heads=16,
|
||||
linear_num_value_heads=32,
|
||||
decoder_sparse_step=1,
|
||||
moe_intermediate_size=512,
|
||||
shared_expert_intermediate_size=512,
|
||||
num_experts_per_tok=10,
|
||||
num_experts=512,
|
||||
norm_topk_prob=True,
|
||||
output_router_logits=False,
|
||||
router_aux_loss_coef=0.001,
|
||||
mlp_only_layers=None,
|
||||
layer_types=None,
|
||||
**kwargs,
|
||||
):
|
||||
if mlp_only_layers is None:
|
||||
mlp_only_layers = []
|
||||
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
|
||||
rope_scaling = kwargs.pop("rope_scaling", None)
|
||||
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
|
||||
rope_theta = kwargs.pop("rope_theta", 10000.0)
|
||||
if "rope_theta" not in rope_parameters:
|
||||
rope_parameters["rope_theta"] = rope_theta
|
||||
partial_rotary_factor = kwargs.pop("partial_rotary_factor", 0.25)
|
||||
if "partial_rotary_factor" not in rope_parameters:
|
||||
rope_parameters["partial_rotary_factor"] = partial_rotary_factor
|
||||
self.rope_parameters = rope_parameters
|
||||
self.partial_rotary_factor = partial_rotary_factor
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
self.head_dim = head_dim
|
||||
|
||||
self.layer_types = layer_types
|
||||
if self.layer_types is None:
|
||||
self.layer_types = [
|
||||
"linear_attention" if bool((i + 1) % 4) else "full_attention"
|
||||
for i in range(self.num_hidden_layers)
|
||||
]
|
||||
layer_type_validation(self.layer_types)
|
||||
|
||||
# linear attention part
|
||||
self.linear_conv_kernel_dim = linear_conv_kernel_dim
|
||||
self.linear_key_head_dim = linear_key_head_dim
|
||||
self.linear_value_head_dim = linear_value_head_dim
|
||||
self.linear_num_key_heads = linear_num_key_heads
|
||||
self.linear_num_value_heads = linear_num_value_heads
|
||||
|
||||
# MoE arguments
|
||||
self.decoder_sparse_step = decoder_sparse_step
|
||||
self.moe_intermediate_size = moe_intermediate_size
|
||||
self.shared_expert_intermediate_size = shared_expert_intermediate_size
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.num_experts = num_experts
|
||||
self.norm_topk_prob = norm_topk_prob
|
||||
self.output_router_logits = output_router_logits
|
||||
self.router_aux_loss_coef = router_aux_loss_coef
|
||||
self.mlp_only_layers = mlp_only_layers
|
||||
|
||||
|
||||
__all__ = ["Qwen3NextConfig"]
|
||||
89
vllm/transformers_utils/configs/radio.py
Normal file
89
vllm/transformers_utils/configs/radio.py
Normal file
@@ -0,0 +1,89 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Radio vision model configuration"""
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
VIT_TIMM_DIM_BY_NAME: dict[str, tuple[int, int, int, int]] = {
|
||||
"vit_small_patch16_224": (384, 12, 6, 1536),
|
||||
"vit_base_patch16_224": (768, 12, 12, 3072),
|
||||
"vit_large_patch16_224": (1024, 24, 16, 4096),
|
||||
"vit_huge_patch16_224": (1280, 32, 16, 5120),
|
||||
}
|
||||
|
||||
OPENAI_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
|
||||
OPENAI_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
|
||||
|
||||
|
||||
class RadioConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a Radio
|
||||
vision model. It is used to instantiate a Radio model according to the
|
||||
specified arguments, defining the model architecture.
|
||||
|
||||
Args:
|
||||
model_name: Name of the vision transformer model
|
||||
(e.g., "vit_base_patch16_224"). Used to determine architecture
|
||||
dimensions from `VIT_TIMM_DIM_BY_NAME`.
|
||||
image_size: The size (resolution) of each image.
|
||||
patch_size: The size (resolution) of each patch.
|
||||
qkv_bias: Whether to add a bias to the queries, keys and values.
|
||||
qk_normalization: Whether to apply normalization to queries and keys.
|
||||
norm_type: The normalization type to use.
|
||||
layer_norm_eps: The epsilon used by the layer normalization layers.
|
||||
initializer_factor: A factor for initializing all weight matrices.
|
||||
hidden_act: The non-linear activation function in the encoder.
|
||||
max_img_size: Maximum image size for position embeddings.
|
||||
norm_mean: Mean values for image normalization (RGB channels).
|
||||
Defaults to (0.48145466, 0.4578275, 0.40821073)).
|
||||
norm_std: Standard deviation values for image normalization
|
||||
(RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).
|
||||
reg_tokens: Number of register tokens to use.
|
||||
"""
|
||||
|
||||
model_type = "radio"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str,
|
||||
image_size: int = 224,
|
||||
patch_size: int = 16,
|
||||
qkv_bias: bool = True,
|
||||
qk_normalization: bool = False,
|
||||
norm_type: str = "layer_norm",
|
||||
layer_norm_eps: float = 1e-6,
|
||||
initializer_factor: float = 1.0,
|
||||
hidden_act: str = "gelu",
|
||||
max_img_size: int = 2048,
|
||||
norm_mean: tuple[float, float, float] | list = OPENAI_CLIP_MEAN,
|
||||
norm_std: tuple[float, float, float] | list = OPENAI_CLIP_STD,
|
||||
reg_tokens: int | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
self.model_name = model_name
|
||||
(
|
||||
self.hidden_size,
|
||||
self.num_hidden_layers,
|
||||
self.num_attention_heads,
|
||||
self.intermediate_size,
|
||||
) = VIT_TIMM_DIM_BY_NAME[model_name]
|
||||
self.image_size = image_size
|
||||
self.patch_size = patch_size
|
||||
self.qkv_bias = qkv_bias
|
||||
self.qk_normalization = qk_normalization
|
||||
self.norm_type = norm_type
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.initializer_factor = initializer_factor
|
||||
self.hidden_act = hidden_act
|
||||
self.max_img_size = max_img_size
|
||||
self.norm_mean = (
|
||||
list(norm_mean) if isinstance(norm_mean, (tuple, list)) else norm_mean
|
||||
)
|
||||
self.norm_std = (
|
||||
list(norm_std) if isinstance(norm_std, (tuple, list)) else norm_std
|
||||
)
|
||||
self.reg_tokens = reg_tokens
|
||||
super().__init__(**kwargs)
|
||||
2
vllm/transformers_utils/configs/speculators/__init__.py
Normal file
2
vllm/transformers_utils/configs/speculators/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
38
vllm/transformers_utils/configs/speculators/algos.py
Normal file
38
vllm/transformers_utils/configs/speculators/algos.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
SUPPORTED_SPECULATORS_TYPES = {}
|
||||
|
||||
|
||||
def register_speculator(name):
|
||||
def decorator(fn):
|
||||
SUPPORTED_SPECULATORS_TYPES[name] = fn
|
||||
return fn
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
@register_speculator("eagle3")
|
||||
def update_eagle3(config_dict: dict, vllm_config: dict) -> None:
|
||||
"""
|
||||
Apply Eagle-3 specific configuration transformations.
|
||||
|
||||
Eagle-3 specific fields:
|
||||
- draft_vocab_size: Size of the draft model's vocabulary
|
||||
- target_hidden_size: Hidden size of the target model
|
||||
- norm_before_residual: Whether to apply norm before residual connection
|
||||
- eagle_aux_hidden_state_layer_ids: List of layer indices from the base
|
||||
model to use as auxiliary inputs for the Eagle3 drafter. These layers
|
||||
provide intermediate hidden states that help the drafter make better
|
||||
predictions. This is the standard field used in Eagle3 checkpoints.
|
||||
"""
|
||||
|
||||
vllm_config["draft_vocab_size"] = config_dict.get("draft_vocab_size")
|
||||
if config_dict.get("target_hidden_size") is not None:
|
||||
vllm_config["target_hidden_size"] = config_dict["target_hidden_size"]
|
||||
vllm_config["norm_before_residual"] = config_dict.get("norm_before_residual", True)
|
||||
vllm_config["architectures"] = ["Eagle3LlamaForCausalLM"]
|
||||
if config_dict.get("eagle_aux_hidden_state_layer_ids"):
|
||||
vllm_config["eagle_aux_hidden_state_layer_ids"] = config_dict[
|
||||
"eagle_aux_hidden_state_layer_ids"
|
||||
]
|
||||
114
vllm/transformers_utils/configs/speculators/base.py
Normal file
114
vllm/transformers_utils/configs/speculators/base.py
Normal file
@@ -0,0 +1,114 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.transformers_utils.configs.speculators.algos import (
|
||||
SUPPORTED_SPECULATORS_TYPES,
|
||||
)
|
||||
|
||||
__all__ = ["SpeculatorsConfig"]
|
||||
|
||||
|
||||
class SpeculatorsConfig(PretrainedConfig):
|
||||
model_type = "speculators"
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
pretrained_model_name_or_path: str | os.PathLike,
|
||||
**kwargs,
|
||||
) -> "SpeculatorsConfig":
|
||||
"""Load speculators Eagle config and convert to vLLM format."""
|
||||
config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
vllm_config = cls.extract_vllm_speculative_config(config_dict)
|
||||
return cls(**vllm_config)
|
||||
|
||||
@classmethod
|
||||
def extract_vllm_speculative_config(
|
||||
cls, config_dict: dict[str, Any]
|
||||
) -> dict[str, Any]:
|
||||
speculators_model_type = config_dict.get("speculators_model_type")
|
||||
if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES:
|
||||
raise ValueError(
|
||||
f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. "
|
||||
"Please ensure you're loading a speculators-format model."
|
||||
)
|
||||
|
||||
# validate fields
|
||||
# TODO: @dsikka - use speculators pydantic model to validate
|
||||
cls.validate_speculators_config(config_dict=config_dict)
|
||||
# Convert from speculators config -> format that can be ingested by vLLM
|
||||
vllm_config = cls.build_vllm_speculative_config(config_dict=config_dict)
|
||||
# Apply anything specific to the supported algorithm
|
||||
algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type]
|
||||
algo_updater(config_dict=config_dict, vllm_config=vllm_config)
|
||||
return vllm_config
|
||||
|
||||
@classmethod
|
||||
def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None:
|
||||
try:
|
||||
spec_config = config_dict["speculators_config"]
|
||||
methods = spec_config["proposal_methods"]
|
||||
first_method = methods[0]
|
||||
_ = first_method["speculative_tokens"]
|
||||
_ = spec_config["verifier"]["name_or_path"]
|
||||
_ = config_dict["speculators_model_type"]
|
||||
except (KeyError, IndexError, TypeError) as e:
|
||||
raise ValueError("Invalid speculators config structure") from e
|
||||
|
||||
if "transformer_layer_config" not in config_dict:
|
||||
raise ValueError("Must provide transformer_layer_config")
|
||||
|
||||
if not isinstance(config_dict["transformer_layer_config"], dict):
|
||||
raise TypeError(
|
||||
"'transformer_layer_config' must be a dictionary if provided"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def build_vllm_speculative_config(
|
||||
cls, config_dict: dict[str, Any]
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Build vLLM-compatible speculative configuration from speculators format.
|
||||
|
||||
This method extracts and transforms speculative configuration from the
|
||||
speculators format into the structure expected by vLLM.
|
||||
|
||||
Args:
|
||||
config_dict: Configuration dictionary in speculators format
|
||||
|
||||
Returns:
|
||||
Dictionary with vLLM-compatible speculative configuration
|
||||
"""
|
||||
# Extract speculators configuration
|
||||
spec_config = config_dict["speculators_config"]
|
||||
|
||||
# Currently we only support one proposal method
|
||||
proposal_methods = spec_config.get("proposal_methods")
|
||||
if not proposal_methods:
|
||||
raise ValueError("No proposal methods found in speculators config")
|
||||
|
||||
first_method = proposal_methods[0]
|
||||
num_speculative_tokens = first_method.get("speculative_tokens")
|
||||
|
||||
if num_speculative_tokens is None:
|
||||
raise ValueError(
|
||||
f"Missing 'speculative_tokens' in proposal method. Got: {first_method}"
|
||||
)
|
||||
|
||||
# Build base vLLM speculative configuration
|
||||
vllm_config = {
|
||||
"method": config_dict.get("speculators_model_type"),
|
||||
"num_speculative_tokens": num_speculative_tokens,
|
||||
"target_model": spec_config.get("verifier")["name_or_path"],
|
||||
}
|
||||
|
||||
# Merge transformer layer configuration if present
|
||||
transformer_config = config_dict.get("transformer_layer_config", {})
|
||||
vllm_config.update(transformer_config)
|
||||
|
||||
return vllm_config
|
||||
178
vllm/transformers_utils/configs/step3_vl.py
Normal file
178
vllm/transformers_utils/configs/step3_vl.py
Normal file
@@ -0,0 +1,178 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class Step3VisionEncoderConfig(PretrainedConfig):
|
||||
model_type = "step3_vision_encoder"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size=1792,
|
||||
intermediate_size=3072,
|
||||
output_hidden_size=4096,
|
||||
num_hidden_layers=63,
|
||||
num_attention_heads=16,
|
||||
num_channels=3,
|
||||
image_size=728,
|
||||
patch_size=14,
|
||||
hidden_act="quick_gelu",
|
||||
layer_norm_eps=1e-5,
|
||||
**kwargs,
|
||||
):
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.output_hidden_size = output_hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_channels = num_channels
|
||||
self.patch_size = patch_size
|
||||
self.image_size = image_size
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.hidden_act = hidden_act
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
class Step3TextConfig(PretrainedConfig):
|
||||
model_type = "step3_text"
|
||||
architectures = ["Step3TextForCausalLM"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size: int = 7168,
|
||||
intermediate_size: int = 18432,
|
||||
num_attention_heads: int = 64,
|
||||
num_attention_groups: int = 1,
|
||||
num_hidden_layers: int = 61,
|
||||
max_seq_len: int = 65536,
|
||||
vocab_size: int = 128815,
|
||||
rms_norm_eps: float = 1e-5,
|
||||
moe_intermediate_size: int = 5120,
|
||||
moe_num_experts: int = 48,
|
||||
moe_top_k: int = 3,
|
||||
rope_parameters: dict[str, Any] | None = None,
|
||||
max_position_embedding: int = 65536,
|
||||
share_expert_dim: int = 5120,
|
||||
share_q_dim: int = 2048,
|
||||
head_dim: int = 256,
|
||||
norm_expert_weight: bool = False,
|
||||
moe_layers_enum: tuple[int, ...] = (
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
16,
|
||||
17,
|
||||
18,
|
||||
19,
|
||||
20,
|
||||
21,
|
||||
22,
|
||||
23,
|
||||
24,
|
||||
25,
|
||||
26,
|
||||
27,
|
||||
28,
|
||||
29,
|
||||
30,
|
||||
31,
|
||||
32,
|
||||
33,
|
||||
34,
|
||||
35,
|
||||
36,
|
||||
37,
|
||||
38,
|
||||
39,
|
||||
40,
|
||||
41,
|
||||
42,
|
||||
43,
|
||||
44,
|
||||
45,
|
||||
46,
|
||||
47,
|
||||
48,
|
||||
49,
|
||||
50,
|
||||
51,
|
||||
52,
|
||||
53,
|
||||
54,
|
||||
55,
|
||||
56,
|
||||
57,
|
||||
58,
|
||||
59,
|
||||
),
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_attention_groups = num_attention_groups
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.max_seq_len = max_seq_len
|
||||
self.vocab_size = vocab_size
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.moe_intermediate_size = moe_intermediate_size
|
||||
self.moe_num_experts = moe_num_experts
|
||||
self.moe_top_k = moe_top_k
|
||||
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
|
||||
rope_scaling = kwargs.pop("rope_scaling", None)
|
||||
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
|
||||
rope_theta = kwargs.pop("rope_theta", 500000.0)
|
||||
if "rope_theta" not in rope_parameters:
|
||||
rope_parameters["rope_theta"] = rope_theta
|
||||
self.rope_parameters = rope_parameters
|
||||
self.max_position_embedding = max_position_embedding
|
||||
self.share_expert_dim = share_expert_dim
|
||||
self.share_q_dim = share_q_dim
|
||||
self.head_dim = head_dim
|
||||
self.norm_expert_weight = norm_expert_weight
|
||||
self.moe_layers_enum = moe_layers_enum
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
class Step3VLConfig(PretrainedConfig):
|
||||
model_type = "step3_vl"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vision_config: dict | Step3VisionEncoderConfig | None = None,
|
||||
text_config: dict | Step3TextConfig | None = None,
|
||||
understand_projector_stride: int = 1,
|
||||
projector_bias: bool = True,
|
||||
image_token_id: int = 128001,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
if vision_config is None:
|
||||
vision_config = Step3VisionEncoderConfig()
|
||||
elif isinstance(vision_config, dict):
|
||||
vision_config = Step3VisionEncoderConfig(**vision_config)
|
||||
self.vision_config = vision_config
|
||||
|
||||
if text_config is None:
|
||||
text_config = Step3TextConfig()
|
||||
elif isinstance(text_config, dict):
|
||||
text_config = Step3TextConfig(**text_config)
|
||||
self.text_config = text_config
|
||||
|
||||
self.understand_projector_stride = understand_projector_stride
|
||||
self.projector_bias = projector_bias
|
||||
self.hidden_size = text_config.hidden_size
|
||||
self.image_token_id = image_token_id
|
||||
|
||||
super().__init__(**kwargs)
|
||||
24
vllm/transformers_utils/configs/tarsier2.py
Normal file
24
vllm/transformers_utils/configs/tarsier2.py
Normal file
@@ -0,0 +1,24 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from transformers import Qwen2VLConfig
|
||||
|
||||
|
||||
class Tarsier2Config(Qwen2VLConfig):
|
||||
"""
|
||||
Tarsier2's config.json is written such that AutoConfig.from_pretrained will create
|
||||
a deeply nested config consisting of:
|
||||
|
||||
- LlavaConfig
|
||||
- Qwen2VLConfig
|
||||
- Qwen2VLTextConfig
|
||||
- Qwen2VLVisionConfig
|
||||
- Qwen2VLConfig
|
||||
- Qwen2VLTextConfig
|
||||
- Qwen2VLVisionConfig
|
||||
|
||||
When it should really just be a single Qwen2VLConfig.
|
||||
|
||||
This class is a hack to stop AutoConfig from creating the nested config structure.
|
||||
"""
|
||||
|
||||
model_type = "tarsier2"
|
||||
120
vllm/transformers_utils/configs/ultravox.py
Normal file
120
vllm/transformers_utils/configs/ultravox.py
Normal file
@@ -0,0 +1,120 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
|
||||
from typing import Any
|
||||
|
||||
import transformers
|
||||
|
||||
|
||||
class UltravoxConfig(transformers.PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a
|
||||
[`UltravoxForConditionalGeneration`]. It is used to instantiate an
|
||||
Ultravox model according to the specified arguments, defining the model
|
||||
architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to
|
||||
control the model outputs. Read the documentation from [`PretrainedConfig`]
|
||||
for more information.
|
||||
|
||||
Args:
|
||||
audio_config (`Union[AutoConfig, dict]`, *optional*):
|
||||
Custom audio config or dict.
|
||||
text_config (`Union[AutoConfig, dict]`, *optional*):
|
||||
The config object of the text backbone.
|
||||
audio_model_id (`str`, *optional*):
|
||||
The model ID of the audio backbone.
|
||||
text_model_id (`str`, *optional*):
|
||||
The model ID of the text backbone.
|
||||
ignore_index (`int`, *optional*, defaults to -100):
|
||||
The ignore index for the loss function.
|
||||
audio_token_index (`int`, *optional*, defaults to 32000):
|
||||
The audio token index to encode the audio prompt.
|
||||
stack_factor (`int`, *optional*, defaults to 8):
|
||||
Audio downsampling factor for the multimodal projector.
|
||||
norm_init (`float`, *optional*, defaults to 0.4):
|
||||
The initialization value for the layer normalization.
|
||||
projector_act (`str`, *optional*, defaults to `"swiglu"`):
|
||||
The activation function used by the multimodal projector.
|
||||
projector_ln_mid (`bool`, *optional*, defaults to `False`):
|
||||
Whether to apply layer normalization at the middle of the
|
||||
projector or at the end. Versions v0.4.1 and below
|
||||
use `False`, but v0.5 and above use `True`.
|
||||
"""
|
||||
|
||||
wrapped_model_config: transformers.PretrainedConfig
|
||||
model_type = "ultravox"
|
||||
audio_token = "<|audio|>"
|
||||
is_composition = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
audio_config: dict[str, Any] | None = None,
|
||||
text_config: dict[str, Any] | None = None,
|
||||
audio_model_id: str | None = None,
|
||||
text_model_id: str | None = None,
|
||||
ignore_index: int = -100,
|
||||
audio_token_index: int = 32000,
|
||||
hidden_size: int = 4096,
|
||||
stack_factor: int = 8,
|
||||
norm_init: float = 0.4,
|
||||
projector_act: str = "swiglu",
|
||||
projector_ln_mid: bool = False,
|
||||
num_projector_layers: int = 0,
|
||||
**kwargs,
|
||||
):
|
||||
self.ignore_index = ignore_index
|
||||
self.audio_token_index = audio_token_index
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.stack_factor = stack_factor
|
||||
self.norm_init = norm_init
|
||||
self.projector_act = projector_act
|
||||
self.projector_ln_mid = projector_ln_mid
|
||||
self.num_projector_layers = num_projector_layers
|
||||
|
||||
# N.B. May set the wrapped_model_config below.
|
||||
self.text_model_id = text_model_id
|
||||
if text_model_id is None:
|
||||
text_config = text_config or {}
|
||||
self.wrapped_model_config = transformers.CONFIG_MAPPING[
|
||||
text_config.get("model_type", "llama")
|
||||
](**text_config)
|
||||
|
||||
# N.B. May set the audio_config below.
|
||||
self.audio_model_id = audio_model_id
|
||||
if audio_model_id is None:
|
||||
self.audio_model_id = None
|
||||
audio_config = audio_config or {}
|
||||
self.audio_config = transformers.CONFIG_MAPPING[
|
||||
audio_config.get("model_type", "whisper")
|
||||
](**audio_config)
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def __setattr__(self, key, value):
|
||||
# Since --hf-overrides are applied _after_ the UltravoxConfig is
|
||||
# instantiated, load the configs implicitly when assigning text_model_id
|
||||
# or audio_model_id. This allows:
|
||||
#
|
||||
# --hf-overrides.text_model_id=<quantized variant>
|
||||
#
|
||||
# to behave as intended.
|
||||
if key == "text_model_id" and value is not None:
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
self.wrapped_model_config = get_config(value, trust_remote_code=False)
|
||||
elif key == "audio_model_id" and value is not None:
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
self.audio_config = get_config(value, trust_remote_code=False)
|
||||
|
||||
return super().__setattr__(key, value)
|
||||
|
||||
@property
|
||||
def text_config(self) -> transformers.PretrainedConfig:
|
||||
# When Ultravox wraps a multi-modal model (e.g. Gemma), we instantiate
|
||||
# the full model, but the text config is the text config of the inner
|
||||
# model.
|
||||
return self.wrapped_model_config.get_text_config()
|
||||
Reference in New Issue
Block a user