update

2026-04-09 11:19:36 +08:00
parent 809cecae09
commit 8082d5f4b2
2579 changed files with 3675 additions and 0 deletions
--- a/transformers_utils/configs/init.py
+++ b/transformers_utils/configs/init.py
@@ -1,70 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Model configs may be defined in this directory for the following reasons:
-
- There is no configuration file defined by HF Hub or Transformers library.
- There is a need to override the existing config to support vLLM.
-"""
-
-from vllm.transformers_utils.configs.afmoe import AfmoeConfig
-from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
-from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
-from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig
-from vllm.transformers_utils.configs.eagle import EAGLEConfig
-
-# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
-# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
-# `FalconConfig` class from the official HuggingFace transformers library.
-from vllm.transformers_utils.configs.falcon import RWConfig
-from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig
-from vllm.transformers_utils.configs.jais import JAISConfig
-from vllm.transformers_utils.configs.kimi_linear import KimiLinearConfig
-from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
-from vllm.transformers_utils.configs.lfm2_moe import Lfm2MoeConfig
-from vllm.transformers_utils.configs.medusa import MedusaConfig
-from vllm.transformers_utils.configs.midashenglm import MiDashengLMConfig
-from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
-from vllm.transformers_utils.configs.moonvit import MoonViTConfig
-from vllm.transformers_utils.configs.nemotron import NemotronConfig
-from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
-from vllm.transformers_utils.configs.olmo3 import Olmo3Config
-from vllm.transformers_utils.configs.ovis import OvisConfig
-from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig
-from vllm.transformers_utils.configs.radio import RadioConfig
-from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
-from vllm.transformers_utils.configs.step3_vl import (
-    Step3TextConfig,
-    Step3VisionEncoderConfig,
-    Step3VLConfig,
-)
-from vllm.transformers_utils.configs.ultravox import UltravoxConfig
-
-__all__ = [
-    "AfmoeConfig",
-    "ChatGLMConfig",
-    "DeepseekVLV2Config",
-    "DotsOCRConfig",
-    "EAGLEConfig",
-    "FlexOlmoConfig",
-    "RWConfig",
-    "JAISConfig",
-    "Lfm2MoeConfig",
-    "MedusaConfig",
-    "MiDashengLMConfig",
-    "MLPSpeculatorConfig",
-    "MoonViTConfig",
-    "KimiLinearConfig",
-    "KimiVLConfig",
-    "NemotronConfig",
-    "NemotronHConfig",
-    "Olmo3Config",
-    "OvisConfig",
-    "RadioConfig",
-    "SpeculatorsConfig",
-    "UltravoxConfig",
-    "Step3VLConfig",
-    "Step3VisionEncoderConfig",
-    "Step3TextConfig",
-    "Qwen3NextConfig",
-]
--- a/transformers_utils/configs/pycache/init.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/init.cpython-312.pyc
--- a/transformers_utils/configs/pycache/afmoe.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/afmoe.cpython-312.pyc
--- a/transformers_utils/configs/pycache/arctic.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/arctic.cpython-312.pyc
--- a/transformers_utils/configs/pycache/chatglm.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/chatglm.cpython-312.pyc
--- a/transformers_utils/configs/pycache/deepseek_vl2.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/deepseek_vl2.cpython-312.pyc
--- a/transformers_utils/configs/pycache/dotsocr.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/dotsocr.cpython-312.pyc
--- a/transformers_utils/configs/pycache/eagle.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/eagle.cpython-312.pyc
--- a/transformers_utils/configs/pycache/falcon.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/falcon.cpython-312.pyc
--- a/transformers_utils/configs/pycache/flex_olmo.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/flex_olmo.cpython-312.pyc
--- a/transformers_utils/configs/pycache/jais.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/jais.cpython-312.pyc
--- a/transformers_utils/configs/pycache/kimi_linear.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/kimi_linear.cpython-312.pyc
--- a/transformers_utils/configs/pycache/kimi_vl.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/kimi_vl.cpython-312.pyc
--- a/transformers_utils/configs/pycache/lfm2_moe.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/lfm2_moe.cpython-312.pyc
--- a/transformers_utils/configs/pycache/medusa.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/medusa.cpython-312.pyc
--- a/transformers_utils/configs/pycache/midashenglm.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/midashenglm.cpython-312.pyc
--- a/transformers_utils/configs/pycache/mistral.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/mistral.cpython-312.pyc
--- a/transformers_utils/configs/pycache/mlp_speculator.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/mlp_speculator.cpython-312.pyc
--- a/transformers_utils/configs/pycache/moonvit.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/moonvit.cpython-312.pyc
--- a/transformers_utils/configs/pycache/nemotron.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/nemotron.cpython-312.pyc
--- a/transformers_utils/configs/pycache/nemotron_h.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/nemotron_h.cpython-312.pyc
--- a/transformers_utils/configs/pycache/olmo3.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/olmo3.cpython-312.pyc
--- a/transformers_utils/configs/pycache/ovis.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/ovis.cpython-312.pyc
--- a/transformers_utils/configs/pycache/qwen3_next.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/qwen3_next.cpython-312.pyc
--- a/transformers_utils/configs/pycache/radio.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/radio.cpython-312.pyc
--- a/transformers_utils/configs/pycache/step3_vl.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/step3_vl.cpython-312.pyc
--- a/transformers_utils/configs/pycache/ultravox.cpython-312.pyc
+++ b/transformers_utils/configs/pycache/ultravox.cpython-312.pyc
--- a/transformers_utils/configs/afmoe.py
+++ b/transformers_utils/configs/afmoe.py
@@ -1,84 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from transformers.configuration_utils import PretrainedConfig
-
-
-class AfmoeConfig(PretrainedConfig):
-    model_type = "afmoe"
-
-    def __init__(
-        self,
-        vocab_size: int = 200_192,
-        hidden_size: int = 2048,
-        intermediate_size: int = 6144,
-        moe_intermediate_size: int = 1408,
-        num_hidden_layers: int = 32,
-        num_dense_layers: int = 1,
-        num_attention_heads: int = 16,
-        num_key_value_heads: int | None = None,
-        head_dim: int = 128,
-        hidden_act: str = "silu",
-        max_position_embeddings: int = 131072,
-        initializer_range: float = 0.02,
-        rms_norm_eps: float = 1e-5,
-        use_cache: bool = True,
-        tie_word_embeddings: bool = False,
-        rope_theta: float = 10000.0,
-        rope_scaling: dict | None = None,
-        num_experts: int = 64,
-        num_experts_per_tok: int = 6,
-        num_shared_experts: int = 2,
-        num_expert_groups: int = 1,
-        num_limited_groups: int = 1,
-        score_func: str = "sigmoid",
-        route_norm: bool = True,
-        route_scale: float = 1.0,
-        global_attn_every_n_layers: int = 4,
-        sliding_window: int = 2048,
-        layer_types: list[str] | None = None,
-        attention_dropout: float = 0.0,
-        mup_enabled: bool = False,
-        n_group: int = 1,
-        topk_group: int = 1,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_dense_layers = num_dense_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads or num_attention_heads
-        self.head_dim = head_dim
-        self.hidden_act = hidden_act
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-
-        self.moe_intermediate_size = moe_intermediate_size
-        self.num_experts = num_experts
-        self.num_experts_per_tok = num_experts_per_tok
-        self.num_shared_experts = num_shared_experts
-        self.num_expert_groups = num_expert_groups
-        self.num_limited_groups = num_limited_groups
-        self.score_func = score_func
-        self.route_norm = route_norm
-        self.route_scale = route_scale
-
-        self.global_attn_every_n_layers = global_attn_every_n_layers
-        self.sliding_window = sliding_window
-        self.layer_types = layer_types
-        self.attention_dropout = attention_dropout
-
-        self.mup_enabled = mup_enabled
-        self.n_group = n_group
-        self.topk_group = topk_group
-
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
-
-
-__all__ = ["AfmoeConfig"]
--- a/transformers_utils/configs/arctic.py
+++ b/transformers_utils/configs/arctic.py
@@ -1,206 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# ruff: noqa: E501
-# coding=utf-8
-# Copied from
-# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
-"""Arctic model configuration"""
-
-from dataclasses import asdict, dataclass
-from typing import Any
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-ARCTIC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "arctic": "https://huggingface.co/Snowflake/snowflake-arctic-instruct/tree/main/config.json",
-}
-
-
-@dataclass
-class ArcticLoRAConfig:
-    lora_r: int = 64
-    lora_alpha: float = 16
-    shard_base_weights: bool = False
-
-
-@dataclass
-class ArcticQuantizationConfig:
-    q_bits: int = 8
-    rounding: str = "nearest"
-    mantissa_bits: int = 3
-    group_size: int = 128
-
-
-class ArcticConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`ArcticModel`]. It is used to instantiate an
-    Arctic model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the #TODO(rsamdani): add what model has the default config..
-
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the Arctic model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`ArcticModel`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 14336):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 8):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
-            The maximum sequence length that this model might ever be used with. Arctic's sliding window attention
-            allows sequence of up to 4096*32 tokens.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*):
-            The id of the padding token.
-        bos_token_id (`int`, *optional*, defaults to 1):
-            The id of the "beginning-of-sequence" token.
-        eos_token_id (`int`, *optional*, defaults to 2):
-            The id of the "end-of-sequence" token.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 1000000.0):
-            The base period of the RoPE embeddings.
-        sliding_window (`int`, *optional*):
-            Sliding window attention window size. If not specified, will default to `4096`.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        num_experts_per_tok (`int`, *optional*, defaults to 2):
-            The number of experts to root per-token, can be also interpreted as the `top-p` routing
-            parameter
-        num_local_experts (`int`, *optional*, defaults to 8):
-            Number of experts per Sparse MLP layer.
-        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
-            The aux loss factor for the total loss.
-
-    ```python
-    >>> from transformers import ArcticModel, ArcticConfig
-
-    >>> # Initializing a Arctic 7B style configuration TODO(rsamdani): verify which model does the default configuration correspond to.
-    >>> configuration = ArcticConfig()
-
-    >>> # Initializing a model from the Arctic 7B style configuration
-    >>> model = ArcticModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "arctic"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=14336,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=4096,
-        initializer_range=0.02,
-        rms_norm_eps=1e-5,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        rope_theta=1e6,
-        sliding_window=None,
-        attention_dropout=0.0,
-        num_experts_per_tok=1,
-        num_local_experts=8,
-        router_aux_loss_coef=0.001,
-        moe_layer_frequency=2,
-        parallel_attn_mlp_res=False,
-        moe_train_capacity_factor=1,
-        moe_eval_capacity_factor=1,
-        enable_expert_tensor_parallelism=False,
-        moe_min_capacity=0,
-        moe_token_dropping=True,
-        quantization=None,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.sliding_window = sliding_window
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-
-        self.num_experts_per_tok = num_experts_per_tok
-        self.num_local_experts = num_local_experts
-        self.router_aux_loss_coef = router_aux_loss_coef
-        self.moe_layer_frequency = moe_layer_frequency
-        self.moe_train_capacity_factor = moe_train_capacity_factor
-        self.moe_eval_capacity_factor = moe_eval_capacity_factor
-        self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
-        self.moe_min_capacity = moe_min_capacity
-        self.moe_token_dropping = moe_token_dropping
-        self.parallel_attn_mlp_res = parallel_attn_mlp_res
-        if isinstance(quantization, dict):
-            self.quantization = ArcticQuantizationConfig(**quantization)
-        else:
-            self.quantization = quantization
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-    @classmethod
-    def from_dict(cls, config_dict: dict[str, Any], **kwargs) -> "ArcticConfig":
-        result = super().from_dict(config_dict, **kwargs)
-        config = result[0] if isinstance(result, tuple) else result
-        if isinstance(config.quantization, dict):
-            config.quantization = ArcticQuantizationConfig(**config.quantization)
-        return result
-
-    def to_dict(self) -> dict[str, Any]:
-        ret = super().to_dict()
-        if isinstance(ret["quantization"], ArcticQuantizationConfig):
-            ret["quantization"] = asdict(ret["quantization"])
-        return ret
--- a/transformers_utils/configs/chatglm.py
+++ b/transformers_utils/configs/chatglm.py
@@ -1,75 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Adapted from
-# https://github.com/zai-org/ChatGLM2-6B
-from transformers import PretrainedConfig
-
-
-class ChatGLMConfig(PretrainedConfig):
-    model_type = "chatglm"
-    attribute_map = {
-        "num_hidden_layers": "num_layers",
-        "n_head_kv": "multi_query_group_num",
-    }
-
-    def __init__(
-        self,
-        num_layers=28,
-        padded_vocab_size=65024,
-        hidden_size=4096,
-        ffn_hidden_size=13696,
-        kv_channels=128,
-        num_attention_heads=32,
-        seq_length=2048,
-        hidden_dropout=0.0,
-        attention_dropout=0.0,
-        layernorm_epsilon=1e-5,
-        rmsnorm=True,
-        apply_residual_connection_post_layernorm=False,
-        post_layer_norm=True,
-        add_bias_linear=False,
-        add_qkv_bias=False,
-        interleaved_qkv=False,
-        bias_dropout_fusion=True,
-        multi_query_attention=False,
-        multi_query_group_num=1,
-        apply_query_key_layer_scaling=True,
-        attention_softmax_in_fp32=True,
-        fp32_residual_connection=False,
-        quantization_bit=0,
-        pre_seq_len=None,
-        prefix_projection=False,
-        **kwargs,
-    ):
-        self.num_layers = num_layers
-        self.vocab_size = padded_vocab_size
-        self.padded_vocab_size = padded_vocab_size
-        self.hidden_size = hidden_size
-        self.ffn_hidden_size = ffn_hidden_size
-        self.kv_channels = kv_channels
-        self.num_attention_heads = num_attention_heads
-        self.seq_length = seq_length
-        # It is to be compatible with long lora.
-        self.max_position_embeddings = seq_length
-        self.hidden_dropout = hidden_dropout
-        self.attention_dropout = attention_dropout
-        self.layernorm_epsilon = layernorm_epsilon
-        self.rmsnorm = rmsnorm
-        self.apply_residual_connection_post_layernorm = (
-            apply_residual_connection_post_layernorm
-        )
-        self.post_layer_norm = post_layer_norm
-        self.add_bias_linear = add_bias_linear
-        self.add_qkv_bias = add_qkv_bias
-        self.bias_dropout_fusion = bias_dropout_fusion
-        self.multi_query_attention = multi_query_attention
-        self.multi_query_group_num = multi_query_group_num
-        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
-        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
-        self.fp32_residual_connection = fp32_residual_connection
-        self.quantization_bit = quantization_bit
-        self.pre_seq_len = pre_seq_len
-        self.prefix_projection = prefix_projection
-        self.interleaved_qkv = interleaved_qkv
-        super().__init__(**kwargs)
--- a/transformers_utils/configs/deepseek_vl2.py
+++ b/transformers_utils/configs/deepseek_vl2.py
@@ -1,126 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
-
-from transformers import DeepseekV2Config, PretrainedConfig
-
-
-class VisionEncoderConfig(PretrainedConfig):
-    model_type: str = "vision"
-
-    model_name: str = "vit_so400m_patch14_siglip_384.webli"
-    image_size: int = 384
-    patch_size: int = 16
-    width: int = 1024
-    layers: int = 24
-    heads: int = 16
-    mlp_ratio: int = 4
-    global_pool: str = "map"
-    ignore_head: bool = True
-    class_token: bool = False
-    num_classes: int = 0
-    use_checkpoint: bool = False
-    weight_init: str = "skip"
-    deterministic: bool = False
-    num_recomputing_layers: int = 0
-
-    def __init__(
-        self,
-        model_name: str = "vit_so400m_patch14_siglip_384.webli",
-        image_size: int = 384,
-        patch_size: int = 16,
-        width: int = 1024,
-        layers: int = 24,
-        heads: int = 16,
-        mlp_ratio: int = 4,
-        global_pool: str = "map",
-        ignore_head: bool = True,
-        class_token: bool = False,
-        num_classes: int = 0,
-        use_checkpoint: bool = False,
-        **kwargs,
-    ):
-        self.model_name = model_name
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.width = width
-        self.layers = layers
-        self.heads = heads
-        self.mlp_ratio = mlp_ratio
-        self.global_pool = global_pool
-        self.ignore_head = ignore_head
-        self.class_token = class_token
-        self.num_classes = num_classes
-        self.use_checkpoint = use_checkpoint
-
-        super().__init__(**kwargs)
-
-
-class MlpProjectorConfig(PretrainedConfig):
-    model_type = "mlp_projector"
-    projector_type: str = "downsample_mlp_gelu"
-    input_dim: int = 1152
-    n_embed: int = 2048
-    depth: int = 2
-    mlp_ratio: int = 1
-    downsample_ratio: int = 2
-    token_pooling: bool = False
-
-    def __init__(
-        self,
-        projector_type: str = "downsample_mlp_gelu",
-        input_dim: int = 1152,
-        n_embed: int = 2048,
-        depth: int = 2,
-        mlp_ratio: int = 1,
-        downsample_ratio: int = 2,
-        **kwargs,
-    ):
-        self.projector_type = projector_type
-        self.input_dim = input_dim
-        self.n_embed = n_embed
-        self.depth = depth
-        self.mlp_ratio = mlp_ratio
-        self.downsample_ratio = downsample_ratio
-
-        super().__init__(**kwargs)
-
-
-class DeepseekVLV2Config(PretrainedConfig):
-    model_type = "deepseek_vl_v2"
-    vision_config: VisionEncoderConfig
-    projector_config: MlpProjectorConfig
-
-    tile_tag: str = "2D"
-    global_view_pos: str = "head"
-    candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),)
-
-    def __init__(
-        self,
-        tile_tag: str = "tile_tag",
-        global_view_pos: str = "head",
-        candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),),
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        vision_config = kwargs.get("vision_config", {})
-        self.vision_config = VisionEncoderConfig(**vision_config)
-
-        projector_config = kwargs.get("projector_config", {})
-        self.projector_config = MlpProjectorConfig(**projector_config)
-
-        language_config = kwargs.get("language_config", {})
-        self.text_config = DeepseekV2Config(**language_config)
-
-        self.tile_tag = tile_tag
-        self.global_view_pos = global_view_pos
-        self.candidate_resolutions = candidate_resolutions
-        self.vocab_size = self.text_config.vocab_size
-
-        # update model_type for OCR model
-        if "DeepseekOCRForCausalLM" in (
-            self.architectures or kwargs.get("architectures", [])
-        ):
-            self.model_type = "deepseek_ocr"
--- a/transformers_utils/configs/dotsocr.py
+++ b/transformers_utils/configs/dotsocr.py
@@ -1,71 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.models.qwen2 import Qwen2Config
-
-
-class DotsVisionConfig(PretrainedConfig):
-    model_type: str = "dots_vit"
-
-    def __init__(
-        self,
-        embed_dim: int = 1536,  # vision encoder embed size
-        hidden_size: int = 1536,  # after merger hidden size
-        intermediate_size: int = 4224,
-        num_hidden_layers: int = 42,
-        num_attention_heads: int = 12,
-        num_channels: int = 3,
-        patch_size: int = 14,
-        spatial_merge_size: int = 2,
-        temporal_patch_size: int = 1,
-        rms_norm_eps: float = 1e-5,
-        use_bias: bool = False,
-        attn_implementation="flash_attention_2",
-        initializer_range=0.02,
-        init_merger_std=0.02,
-        is_causal=False,  # ve causal forward
-        post_norm=True,
-        gradient_checkpointing=False,
-        **kwargs: Any,
-    ):
-        super().__init__(**kwargs)
-        self.embed_dim = embed_dim
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
-        self.patch_size = patch_size
-        self.spatial_merge_size = spatial_merge_size
-        self.temporal_patch_size = temporal_patch_size
-        self.rms_norm_eps = rms_norm_eps
-        self.use_bias = use_bias
-        self.attn_implementation = attn_implementation
-        self.initializer_range = initializer_range
-        self.init_merger_std = init_merger_std
-        self.is_causal = is_causal
-        self.post_norm = post_norm
-        self.gradient_checkpointing = gradient_checkpointing
-
-
-class DotsOCRConfig(Qwen2Config):
-    model_type = "dots_ocr"
-
-    def __init__(
-        self,
-        image_token_id=151665,
-        video_token_id=151656,
-        vision_config: dict | None = None,
-        *args,
-        **kwargs,
-    ):
-        super().__init__(*args, **kwargs)
-        self.image_token_id = image_token_id
-        self.video_token_id = video_token_id
-        self.vision_config = DotsVisionConfig(**(vision_config or {}))
-
-    def save_pretrained(self, save_directory, **kwargs):
-        self._auto_class = None
-        super().save_pretrained(save_directory, **kwargs)
--- a/transformers_utils/configs/eagle.py
+++ b/transformers_utils/configs/eagle.py
@@ -1,84 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-
-from transformers import AutoConfig, DeepseekV2Config, PretrainedConfig
-
-
-class EAGLEConfig(PretrainedConfig):
-    model_type = "eagle"
-
-    def __init__(
-        self,
-        model: PretrainedConfig | dict | None = None,
-        truncated_vocab_size: int | None = None,
-        method: str | None = "eagle",
-        **kwargs,
-    ):
-        model_config: PretrainedConfig | DeepseekV2Config | None
-        if isinstance(model, dict):
-            model_config = AutoConfig.for_model(**model)
-        else:
-            model_config = model
-
-        for k, v in kwargs.items():
-            if k != "architectures" and k != "model_type" and hasattr(model_config, k):
-                setattr(model_config, k, v)
-
-        self.model = model_config
-
-        if self.model is None:
-            self.truncated_vocab_size = None
-        else:
-            self.truncated_vocab_size = (
-                self.model.vocab_size
-                if truncated_vocab_size is None
-                else truncated_vocab_size
-            )
-
-        # Eagle model name should follow naming convention of
-        # LlamaForCausalLM -> EagleLlamaForCausalLM
-        # LlamaForCausalLM -> Eagle3LlamaForCausalLM
-        # LlamaForCausalLMEagle3 -> LlamaForCausalLMEagle3
-        if method == "eagle":
-            assert self.model is not None, (
-                "model should not be None when method is eagle"
-            )
-            kwargs["architectures"] = [
-                f"Eagle{arch}" if not arch.startswith("Eagle") else arch
-                for arch in self.model.architectures
-            ]
-
-        elif method == "eagle3":
-            assert self.model is not None, (
-                "model should not be None when method is eagle3"
-            )
-            kwargs["architectures"] = [
-                arch
-                if arch.startswith("Eagle3") or arch.endswith("Eagle3")
-                else f"Eagle3{arch}"
-                for arch in self.model.architectures
-            ]
-        else:
-            raise ValueError(
-                f"Invalid method {method}. Supported methods are eagle and eagle3."
-            )
-
-        super().__init__(**kwargs)
-
-        if self.model is not None:
-            for k, v in self.model.to_dict().items():
-                if k not in kwargs:
-                    setattr(self, k, v)
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path: str | os.PathLike,
-        **kwargs,
-    ) -> "EAGLEConfig":
-        config_dict, kwargs = cls.get_config_dict(
-            pretrained_model_name_or_path, **kwargs
-        )
-        return cls.from_dict(config_dict, **kwargs)
--- a/transformers_utils/configs/falcon.py
+++ b/transformers_utils/configs/falcon.py
@@ -1,89 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Adapted from
-# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Falcon configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-
-
-class RWConfig(PretrainedConfig):
-    model_type = "falcon"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    attribute_map = {
-        "num_hidden_layers": "n_layer",
-        "num_attention_heads": "n_head",
-        "num_kv_heads": "n_head_kv",
-    }
-
-    def __init__(
-        self,
-        vocab_size=250880,
-        hidden_size=64,
-        n_layer=2,
-        n_head=8,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        use_cache=True,
-        bos_token_id=1,
-        eos_token_id=2,
-        hidden_dropout=0.0,
-        attention_dropout=0.0,
-        multi_query=True,
-        n_head_kv=None,
-        alibi=False,
-        bias=False,
-        parallel_attn=False,
-        new_decoder_architecture=False,
-        **kwargs,
-    ) -> None:
-        self.vocab_size = vocab_size
-        # Backward compatibility with n_embed kwarg
-        n_embed = kwargs.pop("n_embed", None)
-        self.hidden_size = hidden_size if n_embed is None else n_embed
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.use_cache = use_cache
-        self.hidden_dropout = hidden_dropout
-        self.attention_dropout = attention_dropout
-
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.multi_query = multi_query
-        self.n_head_kv = 1 if n_head_kv is None else n_head_kv
-        self.alibi = alibi
-        self.bias = bias
-        self.parallel_attn = parallel_attn
-        self.new_decoder_architecture = new_decoder_architecture
-
-        if self.hidden_size == 8192:
-            # Hack for falcon-40b
-            self.new_decoder_architecture = True
-
-        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
-
-    @property
-    def head_dim(self):
-        return self.hidden_size // self.n_head
-
-    @property
-    def rotary(self):
-        return not self.alibi
--- a/transformers_utils/configs/flex_olmo.py
+++ b/transformers_utils/configs/flex_olmo.py
@@ -1,77 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from transformers.configuration_utils import PretrainedConfig
-
-
-class FlexOlmoConfig(PretrainedConfig):
-    model_type = "flex_olmo"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=100352,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=4096,
-        initializer_range=0.02,
-        rms_norm_eps=1e-06,
-        use_cache=True,
-        pad_token_id=100277,
-        bos_token_id=None,
-        eos_token_id=100257,
-        tie_word_embeddings=False,
-        rope_theta=500000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
-        num_experts_per_tok=5,
-        num_experts=7,
-        output_router_logits=False,
-        router_aux_loss_coef=0.01,
-        norm_topk_prob=False,
-        **kwargs,
-    ):
-        if "architectures" not in kwargs:
-            kwargs["architectures"] = ["FlexOlmoForCausalLM"]
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-        self.num_experts_per_tok = num_experts_per_tok
-        self.num_experts = num_experts
-        self.output_router_logits = output_router_logits
-        self.router_aux_loss_coef = router_aux_loss_coef
-        self.norm_topk_prob = norm_topk_prob
-        # Validate the correctness of rotary position embeddings parameters
-        # BC: if there is a 'type' field, move it to 'rope_type'.
-        if self.rope_scaling is not None and "type" in self.rope_scaling:
-            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
--- a/transformers_utils/configs/jais.py
+++ b/transformers_utils/configs/jais.py
@@ -1,243 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-# Copyright 2023 Cerebras Systems.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""JAIS configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-class JAISConfig(PretrainedConfig):
-    """
-    This is the configuration class to store the configuration of a
-    [`JAISModel`]. It is used to instantiate a JAIS model according to the
-    specified arguments, defining the model architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used
-    to control the model outputs. Read the documentation from
-    [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 50257):
-            Vocabulary size of the JAIS model. Defines the number of different
-            tokens that can be represented by the
-            `inputs_ids` passed when calling [`JAISModel`].
-        n_positions (`int`, *optional*, defaults to 1024):
-            The maximum sequence length that this model might ever be used
-            with. Typically set this to something large just in case
-            (e.g., 512 or 1024 or 2048).
-        n_embd (`int`, *optional*, defaults to 768):
-            Dimensionality of the embeddings and hidden states.
-        n_layer (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        n_head (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the
-            Transformer encoder.
-        n_inner (`int`, *optional*, defaults to None):
-            Dimensionality of the inner feed-forward layers. `None` will set
-            it to 4 times n_embd
-        activation_function (`str`, *optional*, defaults to `"gelu"`):
-            Activation function, to be selected in the list
-            `["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"]`.
-        resid_pdrop (`float`, *optional*, defaults to 0.1):
-            The dropout probability for all fully connected layers in
-            the embeddings, encoder, and pooler.
-        embd_pdrop (`float`, *optional*, defaults to 0.1):
-            The dropout ratio for the embeddings.
-        attn_pdrop (`float`, *optional*, defaults to 0.1):
-            The dropout ratio for the attention.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
-            The epsilon to use in the layer normalization layers.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for
-            initializing all weight matrices.
-        scale_attn_weights (`bool`, *optional*, defaults to `True`):
-            Scale attention weights by dividing by sqrt(hidden_size)..
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values
-            attentions (not used by all models).
-        scale_attn_by_inverse_layer_idx (`bool`, *optional*, default `True`):
-            Whether to additionally scale attention weights
-            by `1 / layer_idx + 1`.
-        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
-            Whether to scale keys (K) prior to computing attention
-            (dot-product)
-            and upcast attention dot-product/softmax to float() when training
-            with mixed precision.
-        position_embedding_type (`str`, *optional*, defaults to `"learned"`):
-            Positional embedding can be either `"alibi"` or `"learned"`.
-        mup_width_scale (`float`, *optional*, defaults to 1.0):
-            muP parameter to scale learning rate and initializers. Calculated
-            as (`d_model,0 / d_model`), where
-            `d_model` is the model's width and `d_model,0` is the proxy
-            model's width.
-        mup_embeddings_scale (`float`, *optional*, defaults to 1.0):
-            muP parameter to scale token and position embeddings.
-        mup_output_alpha (`float`, *optional*, defaults to 1.0):
-            muP parameter to scale output logits
-            (`output_logits_scale = mup_output_alpha * mup_width_scale`).
-        mup_scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
-            Scale attention weights by dividing by hidden_size instead of
-            sqrt(hidden_size). Need to set scale_attn_weights to `True` as
-            well.
-        alibi_scaling (`dict`, *optional*):
-            Dictionary containing the scaling configuration for ALiBi
-            embeddings. Currently only supports linear
-            scaling strategy. Can specify either the scaling `factor` (must be
-            a float greater than 1) for fixed scaling
-            or `train_seq_len` for dynamic scaling on input samples with
-            sequence length > `train_seq_len`. The expected
-            formats are `{"type": strategy name, "factor": scaling factor}` or
-            `{"type": strategy name,
-            "train_seq_len": training sequence length}`.
-        architectures (`list`, *optional*, defaults to ['JAISLMHeadModel']):
-            architecture names for Jais.
-
-    Example:
-
-    ```python
-    >>> from transformers import JAISConfig, JAISModel
-
-    >>> # Initializing a JAIS configuration
-    >>> configuration = JAISConfig()
-
-    >>> # Initializing a model (with random weights) from the configuration
-    >>> model = JAISModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "jais"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    attribute_map = {
-        "hidden_size": "n_embd",
-        "max_position_embeddings": "n_positions",
-        "num_attention_heads": "n_head",
-        "num_hidden_layers": "n_layer",
-    }
-
-    def __init__(
-        self,
-        vocab_size=50257,
-        n_positions=1024,
-        n_embd=768,
-        n_layer=12,
-        n_head=12,
-        n_inner=None,
-        activation_function="gelu_new",
-        resid_pdrop=0.1,
-        embd_pdrop=0.1,
-        attn_pdrop=0.1,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        scale_attn_weights=True,
-        use_cache=True,
-        bos_token_id=50256,
-        eos_token_id=50256,
-        scale_attn_by_inverse_layer_idx=False,
-        reorder_and_upcast_attn=False,
-        position_embedding_type="learned",
-        mup_width_scale=1.0,
-        mup_embeddings_scale=1.0,
-        mup_output_alpha=1.0,
-        mup_scale_qk_dot_by_d=False,
-        alibi_scaling=None,
-        architectures=None,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.n_positions = n_positions
-        self.n_embd = n_embd
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.n_inner = n_inner
-        self.activation_function = activation_function
-        self.resid_pdrop = resid_pdrop
-        self.embd_pdrop = embd_pdrop
-        self.attn_pdrop = attn_pdrop
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.scale_attn_weights = scale_attn_weights
-        self.use_cache = use_cache
-        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
-        self.reorder_and_upcast_attn = reorder_and_upcast_attn
-
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-
-        self.position_embedding_type = position_embedding_type
-        self.mup_width_scale = mup_width_scale
-        self.mup_embeddings_scale = mup_embeddings_scale
-        self.mup_output_alpha = mup_output_alpha
-        self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d
-
-        self.alibi_scaling = alibi_scaling
-        self._alibi_scaling_validation()
-        if architectures is None:
-            architectures = ["JAISLMHeadModel"]
-
-        super().__init__(
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            architectures=architectures,
-            **kwargs,
-        )
-
-    def _alibi_scaling_validation(self):
-        """
-        Validate the `alibi_scaling` configuration.
-        """
-        if self.alibi_scaling is None:
-            return
-
-        if not isinstance(self.alibi_scaling, dict) or len(self.alibi_scaling) != 2:
-            raise ValueError(
-                "`alibi_scaling` must be a dictionary with two fields, "
-                "`type` and `factor` or `type` and `train_seq_len`, "
-                f"got {self.alibi_scaling}"
-            )
-        alibi_scaling_type = self.alibi_scaling.get("type", None)
-        alibi_scaling_factor = self.alibi_scaling.get("factor", None)
-        alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
-        if alibi_scaling_type is None or alibi_scaling_type != "linear":
-            raise ValueError(
-                f"`alibi_scaling`'s type field must be 'linear', "
-                f"got {alibi_scaling_type}"
-            )
-        if (
-            alibi_scaling_factor is not None
-            and not isinstance(alibi_scaling_factor, float)
-            or (alibi_scaling_factor is not None and alibi_scaling_factor <= 1.0)
-        ):
-            raise ValueError(
-                f"`alibi_scaling`'s factor field must be a float > 1.0, "
-                f"got {alibi_scaling_factor}"
-            )
-        if (
-            alibi_dynamic_scaling is not None
-            and not isinstance(alibi_dynamic_scaling, int)
-            or (alibi_dynamic_scaling is not None and alibi_dynamic_scaling <= 1)
-        ):
-            raise ValueError(
-                f"`alibi_scaling`'s `train_seq_len` field must be an "
-                f"integer > 1, got {alibi_dynamic_scaling}"
-            )
--- a/transformers_utils/configs/kimi_linear.py
+++ b/transformers_utils/configs/kimi_linear.py
@@ -1,144 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from transformers.configuration_utils import PretrainedConfig
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-class KimiLinearConfig(PretrainedConfig):
-    model_type = "kimi_linear"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        model_type="kimi_linear",
-        vocab_size=163840,
-        hidden_size=4096,
-        head_dim=None,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        tie_word_embeddings=False,
-        moe_intermediate_size: int | None = None,
-        moe_renormalize: bool = True,
-        moe_router_activation_func: str = "sigmoid",
-        num_experts: int | None = None,
-        num_experts_per_token: int | None = None,
-        num_shared_experts: int = 0,
-        routed_scaling_factor: float = 1.0,
-        first_k_dense_replace: int = 0,
-        moe_layer_freq: int = 1,
-        use_grouped_topk: bool = True,
-        num_expert_group: int = 1,
-        topk_group: int = 1,
-        q_lora_rank: int | None = None,
-        kv_lora_rank: int | None = None,
-        qk_nope_head_dim: int | None = None,
-        qk_rope_head_dim: int | None = None,
-        v_head_dim: int | None = None,
-        mla_use_nope: bool | None = False,
-        num_nextn_predict_layers: int = 0,
-        linear_attn_config: dict | None = None,
-        **kwargs,
-    ):
-        self.model_type = model_type
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.head_dim = (
-            head_dim if head_dim is not None else hidden_size // num_attention_heads
-        )
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-
-        self.q_lora_rank = q_lora_rank
-        self.kv_lora_rank = kv_lora_rank
-        self.qk_nope_head_dim = qk_nope_head_dim
-        self.qk_rope_head_dim = qk_rope_head_dim
-        self.v_head_dim = v_head_dim
-        self.mla_use_nope = mla_use_nope
-        # moe config
-        self.num_experts = num_experts
-        self.num_experts_per_token = num_experts_per_token
-        self.moe_renormalize = moe_renormalize
-        self.num_shared_experts = num_shared_experts
-        self.routed_scaling_factor = routed_scaling_factor
-        self.moe_router_activation_func = moe_router_activation_func
-        assert self.moe_router_activation_func in ("softmax", "sigmoid")
-        self.moe_intermediate_size = moe_intermediate_size
-        self.first_k_dense_replace = first_k_dense_replace
-        self.moe_layer_freq = moe_layer_freq
-        self.use_grouped_topk = use_grouped_topk
-        self.num_expert_group = num_expert_group
-        self.topk_group = topk_group
-        self.num_nextn_predict_layers = num_nextn_predict_layers
-
-        if linear_attn_config is not None:
-            assert linear_attn_config["kda_layers"] is not None
-            assert linear_attn_config["full_attn_layers"] is not None
-        self.linear_attn_config = linear_attn_config
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-    @property
-    def is_mla(self):
-        return (
-            self.q_lora_rank is not None
-            or self.kv_lora_rank is not None
-            or self.qk_nope_head_dim is not None
-            or self.qk_rope_head_dim is not None
-            or self.v_head_dim is not None
-            or self.mla_use_nope is True
-        )
-
-    @property
-    def is_moe(self):
-        return self.num_experts is not None
-
-    @property
-    def is_linear_attn(self) -> bool:
-        return not (
-            self.linear_attn_config is None
-            or (
-                isinstance(self.linear_attn_config, dict)
-                and self.linear_attn_config["kda_layers"] is not None
-                and len(self.linear_attn_config["kda_layers"]) == 0
-            )
-        )
-
-    def is_kda_layer(self, layer_idx: int):
-        return (
-            self.linear_attn_config is not None
-            and (layer_idx + 1) in self.linear_attn_config["kda_layers"]
-        )
--- a/transformers_utils/configs/kimi_vl.py
+++ b/transformers_utils/configs/kimi_vl.py
@@ -1,38 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
-
-from transformers import DeepseekV2Config
-from transformers.configuration_utils import PretrainedConfig
-
-from vllm.transformers_utils.configs.moonvit import MoonViTConfig
-
-
-class KimiVLConfig(PretrainedConfig):
-    model_type = "kimi_vl"
-
-    def __init__(
-        self,
-        vision_config: dict | MoonViTConfig | None = None,
-        text_config: dict | DeepseekV2Config | None = None,
-        ignore_index: int = -100,
-        media_placeholder_token_id: int = 163605,
-        pad_token_id: int = 0,
-        **kwargs,
-    ):
-        if vision_config is None:
-            vision_config = MoonViTConfig()
-        elif isinstance(vision_config, dict):
-            vision_config = MoonViTConfig(**vision_config)
-        self.vision_config = vision_config
-
-        if text_config is None:
-            text_config = DeepseekV2Config()
-        elif isinstance(text_config, dict):
-            text_config = DeepseekV2Config(**text_config)
-        self.text_config = text_config
-
-        self.ignore_index = ignore_index
-        self.media_placeholder_token_id = media_placeholder_token_id
-
-        super().__init__(pad_token_id=pad_token_id, **kwargs)
--- a/transformers_utils/configs/lfm2_moe.py
+++ b/transformers_utils/configs/lfm2_moe.py
@@ -1,159 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from transformers.configuration_utils import PretrainedConfig
-
-
-class Lfm2MoeConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Lfm2MoeModel`]. It is used to instantiate a LFM2 Moe
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the LFM2-8B-A1B model.
-    e.g. [LiquidAI/LFM2-8B-A1B](https://huggingface.co/LiquidAI/LFM2-8B-A1B)
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 65536):
-            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Lfm2Model`]
-        hidden_size (`int`, *optional*, defaults to 2048):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 7168):
-            Dimension of the MLP representations.
-        moe_intermediate_size (`int`, *optional*, defaults to 1792):
-            Intermediate size of the routed expert.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer decoder.
-        pad_token_id (`int`, *optional*, defaults to 0):
-            Padding token id.
-        bos_token_id (`int`, *optional*, defaults to 1):
-            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 2):
-            End of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
-            Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 1000000.0):
-            The base period of the RoPE embeddings.
-        max_position_embeddings (`int`, *optional*, defaults to 128000):
-            The maximum sequence length that this model might ever be used with.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the rms normalization layers.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*, defaults to 8):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details, check out [this
-            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
-            `num_attention_heads`.
-        conv_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use bias in the conv layers.
-        conv_L_cache (`int`, *optional*, defaults to 3):
-            L_cache dim in the conv layers.
-        num_dense_layers (`int`, *optional*, defaults to 2):
-            Number of dense Lfm2MoeMLP layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
-        num_experts_per_tok (`int`, *optional*, defaults to 4):
-            Number of selected experts.
-        num_experts (`int`, *optional*, defaults to 32):
-            Number of routed experts.
-        use_expert_bias (`bool`, *optional*, defaults to `True`):
-            Whether to use the expert bias on the routing weights.
-        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
-            Scaling factor for routed experts in MoE models.
-        norm_topk_prob (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the topk probabilities.
-        layer_types (`Optional`, *optional*):
-            Type of each layers.
-
-    ```python
-    >>> from transformers import Lfm2MoeModel, Lfm2MoeConfig
-
-    >>> # Initializing a LFM2 Moe model
-    >>> configuration = Lfm2MoeConfig()
-
-    >>> # Initializing a model from the LFM2-8B-A1B style configuration
-    >>> model = Lfm2MoeModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""  # noqa: E501
-
-    model_type = "lfm2_moe"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size: int = 65536,
-        hidden_size: int = 2048,
-        intermediate_size: int = 7168,
-        moe_intermediate_size: int = 1792,
-        num_hidden_layers: int = 32,
-        pad_token_id: int = 0,
-        bos_token_id: int = 1,
-        eos_token_id: int = 2,
-        tie_word_embeddings: bool = True,
-        rope_theta: float = 1000000.0,
-        max_position_embeddings: int = 128_000,
-        use_cache: bool = True,
-        norm_eps: float = 0.00001,
-        num_attention_heads: int = 32,
-        num_key_value_heads: int = 8,
-        conv_bias: bool = False,
-        conv_L_cache: int = 3,
-        num_dense_layers: int = 2,
-        num_experts_per_tok: int = 4,
-        num_experts: int = 32,
-        use_expert_bias: bool = True,
-        routed_scaling_factor: float = 1.0,
-        norm_topk_prob: bool = True,
-        layer_types: list[str] | None = None,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-        self.use_cache = use_cache
-        self.norm_eps = norm_eps
-
-        # attn operator config
-        self.num_attention_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-
-        # custom operator config
-        self.conv_bias = conv_bias
-        self.conv_L_cache = conv_L_cache
-
-        # moe config
-        self.num_dense_layers = num_dense_layers
-        self.moe_intermediate_size = moe_intermediate_size
-        self.num_experts_per_tok = num_experts_per_tok
-        self.num_experts = num_experts
-        self.use_expert_bias = use_expert_bias
-        self.routed_scaling_factor = routed_scaling_factor
-        self.norm_topk_prob = norm_topk_prob
-        self.layer_types = layer_types
-
-        tie_word_embeddings = kwargs.get(
-            "tie_embedding", tie_word_embeddings
-        )  # to fit original config keys
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-
-__all__ = ["Lfm2MoeConfig"]
--- a/transformers_utils/configs/medusa.py
+++ b/transformers_utils/configs/medusa.py
@@ -1,65 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-
-from transformers import PretrainedConfig
-
-
-class MedusaConfig(PretrainedConfig):
-    model_type = "medusa"
-
-    def __init__(
-        self,
-        hidden_size: int = 4096,
-        vocab_size: int = 32001,
-        num_heads: int = 5,
-        num_hidden_layers: int = 1,
-        max_paths: int = 64,
-        topk: int = 10,
-        truncated_vocab_size: int | None = None,
-        **kwargs,
-    ):
-        self.hidden_size = hidden_size
-        self.vocab_size = vocab_size
-        self.num_heads = num_heads
-        self.num_hidden_layers = num_hidden_layers
-        self.max_paths = max_paths
-        self.topk = topk
-        self.max_seq_len = int(2**20)
-        self.truncated_vocab_size = (
-            vocab_size if truncated_vocab_size is None else truncated_vocab_size
-        )
-        if "architectures" not in kwargs:
-            kwargs["architectures"] = ["MedusaModel"]
-
-        super().__init__(**kwargs)
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path: str | os.PathLike,
-        **kwargs,
-    ) -> "MedusaConfig":
-        config_dict, kwargs = cls.get_config_dict(
-            pretrained_model_name_or_path, **kwargs
-        )
-        for k in list(config_dict.keys()):
-            if "num" in k:
-                if "heads" in k:
-                    config_dict["num_heads"] = config_dict.pop(k)
-                elif "layers" in k:
-                    config_dict["num_hidden_layers"] = config_dict.pop(k)
-        return cls.from_dict(config_dict, **kwargs)
-
-    @property
-    def num_attention_heads(self):
-        return 0
-
-    @property
-    def num_lookahead_tokens(self):
-        return self.num_heads
-
-    @num_lookahead_tokens.setter
-    def num_lookahead_tokens(self, num_lookahead_tokens: int):
-        self.num_heads = num_lookahead_tokens
--- a/transformers_utils/configs/midashenglm.py
+++ b/transformers_utils/configs/midashenglm.py
@@ -1,103 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Copyright 2025 Horizon team, Xiaomi MiLM Plus.
-# Copyright 2024 The Qwen team.
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from transformers import PretrainedConfig
-from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
-    Qwen2_5OmniTextConfig,
-)
-
-
-class DashengConfig(PretrainedConfig):
-    model_type = "midashenglm_dasheng_encoder"
-
-    def __init__(
-        self,
-        embed_dim: int = 768,
-        outputdim: int = 527,
-        patch_size: int | tuple[int, int] = 16,
-        patch_stride: int | tuple[int, int] = 16,
-        input_channels: int = 1,
-        target_length: int = 1012,
-        depth: int = 12,
-        num_heads: int = 12,
-        mlp_ratio: float = 4.0,
-        qkv_bias: bool = True,
-        init_values: float | None = None,
-        drop_rate: float = 0.0,
-        attn_drop_rate: float = 0.0,
-        f_min: float = 0.0,
-        f_max: float = 8000.0,
-        center: bool = True,
-        win_length: int = 512,
-        hop_length: int = 160,
-        sample_rate: int = 16000,
-        n_fft: int = 512,
-        n_mels: int = 64,
-        **kwargs,
-    ):
-        self.embed_dim = embed_dim
-        self.outputdim = outputdim
-        self.patch_size = patch_size
-        self.patch_stride = patch_stride
-        self.input_channels = input_channels
-        self.target_length = target_length
-        self.depth = depth
-        self.num_heads = num_heads
-        self.mlp_ratio = mlp_ratio
-        self.qkv_bias = qkv_bias
-        self.init_values = init_values
-        self.drop_rate = drop_rate
-        self.attn_drop_rate = attn_drop_rate
-        self.f_min = f_min
-        self.f_max = f_max
-        self.center = center
-        self.win_length = win_length
-        self.hop_length = hop_length
-        self.sample_rate = sample_rate
-        self.n_fft = n_fft
-        self.n_mels = n_mels
-        super().__init__(**kwargs)
-
-
-class MiDashengLMConfig(PretrainedConfig):
-    model_type = "midashenglm"
-
-    def __init__(
-        self,
-        audio_encoder_config: dict | None = None,
-        subsample_factor: int = 5,
-        text_config: dict | None = None,
-        audio_token_id: int | None = None,
-        **kwargs,
-    ):
-        self.audio_encoder_config = DashengConfig(**(audio_encoder_config or {}))
-        self.subsample_factor = subsample_factor
-        self.text_config = (
-            Qwen2_5OmniTextConfig(**text_config)
-            if text_config
-            else Qwen2_5OmniTextConfig()
-        )
-        self.text_config.rope_scaling = None  # uses_mrope is false
-        self.audio_token_id = audio_token_id
-        super().__init__(**kwargs)
--- a/transformers_utils/configs/mistral.py
+++ b/transformers_utils/configs/mistral.py
@@ -1,174 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any
-
-from transformers import PretrainedConfig, WhisperConfig
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-def adapt_config_dict(config_dict: dict[str, Any], **kwargs) -> PretrainedConfig:
-    config_dict.update(kwargs)
-    config_dict = _remap_general_mistral_args(config_dict)
-
-    if bool(config_dict.get("quantization")):
-        config_dict = _remap_mistral_quantization_args(config_dict)
-
-    if bool(config_dict.get("moe")):
-        config_dict["architectures"] = ["MixtralForCausalLM"]
-    else:
-        config_dict["architectures"] = ["MistralForCausalLM"]
-
-    if bool(config_dict.get("yarn")):
-        config_dict = _remap_mistral_yarn_args(config_dict)
-
-    if bool(config_dict.get("llama_4_scaling")):
-        llama_4_scaling_config_keys = ["original_max_position_embeddings", "beta"]
-        assert all(
-            [
-                key in config_dict["llama_4_scaling"]
-                for key in llama_4_scaling_config_keys
-            ]
-        ), (
-            "llama_4_scaling config should define the keys: "
-            f"{','.join(llama_4_scaling_config_keys)}"
-        )
-
-    is_vision = (config_dict.get("multimodal") or {}).get(
-        "vision_encoder_args"
-    ) or config_dict.get("vision_encoder")
-    is_audio = bool(
-        ((config_dict.get("multimodal") or {}).get("whisper_model_args") or {}).get(
-            "encoder_args"
-        )
-    )
-
-    assert not (is_vision and is_audio), "Vision and audio are mutually exclusive"
-
-    if is_vision:
-        config_dict = _remap_mistral_vision_args(config_dict)
-    if is_audio:
-        config_dict = _remap_mistral_audio_args(config_dict)
-
-    config = PretrainedConfig.from_dict(config_dict)
-
-    logger.debug("Initialized config %s", config)
-
-    return config
-
-
-def _remap_mistral_vision_args(config: dict) -> dict:
-    if config.get("multimodal"):
-        vision_config = config.pop("multimodal")
-    else:
-        vision_config = config.pop("vision_encoder")
-
-    quant_config = config.get("quantization_config")
-    config = {
-        "model_type": "pixtral",
-        "architectures": ["PixtralForConditionalGeneration"],
-        "text_config": PretrainedConfig.from_dict(config),
-        "vision_config": PretrainedConfig.from_dict(vision_config),
-    }
-    if quant_config:
-        config["quantization_config"] = quant_config
-    return config
-
-
-def _remap_mistral_yarn_args(config: dict) -> dict:
-    yarn_config_map = {
-        "factor": "factor",
-        "original_max_position_embeddings": "original_max_position_embeddings",
-        "beta": "beta_fast",
-        "alpha": "beta_slow",
-        "apply_scale": "apply_yarn_scaling",
-    }
-    yarn_config = config.get("yarn") or {}
-    config["rope_scaling"] = {
-        "rope_type": "yarn",
-        "mscale_all_dim": 1,
-    }
-    for old_name, new_name in yarn_config_map.items():
-        if old_name in yarn_config:
-            config["rope_scaling"][new_name] = yarn_config.pop(old_name)
-
-    assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}"
-
-    return config
-
-
-def _remap_general_mistral_args(config: dict) -> dict:
-    # Mistral key -> HF key
-    config_mapping = {
-        "dim": "hidden_size",
-        "norm_eps": "rms_norm_eps",
-        "n_kv_heads": "num_key_value_heads",
-        "n_layers": "num_hidden_layers",
-        "n_heads": "num_attention_heads",
-        "hidden_dim": "intermediate_size",
-    }
-    # HF key -> (Mistral key, default value)
-    top_level_mapping_with_default = {
-        "model_type": ("model_type", "transformer"),
-        "hidden_act": ("activation", "silu"),
-        "tie_word_embeddings": ("tied_embeddings", False),
-        "max_seq_len": ("max_seq_len", 128_000),
-        "max_position_embeddings": ("max_position_embeddings", 128_000),
-    }
-
-    for key, new_key in config_mapping.items():
-        if key in config:
-            config[new_key] = config.pop(key)
-
-    for new_key, (key, default_value) in top_level_mapping_with_default.items():
-        config[new_key] = config.pop(key, default_value)
-
-    return config
-
-
-def _remap_mistral_quantization_args(config: dict) -> dict:
-    quantization = config.get("quantization", {})
-    if quantization.get("qformat_weight") == "fp8_e4m3":
-        # This maps to the FP8 static per-tensor quantization scheme
-        quantization_config = {"quant_method": "fp8", "activation_scheme": "static"}
-    elif quantization.get("quant_method") == "compressed-tensors":
-        # Pass through the quantization config to compressed-tensors
-        quantization_config = quantization
-    else:
-        raise ValueError(f"Found unknown quantization='{quantization}' in config")
-
-    config["quantization_config"] = quantization_config
-
-    return config
-
-
-def _remap_mistral_audio_args(config: dict) -> dict:
-    whisper_args = config["multimodal"].pop("whisper_model_args")
-    encoder_args = whisper_args["encoder_args"]
-    downsample_args = whisper_args["downsample_args"]
-
-    quant_config = config.get("quantization_config")
-    config = {
-        "model_type": "whixtral",
-        "architectures": ["VoxtralForConditionalGeneration"],
-        "text_config": PretrainedConfig.from_dict(config),
-        "audio_config": WhisperConfig(
-            num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"],
-            window_size=encoder_args["audio_encoding_args"]["window_size"],
-            sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"],
-            hop_length=encoder_args["audio_encoding_args"]["hop_length"],
-            downsample_factor=downsample_args["downsample_factor"],
-            d_model=encoder_args["dim"],
-            encoder_layers=encoder_args["n_layers"],
-            encoder_ffn_dim=encoder_args["hidden_dim"],
-            encoder_attention_heads=encoder_args["n_heads"],
-            vocab_size=encoder_args["vocab_size"],
-            max_source_positions=encoder_args["max_source_positions"],
-            is_encoder_decoder=False,  # Override WhisperConfig default
-        ),
-    }
-    if quant_config:
-        config["quantization_config"] = quant_config
-    return config
--- a/transformers_utils/configs/mlp_speculator.py
+++ b/transformers_utils/configs/mlp_speculator.py
@@ -1,69 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-
-from transformers import PretrainedConfig
-
-
-class MLPSpeculatorConfig(PretrainedConfig):
-    model_type = "mlp_speculator"
-
-    attribute_map = {
-        "hidden_size": "emb_dim",
-    }
-
-    def __init__(
-        self,
-        vocab_size: int = 32000,
-        emb_dim: int = 4096,
-        inner_dim: int = 0,
-        n_predict: int = 3,
-        top_k_tokens_per_head: list[int] | None = None,
-        n_candidates: int = 5,
-        tie_weights: bool = False,
-        scale_input: bool = False,
-        **kwargs,
-    ):
-        """
-        Initialize an MLPSpeculatorConfig
-
-        Args:
-            vocab_size: int
-                the model vocab size
-            emb_dim: int
-                the model embedding dimension
-            inner_dim: int
-                the inner dimension of the model. If 0, will be the emb_dim.
-            n_predict: int
-                the number of lookaheads for the speculator
-            top_k_tokens_per_head: list[int]
-                Number of tokens to consider from each head when forming the
-                candidate tree.
-                For each candidate branch in the tree, head n produces topk[n]
-                additional sub-branches.
-                NOTE: This parameter is currently unused.
-            n_candidates: int
-                number of child candidates to create per sequence
-            tie_weights: bool
-                If true, use a single set of weights for every model
-                head/stage after the first. The initial projection
-                from the base model may have a different size, so that
-                stays separate.
-            scale_input: bool
-                if True, will scale the initial hidden states from
-                the base model.
-        """
-        if top_k_tokens_per_head is None:
-            top_k_tokens_per_head = [5, 4, 3]
-        assert len(top_k_tokens_per_head) == n_predict
-        self.vocab_size = vocab_size
-        self.emb_dim = emb_dim
-        self.inner_dim = inner_dim
-        self.n_predict = n_predict
-        self.top_k_tokens_per_head = top_k_tokens_per_head
-        self.n_candidates = n_candidates
-        self.num_lookahead_tokens = n_predict
-        self.tie_weights = tie_weights
-        self.scale_input = scale_input
-
-        super().__init__(**kwargs)
--- a/transformers_utils/configs/moonvit.py
+++ b/transformers_utils/configs/moonvit.py
@@ -1,33 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
-from transformers.configuration_utils import PretrainedConfig
-
-
-class MoonViTConfig(PretrainedConfig):
-    model_type = "moonvit"
-
-    def __init__(
-        self,
-        patch_size: int = 14,
-        init_pos_emb_height: int = 64,
-        init_pos_emb_width: int = 64,
-        num_attention_heads: int = 16,
-        num_hidden_layers: int = 27,
-        hidden_size: int = 1152,
-        intermediate_size: int = 4304,
-        merge_kernel_size: tuple[int, int] = (2, 2),
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.patch_size = patch_size
-        # Positional embedding config
-        self.init_pos_emb_height = init_pos_emb_height
-        self.init_pos_emb_width = init_pos_emb_width
-        # Transformer config
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        # Patch merger config
-        self.merge_kernel_size = merge_kernel_size
--- a/transformers_utils/configs/nemotron.py
+++ b/transformers_utils/configs/nemotron.py
@@ -1,212 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Copyright 2024 HuggingFace Inc. team. All rights reserved.
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Nemotron model configuration"""
-
-from transformers import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-class NemotronConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a
-    [`NemotronModel`]. It is used to instantiate a Nemotron model
-    according to the specified arguments, defining the model architecture.
-    Instantiating a configuration with the defaults will yield a similar
-    configuration to that of the Nemotron-8B.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be
-    used to control the model outputs. Read the documentation from
-    [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 256000):
-            Vocabulary size of the Nemotron model. Defines the number of
-            different tokens that can be represented by the
-            `inputs_ids` passed when calling [`NemotronModel`]
-        hidden_size (`int`, *optional*, defaults to 6144):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 24576):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 48):
-            Number of attention heads for each attention layer in the
-            Transformer decoder.
-        head_dim (`int`, *optional*):
-            Projection weights dimension in multi-head attention. Set to
-            hidden_size // num_attention_heads if None
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to
-            implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use
-            Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention
-            (MQA) otherwise GQA is used. When converting a multi-head
-            checkpoint to a GQA checkpoint, each group key and value
-            head should be constructed by meanpooling all the original
-            heads within that group. For more details checkout
-            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
-            is not specified, will default to `num_attention_heads`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
-            The non-linear activation function (function or string) in the
-            decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 4096):
-            The maximum sequence length that this model might ever be used
-            with.
-        initializer_range (`float`, *optional*, defaults to 0.0134):
-            The standard deviation of the truncated_normal_initializer for
-            initializing all weight matrices.
-        norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values
-            attentions (not used by all models). Only relevant if
-            `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*):
-            Padding token id.
-        bos_token_id (`int`, *optional*, defaults to 2):
-            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 3):
-            End of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
-            Percentage of the query and keys which will have rotary embedding.
-        attention_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value and output
-            projection layers during self-attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        mlp_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use a bias in up_proj and down_proj layers in the MLP
-            layers.
-
-    ```python
-    >>> from transformers import NemotronModel, NemotronConfig
-    >>> # Initializing a Nemotron nemotron-15b style configuration
-    >>> configuration = NemotronConfig()
-    >>> # Initializing a model from the nemotron-15b style configuration
-    >>> model = NemotronModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "nemotron"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=256000,
-        hidden_size=6144,
-        intermediate_size=24576,
-        num_hidden_layers=32,
-        num_attention_heads=48,
-        head_dim=None,
-        num_key_value_heads=None,
-        hidden_act="relu2",
-        max_position_embeddings=4096,
-        initializer_range=0.0134,
-        norm_eps=1e-5,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=2,
-        eos_token_id=3,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        partial_rotary_factor=0.5,
-        attention_bias=False,
-        attention_dropout=0.0,
-        mlp_bias=False,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        head_dim = head_dim or kwargs.get("kv_channels")
-        self.head_dim = (
-            head_dim if head_dim is not None else (hidden_size // num_attention_heads)
-        )
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.norm_eps = norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        # for backward compatibility
-        partial_rotary_factor = (
-            kwargs.get("rope_percent")
-            or kwargs.get("rope_percentage")
-            or partial_rotary_factor
-        )
-        self.partial_rotary_factor = partial_rotary_factor
-        self._rope_scaling_validation()
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-        self.mlp_bias = mlp_bias
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-    def _rope_scaling_validation(self):
-        """
-        Validate the `rope_scaling` configuration.
-        """
-        if self.rope_scaling is None:
-            return
-
-        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with two fields, "
-                f"`type` and `factor`, got {self.rope_scaling}"
-            )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                "`rope_scaling`'s type field must be one of ['linear', "
-                f"'dynamic'], got {rope_scaling_type}"
-            )
-        if (
-            rope_scaling_factor is None
-            or not isinstance(rope_scaling_factor, float)
-            or rope_scaling_factor <= 1.0
-        ):
-            raise ValueError(
-                "`rope_scaling`'s factor field must be a float > 1, got "
-                f"{rope_scaling_factor}"
-            )
--- a/transformers_utils/configs/nemotron_h.py
+++ b/transformers_utils/configs/nemotron_h.py
@@ -1,282 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Copyright 2024 HuggingFace Inc. team. All rights reserved.
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""NemotronH model configuration"""
-
-import regex as re
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-class NemotronHConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a
-    [`NemotronHModel`]. It is used to instantiate a NemotronH model according
-    to the specified arguments, defining the model architecture. Instantiating
-    a configuration with the defaults will yield a similar configuration to
-    that of the NemotronH-v0.1 model.
-    Args:
-        vocab_size (`int`, *optional*, defaults to 131072):
-            Vocabulary size of the NemotronH model. Defines the number of
-            different tokens that can be represented by the `inputs_ids`
-            passed when calling [`NemotronHModel`]
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be
-            tied. Note that this is only relevant if the model has an output
-            word embedding layer.
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 21504):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 52):
-            Number of hidden layers in the Transformer encoder.
-        hybrid_override_pattern (`str`, *optional*, defaults to
-            `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
-            The pattern of the hybrid model. The pattern is a string of
-            characters where each character represents
-            M: Mamba2, *: Attention, -: MLP
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the
-            Transformer encoder.
-        attention_head_dim (`int`, *optional*, defaults to 128):
-            Dimension of each attention head.
-        num_key_value_heads (`int`, *optional*, defaults to 8):
-            This is the number of key_value heads that should be used to
-            implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use
-            Multi Head Attention (MHA), if `num_key_value_heads=1` the model
-            will use Multi Query Attention (MQA) otherwise GQA is used.
-        mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
-            The non-linear activation function in the MLP layers.
-        attention_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use bias in attention layers.
-        mlp_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use bias in MLP layers.
-        use_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use bias in the model.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for
-            initializing all weight matrices.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
-            The epsilon used by the layer normalization layers.
-        residual_in_fp32 (`bool`, *optional*, defaults to `False`):
-            Whether or not residuals should be in `float32`. If set to `False`
-            residuals will keep the same `dtype` as the rest of the model.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values
-            attentions (not used by all models). Only relevant if
-            `config.is_decoder=True`.
-        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
-            Number of prompt logits to calculate during generation. If `None`,
-            all logits will be calculated. If an integer value, only last
-            `num_logits_to_keep` logits will be calculated.
-        pad_token_id (`int`, *optional*, defaults to 0):
-            The id of the padding token.
-        bos_token_id (`int`, *optional*, defaults to 1):
-            The id of the "beginning-of-sequence" token.
-        eos_token_id (`int`, *optional*, defaults to 2):
-            The id of the "end-of-sequence" token.
-        sliding_window (`int`, *optional*, defaults to None):
-            Sliding window attention window size.
-        max_position_embeddings (`int`, *optional*, defaults to 4096):
-            The maximum sequence length that this model might ever be used
-            with.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        hidden_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the hidden states.
-        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
-            Flag indicating whether or not to use the fast mamba kernels.
-            These are available only if `mamba-ssm` and `causal-conv1d`
-            are installed, and the mamba modules are running on a CUDA device.
-        ssm_state_size (`int`, *optional*, defaults to 128):
-            The dimension of the mamba state space latents.
-        mamba_num_heads (`int`, *optional*, defaults to 128):
-            Number of heads in Mamba layers.
-        mamba_n_groups (`int`, *optional*, defaults to 8):
-            Number of groups in Mamba layers.
-        mamba_head_dim (`int`, *optional*, defaults to 64):
-            Dimension of each Mamba head.
-        mamba_d_conv (`int`, *optional*, defaults to 4):
-            The size of the mamba convolution kernel.
-        mamba_expand (`int`, *optional*, defaults to 2):
-            Expanding factor used to determine the mamba intermediate size.
-        mamba_hidden_act (`str`, *optional*, defaults to "silu"):
-            The non-linear activation function in the Mamba layers.
-        mamba_dt_min (`float`, *optional*, defaults to 0.001):
-            Minimum value for the time step in Mamba.
-        mamba_dt_max (`float`, *optional*, defaults to 0.1):
-            Maximum value for the time step in Mamba.
-        mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
-            Limits for the time step in Mamba.
-        mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
-            Floor value for time step initialization in Mamba.
-        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
-            Whether to use bias in the convolution layer of the mamba mixer
-            block.
-        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use bias in the input and output projections of the
-            mamba mixer block.
-        mamba_chunk_size (`int`, *optional*, defaults to 256):
-            Size of chunks for Mamba processing.
-        rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the pre-normalization residual connections.
-    """
-
-    model_type = "nemotron_h"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=131072,
-        tie_word_embeddings=False,
-        hidden_size=4096,
-        intermediate_size=21504,
-        num_hidden_layers=52,
-        hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
-        num_attention_heads=32,
-        head_dim=128,
-        num_key_value_heads=8,  # nemo: num_query_groups
-        mlp_hidden_act="relu2",
-        attention_bias=False,
-        mlp_bias=False,
-        use_bias=False,
-        initializer_range=0.02,  # nemo: init_method_std
-        layer_norm_epsilon=1e-5,  # nemo: layernorm_epsilon
-        residual_in_fp32=False,  #  Megatron Core default value
-        use_cache=True,
-        num_logits_to_keep=1,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        sliding_window=None,
-        max_position_embeddings=4096,
-        attention_dropout=0.0,
-        hidden_dropout=0.0,  # * ADDED
-        use_mamba_kernels=True,
-        ssm_state_size=128,  # mamba_state_size
-        mamba_num_heads=128,
-        mamba_n_groups=8,  # nemo: mamba_ssm_ngroups = num_heads
-        mamba_head_dim=64,
-        mamba_d_conv=4,
-        mamba_expand=2,
-        mamba_hidden_act="silu",
-        mamba_dt_min=0.001,
-        mamba_dt_max=0.1,
-        mamba_dt_limit=(0.0, float("inf")),
-        mamba_dt_init_floor=1e-4,
-        mamba_conv_bias=True,
-        mamba_proj_bias=False,
-        mamba_chunk_size=256,
-        rescale_prenorm_residual=True,
-        n_routed_experts=8,
-        n_shared_experts=1,
-        moe_intermediate_size=7688,
-        moe_shared_expert_intermediate_size=7688,
-        num_experts_per_tok=2,
-        routed_scaling_factor=1.0,
-        n_group=1,
-        topk_group=1,
-        norm_topk_prob=True,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.tie_word_embeddings = tie_word_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.hybrid_override_pattern = hybrid_override_pattern
-        self.num_attention_heads = num_attention_heads
-        self.head_dim = head_dim
-        self.sliding_window = sliding_window
-        self.max_position_embeddings = max_position_embeddings
-        self.attention_dropout = attention_dropout
-        self.hidden_dropout = hidden_dropout
-
-        # Validate hybrid_override_pattern
-        # M: Mamba2, *: Attention, -: MLP
-        assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
-            "hybrid_override_pattern must have same length as num_hidden_layers"
-        )
-        assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
-            "hybrid_override_pattern must only contain characters 'M', '*', or '-'"
-        )
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.mlp_hidden_act = mlp_hidden_act
-        self.attention_bias = attention_bias
-        self.mlp_bias = mlp_bias
-        self.use_bias = use_bias
-        self.initializer_range = initializer_range
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.residual_in_fp32 = residual_in_fp32
-
-        self.use_cache = use_cache
-        self.num_logits_to_keep = num_logits_to_keep
-
-        self.use_mamba_kernels = use_mamba_kernels
-        self.n_groups = mamba_n_groups
-        self.mamba_head_dim = mamba_head_dim
-        self.ssm_state_size = ssm_state_size
-        self.mamba_num_heads = mamba_num_heads
-        self.conv_kernel = mamba_d_conv
-        self.expand = mamba_expand
-        self.mamba_hidden_act = mamba_hidden_act
-        self.time_step_min = mamba_dt_min
-        self.time_step_max = mamba_dt_max
-        self.time_step_limit = mamba_dt_limit
-        self.time_step_floor = mamba_dt_init_floor
-        self.use_conv_bias = mamba_conv_bias
-        self.mamba_proj_bias = mamba_proj_bias
-        self.chunk_size = mamba_chunk_size
-        self.rescale_prenorm_residual = rescale_prenorm_residual
-        self.n_routed_experts = n_routed_experts
-        self.n_shared_experts = n_shared_experts
-        self.moe_intermediate_size = moe_intermediate_size
-        self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size  # noqa: E501
-        self.num_experts_per_tok = num_experts_per_tok
-        self.routed_scaling_factor = routed_scaling_factor
-        self.n_group = n_group
-        self.topk_group = topk_group
-        self.norm_topk_prob = norm_topk_prob
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-    @property
-    def layers_block_type(self):
-        return [
-            "mamba"
-            if self.hybrid_override_pattern[i] == "M"
-            else "attention"
-            if self.hybrid_override_pattern[i] == "*"
-            else "mlp"
-            if self.hybrid_override_pattern[i] == "-"
-            else "moe"
-            for i in range(self.num_hidden_layers)
-        ]
--- a/transformers_utils/configs/olmo3.py
+++ b/transformers_utils/configs/olmo3.py
@@ -1,79 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from transformers.configuration_utils import PretrainedConfig
-
-
-class Olmo3Config(PretrainedConfig):
-    model_type = "olmo3"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=50304,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        use_cache=True,
-        pad_token_id=1,
-        bos_token_id=None,
-        eos_token_id=50279,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
-        rms_norm_eps=1e-5,
-        sliding_window=4096,
-        layer_types=None,
-        **kwargs,
-    ):
-        # This model uses Olmo3ForCausalLM in transformers but Olmo2ForCausalLM
-        # in vLLM.
-        if "architectures" not in kwargs:
-            kwargs["architectures"] = ["Olmo2ForCausalLM"]
-        elif "Olmo3ForCausalLM" in kwargs["architectures"]:
-            kwargs["architectures"].remove("Olmo3ForCausalLM")
-            kwargs["architectures"].append("Olmo2ForCausalLM")
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-
-        self.rms_norm_eps = rms_norm_eps
-
-        self.sliding_window = sliding_window
-        self.layer_types = layer_types
-        if self.layer_types is None:
-            self.layer_types = [
-                "sliding_attention" if (i + 1) % 4 != 0 else "full_attention"
-                for i in range(self.num_hidden_layers)
-            ]
--- a/transformers_utils/configs/ovis.py
+++ b/transformers_utils/configs/ovis.py
@@ -1,182 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# ruff: noqa: E501
-# adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
-# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
-# Ovis Config with AimV2 config registration removed for Transformers compatibility
-from typing import Any
-
-from transformers import AutoConfig, PretrainedConfig
-
-
-class AIMv2Config(PretrainedConfig):
-    """This is the configuration class to store the configuration of an [`AIMv2Model`].
-    Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224).
-    Args:
-        hidden_size: Dimension of the hidden representations.
-        intermediate_size: Dimension of the SwiGLU representations.
-        num_hidden_layers: Number of hidden layers in the Transformer.
-        num_attention_heads: Number of attention heads for each attention layer
-            in the Transformer.
-        num_channels: Number of input channels.
-        image_size: Image size.
-        patch_size: Patch size.
-        rms_norm_eps: Epsilon value used for the RMS normalization layer.
-        attention_dropout: Dropout ratio for attention probabilities.
-        projection_dropout: Dropout ratio for the projection layer after the attention.
-        qkv_bias: Whether to add a bias to the queries, keys and values.
-        use_bias: Whether to add a bias in the feed-forward and projection layers.
-        kwargs: Keyword arguments for the [`PretrainedConfig`].
-    """
-
-    model_type: str = "aimv2"
-
-    def __init__(
-        self,
-        hidden_size: int = 1024,
-        intermediate_size: int = 2816,
-        num_hidden_layers: int = 24,
-        num_attention_heads: int = 8,
-        num_channels: int = 3,
-        image_size: int = 224,
-        patch_size: int = 14,
-        rms_norm_eps: float = 1e-5,
-        attention_dropout: float = 0.0,
-        projection_dropout: float = 0.0,
-        qkv_bias: bool = False,
-        use_bias: bool = False,
-        **kwargs: Any,
-    ):
-        super().__init__(**kwargs)
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
-        self.patch_size = patch_size
-        self.image_size = image_size
-        self.attention_dropout = attention_dropout
-        self.rms_norm_eps = rms_norm_eps
-
-        self.projection_dropout = projection_dropout
-        self.qkv_bias = qkv_bias
-        self.use_bias = use_bias
-
-
-# ----------------------------------------------------------------------
-#                     Visual Tokenizer Configuration
-# ----------------------------------------------------------------------
-class BaseVisualTokenizerConfig(PretrainedConfig):
-    def __init__(
-        self,
-        vocab_size=16384,
-        tokenize_function="softmax",
-        tau=1.0,
-        depths=None,
-        drop_cls_token=False,
-        backbone_config: PretrainedConfig | dict | None = None,
-        hidden_stride: int = 1,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.vocab_size = vocab_size
-        self.tokenize_function = tokenize_function
-        self.tau = tau
-        if isinstance(depths, str):
-            depths = [int(x) for x in depths.split("|")]
-        self.depths = depths
-        self.backbone_kwargs = dict[str, Any]()
-        self.drop_cls_token = drop_cls_token
-        if backbone_config is not None:
-            assert isinstance(backbone_config, (PretrainedConfig, dict)), (
-                f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
-            )
-            if not isinstance(backbone_config, PretrainedConfig):
-                model_type = backbone_config["model_type"]
-                if model_type != "aimv2":
-                    backbone_config.pop("model_type")
-                    backbone_config = AutoConfig.for_model(
-                        model_type, **backbone_config
-                    )
-                else:
-                    backbone_config = AIMv2Config(**backbone_config)
-        self.backbone_config = backbone_config
-        self.hidden_stride = hidden_stride
-
-
-class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
-    model_type = "aimv2_visual_tokenizer"
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        if self.drop_cls_token:
-            self.drop_cls_token = False
-        if self.depths:
-            assert len(self.depths) == 1
-            self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
-
-
-class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
-    model_type = "siglip_visual_tokenizer"
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        if self.drop_cls_token:
-            self.drop_cls_token = False
-        if self.depths:
-            assert len(self.depths) == 1
-            self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
-
-
-AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
-AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
-
-
-# ----------------------------------------------------------------------
-#                           Ovis Configuration
-# ----------------------------------------------------------------------
-class OvisConfig(PretrainedConfig):
-    model_type = "ovis"
-
-    def __init__(
-        self,
-        llm_config: PretrainedConfig | dict | None = None,
-        visual_tokenizer_config: PretrainedConfig | dict | None = None,
-        multimodal_max_length=8192,
-        hidden_size=None,
-        conversation_formatter_class=None,
-        llm_attn_implementation=None,
-        disable_tie_weight=False,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        if llm_config is not None:
-            assert isinstance(llm_config, (PretrainedConfig, dict)), (
-                f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
-            )
-            if not isinstance(llm_config, PretrainedConfig):
-                model_type = llm_config["model_type"]
-                llm_config.pop("model_type")
-                llm_config = AutoConfig.for_model(model_type, **llm_config)
-
-        # map llm_config to text_config
-        self.text_config = llm_config
-        if visual_tokenizer_config is not None:
-            assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), (
-                f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
-            )
-            if not isinstance(visual_tokenizer_config, PretrainedConfig):
-                model_type = visual_tokenizer_config["model_type"]
-                visual_tokenizer_config.pop("model_type")
-                visual_tokenizer_config = AutoConfig.for_model(
-                    model_type, **visual_tokenizer_config
-                )
-
-        self.visual_tokenizer_config = visual_tokenizer_config
-        self.multimodal_max_length = multimodal_max_length
-        self.hidden_size = hidden_size
-        self.conversation_formatter_class = conversation_formatter_class
-        self.llm_attn_implementation = llm_attn_implementation
-        self.disable_tie_weight = disable_tie_weight
--- a/transformers_utils/configs/qwen3_next.py
+++ b/transformers_utils/configs/qwen3_next.py
@@ -1,274 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Qwen3-Next model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig, layer_type_validation
-from transformers.modeling_rope_utils import rope_config_validation
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-class Qwen3NextConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Qwen3NextModel`]. It is used to instantiate a
-    Qwen3-Next model according to the specified arguments, defining the model architecture.
-    Instantiating a configuration with the defaults will yield a similar configuration to that of
-    Qwen3-Next-80B-A3B-Instruct [Qwen/Qwen3-Next-80B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct).
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 151936):
-            Vocabulary size of the model. Defines the number of different tokens that can be represented by the
-            `inputs_ids`.
-        hidden_size (`int`, *optional*, defaults to 2048):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 5632):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 48):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 16):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 2):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
-        hidden_act (`str`, *optional*, defaults to `"silu"`):
-            The non-linear activation function in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
-            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
-            accordingly.
-            Expected contents:
-                `rope_type` (`str`):
-                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
-                    'llama3'], with 'default' being the original RoPE implementation.
-                `factor` (`float`, *optional*):
-                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
-                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
-                    original maximum pre-trained length.
-                `original_max_position_embeddings` (`int`, *optional*):
-                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
-                    pretraining.
-                `attention_factor` (`float`, *optional*):
-                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
-                    computation. If unspecified, it defaults to value recommended by the implementation, using the
-                    `factor` field to infer the suggested value.
-                `beta_fast` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 32.
-                `beta_slow` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 1.
-                `short_factor` (`List[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `long_factor` (`List[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `low_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
-                `high_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
-        partial_rotary_factor (`float`, *optional*, defaults to 0.25):
-            Percentage of the query and keys which will have rotary embedding.
-        attention_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value and output projection layers during self-attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        head_dim (`int`, *optional*, defaults to 256):
-            Projection weights dimension in multi-head attention.
-        linear_conv_kernel_dim (`int`, *optional*, defaults to 4):
-            Kernel size of the convolution used in linear attention layers.
-        linear_key_head_dim (`int`, *optional*, defaults to 128):
-            Dimension of each key head in linear attention.
-        linear_value_head_dim (`int`, *optional*, defaults to 128):
-            Dimension of each value head in linear attention.
-        linear_num_key_heads (`int`, *optional*, defaults to 16):
-            Number of key heads used in linear attention layers.
-        linear_num_value_heads (`int`, *optional*, defaults to 32):
-            Number of value heads used in linear attention layers.
-        decoder_sparse_step (`int`, *optional*, defaults to 1):
-            The frequency of the MoE layer.
-        moe_intermediate_size (`int`, *optional*, defaults to 512):
-            Intermediate size of the routed expert.
-        shared_expert_intermediate_size (`int`, *optional*, defaults to 512):
-            Intermediate size of the shared expert.
-        num_experts_per_tok (`int`, *optional*, defaults to 10):
-            Number of selected experts.
-        num_experts (`int`, *optional*, defaults to 512):
-            Number of routed experts.
-        norm_topk_prob (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the topk probabilities.
-        output_router_logits (`bool`, *optional*, defaults to `False`):
-            Whether or not the router logits should be returned by the model. Enabling this will also
-            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
-        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
-            The aux loss factor for the total loss.
-        mlp_only_layers (`list[int]`, *optional*, defaults to `[]`):
-            Indicate which layers use Qwen3NextMLP rather than Qwen3NextSparseMoeBlock
-            The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
-            If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
-        layer_types (`list[str]`, *optional*):
-            Types of each layer (attention or linear).
-
-    ```python
-    >>> from transformers import Qwen3NextModel, Qwen3NextConfig
-
-    >>> # Initializing a Qwen3Next style configuration
-    >>> configuration =  Qwen3NextConfig()
-
-    >>> # Initializing a model from the Qwen3-Next-80B-A3B style configuration
-    >>> model = Qwen3NextModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```
-    """  # noqa: E501
-
-    model_type = "qwen3_next"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    base_model_tp_plan = {
-        "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.k_proj": "colwise",
-        "layers.*.self_attn.v_proj": "colwise",
-        "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.experts.*.gate_proj": "colwise",
-        "layers.*.mlp.experts.*.up_proj": "colwise",
-        "layers.*.mlp.experts.*.down_proj": "rowwise",
-        "layers.*.mlp.shared_experts.gate_proj": "colwise",
-        "layers.*.mlp.shared_experts.up_proj": "colwise",
-        "layers.*.mlp.shared_experts.down_proj": "rowwise",
-        "layers.*.mlp.gate_proj": "colwise",
-        "layers.*.mlp.up_proj": "colwise",
-        "layers.*.mlp.down_proj": "rowwise",
-    }
-    base_model_pp_plan = {
-        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
-        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
-        "norm": (["hidden_states"], ["hidden_states"]),
-    }
-
-    def __init__(
-        self,
-        vocab_size=151936,
-        hidden_size=2048,
-        intermediate_size=5632,
-        num_hidden_layers=48,
-        num_attention_heads=16,
-        num_key_value_heads=2,
-        hidden_act="silu",
-        max_position_embeddings=32768,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        partial_rotary_factor=0.25,
-        attention_bias=False,
-        attention_dropout=0.0,
-        head_dim=256,
-        linear_conv_kernel_dim=4,
-        linear_key_head_dim=128,
-        linear_value_head_dim=128,
-        linear_num_key_heads=16,
-        linear_num_value_heads=32,
-        decoder_sparse_step=1,
-        moe_intermediate_size=512,
-        shared_expert_intermediate_size=512,
-        num_experts_per_tok=10,
-        num_experts=512,
-        norm_topk_prob=True,
-        output_router_logits=False,
-        router_aux_loss_coef=0.001,
-        mlp_only_layers=None,
-        layer_types=None,
-        **kwargs,
-    ):
-        if mlp_only_layers is None:
-            mlp_only_layers = []
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.partial_rotary_factor = partial_rotary_factor
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-        self.head_dim = head_dim
-        rope_config_validation(self)
-
-        self.layer_types = layer_types
-        if self.layer_types is None:
-            self.layer_types = [
-                "linear_attention" if bool((i + 1) % 4) else "full_attention"
-                for i in range(self.num_hidden_layers)
-            ]
-        layer_type_validation(self.layer_types)
-
-        # linear attention part
-        self.linear_conv_kernel_dim = linear_conv_kernel_dim
-        self.linear_key_head_dim = linear_key_head_dim
-        self.linear_value_head_dim = linear_value_head_dim
-        self.linear_num_key_heads = linear_num_key_heads
-        self.linear_num_value_heads = linear_num_value_heads
-
-        # MoE arguments
-        self.decoder_sparse_step = decoder_sparse_step
-        self.moe_intermediate_size = moe_intermediate_size
-        self.shared_expert_intermediate_size = shared_expert_intermediate_size
-        self.num_experts_per_tok = num_experts_per_tok
-        self.num_experts = num_experts
-        self.norm_topk_prob = norm_topk_prob
-        self.output_router_logits = output_router_logits
-        self.router_aux_loss_coef = router_aux_loss_coef
-        self.mlp_only_layers = mlp_only_layers
-
-
-__all__ = ["Qwen3NextConfig"]
--- a/transformers_utils/configs/radio.py
+++ b/transformers_utils/configs/radio.py
@@ -1,89 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Radio vision model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-VIT_TIMM_DIM_BY_NAME: dict[str, tuple[int, int, int, int]] = {
-    "vit_small_patch16_224": (384, 12, 6, 1536),
-    "vit_base_patch16_224": (768, 12, 12, 3072),
-    "vit_large_patch16_224": (1024, 24, 16, 4096),
-    "vit_huge_patch16_224": (1280, 32, 16, 5120),
-}
-
-OPENAI_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
-OPENAI_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
-
-
-class RadioConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a Radio
-    vision model. It is used to instantiate a Radio model according to the
-    specified arguments, defining the model architecture.
-
-    Args:
-        model_name: Name of the vision transformer model
-            (e.g., "vit_base_patch16_224"). Used to determine architecture
-            dimensions from `VIT_TIMM_DIM_BY_NAME`.
-        image_size: The size (resolution) of each image.
-        patch_size: The size (resolution) of each patch.
-        qkv_bias: Whether to add a bias to the queries, keys and values.
-        qk_normalization: Whether to apply normalization to queries and keys.
-        norm_type: The normalization type to use.
-        layer_norm_eps: The epsilon used by the layer normalization layers.
-        initializer_factor: A factor for initializing all weight matrices.
-        hidden_act: The non-linear activation function in the encoder.
-        max_img_size: Maximum image size for position embeddings.
-        norm_mean: Mean values for image normalization (RGB channels).
-            Defaults to (0.48145466, 0.4578275, 0.40821073)).
-        norm_std: Standard deviation values for image normalization
-            (RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).
-        reg_tokens: Number of register tokens to use.
-    """
-
-    model_type = "radio"
-
-    def __init__(
-        self,
-        model_name: str,
-        image_size: int = 224,
-        patch_size: int = 16,
-        qkv_bias: bool = True,
-        qk_normalization: bool = False,
-        norm_type: str = "layer_norm",
-        layer_norm_eps: float = 1e-6,
-        initializer_factor: float = 1.0,
-        hidden_act: str = "gelu",
-        max_img_size: int = 2048,
-        norm_mean: tuple[float, float, float] | list = OPENAI_CLIP_MEAN,
-        norm_std: tuple[float, float, float] | list = OPENAI_CLIP_STD,
-        reg_tokens: int | None = None,
-        **kwargs,
-    ):
-        self.model_name = model_name
-        (
-            self.hidden_size,
-            self.num_hidden_layers,
-            self.num_attention_heads,
-            self.intermediate_size,
-        ) = VIT_TIMM_DIM_BY_NAME[model_name]
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.qkv_bias = qkv_bias
-        self.qk_normalization = qk_normalization
-        self.norm_type = norm_type
-        self.layer_norm_eps = layer_norm_eps
-        self.initializer_factor = initializer_factor
-        self.hidden_act = hidden_act
-        self.max_img_size = max_img_size
-        self.norm_mean = (
-            list(norm_mean) if isinstance(norm_mean, (tuple, list)) else norm_mean
-        )
-        self.norm_std = (
-            list(norm_std) if isinstance(norm_std, (tuple, list)) else norm_std
-        )
-        self.reg_tokens = reg_tokens
-        super().__init__(**kwargs)
--- a/transformers_utils/configs/speculators/init.py
+++ b/transformers_utils/configs/speculators/init.py
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
--- a/transformers_utils/configs/speculators/pycache/init.cpython-312.pyc
+++ b/transformers_utils/configs/speculators/pycache/init.cpython-312.pyc
--- a/transformers_utils/configs/speculators/pycache/algos.cpython-312.pyc
+++ b/transformers_utils/configs/speculators/pycache/algos.cpython-312.pyc
--- a/transformers_utils/configs/speculators/pycache/base.cpython-312.pyc
+++ b/transformers_utils/configs/speculators/pycache/base.cpython-312.pyc
--- a/transformers_utils/configs/speculators/algos.py
+++ b/transformers_utils/configs/speculators/algos.py
@@ -1,38 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-SUPPORTED_SPECULATORS_TYPES = {}
-
-
-def register_speculator(name):
-    def decorator(fn):
-        SUPPORTED_SPECULATORS_TYPES[name] = fn
-        return fn
-
-    return decorator
-
-
-@register_speculator("eagle3")
-def update_eagle3(config_dict: dict, vllm_config: dict) -> None:
-    """
-    Apply Eagle-3 specific configuration transformations.
-
-    Eagle-3 specific fields:
-    - draft_vocab_size: Size of the draft model's vocabulary
-    - target_hidden_size: Hidden size of the target model
-    - norm_before_residual: Whether to apply norm before residual connection
-    - eagle_aux_hidden_state_layer_ids: List of layer indices from the base
-        model to use as auxiliary inputs for the Eagle3 drafter. These layers
-        provide intermediate hidden states that help the drafter make better
-        predictions. This is the standard field used in Eagle3 checkpoints.
-    """
-
-    vllm_config["draft_vocab_size"] = config_dict.get("draft_vocab_size")
-    if config_dict.get("target_hidden_size") is not None:
-        vllm_config["target_hidden_size"] = config_dict["target_hidden_size"]
-    vllm_config["norm_before_residual"] = config_dict.get("norm_before_residual", True)
-    vllm_config["architectures"] = ["Eagle3LlamaForCausalLM"]
-    if config_dict.get("eagle_aux_hidden_state_layer_ids"):
-        vllm_config["eagle_aux_hidden_state_layer_ids"] = config_dict[
-            "eagle_aux_hidden_state_layer_ids"
-        ]
--- a/transformers_utils/configs/speculators/base.py
+++ b/transformers_utils/configs/speculators/base.py
@@ -1,114 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
-from typing import Any
-
-from transformers import PretrainedConfig
-
-from vllm.transformers_utils.configs.speculators.algos import (
-    SUPPORTED_SPECULATORS_TYPES,
-)
-
-__all__ = ["SpeculatorsConfig"]
-
-
-class SpeculatorsConfig(PretrainedConfig):
-    model_type = "speculators"
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path: str | os.PathLike,
-        **kwargs,
-    ) -> "SpeculatorsConfig":
-        """Load speculators Eagle config and convert to vLLM format."""
-        config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        vllm_config = cls.extract_vllm_speculative_config(config_dict)
-        return cls(**vllm_config)
-
-    @classmethod
-    def extract_vllm_speculative_config(
-        cls, config_dict: dict[str, Any]
-    ) -> dict[str, Any]:
-        speculators_model_type = config_dict.get("speculators_model_type")
-        if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES:
-            raise ValueError(
-                f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. "
-                "Please ensure you're loading a speculators-format model."
-            )
-
-        # validate fields
-        # TODO: @dsikka - use speculators pydantic model to validate
-        cls.validate_speculators_config(config_dict=config_dict)
-        # Convert from speculators config -> format that can be ingested by vLLM
-        vllm_config = cls.build_vllm_speculative_config(config_dict=config_dict)
-        # Apply anything specific to the supported algorithm
-        algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type]
-        algo_updater(config_dict=config_dict, vllm_config=vllm_config)
-        return vllm_config
-
-    @classmethod
-    def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None:
-        try:
-            spec_config = config_dict["speculators_config"]
-            methods = spec_config["proposal_methods"]
-            first_method = methods[0]
-            _ = first_method["speculative_tokens"]
-            _ = spec_config["verifier"]["name_or_path"]
-            _ = config_dict["speculators_model_type"]
-        except (KeyError, IndexError, TypeError) as e:
-            raise ValueError("Invalid speculators config structure") from e
-
-        if "transformer_layer_config" not in config_dict:
-            raise ValueError("Must provide transformer_layer_config")
-
-        if not isinstance(config_dict["transformer_layer_config"], dict):
-            raise TypeError(
-                "'transformer_layer_config' must be a dictionary if provided"
-            )
-
-    @classmethod
-    def build_vllm_speculative_config(
-        cls, config_dict: dict[str, Any]
-    ) -> dict[str, Any]:
-        """
-        Build vLLM-compatible speculative configuration from speculators format.
-
-        This method extracts and transforms speculative configuration from the
-        speculators format into the structure expected by vLLM.
-
-        Args:
-            config_dict: Configuration dictionary in speculators format
-
-        Returns:
-            Dictionary with vLLM-compatible speculative configuration
-        """
-        # Extract speculators configuration
-        spec_config = config_dict["speculators_config"]
-
-        # Currently we only support one proposal method
-        proposal_methods = spec_config.get("proposal_methods")
-        if not proposal_methods:
-            raise ValueError("No proposal methods found in speculators config")
-
-        first_method = proposal_methods[0]
-        num_speculative_tokens = first_method.get("speculative_tokens")
-
-        if num_speculative_tokens is None:
-            raise ValueError(
-                f"Missing 'speculative_tokens' in proposal method. Got: {first_method}"
-            )
-
-        # Build base vLLM speculative configuration
-        vllm_config = {
-            "method": config_dict.get("speculators_model_type"),
-            "num_speculative_tokens": num_speculative_tokens,
-            "target_model": spec_config.get("verifier")["name_or_path"],
-        }
-
-        # Merge transformer layer configuration if present
-        transformer_config = config_dict.get("transformer_layer_config", {})
-        vllm_config.update(transformer_config)
-
-        return vllm_config
--- a/transformers_utils/configs/step3_vl.py
+++ b/transformers_utils/configs/step3_vl.py
@@ -1,174 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any
-
-from transformers.configuration_utils import PretrainedConfig
-
-
-class Step3VisionEncoderConfig(PretrainedConfig):
-    model_type = "step3_vision_encoder"
-
-    def __init__(
-        self,
-        hidden_size=1792,
-        intermediate_size=3072,
-        output_hidden_size=4096,
-        num_hidden_layers=63,
-        num_attention_heads=16,
-        num_channels=3,
-        image_size=728,
-        patch_size=14,
-        hidden_act="quick_gelu",
-        layer_norm_eps=1e-5,
-        **kwargs,
-    ):
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.output_hidden_size = output_hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
-        self.patch_size = patch_size
-        self.image_size = image_size
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        super().__init__(**kwargs)
-
-
-class Step3TextConfig(PretrainedConfig):
-    model_type = "step3_text"
-    architectures = ["Step3TextForCausalLM"]
-
-    def __init__(
-        self,
-        hidden_size: int = 7168,
-        intermediate_size: int = 18432,
-        num_attention_heads: int = 64,
-        num_attention_groups: int = 1,
-        num_hidden_layers: int = 61,
-        max_seq_len: int = 65536,
-        vocab_size: int = 128815,
-        rms_norm_eps: float = 1e-5,
-        moe_intermediate_size: int = 5120,
-        moe_num_experts: int = 48,
-        moe_top_k: int = 3,
-        rope_theta: float = 500000,
-        rope_scaling: dict[str, Any] | None = None,
-        max_position_embedding: int = 65536,
-        share_expert_dim: int = 5120,
-        share_q_dim: int = 2048,
-        head_dim: int = 256,
-        norm_expert_weight: bool = False,
-        moe_layers_enum: tuple[int, ...] = (
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            14,
-            15,
-            16,
-            17,
-            18,
-            19,
-            20,
-            21,
-            22,
-            23,
-            24,
-            25,
-            26,
-            27,
-            28,
-            29,
-            30,
-            31,
-            32,
-            33,
-            34,
-            35,
-            36,
-            37,
-            38,
-            39,
-            40,
-            41,
-            42,
-            43,
-            44,
-            45,
-            46,
-            47,
-            48,
-            49,
-            50,
-            51,
-            52,
-            53,
-            54,
-            55,
-            56,
-            57,
-            58,
-            59,
-        ),
-        **kwargs,
-    ) -> None:
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_attention_heads = num_attention_heads
-        self.num_attention_groups = num_attention_groups
-        self.num_hidden_layers = num_hidden_layers
-        self.max_seq_len = max_seq_len
-        self.vocab_size = vocab_size
-        self.rms_norm_eps = rms_norm_eps
-        self.moe_intermediate_size = moe_intermediate_size
-        self.moe_num_experts = moe_num_experts
-        self.moe_top_k = moe_top_k
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.max_position_embedding = max_position_embedding
-        self.share_expert_dim = share_expert_dim
-        self.share_q_dim = share_q_dim
-        self.head_dim = head_dim
-        self.norm_expert_weight = norm_expert_weight
-        self.moe_layers_enum = moe_layers_enum
-
-        super().__init__(**kwargs)
-
-
-class Step3VLConfig(PretrainedConfig):
-    model_type = "step3_vl"
-
-    def __init__(
-        self,
-        vision_config: dict | Step3VisionEncoderConfig | None = None,
-        text_config: dict | Step3TextConfig | None = None,
-        understand_projector_stride: int = 1,
-        projector_bias: bool = True,
-        image_token_id: int = 128001,
-        **kwargs,
-    ) -> None:
-        if vision_config is None:
-            vision_config = Step3VisionEncoderConfig()
-        elif isinstance(vision_config, dict):
-            vision_config = Step3VisionEncoderConfig(**vision_config)
-        self.vision_config = vision_config
-
-        if text_config is None:
-            text_config = Step3TextConfig()
-        elif isinstance(text_config, dict):
-            text_config = Step3TextConfig(**text_config)
-        self.text_config = text_config
-
-        self.understand_projector_stride = understand_projector_stride
-        self.projector_bias = projector_bias
-        self.hidden_size = text_config.hidden_size
-        self.image_token_id = image_token_id
-
-        super().__init__(**kwargs)
--- a/transformers_utils/configs/ultravox.py
+++ b/transformers_utils/configs/ultravox.py
@@ -1,118 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
-from typing import Any
-
-import transformers
-
-
-class UltravoxConfig(transformers.PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a
-    [`UltravoxForConditionalGeneration`]. It is used to instantiate an
-    Ultravox model according to the specified arguments, defining the model
-    architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to
-    control the model outputs. Read the documentation from [`PretrainedConfig`]
-    for more information.
-
-    Args:
-        audio_config (`Union[AutoConfig, dict]`,  *optional*):
-            Custom audio config or dict.
-        text_config (`Union[AutoConfig, dict]`, *optional*):
-            The config object of the text backbone.
-        audio_model_id (`str`, *optional*):
-            The model ID of the audio backbone.
-        text_model_id (`str`, *optional*):
-            The model ID of the text backbone.
-        ignore_index (`int`, *optional*, defaults to -100):
-            The ignore index for the loss function.
-        audio_token_index (`int`, *optional*, defaults to 32000):
-            The audio token index to encode the audio prompt.
-        stack_factor (`int`, *optional*, defaults to 8):
-            Audio downsampling factor for the multimodal projector.
-        norm_init (`float`, *optional*, defaults to 0.4):
-            The initialization value for the layer normalization.
-        projector_act (`str`, *optional*, defaults to `"swiglu"`):
-            The activation function used by the multimodal projector.
-        projector_ln_mid (`bool`, *optional*, defaults to `False`):
-            Whether to apply layer normalization at the middle of the
-            projector or at the end. Versions v0.4.1 and below
-            use `False`, but v0.5 and above use `True`.
-    """
-
-    wrapped_model_config: transformers.PretrainedConfig
-    model_type = "ultravox"
-    audio_token = "<|audio|>"
-    is_composition = False
-
-    def __init__(
-        self,
-        audio_config: dict[str, Any] | None = None,
-        text_config: dict[str, Any] | None = None,
-        audio_model_id: str | None = None,
-        text_model_id: str | None = None,
-        ignore_index: int = -100,
-        audio_token_index: int = 32000,
-        hidden_size: int = 4096,
-        stack_factor: int = 8,
-        norm_init: float = 0.4,
-        projector_act: str = "swiglu",
-        projector_ln_mid: bool = False,
-        **kwargs,
-    ):
-        self.ignore_index = ignore_index
-        self.audio_token_index = audio_token_index
-
-        self.hidden_size = hidden_size
-        self.stack_factor = stack_factor
-        self.norm_init = norm_init
-        self.projector_act = projector_act
-        self.projector_ln_mid = projector_ln_mid
-
-        # N.B. May set the wrapped_model_config below.
-        self.text_model_id = text_model_id
-        if text_model_id is None:
-            text_config = text_config or {}
-            self.wrapped_model_config = transformers.CONFIG_MAPPING[
-                text_config.get("model_type", "llama")
-            ](**text_config)
-
-        # N.B. May set the audio_config below.
-        self.audio_model_id = audio_model_id
-        if audio_model_id is None:
-            self.audio_model_id = None
-            audio_config = audio_config or {}
-            self.audio_config = transformers.CONFIG_MAPPING[
-                audio_config.get("model_type", "whisper")
-            ](**audio_config)
-
-        super().__init__(**kwargs)
-
-    def __setattr__(self, key, value):
-        # Since --hf-overrides are applied _after_ the UltravoxConfig is
-        # instantiated, load the configs implicitly when assigning text_model_id
-        # or audio_model_id. This allows:
-        #
-        #   --hf-overrides.text_model_id=<quantized variant>
-        #
-        # to behave as intended.
-        if key == "text_model_id" and value is not None:
-            from vllm.transformers_utils.config import get_config
-
-            self.wrapped_model_config = get_config(value, trust_remote_code=False)
-        elif key == "audio_model_id" and value is not None:
-            from vllm.transformers_utils.config import get_config
-
-            self.audio_config = get_config(value, trust_remote_code=False)
-
-        return super().__setattr__(key, value)
-
-    @property
-    def text_config(self) -> transformers.PretrainedConfig:
-        # When Ultravox wraps a multi-modal model (e.g. Gemma), we instantiate
-        # the full model, but the text config is the text config of the inner
-        # model.
-        return self.wrapped_model_config.get_text_config()