Add minimal vLLM 0.16.1 build repo for BI-V150

2026-04-18 10:56:22 +08:00
commit d69657327e
1895 changed files with 615301 additions and 0 deletions
--- a/vllm/transformers_utils/configs/init.py
+++ b/vllm/transformers_utils/configs/init.py
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Model configs may be defined in this directory for the following reasons:
+
+- There is no configuration file defined by HF Hub or Transformers library.
+- There is a need to override the existing config to support vLLM.
+- The HF model_type isn't recognized by the Transformers library but can
+  be mapped to an existing Transformers config, such as
+  deepseek-ai/DeepSeek-V3.2-Exp.
+"""
+
+from __future__ import annotations
+
+import importlib
+
+_CLASS_TO_MODULE: dict[str, str] = {
+    "AfmoeConfig": "vllm.transformers_utils.configs.afmoe",
+    "BagelConfig": "vllm.transformers_utils.configs.bagel",
+    "ChatGLMConfig": "vllm.transformers_utils.configs.chatglm",
+    "ColModernVBertConfig": "vllm.transformers_utils.configs.colmodernvbert",
+    "ColQwen3Config": "vllm.transformers_utils.configs.colqwen3",
+    "OpsColQwen3Config": "vllm.transformers_utils.configs.colqwen3",
+    "Qwen3VLNemotronEmbedConfig": "vllm.transformers_utils.configs.colqwen3",
+    "DeepseekVLV2Config": "vllm.transformers_utils.configs.deepseek_vl2",
+    "DotsOCRConfig": "vllm.transformers_utils.configs.dotsocr",
+    "EAGLEConfig": "vllm.transformers_utils.configs.eagle",
+    "FlexOlmoConfig": "vllm.transformers_utils.configs.flex_olmo",
+    "FunAudioChatConfig": "vllm.transformers_utils.configs.funaudiochat",
+    "FunAudioChatAudioEncoderConfig": "vllm.transformers_utils.configs.funaudiochat",
+    "HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl",
+    "HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl",
+    "HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl",
+    "IsaacConfig": "vllm.transformers_utils.configs.isaac",
+    # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
+    # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
+    # `FalconConfig` class from the official HuggingFace transformers library.
+    "RWConfig": "vllm.transformers_utils.configs.falcon",
+    "JAISConfig": "vllm.transformers_utils.configs.jais",
+    "Lfm2MoeConfig": "vllm.transformers_utils.configs.lfm2_moe",
+    "MedusaConfig": "vllm.transformers_utils.configs.medusa",
+    "MiDashengLMConfig": "vllm.transformers_utils.configs.midashenglm",
+    "MLPSpeculatorConfig": "vllm.transformers_utils.configs.mlp_speculator",
+    "MoonViTConfig": "vllm.transformers_utils.configs.moonvit",
+    "KimiLinearConfig": "vllm.transformers_utils.configs.kimi_linear",
+    "KimiVLConfig": "vllm.transformers_utils.configs.kimi_vl",
+    "KimiK25Config": "vllm.transformers_utils.configs.kimi_k25",
+    "NemotronConfig": "vllm.transformers_utils.configs.nemotron",
+    "NemotronHConfig": "vllm.transformers_utils.configs.nemotron_h",
+    "Olmo3Config": "vllm.transformers_utils.configs.olmo3",
+    "OvisConfig": "vllm.transformers_utils.configs.ovis",
+    "PixelShuffleSiglip2VisionConfig": "vllm.transformers_utils.configs.isaac",
+    "RadioConfig": "vllm.transformers_utils.configs.radio",
+    "SpeculatorsConfig": "vllm.transformers_utils.configs.speculators.base",
+    "UltravoxConfig": "vllm.transformers_utils.configs.ultravox",
+    "Step3VLConfig": "vllm.transformers_utils.configs.step3_vl",
+    "Step3VisionEncoderConfig": "vllm.transformers_utils.configs.step3_vl",
+    "Step3TextConfig": "vllm.transformers_utils.configs.step3_vl",
+    "Step3p5Config": "vllm.transformers_utils.configs.step3p5",
+    "Qwen3ASRConfig": "vllm.transformers_utils.configs.qwen3_asr",
+    "Qwen3NextConfig": "vllm.transformers_utils.configs.qwen3_next",
+    "Qwen3_5Config": "vllm.transformers_utils.configs.qwen3_5",
+    "Qwen3_5TextConfig": "vllm.transformers_utils.configs.qwen3_5",
+    "Qwen3_5MoeConfig": "vllm.transformers_utils.configs.qwen3_5_moe",
+    "Qwen3_5MoeTextConfig": "vllm.transformers_utils.configs.qwen3_5_moe",
+    "Tarsier2Config": "vllm.transformers_utils.configs.tarsier2",
+    # Special case: DeepseekV3Config is from HuggingFace Transformers
+    "DeepseekV3Config": "transformers",
+}
+
+__all__ = [
+    "AfmoeConfig",
+    "BagelConfig",
+    "ChatGLMConfig",
+    "ColModernVBertConfig",
+    "ColQwen3Config",
+    "OpsColQwen3Config",
+    "Qwen3VLNemotronEmbedConfig",
+    "DeepseekVLV2Config",
+    "DeepseekV3Config",
+    "DotsOCRConfig",
+    "EAGLEConfig",
+    "FlexOlmoConfig",
+    "FunAudioChatConfig",
+    "FunAudioChatAudioEncoderConfig",
+    "HunYuanVLConfig",
+    "HunYuanVLTextConfig",
+    "HunYuanVLVisionConfig",
+    "IsaacConfig",
+    "RWConfig",
+    "JAISConfig",
+    "Lfm2MoeConfig",
+    "MedusaConfig",
+    "MiDashengLMConfig",
+    "MLPSpeculatorConfig",
+    "MoonViTConfig",
+    "KimiLinearConfig",
+    "KimiVLConfig",
+    "KimiK25Config",
+    "NemotronConfig",
+    "NemotronHConfig",
+    "Olmo3Config",
+    "OvisConfig",
+    "PixelShuffleSiglip2VisionConfig",
+    "RadioConfig",
+    "SpeculatorsConfig",
+    "UltravoxConfig",
+    "Step3VLConfig",
+    "Step3VisionEncoderConfig",
+    "Step3TextConfig",
+    "Step3p5Config",
+    "Qwen3ASRConfig",
+    "Qwen3NextConfig",
+    "Qwen3_5Config",
+    "Qwen3_5TextConfig",
+    "Qwen3_5MoeConfig",
+    "Qwen3_5MoeTextConfig",
+    "Tarsier2Config",
+]
+
+
+def __getattr__(name: str):
+    if name in _CLASS_TO_MODULE:
+        module_name = _CLASS_TO_MODULE[name]
+        module = importlib.import_module(module_name)
+        return getattr(module, name)
+
+    raise AttributeError(f"module 'configs' has no attribute '{name}'")
+
+
+def __dir__():
+    return sorted(list(__all__))
--- a/vllm/transformers_utils/configs/afmoe.py
+++ b/vllm/transformers_utils/configs/afmoe.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class AfmoeConfig(PretrainedConfig):
+    model_type = "afmoe"
+
+    def __init__(
+        self,
+        vocab_size: int = 200_192,
+        hidden_size: int = 2048,
+        intermediate_size: int = 6144,
+        moe_intermediate_size: int = 1408,
+        num_hidden_layers: int = 32,
+        num_dense_layers: int = 1,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int | None = None,
+        head_dim: int = 128,
+        hidden_act: str = "silu",
+        max_position_embeddings: int = 131072,
+        initializer_range: float = 0.02,
+        rms_norm_eps: float = 1e-5,
+        use_cache: bool = True,
+        tie_word_embeddings: bool = False,
+        rope_parameters: dict | None = None,
+        rope_scaling: dict | None = None,
+        num_experts: int = 64,
+        num_experts_per_tok: int = 6,
+        num_shared_experts: int = 2,
+        num_expert_groups: int = 1,
+        num_limited_groups: int = 1,
+        score_func: str = "sigmoid",
+        route_norm: bool = True,
+        route_scale: float = 1.0,
+        global_attn_every_n_layers: int = 4,
+        sliding_window: int = 2048,
+        layer_types: list[str] | None = None,
+        attention_dropout: float = 0.0,
+        mup_enabled: bool = False,
+        n_group: int = 1,
+        topk_group: int = 1,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_dense_layers = num_dense_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads or num_attention_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if rope_parameters is None:
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
+        self.rope_parameters = rope_parameters
+        self.rope_scaling = rope_scaling
+
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts = num_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_shared_experts = num_shared_experts
+        self.num_expert_groups = num_expert_groups
+        self.num_limited_groups = num_limited_groups
+        self.score_func = score_func
+        self.route_norm = route_norm
+        self.route_scale = route_scale
+
+        self.global_attn_every_n_layers = global_attn_every_n_layers
+        self.sliding_window = sliding_window
+        self.layer_types = layer_types
+        self.attention_dropout = attention_dropout
+
+        self.mup_enabled = mup_enabled
+        self.n_group = n_group
+        self.topk_group = topk_group
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+__all__ = ["AfmoeConfig"]
--- a/vllm/transformers_utils/configs/arctic.py
+++ b/vllm/transformers_utils/configs/arctic.py
@@ -0,0 +1,216 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# ruff: noqa: E501
+# coding=utf-8
+# Copied from
+# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
+"""Arctic model configuration"""
+
+from dataclasses import asdict, dataclass
+from typing import Any
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+ARCTIC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "arctic": "https://huggingface.co/Snowflake/snowflake-arctic-instruct/tree/main/config.json",
+}
+
+
+@dataclass
+class ArcticLoRAConfig:
+    lora_r: int = 64
+    lora_alpha: float = 16
+    shard_base_weights: bool = False
+
+
+@dataclass
+class ArcticQuantizationConfig:
+    q_bits: int = 8
+    rounding: str = "nearest"
+    mantissa_bits: int = 3
+    group_size: int = 128
+
+
+class ArcticConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ArcticModel`]. It is used to instantiate an
+    Arctic model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the #TODO(rsamdani): add what model has the default config..
+
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Arctic model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ArcticModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
+            The maximum sequence length that this model might ever be used with. Arctic's sliding window attention
+            allows sequence of up to 4096*32 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_parameters (`dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_theta` (`float`): The base period of the RoPE embeddings.
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If not specified, will default to `4096`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_experts_per_tok (`int`, *optional*, defaults to 2):
+            The number of experts to root per-token, can be also interpreted as the `top-p` routing
+            parameter
+        num_local_experts (`int`, *optional*, defaults to 8):
+            Number of experts per Sparse MLP layer.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+
+    ```python
+    >>> from transformers import ArcticModel, ArcticConfig
+
+    >>> # Initializing a Arctic 7B style configuration TODO(rsamdani): verify which model does the default configuration correspond to.
+    >>> configuration = ArcticConfig()
+
+    >>> # Initializing a model from the Arctic 7B style configuration
+    >>> model = ArcticModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "arctic"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_parameters: dict[str, Any] | None = None,
+        sliding_window=None,
+        attention_dropout=0.0,
+        num_experts_per_tok=1,
+        num_local_experts=8,
+        router_aux_loss_coef=0.001,
+        moe_layer_frequency=2,
+        parallel_attn_mlp_res=False,
+        moe_train_capacity_factor=1,
+        moe_eval_capacity_factor=1,
+        enable_expert_tensor_parallelism=False,
+        moe_min_capacity=0,
+        moe_token_dropping=True,
+        quantization=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        rope_theta = kwargs.pop("rope_theta", 1e6)
+        if rope_parameters is None:
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
+        self.rope_parameters = rope_parameters
+        self.attention_dropout = attention_dropout
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.moe_layer_frequency = moe_layer_frequency
+        self.moe_train_capacity_factor = moe_train_capacity_factor
+        self.moe_eval_capacity_factor = moe_eval_capacity_factor
+        self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
+        self.moe_min_capacity = moe_min_capacity
+        self.moe_token_dropping = moe_token_dropping
+        self.parallel_attn_mlp_res = parallel_attn_mlp_res
+        if isinstance(quantization, dict):
+            self.quantization = ArcticQuantizationConfig(**quantization)
+        else:
+            self.quantization = quantization
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @classmethod
+    def from_dict(cls, config_dict: dict[str, Any], **kwargs) -> "ArcticConfig":
+        result = super().from_dict(config_dict, **kwargs)
+        config = result[0] if isinstance(result, tuple) else result
+        if isinstance(config.quantization, dict):
+            config.quantization = ArcticQuantizationConfig(**config.quantization)
+        return result
+
+    def to_dict(self) -> dict[str, Any]:
+        ret = super().to_dict()
+        if isinstance(ret["quantization"], ArcticQuantizationConfig):
+            ret["quantization"] = asdict(ret["quantization"])
+        return ret
--- a/vllm/transformers_utils/configs/bagel.py
+++ b/vllm/transformers_utils/configs/bagel.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from transformers import PretrainedConfig, SiglipVisionConfig
+from transformers.models.qwen2 import Qwen2Config
+
+
+class BagelConfig(PretrainedConfig):
+    """Configuration class for BAGEL model."""
+
+    model_type = "bagel"
+
+    def __init__(
+        self,
+        visual_gen: bool = True,
+        visual_und: bool = True,
+        llm_config: dict | Qwen2Config | None = None,
+        vit_config: dict | SiglipVisionConfig | None = None,
+        vae_config: dict | None = None,
+        latent_patch_size: int = 2,
+        max_latent_size: int = 32,
+        vit_max_num_patch_per_side: int = 70,
+        connector_act: str = "gelu_pytorch_tanh",
+        interpolate_pos: bool = False,
+        timestep_shift: float = 1.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.visual_gen = visual_gen
+        self.visual_und = visual_und
+
+        # Convert dict configs to proper config objects
+        if isinstance(llm_config, dict):
+            self.llm_config = Qwen2Config(**llm_config)
+        else:
+            self.llm_config = llm_config or Qwen2Config()
+
+        if isinstance(vit_config, dict):
+            self.vit_config = SiglipVisionConfig(**vit_config)
+        else:
+            self.vit_config = vit_config or SiglipVisionConfig()
+
+        self.vae_config = vae_config or {"z_channels": 16, "downsample": 8}
+        self.latent_patch_size = latent_patch_size
+        self.max_latent_size = max_latent_size
+        self.vit_max_num_patch_per_side = vit_max_num_patch_per_side
+        self.connector_act = connector_act
+        self.interpolate_pos = interpolate_pos
+        self.timestep_shift = timestep_shift
+
+    @property
+    def hidden_size(self) -> int:
+        """Return the hidden size of the language model."""
+        return self.llm_config.hidden_size
--- a/vllm/transformers_utils/configs/chatglm.py
+++ b/vllm/transformers_utils/configs/chatglm.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/zai-org/ChatGLM2-6B
+from transformers import PretrainedConfig
+
+
+class ChatGLMConfig(PretrainedConfig):
+    model_type = "chatglm"
+    attribute_map = {
+        "num_hidden_layers": "num_layers",
+        "n_head_kv": "multi_query_group_num",
+    }
+
+    def __init__(
+        self,
+        num_layers=28,
+        padded_vocab_size=65024,
+        hidden_size=4096,
+        ffn_hidden_size=13696,
+        kv_channels=128,
+        num_attention_heads=32,
+        seq_length=2048,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        layernorm_epsilon=1e-5,
+        rmsnorm=True,
+        apply_residual_connection_post_layernorm=False,
+        post_layer_norm=True,
+        add_bias_linear=False,
+        add_qkv_bias=False,
+        interleaved_qkv=False,
+        bias_dropout_fusion=True,
+        multi_query_attention=False,
+        multi_query_group_num=1,
+        apply_query_key_layer_scaling=True,
+        attention_softmax_in_fp32=True,
+        fp32_residual_connection=False,
+        quantization_bit=0,
+        pre_seq_len=None,
+        prefix_projection=False,
+        **kwargs,
+    ):
+        self.num_layers = num_layers
+        self.vocab_size = padded_vocab_size
+        self.padded_vocab_size = padded_vocab_size
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.kv_channels = kv_channels
+        self.num_attention_heads = num_attention_heads
+        self.seq_length = seq_length
+        # It is to be compatible with long lora.
+        self.max_position_embeddings = seq_length
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.layernorm_epsilon = layernorm_epsilon
+        self.rmsnorm = rmsnorm
+        self.apply_residual_connection_post_layernorm = (
+            apply_residual_connection_post_layernorm
+        )
+        self.post_layer_norm = post_layer_norm
+        self.add_bias_linear = add_bias_linear
+        self.add_qkv_bias = add_qkv_bias
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.multi_query_attention = multi_query_attention
+        self.multi_query_group_num = multi_query_group_num
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.fp32_residual_connection = fp32_residual_connection
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+        self.interleaved_qkv = interleaved_qkv
+        super().__init__(**kwargs)
--- a/vllm/transformers_utils/configs/colmodernvbert.py
+++ b/vllm/transformers_utils/configs/colmodernvbert.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Configuration for ColModernVBERT visual document retrieval model.
+
+ColModernVBERT combines SigLIP vision encoder + ModernBERT text encoder
+with a pixel shuffle connector and ColBERT-style 128-dim per-token embeddings.
+
+Reference: https://huggingface.co/ModernVBERT/colmodernvbert-merged
+"""
+
+from transformers import ModernBertConfig, PretrainedConfig, SiglipVisionConfig
+
+
+class ColModernVBertConfig(PretrainedConfig):
+    model_type = "colmodernvbert"
+
+    def __init__(
+        self,
+        embedding_dim: int = 128,
+        vlm_config: dict | None = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embedding_dim = embedding_dim
+
+        if vlm_config is None:
+            vlm_config = {}
+
+        # Top-level VLM fields
+        self.image_token_id = vlm_config.get("image_token_id", 50407)
+        self.pixel_shuffle_factor = vlm_config.get("pixel_shuffle_factor", 4)
+        self.hidden_size = vlm_config.get("hidden_size", 768)
+        additional_vocab_size = vlm_config.get("additional_vocab_size", 40)
+
+        # Text config (ModernBERT)
+        text_cfg = vlm_config.get("text_config", {})
+        base_vocab = text_cfg.get("vocab_size", 50368)
+        self.text_config = ModernBertConfig(
+            vocab_size=base_vocab + additional_vocab_size,
+            hidden_size=text_cfg.get("hidden_size", 768),
+            intermediate_size=text_cfg.get("intermediate_size", 1152),
+            num_hidden_layers=text_cfg.get("num_hidden_layers", 22),
+            num_attention_heads=text_cfg.get("num_attention_heads", 12),
+            mlp_bias=text_cfg.get("mlp_bias", False),
+            max_position_embeddings=vlm_config.get("max_position_embeddings", 8192),
+        )
+
+        # Vision config (SigLIP)
+        vis_cfg = vlm_config.get("vision_config", {})
+        self.vision_config = SiglipVisionConfig(
+            hidden_size=vis_cfg.get("embed_dim", 768),
+            image_size=vis_cfg.get("image_size", 512),
+            patch_size=vis_cfg.get("patch_size", 16),
+            num_hidden_layers=vis_cfg.get("num_hidden_layers", 12),
+            intermediate_size=vis_cfg.get("intermediate_size", 3072),
+            num_attention_heads=vis_cfg.get("num_attention_heads", 12),
+        )
+
+    @property
+    def image_seq_len(self) -> int:
+        ps = self.vision_config.image_size // self.vision_config.patch_size
+        return (ps * ps) // (self.pixel_shuffle_factor**2)
+
+    def get_text_config(self, **kwargs):
+        return self.text_config
--- a/vllm/transformers_utils/configs/colqwen3.py
+++ b/vllm/transformers_utils/configs/colqwen3.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+ColQwen3 configuration that extends Qwen3VLConfig with embedding projection
+fields. This allows ColQwen3 models to be loaded without trust_remote_code
+by mapping their custom model_type (colqwen3, ops_colqwen3, etc.) to a
+standard config class that vLLM understands.
+
+Supported model_types:
+- colqwen3 (TomoroAI/tomoro-colqwen3-embed-8b)
+- ops_colqwen3 (OpenSearch-AI/Ops-Colqwen3-4B)
+- qwen3_vl_nemotron_embed (nvidia/nemotron-colembed-vl-8b-v2)
+"""
+
+from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig
+
+
+class ColQwen3Config(Qwen3VLConfig):
+    """Configuration class for ColQwen3 models.
+
+    Extends Qwen3VLConfig with additional fields used by ColQwen3 variants
+    for the embedding projection layer.
+    """
+
+    # Accept any ColQwen3 variant model_type
+    model_type = "colqwen3"
+
+    def __init__(
+        self,
+        embed_dim: int | None = None,
+        dims: int | None = None,
+        dim: int | None = None,
+        projection_dim: int | None = None,
+        colbert_dim: int | None = None,
+        pooling: str | None = None,
+        **kwargs,
+    ):
+        # Store embedding projection config fields
+        self.embed_dim = embed_dim
+        self.dims = dims
+        self.dim = dim
+        self.projection_dim = projection_dim
+        self.colbert_dim = colbert_dim
+        self.pooling = pooling
+
+        super().__init__(**kwargs)
+
+
+class OpsColQwen3Config(ColQwen3Config):
+    """Configuration for OpenSearch-AI ColQwen3 variants."""
+
+    model_type = "ops_colqwen3"
+
+
+class Qwen3VLNemotronEmbedConfig(ColQwen3Config):
+    """Configuration for NVIDIA Nemotron ColEmbed variants."""
+
+    model_type = "qwen3_vl_nemotron_embed"
--- a/vllm/transformers_utils/configs/deepseek_vl2.py
+++ b/vllm/transformers_utils/configs/deepseek_vl2.py
@@ -0,0 +1,130 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
+
+from transformers import DeepseekV2Config, PretrainedConfig
+
+
+class VisionEncoderConfig(PretrainedConfig):
+    model_type: str = "vision"
+
+    model_name: str = "vit_so400m_patch14_siglip_384.webli"
+    image_size: int = 384
+    patch_size: int = 16
+    width: int = 1024
+    layers: int = 24
+    heads: int = 16
+    mlp_ratio: int = 4
+    global_pool: str = "map"
+    ignore_head: bool = True
+    class_token: bool = False
+    num_classes: int = 0
+    use_checkpoint: bool = False
+    weight_init: str = "skip"
+    deterministic: bool = False
+    num_recomputing_layers: int = 0
+
+    def __init__(
+        self,
+        model_name: str = "vit_so400m_patch14_siglip_384.webli",
+        image_size: int = 384,
+        patch_size: int = 16,
+        width: int = 1024,
+        layers: int = 24,
+        heads: int = 16,
+        mlp_ratio: int = 4,
+        global_pool: str = "map",
+        ignore_head: bool = True,
+        class_token: bool = False,
+        num_classes: int = 0,
+        use_checkpoint: bool = False,
+        **kwargs,
+    ):
+        self.model_name = model_name
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.width = width
+        self.layers = layers
+        self.heads = heads
+        self.mlp_ratio = mlp_ratio
+        self.global_pool = global_pool
+        self.ignore_head = ignore_head
+        self.class_token = class_token
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+
+        super().__init__(**kwargs)
+
+
+class MlpProjectorConfig(PretrainedConfig):
+    model_type = "mlp_projector"
+    projector_type: str = "downsample_mlp_gelu"
+    input_dim: int = 1152
+    n_embed: int = 2048
+    depth: int = 2
+    mlp_ratio: int = 1
+    downsample_ratio: int = 2
+    token_pooling: bool = False
+
+    def __init__(
+        self,
+        projector_type: str = "downsample_mlp_gelu",
+        input_dim: int = 1152,
+        n_embed: int = 2048,
+        depth: int = 2,
+        mlp_ratio: int = 1,
+        downsample_ratio: int = 2,
+        **kwargs,
+    ):
+        self.projector_type = projector_type
+        self.input_dim = input_dim
+        self.n_embed = n_embed
+        self.depth = depth
+        self.mlp_ratio = mlp_ratio
+        self.downsample_ratio = downsample_ratio
+
+        super().__init__(**kwargs)
+
+
+class DeepseekVLV2Config(PretrainedConfig):
+    model_type = "deepseek_vl_v2"
+    architectures: list[str] | None = None
+    vision_config: VisionEncoderConfig
+    projector_config: MlpProjectorConfig
+
+    tile_tag: str = "2D"
+    global_view_pos: str = "head"
+    candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),)
+
+    def __init__(
+        self,
+        tile_tag: str = "tile_tag",
+        global_view_pos: str = "head",
+        candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),),
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if self.architectures is None:
+            self.architectures = ["DeepseekVLV2ForCausalLM"]
+
+        vision_config = kwargs.get("vision_config", {})
+        self.vision_config = VisionEncoderConfig(**vision_config)
+
+        projector_config = kwargs.get("projector_config", {})
+        self.projector_config = MlpProjectorConfig(**projector_config)
+
+        language_config = kwargs.get("language_config", {})
+        self.text_config = DeepseekV2Config(**language_config)
+
+        self.tile_tag = tile_tag
+        self.global_view_pos = global_view_pos
+        self.candidate_resolutions = candidate_resolutions
+        self.vocab_size = self.text_config.vocab_size
+
+        # update model_type for OCR models
+        if "DeepseekOCRForCausalLM" in self.architectures:
+            self.model_type = "deepseek_ocr"
+        elif "DeepseekOCR2ForCausalLM" in self.architectures:
+            self.model_type = "deepseek_ocr2"
--- a/vllm/transformers_utils/configs/dotsocr.py
+++ b/vllm/transformers_utils/configs/dotsocr.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.qwen2 import Qwen2Config
+
+
+class DotsVisionConfig(PretrainedConfig):
+    model_type: str = "dots_vit"
+
+    def __init__(
+        self,
+        embed_dim: int = 1536,  # vision encoder embed size
+        hidden_size: int = 1536,  # after merger hidden size
+        intermediate_size: int = 4224,
+        num_hidden_layers: int = 42,
+        num_attention_heads: int = 12,
+        num_channels: int = 3,
+        patch_size: int = 14,
+        spatial_merge_size: int = 2,
+        temporal_patch_size: int = 1,
+        rms_norm_eps: float = 1e-5,
+        use_bias: bool = False,
+        attn_implementation="flash_attention_2",
+        initializer_range=0.02,
+        init_merger_std=0.02,
+        is_causal=False,  # ve causal forward
+        post_norm=True,
+        gradient_checkpointing=False,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.rms_norm_eps = rms_norm_eps
+        self.use_bias = use_bias
+        self.attn_implementation = attn_implementation
+        self.initializer_range = initializer_range
+        self.init_merger_std = init_merger_std
+        self.is_causal = is_causal
+        self.post_norm = post_norm
+        self.gradient_checkpointing = gradient_checkpointing
+
+
+class DotsOCRConfig(Qwen2Config):
+    model_type = "dots_ocr"
+
+    def __init__(
+        self,
+        image_token_id=151665,
+        video_token_id=151656,
+        vision_config: dict | None = None,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_config = DotsVisionConfig(**(vision_config or {}))
+
+    def save_pretrained(self, save_directory, **kwargs):
+        self._auto_class = None
+        super().save_pretrained(save_directory, **kwargs)
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+from transformers import AutoConfig, DeepseekV2Config, PretrainedConfig
+
+
+class EAGLEConfig(PretrainedConfig):
+    model_type = "eagle"
+
+    def __init__(
+        self,
+        model: PretrainedConfig | dict | None = None,
+        truncated_vocab_size: int | None = None,
+        method: str | None = "eagle",
+        **kwargs,
+    ):
+        model_config: PretrainedConfig | DeepseekV2Config | None
+        if isinstance(model, dict):
+            model_config = AutoConfig.for_model(**model)
+        else:
+            model_config = model
+
+        for k, v in kwargs.items():
+            if k != "architectures" and k != "model_type" and hasattr(model_config, k):
+                setattr(model_config, k, v)
+
+        self.model = model_config
+
+        if self.model is None:
+            self.truncated_vocab_size = None
+        else:
+            self.truncated_vocab_size = (
+                self.model.vocab_size
+                if truncated_vocab_size is None
+                else truncated_vocab_size
+            )
+
+        # Eagle model name should follow naming convention of
+        # LlamaForCausalLM -> EagleLlamaForCausalLM
+        # LlamaForCausalLM -> Eagle3LlamaForCausalLM
+        # LlamaForCausalLMEagle3 -> LlamaForCausalLMEagle3
+        if method == "eagle":
+            assert self.model is not None, (
+                "model should not be None when method is eagle"
+            )
+            kwargs["architectures"] = [
+                f"Eagle{arch}" if not arch.startswith("Eagle") else arch
+                for arch in self.model.architectures
+            ]
+
+        elif method == "eagle3":
+            assert self.model is not None, (
+                "model should not be None when method is eagle3"
+            )
+            kwargs["architectures"] = [
+                arch
+                if arch.startswith("Eagle3") or arch.endswith("Eagle3")
+                else f"Eagle3{arch}"
+                for arch in self.model.architectures
+            ]
+        else:
+            raise ValueError(
+                f"Invalid method {method}. Supported methods are eagle and eagle3."
+            )
+
+        super().__init__(**kwargs)
+
+        if self.model is not None:
+            for k, v in self.model.to_dict().items():
+                if k not in kwargs:
+                    setattr(self, k, v)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str | os.PathLike,
+        **kwargs,
+    ) -> "EAGLEConfig":
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+        return cls.from_dict(config_dict, **kwargs)
+
+    def to_json_string(self, use_diff: bool = True) -> str:
+        # we override use_diff to False as initializing
+        # EAGLEConfig with default arguments is not supported
+        del use_diff
+        return super().to_json_string(use_diff=False)
--- a/vllm/transformers_utils/configs/falcon.py
+++ b/vllm/transformers_utils/configs/falcon.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Falcon configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class RWConfig(PretrainedConfig):
+    model_type = "falcon"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_hidden_layers": "n_layer",
+        "num_attention_heads": "n_head",
+        "num_kv_heads": "n_head_kv",
+    }
+
+    def __init__(
+        self,
+        vocab_size=250880,
+        hidden_size=64,
+        n_layer=2,
+        n_head=8,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=1,
+        eos_token_id=2,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        multi_query=True,
+        n_head_kv=None,
+        alibi=False,
+        bias=False,
+        parallel_attn=False,
+        new_decoder_architecture=False,
+        **kwargs,
+    ) -> None:
+        self.vocab_size = vocab_size
+        # Backward compatibility with n_embed kwarg
+        n_embed = kwargs.pop("n_embed", None)
+        self.hidden_size = hidden_size if n_embed is None else n_embed
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.multi_query = multi_query
+        self.n_head_kv = 1 if n_head_kv is None else n_head_kv
+        self.alibi = alibi
+        self.bias = bias
+        self.parallel_attn = parallel_attn
+        self.new_decoder_architecture = new_decoder_architecture
+
+        if self.hidden_size == 8192:
+            # Hack for falcon-40b
+            self.new_decoder_architecture = True
+
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+    @property
+    def head_dim(self):
+        return self.hidden_size // self.n_head
+
+    @property
+    def rotary(self):
+        return not self.alibi
--- a/vllm/transformers_utils/configs/flex_olmo.py
+++ b/vllm/transformers_utils/configs/flex_olmo.py
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class FlexOlmoConfig(PretrainedConfig):
+    model_type = "flex_olmo"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=100352,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-06,
+        use_cache=True,
+        pad_token_id=100277,
+        bos_token_id=None,
+        eos_token_id=100257,
+        tie_word_embeddings=False,
+        rope_parameters: dict[str, Any] | None = None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        num_experts_per_tok=5,
+        num_experts=7,
+        output_router_logits=False,
+        router_aux_loss_coef=0.01,
+        norm_topk_prob=False,
+        **kwargs,
+    ):
+        if "architectures" not in kwargs:
+            kwargs["architectures"] = ["FlexOlmoForCausalLM"]
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 500000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.norm_topk_prob = norm_topk_prob
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_parameters is not None and "type" in self.rope_parameters:
+            self.rope_parameters["rope_type"] = self.rope_parameters["type"]
--- a/vllm/transformers_utils/configs/funaudiochat.py
+++ b/vllm/transformers_utils/configs/funaudiochat.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+from transformers import PretrainedConfig
+
+# NOTE: Temporary shim for FunAudioChat checkpoints.
+# These checkpoints use `model_type="funaudiochat"`, which is not currently
+# recognized by released Transformers, and the public checkpoint does not
+# provide an `auto_map` to enable `trust_remote_code=True`.
+# Remove this file once Transformers adds native support (or the checkpoint
+# provides an `auto_map`) and vLLM can rely on `AutoConfig.from_pretrained()`.
+
+
+class FunAudioChatAudioEncoderConfig(PretrainedConfig):
+    model_type = "funaudiochat_audio_encoder"
+
+    def __init__(
+        self,
+        _attn_implementation: str | None = None,
+        num_mel_bins: int = 128,
+        encoder_layers: int = 32,
+        encoder_attention_heads: int = 20,
+        encoder_ffn_dim: int = 5120,
+        d_model: int = 1280,
+        dropout: float = 0.0,
+        attention_dropout: float = 0.0,
+        activation_function: str = "gelu",
+        activation_dropout: float = 0.0,
+        scale_embedding: bool = False,
+        initializer_range: float = 0.02,
+        max_source_positions: int = 1500,
+        n_window: int = 100,
+        output_dim: int = 3584,
+        bos_token_id: int | None = None,
+        codebook_size: int | None = None,
+        continuous_features_mode: str = "replace",
+        crq_transformer_config: dict | None = None,
+        eos_token_id: int | None = None,
+        group_size: int = 5,
+        enable_audio_invert_tower: bool = True,
+        pad_token_id: int | None = None,
+        **kwargs,
+    ) -> None:
+        attn_impl = kwargs.pop("_attn_implementation", None) or _attn_implementation
+        super().__init__(**kwargs)
+        # Match HF default for attention implementation selection.
+        self._attn_implementation = attn_impl or "sdpa"
+
+        self.num_mel_bins = num_mel_bins
+        self.d_model = d_model
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_function = activation_function
+        self.activation_dropout = activation_dropout
+        self.num_hidden_layers = encoder_layers
+        self.initializer_range = initializer_range
+        self.scale_embedding = scale_embedding
+        self.max_source_positions = max_source_positions
+        self.n_window = n_window
+        self.output_dim = output_dim
+
+        self.bos_token_id = bos_token_id
+        self.codebook_size = codebook_size
+        self.continuous_features_mode = continuous_features_mode
+        self.crq_transformer_config = crq_transformer_config
+        self.eos_token_id = eos_token_id
+        self.group_size = group_size
+        self.enable_audio_invert_tower = enable_audio_invert_tower
+        self.pad_token_id = pad_token_id
+
+
+class FunAudioChatConfig(PretrainedConfig):
+    model_type = "funaudiochat"
+    attribute_map = {
+        "audio_token_id": "audio_token_index",
+    }
+
+    def __init__(
+        self,
+        audio_config: PretrainedConfig | dict | None = None,
+        text_config: PretrainedConfig | dict | None = None,
+        audio_token_index: int = 151646,
+        ignore_index: int = -100,
+        hidden_size: int | None = None,
+        **kwargs,
+    ) -> None:
+        self.audio_token_index = audio_token_index
+        self.ignore_index = ignore_index
+
+        if isinstance(audio_config, dict):
+            audio_config.setdefault(
+                "model_type", FunAudioChatAudioEncoderConfig.model_type
+            )
+            audio_config = FunAudioChatAudioEncoderConfig(**audio_config)
+        elif audio_config is None:
+            audio_config = FunAudioChatAudioEncoderConfig()
+        self.audio_config = audio_config
+
+        if isinstance(text_config, dict):
+            # Default to qwen2 for backwards compatibility; FunAudioChat uses
+            # qwen3 in practice for recent checkpoints.
+            text_config.setdefault("model_type", "qwen2")
+            import transformers
+
+            text_cls = transformers.CONFIG_MAPPING[text_config["model_type"]]
+            text_config = text_cls(**text_config)
+        elif text_config is None:
+            import transformers
+
+            text_config = transformers.CONFIG_MAPPING["qwen2"]()
+        self.text_config = text_config
+
+        self.hidden_size = (
+            int(self.text_config.hidden_size)
+            if hidden_size is None
+            else int(hidden_size)
+        )
+
+        super().__init__(**kwargs)
--- a/vllm/transformers_utils/configs/hunyuan_vl.py
+++ b/vllm/transformers_utils/configs/hunyuan_vl.py
@@ -0,0 +1,322 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/configuration_hunyuan_vl.py
+
+from transformers import PretrainedConfig
+
+
+class HunYuanVLVisionConfig(PretrainedConfig):
+    model_type = "hunyuan_vl"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_act="gelu",
+        hidden_size=1152,
+        intermediate_size=4304,
+        interpolate_mode="bilinear",
+        rms_norm_eps=1e-05,
+        learnable_mlp_pooling_size=0,
+        num_attention_heads=16,
+        num_key_value_heads=None,
+        num_channels=3,
+        num_hidden_layers=27,
+        out_hidden_size=4096,
+        patch_size=16,
+        remove_prenorm=True,
+        spatial_merge_size=2,
+        temporal_patch_size=1,
+        resize_resolution=2048,
+        img_max_token_num=4096,
+        max_image_size=2048,
+        video_max_image_size=768,
+        video_min_image_size=256,
+        min_image_size=512,
+        anyres_vit_max_image_size=2048,
+        max_vit_seq_len=16384,
+        text_hidden_size=3072,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_act = hidden_act
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.interpolate_mode = interpolate_mode
+        self.learnable_mlp_pooling_size = learnable_mlp_pooling_size
+        self.num_attention_heads = num_attention_heads
+        if not num_key_value_heads:
+            self.num_key_value_heads = num_attention_heads
+        else:
+            self.num_key_value_heads = num_key_value_heads
+        self.num_channels = num_channels
+        self.num_hidden_layers = num_hidden_layers
+        self.out_hidden_size = out_hidden_size
+        self.patch_size = patch_size
+        self.remove_prenorm = remove_prenorm
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.rms_norm_eps = rms_norm_eps
+
+        self.resize_resolution = resize_resolution
+        self.img_max_token_num = img_max_token_num
+        self.max_image_size = max_image_size
+        self.min_image_size = min_image_size
+        self.video_max_image_size = video_max_image_size
+        self.video_min_image_size = video_min_image_size
+        self.anyres_vit_max_image_size = anyres_vit_max_image_size
+        self.max_vit_seq_len = max_vit_seq_len
+        self.text_hidden_size = text_hidden_size
+
+
+class HunYuanVLTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HunYuanVLTextConfig`]. It is used to instantiate an
+    HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the HunYuan-7B.
+    Hunyuan-7B-Instruct [tencent/Hunyuan-7B-Instruct](https://huggingface.co/tencent/Hunyuan-7B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 290943):
+            Vocabulary size of the HunYuan model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`HunYuanVLTextConfig`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations or shared MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        eod_token_id (int, *optional*, defaults to 3):
+            Token ID representing the end-of-document marker. Used to indicate the termination of a text sequence.
+            Example: In multi-document processing, this token helps the model distinguish between separate documents.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        head_dim (`int`, *optional*, defaults to 128):
+            The attention head dimension.
+    """  # noqa: E501
+
+    model_type = "hunyuan_vl_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=290943,
+        hidden_size=4096,
+        intermediate_size: int = 11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        eod_token_id=3,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        head_dim=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # self._rope_scaling_validation()   # TODO: Need validation?
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and "
+                f"`factor` or `type` and `alpha`, got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        rope_scaling_alpha = self.rope_scaling.get("alpha", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                "`rope_scaling`'s type field must be one of ['linear', 'dynamic'], "
+                f"got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None and rope_scaling_alpha is None:
+            raise ValueError(
+                "`rope_scaling`'s factor or alpha field must be have one, "
+                "got both of none"
+            )
+        if rope_scaling_factor is not None and (
+            not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0
+        ):
+            raise ValueError(
+                "`rope_scaling`'s factor field must be a float > 1.0, "
+                f"got {rope_scaling_factor}"
+            )
+        if rope_scaling_alpha is not None and (
+            not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0
+        ):
+            raise ValueError(
+                "`rope_scaling`'s alpha field must be a float > 1.0, "
+                f"got {rope_scaling_alpha}"
+            )
+
+
+class HunYuanVLConfig(PretrainedConfig):
+    model_type = "hunyuan_vl"
+    sub_configs = {
+        "vision_config": HunYuanVLVisionConfig,
+        "text_config": HunYuanVLTextConfig,
+    }
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        im_start_id=120118,
+        im_end_id=120119,
+        image_token_id=120120,
+        im_newline_id=120121,
+        video_start_id=120122,
+        video_end_id=120123,
+        **kwargs,
+    ):
+        # We need to init super() here so that it does not reset values
+        # that are in text config to the BaseClass defaults. The Base
+        # config has many text related defaults and not all defaults are
+        # same as for `HunYuanVLTextConfig`.
+        super().__init__(**kwargs)
+
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            # For BC use all kwargs to init `TextConfig`
+            self.text_config = self.sub_configs["text_config"](**kwargs)
+
+        self.image_token_id = image_token_id
+        self.im_start_id = im_start_id
+        self.im_end_id = im_end_id
+        self.im_newline_id = im_newline_id
+        self.video_start_id = video_start_id
+        self.video_end_id = video_end_id
+
+        self.vision_config.text_hidden_size = self.text_config.hidden_size
+
+        # Attention implementation to use. It sets it recursively on sub-configs
+        # so we call it again in the end.
+        self._attn_implementation = kwargs.pop("attn_implementation", None)
+
+    def __setattr__(self, key, value):
+        if (
+            (text_config := super().__getattribute__("__dict__").get("text_config"))
+            is not None
+            and key not in ["dtype", "_attn_implementation_internal"]
+            and key in text_config.__dict__
+        ):
+            setattr(text_config, key, value)
+        else:
+            super().__setattr__(key, value)
+
+    def __getattribute__(self, key):
+        if "text_config" in super().__getattribute__("__dict__") and key not in [
+            "_name_or_path",
+            "model_type",
+            "dtype",
+            "_attn_implementation_internal",
+        ]:
+            text_config = super().__getattribute__("text_config")
+            if key in text_config.__dict__:
+                return getattr(text_config, key)
+
+        return super().__getattribute__(key)
--- a/vllm/transformers_utils/configs/isaac.py
+++ b/vllm/transformers_utils/configs/isaac.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+from transformers import Qwen3Config
+from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfig
+
+
+class PixelShuffleSiglip2VisionConfig(Siglip2VisionConfig):
+    """Vision configuration for Isaac with Pixel Shuffle support.
+
+    Extends Siglip2VisionConfig with additional fields for pixel shuffle.
+    """
+
+    model_type = "pixel_shuffle_siglip2"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        pixel_shuffle_scale_factor: int = 1,
+        num_patches: int = 256,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # Add our custom fields
+        self.pixel_shuffle_scale_factor = pixel_shuffle_scale_factor
+        self.num_patches = num_patches
+
+
+class IsaacConfig(Qwen3Config):
+    """Configuration class for Isaac multimodal model."""
+
+    model_type = "isaac"
+    sub_configs = {
+        "vision_config": PixelShuffleSiglip2VisionConfig,
+        "text_config": Qwen3Config,
+    }
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        vision_patch_size: int = 16,
+        vision_max_num_patches: int = 256,
+        vision_min_num_patches: int | None = None,
+        pixel_shuffle_scale: int = 1,
+        max_sequence_length: int = 16384,
+        vision_token: str = "<image>",
+        vision_attn_implementation: str | None = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if isinstance(text_config, dict):
+            # from HF config
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            # For BC use all kwargs to init text config.
+            self.text_config = self.sub_configs["text_config"](**kwargs)
+        else:
+            # from Qwen3Config
+            self.text_config = text_config
+
+        # EventStreamProcessor parameters (for backward compatibility)
+        self.video_patch_size = vision_patch_size
+        self.vision_max_num_patches = vision_max_num_patches
+        self.vision_min_num_patches = vision_min_num_patches
+        self.pixel_shuffle_scale = pixel_shuffle_scale
+
+        # Processing parameters
+        self.max_sequence_length = max_sequence_length
+        self.vision_token = vision_token
+
+        # Handle vision config - PixelShuffleSiglip2VisionConfig instance
+        if isinstance(vision_config, dict):
+            self.vision_config = PixelShuffleSiglip2VisionConfig(**vision_config)
+        elif vision_config is None:
+            self.vision_config = PixelShuffleSiglip2VisionConfig()
+        else:
+            self.vision_config = vision_config
+
+        # Ensure compatibility with pretrained checkpoints
+        self.vision_config.pixel_shuffle_scale_factor = getattr(
+            self.vision_config,
+            "pixel_shuffle_scale_factor",
+            pixel_shuffle_scale,
+        )
+        self.vision_config.num_patches = getattr(
+            self.vision_config,
+            "num_patches",
+            vision_max_num_patches,
+        )
+        self.vision_attn_implementation = vision_attn_implementation
+
+
+__all__ = [
+    "IsaacConfig",
+    "PixelShuffleSiglip2VisionConfig",
+]
--- a/vllm/transformers_utils/configs/jais.py
+++ b/vllm/transformers_utils/configs/jais.py
@@ -0,0 +1,243 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2023 Cerebras Systems.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""JAIS configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class JAISConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a
+    [`JAISModel`]. It is used to instantiate a JAIS model according to the
+    specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used
+    to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50257):
+            Vocabulary size of the JAIS model. Defines the number of different
+            tokens that can be represented by the
+            `inputs_ids` passed when calling [`JAISModel`].
+        n_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used
+            with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the
+            Transformer encoder.
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set
+            it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            Activation function, to be selected in the list
+            `["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in
+            the embeddings, encoder, and pooler.
+        embd_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(hidden_size)..
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models).
+        scale_attn_by_inverse_layer_idx (`bool`, *optional*, default `True`):
+            Whether to additionally scale attention weights
+            by `1 / layer_idx + 1`.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
+            Whether to scale keys (K) prior to computing attention
+            (dot-product)
+            and upcast attention dot-product/softmax to float() when training
+            with mixed precision.
+        position_embedding_type (`str`, *optional*, defaults to `"learned"`):
+            Positional embedding can be either `"alibi"` or `"learned"`.
+        mup_width_scale (`float`, *optional*, defaults to 1.0):
+            muP parameter to scale learning rate and initializers. Calculated
+            as (`d_model,0 / d_model`), where
+            `d_model` is the model's width and `d_model,0` is the proxy
+            model's width.
+        mup_embeddings_scale (`float`, *optional*, defaults to 1.0):
+            muP parameter to scale token and position embeddings.
+        mup_output_alpha (`float`, *optional*, defaults to 1.0):
+            muP parameter to scale output logits
+            (`output_logits_scale = mup_output_alpha * mup_width_scale`).
+        mup_scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
+            Scale attention weights by dividing by hidden_size instead of
+            sqrt(hidden_size). Need to set scale_attn_weights to `True` as
+            well.
+        alibi_scaling (`dict`, *optional*):
+            Dictionary containing the scaling configuration for ALiBi
+            embeddings. Currently only supports linear
+            scaling strategy. Can specify either the scaling `factor` (must be
+            a float greater than 1) for fixed scaling
+            or `train_seq_len` for dynamic scaling on input samples with
+            sequence length > `train_seq_len`. The expected
+            formats are `{"type": strategy name, "factor": scaling factor}` or
+            `{"type": strategy name,
+            "train_seq_len": training sequence length}`.
+        architectures (`list`, *optional*, defaults to ['JAISLMHeadModel']):
+            architecture names for Jais.
+
+    Example:
+
+    ```python
+    >>> from transformers import JAISConfig, JAISModel
+
+    >>> # Initializing a JAIS configuration
+    >>> configuration = JAISConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = JAISModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "jais"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=50257,
+        n_positions=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        n_inner=None,
+        activation_function="gelu_new",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        scale_attn_weights=True,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        scale_attn_by_inverse_layer_idx=False,
+        reorder_and_upcast_attn=False,
+        position_embedding_type="learned",
+        mup_width_scale=1.0,
+        mup_embeddings_scale=1.0,
+        mup_output_alpha=1.0,
+        mup_scale_qk_dot_by_d=False,
+        alibi_scaling=None,
+        architectures=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        self.position_embedding_type = position_embedding_type
+        self.mup_width_scale = mup_width_scale
+        self.mup_embeddings_scale = mup_embeddings_scale
+        self.mup_output_alpha = mup_output_alpha
+        self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d
+
+        self.alibi_scaling = alibi_scaling
+        self._alibi_scaling_validation()
+        if architectures is None:
+            architectures = ["JAISLMHeadModel"]
+
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            architectures=architectures,
+            **kwargs,
+        )
+
+    def _alibi_scaling_validation(self):
+        """
+        Validate the `alibi_scaling` configuration.
+        """
+        if self.alibi_scaling is None:
+            return
+
+        if not isinstance(self.alibi_scaling, dict) or len(self.alibi_scaling) != 2:
+            raise ValueError(
+                "`alibi_scaling` must be a dictionary with two fields, "
+                "`type` and `factor` or `type` and `train_seq_len`, "
+                f"got {self.alibi_scaling}"
+            )
+        alibi_scaling_type = self.alibi_scaling.get("type", None)
+        alibi_scaling_factor = self.alibi_scaling.get("factor", None)
+        alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
+        if alibi_scaling_type is None or alibi_scaling_type != "linear":
+            raise ValueError(
+                f"`alibi_scaling`'s type field must be 'linear', "
+                f"got {alibi_scaling_type}"
+            )
+        if (
+            alibi_scaling_factor is not None
+            and not isinstance(alibi_scaling_factor, float)
+            or (alibi_scaling_factor is not None and alibi_scaling_factor <= 1.0)
+        ):
+            raise ValueError(
+                f"`alibi_scaling`'s factor field must be a float > 1.0, "
+                f"got {alibi_scaling_factor}"
+            )
+        if (
+            alibi_dynamic_scaling is not None
+            and not isinstance(alibi_dynamic_scaling, int)
+            or (alibi_dynamic_scaling is not None and alibi_dynamic_scaling <= 1)
+        ):
+            raise ValueError(
+                f"`alibi_scaling`'s `train_seq_len` field must be an "
+                f"integer > 1, got {alibi_dynamic_scaling}"
+            )
--- a/vllm/transformers_utils/configs/kimi_k25.py
+++ b/vllm/transformers_utils/configs/kimi_k25.py
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Kimi-K2.5 Model Configuration.
+
+This configuration supports video-chunk as an internal modality type.
+A video-chunk is the smallest independently processable unit of video.
+"""
+
+from transformers import DeepseekV3Config
+from transformers.configuration_utils import PretrainedConfig
+
+
+class KimiK25VisionConfig(PretrainedConfig):
+    model_type = "kimi_k25_vision"
+
+    def __init__(
+        self,
+        # Vision Tower
+        patch_size: int = 14,
+        init_pos_emb_height: int = 64,
+        init_pos_emb_width: int = 64,
+        init_pos_emb_time: int = 4,
+        pos_emb_type: str = "divided_fixed",
+        num_attention_heads: int = 16,
+        num_hidden_layers: int = 27,
+        hidden_size: int = 1152,
+        intermediate_size: int = 4304,
+        merge_kernel_size: tuple[int, int] = (2, 2),
+        video_attn_type: str = "spatial_temporal",
+        merge_type: str = "sd2_tpool",
+        # MM Projector
+        mm_projector_type: str = "patchmerger",
+        mm_hidden_size: int | None = None,
+        projector_hidden_act: str = "gelu",
+        projector_ln_eps: float = 1e-5,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # Vision Tower
+        self.patch_size = patch_size
+        self.init_pos_emb_height = init_pos_emb_height
+        self.init_pos_emb_width = init_pos_emb_width
+        self.init_pos_emb_time = init_pos_emb_time
+        self.pos_emb_type = pos_emb_type
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.merge_kernel_size = merge_kernel_size
+        self.video_attn_type = video_attn_type
+        self.merge_type = merge_type
+        # MM Projector
+        self.mm_projector_type = mm_projector_type
+        if mm_hidden_size is not None:
+            self.mm_hidden_size = mm_hidden_size
+        else:
+            self.mm_hidden_size = hidden_size
+        self.projector_hidden_act = projector_hidden_act
+        self.projector_ln_eps = projector_ln_eps
+
+
+class KimiK25Config(PretrainedConfig):
+    """Kimi-K2.5 model configuration.
+
+    Kimi-K2.5 extends Kimi-K2 with vision support using video-chunks.
+    A video-chunk consists of multiple consecutive frames
+    that are processed together with temporal pooling.
+
+    Args:
+        vision_config: Configuration for the vision tower and projector.
+        text_config: Configuration for the text model (DeepseekV3).
+        ignore_index: The ignore index for the loss function.
+        media_placeholder_token_id: The token ID for media placeholders.
+        pad_token_id: The token ID for padding.
+    """
+
+    model_type = "kimi_k25"
+
+    def __init__(
+        self,
+        vision_config: dict | KimiK25VisionConfig | None = None,
+        text_config: dict | DeepseekV3Config | None = None,
+        ignore_index: int = -100,
+        media_placeholder_token_id: int = 163605,
+        pad_token_id: int = 0,
+        use_unified_vision_chunk: bool = False,
+        video_placeholder: str = "<|kimi_k25_video_placeholder|>",
+        **kwargs,
+    ):
+        # Vision config
+        if vision_config is None:
+            vision_config = KimiK25VisionConfig()
+        elif isinstance(vision_config, dict):
+            vision_config = KimiK25VisionConfig(**vision_config)
+        self.vision_config: KimiK25VisionConfig = vision_config
+
+        # Text config
+        if text_config is None:
+            text_config = DeepseekV3Config()
+        elif isinstance(text_config, dict):
+            text_config = DeepseekV3Config(**text_config)
+        self.text_config: DeepseekV3Config = text_config
+
+        # Set mm_hidden_size to text hidden size if not explicitly set
+        if self.vision_config.mm_hidden_size == self.vision_config.hidden_size:
+            self.vision_config.mm_hidden_size = self.text_config.hidden_size
+
+        # Other config
+        self.ignore_index = ignore_index
+        self.media_placeholder_token_id = media_placeholder_token_id
+        self.use_unified_vision_chunk = use_unified_vision_chunk
+        self.video_placeholder = video_placeholder
+
+        # Propagate quantization config from text model
+        if getattr(self.text_config, "quantization_config", None) is not None:
+            self.quantization_config = self.text_config.quantization_config
+
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+    @property
+    def hidden_size(self) -> int:
+        """Get hidden size from text config for compatibility."""
+        return self.text_config.hidden_size
+
+    @property
+    def vocab_size(self) -> int:
+        """Get vocab size from text config for compatibility."""
+        return self.text_config.vocab_size
--- a/vllm/transformers_utils/configs/kimi_linear.py
+++ b/vllm/transformers_utils/configs/kimi_linear.py
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from transformers.configuration_utils import PretrainedConfig
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class KimiLinearConfig(PretrainedConfig):
+    model_type = "kimi_linear"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        model_type="kimi_linear",
+        vocab_size=163840,
+        hidden_size=4096,
+        head_dim=None,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        rope_parameters=None,
+        tie_word_embeddings=False,
+        moe_intermediate_size: int | None = None,
+        moe_renormalize: bool = True,
+        moe_router_activation_func: str = "sigmoid",
+        num_experts: int | None = None,
+        num_experts_per_token: int | None = None,
+        num_shared_experts: int = 0,
+        routed_scaling_factor: float = 1.0,
+        first_k_dense_replace: int = 0,
+        moe_layer_freq: int = 1,
+        use_grouped_topk: bool = True,
+        num_expert_group: int = 1,
+        topk_group: int = 1,
+        q_lora_rank: int | None = None,
+        kv_lora_rank: int | None = None,
+        qk_nope_head_dim: int | None = None,
+        qk_rope_head_dim: int | None = None,
+        v_head_dim: int | None = None,
+        mla_use_nope: bool | None = False,
+        num_nextn_predict_layers: int = 0,
+        linear_attn_config: dict | None = None,
+        **kwargs,
+    ):
+        self.model_type = model_type
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.head_dim = (
+            head_dim if head_dim is not None else hidden_size // num_attention_heads
+        )
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.mla_use_nope = mla_use_nope
+        # moe config
+        self.num_experts = num_experts
+        self.num_experts_per_token = num_experts_per_token
+        self.moe_renormalize = moe_renormalize
+        self.num_shared_experts = num_shared_experts
+        self.routed_scaling_factor = routed_scaling_factor
+        self.moe_router_activation_func = moe_router_activation_func
+        assert self.moe_router_activation_func in ("softmax", "sigmoid")
+        self.moe_intermediate_size = moe_intermediate_size
+        self.first_k_dense_replace = first_k_dense_replace
+        self.moe_layer_freq = moe_layer_freq
+        self.use_grouped_topk = use_grouped_topk
+        self.num_expert_group = num_expert_group
+        self.topk_group = topk_group
+        self.num_nextn_predict_layers = num_nextn_predict_layers
+
+        if linear_attn_config is not None:
+            assert linear_attn_config["kda_layers"] is not None
+            assert linear_attn_config["full_attn_layers"] is not None
+        self.linear_attn_config = linear_attn_config
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def is_mla(self):
+        return (
+            self.q_lora_rank is not None
+            or self.kv_lora_rank is not None
+            or self.qk_nope_head_dim is not None
+            or self.qk_rope_head_dim is not None
+            or self.v_head_dim is not None
+            or self.mla_use_nope is True
+        )
+
+    @property
+    def is_moe(self):
+        return self.num_experts is not None
+
+    @property
+    def is_linear_attn(self) -> bool:
+        return not (
+            self.linear_attn_config is None
+            or (
+                isinstance(self.linear_attn_config, dict)
+                and self.linear_attn_config["kda_layers"] is not None
+                and len(self.linear_attn_config["kda_layers"]) == 0
+            )
+        )
+
+    def is_kda_layer(self, layer_idx: int):
+        return (
+            self.linear_attn_config is not None
+            and (layer_idx + 1) in self.linear_attn_config["kda_layers"]
+        )
--- a/vllm/transformers_utils/configs/kimi_vl.py
+++ b/vllm/transformers_utils/configs/kimi_vl.py
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
+
+from transformers import DeepseekV2Config
+from transformers.configuration_utils import PretrainedConfig
+
+from vllm.transformers_utils.configs.moonvit import MoonViTConfig
+
+
+class KimiVLConfig(PretrainedConfig):
+    model_type = "kimi_vl"
+
+    def __init__(
+        self,
+        vision_config: dict | MoonViTConfig | None = None,
+        text_config: dict | DeepseekV2Config | None = None,
+        ignore_index: int = -100,
+        media_placeholder_token_id: int = 163605,
+        pad_token_id: int = 0,
+        **kwargs,
+    ):
+        if vision_config is None:
+            vision_config = MoonViTConfig()
+        elif isinstance(vision_config, dict):
+            vision_config = MoonViTConfig(**vision_config)
+        self.vision_config = vision_config
+
+        if text_config is None:
+            text_config = DeepseekV2Config()
+        elif isinstance(text_config, dict):
+            text_config = DeepseekV2Config(**text_config)
+        self.text_config = text_config
+
+        self.ignore_index = ignore_index
+        self.media_placeholder_token_id = media_placeholder_token_id
+
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
--- a/vllm/transformers_utils/configs/lfm2_moe.py
+++ b/vllm/transformers_utils/configs/lfm2_moe.py
@@ -0,0 +1,163 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class Lfm2MoeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Lfm2MoeModel`]. It is used to instantiate a LFM2 Moe
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LFM2-8B-A1B model.
+    e.g. [LiquidAI/LFM2-8B-A1B](https://huggingface.co/LiquidAI/LFM2-8B-A1B)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 65536):
+            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Lfm2Model`]
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 7168):
+            Dimension of the MLP representations.
+        moe_intermediate_size (`int`, *optional*, defaults to 1792):
+            Intermediate size of the routed expert.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_parameters (`dict`, *optional*):
+            The parameters of the RoPE embeddings.
+        max_position_embeddings (`int`, *optional*, defaults to 128000):
+            The maximum sequence length that this model might ever be used with.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the conv layers.
+        conv_L_cache (`int`, *optional*, defaults to 3):
+            L_cache dim in the conv layers.
+        num_dense_layers (`int`, *optional*, defaults to 2):
+            Number of dense Lfm2MoeMLP layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
+        num_experts_per_tok (`int`, *optional*, defaults to 4):
+            Number of selected experts.
+        num_experts (`int`, *optional*, defaults to 32):
+            Number of routed experts.
+        use_expert_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use the expert bias on the routing weights.
+        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor for routed experts in MoE models.
+        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the topk probabilities.
+        layer_types (`Optional`, *optional*):
+            Type of each layers.
+
+    ```python
+    >>> from transformers import Lfm2MoeModel, Lfm2MoeConfig
+
+    >>> # Initializing a LFM2 Moe model
+    >>> configuration = Lfm2MoeConfig()
+
+    >>> # Initializing a model from the LFM2-8B-A1B style configuration
+    >>> model = Lfm2MoeModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""  # noqa: E501
+
+    model_type = "lfm2_moe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size: int = 65536,
+        hidden_size: int = 2048,
+        intermediate_size: int = 7168,
+        moe_intermediate_size: int = 1792,
+        num_hidden_layers: int = 32,
+        pad_token_id: int = 0,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        tie_word_embeddings: bool = True,
+        rope_parameters: dict[str, Any] | None = None,
+        max_position_embeddings: int = 128_000,
+        use_cache: bool = True,
+        norm_eps: float = 0.00001,
+        num_attention_heads: int = 32,
+        num_key_value_heads: int = 8,
+        conv_bias: bool = False,
+        conv_L_cache: int = 3,
+        num_dense_layers: int = 2,
+        num_experts_per_tok: int = 4,
+        num_experts: int = 32,
+        use_expert_bias: bool = True,
+        routed_scaling_factor: float = 1.0,
+        norm_topk_prob: bool = True,
+        layer_types: list[str] | None = None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        rope_theta = kwargs.pop("rope_theta", 1000000.0)
+        if rope_parameters is None:
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
+        self.rope_parameters = rope_parameters
+        self.max_position_embeddings = max_position_embeddings
+        self.use_cache = use_cache
+        self.norm_eps = norm_eps
+
+        # attn operator config
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+
+        # custom operator config
+        self.conv_bias = conv_bias
+        self.conv_L_cache = conv_L_cache
+
+        # moe config
+        self.num_dense_layers = num_dense_layers
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.use_expert_bias = use_expert_bias
+        self.routed_scaling_factor = routed_scaling_factor
+        self.norm_topk_prob = norm_topk_prob
+        self.layer_types = layer_types
+
+        tie_word_embeddings = kwargs.get(
+            "tie_embedding", tie_word_embeddings
+        )  # to fit original config keys
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["Lfm2MoeConfig"]
--- a/vllm/transformers_utils/configs/medusa.py
+++ b/vllm/transformers_utils/configs/medusa.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+from transformers import PretrainedConfig
+
+
+class MedusaConfig(PretrainedConfig):
+    model_type = "medusa"
+
+    def __init__(
+        self,
+        hidden_size: int = 4096,
+        vocab_size: int = 32001,
+        num_heads: int = 5,
+        num_hidden_layers: int = 1,
+        max_paths: int = 64,
+        topk: int = 10,
+        truncated_vocab_size: int | None = None,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.num_heads = num_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.max_paths = max_paths
+        self.topk = topk
+        self.max_seq_len = int(2**20)
+        self.truncated_vocab_size = (
+            vocab_size if truncated_vocab_size is None else truncated_vocab_size
+        )
+        if "architectures" not in kwargs:
+            kwargs["architectures"] = ["MedusaModel"]
+
+        super().__init__(**kwargs)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str | os.PathLike,
+        **kwargs,
+    ) -> "MedusaConfig":
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+        for k in list(config_dict.keys()):
+            if "num" in k:
+                if "heads" in k:
+                    config_dict["num_heads"] = config_dict.pop(k)
+                elif "layers" in k:
+                    config_dict["num_hidden_layers"] = config_dict.pop(k)
+        return cls.from_dict(config_dict, **kwargs)
+
+    @property
+    def num_attention_heads(self):
+        return 0
+
+    @property
+    def num_lookahead_tokens(self):
+        return self.num_heads
+
+    @num_lookahead_tokens.setter
+    def num_lookahead_tokens(self, num_lookahead_tokens: int):
+        self.num_heads = num_lookahead_tokens
--- a/vllm/transformers_utils/configs/midashenglm.py
+++ b/vllm/transformers_utils/configs/midashenglm.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2025 Horizon team, Xiaomi MiLM Plus.
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import PretrainedConfig
+from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
+    Qwen2_5OmniTextConfig,
+)
+
+
+class DashengConfig(PretrainedConfig):
+    model_type = "midashenglm_dasheng_encoder"
+
+    def __init__(
+        self,
+        embed_dim: int = 768,
+        outputdim: int = 527,
+        patch_size: int | tuple[int, int] = 16,
+        patch_stride: int | tuple[int, int] = 16,
+        input_channels: int = 1,
+        target_length: int = 1012,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        init_values: float | None = None,
+        drop_rate: float = 0.0,
+        attn_drop_rate: float = 0.0,
+        f_min: float = 0.0,
+        f_max: float = 8000.0,
+        center: bool = True,
+        win_length: int = 512,
+        hop_length: int = 160,
+        sample_rate: int = 16000,
+        n_fft: int = 512,
+        n_mels: int = 64,
+        **kwargs,
+    ):
+        self.embed_dim = embed_dim
+        self.outputdim = outputdim
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.input_channels = input_channels
+        self.target_length = target_length
+        self.depth = depth
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.init_values = init_values
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.f_min = f_min
+        self.f_max = f_max
+        self.center = center
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.n_mels = n_mels
+        super().__init__(**kwargs)
+
+
+class MiDashengLMConfig(PretrainedConfig):
+    model_type = "midashenglm"
+
+    def __init__(
+        self,
+        audio_encoder_config: dict | None = None,
+        subsample_factor: int = 5,
+        text_config: dict | None = None,
+        audio_token_id: int | None = None,
+        **kwargs,
+    ):
+        self.audio_encoder_config = DashengConfig(**(audio_encoder_config or {}))
+        self.subsample_factor = subsample_factor
+        self.text_config = (
+            Qwen2_5OmniTextConfig(**text_config)
+            if text_config
+            else Qwen2_5OmniTextConfig()
+        )
+        self.text_config.rope_parameters = None  # uses_mrope is false
+        self.audio_token_id = audio_token_id
+        super().__init__(**kwargs)
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -0,0 +1,293 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+from transformers import PretrainedConfig, WhisperConfig
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def adapt_config_dict(
+    config_dict: dict[str, Any],
+    defaults: dict[str, Any],
+) -> PretrainedConfig:
+    config_dict = _remap_general_mistral_args(config_dict)
+    config_dict = _remap_mistral_sliding_window(config_dict)
+
+    if bool(config_dict.get("quantization")):
+        config_dict = _remap_mistral_quantization_args(config_dict)
+
+    is_moe = bool(config_dict.get("moe"))
+    is_mistral_large_3 = (
+        is_moe and (config_dict["moe"].get("num_shared_experts") or 0) > 0
+    )
+    if config_dict.get("model_type") == "mamba":
+        config_dict["architectures"] = ["Mamba2ForCausalLM"]
+    elif is_moe and is_mistral_large_3:
+        config_dict = _remap_moe_args(config_dict)
+        config_dict["model_type"] = "deepseek_v3"
+        config_dict["architectures"] = ["MistralLarge3ForCausalLM"]
+
+        assert "llama_4_scaling" in config_dict, (
+            "MistralLarge3 expect llama4 scaling config."
+        )
+        llama_4_scaling_config_keys = ["original_max_position_embeddings", "beta"]
+        assert all(
+            [
+                key in config_dict["llama_4_scaling"]
+                for key in llama_4_scaling_config_keys
+            ]
+        ), (
+            "llama_4_scaling config should define the keys: "
+            f"{','.join(llama_4_scaling_config_keys)}"
+        )
+    elif is_moe:
+        config_dict["architectures"] = ["MixtralForCausalLM"]
+    else:
+        config_dict["architectures"] = ["MistralForCausalLM"]
+
+    if bool(config_dict.get("yarn")):
+        config_dict = _remap_mistral_yarn_args(config_dict)
+
+    if bool(config_dict.get("llama_4_scaling")):
+        llama_4_scaling_config_keys = ["original_max_position_embeddings", "beta"]
+        assert all(
+            [
+                key in config_dict["llama_4_scaling"]
+                for key in llama_4_scaling_config_keys
+            ]
+        ), (
+            "llama_4_scaling config should define the keys: "
+            f"{','.join(llama_4_scaling_config_keys)}"
+        )
+
+    is_vision = (config_dict.get("multimodal") or {}).get(
+        "vision_encoder_args"
+    ) or config_dict.get("vision_encoder")
+    is_audio = bool(
+        ((config_dict.get("multimodal") or {}).get("whisper_model_args") or {}).get(
+            "encoder_args"
+        )
+    )
+
+    assert not (is_vision and is_audio), "Vision and audio are mutually exclusive"
+
+    if is_vision:
+        config_dict = _remap_mistral_vision_args(config_dict)
+    if is_audio:
+        config_dict = _remap_mistral_audio_args(config_dict)
+
+    for k, v in defaults.items():
+        config_dict.setdefault(k, v)
+
+    config = PretrainedConfig.from_dict(config_dict)
+
+    logger.debug("Initialized config %s", config)
+
+    return config
+
+
+def _remap_mistral_vision_args(config: dict) -> dict:
+    if config.get("multimodal"):
+        vision_config = config.pop("multimodal")
+    else:
+        vision_config = config.pop("vision_encoder")
+
+    quant_config = config.get("quantization_config")
+    config = {
+        "model_type": "pixtral",
+        "architectures": ["PixtralForConditionalGeneration"],
+        "text_config": PretrainedConfig.from_dict(config),
+        "vision_config": PretrainedConfig.from_dict(vision_config),
+    }
+    if quant_config:
+        config["quantization_config"] = quant_config
+    return config
+
+
+def _remap_mistral_yarn_args(config: dict) -> dict:
+    yarn_config_map = {
+        "factor": "factor",
+        "original_max_position_embeddings": "original_max_position_embeddings",
+        "beta": "beta_fast",
+        "alpha": "beta_slow",
+        "apply_scale": "apply_yarn_scaling",
+    }
+    yarn_config = config.get("yarn") or {}
+    config["rope_parameters"] = {
+        "rope_type": "yarn",
+        "mscale_all_dim": 1,
+    }
+
+    if rope_theta := config.pop("rope_theta", None):
+        config["rope_parameters"]["rope_theta"] = rope_theta
+
+    for old_name, new_name in yarn_config_map.items():
+        if old_name in yarn_config:
+            config["rope_parameters"][new_name] = yarn_config.pop(old_name)
+
+    assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}"
+
+    return config
+
+
+def _remap_general_mistral_args(config: dict) -> dict:
+    # Mistral key -> HF key
+    config_mapping = {
+        "dim": "hidden_size",
+        "norm_eps": "rms_norm_eps",
+        "n_kv_heads": "num_key_value_heads",
+        "n_layers": "num_hidden_layers",
+        "n_heads": "num_attention_heads",
+        "hidden_dim": "intermediate_size",
+    }
+    # HF key -> (Mistral key, default value)
+    top_level_mapping_with_default = {
+        "model_type": ("model_type", "transformer"),
+        "hidden_act": ("activation", "silu"),
+        "tie_word_embeddings": ("tied_embeddings", False),
+        "max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)),
+        "max_position_embeddings": ("max_position_embeddings", 128_000),
+    }
+
+    for key, new_key in config_mapping.items():
+        if key in config:
+            config[new_key] = config.pop(key)
+
+    for new_key, (key, default_value) in top_level_mapping_with_default.items():
+        config[new_key] = config.pop(key, default_value)
+
+    return config
+
+
+def _remap_mistral_sliding_window(config: dict) -> dict:
+    # Remap sliding_window (list) -> layer_types (list) + sliding window (int)
+    # for HF compatibility
+    # Mistral configs may define sliding_window as list[int]. Convert it
+    # to int and add the layer_types list[str] to make it HF compatible
+    if sliding_window := config.get("sliding_window"):
+        if isinstance(sliding_window, list):
+            pattern_repeats = config["num_hidden_layers"] // len(sliding_window)
+            layer_types = sliding_window * pattern_repeats
+            config["layer_types"] = [
+                "full_attention" if layer_type is None else "sliding_attention"
+                for layer_type in layer_types
+            ]
+            assert len(set(sliding_window) - {None}) <= 1, sliding_window
+            config["sliding_window"] = next(filter(None, sliding_window), None)
+        elif isinstance(sliding_window, int) and config.get("layer_types") is None:
+            config["layer_types"] = ["sliding_attention"] * config["num_hidden_layers"]
+        else:
+            raise ValueError(f"Unsupported sliding_window type: {sliding_window}")
+
+    return config
+
+
+def _remap_mistral_quantization_args(config: dict) -> dict:
+    if config.get("quantization"):
+        quantization = config.pop("quantization", {})
+        if quantization.get("qformat_weight") == "fp8_e4m3":
+            qscheme_act = quantization.get("qscheme_act")
+            assert qscheme_act in ("NO_SCALES", "TENSOR", None), (
+                "Only NO_SCALES and TENSOR (default) are supported for qscheme_act"
+            )
+            is_dynamic = qscheme_act == "NO_SCALES"
+            config["quantization_config"] = {
+                "quant_method": "fp8",
+                "activation_scheme": "dynamic" if is_dynamic else "static",
+            }
+        elif (
+            str(quantization.get("quant_method", "")).lower().replace("_", "-")
+            == "compressed-tensors"
+        ):
+            # Pass through compressed-tensors config, while normalizing
+            # quant_method to the canonical community spelling.
+            quantization["quant_method"] = "compressed-tensors"
+            config["quantization_config"] = quantization
+        else:
+            raise ValueError(f"Found unknown quantization='{quantization}' in config")
+
+    return config
+
+
+def _remap_mistral_audio_args(config: dict) -> dict:
+    whisper_args = config["multimodal"].pop("whisper_model_args")
+    encoder_args = whisper_args["encoder_args"]
+    downsample_args = whisper_args["downsample_args"]
+    downsample_factor = downsample_args["downsample_factor"]
+
+    # make sure that k/v blocks can be allocated with
+    # unified k/v cache class and pool whisper k/v cache blocks
+    # with downsample_factor:1 ratio
+    if encoder_args.get("causal"):
+        block_pool_size = downsample_factor
+        config["projection_size"] = downsample_factor * encoder_args["dim"]
+    else:
+        block_pool_size = 1
+
+    architecture = (
+        "VoxtralRealtimeGeneration"
+        if encoder_args.get("causal")
+        else "VoxtralForConditionalGeneration"
+    )
+
+    quant_config = config.get("quantization_config")
+    config = {
+        "model_type": "voxtral",
+        "architectures": [architecture],
+        "text_config": PretrainedConfig.from_dict(config),
+        "audio_config": WhisperConfig(
+            num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"],
+            window_size=encoder_args["audio_encoding_args"]["window_size"],
+            sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"],
+            hop_length=encoder_args["audio_encoding_args"]["hop_length"],
+            downsample_factor=downsample_factor,
+            d_model=encoder_args["dim"],
+            encoder_layers=encoder_args["n_layers"],
+            encoder_ffn_dim=encoder_args["hidden_dim"],
+            encoder_attention_heads=encoder_args["n_heads"],
+            encoder_head_dim=encoder_args["head_dim"],
+            vocab_size=encoder_args["vocab_size"],
+            max_source_positions=encoder_args["max_source_positions"],
+            is_encoder_decoder=False,  # Override WhisperConfig default
+            is_causal=encoder_args.get("causal", False),
+            sliding_window=encoder_args.get("sliding_window", None),
+            block_pool_size=block_pool_size,
+            pos_embed=encoder_args.get("pos_embed", "sinusoidal"),
+            global_log_mel_max=encoder_args["audio_encoding_args"].get(
+                "global_log_mel_max"
+            ),
+            # only needed for RoPE
+            max_position_embeddings=block_pool_size * config["max_position_embeddings"],
+        ),
+    }
+    if quant_config:
+        config["quantization_config"] = quant_config
+    return config
+
+
+def _remap_moe_args(config: dict) -> dict:
+    moe_config_map = {
+        "route_every_n": "moe_layer_freq",
+        "first_k_dense_replace": "first_k_dense_replace",
+        "num_experts_per_tok": "num_experts_per_tok",
+        "num_experts": "n_routed_experts",
+        "expert_hidden_dim": "moe_intermediate_size",
+        "routed_scale": "routed_scaling_factor",
+        "num_shared_experts": "n_shared_experts",
+        "num_expert_groups": "n_group",
+        "num_expert_groups_per_tok": "topk_group",
+    }
+    moe_config = config.get("moe", {})
+    for old_name, new_name in moe_config_map.items():
+        if old_name in moe_config:
+            value = moe_config.pop(old_name)
+            config[new_name] = value
+
+    config["topk_method"] = None
+    config["norm_topk_prob"] = True
+    config["scoring_func"] = "softmax"
+
+    return config
--- a/vllm/transformers_utils/configs/mlp_speculator.py
+++ b/vllm/transformers_utils/configs/mlp_speculator.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from transformers import PretrainedConfig
+
+
+class MLPSpeculatorConfig(PretrainedConfig):
+    model_type = "mlp_speculator"
+
+    attribute_map = {
+        "hidden_size": "emb_dim",
+    }
+
+    def __init__(
+        self,
+        vocab_size: int = 32000,
+        emb_dim: int = 4096,
+        inner_dim: int = 0,
+        n_predict: int = 3,
+        top_k_tokens_per_head: list[int] | None = None,
+        n_candidates: int = 5,
+        tie_weights: bool = False,
+        scale_input: bool = False,
+        **kwargs,
+    ):
+        """
+        Initialize an MLPSpeculatorConfig
+
+        Args:
+            vocab_size: int
+                the model vocab size
+            emb_dim: int
+                the model embedding dimension
+            inner_dim: int
+                the inner dimension of the model. If 0, will be the emb_dim.
+            n_predict: int
+                the number of lookaheads for the speculator
+            top_k_tokens_per_head: list[int]
+                Number of tokens to consider from each head when forming the
+                candidate tree.
+                For each candidate branch in the tree, head n produces topk[n]
+                additional sub-branches.
+                NOTE: This parameter is currently unused.
+            n_candidates: int
+                number of child candidates to create per sequence
+            tie_weights: bool
+                If true, use a single set of weights for every model
+                head/stage after the first. The initial projection
+                from the base model may have a different size, so that
+                stays separate.
+            scale_input: bool
+                if True, will scale the initial hidden states from
+                the base model.
+        """
+        if top_k_tokens_per_head is None:
+            top_k_tokens_per_head = [5, 4, 3]
+        assert len(top_k_tokens_per_head) == n_predict
+        self.vocab_size = vocab_size
+        self.emb_dim = emb_dim
+        self.inner_dim = inner_dim
+        self.n_predict = n_predict
+        self.top_k_tokens_per_head = top_k_tokens_per_head
+        self.n_candidates = n_candidates
+        self.num_lookahead_tokens = n_predict
+        self.tie_weights = tie_weights
+        self.scale_input = scale_input
+
+        super().__init__(**kwargs)
--- a/vllm/transformers_utils/configs/moonvit.py
+++ b/vllm/transformers_utils/configs/moonvit.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
+from transformers.configuration_utils import PretrainedConfig
+
+
+class MoonViTConfig(PretrainedConfig):
+    model_type = "moonvit"
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        init_pos_emb_height: int = 64,
+        init_pos_emb_width: int = 64,
+        num_attention_heads: int = 16,
+        num_hidden_layers: int = 27,
+        hidden_size: int = 1152,
+        intermediate_size: int = 4304,
+        merge_kernel_size: tuple[int, int] = (2, 2),
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        # Positional embedding config
+        self.init_pos_emb_height = init_pos_emb_height
+        self.init_pos_emb_width = init_pos_emb_width
+        # Transformer config
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        # Patch merger config
+        self.merge_kernel_size = merge_kernel_size
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -0,0 +1,220 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Nemotron model configuration"""
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class NemotronConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`NemotronModel`]. It is used to instantiate a Nemotron model
+    according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the Nemotron-8B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be
+    used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the Nemotron model. Defines the number of
+            different tokens that can be represented by the
+            `inputs_ids` passed when calling [`NemotronModel`]
+        hidden_size (`int`, *optional*, defaults to 6144):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 24576):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 48):
+            Number of attention heads for each attention layer in the
+            Transformer decoder.
+        head_dim (`int`, *optional*):
+            Projection weights dimension in multi-head attention. Set to
+            hidden_size // num_attention_heads if None
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use
+            Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention
+            (MQA) otherwise GQA is used. When converting a multi-head
+            checkpoint to a GQA checkpoint, each group key and value
+            head should be constructed by meanpooling all the original
+            heads within that group. For more details checkout
+            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
+            is not specified, will default to `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
+            The non-linear activation function (function or string) in the
+            decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used
+            with.
+        initializer_range (`float`, *optional*, defaults to 0.0134):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 3):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_parameters (`dict`, *optional*):
+            The parameters of the RoPE embeddings. Expected contents:
+                `rope_theta` (`float`): The base period of the RoPE embeddings.
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear',
+                    'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the
+                    original RoPE implementation.
+                `partial_rotary_factor` (`float`, *optional*, defaults to 0.5):
+                    Percentage of the query and keys which will have rotary embedding.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output
+            projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj and down_proj layers in the MLP
+            layers.
+
+    ```python
+    >>> from transformers import NemotronModel, NemotronConfig
+    >>> # Initializing a Nemotron nemotron-15b style configuration
+    >>> configuration = NemotronConfig()
+    >>> # Initializing a model from the nemotron-15b style configuration
+    >>> model = NemotronModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "nemotron"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=256000,
+        hidden_size=6144,
+        intermediate_size=24576,
+        num_hidden_layers=32,
+        num_attention_heads=48,
+        head_dim=None,
+        num_key_value_heads=None,
+        hidden_act="relu2",
+        max_position_embeddings=4096,
+        initializer_range=0.0134,
+        norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=2,
+        eos_token_id=3,
+        tie_word_embeddings=False,
+        rope_parameters=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        head_dim = head_dim or kwargs.get("kv_channels")
+        self.head_dim = (
+            head_dim if head_dim is not None else (hidden_size // num_attention_heads)
+        )
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.norm_eps = norm_eps
+        self.use_cache = use_cache
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        # for backward compatibility
+        partial_rotary_factor = (
+            kwargs.get("rope_percent")
+            or kwargs.get("rope_percentage")
+            or kwargs.get("partial_rotary_factor")
+            or 0.5
+        )
+        if "partial_rotary_factor" not in rope_parameters:
+            rope_parameters["partial_rotary_factor"] = partial_rotary_factor
+        self.rope_parameters = rope_parameters
+        self._rope_parameters_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_parameters_validation(self):
+        """
+        Validate the `rope_parameters` configuration.
+        """
+        if self.rope_parameters is None:
+            return
+
+        rope_type: str | None = self.rope_parameters.get("rope_type", None)
+        factor: float | None = self.rope_parameters.get("factor", None)
+
+        if rope_type not in {"default", "linear", "dynamic"}:
+            raise ValueError(
+                "`rope_type` must be one of ['default', 'linear', 'dynamic'], "
+                f"got {rope_type}"
+            )
+        if rope_type != "default":
+            if factor is None:
+                raise ValueError(
+                    "If `rope_type` is not 'default', `rope_parameters` "
+                    "must include a `factor` field. Got `None`."
+                )
+            if not isinstance(factor, float) or factor <= 1.0:
+                raise ValueError(
+                    "`rope_parameters`'s factor field must be a float > 1, got "
+                    f"{factor}"
+                )
--- a/vllm/transformers_utils/configs/nemotron_h.py
+++ b/vllm/transformers_utils/configs/nemotron_h.py
@@ -0,0 +1,287 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""NemotronH model configuration"""
+
+import regex as re
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class NemotronHConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`NemotronHModel`]. It is used to instantiate a NemotronH model according
+    to the specified arguments, defining the model architecture. Instantiating
+    a configuration with the defaults will yield a similar configuration to
+    that of the NemotronH-v0.1 model.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 131072):
+            Vocabulary size of the NemotronH model. Defines the number of
+            different tokens that can be represented by the `inputs_ids`
+            passed when calling [`NemotronHModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be
+            tied. Note that this is only relevant if the model has an output
+            word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 21504):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 52):
+            Number of hidden layers in the Transformer encoder.
+        hybrid_override_pattern (`str`, *optional*, defaults to
+            `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
+            The pattern of the hybrid model. The pattern is a string of
+            characters where each character represents
+            M: Mamba2, *: Attention, -: MLP
+        mtp_hybrid_override_pattern (`str`, *optional*, defaults to `"*E"`):
+            The pattern of the MTP layers.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the
+            Transformer encoder.
+        attention_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each attention head.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use
+            Multi Head Attention (MHA), if `num_key_value_heads=1` the model
+            will use Multi Query Attention (MQA) otherwise GQA is used.
+        mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
+            The non-linear activation function in the MLP layers.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in attention layers.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in MLP layers.
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the model.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        residual_in_fp32 (`bool`, *optional*, defaults to `False`):
+            Whether or not residuals should be in `float32`. If set to `False`
+            residuals will keep the same `dtype` as the rest of the model.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`,
+            all logits will be calculated. If an integer value, only last
+            `num_logits_to_keep` logits will be calculated.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        sliding_window (`int`, *optional*, defaults to None):
+            Sliding window attention window size.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used
+            with.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the hidden states.
+        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use the fast mamba kernels.
+            These are available only if `mamba-ssm` and `causal-conv1d`
+            are installed, and the mamba modules are running on a CUDA device.
+        ssm_state_size (`int`, *optional*, defaults to 128):
+            The dimension of the mamba state space latents.
+        mamba_num_heads (`int`, *optional*, defaults to 128):
+            Number of heads in Mamba layers.
+        mamba_n_groups (`int`, *optional*, defaults to 8):
+            Number of groups in Mamba layers.
+        mamba_head_dim (`int`, *optional*, defaults to 64):
+            Dimension of each Mamba head.
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel.
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor used to determine the mamba intermediate size.
+        mamba_hidden_act (`str`, *optional*, defaults to "silu"):
+            The non-linear activation function in the Mamba layers.
+        mamba_dt_min (`float`, *optional*, defaults to 0.001):
+            Minimum value for the time step in Mamba.
+        mamba_dt_max (`float`, *optional*, defaults to 0.1):
+            Maximum value for the time step in Mamba.
+        mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
+            Limits for the time step in Mamba.
+        mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
+            Floor value for time step initialization in Mamba.
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the convolution layer of the mamba mixer
+            block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the input and output projections of the
+            mamba mixer block.
+        mamba_chunk_size (`int`, *optional*, defaults to 256):
+            Size of chunks for Mamba processing.
+        rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the pre-normalization residual connections.
+    """
+
+    model_type = "nemotron_h"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=131072,
+        tie_word_embeddings=False,
+        hidden_size=4096,
+        intermediate_size=21504,
+        num_hidden_layers=52,
+        hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
+        mtp_hybrid_override_pattern="*E",
+        num_attention_heads=32,
+        head_dim=128,
+        num_key_value_heads=8,  # nemo: num_query_groups
+        mlp_hidden_act="relu2",
+        attention_bias=False,
+        mlp_bias=False,
+        use_bias=False,
+        initializer_range=0.02,  # nemo: init_method_std
+        layer_norm_epsilon=1e-5,  # nemo: layernorm_epsilon
+        residual_in_fp32=False,  #  Megatron Core default value
+        use_cache=True,
+        num_logits_to_keep=1,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        sliding_window=None,
+        max_position_embeddings=4096,
+        attention_dropout=0.0,
+        hidden_dropout=0.0,  # * ADDED
+        use_mamba_kernels=True,
+        ssm_state_size=128,  # mamba_state_size
+        mamba_num_heads=128,
+        mamba_n_groups=8,  # nemo: mamba_ssm_ngroups = num_heads
+        mamba_head_dim=64,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_hidden_act="silu",
+        mamba_dt_min=0.001,
+        mamba_dt_max=0.1,
+        mamba_dt_limit=(0.0, float("inf")),
+        mamba_dt_init_floor=1e-4,
+        mamba_conv_bias=True,
+        mamba_proj_bias=False,
+        mamba_chunk_size=256,
+        rescale_prenorm_residual=True,
+        n_routed_experts=8,
+        n_shared_experts=1,
+        moe_intermediate_size=7688,
+        moe_shared_expert_intermediate_size=7688,
+        moe_latent_size=None,
+        num_experts_per_tok=2,
+        routed_scaling_factor=1.0,
+        n_group=1,
+        topk_group=1,
+        norm_topk_prob=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.hybrid_override_pattern = hybrid_override_pattern
+        self.mtp_hybrid_override_pattern = mtp_hybrid_override_pattern
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.sliding_window = sliding_window
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_dropout = attention_dropout
+        self.hidden_dropout = hidden_dropout
+
+        # Validate hybrid_override_pattern
+        # M: Mamba2, *: Attention, -: MLP
+        assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
+            "hybrid_override_pattern must have same length as num_hidden_layers"
+        )
+        assert re.match(r"^[*-ME]+$", self.hybrid_override_pattern), (
+            "hybrid_override_pattern must only contain characters 'M', '*', '-', or 'E'"
+        )
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.mlp_hidden_act = mlp_hidden_act
+        self.attention_bias = attention_bias
+        self.mlp_bias = mlp_bias
+        self.use_bias = use_bias
+        self.initializer_range = initializer_range
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.residual_in_fp32 = residual_in_fp32
+
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+
+        self.use_mamba_kernels = use_mamba_kernels
+        self.n_groups = mamba_n_groups
+        self.mamba_head_dim = mamba_head_dim
+        self.ssm_state_size = ssm_state_size
+        self.mamba_num_heads = mamba_num_heads
+        self.conv_kernel = mamba_d_conv
+        self.expand = mamba_expand
+        self.mamba_hidden_act = mamba_hidden_act
+        self.time_step_min = mamba_dt_min
+        self.time_step_max = mamba_dt_max
+        self.time_step_limit = mamba_dt_limit
+        self.time_step_floor = mamba_dt_init_floor
+        self.use_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+        self.chunk_size = mamba_chunk_size
+        self.rescale_prenorm_residual = rescale_prenorm_residual
+        self.n_routed_experts = n_routed_experts
+        self.n_shared_experts = n_shared_experts
+        self.moe_intermediate_size = moe_intermediate_size
+        self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size  # noqa: E501
+        self.moe_latent_size = moe_latent_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.routed_scaling_factor = routed_scaling_factor
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.norm_topk_prob = norm_topk_prob
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def layers_block_type(self):
+        return [
+            "mamba"
+            if self.hybrid_override_pattern[i] == "M"
+            else "attention"
+            if self.hybrid_override_pattern[i] == "*"
+            else "mlp"
+            if self.hybrid_override_pattern[i] == "-"
+            else "moe"
+            for i in range(self.num_hidden_layers)
+        ]
--- a/vllm/transformers_utils/configs/olmo3.py
+++ b/vllm/transformers_utils/configs/olmo3.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class Olmo3Config(PretrainedConfig):
+    model_type = "olmo3"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=50304,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=None,
+        eos_token_id=50279,
+        tie_word_embeddings=False,
+        rope_parameters=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        rms_norm_eps=1e-5,
+        sliding_window=4096,
+        layer_types=None,
+        **kwargs,
+    ):
+        # This model uses Olmo3ForCausalLM in transformers but Olmo2ForCausalLM
+        # in vLLM.
+        if "architectures" not in kwargs:
+            kwargs["architectures"] = ["Olmo2ForCausalLM"]
+        elif "Olmo3ForCausalLM" in kwargs["architectures"]:
+            kwargs["architectures"].remove("Olmo3ForCausalLM")
+            kwargs["architectures"].append("Olmo2ForCausalLM")
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        self.rms_norm_eps = rms_norm_eps
+
+        self.sliding_window = sliding_window
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention" if (i + 1) % 4 != 0 else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
--- a/vllm/transformers_utils/configs/ovis.py
+++ b/vllm/transformers_utils/configs/ovis.py
@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# ruff: noqa: E501
+# adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
+# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
+# Ovis Config with AimV2 config registration removed for Transformers compatibility
+from typing import Any
+
+from transformers import AutoConfig, PretrainedConfig
+
+
+class AIMv2Config(PretrainedConfig):
+    """This is the configuration class to store the configuration of an [`AIMv2Model`].
+    Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224).
+    Args:
+        hidden_size: Dimension of the hidden representations.
+        intermediate_size: Dimension of the SwiGLU representations.
+        num_hidden_layers: Number of hidden layers in the Transformer.
+        num_attention_heads: Number of attention heads for each attention layer
+            in the Transformer.
+        num_channels: Number of input channels.
+        image_size: Image size.
+        patch_size: Patch size.
+        rms_norm_eps: Epsilon value used for the RMS normalization layer.
+        attention_dropout: Dropout ratio for attention probabilities.
+        projection_dropout: Dropout ratio for the projection layer after the attention.
+        qkv_bias: Whether to add a bias to the queries, keys and values.
+        use_bias: Whether to add a bias in the feed-forward and projection layers.
+        kwargs: Keyword arguments for the [`PretrainedConfig`].
+    """
+
+    model_type: str = "aimv2"
+
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        intermediate_size: int = 2816,
+        num_hidden_layers: int = 24,
+        num_attention_heads: int = 8,
+        num_channels: int = 3,
+        image_size: int = 224,
+        patch_size: int = 14,
+        rms_norm_eps: float = 1e-5,
+        attention_dropout: float = 0.0,
+        projection_dropout: float = 0.0,
+        qkv_bias: bool = False,
+        use_bias: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.rms_norm_eps = rms_norm_eps
+
+        self.projection_dropout = projection_dropout
+        self.qkv_bias = qkv_bias
+        self.use_bias = use_bias
+
+
+# ----------------------------------------------------------------------
+#                     Visual Tokenizer Configuration
+# ----------------------------------------------------------------------
+class BaseVisualTokenizerConfig(PretrainedConfig):
+    def __init__(
+        self,
+        vocab_size=16384,
+        tokenize_function="softmax",
+        tau=1.0,
+        depths=None,
+        drop_cls_token=False,
+        backbone_config: PretrainedConfig | dict | None = None,
+        hidden_stride: int = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.tokenize_function = tokenize_function
+        self.tau = tau
+        if isinstance(depths, str):
+            depths = [int(x) for x in depths.split("|")]
+        self.depths = depths
+        self.backbone_kwargs = dict[str, Any]()
+        self.drop_cls_token = drop_cls_token
+        if backbone_config is not None:
+            assert isinstance(backbone_config, (PretrainedConfig, dict)), (
+                f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
+            )
+            if not isinstance(backbone_config, PretrainedConfig):
+                model_type = backbone_config["model_type"]
+                if model_type != "aimv2":
+                    backbone_config.pop("model_type")
+                    backbone_config = AutoConfig.for_model(
+                        model_type, **backbone_config
+                    )
+                else:
+                    backbone_config = AIMv2Config(**backbone_config)
+        self.backbone_config = backbone_config
+        self.hidden_stride = hidden_stride
+
+
+class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
+    model_type = "aimv2_visual_tokenizer"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if self.drop_cls_token:
+            self.drop_cls_token = False
+        if self.depths:
+            assert len(self.depths) == 1
+            self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
+
+
+class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
+    model_type = "siglip_visual_tokenizer"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if self.drop_cls_token:
+            self.drop_cls_token = False
+        if self.depths:
+            assert len(self.depths) == 1
+            self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
+
+
+AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
+AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
+
+
+# ----------------------------------------------------------------------
+#                           Ovis Configuration
+# ----------------------------------------------------------------------
+class OvisConfig(PretrainedConfig):
+    model_type = "ovis"
+
+    def __init__(
+        self,
+        llm_config: PretrainedConfig | dict | None = None,
+        visual_tokenizer_config: PretrainedConfig | dict | None = None,
+        multimodal_max_length=8192,
+        hidden_size=None,
+        conversation_formatter_class=None,
+        llm_attn_implementation=None,
+        disable_tie_weight=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if llm_config is not None:
+            assert isinstance(llm_config, (PretrainedConfig, dict)), (
+                f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
+            )
+            if not isinstance(llm_config, PretrainedConfig):
+                model_type = llm_config["model_type"]
+                llm_config.pop("model_type")
+                llm_config = AutoConfig.for_model(model_type, **llm_config)
+
+        # map llm_config to text_config
+        self.text_config = llm_config
+        if visual_tokenizer_config is not None:
+            assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), (
+                f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
+            )
+            if not isinstance(visual_tokenizer_config, PretrainedConfig):
+                model_type = visual_tokenizer_config["model_type"]
+                visual_tokenizer_config.pop("model_type")
+                visual_tokenizer_config = AutoConfig.for_model(
+                    model_type, **visual_tokenizer_config
+                )
+
+        self.visual_tokenizer_config = visual_tokenizer_config
+        self.multimodal_max_length = multimodal_max_length
+        self.hidden_size = hidden_size
+        self.conversation_formatter_class = conversation_formatter_class
+        self.llm_attn_implementation = llm_attn_implementation
+        self.disable_tie_weight = disable_tie_weight
--- a/vllm/transformers_utils/configs/qwen3_5.py
+++ b/vllm/transformers_utils/configs/qwen3_5.py
@@ -0,0 +1,193 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2025 The Qwen Team and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen3.5 model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig, layer_type_validation
+
+
+class Qwen3_5TextConfig(PretrainedConfig):
+    model_type = "qwen3_5_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        vocab_size=248320,
+        hidden_size=4096,
+        intermediate_size=12288,
+        num_hidden_layers=32,
+        num_attention_heads=16,
+        num_key_value_heads=4,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_parameters=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        head_dim=256,
+        linear_conv_kernel_dim=4,
+        linear_key_head_dim=128,
+        linear_value_head_dim=128,
+        linear_num_key_heads=16,
+        linear_num_value_heads=32,
+        layer_types=None,
+        pad_token_id=None,
+        bos_token_id=None,
+        eos_token_id=None,
+        **kwargs,
+    ):
+        kwargs["ignore_keys_at_rope_validation"] = [
+            "mrope_section",
+            "mrope_interleaved",
+        ]
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.head_dim = head_dim
+        self.rope_parameters = rope_parameters
+        kwargs.setdefault("partial_rotary_factor", 0.25)
+
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            interval_pattern = kwargs.get("full_attention_interval", 4)
+            self.layer_types = [
+                "linear_attention"
+                if bool((i + 1) % interval_pattern)
+                else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types, self.num_hidden_layers)
+
+        # linear attention part
+        self.linear_conv_kernel_dim = linear_conv_kernel_dim
+        self.linear_key_head_dim = linear_key_head_dim
+        self.linear_value_head_dim = linear_value_head_dim
+        self.linear_num_key_heads = linear_num_key_heads
+        self.linear_num_value_heads = linear_num_value_heads
+        super().__init__(**kwargs)
+        # Set these AFTER super().__init__() because transformers v4's
+        # PretrainedConfig.__init__ has these as explicit params with different
+        # defaults (e.g. tie_word_embeddings=True) that would overwrite our values.
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.tie_word_embeddings = tie_word_embeddings
+
+
+class Qwen3_5VisionConfig(PretrainedConfig):
+    model_type = "qwen3_5"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        depth=27,
+        hidden_size=1152,
+        hidden_act="gelu_pytorch_tanh",
+        intermediate_size=4304,
+        num_heads=16,
+        in_channels=3,
+        patch_size=16,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        out_hidden_size=3584,
+        num_position_embeddings=2304,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.out_hidden_size = out_hidden_size
+        self.num_position_embeddings = num_position_embeddings
+        self.initializer_range = initializer_range
+
+
+class Qwen3_5Config(PretrainedConfig):
+    model_type = "qwen3_5"
+    sub_configs = {
+        "vision_config": Qwen3_5VisionConfig,
+        "text_config": Qwen3_5TextConfig,
+    }
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=248056,
+        video_token_id=248057,
+        vision_start_token_id=248053,
+        vision_end_token_id=248054,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            self.text_config = self.sub_configs["text_config"]()
+
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_start_token_id = vision_start_token_id
+        self.vision_end_token_id = vision_end_token_id
+        super().__init__(**kwargs)
+        # Set after super().__init__() to avoid v4 PretrainedConfig overwrite
+        self.tie_word_embeddings = tie_word_embeddings
+
+
+__all__ = ["Qwen3_5Config", "Qwen3_5TextConfig"]
--- a/vllm/transformers_utils/configs/qwen3_5_moe.py
+++ b/vllm/transformers_utils/configs/qwen3_5_moe.py
@@ -0,0 +1,205 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2025 The Qwen Team and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen3.5-MoE model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig, layer_type_validation
+
+
+class Qwen3_5MoeTextConfig(PretrainedConfig):
+    model_type = "qwen3_5_moe_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.experts.gate_up_proj": "packed_colwise",
+        "layers.*.mlp.experts.down_proj": "rowwise",
+        "layers.*.mlp.shared_expert.gate_proj": "colwise",
+        "layers.*.mlp.shared_expert.up_proj": "colwise",
+        "layers.*.mlp.shared_expert.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        vocab_size=248320,
+        hidden_size=2048,
+        num_hidden_layers=40,
+        num_attention_heads=16,
+        num_key_value_heads=2,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_parameters=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        head_dim=256,
+        linear_conv_kernel_dim=4,
+        linear_key_head_dim=128,
+        linear_value_head_dim=128,
+        linear_num_key_heads=16,
+        linear_num_value_heads=32,
+        moe_intermediate_size=512,
+        shared_expert_intermediate_size=512,
+        num_experts_per_tok=8,
+        num_experts=256,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        layer_types=None,
+        pad_token_id=None,
+        bos_token_id=None,
+        eos_token_id=None,
+        **kwargs,
+    ):
+        kwargs["ignore_keys_at_rope_validation"] = [
+            "mrope_section",
+            "mrope_interleaved",
+        ]
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.head_dim = head_dim
+        self.rope_parameters = rope_parameters
+        kwargs.setdefault("partial_rotary_factor", 0.25)
+
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            interval_pattern = kwargs.get("full_attention_interval", 4)
+            self.layer_types = [
+                "linear_attention"
+                if bool((i + 1) % interval_pattern)
+                else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types, self.num_hidden_layers)
+
+        # linear attention part
+        self.linear_conv_kernel_dim = linear_conv_kernel_dim
+        self.linear_key_head_dim = linear_key_head_dim
+        self.linear_value_head_dim = linear_value_head_dim
+        self.linear_num_key_heads = linear_num_key_heads
+        self.linear_num_value_heads = linear_num_value_heads
+        self.moe_intermediate_size = moe_intermediate_size
+        self.shared_expert_intermediate_size = shared_expert_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        super().__init__(**kwargs)
+        # Set these AFTER super().__init__() because transformers v4's
+        # PretrainedConfig.__init__ has these as explicit params with different
+        # defaults (e.g. tie_word_embeddings=True) that would overwrite our values.
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.tie_word_embeddings = tie_word_embeddings
+
+
+class Qwen3_5MoeVisionConfig(PretrainedConfig):
+    model_type = "qwen3_5_moe"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        depth=27,
+        hidden_size=1152,
+        hidden_act="gelu_pytorch_tanh",
+        intermediate_size=4304,
+        num_heads=16,
+        in_channels=3,
+        patch_size=16,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        out_hidden_size=3584,
+        num_position_embeddings=2304,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.out_hidden_size = out_hidden_size
+        self.num_position_embeddings = num_position_embeddings
+        self.initializer_range = initializer_range
+
+
+class Qwen3_5MoeConfig(PretrainedConfig):
+    model_type = "qwen3_5_moe"
+    sub_configs = {
+        "vision_config": Qwen3_5MoeVisionConfig,
+        "text_config": Qwen3_5MoeTextConfig,
+    }
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=248056,
+        video_token_id=248057,
+        vision_start_token_id=248053,
+        vision_end_token_id=248054,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            self.text_config = self.sub_configs["text_config"]()
+
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_start_token_id = vision_start_token_id
+        self.vision_end_token_id = vision_end_token_id
+        super().__init__(**kwargs)
+        # Set after super().__init__() to avoid v4 PretrainedConfig overwrite
+        self.tie_word_embeddings = tie_word_embeddings
+
+
+__all__ = ["Qwen3_5MoeConfig", "Qwen3_5MoeTextConfig"]
--- a/vllm/transformers_utils/configs/qwen3_asr.py
+++ b/vllm/transformers_utils/configs/qwen3_asr.py
@@ -0,0 +1,436 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# ruff: noqa
+# mypy: ignore-errors
+# coding=utf-8
+# Copyright 2026 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class Qwen3ASRAudioEncoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3ASRAudioEncoder`]. It is used to instantiate a
+    Qwen3-ASR audio encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the audio encoder of the Qwen2-Audio
+    architecture.
+
+    e.g. [Qwen/Qwen3-ASR-1.7B](https://huggingface.co/Qwen/Qwen3-ASR-1.7B)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_mel_bins (`int`, *optional*, defaults to 128):
+            Number of mel features used per input features. Should correspond to the value used in the
+            `Qwen3ASRProcessor` class.
+        encoder_layers (`int`, *optional*, defaults to 32):
+            Number of encoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 20):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 5120):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in encoder.
+        d_model (`int`, *optional*, defaults to 1280):
+            Dimensionality of the layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by diving by sqrt(d_model).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        max_source_positions (`int`, *optional*, defaults to 1500):
+            The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
+        n_window (`int`, *optional*, defaults to 100):
+            The chunk for conv and flash attn in AudioEncoder.
+        output_dim (`int`, *optional*, defaults to 3584):
+            The output dimension of AudioEncoder.
+
+    Example:
+
+    ```python
+    >>> from transformers import Qwen3ASRAudioEncoderConfig, Qwen3ASRAudioEncoder
+
+    >>> # Initializing a Qwen3ASRAudioEncoderConfig
+    >>> configuration = Qwen3ASRAudioEncoderConfig()
+
+    >>> # Initializing a Qwen3ASRAudioEncoder (with random weights)
+    >>> model = Qwen3ASRAudioEncoder(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3_asr_audio_encoder"
+
+    def __init__(
+        self,
+        num_mel_bins=128,
+        encoder_layers=32,
+        encoder_attention_heads=20,
+        encoder_ffn_dim=5120,
+        d_model=1280,
+        dropout=0,
+        attention_dropout=0,
+        activation_function="gelu",
+        activation_dropout=0,
+        scale_embedding=False,
+        initializer_range=0.02,
+        max_source_positions=1500,
+        n_window=100,
+        output_dim=3584,
+        n_window_infer=400,
+        conv_chunksize=500,
+        downsample_hidden_size=480,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.num_mel_bins = num_mel_bins
+        self.d_model = d_model
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_function = activation_function
+        self.activation_dropout = activation_dropout
+        self.num_hidden_layers = encoder_layers
+        self.initializer_range = initializer_range
+        self.scale_embedding = (
+            scale_embedding  # scale factor will be sqrt(d_model) if True
+        )
+        self.max_source_positions = max_source_positions
+        self.n_window = n_window
+        self.output_dim = output_dim
+        self.n_window_infer = n_window_infer
+        self.conv_chunksize = conv_chunksize
+        self.downsample_hidden_size = downsample_hidden_size
+
+
+class Qwen3ASRTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3ASRTextModel`]. It is used to instantiate a
+    Qwen3-ASR model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen3-ASR-1.7B [Qwen/Qwen3-ASR-1.7B](https://huggingface.co/Qwen/Qwen3-ASR-1.7B)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen3ASR model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen3ASRModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
+        head_dim (`int`, *optional*, defaults to 128):
+            The dimension of the head. If not specified, will default to `hidden_size // num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 128000):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 5000000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import Qwen3ASRTextModel, Qwen3ASRTextConfig
+
+    >>> # Initializing a Qwen3ASR style configuration
+    >>> configuration = Qwen3ASRTextConfig()
+
+    >>> # Initializing a model from the Qwen3-VL-7B style configuration
+    >>> model = Qwen3ASRTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3_asr_text"
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        head_dim=128,
+        hidden_act="silu",
+        max_position_embeddings=128000,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=5000000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+class Qwen3ASRThinkerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3ASRThinker`]. It is used to instantiate a
+    Qwen3-ASR-Thinker model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the thinker component of the Qwen3-Omni
+    architecture.
+
+    e.g. [Qwen/Qwen3-ASR-1.7B](https://huggingface.co/Qwen/Qwen3-ASR-1.7B)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        audio_config (`dict`, *optional*):
+            The config dictionary of the audio backbone.
+        text_config (`dict`, *optional*):
+            The config dictionary of the text backbone.
+        audio_token_id (`int`, *optional*, defaults to 151646):
+            The audio token id to encode the audio prompt.
+        audio_start_token_id (`int`, *optional*, defaults to 151647):
+            The audio start token id to encode the audio prompt.
+        user_token_id (`int`, *optional*, defaults to 872):
+            The user token id to encode the user token.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+    Example:
+
+    ```python
+    >>> from transformers import Qwen3ASRThinkerModel, Qwen3ASRThinkerConfig
+
+    >>> # Initializing a default Qwen3ASRThinkerConfig
+    >>> configuration = Qwen3ASRThinkerConfig()
+
+    >>> # Initializing a model (with random weights) from the default configuration
+    >>> model = Qwen3ASRThinkerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3_asr_thinker"
+
+    attribute_map = {}
+    sub_configs = {
+        "audio_config": Qwen3ASRAudioEncoderConfig,
+        "text_config": Qwen3ASRTextConfig,
+    }
+
+    def __init__(
+        self,
+        audio_config=None,
+        text_config=None,
+        audio_token_id=151646,
+        audio_start_token_id=151647,
+        user_token_id=872,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.user_token_id = user_token_id
+        self.audio_start_token_id = audio_start_token_id
+        self.initializer_range = initializer_range
+
+        if isinstance(audio_config, dict):
+            audio_config = Qwen3ASRAudioEncoderConfig(**audio_config)
+        elif audio_config is None:
+            audio_config = Qwen3ASRAudioEncoderConfig()
+        self.audio_config = audio_config
+
+        if isinstance(text_config, dict):
+            text_config = Qwen3ASRTextConfig(**text_config)
+        elif text_config is None:
+            text_config = Qwen3ASRTextConfig()
+        self.text_config = text_config
+        self.audio_token_id = audio_token_id
+
+
+class Qwen3ASRConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`Qwen3ASRForConditionalGeneration`]. It is used to instantiate a Qwen3ASR
+    model according to the specified sub-models configurations, defining the model architecture.
+
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    [Qwen/Qwen3-ASR-1.7B](https://huggingface.co/Qwen/Qwen3-ASR-1.7B) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        thinker_config (`dict`, *optional*): Configuration of the underlying thinker sub-model.
+        support_languages (`List[str]`, *optional*): The languages supported by the model.
+
+    Example:
+
+    ```python
+    >>> from transformers import (
+    ...     Qwen3ASRThinkerConfig,
+    ...     Qwen3ASRForConditionalGeneration,
+    ...     Qwen3ASRConfig,
+    ... )
+
+    >>> # Initializing a Qwen3ASR style configuration
+    >>> configuration = Qwen3ASRConfig()
+
+    >>> # Initializing a model from the configuration
+    >>> model = Qwen3ASRForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3_asr"
+    sub_configs = {
+        "thinker_config": Qwen3ASRThinkerConfig,
+    }
+
+    def __init__(
+        self,
+        thinker_config=None,
+        support_languages=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if thinker_config is None:
+            thinker_config = {}
+            logger.info(
+                "thinker_config is None. Initializing thinker model with default values"
+            )
+
+        self.thinker_config = Qwen3ASRThinkerConfig(**thinker_config)
+        self.support_languages = support_languages
+
+    def get_text_config(self, decoder=False) -> "PretrainedConfig":
+        """
+        Returns the config that is meant to be used with text IO. On most models, it is the original config instance
+        itself. On specific composite models, it is under a set of valid names.
+
+        Args:
+            decoder (`Optional[bool]`, *optional*, defaults to `False`):
+                If set to `True`, then only search for decoder config names.
+        """
+        # Overridden for deeply nested config like Qwen2.5-Omni. We don't have any omni model
+        # except for Qwen yet. This has to be generalized if more deeply nested configs are
+        # added. NOTE: currently method used only by vLLM
+        return self.thinker_config.get_text_config()
+
+
+__all__ = ["Qwen3ASRConfig", "Qwen3ASRThinkerConfig", "Qwen3ASRAudioEncoderConfig"]
--- a/vllm/transformers_utils/configs/qwen3_next.py
+++ b/vllm/transformers_utils/configs/qwen3_next.py
@@ -0,0 +1,277 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen3-Next model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig, layer_type_validation
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class Qwen3NextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3NextModel`]. It is used to instantiate a
+    Qwen3-Next model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of
+    Qwen3-Next-80B-A3B-Instruct [Qwen/Qwen3-Next-80B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the model. Defines the number of different tokens that can be represented by the
+            `inputs_ids`.
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 5632):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 2):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str`, *optional*, defaults to `"silu"`):
+            The non-linear activation function in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_parameters (`dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_theta` (`float`): The base period of the RoPE embeddings.
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+                `partial_rotary_factor` (`float`, *optional*, defaults to 0.25):
+                    Percentage of the query and keys which will have rotary embedding.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        head_dim (`int`, *optional*, defaults to 256):
+            Projection weights dimension in multi-head attention.
+        linear_conv_kernel_dim (`int`, *optional*, defaults to 4):
+            Kernel size of the convolution used in linear attention layers.
+        linear_key_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each key head in linear attention.
+        linear_value_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each value head in linear attention.
+        linear_num_key_heads (`int`, *optional*, defaults to 16):
+            Number of key heads used in linear attention layers.
+        linear_num_value_heads (`int`, *optional*, defaults to 32):
+            Number of value heads used in linear attention layers.
+        decoder_sparse_step (`int`, *optional*, defaults to 1):
+            The frequency of the MoE layer.
+        moe_intermediate_size (`int`, *optional*, defaults to 512):
+            Intermediate size of the routed expert.
+        shared_expert_intermediate_size (`int`, *optional*, defaults to 512):
+            Intermediate size of the shared expert.
+        num_experts_per_tok (`int`, *optional*, defaults to 10):
+            Number of selected experts.
+        num_experts (`int`, *optional*, defaults to 512):
+            Number of routed experts.
+        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the topk probabilities.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+        mlp_only_layers (`list[int]`, *optional*, defaults to `[]`):
+            Indicate which layers use Qwen3NextMLP rather than Qwen3NextSparseMoeBlock
+            The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
+            If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
+        layer_types (`list[str]`, *optional*):
+            Types of each layer (attention or linear).
+
+    ```python
+    >>> from transformers import Qwen3NextModel, Qwen3NextConfig
+
+    >>> # Initializing a Qwen3Next style configuration
+    >>> configuration =  Qwen3NextConfig()
+
+    >>> # Initializing a model from the Qwen3-Next-80B-A3B style configuration
+    >>> model = Qwen3NextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """  # noqa: E501
+
+    model_type = "qwen3_next"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.experts.*.gate_proj": "colwise",
+        "layers.*.mlp.experts.*.up_proj": "colwise",
+        "layers.*.mlp.experts.*.down_proj": "rowwise",
+        "layers.*.mlp.shared_experts.gate_proj": "colwise",
+        "layers.*.mlp.shared_experts.up_proj": "colwise",
+        "layers.*.mlp.shared_experts.down_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=2048,
+        intermediate_size=5632,
+        num_hidden_layers=48,
+        num_attention_heads=16,
+        num_key_value_heads=2,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_parameters=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        head_dim=256,
+        linear_conv_kernel_dim=4,
+        linear_key_head_dim=128,
+        linear_value_head_dim=128,
+        linear_num_key_heads=16,
+        linear_num_value_heads=32,
+        decoder_sparse_step=1,
+        moe_intermediate_size=512,
+        shared_expert_intermediate_size=512,
+        num_experts_per_tok=10,
+        num_experts=512,
+        norm_topk_prob=True,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        mlp_only_layers=None,
+        layer_types=None,
+        **kwargs,
+    ):
+        if mlp_only_layers is None:
+            mlp_only_layers = []
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        partial_rotary_factor = kwargs.pop("partial_rotary_factor", 0.25)
+        if "partial_rotary_factor" not in rope_parameters:
+            rope_parameters["partial_rotary_factor"] = partial_rotary_factor
+        self.rope_parameters = rope_parameters
+        self.partial_rotary_factor = partial_rotary_factor
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.head_dim = head_dim
+
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                "linear_attention" if bool((i + 1) % 4) else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types)
+
+        # linear attention part
+        self.linear_conv_kernel_dim = linear_conv_kernel_dim
+        self.linear_key_head_dim = linear_key_head_dim
+        self.linear_value_head_dim = linear_value_head_dim
+        self.linear_num_key_heads = linear_num_key_heads
+        self.linear_num_value_heads = linear_num_value_heads
+
+        # MoE arguments
+        self.decoder_sparse_step = decoder_sparse_step
+        self.moe_intermediate_size = moe_intermediate_size
+        self.shared_expert_intermediate_size = shared_expert_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.norm_topk_prob = norm_topk_prob
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.mlp_only_layers = mlp_only_layers
+
+
+__all__ = ["Qwen3NextConfig"]
--- a/vllm/transformers_utils/configs/radio.py
+++ b/vllm/transformers_utils/configs/radio.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Radio vision model configuration"""
+
+from typing import Any
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+VIT_TIMM_DIM_BY_NAME: dict[str, tuple[int, int, int, int]] = {
+    "vit_small_patch16_224": (384, 12, 6, 1536),
+    "vit_base_patch16_224": (768, 12, 12, 3072),
+    "vit_large_patch16_224": (1024, 24, 16, 4096),
+    "vit_huge_patch16_224": (1280, 32, 16, 5120),
+}
+
+OPENAI_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
+
+
+class RadioConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a Radio
+    vision model. It is used to instantiate a Radio model according to the
+    specified arguments, defining the model architecture.
+
+    Args:
+        model_name: Name of the vision transformer model
+            (e.g., "vit_base_patch16_224"). Used to determine architecture
+            dimensions from `VIT_TIMM_DIM_BY_NAME`.
+        image_size: The size (resolution) of each image.
+        patch_size: The size (resolution) of each patch.
+        qkv_bias: Whether to add a bias to the queries, keys and values.
+        qk_normalization: Whether to apply normalization to queries and keys.
+        norm_type: The normalization type to use.
+        layer_norm_eps: The epsilon used by the layer normalization layers.
+        initializer_factor: A factor for initializing all weight matrices.
+        hidden_act: The non-linear activation function in the encoder.
+        cpe_max_size: Maximum image size for position embeddings.
+        norm_mean: Mean values for image normalization (RGB channels).
+            Defaults to (0.48145466, 0.4578275, 0.40821073)).
+        norm_std: Standard deviation values for image normalization
+            (RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).
+        register_multiple: Number of register tokens to use.
+        teachers: A list of teacher model configurations. Each teacher configuration is
+            a dict with keys like "name" and some may have "use_summary".
+        cls_token_per_teacher: Whether to use a separate CLS token for each teacher.
+    """
+
+    model_type = "radio"
+
+    def __init__(
+        self,
+        model_name: str,
+        image_size: int = 224,
+        patch_size: int = 16,
+        qkv_bias: bool = True,
+        qk_normalization: bool = False,
+        norm_type: str = "layer_norm",
+        layer_norm_eps: float = 1e-6,
+        initializer_factor: float = 1.0,
+        hidden_act: str = "gelu",
+        cpe_max_size: int = 2048,
+        norm_mean: tuple[float, float, float] | list = OPENAI_CLIP_MEAN,
+        norm_std: tuple[float, float, float] | list = OPENAI_CLIP_STD,
+        register_multiple: int | None = None,
+        teachers: list[dict[str, Any]] | None = None,
+        cls_token_per_teacher: bool = False,
+        **kwargs,
+    ):
+        self.model_name = model_name
+        (
+            self.hidden_size,
+            self.num_hidden_layers,
+            self.num_attention_heads,
+            self.intermediate_size,
+        ) = VIT_TIMM_DIM_BY_NAME[model_name]
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.qkv_bias = qkv_bias
+        self.qk_normalization = qk_normalization
+        self.norm_type = norm_type
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_factor = initializer_factor
+        self.hidden_act = hidden_act
+        self.cpe_max_size = cpe_max_size
+        self.norm_mean = (
+            list(norm_mean) if isinstance(norm_mean, (tuple, list)) else norm_mean
+        )
+        self.norm_std = (
+            list(norm_std) if isinstance(norm_std, (tuple, list)) else norm_std
+        )
+        self.register_multiple = register_multiple
+        self.teachers = teachers if teachers is not None else []
+        self.cls_token_per_teacher = cls_token_per_teacher
+        super().__init__(**kwargs)
--- a/vllm/transformers_utils/configs/speculators/init.py
+++ b/vllm/transformers_utils/configs/speculators/init.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
--- a/vllm/transformers_utils/configs/speculators/algos.py
+++ b/vllm/transformers_utils/configs/speculators/algos.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+SUPPORTED_SPECULATORS_TYPES = {}
+
+
+def register_speculator(name):
+    def decorator(fn):
+        SUPPORTED_SPECULATORS_TYPES[name] = fn
+        return fn
+
+    return decorator
+
+
+@register_speculator("eagle3")
+def update_eagle3(config_dict: dict, pre_trained_config: dict) -> None:
+    """
+    Apply Eagle-3 specific configuration transformations to the `dict` used to
+    construct the Transformers PreTrainedConfig.
+
+    Eagle-3 specific fields:
+    - draft_vocab_size: Size of the draft model's vocabulary
+    - target_hidden_size: Hidden size of the target model
+    - norm_before_residual: Whether to apply norm before residual connection
+    - eagle_aux_hidden_state_layer_ids: List of layer indices from the base
+        model to use as auxiliary inputs for the Eagle3 drafter. These layers
+        provide intermediate hidden states that help the drafter make better
+        predictions. This is the standard field used in Eagle3 checkpoints.
+    """
+
+    pre_trained_config["draft_vocab_size"] = config_dict.get("draft_vocab_size")
+    if config_dict.get("target_hidden_size") is not None:
+        pre_trained_config["target_hidden_size"] = config_dict["target_hidden_size"]
+    pre_trained_config["norm_before_residual"] = config_dict.get(
+        "norm_before_residual", True
+    )
+    pre_trained_config["architectures"] = ["Eagle3LlamaForCausalLM"]
+    if config_dict.get("eagle_aux_hidden_state_layer_ids"):
+        pre_trained_config["eagle_aux_hidden_state_layer_ids"] = config_dict[
+            "eagle_aux_hidden_state_layer_ids"
+        ]
--- a/vllm/transformers_utils/configs/speculators/base.py
+++ b/vllm/transformers_utils/configs/speculators/base.py
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from typing import Any
+
+from transformers import PretrainedConfig
+
+from vllm.transformers_utils.configs.speculators.algos import (
+    SUPPORTED_SPECULATORS_TYPES,
+)
+
+__all__ = ["SpeculatorsConfig"]
+
+
+class SpeculatorsConfig(PretrainedConfig):
+    model_type = "speculators"
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str | os.PathLike,
+        **kwargs,
+    ) -> "SpeculatorsConfig":
+        """Load speculators Eagle config and convert to vLLM format."""
+        config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        vllm_config = cls.extract_transformers_pre_trained_config(config_dict)
+        return cls(**vllm_config)
+
+    @classmethod
+    def extract_transformers_pre_trained_config(
+        cls, config_dict: dict[str, Any]
+    ) -> dict[str, Any]:
+        """
+        Extract standard Transformers PreTrainedConfig config from speculators config.
+        """
+        speculators_model_type = config_dict.get("speculators_model_type")
+        if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES:
+            raise ValueError(
+                f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. "
+                "Please ensure you're loading a speculators-format model."
+            )
+
+        # Start with transformer layer configuration if present
+        pre_trained_config = config_dict.get("transformer_layer_config", {})
+        # Apply anything specific to the supported algorithm
+        algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type]
+        algo_updater(config_dict=config_dict, pre_trained_config=pre_trained_config)
+        return pre_trained_config
+
+    @classmethod
+    def extract_vllm_speculative_config(
+        cls, config_dict: dict[str, Any]
+    ) -> dict[str, Any]:
+        """Extract vLLM speculative config from speculators config."""
+        # validate fields
+        # TODO: @dsikka - use speculators pydantic model to validate
+        cls.validate_speculators_config(config_dict=config_dict)
+        # Convert from speculators config -> format that can be ingested by vLLM
+        return cls.build_vllm_speculative_config(config_dict=config_dict)
+
+    @classmethod
+    def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None:
+        try:
+            spec_config = config_dict["speculators_config"]
+            methods = spec_config["proposal_methods"]
+            first_method = methods[0]
+            _ = first_method["speculative_tokens"]
+            _ = spec_config["verifier"]["name_or_path"]
+            _ = config_dict["speculators_model_type"]
+        except (KeyError, IndexError, TypeError) as e:
+            raise ValueError("Invalid speculators config structure") from e
+
+        if "transformer_layer_config" not in config_dict:
+            raise ValueError("Must provide transformer_layer_config")
+
+        if not isinstance(config_dict["transformer_layer_config"], dict):
+            raise TypeError(
+                "'transformer_layer_config' must be a dictionary if provided"
+            )
+
+    @classmethod
+    def build_vllm_speculative_config(
+        cls, config_dict: dict[str, Any]
+    ) -> dict[str, Any]:
+        """
+        Build vLLM-compatible speculative configuration from speculators format.
+
+        This method extracts and transforms speculative configuration from the
+        speculators format into the structure expected by vLLM.
+
+        Args:
+            config_dict: Configuration dictionary in speculators format
+
+        Returns:
+            Dictionary with vLLM-compatible speculative configuration
+        """
+        # Extract speculators configuration
+        spec_config = config_dict["speculators_config"]
+
+        # Currently we only support one proposal method
+        proposal_methods = spec_config.get("proposal_methods")
+        if not proposal_methods:
+            raise ValueError("No proposal methods found in speculators config")
+
+        first_method = proposal_methods[0]
+        num_speculative_tokens = first_method.get("speculative_tokens")
+
+        if num_speculative_tokens is None:
+            raise ValueError(
+                f"Missing 'speculative_tokens' in proposal method. Got: {first_method}"
+            )
+
+        # Build base vLLM speculative configuration
+        return {
+            "method": config_dict.get("speculators_model_type"),
+            "num_speculative_tokens": num_speculative_tokens,
+        }
--- a/vllm/transformers_utils/configs/step3_vl.py
+++ b/vllm/transformers_utils/configs/step3_vl.py
@@ -0,0 +1,178 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class Step3VisionEncoderConfig(PretrainedConfig):
+    model_type = "step3_vision_encoder"
+
+    def __init__(
+        self,
+        hidden_size=1792,
+        intermediate_size=3072,
+        output_hidden_size=4096,
+        num_hidden_layers=63,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=728,
+        patch_size=14,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.output_hidden_size = output_hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        super().__init__(**kwargs)
+
+
+class Step3TextConfig(PretrainedConfig):
+    model_type = "step3_text"
+    architectures = ["Step3TextForCausalLM"]
+
+    def __init__(
+        self,
+        hidden_size: int = 7168,
+        intermediate_size: int = 18432,
+        num_attention_heads: int = 64,
+        num_attention_groups: int = 1,
+        num_hidden_layers: int = 61,
+        max_seq_len: int = 65536,
+        vocab_size: int = 128815,
+        rms_norm_eps: float = 1e-5,
+        moe_intermediate_size: int = 5120,
+        moe_num_experts: int = 48,
+        moe_top_k: int = 3,
+        rope_parameters: dict[str, Any] | None = None,
+        max_position_embedding: int = 65536,
+        share_expert_dim: int = 5120,
+        share_q_dim: int = 2048,
+        head_dim: int = 256,
+        norm_expert_weight: bool = False,
+        moe_layers_enum: tuple[int, ...] = (
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15,
+            16,
+            17,
+            18,
+            19,
+            20,
+            21,
+            22,
+            23,
+            24,
+            25,
+            26,
+            27,
+            28,
+            29,
+            30,
+            31,
+            32,
+            33,
+            34,
+            35,
+            36,
+            37,
+            38,
+            39,
+            40,
+            41,
+            42,
+            43,
+            44,
+            45,
+            46,
+            47,
+            48,
+            49,
+            50,
+            51,
+            52,
+            53,
+            54,
+            55,
+            56,
+            57,
+            58,
+            59,
+        ),
+        **kwargs,
+    ) -> None:
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.num_attention_groups = num_attention_groups
+        self.num_hidden_layers = num_hidden_layers
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.rms_norm_eps = rms_norm_eps
+        self.moe_intermediate_size = moe_intermediate_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 500000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
+        self.max_position_embedding = max_position_embedding
+        self.share_expert_dim = share_expert_dim
+        self.share_q_dim = share_q_dim
+        self.head_dim = head_dim
+        self.norm_expert_weight = norm_expert_weight
+        self.moe_layers_enum = moe_layers_enum
+
+        super().__init__(**kwargs)
+
+
+class Step3VLConfig(PretrainedConfig):
+    model_type = "step3_vl"
+
+    def __init__(
+        self,
+        vision_config: dict | Step3VisionEncoderConfig | None = None,
+        text_config: dict | Step3TextConfig | None = None,
+        understand_projector_stride: int = 1,
+        projector_bias: bool = True,
+        image_token_id: int = 128001,
+        **kwargs,
+    ) -> None:
+        if vision_config is None:
+            vision_config = Step3VisionEncoderConfig()
+        elif isinstance(vision_config, dict):
+            vision_config = Step3VisionEncoderConfig(**vision_config)
+        self.vision_config = vision_config
+
+        if text_config is None:
+            text_config = Step3TextConfig()
+        elif isinstance(text_config, dict):
+            text_config = Step3TextConfig(**text_config)
+        self.text_config = text_config
+
+        self.understand_projector_stride = understand_projector_stride
+        self.projector_bias = projector_bias
+        self.hidden_size = text_config.hidden_size
+        self.image_token_id = image_token_id
+
+        super().__init__(**kwargs)
--- a/vllm/transformers_utils/configs/step3p5.py
+++ b/vllm/transformers_utils/configs/step3p5.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class Step3p5Config(PretrainedConfig):
+    model_type = "step3p5"
+
+    def __init__(
+        self,
+        hidden_size: int = 5120,
+        intermediate_size: int = 13312,
+        num_attention_heads: int = 40,
+        num_attention_groups: int = 8,
+        num_hidden_layers: int = 48,
+        max_seq_len: int = 4096,
+        vocab_size: int = 65536,
+        rms_norm_eps: float = 1e-5,
+        moe_every_n_layer: int = 2,
+        use_moe: bool = False,
+        moe_intermediate_size: int = 10240,
+        moe_num_experts: int = 16,
+        moe_top_k: int = 4,
+        moe_layer_offset: int = 0,
+        rope_theta: float | list[float] | None = 500000,
+        rope_scaling: dict[str, Any] | None = None,
+        head_dim: int | None = None,
+        share_expert_dim: int | None = None,
+        norm_expert_weight: bool = True,
+        bos_token_id: list[int] | int | None = None,
+        eos_token_id: list[int] | int | None = None,
+        moe_router_activation: str = "softmax",
+        moe_router_scaling_factor: float = 1.0,
+        att_impl_type: str = "GQA",
+        use_head_wise_attn_gate: bool = False,
+        use_moe_router_bias: bool = True,
+        need_fp32_gate: bool = True,
+        layer_types: list[str] | None = None,
+        use_rope_layers: list[bool] | None = None,
+        yarn_only_types: list[str] | None = None,
+        attention_other_setting: dict[str, Any] | None = None,
+        num_nextn_predict_layers: int = 0,
+        swiglu_limits: list[float] | None = None,
+        swiglu_limits_shared: list[float] | None = None,
+        max_position_embeddings: int | None = None,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.num_attention_groups = num_attention_groups
+        self.num_hidden_layers = num_hidden_layers
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.rms_norm_eps = rms_norm_eps
+        self.use_moe = use_moe
+        self.moe_intermediate_size = moe_intermediate_size
+        self.moe_every_n_layer = moe_every_n_layer
+        self.moe_num_experts = moe_num_experts
+        self.num_experts_per_tok = moe_top_k
+        self.moe_top_k = moe_top_k
+        self.moe_layer_offset = moe_layer_offset
+
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.head_dim = head_dim
+        if share_expert_dim is None:
+            self.share_expert_dim = self.moe_intermediate_size * self.moe_top_k
+        else:
+            self.share_expert_dim = share_expert_dim
+        self.norm_expert_weight = norm_expert_weight
+
+        self.max_position_embeddings = max_position_embeddings
+        self.moe_router_activation = moe_router_activation
+        self.moe_router_scaling_factor = moe_router_scaling_factor
+        self.use_moe_router_bias = use_moe_router_bias
+        self.need_fp32_gate = need_fp32_gate
+
+        self.att_impl_type = att_impl_type
+        self.use_head_wise_attn_gate = use_head_wise_attn_gate
+        self.layer_types = layer_types
+        self.use_rope_layers = use_rope_layers
+        self.yarn_only_types = yarn_only_types
+        self.attention_other_setting = attention_other_setting
+        self.num_nextn_predict_layers = num_nextn_predict_layers
+        self.swiglu_limits = swiglu_limits
+        self.swiglu_limits_shared = swiglu_limits_shared
+
+        resolved_bos_token_id = 1 if bos_token_id is None else bos_token_id
+        resolved_eos_token_id = [2, 3] if eos_token_id is None else eos_token_id
+        self.bos_token_id = resolved_bos_token_id
+        self.eos_token_id = resolved_eos_token_id
+
+        super().__init__(
+            bos_token_id=resolved_bos_token_id,
+            eos_token_id=resolved_eos_token_id,
+            **kwargs,
+        )
--- a/vllm/transformers_utils/configs/tarsier2.py
+++ b/vllm/transformers_utils/configs/tarsier2.py
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from transformers import Qwen2VLConfig
+
+
+class Tarsier2Config(Qwen2VLConfig):
+    """
+    Tarsier2's config.json is written such that AutoConfig.from_pretrained will create
+    a deeply nested config consisting of:
+
+    - LlavaConfig
+      - Qwen2VLConfig
+        - Qwen2VLTextConfig
+        - Qwen2VLVisionConfig
+      - Qwen2VLConfig
+        - Qwen2VLTextConfig
+        - Qwen2VLVisionConfig
+
+    When it should really just be a single Qwen2VLConfig.
+
+    This class is a hack to stop AutoConfig from creating the nested config structure.
+    """
+
+    model_type = "tarsier2"
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
+from typing import Any
+
+import transformers
+
+
+class UltravoxConfig(transformers.PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`UltravoxForConditionalGeneration`]. It is used to instantiate an
+    Ultravox model according to the specified arguments, defining the model
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to
+    control the model outputs. Read the documentation from [`PretrainedConfig`]
+    for more information.
+
+    Args:
+        audio_config (`Union[AutoConfig, dict]`,  *optional*):
+            Custom audio config or dict.
+        text_config (`Union[AutoConfig, dict]`, *optional*):
+            The config object of the text backbone.
+        audio_model_id (`str`, *optional*):
+            The model ID of the audio backbone.
+        text_model_id (`str`, *optional*):
+            The model ID of the text backbone.
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        audio_token_index (`int`, *optional*, defaults to 32000):
+            The audio token index to encode the audio prompt.
+        stack_factor (`int`, *optional*, defaults to 8):
+            Audio downsampling factor for the multimodal projector.
+        norm_init (`float`, *optional*, defaults to 0.4):
+            The initialization value for the layer normalization.
+        projector_act (`str`, *optional*, defaults to `"swiglu"`):
+            The activation function used by the multimodal projector.
+        projector_ln_mid (`bool`, *optional*, defaults to `False`):
+            Whether to apply layer normalization at the middle of the
+            projector or at the end. Versions v0.4.1 and below
+            use `False`, but v0.5 and above use `True`.
+    """
+
+    wrapped_model_config: transformers.PretrainedConfig
+    model_type = "ultravox"
+    audio_token = "<|audio|>"
+    is_composition = False
+
+    def __init__(
+        self,
+        audio_config: dict[str, Any] | None = None,
+        text_config: dict[str, Any] | None = None,
+        audio_model_id: str | None = None,
+        text_model_id: str | None = None,
+        ignore_index: int = -100,
+        audio_token_index: int = 32000,
+        hidden_size: int = 4096,
+        stack_factor: int = 8,
+        norm_init: float = 0.4,
+        projector_act: str = "swiglu",
+        projector_ln_mid: bool = False,
+        num_projector_layers: int = 0,
+        **kwargs,
+    ):
+        self.ignore_index = ignore_index
+        self.audio_token_index = audio_token_index
+
+        self.hidden_size = hidden_size
+        self.stack_factor = stack_factor
+        self.norm_init = norm_init
+        self.projector_act = projector_act
+        self.projector_ln_mid = projector_ln_mid
+        self.num_projector_layers = num_projector_layers
+
+        # N.B. May set the wrapped_model_config below.
+        self.text_model_id = text_model_id
+        if text_model_id is None:
+            text_config = text_config or {}
+            self.wrapped_model_config = transformers.CONFIG_MAPPING[
+                text_config.get("model_type", "llama")
+            ](**text_config)
+
+        # N.B. May set the audio_config below.
+        self.audio_model_id = audio_model_id
+        if audio_model_id is None:
+            self.audio_model_id = None
+            audio_config = audio_config or {}
+            self.audio_config = transformers.CONFIG_MAPPING[
+                audio_config.get("model_type", "whisper")
+            ](**audio_config)
+
+        super().__init__(**kwargs)
+
+    def __setattr__(self, key, value):
+        # Since --hf-overrides are applied _after_ the UltravoxConfig is
+        # instantiated, load the configs implicitly when assigning text_model_id
+        # or audio_model_id. This allows:
+        #
+        #   --hf-overrides.text_model_id=<quantized variant>
+        #
+        # to behave as intended.
+        if key == "text_model_id" and value is not None:
+            from vllm.transformers_utils.config import get_config
+
+            self.wrapped_model_config = get_config(value, trust_remote_code=False)
+        elif key == "audio_model_id" and value is not None:
+            from vllm.transformers_utils.config import get_config
+
+            self.audio_config = get_config(value, trust_remote_code=False)
+
+        return super().__setattr__(key, value)
+
+    @property
+    def text_config(self) -> transformers.PretrainedConfig:
+        # When Ultravox wraps a multi-modal model (e.g. Gemma), we instantiate
+        # the full model, but the text config is the text config of the inner
+        # model.
+        return self.wrapped_model_config.get_text_config()