add qwen3

2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/init.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/init.py
@@ -0,0 +1,38 @@
+from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
+from vllm.transformers_utils.configs.dbrx import DbrxConfig
+from vllm.transformers_utils.configs.eagle import EAGLEConfig
+from vllm.transformers_utils.configs.exaone import ExaoneConfig
+# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
+# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
+# `FalconConfig` class from the official HuggingFace transformers library.
+from vllm.transformers_utils.configs.falcon import RWConfig
+from vllm.transformers_utils.configs.h2ovl import H2OVLChatConfig
+from vllm.transformers_utils.configs.internvl import InternVLChatConfig
+from vllm.transformers_utils.configs.jais import JAISConfig
+from vllm.transformers_utils.configs.medusa import MedusaConfig
+from vllm.transformers_utils.configs.mllama import MllamaConfig
+from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
+from vllm.transformers_utils.configs.mpt import MPTConfig
+from vllm.transformers_utils.configs.nemotron import NemotronConfig
+from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
+from vllm.transformers_utils.configs.solar import SolarConfig
+from vllm.transformers_utils.configs.ultravox import UltravoxConfig
+
+__all__ = [
+    "ChatGLMConfig",
+    "DbrxConfig",
+    "MPTConfig",
+    "RWConfig",
+    "H2OVLChatConfig",
+    "InternVLChatConfig",
+    "JAISConfig",
+    "MedusaConfig",
+    "EAGLEConfig",
+    "ExaoneConfig",
+    "MllamaConfig",
+    "MLPSpeculatorConfig",
+    "NemotronConfig",
+    "NVLM_D_Config",
+    "SolarConfig",
+    "UltravoxConfig",
+]
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/init.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/init.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/chatglm.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/chatglm.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/dbrx.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/dbrx.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/eagle.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/eagle.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/exaone.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/exaone.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/falcon.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/falcon.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/h2ovl.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/h2ovl.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/internvl.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/internvl.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/jais.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/jais.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/medusa.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/medusa.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/mllama.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/mllama.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/mlp_speculator.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/mlp_speculator.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/mpt.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/mpt.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/nemotron.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/nemotron.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/nvlm_d.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/nvlm_d.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/solar.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/solar.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/ultravox.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/pycache/ultravox.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/arctic.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/arctic.py
@@ -0,0 +1,204 @@
+# yapf: disable
+# ruff: noqa: E501
+# coding=utf-8
+# Copied from
+# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
+""" Arctic model configuration"""
+
+from dataclasses import asdict, dataclass
+from typing import Any, Dict
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+ARCTIC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "arctic": "https://huggingface.co/Snowflake/snowflake-arctic-instruct/tree/main/config.json",
+}
+
+
+@dataclass
+class ArcticLoraConfig:
+    lora_r: int = 64
+    lora_alpha: float = 16
+    shard_base_weights: bool = False
+
+
+@dataclass
+class ArcticQuantizationConfig:
+    q_bits: int = 8
+    rounding: str = "nearest"
+    mantissa_bits: int = 3
+    group_size: int = 128
+
+
+class ArcticConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ArcticModel`]. It is used to instantiate an
+    Arctic model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the #TODO(rsamdani): add what model has the default config..
+
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Arctic model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ArcticModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
+            The maximum sequence length that this model might ever be used with. Arctic's sliding window attention
+            allows sequence of up to 4096*32 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If not specified, will default to `4096`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_experts_per_tok (`int`, *optional*, defaults to 2):
+            The number of experts to root per-token, can be also interpreted as the `top-p` routing
+            parameter
+        num_local_experts (`int`, *optional*, defaults to 8):
+            Number of experts per Sparse MLP layer.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+
+    ```python
+    >>> from transformers import ArcticModel, ArcticConfig
+
+    >>> # Initializing a Arctic 7B style configuration TODO(rsamdani): verify which model does the default configuration correspond to.
+    >>> configuration = ArcticConfig()
+
+    >>> # Initializing a model from the Arctic 7B style configuration
+    >>> model = ArcticModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "arctic"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=1e6,
+        sliding_window=None,
+        attention_dropout=0.0,
+        num_experts_per_tok=1,
+        num_local_experts=8,
+        router_aux_loss_coef=0.001,
+        moe_layer_frequency=2,
+        parallel_attn_mlp_res=False,
+        moe_train_capacity_factor=1,
+        moe_eval_capacity_factor=1,
+        enable_expert_tensor_parallelism=False,
+        moe_min_capacity=0,
+        moe_token_dropping=True,
+        quantization=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.moe_layer_frequency = moe_layer_frequency
+        self.moe_train_capacity_factor = moe_train_capacity_factor
+        self.moe_eval_capacity_factor = moe_eval_capacity_factor
+        self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
+        self.moe_min_capacity = moe_min_capacity
+        self.moe_token_dropping = moe_token_dropping
+        self.parallel_attn_mlp_res = parallel_attn_mlp_res
+        if isinstance(quantization, dict):
+            self.quantization = ArcticQuantizationConfig(**quantization)
+        else:
+            self.quantization = quantization
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "ArcticConfig":
+        result = super().from_dict(config_dict, **kwargs)
+        config = result[0] if isinstance(result, tuple) else result
+        if isinstance(config.quantization, dict):
+            config.quantization = ArcticQuantizationConfig(**config.quantization)
+        return result
+
+    def to_dict(self) -> Dict[str, Any]:
+        ret = super().to_dict()
+        if isinstance(ret["quantization"], ArcticQuantizationConfig):
+            ret["quantization"] = asdict(ret["quantization"])
+        return ret
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/chatglm.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/chatglm.py
@@ -0,0 +1,69 @@
+# Adapted from
+# https://github.com/THUDM/ChatGLM2-6B
+from transformers import PretrainedConfig
+
+
+class ChatGLMConfig(PretrainedConfig):
+    model_type = "chatglm"
+    attribute_map = {
+        "num_hidden_layers": "num_layers",
+        "n_head_kv": "multi_query_group_num",
+    }
+
+    def __init__(self,
+                 num_layers=28,
+                 padded_vocab_size=65024,
+                 hidden_size=4096,
+                 ffn_hidden_size=13696,
+                 kv_channels=128,
+                 num_attention_heads=32,
+                 seq_length=2048,
+                 hidden_dropout=0.0,
+                 attention_dropout=0.0,
+                 layernorm_epsilon=1e-5,
+                 rmsnorm=True,
+                 apply_residual_connection_post_layernorm=False,
+                 post_layer_norm=True,
+                 add_bias_linear=False,
+                 add_qkv_bias=False,
+                 interleaved_qkv=False,
+                 bias_dropout_fusion=True,
+                 multi_query_attention=False,
+                 multi_query_group_num=1,
+                 apply_query_key_layer_scaling=True,
+                 attention_softmax_in_fp32=True,
+                 fp32_residual_connection=False,
+                 quantization_bit=0,
+                 pre_seq_len=None,
+                 prefix_projection=False,
+                 **kwargs):
+        self.num_layers = num_layers
+        self.vocab_size = padded_vocab_size
+        self.padded_vocab_size = padded_vocab_size
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.kv_channels = kv_channels
+        self.num_attention_heads = num_attention_heads
+        self.seq_length = seq_length
+        # It is to be compatible with long lora.
+        self.max_position_embeddings = seq_length
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.layernorm_epsilon = layernorm_epsilon
+        self.rmsnorm = rmsnorm
+        self.apply_residual_connection_post_layernorm = (
+            apply_residual_connection_post_layernorm)
+        self.post_layer_norm = post_layer_norm
+        self.add_bias_linear = add_bias_linear
+        self.add_qkv_bias = add_qkv_bias
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.multi_query_attention = multi_query_attention
+        self.multi_query_group_num = multi_query_group_num
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.fp32_residual_connection = fp32_residual_connection
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+        self.interleaved_qkv = interleaved_qkv
+        super().__init__(**kwargs)
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/dbrx.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/dbrx.py
@@ -0,0 +1,278 @@
+# yapf: disable
+# ruff: noqa: E501
+# coding=utf-8
+# Copied from
+# https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py
+"""Dbrx configuration."""
+
+from typing import Any, Optional
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {} # type: ignore
+
+
+class DbrxAttentionConfig(PretrainedConfig):
+    """Configuration class for Dbrx Attention.
+
+    [`DbrxAttention`] class. It is used to instantiate attention layers
+    according to the specified arguments, defining the layers architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        attn_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention layers.
+        clip_qkv (`float`, *optional*, defaults to None):
+            If not `None`, clip the queries, keys, and values in the attention layer to this value.
+        kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
+        rope_theta (float): The base frequency for rope.
+    """
+
+    def __init__(
+        self,
+        attn_pdrop: float = 0,
+        clip_qkv: Optional[float] = None,
+        kv_n_heads: int = 1,
+        rope_theta: float = 10000.0,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.attn_pdrop = attn_pdrop
+        self.clip_qkv = clip_qkv
+        self.kv_n_heads = kv_n_heads
+        self.rope_theta = rope_theta
+
+        for k in ["model_type"]:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f"Found unknown {kwargs=}")
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: str, **kwargs: Any
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        if config_dict.get("model_type") == "dbrx":
+            config_dict = config_dict["attn_config"]
+
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                "You are using a model of type %s to instantiate a model of "
+                "type %s. This is not supported for all configurations of "
+                "models and can yield errors.",
+                config_dict["model_type"], cls.model_type)
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class DbrxFFNConfig(PretrainedConfig):
+    """Configuration class for Dbrx FFN.
+
+    [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
+    the specified arguments, defining the layers architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
+            The dict should have a key 'name' with the value being the name of
+            the activation function along with any additional keyword arguments.
+        ffn_hidden_size (int, optional): The hidden size of the feedforward network.
+        moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
+        moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
+        moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
+        moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
+        moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
+        uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
+            This should only be used for benchmarking purposes.
+    """
+
+    def __init__(
+        self,
+        ffn_act_fn: Optional[dict] = None,
+        ffn_hidden_size: int = 3584,
+        moe_num_experts: int = 4,
+        moe_top_k: int = 1,
+        moe_jitter_eps: Optional[float] = None,
+        moe_loss_weight: float = 0.01,
+        moe_normalize_expert_weights: Optional[float] = 1,
+        uniform_expert_assignment: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__()
+        if ffn_act_fn is None:
+            ffn_act_fn = {"name": "silu"}
+        self.ffn_act_fn = ffn_act_fn
+        self.ffn_hidden_size = ffn_hidden_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.moe_jitter_eps = moe_jitter_eps
+        self.moe_loss_weight = moe_loss_weight
+        self.moe_normalize_expert_weights = moe_normalize_expert_weights
+        self.uniform_expert_assignment = uniform_expert_assignment
+
+        for k in ["model_type"]:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f"Found unknown {kwargs=}")
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: str, **kwargs: Any
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        if config_dict.get("model_type") == "dbrx":
+            config_dict = config_dict["ffn_config"]
+
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                "You are using a model of type %s to instantiate a model of "
+                "type %s. This is not supported for all "
+                "configurations of models and can yield errors.", config_dict["model_type"], cls.model_type)
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class DbrxConfig(PretrainedConfig):
+    """Configuration class for Dbrx.
+
+    [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
+    specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        d_model (`int`, *optional*, defaults to 6144):
+            Dimensionality of the embeddings and hidden states.
+        n_heads (`int`, *optional*, defaults to 48):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer encoder.
+        max_seq_len (`int`, *optional*, defaults to 32768):
+            The maximum sequence length of the model.
+        vocab_size (`int`, *optional*, defaults to 100352):
+            Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`DbrxModel`].
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability applied to the attention output before combining with residual.
+        emb_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the embedding layer.
+        attn_config (`dict`, *optional*):
+            A dictionary used to configure the model's attention module.
+        ffn_config (`dict`, *optional*):
+            A dictionary used to configure the model's FFN module.
+        use_cache (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss. See [here]() for more details
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+
+
+    Example:
+    ```python
+    >>> from transformers import DbrxConfig, DbrxModel
+
+    >>> # Initializing a Dbrx configuration
+    >>> configuration = DbrxConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = DbrxModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "dbrx"
+    attribute_map = {
+        "num_attention_heads": "n_heads",
+        "hidden_size": "d_model",
+        "num_hidden_layers": "n_layers",
+        "max_position_embeddings": "max_seq_len",
+    }
+
+    def __init__(
+        self,
+        d_model: int = 2048,
+        n_heads: int = 16,
+        n_layers: int = 24,
+        max_seq_len: int = 2048,
+        vocab_size: int = 32000,
+        resid_pdrop: float = 0.0,
+        emb_pdrop: float = 0.0,
+        attn_config: Optional[DbrxAttentionConfig] = None,
+        ffn_config: Optional[DbrxFFNConfig] = None,
+        use_cache: bool = True,
+        initializer_range: float = 0.02,
+        output_router_logits: bool = False,
+        router_aux_loss_coef: float = 0.05,
+        **kwargs: Any,
+    ):
+        if attn_config is None:
+            self.attn_config = DbrxAttentionConfig()
+        elif isinstance(attn_config, dict):
+            self.attn_config = DbrxAttentionConfig(**attn_config)
+        else:
+            self.attn_config = attn_config
+
+        if ffn_config is None:
+            self.ffn_config = DbrxFFNConfig()
+        elif isinstance(ffn_config, dict):
+            self.ffn_config = DbrxFFNConfig(**ffn_config)
+        else:
+            self.ffn_config = ffn_config
+
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
+        if tie_word_embeddings:
+            raise ValueError(
+                "tie_word_embeddings is not supported for Dbrx models."
+            )
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/eagle.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/eagle.py
@@ -0,0 +1,49 @@
+import os
+from typing import Optional, Union
+
+from transformers import AutoConfig, PretrainedConfig
+
+
+class EAGLEConfig(PretrainedConfig):
+    model_type = "eagle"
+
+    def __init__(self,
+                 model: Union[PretrainedConfig, dict, None] = None,
+                 truncated_vocab_size: Optional[int] = None,
+                 **kwargs):
+
+        model_config = None if model is None else (AutoConfig.for_model(
+            **model) if isinstance(model, dict) else model)
+
+        for k, v in kwargs.items():
+            if k != "architectures" and k != "model_type" and hasattr(
+                    model_config, k):
+                setattr(model_config, k, v)
+
+        self.model = model_config
+
+        if self.model is None:
+            self.truncated_vocab_size = None
+        else:
+            self.truncated_vocab_size = self.model.vocab_size if \
+                truncated_vocab_size is None else truncated_vocab_size
+
+        if "architectures" not in kwargs:
+            kwargs["architectures"] = ["EAGLEModel"]
+
+        super().__init__(**kwargs)
+
+        if self.model is not None:
+            for k, v in self.model.to_dict().items():
+                if not hasattr(self, k):
+                    setattr(self, k, v)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        **kwargs,
+    ) -> "EAGLEConfig":
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs)
+        return cls.from_dict(config_dict, **kwargs)
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/exaone.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/exaone.py
@@ -0,0 +1,189 @@
+# Copied from
+# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
+# Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Exaone model configuration"""
+
+from typing import Dict
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, str] = {}
+
+
+class ExaoneConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:
+    `~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model
+    according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the Exaone
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig`
+    and can be used to control the model outputs. Read the documentation from :
+    class:`~transformers.PretrainedConfig` for more information.
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 50257):
+            Vocabulary size of the GPT Lingvo model. Defines the number of
+            different tokens that can be represented by the :obj:`inputs_ids`
+            passed when calling :class:`~transformers.ExaoneModel`. Vocabulary
+            size of the model.
+            Defines the different tokens that can be represented by the
+            `inputs_ids` passed to the forward method of :class:
+            `~transformers.EXAONEModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_layers (:obj:`int`, `optional`, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the
+            Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi
+            Head Attention (MHA), if `num_key_value_heads=1 the model will use
+            Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint,
+            each group key and value head should be constructed by meanpooling
+            all the original heads within that group. For more details checkout
+            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
+            specified, will default to `num_attention_heads`.
+        rotary_pct (`float`, *optional*, defaults to 0.25):
+            percentage of hidden dimensions to allocate to rotary embeddings
+        intermediate_size (:obj:`int`, `optional`, defaults to 8192):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in
+            the Transformer encoder.
+        activation_function (:obj:`str` or :obj:`function`, `optional`,
+        defaults to :obj:`"gelu_new"`):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`,
+            :obj:`"selu"` and :obj:`"gelu_new"` are supported.
+        embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the
+            embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling
+            :class:`~transformers.EXAONEModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models).
+            Only relevant if ``config.is_decoder=True``.
+        gradient_checkpointing (:obj:`bool`, `optional`,
+        defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense
+            of slower backward pass.
+        Example::
+
+            >>> from transformers import ExoneModel, ExaoneConfig
+
+            >>> # Initializing a EXAONE configuration
+            >>> configuration = ExaoneConfig()
+
+            >>> # Initializing a model from configuration
+            >>> model = ExoneModel(configuration)
+
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
+    """
+
+    model_type = "exaone"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_hidden_layers": "num_layers"}
+
+    def __init__(
+        self,
+        vocab_size=102400,
+        max_position_embeddings=2048,
+        hidden_size=2048,
+        num_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        intermediate_size=None,
+        activation_function="silu",
+        rotary_pct=0.25,
+        resid_dropout=0.0,
+        embed_dropout=0.0,
+        attention_dropout=0.0,
+        layer_norm_epsilon=1e-6,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=0,
+        eos_token_id=2,
+        tie_word_embeddings=True,
+        **kwargs,
+    ):
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_layers
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        if intermediate_size:
+            self.intermediate_size = intermediate_size
+        else:
+            self.intermediate_size = hidden_size * 4
+        self.activation_function = activation_function
+        self.resid_dropout = resid_dropout
+        self.embed_dropout = embed_dropout
+        self.attention_dropout = attention_dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.rotary_pct = rotary_pct
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        self.use_logit_cap = kwargs.pop("use_logit_cap", False)
+        self.ln_no_scale = kwargs.pop("ln_no_scale", False)
+        self.use_gated = kwargs.pop("use_gated", False)
+        self.use_emb_norm = kwargs.pop("use_emb_norm", False)
+        self.use_rotary_pos = kwargs.pop("use_rotary_pos", False)
+        self.rotary_type = kwargs.pop("rotary_type", None)
+        self.scaling_factor = kwargs.pop("scaling_factor", 1)
+        self.use_absolute_pos = kwargs.pop("use_absolute_pos", True)
+        self.use_extra_logit = kwargs.pop("use_extra_logit", True)
+        self.rotary_expand_length = kwargs.pop("rotary_expand_length", None)
+        self.rotary_base = kwargs.pop("rotary_base", 10000.0)
+        self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False)
+        self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head",
+                                                 (rotary_pct == 0.25))
+        if self.use_rotary_pos:
+            self.use_absolute_pos = False
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/falcon.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/falcon.py
@@ -0,0 +1,87 @@
+# Adapted from
+# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Falcon configuration"""
+from transformers.configuration_utils import PretrainedConfig
+
+
+class RWConfig(PretrainedConfig):
+    model_type = "falcon"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_hidden_layers": "n_layer",
+        "num_attention_heads": "n_head",
+        "num_kv_heads": "n_head_kv",
+    }
+
+    def __init__(
+        self,
+        vocab_size=250880,
+        hidden_size=64,
+        n_layer=2,
+        n_head=8,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=1,
+        eos_token_id=2,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        multi_query=True,
+        n_head_kv=None,
+        alibi=False,
+        bias=False,
+        parallel_attn=False,
+        new_decoder_architecture=False,
+        **kwargs,
+    ) -> None:
+        self.vocab_size = vocab_size
+        # Backward compatibility with n_embed kwarg
+        n_embed = kwargs.pop("n_embed", None)
+        self.hidden_size = hidden_size if n_embed is None else n_embed
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.multi_query = multi_query
+        self.n_head_kv = 1 if n_head_kv is None else n_head_kv
+        self.alibi = alibi
+        self.bias = bias
+        self.parallel_attn = parallel_attn
+        self.new_decoder_architecture = new_decoder_architecture
+
+        if self.hidden_size == 8192:
+            # Hack for falcon-40b
+            self.new_decoder_architecture = True
+
+        super().__init__(bos_token_id=bos_token_id,
+                         eos_token_id=eos_token_id,
+                         **kwargs)
+
+    @property
+    def head_dim(self):
+        return self.hidden_size // self.n_head
+
+    @property
+    def rotary(self):
+        return not self.alibi
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/h2ovl.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/h2ovl.py
@@ -0,0 +1,13 @@
+# Adapted from
+# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py
+# --------------------------------------------------------
+# H2OVL-Mississippi
+# Copyright (c) 2024 H2O.AI
+# Licensed under Apache 2.0 License [see LICENSE for details]
+# --------------------------------------------------------
+
+from .internvl import InternVLChatConfig
+
+
+class H2OVLChatConfig(InternVLChatConfig):
+    model_type = "h2ovl_chat"
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/internvl.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/internvl.py
@@ -0,0 +1,51 @@
+# Adapted from
+# https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from transformers.configuration_utils import PretrainedConfig
+
+
+class InternVLChatConfig(PretrainedConfig):
+    model_type = 'internvl_chat'
+    is_composition = True
+
+    def __init__(self,
+                 vision_config=None,
+                 llm_config=None,
+                 use_backbone_lora=0,
+                 use_llm_lora=0,
+                 select_layer=-1,
+                 force_image_size=None,
+                 downsample_ratio=0.5,
+                 template=None,
+                 dynamic_image_size=False,
+                 use_thumbnail=False,
+                 ps_version='v1',
+                 min_dynamic_patch=1,
+                 max_dynamic_patch=6,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+
+        if llm_config is None:
+            llm_config = {}
+
+        self.vision_config = PretrainedConfig(**vision_config)
+        self.text_config = PretrainedConfig(**llm_config)
+
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.select_layer = select_layer
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.ps_version = ps_version  # pixel shuffle version
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/jais.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/jais.py
@@ -0,0 +1,235 @@
+# Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2023 Cerebras Systems.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""JAIS configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class JAISConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a
+    [`JAISModel`]. It is used to instantiate a JAIS model according to the
+    specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used
+    to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50257):
+            Vocabulary size of the JAIS model. Defines the number of different
+            tokens that can be represented by the
+            `inputs_ids` passed when calling [`JAISModel`].
+        n_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used
+            with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the
+            Transformer encoder.
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set
+            it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            Activation function, to be selected in the list
+            `["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in
+            the embeddings, encoder, and pooler.
+        embd_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(hidden_size)..
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models).
+        scale_attn_by_inverse_layer_idx (`bool`, *optional*,
+            defaults to `False`):
+            Whether to additionally scale attention weights by
+            `1 / layer_idx + 1`.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
+            Whether to scale keys (K) prior to computing attention
+            (dot-product)
+            and upcast attention dot-product/softmax to float() when training
+            with mixed precision.
+        position_embedding_type (`str`, *optional*, defaults to `"learned"`):
+            Positional embedding can be either `"alibi"` or `"learned"`.
+        mup_width_scale (`float`, *optional*, defaults to 1.0):
+            muP parameter to scale learning rate and initializers. Calculated
+            as (`d_model,0 / d_model`), where
+            `d_model` is the model's width and `d_model,0` is the proxy
+            model's width.
+        mup_embeddings_scale (`float`, *optional*, defaults to 1.0):
+            muP parameter to scale token and position embeddings.
+        mup_output_alpha (`float`, *optional*, defaults to 1.0):
+            muP parameter to scale output logits
+            (`output_logits_scale = mup_output_alpha * mup_width_scale`).
+        mup_scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
+            Scale attention weights by dividing by hidden_size instead of
+            sqrt(hidden_size). Need to set scale_attn_weights to `True` as
+            well.
+        alibi_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for ALiBi
+            embeddings. Currently only supports linear
+            scaling strategy. Can specify either the scaling `factor` (must be
+            a float greater than 1) for fixed scaling
+            or `train_seq_len` for dynamic scaling on input samples with
+            sequence length > `train_seq_len`. The expected
+            formats are `{"type": strategy name, "factor": scaling factor}` or
+            `{"type": strategy name,
+            "train_seq_len": training sequence length}`.
+        architectures (`List`, *optional*, defaults to ['JAISLMHeadModel']):
+            architecture names for Jais.
+
+    Example:
+
+    ```python
+    >>> from transformers import JAISConfig, JAISModel
+
+    >>> # Initializing a JAIS configuration
+    >>> configuration = JAISConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = JAISModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "jais"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=50257,
+        n_positions=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        n_inner=None,
+        activation_function="gelu_new",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        scale_attn_weights=True,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        scale_attn_by_inverse_layer_idx=False,
+        reorder_and_upcast_attn=False,
+        position_embedding_type="learned",
+        mup_width_scale=1.0,
+        mup_embeddings_scale=1.0,
+        mup_output_alpha=1.0,
+        mup_scale_qk_dot_by_d=False,
+        alibi_scaling=None,
+        architectures=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        self.position_embedding_type = position_embedding_type
+        self.mup_width_scale = mup_width_scale
+        self.mup_embeddings_scale = mup_embeddings_scale
+        self.mup_output_alpha = mup_output_alpha
+        self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d
+
+        self.alibi_scaling = alibi_scaling
+        self._alibi_scaling_validation()
+        if architectures is None:
+            architectures = ["JAISLMHeadModel"]
+
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            architectures=architectures,
+            **kwargs,
+        )
+
+    def _alibi_scaling_validation(self):
+        """
+        Validate the `alibi_scaling` configuration.
+        """
+        if self.alibi_scaling is None:
+            return
+
+        if (not isinstance(self.alibi_scaling, dict)
+                or len(self.alibi_scaling) != 2):
+            raise ValueError(
+                "`alibi_scaling` must be a dictionary with two fields,"
+                "`type` and `factor` or `type` and `train_seq_len`, "
+                f"got {self.alibi_scaling}")
+        alibi_scaling_type = self.alibi_scaling.get("type", None)
+        alibi_scaling_factor = self.alibi_scaling.get("factor", None)
+        alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
+        if alibi_scaling_type is None or alibi_scaling_type != "linear":
+            raise ValueError(f"`alibi_scaling`'s type field must be 'linear',"
+                             f"got {alibi_scaling_type}")
+        if (alibi_scaling_factor is not None
+                and not isinstance(alibi_scaling_factor, float)
+                or (alibi_scaling_factor is not None
+                    and alibi_scaling_factor <= 1.0)):
+            raise ValueError(
+                f"`alibi_scaling`'s factor field must be a float > 1.0,"
+                f"got {alibi_scaling_factor}")
+        if (alibi_dynamic_scaling is not None
+                and not isinstance(alibi_dynamic_scaling, int)
+                or (alibi_dynamic_scaling is not None
+                    and alibi_dynamic_scaling <= 1)):
+            raise ValueError(
+                f"`alibi_scaling`'s `train_seq_len` field must be an"
+                f"integer > 1, got {alibi_dynamic_scaling}")
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/medusa.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/medusa.py
@@ -0,0 +1,60 @@
+import os
+from typing import Optional, Union
+
+from transformers import PretrainedConfig
+
+
+class MedusaConfig(PretrainedConfig):
+    model_type = "medusa"
+
+    def __init__(self,
+                 hidden_size: int = 4096,
+                 vocab_size: int = 32001,
+                 num_heads: int = 5,
+                 num_hidden_layers: int = 1,
+                 max_paths: int = 64,
+                 topk: int = 10,
+                 truncated_vocab_size: Optional[int] = None,
+                 **kwargs):
+
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.num_heads = num_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.max_paths = max_paths
+        self.topk = topk
+        self.max_seq_len = int(2**20)
+        self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
+            else truncated_vocab_size
+        if "architectures" not in kwargs:
+            kwargs["architectures"] = ["MedusaModel"]
+
+        super().__init__(**kwargs)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        **kwargs,
+    ) -> "MedusaConfig":
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs)
+        for k in list(config_dict.keys()):
+            if 'num' in k:
+                if 'heads' in k:
+                    config_dict["num_heads"] = config_dict.pop(k)
+                elif 'layers' in k:
+                    config_dict["num_hidden_layers"] = config_dict.pop(k)
+        return cls.from_dict(config_dict, **kwargs)
+
+    @property
+    def num_attention_heads(self):
+        return 0
+
+    @property
+    def num_lookahead_tokens(self):
+        return self.num_heads
+
+    @num_lookahead_tokens.setter
+    def num_lookahead_tokens(self, num_lookahead_tokens: int):
+        self.num_heads = num_lookahead_tokens
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/mllama.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/mllama.py
@@ -0,0 +1,28 @@
+from transformers.models.mllama import configuration_mllama as mllama_hf_config
+
+
+class MllamaTextConfig(mllama_hf_config.MllamaTextConfig):
+    '''
+    Use this class to override is_encoder_decoder:
+    - transformers regards mllama as is_encoder_decoder=False
+    - vllm needs is_encoder_decoder=True to enable cross-attention
+    '''
+
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.is_encoder_decoder = True
+
+
+class MllamaConfig(mllama_hf_config.MllamaConfig):
+
+    def __init__(
+        self,
+        text_config=None,
+        **kwargs,
+    ):
+        if isinstance(text_config, dict):
+            text_config = MllamaTextConfig(**text_config)
+        super().__init__(text_config=text_config, **kwargs)
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/mlp_speculator.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/mlp_speculator.py
@@ -0,0 +1,65 @@
+from typing import List, Optional
+
+from transformers import PretrainedConfig
+
+
+class MLPSpeculatorConfig(PretrainedConfig):
+    model_type = "mlp_speculator"
+
+    attribute_map = {
+        "hidden_size": "emb_dim",
+    }
+
+    def __init__(self,
+                 vocab_size: int = 32000,
+                 emb_dim: int = 4096,
+                 inner_dim: int = 0,
+                 n_predict: int = 3,
+                 top_k_tokens_per_head: Optional[List[int]] = None,
+                 n_candidates: int = 5,
+                 tie_weights: bool = False,
+                 scale_input: bool = False,
+                 **kwargs):
+        """
+        Initialize an MLPSpeculatorConfig
+
+        Args:
+            vocab_size: int
+                the model vocab size
+            emb_dim: int
+                the model embedding dimension
+            inner_dim: int
+                the inner dimension of the model. If 0, will be the emb_dim.
+            n_predict: int
+                the number of lookaheads for the speculator
+            top_k_tokens_per_head: List[int]
+                Number of tokens to consider from each head when forming the
+                candidate tree.
+                For each candidate branch in the tree, head n produces topk[n]
+                additional sub-branches.
+                NOTE: This parameter is currently unused.
+            n_candidates: int
+                number of child candidates to create per sequence
+            tie_weights: bool
+                If true, use a single set of weights for every model
+                head/stage after the first. The initial projection
+                from the base model may have a different size, so that
+                stays separate.
+            scale_input: bool
+                if True, will scale the initial hidden states from
+                the base model.
+        """
+        if top_k_tokens_per_head is None:
+            top_k_tokens_per_head = [5, 4, 3]
+        assert len(top_k_tokens_per_head) == n_predict
+        self.vocab_size = vocab_size
+        self.emb_dim = emb_dim
+        self.inner_dim = inner_dim
+        self.n_predict = n_predict
+        self.top_k_tokens_per_head = top_k_tokens_per_head
+        self.n_candidates = n_candidates
+        self.num_lookahead_tokens = n_predict
+        self.tie_weights = tie_weights
+        self.scale_input = scale_input
+
+        super().__init__(**kwargs)
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/mpt.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/mpt.py
@@ -0,0 +1,177 @@
+# Copied from
+# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
+"""A HuggingFace-style model configuration."""
+import warnings
+from typing import Any, Dict, Optional, Union
+
+from transformers import PretrainedConfig
+
+attn_config_defaults: Dict = {
+    'attn_type': 'multihead_attention',
+    'attn_pdrop': 0.0,
+    'attn_impl': 'triton',
+    'qk_ln': False,
+    'clip_qkv': None,
+    'softmax_scale': None,
+    'prefix_lm': False,
+    'attn_uses_sequence_id': False,
+    'alibi': False,
+    'alibi_bias_max': 8
+}
+ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'}
+init_config_defaults: Dict = {
+    'name': 'kaiming_normal_',
+    'fan_mode': 'fan_in',
+    'init_nonlinearity': 'relu',
+    'init_div_is_residual': True,
+    'emb_init_std': None,
+    'emb_init_uniform_lim': None,
+    'init_std': None,
+    'init_gain': 0.0
+}
+
+
+class MPTConfig(PretrainedConfig):
+    model_type = 'mpt'
+    attribute_map = {
+        'num_attention_heads': 'n_heads',
+        'hidden_size': 'd_model',
+        'num_hidden_layers': 'n_layers',
+    }
+
+    # pylint: disable=dangerous-default-value
+    def __init__(self,
+                 d_model: int = 2048,
+                 n_heads: int = 16,
+                 n_layers: int = 24,
+                 expansion_ratio: int = 4,
+                 max_seq_len: int = 2048,
+                 vocab_size: int = 50368,
+                 resid_pdrop: float = 0.0,
+                 emb_pdrop: float = 0.0,
+                 learned_pos_emb: bool = True,
+                 attn_config: Dict = attn_config_defaults,
+                 ffn_config: Dict = ffn_config_defaults,
+                 init_device: str = 'cpu',
+                 logit_scale: Optional[Union[float, str]] = None,
+                 no_bias: bool = False,
+                 embedding_fraction: float = 1.0,
+                 norm_type: str = 'low_precision_layernorm',
+                 use_cache: bool = False,
+                 init_config: Dict = init_config_defaults,
+                 fc_type: str = 'torch',
+                 verbose: Optional[int] = None,
+                 **kwargs: Any):
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.expansion_ratio = expansion_ratio
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.learned_pos_emb = learned_pos_emb
+        self.attn_config = attn_config
+        self.ffn_config = ffn_config
+        self.init_device = init_device
+        self.logit_scale = logit_scale
+        self.no_bias = no_bias
+        self.embedding_fraction = embedding_fraction
+        self.norm_type = norm_type
+        self.use_cache = use_cache
+        self.init_config = init_config
+        self.fc_type = fc_type
+        if verbose is not None:
+            warnings.warn(DeprecationWarning(
+                'verbose argument for MPTConfig is now ignored and '
+                'will be removed. Use python_log_level instead.'),
+                          stacklevel=2)
+        if 'name' in kwargs:
+            del kwargs['name']
+        if 'loss_fn' in kwargs:
+            del kwargs['loss_fn']
+        if self.attn_config.get('alibi', False):
+            self.learned_pos_emb = False
+            warnings.warn(
+                f'alibi is turned on, setting `learned_pos_emb` '
+                f'to {self.learned_pos_emb}`',
+                stacklevel=2)
+        super().__init__(**kwargs)
+        self._validate_config()
+
+    def _set_config_defaults(
+            self, config: Dict[str, Any],
+            config_defaults: Dict[str, Any]) -> Dict[str, Any]:
+        for (k, v) in config_defaults.items():
+            if k not in config:
+                config[k] = v
+        return config
+
+    def _validate_config(self) -> None:
+        self.attn_config = self._set_config_defaults(self.attn_config,
+                                                     attn_config_defaults)
+        self.ffn_config = self._set_config_defaults(self.ffn_config,
+                                                    ffn_config_defaults)
+        self.init_config = self._set_config_defaults(self.init_config,
+                                                     init_config_defaults)
+        if self.d_model % self.n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads')
+        if any(
+                prob < 0 or prob > 1 for prob in
+            [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop
+             ]):
+            raise ValueError(
+                "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
+                "probabilities and must be between 0 and 1")
+        if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
+            raise ValueError(
+                f"Unknown attn_impl={self.attn_config['attn_impl']}")
+        if self.attn_config['prefix_lm'] and self.attn_config[
+                'attn_impl'] not in ['torch', 'triton']:
+            raise NotImplementedError(
+                'prefix_lm only implemented with torch and triton attention.')
+        if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [
+                'torch', 'triton'
+        ]:
+            raise NotImplementedError(
+                'alibi only implemented with torch and triton attention.')
+        if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
+                'attn_impl'] not in ['torch', 'triton']:
+            raise NotImplementedError(
+                'attn_uses_sequence_id only implemented with torch '
+                'and triton attention.')
+        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
+            raise ValueError(
+                'model.embedding_fraction must be between 0 (exclusive) '
+                'and 1 (inclusive)!')
+        if isinstance(self.logit_scale,
+                      str) and self.logit_scale != 'inv_sqrt_d_model':
+            raise ValueError(
+                f"self.logit_scale={self.logit_scale!r} is not recognized as "
+                "an option; use numeric value or 'inv_sqrt_d_model'.")
+        if self.init_config.get('name', None) is None:
+            raise ValueError(
+                f"self.init_config={self.init_config!r} 'name' needs to be set."
+            )
+        if not self.learned_pos_emb and (not self.attn_config['alibi']):
+            warnings.warn(
+                'Positional information not being provided to the model.',
+                stacklevel=2)
+        if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
+            try:
+                # pylint: disable=import-outside-toplevel
+                import transformer_engine.pytorch as te
+                del te
+            except Exception as exc:
+                raise ImportError(
+                    'TransformerEngine import fail. `fc_type: te` requires '
+                    'TransformerEngine be installed. '
+                    'The required version of transformer_engine also requires '
+                    'FlashAttention v1.0.6 is installed:\n'
+                    'pip install flash-attn==1.0.6 --no-build-isolation \n'
+                    'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
+                ) from exc
+        if self.ffn_config['ffn_type'] == 'mptmlp':
+            self.ffn_config['fc_type'] = self.fc_type
+        elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
+            self.ffn_config['bias'] = not self.no_bias
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/nemotron.py
@@ -0,0 +1,202 @@
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Nemotron model configuration"""
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class NemotronConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`NemotronModel`]. It is used to instantiate an Nemotron model
+    according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the Nemotron-8B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be
+    used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the Nemotron model. Defines the number of
+            different tokens that can be represented by the
+            `inputs_ids` passed when calling [`NemotronModel`]
+        hidden_size (`int`, *optional*, defaults to 6144):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 24576):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 48):
+            Number of attention heads for each attention layer in the
+            Transformer decoder.
+        head_dim (`int`, *optional*):
+            Projection weights dimension in multi-head attention. Set to
+            hidden_size // num_attention_heads if None
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use
+            Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention
+            (MQA) otherwise GQA is used. When converting a multi-head
+            checkpoint to a GQA checkpoint, each group key and value
+            head should be constructed by meanpooling all the original
+            heads within that group. For more details checkout 
+            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
+            is not specified, will default to `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
+            The non-linear activation function (function or string) in the
+            decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used
+            with.
+        initializer_range (`float`, *optional*, defaults to 0.0134):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 3):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
+            Percentage of the query and keys which will have rotary embedding.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output
+            projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj and down_proj layers in the MLP
+            layers.
+
+    ```python
+    >>> from transformers import NemotronModel, NemotronConfig
+    >>> # Initializing a Nemotron nemotron-15b style configuration
+    >>> configuration = NemotronConfig()
+    >>> # Initializing a model from the nemotron-15b style configuration
+    >>> model = NemotronModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "nemotron"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=256000,
+        hidden_size=6144,
+        intermediate_size=24576,
+        num_hidden_layers=32,
+        num_attention_heads=48,
+        head_dim=None,
+        num_key_value_heads=None,
+        hidden_act="relu2",
+        max_position_embeddings=4096,
+        initializer_range=0.0134,
+        norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=2,
+        eos_token_id=3,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        partial_rotary_factor=0.5,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        head_dim = head_dim or kwargs.get("kv_channels")
+        self.head_dim = head_dim if head_dim is not None else (
+            hidden_size // num_attention_heads)
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.norm_eps = norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # for backward compatibility
+        partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
+            "rope_percentage") or partial_rotary_factor
+        self.partial_rotary_factor = partial_rotary_factor
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling,
+                          dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with two fields, "
+                f"`type` and `factor`, got {self.rope_scaling}")
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in [
+                "linear", "dynamic"
+        ]:
+            raise ValueError(
+                "`rope_scaling`'s type field must be one of ['linear', "
+                f"'dynamic'], got {rope_scaling_type}")
+        if rope_scaling_factor is None or not isinstance(
+                rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(
+                "`rope_scaling`'s factor field must be a float > 1, got "
+                f"{rope_scaling_factor}")
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/nvlm_d.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/nvlm_d.py
@@ -0,0 +1,12 @@
+# Adapted from
+# https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
+# --------------------------------------------------------
+# NVLM-D
+# Copyright (c) 2024 NVIDIA
+# Licensed under Apache 2.0 License [see LICENSE for details]
+# --------------------------------------------------------
+from .internvl import InternVLChatConfig
+
+
+class NVLM_D_Config(InternVLChatConfig):
+    model_type = 'NVLM_D'
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/solar.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/solar.py
@@ -0,0 +1,244 @@
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Solar model configuration"""
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class SolarConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store
+    the configuration of a [`SolarModel`].
+    It is used to instantiate an LLaMA model
+    according to the specified arguments,
+    defining the model architecture.
+    Instantiating a configuration with the
+    defaults will yield a similar
+    configuration to that of the LLaMA-7B.
+    Configuration objects inherit from [`PretrainedConfig`]
+    and can be used to control the model outputs.
+    Read the documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the LLaMA model.
+            Defines the number of different tokens
+            that can be represented by the `inputs_ids`
+            passed when calling [`SolarModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer
+            in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that
+            should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`,
+            the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model
+            will use Multi Query Attention (MQA)
+            otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint,
+            each group key and value head should be constructed
+            by meanpooling all the original heads within that group.
+            For more details checkout [this paper]
+            (https://arxiv.org/pdf/2305.13245.pdf).
+            If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string)
+            in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+            Solar 1 supports up to 2048 tokens,
+            Solar 2 up to 4096, CodeSolar up to 16384.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of
+            the truncated_normal_initializer for initializing
+            all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return
+            the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank
+            used during pretraining.
+            Please refer to [this
+            document](https://huggingface.co/docs/
+            transformers/main/
+            perf_train_gpu_many#tensor-parallelism)
+             to understand more about it. This value is
+            necessary to ensure exact reproducibility
+            of the pretraining results.
+            Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for
+            the RoPE embeddings.
+            Currently supports two scaling
+            strategies: linear and dynamic.
+            Their scaling factor must be a float greater than 1.
+            The expected format is
+            `{"type": strategy name, "factor": scaling factor}`.
+            When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum.
+            See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/
+            dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking
+            API changes in future versions.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value
+            and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj
+            layers in the MLP layers.
+        sliding_window (`int`, *optional*, defaults to 2047):
+            Sliding window attention window size. If not specified,
+            will default to `2047`.
+    ```python
+    >>> from transformers import SolarModel, SolarConfig
+    >>> # Initializing a Solar-pro style configuration
+    >>> configuration = SolarConfig()
+    >>> # Initializing a model from the Solar-pro style configuration
+    >>> model = SolarModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "solar"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        sliding_window=2047,
+        bskcn_1=None,
+        bskcn_2=None,
+        bskcn_3=None,
+        bskcn_4=None,
+        bskcn_tv=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        self.sliding_window = sliding_window
+        self.bskcn_1 = bskcn_1 if bskcn_1 is not None else [12, 20, 32, 44]
+        self.bskcn_2 = bskcn_2 if bskcn_2 is not None else [20, 32]
+        self.bskcn_3 = bskcn_3 if bskcn_3 is not None else [16, 24, 36, 48]
+        self.bskcn_4 = bskcn_4 if bskcn_4 is not None else [28, 40]
+        self.bskcn_tv = bskcn_tv if bskcn_tv is not None else [0.9, 0.8]
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if (not isinstance(self.rope_scaling, dict)
+                or len(self.rope_scaling) != 2):
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with two fields,"
+                " `type` and `factor`, "
+                f"got {self.rope_scaling}")
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in [
+                "linear",
+                "dynamic",
+        ]:
+            raise ValueError(f"`rope_scaling`'s type field must be one of "
+                             f"['linear', 'dynamic'], got {rope_scaling_type}")
+        if (rope_scaling_factor is None
+                or not isinstance(rope_scaling_factor, float)
+                or rope_scaling_factor <= 1.0):
+            raise ValueError(
+                f"`rope_scaling`'s factor field must be a float > 1,"
+                f" got {rope_scaling_factor}")
--- a/vllm-v0.6.2/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/configs/ultravox.py
@@ -0,0 +1,99 @@
+# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
+from typing import Any, Dict, Optional
+
+import transformers
+
+
+class UltravoxConfig(transformers.PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`UltravoxForConditionalGeneration`]. It is used to instantiate an
+    Ultravox model according to the specified arguments, defining the model
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to
+    control the model outputs. Read the documentation from [`PretrainedConfig`]
+    for more information.
+
+    Args:
+        audio_config (`Union[AutoConfig, dict]`,  *optional*):
+            Custom audio config or dict
+        text_config (`Union[AutoConfig, dict]`, *optional*):
+            The config object of the text backbone. Can be any of `LlamaConfig`
+            or `MistralConfig`.
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        audio_token_index (`int`, *optional*, defaults to 32000):
+            The audio token index to encode the audio prompt.
+        stack_factor (`int`, *optional*, defaults to 8):
+            Audio downsampling factor for the multimodal projector.
+        norm_init (`float`, *optional*, defaults to 0.4):
+            The initialization value for the layer normalization.
+        projector_act (`str`, *optional*, defaults to `"swiglu"`):
+            The activation function used by the multimodal projector.
+        text_model_lora_config (`LoraConfigSimplified`, *optional*):
+            The LoRA configuration for finetuning the text model.
+        audio_model_lora_config (`LoraConfigSimplified`, *optional*):
+            The LoRA configuration for finetuning the audio model.
+    """
+
+    model_type = "ultravox"
+    is_composition = False
+
+    def __init__(
+        self,
+        audio_config: Optional[Dict[str, Any]] = None,
+        text_config: Optional[Dict[str, Any]] = None,
+        audio_model_id: Optional[str] = None,
+        text_model_id: Optional[str] = None,
+        ignore_index: int = -100,
+        audio_token_index: int = 32000,
+        hidden_size: int = 4096,
+        stack_factor: int = 8,
+        norm_init: float = 0.4,
+        projector_act: str = "swiglu",
+        text_model_lora_config: Optional[Dict[str, Any]] = None,
+        audio_model_lora_config: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ):
+        self.ignore_index = ignore_index
+
+        self.audio_model_id = audio_model_id
+        self.text_model_id = text_model_id
+        self.audio_token_index = audio_token_index
+
+        self.hidden_size = hidden_size
+        self.stack_factor = stack_factor
+        self.norm_init = norm_init
+        self.projector_act = projector_act
+
+        if text_model_id is not None:
+            # Avoid circular import
+            from vllm.transformers_utils.config import get_config
+
+            self.text_config = get_config(text_model_id,
+                                          trust_remote_code=False)
+        else:
+            text_config = text_config or {}
+            self.text_config = transformers.CONFIG_MAPPING[text_config.get(
+                "model_type", "llama")](**text_config)
+
+        if audio_model_id is not None:
+            # Avoid circular import
+            from vllm.transformers_utils.config import get_config
+
+            self.audio_config = get_config(audio_model_id,
+                                           trust_remote_code=False)
+        else:
+            audio_config = audio_config or {}
+            self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
+                "model_type", "whisper")](**audio_config)
+
+        self.text_model_lora_config = text_model_lora_config or {}
+        self.audio_model_lora_config = audio_model_lora_config or {}
+
+        self.vocab_size = self.text_config.vocab_size
+
+        self.initializer_range = self.text_config.initializer_range
+
+        super().__init__(**kwargs)