[gpt-oss] Add gpt-oss bf16 support

2025-08-13 21:25:57 +08:00
parent 5d2e7edf78
commit 17ea2ec6aa
1232 changed files with 777 additions and 36 deletions
--- a/vllm/transformers_utils/init.py
+++ b/vllm/transformers_utils/init.py
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import envs
+
+if envs.VLLM_USE_MODELSCOPE:
+    try:
+        # Patch here, before each import happens
+        import modelscope
+        from packaging import version
+
+        # patch_hub begins from modelscope>=1.18.1
+        if version.parse(modelscope.__version__) <= version.parse('1.18.0'):
+            raise ImportError(
+                'Using vLLM with ModelScope needs modelscope>=1.18.1, please '
+                'install by `pip install modelscope -U`')
+        from modelscope.utils.hf_util import patch_hub
+
+        # Patch hub to download models from modelscope to speed up.
+        patch_hub()
+    except ImportError as err:
+        raise ImportError(
+            "Please install modelscope>=1.18.1 via "
+            "`pip install modelscope>=1.18.1` to use ModelScope.") from err
--- a/vllm/transformers_utils/chat_templates/init.py
+++ b/vllm/transformers_utils/chat_templates/init.py
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from .registry import get_chat_template_fallback_path
+
+__all__ = ["get_chat_template_fallback_path"]
--- a/vllm/transformers_utils/chat_templates/registry.py
+++ b/vllm/transformers_utils/chat_templates/registry.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from pathlib import Path
+from typing import Callable, Optional, Union
+
+from vllm.logger import init_logger
+
+logger = init_logger(__file__)
+
+CHAT_TEMPLATES_DIR = Path(__file__).parent
+
+ChatTemplatePath = Union[Path, Callable[[str], Optional[Path]]]
+
+
+def _get_qwen_chat_template_fallback(
+        tokenizer_name_or_path: str) -> Optional[Path]:
+    if tokenizer_name_or_path.endswith("-Chat"):
+        return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
+
+    return CHAT_TEMPLATES_DIR / "template_basic.jinja"
+
+
+# yapf: disable
+_MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
+    "blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
+    "chameleon": CHAT_TEMPLATES_DIR / "template_basic.jinja",
+    "deepseek_vl_v2": CHAT_TEMPLATES_DIR / "template_deepseek_vl2.jinja",
+    "florence2": CHAT_TEMPLATES_DIR / "template_basic.jinja",
+    "fuyu": CHAT_TEMPLATES_DIR / "template_fuyu.jinja",
+    "paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
+    "qwen": _get_qwen_chat_template_fallback,
+}
+# yapf: enable
+
+
+def register_chat_template_fallback_path(
+    model_type: str,
+    chat_template: ChatTemplatePath,
+) -> None:
+    if model_type in _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK:
+        logger.warning(
+            "Model type %s already has a chat template registered. "
+            "It will be overwritten by the new chat template %s.", model_type,
+            chat_template)
+
+    _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK[model_type] = chat_template
+
+
+def get_chat_template_fallback_path(
+    model_type: str,
+    tokenizer_name_or_path: str,
+) -> Optional[Path]:
+    chat_template = _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK.get(model_type)
+    if callable(chat_template):
+        chat_template = chat_template(tokenizer_name_or_path)
+
+    if chat_template is None:
+        return None
+
+    return chat_template
--- a/vllm/transformers_utils/chat_templates/template_basic.jinja
+++ b/vllm/transformers_utils/chat_templates/template_basic.jinja
@@ -0,0 +1,3 @@
+{%- for message in messages -%}
+    {{- message['content'] -}}
+{%- endfor -%}
--- a/vllm/transformers_utils/chat_templates/template_blip2.jinja
+++ b/vllm/transformers_utils/chat_templates/template_blip2.jinja
@@ -0,0 +1,11 @@
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- 'Question: ' + message['content'] + ' ' -}}
+    {%- elif message['role'] == 'assistant' -%}
+        {{- 'Answer: ' + message['content'] + ' ' -}}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {{- 'Answer:' -}}
+{% endif %}
--- a/vllm/transformers_utils/chat_templates/template_chatml.jinja
+++ b/vllm/transformers_utils/chat_templates/template_chatml.jinja
@@ -0,0 +1,10 @@
+{%- for message in messages -%}
+    {{- '<|im_start|>' + message['role'] + '\n' + message['content'] -}}
+    {%- if (loop.last and add_generation_prompt) or not loop.last -%}
+        {{- '<|im_end|>' + '\n' -}}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
+    {{- '<|im_start|>assistant\n' -}}
+{%- endif -%}
--- a/vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja
+++ b/vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja
@@ -0,0 +1,23 @@
+{%- if messages[0]['role'] == 'system' -%}
+    {%- set system_message = messages[0]['content'] -%}
+    {%- set messages = messages[1:] -%}
+{%- else -%}
+    {% set system_message = '' -%}
+{%- endif -%}
+
+{{ bos_token + system_message }}
+{%- for message in messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif -%}
+
+    {%- if message['role'] == 'user' -%}
+        {{ '<|User|>: ' + message['content'] + '\n\n' }}
+    {%- elif message['role'] == 'assistant' -%}
+        {{ '<|Assistant|>: ' + message['content'] + eos_token + '\n\n' }}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {{ '<|Assistant|>: ' }}
+{%- endif -%}
--- a/vllm/transformers_utils/chat_templates/template_fuyu.jinja
+++ b/vllm/transformers_utils/chat_templates/template_fuyu.jinja
@@ -0,0 +1,3 @@
+{%- for message in messages -%}
+    {{- message['content'] + '\n' -}}
+{%- endfor -%}
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -0,0 +1,887 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import enum
+import json
+import os
+import time
+from functools import cache, partial
+from pathlib import Path
+from typing import Any, Callable, Literal, Optional, TypeVar, Union
+
+import huggingface_hub
+from huggingface_hub import get_safetensors_metadata, hf_hub_download
+from huggingface_hub import list_repo_files as hf_list_repo_files
+from huggingface_hub import try_to_load_from_cache
+from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
+                                   HFValidationError, LocalEntryNotFoundError,
+                                   RepositoryNotFoundError,
+                                   RevisionNotFoundError)
+from torch import nn
+from transformers import GenerationConfig, PretrainedConfig
+from transformers.models.auto.image_processing_auto import (
+    get_image_processor_config)
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
+from transformers.models.auto.tokenization_auto import get_tokenizer_config
+from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
+
+from vllm import envs
+from vllm.logger import init_logger
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
+                                             DbrxConfig, DeepseekVLV2Config,
+                                             EAGLEConfig, ExaoneConfig,
+                                             H2OVLChatConfig,
+                                             InternVLChatConfig, JAISConfig,
+                                             KimiVLConfig, MedusaConfig,
+                                             MiniMaxText01Config,
+                                             MiniMaxVL01Config, MllamaConfig,
+                                             MLPSpeculatorConfig, MPTConfig,
+                                             NemotronConfig, NVLM_D_Config,
+                                             OvisConfig, RWConfig,
+                                             SkyworkR1VChatConfig, SolarConfig,
+                                             Telechat2Config, UltravoxConfig)
+# yapf: enable
+from vllm.transformers_utils.utils import check_gguf_file
+from vllm.utils import resolve_obj_by_qualname
+
+if envs.VLLM_USE_MODELSCOPE:
+    from modelscope import AutoConfig
+else:
+    from transformers import AutoConfig
+
+MISTRAL_CONFIG_NAME = "params.json"
+
+logger = init_logger(__name__)
+
+_CONFIG_REGISTRY_OVERRIDE_HF: dict[str, type[PretrainedConfig]] = {
+    "mllama": MllamaConfig
+}
+
+_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
+    "chatglm": ChatGLMConfig,
+    "cohere2": Cohere2Config,
+    "dbrx": DbrxConfig,
+    "deepseek_vl_v2": DeepseekVLV2Config,
+    "kimi_vl": KimiVLConfig,
+    "mpt": MPTConfig,
+    "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
+    "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
+    "jais": JAISConfig,
+    "mlp_speculator": MLPSpeculatorConfig,
+    "medusa": MedusaConfig,
+    "eagle": EAGLEConfig,
+    "exaone": ExaoneConfig,
+    "h2ovl_chat": H2OVLChatConfig,
+    "internvl_chat": InternVLChatConfig,
+    "minimax_text_01": MiniMaxText01Config,
+    "minimax_vl_01": MiniMaxVL01Config,
+    "nemotron": NemotronConfig,
+    "NVLM_D": NVLM_D_Config,
+    "ovis": OvisConfig,
+    "solar": SolarConfig,
+    "skywork_chat": SkyworkR1VChatConfig,
+    "telechat": Telechat2Config,
+    "ultravox": UltravoxConfig,
+    **_CONFIG_REGISTRY_OVERRIDE_HF
+}
+
+
+class ConfigFormat(str, enum.Enum):
+    AUTO = "auto"
+    HF = "hf"
+    MISTRAL = "mistral"
+
+
+_R = TypeVar("_R")
+
+
+def with_retry(
+    func: Callable[[], _R],
+    log_msg: str,
+    max_retries: int = 2,
+    retry_delay: int = 2,
+) -> _R:
+    for attempt in range(max_retries):
+        try:
+            return func()
+        except Exception as e:
+            if attempt == max_retries - 1:
+                logger.error("%s: %s", log_msg, e)
+                raise
+            logger.error("%s: %s, retrying %d of %d", log_msg, e, attempt + 1,
+                         max_retries)
+            time.sleep(retry_delay)
+            retry_delay *= 2
+
+    raise AssertionError("Should not be reached")
+
+
+# @cache doesn't cache exceptions
+@cache
+def list_repo_files(
+    repo_id: str,
+    *,
+    revision: Optional[str] = None,
+    repo_type: Optional[str] = None,
+    token: Union[str, bool, None] = None,
+) -> list[str]:
+
+    def lookup_files() -> list[str]:
+        # directly list files if model is local
+        if (local_path := Path(repo_id)).exists():
+            return [
+                str(file.relative_to(local_path))
+                for file in local_path.rglob('*') if file.is_file()
+            ]
+        # if model is remote, use hf_hub api to list files
+        try:
+            if envs.VLLM_USE_MODELSCOPE:
+                from vllm.transformers_utils.utils import (
+                    modelscope_list_repo_files)
+                return modelscope_list_repo_files(repo_id,
+                                                  revision=revision,
+                                                  token=os.getenv(
+                                                      "MODELSCOPE_API_TOKEN",
+                                                      None))
+            return hf_list_repo_files(repo_id,
+                                      revision=revision,
+                                      repo_type=repo_type,
+                                      token=token)
+        except huggingface_hub.errors.OfflineModeIsEnabled:
+            # Don't raise in offline mode,
+            # all we know is that we don't have this
+            # file cached.
+            return []
+
+    return with_retry(lookup_files, "Error retrieving file list")
+
+
+def file_exists(
+    repo_id: str,
+    file_name: str,
+    *,
+    repo_type: Optional[str] = None,
+    revision: Optional[str] = None,
+    token: Union[str, bool, None] = None,
+) -> bool:
+    file_list = list_repo_files(repo_id,
+                                repo_type=repo_type,
+                                revision=revision,
+                                token=token)
+    return file_name in file_list
+
+
+# In offline mode the result can be a false negative
+def file_or_path_exists(model: Union[str, Path], config_name: str,
+                        revision: Optional[str]) -> bool:
+    if (local_path := Path(model)).exists():
+        return (local_path / config_name).is_file()
+
+    # Offline mode support: Check if config file is cached already
+    cached_filepath = try_to_load_from_cache(repo_id=model,
+                                             filename=config_name,
+                                             revision=revision)
+    if isinstance(cached_filepath, str):
+        # The config file exists in cache- we can continue trying to load
+        return True
+
+    # NB: file_exists will only check for the existence of the config file on
+    # hf_hub. This will fail in offline mode.
+
+    # Call HF to check if the file exists
+    return file_exists(str(model),
+                       config_name,
+                       revision=revision,
+                       token=os.getenv('HF_TOKEN', None))
+
+
+def patch_rope_scaling(config: PretrainedConfig) -> None:
+    """Provide backwards compatibility for RoPE."""
+    text_config = getattr(config, "text_config", None)
+    if text_config is not None:
+        patch_rope_scaling(text_config)
+
+    rope_scaling = getattr(config, "rope_scaling", None)
+    if rope_scaling is not None:
+        patch_rope_scaling_dict(rope_scaling)
+
+
+def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None:
+    if "rope_type" in rope_scaling and "type" in rope_scaling:
+        rope_type = rope_scaling["rope_type"]
+        rope_type_legacy = rope_scaling["type"]
+        if rope_type != rope_type_legacy:
+            raise ValueError(
+                f"Found conflicts between 'rope_type={rope_type}' (modern "
+                f"field) and 'type={rope_type_legacy}' (legacy field). "
+                "You should only specify one of them.")
+
+    if "rope_type" not in rope_scaling and "type" in rope_scaling:
+        rope_scaling["rope_type"] = rope_scaling["type"]
+        logger.info("Replacing legacy 'type' key with 'rope_type'")
+
+    if "rope_type" not in rope_scaling:
+        raise ValueError("rope_scaling should have a 'rope_type' key")
+
+    if rope_scaling["rope_type"] == "su":
+        rope_scaling["rope_type"] = "longrope"
+        logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
+    elif rope_scaling["rope_type"] == "mrope":
+        assert "mrope_section" in rope_scaling
+        rope_scaling["rope_type"] = "default"
+        logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
+
+
+def _uses_mrope(config: PretrainedConfig) -> bool:
+    rope_scaling = getattr(config, "rope_scaling", None)
+    if rope_scaling is None:
+        return False
+
+    return "mrope_section" in rope_scaling
+
+
+def uses_mrope(config: PretrainedConfig) -> bool:
+    """Detect if the model with this config uses M-ROPE."""
+    return _uses_mrope(config) or thinker_uses_mrope(config)
+
+
+def thinker_uses_mrope(config: PretrainedConfig) -> bool:
+    """Detect if the model contains a thinker config and it uses M-ROPE."""
+    thinker_config = getattr(config, "thinker_config", None)
+    if thinker_config is None:
+        return False
+
+    thinker_text_config = getattr(thinker_config, "text_config", None)
+    if thinker_text_config is None:
+        return False
+
+    return uses_mrope(thinker_text_config)
+
+
+def is_encoder_decoder(config: PretrainedConfig) -> bool:
+    """Detect if the model with this config is used as an encoder/decoder."""
+    text_config = getattr(config, "text_config", None)
+    if text_config is not None:
+        return is_encoder_decoder(text_config)
+
+    return getattr(config, "is_encoder_decoder", False)
+
+
+def get_config(
+    model: Union[str, Path],
+    trust_remote_code: bool,
+    revision: Optional[str] = None,
+    code_revision: Optional[str] = None,
+    config_format: ConfigFormat = ConfigFormat.AUTO,
+    **kwargs,
+) -> PretrainedConfig:
+    # Separate model folder from file path for GGUF models
+
+    is_gguf = check_gguf_file(model)
+    if is_gguf:
+        kwargs["gguf_file"] = Path(model).name
+        model = Path(model).parent
+
+    if config_format == ConfigFormat.AUTO:
+        try:
+            if is_gguf or file_or_path_exists(
+                    model, HF_CONFIG_NAME, revision=revision):
+                config_format = ConfigFormat.HF
+            elif file_or_path_exists(model,
+                                     MISTRAL_CONFIG_NAME,
+                                     revision=revision):
+                config_format = ConfigFormat.MISTRAL
+            else:
+                raise ValueError(
+                    "Could not detect config format for no config file found. "
+                    "Ensure your model has either config.json (HF format) "
+                    "or params.json (Mistral format).")
+
+        except Exception as e:
+            error_message = (
+                "Invalid repository ID or local directory specified:"
+                " '{model}'.\nPlease verify the following requirements:\n"
+                "1. Provide a valid Hugging Face repository ID.\n"
+                "2. Specify a local directory that contains a recognized "
+                "configuration file.\n"
+                "   - For Hugging Face models: ensure the presence of a "
+                "'config.json'.\n"
+                "   - For Mistral models: ensure the presence of a "
+                "'params.json'.\n"
+                "3. For GGUF: pass the local path of the GGUF checkpoint.\n"
+                "   Loading GGUF from a remote repo directly is not yet "
+                "supported.\n").format(model=model)
+
+            raise ValueError(error_message) from e
+
+    if config_format == ConfigFormat.HF:
+        config_dict, _ = PretrainedConfig.get_config_dict(
+            model,
+            revision=revision,
+            code_revision=code_revision,
+            token=os.getenv('HF_TOKEN', None),
+            **kwargs,
+        )
+
+        # Use custom model class if it's in our registry
+        model_type = config_dict.get("model_type")
+        if model_type in _CONFIG_REGISTRY:
+            config_class = _CONFIG_REGISTRY[model_type]
+            config = config_class.from_pretrained(
+                model,
+                revision=revision,
+                code_revision=code_revision,
+                token=os.getenv('HF_TOKEN', None),
+                **kwargs,
+            )
+        else:
+            try:
+                config = AutoConfig.from_pretrained(
+                    model,
+                    trust_remote_code=trust_remote_code,
+                    revision=revision,
+                    code_revision=code_revision,
+                    token=os.getenv('HF_TOKEN', None),
+                    **kwargs,
+                )
+            except ValueError as e:
+                if (not trust_remote_code
+                        and "requires you to execute the configuration file"
+                        in str(e)):
+                    err_msg = (
+                        "Failed to load the model config. If the model "
+                        "is a custom model not yet available in the "
+                        "HuggingFace transformers library, consider setting "
+                        "`trust_remote_code=True` in LLM or using the "
+                        "`--trust-remote-code` flag in the CLI.")
+                    raise RuntimeError(err_msg) from e
+                else:
+                    raise e
+
+    elif config_format == ConfigFormat.MISTRAL:
+        config = load_params_config(model, revision, **kwargs)
+    else:
+        supported_formats = [
+            fmt.value for fmt in ConfigFormat if fmt != ConfigFormat.AUTO
+        ]
+        raise ValueError(
+            f"Unsupported config format: {config_format}. "
+            f"Supported formats are: {', '.join(supported_formats)}. "
+            f"Ensure your model uses one of these configuration formats "
+            f"or specify the correct format explicitly.")
+
+    # Special architecture mapping check for GGUF models
+    if is_gguf:
+        if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
+            raise RuntimeError(
+                f"Can't get gguf config for {config.model_type}.")
+        model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
+        config.update({"architectures": [model_type]})
+
+    patch_rope_scaling(config)
+
+    if trust_remote_code:
+        maybe_register_config_serialize_by_value()
+
+    return config
+
+
+def try_get_local_file(model: Union[str, Path],
+                       file_name: str,
+                       revision: Optional[str] = 'main') -> Optional[Path]:
+    file_path = Path(model) / file_name
+    if file_path.is_file():
+        return file_path
+    else:
+        try:
+            cached_filepath = try_to_load_from_cache(repo_id=model,
+                                                     filename=file_name,
+                                                     revision=revision)
+            if isinstance(cached_filepath, str):
+                return Path(cached_filepath)
+        except HFValidationError:
+            ...
+    return None
+
+
+def get_hf_file_to_dict(file_name: str,
+                        model: Union[str, Path],
+                        revision: Optional[str] = 'main'):
+    """
+    Downloads a file from the Hugging Face Hub and returns
+    its contents as a dictionary.
+
+    Parameters:
+    - file_name (str): The name of the file to download.
+    - model (str): The name of the model on the Hugging Face Hub.
+    - revision (str): The specific version of the model.
+
+    Returns:
+    - config_dict (dict): A dictionary containing
+    the contents of the downloaded file.
+    """
+
+    file_path = try_get_local_file(model=model,
+                                   file_name=file_name,
+                                   revision=revision)
+
+    if file_path is None:
+        try:
+            hf_hub_file = hf_hub_download(model, file_name, revision=revision)
+        except huggingface_hub.errors.OfflineModeIsEnabled:
+            return None
+        except (RepositoryNotFoundError, RevisionNotFoundError,
+                EntryNotFoundError, LocalEntryNotFoundError) as e:
+            logger.debug("File or repository not found in hf_hub_download", e)
+            return None
+        except HfHubHTTPError as e:
+            logger.warning(
+                "Cannot connect to Hugging Face Hub. Skipping file "
+                "download for '%s':",
+                file_name,
+                exc_info=e)
+            return None
+        file_path = Path(hf_hub_file)
+
+    if file_path is not None and file_path.is_file():
+        with open(file_path) as file:
+            return json.load(file)
+
+    return None
+
+
+@cache
+def get_pooling_config(model: str, revision: Optional[str] = 'main'):
+    """
+    This function gets the pooling and normalize
+    config from the model - only applies to
+    sentence-transformers models.
+
+    Args:
+        model (str): The name of the Hugging Face model.
+        revision (str, optional): The specific version
+        of the model to use. Defaults to 'main'.
+
+    Returns:
+        dict: A dictionary containing the pooling
+        type and whether normalization is used.
+    """
+
+    modules_file_name = "modules.json"
+
+    modules_dict = None
+    if file_or_path_exists(model=model,
+                           config_name=modules_file_name,
+                           revision=revision):
+        modules_dict = get_hf_file_to_dict(modules_file_name, model, revision)
+
+    if modules_dict is None:
+        return None
+
+    logger.info("Found sentence-transformers modules configuration.")
+
+    pooling = next((item for item in modules_dict
+                    if item["type"] == "sentence_transformers.models.Pooling"),
+                   None)
+    normalize = bool(
+        next((item for item in modules_dict
+              if item["type"] == "sentence_transformers.models.Normalize"),
+             False))
+
+    if pooling:
+
+        pooling_file_name = "{}/config.json".format(pooling["path"])
+        pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision)
+        pooling_type_name = next(
+            (item for item, val in pooling_dict.items() if val is True), None)
+
+        if pooling_type_name is not None:
+            pooling_type_name = get_pooling_config_name(pooling_type_name)
+
+        logger.info("Found pooling configuration.")
+        return {"pooling_type": pooling_type_name, "normalize": normalize}
+
+    return None
+
+
+def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
+    if "pooling_mode_" in pooling_name:
+        pooling_name = pooling_name.replace("pooling_mode_", "")
+
+    if "_" in pooling_name:
+        pooling_name = pooling_name.split("_")[0]
+
+    if "lasttoken" in pooling_name:
+        pooling_name = "last"
+
+    supported_pooling_types = ['LAST', 'ALL', 'CLS', 'STEP', 'MEAN']
+    pooling_type_name = pooling_name.upper()
+
+    try:
+        if pooling_type_name in supported_pooling_types:
+            return pooling_type_name
+    except NotImplementedError as e:
+        logger.debug("Pooling type not supported", e)
+        return None
+    return None
+
+
+@cache
+def get_sentence_transformer_tokenizer_config(model: str,
+                                              revision: Optional[str] = 'main'
+                                              ):
+    """
+    Returns the tokenization configuration dictionary for a
+    given Sentence Transformer BERT model.
+
+    Parameters:
+    - model (str): The name of the Sentence Transformer
+    BERT model.
+    - revision (str, optional): The revision of the m
+    odel to use. Defaults to 'main'.
+
+    Returns:
+    - dict: A dictionary containing the configuration parameters
+    for the Sentence Transformer BERT model.
+    """
+    sentence_transformer_config_files = [
+        "sentence_bert_config.json",
+        "sentence_roberta_config.json",
+        "sentence_distilbert_config.json",
+        "sentence_camembert_config.json",
+        "sentence_albert_config.json",
+        "sentence_xlm-roberta_config.json",
+        "sentence_xlnet_config.json",
+    ]
+    encoder_dict = None
+
+    for config_file in sentence_transformer_config_files:
+        if try_get_local_file(model=model,
+                              file_name=config_file,
+                              revision=revision) is not None:
+            encoder_dict = get_hf_file_to_dict(config_file, model, revision)
+            if encoder_dict:
+                break
+
+    if not encoder_dict and not model.startswith("/"):
+        try:
+            # If model is on HuggingfaceHub, get the repo files
+            repo_files = list_repo_files(model,
+                                         revision=revision,
+                                         token=os.getenv('HF_TOKEN', None))
+        except Exception:
+            repo_files = []
+
+        for config_name in sentence_transformer_config_files:
+            if config_name in repo_files:
+                encoder_dict = get_hf_file_to_dict(config_name, model,
+                                                   revision)
+                if encoder_dict:
+                    break
+
+    if not encoder_dict:
+        return None
+
+    logger.info("Found sentence-transformers tokenize configuration.")
+
+    if all(k in encoder_dict for k in ("max_seq_length", "do_lower_case")):
+        return encoder_dict
+    return None
+
+
+def maybe_register_config_serialize_by_value() -> None:
+    """Try to register HF model configuration class to serialize by value
+
+        If trust_remote_code is set, and the model's config file specifies an
+        `AutoConfig` class, then the config class is typically an instance of
+        a custom class imported from the HF modules cache.
+
+        Examples:
+
+        >>> from transformers import AutoConfig
+        >>> klass = AutoConfig.from_pretrained('meta-llama/Meta-Llama-3-8B', trust_remote_code=True)
+        >>> klass.__class__ # transformers.models.llama.configuration_llama.LlamaConfig
+        >>> import transformers_modules # error, not initialized
+        >>> klass = AutoConfig.from_pretrained('deepseek-ai/DeepSeek-V2.5', trust_remote_code=True)
+        >>> import transformers_modules # success, initialized
+        >>> klass.__class__ # transformers_modules.deepseek-ai.DeepSeek-V2.5.98b11844770b2c3ffc18b175c758a803640f4e77.configuration_deepseek.DeepseekV2Config
+
+        In the DeepSeek example, the config class is an instance of a custom
+        class that is not serializable by default. This class will not be
+        importable in spawned workers, and won't exist at all on
+        other nodes, which breaks serialization of the config.
+
+        In this function we tell the cloudpickle serialization library to pass
+        instances of these generated classes by value instead of by reference,
+        i.e. the class definition is serialized along with its data so that the
+        class module does not need to be importable on the receiving end.
+
+        See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs
+    """ # noqa
+    try:
+        import transformers_modules
+    except ImportError:
+        # the config does not need trust_remote_code
+        return
+
+    try:
+        import cloudpickle
+        cloudpickle.register_pickle_by_value(transformers_modules)
+
+        # ray vendors its own version of cloudpickle
+        from vllm.executor.ray_utils import ray
+        if ray:
+            ray.cloudpickle.register_pickle_by_value(transformers_modules)
+
+        # multiprocessing uses pickle to serialize arguments when using spawn
+        # Here we get pickle to use cloudpickle to serialize config objects
+        # that contain instances of the custom config class to avoid
+        # serialization problems if the generated module (and model) has a `.`
+        # in its name
+        import multiprocessing
+        import pickle
+
+        from vllm.config import VllmConfig
+
+        def _reduce_config(config: VllmConfig):
+            return (pickle.loads, (cloudpickle.dumps(config), ))
+
+        multiprocessing.reducer.register(VllmConfig, _reduce_config)
+
+    except Exception as e:
+        logger.warning(
+            "Unable to register remote classes used by"
+            " trust_remote_code with by-value serialization. This may"
+            " lead to a later error. If remote code is not needed"
+            " remove `--trust-remote-code`",
+            exc_info=e)
+
+
+def load_params_config(model: Union[str, Path], revision: Optional[str],
+                       **kwargs) -> PretrainedConfig:
+    # This function loads a params.json config which
+    # should be used when loading models in mistral format
+
+    config_file_name = "params.json"
+
+    config_dict = get_hf_file_to_dict(config_file_name, model, revision)
+    if config_dict is None:
+        raise ValueError(
+            f"Failed to load mistral '{config_file_name}' config for model "
+            f"{model}. Please check if the model is a mistral-format model "
+            f"and if the config file exists.")
+    assert isinstance(config_dict, dict)
+
+    config_mapping = {
+        "dim": "hidden_size",
+        "norm_eps": "rms_norm_eps",
+        "n_kv_heads": "num_key_value_heads",
+        "n_layers": "num_hidden_layers",
+        "n_heads": "num_attention_heads",
+        "hidden_dim": "intermediate_size",
+    }
+
+    def recurse_elems(elem: Any):
+        if isinstance(elem, dict):
+            config_dict = {}
+            for key, value in elem.items():
+                key = config_mapping.get(key, key)
+                config_dict[key] = recurse_elems(value)
+
+            return config_dict
+        else:
+            return elem
+
+    config_dict["model_type"] = config_dict.get("model_type", "transformer")
+    config_dict["hidden_act"] = config_dict.get("activation", "silu")
+    config_dict["tie_word_embeddings"] = config_dict.get(
+        "tie_embeddings", False)
+
+    if config_dict.get("max_position_embeddings") is None:
+        max_position_embeddings = 128_000
+        try:
+            trust_remote_code_val = kwargs.get("trust_remote_code", False)
+            hf_config = get_config(model=model,
+                                   trust_remote_code=trust_remote_code_val,
+                                   revision=revision,
+                                   config_format=ConfigFormat.HF)
+            if hf_value := hf_config.get_text_config().max_position_embeddings:
+                max_position_embeddings = hf_value
+        except Exception as e:
+            logger.warning(
+                "The params.json file is missing 'max_position_embeddings'"
+                " and could not get a value from the HF config."
+                " Defaulting to 128000",
+                exc_info=e)
+        config_dict["max_position_embeddings"] = max_position_embeddings
+
+    if config_dict.get("quantization") is not None:
+        quantization = config_dict.get("quantization", {})
+        if quantization.get("qformat_weight") == "fp8_e4m3":
+            # This maps to the FP8 static per-tensor quantization scheme
+            quantization_config = {
+                "quant_method": "fp8",
+                "activation_scheme": "static"
+            }
+        elif quantization.get("quant_method") == "compressed-tensors":
+            # Pass through the quantization config to compressed-tensors
+            quantization_config = quantization
+        else:
+            raise ValueError(
+                f"Found unknown quantization='{quantization}' in config")
+
+        config_dict["quantization_config"] = quantization_config
+
+    config_type: Literal["text",
+                         "multimodal"] = "multimodal" if config_dict.get(
+                             "vision_encoder") is not None else "text"
+
+    if config_dict.get("moe") is not None:
+        config_dict["architectures"] = ["MixtralForCausalLM"]
+    else:
+        config_dict["architectures"] = ["MistralForCausalLM"]
+
+    if config_type == "multimodal":
+        multimodal_config = config_dict.pop("vision_encoder")
+        quantization_config = config_dict.get("quantization_config", {})
+
+        config_dict = {
+            "text_config": config_dict,
+            "vision_config": multimodal_config
+        }
+        config_dict["architectures"] = ["PixtralForConditionalGeneration"]
+        config_dict["model_type"] = "pixtral"
+        if quantization_config:
+            config_dict["quantization_config"] = quantization_config
+
+    config_dict.update(kwargs)
+
+    config_dict = recurse_elems(config_dict)
+
+    # transform to HF config format
+    if config_type == "multimodal":
+        config_dict["text_config"] = PretrainedConfig(
+            **config_dict["text_config"])
+        config_dict["vision_config"] = PretrainedConfig(
+            **config_dict["vision_config"])
+
+    return PretrainedConfig(**config_dict)
+
+
+def get_hf_image_processor_config(
+    model: Union[str, Path],
+    hf_token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    **kwargs,
+) -> dict[str, Any]:
+    # ModelScope does not provide an interface for image_processor
+    if envs.VLLM_USE_MODELSCOPE:
+        return dict()
+    # Separate model folder from file path for GGUF models
+    if check_gguf_file(model):
+        model = Path(model).parent
+    return get_image_processor_config(model,
+                                      token=hf_token,
+                                      revision=revision,
+                                      **kwargs)
+
+
+def get_hf_text_config(config: PretrainedConfig):
+    """Get the "sub" config relevant to llm for multi modal models.
+    No op for pure text models.
+    """
+    # This block should be unnecessary after https://github.com/huggingface/transformers/pull/37517
+    if hasattr(config, "thinker_config"):
+        # TODO(suyang.fy): Refactor code.
+        #  For Qwen2.5-Omni, change hf_text_config to
+        #  thinker_config.text_config.
+        return config.thinker_config.text_config
+
+    text_config = config.get_text_config()
+
+    if text_config is not config:
+        # The code operates under the assumption that text_config should have
+        # `num_attention_heads` (among others). Assert here to fail early
+        # if transformers config doesn't align with this assumption.
+        assert hasattr(text_config, "num_attention_heads")
+
+    return text_config
+
+
+def try_get_generation_config(
+    model: str,
+    trust_remote_code: bool,
+    revision: Optional[str] = None,
+) -> Optional[GenerationConfig]:
+    try:
+        return GenerationConfig.from_pretrained(
+            model,
+            revision=revision,
+        )
+    except OSError:  # Not found
+        try:
+            config = get_config(
+                model,
+                trust_remote_code=trust_remote_code,
+                revision=revision,
+            )
+            return GenerationConfig.from_model_config(config)
+        except OSError:  # Not found
+            return None
+
+
+def get_cross_encoder_activation_function(config: PretrainedConfig):
+
+    function_name: Optional[str] = None
+    if hasattr(config, "sentence_transformers") and "activation_fn" in \
+        config.sentence_transformers:
+        function_name = config.sentence_transformers["activation_fn"]
+
+    elif (hasattr(config, "sbert_ce_default_activation_function")
+          and config.sbert_ce_default_activation_function is not None):
+        function_name = config.sbert_ce_default_activation_function
+
+    if function_name is not None:
+        assert function_name.startswith("torch.nn.modules."), \
+            "Loading of activation functions is restricted to " \
+            "torch.nn.modules for security reasons"
+        return resolve_obj_by_qualname(function_name)()
+    else:
+        return nn.Sigmoid() if config.num_labels == 1 else nn.Identity()
+
+
+def try_get_safetensors_metadata(
+    model: str,
+    *,
+    revision: Optional[str] = None,
+):
+    get_safetensors_metadata_partial = partial(
+        get_safetensors_metadata,
+        model,
+        revision=revision,
+        token=os.getenv('HF_TOKEN', None),
+    )
+
+    try:
+        return with_retry(get_safetensors_metadata_partial,
+                          "Error retrieving safetensors")
+    except Exception:
+        return None
+
+
+def try_get_tokenizer_config(
+    pretrained_model_name_or_path: Union[str, os.PathLike],
+    trust_remote_code: bool,
+    revision: Optional[str] = None,
+) -> Optional[dict[str, Any]]:
+    try:
+        return get_tokenizer_config(
+            pretrained_model_name_or_path,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+        )
+    except Exception:
+        return None
--- a/vllm/transformers_utils/configs/init.py
+++ b/vllm/transformers_utils/configs/init.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
+from vllm.transformers_utils.configs.cohere2 import Cohere2Config
+from vllm.transformers_utils.configs.dbrx import DbrxConfig
+from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
+from vllm.transformers_utils.configs.eagle import EAGLEConfig
+from vllm.transformers_utils.configs.exaone import ExaoneConfig
+# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
+# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
+# `FalconConfig` class from the official HuggingFace transformers library.
+from vllm.transformers_utils.configs.falcon import RWConfig
+from vllm.transformers_utils.configs.h2ovl import H2OVLChatConfig
+from vllm.transformers_utils.configs.internvl import InternVLChatConfig
+from vllm.transformers_utils.configs.jais import JAISConfig
+from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
+from vllm.transformers_utils.configs.medusa import MedusaConfig
+from vllm.transformers_utils.configs.minimax_text_01 import MiniMaxText01Config
+from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config
+from vllm.transformers_utils.configs.mllama import MllamaConfig
+from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
+from vllm.transformers_utils.configs.moonvit import MoonViTConfig
+from vllm.transformers_utils.configs.mpt import MPTConfig
+from vllm.transformers_utils.configs.nemotron import NemotronConfig
+from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
+from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
+from vllm.transformers_utils.configs.ovis import OvisConfig
+from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig
+from vllm.transformers_utils.configs.solar import SolarConfig
+from vllm.transformers_utils.configs.telechat2 import Telechat2Config
+from vllm.transformers_utils.configs.ultravox import UltravoxConfig
+
+__all__ = [
+    "ChatGLMConfig",
+    "Cohere2Config",
+    "DbrxConfig",
+    "DeepseekVLV2Config",
+    "MPTConfig",
+    "RWConfig",
+    "H2OVLChatConfig",
+    "InternVLChatConfig",
+    "JAISConfig",
+    "MedusaConfig",
+    "EAGLEConfig",
+    "ExaoneConfig",
+    "MiniMaxText01Config",
+    "MiniMaxVL01Config",
+    "MllamaConfig",
+    "MLPSpeculatorConfig",
+    "MoonViTConfig",
+    "KimiVLConfig",
+    "NemotronConfig",
+    "NemotronHConfig",
+    "NVLM_D_Config",
+    "OvisConfig",
+    "SkyworkR1VChatConfig",
+    "SolarConfig",
+    "Telechat2Config",
+    "UltravoxConfig",
+]
--- a/vllm/transformers_utils/configs/arctic.py
+++ b/vllm/transformers_utils/configs/arctic.py
@@ -0,0 +1,207 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# yapf: disable
+# ruff: noqa: E501
+# coding=utf-8
+# Copied from
+# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
+""" Arctic model configuration"""
+
+from dataclasses import asdict, dataclass
+from typing import Any
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+ARCTIC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "arctic": "https://huggingface.co/Snowflake/snowflake-arctic-instruct/tree/main/config.json",
+}
+
+
+@dataclass
+class ArcticLoRAConfig:
+    lora_r: int = 64
+    lora_alpha: float = 16
+    shard_base_weights: bool = False
+
+
+@dataclass
+class ArcticQuantizationConfig:
+    q_bits: int = 8
+    rounding: str = "nearest"
+    mantissa_bits: int = 3
+    group_size: int = 128
+
+
+class ArcticConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ArcticModel`]. It is used to instantiate an
+    Arctic model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the #TODO(rsamdani): add what model has the default config..
+
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Arctic model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ArcticModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
+            The maximum sequence length that this model might ever be used with. Arctic's sliding window attention
+            allows sequence of up to 4096*32 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If not specified, will default to `4096`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_experts_per_tok (`int`, *optional*, defaults to 2):
+            The number of experts to root per-token, can be also interpreted as the `top-p` routing
+            parameter
+        num_local_experts (`int`, *optional*, defaults to 8):
+            Number of experts per Sparse MLP layer.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+
+    ```python
+    >>> from transformers import ArcticModel, ArcticConfig
+
+    >>> # Initializing a Arctic 7B style configuration TODO(rsamdani): verify which model does the default configuration correspond to.
+    >>> configuration = ArcticConfig()
+
+    >>> # Initializing a model from the Arctic 7B style configuration
+    >>> model = ArcticModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "arctic"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=1e6,
+        sliding_window=None,
+        attention_dropout=0.0,
+        num_experts_per_tok=1,
+        num_local_experts=8,
+        router_aux_loss_coef=0.001,
+        moe_layer_frequency=2,
+        parallel_attn_mlp_res=False,
+        moe_train_capacity_factor=1,
+        moe_eval_capacity_factor=1,
+        enable_expert_tensor_parallelism=False,
+        moe_min_capacity=0,
+        moe_token_dropping=True,
+        quantization=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.moe_layer_frequency = moe_layer_frequency
+        self.moe_train_capacity_factor = moe_train_capacity_factor
+        self.moe_eval_capacity_factor = moe_eval_capacity_factor
+        self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
+        self.moe_min_capacity = moe_min_capacity
+        self.moe_token_dropping = moe_token_dropping
+        self.parallel_attn_mlp_res = parallel_attn_mlp_res
+        if isinstance(quantization, dict):
+            self.quantization = ArcticQuantizationConfig(**quantization)
+        else:
+            self.quantization = quantization
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @classmethod
+    def from_dict(cls, config_dict: dict[str, Any], **kwargs) -> "ArcticConfig":
+        result = super().from_dict(config_dict, **kwargs)
+        config = result[0] if isinstance(result, tuple) else result
+        if isinstance(config.quantization, dict):
+            config.quantization = ArcticQuantizationConfig(**config.quantization)
+        return result
+
+    def to_dict(self) -> dict[str, Any]:
+        ret = super().to_dict()
+        if isinstance(ret["quantization"], ArcticQuantizationConfig):
+            ret["quantization"] = asdict(ret["quantization"])
+        return ret
--- a/vllm/transformers_utils/configs/chatglm.py
+++ b/vllm/transformers_utils/configs/chatglm.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/THUDM/ChatGLM2-6B
+from transformers import PretrainedConfig
+
+
+class ChatGLMConfig(PretrainedConfig):
+    model_type = "chatglm"
+    attribute_map = {
+        "num_hidden_layers": "num_layers",
+        "n_head_kv": "multi_query_group_num",
+    }
+
+    def __init__(self,
+                 num_layers=28,
+                 padded_vocab_size=65024,
+                 hidden_size=4096,
+                 ffn_hidden_size=13696,
+                 kv_channels=128,
+                 num_attention_heads=32,
+                 seq_length=2048,
+                 hidden_dropout=0.0,
+                 attention_dropout=0.0,
+                 layernorm_epsilon=1e-5,
+                 rmsnorm=True,
+                 apply_residual_connection_post_layernorm=False,
+                 post_layer_norm=True,
+                 add_bias_linear=False,
+                 add_qkv_bias=False,
+                 interleaved_qkv=False,
+                 bias_dropout_fusion=True,
+                 multi_query_attention=False,
+                 multi_query_group_num=1,
+                 apply_query_key_layer_scaling=True,
+                 attention_softmax_in_fp32=True,
+                 fp32_residual_connection=False,
+                 quantization_bit=0,
+                 pre_seq_len=None,
+                 prefix_projection=False,
+                 **kwargs):
+        self.num_layers = num_layers
+        self.vocab_size = padded_vocab_size
+        self.padded_vocab_size = padded_vocab_size
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.kv_channels = kv_channels
+        self.num_attention_heads = num_attention_heads
+        self.seq_length = seq_length
+        # It is to be compatible with long lora.
+        self.max_position_embeddings = seq_length
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.layernorm_epsilon = layernorm_epsilon
+        self.rmsnorm = rmsnorm
+        self.apply_residual_connection_post_layernorm = (
+            apply_residual_connection_post_layernorm)
+        self.post_layer_norm = post_layer_norm
+        self.add_bias_linear = add_bias_linear
+        self.add_qkv_bias = add_qkv_bias
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.multi_query_attention = multi_query_attention
+        self.multi_query_group_num = multi_query_group_num
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.fp32_residual_connection = fp32_residual_connection
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+        self.interleaved_qkv = interleaved_qkv
+        super().__init__(**kwargs)
--- a/vllm/transformers_utils/configs/cohere2.py
+++ b/vllm/transformers_utils/configs/cohere2.py
@@ -0,0 +1,195 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# ruff: noqa
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/cohere2/configuration_cohere2.py
+from transformers import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+
+
+class Cohere2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
+    model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`CohereModel`]
+        hidden_size (`int`, *optional*, defaults to 8192):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22528):
+            Dimension of the MLP representations.
+        logit_scale (`float`, *optional*, defaults to 0.0625):
+            The scaling factor for the output logits.
+        num_hidden_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 5):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 255001):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Size of the sliding window attention context.
+        sliding_window_pattern (`int`, *optional*, defaults to 4):
+            Pattern for the sliding window attention.
+        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
+
+    ```python
+    >>> from transformers import Cohere2Model, Cohere2Config
+
+    >>> # Initializing a Cohere Nextmodel configuration
+    >>> configuration = Cohere2Config()
+
+    >>> # Initializing a model from the Cohere2 configuration
+    >>> model = Cohere2Model(configuration) # doctest: +SKIP
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config # doctest: +SKIP
+    ```
+    """
+
+    model_type = "cohere2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=256000,
+        hidden_size=8192,
+        intermediate_size=22528,
+        logit_scale=0.0625,
+        num_hidden_layers=40,
+        num_attention_heads=64,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=8192,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=5,
+        eos_token_id=255001,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        sliding_window=4096,
+        sliding_window_pattern=4,
+        cache_implementation="hybrid",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.logit_scale = logit_scale
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.sliding_window = sliding_window
+        self.sliding_window_pattern = sliding_window_pattern
+        # Need to specify head_dim in the config so it can be used in the attention forward functions
+        self.head_dim = hidden_size // num_attention_heads
+        self.cache_implementation = cache_implementation
+
+        # Validate the correctness of rotary position embeddings parameters
+        rope_config_validation(self)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["Cohere2Config"]
--- a/vllm/transformers_utils/configs/dbrx.py
+++ b/vllm/transformers_utils/configs/dbrx.py
@@ -0,0 +1,280 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# yapf: disable
+# ruff: noqa: E501
+# coding=utf-8
+# Copied from
+# https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py
+"""Dbrx configuration."""
+
+from typing import Any, Optional
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {} # type: ignore
+
+
+class DbrxAttentionConfig(PretrainedConfig):
+    """Configuration class for Dbrx Attention.
+
+    [`DbrxAttention`] class. It is used to instantiate attention layers
+    according to the specified arguments, defining the layers architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        attn_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention layers.
+        clip_qkv (`float`, *optional*, defaults to None):
+            If not `None`, clip the queries, keys, and values in the attention layer to this value.
+        kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
+        rope_theta (float): The base frequency for rope.
+    """
+
+    def __init__(
+        self,
+        attn_pdrop: float = 0,
+        clip_qkv: Optional[float] = None,
+        kv_n_heads: int = 1,
+        rope_theta: float = 10000.0,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.attn_pdrop = attn_pdrop
+        self.clip_qkv = clip_qkv
+        self.kv_n_heads = kv_n_heads
+        self.rope_theta = rope_theta
+
+        for k in ["model_type"]:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f"Found unknown {kwargs=}")
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: str, **kwargs: Any
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        if config_dict.get("model_type") == "dbrx":
+            config_dict = config_dict["attn_config"]
+
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                "You are using a model of type %s to instantiate a model of "
+                "type %s. This is not supported for all configurations of "
+                "models and can yield errors.",
+                config_dict["model_type"], cls.model_type)
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class DbrxFFNConfig(PretrainedConfig):
+    """Configuration class for Dbrx FFN.
+
+    [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
+    the specified arguments, defining the layers architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
+            The dict should have a key 'name' with the value being the name of
+            the activation function along with any additional keyword arguments.
+        ffn_hidden_size (int, optional): The hidden size of the feedforward network.
+        moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
+        moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
+        moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
+        moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
+        moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
+        uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
+            This should only be used for benchmarking purposes.
+    """
+
+    def __init__(
+        self,
+        ffn_act_fn: Optional[dict] = None,
+        ffn_hidden_size: int = 3584,
+        moe_num_experts: int = 4,
+        moe_top_k: int = 1,
+        moe_jitter_eps: Optional[float] = None,
+        moe_loss_weight: float = 0.01,
+        moe_normalize_expert_weights: Optional[float] = 1,
+        uniform_expert_assignment: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__()
+        if ffn_act_fn is None:
+            ffn_act_fn = {"name": "silu"}
+        self.ffn_act_fn = ffn_act_fn
+        self.ffn_hidden_size = ffn_hidden_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.moe_jitter_eps = moe_jitter_eps
+        self.moe_loss_weight = moe_loss_weight
+        self.moe_normalize_expert_weights = moe_normalize_expert_weights
+        self.uniform_expert_assignment = uniform_expert_assignment
+
+        for k in ["model_type"]:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f"Found unknown {kwargs=}")
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: str, **kwargs: Any
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        if config_dict.get("model_type") == "dbrx":
+            config_dict = config_dict["ffn_config"]
+
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                "You are using a model of type %s to instantiate a model of "
+                "type %s. This is not supported for all "
+                "configurations of models and can yield errors.", config_dict["model_type"], cls.model_type)
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class DbrxConfig(PretrainedConfig):
+    """Configuration class for Dbrx.
+
+    [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
+    specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        d_model (`int`, *optional*, defaults to 6144):
+            Dimensionality of the embeddings and hidden states.
+        n_heads (`int`, *optional*, defaults to 48):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer encoder.
+        max_seq_len (`int`, *optional*, defaults to 32768):
+            The maximum sequence length of the model.
+        vocab_size (`int`, *optional*, defaults to 100352):
+            Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`DbrxModel`].
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability applied to the attention output before combining with residual.
+        emb_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the embedding layer.
+        attn_config (`dict`, *optional*):
+            A dictionary used to configure the model's attention module.
+        ffn_config (`dict`, *optional*):
+            A dictionary used to configure the model's FFN module.
+        use_cache (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+
+
+    Example:
+    ```python
+    >>> from transformers import DbrxConfig, DbrxModel
+
+    >>> # Initializing a Dbrx configuration
+    >>> configuration = DbrxConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = DbrxModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "dbrx"
+    attribute_map = {
+        "num_attention_heads": "n_heads",
+        "hidden_size": "d_model",
+        "num_hidden_layers": "n_layers",
+        "max_position_embeddings": "max_seq_len",
+    }
+
+    def __init__(
+        self,
+        d_model: int = 2048,
+        n_heads: int = 16,
+        n_layers: int = 24,
+        max_seq_len: int = 2048,
+        vocab_size: int = 32000,
+        resid_pdrop: float = 0.0,
+        emb_pdrop: float = 0.0,
+        attn_config: Optional[DbrxAttentionConfig] = None,
+        ffn_config: Optional[DbrxFFNConfig] = None,
+        use_cache: bool = True,
+        initializer_range: float = 0.02,
+        output_router_logits: bool = False,
+        router_aux_loss_coef: float = 0.05,
+        **kwargs: Any,
+    ):
+        if attn_config is None:
+            self.attn_config = DbrxAttentionConfig()
+        elif isinstance(attn_config, dict):
+            self.attn_config = DbrxAttentionConfig(**attn_config)
+        else:
+            self.attn_config = attn_config
+
+        if ffn_config is None:
+            self.ffn_config = DbrxFFNConfig()
+        elif isinstance(ffn_config, dict):
+            self.ffn_config = DbrxFFNConfig(**ffn_config)
+        else:
+            self.ffn_config = ffn_config
+
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
+        if tie_word_embeddings:
+            raise ValueError(
+                "tie_word_embeddings is not supported for Dbrx models."
+            )
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
--- a/vllm/transformers_utils/configs/deepseek_vl2.py
+++ b/vllm/transformers_utils/configs/deepseek_vl2.py
@@ -0,0 +1,216 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class VisionEncoderConfig(PretrainedConfig):
+    model_type: str = "vision"
+
+    model_name: str = "vit_so400m_patch14_siglip_384.webli"
+    image_size: int = 384
+    patch_size: int = 16
+    width: int = 1024
+    layers: int = 24
+    heads: int = 16
+    mlp_ratio: int = 4
+    global_pool: str = "map"
+    ignore_head: bool = True
+    class_token: bool = False
+    num_classes: int = 0
+    use_checkpoint: bool = False
+    weight_init: str = "skip"
+    deterministic: bool = False
+    num_recomputing_layers: int = 0
+
+    def __init__(self,
+                 model_name: str = "vit_so400m_patch14_siglip_384.webli",
+                 image_size: int = 384,
+                 patch_size: int = 16,
+                 width: int = 1024,
+                 layers: int = 24,
+                 heads: int = 16,
+                 mlp_ratio: int = 4,
+                 global_pool: str = "map",
+                 ignore_head: bool = True,
+                 class_token: bool = False,
+                 num_classes: int = 0,
+                 use_checkpoint: bool = False,
+                 **kwargs):
+        self.model_name = model_name
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.width = width
+        self.layers = layers
+        self.heads = heads
+        self.mlp_ratio = mlp_ratio
+        self.global_pool = global_pool
+        self.ignore_head = ignore_head
+        self.class_token = class_token
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+
+        super().__init__(**kwargs)
+
+
+class MlpProjectorConfig(PretrainedConfig):
+    model_type = "mlp_projector"
+    projector_type: str = "downsample_mlp_gelu"
+    input_dim: int = 1152
+    n_embed: int = 2048
+    depth: int = 2
+    mlp_ratio: int = 1
+    downsample_ratio: int = 2
+    token_pooling: bool = False
+
+    def __init__(self,
+                 projector_type: str = "downsample_mlp_gelu",
+                 input_dim: int = 1152,
+                 n_embed: int = 2048,
+                 depth: int = 2,
+                 mlp_ratio: int = 1,
+                 downsample_ratio: int = 2,
+                 **kwargs):
+        self.projector_type = projector_type
+        self.input_dim = input_dim
+        self.n_embed = n_embed
+        self.depth = depth
+        self.mlp_ratio = mlp_ratio
+        self.downsample_ratio = downsample_ratio
+
+        super().__init__(**kwargs)
+
+
+class DeepseekV2Config(PretrainedConfig):
+
+    model_type = "deepseek_v2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=102400,
+        hidden_size=4096,
+        intermediate_size=11008,
+        moe_intermediate_size=1407,
+        num_hidden_layers=30,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        n_shared_experts=None,
+        n_routed_experts=None,
+        ep_size=1,
+        routed_scaling_factor=1.0,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        qk_nope_head_dim=128,
+        topk_method='gready',
+        n_group=None,
+        topk_group=None,
+        num_experts_per_tok=None,
+        moe_layer_freq=1,
+        first_k_dense_replace=0,
+        norm_topk_prob=False,
+        scoring_func='softmax',
+        aux_loss_alpha=0.001,
+        seq_aux=True,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=100000,
+        eos_token_id=100001,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        use_mla=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.ep_size = ep_size
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.topk_method = topk_method
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_layer_freq = moe_layer_freq
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.scoring_func = scoring_func
+        self.aux_loss_alpha = aux_loss_alpha
+        self.seq_aux = seq_aux
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = float(rms_norm_eps)
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.use_mla = use_mla
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class DeepseekVLV2Config(PretrainedConfig):
+    model_type = "deepseek_vl_v2"
+    vision_config: VisionEncoderConfig
+    projector_config: MlpProjectorConfig
+
+    tile_tag: str = "2D"
+    global_view_pos: str = "head"
+    candidate_resolutions: tuple[tuple[int, int]] = ((384, 384), )
+
+    def __init__(self,
+                 tile_tag: str = "tile_tag",
+                 global_view_pos: str = "head",
+                 candidate_resolutions: tuple[tuple[int,
+                                                    int]] = ((384, 384), ),
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        vision_config = kwargs.get("vision_config", {})
+        self.vision_config = VisionEncoderConfig(**vision_config)
+
+        projector_config = kwargs.get("projector_config", {})
+        self.projector_config = MlpProjectorConfig(**projector_config)
+
+        language_config = kwargs.get("language_config", {})
+        self.text_config = DeepseekV2Config(**language_config)
+
+        self.tile_tag = tile_tag
+        self.global_view_pos = global_view_pos
+        self.candidate_resolutions = candidate_resolutions
+        self.vocab_size = self.text_config.vocab_size
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from typing import Optional, Union
+
+from transformers import AutoConfig, PretrainedConfig
+
+import vllm.envs as envs
+from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
+
+
+class EAGLEConfig(PretrainedConfig):
+    model_type = "eagle"
+
+    def __init__(self,
+                 model: Union[PretrainedConfig, dict, None] = None,
+                 truncated_vocab_size: Optional[int] = None,
+                 method: Optional[str] = 'eagle',
+                 **kwargs):
+
+        model_config: Union[PretrainedConfig, DeepseekV2Config, None]
+        if isinstance(model, dict):
+            archs = model.get("architectures", [])
+            target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]
+            if any(target_arch in archs for target_arch in target_archs):
+                # AutoConfig does not support DeepSeek MoE models yet
+                model_config = DeepseekV2Config(**model)
+            else:
+                model_config = AutoConfig.for_model(**model)
+        else:
+            model_config = model
+
+        for k, v in kwargs.items():
+            if k != "architectures" and k != "model_type" and hasattr(
+                    model_config, k):
+                setattr(model_config, k, v)
+
+        self.model = model_config
+
+        if self.model is None:
+            self.truncated_vocab_size = None
+        else:
+            self.truncated_vocab_size = self.model.vocab_size if \
+                truncated_vocab_size is None else truncated_vocab_size
+
+        if not envs.VLLM_USE_V1:
+            kwargs["architectures"] = ["EAGLEModel"]
+        else:
+            # Eagle model name should follow naming convention of
+            # LlamaForCausalLM -> EagleLlamaForCausalLM
+            if method == "eagle":
+                assert self.model is not None, \
+                    "model should not be None when method is eagle"
+                kwargs["architectures"] = [
+                    f"Eagle{arch}" if not arch.startswith("Eagle") \
+                        else arch for arch in self.model.architectures
+                ]
+            elif method == "eagle3":
+                assert self.model is not None, \
+                    "model should not be None when method is eagle3"
+                kwargs["architectures"] = [
+                    f"Eagle3{arch}" if not arch.startswith("Eagle3") \
+                        else arch for arch in self.model.architectures
+                ]
+            else:
+                raise ValueError(f"Invalid method {method}. \
+                    Supported methods are eagle and eagle3.")
+
+        super().__init__(**kwargs)
+
+        if self.model is not None:
+            for k, v in self.model.to_dict().items():
+                if k not in kwargs:
+                    setattr(self, k, v)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        **kwargs,
+    ) -> "EAGLEConfig":
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs)
+        return cls.from_dict(config_dict, **kwargs)
--- a/vllm/transformers_utils/configs/exaone.py
+++ b/vllm/transformers_utils/configs/exaone.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copied from
+# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
+# Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Exaone model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: dict[str, str] = {}
+
+
+class ExaoneConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:
+    `~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model
+    according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the Exaone
+
+    Configuration objects inherit from {class}`~transformers.PretrainedConfig`
+    and can be used to control the model outputs. Read the documentation from :
+    class:`~transformers.PretrainedConfig` for more information.
+
+    Args:
+        vocab_size ({obj}`int`, `optional`, defaults to 50257):
+            Vocabulary size of the GPT Lingvo model. Defines the number of
+            different tokens that can be represented by the {obj}`inputs_ids`
+            passed when calling {class}`~transformers.ExaoneModel`. Vocabulary
+            size of the model.
+            Defines the different tokens that can be represented by the
+            `inputs_ids` passed to the forward method of :class:
+            `~transformers.EXAONEModel`.
+        hidden_size ({obj}`int`, `optional`, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_layers ({obj}`int`, `optional`, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the
+            Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi
+            Head Attention (MHA), if `num_key_value_heads=1 the model will use
+            Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint,
+            each group key and value head should be constructed by meanpooling
+            all the original heads within that group. For more details checkout
+            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
+            specified, will default to `num_attention_heads`.
+        rotary_pct (`float`, *optional*, defaults to 0.25):
+            percentage of hidden dimensions to allocate to rotary embeddings
+        intermediate_size ({obj}`int`, `optional`, defaults to 8192):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in
+            the Transformer encoder.
+        activation_function ({obj}`str` or {obj}`function`, `optional`,
+        defaults to {obj}`"gelu_new"`):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If string, {obj}`"gelu"`, {obj}`"relu"`,
+            {obj}`"selu"` and {obj}`"gelu_new"` are supported.
+        embed_dropout ({obj}`float`, `optional`, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the
+            embeddings, encoder, and pooler.
+        attention_dropout ({obj}`float`, `optional`, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings ({obj}`int`, `optional`, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        type_vocab_size ({obj}`int`, `optional`, defaults to 2):
+            The vocabulary size of the {obj}`token_type_ids` passed when calling
+            {class}`~transformers.EXAONEModel`.
+        initializer_range ({obj}`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_epsilon ({obj}`float`, `optional`, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        use_cache ({obj}`bool`, `optional`, defaults to {obj}`True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models).
+            Only relevant if ``config.is_decoder=True``.
+        gradient_checkpointing ({obj}`bool`, `optional`,
+        defaults to {obj}`False`):
+            If True, use gradient checkpointing to save memory at the expense
+            of slower backward pass.
+        Example::
+
+            >>> from transformers import ExoneModel, ExaoneConfig
+
+            >>> # Initializing a EXAONE configuration
+            >>> configuration = ExaoneConfig()
+
+            >>> # Initializing a model from configuration
+            >>> model = ExoneModel(configuration)
+
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
+    """
+
+    model_type = "exaone"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_hidden_layers": "num_layers"}
+
+    def __init__(
+        self,
+        vocab_size=102400,
+        max_position_embeddings=2048,
+        hidden_size=2048,
+        num_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        intermediate_size=None,
+        activation_function="silu",
+        rotary_pct=0.25,
+        resid_dropout=0.0,
+        embed_dropout=0.0,
+        attention_dropout=0.0,
+        layer_norm_epsilon=1e-6,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=0,
+        eos_token_id=2,
+        tie_word_embeddings=True,
+        **kwargs,
+    ):
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_layers
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        if intermediate_size:
+            self.intermediate_size = intermediate_size
+        else:
+            self.intermediate_size = hidden_size * 4
+        self.activation_function = activation_function
+        self.resid_dropout = resid_dropout
+        self.embed_dropout = embed_dropout
+        self.attention_dropout = attention_dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.rotary_pct = rotary_pct
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        self.use_logit_cap = kwargs.pop("use_logit_cap", False)
+        self.ln_no_scale = kwargs.pop("ln_no_scale", False)
+        self.use_gated = kwargs.pop("use_gated", False)
+        self.use_emb_norm = kwargs.pop("use_emb_norm", False)
+        self.use_rotary_pos = kwargs.pop("use_rotary_pos", False)
+        self.rotary_type = kwargs.pop("rotary_type", None)
+        self.scaling_factor = kwargs.pop("scaling_factor", 1)
+        self.use_absolute_pos = kwargs.pop("use_absolute_pos", True)
+        self.use_extra_logit = kwargs.pop("use_extra_logit", True)
+        self.rotary_expand_length = kwargs.pop("rotary_expand_length", None)
+        self.rotary_base = kwargs.pop("rotary_base", 10000.0)
+        self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False)
+        self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head",
+                                                 (rotary_pct == 0.25))
+        if self.use_rotary_pos:
+            self.use_absolute_pos = False
--- a/vllm/transformers_utils/configs/falcon.py
+++ b/vllm/transformers_utils/configs/falcon.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Falcon configuration"""
+from transformers.configuration_utils import PretrainedConfig
+
+
+class RWConfig(PretrainedConfig):
+    model_type = "falcon"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_hidden_layers": "n_layer",
+        "num_attention_heads": "n_head",
+        "num_kv_heads": "n_head_kv",
+    }
+
+    def __init__(
+        self,
+        vocab_size=250880,
+        hidden_size=64,
+        n_layer=2,
+        n_head=8,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=1,
+        eos_token_id=2,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        multi_query=True,
+        n_head_kv=None,
+        alibi=False,
+        bias=False,
+        parallel_attn=False,
+        new_decoder_architecture=False,
+        **kwargs,
+    ) -> None:
+        self.vocab_size = vocab_size
+        # Backward compatibility with n_embed kwarg
+        n_embed = kwargs.pop("n_embed", None)
+        self.hidden_size = hidden_size if n_embed is None else n_embed
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.multi_query = multi_query
+        self.n_head_kv = 1 if n_head_kv is None else n_head_kv
+        self.alibi = alibi
+        self.bias = bias
+        self.parallel_attn = parallel_attn
+        self.new_decoder_architecture = new_decoder_architecture
+
+        if self.hidden_size == 8192:
+            # Hack for falcon-40b
+            self.new_decoder_architecture = True
+
+        super().__init__(bos_token_id=bos_token_id,
+                         eos_token_id=eos_token_id,
+                         **kwargs)
+
+    @property
+    def head_dim(self):
+        return self.hidden_size // self.n_head
+
+    @property
+    def rotary(self):
+        return not self.alibi
--- a/vllm/transformers_utils/configs/h2ovl.py
+++ b/vllm/transformers_utils/configs/h2ovl.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py
+# --------------------------------------------------------
+# H2OVL-Mississippi
+# Copyright (c) 2024 H2O.AI
+# Licensed under Apache 2.0 License [see LICENSE for details]
+# --------------------------------------------------------
+
+from .internvl import InternVLChatConfig
+
+
+class H2OVLChatConfig(InternVLChatConfig):
+    model_type = "h2ovl_chat"
--- a/vllm/transformers_utils/configs/internvl.py
+++ b/vllm/transformers_utils/configs/internvl.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from transformers.configuration_utils import PretrainedConfig
+
+
+class InternVLChatConfig(PretrainedConfig):
+    model_type = 'internvl_chat'
+    is_composition = True
+
+    def __init__(self,
+                 vision_config=None,
+                 llm_config=None,
+                 use_backbone_lora=0,
+                 use_llm_lora=0,
+                 select_layer=-1,
+                 force_image_size=None,
+                 downsample_ratio=0.5,
+                 template=None,
+                 dynamic_image_size=False,
+                 use_thumbnail=False,
+                 ps_version='v1',
+                 min_dynamic_patch=1,
+                 max_dynamic_patch=6,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+
+        if llm_config is None:
+            llm_config = {}
+
+        self.vision_config = PretrainedConfig(**vision_config)
+        self.text_config = PretrainedConfig(**llm_config)
+
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.select_layer = select_layer
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.ps_version = ps_version  # pixel shuffle version
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
--- a/vllm/transformers_utils/configs/jais.py
+++ b/vllm/transformers_utils/configs/jais.py
@@ -0,0 +1,238 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2023 Cerebras Systems.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""JAIS configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class JAISConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a
+    [`JAISModel`]. It is used to instantiate a JAIS model according to the
+    specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used
+    to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50257):
+            Vocabulary size of the JAIS model. Defines the number of different
+            tokens that can be represented by the
+            `inputs_ids` passed when calling [`JAISModel`].
+        n_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used
+            with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the
+            Transformer encoder.
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set
+            it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            Activation function, to be selected in the list
+            `["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in
+            the embeddings, encoder, and pooler.
+        embd_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(hidden_size)..
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models).
+        scale_attn_by_inverse_layer_idx (`bool`, *optional*,
+            defaults to `False`):
+            Whether to additionally scale attention weights by
+            `1 / layer_idx + 1`.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
+            Whether to scale keys (K) prior to computing attention
+            (dot-product)
+            and upcast attention dot-product/softmax to float() when training
+            with mixed precision.
+        position_embedding_type (`str`, *optional*, defaults to `"learned"`):
+            Positional embedding can be either `"alibi"` or `"learned"`.
+        mup_width_scale (`float`, *optional*, defaults to 1.0):
+            muP parameter to scale learning rate and initializers. Calculated
+            as (`d_model,0 / d_model`), where
+            `d_model` is the model's width and `d_model,0` is the proxy
+            model's width.
+        mup_embeddings_scale (`float`, *optional*, defaults to 1.0):
+            muP parameter to scale token and position embeddings.
+        mup_output_alpha (`float`, *optional*, defaults to 1.0):
+            muP parameter to scale output logits
+            (`output_logits_scale = mup_output_alpha * mup_width_scale`).
+        mup_scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
+            Scale attention weights by dividing by hidden_size instead of
+            sqrt(hidden_size). Need to set scale_attn_weights to `True` as
+            well.
+        alibi_scaling (`dict`, *optional*):
+            Dictionary containing the scaling configuration for ALiBi
+            embeddings. Currently only supports linear
+            scaling strategy. Can specify either the scaling `factor` (must be
+            a float greater than 1) for fixed scaling
+            or `train_seq_len` for dynamic scaling on input samples with
+            sequence length > `train_seq_len`. The expected
+            formats are `{"type": strategy name, "factor": scaling factor}` or
+            `{"type": strategy name,
+            "train_seq_len": training sequence length}`.
+        architectures (`list`, *optional*, defaults to ['JAISLMHeadModel']):
+            architecture names for Jais.
+
+    Example:
+
+    ```python
+    >>> from transformers import JAISConfig, JAISModel
+
+    >>> # Initializing a JAIS configuration
+    >>> configuration = JAISConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = JAISModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "jais"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=50257,
+        n_positions=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        n_inner=None,
+        activation_function="gelu_new",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        scale_attn_weights=True,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        scale_attn_by_inverse_layer_idx=False,
+        reorder_and_upcast_attn=False,
+        position_embedding_type="learned",
+        mup_width_scale=1.0,
+        mup_embeddings_scale=1.0,
+        mup_output_alpha=1.0,
+        mup_scale_qk_dot_by_d=False,
+        alibi_scaling=None,
+        architectures=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        self.position_embedding_type = position_embedding_type
+        self.mup_width_scale = mup_width_scale
+        self.mup_embeddings_scale = mup_embeddings_scale
+        self.mup_output_alpha = mup_output_alpha
+        self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d
+
+        self.alibi_scaling = alibi_scaling
+        self._alibi_scaling_validation()
+        if architectures is None:
+            architectures = ["JAISLMHeadModel"]
+
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            architectures=architectures,
+            **kwargs,
+        )
+
+    def _alibi_scaling_validation(self):
+        """
+        Validate the `alibi_scaling` configuration.
+        """
+        if self.alibi_scaling is None:
+            return
+
+        if (not isinstance(self.alibi_scaling, dict)
+                or len(self.alibi_scaling) != 2):
+            raise ValueError(
+                "`alibi_scaling` must be a dictionary with two fields, "
+                "`type` and `factor` or `type` and `train_seq_len`, "
+                f"got {self.alibi_scaling}")
+        alibi_scaling_type = self.alibi_scaling.get("type", None)
+        alibi_scaling_factor = self.alibi_scaling.get("factor", None)
+        alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
+        if alibi_scaling_type is None or alibi_scaling_type != "linear":
+            raise ValueError(f"`alibi_scaling`'s type field must be 'linear', "
+                             f"got {alibi_scaling_type}")
+        if (alibi_scaling_factor is not None
+                and not isinstance(alibi_scaling_factor, float)
+                or (alibi_scaling_factor is not None
+                    and alibi_scaling_factor <= 1.0)):
+            raise ValueError(
+                f"`alibi_scaling`'s factor field must be a float > 1.0, "
+                f"got {alibi_scaling_factor}")
+        if (alibi_dynamic_scaling is not None
+                and not isinstance(alibi_dynamic_scaling, int)
+                or (alibi_dynamic_scaling is not None
+                    and alibi_dynamic_scaling <= 1)):
+            raise ValueError(
+                f"`alibi_scaling`'s `train_seq_len` field must be an "
+                f"integer > 1, got {alibi_dynamic_scaling}")
--- a/vllm/transformers_utils/configs/kimi_vl.py
+++ b/vllm/transformers_utils/configs/kimi_vl.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
+from typing import Optional, Union
+
+from transformers.configuration_utils import PretrainedConfig
+
+from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
+from vllm.transformers_utils.configs.moonvit import MoonViTConfig
+
+
+class KimiVLConfig(PretrainedConfig):
+    model_type = "kimi_vl"
+
+    def __init__(self,
+                 vision_config: Optional[Union[dict, MoonViTConfig]] = None,
+                 text_config: Optional[Union[dict, DeepseekV2Config]] = None,
+                 ignore_index: int = -100,
+                 media_placeholder_token_id: int = 163605,
+                 pad_token_id: int = 0,
+                 **kwargs):
+        if vision_config is None:
+            vision_config = MoonViTConfig()
+        elif isinstance(vision_config, dict):
+            vision_config = MoonViTConfig(**vision_config)
+        self.vision_config = vision_config
+
+        if text_config is None:
+            text_config = DeepseekV2Config()
+        elif isinstance(text_config, dict):
+            text_config = DeepseekV2Config(**text_config)
+        self.text_config = text_config
+
+        self.ignore_index = ignore_index
+        self.media_placeholder_token_id = media_placeholder_token_id
+
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
--- a/vllm/transformers_utils/configs/medusa.py
+++ b/vllm/transformers_utils/configs/medusa.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from typing import Optional, Union
+
+from transformers import PretrainedConfig
+
+
+class MedusaConfig(PretrainedConfig):
+    model_type = "medusa"
+
+    def __init__(self,
+                 hidden_size: int = 4096,
+                 vocab_size: int = 32001,
+                 num_heads: int = 5,
+                 num_hidden_layers: int = 1,
+                 max_paths: int = 64,
+                 topk: int = 10,
+                 truncated_vocab_size: Optional[int] = None,
+                 **kwargs):
+
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.num_heads = num_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.max_paths = max_paths
+        self.topk = topk
+        self.max_seq_len = int(2**20)
+        self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
+            else truncated_vocab_size
+        if "architectures" not in kwargs:
+            kwargs["architectures"] = ["MedusaModel"]
+
+        super().__init__(**kwargs)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        **kwargs,
+    ) -> "MedusaConfig":
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs)
+        for k in list(config_dict.keys()):
+            if 'num' in k:
+                if 'heads' in k:
+                    config_dict["num_heads"] = config_dict.pop(k)
+                elif 'layers' in k:
+                    config_dict["num_hidden_layers"] = config_dict.pop(k)
+        return cls.from_dict(config_dict, **kwargs)
+
+    @property
+    def num_attention_heads(self):
+        return 0
+
+    @property
+    def num_lookahead_tokens(self):
+        return self.num_heads
+
+    @num_lookahead_tokens.setter
+    def num_lookahead_tokens(self, num_lookahead_tokens: int):
+        self.num_heads = num_lookahead_tokens
--- a/vllm/transformers_utils/configs/minimax_text_01.py
+++ b/vllm/transformers_utils/configs/minimax_text_01.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+""" MiniMaxText01 model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class MiniMaxText01Config(PretrainedConfig):
+    model_type = "MiniMaxText01"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=None,
+        eos_token_id=None,
+        tie_word_embeddings=False,
+        rope_theta=1e6,
+        sliding_window=None,
+        attention_dropout=0.0,
+        num_experts_per_tok=2,
+        num_local_experts=8,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        router_jitter_noise=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.router_jitter_noise = router_jitter_noise
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
--- a/vllm/transformers_utils/configs/minimax_vl_01.py
+++ b/vllm/transformers_utils/configs/minimax_vl_01.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""MiniMaxVL01 model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.auto import CONFIG_MAPPING
+
+from .minimax_text_01 import MiniMaxText01Config
+
+
+class MiniMaxVL01Config(PretrainedConfig):
+    model_type = "minimax_vl_01"
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        ignore_index=-100,
+        image_token_index=32000,
+        projector_hidden_act="gelu",
+        vision_feature_select_strategy="default",
+        vision_feature_layer=-2,
+        image_grid_pinpoints=None,
+        tie_word_embeddings=False,
+        image_seq_length=576,
+        **kwargs,
+    ):
+        self.ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+        self.image_seq_length = image_seq_length
+
+        if vision_feature_select_strategy not in ["default", "full"]:
+            raise ValueError("vision_feature_select_strategy should " +
+                             "be one of 'default', 'full'." +
+                             f"Got: {vision_feature_select_strategy}")
+
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.vision_feature_layer = vision_feature_layer
+        image_grid_pinpoints = (
+            image_grid_pinpoints if image_grid_pinpoints is not None else
+            [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]])
+        self.image_grid_pinpoints = image_grid_pinpoints
+
+        if isinstance(vision_config, dict):
+            if "model_type" not in vision_config:
+                vision_config["model_type"] = "clip_vision_model"
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](
+                **vision_config)
+        elif vision_config is None:
+            vision_config = CONFIG_MAPPING["clip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1024,
+                patch_size=14,
+                image_size=336,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                vocab_size=32000,
+                projection_dim=768,
+            )
+
+        self.vision_config = vision_config
+
+        if text_config is not None:
+            text_config = MiniMaxText01Config(**text_config)
+        else:
+            text_config = MiniMaxText01Config()
+
+        self.text_config = text_config
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
--- a/vllm/transformers_utils/configs/mllama.py
+++ b/vllm/transformers_utils/configs/mllama.py
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from transformers.models.mllama import configuration_mllama as mllama_hf_config
+
+
+class MllamaTextConfig(mllama_hf_config.MllamaTextConfig):
+    '''
+    Use this class to override is_encoder_decoder:
+    - transformers regards mllama as is_encoder_decoder=False
+    - vllm needs is_encoder_decoder=True to enable cross-attention
+    '''
+
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.is_encoder_decoder = True
+
+
+class MllamaConfig(mllama_hf_config.MllamaConfig):
+
+    def __init__(
+        self,
+        text_config=None,
+        **kwargs,
+    ):
+        if isinstance(text_config, dict):
+            text_config = MllamaTextConfig(**text_config)
+        super().__init__(text_config=text_config, **kwargs)
--- a/vllm/transformers_utils/configs/mlp_speculator.py
+++ b/vllm/transformers_utils/configs/mlp_speculator.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+from transformers import PretrainedConfig
+
+
+class MLPSpeculatorConfig(PretrainedConfig):
+    model_type = "mlp_speculator"
+
+    attribute_map = {
+        "hidden_size": "emb_dim",
+    }
+
+    def __init__(self,
+                 vocab_size: int = 32000,
+                 emb_dim: int = 4096,
+                 inner_dim: int = 0,
+                 n_predict: int = 3,
+                 top_k_tokens_per_head: Optional[list[int]] = None,
+                 n_candidates: int = 5,
+                 tie_weights: bool = False,
+                 scale_input: bool = False,
+                 **kwargs):
+        """
+        Initialize an MLPSpeculatorConfig
+
+        Args:
+            vocab_size: int
+                the model vocab size
+            emb_dim: int
+                the model embedding dimension
+            inner_dim: int
+                the inner dimension of the model. If 0, will be the emb_dim.
+            n_predict: int
+                the number of lookaheads for the speculator
+            top_k_tokens_per_head: list[int]
+                Number of tokens to consider from each head when forming the
+                candidate tree.
+                For each candidate branch in the tree, head n produces topk[n]
+                additional sub-branches.
+                NOTE: This parameter is currently unused.
+            n_candidates: int
+                number of child candidates to create per sequence
+            tie_weights: bool
+                If true, use a single set of weights for every model
+                head/stage after the first. The initial projection
+                from the base model may have a different size, so that
+                stays separate.
+            scale_input: bool
+                if True, will scale the initial hidden states from
+                the base model.
+        """
+        if top_k_tokens_per_head is None:
+            top_k_tokens_per_head = [5, 4, 3]
+        assert len(top_k_tokens_per_head) == n_predict
+        self.vocab_size = vocab_size
+        self.emb_dim = emb_dim
+        self.inner_dim = inner_dim
+        self.n_predict = n_predict
+        self.top_k_tokens_per_head = top_k_tokens_per_head
+        self.n_candidates = n_candidates
+        self.num_lookahead_tokens = n_predict
+        self.tie_weights = tie_weights
+        self.scale_input = scale_input
+
+        super().__init__(**kwargs)
--- a/vllm/transformers_utils/configs/moonvit.py
+++ b/vllm/transformers_utils/configs/moonvit.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
+from transformers.configuration_utils import PretrainedConfig
+
+
+class MoonViTConfig(PretrainedConfig):
+    model_type = "moonvit"
+
+    def __init__(
+            self,
+            patch_size: int = 14,
+            init_pos_emb_height: int = 64,
+            init_pos_emb_width: int = 64,
+            num_attention_heads: int = 16,
+            num_hidden_layers: int = 27,
+            hidden_size: int = 1152,
+            intermediate_size: int = 4304,
+            merge_kernel_size: tuple[int, int] = (2, 2),
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        # Positional embedding config
+        self.init_pos_emb_height = init_pos_emb_height
+        self.init_pos_emb_width = init_pos_emb_width
+        # Transformer config
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        # Patch merger config
+        self.merge_kernel_size = merge_kernel_size
--- a/vllm/transformers_utils/configs/mpt.py
+++ b/vllm/transformers_utils/configs/mpt.py
@@ -0,0 +1,180 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copied from
+# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
+"""A HuggingFace-style model configuration."""
+import warnings
+from typing import Any, Optional, Union
+
+from transformers import PretrainedConfig
+
+attn_config_defaults: dict = {
+    'attn_type': 'multihead_attention',
+    'attn_pdrop': 0.0,
+    'attn_impl': 'triton',
+    'qk_ln': False,
+    'clip_qkv': None,
+    'softmax_scale': None,
+    'prefix_lm': False,
+    'attn_uses_sequence_id': False,
+    'alibi': False,
+    'alibi_bias_max': 8
+}
+ffn_config_defaults: dict = {'ffn_type': 'mptmlp'}
+init_config_defaults: dict = {
+    'name': 'kaiming_normal_',
+    'fan_mode': 'fan_in',
+    'init_nonlinearity': 'relu',
+    'init_div_is_residual': True,
+    'emb_init_std': None,
+    'emb_init_uniform_lim': None,
+    'init_std': None,
+    'init_gain': 0.0
+}
+
+
+class MPTConfig(PretrainedConfig):
+    model_type = 'mpt'
+    attribute_map = {
+        'num_attention_heads': 'n_heads',
+        'hidden_size': 'd_model',
+        'num_hidden_layers': 'n_layers',
+    }
+
+    # pylint: disable=dangerous-default-value
+    def __init__(self,
+                 d_model: int = 2048,
+                 n_heads: int = 16,
+                 n_layers: int = 24,
+                 expansion_ratio: int = 4,
+                 max_seq_len: int = 2048,
+                 vocab_size: int = 50368,
+                 resid_pdrop: float = 0.0,
+                 emb_pdrop: float = 0.0,
+                 learned_pos_emb: bool = True,
+                 attn_config: dict = attn_config_defaults,
+                 ffn_config: dict = ffn_config_defaults,
+                 init_device: str = 'cpu',
+                 logit_scale: Optional[Union[float, str]] = None,
+                 no_bias: bool = False,
+                 embedding_fraction: float = 1.0,
+                 norm_type: str = 'low_precision_layernorm',
+                 use_cache: bool = False,
+                 init_config: dict = init_config_defaults,
+                 fc_type: str = 'torch',
+                 verbose: Optional[int] = None,
+                 **kwargs: Any):
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.expansion_ratio = expansion_ratio
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.learned_pos_emb = learned_pos_emb
+        self.attn_config = attn_config
+        self.ffn_config = ffn_config
+        self.init_device = init_device
+        self.logit_scale = logit_scale
+        self.no_bias = no_bias
+        self.embedding_fraction = embedding_fraction
+        self.norm_type = norm_type
+        self.use_cache = use_cache
+        self.init_config = init_config
+        self.fc_type = fc_type
+        if verbose is not None:
+            warnings.warn(DeprecationWarning(
+                'verbose argument for MPTConfig is now ignored and '
+                'will be removed. Use python_log_level instead.'),
+                          stacklevel=2)
+        if 'name' in kwargs:
+            del kwargs['name']
+        if 'loss_fn' in kwargs:
+            del kwargs['loss_fn']
+        if self.attn_config.get('alibi', False):
+            self.learned_pos_emb = False
+            warnings.warn(
+                f'alibi is turned on, setting `learned_pos_emb` '
+                f'to {self.learned_pos_emb}`',
+                stacklevel=2)
+        super().__init__(**kwargs)
+        self._validate_config()
+
+    def _set_config_defaults(
+            self, config: dict[str, Any],
+            config_defaults: dict[str, Any]) -> dict[str, Any]:
+        for (k, v) in config_defaults.items():
+            if k not in config:
+                config[k] = v
+        return config
+
+    def _validate_config(self) -> None:
+        self.attn_config = self._set_config_defaults(self.attn_config,
+                                                     attn_config_defaults)
+        self.ffn_config = self._set_config_defaults(self.ffn_config,
+                                                    ffn_config_defaults)
+        self.init_config = self._set_config_defaults(self.init_config,
+                                                     init_config_defaults)
+        if self.d_model % self.n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads')
+        if any(
+                prob < 0 or prob > 1 for prob in
+            [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop
+             ]):
+            raise ValueError(
+                "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
+                "probabilities and must be between 0 and 1")
+        if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
+            raise ValueError(
+                f"Unknown attn_impl={self.attn_config['attn_impl']}")
+        if self.attn_config['prefix_lm'] and self.attn_config[
+                'attn_impl'] not in ['torch', 'triton']:
+            raise NotImplementedError(
+                'prefix_lm only implemented with torch and triton attention.')
+        if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [
+                'torch', 'triton'
+        ]:
+            raise NotImplementedError(
+                'alibi only implemented with torch and triton attention.')
+        if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
+                'attn_impl'] not in ['torch', 'triton']:
+            raise NotImplementedError(
+                'attn_uses_sequence_id only implemented with torch '
+                'and triton attention.')
+        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
+            raise ValueError(
+                'model.embedding_fraction must be between 0 (exclusive) '
+                'and 1 (inclusive)!')
+        if isinstance(self.logit_scale,
+                      str) and self.logit_scale != 'inv_sqrt_d_model':
+            raise ValueError(
+                f"self.logit_scale={self.logit_scale!r} is not recognized as "
+                "an option; use numeric value or 'inv_sqrt_d_model'.")
+        if self.init_config.get('name', None) is None:
+            raise ValueError(
+                f"self.init_config={self.init_config!r} 'name' needs to be set."
+            )
+        if not self.learned_pos_emb and (not self.attn_config['alibi']):
+            warnings.warn(
+                'Positional information not being provided to the model.',
+                stacklevel=2)
+        if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
+            try:
+                # pylint: disable=import-outside-toplevel
+                import transformer_engine.pytorch as te
+                del te
+            except Exception as exc:
+                raise ImportError(
+                    'TransformerEngine import fail. `fc_type: te` requires '
+                    'TransformerEngine be installed. '
+                    'The required version of transformer_engine also requires '
+                    'FlashAttention v1.0.6 is installed:\n'
+                    'pip install flash-attn==1.0.6 --no-build-isolation \n'
+                    'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
+                ) from exc
+        if self.ffn_config['ffn_type'] == 'mptmlp':
+            self.ffn_config['fc_type'] = self.fc_type
+        elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
+            self.ffn_config['bias'] = not self.no_bias
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -0,0 +1,205 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Nemotron model configuration"""
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class NemotronConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`NemotronModel`]. It is used to instantiate an Nemotron model
+    according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the Nemotron-8B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be
+    used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the Nemotron model. Defines the number of
+            different tokens that can be represented by the
+            `inputs_ids` passed when calling [`NemotronModel`]
+        hidden_size (`int`, *optional*, defaults to 6144):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 24576):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 48):
+            Number of attention heads for each attention layer in the
+            Transformer decoder.
+        head_dim (`int`, *optional*):
+            Projection weights dimension in multi-head attention. Set to
+            hidden_size // num_attention_heads if None
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use
+            Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention
+            (MQA) otherwise GQA is used. When converting a multi-head
+            checkpoint to a GQA checkpoint, each group key and value
+            head should be constructed by meanpooling all the original
+            heads within that group. For more details checkout 
+            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
+            is not specified, will default to `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
+            The non-linear activation function (function or string) in the
+            decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used
+            with.
+        initializer_range (`float`, *optional*, defaults to 0.0134):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 3):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
+            Percentage of the query and keys which will have rotary embedding.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output
+            projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj and down_proj layers in the MLP
+            layers.
+
+    ```python
+    >>> from transformers import NemotronModel, NemotronConfig
+    >>> # Initializing a Nemotron nemotron-15b style configuration
+    >>> configuration = NemotronConfig()
+    >>> # Initializing a model from the nemotron-15b style configuration
+    >>> model = NemotronModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "nemotron"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=256000,
+        hidden_size=6144,
+        intermediate_size=24576,
+        num_hidden_layers=32,
+        num_attention_heads=48,
+        head_dim=None,
+        num_key_value_heads=None,
+        hidden_act="relu2",
+        max_position_embeddings=4096,
+        initializer_range=0.0134,
+        norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=2,
+        eos_token_id=3,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        partial_rotary_factor=0.5,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        head_dim = head_dim or kwargs.get("kv_channels")
+        self.head_dim = head_dim if head_dim is not None else (
+            hidden_size // num_attention_heads)
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.norm_eps = norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # for backward compatibility
+        partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
+            "rope_percentage") or partial_rotary_factor
+        self.partial_rotary_factor = partial_rotary_factor
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(
+                self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with two fields, "
+                f"`type` and `factor`, got {self.rope_scaling}")
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in [
+                "linear", "dynamic"
+        ]:
+            raise ValueError(
+                "`rope_scaling`'s type field must be one of ['linear', "
+                f"'dynamic'], got {rope_scaling_type}")
+        if rope_scaling_factor is None or not isinstance(
+                rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(
+                "`rope_scaling`'s factor field must be a float > 1, got "
+                f"{rope_scaling_factor}")
--- a/vllm/transformers_utils/configs/nemotron_h.py
+++ b/vllm/transformers_utils/configs/nemotron_h.py
@@ -0,0 +1,258 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""NemotronH model configuration"""
+
+import regex as re
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class NemotronHConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`NemotronHModel`]. It is used to instantiate a NemotronH model according
+    to the specified arguments, defining the model architecture. Instantiating
+    a configuration with the defaults will yield a similar configuration to
+    that of the NemotronH-v0.1 model.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 131072):
+            Vocabulary size of the NemotronH model. Defines the number of
+            different tokens that can be represented by the `inputs_ids`
+            passed when calling [`NemotronHModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be
+            tied. Note that this is only relevant if the model has a output
+            word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 21504):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 52):
+            Number of hidden layers in the Transformer encoder.
+        hybrid_override_pattern (`str`, *optional*, defaults to
+            `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
+            The pattern of the hybrid model. The pattern is a string of
+            characters where each character represents
+            M: Mamba2, *: Attention, -: MLP
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the
+            Transformer encoder.
+        attention_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each attention head.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use
+            Multi Head Attention (MHA), if `num_key_value_heads=1` the model
+            will use Multi Query Attention (MQA) otherwise GQA is used.
+        mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
+            The non-linear activation function in the MLP layers.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in attention layers.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in MLP layers.
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the model.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        residual_in_fp32 (`bool`, *optional*, defaults to `False`):
+            Whether or not residuals should be in `float32`. If set to `False`
+            residuals will keep the same `dtype` as the rest of the model.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`,
+            all logits will be calculated. If an integer value, only last
+            `num_logits_to_keep` logits will be calculated.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        sliding_window (`int`, *optional*, defaults to None):
+            Sliding window attention window size.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used
+            with.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the hidden states.
+        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use the fast mamba kernels.
+            These are available only if `mamba-ssm` and `causal-conv1d`
+            are installed, and the mamba modules are running on a CUDA device.
+        ssm_state_size (`int`, *optional*, defaults to 128):
+            The dimension of the mamba state space latents.
+        mamba_num_heads (`int`, *optional*, defaults to 128):
+            Number of heads in Mamba layers.
+        mamba_n_groups (`int`, *optional*, defaults to 8):
+            Number of groups in Mamba layers.
+        mamba_head_dim (`int`, *optional*, defaults to 64):
+            Dimension of each Mamba head.
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel.
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor used to determine the mamba intermediate size.
+        mamba_hidden_act (`str`, *optional*, defaults to "silu"):
+            The non-linear activation function in the Mamba layers.
+        mamba_dt_min (`float`, *optional*, defaults to 0.001):
+            Minimum value for the time step in Mamba.
+        mamba_dt_max (`float`, *optional*, defaults to 0.1):
+            Maximum value for the time step in Mamba.
+        mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
+            Limits for the time step in Mamba.
+        mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
+            Floor value for time step initialization in Mamba.
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the convolution layer of the mamba mixer
+            block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the input and output projections of the
+            mamba mixer block.
+        mamba_chunk_size (`int`, *optional*, defaults to 256):
+            Size of chunks for Mamba processing.
+        rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the pre-normalization residual connections.
+    """
+
+    model_type = "nemotron_h"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=131072,
+        tie_word_embeddings=False,
+        hidden_size=4096,
+        intermediate_size=21504,
+        num_hidden_layers=52,
+        hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
+        num_attention_heads=32,
+        attention_head_dim=128,
+        num_key_value_heads=8,  # nemo: num_query_groups
+        mlp_hidden_act="relu2",
+        attention_bias=False,
+        mlp_bias=False,
+        use_bias=False,
+        initializer_range=0.02,  # nemo: init_method_std
+        layer_norm_epsilon=1e-5,  # nemo: layernorm_epsilon
+        residual_in_fp32=False,  #  Megatron Core default value
+        use_cache=True,
+        num_logits_to_keep=1,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        sliding_window=None,
+        max_position_embeddings=4096,
+        attention_dropout=0.0,
+        hidden_dropout=0.0,  # * ADDED
+        use_mamba_kernels=True,
+        ssm_state_size=128,  # mamba_state_size
+        mamba_num_heads=128,
+        mamba_n_groups=8,  # nemo: mamba_ssm_ngroups = num_heads
+        mamba_head_dim=64,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_hidden_act="silu",
+        mamba_dt_min=0.001,
+        mamba_dt_max=0.1,
+        mamba_dt_limit=(0.0, float("inf")),
+        mamba_dt_init_floor=1e-4,
+        mamba_conv_bias=True,
+        mamba_proj_bias=False,
+        mamba_chunk_size=256,
+        rescale_prenorm_residual=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.hybrid_override_pattern = hybrid_override_pattern
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.sliding_window = sliding_window
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_dropout = attention_dropout
+        self.hidden_dropout = hidden_dropout
+
+        # Validate hybrid_override_pattern
+        # M: Mamba2, *: Attention, -: MLP
+        assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
+            "hybrid_override_pattern must have same length as "
+            "num_hidden_layers")
+        assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
+            "hybrid_override_pattern must only contain characters "
+            "'M', '*', or '-'")
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.mlp_hidden_act = mlp_hidden_act
+        self.attention_bias = attention_bias
+        self.mlp_bias = mlp_bias
+        self.use_bias = use_bias
+        self.initializer_range = initializer_range
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.residual_in_fp32 = residual_in_fp32
+
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+
+        self.use_mamba_kernels = use_mamba_kernels
+        self.n_groups = mamba_n_groups
+        self.mamba_head_dim = mamba_head_dim
+        self.ssm_state_size = ssm_state_size
+        self.mamba_num_heads = mamba_num_heads
+        self.conv_kernel = mamba_d_conv
+        self.expand = mamba_expand
+        self.mamba_hidden_act = mamba_hidden_act
+        self.time_step_min = mamba_dt_min
+        self.time_step_max = mamba_dt_max
+        self.time_step_limit = mamba_dt_limit
+        self.time_step_floor = mamba_dt_init_floor
+        self.use_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+        self.chunk_size = mamba_chunk_size
+        self.rescale_prenorm_residual = rescale_prenorm_residual
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def layers_block_type(self):
+        return [
+            "mamba" if self.hybrid_override_pattern[i] == "M" else
+            "attention" if self.hybrid_override_pattern[i] == "*" else "mlp"
+            for i in range(self.num_hidden_layers)
+        ]
--- a/vllm/transformers_utils/configs/nvlm_d.py
+++ b/vllm/transformers_utils/configs/nvlm_d.py
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
+# --------------------------------------------------------
+# NVLM-D
+# Copyright (c) 2024 NVIDIA
+# Licensed under Apache 2.0 License [see LICENSE for details]
+# --------------------------------------------------------
+from .internvl import InternVLChatConfig
+
+
+class NVLM_D_Config(InternVLChatConfig):
+    model_type = 'NVLM_D'
--- a/vllm/transformers_utils/configs/ovis.py
+++ b/vllm/transformers_utils/configs/ovis.py
@@ -0,0 +1,184 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# yapf: disable
+# ruff: noqa: E501
+# copied from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
+# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
+from typing import Any, Optional, Union
+
+from transformers import AutoConfig, PretrainedConfig
+
+
+class AIMv2Config(PretrainedConfig):
+    """This is the configuration class to store the configuration of an [`AIMv2Model`].
+
+    Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224).
+
+    Args:
+        hidden_size: Dimension of the hidden representations.
+        intermediate_size: Dimension of the SwiGLU representations.
+        num_hidden_layers: Number of hidden layers in the Transformer.
+        num_attention_heads: Number of attention heads for each attention layer
+            in the Transformer.
+        num_channels: Number of input channels.
+        image_size: Image size.
+        patch_size: Patch size.
+        rms_norm_eps: Epsilon value used for the RMS normalization layer.
+        attention_dropout: Dropout ratio for attention probabilities.
+        projection_dropout: Dropout ratio for the projection layer after the attention.
+        qkv_bias: Whether to add a bias to the queries, keys and values.
+        use_bias: Whether to add a bias in the feed-forward and projection layers.
+        kwargs: Keyword arguments for the [`PretrainedConfig`].
+    """
+
+    model_type: str = "aimv2"
+
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        intermediate_size: int = 2816,
+        num_hidden_layers: int = 24,
+        num_attention_heads: int = 8,
+        num_channels: int = 3,
+        image_size: int = 224,
+        patch_size: int = 14,
+        rms_norm_eps: float = 1e-5,
+        attention_dropout: float = 0.0,
+        projection_dropout: float = 0.0,
+        qkv_bias: bool = False,
+        use_bias: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.rms_norm_eps = rms_norm_eps
+
+        self.projection_dropout = projection_dropout
+        self.qkv_bias = qkv_bias
+        self.use_bias = use_bias
+
+
+IGNORE_ID = -100
+IMAGE_TOKEN_ID = -200
+IMAGE_TOKEN = "<image>"
+IMAGE_ATOM_ID = -300
+IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305]
+
+AutoConfig.register("aimv2", AIMv2Config, exist_ok=True)
+
+
+# ----------------------------------------------------------------------
+#                     Visual Tokenizer Configuration
+# ----------------------------------------------------------------------
+class BaseVisualTokenizerConfig(PretrainedConfig):
+
+    def __init__(self,
+                 vocab_size=16384,
+                 tokenize_function="softmax",
+                 tau=1.0,
+                 depths=None,
+                 drop_cls_token=False,
+                 backbone_config: Optional[Union[PretrainedConfig,
+                                                 dict]] = None,
+                 hidden_stride: int = 1,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.tokenize_function = tokenize_function
+        self.tau = tau
+        if isinstance(depths, str):
+            depths = [int(x) for x in depths.split('|')]
+        self.depths = depths
+        self.backbone_kwargs = dict[str, Any]()
+        self.drop_cls_token = drop_cls_token
+        if backbone_config is not None:
+            assert isinstance(backbone_config, (PretrainedConfig, dict)), \
+                f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
+            if not isinstance(backbone_config, PretrainedConfig):
+                model_type = backbone_config['model_type']
+                backbone_config.pop('model_type')
+                backbone_config = AutoConfig.for_model(model_type,
+                                                       **backbone_config)
+        self.backbone_config = backbone_config
+        self.hidden_stride = hidden_stride
+
+
+class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
+    model_type = "aimv2_visual_tokenizer"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if self.drop_cls_token:
+            self.drop_cls_token = False
+        if self.depths:
+            assert len(self.depths) == 1
+            self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
+
+
+class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
+    model_type = "siglip_visual_tokenizer"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if self.drop_cls_token:
+            self.drop_cls_token = False
+        if self.depths:
+            assert len(self.depths) == 1
+            self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
+
+
+AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
+AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
+
+
+# ----------------------------------------------------------------------
+#                           Ovis Configuration
+# ----------------------------------------------------------------------
+class OvisConfig(PretrainedConfig):
+    model_type = "ovis"
+
+    def __init__(self,
+                 llm_config: Optional[Union[PretrainedConfig, dict]] = None,
+                 visual_tokenizer_config: Optional[Union[PretrainedConfig,
+                                                         dict]] = None,
+                 multimodal_max_length=8192,
+                 hidden_size=None,
+                 conversation_formatter_class=None,
+                 llm_attn_implementation=None,
+                 disable_tie_weight=False,
+                 **kwargs):
+        super().__init__(**kwargs)
+        if llm_config is not None:
+            assert isinstance(llm_config, (PretrainedConfig, dict)), \
+                f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
+            if not isinstance(llm_config, PretrainedConfig):
+                model_type = llm_config['model_type']
+                llm_config.pop('model_type')
+                llm_config = AutoConfig.for_model(model_type, **llm_config)
+
+        # map llm_config to text_config
+        self.text_config = llm_config
+        if visual_tokenizer_config is not None:
+            assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
+                f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
+            if not isinstance(visual_tokenizer_config, PretrainedConfig):
+                model_type = visual_tokenizer_config['model_type']
+                visual_tokenizer_config.pop('model_type')
+                visual_tokenizer_config = AutoConfig.for_model(
+                    model_type, **visual_tokenizer_config)
+
+        self.visual_tokenizer_config = visual_tokenizer_config
+        self.multimodal_max_length = multimodal_max_length
+        self.hidden_size = hidden_size
+        self.conversation_formatter_class = conversation_formatter_class
+        self.llm_attn_implementation = llm_attn_implementation
+        self.disable_tie_weight = disable_tie_weight
--- a/vllm/transformers_utils/configs/skyworkr1v.py
+++ b/vllm/transformers_utils/configs/skyworkr1v.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/configuration_skywork_chat.py
+# --------------------------------------------------------
+# SkyworkR1V
+# Copyright (c) 2025 Skywork
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from transformers.configuration_utils import PretrainedConfig
+
+
+class SkyworkR1VChatConfig(PretrainedConfig):
+    model_type = 'internvl_chat'
+    is_composition = True
+
+    def __init__(self,
+                 vision_config=None,
+                 llm_config=None,
+                 use_backbone_lora=0,
+                 use_llm_lora=0,
+                 select_layer=-1,
+                 force_image_size=None,
+                 downsample_ratio=0.5,
+                 template=None,
+                 dynamic_image_size=False,
+                 use_thumbnail=False,
+                 ps_version='v1',
+                 min_dynamic_patch=1,
+                 max_dynamic_patch=6,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+
+        if llm_config is None:
+            llm_config = {}
+
+        self.vision_config = PretrainedConfig(**vision_config)
+        self.text_config = PretrainedConfig(**llm_config)
+
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.select_layer = select_layer
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.ps_version = ps_version  # pixel shuffle version
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
--- a/vllm/transformers_utils/configs/solar.py
+++ b/vllm/transformers_utils/configs/solar.py
@@ -0,0 +1,247 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Solar model configuration"""
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class SolarConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store
+    the configuration of a [`SolarModel`].
+    It is used to instantiate an LLaMA model
+    according to the specified arguments,
+    defining the model architecture.
+    Instantiating a configuration with the
+    defaults will yield a similar
+    configuration to that of the LLaMA-7B.
+    Configuration objects inherit from [`PretrainedConfig`]
+    and can be used to control the model outputs.
+    Read the documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the LLaMA model.
+            Defines the number of different tokens
+            that can be represented by the `inputs_ids`
+            passed when calling [`SolarModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer
+            in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that
+            should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`,
+            the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model
+            will use Multi Query Attention (MQA)
+            otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint,
+            each group key and value head should be constructed
+            by meanpooling all the original heads within that group.
+            For more details checkout [this paper]
+            (https://arxiv.org/pdf/2305.13245.pdf).
+            If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string)
+            in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+            Solar 1 supports up to 2048 tokens,
+            Solar 2 up to 4096, CodeSolar up to 16384.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of
+            the truncated_normal_initializer for initializing
+            all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return
+            the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank
+            used during pretraining.
+            Please refer to [this
+            document](https://huggingface.co/docs/
+            transformers/main/
+            perf_train_gpu_many#tensor-parallelism)
+             to understand more about it. This value is
+            necessary to ensure exact reproducibility
+            of the pretraining results.
+            Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            Dictionary containing the scaling configuration for
+            the RoPE embeddings.
+            Currently supports two scaling
+            strategies: linear and dynamic.
+            Their scaling factor must be a float greater than 1.
+            The expected format is
+            `{"type": strategy name, "factor": scaling factor}`.
+            When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum.
+            See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/
+            dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking
+            API changes in future versions.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value
+            and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj
+            layers in the MLP layers.
+        sliding_window (`int`, *optional*, defaults to 2047):
+            Sliding window attention window size. If not specified,
+            will default to `2047`.
+    ```python
+    >>> from transformers import SolarModel, SolarConfig
+    >>> # Initializing a Solar-pro style configuration
+    >>> configuration = SolarConfig()
+    >>> # Initializing a model from the Solar-pro style configuration
+    >>> model = SolarModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "solar"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        sliding_window=2047,
+        bskcn_1=None,
+        bskcn_2=None,
+        bskcn_3=None,
+        bskcn_4=None,
+        bskcn_tv=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        self.sliding_window = sliding_window
+        self.bskcn_1 = bskcn_1 if bskcn_1 is not None else [12, 20, 32, 44]
+        self.bskcn_2 = bskcn_2 if bskcn_2 is not None else [20, 32]
+        self.bskcn_3 = bskcn_3 if bskcn_3 is not None else [16, 24, 36, 48]
+        self.bskcn_4 = bskcn_4 if bskcn_4 is not None else [28, 40]
+        self.bskcn_tv = bskcn_tv if bskcn_tv is not None else [0.9, 0.8]
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if (not isinstance(self.rope_scaling, dict)
+                or len(self.rope_scaling) != 2):
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with two fields,"
+                " `type` and `factor`, "
+                f"got {self.rope_scaling}")
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in [
+                "linear",
+                "dynamic",
+        ]:
+            raise ValueError(f"`rope_scaling`'s type field must be one of "
+                             f"['linear', 'dynamic'], got {rope_scaling_type}")
+        if (rope_scaling_factor is None
+                or not isinstance(rope_scaling_factor, float)
+                or rope_scaling_factor <= 1.0):
+            raise ValueError(
+                f"`rope_scaling`'s factor field must be a float > 1,"
+                f" got {rope_scaling_factor}")
--- a/vllm/transformers_utils/configs/telechat2.py
+++ b/vllm/transformers_utils/configs/telechat2.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py
+""" Telechat configuration compatible with LlamaConfig. """
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class Telechat2Config(PretrainedConfig):
+
+    model_type = "telechat"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_hidden_layers": "n_layer",
+        "num_attention_heads": "n_head",
+        "intermediate_size": "ffn_hidden_size",
+        "rms_norm_eps": "layer_norm_epsilon"
+    }
+
+    def __init__(
+        self,
+        vocab_size=160256,
+        hidden_size=4096,
+        n_layer=30,
+        n_head=32,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=1,
+        eos_token_id=2,
+        apply_residual_connection_post_layernorm=False,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        ffn_hidden_size=12288,
+        training_seqlen=8192,
+        logn=True,
+        embed_layernorm=False,
+        hidden_act="silu",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        n_embed = kwargs.pop("n_embed", None)
+        self.hidden_size = hidden_size if n_embed is None else n_embed
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.apply_residual_connection_post_layernorm = (
+            apply_residual_connection_post_layernorm)
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.logn = logn
+        self.training_seqlen = training_seqlen
+        self.embed_layernorm = embed_layernorm
+        self.num_key_value_heads = kwargs.pop("num_key_value_heads", None)
+        self.ffn_hidden_size = ffn_hidden_size
+        self.hidden_act = hidden_act
+        super().__init__(bos_token_id=bos_token_id,
+                         eos_token_id=eos_token_id,
+                         **kwargs)
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
+from typing import Any, Optional
+
+import transformers
+
+
+class UltravoxConfig(transformers.PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`UltravoxForConditionalGeneration`]. It is used to instantiate an
+    Ultravox model according to the specified arguments, defining the model
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to
+    control the model outputs. Read the documentation from [`PretrainedConfig`]
+    for more information.
+
+    Args:
+        audio_config (`Union[AutoConfig, dict]`,  *optional*):
+            Custom audio config or dict
+        text_config (`Union[AutoConfig, dict]`, *optional*):
+            The config object of the text backbone. Can be any of `LlamaConfig`
+            or `MistralConfig`.
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        audio_token_index (`int`, *optional*, defaults to 32000):
+            The audio token index to encode the audio prompt.
+        stack_factor (`int`, *optional*, defaults to 8):
+            Audio downsampling factor for the multimodal projector.
+        norm_init (`float`, *optional*, defaults to 0.4):
+            The initialization value for the layer normalization.
+        projector_act (`str`, *optional*, defaults to `"swiglu"`):
+            The activation function used by the multimodal projector.
+        text_model_lora_config (`LoraConfigSimplified`, *optional*):
+            The LoRA configuration for finetuning the text model.
+        audio_model_lora_config (`LoraConfigSimplified`, *optional*):
+            The LoRA configuration for finetuning the audio model.
+        projector_ln_mid (`bool`, *optional*, defaults to `False`):
+            Whether to apply layer normalization at the middle of the
+            projector or at the end. Versions v0.4.1 and below
+            use `False`, but v0.5 and above use `True`.
+    """
+
+    model_type = "ultravox"
+    is_composition = False
+
+    def __init__(
+        self,
+        audio_config: Optional[dict[str, Any]] = None,
+        text_config: Optional[dict[str, Any]] = None,
+        audio_model_id: Optional[str] = None,
+        text_model_id: Optional[str] = None,
+        ignore_index: int = -100,
+        audio_token_index: int = 32000,
+        hidden_size: int = 4096,
+        stack_factor: int = 8,
+        norm_init: float = 0.4,
+        projector_act: str = "swiglu",
+        text_model_lora_config: Optional[dict[str, Any]] = None,
+        audio_model_lora_config: Optional[dict[str, Any]] = None,
+        projector_ln_mid: bool = False,
+        **kwargs,
+    ):
+        self.ignore_index = ignore_index
+
+        self.audio_model_id = audio_model_id
+        self.text_model_id = text_model_id
+        self.audio_token_index = audio_token_index
+
+        self.hidden_size = hidden_size
+        self.stack_factor = stack_factor
+        self.norm_init = norm_init
+        self.projector_act = projector_act
+        self.projector_ln_mid = projector_ln_mid
+
+        if text_model_id is not None:
+            # Avoid circular import
+            from vllm.transformers_utils.config import get_config
+
+            self.text_config = get_config(text_model_id,
+                                          trust_remote_code=False)
+        else:
+            text_config = text_config or {}
+            self.text_config = transformers.CONFIG_MAPPING[text_config.get(
+                "model_type", "llama")](**text_config)
+
+        if audio_model_id is not None:
+            # Avoid circular import
+            from vllm.transformers_utils.config import get_config
+
+            self.audio_config = get_config(audio_model_id,
+                                           trust_remote_code=False)
+        else:
+            audio_config = audio_config or {}
+            self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
+                "model_type", "whisper")](**audio_config)
+
+        self.text_model_lora_config = text_model_lora_config or {}
+        self.audio_model_lora_config = audio_model_lora_config or {}
+
+        self.vocab_size = self.text_config.vocab_size
+
+        self.initializer_range = self.text_config.initializer_range
+
+        super().__init__(**kwargs)
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -0,0 +1,168 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams,
+                           Sequence, SequenceGroup)
+
+from .detokenizer_utils import (convert_prompt_ids_to_tokens,
+                                detokenize_incrementally)
+from .tokenizer import AnyTokenizer
+from .tokenizer_group import TokenizerGroup
+
+
+class Detokenizer:
+    """Provides methods to decode the output of a model into text."""
+
+    def __init__(self, tokenizer_group: TokenizerGroup):
+        self.tokenizer_group = tokenizer_group
+
+    def get_tokenizer_for_seq(self, sequence: Sequence) -> AnyTokenizer:
+        """Returns the HF tokenizer to use for a given sequence."""
+        return self.tokenizer_group.get_lora_tokenizer(sequence.lora_request)
+
+    def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup,
+                                       prompt_logprobs: list[Optional[dict[
+                                           int, Logprob]]],
+                                       position_offset: int) -> None:
+        """Decodes the logprobs for the prompt of a sequence group.
+
+        Args:
+            seq_group: The sequence group to decode.
+            prompt_logprobs: The logprobs to decode.
+            position_offset: Offset of the first index of the logprobs 
+                relative to the start of the sequence (for chunked prefill).
+        
+        Returns:
+            The prompt logprobs with the decoded tokens.
+        """
+        prms = seq_group.sampling_params
+        assert prms is not None
+
+        # We can pick any sequence for the prompt.
+        seq = seq_group.get_seqs()[0]
+        # Only prompt, without the generated token.
+        all_token_ids = seq.get_token_ids()
+        prompt_token_ids = all_token_ids[:-1]
+        tokenizer = self.get_tokenizer_for_seq(seq)
+        prefix_offset = 0
+        read_offset = 0
+        next_iter_prefix_offset = 0
+        next_iter_read_offset = 0
+        next_iter_tokens: list[str] = []
+        prev_tokens = None
+
+        for token_position_in_logprob, prompt_logprobs_for_token in enumerate(
+                prompt_logprobs):
+
+            # Absolute token position equals the index in the logprobs
+            # list plus the offset of the entire logprobs list relative
+            # to the start of the sequence.
+            token_position = token_position_in_logprob + position_offset
+            if not prompt_logprobs_for_token:
+                continue
+            for token_id, sample_logprob in prompt_logprobs_for_token.items():
+                if (sample_logprob.decoded_token is None
+                        and token_id != VLLM_INVALID_TOKEN_ID):
+                    prompt_token_ids_with_token = (
+                        prompt_token_ids[:token_position] + [token_id])
+                    (new_tokens, new_text, new_prefix_offset,
+                     new_read_offset) = detokenize_incrementally(
+                         tokenizer=tokenizer,
+                         all_input_ids=prompt_token_ids_with_token,
+                         prev_tokens=prev_tokens,
+                         prefix_offset=prefix_offset,
+                         read_offset=read_offset,
+                         skip_special_tokens=prms.skip_special_tokens,
+                         spaces_between_special_tokens=prms.
+                         spaces_between_special_tokens,
+                     )
+
+                    sample_logprob.decoded_token = new_text
+
+                    # Use the offsets & prev tokens corresponding to
+                    # real tokens to ensure detokenization is consistent
+                    # actual with prompt.
+                    if token_id == all_token_ids[token_position]:
+                        next_iter_prefix_offset = new_prefix_offset
+                        next_iter_read_offset = new_read_offset
+                        next_iter_tokens = new_tokens
+
+            # Advance to the next token position.
+            prefix_offset = next_iter_prefix_offset
+            read_offset = next_iter_read_offset
+            if prev_tokens is None:
+                prev_tokens = next_iter_tokens.copy()
+            else:
+                prev_tokens.extend(next_iter_tokens)
+
+    def decode_sequence_inplace(self, seq: Sequence,
+                                prms: SamplingParams) -> int:
+        """Decodes the new token for a sequence. In-place operation.
+
+        Args:
+            seq: The sequence to decode.
+            prms: The sampling parameters used to generate the sequence.
+
+        Returns:
+            The number of characters added to the output text.
+        """
+        all_input_ids = seq.get_token_ids()
+        token_id_generated_this_iteration = all_input_ids[-1]
+        tokenizer = self.get_tokenizer_for_seq(seq)
+
+        # Convert prompt token IDs to tokens if necessary.
+        # Do it here so that we don't have to repeat this
+        # computation for each logprob.
+        if seq.tokens is None:
+            (seq.tokens, seq.prefix_offset,
+             seq.read_offset) = convert_prompt_ids_to_tokens(
+                 tokenizer=tokenizer,
+                 prompt_ids=all_input_ids[:-1],
+                 skip_special_tokens=prms.skip_special_tokens,
+             )
+
+        (new_tokens, new_decoded_token_text, prefix_offset,
+         read_offset) = detokenize_incrementally(
+             tokenizer=tokenizer,
+             all_input_ids=all_input_ids,
+             prev_tokens=seq.tokens,
+             prefix_offset=seq.prefix_offset,
+             read_offset=seq.read_offset,
+             skip_special_tokens=prms.skip_special_tokens,
+             spaces_between_special_tokens=prms.spaces_between_special_tokens,
+         )
+
+        # Decode logprobs
+        logprobs = seq.output_logprobs[-1]
+        if logprobs:
+            previous_tokens = all_input_ids[:-1]
+            for token_id, sample_logprob in logprobs.items():
+                # If the token was generated this iteration,
+                # use the provided text.
+                if token_id == token_id_generated_this_iteration:
+                    sample_logprob.decoded_token = new_decoded_token_text
+                    continue
+
+                if (sample_logprob.decoded_token is None
+                        and token_id != VLLM_INVALID_TOKEN_ID):
+                    all_input_ids_with_logprob = previous_tokens + [token_id]
+                    (_, new_text, _, _) = detokenize_incrementally(
+                        tokenizer=tokenizer,
+                        all_input_ids=all_input_ids_with_logprob,
+                        prev_tokens=seq.tokens,
+                        prefix_offset=seq.prefix_offset,
+                        read_offset=seq.read_offset,
+                        skip_special_tokens=prms.skip_special_tokens,
+                        spaces_between_special_tokens=prms.
+                        spaces_between_special_tokens,
+                    )
+                    sample_logprob.decoded_token = new_text
+
+        seq.tokens.extend(new_tokens)
+        seq.prefix_offset = prefix_offset
+        seq.read_offset = read_offset
+        seq.output_text += new_decoded_token_text
+
+        return len(new_decoded_token_text)
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -0,0 +1,189 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+from .tokenizer import AnyTokenizer
+
+
+def _replace_none_with_empty(tokens: list[Optional[str]]):
+    for i, token in enumerate(tokens):
+        if token is None:
+            tokens[i] = ""
+
+
+def _convert_tokens_to_string_with_added_encoders(
+    tokenizer: AnyTokenizer,
+    output_tokens: list[str],
+    skip_special_tokens: bool,
+    spaces_between_special_tokens: bool,
+) -> str:
+    # Adapted from
+    # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
+    # NOTE(woosuk): The following code is slow because it runs a for loop over
+    # the output_tokens. In Python, running a for loop over a list can be slow
+    # even when the loop body is very simple.
+    sub_texts: list[str] = []
+    current_sub_text: list[str] = []
+    all_special_tokens = set(tokenizer.all_special_tokens)
+    for token in output_tokens:
+        if skip_special_tokens and token in all_special_tokens:
+            continue
+        if token in tokenizer.get_added_vocab():
+            if current_sub_text:
+                sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
+                sub_texts.append(sub_text)
+                current_sub_text = []
+            sub_texts.append(token)
+        else:
+            current_sub_text.append(token)
+    if current_sub_text:
+        sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
+        sub_texts.append(sub_text)
+    if spaces_between_special_tokens:
+        return " ".join(sub_texts)
+    else:
+        return "".join(sub_texts)
+
+
+# 5 is an arbitrary value that should work for all
+# tokenizers (bigger = more conservative).
+INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5
+
+
+def convert_prompt_ids_to_tokens(
+    tokenizer: AnyTokenizer,
+    prompt_ids: list[int],
+    skip_special_tokens: bool = False,
+) -> tuple[list[str], int, int]:
+    """Converts the prompt ids to tokens and returns the tokens and offsets
+    for incremental detokenization.
+
+    Note that not all tokens are converted to strings. Only the tokens that
+    are necessary for incremental detokenization are converted to strings.
+    """
+    # We do not need to convert the whole prompt to tokens.
+    # Offset a little more in case we have special tokens.
+    new_tokens = tokenizer.convert_ids_to_tokens(
+        prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2:],
+        skip_special_tokens=skip_special_tokens)
+    read_offset = len(new_tokens)
+    prefix_offset = max(
+        read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
+    # This is required to guard against out-of-vocab prompt token ids
+    _replace_none_with_empty(new_tokens)  # type: ignore[arg-type]
+    return new_tokens, prefix_offset, read_offset
+
+
+def convert_ids_list_to_tokens(
+    tokenizer: AnyTokenizer,
+    token_ids: list[int],
+) -> list[str]:
+    """Detokenize the input ids individually.
+
+    Args:
+      tokenizer: tokenizer used by model under test
+      token_ids: convert these tokens (Python list form)
+
+    Returns:
+      Python list of token string representations
+    
+    """
+    token_str_lst = tokenizer.convert_ids_to_tokens(token_ids)
+    _replace_none_with_empty(token_str_lst)  # type: ignore
+    return token_str_lst
+
+
+# Based on
+# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
+# under Apache 2.0 license
+def detokenize_incrementally(
+    tokenizer: AnyTokenizer,
+    all_input_ids: list[int],
+    prev_tokens: Optional[list[str]],
+    prefix_offset: int,
+    read_offset: int,
+    skip_special_tokens: bool = False,
+    spaces_between_special_tokens: bool = True,
+) -> tuple[list[str], str, int, int]:
+    """Detokenizes the input ids incrementally and returns the new tokens
+    and the new text.
+
+    If `prev_tokens` is None, this function will convert the input ids to
+    tokens and return the tokens and the new text. Otherwise, it will return the
+    new tokens and the new text.
+
+    This function will also return the new prefix offset and the new read
+    offset to be used in the next iteration.
+
+    The offsets are necessary to defeat cleanup algorithms in the decode which
+    decide to add a space or not depending on the surrounding ids.
+
+    Args:
+        tokenizer: The tokenizer to use.
+        all_input_ids: The input ids. The last id is the new token id.
+        prev_tokens: The previous tokens. If None, this function will convert
+            the input ids to tokens and return the tokens and the new text.
+        prefix_offset: The prefix offset.
+        read_offset: The read offset.
+        skip_special_tokens: Whether to skip special tokens.
+        spaces_between_special_tokens: Whether to add spaces between special
+            tokens.
+    """
+    new_token_id = all_input_ids[-1]
+    # This is the first iteration for this sequence
+    is_first_iter = prev_tokens is None
+    if is_first_iter:
+        (prev_tokens, prefix_offset,
+         read_offset) = convert_prompt_ids_to_tokens(
+             tokenizer,
+             all_input_ids[:-1],
+             skip_special_tokens=skip_special_tokens)
+    assert prev_tokens is not None
+
+    # If the new token id is out of bounds, return an empty string.
+    if 0 <= new_token_id < len(tokenizer):
+        # Put new_token_id in a list so skip_special_tokens is respected
+        new_tokens = tokenizer.convert_ids_to_tokens(
+            [new_token_id], skip_special_tokens=skip_special_tokens)
+        if isinstance(new_tokens, str):
+            new_tokens = [new_tokens]
+    else:
+        new_tokens = [""]
+    output_tokens = prev_tokens + new_tokens
+
+    # If this is the first iteration, return all tokens.
+    if is_first_iter:
+        new_tokens = output_tokens
+
+    # The prefix text is necessary only to defeat cleanup algorithms in
+    # the decode which decide to add a space or not depending on the
+    # surrounding ids.
+    if tokenizer.is_fast or not tokenizer.get_added_vocab():
+        prefix_text = tokenizer.convert_tokens_to_string(
+            output_tokens[prefix_offset:read_offset])
+        new_text = tokenizer.convert_tokens_to_string(
+            output_tokens[prefix_offset:])
+    else:
+        prefix_text = _convert_tokens_to_string_with_added_encoders(
+            tokenizer,
+            output_tokens[prefix_offset:read_offset],
+            skip_special_tokens=skip_special_tokens,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+        )
+        new_text = _convert_tokens_to_string_with_added_encoders(
+            tokenizer,
+            output_tokens[prefix_offset:],
+            skip_special_tokens=skip_special_tokens,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+        )
+
+    if len(new_text) <= len(prefix_text) or new_text.endswith("<EFBFBD>"):
+        # utf-8 char at the end means it's a potential unfinished byte sequence
+        # from byte fallback tokenization.
+        # If it's in the middle, it's probably a real invalid id generated
+        # by the model
+        return new_tokens, "", prefix_offset, read_offset
+
+    new_text = new_text[len(prefix_text):]
+    return new_tokens, new_text, read_offset, len(output_tokens)
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -0,0 +1,221 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from functools import lru_cache
+from typing import TYPE_CHECKING, Any, Optional, Union, cast
+
+from transformers.processing_utils import ProcessorMixin
+from typing_extensions import TypeVar
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+
+_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
+
+
+class HashableDict(dict):
+    """
+    A dictionary that can be hashed by lru_cache.
+    """
+
+    # NOTE: pythonic dict is not hashable,
+    # we override on it directly for simplicity
+    def __hash__(self) -> int:  # type: ignore[override]
+        return hash(frozenset(self.items()))
+
+
+class HashableList(list):
+    """
+    A list that can be hashed by lru_cache.
+    """
+
+    def __hash__(self) -> int:  # type: ignore[override]
+        return hash(tuple(self))
+
+
+def _merge_mm_kwargs(model_config: "ModelConfig", **kwargs):
+    mm_config = model_config.get_multimodal_config()
+    base_kwargs = mm_config.mm_processor_kwargs
+    if base_kwargs is None:
+        base_kwargs = {}
+
+    merged_kwargs = {**base_kwargs, **kwargs}
+
+    # NOTE: Pythonic dict is not hashable and will raise unhashable type
+    # error when calling `cached_get_processor`, therefore we need to
+    # wrap it to a hashable dict.
+    for key, value in merged_kwargs.items():
+        if isinstance(value, dict):
+            merged_kwargs[key] = HashableDict(value)
+        if isinstance(value, list):
+            merged_kwargs[key] = HashableList(value)
+    return merged_kwargs
+
+
+def get_processor(
+    processor_name: str,
+    *args: Any,
+    revision: Optional[str] = None,
+    trust_remote_code: bool = False,
+    processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
+    **kwargs: Any,
+) -> _P:
+    """Load a processor for the given model name via HuggingFace."""
+    # don't put this import at the top level
+    # it will call torch.cuda.device_count()
+    from transformers import AutoProcessor
+
+    processor_factory = (AutoProcessor if processor_cls == ProcessorMixin or
+                         isinstance(processor_cls, tuple) else processor_cls)
+
+    try:
+        processor = processor_factory.from_pretrained(
+            processor_name,
+            *args,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            **kwargs,
+        )
+    except ValueError as e:
+        # If the error pertains to the processor class not existing or not
+        # currently being imported, suggest using the --trust-remote-code flag.
+        # Unlike AutoTokenizer, AutoProcessor does not separate such errors
+        if not trust_remote_code:
+            err_msg = (
+                "Failed to load the processor. If the processor is "
+                "a custom processor not yet available in the HuggingFace "
+                "transformers library, consider setting "
+                "`trust_remote_code=True` in LLM or using the "
+                "`--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+
+    if not isinstance(processor, processor_cls):
+        raise TypeError("Invalid type of HuggingFace processor. "
+                        f"Expected type: {processor_cls}, but "
+                        f"found type: {type(processor)}")
+
+    return processor
+
+
+cached_get_processor = lru_cache(get_processor)
+
+
+def cached_processor_from_config(
+    model_config: "ModelConfig",
+    processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
+    **kwargs: Any,
+) -> _P:
+    return cached_get_processor(
+        model_config.model,
+        revision=model_config.revision,
+        trust_remote_code=model_config.trust_remote_code,
+        processor_cls=processor_cls,  # type: ignore[arg-type]
+        **_merge_mm_kwargs(model_config, **kwargs),
+    )
+
+
+def get_feature_extractor(
+    processor_name: str,
+    *args: Any,
+    revision: Optional[str] = None,
+    trust_remote_code: bool = False,
+    **kwargs: Any,
+):
+    """Load an audio feature extractor for the given model name 
+    via HuggingFace."""
+    # don't put this import at the top level
+    # it will call torch.cuda.device_count()
+    from transformers import AutoFeatureExtractor
+    from transformers.feature_extraction_utils import FeatureExtractionMixin
+    try:
+        feature_extractor = AutoFeatureExtractor.from_pretrained(
+            processor_name,
+            *args,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            **kwargs)
+    except ValueError as e:
+        # If the error pertains to the processor class not existing or not
+        # currently being imported, suggest using the --trust-remote-code flag.
+        # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
+        if not trust_remote_code:
+            err_msg = (
+                "Failed to load the feature extractor. If the feature "
+                "extractor is a custom extractor not yet available in the "
+                "HuggingFace transformers library, consider setting "
+                "`trust_remote_code=True` in LLM or using the "
+                "`--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+    return cast(FeatureExtractionMixin, feature_extractor)
+
+
+cached_get_feature_extractor = lru_cache(get_feature_extractor)
+
+
+def cached_feature_extractor_from_config(
+    model_config: "ModelConfig",
+    **kwargs: Any,
+):
+    return cached_get_feature_extractor(
+        model_config.model,
+        revision=model_config.revision,
+        trust_remote_code=model_config.trust_remote_code,
+        **_merge_mm_kwargs(model_config, **kwargs),
+    )
+
+
+def get_image_processor(
+    processor_name: str,
+    *args: Any,
+    revision: Optional[str] = None,
+    trust_remote_code: bool = False,
+    **kwargs: Any,
+):
+    """Load an image processor for the given model name via HuggingFace."""
+    # don't put this import at the top level
+    # it will call torch.cuda.device_count()
+    from transformers import AutoImageProcessor
+    from transformers.image_processing_utils import BaseImageProcessor
+
+    try:
+        processor = AutoImageProcessor.from_pretrained(
+            processor_name,
+            *args,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            **kwargs)
+    except ValueError as e:
+        # If the error pertains to the processor class not existing or not
+        # currently being imported, suggest using the --trust-remote-code flag.
+        # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
+        if not trust_remote_code:
+            err_msg = (
+                "Failed to load the image processor. If the image processor is "
+                "a custom processor not yet available in the HuggingFace "
+                "transformers library, consider setting "
+                "`trust_remote_code=True` in LLM or using the "
+                "`--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+
+    return cast(BaseImageProcessor, processor)
+
+
+cached_get_image_processor = lru_cache(get_image_processor)
+
+
+def cached_image_processor_from_config(
+    model_config: "ModelConfig",
+    **kwargs: Any,
+):
+    return cached_get_image_processor(
+        model_config.model,
+        revision=model_config.revision,
+        trust_remote_code=model_config.trust_remote_code,
+        **_merge_mm_kwargs(model_config, **kwargs),
+    )
--- a/vllm/transformers_utils/processors/init.py
+++ b/vllm/transformers_utils/processors/init.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.transformers_utils.processors.deepseek_vl2 import (
+    DeepseekVLV2Processor)
+from vllm.transformers_utils.processors.ovis import OvisProcessor
+
+__all__ = ["DeepseekVLV2Processor", "OvisProcessor"]
--- a/vllm/transformers_utils/processors/deepseek_vl2.py
+++ b/vllm/transformers_utils/processors/deepseek_vl2.py
@@ -0,0 +1,363 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# yapf: disable
+# ruff: noqa: E501
+# coding=utf-8
+# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import math
+
+import torch
+import torchvision.transforms as T
+from PIL import Image, ImageOps
+from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast
+from transformers.processing_utils import ProcessorMixin
+
+
+class ImageTransform:
+
+    def __init__(self,
+                 mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
+                 std: tuple[float, float, float] = (0.5, 0.5, 0.5),
+                 normalize: bool = True):
+        self.mean = mean
+        self.std = std
+        self.normalize = normalize
+
+        transform_pipelines = [T.ToTensor()]
+
+        if normalize:
+            transform_pipelines.append(T.Normalize(mean, std))
+
+        self.transform = T.Compose(transform_pipelines)
+
+    def __call__(self, pil_img: Image.Image):
+        x = self.transform(pil_img)
+        return x
+
+
+class DeepseekVLV2Processor(ProcessorMixin):
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    attributes = ["tokenizer"]
+
+    def __init__(
+        self,
+        tokenizer: LlamaTokenizerFast,
+        candidate_resolutions: tuple[tuple[int, int]],
+        patch_size: int,
+        downsample_ratio: int,
+        image_mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
+        image_std: tuple[float, float, float] = (0.5, 0.5, 0.5),
+        normalize: bool = True,
+        image_token: str = "<image>",
+        pad_token: str = "<｜▁pad▁｜>",
+        add_special_token: bool = False,
+        sft_format: str = "deepseek",
+        mask_prompt: bool = True,
+        ignore_id: int = -100,
+        **kwargs,
+    ):
+
+        self.candidate_resolutions = candidate_resolutions
+        self.image_size = candidate_resolutions[0][0]
+        self.patch_size = patch_size
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.normalize = normalize
+        self.downsample_ratio = downsample_ratio
+
+        self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize)
+        self.tokenizer = tokenizer
+        self.tokenizer.padding_side = 'left'  # must set this，padding side with make a difference in batch inference
+
+        # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
+        if tokenizer.pad_token is None:
+            self.tokenizer.add_special_tokens({'pad_token': pad_token})
+
+        # add image token
+        image_token_id = self.tokenizer.vocab.get(image_token)
+        if image_token_id is None:
+            special_tokens = [image_token]
+            special_tokens_dict = {"additional_special_tokens": special_tokens}
+            self.tokenizer.add_special_tokens(special_tokens_dict)
+        self.image_token_id = self.tokenizer.vocab.get(image_token)
+
+        # add five special tokens for grounding-related tasks
+        # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
+        special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>']
+        special_tokens_dict = {"additional_special_tokens": special_tokens}
+        self.tokenizer.add_special_tokens(special_tokens_dict)
+
+        # add special tokens for SFT data
+        special_tokens = ["<|User|>", "<|Assistant|>"]
+        special_tokens_dict = {"additional_special_tokens": special_tokens}
+        self.tokenizer.add_special_tokens(special_tokens_dict)
+
+        self.image_token = image_token
+        self.pad_token = pad_token
+        self.add_special_token = add_special_token
+        self.sft_format = sft_format
+        self.mask_prompt = mask_prompt
+        self.ignore_id = ignore_id
+
+        super().__init__(
+            tokenizer,
+            **kwargs,
+        )
+
+    def select_best_resolution(self, image_size):
+        # used for cropping
+        original_width, original_height = image_size
+        best_fit = None
+        max_effective_resolution = 0
+        min_wasted_resolution = float("inf")
+
+        for width, height in self.candidate_resolutions:
+            scale = min(width / original_width, height / original_height)
+            downscaled_width, downscaled_height = int(
+                original_width * scale), int(original_height * scale)
+            effective_resolution = min(downscaled_width * downscaled_height,
+                                       original_width * original_height)
+            wasted_resolution = (width * height) - effective_resolution
+
+            if effective_resolution > max_effective_resolution or (
+                    effective_resolution == max_effective_resolution
+                    and wasted_resolution < min_wasted_resolution):
+                max_effective_resolution = effective_resolution
+                min_wasted_resolution = wasted_resolution
+                best_fit = (width, height)
+
+        return best_fit
+
+    @property
+    def bos_id(self):
+        return self.tokenizer.bos_token_id
+
+    @property
+    def eos_id(self):
+        return self.tokenizer.eos_token_id
+
+    @property
+    def pad_id(self):
+        return self.tokenizer.pad_token_id
+
+    def encode(self, text: str, bos: bool = True, eos: bool = False):
+        t = self.tokenizer.encode(text, add_special_tokens=False)
+
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+
+        return t
+
+    def decode(self, t: list[int], **kwargs) -> str:
+        return self.tokenizer.decode(t, **kwargs)
+
+    def process_one(
+        self,
+        prompt: str,
+        images: list[Image.Image],
+        inference_mode: bool = True,
+        **kwargs,
+    ):
+        """
+
+        Args:
+            prompt (str): the formatted prompt;
+            conversations (list[dict]): conversations with a list of messages;
+            images (list[ImageType]): the list of images;
+            inference_mode (bool): if True, then remove the last eos token;
+            system_prompt (str): the system prompt;
+            **kwargs:
+
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - target_ids (torch.LongTensor): [N + image tokens]
+                - pixel_values (torch.FloatTensor): [n_patches, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (list[int]): the number of image tokens
+        """
+
+        assert (prompt is not None and images is not None
+                ), "prompt and images must be used at the same time."
+
+        sft_format = prompt
+        tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images(
+            sft_format, images, bos=True, eos=True, cropping=len(images) <= 2)
+        masked_tokenized_str = []
+        for token_index in tokenized_str:
+            if token_index != self.image_token_id:
+                masked_tokenized_str.append(token_index)
+            else:
+                masked_tokenized_str.append(self.ignore_id)
+
+        assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \
+            (f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
+             f"imags_seq_mask's length {len(images_seq_mask)}, are not equal")
+
+        input_ids = torch.LongTensor(tokenized_str)
+        target_ids = torch.LongTensor(masked_tokenized_str)
+        images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
+
+        # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
+        target_ids[(input_ids < 0) |
+                   (input_ids == self.image_token_id)] = self.ignore_id
+        input_ids[input_ids < 0] = self.pad_id
+
+        if inference_mode:
+            # Remove the ending eos token
+            assert input_ids[-1] == self.eos_id
+            input_ids = input_ids[:-1]
+            target_ids = target_ids[:-1]
+            images_seq_mask = images_seq_mask[:-1]
+
+        if len(images_list) == 0:
+            pixel_values = torch.zeros((1, 3, self.image_size, self.image_size))
+            images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
+        else:
+            pixel_values = torch.stack(images_list, dim=0)
+            images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
+
+        input_ids = input_ids.unsqueeze(0)
+
+        prepare = BatchFeature(
+            data=dict(
+                input_ids=input_ids,
+                pixel_values=pixel_values,
+                images_seq_mask=images_seq_mask,
+                images_spatial_crop=images_spatial_crop,
+                num_image_tokens=num_image_tokens,
+            ),
+            tensor_type="pt",
+        )
+        return prepare
+
+    def __call__(
+        self,
+        *,
+        prompt: str,
+        images: list[Image.Image],
+        inference_mode: bool = True,
+        **kwargs,
+    ):
+        """
+
+        Args:
+            prompt (str): the formatted prompt;
+            images (list[ImageType]): the list of images;
+            inference_mode (bool): if True, then remove the last eos token;
+            **kwargs:
+
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - images (torch.FloatTensor): [n_images, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (list[int]): the number of image tokens
+        """
+
+        prepare = self.process_one(
+            prompt=prompt,
+            images=images,
+            inference_mode=inference_mode,
+        )
+
+        return prepare
+
+    def tokenize_with_images(
+        self,
+        conversation: str,
+        images: list[Image.Image],
+        bos: bool = True,
+        eos: bool = True,
+        cropping: bool = True,
+    ):
+        """Tokenize text with <image> tags."""
+        assert conversation.count(self.image_token) == len(images)
+        text_splits = conversation.split(self.image_token)
+        images_list, images_seq_mask, images_spatial_crop = [], [], []
+        num_image_tokens = []
+        tokenized_str = []
+        for text_sep, image in zip(text_splits, images):
+            """encode text_sep"""
+            tokenized_sep = self.encode(text_sep, bos=False, eos=False)
+            tokenized_str += tokenized_sep
+            images_seq_mask += [False] * len(tokenized_sep)
+
+            """select best resolution for anyres"""
+            if cropping:
+                best_width, best_height = self.select_best_resolution(image.size)
+            else:
+                best_width, best_height = self.image_size, self.image_size
+
+            """process the global view"""
+            global_view = ImageOps.pad(image, (self.image_size, self.image_size),
+                                       color=tuple(int(x * 255) for x in self.image_transform.mean))
+            images_list.append(self.image_transform(global_view))
+
+            """process the local views"""
+            local_view = ImageOps.pad(image, (best_width, best_height),
+                                      color=tuple(int(x * 255) for x in self.image_transform.mean))
+            for i in range(0, best_height, self.image_size):
+                for j in range(0, best_width, self.image_size):
+                    images_list.append(
+                        self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
+
+            """record height / width crop num"""
+            num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size
+            images_spatial_crop.append([num_width_tiles, num_height_tiles])
+
+            """add image tokens"""
+            h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
+            # global views tokens h * (w + 1), 1 is for line separator
+            tokenized_image = [self.image_token_id] * h * (w + 1)
+            # add a separator between global and local views
+            tokenized_image += [self.image_token_id]
+            # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
+            tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1)
+
+            tokenized_str += tokenized_image
+            images_seq_mask += [True] * len(tokenized_image)
+            num_image_tokens.append(len(tokenized_image))
+
+        """process the last text split"""
+        tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
+        tokenized_str += tokenized_sep
+        images_seq_mask += [False] * len(tokenized_sep)
+
+        """add the bos and eos tokens"""
+        if bos:
+            tokenized_str = [self.bos_id] + tokenized_str
+            images_seq_mask = [False] + images_seq_mask
+        if eos:
+            tokenized_str = tokenized_str + [self.eos_id]
+            images_seq_mask = images_seq_mask + [False]
+
+        assert len(tokenized_str) == len(
+            images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
+
+        return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens
+
+
+AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)
--- a/vllm/transformers_utils/processors/ovis.py
+++ b/vllm/transformers_utils/processors/ovis.py
@@ -0,0 +1,420 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# yapf: disable
+# ruff: noqa: E501
+# coding=utf-8
+# adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
+# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import cached_property
+from typing import Union
+
+import PIL
+import torch
+from transformers import AutoProcessor, BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
+                                           Unpack)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+from vllm.multimodal.image import convert_image_mode
+
+__all__ = ['OvisProcessor']
+IGNORE_ID = -100
+
+class OvisProcessorKwargs(ProcessingKwargs, total=False):   # type: ignore[call-arg]
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {
+            'max_partition':9,
+            'covering_threshold':0.9,
+            'convert_to_rgb':True,
+        'return_tensors':'pt'},
+    }
+
+
+
+class OvisProcessor(ProcessorMixin):
+    r"""
+    Constructs a Ovis processor which wraps a Ovis image processor and a Qwen2 tokenizer into a single processor.
+    [`OvisProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
+    [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`] for more information.
+    Args:
+        image_processor ([`Qwen2VLImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template", "image_pad_token", "image_segement_len"]
+
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        image_pad_token=None,
+        image_segment_len=255,
+        **kwargs,
+    ):
+        self.image_token = "<image>"
+        self.image_pad_token = image_pad_token
+        self.image_segment_len = image_segment_len
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    @cached_property
+    def extra_special_tokens(self):
+        image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token]
+        extra_special_tokens = {
+            "image_token": -200,
+            "image_atom": -300,
+            "image_start": -301,
+            "image_prefix": -302,
+            "image_col_sep": -303,
+            "image_row_sep": -304,
+            "image_end": -305,
+            'image_pad': image_pad_token_id,
+        }
+        return extra_special_tokens
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        **kwargs: Unpack[OvisProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
+            Args:
+                images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                    The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                    tensor. Both channels-first and channels-last formats are supported.
+                text (`str`, `list[str]`, `list[list[str]]`):
+                    The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                    (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                    `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+                videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                    The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+                    tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
+                return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                    If set, will return tensors of a particular framework. Acceptable values are:
+                    - `'tf'`: Return TensorFlow `tf.constant` objects.
+                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                    - `'np'`: Return NumPy `np.ndarray` objects.
+                    - `'jax'`: Return JAX `jnp.ndarray` objects.
+            Returns:
+                [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+                - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+                - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+                  `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+                  `None`).
+                - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+                - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+                - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+                - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
+                - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            OvisProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        # Process all images first
+        image_features = {}
+        if images is not None:
+            processed_images = []
+            image_placeholders_list = []
+            grids = []
+
+            # Process each image
+            for image in images if isinstance(images, list) else [images]:
+                pixel_values, image_placeholders, grid = self.preprocess_image(
+                    image=image, **output_kwargs["images_kwargs"]
+                )
+                processed_images.append(pixel_values)
+                image_placeholders_list.append(image_placeholders)
+                grids.append(grid)
+
+            # assign all processed images
+            if processed_images:
+                image_features["image_placeholders"] = image_placeholders_list
+
+        # Process text input
+        if text is not None:
+
+            if not isinstance(text, list):
+                text = [text]
+
+            tokenized_batched_text = self._tokenize_with_image_symbol(text)
+            image_token_id = self.get_token_value("image_token")
+            replaced_ids_list = []
+            idx = 0
+            for ids_tensor in tokenized_batched_text:
+                if image_token_id in ids_tensor and "image_placeholders" in image_features:
+                    if idx < len(image_features["image_placeholders"]):
+                        # Converts in list for ease of use
+                        ids_list = ids_tensor.tolist()
+
+                        new_ids = []
+
+                        # replace placeholders
+                        for i, token_id in enumerate(ids_list):
+                            if token_id == image_token_id:
+                                placeholder_ids = image_features["image_placeholders"][idx]
+                                new_ids.extend(placeholder_ids)
+                                idx += 1
+                            else:
+                                new_ids.append(token_id)
+
+                        # Converts back to tensors
+                        ids_tensor = torch.tensor(new_ids, dtype=torch.long)
+                    else:
+                        raise RuntimeError(
+                            'Mismatch between the images you provided and the number of placeholder present in the text')
+
+                replaced_ids_list.append(ids_tensor)
+
+            if replaced_ids_list:
+                replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
+            else:
+                replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)
+
+            # Create the output with text features
+            output = BatchFeature(
+                data={
+                    "input_ids": replaced_and_tokenized_ids,
+                }
+            )
+
+            # Add image features if present
+            if image_features:
+                output["pixel_values"] = processed_images
+                output['grids'] = grids
+
+            return output
+
+        # If only images were provided
+        return BatchFeature(data=image_features)
+
+    def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
+        batch_token_ids = []
+        for text in text_list:
+            text_chunks = [self.tokenizer(chunk, add_special_tokens=False).input_ids for chunk in
+                           text.split(self.image_token)]
+            token_ids = []
+            num_chuck = len(text_chunks)
+            for i, chunk in enumerate(text_chunks):
+                token_ids.extend(chunk)
+                if i < num_chuck - 1:
+                    token_ids.append(self.get_token_value("image_token"))
+            batch_token_ids.append(token_ids)
+        return torch.tensor(batch_token_ids, dtype=torch.long)
+
+    def get_image_size(self):
+        size = self.image_processor.size
+        if 'shortest_edge' in size:
+            width = height = size['shortest_edge']
+        elif "height" in size and "width" in size:
+            width = size['width']
+            height = size['height']
+        else:
+            raise ValueError( "Can't parse image size from image_processor config.")
+        return height, width
+
+    def get_token_value(self, tok):
+        return self.extra_special_tokens[tok]
+
+    def construct_image_indicators(self, grid):
+        image_placeholders = [self.get_token_value('image_start'),
+                              self.get_token_value('image_atom'),
+                              self.get_token_value('image_prefix')]
+        if grid[0] * grid[1] > 1:
+            for r in range(grid[0]):
+                for c in range(grid[1]):
+                    image_placeholders.append(self.get_token_value('image_atom') )
+                    if c < grid[1] - 1:
+                        image_placeholders.append(self.get_token_value('image_col_sep'))
+                if r < grid[0] - 1:
+                    image_placeholders.append(self.get_token_value('image_row_sep'))
+        image_placeholders.append(self.get_token_value('image_end'))
+        return image_placeholders
+
+    def construct_image_placeholders(self, grid):
+
+        image_placeholders = self.construct_image_indicators(grid)
+
+        image_atom_token_id = self.get_token_value('image_atom')
+        # Extract the padding token ID from tokenizer
+        image_padding_token_id = self.get_token_value('image_pad')
+
+        # Create a new list with padding tokens inserted
+        padded_placeholder_tokens = []
+        for token in image_placeholders:
+            padded_placeholder_tokens.append(image_padding_token_id)
+            if token == image_atom_token_id:
+                padded_placeholder_tokens.extend([image_padding_token_id] * self.image_segment_len)
+        return padded_placeholder_tokens
+
+    def preprocess_image(self, image: PIL.Image.Image, max_partition, covering_threshold, convert_to_rgb, return_tensors):
+        def _preprocess(img: PIL.Image.Image, side):
+            # first resize and preprocess
+            w, h = img.size
+            if w == h:
+                new_width = new_height = side
+            elif w > h:
+                new_width = side
+                new_height = int(h / w * new_width)
+            else:
+                new_height = side
+                new_width = int(w / h * new_height)
+            new_size = dict(height=new_height, width=new_width)
+            pixel_values = self.image_processor.preprocess(img, size=new_size, return_tensors=return_tensors)['pixel_values']
+
+            # then pad to square
+            square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device)
+            new_height, new_width = pixel_values.shape[2:]
+            if new_height == new_width:
+                square_values[:, :, :, :] = pixel_values
+            elif new_height > new_width:
+                from_index = (side - new_width) // 2
+                square_values[:, :, :, from_index:from_index + new_width] = pixel_values
+            else:
+                from_index = (side - new_height) // 2
+                square_values[:, :, from_index:from_index + new_height, :] = pixel_values
+
+            return square_values
+
+        def _partition(img, grid) -> list[tuple[int, int, int, int]]:
+            w, h = img.size
+            row_height = h // grid[0]
+            col_width = w // grid[1]
+
+            partition = []
+            for row in range(grid[0]):
+                for col in range(grid[1]):
+                    left = col * col_width
+                    upper = row * row_height
+                    right = w if col == grid[1] - 1 else (col + 1) * col_width
+                    lower = h if row == grid[0] - 1 else (row + 1) * row_height
+                    partition.append((left, upper, right, lower))
+
+            return partition
+
+        def _covering_area(left, upper, right, lower, side):
+            w = right - left
+            h = lower - upper
+            w, h = max(w, h), min(w, h)
+            if w > side:
+                h = h / w * side
+                w = side
+            return w * h
+
+        def _get_best_grid(img, side):
+            img_area = img.size[0] * img.size[1]
+
+            candidate_grids = []
+            for i in range(1, max_partition + 1):
+                for j in range(1, max_partition + 1):
+                    if i * j <= max_partition:
+                        candidate_grids.append((i, j))
+
+            all_grids = []
+            good_grids = []
+            for grid in candidate_grids:
+                partition = _partition(img, grid)
+                covering_ratio = sum([_covering_area(*p, side) for p in partition]) / img_area
+                assert covering_ratio <= 1.0
+                all_grids.append((grid, covering_ratio))
+                if covering_ratio > covering_threshold:
+                    good_grids.append((grid, covering_ratio))
+
+            if len(good_grids) > 0:
+                # pick the good partition with minimum #sub_images and break the tie using covering_ratio
+                return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][0]
+            else:
+                # pick the partition with maximum covering_ratio and break the tie using #sub_images
+                return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
+
+        if convert_to_rgb:
+            image = convert_image_mode(image, 'RGB')
+
+
+        sides = self.get_image_size()
+        if sides[0] != sides[1]:
+            raise ValueError('get_image_size() returns non-square size')
+        side = sides[0]
+        grid = _get_best_grid(image, side)
+        partition = _partition(image, grid)
+        crops = [image.crop(p) for p in partition]
+        if len(crops) > 1:
+            crops.insert(0, image)
+        pixel_values = torch.cat([_preprocess(crop, side) for crop in crops], dim=0)
+        image_placeholders = self.construct_image_placeholders(grid)
+        return pixel_values, image_placeholders, grid
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def post_process_image_text_to_text(self, generated_outputs):
+        """
+        Post-process the output of the model to decode the text.
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+        Returns:
+            `list[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+        return names_from_processor + ["second_per_grid_ts"]
+
+
+AutoProcessor.register("OvisProcessor", OvisProcessor)
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import fnmatch
+import os
+import shutil
+import signal
+import tempfile
+from pathlib import Path
+from typing import Optional
+
+from vllm.utils import PlaceholderModule
+
+try:
+    import boto3
+except ImportError:
+    boto3 = PlaceholderModule("boto3")  # type: ignore[assignment]
+
+
+def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
+    return [
+        path for path in paths if any(
+            fnmatch.fnmatch(path, pattern) for pattern in patterns)
+    ]
+
+
+def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
+    return [
+        path for path in paths
+        if not any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
+    ]
+
+
+def glob(s3=None,
+         path: str = "",
+         allow_pattern: Optional[list[str]] = None) -> list[str]:
+    """
+    List full file names from S3 path and filter by allow pattern.
+
+    Args:
+        s3: S3 client to use.
+        path: The S3 path to list from.
+        allow_pattern: A list of patterns of which files to pull.
+
+    Returns:
+        list[str]: List of full S3 paths allowed by the pattern
+    """
+    if s3 is None:
+        s3 = boto3.client("s3")
+    if not path.endswith("/"):
+        path = path + "/"
+    bucket_name, _, paths = list_files(s3,
+                                       path=path,
+                                       allow_pattern=allow_pattern)
+    return [f"s3://{bucket_name}/{path}" for path in paths]
+
+
+def list_files(
+        s3,
+        path: str,
+        allow_pattern: Optional[list[str]] = None,
+        ignore_pattern: Optional[list[str]] = None
+) -> tuple[str, str, list[str]]:
+    """
+    List files from S3 path and filter by pattern.
+
+    Args:
+        s3: S3 client to use.
+        path: The S3 path to list from.
+        allow_pattern: A list of patterns of which files to pull.
+        ignore_pattern: A list of patterns of which files not to pull.
+
+    Returns:
+        tuple[str, str, list[str]]: A tuple where:
+            - The first element is the bucket name
+            - The second element is string represent the bucket 
+              and the prefix as a dir like string
+            - The third element is a list of files allowed or 
+              disallowed by pattern
+    """
+    parts = path.removeprefix('s3://').split('/')
+    prefix = '/'.join(parts[1:])
+    bucket_name = parts[0]
+
+    objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
+    paths = [obj['Key'] for obj in objects.get('Contents', [])]
+
+    paths = _filter_ignore(paths, ["*/"])
+    if allow_pattern is not None:
+        paths = _filter_allow(paths, allow_pattern)
+
+    if ignore_pattern is not None:
+        paths = _filter_ignore(paths, ignore_pattern)
+
+    return bucket_name, prefix, paths
+
+
+class S3Model:
+    """
+    A class representing a S3 model mirrored into a temporary directory.
+
+    Attributes:
+        s3: S3 client.
+        dir: The temporary created directory.
+
+    Methods:
+        pull_files(): Pull model from S3 to the temporary directory.
+    """
+
+    def __init__(self) -> None:
+        self.s3 = boto3.client('s3')
+        for sig in (signal.SIGINT, signal.SIGTERM):
+            existing_handler = signal.getsignal(sig)
+            signal.signal(sig, self._close_by_signal(existing_handler))
+
+        self.dir = tempfile.mkdtemp()
+
+    def __del__(self):
+        self._close()
+
+    def _close(self) -> None:
+        if os.path.exists(self.dir):
+            shutil.rmtree(self.dir)
+
+    def _close_by_signal(self, existing_handler=None):
+
+        def new_handler(signum, frame):
+            self._close()
+            if existing_handler:
+                existing_handler(signum, frame)
+
+        return new_handler
+
+    def pull_files(self,
+                   s3_model_path: str = "",
+                   allow_pattern: Optional[list[str]] = None,
+                   ignore_pattern: Optional[list[str]] = None) -> None:
+        """
+        Pull files from S3 storage into the temporary directory.
+
+        Args:
+            s3_model_path: The S3 path of the model.
+            allow_pattern: A list of patterns of which files to pull.
+            ignore_pattern: A list of patterns of which files not to pull.
+
+        """
+        if not s3_model_path.endswith("/"):
+            s3_model_path = s3_model_path + "/"
+
+        bucket_name, base_dir, files = list_files(self.s3, s3_model_path,
+                                                  allow_pattern,
+                                                  ignore_pattern)
+        if len(files) == 0:
+            return
+
+        for file in files:
+            destination_file = os.path.join(
+                self.dir,
+                file.removeprefix(base_dir).lstrip("/"))
+            local_dir = Path(destination_file).parent
+            os.makedirs(local_dir, exist_ok=True)
+            self.s3.download_file(bucket_name, file, destination_file)
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -0,0 +1,302 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import contextlib
+import copy
+import os
+import warnings
+from functools import lru_cache
+from pathlib import Path
+from types import MethodType
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+import huggingface_hub
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
+
+from vllm import envs
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.transformers_utils.tokenizer_base import (TokenizerBase,
+                                                    TokenizerRegistry)
+from vllm.transformers_utils.tokenizers import MistralTokenizer
+from vllm.transformers_utils.utils import check_gguf_file
+from vllm.utils import make_async
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+
+logger = init_logger(__name__)
+
+AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
+                     TokenizerBase]
+
+
+def decode_tokens(
+    tokenizer: AnyTokenizer,
+    token_ids: list[int],
+    *,
+    skip_special_tokens: Optional[bool] = None,
+) -> str:
+    """
+    Backend-agnostic equivalent of HF's
+    `tokenizer.decode(token_ids, ...)`.
+
+    `skip_special_tokens=None` means to use the backend's default
+    settings.
+    """
+    if skip_special_tokens is not None:
+        return tokenizer.decode(token_ids,
+                                skip_special_tokens=skip_special_tokens)
+
+    return tokenizer.decode(token_ids)
+
+
+def encode_tokens(
+    tokenizer: AnyTokenizer,
+    text: str,
+    *,
+    truncation: Optional[bool] = None,
+    max_length: Optional[int] = None,
+    add_special_tokens: Optional[bool] = None,
+) -> list[int]:
+    """
+    Backend-agnostic equivalent of HF's
+    `tokenizer.encode(text, ...)`.
+
+    `add_special_tokens=None` means to use the backend's default
+    settings.
+    """
+
+    kw_args: dict[str, Any] = {}
+    if max_length is not None:
+        kw_args["max_length"] = max_length
+
+    if truncation is not None:
+        kw_args["truncation"] = truncation
+
+    if add_special_tokens is not None:
+        kw_args["add_special_tokens"] = add_special_tokens
+
+    return tokenizer.encode(text, **kw_args)
+
+
+def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
+    """
+    By default, transformers will recompute multiple tokenizer properties
+    each time they are called, leading to a significant slowdown.
+    This proxy caches these properties for faster access.
+    """
+    cached_tokenizer = copy.copy(tokenizer)
+
+    tokenizer_all_special_ids = tokenizer.all_special_ids
+    tokenizer_all_special_tokens = tokenizer.all_special_tokens
+    tokenizer_all_special_tokens_extended = (
+        tokenizer.all_special_tokens_extended)
+    tokenizer_vocab = tokenizer.get_vocab()
+    tokenizer_len = len(tokenizer)
+
+    max_token_id = max(tokenizer_vocab.values())
+    # Some tokenizers (e.g., QwenTokenizer) have special tokens that
+    # are added and included in the implementation of the vocab_size
+    # property, but not in get_vocab(); if there is an implementation
+    # of vocab size, we should take the greater value.
+    if hasattr(tokenizer, "vocab_size"):
+        with contextlib.suppress(NotImplementedError):
+            max_token_id = max(max_token_id, tokenizer.vocab_size)
+
+    class CachedTokenizer(tokenizer.__class__):  # type: ignore
+
+        @property
+        def all_special_ids(self) -> list[int]:
+            return tokenizer_all_special_ids
+
+        @property
+        def all_special_tokens(self) -> list[str]:
+            return tokenizer_all_special_tokens
+
+        @property
+        def all_special_tokens_extended(self) -> list[str]:
+            return tokenizer_all_special_tokens_extended
+
+        @property
+        def max_token_id(self) -> int:
+            return max_token_id
+
+        def get_vocab(self) -> dict[str, int]:
+            return tokenizer_vocab
+
+        def __len__(self) -> int:
+            return tokenizer_len
+
+        def __reduce__(self):
+            return get_cached_tokenizer, (tokenizer, )
+
+    CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
+
+    cached_tokenizer.__class__ = CachedTokenizer
+    return cached_tokenizer
+
+
+def patch_padding_side(tokenizer: PreTrainedTokenizer) -> None:
+    """Patch _pad method to accept `padding_side` for older tokenizers."""
+    orig_pad = tokenizer._pad
+
+    def _pad(
+        self: PreTrainedTokenizer,
+        *args,
+        padding_side: Optional[str] = None,
+        **kwargs,
+    ):
+        if padding_side is not None and padding_side != self.padding_side:
+            msg = ("`padding_side` argument is not supported by "
+                   f"{type(tokenizer).__name__} and will be ignored.")
+            warnings.warn(msg, stacklevel=2)
+
+        return orig_pad(*args, **kwargs)
+
+    tokenizer._pad = MethodType(_pad, tokenizer)
+
+
+def get_tokenizer(
+    tokenizer_name: Union[str, Path],
+    *args,
+    tokenizer_mode: str = "auto",
+    trust_remote_code: bool = False,
+    revision: Optional[str] = None,
+    download_dir: Optional[str] = None,
+    **kwargs,
+) -> AnyTokenizer:
+    """Gets a tokenizer for the given model name via HuggingFace or ModelScope.
+    """
+    if envs.VLLM_USE_MODELSCOPE:
+        # download model from ModelScope hub,
+        # lazy import so that modelscope is not required for normal use.
+        # pylint: disable=C.
+        from modelscope.hub.snapshot_download import snapshot_download
+
+        # avoid circuit import
+        from vllm.model_executor.model_loader.weight_utils import get_lock
+
+        # Only set the tokenizer here, model will be downloaded on the workers.
+        if not os.path.exists(tokenizer_name):
+            # Use file lock to prevent multiple processes from
+            # downloading the same file at the same time.
+            with get_lock(tokenizer_name, download_dir):
+                tokenizer_path = snapshot_download(
+                    model_id=tokenizer_name,
+                    cache_dir=download_dir,
+                    revision=revision,
+                    local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                    # Ignore weights - we only need the tokenizer.
+                    ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
+                tokenizer_name = tokenizer_path
+
+    if tokenizer_mode == "slow":
+        if kwargs.get("use_fast", False):
+            raise ValueError(
+                "Cannot use the fast tokenizer in slow tokenizer mode.")
+        kwargs["use_fast"] = False
+
+    if "truncation_side" not in kwargs:
+        kwargs["truncation_side"] = "left"
+
+    # Separate model folder from file path for GGUF models
+    is_gguf = check_gguf_file(tokenizer_name)
+    if is_gguf:
+        kwargs["gguf_file"] = Path(tokenizer_name).name
+        tokenizer_name = Path(tokenizer_name).parent
+
+    # if tokenizer is from official mistral org
+    is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai"
+    if is_from_mistral_org and tokenizer_mode != "mistral":
+        warnings.warn(
+            'It is strongly recommended to run mistral models with '
+            '`--tokenizer-mode "mistral"` to ensure correct '
+            'encoding and decoding.',
+            FutureWarning,
+            stacklevel=2)
+
+    tokenizer: AnyTokenizer
+    if tokenizer_mode == "mistral":
+        tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name),
+                                                     revision=revision)
+    elif tokenizer_mode == "custom":
+        tokenizer = TokenizerRegistry.get_tokenizer(str(tokenizer_name),
+                                                    *args,
+                                                    revision=revision,
+                                                    download_dir=download_dir,
+                                                    **kwargs)
+    else:
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer_name,
+                *args,
+                trust_remote_code=trust_remote_code,
+                revision=revision,
+                **kwargs,
+            )
+        except ValueError as e:
+            # If the error pertains to the tokenizer class not existing or not
+            # currently being imported,
+            # suggest using the --trust-remote-code flag.
+            if not trust_remote_code and (
+                    "does not exist or is not currently imported." in str(e)
+                    or "requires you to execute the tokenizer file" in str(e)):
+                err_msg = ("Failed to load the tokenizer. If the tokenizer "
+                           "is a custom tokenizer not yet available in the "
+                           "HuggingFace transformers library, consider "
+                           "setting `trust_remote_code=True` in LLM or using "
+                           "the `--trust-remote-code` flag in the CLI.")
+                raise RuntimeError(err_msg) from e
+            else:
+                raise e
+
+        # NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324
+        if type(tokenizer).__name__ in ("ChatGLMTokenizer",
+                                        "ChatGLM4Tokenizer"):
+            assert isinstance(tokenizer, PreTrainedTokenizer)
+            patch_padding_side(tokenizer)
+
+        if not isinstance(tokenizer, PreTrainedTokenizerFast):
+            logger.warning(
+                "Using a slow tokenizer. This might cause a significant "
+                "slowdown. Consider using a fast tokenizer instead.")
+        tokenizer = get_cached_tokenizer(tokenizer)
+
+    return tokenizer
+
+
+cached_get_tokenizer = lru_cache(get_tokenizer)
+
+
+def cached_tokenizer_from_config(
+    model_config: "ModelConfig",
+    **kwargs: Any,
+):
+    return cached_get_tokenizer(
+        model_config.tokenizer,
+        tokenizer_mode=model_config.tokenizer_mode,
+        tokenizer_revision=model_config.tokenizer_revision,
+        trust_remote_code=model_config.trust_remote_code,
+        **kwargs,
+    )
+
+
+def get_lora_tokenizer(lora_request: LoRARequest, *args,
+                       **kwargs) -> Optional[AnyTokenizer]:
+    if lora_request is None:
+        return None
+    try:
+        tokenizer = get_tokenizer(lora_request.lora_path, *args, **kwargs)
+    except Exception as e:
+        # No tokenizer was found in the LoRA folder,
+        # use base model tokenizer
+        logger.warning(
+            "No tokenizer found in %s, using base model tokenizer instead. "
+            "(Exception: %s)", lora_request.lora_path, e)
+        tokenizer = None
+    return tokenizer
+
+
+get_lora_tokenizer_async = make_async(get_lora_tokenizer)
--- a/vllm/transformers_utils/tokenizer_base.py
+++ b/vllm/transformers_utils/tokenizer_base.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import importlib
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+if TYPE_CHECKING:
+    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+
+
+class TokenizerBase(ABC):
+
+    @property
+    @abstractmethod
+    def all_special_tokens_extended(self) -> list[str]:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def all_special_tokens(self) -> list[str]:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def all_special_ids(self) -> list[int]:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def bos_token_id(self) -> int:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def eos_token_id(self) -> int:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def sep_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def pad_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def is_fast(self) -> bool:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def vocab_size(self) -> int:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def max_token_id(self) -> int:
+        raise NotImplementedError()
+
+    def __len__(self) -> int:
+        return self.vocab_size
+
+    @abstractmethod
+    def __call__(
+        self,
+        text: Union[str, list[str], list[int]],
+        text_pair: Optional[str] = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def get_vocab(self) -> dict[str, int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def get_added_vocab(self) -> dict[str, int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def encode_one(
+        self,
+        text: str,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ) -> list[int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def encode(self,
+               text: str,
+               truncation: Optional[bool] = None,
+               max_length: Optional[int] = None,
+               add_special_tokens: Optional[bool] = None) -> list[int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def apply_chat_template(self,
+                            messages: list["ChatCompletionMessageParam"],
+                            tools: Optional[list[dict[str, Any]]] = None,
+                            **kwargs) -> list[int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def decode(self,
+               ids: Union[list[int], int],
+               skip_special_tokens: bool = True) -> str:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def convert_ids_to_tokens(
+        self,
+        ids: list[int],
+        skip_special_tokens: bool = True,
+    ) -> list[str]:
+        raise NotImplementedError()
+
+
+class TokenizerRegistry:
+    # Tokenizer name -> (tokenizer module, tokenizer class)
+    REGISTRY: dict[str, tuple[str, str]] = {}
+
+    @staticmethod
+    def register(name: str, module: str, class_name: str) -> None:
+        TokenizerRegistry.REGISTRY[name] = (module, class_name)
+
+    @staticmethod
+    def get_tokenizer(
+        tokenizer_name: str,
+        *args,
+        **kwargs,
+    ) -> TokenizerBase:
+        tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
+        if tokenizer_cls is None:
+            raise ValueError(f"Tokenizer {tokenizer_name} not found.")
+
+        tokenizer_module = importlib.import_module(tokenizer_cls[0])
+        class_ = getattr(tokenizer_module, tokenizer_cls[1])
+        return class_.from_pretrained(*args, **kwargs)
--- a/vllm/transformers_utils/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
+from vllm.lora.request import LoRARequest
+from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
+                                               get_lora_tokenizer,
+                                               get_lora_tokenizer_async,
+                                               get_tokenizer)
+from vllm.utils import LRUCache
+
+
+class TokenizerGroup:
+    """A group of tokenizers that can be used for LoRA adapters."""
+
+    def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
+                 max_input_length: Optional[int], **tokenizer_config):
+        self.tokenizer_id = tokenizer_id
+        self.tokenizer_config = tokenizer_config
+        self.enable_lora = enable_lora
+        self.max_input_length = max_input_length
+        self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
+        max_loras = tokenizer_config.get("max_loras", 0)
+        self.lora_tokenizers = LRUCache[int, AnyTokenizer](
+            capacity=max(max_loras, max_num_seqs) if enable_lora else 0)
+
+    def get_max_input_len(self,
+                          lora_request: Optional[LoRARequest] = None
+                          ) -> Optional[int]:
+        """Get the maximum input length for the LoRA request."""
+        return self.max_input_length
+
+    def _raise_if_input_too_long(self,
+                                 encoded_tokens: list[int],
+                                 lora_request: Optional[LoRARequest] = None):
+        input_length = len(encoded_tokens)
+        if lora_request:
+            max_input_length = (lora_request.long_lora_max_len
+                                or self.max_input_length)
+        else:
+            max_input_length = self.max_input_length
+        if max_input_length is not None and input_length > max_input_length:
+            raise ValueError("Input too long.", input_length, max_input_length)
+
+    def encode(self,
+               prompt: str,
+               max_length: Optional[int] = None,
+               truncation: Optional[bool] = None,
+               lora_request: Optional[LoRARequest] = None,
+               add_special_tokens: Optional[bool] = None) -> list[int]:
+
+        tokenizer = self.get_lora_tokenizer(lora_request)
+        ret = encode_tokens(tokenizer,
+                            prompt,
+                            max_length=max_length,
+                            truncation=truncation,
+                            add_special_tokens=add_special_tokens)
+        self._raise_if_input_too_long(ret, lora_request)
+        return ret
+
+    async def encode_async(
+            self,
+            prompt: str,
+            max_length: Optional[int] = None,
+            truncation: Optional[bool] = None,
+            lora_request: Optional[LoRARequest] = None,
+            add_special_tokens: Optional[bool] = None) -> list[int]:
+        tokenizer = await self.get_lora_tokenizer_async(lora_request)
+        ret = encode_tokens(tokenizer,
+                            prompt,
+                            max_length=max_length,
+                            truncation=truncation,
+                            add_special_tokens=add_special_tokens)
+        self._raise_if_input_too_long(ret, lora_request)
+        return ret
+
+    def get_lora_tokenizer(
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
+        if not lora_request or not self.enable_lora:
+            return self.tokenizer
+        if lora_request.lora_int_id not in self.lora_tokenizers:
+            tokenizer = (get_lora_tokenizer(
+                lora_request, **self.tokenizer_config) or self.tokenizer)
+            self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
+            return tokenizer
+        else:
+            return self.lora_tokenizers[lora_request.lora_int_id]
+
+    async def get_lora_tokenizer_async(
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
+        if not lora_request or not self.enable_lora:
+            return self.tokenizer
+        if lora_request.lora_int_id not in self.lora_tokenizers:
+            tokenizer = (await get_lora_tokenizer_async(
+                lora_request, **self.tokenizer_config) or self.tokenizer)
+            self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
+            return tokenizer
+        else:
+            return self.lora_tokenizers[lora_request.lora_int_id]
+
+
+def init_tokenizer_from_configs(model_config: ModelConfig,
+                                scheduler_config: SchedulerConfig,
+                                lora_config: Optional[LoRAConfig]):
+    return TokenizerGroup(
+        tokenizer_id=model_config.tokenizer,
+        enable_lora=bool(lora_config),
+        max_num_seqs=scheduler_config.max_num_seqs,
+        max_loras=lora_config.max_loras if lora_config else 0,
+        max_input_length=None,
+        tokenizer_mode=model_config.tokenizer_mode,
+        trust_remote_code=model_config.trust_remote_code,
+        revision=model_config.tokenizer_revision,
+        truncation_side=model_config.truncation_side)
--- a/vllm/transformers_utils/tokenizers/init.py
+++ b/vllm/transformers_utils/tokenizers/init.py
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
+                      truncate_tool_call_ids, validate_request_params)
+
+__all__ = [
+    "MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids",
+    "validate_request_params"
+]
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -0,0 +1,493 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Optional, Union, cast
+
+import huggingface_hub
+import regex as re
+from huggingface_hub import HfApi, hf_hub_download
+
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer_base import TokenizerBase
+from vllm.utils import is_list_of
+
+if TYPE_CHECKING:
+    # make sure `mistral_common` is lazy imported,
+    # so that users who only use non-mistral models
+    # will not be bothered by the dependency.
+    from mistral_common.protocol.instruct.request import ChatCompletionRequest
+    from mistral_common.tokens.tokenizers.mistral import (
+        MistralTokenizer as PublicMistralTokenizer)
+
+    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class Encoding:
+    input_ids: Union[list[int], list[list[int]]]
+
+
+def maybe_serialize_tool_calls(request: "ChatCompletionRequest"):
+    # SEE: https://github.com/vllm-project/vllm/pull/9951
+    # Credits go to: @gcalmettes
+    # NOTE: There is currently a bug in pydantic where attributes
+    # declared as iterables are replaced in in the instances by
+    # pydantic-core ValidatorIterator instance. In particular, this
+    # affects tool_calls defined in ChatCompletionAssistantMessageParam
+    # model:
+    # see:
+    #   - https://github.com/pydantic/pydantic/issues/9467
+    # As a result, tool_calls from assistant messages are never
+    # deserialized in the request object if the tool_calls iterator is
+    # not consumed. This affect messages passed to the MistralTokenizer
+    # since no chat template is applied and therefore the tools_calls
+    # iterator is not directly consumed.
+    # Issue is tracked on Pydantic side, with resolution planned for
+    # v2.11 release. In the meantime, the official workaround is to
+    # consume the iterator so the tool_calls are correctly deserialized
+    # in the OpenAI ChatCompletionAssistantMessageParam object
+    # https://github.com/pydantic/pydantic/issues/9467#issuecomment-2442097291 # noqa: E501
+    # Official Pydantic Issues:
+    #   - https://github.com/pydantic/pydantic/issues/9541
+    # TODO: remove when pydantic v2.11 is released
+    for i, message in enumerate(request.messages):
+        if message.get("role") == 'assistant':
+            tool_calls_validator = message.get("tool_calls", ().__iter__())
+            validated_tool_calls = []
+            while True:
+                try:
+                    tool_call = next(tool_calls_validator)  # type: ignore
+                    validated_tool_calls.append(tool_call)
+                except StopIteration:
+                    break
+
+            request.messages[i]["tool_calls"] = validated_tool_calls
+
+
+def truncate_tool_call_ids(request: "ChatCompletionRequest"):
+    """Truncates tool call IDs for Mistral's ID requirements."""
+    for i, message in enumerate(request.messages):
+        if message.get("role") == 'assistant':
+            tool_calls = message.get("tool_calls", [])
+            for tool_call in tool_calls:
+                if len(tool_call["id"]) > 9:
+                    logger.warning(
+                        "Truncating tool call ID: %s to %s",
+                        tool_call["id"],
+                        tool_call["id"][-9:],
+                    )
+                    tool_call["id"] = tool_call["id"][-9:]
+
+            request.messages[i]["tool_calls"] = tool_calls
+
+        elif message.get("role") in {"tool_results", "tool"}:
+            if "tool_call_id" in message:
+                tool_call_id = message["tool_call_id"]
+
+                if len(tool_call_id) > 9:
+                    logger.warning(
+                        "Truncating tool_call_id: %s to %s",
+                        tool_call_id,
+                        tool_call_id[-9:],
+                    )
+                    tool_call_id = tool_call_id[-9:]
+                request.messages[i]["tool_call_id"] = tool_call_id
+
+
+def validate_request_params(request: "ChatCompletionRequest"):
+    if (request.skip_special_tokens is not None
+            and not request.skip_special_tokens):
+        raise ValueError("skip_special_tokens=False is not supported "
+                         "for Mistral tokenizers.")
+
+
+def list_local_repo_files(repo_id: str, revision: Optional[str]) -> list[str]:
+    repo_cache = os.path.join(
+        huggingface_hub.constants.HF_HUB_CACHE,
+        huggingface_hub.constants.REPO_ID_SEPARATOR.join(
+            ["models", *repo_id.split("/")]))
+
+    if revision is None:
+        revision_file = os.path.join(repo_cache, "refs", "main")
+        if os.path.isfile(revision_file):
+            with open(revision_file) as file:
+                revision = file.read()
+
+    if revision:
+        revision_dir = os.path.join(repo_cache, "snapshots", revision)
+        if os.path.isdir(revision_dir):
+            return os.listdir(revision_dir)
+
+    return []
+
+
+def find_tokenizer_file(files: list[str]):
+    file_pattern = re.compile(
+        r"^tokenizer\.model\.v.*$|^tekken\.json$|^tokenizer\.mm\.model\.v.*$")
+
+    matched_files = [file for file in files if file_pattern.match(file)]
+    if len(matched_files) > 1:
+        raise OSError(
+            f"Found {len(matched_files)} files matching the "
+            f"pattern: `{file_pattern.pattern}`. Make sure only one Mistral "
+            f"tokenizer is present in {files}.")
+    elif len(matched_files) == 0:
+        raise OSError(
+            f"Found {len(matched_files)} files matching the "
+            f"pattern: `{file_pattern.pattern}`. Make sure that a Mistral "
+            f"tokenizer is present in {files}.")
+
+    return matched_files[0]
+
+
+def make_mistral_chat_completion_request(
+        messages: list["ChatCompletionMessageParam"],
+        tools: Optional[list[dict[str,
+                                  Any]]] = None) -> "ChatCompletionRequest":
+    last_message = cast(dict[str, Any], messages[-1])
+    if last_message["role"] == "assistant":
+        last_message["prefix"] = True
+
+    # mistral-common requires AssistantMessage content to be string [1].
+    #
+    # [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80
+    for message in messages:
+        # Remove reasoning_content as unsupported by Mistral
+        _ = message.pop("reasoning_content", None)  # type: ignore
+
+        # Convert list text content to string
+        if message.get("role") in ("assistant", "tool"):
+            content = message.get("content")
+            if isinstance(content, list):
+                content = "\n".join(chunk.get("text") for chunk in content)
+                message["content"] = content
+
+    # The Mistral client, in comparison to the OpenAI client, requires the
+    # "parameters" dict to be present, even if it's empty.
+    if tools:
+        for function in [
+                tool["function"] for tool in tools
+                if tool["type"] == "function"
+        ]:
+            if function.get("parameters") is None:
+                function["parameters"] = {}
+
+    from mistral_common.protocol.instruct.request import ChatCompletionRequest
+    return ChatCompletionRequest(messages=messages,
+                                 tools=tools)  # type: ignore[type-var]
+
+
+class MistralTokenizer(TokenizerBase):
+
+    def __init__(self, tokenizer: "PublicMistralTokenizer") -> None:
+        self.mistral = tokenizer
+        self.instruct = tokenizer.instruct_tokenizer
+        _mistral_version_str = self.instruct.tokenizer.version.value
+        self.version: int = int(_mistral_version_str.split("v")[-1])
+
+        tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
+        from mistral_common.tokens.tokenizers.tekken import (
+            SpecialTokenPolicy, Tekkenizer)
+        self.is_tekken = isinstance(tokenizer_, Tekkenizer)
+        from mistral_common.tokens.tokenizers.sentencepiece import (
+            SentencePieceTokenizer)
+        self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer)
+        if self.is_tekken:
+            # Make sure special tokens will not raise
+            tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE
+        elif self.is_spm:
+            pass
+        else:
+            raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
+
+        self._vocab = tokenizer_.vocab()
+        # Convert to a dict[str, int] to match protocol, but this is a lossy
+        # conversion. There may be multiple token ids that decode to the same
+        # string due to partial UTF-8 byte sequences being converted to <20>
+        self._vocab_dict = {
+            token: idx
+            for idx, token in enumerate(self._vocab)
+        }
+        self.tokenizer = tokenizer_
+        self._max_token_id = self.vocab_size - 1
+
+    @classmethod
+    def from_pretrained(cls,
+                        path_or_repo_id: str,
+                        *,
+                        revision: Optional[str] = None) -> "MistralTokenizer":
+        if not Path(path_or_repo_id).exists():
+            assert len(path_or_repo_id.split("/")) == 2, (
+                "You have either provided a non-existent path: "
+                "{path_or_repo_id} or an invalid HF Hub repo id.")
+            tokenizer_file = cls._download_mistral_tokenizer_from_hf(
+                path_or_repo_id, revision)
+        elif Path(path_or_repo_id).is_dir():
+            tokenizer_file_name = find_tokenizer_file(
+                os.listdir(path_or_repo_id))
+            tokenizer_file = str(Path(path_or_repo_id) / tokenizer_file_name)
+        else:
+            assert Path(
+                path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
+            tokenizer_file = str(Path(path_or_repo_id))
+
+        from mistral_common.tokens.tokenizers.mistral import (
+            MistralTokenizer as PublicMistralTokenizer)
+        mistral_tokenizer = PublicMistralTokenizer.from_file(tokenizer_file)
+        return cls(mistral_tokenizer)
+
+    @staticmethod
+    def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
+                                            revision: Optional[str]) -> str:
+        try:
+            hf_api = HfApi()
+            files = hf_api.list_repo_files(repo_id=tokenizer_name,
+                                           revision=revision)
+        except ConnectionError as exc:
+            files = list_local_repo_files(repo_id=tokenizer_name,
+                                          revision=revision)
+
+            if len(files) == 0:
+                raise exc
+
+        filename = find_tokenizer_file(files)
+
+        tokenizer_file = hf_hub_download(tokenizer_name,
+                                         filename=filename,
+                                         revision=revision)
+        return tokenizer_file
+
+    # the following attributes are set to fit vLLM's design and are used
+    # by the guided structured output backends.
+    @property
+    def all_special_tokens_extended(self) -> list[str]:
+        from mistral_common.tokens.tokenizers.base import SpecialTokens
+
+        # tekken defines its own extended special tokens list
+        if hasattr(self.tokenizer, "SPECIAL_TOKENS"):
+            special_tokens = self.tokenizer.SPECIAL_TOKENS
+        else:
+            special_tokens = list(SpecialTokens)
+        return [
+            s.value if isinstance(s, SpecialTokens) else s
+            for s in special_tokens
+        ]
+
+    @property
+    def all_special_tokens(self) -> list[str]:
+        return self.all_special_tokens_extended
+
+    @property
+    def all_special_ids(self) -> list[int]:
+        return [
+            self.all_special_tokens.index(t) for t in self.all_special_tokens
+        ]
+
+    @property
+    def bos_token_id(self) -> int:
+        return self.tokenizer.bos_id
+
+    @property
+    def eos_token_id(self) -> int:
+        return self.tokenizer.eos_id
+
+    @property
+    def sep_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    def pad_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    def is_fast(self) -> bool:
+        return True
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self._vocab)
+
+    @property
+    def max_token_id(self) -> int:
+        return self._max_token_id
+
+    def __len__(self) -> int:
+        return self.vocab_size
+
+    def __call__(
+        self,
+        text: Union[str, list[str], list[int]],
+        text_pair: Optional[str] = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ):
+        input_ids: Union[list[int], list[list[int]]]
+        # For list[str], original prompt text
+        if is_list_of(text, str):
+            input_ids_: list[list[int]] = []
+            for p in text:
+                each_input_ids = self.encode_one(p, truncation, max_length)
+                input_ids_.append(each_input_ids)
+            input_ids = input_ids_
+        # For list[int], apply chat template output, already tokens.
+        elif is_list_of(text, int):
+            input_ids = text
+        # For str, single prompt text
+        else:
+            input_ids = self.encode_one(text, truncation, max_length)
+        return Encoding(input_ids=input_ids)
+
+    def get_vocab(self) -> dict[str, int]:
+        # NB: the dictionary form of the vocabulary collapses token ids that map
+        # to the same string but have different bytes
+        return self._vocab_dict
+
+    def get_added_vocab(self) -> dict[str, int]:
+        # Mistral tokenizers have no added vocabulary
+        return {}
+
+    def encode_one(
+        self,
+        text: str,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ) -> list[int]:
+        # Mistral Tokenizers should not add special tokens
+        input_ids = self.encode(text)
+
+        if truncation:
+            input_ids = input_ids[:max_length]
+        return input_ids
+
+    def encode(self,
+               text: str,
+               truncation: Optional[bool] = None,
+               max_length: Optional[int] = None,
+               add_special_tokens: Optional[bool] = None) -> list[int]:
+        # `encode` should only be used for prompt completion
+        # it should never be used for chat_completion.
+        # For chat completion use `apply_chat_template`
+        if add_special_tokens is not None:
+            return self.tokenizer.encode(text,
+                                         bos=add_special_tokens,
+                                         eos=add_special_tokens)
+        else:
+            return self.tokenizer.encode(text, bos=True, eos=False)
+
+    def apply_chat_template(self,
+                            messages: list["ChatCompletionMessageParam"],
+                            tools: Optional[list[dict[str, Any]]] = None,
+                            **kwargs) -> list[int]:
+
+        request = make_mistral_chat_completion_request(messages, tools)
+        encoded = self.mistral.encode_chat_completion(request)
+
+        # encode-decode to get clean prompt
+        return encoded.tokens
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        from mistral_common.tokens.tokenizers.base import SpecialTokens
+        if self.is_tekken:
+            tokens = [
+                t for t in tokens
+                if (t is SpecialTokens.tool_calls
+                    or t not in self.tokenizer._all_special_tokens)
+            ]
+
+            if any(isinstance(t, bytes) for t in tokens):
+                # we need to encode and decode all tokens again
+                shift = self.tokenizer.num_special_tokens
+
+                def _token_to_id(t: str):
+                    t_bytes = t.encode("utf-8") \
+                        if not isinstance(t, bytes) else t
+                    try:
+                        return shift + \
+                            self.tokenizer._tekken_token2id_nospecial[t_bytes]
+                    except KeyError:
+                        logger.warning(
+                            "Failed to convert token %s to id,"
+                            " replacing with <unk>", t_bytes)
+                        return self.tokenizer.unk_id
+
+                ids = [_token_to_id(t) for t in tokens]
+                decoded = self.tokenizer.decode(ids)
+            else:
+                decoded = "".join(tokens)
+        else:
+            # make sure certain special tokens like Tool calls are
+            # not decoded
+            special_tokens = {SpecialTokens.tool_calls}
+            regular_tokens: list[str] = []
+            decoded_list = []
+
+            for token in tokens:
+                if token in special_tokens:
+                    if regular_tokens:
+                        decoded_list.append(
+                            self.tokenizer.decode(regular_tokens))
+                        regular_tokens = []
+                    decoded_list.append(token)
+                else:
+                    regular_tokens.append(token)
+
+            if regular_tokens:
+                decoded_list.append(
+                    self.tokenizer.decode(regular_tokens))  # type: ignore
+
+            decoded = ''.join(decoded_list)
+
+        return decoded
+
+    # WARN: Outlines logits processors can overwrite this method.
+    # See: guided_decoding/outlines_logits_processors.py::_adapt_tokenizer
+    # for more.
+    def decode(self,
+               ids: Union[list[int], int],
+               skip_special_tokens: bool = True) -> str:
+        assert (
+            skip_special_tokens
+        ), "skip_special_tokens=False is not supported for Mistral tokenizers."
+
+        if isinstance(ids, int):
+            ids = [ids]
+        return self.tokenizer.decode(ids)
+
+    def convert_ids_to_tokens(
+        self,
+        ids: list[int],
+        skip_special_tokens: bool = True,
+    ) -> list[str]:
+        from mistral_common.tokens.tokenizers.base import SpecialTokens
+
+        # TODO(Patrick) - potentially allow special tokens to not be skipped
+        assert (
+            skip_special_tokens
+        ), "skip_special_tokens=False is not supported for Mistral tokenizers."
+
+        assert self.is_tekken or self.is_spm, type(self.tokenizer)
+
+        if self.is_tekken:
+            # skip special tokens except tool call
+            ids = [
+                i for i in ids if i > self.tokenizer.num_special_tokens or i ==
+                self.tokenizer.get_control_token(SpecialTokens.tool_calls)
+            ]
+
+        tokens = [self.tokenizer.id_to_piece(id) for id in ids]
+
+        if any("<EFBFBD>" in t for t in tokens) and self.is_tekken:
+            # if a decoded token contains the replacement character, then the
+            # token has an incomplete UTF-8 character so we must use bytes
+            # See: https://github.com/vllm-project/vllm/pull/8640
+            #      https://github.com/vllm-project/vllm/pull/9625
+            # if underlying tokenizeir is sentencepiece, we just add "<22>"
+            tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]
+
+        return tokens
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from functools import cache
+from os import PathLike
+from pathlib import Path
+from typing import Optional, Union
+
+from vllm.envs import VLLM_MODEL_REDIRECT_PATH
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def is_s3(model_or_path: str) -> bool:
+    return model_or_path.lower().startswith('s3://')
+
+
+def check_gguf_file(model: Union[str, PathLike]) -> bool:
+    """Check if the file is a GGUF model."""
+    model = Path(model)
+    if not model.is_file():
+        return False
+    elif model.suffix == ".gguf":
+        return True
+
+    try:
+        with model.open("rb") as f:
+            header = f.read(4)
+
+        return header == b"GGUF"
+    except Exception as e:
+        logger.debug("Error reading file %s: %s", model, e)
+        return False
+
+
+def modelscope_list_repo_files(
+    repo_id: str,
+    revision: Optional[str] = None,
+    token: Union[str, bool, None] = None,
+) -> list[str]:
+    """List files in a modelscope repo."""
+    from modelscope.hub.api import HubApi
+    api = HubApi()
+    api.login(token)
+    # same as huggingface_hub.list_repo_files
+    files = [
+        file['Path'] for file in api.get_model_files(
+            model_id=repo_id, revision=revision, recursive=True)
+        if file['Type'] == 'blob'
+    ]
+    return files
+
+
+def _maybe_json_dict(path: Union[str, PathLike]) -> dict[str, str]:
+    with open(path) as f:
+        try:
+            return json.loads(f.read())
+        except Exception:
+            return dict[str, str]()
+
+
+def _maybe_space_split_dict(path: Union[str, PathLike]) -> dict[str, str]:
+    parsed_dict = dict[str, str]()
+    with open(path) as f:
+        for line in f.readlines():
+            try:
+                model_name, redirect_name = line.strip().split()
+                parsed_dict[model_name] = redirect_name
+            except Exception:
+                pass
+    return parsed_dict
+
+
+@cache
+def maybe_model_redirect(model: str) -> str:
+    """
+    Use model_redirect to redirect the model name to a local folder.
+
+    :param model: hf model name
+    :return: maybe redirect to a local folder
+    """
+
+    model_redirect_path = VLLM_MODEL_REDIRECT_PATH
+
+    if not model_redirect_path:
+        return model
+
+    if not Path(model_redirect_path).exists():
+        return model
+
+    redirect_dict = (_maybe_json_dict(model_redirect_path)
+                     or _maybe_space_split_dict(model_redirect_path))
+    if (redirect_model := redirect_dict.get(model)):
+        logger.info("model redirect: [ %s ] -> [ %s ]", model, redirect_model)
+        return redirect_model
+
+    return model