Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

@@ -0,0 +1,26 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm import envs
if envs.VLLM_USE_MODELSCOPE:
try:
# Patch here, before each import happens
import modelscope
from packaging import version
# patch_hub begins from modelscope>=1.18.1
if version.parse(modelscope.__version__) <= version.parse("1.18.0"):
raise ImportError(
"Using vLLM with ModelScope needs modelscope>=1.18.1, please "
"install by `pip install modelscope -U`"
)
from modelscope.utils.hf_util import patch_hub
# Patch hub to download models from modelscope to speed up.
patch_hub()
except ImportError as err:
raise ImportError(
"Please install modelscope>=1.18.1 via "
"`pip install modelscope>=1.18.1` to use ModelScope."
) from err

View File

@@ -0,0 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from .registry import get_chat_template_fallback_path
__all__ = ["get_chat_template_fallback_path"]

View File

@@ -0,0 +1,73 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Callable
from pathlib import Path
from typing import TypeAlias
from vllm.logger import init_logger
logger = init_logger(__file__)
CHAT_TEMPLATES_DIR = Path(__file__).parent
ChatTemplatePath: TypeAlias = Path | Callable[[str], Path | None]
def _get_qwen_chat_template_fallback(tokenizer_name_or_path: str) -> Path | None:
if tokenizer_name_or_path.endswith("-Chat"):
return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
return CHAT_TEMPLATES_DIR / "template_basic.jinja"
def _get_minicpmv_chat_template_fallback(tokenizer_name_or_path: str) -> Path | None:
# MiniCPM-V-4.5 version uses a dedicated template
if "4.5" in tokenizer_name_or_path or "4_5" in tokenizer_name_or_path:
return CHAT_TEMPLATES_DIR / "template_minicpmv45.jinja"
# Other versions use chatml template
return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
_MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
"blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
"chameleon": CHAT_TEMPLATES_DIR / "template_basic.jinja",
"clip": CHAT_TEMPLATES_DIR / "template_basic.jinja",
"deepseek_ocr": CHAT_TEMPLATES_DIR / "template_deepseek_ocr.jinja",
"deepseek_vl_v2": CHAT_TEMPLATES_DIR / "template_deepseek_vl2.jinja",
"fuyu": CHAT_TEMPLATES_DIR / "template_fuyu.jinja",
"minicpmv": _get_minicpmv_chat_template_fallback,
"paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
"qwen": _get_qwen_chat_template_fallback,
"siglip": CHAT_TEMPLATES_DIR / "template_basic.jinja",
"siglip2": CHAT_TEMPLATES_DIR / "template_basic.jinja",
}
def register_chat_template_fallback_path(
model_type: str,
chat_template: ChatTemplatePath,
) -> None:
if model_type in _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK:
logger.warning(
"Model type %s already has a chat template registered. "
"It will be overwritten by the new chat template %s.",
model_type,
chat_template,
)
_MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK[model_type] = chat_template
def get_chat_template_fallback_path(
model_type: str,
tokenizer_name_or_path: str,
) -> Path | None:
chat_template = _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK.get(model_type)
if callable(chat_template):
chat_template = chat_template(tokenizer_name_or_path)
if chat_template is None:
return None
return chat_template

View File

@@ -0,0 +1,3 @@
{%- for message in messages -%}
{{- message['content'] -}}
{%- endfor -%}

View File

@@ -0,0 +1,11 @@
{%- for message in messages -%}
{%- if message['role'] == 'user' -%}
{{- 'Question: ' + message['content'] + ' ' -}}
{%- elif message['role'] == 'assistant' -%}
{{- 'Answer: ' + message['content'] + ' ' -}}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{- 'Answer:' -}}
{% endif %}

View File

@@ -0,0 +1,10 @@
{%- for message in messages -%}
{{- '<|im_start|>' + message['role'] + '\n' + message['content'] -}}
{%- if (loop.last and add_generation_prompt) or not loop.last -%}
{{- '<|im_end|>' + '\n' -}}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
{{- '<|im_start|>assistant\n' -}}
{%- endif -%}

View File

@@ -0,0 +1,14 @@
{%- if messages[0]['role'] == 'system' -%}
{%- set system_message = messages[0]['content'] -%}
{%- set messages = messages[1:] -%}
{%- else -%}
{% set system_message = '' -%}
{%- endif -%}
{{ bos_token + system_message }}
{%- for message in messages -%}
{%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
{%- endif -%}
{{ message['content'] }}
{%- endfor -%}

View File

@@ -0,0 +1,23 @@
{%- if messages[0]['role'] == 'system' -%}
{%- set system_message = messages[0]['content'] -%}
{%- set messages = messages[1:] -%}
{%- else -%}
{% set system_message = '' -%}
{%- endif -%}
{{ bos_token + system_message }}
{%- for message in messages -%}
{%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
{%- endif -%}
{%- if message['role'] == 'user' -%}
{{ '<|User|>: ' + message['content'] + '\n\n' }}
{%- elif message['role'] == 'assistant' -%}
{{ '<|Assistant|>: ' + message['content'] + eos_token + '\n\n' }}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{ '<|Assistant|>: ' }}
{%- endif -%}

View File

@@ -0,0 +1,3 @@
{%- for message in messages -%}
{{- message['content'] + '\n' -}}
{%- endfor -%}

View File

@@ -0,0 +1,93 @@
{%- set enable_thinking = enable_thinking | default(false) %}
{%- if tools %}
{{- '<|im_start|>system\n' }}
{%- if messages[0].role == 'system' %}
{{- messages[0].content + '\n\n' }}
{%- endif %}
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
{%- for tool in tools %}
{{- "\n" }}
{{- tool | tojson }}
{%- endfor %}
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
{%- if messages[0].role == 'system' %}
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
{%- for message in messages[::-1] %}
{%- set index = (messages|length - 1) - loop.index0 %}
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
{%- set ns.multi_step_tool = false %}
{%- set ns.last_query_index = index %}
{%- endif %}
{%- endfor %}
{%- for message in messages %}
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
{%- elif message.role == "assistant" %}
{%- set content = message.content %}
{%- set reasoning = '' %}
{%- if message.reasoning is defined and message.reasoning is not none %}
{%- set reasoning = message.reasoning %}
{%- else %}
{%- if '</think>' in message.content %}
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
{%- set reasoning = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
{%- endif %}
{%- endif %}
{%- if loop.index0 > ns.last_query_index %}
{%- if loop.last or (not loop.last and reasoning) %}
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + content }}
{%- endif %}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + content }}
{%- endif %}
{%- if message.tool_calls %}
{%- for tool_call in message.tool_calls %}
{%- if (loop.first and content) or (not loop.first) %}
{{- '\n' }}
{%- endif %}
{%- if tool_call.function %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- '<tool_call>\n{"name": "' }}
{{- tool_call.name }}
{{- '", "arguments": ' }}
{%- if tool_call.arguments is string %}
{{- tool_call.arguments }}
{%- else %}
{{- tool_call.arguments | tojson }}
{%- endif %}
{{- '}\n</tool_call>' }}
{%- endfor %}
{%- endif %}
{{- '<|im_end|>\n' }}
{%- elif message.role == "tool" %}
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
{{- '<|im_start|>user' }}
{%- endif %}
{{- '\n<tool_response>\n' }}
{{- message.content }}
{{- '\n</tool_response>' }}
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|im_start|>assistant\n' }}
{%- if enable_thinking is defined and enable_thinking is false %}
{{- '<think>\n\n</think>\n\n' }}
{%- endif %}
{%- if enable_thinking is defined and enable_thinking is true %}
{{- '<think>\n' }}
{%- endif %}
{%- endif %}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,20 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
from pathlib import Path
from transformers import PretrainedConfig
class ConfigParserBase(ABC):
@abstractmethod
def parse(
self,
model: str | Path,
trust_remote_code: bool,
revision: str | None = None,
code_revision: str | None = None,
**kwargs,
) -> tuple[dict, PretrainedConfig]:
raise NotImplementedError

View File

@@ -1,16 +1,102 @@
from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
from vllm.transformers_utils.configs.dbrx import DbrxConfig
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# `FalconConfig` class from the official HuggingFace transformers library.
from vllm.transformers_utils.configs.falcon import RWConfig
from vllm.transformers_utils.configs.jais import JAISConfig
from vllm.transformers_utils.configs.mpt import MPTConfig
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Model configs may be defined in this directory for the following reasons:
- There is no configuration file defined by HF Hub or Transformers library.
- There is a need to override the existing config to support vLLM.
- The HF model_type isn't recognized by the Transformers library but can
be mapped to an existing Transformers config, such as
deepseek-ai/DeepSeek-V3.2-Exp.
"""
from __future__ import annotations
import importlib
_CLASS_TO_MODULE: dict[str, str] = {
"AfmoeConfig": "vllm.transformers_utils.configs.afmoe",
"BagelConfig": "vllm.transformers_utils.configs.bagel",
"ChatGLMConfig": "vllm.transformers_utils.configs.chatglm",
"DeepseekVLV2Config": "vllm.transformers_utils.configs.deepseek_vl2",
"DotsOCRConfig": "vllm.transformers_utils.configs.dotsocr",
"EAGLEConfig": "vllm.transformers_utils.configs.eagle",
"FlexOlmoConfig": "vllm.transformers_utils.configs.flex_olmo",
"HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl",
"HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl",
"HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl",
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# `FalconConfig` class from the official HuggingFace transformers library.
"RWConfig": "vllm.transformers_utils.configs.falcon",
"JAISConfig": "vllm.transformers_utils.configs.jais",
"Lfm2MoeConfig": "vllm.transformers_utils.configs.lfm2_moe",
"MedusaConfig": "vllm.transformers_utils.configs.medusa",
"MiDashengLMConfig": "vllm.transformers_utils.configs.midashenglm",
"MLPSpeculatorConfig": "vllm.transformers_utils.configs.mlp_speculator",
"MoonViTConfig": "vllm.transformers_utils.configs.moonvit",
"KimiLinearConfig": "vllm.transformers_utils.configs.kimi_linear",
"KimiVLConfig": "vllm.transformers_utils.configs.kimi_vl",
"NemotronConfig": "vllm.transformers_utils.configs.nemotron",
"NemotronHConfig": "vllm.transformers_utils.configs.nemotron_h",
"Olmo3Config": "vllm.transformers_utils.configs.olmo3",
"OvisConfig": "vllm.transformers_utils.configs.ovis",
"RadioConfig": "vllm.transformers_utils.configs.radio",
"SpeculatorsConfig": "vllm.transformers_utils.configs.speculators.base",
"UltravoxConfig": "vllm.transformers_utils.configs.ultravox",
"Step3VLConfig": "vllm.transformers_utils.configs.step3_vl",
"Step3VisionEncoderConfig": "vllm.transformers_utils.configs.step3_vl",
"Step3TextConfig": "vllm.transformers_utils.configs.step3_vl",
"Qwen3NextConfig": "vllm.transformers_utils.configs.qwen3_next",
"Tarsier2Config": "vllm.transformers_utils.configs.tarsier2",
# Special case: DeepseekV3Config is from HuggingFace Transformers
"DeepseekV3Config": "transformers",
}
__all__ = [
"AfmoeConfig",
"BagelConfig",
"ChatGLMConfig",
"DbrxConfig",
"MPTConfig",
"DeepseekVLV2Config",
"DeepseekV3Config",
"DotsOCRConfig",
"EAGLEConfig",
"FlexOlmoConfig",
"HunYuanVLConfig",
"HunYuanVLTextConfig",
"HunYuanVLVisionConfig",
"RWConfig",
"JAISConfig",
"Lfm2MoeConfig",
"MedusaConfig",
"MiDashengLMConfig",
"MLPSpeculatorConfig",
"MoonViTConfig",
"KimiLinearConfig",
"KimiVLConfig",
"NemotronConfig",
"NemotronHConfig",
"Olmo3Config",
"OvisConfig",
"RadioConfig",
"SpeculatorsConfig",
"UltravoxConfig",
"Step3VLConfig",
"Step3VisionEncoderConfig",
"Step3TextConfig",
"Qwen3NextConfig",
"Tarsier2Config",
]
def __getattr__(name: str):
if name in _CLASS_TO_MODULE:
module_name = _CLASS_TO_MODULE[name]
module = importlib.import_module(module_name)
return getattr(module, name)
raise AttributeError(f"module 'configs' has no attribute '{name}'")
def __dir__():
return sorted(list(__all__))

View File

@@ -0,0 +1,87 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from transformers.configuration_utils import PretrainedConfig
class AfmoeConfig(PretrainedConfig):
model_type = "afmoe"
def __init__(
self,
vocab_size: int = 200_192,
hidden_size: int = 2048,
intermediate_size: int = 6144,
moe_intermediate_size: int = 1408,
num_hidden_layers: int = 32,
num_dense_layers: int = 1,
num_attention_heads: int = 16,
num_key_value_heads: int | None = None,
head_dim: int = 128,
hidden_act: str = "silu",
max_position_embeddings: int = 131072,
initializer_range: float = 0.02,
rms_norm_eps: float = 1e-5,
use_cache: bool = True,
tie_word_embeddings: bool = False,
rope_parameters: dict | None = None,
rope_scaling: dict | None = None,
num_experts: int = 64,
num_experts_per_tok: int = 6,
num_shared_experts: int = 2,
num_expert_groups: int = 1,
num_limited_groups: int = 1,
score_func: str = "sigmoid",
route_norm: bool = True,
route_scale: float = 1.0,
global_attn_every_n_layers: int = 4,
sliding_window: int = 2048,
layer_types: list[str] | None = None,
attention_dropout: float = 0.0,
mup_enabled: bool = False,
n_group: int = 1,
topk_group: int = 1,
**kwargs,
):
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_dense_layers = num_dense_layers
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads or num_attention_heads
self.head_dim = head_dim
self.hidden_act = hidden_act
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
rope_theta = kwargs.pop("rope_theta", 10000.0)
if rope_parameters is None:
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
self.rope_parameters = rope_parameters
self.rope_scaling = rope_scaling
self.moe_intermediate_size = moe_intermediate_size
self.num_experts = num_experts
self.num_experts_per_tok = num_experts_per_tok
self.num_shared_experts = num_shared_experts
self.num_expert_groups = num_expert_groups
self.num_limited_groups = num_limited_groups
self.score_func = score_func
self.route_norm = route_norm
self.route_scale = route_scale
self.global_attn_every_n_layers = global_attn_every_n_layers
self.sliding_window = sliding_window
self.layer_types = layer_types
self.attention_dropout = attention_dropout
self.mup_enabled = mup_enabled
self.n_group = n_group
self.topk_group = topk_group
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
__all__ = ["AfmoeConfig"]

View File

@@ -0,0 +1,216 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
# coding=utf-8
# Copied from
# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
"""Arctic model configuration"""
from dataclasses import asdict, dataclass
from typing import Any
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
ARCTIC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"arctic": "https://huggingface.co/Snowflake/snowflake-arctic-instruct/tree/main/config.json",
}
@dataclass
class ArcticLoRAConfig:
lora_r: int = 64
lora_alpha: float = 16
shard_base_weights: bool = False
@dataclass
class ArcticQuantizationConfig:
q_bits: int = 8
rounding: str = "nearest"
mantissa_bits: int = 3
group_size: int = 128
class ArcticConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`ArcticModel`]. It is used to instantiate an
Arctic model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the #TODO(rsamdani): add what model has the default config..
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the Arctic model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`ArcticModel`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 14336):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer encoder.
num_key_value_heads (`int`, *optional*, defaults to 8):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
The maximum sequence length that this model might ever be used with. Arctic's sliding window attention
allows sequence of up to 4096*32 tokens.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*):
The id of the padding token.
bos_token_id (`int`, *optional*, defaults to 1):
The id of the "beginning-of-sequence" token.
eos_token_id (`int`, *optional*, defaults to 2):
The id of the "end-of-sequence" token.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether the model's input and output word embeddings should be tied.
rope_parameters (`dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
accordingly.
Expected contents:
`rope_theta` (`float`): The base period of the RoPE embeddings.
`rope_type` (`str`):
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
'llama3'], with 'default' being the original RoPE implementation.
sliding_window (`int`, *optional*):
Sliding window attention window size. If not specified, will default to `4096`.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
num_experts_per_tok (`int`, *optional*, defaults to 2):
The number of experts to root per-token, can be also interpreted as the `top-p` routing
parameter
num_local_experts (`int`, *optional*, defaults to 8):
Number of experts per Sparse MLP layer.
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
The aux loss factor for the total loss.
```python
>>> from transformers import ArcticModel, ArcticConfig
>>> # Initializing a Arctic 7B style configuration TODO(rsamdani): verify which model does the default configuration correspond to.
>>> configuration = ArcticConfig()
>>> # Initializing a model from the Arctic 7B style configuration
>>> model = ArcticModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "arctic"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=32000,
hidden_size=4096,
intermediate_size=14336,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=4096,
initializer_range=0.02,
rms_norm_eps=1e-5,
use_cache=True,
pad_token_id=None,
bos_token_id=1,
eos_token_id=2,
tie_word_embeddings=False,
rope_parameters: dict[str, Any] | None = None,
sliding_window=None,
attention_dropout=0.0,
num_experts_per_tok=1,
num_local_experts=8,
router_aux_loss_coef=0.001,
moe_layer_frequency=2,
parallel_attn_mlp_res=False,
moe_train_capacity_factor=1,
moe_eval_capacity_factor=1,
enable_expert_tensor_parallelism=False,
moe_min_capacity=0,
moe_token_dropping=True,
quantization=None,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.sliding_window = sliding_window
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
rope_theta = kwargs.pop("rope_theta", 1e6)
if rope_parameters is None:
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
self.rope_parameters = rope_parameters
self.attention_dropout = attention_dropout
self.num_experts_per_tok = num_experts_per_tok
self.num_local_experts = num_local_experts
self.router_aux_loss_coef = router_aux_loss_coef
self.moe_layer_frequency = moe_layer_frequency
self.moe_train_capacity_factor = moe_train_capacity_factor
self.moe_eval_capacity_factor = moe_eval_capacity_factor
self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
self.moe_min_capacity = moe_min_capacity
self.moe_token_dropping = moe_token_dropping
self.parallel_attn_mlp_res = parallel_attn_mlp_res
if isinstance(quantization, dict):
self.quantization = ArcticQuantizationConfig(**quantization)
else:
self.quantization = quantization
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
@classmethod
def from_dict(cls, config_dict: dict[str, Any], **kwargs) -> "ArcticConfig":
result = super().from_dict(config_dict, **kwargs)
config = result[0] if isinstance(result, tuple) else result
if isinstance(config.quantization, dict):
config.quantization = ArcticQuantizationConfig(**config.quantization)
return result
def to_dict(self) -> dict[str, Any]:
ret = super().to_dict()
if isinstance(ret["quantization"], ArcticQuantizationConfig):
ret["quantization"] = asdict(ret["quantization"])
return ret

View File

@@ -0,0 +1,53 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from transformers import PretrainedConfig, SiglipVisionConfig
from transformers.models.qwen2 import Qwen2Config
class BagelConfig(PretrainedConfig):
"""Configuration class for BAGEL model."""
model_type = "bagel"
def __init__(
self,
visual_gen: bool = True,
visual_und: bool = True,
llm_config: dict | Qwen2Config | None = None,
vit_config: dict | SiglipVisionConfig | None = None,
vae_config: dict | None = None,
latent_patch_size: int = 2,
max_latent_size: int = 32,
vit_max_num_patch_per_side: int = 70,
connector_act: str = "gelu_pytorch_tanh",
interpolate_pos: bool = False,
timestep_shift: float = 1.0,
**kwargs,
):
super().__init__(**kwargs)
self.visual_gen = visual_gen
self.visual_und = visual_und
# Convert dict configs to proper config objects
if isinstance(llm_config, dict):
self.llm_config = Qwen2Config(**llm_config)
else:
self.llm_config = llm_config or Qwen2Config()
if isinstance(vit_config, dict):
self.vit_config = SiglipVisionConfig(**vit_config)
else:
self.vit_config = vit_config or SiglipVisionConfig()
self.vae_config = vae_config or {"z_channels": 16, "downsample": 8}
self.latent_patch_size = latent_patch_size
self.max_latent_size = max_latent_size
self.vit_max_num_patch_per_side = vit_max_num_patch_per_side
self.connector_act = connector_act
self.interpolate_pos = interpolate_pos
self.timestep_shift = timestep_shift
@property
def hidden_size(self) -> int:
"""Return the hidden size of the language model."""
return self.llm_config.hidden_size

View File

@@ -1,6 +1,8 @@
# coding=utf-8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from
# https://github.com/THUDM/ChatGLM2-6B
# https://github.com/zai-org/ChatGLM2-6B
from transformers import PretrainedConfig
@@ -11,33 +13,35 @@ class ChatGLMConfig(PretrainedConfig):
"n_head_kv": "multi_query_group_num",
}
def __init__(self,
num_layers=28,
padded_vocab_size=65024,
hidden_size=4096,
ffn_hidden_size=13696,
kv_channels=128,
num_attention_heads=32,
seq_length=2048,
hidden_dropout=0.0,
attention_dropout=0.0,
layernorm_epsilon=1e-5,
rmsnorm=True,
apply_residual_connection_post_layernorm=False,
post_layer_norm=True,
add_bias_linear=False,
add_qkv_bias=False,
interleaved_qkv=False,
bias_dropout_fusion=True,
multi_query_attention=False,
multi_query_group_num=1,
apply_query_key_layer_scaling=True,
attention_softmax_in_fp32=True,
fp32_residual_connection=False,
quantization_bit=0,
pre_seq_len=None,
prefix_projection=False,
**kwargs):
def __init__(
self,
num_layers=28,
padded_vocab_size=65024,
hidden_size=4096,
ffn_hidden_size=13696,
kv_channels=128,
num_attention_heads=32,
seq_length=2048,
hidden_dropout=0.0,
attention_dropout=0.0,
layernorm_epsilon=1e-5,
rmsnorm=True,
apply_residual_connection_post_layernorm=False,
post_layer_norm=True,
add_bias_linear=False,
add_qkv_bias=False,
interleaved_qkv=False,
bias_dropout_fusion=True,
multi_query_attention=False,
multi_query_group_num=1,
apply_query_key_layer_scaling=True,
attention_softmax_in_fp32=True,
fp32_residual_connection=False,
quantization_bit=0,
pre_seq_len=None,
prefix_projection=False,
**kwargs,
):
self.num_layers = num_layers
self.vocab_size = padded_vocab_size
self.padded_vocab_size = padded_vocab_size
@@ -46,12 +50,15 @@ class ChatGLMConfig(PretrainedConfig):
self.kv_channels = kv_channels
self.num_attention_heads = num_attention_heads
self.seq_length = seq_length
# It is to be compatible with long lora.
self.max_position_embeddings = seq_length
self.hidden_dropout = hidden_dropout
self.attention_dropout = attention_dropout
self.layernorm_epsilon = layernorm_epsilon
self.rmsnorm = rmsnorm
self.apply_residual_connection_post_layernorm = (
apply_residual_connection_post_layernorm)
apply_residual_connection_post_layernorm
)
self.post_layer_norm = post_layer_norm
self.add_bias_linear = add_bias_linear
self.add_qkv_bias = add_qkv_bias

View File

@@ -1,278 +0,0 @@
# yapf: disable
# ruff: noqa: E501
# coding=utf-8
# Copied from
# https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py
"""Dbrx configuration."""
from typing import Any, Optional
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {} # type: ignore
class DbrxAttentionConfig(PretrainedConfig):
"""Configuration class for Dbrx Attention.
[`DbrxAttention`] class. It is used to instantiate attention layers
according to the specified arguments, defining the layers architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
attn_pdrop (`float`, *optional*, defaults to 0.0):
The dropout probability for the attention layers.
clip_qkv (`float`, *optional*, defaults to None):
If not `None`, clip the queries, keys, and values in the attention layer to this value.
kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
rope_theta (float): The base frequency for rope.
"""
def __init__(
self,
attn_pdrop: float = 0,
clip_qkv: Optional[float] = None,
kv_n_heads: int = 1,
rope_theta: float = 10000.0,
**kwargs: Any,
):
super().__init__(**kwargs)
self.attn_pdrop = attn_pdrop
self.clip_qkv = clip_qkv
self.kv_n_heads = kv_n_heads
self.rope_theta = rope_theta
for k in ["model_type"]:
if k in kwargs:
kwargs.pop(k)
if len(kwargs) != 0:
raise ValueError(f"Found unknown {kwargs=}")
@classmethod
def from_pretrained(
cls, pretrained_model_name_or_path: str, **kwargs: Any
) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(
pretrained_model_name_or_path, **kwargs
)
if config_dict.get("model_type") == "dbrx":
config_dict = config_dict["attn_config"]
if (
"model_type" in config_dict
and hasattr(cls, "model_type")
and config_dict["model_type"] != cls.model_type
):
logger.warning(
"You are using a model of type %s to instantiate a model of "
"type %s. This is not supported for all configurations of "
"models and can yield errors.",
config_dict["model_type"], cls.model_type)
return cls.from_dict(config_dict, **kwargs)
class DbrxFFNConfig(PretrainedConfig):
"""Configuration class for Dbrx FFN.
[`DbrxFFN`] class. It is used to instantiate feedforward layers according to
the specified arguments, defining the layers architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
The dict should have a key 'name' with the value being the name of
the activation function along with any additional keyword arguments.
ffn_hidden_size (int, optional): The hidden size of the feedforward network.
moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
This should only be used for benchmarking purposes.
"""
def __init__(
self,
ffn_act_fn: Optional[dict] = None,
ffn_hidden_size: int = 3584,
moe_num_experts: int = 4,
moe_top_k: int = 1,
moe_jitter_eps: Optional[float] = None,
moe_loss_weight: float = 0.01,
moe_normalize_expert_weights: Optional[float] = 1,
uniform_expert_assignment: bool = False,
**kwargs: Any,
):
super().__init__()
if ffn_act_fn is None:
ffn_act_fn = {"name": "silu"}
self.ffn_act_fn = ffn_act_fn
self.ffn_hidden_size = ffn_hidden_size
self.moe_num_experts = moe_num_experts
self.moe_top_k = moe_top_k
self.moe_jitter_eps = moe_jitter_eps
self.moe_loss_weight = moe_loss_weight
self.moe_normalize_expert_weights = moe_normalize_expert_weights
self.uniform_expert_assignment = uniform_expert_assignment
for k in ["model_type"]:
if k in kwargs:
kwargs.pop(k)
if len(kwargs) != 0:
raise ValueError(f"Found unknown {kwargs=}")
@classmethod
def from_pretrained(
cls, pretrained_model_name_or_path: str, **kwargs: Any
) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(
pretrained_model_name_or_path, **kwargs
)
if config_dict.get("model_type") == "dbrx":
config_dict = config_dict["ffn_config"]
if (
"model_type" in config_dict
and hasattr(cls, "model_type")
and config_dict["model_type"] != cls.model_type
):
logger.warning(
"You are using a model of type %s to instantiate a model of "
"type %s. This is not supported for all "
"configurations of models and can yield errors.", config_dict["model_type"], cls.model_type)
return cls.from_dict(config_dict, **kwargs)
class DbrxConfig(PretrainedConfig):
"""Configuration class for Dbrx.
[`DbrxModel`]. It is used to instantiate a Dbrx model according to the
specified arguments, defining the model architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
d_model (`int`, *optional*, defaults to 6144):
Dimensionality of the embeddings and hidden states.
n_heads (`int`, *optional*, defaults to 48):
Number of attention heads for each attention layer in the Transformer encoder.
n_layers (`int`, *optional*, defaults to 40):
Number of hidden layers in the Transformer encoder.
max_seq_len (`int`, *optional*, defaults to 32768):
The maximum sequence length of the model.
vocab_size (`int`, *optional*, defaults to 100352):
Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
the `inputs_ids` passed when calling [`DbrxModel`].
resid_pdrop (`float`, *optional*, defaults to 0.0):
The dropout probability applied to the attention output before combining with residual.
emb_pdrop (`float`, *optional*, defaults to 0.0):
The dropout probability for the embedding layer.
attn_config (`dict`, *optional*):
A dictionary used to configure the model's attention module.
ffn_config (`dict`, *optional*):
A dictionary used to configure the model's FFN module.
use_cache (`bool`, *optional*, defaults to `False`):
Whether or not the model should return the last key/values attentions (not used by all models).
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
output_router_logits (`bool`, *optional*, defaults to `False`):
Whether or not the router logits should be returned by the model. Enabling this will also
allow the model to output the auxiliary loss. See [here]() for more details
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
The aux loss factor for the total loss.
Example:
```python
>>> from transformers import DbrxConfig, DbrxModel
>>> # Initializing a Dbrx configuration
>>> configuration = DbrxConfig()
>>> # Initializing a model (with random weights) from the configuration
>>> model = DbrxModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "dbrx"
attribute_map = {
"num_attention_heads": "n_heads",
"hidden_size": "d_model",
"num_hidden_layers": "n_layers",
"max_position_embeddings": "max_seq_len",
}
def __init__(
self,
d_model: int = 2048,
n_heads: int = 16,
n_layers: int = 24,
max_seq_len: int = 2048,
vocab_size: int = 32000,
resid_pdrop: float = 0.0,
emb_pdrop: float = 0.0,
attn_config: Optional[DbrxAttentionConfig] = None,
ffn_config: Optional[DbrxFFNConfig] = None,
use_cache: bool = True,
initializer_range: float = 0.02,
output_router_logits: bool = False,
router_aux_loss_coef: float = 0.05,
**kwargs: Any,
):
if attn_config is None:
self.attn_config = DbrxAttentionConfig()
elif isinstance(attn_config, dict):
self.attn_config = DbrxAttentionConfig(**attn_config)
else:
self.attn_config = attn_config
if ffn_config is None:
self.ffn_config = DbrxFFNConfig()
elif isinstance(ffn_config, dict):
self.ffn_config = DbrxFFNConfig(**ffn_config)
else:
self.ffn_config = ffn_config
self.d_model = d_model
self.n_heads = n_heads
self.n_layers = n_layers
self.max_seq_len = max_seq_len
self.vocab_size = vocab_size
self.resid_pdrop = resid_pdrop
self.emb_pdrop = emb_pdrop
self.use_cache = use_cache
self.initializer_range = initializer_range
self.output_router_logits = output_router_logits
self.router_aux_loss_coef = router_aux_loss_coef
tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
if tie_word_embeddings:
raise ValueError(
"tie_word_embeddings is not supported for Dbrx models."
)
super().__init__(
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)

View File

@@ -0,0 +1,126 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
from transformers import DeepseekV2Config, PretrainedConfig
class VisionEncoderConfig(PretrainedConfig):
model_type: str = "vision"
model_name: str = "vit_so400m_patch14_siglip_384.webli"
image_size: int = 384
patch_size: int = 16
width: int = 1024
layers: int = 24
heads: int = 16
mlp_ratio: int = 4
global_pool: str = "map"
ignore_head: bool = True
class_token: bool = False
num_classes: int = 0
use_checkpoint: bool = False
weight_init: str = "skip"
deterministic: bool = False
num_recomputing_layers: int = 0
def __init__(
self,
model_name: str = "vit_so400m_patch14_siglip_384.webli",
image_size: int = 384,
patch_size: int = 16,
width: int = 1024,
layers: int = 24,
heads: int = 16,
mlp_ratio: int = 4,
global_pool: str = "map",
ignore_head: bool = True,
class_token: bool = False,
num_classes: int = 0,
use_checkpoint: bool = False,
**kwargs,
):
self.model_name = model_name
self.image_size = image_size
self.patch_size = patch_size
self.width = width
self.layers = layers
self.heads = heads
self.mlp_ratio = mlp_ratio
self.global_pool = global_pool
self.ignore_head = ignore_head
self.class_token = class_token
self.num_classes = num_classes
self.use_checkpoint = use_checkpoint
super().__init__(**kwargs)
class MlpProjectorConfig(PretrainedConfig):
model_type = "mlp_projector"
projector_type: str = "downsample_mlp_gelu"
input_dim: int = 1152
n_embed: int = 2048
depth: int = 2
mlp_ratio: int = 1
downsample_ratio: int = 2
token_pooling: bool = False
def __init__(
self,
projector_type: str = "downsample_mlp_gelu",
input_dim: int = 1152,
n_embed: int = 2048,
depth: int = 2,
mlp_ratio: int = 1,
downsample_ratio: int = 2,
**kwargs,
):
self.projector_type = projector_type
self.input_dim = input_dim
self.n_embed = n_embed
self.depth = depth
self.mlp_ratio = mlp_ratio
self.downsample_ratio = downsample_ratio
super().__init__(**kwargs)
class DeepseekVLV2Config(PretrainedConfig):
model_type = "deepseek_vl_v2"
vision_config: VisionEncoderConfig
projector_config: MlpProjectorConfig
tile_tag: str = "2D"
global_view_pos: str = "head"
candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),)
def __init__(
self,
tile_tag: str = "tile_tag",
global_view_pos: str = "head",
candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),),
**kwargs,
):
super().__init__(**kwargs)
vision_config = kwargs.get("vision_config", {})
self.vision_config = VisionEncoderConfig(**vision_config)
projector_config = kwargs.get("projector_config", {})
self.projector_config = MlpProjectorConfig(**projector_config)
language_config = kwargs.get("language_config", {})
self.text_config = DeepseekV2Config(**language_config)
self.tile_tag = tile_tag
self.global_view_pos = global_view_pos
self.candidate_resolutions = candidate_resolutions
self.vocab_size = self.text_config.vocab_size
# update model_type for OCR model
if "DeepseekOCRForCausalLM" in (
self.architectures or kwargs.get("architectures", [])
):
self.model_type = "deepseek_ocr"

View File

@@ -0,0 +1,71 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
from transformers.configuration_utils import PretrainedConfig
from transformers.models.qwen2 import Qwen2Config
class DotsVisionConfig(PretrainedConfig):
model_type: str = "dots_vit"
def __init__(
self,
embed_dim: int = 1536, # vision encoder embed size
hidden_size: int = 1536, # after merger hidden size
intermediate_size: int = 4224,
num_hidden_layers: int = 42,
num_attention_heads: int = 12,
num_channels: int = 3,
patch_size: int = 14,
spatial_merge_size: int = 2,
temporal_patch_size: int = 1,
rms_norm_eps: float = 1e-5,
use_bias: bool = False,
attn_implementation="flash_attention_2",
initializer_range=0.02,
init_merger_std=0.02,
is_causal=False, # ve causal forward
post_norm=True,
gradient_checkpointing=False,
**kwargs: Any,
):
super().__init__(**kwargs)
self.embed_dim = embed_dim
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.spatial_merge_size = spatial_merge_size
self.temporal_patch_size = temporal_patch_size
self.rms_norm_eps = rms_norm_eps
self.use_bias = use_bias
self.attn_implementation = attn_implementation
self.initializer_range = initializer_range
self.init_merger_std = init_merger_std
self.is_causal = is_causal
self.post_norm = post_norm
self.gradient_checkpointing = gradient_checkpointing
class DotsOCRConfig(Qwen2Config):
model_type = "dots_ocr"
def __init__(
self,
image_token_id=151665,
video_token_id=151656,
vision_config: dict | None = None,
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
self.image_token_id = image_token_id
self.video_token_id = video_token_id
self.vision_config = DotsVisionConfig(**(vision_config or {}))
def save_pretrained(self, save_directory, **kwargs):
self._auto_class = None
super().save_pretrained(save_directory, **kwargs)

View File

@@ -0,0 +1,90 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from transformers import AutoConfig, DeepseekV2Config, PretrainedConfig
class EAGLEConfig(PretrainedConfig):
model_type = "eagle"
def __init__(
self,
model: PretrainedConfig | dict | None = None,
truncated_vocab_size: int | None = None,
method: str | None = "eagle",
**kwargs,
):
model_config: PretrainedConfig | DeepseekV2Config | None
if isinstance(model, dict):
model_config = AutoConfig.for_model(**model)
else:
model_config = model
for k, v in kwargs.items():
if k != "architectures" and k != "model_type" and hasattr(model_config, k):
setattr(model_config, k, v)
self.model = model_config
if self.model is None:
self.truncated_vocab_size = None
else:
self.truncated_vocab_size = (
self.model.vocab_size
if truncated_vocab_size is None
else truncated_vocab_size
)
# Eagle model name should follow naming convention of
# LlamaForCausalLM -> EagleLlamaForCausalLM
# LlamaForCausalLM -> Eagle3LlamaForCausalLM
# LlamaForCausalLMEagle3 -> LlamaForCausalLMEagle3
if method == "eagle":
assert self.model is not None, (
"model should not be None when method is eagle"
)
kwargs["architectures"] = [
f"Eagle{arch}" if not arch.startswith("Eagle") else arch
for arch in self.model.architectures
]
elif method == "eagle3":
assert self.model is not None, (
"model should not be None when method is eagle3"
)
kwargs["architectures"] = [
arch
if arch.startswith("Eagle3") or arch.endswith("Eagle3")
else f"Eagle3{arch}"
for arch in self.model.architectures
]
else:
raise ValueError(
f"Invalid method {method}. Supported methods are eagle and eagle3."
)
super().__init__(**kwargs)
if self.model is not None:
for k, v in self.model.to_dict().items():
if k not in kwargs:
setattr(self, k, v)
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: str | os.PathLike,
**kwargs,
) -> "EAGLEConfig":
config_dict, kwargs = cls.get_config_dict(
pretrained_model_name_or_path, **kwargs
)
return cls.from_dict(config_dict, **kwargs)
def to_json_string(self, use_diff: bool = True) -> str:
# we override use_diff to False as initializing
# EAGLEConfig with default arguments is not supported
del use_diff
return super().to_json_string(use_diff=False)

View File

@@ -1,3 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from
# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
# Copyright 2023 The vLLM team.
@@ -16,6 +19,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Falcon configuration"""
from transformers.configuration_utils import PretrainedConfig
@@ -74,9 +78,7 @@ class RWConfig(PretrainedConfig):
# Hack for falcon-40b
self.new_decoder_architecture = True
super().__init__(bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs)
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@property
def head_dim(self):

View File

@@ -0,0 +1,82 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
from transformers.configuration_utils import PretrainedConfig
class FlexOlmoConfig(PretrainedConfig):
model_type = "flex_olmo"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=100352,
hidden_size=4096,
intermediate_size=11008,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=4096,
initializer_range=0.02,
rms_norm_eps=1e-06,
use_cache=True,
pad_token_id=100277,
bos_token_id=None,
eos_token_id=100257,
tie_word_embeddings=False,
rope_parameters: dict[str, Any] | None = None,
attention_bias=False,
attention_dropout=0.0,
num_experts_per_tok=5,
num_experts=7,
output_router_logits=False,
router_aux_loss_coef=0.01,
norm_topk_prob=False,
**kwargs,
):
if "architectures" not in kwargs:
kwargs["architectures"] = ["FlexOlmoForCausalLM"]
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 500000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
self.rope_parameters = rope_parameters
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.num_experts_per_tok = num_experts_per_tok
self.num_experts = num_experts
self.output_router_logits = output_router_logits
self.router_aux_loss_coef = router_aux_loss_coef
self.norm_topk_prob = norm_topk_prob
# Validate the correctness of rotary position embeddings parameters
# BC: if there is a 'type' field, move it to 'rope_type'.
if self.rope_parameters is not None and "type" in self.rope_parameters:
self.rope_parameters["rope_type"] = self.rope_parameters["type"]

View File

@@ -0,0 +1,322 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/configuration_hunyuan_vl.py
from transformers import PretrainedConfig
class HunYuanVLVisionConfig(PretrainedConfig):
model_type = "hunyuan_vl"
base_config_key = "vision_config"
def __init__(
self,
hidden_act="gelu",
hidden_size=1152,
intermediate_size=4304,
interpolate_mode="bilinear",
rms_norm_eps=1e-05,
learnable_mlp_pooling_size=0,
num_attention_heads=16,
num_key_value_heads=None,
num_channels=3,
num_hidden_layers=27,
out_hidden_size=4096,
patch_size=16,
remove_prenorm=True,
spatial_merge_size=2,
temporal_patch_size=1,
resize_resolution=2048,
img_max_token_num=4096,
max_image_size=2048,
video_max_image_size=768,
video_min_image_size=256,
min_image_size=512,
anyres_vit_max_image_size=2048,
max_vit_seq_len=16384,
text_hidden_size=3072,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_act = hidden_act
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.interpolate_mode = interpolate_mode
self.learnable_mlp_pooling_size = learnable_mlp_pooling_size
self.num_attention_heads = num_attention_heads
if not num_key_value_heads:
self.num_key_value_heads = num_attention_heads
else:
self.num_key_value_heads = num_key_value_heads
self.num_channels = num_channels
self.num_hidden_layers = num_hidden_layers
self.out_hidden_size = out_hidden_size
self.patch_size = patch_size
self.remove_prenorm = remove_prenorm
self.spatial_merge_size = spatial_merge_size
self.temporal_patch_size = temporal_patch_size
self.rms_norm_eps = rms_norm_eps
self.resize_resolution = resize_resolution
self.img_max_token_num = img_max_token_num
self.max_image_size = max_image_size
self.min_image_size = min_image_size
self.video_max_image_size = video_max_image_size
self.video_min_image_size = video_min_image_size
self.anyres_vit_max_image_size = anyres_vit_max_image_size
self.max_vit_seq_len = max_vit_seq_len
self.text_hidden_size = text_hidden_size
class HunYuanVLTextConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`HunYuanVLTextConfig`]. It is used to instantiate an
HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the HunYuan-7B.
Hunyuan-7B-Instruct [tencent/Hunyuan-7B-Instruct](https://huggingface.co/tencent/Hunyuan-7B-Instruct).
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 290943):
Vocabulary size of the HunYuan model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`HunYuanVLTextConfig`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations or shared MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
`num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*, defaults to 0):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
eod_token_id (int, *optional*, defaults to 3):
Token ID representing the end-of-document marker. Used to indicate the termination of a text sequence.
Example: In multi-document processing, this token helps the model distinguish between separate documents.
pretraining_tp (`int`, *optional*, defaults to 1):
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
issue](https://github.com/pytorch/pytorch/issues/76232).
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how
these scaling strategies behave:
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
experimental feature, subject to breaking API changes in future versions.
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
head_dim (`int`, *optional*, defaults to 128):
The attention head dimension.
""" # noqa: E501
model_type = "hunyuan_vl_text"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=290943,
hidden_size=4096,
intermediate_size: int = 11008,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
rms_norm_eps=1e-5,
use_cache=True,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
eod_token_id=3,
pretraining_tp=1,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
head_dim=None,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.head_dim = head_dim
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
# self._rope_scaling_validation() # TODO: Need validation?
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
def _rope_scaling_validation(self):
"""
Validate the `rope_scaling` configuration.
"""
if self.rope_scaling is None:
return
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
raise ValueError(
"`rope_scaling` must be a dictionary with with two fields, `type` and "
f"`factor` or `type` and `alpha`, got {self.rope_scaling}"
)
rope_scaling_type = self.rope_scaling.get("type", None)
rope_scaling_factor = self.rope_scaling.get("factor", None)
rope_scaling_alpha = self.rope_scaling.get("alpha", None)
if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
raise ValueError(
"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], "
f"got {rope_scaling_type}"
)
if rope_scaling_factor is None and rope_scaling_alpha is None:
raise ValueError(
"`rope_scaling`'s factor or alpha field must be have one, "
"got both of none"
)
if rope_scaling_factor is not None and (
not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0
):
raise ValueError(
"`rope_scaling`'s factor field must be a float > 1.0, "
f"got {rope_scaling_factor}"
)
if rope_scaling_alpha is not None and (
not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0
):
raise ValueError(
"`rope_scaling`'s alpha field must be a float > 1.0, "
f"got {rope_scaling_alpha}"
)
class HunYuanVLConfig(PretrainedConfig):
model_type = "hunyuan_vl"
sub_configs = {
"vision_config": HunYuanVLVisionConfig,
"text_config": HunYuanVLTextConfig,
}
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
text_config=None,
vision_config=None,
im_start_id=120118,
im_end_id=120119,
image_token_id=120120,
im_newline_id=120121,
video_start_id=120122,
video_end_id=120123,
**kwargs,
):
# We need to init super() here so that it does not reset values
# that are in text config to the BaseClass defaults. The Base
# config has many text related defaults and not all defaults are
# same as for `HunYuanVLTextConfig`.
super().__init__(**kwargs)
if isinstance(vision_config, dict):
self.vision_config = self.sub_configs["vision_config"](**vision_config)
elif vision_config is None:
self.vision_config = self.sub_configs["vision_config"]()
if isinstance(text_config, dict):
self.text_config = self.sub_configs["text_config"](**text_config)
elif text_config is None:
# For BC use all kwargs to init `TextConfig`
self.text_config = self.sub_configs["text_config"](**kwargs)
self.image_token_id = image_token_id
self.im_start_id = im_start_id
self.im_end_id = im_end_id
self.im_newline_id = im_newline_id
self.video_start_id = video_start_id
self.video_end_id = video_end_id
self.vision_config.text_hidden_size = self.text_config.hidden_size
# Attention implementation to use. It sets it recursively on sub-configs
# so we call it again in the end.
self._attn_implementation = kwargs.pop("attn_implementation", None)
def __setattr__(self, key, value):
if (
(text_config := super().__getattribute__("__dict__").get("text_config"))
is not None
and key not in ["dtype", "_attn_implementation_internal"]
and key in text_config.__dict__
):
setattr(text_config, key, value)
else:
super().__setattr__(key, value)
def __getattribute__(self, key):
if "text_config" in super().__getattribute__("__dict__") and key not in [
"_name_or_path",
"model_type",
"dtype",
"_attn_implementation_internal",
]:
text_config = super().__getattribute__("text_config")
if key in text_config.__dict__:
return getattr(text_config, key)
return super().__getattribute__(key)

View File

@@ -1,6 +1,7 @@
# coding=utf-8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2024 - 2024 Moore Threads Technology Co., Ltd("Moore Threads"). All rights reserved.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
# Copyright 2023 Cerebras Systems.
#
@@ -73,10 +74,9 @@ class JAISConfig(PretrainedConfig):
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values
attentions (not used by all models).
scale_attn_by_inverse_layer_idx (`bool`, *optional*,
defaults to `False`):
Whether to additionally scale attention weights by
`1 / layer_idx + 1`.
scale_attn_by_inverse_layer_idx (`bool`, *optional*, default `True`):
Whether to additionally scale attention weights
by `1 / layer_idx + 1`.
reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
Whether to scale keys (K) prior to computing attention
(dot-product)
@@ -98,7 +98,7 @@ class JAISConfig(PretrainedConfig):
Scale attention weights by dividing by hidden_size instead of
sqrt(hidden_size). Need to set scale_attn_weights to `True` as
well.
alibi_scaling (`Dict`, *optional*):
alibi_scaling (`dict`, *optional*):
Dictionary containing the scaling configuration for ALiBi
embeddings. Currently only supports linear
scaling strategy. Can specify either the scaling `factor` (must be
@@ -108,7 +108,7 @@ class JAISConfig(PretrainedConfig):
formats are `{"type": strategy name, "factor": scaling factor}` or
`{"type": strategy name,
"train_seq_len": training sequence length}`.
architectures (`List`, *optional*, defaults to ['JAISLMHeadModel']):
architectures (`list`, *optional*, defaults to ['JAISLMHeadModel']):
architecture names for Jais.
Example:
@@ -209,29 +209,35 @@ class JAISConfig(PretrainedConfig):
if self.alibi_scaling is None:
return
if (not isinstance(self.alibi_scaling, dict)
or len(self.alibi_scaling) != 2):
if not isinstance(self.alibi_scaling, dict) or len(self.alibi_scaling) != 2:
raise ValueError(
"`alibi_scaling` must be a dictionary with two fields,"
"`alibi_scaling` must be a dictionary with two fields, "
"`type` and `factor` or `type` and `train_seq_len`, "
f"got {self.alibi_scaling}")
f"got {self.alibi_scaling}"
)
alibi_scaling_type = self.alibi_scaling.get("type", None)
alibi_scaling_factor = self.alibi_scaling.get("factor", None)
alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
if alibi_scaling_type is None or alibi_scaling_type != "linear":
raise ValueError(f"`alibi_scaling`'s type field must be 'linear',"
f"got {alibi_scaling_type}")
if (alibi_scaling_factor is not None
and not isinstance(alibi_scaling_factor, float)
or (alibi_scaling_factor is not None
and alibi_scaling_factor <= 1.0)):
raise ValueError(
f"`alibi_scaling`'s factor field must be a float > 1.0,"
f"got {alibi_scaling_factor}")
if (alibi_dynamic_scaling is not None
and not isinstance(alibi_dynamic_scaling, int)
or (alibi_dynamic_scaling is not None
and alibi_dynamic_scaling <= 1)):
f"`alibi_scaling`'s type field must be 'linear', "
f"got {alibi_scaling_type}"
)
if (
alibi_scaling_factor is not None
and not isinstance(alibi_scaling_factor, float)
or (alibi_scaling_factor is not None and alibi_scaling_factor <= 1.0)
):
raise ValueError(
f"`alibi_scaling`'s `train_seq_len` field must be an"
f"integer > 1, got {alibi_dynamic_scaling}")
f"`alibi_scaling`'s factor field must be a float > 1.0, "
f"got {alibi_scaling_factor}"
)
if (
alibi_dynamic_scaling is not None
and not isinstance(alibi_dynamic_scaling, int)
or (alibi_dynamic_scaling is not None and alibi_dynamic_scaling <= 1)
):
raise ValueError(
f"`alibi_scaling`'s `train_seq_len` field must be an "
f"integer > 1, got {alibi_dynamic_scaling}"
)

View File

@@ -0,0 +1,148 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from transformers.configuration_utils import PretrainedConfig
from vllm.logger import init_logger
logger = init_logger(__name__)
class KimiLinearConfig(PretrainedConfig):
model_type = "kimi_linear"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
model_type="kimi_linear",
vocab_size=163840,
hidden_size=4096,
head_dim=None,
intermediate_size=11008,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
hidden_act="silu",
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
rope_parameters=None,
tie_word_embeddings=False,
moe_intermediate_size: int | None = None,
moe_renormalize: bool = True,
moe_router_activation_func: str = "sigmoid",
num_experts: int | None = None,
num_experts_per_token: int | None = None,
num_shared_experts: int = 0,
routed_scaling_factor: float = 1.0,
first_k_dense_replace: int = 0,
moe_layer_freq: int = 1,
use_grouped_topk: bool = True,
num_expert_group: int = 1,
topk_group: int = 1,
q_lora_rank: int | None = None,
kv_lora_rank: int | None = None,
qk_nope_head_dim: int | None = None,
qk_rope_head_dim: int | None = None,
v_head_dim: int | None = None,
mla_use_nope: bool | None = False,
num_nextn_predict_layers: int = 0,
linear_attn_config: dict | None = None,
**kwargs,
):
self.model_type = model_type
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.head_dim = (
head_dim if head_dim is not None else hidden_size // num_attention_heads
)
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 10000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
self.rope_parameters = rope_parameters
self.q_lora_rank = q_lora_rank
self.kv_lora_rank = kv_lora_rank
self.qk_nope_head_dim = qk_nope_head_dim
self.qk_rope_head_dim = qk_rope_head_dim
self.v_head_dim = v_head_dim
self.mla_use_nope = mla_use_nope
# moe config
self.num_experts = num_experts
self.num_experts_per_token = num_experts_per_token
self.moe_renormalize = moe_renormalize
self.num_shared_experts = num_shared_experts
self.routed_scaling_factor = routed_scaling_factor
self.moe_router_activation_func = moe_router_activation_func
assert self.moe_router_activation_func in ("softmax", "sigmoid")
self.moe_intermediate_size = moe_intermediate_size
self.first_k_dense_replace = first_k_dense_replace
self.moe_layer_freq = moe_layer_freq
self.use_grouped_topk = use_grouped_topk
self.num_expert_group = num_expert_group
self.topk_group = topk_group
self.num_nextn_predict_layers = num_nextn_predict_layers
if linear_attn_config is not None:
assert linear_attn_config["kda_layers"] is not None
assert linear_attn_config["full_attn_layers"] is not None
self.linear_attn_config = linear_attn_config
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
@property
def is_mla(self):
return (
self.q_lora_rank is not None
or self.kv_lora_rank is not None
or self.qk_nope_head_dim is not None
or self.qk_rope_head_dim is not None
or self.v_head_dim is not None
or self.mla_use_nope is True
)
@property
def is_moe(self):
return self.num_experts is not None
@property
def is_linear_attn(self) -> bool:
return not (
self.linear_attn_config is None
or (
isinstance(self.linear_attn_config, dict)
and self.linear_attn_config["kda_layers"] is not None
and len(self.linear_attn_config["kda_layers"]) == 0
)
)
def is_kda_layer(self, layer_idx: int):
return (
self.linear_attn_config is not None
and (layer_idx + 1) in self.linear_attn_config["kda_layers"]
)

View File

@@ -0,0 +1,38 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
from transformers import DeepseekV2Config
from transformers.configuration_utils import PretrainedConfig
from vllm.transformers_utils.configs.moonvit import MoonViTConfig
class KimiVLConfig(PretrainedConfig):
model_type = "kimi_vl"
def __init__(
self,
vision_config: dict | MoonViTConfig | None = None,
text_config: dict | DeepseekV2Config | None = None,
ignore_index: int = -100,
media_placeholder_token_id: int = 163605,
pad_token_id: int = 0,
**kwargs,
):
if vision_config is None:
vision_config = MoonViTConfig()
elif isinstance(vision_config, dict):
vision_config = MoonViTConfig(**vision_config)
self.vision_config = vision_config
if text_config is None:
text_config = DeepseekV2Config()
elif isinstance(text_config, dict):
text_config = DeepseekV2Config(**text_config)
self.text_config = text_config
self.ignore_index = ignore_index
self.media_placeholder_token_id = media_placeholder_token_id
super().__init__(pad_token_id=pad_token_id, **kwargs)

View File

@@ -0,0 +1,163 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
from transformers.configuration_utils import PretrainedConfig
class Lfm2MoeConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Lfm2MoeModel`]. It is used to instantiate a LFM2 Moe
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the LFM2-8B-A1B model.
e.g. [LiquidAI/LFM2-8B-A1B](https://huggingface.co/LiquidAI/LFM2-8B-A1B)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 65536):
Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`Lfm2Model`]
hidden_size (`int`, *optional*, defaults to 2048):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 7168):
Dimension of the MLP representations.
moe_intermediate_size (`int`, *optional*, defaults to 1792):
Intermediate size of the routed expert.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
pad_token_id (`int`, *optional*, defaults to 0):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether to tie weight embeddings
rope_parameters (`dict`, *optional*):
The parameters of the RoPE embeddings.
max_position_embeddings (`int`, *optional*, defaults to 128000):
The maximum sequence length that this model might ever be used with.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the rms normalization layers.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*, defaults to 8):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details, check out [this
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
`num_attention_heads`.
conv_bias (`bool`, *optional*, defaults to `False`):
Whether to use bias in the conv layers.
conv_L_cache (`int`, *optional*, defaults to 3):
L_cache dim in the conv layers.
num_dense_layers (`int`, *optional*, defaults to 2):
Number of dense Lfm2MoeMLP layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
num_experts_per_tok (`int`, *optional*, defaults to 4):
Number of selected experts.
num_experts (`int`, *optional*, defaults to 32):
Number of routed experts.
use_expert_bias (`bool`, *optional*, defaults to `True`):
Whether to use the expert bias on the routing weights.
routed_scaling_factor (`float`, *optional*, defaults to 1.0):
Scaling factor for routed experts in MoE models.
norm_topk_prob (`bool`, *optional*, defaults to `True`):
Whether to normalize the topk probabilities.
layer_types (`Optional`, *optional*):
Type of each layers.
```python
>>> from transformers import Lfm2MoeModel, Lfm2MoeConfig
>>> # Initializing a LFM2 Moe model
>>> configuration = Lfm2MoeConfig()
>>> # Initializing a model from the LFM2-8B-A1B style configuration
>>> model = Lfm2MoeModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```""" # noqa: E501
model_type = "lfm2_moe"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size: int = 65536,
hidden_size: int = 2048,
intermediate_size: int = 7168,
moe_intermediate_size: int = 1792,
num_hidden_layers: int = 32,
pad_token_id: int = 0,
bos_token_id: int = 1,
eos_token_id: int = 2,
tie_word_embeddings: bool = True,
rope_parameters: dict[str, Any] | None = None,
max_position_embeddings: int = 128_000,
use_cache: bool = True,
norm_eps: float = 0.00001,
num_attention_heads: int = 32,
num_key_value_heads: int = 8,
conv_bias: bool = False,
conv_L_cache: int = 3,
num_dense_layers: int = 2,
num_experts_per_tok: int = 4,
num_experts: int = 32,
use_expert_bias: bool = True,
routed_scaling_factor: float = 1.0,
norm_topk_prob: bool = True,
layer_types: list[str] | None = None,
**kwargs,
):
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
rope_theta = kwargs.pop("rope_theta", 1000000.0)
if rope_parameters is None:
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
self.rope_parameters = rope_parameters
self.max_position_embeddings = max_position_embeddings
self.use_cache = use_cache
self.norm_eps = norm_eps
# attn operator config
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
# custom operator config
self.conv_bias = conv_bias
self.conv_L_cache = conv_L_cache
# moe config
self.num_dense_layers = num_dense_layers
self.moe_intermediate_size = moe_intermediate_size
self.num_experts_per_tok = num_experts_per_tok
self.num_experts = num_experts
self.use_expert_bias = use_expert_bias
self.routed_scaling_factor = routed_scaling_factor
self.norm_topk_prob = norm_topk_prob
self.layer_types = layer_types
tie_word_embeddings = kwargs.get(
"tie_embedding", tie_word_embeddings
) # to fit original config keys
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
__all__ = ["Lfm2MoeConfig"]

View File

@@ -0,0 +1,65 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from transformers import PretrainedConfig
class MedusaConfig(PretrainedConfig):
model_type = "medusa"
def __init__(
self,
hidden_size: int = 4096,
vocab_size: int = 32001,
num_heads: int = 5,
num_hidden_layers: int = 1,
max_paths: int = 64,
topk: int = 10,
truncated_vocab_size: int | None = None,
**kwargs,
):
self.hidden_size = hidden_size
self.vocab_size = vocab_size
self.num_heads = num_heads
self.num_hidden_layers = num_hidden_layers
self.max_paths = max_paths
self.topk = topk
self.max_seq_len = int(2**20)
self.truncated_vocab_size = (
vocab_size if truncated_vocab_size is None else truncated_vocab_size
)
if "architectures" not in kwargs:
kwargs["architectures"] = ["MedusaModel"]
super().__init__(**kwargs)
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: str | os.PathLike,
**kwargs,
) -> "MedusaConfig":
config_dict, kwargs = cls.get_config_dict(
pretrained_model_name_or_path, **kwargs
)
for k in list(config_dict.keys()):
if "num" in k:
if "heads" in k:
config_dict["num_heads"] = config_dict.pop(k)
elif "layers" in k:
config_dict["num_hidden_layers"] = config_dict.pop(k)
return cls.from_dict(config_dict, **kwargs)
@property
def num_attention_heads(self):
return 0
@property
def num_lookahead_tokens(self):
return self.num_heads
@num_lookahead_tokens.setter
def num_lookahead_tokens(self, num_lookahead_tokens: int):
self.num_heads = num_lookahead_tokens

View File

@@ -0,0 +1,103 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2025 Horizon team, Xiaomi MiLM Plus.
# Copyright 2024 The Qwen team.
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from transformers import PretrainedConfig
from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
Qwen2_5OmniTextConfig,
)
class DashengConfig(PretrainedConfig):
model_type = "midashenglm_dasheng_encoder"
def __init__(
self,
embed_dim: int = 768,
outputdim: int = 527,
patch_size: int | tuple[int, int] = 16,
patch_stride: int | tuple[int, int] = 16,
input_channels: int = 1,
target_length: int = 1012,
depth: int = 12,
num_heads: int = 12,
mlp_ratio: float = 4.0,
qkv_bias: bool = True,
init_values: float | None = None,
drop_rate: float = 0.0,
attn_drop_rate: float = 0.0,
f_min: float = 0.0,
f_max: float = 8000.0,
center: bool = True,
win_length: int = 512,
hop_length: int = 160,
sample_rate: int = 16000,
n_fft: int = 512,
n_mels: int = 64,
**kwargs,
):
self.embed_dim = embed_dim
self.outputdim = outputdim
self.patch_size = patch_size
self.patch_stride = patch_stride
self.input_channels = input_channels
self.target_length = target_length
self.depth = depth
self.num_heads = num_heads
self.mlp_ratio = mlp_ratio
self.qkv_bias = qkv_bias
self.init_values = init_values
self.drop_rate = drop_rate
self.attn_drop_rate = attn_drop_rate
self.f_min = f_min
self.f_max = f_max
self.center = center
self.win_length = win_length
self.hop_length = hop_length
self.sample_rate = sample_rate
self.n_fft = n_fft
self.n_mels = n_mels
super().__init__(**kwargs)
class MiDashengLMConfig(PretrainedConfig):
model_type = "midashenglm"
def __init__(
self,
audio_encoder_config: dict | None = None,
subsample_factor: int = 5,
text_config: dict | None = None,
audio_token_id: int | None = None,
**kwargs,
):
self.audio_encoder_config = DashengConfig(**(audio_encoder_config or {}))
self.subsample_factor = subsample_factor
self.text_config = (
Qwen2_5OmniTextConfig(**text_config)
if text_config
else Qwen2_5OmniTextConfig()
)
self.text_config.rope_parameters = None # uses_mrope is false
self.audio_token_id = audio_token_id
super().__init__(**kwargs)

View File

@@ -0,0 +1,235 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
from transformers import PretrainedConfig, WhisperConfig
from vllm.logger import init_logger
logger = init_logger(__name__)
def adapt_config_dict(
config_dict: dict[str, Any],
defaults: dict[str, Any],
) -> PretrainedConfig:
config_dict = _remap_general_mistral_args(config_dict)
if bool(config_dict.get("quantization")):
config_dict = _remap_mistral_quantization_args(config_dict)
is_moe = bool(config_dict.get("moe"))
is_mistral_large_3 = (
is_moe and (config_dict["moe"].get("num_shared_experts") or 0) > 0
)
if config_dict.get("model_type") == "mamba":
config_dict["architectures"] = ["Mamba2ForCausalLM"]
elif is_moe and is_mistral_large_3:
config_dict = _remap_moe_args(config_dict)
config_dict["model_type"] = "deepseek_v3"
config_dict["architectures"] = ["MistralLarge3ForCausalLM"]
assert "llama_4_scaling" in config_dict, (
"MistralLarge3 expect llama4 scaling config."
)
llama_4_scaling_config_keys = ["original_max_position_embeddings", "beta"]
assert all(
[
key in config_dict["llama_4_scaling"]
for key in llama_4_scaling_config_keys
]
), (
"llama_4_scaling config should define the keys: "
f"{','.join(llama_4_scaling_config_keys)}"
)
elif is_moe:
config_dict["architectures"] = ["MixtralForCausalLM"]
else:
config_dict["architectures"] = ["MistralForCausalLM"]
if bool(config_dict.get("yarn")):
config_dict = _remap_mistral_yarn_args(config_dict)
if bool(config_dict.get("llama_4_scaling")):
llama_4_scaling_config_keys = ["original_max_position_embeddings", "beta"]
assert all(
[
key in config_dict["llama_4_scaling"]
for key in llama_4_scaling_config_keys
]
), (
"llama_4_scaling config should define the keys: "
f"{','.join(llama_4_scaling_config_keys)}"
)
is_vision = (config_dict.get("multimodal") or {}).get(
"vision_encoder_args"
) or config_dict.get("vision_encoder")
is_audio = bool(
((config_dict.get("multimodal") or {}).get("whisper_model_args") or {}).get(
"encoder_args"
)
)
assert not (is_vision and is_audio), "Vision and audio are mutually exclusive"
if is_vision:
config_dict = _remap_mistral_vision_args(config_dict)
if is_audio:
config_dict = _remap_mistral_audio_args(config_dict)
for k, v in defaults.items():
config_dict.setdefault(k, v)
config = PretrainedConfig.from_dict(config_dict)
logger.debug("Initialized config %s", config)
return config
def _remap_mistral_vision_args(config: dict) -> dict:
if config.get("multimodal"):
vision_config = config.pop("multimodal")
else:
vision_config = config.pop("vision_encoder")
quant_config = config.get("quantization_config")
config = {
"model_type": "pixtral",
"architectures": ["PixtralForConditionalGeneration"],
"text_config": PretrainedConfig.from_dict(config),
"vision_config": PretrainedConfig.from_dict(vision_config),
}
if quant_config:
config["quantization_config"] = quant_config
return config
def _remap_mistral_yarn_args(config: dict) -> dict:
yarn_config_map = {
"factor": "factor",
"original_max_position_embeddings": "original_max_position_embeddings",
"beta": "beta_fast",
"alpha": "beta_slow",
"apply_scale": "apply_yarn_scaling",
}
yarn_config = config.get("yarn") or {}
config["rope_parameters"] = {
"rope_type": "yarn",
"mscale_all_dim": 1,
}
if rope_theta := config.pop("rope_theta", None):
config["rope_parameters"]["rope_theta"] = rope_theta
for old_name, new_name in yarn_config_map.items():
if old_name in yarn_config:
config["rope_parameters"][new_name] = yarn_config.pop(old_name)
assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}"
return config
def _remap_general_mistral_args(config: dict) -> dict:
# Mistral key -> HF key
config_mapping = {
"dim": "hidden_size",
"norm_eps": "rms_norm_eps",
"n_kv_heads": "num_key_value_heads",
"n_layers": "num_hidden_layers",
"n_heads": "num_attention_heads",
"hidden_dim": "intermediate_size",
}
# HF key -> (Mistral key, default value)
top_level_mapping_with_default = {
"model_type": ("model_type", "transformer"),
"hidden_act": ("activation", "silu"),
"tie_word_embeddings": ("tied_embeddings", False),
"max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)),
"max_position_embeddings": ("max_position_embeddings", 128_000),
}
for key, new_key in config_mapping.items():
if key in config:
config[new_key] = config.pop(key)
for new_key, (key, default_value) in top_level_mapping_with_default.items():
config[new_key] = config.pop(key, default_value)
return config
def _remap_mistral_quantization_args(config: dict) -> dict:
if config.get("quantization"):
quantization = config.pop("quantization", {})
if quantization.get("qformat_weight") == "fp8_e4m3":
qscheme_act = quantization.get("qscheme_act")
assert qscheme_act in ("NO_SCALES", "TENSOR", None), (
"Only NO_SCALES and TENSOR (default) are supported for qscheme_act"
)
is_dynamic = qscheme_act == "NO_SCALES"
config["quantization_config"] = {
"quant_method": "fp8",
"activation_scheme": "dynamic" if is_dynamic else "static",
}
else:
raise ValueError(f"Found unknown quantization='{quantization}' in config")
return config
def _remap_mistral_audio_args(config: dict) -> dict:
whisper_args = config["multimodal"].pop("whisper_model_args")
encoder_args = whisper_args["encoder_args"]
downsample_args = whisper_args["downsample_args"]
quant_config = config.get("quantization_config")
config = {
"model_type": "whixtral",
"architectures": ["VoxtralForConditionalGeneration"],
"text_config": PretrainedConfig.from_dict(config),
"audio_config": WhisperConfig(
num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"],
window_size=encoder_args["audio_encoding_args"]["window_size"],
sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"],
hop_length=encoder_args["audio_encoding_args"]["hop_length"],
downsample_factor=downsample_args["downsample_factor"],
d_model=encoder_args["dim"],
encoder_layers=encoder_args["n_layers"],
encoder_ffn_dim=encoder_args["hidden_dim"],
encoder_attention_heads=encoder_args["n_heads"],
vocab_size=encoder_args["vocab_size"],
max_source_positions=encoder_args["max_source_positions"],
is_encoder_decoder=False, # Override WhisperConfig default
),
}
if quant_config:
config["quantization_config"] = quant_config
return config
def _remap_moe_args(config: dict) -> dict:
moe_config_map = {
"route_every_n": "moe_layer_freq",
"first_k_dense_replace": "first_k_dense_replace",
"num_experts_per_tok": "num_experts_per_tok",
"num_experts": "n_routed_experts",
"expert_hidden_dim": "moe_intermediate_size",
"routed_scale": "routed_scaling_factor",
"num_shared_experts": "n_shared_experts",
"num_expert_groups": "n_group",
"num_expert_groups_per_tok": "topk_group",
}
moe_config = config.get("moe", {})
for old_name, new_name in moe_config_map.items():
if old_name in moe_config:
value = moe_config.pop(old_name)
config[new_name] = value
config["topk_method"] = None
config["norm_topk_prob"] = True
config["scoring_func"] = "softmax"
return config

View File

@@ -0,0 +1,69 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from transformers import PretrainedConfig
class MLPSpeculatorConfig(PretrainedConfig):
model_type = "mlp_speculator"
attribute_map = {
"hidden_size": "emb_dim",
}
def __init__(
self,
vocab_size: int = 32000,
emb_dim: int = 4096,
inner_dim: int = 0,
n_predict: int = 3,
top_k_tokens_per_head: list[int] | None = None,
n_candidates: int = 5,
tie_weights: bool = False,
scale_input: bool = False,
**kwargs,
):
"""
Initialize an MLPSpeculatorConfig
Args:
vocab_size: int
the model vocab size
emb_dim: int
the model embedding dimension
inner_dim: int
the inner dimension of the model. If 0, will be the emb_dim.
n_predict: int
the number of lookaheads for the speculator
top_k_tokens_per_head: list[int]
Number of tokens to consider from each head when forming the
candidate tree.
For each candidate branch in the tree, head n produces topk[n]
additional sub-branches.
NOTE: This parameter is currently unused.
n_candidates: int
number of child candidates to create per sequence
tie_weights: bool
If true, use a single set of weights for every model
head/stage after the first. The initial projection
from the base model may have a different size, so that
stays separate.
scale_input: bool
if True, will scale the initial hidden states from
the base model.
"""
if top_k_tokens_per_head is None:
top_k_tokens_per_head = [5, 4, 3]
assert len(top_k_tokens_per_head) == n_predict
self.vocab_size = vocab_size
self.emb_dim = emb_dim
self.inner_dim = inner_dim
self.n_predict = n_predict
self.top_k_tokens_per_head = top_k_tokens_per_head
self.n_candidates = n_candidates
self.num_lookahead_tokens = n_predict
self.tie_weights = tie_weights
self.scale_input = scale_input
super().__init__(**kwargs)

View File

@@ -0,0 +1,33 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
from transformers.configuration_utils import PretrainedConfig
class MoonViTConfig(PretrainedConfig):
model_type = "moonvit"
def __init__(
self,
patch_size: int = 14,
init_pos_emb_height: int = 64,
init_pos_emb_width: int = 64,
num_attention_heads: int = 16,
num_hidden_layers: int = 27,
hidden_size: int = 1152,
intermediate_size: int = 4304,
merge_kernel_size: tuple[int, int] = (2, 2),
**kwargs,
):
super().__init__(**kwargs)
self.patch_size = patch_size
# Positional embedding config
self.init_pos_emb_height = init_pos_emb_height
self.init_pos_emb_width = init_pos_emb_width
# Transformer config
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
# Patch merger config
self.merge_kernel_size = merge_kernel_size

View File

@@ -1,178 +0,0 @@
# coding=utf-8
# Copied from
# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
"""A HuggingFace-style model configuration."""
import warnings
from typing import Any, Dict, Optional, Union
from transformers import PretrainedConfig
attn_config_defaults: Dict = {
'attn_type': 'multihead_attention',
'attn_pdrop': 0.0,
'attn_impl': 'triton',
'qk_ln': False,
'clip_qkv': None,
'softmax_scale': None,
'prefix_lm': False,
'attn_uses_sequence_id': False,
'alibi': False,
'alibi_bias_max': 8
}
ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'}
init_config_defaults: Dict = {
'name': 'kaiming_normal_',
'fan_mode': 'fan_in',
'init_nonlinearity': 'relu',
'init_div_is_residual': True,
'emb_init_std': None,
'emb_init_uniform_lim': None,
'init_std': None,
'init_gain': 0.0
}
class MPTConfig(PretrainedConfig):
model_type = 'mpt'
attribute_map = {
'num_attention_heads': 'n_heads',
'hidden_size': 'd_model',
'num_hidden_layers': 'n_layers',
}
# pylint: disable=dangerous-default-value
def __init__(self,
d_model: int = 2048,
n_heads: int = 16,
n_layers: int = 24,
expansion_ratio: int = 4,
max_seq_len: int = 2048,
vocab_size: int = 50368,
resid_pdrop: float = 0.0,
emb_pdrop: float = 0.0,
learned_pos_emb: bool = True,
attn_config: Dict = attn_config_defaults,
ffn_config: Dict = ffn_config_defaults,
init_device: str = 'cpu',
logit_scale: Optional[Union[float, str]] = None,
no_bias: bool = False,
embedding_fraction: float = 1.0,
norm_type: str = 'low_precision_layernorm',
use_cache: bool = False,
init_config: Dict = init_config_defaults,
fc_type: str = 'torch',
verbose: Optional[int] = None,
**kwargs: Any):
self.d_model = d_model
self.n_heads = n_heads
self.n_layers = n_layers
self.expansion_ratio = expansion_ratio
self.max_seq_len = max_seq_len
self.vocab_size = vocab_size
self.resid_pdrop = resid_pdrop
self.emb_pdrop = emb_pdrop
self.learned_pos_emb = learned_pos_emb
self.attn_config = attn_config
self.ffn_config = ffn_config
self.init_device = init_device
self.logit_scale = logit_scale
self.no_bias = no_bias
self.embedding_fraction = embedding_fraction
self.norm_type = norm_type
self.use_cache = use_cache
self.init_config = init_config
self.fc_type = fc_type
if verbose is not None:
warnings.warn(DeprecationWarning(
'verbose argument for MPTConfig is now ignored and '
'will be removed. Use python_log_level instead.'),
stacklevel=2)
if 'name' in kwargs:
del kwargs['name']
if 'loss_fn' in kwargs:
del kwargs['loss_fn']
if self.attn_config.get('alibi', False):
self.learned_pos_emb = False
warnings.warn(
f'alibi is turned on, setting `learned_pos_emb` '
f'to {self.learned_pos_emb}`',
stacklevel=2)
super().__init__(**kwargs)
self._validate_config()
def _set_config_defaults(
self, config: Dict[str, Any],
config_defaults: Dict[str, Any]) -> Dict[str, Any]:
for (k, v) in config_defaults.items():
if k not in config:
config[k] = v
return config
def _validate_config(self) -> None:
self.attn_config = self._set_config_defaults(self.attn_config,
attn_config_defaults)
self.ffn_config = self._set_config_defaults(self.ffn_config,
ffn_config_defaults)
self.init_config = self._set_config_defaults(self.init_config,
init_config_defaults)
if self.d_model % self.n_heads != 0:
raise ValueError('d_model must be divisible by n_heads')
if any((
prob < 0 or prob > 1 for prob in
[self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop]
)):
raise ValueError(
"self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
"probabilities and must be between 0 and 1")
if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
raise ValueError(
f"Unknown attn_impl={self.attn_config['attn_impl']}")
if self.attn_config['prefix_lm'] and self.attn_config[
'attn_impl'] not in ['torch', 'triton']:
raise NotImplementedError(
'prefix_lm only implemented with torch and triton attention.')
if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [
'torch', 'triton'
]:
raise NotImplementedError(
'alibi only implemented with torch and triton attention.')
if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
'attn_impl'] not in ['torch', 'triton']:
raise NotImplementedError(
'attn_uses_sequence_id only implemented with torch '
'and triton attention.')
if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
raise ValueError(
'model.embedding_fraction must be between 0 (exclusive) '
'and 1 (inclusive)!')
if isinstance(self.logit_scale,
str) and self.logit_scale != 'inv_sqrt_d_model':
raise ValueError(
f"self.logit_scale={self.logit_scale!r} is not recognized as "
"an option; use numeric value or 'inv_sqrt_d_model'.")
if self.init_config.get('name', None) is None:
raise ValueError(
f"self.init_config={self.init_config!r} 'name' needs to be set."
)
if not self.learned_pos_emb and (not self.attn_config['alibi']):
warnings.warn(
'Positional information not being provided to the model.',
stacklevel=2)
if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
try:
# pylint: disable=import-outside-toplevel
import transformer_engine.pytorch as te
del te
except Exception as exc:
raise ImportError(
'TransformerEngine import fail. `fc_type: te` requires '
'TransformerEngine be installed. '
'The required version of transformer_engine also requires '
'FlashAttention v1.0.6 is installed:\n'
'pip install flash-attn==1.0.6 --no-build-isolation \n'
'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
) from exc
if self.ffn_config['ffn_type'] == 'mptmlp':
self.ffn_config['fc_type'] = self.fc_type
elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
self.ffn_config['bias'] = not self.no_bias

View File

@@ -0,0 +1,220 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2024 HuggingFace Inc. team. All rights reserved.
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Nemotron model configuration"""
from transformers import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
class NemotronConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a
[`NemotronModel`]. It is used to instantiate a Nemotron model
according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar
configuration to that of the Nemotron-8B.
Configuration objects inherit from [`PretrainedConfig`] and can be
used to control the model outputs. Read the documentation from
[`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 256000):
Vocabulary size of the Nemotron model. Defines the number of
different tokens that can be represented by the
`inputs_ids` passed when calling [`NemotronModel`]
hidden_size (`int`, *optional*, defaults to 6144):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 24576):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 48):
Number of attention heads for each attention layer in the
Transformer decoder.
head_dim (`int`, *optional*):
Projection weights dimension in multi-head attention. Set to
hidden_size // num_attention_heads if None
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to
implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use
Multi Head Attention (MHA), if
`num_key_value_heads=1 the model will use Multi Query Attention
(MQA) otherwise GQA is used. When converting a multi-head
checkpoint to a GQA checkpoint, each group key and value
head should be constructed by meanpooling all the original
heads within that group. For more details checkout
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
is not specified, will default to `num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
The non-linear activation function (function or string) in the
decoder.
max_position_embeddings (`int`, *optional*, defaults to 4096):
The maximum sequence length that this model might ever be used
with.
initializer_range (`float`, *optional*, defaults to 0.0134):
The standard deviation of the truncated_normal_initializer for
initializing all weight matrices.
norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values
attentions (not used by all models). Only relevant if
`config.is_decoder=True`.
pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 2):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 3):
End of stream token id.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_parameters (`dict`, *optional*):
The parameters of the RoPE embeddings. Expected contents:
`rope_theta` (`float`): The base period of the RoPE embeddings.
`rope_type` (`str`):
The sub-variant of RoPE to use. Can be one of ['default', 'linear',
'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the
original RoPE implementation.
`partial_rotary_factor` (`float`, *optional*, defaults to 0.5):
Percentage of the query and keys which will have rotary embedding.
attention_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output
projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
mlp_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in up_proj and down_proj layers in the MLP
layers.
```python
>>> from transformers import NemotronModel, NemotronConfig
>>> # Initializing a Nemotron nemotron-15b style configuration
>>> configuration = NemotronConfig()
>>> # Initializing a model from the nemotron-15b style configuration
>>> model = NemotronModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "nemotron"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=256000,
hidden_size=6144,
intermediate_size=24576,
num_hidden_layers=32,
num_attention_heads=48,
head_dim=None,
num_key_value_heads=None,
hidden_act="relu2",
max_position_embeddings=4096,
initializer_range=0.0134,
norm_eps=1e-5,
use_cache=True,
pad_token_id=None,
bos_token_id=2,
eos_token_id=3,
tie_word_embeddings=False,
rope_parameters=None,
attention_bias=False,
attention_dropout=0.0,
mlp_bias=False,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
head_dim = head_dim or kwargs.get("kv_channels")
self.head_dim = (
head_dim if head_dim is not None else (hidden_size // num_attention_heads)
)
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.norm_eps = norm_eps
self.use_cache = use_cache
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 10000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
# for backward compatibility
partial_rotary_factor = (
kwargs.get("rope_percent")
or kwargs.get("rope_percentage")
or kwargs.get("partial_rotary_factor")
or 0.5
)
if "partial_rotary_factor" not in rope_parameters:
rope_parameters["partial_rotary_factor"] = partial_rotary_factor
self.rope_parameters = rope_parameters
self._rope_parameters_validation()
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.mlp_bias = mlp_bias
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
def _rope_parameters_validation(self):
"""
Validate the `rope_parameters` configuration.
"""
if self.rope_parameters is None:
return
rope_type: str | None = self.rope_parameters.get("rope_type", None)
factor: float | None = self.rope_parameters.get("factor", None)
if rope_type not in {"default", "linear", "dynamic"}:
raise ValueError(
"`rope_type` must be one of ['default', 'linear', 'dynamic'], "
f"got {rope_type}"
)
if rope_type != "default":
if factor is None:
raise ValueError(
"If `rope_type` is not 'default', `rope_parameters` "
"must include a `factor` field. Got `None`."
)
if not isinstance(factor, float) or factor <= 1.0:
raise ValueError(
"`rope_parameters`'s factor field must be a float > 1, got "
f"{factor}"
)

View File

@@ -0,0 +1,284 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2024 HuggingFace Inc. team. All rights reserved.
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""NemotronH model configuration"""
import regex as re
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
class NemotronHConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a
[`NemotronHModel`]. It is used to instantiate a NemotronH model according
to the specified arguments, defining the model architecture. Instantiating
a configuration with the defaults will yield a similar configuration to
that of the NemotronH-v0.1 model.
Args:
vocab_size (`int`, *optional*, defaults to 131072):
Vocabulary size of the NemotronH model. Defines the number of
different tokens that can be represented by the `inputs_ids`
passed when calling [`NemotronHModel`]
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether the model's input and output word embeddings should be
tied. Note that this is only relevant if the model has an output
word embedding layer.
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 21504):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 52):
Number of hidden layers in the Transformer encoder.
hybrid_override_pattern (`str`, *optional*, defaults to
`"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
The pattern of the hybrid model. The pattern is a string of
characters where each character represents
M: Mamba2, *: Attention, -: MLP
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the
Transformer encoder.
attention_head_dim (`int`, *optional*, defaults to 128):
Dimension of each attention head.
num_key_value_heads (`int`, *optional*, defaults to 8):
This is the number of key_value heads that should be used to
implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use
Multi Head Attention (MHA), if `num_key_value_heads=1` the model
will use Multi Query Attention (MQA) otherwise GQA is used.
mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
The non-linear activation function in the MLP layers.
attention_bias (`bool`, *optional*, defaults to `False`):
Whether to use bias in attention layers.
mlp_bias (`bool`, *optional*, defaults to `False`):
Whether to use bias in MLP layers.
use_bias (`bool`, *optional*, defaults to `False`):
Whether to use bias in the model.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for
initializing all weight matrices.
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
residual_in_fp32 (`bool`, *optional*, defaults to `False`):
Whether or not residuals should be in `float32`. If set to `False`
residuals will keep the same `dtype` as the rest of the model.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values
attentions (not used by all models). Only relevant if
`config.is_decoder=True`.
num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
Number of prompt logits to calculate during generation. If `None`,
all logits will be calculated. If an integer value, only last
`num_logits_to_keep` logits will be calculated.
pad_token_id (`int`, *optional*, defaults to 0):
The id of the padding token.
bos_token_id (`int`, *optional*, defaults to 1):
The id of the "beginning-of-sequence" token.
eos_token_id (`int`, *optional*, defaults to 2):
The id of the "end-of-sequence" token.
sliding_window (`int`, *optional*, defaults to None):
Sliding window attention window size.
max_position_embeddings (`int`, *optional*, defaults to 4096):
The maximum sequence length that this model might ever be used
with.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
hidden_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the hidden states.
use_mamba_kernels (`bool`, *optional*, defaults to `True`):
Flag indicating whether or not to use the fast mamba kernels.
These are available only if `mamba-ssm` and `causal-conv1d`
are installed, and the mamba modules are running on a CUDA device.
ssm_state_size (`int`, *optional*, defaults to 128):
The dimension of the mamba state space latents.
mamba_num_heads (`int`, *optional*, defaults to 128):
Number of heads in Mamba layers.
mamba_n_groups (`int`, *optional*, defaults to 8):
Number of groups in Mamba layers.
mamba_head_dim (`int`, *optional*, defaults to 64):
Dimension of each Mamba head.
mamba_d_conv (`int`, *optional*, defaults to 4):
The size of the mamba convolution kernel.
mamba_expand (`int`, *optional*, defaults to 2):
Expanding factor used to determine the mamba intermediate size.
mamba_hidden_act (`str`, *optional*, defaults to "silu"):
The non-linear activation function in the Mamba layers.
mamba_dt_min (`float`, *optional*, defaults to 0.001):
Minimum value for the time step in Mamba.
mamba_dt_max (`float`, *optional*, defaults to 0.1):
Maximum value for the time step in Mamba.
mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
Limits for the time step in Mamba.
mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
Floor value for time step initialization in Mamba.
mamba_conv_bias (`bool`, *optional*, defaults to `True`):
Whether to use bias in the convolution layer of the mamba mixer
block.
mamba_proj_bias (`bool`, *optional*, defaults to `False`):
Whether to use bias in the input and output projections of the
mamba mixer block.
mamba_chunk_size (`int`, *optional*, defaults to 256):
Size of chunks for Mamba processing.
rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
Whether to rescale the pre-normalization residual connections.
"""
model_type = "nemotron_h"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=131072,
tie_word_embeddings=False,
hidden_size=4096,
intermediate_size=21504,
num_hidden_layers=52,
hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
num_attention_heads=32,
head_dim=128,
num_key_value_heads=8, # nemo: num_query_groups
mlp_hidden_act="relu2",
attention_bias=False,
mlp_bias=False,
use_bias=False,
initializer_range=0.02, # nemo: init_method_std
layer_norm_epsilon=1e-5, # nemo: layernorm_epsilon
residual_in_fp32=False, # Megatron Core default value
use_cache=True,
num_logits_to_keep=1,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
sliding_window=None,
max_position_embeddings=4096,
attention_dropout=0.0,
hidden_dropout=0.0, # * ADDED
use_mamba_kernels=True,
ssm_state_size=128, # mamba_state_size
mamba_num_heads=128,
mamba_n_groups=8, # nemo: mamba_ssm_ngroups = num_heads
mamba_head_dim=64,
mamba_d_conv=4,
mamba_expand=2,
mamba_hidden_act="silu",
mamba_dt_min=0.001,
mamba_dt_max=0.1,
mamba_dt_limit=(0.0, float("inf")),
mamba_dt_init_floor=1e-4,
mamba_conv_bias=True,
mamba_proj_bias=False,
mamba_chunk_size=256,
rescale_prenorm_residual=True,
n_routed_experts=8,
n_shared_experts=1,
moe_intermediate_size=7688,
moe_shared_expert_intermediate_size=7688,
moe_latent_size=None,
num_experts_per_tok=2,
routed_scaling_factor=1.0,
n_group=1,
topk_group=1,
norm_topk_prob=True,
**kwargs,
):
self.vocab_size = vocab_size
self.tie_word_embeddings = tie_word_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.hybrid_override_pattern = hybrid_override_pattern
self.num_attention_heads = num_attention_heads
self.head_dim = head_dim
self.sliding_window = sliding_window
self.max_position_embeddings = max_position_embeddings
self.attention_dropout = attention_dropout
self.hidden_dropout = hidden_dropout
# Validate hybrid_override_pattern
# M: Mamba2, *: Attention, -: MLP
assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
"hybrid_override_pattern must have same length as num_hidden_layers"
)
assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
"hybrid_override_pattern must only contain characters 'M', '*', or '-'"
)
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.mlp_hidden_act = mlp_hidden_act
self.attention_bias = attention_bias
self.mlp_bias = mlp_bias
self.use_bias = use_bias
self.initializer_range = initializer_range
self.layer_norm_epsilon = layer_norm_epsilon
self.residual_in_fp32 = residual_in_fp32
self.use_cache = use_cache
self.num_logits_to_keep = num_logits_to_keep
self.use_mamba_kernels = use_mamba_kernels
self.n_groups = mamba_n_groups
self.mamba_head_dim = mamba_head_dim
self.ssm_state_size = ssm_state_size
self.mamba_num_heads = mamba_num_heads
self.conv_kernel = mamba_d_conv
self.expand = mamba_expand
self.mamba_hidden_act = mamba_hidden_act
self.time_step_min = mamba_dt_min
self.time_step_max = mamba_dt_max
self.time_step_limit = mamba_dt_limit
self.time_step_floor = mamba_dt_init_floor
self.use_conv_bias = mamba_conv_bias
self.mamba_proj_bias = mamba_proj_bias
self.chunk_size = mamba_chunk_size
self.rescale_prenorm_residual = rescale_prenorm_residual
self.n_routed_experts = n_routed_experts
self.n_shared_experts = n_shared_experts
self.moe_intermediate_size = moe_intermediate_size
self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size # noqa: E501
self.moe_latent_size = moe_latent_size
self.num_experts_per_tok = num_experts_per_tok
self.routed_scaling_factor = routed_scaling_factor
self.n_group = n_group
self.topk_group = topk_group
self.norm_topk_prob = norm_topk_prob
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
@property
def layers_block_type(self):
return [
"mamba"
if self.hybrid_override_pattern[i] == "M"
else "attention"
if self.hybrid_override_pattern[i] == "*"
else "mlp"
if self.hybrid_override_pattern[i] == "-"
else "moe"
for i in range(self.num_hidden_layers)
]

View File

@@ -0,0 +1,83 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from transformers.configuration_utils import PretrainedConfig
class Olmo3Config(PretrainedConfig):
model_type = "olmo3"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=50304,
hidden_size=4096,
intermediate_size=11008,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
use_cache=True,
pad_token_id=1,
bos_token_id=None,
eos_token_id=50279,
tie_word_embeddings=False,
rope_parameters=None,
attention_bias=False,
attention_dropout=0.0,
rms_norm_eps=1e-5,
sliding_window=4096,
layer_types=None,
**kwargs,
):
# This model uses Olmo3ForCausalLM in transformers but Olmo2ForCausalLM
# in vLLM.
if "architectures" not in kwargs:
kwargs["architectures"] = ["Olmo2ForCausalLM"]
elif "Olmo3ForCausalLM" in kwargs["architectures"]:
kwargs["architectures"].remove("Olmo3ForCausalLM")
kwargs["architectures"].append("Olmo2ForCausalLM")
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.use_cache = use_cache
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 10000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
self.rope_parameters = rope_parameters
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.rms_norm_eps = rms_norm_eps
self.sliding_window = sliding_window
self.layer_types = layer_types
if self.layer_types is None:
self.layer_types = [
"sliding_attention" if (i + 1) % 4 != 0 else "full_attention"
for i in range(self.num_hidden_layers)
]

View File

@@ -0,0 +1,182 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
# adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
# Ovis Config with AimV2 config registration removed for Transformers compatibility
from typing import Any
from transformers import AutoConfig, PretrainedConfig
class AIMv2Config(PretrainedConfig):
"""This is the configuration class to store the configuration of an [`AIMv2Model`].
Instantiating a configuration with the defaults will yield a similar configuration
to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224).
Args:
hidden_size: Dimension of the hidden representations.
intermediate_size: Dimension of the SwiGLU representations.
num_hidden_layers: Number of hidden layers in the Transformer.
num_attention_heads: Number of attention heads for each attention layer
in the Transformer.
num_channels: Number of input channels.
image_size: Image size.
patch_size: Patch size.
rms_norm_eps: Epsilon value used for the RMS normalization layer.
attention_dropout: Dropout ratio for attention probabilities.
projection_dropout: Dropout ratio for the projection layer after the attention.
qkv_bias: Whether to add a bias to the queries, keys and values.
use_bias: Whether to add a bias in the feed-forward and projection layers.
kwargs: Keyword arguments for the [`PretrainedConfig`].
"""
model_type: str = "aimv2"
def __init__(
self,
hidden_size: int = 1024,
intermediate_size: int = 2816,
num_hidden_layers: int = 24,
num_attention_heads: int = 8,
num_channels: int = 3,
image_size: int = 224,
patch_size: int = 14,
rms_norm_eps: float = 1e-5,
attention_dropout: float = 0.0,
projection_dropout: float = 0.0,
qkv_bias: bool = False,
use_bias: bool = False,
**kwargs: Any,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.attention_dropout = attention_dropout
self.rms_norm_eps = rms_norm_eps
self.projection_dropout = projection_dropout
self.qkv_bias = qkv_bias
self.use_bias = use_bias
# ----------------------------------------------------------------------
# Visual Tokenizer Configuration
# ----------------------------------------------------------------------
class BaseVisualTokenizerConfig(PretrainedConfig):
def __init__(
self,
vocab_size=16384,
tokenize_function="softmax",
tau=1.0,
depths=None,
drop_cls_token=False,
backbone_config: PretrainedConfig | dict | None = None,
hidden_stride: int = 1,
**kwargs,
):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.tokenize_function = tokenize_function
self.tau = tau
if isinstance(depths, str):
depths = [int(x) for x in depths.split("|")]
self.depths = depths
self.backbone_kwargs = dict[str, Any]()
self.drop_cls_token = drop_cls_token
if backbone_config is not None:
assert isinstance(backbone_config, (PretrainedConfig, dict)), (
f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
)
if not isinstance(backbone_config, PretrainedConfig):
model_type = backbone_config["model_type"]
if model_type != "aimv2":
backbone_config.pop("model_type")
backbone_config = AutoConfig.for_model(
model_type, **backbone_config
)
else:
backbone_config = AIMv2Config(**backbone_config)
self.backbone_config = backbone_config
self.hidden_stride = hidden_stride
class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
model_type = "aimv2_visual_tokenizer"
def __init__(self, **kwargs):
super().__init__(**kwargs)
if self.drop_cls_token:
self.drop_cls_token = False
if self.depths:
assert len(self.depths) == 1
self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
model_type = "siglip_visual_tokenizer"
def __init__(self, **kwargs):
super().__init__(**kwargs)
if self.drop_cls_token:
self.drop_cls_token = False
if self.depths:
assert len(self.depths) == 1
self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
# ----------------------------------------------------------------------
# Ovis Configuration
# ----------------------------------------------------------------------
class OvisConfig(PretrainedConfig):
model_type = "ovis"
def __init__(
self,
llm_config: PretrainedConfig | dict | None = None,
visual_tokenizer_config: PretrainedConfig | dict | None = None,
multimodal_max_length=8192,
hidden_size=None,
conversation_formatter_class=None,
llm_attn_implementation=None,
disable_tie_weight=False,
**kwargs,
):
super().__init__(**kwargs)
if llm_config is not None:
assert isinstance(llm_config, (PretrainedConfig, dict)), (
f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
)
if not isinstance(llm_config, PretrainedConfig):
model_type = llm_config["model_type"]
llm_config.pop("model_type")
llm_config = AutoConfig.for_model(model_type, **llm_config)
# map llm_config to text_config
self.text_config = llm_config
if visual_tokenizer_config is not None:
assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), (
f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
)
if not isinstance(visual_tokenizer_config, PretrainedConfig):
model_type = visual_tokenizer_config["model_type"]
visual_tokenizer_config.pop("model_type")
visual_tokenizer_config = AutoConfig.for_model(
model_type, **visual_tokenizer_config
)
self.visual_tokenizer_config = visual_tokenizer_config
self.multimodal_max_length = multimodal_max_length
self.hidden_size = hidden_size
self.conversation_formatter_class = conversation_formatter_class
self.llm_attn_implementation = llm_attn_implementation
self.disable_tie_weight = disable_tie_weight

View File

@@ -0,0 +1,277 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Qwen3-Next model configuration"""
from transformers.configuration_utils import PretrainedConfig, layer_type_validation
from transformers.utils import logging
logger = logging.get_logger(__name__)
class Qwen3NextConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Qwen3NextModel`]. It is used to instantiate a
Qwen3-Next model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of
Qwen3-Next-80B-A3B-Instruct [Qwen/Qwen3-Next-80B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct).
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 151936):
Vocabulary size of the model. Defines the number of different tokens that can be represented by the
`inputs_ids`.
hidden_size (`int`, *optional*, defaults to 2048):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 5632):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 48):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
num_key_value_heads (`int`, *optional*, defaults to 2):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
hidden_act (`str`, *optional*, defaults to `"silu"`):
The non-linear activation function in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 32768):
The maximum sequence length that this model might ever be used with.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether the model's input and output word embeddings should be tied.
rope_parameters (`dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
accordingly.
Expected contents:
`rope_theta` (`float`): The base period of the RoPE embeddings.
`rope_type` (`str`):
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
'llama3'], with 'default' being the original RoPE implementation.
`factor` (`float`, *optional*):
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
original maximum pre-trained length.
`original_max_position_embeddings` (`int`, *optional*):
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
pretraining.
`attention_factor` (`float`, *optional*):
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
computation. If unspecified, it defaults to value recommended by the implementation, using the
`factor` field to infer the suggested value.
`beta_fast` (`float`, *optional*):
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
ramp function. If unspecified, it defaults to 32.
`beta_slow` (`float`, *optional*):
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
ramp function. If unspecified, it defaults to 1.
`short_factor` (`List[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
`long_factor` (`List[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
`low_freq_factor` (`float`, *optional*):
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
`high_freq_factor` (`float`, *optional*):
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
`partial_rotary_factor` (`float`, *optional*, defaults to 0.25):
Percentage of the query and keys which will have rotary embedding.
attention_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
head_dim (`int`, *optional*, defaults to 256):
Projection weights dimension in multi-head attention.
linear_conv_kernel_dim (`int`, *optional*, defaults to 4):
Kernel size of the convolution used in linear attention layers.
linear_key_head_dim (`int`, *optional*, defaults to 128):
Dimension of each key head in linear attention.
linear_value_head_dim (`int`, *optional*, defaults to 128):
Dimension of each value head in linear attention.
linear_num_key_heads (`int`, *optional*, defaults to 16):
Number of key heads used in linear attention layers.
linear_num_value_heads (`int`, *optional*, defaults to 32):
Number of value heads used in linear attention layers.
decoder_sparse_step (`int`, *optional*, defaults to 1):
The frequency of the MoE layer.
moe_intermediate_size (`int`, *optional*, defaults to 512):
Intermediate size of the routed expert.
shared_expert_intermediate_size (`int`, *optional*, defaults to 512):
Intermediate size of the shared expert.
num_experts_per_tok (`int`, *optional*, defaults to 10):
Number of selected experts.
num_experts (`int`, *optional*, defaults to 512):
Number of routed experts.
norm_topk_prob (`bool`, *optional*, defaults to `True`):
Whether to normalize the topk probabilities.
output_router_logits (`bool`, *optional*, defaults to `False`):
Whether or not the router logits should be returned by the model. Enabling this will also
allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
The aux loss factor for the total loss.
mlp_only_layers (`list[int]`, *optional*, defaults to `[]`):
Indicate which layers use Qwen3NextMLP rather than Qwen3NextSparseMoeBlock
The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
layer_types (`list[str]`, *optional*):
Types of each layer (attention or linear).
```python
>>> from transformers import Qwen3NextModel, Qwen3NextConfig
>>> # Initializing a Qwen3Next style configuration
>>> configuration = Qwen3NextConfig()
>>> # Initializing a model from the Qwen3-Next-80B-A3B style configuration
>>> model = Qwen3NextModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
""" # noqa: E501
model_type = "qwen3_next"
keys_to_ignore_at_inference = ["past_key_values"]
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.mlp.experts.*.gate_proj": "colwise",
"layers.*.mlp.experts.*.up_proj": "colwise",
"layers.*.mlp.experts.*.down_proj": "rowwise",
"layers.*.mlp.shared_experts.gate_proj": "colwise",
"layers.*.mlp.shared_experts.up_proj": "colwise",
"layers.*.mlp.shared_experts.down_proj": "rowwise",
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
base_model_pp_plan = {
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
"norm": (["hidden_states"], ["hidden_states"]),
}
def __init__(
self,
vocab_size=151936,
hidden_size=2048,
intermediate_size=5632,
num_hidden_layers=48,
num_attention_heads=16,
num_key_value_heads=2,
hidden_act="silu",
max_position_embeddings=32768,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
tie_word_embeddings=False,
rope_parameters=None,
attention_bias=False,
attention_dropout=0.0,
head_dim=256,
linear_conv_kernel_dim=4,
linear_key_head_dim=128,
linear_value_head_dim=128,
linear_num_key_heads=16,
linear_num_value_heads=32,
decoder_sparse_step=1,
moe_intermediate_size=512,
shared_expert_intermediate_size=512,
num_experts_per_tok=10,
num_experts=512,
norm_topk_prob=True,
output_router_logits=False,
router_aux_loss_coef=0.001,
mlp_only_layers=None,
layer_types=None,
**kwargs,
):
if mlp_only_layers is None:
mlp_only_layers = []
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 10000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
partial_rotary_factor = kwargs.pop("partial_rotary_factor", 0.25)
if "partial_rotary_factor" not in rope_parameters:
rope_parameters["partial_rotary_factor"] = partial_rotary_factor
self.rope_parameters = rope_parameters
self.partial_rotary_factor = partial_rotary_factor
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.head_dim = head_dim
self.layer_types = layer_types
if self.layer_types is None:
self.layer_types = [
"linear_attention" if bool((i + 1) % 4) else "full_attention"
for i in range(self.num_hidden_layers)
]
layer_type_validation(self.layer_types)
# linear attention part
self.linear_conv_kernel_dim = linear_conv_kernel_dim
self.linear_key_head_dim = linear_key_head_dim
self.linear_value_head_dim = linear_value_head_dim
self.linear_num_key_heads = linear_num_key_heads
self.linear_num_value_heads = linear_num_value_heads
# MoE arguments
self.decoder_sparse_step = decoder_sparse_step
self.moe_intermediate_size = moe_intermediate_size
self.shared_expert_intermediate_size = shared_expert_intermediate_size
self.num_experts_per_tok = num_experts_per_tok
self.num_experts = num_experts
self.norm_topk_prob = norm_topk_prob
self.output_router_logits = output_router_logits
self.router_aux_loss_coef = router_aux_loss_coef
self.mlp_only_layers = mlp_only_layers
__all__ = ["Qwen3NextConfig"]

View File

@@ -0,0 +1,89 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Radio vision model configuration"""
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
VIT_TIMM_DIM_BY_NAME: dict[str, tuple[int, int, int, int]] = {
"vit_small_patch16_224": (384, 12, 6, 1536),
"vit_base_patch16_224": (768, 12, 12, 3072),
"vit_large_patch16_224": (1024, 24, 16, 4096),
"vit_huge_patch16_224": (1280, 32, 16, 5120),
}
OPENAI_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
OPENAI_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
class RadioConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a Radio
vision model. It is used to instantiate a Radio model according to the
specified arguments, defining the model architecture.
Args:
model_name: Name of the vision transformer model
(e.g., "vit_base_patch16_224"). Used to determine architecture
dimensions from `VIT_TIMM_DIM_BY_NAME`.
image_size: The size (resolution) of each image.
patch_size: The size (resolution) of each patch.
qkv_bias: Whether to add a bias to the queries, keys and values.
qk_normalization: Whether to apply normalization to queries and keys.
norm_type: The normalization type to use.
layer_norm_eps: The epsilon used by the layer normalization layers.
initializer_factor: A factor for initializing all weight matrices.
hidden_act: The non-linear activation function in the encoder.
max_img_size: Maximum image size for position embeddings.
norm_mean: Mean values for image normalization (RGB channels).
Defaults to (0.48145466, 0.4578275, 0.40821073)).
norm_std: Standard deviation values for image normalization
(RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).
reg_tokens: Number of register tokens to use.
"""
model_type = "radio"
def __init__(
self,
model_name: str,
image_size: int = 224,
patch_size: int = 16,
qkv_bias: bool = True,
qk_normalization: bool = False,
norm_type: str = "layer_norm",
layer_norm_eps: float = 1e-6,
initializer_factor: float = 1.0,
hidden_act: str = "gelu",
max_img_size: int = 2048,
norm_mean: tuple[float, float, float] | list = OPENAI_CLIP_MEAN,
norm_std: tuple[float, float, float] | list = OPENAI_CLIP_STD,
reg_tokens: int | None = None,
**kwargs,
):
self.model_name = model_name
(
self.hidden_size,
self.num_hidden_layers,
self.num_attention_heads,
self.intermediate_size,
) = VIT_TIMM_DIM_BY_NAME[model_name]
self.image_size = image_size
self.patch_size = patch_size
self.qkv_bias = qkv_bias
self.qk_normalization = qk_normalization
self.norm_type = norm_type
self.layer_norm_eps = layer_norm_eps
self.initializer_factor = initializer_factor
self.hidden_act = hidden_act
self.max_img_size = max_img_size
self.norm_mean = (
list(norm_mean) if isinstance(norm_mean, (tuple, list)) else norm_mean
)
self.norm_std = (
list(norm_std) if isinstance(norm_std, (tuple, list)) else norm_std
)
self.reg_tokens = reg_tokens
super().__init__(**kwargs)

View File

@@ -0,0 +1,2 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

View File

@@ -0,0 +1,38 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
SUPPORTED_SPECULATORS_TYPES = {}
def register_speculator(name):
def decorator(fn):
SUPPORTED_SPECULATORS_TYPES[name] = fn
return fn
return decorator
@register_speculator("eagle3")
def update_eagle3(config_dict: dict, vllm_config: dict) -> None:
"""
Apply Eagle-3 specific configuration transformations.
Eagle-3 specific fields:
- draft_vocab_size: Size of the draft model's vocabulary
- target_hidden_size: Hidden size of the target model
- norm_before_residual: Whether to apply norm before residual connection
- eagle_aux_hidden_state_layer_ids: List of layer indices from the base
model to use as auxiliary inputs for the Eagle3 drafter. These layers
provide intermediate hidden states that help the drafter make better
predictions. This is the standard field used in Eagle3 checkpoints.
"""
vllm_config["draft_vocab_size"] = config_dict.get("draft_vocab_size")
if config_dict.get("target_hidden_size") is not None:
vllm_config["target_hidden_size"] = config_dict["target_hidden_size"]
vllm_config["norm_before_residual"] = config_dict.get("norm_before_residual", True)
vllm_config["architectures"] = ["Eagle3LlamaForCausalLM"]
if config_dict.get("eagle_aux_hidden_state_layer_ids"):
vllm_config["eagle_aux_hidden_state_layer_ids"] = config_dict[
"eagle_aux_hidden_state_layer_ids"
]

View File

@@ -0,0 +1,114 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from typing import Any
from transformers import PretrainedConfig
from vllm.transformers_utils.configs.speculators.algos import (
SUPPORTED_SPECULATORS_TYPES,
)
__all__ = ["SpeculatorsConfig"]
class SpeculatorsConfig(PretrainedConfig):
model_type = "speculators"
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: str | os.PathLike,
**kwargs,
) -> "SpeculatorsConfig":
"""Load speculators Eagle config and convert to vLLM format."""
config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
vllm_config = cls.extract_vllm_speculative_config(config_dict)
return cls(**vllm_config)
@classmethod
def extract_vllm_speculative_config(
cls, config_dict: dict[str, Any]
) -> dict[str, Any]:
speculators_model_type = config_dict.get("speculators_model_type")
if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES:
raise ValueError(
f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. "
"Please ensure you're loading a speculators-format model."
)
# validate fields
# TODO: @dsikka - use speculators pydantic model to validate
cls.validate_speculators_config(config_dict=config_dict)
# Convert from speculators config -> format that can be ingested by vLLM
vllm_config = cls.build_vllm_speculative_config(config_dict=config_dict)
# Apply anything specific to the supported algorithm
algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type]
algo_updater(config_dict=config_dict, vllm_config=vllm_config)
return vllm_config
@classmethod
def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None:
try:
spec_config = config_dict["speculators_config"]
methods = spec_config["proposal_methods"]
first_method = methods[0]
_ = first_method["speculative_tokens"]
_ = spec_config["verifier"]["name_or_path"]
_ = config_dict["speculators_model_type"]
except (KeyError, IndexError, TypeError) as e:
raise ValueError("Invalid speculators config structure") from e
if "transformer_layer_config" not in config_dict:
raise ValueError("Must provide transformer_layer_config")
if not isinstance(config_dict["transformer_layer_config"], dict):
raise TypeError(
"'transformer_layer_config' must be a dictionary if provided"
)
@classmethod
def build_vllm_speculative_config(
cls, config_dict: dict[str, Any]
) -> dict[str, Any]:
"""
Build vLLM-compatible speculative configuration from speculators format.
This method extracts and transforms speculative configuration from the
speculators format into the structure expected by vLLM.
Args:
config_dict: Configuration dictionary in speculators format
Returns:
Dictionary with vLLM-compatible speculative configuration
"""
# Extract speculators configuration
spec_config = config_dict["speculators_config"]
# Currently we only support one proposal method
proposal_methods = spec_config.get("proposal_methods")
if not proposal_methods:
raise ValueError("No proposal methods found in speculators config")
first_method = proposal_methods[0]
num_speculative_tokens = first_method.get("speculative_tokens")
if num_speculative_tokens is None:
raise ValueError(
f"Missing 'speculative_tokens' in proposal method. Got: {first_method}"
)
# Build base vLLM speculative configuration
vllm_config = {
"method": config_dict.get("speculators_model_type"),
"num_speculative_tokens": num_speculative_tokens,
"target_model": spec_config.get("verifier")["name_or_path"],
}
# Merge transformer layer configuration if present
transformer_config = config_dict.get("transformer_layer_config", {})
vllm_config.update(transformer_config)
return vllm_config

View File

@@ -0,0 +1,178 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
from transformers.configuration_utils import PretrainedConfig
class Step3VisionEncoderConfig(PretrainedConfig):
model_type = "step3_vision_encoder"
def __init__(
self,
hidden_size=1792,
intermediate_size=3072,
output_hidden_size=4096,
num_hidden_layers=63,
num_attention_heads=16,
num_channels=3,
image_size=728,
patch_size=14,
hidden_act="quick_gelu",
layer_norm_eps=1e-5,
**kwargs,
):
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.output_hidden_size = output_hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
super().__init__(**kwargs)
class Step3TextConfig(PretrainedConfig):
model_type = "step3_text"
architectures = ["Step3TextForCausalLM"]
def __init__(
self,
hidden_size: int = 7168,
intermediate_size: int = 18432,
num_attention_heads: int = 64,
num_attention_groups: int = 1,
num_hidden_layers: int = 61,
max_seq_len: int = 65536,
vocab_size: int = 128815,
rms_norm_eps: float = 1e-5,
moe_intermediate_size: int = 5120,
moe_num_experts: int = 48,
moe_top_k: int = 3,
rope_parameters: dict[str, Any] | None = None,
max_position_embedding: int = 65536,
share_expert_dim: int = 5120,
share_q_dim: int = 2048,
head_dim: int = 256,
norm_expert_weight: bool = False,
moe_layers_enum: tuple[int, ...] = (
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
),
**kwargs,
) -> None:
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_attention_heads = num_attention_heads
self.num_attention_groups = num_attention_groups
self.num_hidden_layers = num_hidden_layers
self.max_seq_len = max_seq_len
self.vocab_size = vocab_size
self.rms_norm_eps = rms_norm_eps
self.moe_intermediate_size = moe_intermediate_size
self.moe_num_experts = moe_num_experts
self.moe_top_k = moe_top_k
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 500000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
self.rope_parameters = rope_parameters
self.max_position_embedding = max_position_embedding
self.share_expert_dim = share_expert_dim
self.share_q_dim = share_q_dim
self.head_dim = head_dim
self.norm_expert_weight = norm_expert_weight
self.moe_layers_enum = moe_layers_enum
super().__init__(**kwargs)
class Step3VLConfig(PretrainedConfig):
model_type = "step3_vl"
def __init__(
self,
vision_config: dict | Step3VisionEncoderConfig | None = None,
text_config: dict | Step3TextConfig | None = None,
understand_projector_stride: int = 1,
projector_bias: bool = True,
image_token_id: int = 128001,
**kwargs,
) -> None:
if vision_config is None:
vision_config = Step3VisionEncoderConfig()
elif isinstance(vision_config, dict):
vision_config = Step3VisionEncoderConfig(**vision_config)
self.vision_config = vision_config
if text_config is None:
text_config = Step3TextConfig()
elif isinstance(text_config, dict):
text_config = Step3TextConfig(**text_config)
self.text_config = text_config
self.understand_projector_stride = understand_projector_stride
self.projector_bias = projector_bias
self.hidden_size = text_config.hidden_size
self.image_token_id = image_token_id
super().__init__(**kwargs)

View File

@@ -0,0 +1,24 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from transformers import Qwen2VLConfig
class Tarsier2Config(Qwen2VLConfig):
"""
Tarsier2's config.json is written such that AutoConfig.from_pretrained will create
a deeply nested config consisting of:
- LlavaConfig
- Qwen2VLConfig
- Qwen2VLTextConfig
- Qwen2VLVisionConfig
- Qwen2VLConfig
- Qwen2VLTextConfig
- Qwen2VLVisionConfig
When it should really just be a single Qwen2VLConfig.
This class is a hack to stop AutoConfig from creating the nested config structure.
"""
model_type = "tarsier2"

View File

@@ -0,0 +1,120 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
from typing import Any
import transformers
class UltravoxConfig(transformers.PretrainedConfig):
r"""
This is the configuration class to store the configuration of a
[`UltravoxForConditionalGeneration`]. It is used to instantiate an
Ultravox model according to the specified arguments, defining the model
architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to
control the model outputs. Read the documentation from [`PretrainedConfig`]
for more information.
Args:
audio_config (`Union[AutoConfig, dict]`, *optional*):
Custom audio config or dict.
text_config (`Union[AutoConfig, dict]`, *optional*):
The config object of the text backbone.
audio_model_id (`str`, *optional*):
The model ID of the audio backbone.
text_model_id (`str`, *optional*):
The model ID of the text backbone.
ignore_index (`int`, *optional*, defaults to -100):
The ignore index for the loss function.
audio_token_index (`int`, *optional*, defaults to 32000):
The audio token index to encode the audio prompt.
stack_factor (`int`, *optional*, defaults to 8):
Audio downsampling factor for the multimodal projector.
norm_init (`float`, *optional*, defaults to 0.4):
The initialization value for the layer normalization.
projector_act (`str`, *optional*, defaults to `"swiglu"`):
The activation function used by the multimodal projector.
projector_ln_mid (`bool`, *optional*, defaults to `False`):
Whether to apply layer normalization at the middle of the
projector or at the end. Versions v0.4.1 and below
use `False`, but v0.5 and above use `True`.
"""
wrapped_model_config: transformers.PretrainedConfig
model_type = "ultravox"
audio_token = "<|audio|>"
is_composition = False
def __init__(
self,
audio_config: dict[str, Any] | None = None,
text_config: dict[str, Any] | None = None,
audio_model_id: str | None = None,
text_model_id: str | None = None,
ignore_index: int = -100,
audio_token_index: int = 32000,
hidden_size: int = 4096,
stack_factor: int = 8,
norm_init: float = 0.4,
projector_act: str = "swiglu",
projector_ln_mid: bool = False,
num_projector_layers: int = 0,
**kwargs,
):
self.ignore_index = ignore_index
self.audio_token_index = audio_token_index
self.hidden_size = hidden_size
self.stack_factor = stack_factor
self.norm_init = norm_init
self.projector_act = projector_act
self.projector_ln_mid = projector_ln_mid
self.num_projector_layers = num_projector_layers
# N.B. May set the wrapped_model_config below.
self.text_model_id = text_model_id
if text_model_id is None:
text_config = text_config or {}
self.wrapped_model_config = transformers.CONFIG_MAPPING[
text_config.get("model_type", "llama")
](**text_config)
# N.B. May set the audio_config below.
self.audio_model_id = audio_model_id
if audio_model_id is None:
self.audio_model_id = None
audio_config = audio_config or {}
self.audio_config = transformers.CONFIG_MAPPING[
audio_config.get("model_type", "whisper")
](**audio_config)
super().__init__(**kwargs)
def __setattr__(self, key, value):
# Since --hf-overrides are applied _after_ the UltravoxConfig is
# instantiated, load the configs implicitly when assigning text_model_id
# or audio_model_id. This allows:
#
# --hf-overrides.text_model_id=<quantized variant>
#
# to behave as intended.
if key == "text_model_id" and value is not None:
from vllm.transformers_utils.config import get_config
self.wrapped_model_config = get_config(value, trust_remote_code=False)
elif key == "audio_model_id" and value is not None:
from vllm.transformers_utils.config import get_config
self.audio_config = get_config(value, trust_remote_code=False)
return super().__setattr__(key, value)
@property
def text_config(self) -> transformers.PretrainedConfig:
# When Ultravox wraps a multi-modal model (e.g. Gemma), we instantiate
# the full model, but the text config is the text config of the inner
# model.
return self.wrapped_model_config.get_text_config()

View File

@@ -1,313 +0,0 @@
from typing import Dict, List, Optional, Tuple, Union
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
BaseTokenizerGroup)
# Used eg. for marking rejected tokens in spec decoding.
INVALID_TOKEN_ID = -1
class Detokenizer:
"""Provides methods to decode the output of a model into text."""
def __init__(self, tokenizer_group: BaseTokenizerGroup):
self.tokenizer_group = tokenizer_group
def get_tokenizer_for_seq(self,
sequence: Sequence) -> "PreTrainedTokenizer":
"""Returns the HF tokenizer to use for a given sequence."""
return self.tokenizer_group.get_lora_tokenizer(sequence.lora_request)
def decode_prompt_logprobs_inplace(
self, seq_group: SequenceGroup,
prompt_logprobs: List[Optional[Dict[int, Logprob]]]) -> None:
"""Decodes the logprobs for the prompt of a sequence group.
Args:
seq_group: The sequence group to decode.
prompt_logprobs: The logprobs to decode.
Returns:
The prompt logprobs with the decoded tokens.
"""
prms = seq_group.sampling_params
# We can pick any sequence for the prompt.
seq = next(iter(seq_group.seqs_dict.values()))
# Only prompt, without the generated token.
all_token_ids = seq.get_token_ids()
prompt_token_ids = all_token_ids[:-1]
tokenizer = self.get_tokenizer_for_seq(seq)
prefix_offset = 0
read_offset = 0
next_iter_prefix_offset = 0
next_iter_read_offset = 0
next_iter_tokens = []
prev_tokens = None
for token_position, prompt_logprobs_for_token in enumerate(
prompt_logprobs):
if not prompt_logprobs_for_token:
continue
for token_id, sample_logprob in prompt_logprobs_for_token.items():
if (sample_logprob.decoded_token is None
and token_id != INVALID_TOKEN_ID):
prompt_token_ids_with_token = (
prompt_token_ids[:token_position] + [token_id])
(new_tokens, new_text, new_prefix_offset,
new_read_offset) = detokenize_incrementally(
tokenizer=tokenizer,
all_input_ids=prompt_token_ids_with_token,
prev_tokens=prev_tokens,
prefix_offset=prefix_offset,
read_offset=read_offset,
skip_special_tokens=prms.skip_special_tokens,
spaces_between_special_tokens=prms.
spaces_between_special_tokens,
)
sample_logprob.decoded_token = new_text
# Use the offsets & prev tokens corresponding to
# real tokens to ensure detokenization is consistent
# actual with prompt.
if token_id == all_token_ids[token_position]:
next_iter_prefix_offset = new_prefix_offset
next_iter_read_offset = new_read_offset
next_iter_tokens = new_tokens
# Advance to the next token position.
prefix_offset = next_iter_prefix_offset
read_offset = next_iter_read_offset
if prev_tokens is None:
prev_tokens = next_iter_tokens
else:
prev_tokens.extend(next_iter_tokens)
def decode_sequence_inplace(self, seq: Sequence,
prms: SamplingParams) -> int:
"""Decodes the new token for a sequence. In-place operation.
Args:
seq: The sequence to decode.
prms: The sampling parameters used to generate the sequence.
Returns:
The number of characters added to the output text.
"""
all_input_ids = seq.get_token_ids()
token_id_generated_this_iteration = all_input_ids[-1]
tokenizer = self.get_tokenizer_for_seq(seq)
# Convert prompt token IDs to tokens if necessary.
# Do it here so that we don't have to repeat this
# computation for each logprob.
if seq.tokens is None:
(seq.tokens, seq.prefix_offset,
seq.read_offset) = convert_prompt_ids_to_tokens(
tokenizer=tokenizer,
prompt_ids=all_input_ids[:-1],
skip_special_tokens=prms.skip_special_tokens,
)
(new_tokens, new_decoded_token_text, prefix_offset,
read_offset) = detokenize_incrementally(
tokenizer=tokenizer,
all_input_ids=all_input_ids,
prev_tokens=seq.tokens,
prefix_offset=seq.prefix_offset,
read_offset=seq.read_offset,
skip_special_tokens=prms.skip_special_tokens,
spaces_between_special_tokens=prms.spaces_between_special_tokens,
)
# Decode logprobs
logprobs = seq.output_logprobs[-1]
if logprobs:
previous_tokens = all_input_ids[:-1]
for token_id, sample_logprob in logprobs.items():
# If the token was generated this iteration,
# use the provided text.
if token_id == token_id_generated_this_iteration:
sample_logprob.decoded_token = new_decoded_token_text
continue
if (sample_logprob.decoded_token is None
and token_id != INVALID_TOKEN_ID):
all_input_ids_with_logprob = previous_tokens + [token_id]
(_, new_text, _, _) = detokenize_incrementally(
tokenizer=tokenizer,
all_input_ids=all_input_ids_with_logprob,
prev_tokens=seq.tokens,
prefix_offset=seq.prefix_offset,
read_offset=seq.read_offset,
skip_special_tokens=prms.skip_special_tokens,
spaces_between_special_tokens=prms.
spaces_between_special_tokens,
)
sample_logprob.decoded_token = new_text
seq.tokens.extend(new_tokens)
seq.prefix_offset = prefix_offset
seq.read_offset = read_offset
seq.output_text += new_decoded_token_text
return len(new_decoded_token_text)
def _convert_tokens_to_string_with_added_encoders(
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
output_tokens: List[str],
skip_special_tokens: bool,
spaces_between_special_tokens: bool,
) -> str:
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
# NOTE(woosuk): The following code is slow because it runs a for loop over
# the output_tokens. In Python, running a for loop over a list can be slow
# even when the loop body is very simple.
sub_texts: List[str] = []
current_sub_text: List[str] = []
all_special_tokens = set(tokenizer.all_special_tokens)
for token in output_tokens:
if skip_special_tokens and token in all_special_tokens:
continue
if token in tokenizer.get_added_vocab():
if current_sub_text:
sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
sub_texts.append(sub_text)
current_sub_text = []
sub_texts.append(token)
else:
current_sub_text.append(token)
if current_sub_text:
sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
sub_texts.append(sub_text)
if spaces_between_special_tokens:
return " ".join(sub_texts)
else:
return "".join(sub_texts)
# 5 is an arbitrary value that should work for all
# tokenizers (bigger = more conservative).
INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5
def convert_prompt_ids_to_tokens(
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
prompt_ids: List[int],
skip_special_tokens: bool = False,
) -> Tuple[List[str], int, int]:
"""Converts the prompt ids to tokens and returns the tokens and offsets
for incremental detokenization.
Note that not all tokens are converted to strings. Only the tokens that
are necessary for incremental detokenization are converted to strings.
"""
# We do not need to convert the whole prompt to tokens.
# Offset a little more in case we have special tokens.
new_tokens = tokenizer.convert_ids_to_tokens(
prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2:],
skip_special_tokens=skip_special_tokens)
read_offset = len(new_tokens)
prefix_offset = max(
read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
return new_tokens, prefix_offset, read_offset
# Based on
# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
# under Apache 2.0 license
def detokenize_incrementally(
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
all_input_ids: List[int],
prev_tokens: Optional[List[str]],
prefix_offset: int,
read_offset: int,
skip_special_tokens: bool = False,
spaces_between_special_tokens: bool = True,
) -> Tuple[List[str], str, int, int]:
"""Detokenizes the input ids incrementally and returns the new tokens
and the new text.
If `prev_tokens` is None, this function will convert the input ids to
tokens and return the tokens and the new text. Otherwise, it will return the
new tokens and the new text.
This function will also return the new prefix offset and the new read
offset to be used in the next iteration.
The offsets are necessary to defeat cleanup algorithms in the decode which
decide to add a space or not depending on the surrounding ids.
Args:
tokenizer: The tokenizer to use.
all_input_ids: The input ids. The last id is the new token id.
prev_tokens: The previous tokens. If None, this function will convert
the input ids to tokens and return the tokens and the new text.
prefix_offset: The prefix offset.
read_offset: The read offset.
skip_special_tokens: Whether to skip special tokens.
spaces_between_special_tokens: Whether to add spaces between special
tokens.
"""
new_token_id = all_input_ids[-1]
# This is the first iteration for this sequence
is_first_iter = prev_tokens is None
if is_first_iter:
(prev_tokens, prefix_offset,
read_offset) = convert_prompt_ids_to_tokens(
tokenizer,
all_input_ids[:-1],
skip_special_tokens=skip_special_tokens)
assert prev_tokens is not None
# If the new token id is out of bounds, return an empty string.
if new_token_id >= len(tokenizer):
new_tokens = [""]
else:
# Put new_token_id in a list so skip_special_tokens is respected
new_tokens = tokenizer.convert_ids_to_tokens(
[new_token_id], skip_special_tokens=skip_special_tokens)
if isinstance(new_tokens, str):
new_tokens = [new_tokens]
output_tokens = prev_tokens + new_tokens
# If this is the first iteration, return all tokens.
if is_first_iter:
new_tokens = output_tokens
# The prefix text is necessary only to defeat cleanup algorithms in
# the decode which decide to add a space or not depending on the
# surrounding ids.
if tokenizer.is_fast or not tokenizer.get_added_vocab():
prefix_text = tokenizer.convert_tokens_to_string(
output_tokens[prefix_offset:read_offset])
new_text = tokenizer.convert_tokens_to_string(
output_tokens[prefix_offset:])
else:
prefix_text = _convert_tokens_to_string_with_added_encoders(
tokenizer,
output_tokens[prefix_offset:read_offset],
skip_special_tokens=skip_special_tokens,
spaces_between_special_tokens=spaces_between_special_tokens,
)
new_text = _convert_tokens_to_string_with_added_encoders(
tokenizer,
output_tokens[prefix_offset:],
skip_special_tokens=skip_special_tokens,
spaces_between_special_tokens=spaces_between_special_tokens,
)
if len(new_text) <= len(prefix_text) or new_text.endswith("<EFBFBD>"):
# utf-8 char at the end means it's a potential unfinished byte sequence
# from byte fallback tokenization.
# If it's in the middle, it's probably a real invalid id generated
# by the model
return new_tokens, "", prefix_offset, read_offset
new_text = new_text[len(prefix_text):]
return new_tokens, new_text, read_offset, len(output_tokens)

View File

@@ -0,0 +1,59 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from transformers.dynamic_module_utils import get_class_from_dynamic_module
import vllm.envs as envs
from vllm.logger import init_logger
logger = init_logger(__name__)
def try_get_class_from_dynamic_module(
class_reference: str,
pretrained_model_name_or_path: str,
cache_dir: str | os.PathLike | None = None,
force_download: bool = False,
resume_download: bool | None = None,
proxies: dict[str, str] | None = None,
token: bool | str | None = None,
revision: str | None = None,
local_files_only: bool = False,
repo_type: str | None = None,
code_revision: str | None = None,
warn_on_fail: bool = True,
**kwargs,
) -> type | None:
"""
As `transformers.dynamic_module_utils.get_class_from_dynamic_module`,
but ignoring any errors.
"""
try:
return get_class_from_dynamic_module(
class_reference,
pretrained_model_name_or_path,
cache_dir=cache_dir,
force_download=force_download,
resume_download=resume_download,
proxies=proxies,
token=token,
revision=revision,
local_files_only=local_files_only,
repo_type=repo_type,
code_revision=code_revision,
**kwargs,
)
except Exception:
location = "ModelScope" if envs.VLLM_USE_MODELSCOPE else "HF Hub"
if warn_on_fail:
logger.warning(
"Unable to load %s from %s on %s.",
class_reference,
pretrained_model_name_or_path,
location,
exc_info=True,
)
return None

View File

@@ -0,0 +1,280 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""GGUF utility functions."""
from functools import cache
from os import PathLike
from pathlib import Path
import gguf
import regex as re
from gguf.constants import Keys, VisionProjectorType
from gguf.quants import GGMLQuantizationType
from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
from vllm.logger import init_logger
from .repo_utils import list_filtered_repo_files
logger = init_logger(__name__)
@cache
def check_gguf_file(model: str | PathLike) -> bool:
"""Check if the file is a GGUF model."""
model = Path(model)
if not model.is_file():
return False
elif model.suffix == ".gguf":
return True
try:
with model.open("rb") as f:
header = f.read(4)
return header == b"GGUF"
except Exception as e:
logger.debug("Error reading file %s: %s", model, e)
return False
@cache
def is_remote_gguf(model: str | Path) -> bool:
"""Check if the model is a remote GGUF model."""
pattern = r"^[a-zA-Z0-9][a-zA-Z0-9._-]*/[a-zA-Z0-9][a-zA-Z0-9._-]*:[A-Za-z0-9_+-]+$"
model = str(model)
if re.fullmatch(pattern, model):
_, quant_type = model.rsplit(":", 1)
return is_valid_gguf_quant_type(quant_type)
return False
def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool:
"""Check if the quant type is a valid GGUF quant type."""
return getattr(GGMLQuantizationType, gguf_quant_type, None) is not None
def split_remote_gguf(model: str | Path) -> tuple[str, str]:
"""Split the model into repo_id and quant type."""
model = str(model)
if is_remote_gguf(model):
parts = model.rsplit(":", 1)
return (parts[0], parts[1])
raise ValueError(
f"Wrong GGUF model or invalid GGUF quant type: {model}.\n"
"- It should be in repo_id:quant_type format.\n"
f"- Valid GGMLQuantizationType values: {GGMLQuantizationType._member_names_}",
)
def is_gguf(model: str | Path) -> bool:
"""Check if the model is a GGUF model.
Args:
model: Model name, path, or Path object to check.
Returns:
True if the model is a GGUF model, False otherwise.
"""
model = str(model)
# Check if it's a local GGUF file
if check_gguf_file(model):
return True
# Check if it's a remote GGUF model (repo_id:quant_type format)
return is_remote_gguf(model)
def detect_gguf_multimodal(model: str) -> Path | None:
"""Check if GGUF model has multimodal projector file.
Args:
model: Model path string
Returns:
Path to mmproj file if found, None otherwise
"""
if not model.endswith(".gguf"):
return None
try:
model_path = Path(model)
if not model_path.is_file():
return None
model_dir = model_path.parent
mmproj_patterns = ["mmproj.gguf", "mmproj-*.gguf", "*mmproj*.gguf"]
for pattern in mmproj_patterns:
mmproj_files = list(model_dir.glob(pattern))
if mmproj_files:
return mmproj_files[0]
return None
except Exception:
return None
def extract_vision_config_from_gguf(mmproj_path: str) -> "SiglipVisionConfig | None":
"""Extract vision config parameters from mmproj.gguf metadata.
Reads vision encoder configuration from GGUF metadata fields using
standardized GGUF constants. Automatically detects the projector type
(e.g., gemma3, llama4) and applies model-specific parameters accordingly.
The function extracts standard CLIP vision parameters from GGUF metadata
and applies projector-type-specific customizations. For unknown projector
types, it uses safe defaults from SiglipVisionConfig.
Args:
mmproj_path: Path to mmproj.gguf file (str or Path)
Returns:
SiglipVisionConfig if extraction succeeds, None if any required
field is missing from the GGUF metadata
Raises:
Exception: Exceptions from GGUF reading (file not found, corrupted
file, etc.) propagate directly from gguf.GGUFReader
"""
reader = gguf.GGUFReader(str(mmproj_path))
# Detect projector type to apply model-specific parameters
projector_type = None
projector_type_field = reader.get_field(Keys.Clip.PROJECTOR_TYPE)
if projector_type_field:
try:
projector_type = bytes(projector_type_field.parts[-1]).decode("utf-8")
except (AttributeError, UnicodeDecodeError) as e:
logger.warning("Failed to decode projector type from GGUF: %s", e)
# Map GGUF field constants to SiglipVisionConfig parameters.
# Uses official GGUF constants from gguf-py for standardization.
# Format: {gguf_constant: (param_name, dtype)}
VISION_CONFIG_FIELDS = {
Keys.ClipVision.EMBEDDING_LENGTH: ("hidden_size", int),
Keys.ClipVision.FEED_FORWARD_LENGTH: ("intermediate_size", int),
Keys.ClipVision.BLOCK_COUNT: ("num_hidden_layers", int),
Keys.ClipVision.Attention.HEAD_COUNT: ("num_attention_heads", int),
Keys.ClipVision.IMAGE_SIZE: ("image_size", int),
Keys.ClipVision.PATCH_SIZE: ("patch_size", int),
Keys.ClipVision.Attention.LAYERNORM_EPS: ("layer_norm_eps", float),
}
# Extract and validate all required fields
config_params = {}
for gguf_key, (param_name, dtype) in VISION_CONFIG_FIELDS.items():
field = reader.get_field(gguf_key)
if field is None:
logger.warning(
"Missing required vision config field '%s' in mmproj.gguf",
gguf_key,
)
return None
# Extract scalar value from GGUF field and convert to target type
config_params[param_name] = dtype(field.parts[-1])
# Apply model-specific parameters based on projector type
if projector_type == VisionProjectorType.GEMMA3:
# Gemma3 doesn't use the vision pooling head (multihead attention)
# This is a vLLM-specific parameter used in SiglipVisionTransformer
config_params["vision_use_head"] = False
logger.info("Detected Gemma3 projector, disabling vision pooling head")
# Add other projector-type-specific customizations here as needed
# elif projector_type == VisionProjectorType.LLAMA4:
# config_params["vision_use_head"] = ...
# Create config with extracted parameters
# Note: num_channels and attention_dropout use SiglipVisionConfig defaults
# (3 and 0.0 respectively) which are correct for all models
config = SiglipVisionConfig(**config_params)
if projector_type:
logger.info(
"Extracted vision config from mmproj.gguf (projector_type: %s)",
projector_type,
)
else:
logger.info("Extracted vision config from mmproj.gguf metadata")
return config
def maybe_patch_hf_config_from_gguf(
model: str,
hf_config: PretrainedConfig,
) -> PretrainedConfig:
"""Patch HF config for GGUF models.
Applies GGUF-specific patches to HuggingFace config:
1. For multimodal models: patches architecture and vision config
2. For all GGUF models: overrides vocab_size from embedding tensor
This ensures compatibility with GGUF models that have extended
vocabularies (e.g., Unsloth) where the GGUF file contains more
tokens than the HuggingFace tokenizer config specifies.
Args:
model: Model path string
hf_config: HuggingFace config to patch in-place
Returns:
Updated HuggingFace config
"""
# Patch multimodal config if mmproj.gguf exists
mmproj_path = detect_gguf_multimodal(model)
if mmproj_path is not None:
vision_config = extract_vision_config_from_gguf(str(mmproj_path))
# Create HF config for Gemma3 multimodal
text_config = hf_config.get_text_config()
is_gemma3 = hf_config.model_type in ("gemma3", "gemma3_text")
if vision_config is not None and is_gemma3:
new_hf_config = Gemma3Config.from_text_vision_configs(
text_config=text_config,
vision_config=vision_config,
architectures=["Gemma3ForConditionalGeneration"],
)
hf_config = new_hf_config
return hf_config
def get_gguf_file_path_from_hf(
repo_id: str | Path,
quant_type: str,
revision: str | None = None,
) -> str:
"""Get the GGUF file path from HuggingFace Hub based on repo_id and quant_type.
Args:
repo_id: The HuggingFace repository ID (e.g., "Qwen/Qwen3-0.6B")
quant_type: The quantization type (e.g., "Q4_K_M", "F16")
revision: Optional revision/branch name
Returns:
The path to the GGUF file on HuggingFace Hub (e.g., "filename.gguf"),
"""
repo_id = str(repo_id)
gguf_patterns = [
f"*-{quant_type}.gguf",
f"*-{quant_type}-*.gguf",
f"*/*-{quant_type}.gguf",
f"*/*-{quant_type}-*.gguf",
]
matching_files = list_filtered_repo_files(
repo_id,
allow_patterns=gguf_patterns,
revision=revision,
)
if len(matching_files) == 0:
raise ValueError(
"Could not find GGUF file for repo %s with quantization %s.",
repo_id,
quant_type,
)
# Sort to ensure consistent ordering (prefer non-sharded files)
matching_files.sort(key=lambda x: (x.count("-"), x))
gguf_filename = matching_files[0]
return gguf_filename

View File

@@ -0,0 +1,424 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib
import inspect
from functools import lru_cache
from typing import TYPE_CHECKING, Any, cast, get_args, get_type_hints
from transformers import (
AutoFeatureExtractor,
AutoImageProcessor,
AutoProcessor,
AutoVideoProcessor,
)
from transformers.feature_extraction_utils import FeatureExtractionMixin
from transformers.image_processing_utils import BaseImageProcessor
from transformers.processing_utils import ProcessorMixin
from transformers.video_processing_utils import BaseVideoProcessor
from typing_extensions import TypeVar
from vllm.transformers_utils.gguf_utils import is_gguf
from vllm.transformers_utils.utils import convert_model_repo_to_path
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
if TYPE_CHECKING:
from vllm.config import ModelConfig
_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
_V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor)
class HashableDict(dict):
"""
A dictionary that can be hashed by lru_cache.
"""
# NOTE: pythonic dict is not hashable,
# we override on it directly for simplicity
def __hash__(self) -> int: # type: ignore[override]
return hash(frozenset(self.items()))
class HashableList(list):
"""
A list that can be hashed by lru_cache.
"""
def __hash__(self) -> int: # type: ignore[override]
return hash(tuple(self))
def _get_processor_factory_fn(processor_cls: type | tuple[type, ...]):
if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin:
return AutoProcessor.from_pretrained
if hasattr(processor_cls, "from_pretrained"):
return processor_cls.from_pretrained
return processor_cls
@lru_cache
def _collect_dynamic_keys_from_processing_kwargs(kwargs_cls: type) -> set[str]:
dynamic_kwargs: set[str] = set()
if kwargs_cls is None:
return dynamic_kwargs
# get kwargs annotations in processor
# merge text_kwargs / images_kwargs / videos_kwargs / audio_kwargs
kwargs_type_annotations = get_type_hints(kwargs_cls)
for kw_type in ("text_kwargs", "images_kwargs", "videos_kwargs", "audio_kwargs"):
if kw_type in kwargs_type_annotations:
kw_annotations = get_type_hints(kwargs_type_annotations[kw_type])
for kw_name in kw_annotations:
dynamic_kwargs.add(kw_name)
dynamic_kwargs |= {"text_kwargs", "images_kwargs", "videos_kwargs", "audio_kwargs"}
return dynamic_kwargs
def _merge_mm_kwargs(
model_config: "ModelConfig",
processor_cls: type | tuple[type, ...],
/,
**kwargs,
):
mm_config = model_config.get_multimodal_config()
merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs)
factory = _get_processor_factory_fn(processor_cls)
allowed_kwargs = get_allowed_kwarg_only_overrides(
factory,
merged_kwargs,
requires_kw_only=False,
allow_var_kwargs=True,
)
# NOTE: Pythonic dict is not hashable and will raise unhashable type
# error when calling `cached_get_processor`, therefore we need to
# wrap it to a hashable dict.
for key, value in allowed_kwargs.items():
if isinstance(value, dict):
allowed_kwargs[key] = HashableDict(value)
if isinstance(value, list):
allowed_kwargs[key] = HashableList(value)
return allowed_kwargs
def get_processor(
processor_name: str,
*args: Any,
revision: str | None = None,
trust_remote_code: bool = False,
processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
**kwargs: Any,
) -> _P:
"""Load a processor for the given model name via HuggingFace."""
if revision is None:
revision = "main"
try:
processor_name = convert_model_repo_to_path(processor_name)
if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin:
processor = AutoProcessor.from_pretrained(
processor_name,
*args,
revision=revision,
trust_remote_code=trust_remote_code,
**kwargs,
)
elif issubclass(processor_cls, ProcessorMixin):
processor = processor_cls.from_pretrained(
processor_name,
*args,
revision=revision,
trust_remote_code=trust_remote_code,
**kwargs,
)
else:
# Processors that are standalone classes unrelated to HF
processor = processor_cls(*args, **kwargs)
except ValueError as e:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
# Unlike AutoTokenizer, AutoProcessor does not separate such errors
if not trust_remote_code:
err_msg = (
"Failed to load the processor. If the processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI."
)
raise RuntimeError(err_msg) from e
else:
raise e
if not isinstance(processor, processor_cls):
raise TypeError(
"Invalid type of HuggingFace processor. "
f"Expected type: {processor_cls}, but "
f"found type: {type(processor)}"
)
return processor
cached_get_processor = lru_cache(get_processor)
@lru_cache
def get_processor_kwargs_from_processor(processor: _P) -> set[str]:
try:
# get kwargs annotations in processor
call_kwargs = inspect.signature(type(processor).__call__).parameters.get(
"kwargs"
)
call_kwargs_annotations = call_kwargs.annotation if call_kwargs else None
# if the processor has explicit kwargs annotation, use it
if call_kwargs_annotations not in (None, inspect._empty):
# get_type_hints will parse all type annotations at runtime,
# and if an annotation refers to a type or
# name that hasnt been imported or defined, it will raise an error.
# So we use __annotations__ to get the raw annotations directly.
return _collect_dynamic_keys_from_processing_kwargs(
get_args(call_kwargs_annotations)[0]
)
# otherwise, try to get from ProcessingKwargs
else:
module_name = type(processor).__module__
mod = importlib.import_module(module_name)
# find *ProcessingKwargs in the module
processor_kwargs: set[str] = set()
for name, obj in vars(mod).items():
if name.endswith("ProcessingKwargs"):
processor_kwargs = (
processor_kwargs
| _collect_dynamic_keys_from_processing_kwargs(obj)
)
return processor_kwargs
except Exception:
return set()
def cached_get_processor_without_dynamic_kwargs(
processor_name: str,
*args: Any,
revision: str | None = None,
trust_remote_code: bool = False,
processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
**kwargs: Any,
) -> _P:
# Step 1: use default kwargs to get a temporary processor instance
processor = cached_get_processor(
processor_name,
revision=revision,
trust_remote_code=trust_remote_code,
processor_cls=processor_cls, # type: ignore[arg-type]
)
# Step 2: use temporary processor collect dynamic keys
dynamic_keys = get_processor_kwargs_from_processor(processor)
# Step 3: use dynamic_keys filter kwargs
filtered_kwargs = {k: v for k, v in kwargs.items() if k not in dynamic_keys}
# Step 4: use filtered kwargs to get final processor instance
final_processor = cached_get_processor(
processor_name,
revision=revision,
trust_remote_code=trust_remote_code,
processor_cls=processor_cls, # type: ignore[arg-type]
**filtered_kwargs,
)
return final_processor
def cached_processor_from_config(
model_config: "ModelConfig",
processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
**kwargs: Any,
) -> _P:
if is_gguf(model_config.model):
assert not is_gguf(model_config.tokenizer), (
"For multimodal GGUF models, the original tokenizer "
"should be used to correctly load processor."
)
model = model_config.tokenizer
revision = model_config.tokenizer_revision
else:
model = model_config.model
revision = model_config.revision
return cached_get_processor_without_dynamic_kwargs(
model,
revision=revision,
trust_remote_code=model_config.trust_remote_code,
processor_cls=processor_cls, # type: ignore[arg-type]
**_merge_mm_kwargs(model_config, processor_cls, **kwargs),
)
def get_feature_extractor(
processor_name: str,
*args: Any,
revision: str | None = None,
trust_remote_code: bool = False,
**kwargs: Any,
):
"""Load an audio feature extractor for the given model name
via HuggingFace."""
try:
processor_name = convert_model_repo_to_path(processor_name)
feature_extractor = AutoFeatureExtractor.from_pretrained(
processor_name,
*args,
revision=revision,
trust_remote_code=trust_remote_code,
**kwargs,
)
except ValueError as e:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
# Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
if not trust_remote_code:
err_msg = (
"Failed to load the feature extractor. If the feature "
"extractor is a custom extractor not yet available in the "
"HuggingFace transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI."
)
raise RuntimeError(err_msg) from e
else:
raise e
return cast(FeatureExtractionMixin, feature_extractor)
cached_get_feature_extractor = lru_cache(get_feature_extractor)
def cached_feature_extractor_from_config(
model_config: "ModelConfig",
**kwargs: Any,
):
return cached_get_feature_extractor(
model_config.model,
revision=model_config.revision,
trust_remote_code=model_config.trust_remote_code,
**_merge_mm_kwargs(model_config, AutoFeatureExtractor, **kwargs),
)
def get_image_processor(
processor_name: str,
*args: Any,
revision: str | None = None,
trust_remote_code: bool = False,
**kwargs: Any,
):
"""Load an image processor for the given model name via HuggingFace."""
try:
processor_name = convert_model_repo_to_path(processor_name)
processor = AutoImageProcessor.from_pretrained(
processor_name,
*args,
revision=revision,
trust_remote_code=trust_remote_code,
**kwargs,
)
except ValueError as e:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
# Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
if not trust_remote_code:
err_msg = (
"Failed to load the image processor. If the image processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI."
)
raise RuntimeError(err_msg) from e
else:
raise e
return cast(BaseImageProcessor, processor)
cached_get_image_processor = lru_cache(get_image_processor)
def cached_image_processor_from_config(
model_config: "ModelConfig",
**kwargs: Any,
):
if is_gguf(model_config.model):
assert not is_gguf(model_config.tokenizer), (
"For multimodal GGUF models, the original tokenizer "
"should be used to correctly load image processor."
)
model = model_config.tokenizer
revision = model_config.tokenizer_revision
else:
model = model_config.model
revision = model_config.revision
return cached_get_image_processor(
model,
revision=revision,
trust_remote_code=model_config.trust_remote_code,
**_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs),
)
def get_video_processor(
processor_name: str,
*args: Any,
revision: str | None = None,
trust_remote_code: bool = False,
processor_cls_overrides: type[_V] | None = None,
**kwargs: Any,
):
"""Load a video processor for the given model name via HuggingFace."""
try:
processor_name = convert_model_repo_to_path(processor_name)
processor_cls = processor_cls_overrides or AutoVideoProcessor
processor = processor_cls.from_pretrained(
processor_name,
*args,
revision=revision,
trust_remote_code=trust_remote_code,
**kwargs,
)
except ValueError as e:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
# Unlike AutoTokenizer, AutoVideoProcessor does not separate such errors
if not trust_remote_code:
err_msg = (
"Failed to load the video processor. If the video processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI."
)
raise RuntimeError(err_msg) from e
else:
raise e
return cast(BaseVideoProcessor, processor)
cached_get_video_processor = lru_cache(get_video_processor)
def cached_video_processor_from_config(
model_config: "ModelConfig",
processor_cls: type[_V] | None = None,
**kwargs: Any,
):
return cached_get_video_processor(
model_config.model,
revision=model_config.revision,
trust_remote_code=model_config.trust_remote_code,
processor_cls_overrides=processor_cls, # type: ignore[arg-type]
**_merge_mm_kwargs(model_config, AutoVideoProcessor, **kwargs),
)

View File

@@ -0,0 +1,25 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Multi-modal processors may be defined in this directory for the following
reasons:
- There is no processing file defined by HF Hub or Transformers library.
- There is a need to override the existing processor to support vLLM.
"""
from vllm.transformers_utils.processors.bagel import BagelProcessor
from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
from vllm.transformers_utils.processors.hunyuan_vl_image import HunYuanVLImageProcessor
from vllm.transformers_utils.processors.ovis import OvisProcessor
from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
__all__ = [
"BagelProcessor",
"DeepseekVLV2Processor",
"HunYuanVLProcessor",
"HunYuanVLImageProcessor",
"OvisProcessor",
"Ovis2_5Processor",
]

View File

@@ -0,0 +1,73 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2025 Bytedance Ltd. and/or its affiliates.
"""BAGEL processor for image and text inputs."""
from transformers import AutoProcessor
from transformers.image_utils import ImageInput
from transformers.processing_utils import ProcessorMixin
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
class BagelProcessor(ProcessorMixin):
"""
Constructs a BAGEL processor which wraps a
SigLIP image processor and a Qwen2 tokenizer.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "SiglipImageProcessor"
tokenizer_class = "AutoTokenizer"
def __call__(
self,
text: TextInput
| PreTokenizedInput
| list[TextInput]
| list[PreTokenizedInput] = None,
images: ImageInput = None,
**kwargs,
):
"""
Main method to prepare for the model one or several sequences(s) and image(s).
"""
if images is not None:
# Process images with the image processor
# Ensure return_tensors is set to "pt" for PyTorch tensors
image_kwargs = {**kwargs}
if "return_tensors" not in image_kwargs:
image_kwargs["return_tensors"] = "pt"
pixel_values = self.image_processor(images, **image_kwargs)
else:
pixel_values = None
text_inputs = self.tokenizer(text, **kwargs) if text is not None else None
if pixel_values is not None and text_inputs is not None:
text_inputs["pixel_values"] = pixel_values["pixel_values"]
return text_inputs
elif pixel_values is not None:
return pixel_values
else:
return text_inputs
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's batch_decode.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's decode.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
AutoProcessor.register("BagelProcessor", BagelProcessor)

View File

@@ -0,0 +1,438 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://github.com/deepseek-ai/DeepSeek-OCR/blob/main/DeepSeek-OCR-master/DeepSeek-OCR-vllm/process/image_process.py
import math
import torch
import torchvision.transforms as T
from PIL import Image, ImageOps
from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast
from transformers.processing_utils import ProcessorMixin
# TODO(Isotr0py): change modes for variants
# see: https://github.com/deepseek-ai/DeepSeek-OCR/blob/8cf003d38821fa1b19c73da3bd1b0dc262ea8136/DeepSeek-OCR-master/DeepSeek-OCR-vllm/config.py#L1-L6
# Tiny: base_size = 512, image_size = 512, crop_mode = False
# Small: base_size = 640, image_size = 640, crop_mode = False
# Base: base_size = 1024, image_size = 1024, crop_mode = False
# Large: base_size = 1280, image_size = 1280, crop_mode = False
# Gundam: base_size = 1024, image_size = 640, crop_mode = True
BASE_SIZE = 1024
IMAGE_SIZE = 640
CROP_MODE = True
# TODO(Isotr0py): Expose as mm_kwargs
MIN_CROPS = 2
MAX_CROPS = 6 # max:9; If your GPU memory is small, it is recommended to set it to 6.
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float("inf")
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def calculate_aspect_ratios(
min_num: int = MIN_CROPS, max_num: int = MAX_CROPS
) -> list[tuple[int, int]]:
target_ratios: set[tuple[int, int]] = set(
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if i * j <= max_num and i * j >= min_num
)
sorted_target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
return sorted_target_ratios
def count_tiles(
orig_width,
orig_height,
min_num=MIN_CROPS,
max_num=MAX_CROPS,
image_size=640,
use_thumbnail=False,
):
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = calculate_aspect_ratios(min_num, max_num)
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size
)
return target_aspect_ratio
def dynamic_preprocess(
image, min_num=MIN_CROPS, max_num=MAX_CROPS, image_size=640, use_thumbnail=False
):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = calculate_aspect_ratios(min_num, max_num)
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size
)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images, target_aspect_ratio
class ImageTransform:
def __init__(
self,
mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
std: tuple[float, float, float] = (0.5, 0.5, 0.5),
normalize: bool = True,
):
self.mean = mean
self.std = std
self.normalize = normalize
transform_pipelines = [T.ToTensor()]
if normalize:
transform_pipelines.append(T.Normalize(mean, std))
self.transform = T.Compose(transform_pipelines)
def __call__(self, pil_img: Image.Image):
x = self.transform(pil_img)
return x
class DeepseekOCRProcessor(ProcessorMixin):
tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
attributes = ["tokenizer"]
def __init__(
self,
tokenizer: LlamaTokenizerFast,
patch_size: int = 16,
downsample_ratio: int = 4,
image_mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
image_std: tuple[float, float, float] = (0.5, 0.5, 0.5),
normalize: bool = True,
image_token: str = "<image>",
pad_token: str = "<▁pad▁>",
add_special_token: bool = False,
sft_format: str = "deepseek",
mask_prompt: bool = True,
ignore_id: int = -100,
**kwargs,
):
self.image_size = IMAGE_SIZE
self.base_size = BASE_SIZE
self.patch_size = 16
self.image_mean = image_mean
self.image_std = image_std
self.normalize = normalize
self.downsample_ratio = 4
self.image_transform = ImageTransform(
mean=image_mean, std=image_std, normalize=normalize
)
self.tokenizer = tokenizer
self.tokenizer.padding_side = "left" # must set thispadding side with make a difference in batch inference # noqa: E501
# add the pad_token as special token to use 'tokenizer.pad_token'
# and 'tokenizer.pad_token_id'
if self.tokenizer.pad_token is None:
self.tokenizer.add_special_tokens({"pad_token": pad_token})
# add image token
self.image_token_id = self.tokenizer.vocab.get(image_token)
self.image_token = image_token
self.pad_token = pad_token
self.add_special_token = add_special_token
self.sft_format = sft_format
self.mask_prompt = mask_prompt
self.ignore_id = ignore_id
super().__init__(
tokenizer,
**kwargs,
)
@property
def bos_id(self):
return self.tokenizer.bos_token_id
@property
def eos_id(self):
return self.tokenizer.eos_token_id
@property
def pad_id(self):
return self.tokenizer.pad_token_id
def encode(self, text: str, bos: bool = True, eos: bool = False):
t = self.tokenizer.encode(text, add_special_tokens=False)
if bos:
t = [self.bos_id] + t
if eos:
t = t + [self.eos_id]
return t
def decode(self, t: list[int], **kwargs) -> str:
return self.tokenizer.decode(t, **kwargs)
def process_one(
self,
prompt: str,
images: list[Image.Image],
crop_mode: bool = CROP_MODE,
):
"""
Args:
prompt (str): the formatted prompt;
images (List[ImageType]): the list of images;
crop_mode (bool): if True, then crop the image;
Returns:
outputs (BaseProcessorOutput): the output of the processor,
- input_ids (torch.LongTensor): [N + image tokens]
- target_ids (torch.LongTensor): [N + image tokens]
- pixel_values (torch.FloatTensor): [n_patches, 3, H, W]
- image_id (int): the id of the image token
- num_image_tokens (List[int]): the number of image tokens
"""
assert prompt is not None and images is not None, (
"prompt and images must be used at the same time."
)
sft_format = prompt
(
input_ids,
pixel_values,
images_crop,
images_seq_mask,
images_spatial_crop,
num_image_tokens,
_,
) = self.tokenize_with_images(
conversation=sft_format,
images=images,
bos=True,
eos=True,
cropping=crop_mode,
)
prepare = BatchFeature(
data=dict(
input_ids=input_ids,
pixel_values=pixel_values,
images_crop=images_crop,
images_seq_mask=images_seq_mask,
images_spatial_crop=images_spatial_crop,
num_image_tokens=num_image_tokens,
),
tensor_type="pt",
)
return prepare
def __call__(
self,
*,
prompt: str,
images: list[Image.Image],
crop_mode: bool = CROP_MODE,
**kwargs,
):
prepare = self.process_one(
prompt=prompt,
images=images,
crop_mode=crop_mode,
)
return prepare
def tokenize_with_images(
self,
conversation: str,
images: list[Image.Image],
bos: bool = True,
eos: bool = True,
cropping: bool = True,
):
"""Tokenize text with <image> tags."""
assert conversation.count(self.image_token) == len(images)
text_splits = conversation.split(self.image_token)
images_list, images_crop_list, images_seq_mask, images_spatial_crop = (
[],
[],
[],
[],
)
image_shapes = []
num_image_tokens = []
tokenized_str = []
for text_sep, image in zip(text_splits, images):
tokenized_sep = self.encode(text_sep, bos=False, eos=False)
tokenized_str += tokenized_sep
images_seq_mask += [False] * len(tokenized_sep)
image_shapes.append(image.size)
images_crop_raw = []
if image.size[0] <= 640 and image.size[1] <= 640:
crop_ratio = [1, 1]
elif cropping:
images_crop_raw, crop_ratio = dynamic_preprocess(
image, image_size=IMAGE_SIZE
)
else:
crop_ratio = [1, 1]
if self.image_size <= 640 and not cropping:
image = image.resize((self.image_size, self.image_size))
global_view = ImageOps.pad(
image,
(self.base_size, self.base_size),
color=tuple(int(x * 255) for x in self.image_transform.mean),
)
images_list.append(self.image_transform(global_view))
num_width_tiles, num_height_tiles = crop_ratio
images_spatial_crop.append([num_width_tiles, num_height_tiles])
if num_width_tiles > 1 or num_height_tiles > 1:
for cropped_image in images_crop_raw:
images_crop_list.append(self.image_transform(cropped_image))
num_queries = math.ceil(
(self.image_size // self.patch_size) / self.downsample_ratio
)
num_queries_base = math.ceil(
(self.base_size // self.patch_size) / self.downsample_ratio
)
tokenized_image = (
[self.image_token_id] * num_queries_base + [self.image_token_id]
) * num_queries_base
tokenized_image += [self.image_token_id]
if num_width_tiles > 1 or num_height_tiles > 1:
local_row = [self.image_token_id] * (num_queries * num_width_tiles + 1)
tokenized_image += local_row * (num_queries * num_height_tiles)
tokenized_str += tokenized_image
images_seq_mask += [True] * len(tokenized_image)
num_image_tokens.append(len(tokenized_image))
"""process the last text split"""
tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
tokenized_str += tokenized_sep
images_seq_mask += [False] * len(tokenized_sep)
"""add the bos and eos tokens"""
if bos:
tokenized_str = [self.bos_id] + tokenized_str
images_seq_mask = [False] + images_seq_mask
if eos:
tokenized_str = tokenized_str + [self.eos_id]
images_seq_mask = images_seq_mask + [False]
assert len(tokenized_str) == len(images_seq_mask), (
f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} "
f"is not equal to images_seq_mask's length {len(images_seq_mask)}."
)
masked_tokenized_str = []
for token_index in tokenized_str:
if token_index != self.image_token_id:
masked_tokenized_str.append(token_index)
else:
masked_tokenized_str.append(self.ignore_id)
assert (
len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
), (
f"tokenized_str's length {len(tokenized_str)}, "
f"input_ids' length {len(masked_tokenized_str)}, "
f"images_seq_mask's length {len(images_seq_mask)}, are not equal."
)
input_ids = torch.LongTensor(tokenized_str)
target_ids = torch.LongTensor(masked_tokenized_str)
images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
# set input_ids < 0 | input_ids == self.image_token_id as ignore_id
target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
self.ignore_id
)
input_ids[input_ids < 0] = self.pad_id
# Remove the ending eos token
assert input_ids[-1] == self.eos_id
input_ids = input_ids[:-1]
target_ids = target_ids[:-1]
images_seq_mask = images_seq_mask[:-1]
if len(images_list) == 0:
pixel_values = torch.zeros((0, 3, self.base_size, self.base_size))
images_spatial_crop = torch.zeros((0, 2), dtype=torch.long)
images_crop = torch.zeros((0, 3, self.image_size, self.image_size))
else:
pixel_values = torch.stack(images_list, dim=0)
images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
if images_crop_list:
images_crop = torch.stack(images_crop_list, dim=0)
else:
images_crop = torch.zeros((0, 3, self.image_size, self.image_size))
input_ids = input_ids.unsqueeze(0)
return (
input_ids,
pixel_values,
images_crop,
images_seq_mask,
images_spatial_crop,
num_image_tokens,
image_shapes,
)
AutoProcessor.register("DeepseekOCRProcessor", DeepseekOCRProcessor)

View File

@@ -0,0 +1,406 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
# coding=utf-8
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
# Copyright (c) 2023-2024 DeepSeek.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import math
from typing import Any
import torch
import torchvision.transforms as T
from PIL import Image, ImageOps
from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast
from transformers.processing_utils import ProcessorMixin
class ImageTransform:
def __init__(
self,
mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
std: tuple[float, float, float] = (0.5, 0.5, 0.5),
normalize: bool = True,
):
self.mean = mean
self.std = std
self.normalize = normalize
transform_pipelines = [T.ToTensor()]
if normalize:
transform_pipelines.append(T.Normalize(mean, std))
self.transform = T.Compose(transform_pipelines)
def __call__(self, pil_img: Image.Image):
x = self.transform(pil_img)
return x
class DeepseekVLV2Processor(ProcessorMixin):
tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
attributes = ["tokenizer"]
def __init__(
self,
tokenizer: LlamaTokenizerFast,
candidate_resolutions: tuple[tuple[int, int]],
patch_size: int,
downsample_ratio: int,
image_mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
image_std: tuple[float, float, float] = (0.5, 0.5, 0.5),
normalize: bool = True,
image_token: str = "<image>",
pad_token: str = "<▁pad▁>",
add_special_token: bool = False,
sft_format: str = "deepseek",
mask_prompt: bool = True,
ignore_id: int = -100,
**kwargs,
):
self.candidate_resolutions = candidate_resolutions
self.image_size = candidate_resolutions[0][0]
self.patch_size = patch_size
self.image_mean = image_mean
self.image_std = image_std
self.normalize = normalize
self.downsample_ratio = downsample_ratio
self.image_transform = ImageTransform(
mean=image_mean, std=image_std, normalize=normalize
)
self.tokenizer = tokenizer
self.tokenizer.padding_side = "left" # must set thispadding side with make a difference in batch inference
# add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
if tokenizer.pad_token is None:
self.tokenizer.add_special_tokens({"pad_token": pad_token})
# add image token
image_token_id = self.tokenizer.vocab.get(image_token)
if image_token_id is None:
special_tokens = [image_token]
special_tokens_dict = {"additional_special_tokens": special_tokens}
self.tokenizer.add_special_tokens(special_tokens_dict)
self.image_token_id = self.tokenizer.vocab.get(image_token)
# add five special tokens for grounding-related tasks
# <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
special_tokens = ["<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>", "<|grounding|>"]
special_tokens_dict = {"additional_special_tokens": special_tokens}
self.tokenizer.add_special_tokens(special_tokens_dict)
# add special tokens for SFT data
special_tokens = ["<|User|>", "<|Assistant|>"]
special_tokens_dict = {"additional_special_tokens": special_tokens}
self.tokenizer.add_special_tokens(special_tokens_dict)
self.image_token = image_token
self.pad_token = pad_token
self.add_special_token = add_special_token
self.sft_format = sft_format
self.mask_prompt = mask_prompt
self.ignore_id = ignore_id
super().__init__(
tokenizer,
**kwargs,
)
def select_best_resolution(self, image_size):
# used for cropping
original_width, original_height = image_size
best_fit = None
max_effective_resolution = 0
min_wasted_resolution = float("inf")
for width, height in self.candidate_resolutions:
scale = min(width / original_width, height / original_height)
downscaled_width, downscaled_height = (
int(original_width * scale),
int(original_height * scale),
)
effective_resolution = min(
downscaled_width * downscaled_height, original_width * original_height
)
wasted_resolution = (width * height) - effective_resolution
if effective_resolution > max_effective_resolution or (
effective_resolution == max_effective_resolution
and wasted_resolution < min_wasted_resolution
):
max_effective_resolution = effective_resolution
min_wasted_resolution = wasted_resolution
best_fit = (width, height)
return best_fit
@property
def bos_id(self):
return self.tokenizer.bos_token_id
@property
def eos_id(self):
return self.tokenizer.eos_token_id
@property
def pad_id(self):
return self.tokenizer.pad_token_id
def encode(self, text: str, bos: bool = True, eos: bool = False):
t = self.tokenizer.encode(text, add_special_tokens=False)
if bos:
t = [self.bos_id] + t
if eos:
t = t + [self.eos_id]
return t
def decode(self, t: list[int], **kwargs) -> str:
return self.tokenizer.decode(t, **kwargs)
def process_one(
self,
prompt: str,
images: list[Image.Image],
inference_mode: bool = True,
**kwargs: Any,
):
"""
Args:
prompt (str): the formatted prompt;
images (list[ImageType]): the list of images;
inference_mode (bool): if True, then remove the last eos token;
**kwargs: Additional keyword arguments.
Returns:
outputs (BaseProcessorOutput): the output of the processor,
- input_ids (torch.LongTensor): [N + image tokens]
- target_ids (torch.LongTensor): [N + image tokens]
- pixel_values (torch.FloatTensor): [n_patches, 3, H, W]
- image_id (int): the id of the image token
- num_image_tokens (list[int]): the number of image tokens
"""
assert prompt is not None and images is not None, (
"prompt and images must be used at the same time."
)
sft_format = prompt
(
tokenized_str,
images_list,
images_seq_mask,
images_spatial_crop,
num_image_tokens,
) = self.tokenize_with_images(
sft_format, images, bos=True, eos=True, cropping=len(images) <= 2
)
masked_tokenized_str = []
for token_index in tokenized_str:
if token_index != self.image_token_id:
masked_tokenized_str.append(token_index)
else:
masked_tokenized_str.append(self.ignore_id)
assert (
len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
), (
f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
)
input_ids = torch.LongTensor(tokenized_str)
target_ids = torch.LongTensor(masked_tokenized_str)
images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
# set input_ids < 0 | input_ids == self.image_token_id as ignore_id
target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
self.ignore_id
)
input_ids[input_ids < 0] = self.pad_id
if inference_mode:
# Remove the ending eos token
assert input_ids[-1] == self.eos_id
input_ids = input_ids[:-1]
target_ids = target_ids[:-1]
images_seq_mask = images_seq_mask[:-1]
if len(images_list) == 0:
pixel_values = torch.zeros((1, 3, self.image_size, self.image_size))
images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
else:
pixel_values = torch.stack(images_list, dim=0)
images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
input_ids = input_ids.unsqueeze(0)
prepare = BatchFeature(
data=dict(
input_ids=input_ids,
pixel_values=pixel_values,
images_seq_mask=images_seq_mask,
images_spatial_crop=images_spatial_crop,
num_image_tokens=num_image_tokens,
),
tensor_type="pt",
)
return prepare
def __call__(
self,
*,
text: str,
images: list[Image.Image],
inference_mode: bool = True,
**kwargs: Any,
):
"""
Args:
text (str): the formatted prompt;
images (list[ImageType]): the list of images;
inference_mode (bool): if True, then remove the last eos token;
**kwargs:
Returns:
outputs (BaseProcessorOutput): the output of the processor,
- input_ids (torch.LongTensor): [N + image tokens]
- images (torch.FloatTensor): [n_images, 3, H, W]
- image_id (int): the id of the image token
- num_image_tokens (list[int]): the number of image tokens
"""
prepare = self.process_one(
prompt=text,
images=images,
inference_mode=inference_mode,
)
return prepare
def tokenize_with_images(
self,
conversation: str,
images: list[Image.Image],
bos: bool = True,
eos: bool = True,
cropping: bool = True,
):
"""Tokenize text with <image> tags."""
assert conversation.count(self.image_token) == len(images)
text_splits = conversation.split(self.image_token)
images_list, images_seq_mask, images_spatial_crop = [], [], []
num_image_tokens = []
tokenized_str = []
for text_sep, image in zip(text_splits, images):
"""encode text_sep"""
tokenized_sep = self.encode(text_sep, bos=False, eos=False)
tokenized_str += tokenized_sep
images_seq_mask += [False] * len(tokenized_sep)
"""select best resolution for anyres"""
if cropping:
best_width, best_height = self.select_best_resolution(image.size)
else:
best_width, best_height = self.image_size, self.image_size
"""process the global view"""
global_view = ImageOps.pad(
image,
(self.image_size, self.image_size),
color=tuple(int(x * 255) for x in self.image_transform.mean),
)
images_list.append(self.image_transform(global_view))
"""process the local views"""
local_view = ImageOps.pad(
image,
(best_width, best_height),
color=tuple(int(x * 255) for x in self.image_transform.mean),
)
for i in range(0, best_height, self.image_size):
for j in range(0, best_width, self.image_size):
images_list.append(
self.image_transform(
local_view.crop(
(j, i, j + self.image_size, i + self.image_size)
)
)
)
"""record height / width crop num"""
num_width_tiles, num_height_tiles = (
best_width // self.image_size,
best_height // self.image_size,
)
images_spatial_crop.append([num_width_tiles, num_height_tiles])
"""add image tokens"""
h = w = math.ceil(
(self.image_size // self.patch_size) / self.downsample_ratio
)
# global views tokens h * (w + 1), 1 is for line separator
tokenized_image = [self.image_token_id] * h * (w + 1)
# add a separator between global and local views
tokenized_image += [self.image_token_id]
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
tokenized_image += (
[self.image_token_id]
* (num_height_tiles * h)
* (num_width_tiles * w + 1)
)
tokenized_str += tokenized_image
images_seq_mask += [True] * len(tokenized_image)
num_image_tokens.append(len(tokenized_image))
"""process the last text split"""
tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
tokenized_str += tokenized_sep
images_seq_mask += [False] * len(tokenized_sep)
"""add the bos and eos tokens"""
if bos:
tokenized_str = [self.bos_id] + tokenized_str
images_seq_mask = [False] + images_seq_mask
if eos:
tokenized_str = tokenized_str + [self.eos_id]
images_seq_mask = images_seq_mask + [False]
assert len(tokenized_str) == len(images_seq_mask), (
f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
)
return (
tokenized_str,
images_list,
images_seq_mask,
images_spatial_crop,
num_image_tokens,
)
AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)

View File

@@ -0,0 +1,233 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/processing_hunyuan_vl.py
import numpy as np
import torch
from transformers import AutoProcessor
from transformers.feature_extraction_utils import BatchFeature
from transformers.image_utils import ImageInput
from transformers.processing_utils import ProcessorMixin
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
from transformers.video_utils import VideoInput
class HunYuanVLProcessor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"]
valid_kwargs = ["chat_template"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer" # ("AutoTokenizer", None)
def __init__(
self,
image_processor=None,
tokenizer=None,
video_processor=None,
chat_template=None,
**kwargs,
):
# TODO Fix the init
self.tokenizer = tokenizer
self.image_token_id = 120120 # self.tokenizer.image_token_id
self.image_token = self.tokenizer.convert_ids_to_tokens(self.image_token_id)
self.im_start_token_id = 120118 # self.tokenizer.im_start_id
self.im_start_token = self.tokenizer.convert_ids_to_tokens(
self.im_start_token_id
)
self.im_end_token_id = 120119 # self.tokenizer.im_end_id
self.im_end_token = self.tokenizer.convert_ids_to_tokens(self.im_end_token_id)
self.placeholder_token = self.tokenizer.convert_ids_to_tokens(
self.tokenizer.vocab_size - 1
)
self.pad_id = 120002 # self.tokenizer.pad_token_id
super().__init__(
image_processor, tokenizer, video_processor, chat_template=chat_template
)
def __call__(
self,
images: ImageInput = None,
text: TextInput
| PreTokenizedInput
| list[TextInput]
| list[PreTokenizedInput] = None,
videos: VideoInput = None,
**kwargs,
) -> BatchFeature:
image_inputs = {}
if images is not None:
image_inputs = self.image_processor(images=images)
image_grid_thw = image_inputs["image_grid_thw"]
if not isinstance(text, list):
text = [text]
text = text.copy() # below lines change text in-place
image_tokens_cumsum = [0]
if images is not None:
index = 0
for i in range(len(text)):
while self.image_token in text[i]:
grid_h, grid_w = image_grid_thw[index][-2:]
patch_h = grid_h // self.image_processor.merge_size
patch_w = grid_w // self.image_processor.merge_size
num_image_tokens = patch_h * (patch_w + 1) + 2
image_tokens_cumsum.append(
image_tokens_cumsum[-1] + num_image_tokens
)
# text[i] = text[i].replace(self.image_token, self.im_start_token + self.placeholder_token * num_image_tokens + self.im_end_token, 1) # noqa: E501
text[i] = text[i].replace(
self.image_token, self.placeholder_token * num_image_tokens, 1
)
index += 1
text[i] = text[i].replace(self.placeholder_token, self.image_token)
# text[i] = self.tokenizer.bos_token + text[i]
text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
input_ids = text_inputs["input_ids"]
position_ids = torch.arange(len(input_ids[0]))
position_ids_w = torch.arange(len(input_ids[0]))
position_ids_h = torch.arange(len(input_ids[0]))
position_ids_t = torch.arange(len(input_ids[0]))
if images is not None:
image_token_pos_indices = torch.where(input_ids[0] == self.image_token_id)[
0
]
for i in range(len(image_grid_thw)):
grid_h, grid_w = image_grid_thw[i][-2:]
patch_h = grid_h // self.image_processor.merge_size
patch_w = grid_w // self.image_processor.merge_size
start_pos = image_token_pos_indices[image_tokens_cumsum[i]].item() + 1
replace_num = (patch_w + 1) * patch_h
position_ids_w[start_pos : start_pos + replace_num] = torch.tensor(
list(range(patch_w + 1)) * patch_h, dtype=torch.int64
)
patch_h_list = []
for h in range(patch_h):
patch_h_list += [h] * (patch_w + 1)
position_ids_h[start_pos : start_pos + replace_num] = torch.tensor(
patch_h_list, dtype=torch.int64
)
position_ids_t[start_pos : start_pos + replace_num] = 0
position_ids = torch.stack(
[position_ids, position_ids_w, position_ids_h, position_ids_t]
).unsqueeze(0)
text_inputs["position_ids"] = position_ids
attention_mask = input_ids.ne(self.pad_id)
text_inputs["attention_mask"] = attention_mask
text_inputs["imgs_pos"] = [self.get_imgs_pos(e) for e in input_ids]
# image_inputs["imgs"] = [[image_inputs["pixel_values"]]]
return_tensors = kwargs.pop("return_tensors", None)
return BatchFeature(
data={**text_inputs, **image_inputs},
tensor_type=return_tensors,
)
def batch_decode(self, *args, **kwargs):
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
return self.tokenizer.decode(*args, **kwargs)
def post_process_image_text_to_text(
self,
generated_outputs,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
**kwargs,
):
assert 0
def apply_chat_template(self, *args, **kwargs):
token_ids = self.tokenizer.apply_chat_template(*args, **kwargs)
return token_ids
def get_imgs_pos(self, doc_ids):
doc_ids = np.array(doc_ids, dtype=np.int64)
img_begin_index = np.where(doc_ids == self.im_start_token_id)[0]
img_end_index = np.where(doc_ids == self.im_end_token_id)[0]
imgs_pos = np.concatenate(
(
np.reshape(img_begin_index + 1, (-1, 1)),
np.reshape(img_end_index, (-1, 1)),
),
axis=-1,
).tolist()
return imgs_pos
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
def split_image_into_patch_blocks(
pixel_values: torch.Tensor, # shape: [batch_size, 3, H, W]
patch_size: int = 16, # e.g. 16
adaptor_patch_div: int = 4, # e.g. 4 --> each patch_size is cut into 4x4 small regions, i.e. patch_size // 4 # noqa: E501
) -> torch.Tensor:
"""
Split the input image tensor (supporting batch) into large patches of size `patch_size`,
and then further divide each large patch into smaller regions of size
(patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div).
Each small region is extracted as a tensor of shape [3, patch_size, patch_size].
The final output contains all such small region tensors.
Args:
pixel_values: Input image tensor of shape [batch_size, 3, H, W].
patch_size: Size of the large patch, e.g., 16.
adaptor_patch_div: Each large patch is divided into
(patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div)
smaller regions.
Returns:
patches: A tensor of shape [N, 3, patch_size, patch_size],
where N = batch_size * (H // patch_size) * (W // patch_size) * (patch_size // adaptor_patch_div)^2.
Each element in the batch corresponds to one small image region.
""" # noqa: E501
batch_size, channels, height, width = pixel_values.shape
assert channels == 3, "Pixel values must have 3 channels in dim=1"
assert height % patch_size == 0 and width % patch_size == 0, (
"H and W must be divisible by patch_size"
)
patch_height_num = height // patch_size
patch_width_num = width // patch_size
# Reshape to [B, 3, ph, ps, pw, ps]
img = pixel_values.reshape(
batch_size, 3, patch_height_num, patch_size, patch_width_num, patch_size
)
# Further split each psxps patch into (ps//aps)x(ps//aps) small regions
img = img.reshape(
batch_size,
3,
patch_height_num,
patch_size // adaptor_patch_div, # ps // aps
adaptor_patch_div,
patch_width_num,
patch_size // adaptor_patch_div, # ps // aps
adaptor_patch_div,
)
# Permute to group the small regions: [B, ph, pw, ps//aps, ps//aps, 3, aps, aps]
img = img.permute(0, 2, 5, 3, 6, 1, 4, 7)
# Reshape into [B * ph * pw * (ps//aps)^2, 3, patch_size, patch_size]
patches = img.reshape(-1, 3, patch_size, patch_size)
return patches
AutoProcessor.register("HunYuanVLProcessor", HunYuanVLProcessor)

View File

@@ -0,0 +1,477 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/image_processing_hunyuan_vl.py
"""Image processor class for HunYuanVL."""
# isort conflicts with ruff for transformers imports
# isort: skip_file
import math
import numpy as np
import torchvision.transforms as transforms
from transformers import AutoImageProcessor
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
from transformers.image_transforms import (
convert_to_rgb,
)
from transformers.image_utils import (
OPENAI_CLIP_MEAN,
OPENAI_CLIP_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
make_flat_list_of_images,
make_list_of_images,
valid_images,
validate_preprocess_arguments,
)
from transformers.utils import TensorType, logging
from transformers.video_utils import VideoInput, make_batched_videos
logger = logging.get_logger(__name__)
def smart_resize(
height: int,
width: int,
factor: int = 16,
min_pixels: int = 512 * 512,
max_pixels: int = 2048 * 2048,
):
"""Rescales the image so that the following conditions are met:
1. Both dimensions (height and width) are divisible by 'factor'.
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
3. The aspect ratio of the image is maintained as closely as possible.
"""
if max(height, width) / min(height, width) > 200:
raise ValueError(
"absolute aspect ratio must be smaller than 200, got "
f"{max(height, width) / min(height, width)}"
)
h_bar = round(height / factor) * factor
w_bar = round(width / factor) * factor
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = max(factor, math.floor(height / beta / factor) * factor)
w_bar = max(factor, math.floor(width / beta / factor) * factor)
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = math.ceil(height * beta / factor) * factor
w_bar = math.ceil(width * beta / factor) * factor
return h_bar, w_bar
class HunYuanVLImageProcessor(BaseImageProcessor):
model_input_names = [
"pixel_values",
"image_grid_thw",
"pixel_values_videos",
"video_grid_thw",
]
def __init__(
self,
do_resize: bool = True,
size: dict[str, int] | None = None,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_rescale: bool = True,
rescale_factor: int | float = 1 / 255,
do_normalize: bool = True,
image_mean: float | list[float] | None = None,
image_std: float | list[float] | None = None,
do_convert_rgb: bool = True,
min_pixels: int | None = None,
max_pixels: int | None = None,
patch_size: int = 16,
temporal_patch_size: int = 2,
merge_size: int = 2,
**kwargs,
) -> None:
super().__init__(**kwargs)
if size is not None and (
"shortest_edge" not in size or "longest_edge" not in size
):
raise ValueError(
"size must contain 'shortest_edge' and 'longest_edge' keys."
)
else:
size = {"shortest_edge": 512 * 512, "longest_edge": 2048 * 2048}
# backward compatibility: override size with min_pixels and max_pixels
# if they are provided.
if min_pixels is not None:
size["shortest_edge"] = min_pixels
if max_pixels is not None:
size["longest_edge"] = max_pixels
self.min_pixels = size["shortest_edge"]
self.max_pixels = size["longest_edge"]
self.size = size
self.do_resize = do_resize
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
self.patch_size = patch_size
self.temporal_patch_size = temporal_patch_size
self.merge_size = merge_size
self.do_convert_rgb = do_convert_rgb
# hard-code
def _preprocess(
self,
images: ImageInput | VideoInput,
do_resize: bool | None = None,
size: dict[str, int] | None = None,
resample: PILImageResampling = None,
do_rescale: bool | None = None,
rescale_factor: float | None = None,
do_normalize: bool | None = None,
image_mean: float | list[float] | None = None,
image_std: float | list[float] | None = None,
patch_size: int = 16,
temporal_patch_size: int = 2,
merge_size: int = 2,
do_convert_rgb: bool | None = None,
data_format: ChannelDimension | None = ChannelDimension.FIRST,
input_data_format: str | ChannelDimension | None = None,
):
"""
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
Args:
images (`ImageInput`):
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Scale factor to use if rescaling the image.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
patch_size (`int`, *optional*, defaults to `self.patch_size`):
The spatial patch size of the vision encoder.
temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
The temporal patch size of the vision encoder.
merge_size (`int`, *optional*, defaults to `self.merge_size`):
The merge size of the vision encoder to llm encoder.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
""" # noqa: E501
images = make_list_of_images(images)
if do_convert_rgb:
images = [convert_to_rgb(image) for image in images]
width, height = images[0].width, images[0].height
resized_width, resized_height = width, height
processed_images = []
for image in images:
if do_resize:
resized_height, resized_width = smart_resize(
height=height,
width=width,
factor=patch_size * merge_size,
min_pixels=self.min_pixels,
max_pixels=self.max_pixels,
)
image = image.resize((resized_width, resized_height))
if do_normalize:
image = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize(self.image_mean, self.image_std),
]
)(image)
processed_images.append(image)
patches = np.array(processed_images)
channel = patches.shape[1]
grid_t = patches.shape[0] // temporal_patch_size
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
patches = patches.reshape(
1,
channel,
grid_h // merge_size,
merge_size,
patch_size,
grid_w // merge_size,
merge_size,
patch_size,
)
patches = patches.transpose(0, 2, 3, 5, 6, 1, 4, 7)
flatten_patches = patches.reshape(
1 * grid_h * grid_w, channel * patch_size * patch_size
)
return flatten_patches, (grid_t, grid_h, grid_w)
def preprocess(
self,
images: ImageInput,
videos: VideoInput = None,
do_resize: bool | None = None,
size: dict[str, int] | None = None,
min_pixels: int | None = None,
max_pixels: int | None = None,
resample: PILImageResampling = None,
do_rescale: bool | None = None,
rescale_factor: float | None = None,
do_normalize: bool | None = None,
image_mean: float | list[float] | None = None,
image_std: float | list[float] | None = None,
patch_size: int | None = None,
temporal_patch_size: int | None = None,
merge_size: int | None = None,
do_convert_rgb: bool | None = None,
return_tensors: str | TensorType | None = None,
data_format: ChannelDimension | None = ChannelDimension.FIRST,
input_data_format: str | ChannelDimension | None = None,
):
"""
Args:
images (`ImageInput`):
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
videos (`VideoInput`):
Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
the longest edge resized to keep the input aspect ratio.
resample (`int`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
has an effect if `do_resize` is set to `True`.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
`True`.
min_pixels (`int`, *optional*, defaults to `self.min_pixels`):
The min pixels of the image to resize the image.
max_pixels (`int`, *optional*, defaults to `self.max_pixels`):
The max pixels of the image to resize the image.
patch_size (`int`, *optional*, defaults to `self.patch_size`):
The spatial patch size of the vision encoder.
temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
The temporal patch size of the vision encoder.
merge_size (`int`, *optional*, defaults to `self.merge_size`):
The merge size of the vision encoder to llm encoder.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
""" # noqa: E501
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
if size is not None:
if "shortest_edge" not in size or "longest_edge" not in size:
raise ValueError(
"size must contain 'shortest_edge' and 'longest_edge' keys."
)
min_pixels = size["shortest_edge"]
elif min_pixels is not None and max_pixels is not None:
# backward compatibility: override size with min_pixels and max_pixels
# if they are provided.
size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
else:
size = {**self.size}
do_resize = do_resize if do_resize is not None else self.do_resize
resample = resample if resample is not None else self.resample
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = (
rescale_factor if rescale_factor is not None else self.rescale_factor
)
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
patch_size = patch_size if patch_size is not None else self.patch_size
temporal_patch_size = (
temporal_patch_size
if temporal_patch_size is not None
else self.temporal_patch_size
)
merge_size = merge_size if merge_size is not None else self.merge_size
do_convert_rgb = (
do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
)
if images is not None:
images = make_flat_list_of_images(images)
if images is not None and not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
validate_preprocess_arguments(
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
do_resize=do_resize,
size=size,
resample=resample,
)
data = {}
if images is not None:
pixel_values, vision_grid_thws = [], []
for image in images:
patches, image_grid_thw = self._preprocess(
image,
do_resize=do_resize,
size=size,
resample=resample,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
patch_size=patch_size,
temporal_patch_size=temporal_patch_size,
merge_size=merge_size,
data_format=data_format,
do_convert_rgb=do_convert_rgb,
input_data_format=input_data_format,
)
pixel_values.extend(patches)
vision_grid_thws.append(image_grid_thw)
pixel_values = np.array(pixel_values)
vision_grid_thws = np.array(vision_grid_thws)
data.update(
{"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
)
# kept for BC only and should be removed after v5.0
if videos is not None:
logger.warning(
"`HunYuanVLV1ImageProcessor` works only with image inputs "
"and doesn't process videos anymore. "
"This is a deprecated behavior and will be removed in v5.0. "
"Your videos should be forwarded to `HunYuanVLV1VideoProcessor`. "
)
videos = make_batched_videos(videos)
pixel_values_videos, vision_grid_thws_videos = [], []
for images in videos:
patches, video_grid_thw = self._preprocess(
images,
do_resize=do_resize,
size=size,
resample=resample,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
patch_size=patch_size,
temporal_patch_size=temporal_patch_size,
merge_size=merge_size,
data_format=data_format,
do_convert_rgb=do_convert_rgb,
input_data_format=input_data_format,
)
pixel_values_videos.extend(patches)
vision_grid_thws_videos.append(video_grid_thw)
data.update(
{
"pixel_values_videos": np.array(pixel_values_videos),
"video_grid_thw": np.array(vision_grid_thws_videos),
}
)
return BatchFeature(data=data, tensor_type=return_tensors)
def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
"""
A utility that returns number of image patches for a given image size.
Args:
height (`int`):
Height of the input image.
width (`int`):
Width of the input image.
images_kwargs (`dict`, *optional*):
Any kwargs to override defaults of the image processor.
Returns:
`int`: Number of image patches per image.
"""
min_pixels = (
images_kwargs["min_pixels"]
if "min_pixels" in images_kwargs
else self.size["shortest_edge"]
)
max_pixels = (
images_kwargs["max_pixels"]
if "max_pixels" in images_kwargs
else self.size["longest_edge"]
)
patch_size = images_kwargs.get("patch_size", self.patch_size)
merge_size = images_kwargs.get("merge_size", self.merge_size)
factor = patch_size * merge_size
resized_height, resized_width = smart_resize(
height, width, factor, min_pixels=min_pixels, max_pixels=max_pixels
)
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
return grid_h * (grid_w + 1) + 2
AutoImageProcessor.register("HunYuanVLImageProcessor", HunYuanVLImageProcessor)

View File

@@ -0,0 +1,453 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
# coding=utf-8
# adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from functools import cached_property
import PIL
import torch
from transformers import AutoProcessor, BatchFeature
from transformers.image_utils import ImageInput
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
from vllm.multimodal.image import convert_image_mode
__all__ = ["OvisProcessor"]
IGNORE_ID = -100
class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
_defaults = {
"text_kwargs": {
"padding": False,
},
"images_kwargs": {
"max_partition": 9,
"covering_threshold": 0.9,
"convert_to_rgb": True,
"return_tensors": "pt",
},
}
class OvisProcessor(ProcessorMixin):
r"""
Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
[`OvisProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
[`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`] for more information.
Args:
image_processor ([`Qwen2VLImageProcessor`], *optional*):
The image processor is a required input.
tokenizer ([`Qwen2TokenizerFast`], *optional*):
The tokenizer is a required input.
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
in a chat into a tokenizable string.
"""
attributes = ["image_processor", "tokenizer"]
valid_kwargs = ["chat_template", "image_pad_token", "image_segment_len"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer"
def __init__(
self,
image_processor=None,
tokenizer=None,
chat_template=None,
image_pad_token=None,
image_segment_len=255,
**kwargs,
):
self.image_token = "<image>"
self.image_pad_token = image_pad_token
self.image_segment_len = image_segment_len
super().__init__(image_processor, tokenizer, chat_template=chat_template)
@cached_property
def extra_special_tokens(self):
image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token]
extra_special_tokens = {
"image_token": -200,
"image_atom": -300,
"image_start": -301,
"image_prefix": -302,
"image_col_sep": -303,
"image_row_sep": -304,
"image_end": -305,
"image_pad": image_pad_token_id,
}
return extra_special_tokens
def __call__(
self,
images: ImageInput = None,
text: TextInput
| PreTokenizedInput
| list[TextInput]
| list[PreTokenizedInput] = None,
**kwargs: Unpack[OvisProcessorKwargs],
) -> BatchFeature:
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
text (`str`, `list[str]`, `list[list[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
- **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
- **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
- **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
- **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
"""
output_kwargs = self._merge_kwargs(
OvisProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
# Process all images first
image_features = {}
if images is not None:
processed_images = []
image_placeholders_list = []
grids = []
# Process each image
for image in images if isinstance(images, list) else [images]:
pixel_values, image_placeholders, grid = self.preprocess_image(
image=image, **output_kwargs["images_kwargs"]
)
processed_images.append(pixel_values)
image_placeholders_list.append(image_placeholders)
grids.append(grid)
# assign all processed images
if processed_images:
image_features["image_placeholders"] = image_placeholders_list
# Process text input
if text is not None:
if not isinstance(text, list):
text = [text]
tokenized_batched_text = self._tokenize_with_image_symbol(text)
image_token_id = self.get_token_value("image_token")
replaced_ids_list = []
idx = 0
for ids_tensor in tokenized_batched_text:
if (
image_token_id in ids_tensor
and "image_placeholders" in image_features
):
if idx < len(image_features["image_placeholders"]):
# Converts in list for ease of use
ids_list = ids_tensor.tolist()
new_ids = []
# replace placeholders
for i, token_id in enumerate(ids_list):
if token_id == image_token_id:
placeholder_ids = image_features["image_placeholders"][
idx
]
new_ids.extend(placeholder_ids)
idx += 1
else:
new_ids.append(token_id)
# Converts back to tensors
ids_tensor = torch.tensor(new_ids, dtype=torch.long)
else:
raise RuntimeError(
"Mismatch between the images you provided and the number of placeholder present in the text"
)
replaced_ids_list.append(ids_tensor)
if replaced_ids_list:
replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
else:
replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)
# Create the output with text features
output = BatchFeature(
data={
"input_ids": replaced_and_tokenized_ids,
}
)
# Add image features if present
if image_features:
output["pixel_values"] = processed_images
output["grids"] = grids
return output
# If only images were provided
return BatchFeature(data=image_features)
def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
batch_token_ids = []
for text in text_list:
text_chunks = [
self.tokenizer(chunk, add_special_tokens=False).input_ids
for chunk in text.split(self.image_token)
]
token_ids = []
num_chuck = len(text_chunks)
for i, chunk in enumerate(text_chunks):
token_ids.extend(chunk)
if i < num_chuck - 1:
token_ids.append(self.get_token_value("image_token"))
batch_token_ids.append(token_ids)
return torch.tensor(batch_token_ids, dtype=torch.long)
def get_image_size(self):
size = self.image_processor.size
if "shortest_edge" in size:
width = height = size["shortest_edge"]
elif "height" in size and "width" in size:
width = size["width"]
height = size["height"]
else:
raise ValueError("Can't parse image size from image_processor config.")
return height, width
def get_token_value(self, tok):
return self.extra_special_tokens[tok]
def construct_image_indicators(self, grid):
image_placeholders = [
self.get_token_value("image_start"),
self.get_token_value("image_atom"),
self.get_token_value("image_prefix"),
]
if grid[0] * grid[1] > 1:
for r in range(grid[0]):
for c in range(grid[1]):
image_placeholders.append(self.get_token_value("image_atom"))
if c < grid[1] - 1:
image_placeholders.append(self.get_token_value("image_col_sep"))
if r < grid[0] - 1:
image_placeholders.append(self.get_token_value("image_row_sep"))
image_placeholders.append(self.get_token_value("image_end"))
return image_placeholders
def construct_image_placeholders(self, grid):
image_placeholders = self.construct_image_indicators(grid)
image_atom_token_id = self.get_token_value("image_atom")
# Extract the padding token ID from tokenizer
image_padding_token_id = self.get_token_value("image_pad")
# Create a new list with padding tokens inserted
padded_placeholder_tokens = []
for token in image_placeholders:
padded_placeholder_tokens.append(image_padding_token_id)
if token == image_atom_token_id:
padded_placeholder_tokens.extend(
[image_padding_token_id] * self.image_segment_len
)
return padded_placeholder_tokens
def preprocess_image(
self,
image: PIL.Image.Image,
max_partition,
covering_threshold,
convert_to_rgb,
return_tensors,
):
def _preprocess(img: PIL.Image.Image, side):
# first resize and preprocess
w, h = img.size
if w == h:
new_width = new_height = side
elif w > h:
new_width = side
new_height = int(h / w * new_width)
else:
new_height = side
new_width = int(w / h * new_height)
new_size = dict(height=new_height, width=new_width)
pixel_values = self.image_processor.preprocess(
img, size=new_size, return_tensors=return_tensors
)["pixel_values"]
# then pad to square
square_values = torch.zeros(
[1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device
)
new_height, new_width = pixel_values.shape[2:]
if new_height == new_width:
square_values[:, :, :, :] = pixel_values
elif new_height > new_width:
from_index = (side - new_width) // 2
square_values[:, :, :, from_index : from_index + new_width] = (
pixel_values
)
else:
from_index = (side - new_height) // 2
square_values[:, :, from_index : from_index + new_height, :] = (
pixel_values
)
return square_values
def _partition(img, grid) -> list[tuple[int, int, int, int]]:
w, h = img.size
row_height = h // grid[0]
col_width = w // grid[1]
partition = []
for row in range(grid[0]):
for col in range(grid[1]):
left = col * col_width
upper = row * row_height
right = w if col == grid[1] - 1 else (col + 1) * col_width
lower = h if row == grid[0] - 1 else (row + 1) * row_height
partition.append((left, upper, right, lower))
return partition
def _covering_area(left, upper, right, lower, side):
w = right - left
h = lower - upper
w, h = max(w, h), min(w, h)
if w > side:
h = h / w * side
w = side
return w * h
def _get_best_grid(img, side):
img_area = img.size[0] * img.size[1]
candidate_grids = []
for i in range(1, max_partition + 1):
for j in range(1, max_partition + 1):
if i * j <= max_partition:
candidate_grids.append((i, j))
all_grids = []
good_grids = []
for grid in candidate_grids:
partition = _partition(img, grid)
covering_ratio = (
sum([_covering_area(*p, side) for p in partition]) / img_area
)
assert covering_ratio <= 1.0
all_grids.append((grid, covering_ratio))
if covering_ratio > covering_threshold:
good_grids.append((grid, covering_ratio))
if len(good_grids) > 0:
# pick the good partition with minimum #sub_images and break the tie using covering_ratio
return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][
0
]
else:
# pick the partition with maximum covering_ratio and break the tie using #sub_images
return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
if convert_to_rgb:
image = convert_image_mode(image, "RGB")
sides = self.get_image_size()
if sides[0] != sides[1]:
raise ValueError("get_image_size() returns non-square size")
side = sides[0]
grid = _get_best_grid(image, side)
partition = _partition(image, grid)
crops = [image.crop(p) for p in partition]
if len(crops) > 1:
crops.insert(0, image)
pixel_values = torch.cat([_preprocess(crop, side) for crop in crops], dim=0)
image_placeholders = self.construct_image_placeholders(grid)
return torch.tensor(pixel_values), image_placeholders, torch.tensor(grid)
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
def post_process_image_text_to_text(self, generated_outputs):
"""
Post-process the output of the model to decode the text.
Args:
generated_outputs (`torch.Tensor` or `np.ndarray`):
The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
or `(sequence_length,)`.
Returns:
`list[str]`: The decoded text.
"""
return self.tokenizer.batch_decode(
generated_outputs,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
names_from_processor = list(
dict.fromkeys(tokenizer_input_names + image_processor_input_names)
)
return names_from_processor + ["second_per_grid_ts"]
AutoProcessor.register("OvisProcessor", OvisProcessor)

View File

@@ -0,0 +1,468 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
from functools import cached_property
import numpy as np
import PIL
import torch
from transformers import AutoProcessor, BatchFeature
from transformers.image_utils import ImageInput
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
__all__ = ["Ovis2_5Processor"]
IMAGE_TOKEN = "<image>"
VIDEO_TOKEN = "<video>"
MIN_PIXELS = 448 * 448
MAX_PIXELS = 1792 * 1792
class Ovis2_5ProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
_defaults = {
"text_kwargs": {
"padding": False,
},
"images_kwargs": {
"convert_to_rgb": True,
"min_pixels": MIN_PIXELS,
"max_pixels": MAX_PIXELS,
},
"videos_kwargs": {
"convert_to_rgb": True,
"min_pixels": MIN_PIXELS,
"max_pixels": MAX_PIXELS,
},
}
class Ovis2_5Processor(ProcessorMixin):
r"""
Constructs an Ovis processor which wraps an Ovis image processor
and a Qwen2 tokenizer into a single processor.
[`OvisProcessor`] offers all the functionalities of
[`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`].
See the [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`]
for more information.
Args:
image_processor ([`Qwen2VLImageProcessor`], *optional*):
The image processor is a required input.
tokenizer ([`Qwen2TokenizerFast`], *optional*):
The tokenizer is a required input.
chat_template (`str`, *optional*): A Jinja template which will
be used to convert lists of messages in a chat into
a tokenizable string.
"""
attributes = ["image_processor", "tokenizer"]
valid_kwargs = ["chat_template", "image_pad_token"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer"
def __init__(
self,
image_processor=None,
tokenizer=None,
chat_template=None,
image_pad_token=None,
patch_size=16,
hidden_stride=2,
temporal_patch_size=1,
**kwargs,
):
self.image_token = IMAGE_TOKEN
self.video_token = VIDEO_TOKEN
self.image_pad_token = "<|image_pad|>"
self.patch_size = patch_size
self.hidden_stride = hidden_stride
self.temporal_patch_size = temporal_patch_size
super().__init__(image_processor, tokenizer, chat_template=chat_template)
@cached_property
def extra_special_tokens(self):
image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token]
extra_special_tokens = {
"image_token": -200,
"video_token": -201,
"visual_atom": -300,
"image_start": -301,
"image_end": -302,
"video_start": -303,
"video_end": -304,
"image_pad": image_pad_token_id,
}
return extra_special_tokens
def __call__(
self,
images: ImageInput = None,
videos: np.ndarray | list[ImageInput] = None,
text: TextInput
| PreTokenizedInput
| list[TextInput]
| list[PreTokenizedInput] = None,
**kwargs: Unpack[Ovis2_5ProcessorKwargs],
) -> BatchFeature:
"""
Main method to prepare for the model one or several sequences(s)
and image(s). This method forwards the `text`and `kwargs` arguments
to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text`
is not `None` to encode the text. To prepare the vision inputs,
this method forwards the `vision_infos` and `kwrags` arguments to
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`]
if `vision_infos` is not `None`.
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`,
`list[PIL.Image.Image]`, `list[np.ndarray]`,
`list[torch.Tensor]`):
The image or batch of images to be prepared.
Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats
are supported.
text (`str`, `list[str]`, `list[list[str]]`):
The sequence or batch of sequences to be encoded.
Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as
list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with
a batch of sequences).
videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`,
`list[torch.Tensor]`):
The image or batch of videos to be prepared. Each video
can be a 4D NumPy array or PyTorch tensor, or a nested
list of 3D frames. Both channels-first and channels-last
formats are supported.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework.
Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **input_ids** -- list of token ids to be fed to a model.
Returned when `text` is not `None`.
- **attention_mask** -- list of indices specifying which tokens
should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"*
is in `self.model_input_names` and if `text` is not `None`).
- **pixel_values** -- Pixel values to be fed to a model.
Returned when `images` is not `None`.
- **pixel_values_videos** -- Pixel values of videos to be fed to
a model. Returned when `videos` is not `None`.
- **image_grid_thw** -- list of image 3D grid in LLM. Returned
when `images` is not `None`.
- **video_grid_thw** -- list of video 3D grid in LLM. Returned
when `videos` is not `None`.
- **second_per_grid_ts** -- list of video seconds per time grid.
Returned when `videos` is not `None`.
"""
output_kwargs = self._merge_kwargs(
Ovis2_5ProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
# Process all images first
visual_features = {}
output = BatchFeature()
if images is not None:
processed_images = []
image_placeholders_list = []
grids = []
# Process each image
for image in images if isinstance(images, list) else [images]:
pixel_values, image_placeholders, grid = self.preprocess_multidata(
images=image, **output_kwargs["images_kwargs"]
)
processed_images.append(pixel_values)
image_placeholders_list.append(image_placeholders)
grids.append(grid)
# assign all processed images
if processed_images:
visual_features["image_placeholders"] = image_placeholders_list
output["pixel_values"] = processed_images
output["grids"] = grids
if videos is not None:
processed_videos = []
videos_placeholders_list = []
grids = []
# Process each video
for video in videos if isinstance(videos, list) else [videos]:
pixel_values, video_placeholders, grid = self.preprocess_multidata(
video=video, **output_kwargs["videos_kwargs"]
)
processed_videos.append(pixel_values)
videos_placeholders_list.append(video_placeholders)
grids.append(grid)
# assign all processed videos
if processed_videos:
visual_features["video_placeholders"] = videos_placeholders_list
output["video_pixel_values"] = processed_videos
output["video_grids"] = grids
# Process text input
if text is not None:
if not isinstance(text, list):
text = [text]
tokenized_batched_text = self._tokenize_with_visual_symbol(text)
image_token_id = self.get_token_value("image_token")
video_token_id = self.get_token_value("video_token")
replaced_ids_list = []
image_idx = 0
video_idx = 0
for ids_tensor in tokenized_batched_text:
has_image_tokens = (
image_token_id in ids_tensor
and "image_placeholders" in visual_features
and image_idx < len(visual_features["image_placeholders"])
)
has_video_tokens = (
video_token_id in ids_tensor
and "video_placeholders" in visual_features
and video_idx < len(visual_features["video_placeholders"])
)
if has_image_tokens or has_video_tokens:
# Convert to list for easier manipulation
ids_list = ids_tensor.tolist()
new_ids = []
# Replace placeholders
for token_id in ids_list:
if token_id == image_token_id:
new_ids.extend(
visual_features["image_placeholders"][image_idx]
)
image_idx += 1
elif token_id == video_token_id:
new_ids.extend(
visual_features["video_placeholders"][video_idx]
)
video_idx += 1
else:
new_ids.append(token_id)
# Convert back to tensor
ids_tensor = torch.tensor(new_ids, dtype=torch.long)
replaced_ids_list.append(ids_tensor)
if replaced_ids_list:
replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
else:
replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)
output["input_ids"] = replaced_and_tokenized_ids
return output
# If only images were provided
return BatchFeature(data=visual_features)
def _tokenize_with_visual_symbol(self, text_list: list[str]) -> torch.LongTensor:
batch_token_ids = []
for text in text_list:
token_ids = []
video_token_id = self.get_token_value("video_token")
image_token_id = self.get_token_value("image_token")
video_split_texts = text.split(self.video_token)
for j, video_segment in enumerate(video_split_texts):
image_split_texts = video_segment.split(self.image_token)
text_chunks = [
self.tokenizer(chunk, add_special_tokens=False).input_ids
for chunk in image_split_texts
]
segment_tokens = []
for i, chunk in enumerate(text_chunks):
segment_tokens.extend(chunk)
if i < len(text_chunks) - 1:
segment_tokens.append(image_token_id)
token_ids.extend(segment_tokens)
if j < len(video_split_texts) - 1:
token_ids.append(video_token_id)
batch_token_ids.append(token_ids)
return torch.tensor(batch_token_ids, dtype=torch.long)
# Copied from qwen2_vl
def smart_resize(
self,
height: int,
width: int,
factor: int = 28,
min_pixels: int = MIN_PIXELS,
max_pixels: int = MAX_PIXELS,
):
"""Rescales the image so that the following conditions are met:
1. Both dimensions (height and width) are divisible by 'factor'.
2. The total number of pixels is within the range
['min_pixels', 'max_pixels'].
3. The aspect ratio of the image is maintained as closely as possible.
"""
if height < factor or width < factor:
print(
f"height:{height} or width:{width} must be larger than factor:{factor}"
)
if height < width:
width = round(factor / height * width)
height = factor
else:
height = round(factor / width * height)
width = factor
elif max(height, width) / min(height, width) > 200:
print(
f"absolute aspect ratio must be smaller than 200, "
f"got {max(height, width) / min(height, width)}"
)
if height > width:
height = 200 * width
else:
width = 200 * height
h_bar = round(height / factor) * factor
w_bar = round(width / factor) * factor
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = math.floor(height / beta / factor) * factor
w_bar = math.floor(width / beta / factor) * factor
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = math.ceil(height * beta / factor) * factor
w_bar = math.ceil(width * beta / factor) * factor
return h_bar, w_bar
def get_token_value(self, tok):
return self.extra_special_tokens[tok]
def construct_visual_indicators(self, grid, is_video: bool = False):
if is_video:
start_token = self.get_token_value("video_start")
end_token = self.get_token_value("video_end")
else:
start_token = self.get_token_value("image_start")
end_token = self.get_token_value("image_end")
image_placeholders = [start_token, self.get_token_value("visual_atom")]
if grid[0] * grid[1] > 1:
for r in range(grid[0]):
for c in range(grid[1]):
image_placeholders.append(self.get_token_value("visual_atom"))
image_placeholders.append(end_token)
return image_placeholders
def construct_visual_placeholders(self, grid, is_video: bool = False):
visual_placeholders = self.construct_visual_indicators((1, 1), is_video)
image_atom_token_id = self.get_token_value("visual_atom")
# Extract the padding token ID from tokenizer
image_padding_token_id = self.get_token_value("image_pad")
num_image_atoms = grid[0] * grid[1] * grid[2]
num_image_atoms //= self.hidden_stride**2
num_image_atoms //= self.temporal_patch_size
# Create a new list with padding tokens inserted
padded_placeholder_tokens = []
for token in visual_placeholders:
if token == image_atom_token_id:
padded_placeholder_tokens.extend(
[image_padding_token_id] * num_image_atoms
)
else:
padded_placeholder_tokens.append(image_padding_token_id)
return padded_placeholder_tokens
def preprocess_multidata(
self,
images: PIL.Image.Image | list[PIL.Image.Image] | None = None,
video: list[PIL.Image.Image] | np.ndarray | None = None,
convert_to_rgb: bool | None = True,
min_pixels: int = MIN_PIXELS,
max_pixels: int = MAX_PIXELS,
return_tensors: str | None = "pt",
):
is_video = False
if images is not None:
if not isinstance(images, list):
images = [images]
elif video is not None:
is_video = True
# type of vidoe in dummy_mm_data is np.ndarray
if isinstance(video, np.ndarray):
images = []
for i in range(video.shape[0]):
image = PIL.Image.fromarray(video[i].astype(np.uint8))
images.append(image)
elif isinstance(video, list):
images = video
else:
raise ValueError("Either images or video should be provided.")
min_pixels = min(
max_pixels if max_pixels is not None else MAX_PIXELS,
min_pixels if min_pixels is not None else MIN_PIXELS,
)
images = [
image.convert("RGB") if convert_to_rgb and image.mode != "RGB" else image
for image in images
]
width, height = images[0].size
resized_height, resized_width = height, width
processed_images = []
for image in images:
resized_height, resized_width = self.smart_resize(
height,
width,
factor=self.patch_size * self.hidden_stride,
min_pixels=min_pixels,
max_pixels=max_pixels,
)
new_size = dict(height=resized_height, width=resized_width)
image_pt = self.image_processor.preprocess(
image, size=new_size, return_tensors="np"
)["pixel_values"][0]
processed_images.append(image_pt)
patches = np.array(processed_images)
if patches.shape[0] % self.temporal_patch_size != 0:
num_to_pad = self.temporal_patch_size - (
patches.shape[0] % self.temporal_patch_size
)
repeats = np.repeat(patches[-1][np.newaxis], num_to_pad, axis=0)
patches = np.concatenate([patches, repeats], axis=0)
channel = patches.shape[1]
grid_t = patches.shape[0] // self.temporal_patch_size
grid_h = resized_height // self.patch_size
grid_w = resized_width // self.patch_size
patches = patches.reshape(
grid_t,
self.temporal_patch_size,
channel,
grid_h // self.hidden_stride,
self.hidden_stride,
self.patch_size,
grid_w // self.hidden_stride,
self.hidden_stride,
self.patch_size,
)
patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
flatten_patches = patches.reshape(
grid_t * grid_h * grid_w,
channel * self.temporal_patch_size * self.patch_size * self.patch_size,
)
visual_placeholders = self.construct_visual_placeholders(
[grid_t, grid_h, grid_w], is_video
)
return (
torch.tensor(flatten_patches),
visual_placeholders,
torch.tensor([[grid_t, grid_h, grid_w]]),
)
AutoProcessor.register("Ovis2_5Processor", Ovis2_5Processor)

View File

@@ -0,0 +1,287 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Utilities for model repo interaction."""
import fnmatch
import json
import os
import time
from collections.abc import Callable
from functools import cache
from pathlib import Path
from typing import TypeVar
import huggingface_hub
from huggingface_hub import (
hf_hub_download,
try_to_load_from_cache,
)
from huggingface_hub import list_repo_files as hf_list_repo_files
from huggingface_hub.utils import (
EntryNotFoundError,
HfHubHTTPError,
LocalEntryNotFoundError,
RepositoryNotFoundError,
RevisionNotFoundError,
)
from vllm import envs
from vllm.logger import init_logger
logger = init_logger(__name__)
def _get_hf_token() -> str | None:
"""
Get the HuggingFace token from environment variable.
Returns None if the token is not set, is an empty string,
or contains only whitespace.
This follows the same pattern as huggingface_hub library which
treats empty string tokens as None to avoid authentication errors.
"""
token = os.getenv("HF_TOKEN")
if token and token.strip():
return token
return None
_R = TypeVar("_R")
def with_retry(
func: Callable[[], _R],
log_msg: str,
max_retries: int = 2,
retry_delay: int = 2,
) -> _R:
for attempt in range(max_retries):
try:
return func()
except Exception as e:
if attempt == max_retries - 1:
logger.error("%s: %s", log_msg, e)
raise
logger.error(
"%s: %s, retrying %d of %d", log_msg, e, attempt + 1, max_retries
)
time.sleep(retry_delay)
retry_delay *= 2
raise AssertionError("Should not be reached")
# @cache doesn't cache exceptions
@cache
def list_repo_files(
repo_id: str,
*,
revision: str | None = None,
repo_type: str | None = None,
token: str | bool | None = None,
) -> list[str]:
def lookup_files() -> list[str]:
# directly list files if model is local
if (local_path := Path(repo_id)).exists():
return [
str(file.relative_to(local_path))
for file in local_path.rglob("*")
if file.is_file()
]
# if model is remote, use hf_hub api to list files
try:
if envs.VLLM_USE_MODELSCOPE:
from vllm.transformers_utils.utils import modelscope_list_repo_files
return modelscope_list_repo_files(
repo_id,
revision=revision,
token=os.getenv("MODELSCOPE_API_TOKEN", None),
)
return hf_list_repo_files(
repo_id, revision=revision, repo_type=repo_type, token=token
)
except huggingface_hub.errors.OfflineModeIsEnabled:
# Don't raise in offline mode,
# all we know is that we don't have this
# file cached.
return []
return with_retry(lookup_files, "Error retrieving file list")
def list_filtered_repo_files(
model_name_or_path: str,
allow_patterns: list[str],
revision: str | None = None,
repo_type: str | None = None,
token: str | bool | None = None,
) -> list[str]:
try:
all_files = list_repo_files(
repo_id=model_name_or_path,
revision=revision,
token=token,
repo_type=repo_type,
)
except Exception:
logger.error(
"Error retrieving file list. Please ensure your `model_name_or_path`"
"`repo_type`, `token` and `revision` arguments are correctly set. "
"Returning an empty list."
)
return []
file_list = []
# Filter patterns on filenames
for pattern in allow_patterns:
file_list.extend(
[
file
for file in all_files
if fnmatch.fnmatch(os.path.basename(file), pattern)
]
)
return file_list
def file_exists(
repo_id: str,
file_name: str,
*,
repo_type: str | None = None,
revision: str | None = None,
token: str | bool | None = None,
) -> bool:
file_list = list_repo_files(
repo_id, repo_type=repo_type, revision=revision, token=token
)
return file_name in file_list
# In offline mode the result can be a false negative
def file_or_path_exists(
model: str | Path, config_name: str, revision: str | None
) -> bool:
if (local_path := Path(model)).exists():
return (local_path / config_name).is_file()
# Offline mode support: Check if config file is cached already
cached_filepath = try_to_load_from_cache(
repo_id=model, filename=config_name, revision=revision
)
if isinstance(cached_filepath, str):
# The config file exists in cache - we can continue trying to load
return True
# NB: file_exists will only check for the existence of the config file on
# hf_hub. This will fail in offline mode.
# Call HF to check if the file exists
return file_exists(
str(model), config_name, revision=revision, token=_get_hf_token()
)
def get_model_path(model: str | Path, revision: str | None = None):
if os.path.exists(model):
return model
assert huggingface_hub.constants.HF_HUB_OFFLINE
common_kwargs = {
"local_files_only": huggingface_hub.constants.HF_HUB_OFFLINE,
"revision": revision,
}
if envs.VLLM_USE_MODELSCOPE:
from modelscope.hub.snapshot_download import snapshot_download
return snapshot_download(model_id=model, **common_kwargs)
from huggingface_hub import snapshot_download
return snapshot_download(repo_id=model, **common_kwargs)
def get_hf_file_bytes(
file_name: str, model: str | Path, revision: str | None = "main"
) -> bytes | None:
"""Get file contents from HuggingFace repository as bytes."""
file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
if file_path is None:
hf_hub_file = hf_hub_download(
model, file_name, revision=revision, token=_get_hf_token()
)
file_path = Path(hf_hub_file)
if file_path is not None and file_path.is_file():
with open(file_path, "rb") as file:
return file.read()
return None
def try_get_local_file(
model: str | Path, file_name: str, revision: str | None = "main"
) -> Path | None:
file_path = Path(model) / file_name
if file_path.is_file():
return file_path
else:
try:
cached_filepath = try_to_load_from_cache(
repo_id=model, filename=file_name, revision=revision
)
if isinstance(cached_filepath, str):
return Path(cached_filepath)
except ValueError:
...
return None
def get_hf_file_to_dict(
file_name: str, model: str | Path, revision: str | None = "main"
):
"""
Downloads a file from the Hugging Face Hub and returns
its contents as a dictionary.
Parameters:
- file_name (str): The name of the file to download.
- model (str): The name of the model on the Hugging Face Hub.
- revision (str): The specific version of the model.
Returns:
- config_dict (dict): A dictionary containing
the contents of the downloaded file.
"""
file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
if file_path is None:
try:
hf_hub_file = hf_hub_download(model, file_name, revision=revision)
except huggingface_hub.errors.OfflineModeIsEnabled:
return None
except (
RepositoryNotFoundError,
RevisionNotFoundError,
EntryNotFoundError,
LocalEntryNotFoundError,
) as e:
logger.debug("File or repository not found in hf_hub_download", e)
return None
except HfHubHTTPError as e:
logger.warning(
"Cannot connect to Hugging Face Hub. Skipping file download for '%s':",
file_name,
exc_info=e,
)
return None
file_path = Path(hf_hub_file)
if file_path is not None and file_path.is_file():
with open(file_path) as file:
return json.load(file)
return None

View File

@@ -0,0 +1,102 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import hashlib
import os
import shutil
import signal
from vllm import envs
from vllm.assets.base import get_cache_dir
from vllm.logger import init_logger
from vllm.utils.import_utils import PlaceholderModule
logger = init_logger(__name__)
SUPPORTED_SCHEMES = ["s3://", "gs://"]
try:
from runai_model_streamer import list_safetensors as runai_list_safetensors
from runai_model_streamer import pull_files as runai_pull_files
except ImportError:
runai_model_streamer = PlaceholderModule("runai_model_streamer") # type: ignore[assignment]
runai_pull_files = runai_model_streamer.placeholder_attr("pull_files")
runai_list_safetensors = runai_model_streamer.placeholder_attr("list_safetensors")
def list_safetensors(path: str = "") -> list[str]:
"""
List full file names from object path and filter by allow pattern.
Args:
path: The object storage path to list from.
Returns:
list[str]: List of full object storage paths allowed by the pattern
"""
return runai_list_safetensors(path)
def is_runai_obj_uri(model_or_path: str) -> bool:
return model_or_path.lower().startswith(tuple(SUPPORTED_SCHEMES))
class ObjectStorageModel:
"""
A class representing an ObjectStorage model mirrored into a
temporary directory.
Attributes:
dir: The temporary created directory.
Methods:
pull_files(): Pull model from object storage to the temporary directory.
"""
def __init__(self, url: str) -> None:
if envs.VLLM_ASSETS_CACHE_MODEL_CLEAN:
for sig in (signal.SIGINT, signal.SIGTERM):
existing_handler = signal.getsignal(sig)
signal.signal(sig, self._close_by_signal(existing_handler))
dir_name = os.path.join(
get_cache_dir(),
"model_streamer",
hashlib.sha256(str(url).encode()).hexdigest()[:8],
)
if os.path.exists(dir_name):
shutil.rmtree(dir_name)
os.makedirs(dir_name)
self.dir = dir_name
logger.debug("Init object storage, model cache path is: %s", dir_name)
def _close(self) -> None:
if os.path.exists(self.dir):
shutil.rmtree(self.dir)
def _close_by_signal(self, existing_handler=None):
def new_handler(signum, frame):
self._close()
if existing_handler:
existing_handler(signum, frame)
return new_handler
def pull_files(
self,
model_path: str = "",
allow_pattern: list[str] | None = None,
ignore_pattern: list[str] | None = None,
) -> None:
"""
Pull files from object storage into the temporary directory.
Args:
model_path: The object storage path of the model.
allow_pattern: A list of patterns of which files to pull.
ignore_pattern: A list of patterns of which files not to pull.
"""
if not model_path.endswith("/"):
model_path = model_path + "/"
runai_pull_files(model_path, self.dir, allow_pattern, ignore_pattern)

View File

@@ -0,0 +1,95 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import fnmatch
from typing import TYPE_CHECKING, Optional
from vllm.utils.import_utils import PlaceholderModule
if TYPE_CHECKING:
from botocore.client import BaseClient
try:
import boto3
except ImportError:
boto3 = PlaceholderModule("boto3") # type: ignore[assignment]
def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
return [
path
for path in paths
if any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
]
def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
return [
path
for path in paths
if not any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
]
def glob(
s3: Optional["BaseClient"] = None,
path: str = "",
allow_pattern: list[str] | None = None,
) -> list[str]:
"""
List full file names from S3 path and filter by allow pattern.
Args:
s3: S3 client to use.
path: The S3 path to list from.
allow_pattern: A list of patterns of which files to pull.
Returns:
list[str]: List of full S3 paths allowed by the pattern
"""
if s3 is None:
s3 = boto3.client("s3")
if not path.endswith("/"):
path = path + "/"
bucket_name, _, paths = list_files(s3, path=path, allow_pattern=allow_pattern)
return [f"s3://{bucket_name}/{path}" for path in paths]
def list_files(
s3: "BaseClient",
path: str,
allow_pattern: list[str] | None = None,
ignore_pattern: list[str] | None = None,
) -> tuple[str, str, list[str]]:
"""
List files from S3 path and filter by pattern.
Args:
s3: S3 client to use.
path: The S3 path to list from.
allow_pattern: A list of patterns of which files to pull.
ignore_pattern: A list of patterns of which files not to pull.
Returns:
tuple[str, str, list[str]]: A tuple where:
- The first element is the bucket name
- The second element is string represent the bucket
and the prefix as a dir like string
- The third element is a list of files allowed or
disallowed by pattern
"""
parts = path.removeprefix("s3://").split("/")
prefix = "/".join(parts[1:])
bucket_name = parts[0]
objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
paths = [obj["Key"] for obj in objects.get("Contents", [])]
paths = _filter_ignore(paths, ["*/"])
if allow_pattern is not None:
paths = _filter_allow(paths, allow_pattern)
if ignore_pattern is not None:
paths = _filter_ignore(paths, ignore_pattern)
return bucket_name, prefix, paths

View File

@@ -1,149 +1,127 @@
import os
from typing import Optional, Union
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import huggingface_hub
from transformers import (AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast)
import warnings
from typing import Any
from typing_extensions import deprecated
from vllm.envs import VLLM_USE_MODELSCOPE
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizers import BaichuanTokenizer
from vllm.utils import make_async
from vllm.tokenizers import TokenizerLike
logger = init_logger(__name__)
def get_cached_tokenizer(
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
"""Get tokenizer with cached properties.
def __getattr__(name: str):
if name == "AnyTokenizer":
warnings.warn(
"`vllm.transformers_utils.tokenizer.AnyTokenizer` has been moved to "
"`vllm.tokenizers.TokenizerLike`. "
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
This will patch the tokenizer object in place.
return TokenizerLike
if name == "get_tokenizer":
from vllm.tokenizers import get_tokenizer
By default, transformers will recompute multiple tokenizer properties
each time they are called, leading to a significant slowdown. This
function caches these properties for faster access."""
warnings.warn(
"`vllm.transformers_utils.tokenizer.get_tokenizer` "
"has been moved to `vllm.tokenizers.get_tokenizer`. "
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
tokenizer_all_special_ids = set(tokenizer.all_special_ids)
tokenizer_all_special_tokens_extended = (
tokenizer.all_special_tokens_extended)
tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
tokenizer_len = len(tokenizer)
return get_tokenizer
if name == "cached_get_tokenizer":
from vllm.tokenizers import cached_get_tokenizer
class CachedTokenizer(tokenizer.__class__): # type: ignore
warnings.warn(
"`vllm.transformers_utils.tokenizer.cached_get_tokenizer` "
"has been moved to `vllm.tokenizers.cached_get_tokenizer`. "
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
@property
def all_special_ids(self):
return tokenizer_all_special_ids
return cached_get_tokenizer
if name == "cached_tokenizer_from_config":
from vllm.tokenizers import cached_tokenizer_from_config
@property
def all_special_tokens(self):
return tokenizer_all_special_tokens
warnings.warn(
"`vllm.transformers_utils.tokenizer.cached_tokenizer_from_config` "
"has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. "
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
@property
def all_special_tokens_extended(self):
return tokenizer_all_special_tokens_extended
return cached_tokenizer_from_config
if name == "init_tokenizer_from_configs":
from vllm.tokenizers import cached_tokenizer_from_config
def __len__(self):
return tokenizer_len
warnings.warn(
"`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` "
"has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. "
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
return cached_tokenizer_from_config
tokenizer.__class__ = CachedTokenizer
return tokenizer
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
def get_tokenizer(
tokenizer_name: str,
*args,
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
revision: Optional[str] = None,
download_dir: Optional[str] = None,
**kwargs,
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope.
@deprecated("Will be removed in v0.14. Please use `tokenizer.decode()` instead.")
def decode_tokens(
tokenizer: TokenizerLike,
token_ids: list[int],
*,
skip_special_tokens: bool | None = None,
) -> str:
"""
if VLLM_USE_MODELSCOPE:
# download model from ModelScope hub,
# lazy import so that modelscope is not required for normal use.
# pylint: disable=C.
from modelscope.hub.snapshot_download import snapshot_download
Backend-agnostic equivalent of HF's
`tokenizer.decode(token_ids, ...)`.
# Only set the tokenizer here, model will be downloaded on the workers.
if not os.path.exists(tokenizer_name):
tokenizer_path = snapshot_download(
model_id=tokenizer_name,
cache_dir=download_dir,
revision=revision,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
# Ignore weights - we only need the tokenizer.
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
tokenizer_name = tokenizer_path
`skip_special_tokens=None` means to use the backend's default
settings.
"""
kw_args: dict[str, Any] = {}
if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
raise ValueError(
"Cannot use the fast tokenizer in slow tokenizer mode.")
kwargs["use_fast"] = False
if skip_special_tokens is not None:
kw_args["skip_special_tokens"] = skip_special_tokens
try:
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_name,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
**kwargs)
except ValueError as e:
# If the error pertains to the tokenizer class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
if (not trust_remote_code and
("does not exist or is not currently imported." in str(e)
or "requires you to execute the tokenizer file" in str(e))):
err_msg = (
"Failed to load the tokenizer. If the tokenizer is a custom "
"tokenizer not yet available in the HuggingFace transformers "
"library, consider setting `trust_remote_code=True` in LLM "
"or using the `--trust-remote-code` flag in the CLI.")
raise RuntimeError(err_msg) from e
else:
raise e
except AttributeError as e:
if "BaichuanTokenizer" in str(e):
# This is for the error "'BaichuanTokenizer' object has no
# attribute 'sp_model'".
tokenizer = BaichuanTokenizer.from_pretrained(
tokenizer_name,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
**kwargs)
else:
raise e
if not isinstance(tokenizer, PreTrainedTokenizerFast):
logger.warning(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead.")
return get_cached_tokenizer(tokenizer)
return tokenizer.decode(token_ids, **kw_args)
def get_lora_tokenizer(lora_request: LoRARequest, *args,
**kwargs) -> Optional[PreTrainedTokenizer]:
if lora_request is None:
return None
try:
tokenizer = get_tokenizer(lora_request.lora_local_path, *args,
**kwargs)
except OSError as e:
# No tokenizer was found in the LoRA folder,
# use base model tokenizer
logger.warning(
"No tokenizer found in %s, using base model tokenizer instead. "
"(Exception: %s)", lora_request.lora_local_path, e)
tokenizer = None
return tokenizer
@deprecated("Will be removed in v0.14. Please use `tokenizer.encode()` instead.")
def encode_tokens(
tokenizer: TokenizerLike,
text: str,
*,
truncation: bool | None = None,
max_length: int | None = None,
add_special_tokens: bool | None = None,
) -> list[int]:
"""
Backend-agnostic equivalent of HF's
`tokenizer.encode(text, ...)`.
`add_special_tokens=None` means to use the backend's default
settings.
"""
get_lora_tokenizer_async = make_async(get_lora_tokenizer)
kw_args: dict[str, Any] = {}
if max_length is not None:
kw_args["max_length"] = max_length
if truncation is not None:
kw_args["truncation"] = truncation
if add_special_tokens is not None:
kw_args["add_special_tokens"] = add_special_tokens
return tokenizer.encode(text, **kw_args)

View File

@@ -0,0 +1,33 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import warnings
def __getattr__(name: str):
if name == "TokenizerBase":
from vllm.tokenizers import TokenizerLike
warnings.warn(
"`vllm.transformers_utils.tokenizer_base.TokenizerBase` has been "
"moved to `vllm.tokenizers.TokenizerLike`. "
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
return TokenizerLike
if name == "TokenizerRegistry":
from vllm.tokenizers import TokenizerRegistry
warnings.warn(
"`vllm.transformers_utils.tokenizer_base.TokenizerRegistry` has been "
"moved to `vllm.tokenizers.TokenizerRegistry`. "
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
return TokenizerRegistry
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

View File

@@ -1,33 +0,0 @@
from typing import Optional
from vllm.config import TokenizerPoolConfig
from vllm.executor.ray_utils import ray
from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
BaseTokenizerGroup)
from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
TokenizerGroup)
if ray:
from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
RayTokenizerGroupPool)
else:
RayTokenizerGroupPool = None # type: ignore
def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
**init_kwargs) -> BaseTokenizerGroup:
if tokenizer_pool_config is None:
return TokenizerGroup(**init_kwargs)
if tokenizer_pool_config.pool_type == "ray":
if RayTokenizerGroupPool is None:
raise ImportError(
"RayTokenizerGroupPool is not available. Please install "
"the ray package to use the Ray tokenizer group pool.")
return RayTokenizerGroupPool.from_config(tokenizer_pool_config,
**init_kwargs)
else:
raise ValueError(
f"Unknown pool type: {tokenizer_pool_config.pool_type}")
__all__ = ["get_tokenizer_group", "BaseTokenizerGroup"]

View File

@@ -1,55 +0,0 @@
from abc import ABC, abstractmethod
from typing import List, Optional
from transformers import PreTrainedTokenizer
from vllm.lora.request import LoRARequest
class BaseTokenizerGroup(ABC):
"""A group of tokenizers that can be used for LoRA adapters."""
@abstractmethod
def ping(self) -> bool:
"""Check if the tokenizer group is alive."""
pass
@abstractmethod
def get_max_input_len(self,
lora_request: Optional[LoRARequest] = None
) -> Optional[int]:
"""Get the maximum input length for the LoRA request."""
pass
@abstractmethod
def encode(self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
"""Encode a prompt using the tokenizer group."""
pass
@abstractmethod
async def encode_async(
self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
"""Encode a prompt using the tokenizer group."""
pass
@abstractmethod
def get_lora_tokenizer(
self,
lora_request: Optional[LoRARequest] = None
) -> "PreTrainedTokenizer":
"""Get a tokenizer for a LoRA request."""
pass
@abstractmethod
async def get_lora_tokenizer_async(
self,
lora_request: Optional[LoRARequest] = None
) -> "PreTrainedTokenizer":
"""Get a tokenizer for a LoRA request."""
pass

View File

@@ -1,169 +0,0 @@
import asyncio
import os
from typing import List, Optional
from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
from transformers import PreTrainedTokenizer
from vllm.config import TokenizerPoolConfig
from vllm.executor.ray_utils import ray
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
BaseTokenizerGroup)
from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
TokenizerGroup)
class RayTokenizerGroupPool(BaseTokenizerGroup):
"""A Ray-based pool of TokenizerGroups for async tokenization."""
# Class to use for workers making up the pool.
_worker_cls = TokenizerGroup
@classmethod
def from_config(cls, tokenizer_pool_config: TokenizerPoolConfig,
**init_kwargs) -> "RayTokenizerGroupPool":
ray_actor_options = (tokenizer_pool_config.extra_config or {
"num_cpus": 0
})
ray_actor_options.setdefault(
"scheduling_strategy",
NodeAffinitySchedulingStrategy(
node_id=ray.get_runtime_context().get_node_id(), soft=True))
# Carry over the env vars to the actors.
# This is necessary for API keys and such.
ray_actor_options.setdefault("runtime_env", {})
_carry_over_env_vars_to_runtime_env(ray_actor_options["runtime_env"])
init_kwargs["num_actors"] = tokenizer_pool_config.pool_size
init_kwargs["ray_actor_options"] = ray_actor_options
return cls(**init_kwargs)
def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
max_input_length: Optional[int], num_actors: int,
ray_actor_options: dict, **tokenizer_config):
# Store a local copy of the TokenizerGroup for quick access
# to underlying HF tokenizers.
self._local_tokenizer_group = self._worker_cls(
tokenizer_id=tokenizer_id,
enable_lora=enable_lora,
max_num_seqs=max_num_seqs,
max_input_length=max_input_length,
**tokenizer_config,
)
ray_tokenizer_group_cls = ray.remote(
self._worker_cls).options(**ray_actor_options)
self.tokenizer_actors = [
ray_tokenizer_group_cls.remote(tokenizer_id, enable_lora,
max_num_seqs, max_input_length,
**tokenizer_config)
for _ in range(num_actors)
]
self._idle_actors: Optional[asyncio.Queue] = None
@property
def pool_size(self) -> int:
return len(self.tokenizer_actors)
def ping(self):
return ray.get(
[actor.ping.remote() for actor in self.tokenizer_actors])
def _ensure_queue_initialized(self):
if self._idle_actors is None:
self._idle_actors = asyncio.Queue()
for actor in self.tokenizer_actors:
self._idle_actors.put_nowait(actor)
def encode(self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
"""Encode a prompt using the tokenizer group.
We pick an idle actor and use it to encode the prompt.
The actor is then put back in the queue for future use.
This is blocking.
"""
self._ensure_queue_initialized()
assert self._idle_actors is not None
if self._idle_actors.empty():
raise RuntimeError("No idle actors available.")
actor = self._idle_actors.get_nowait()
try:
ret = ray.get(
actor.encode.remote(request_id=request_id,
prompt=prompt,
lora_request=lora_request))
finally:
# Put the actor back in the queue.
# This is done in a finally block to ensure that the actor is
# always put back in the queue, even if an exception/cancellation
# is raised.
self._idle_actors.put_nowait(actor)
return ret
async def encode_async(
self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
"""Encode a prompt using the tokenizer group.
We pick an idle actor and use it to encode the prompt.
If there are no idle actors, we wait until one becomes
available.
The actor is then put back in the queue for future use.
This is non-blocking.
"""
self._ensure_queue_initialized()
assert self._idle_actors is not None
actor = await self._idle_actors.get()
try:
ret = await actor.encode.remote(request_id=request_id,
prompt=prompt,
lora_request=lora_request)
finally:
# Put the actor back in the queue.
# This is done in a finally block to ensure that the actor is
# always put back in the queue, even if an exception/cancellation
# is raised.
self._idle_actors.put_nowait(actor)
return ret
def get_max_input_len(self,
lora_request: Optional[LoRARequest] = None
) -> Optional[int]:
"""Get the maximum input length for the LoRA request."""
return self._local_tokenizer_group.get_max_input_len(lora_request)
def get_lora_tokenizer(
self,
lora_request: Optional[LoRARequest] = None
) -> "PreTrainedTokenizer":
return self._local_tokenizer_group.get_lora_tokenizer(lora_request)
async def get_lora_tokenizer_async(
self,
lora_request: Optional[LoRARequest] = None
) -> "PreTrainedTokenizer":
return await self._local_tokenizer_group.get_lora_tokenizer_async(
lora_request)
def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None:
"""Copy over all current process environment variables to the runtime_env.
The variables in runtime_env will take precedence over the current process
environment variables.
runtime_env will be modified in place."""
env_vars = os.environ.copy()
runtime_env.setdefault("env_vars", {})
env_vars.update(runtime_env["env_vars"])
runtime_env["env_vars"] = env_vars

View File

@@ -1,78 +0,0 @@
from typing import List, Optional
from transformers import PreTrainedTokenizer
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer import (get_lora_tokenizer,
get_lora_tokenizer_async,
get_tokenizer)
from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
BaseTokenizerGroup)
from vllm.utils import LRUCache
class TokenizerGroup(BaseTokenizerGroup):
"""A group of tokenizers that can be used for LoRA adapters."""
def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
max_input_length: Optional[int], **tokenizer_config):
self.tokenizer_id = tokenizer_id
self.tokenizer_config = tokenizer_config
self.enable_lora = enable_lora
self.max_input_length = max_input_length
self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
self.lora_tokenizers = LRUCache[PreTrainedTokenizer](
capacity=max_num_seqs) if enable_lora else None
def ping(self) -> bool:
"""Check if the tokenizer group is alive."""
return True
def get_max_input_len(self,
lora_request: Optional[LoRARequest] = None
) -> Optional[int]:
"""Get the maximum input length for the LoRA request."""
return self.max_input_length
def encode(self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
tokenizer = self.get_lora_tokenizer(lora_request)
return tokenizer.encode(prompt)
async def encode_async(
self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
tokenizer = await self.get_lora_tokenizer_async(lora_request)
return tokenizer.encode(prompt)
def get_lora_tokenizer(
self,
lora_request: Optional[LoRARequest] = None
) -> "PreTrainedTokenizer":
if not lora_request or not self.enable_lora:
return self.tokenizer
if lora_request.lora_int_id not in self.lora_tokenizers:
tokenizer = (get_lora_tokenizer(
lora_request, **self.tokenizer_config) or self.tokenizer)
self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
return tokenizer
else:
return self.lora_tokenizers.get(lora_request.lora_int_id)
async def get_lora_tokenizer_async(
self,
lora_request: Optional[LoRARequest] = None
) -> "PreTrainedTokenizer":
if not lora_request or not self.enable_lora:
return self.tokenizer
if lora_request.lora_int_id not in self.lora_tokenizers:
tokenizer = (await get_lora_tokenizer_async(
lora_request, **self.tokenizer_config) or self.tokenizer)
self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
return tokenizer
else:
return self.lora_tokenizers.get(lora_request.lora_int_id)

View File

@@ -1,5 +0,0 @@
from vllm.transformers_utils.tokenizers.baichuan import BaichuanTokenizer
__all__ = [
"BaichuanTokenizer",
]

View File

@@ -1,256 +0,0 @@
# Adapted from
# https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py
# This includes a fix suggested in
# https://github.com/vllm-project/vllm/issues/1403#issuecomment-1767503058
# Copyright (c) 2024 - 2024 Moore Threads Technology Co., Ltd("Moore Threads"). All rights reserved.
# Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved.
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
from transformers.utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
PRETRAINED_VOCAB_FILES_MAP = { # type: ignore
"vocab_file": {},
"tokenizer_file": {},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} # type: ignore
class BaichuanTokenizer(PreTrainedTokenizer):
"""
Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding.
Args:
vocab_file (`str`):
Path to the vocabulary file.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>",
pad_token=None,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
add_bos_token=True,
add_eos_token=False,
clean_up_tokenization_spaces=False,
**kwargs,
):
self.sp_model_kwargs = ({} if sp_model_kwargs is None else
sp_model_kwargs)
bos_token = (AddedToken(bos_token, lstrip=False, rstrip=False)
if isinstance(bos_token, str) else bos_token)
eos_token = (AddedToken(eos_token, lstrip=False, rstrip=False)
if isinstance(eos_token, str) else eos_token)
unk_token = (AddedToken(unk_token, lstrip=False, rstrip=False)
if isinstance(unk_token, str) else unk_token)
pad_token = (AddedToken(pad_token, lstrip=False, rstrip=False)
if isinstance(pad_token, str) else pad_token)
self.vocab_file = vocab_file
self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file)
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
add_bos_token=add_bos_token,
add_eos_token=add_eos_token,
sp_model_kwargs=self.sp_model_kwargs,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file)
@property
def vocab_size(self):
"""Returns vocab size"""
return self.sp_model.get_piece_size()
def get_vocab(self):
"""Returns vocab as a dict"""
vocab = {
self.convert_ids_to_tokens(i): i
for i in range(self.vocab_size)
}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text):
"""Returns a tokenized string."""
return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
token = self.sp_model.IdToPiece(index)
return token
def convert_tokens_to_string(self, tokens: List[str]):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens: List[str] = []
out_string = ""
prev_is_special = False
for i, token in enumerate(tokens):
# make sure that special tokens are not decoded using
# sentencepiece model
if token in self.all_special_tokens:
if not prev_is_special and i != 0:
out_string += " "
out_string += self.sp_model.decode(current_sub_tokens) + token
prev_is_special = True
current_sub_tokens = []
else:
current_sub_tokens.append(token)
prev_is_special = False
out_string += self.sp_model.decode(current_sub_tokens)
return out_string
def save_vocabulary(self,
save_directory,
filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (`str`):
The directory in which to save the vocabulary.
Returns:
`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
raise ValueError(f"Vocabulary path ({save_directory}) "
"should be a directory")
out_vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") +
VOCAB_FILES_NAMES["vocab_file"],
)
if os.path.abspath(self.vocab_file) != os.path.abspath(
out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file, )
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
output = bos_token_id + token_ids_0 + eos_token_id
if token_ids_1 is not None:
output = output + bos_token_id + token_ids_1 + eos_token_id
return output
def get_special_tokens_mask(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None,
already_has_special_tokens: bool = False,
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens
added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to
`False`):
Whether or not the token list is already formatted with
special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]:
1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0,
token_ids_1=token_ids_1,
already_has_special_tokens=True,
)
bos_token_id = [1] if self.add_bos_token else []
eos_token_id = [1] if self.add_eos_token else []
if token_ids_1 is None:
return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)
def create_token_type_ids_from_sequences(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a
sequence-pair classification task. An ALBERT
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of ids.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids)
according to the given sequence(s).
"""
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
if token_ids_1 is not None:
output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
return output

View File

@@ -0,0 +1,112 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import os
import struct
from functools import cache
from os import PathLike
from pathlib import Path
from typing import Any
import vllm.envs as envs
from vllm.logger import init_logger
logger = init_logger(__name__)
def is_s3(model_or_path: str) -> bool:
return model_or_path.lower().startswith("s3://")
def is_gcs(model_or_path: str) -> bool:
return model_or_path.lower().startswith("gs://")
def is_cloud_storage(model_or_path: str) -> bool:
return is_s3(model_or_path) or is_gcs(model_or_path)
def modelscope_list_repo_files(
repo_id: str,
revision: str | None = None,
token: str | bool | None = None,
) -> list[str]:
"""List files in a modelscope repo."""
from modelscope.hub.api import HubApi
api = HubApi()
api.login(token)
# same as huggingface_hub.list_repo_files
files = [
file["Path"]
for file in api.get_model_files(
model_id=repo_id, revision=revision, recursive=True
)
if file["Type"] == "blob"
]
return files
def _maybe_json_dict(path: str | PathLike) -> dict[str, str]:
with open(path) as f:
try:
return json.loads(f.read())
except Exception:
return dict[str, str]()
def _maybe_space_split_dict(path: str | PathLike) -> dict[str, str]:
parsed_dict = dict[str, str]()
with open(path) as f:
for line in f.readlines():
try:
model_name, redirect_name = line.strip().split()
parsed_dict[model_name] = redirect_name
except Exception:
pass
return parsed_dict
@cache
def maybe_model_redirect(model: str) -> str:
"""
Use model_redirect to redirect the model name to a local folder.
:param model: hf model name
:return: maybe redirect to a local folder
"""
model_redirect_path = envs.VLLM_MODEL_REDIRECT_PATH
if not model_redirect_path:
return model
if not Path(model_redirect_path).exists():
return model
redirect_dict = _maybe_json_dict(model_redirect_path) or _maybe_space_split_dict(
model_redirect_path
)
if redirect_model := redirect_dict.get(model):
logger.info("model redirect: [ %s ] -> [ %s ]", model, redirect_model)
return redirect_model
return model
def parse_safetensors_file_metadata(path: str | PathLike) -> dict[str, Any]:
with open(path, "rb") as f:
length_of_metadata = struct.unpack("<Q", f.read(8))[0]
metadata = json.loads(f.read(length_of_metadata).decode("utf-8"))
return metadata
def convert_model_repo_to_path(model_repo: str) -> str:
"""When VLLM_USE_MODELSCOPE is True convert a model
repository string to a Path str."""
if not envs.VLLM_USE_MODELSCOPE or Path(model_repo).exists():
return model_repo
from modelscope.utils.file_utils import get_model_cache_root
return os.path.join(get_model_cache_root(), model_repo)