init
This commit is contained in:
24
vllm/transformers_utils/__init__.py
Normal file
24
vllm/transformers_utils/__init__.py
Normal file
@@ -0,0 +1,24 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm import envs
|
||||
|
||||
if envs.VLLM_USE_MODELSCOPE:
|
||||
try:
|
||||
# Patch here, before each import happens
|
||||
import modelscope
|
||||
from packaging import version
|
||||
|
||||
# patch_hub begins from modelscope>=1.18.1
|
||||
if version.parse(modelscope.__version__) <= version.parse('1.18.0'):
|
||||
raise ImportError(
|
||||
'Using vLLM with ModelScope needs modelscope>=1.18.1, please '
|
||||
'install by `pip install modelscope -U`')
|
||||
from modelscope.utils.hf_util import patch_hub
|
||||
|
||||
# Patch hub to download models from modelscope to speed up.
|
||||
patch_hub()
|
||||
except ImportError as err:
|
||||
raise ImportError(
|
||||
"Please install modelscope>=1.18.1 via "
|
||||
"`pip install modelscope>=1.18.1` to use ModelScope.") from err
|
||||
BIN
vllm/transformers_utils/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
vllm/transformers_utils/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
vllm/transformers_utils/__pycache__/config.cpython-312.pyc
Normal file
BIN
vllm/transformers_utils/__pycache__/config.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
vllm/transformers_utils/__pycache__/processor.cpython-312.pyc
Normal file
BIN
vllm/transformers_utils/__pycache__/processor.cpython-312.pyc
Normal file
Binary file not shown.
BIN
vllm/transformers_utils/__pycache__/runai_utils.cpython-312.pyc
Normal file
BIN
vllm/transformers_utils/__pycache__/runai_utils.cpython-312.pyc
Normal file
Binary file not shown.
BIN
vllm/transformers_utils/__pycache__/tokenizer.cpython-312.pyc
Normal file
BIN
vllm/transformers_utils/__pycache__/tokenizer.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
vllm/transformers_utils/__pycache__/utils.cpython-312.pyc
Normal file
BIN
vllm/transformers_utils/__pycache__/utils.cpython-312.pyc
Normal file
Binary file not shown.
5
vllm/transformers_utils/chat_templates/__init__.py
Normal file
5
vllm/transformers_utils/chat_templates/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from .registry import get_chat_template_fallback_path
|
||||
|
||||
__all__ = ["get_chat_template_fallback_path"]
|
||||
Binary file not shown.
Binary file not shown.
70
vllm/transformers_utils/chat_templates/registry.py
Normal file
70
vllm/transformers_utils/chat_templates/registry.py
Normal file
@@ -0,0 +1,70 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from pathlib import Path
|
||||
from typing import Callable, Optional, Union
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__file__)
|
||||
|
||||
CHAT_TEMPLATES_DIR = Path(__file__).parent
|
||||
|
||||
ChatTemplatePath = Union[Path, Callable[[str], Optional[Path]]]
|
||||
|
||||
|
||||
def _get_qwen_chat_template_fallback(
|
||||
tokenizer_name_or_path: str) -> Optional[Path]:
|
||||
if tokenizer_name_or_path.endswith("-Chat"):
|
||||
return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
|
||||
|
||||
return CHAT_TEMPLATES_DIR / "template_basic.jinja"
|
||||
|
||||
|
||||
def _get_minicpmv_chat_template_fallback(
|
||||
tokenizer_name_or_path: str) -> Optional[Path]:
|
||||
# MiniCPM-V-4.5 version uses a dedicated template
|
||||
if "4.5" in tokenizer_name_or_path or "4_5" in tokenizer_name_or_path:
|
||||
return CHAT_TEMPLATES_DIR / "template_minicpmv45.jinja"
|
||||
|
||||
# Other versions use chatml template
|
||||
return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
|
||||
|
||||
|
||||
# yapf: disable
|
||||
_MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
|
||||
"blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
|
||||
"chameleon": CHAT_TEMPLATES_DIR / "template_basic.jinja",
|
||||
"deepseek_vl_v2": CHAT_TEMPLATES_DIR / "template_deepseek_vl2.jinja",
|
||||
"fuyu": CHAT_TEMPLATES_DIR / "template_fuyu.jinja",
|
||||
"minicpmv": _get_minicpmv_chat_template_fallback,
|
||||
"paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
|
||||
"qwen": _get_qwen_chat_template_fallback,
|
||||
}
|
||||
# yapf: enable
|
||||
|
||||
|
||||
def register_chat_template_fallback_path(
|
||||
model_type: str,
|
||||
chat_template: ChatTemplatePath,
|
||||
) -> None:
|
||||
if model_type in _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK:
|
||||
logger.warning(
|
||||
"Model type %s already has a chat template registered. "
|
||||
"It will be overwritten by the new chat template %s.", model_type,
|
||||
chat_template)
|
||||
|
||||
_MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK[model_type] = chat_template
|
||||
|
||||
|
||||
def get_chat_template_fallback_path(
|
||||
model_type: str,
|
||||
tokenizer_name_or_path: str,
|
||||
) -> Optional[Path]:
|
||||
chat_template = _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK.get(model_type)
|
||||
if callable(chat_template):
|
||||
chat_template = chat_template(tokenizer_name_or_path)
|
||||
|
||||
if chat_template is None:
|
||||
return None
|
||||
|
||||
return chat_template
|
||||
@@ -0,0 +1,3 @@
|
||||
{%- for message in messages -%}
|
||||
{{- message['content'] -}}
|
||||
{%- endfor -%}
|
||||
11
vllm/transformers_utils/chat_templates/template_blip2.jinja
Normal file
11
vllm/transformers_utils/chat_templates/template_blip2.jinja
Normal file
@@ -0,0 +1,11 @@
|
||||
{%- for message in messages -%}
|
||||
{%- if message['role'] == 'user' -%}
|
||||
{{- 'Question: ' + message['content'] + ' ' -}}
|
||||
{%- elif message['role'] == 'assistant' -%}
|
||||
{{- 'Answer: ' + message['content'] + ' ' -}}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
|
||||
{%- if add_generation_prompt -%}
|
||||
{{- 'Answer:' -}}
|
||||
{% endif %}
|
||||
10
vllm/transformers_utils/chat_templates/template_chatml.jinja
Normal file
10
vllm/transformers_utils/chat_templates/template_chatml.jinja
Normal file
@@ -0,0 +1,10 @@
|
||||
{%- for message in messages -%}
|
||||
{{- '<|im_start|>' + message['role'] + '\n' + message['content'] -}}
|
||||
{%- if (loop.last and add_generation_prompt) or not loop.last -%}
|
||||
{{- '<|im_end|>' + '\n' -}}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
|
||||
{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
|
||||
{{- '<|im_start|>assistant\n' -}}
|
||||
{%- endif -%}
|
||||
@@ -0,0 +1,23 @@
|
||||
{%- if messages[0]['role'] == 'system' -%}
|
||||
{%- set system_message = messages[0]['content'] -%}
|
||||
{%- set messages = messages[1:] -%}
|
||||
{%- else -%}
|
||||
{% set system_message = '' -%}
|
||||
{%- endif -%}
|
||||
|
||||
{{ bos_token + system_message }}
|
||||
{%- for message in messages -%}
|
||||
{%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
|
||||
{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
|
||||
{%- endif -%}
|
||||
|
||||
{%- if message['role'] == 'user' -%}
|
||||
{{ '<|User|>: ' + message['content'] + '\n\n' }}
|
||||
{%- elif message['role'] == 'assistant' -%}
|
||||
{{ '<|Assistant|>: ' + message['content'] + eos_token + '\n\n' }}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
|
||||
{%- if add_generation_prompt -%}
|
||||
{{ '<|Assistant|>: ' }}
|
||||
{%- endif -%}
|
||||
@@ -0,0 +1,3 @@
|
||||
{%- for message in messages -%}
|
||||
{{- message['content'] + '\n' -}}
|
||||
{%- endfor -%}
|
||||
@@ -0,0 +1,93 @@
|
||||
{%- set enable_thinking = enable_thinking | default(false) %}
|
||||
{%- if tools %}
|
||||
{{- '<|im_start|>system\n' }}
|
||||
{%- if messages[0].role == 'system' %}
|
||||
{{- messages[0].content + '\n\n' }}
|
||||
{%- endif %}
|
||||
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
||||
{%- for tool in tools %}
|
||||
{{- "\n" }}
|
||||
{{- tool | tojson }}
|
||||
{%- endfor %}
|
||||
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
||||
{%- else %}
|
||||
{%- if messages[0].role == 'system' %}
|
||||
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
|
||||
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
||||
{%- for message in messages[::-1] %}
|
||||
{%- set index = (messages|length - 1) - loop.index0 %}
|
||||
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
||||
{%- set ns.multi_step_tool = false %}
|
||||
{%- set ns.last_query_index = index %}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
|
||||
{%- for message in messages %}
|
||||
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
||||
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
||||
{%- elif message.role == "assistant" %}
|
||||
{%- set content = message.content %}
|
||||
{%- set reasoning_content = '' %}
|
||||
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
|
||||
{%- set reasoning_content = message.reasoning_content %}
|
||||
{%- else %}
|
||||
{%- if '</think>' in message.content %}
|
||||
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
||||
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- if loop.index0 > ns.last_query_index %}
|
||||
{%- if loop.last or (not loop.last and reasoning_content) %}
|
||||
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
||||
{%- else %}
|
||||
{{- '<|im_start|>' + message.role + '\n' + content }}
|
||||
{%- endif %}
|
||||
{%- else %}
|
||||
{{- '<|im_start|>' + message.role + '\n' + content }}
|
||||
{%- endif %}
|
||||
|
||||
{%- if message.tool_calls %}
|
||||
{%- for tool_call in message.tool_calls %}
|
||||
{%- if (loop.first and content) or (not loop.first) %}
|
||||
{{- '\n' }}
|
||||
{%- endif %}
|
||||
{%- if tool_call.function %}
|
||||
{%- set tool_call = tool_call.function %}
|
||||
{%- endif %}
|
||||
{{- '<tool_call>\n{"name": "' }}
|
||||
{{- tool_call.name }}
|
||||
{{- '", "arguments": ' }}
|
||||
{%- if tool_call.arguments is string %}
|
||||
{{- tool_call.arguments }}
|
||||
{%- else %}
|
||||
{{- tool_call.arguments | tojson }}
|
||||
{%- endif %}
|
||||
{{- '}\n</tool_call>' }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- elif message.role == "tool" %}
|
||||
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
||||
{{- '<|im_start|>user' }}
|
||||
{%- endif %}
|
||||
{{- '\n<tool_response>\n' }}
|
||||
{{- message.content }}
|
||||
{{- '\n</tool_response>' }}
|
||||
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
|
||||
{%- if add_generation_prompt %}
|
||||
{{- '<|im_start|>assistant\n' }}
|
||||
{%- if enable_thinking is defined and enable_thinking is false %}
|
||||
{{- '<think>\n\n</think>\n\n' }}
|
||||
{%- endif %}
|
||||
{%- if enable_thinking is defined and enable_thinking is true %}
|
||||
{{- '<think>\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
1102
vllm/transformers_utils/config.py
Normal file
1102
vllm/transformers_utils/config.py
Normal file
File diff suppressed because it is too large
Load Diff
20
vllm/transformers_utils/config_parser_base.py
Normal file
20
vllm/transformers_utils/config_parser_base.py
Normal file
@@ -0,0 +1,20 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class ConfigParserBase(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def parse(self,
|
||||
model: Union[str, Path],
|
||||
trust_remote_code: bool,
|
||||
revision: Optional[str] = None,
|
||||
code_revision: Optional[str] = None,
|
||||
**kwargs) -> tuple[dict, PretrainedConfig]:
|
||||
raise NotImplementedError
|
||||
63
vllm/transformers_utils/configs/__init__.py
Normal file
63
vllm/transformers_utils/configs/__init__.py
Normal file
@@ -0,0 +1,63 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Model configs may be defined in this directory for the following reasons:
|
||||
|
||||
- There is no configuration file defined by HF Hub or Transformers library.
|
||||
- There is a need to override the existing config to support vLLM.
|
||||
"""
|
||||
|
||||
from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
|
||||
from vllm.transformers_utils.configs.deepseek_v3 import DeepseekV3Config
|
||||
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
|
||||
from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig
|
||||
from vllm.transformers_utils.configs.eagle import EAGLEConfig
|
||||
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
|
||||
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
|
||||
# `FalconConfig` class from the official HuggingFace transformers library.
|
||||
from vllm.transformers_utils.configs.falcon import RWConfig
|
||||
from vllm.transformers_utils.configs.jais import JAISConfig
|
||||
from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
|
||||
from vllm.transformers_utils.configs.medusa import MedusaConfig
|
||||
from vllm.transformers_utils.configs.midashenglm import MiDashengLMConfig
|
||||
from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
|
||||
from vllm.transformers_utils.configs.moonvit import MoonViTConfig
|
||||
from vllm.transformers_utils.configs.nemotron import NemotronConfig
|
||||
from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
|
||||
from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config
|
||||
from vllm.transformers_utils.configs.olmo3 import Olmo3Config
|
||||
from vllm.transformers_utils.configs.ovis import OvisConfig
|
||||
from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig
|
||||
from vllm.transformers_utils.configs.radio import RadioConfig
|
||||
from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
|
||||
from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig,
|
||||
Step3VisionEncoderConfig,
|
||||
Step3VLConfig)
|
||||
from vllm.transformers_utils.configs.ultravox import UltravoxConfig
|
||||
|
||||
__all__ = [
|
||||
"ChatGLMConfig",
|
||||
"DeepseekVLV2Config",
|
||||
"DeepseekV3Config",
|
||||
"DotsOCRConfig",
|
||||
"EAGLEConfig",
|
||||
"RWConfig",
|
||||
"JAISConfig",
|
||||
"MedusaConfig",
|
||||
"MiDashengLMConfig",
|
||||
"MLPSpeculatorConfig",
|
||||
"MoonViTConfig",
|
||||
"KimiVLConfig",
|
||||
"NemotronConfig",
|
||||
"NemotronHConfig",
|
||||
"Nemotron_Nano_VL_Config",
|
||||
"Olmo3Config",
|
||||
"OvisConfig",
|
||||
"RadioConfig",
|
||||
"SpeculatorsConfig",
|
||||
"UltravoxConfig",
|
||||
"Step3VLConfig",
|
||||
"Step3VisionEncoderConfig",
|
||||
"Step3TextConfig",
|
||||
"Qwen3NextConfig",
|
||||
]
|
||||
207
vllm/transformers_utils/configs/arctic.py
Normal file
207
vllm/transformers_utils/configs/arctic.py
Normal file
@@ -0,0 +1,207 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# Copied from
|
||||
# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
|
||||
""" Arctic model configuration"""
|
||||
|
||||
from dataclasses import asdict, dataclass
|
||||
from typing import Any
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
ARCTIC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"arctic": "https://huggingface.co/Snowflake/snowflake-arctic-instruct/tree/main/config.json",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ArcticLoRAConfig:
|
||||
lora_r: int = 64
|
||||
lora_alpha: float = 16
|
||||
shard_base_weights: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class ArcticQuantizationConfig:
|
||||
q_bits: int = 8
|
||||
rounding: str = "nearest"
|
||||
mantissa_bits: int = 3
|
||||
group_size: int = 128
|
||||
|
||||
|
||||
class ArcticConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`ArcticModel`]. It is used to instantiate an
|
||||
Arctic model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
||||
with the defaults will yield a similar configuration to that of the #TODO(rsamdani): add what model has the default config..
|
||||
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 32000):
|
||||
Vocabulary size of the Arctic model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`ArcticModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 14336):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 8):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function (function or string) in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
|
||||
The maximum sequence length that this model might ever be used with. Arctic's sliding window attention
|
||||
allows sequence of up to 4096*32 tokens.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*):
|
||||
The id of the padding token.
|
||||
bos_token_id (`int`, *optional*, defaults to 1):
|
||||
The id of the "beginning-of-sequence" token.
|
||||
eos_token_id (`int`, *optional*, defaults to 2):
|
||||
The id of the "end-of-sequence" token.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model's input and output word embeddings should be tied.
|
||||
rope_theta (`float`, *optional*, defaults to 1000000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
sliding_window (`int`, *optional*):
|
||||
Sliding window attention window size. If not specified, will default to `4096`.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
num_experts_per_tok (`int`, *optional*, defaults to 2):
|
||||
The number of experts to root per-token, can be also interpreted as the `top-p` routing
|
||||
parameter
|
||||
num_local_experts (`int`, *optional*, defaults to 8):
|
||||
Number of experts per Sparse MLP layer.
|
||||
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
|
||||
The aux loss factor for the total loss.
|
||||
|
||||
```python
|
||||
>>> from transformers import ArcticModel, ArcticConfig
|
||||
|
||||
>>> # Initializing a Arctic 7B style configuration TODO(rsamdani): verify which model does the default configuration correspond to.
|
||||
>>> configuration = ArcticConfig()
|
||||
|
||||
>>> # Initializing a model from the Arctic 7B style configuration
|
||||
>>> model = ArcticModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "arctic"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=32000,
|
||||
hidden_size=4096,
|
||||
intermediate_size=14336,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=4096,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-5,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=1e6,
|
||||
sliding_window=None,
|
||||
attention_dropout=0.0,
|
||||
num_experts_per_tok=1,
|
||||
num_local_experts=8,
|
||||
router_aux_loss_coef=0.001,
|
||||
moe_layer_frequency=2,
|
||||
parallel_attn_mlp_res=False,
|
||||
moe_train_capacity_factor=1,
|
||||
moe_eval_capacity_factor=1,
|
||||
enable_expert_tensor_parallelism=False,
|
||||
moe_min_capacity=0,
|
||||
moe_token_dropping=True,
|
||||
quantization=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.sliding_window = sliding_window
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.num_local_experts = num_local_experts
|
||||
self.router_aux_loss_coef = router_aux_loss_coef
|
||||
self.moe_layer_frequency = moe_layer_frequency
|
||||
self.moe_train_capacity_factor = moe_train_capacity_factor
|
||||
self.moe_eval_capacity_factor = moe_eval_capacity_factor
|
||||
self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
|
||||
self.moe_min_capacity = moe_min_capacity
|
||||
self.moe_token_dropping = moe_token_dropping
|
||||
self.parallel_attn_mlp_res = parallel_attn_mlp_res
|
||||
if isinstance(quantization, dict):
|
||||
self.quantization = ArcticQuantizationConfig(**quantization)
|
||||
else:
|
||||
self.quantization = quantization
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, config_dict: dict[str, Any], **kwargs) -> "ArcticConfig":
|
||||
result = super().from_dict(config_dict, **kwargs)
|
||||
config = result[0] if isinstance(result, tuple) else result
|
||||
if isinstance(config.quantization, dict):
|
||||
config.quantization = ArcticQuantizationConfig(**config.quantization)
|
||||
return result
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
ret = super().to_dict()
|
||||
if isinstance(ret["quantization"], ArcticQuantizationConfig):
|
||||
ret["quantization"] = asdict(ret["quantization"])
|
||||
return ret
|
||||
72
vllm/transformers_utils/configs/chatglm.py
Normal file
72
vllm/transformers_utils/configs/chatglm.py
Normal file
@@ -0,0 +1,72 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Adapted from
|
||||
# https://github.com/zai-org/ChatGLM2-6B
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class ChatGLMConfig(PretrainedConfig):
|
||||
model_type = "chatglm"
|
||||
attribute_map = {
|
||||
"num_hidden_layers": "num_layers",
|
||||
"n_head_kv": "multi_query_group_num",
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
num_layers=28,
|
||||
padded_vocab_size=65024,
|
||||
hidden_size=4096,
|
||||
ffn_hidden_size=13696,
|
||||
kv_channels=128,
|
||||
num_attention_heads=32,
|
||||
seq_length=2048,
|
||||
hidden_dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
layernorm_epsilon=1e-5,
|
||||
rmsnorm=True,
|
||||
apply_residual_connection_post_layernorm=False,
|
||||
post_layer_norm=True,
|
||||
add_bias_linear=False,
|
||||
add_qkv_bias=False,
|
||||
interleaved_qkv=False,
|
||||
bias_dropout_fusion=True,
|
||||
multi_query_attention=False,
|
||||
multi_query_group_num=1,
|
||||
apply_query_key_layer_scaling=True,
|
||||
attention_softmax_in_fp32=True,
|
||||
fp32_residual_connection=False,
|
||||
quantization_bit=0,
|
||||
pre_seq_len=None,
|
||||
prefix_projection=False,
|
||||
**kwargs):
|
||||
self.num_layers = num_layers
|
||||
self.vocab_size = padded_vocab_size
|
||||
self.padded_vocab_size = padded_vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.ffn_hidden_size = ffn_hidden_size
|
||||
self.kv_channels = kv_channels
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.seq_length = seq_length
|
||||
# It is to be compatible with long lora.
|
||||
self.max_position_embeddings = seq_length
|
||||
self.hidden_dropout = hidden_dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.layernorm_epsilon = layernorm_epsilon
|
||||
self.rmsnorm = rmsnorm
|
||||
self.apply_residual_connection_post_layernorm = (
|
||||
apply_residual_connection_post_layernorm)
|
||||
self.post_layer_norm = post_layer_norm
|
||||
self.add_bias_linear = add_bias_linear
|
||||
self.add_qkv_bias = add_qkv_bias
|
||||
self.bias_dropout_fusion = bias_dropout_fusion
|
||||
self.multi_query_attention = multi_query_attention
|
||||
self.multi_query_group_num = multi_query_group_num
|
||||
self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
|
||||
self.attention_softmax_in_fp32 = attention_softmax_in_fp32
|
||||
self.fp32_residual_connection = fp32_residual_connection
|
||||
self.quantization_bit = quantization_bit
|
||||
self.pre_seq_len = pre_seq_len
|
||||
self.prefix_projection = prefix_projection
|
||||
self.interleaved_qkv = interleaved_qkv
|
||||
super().__init__(**kwargs)
|
||||
101
vllm/transformers_utils/configs/deepseek_v3.py
Normal file
101
vllm/transformers_utils/configs/deepseek_v3.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class DeepseekV3Config(PretrainedConfig):
|
||||
|
||||
model_type = "deepseek_v3"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=129280,
|
||||
hidden_size=7168,
|
||||
intermediate_size=18432,
|
||||
moe_intermediate_size=2048,
|
||||
num_hidden_layers=61,
|
||||
num_nextn_predict_layers=1,
|
||||
num_attention_heads=128,
|
||||
num_key_value_heads=128,
|
||||
n_shared_experts=1,
|
||||
n_routed_experts=256,
|
||||
ep_size=1,
|
||||
routed_scaling_factor=2.5,
|
||||
kv_lora_rank=512,
|
||||
q_lora_rank=1536,
|
||||
qk_rope_head_dim=64,
|
||||
v_head_dim=128,
|
||||
qk_nope_head_dim=128,
|
||||
topk_method='noaux_tc',
|
||||
n_group=8,
|
||||
topk_group=4,
|
||||
num_experts_per_tok=8,
|
||||
moe_layer_freq=1,
|
||||
first_k_dense_replace=3,
|
||||
norm_topk_prob=True,
|
||||
scoring_func='sigmoid',
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=4096,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-6,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=0,
|
||||
eos_token_id=1,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.moe_intermediate_size = moe_intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_nextn_predict_layers = num_nextn_predict_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.n_shared_experts = n_shared_experts
|
||||
self.n_routed_experts = n_routed_experts
|
||||
self.ep_size = ep_size
|
||||
self.routed_scaling_factor = routed_scaling_factor
|
||||
self.kv_lora_rank = kv_lora_rank
|
||||
self.q_lora_rank = q_lora_rank
|
||||
self.qk_rope_head_dim = qk_rope_head_dim
|
||||
self.v_head_dim = v_head_dim
|
||||
self.qk_nope_head_dim = qk_nope_head_dim
|
||||
self.topk_method = topk_method
|
||||
self.n_group = n_group
|
||||
self.topk_group = topk_group
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.moe_layer_freq = moe_layer_freq
|
||||
self.first_k_dense_replace = first_k_dense_replace
|
||||
self.norm_topk_prob = norm_topk_prob
|
||||
self.scoring_func = scoring_func
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
216
vllm/transformers_utils/configs/deepseek_vl2.py
Normal file
216
vllm/transformers_utils/configs/deepseek_vl2.py
Normal file
@@ -0,0 +1,216 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class VisionEncoderConfig(PretrainedConfig):
|
||||
model_type: str = "vision"
|
||||
|
||||
model_name: str = "vit_so400m_patch14_siglip_384.webli"
|
||||
image_size: int = 384
|
||||
patch_size: int = 16
|
||||
width: int = 1024
|
||||
layers: int = 24
|
||||
heads: int = 16
|
||||
mlp_ratio: int = 4
|
||||
global_pool: str = "map"
|
||||
ignore_head: bool = True
|
||||
class_token: bool = False
|
||||
num_classes: int = 0
|
||||
use_checkpoint: bool = False
|
||||
weight_init: str = "skip"
|
||||
deterministic: bool = False
|
||||
num_recomputing_layers: int = 0
|
||||
|
||||
def __init__(self,
|
||||
model_name: str = "vit_so400m_patch14_siglip_384.webli",
|
||||
image_size: int = 384,
|
||||
patch_size: int = 16,
|
||||
width: int = 1024,
|
||||
layers: int = 24,
|
||||
heads: int = 16,
|
||||
mlp_ratio: int = 4,
|
||||
global_pool: str = "map",
|
||||
ignore_head: bool = True,
|
||||
class_token: bool = False,
|
||||
num_classes: int = 0,
|
||||
use_checkpoint: bool = False,
|
||||
**kwargs):
|
||||
self.model_name = model_name
|
||||
self.image_size = image_size
|
||||
self.patch_size = patch_size
|
||||
self.width = width
|
||||
self.layers = layers
|
||||
self.heads = heads
|
||||
self.mlp_ratio = mlp_ratio
|
||||
self.global_pool = global_pool
|
||||
self.ignore_head = ignore_head
|
||||
self.class_token = class_token
|
||||
self.num_classes = num_classes
|
||||
self.use_checkpoint = use_checkpoint
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
class MlpProjectorConfig(PretrainedConfig):
|
||||
model_type = "mlp_projector"
|
||||
projector_type: str = "downsample_mlp_gelu"
|
||||
input_dim: int = 1152
|
||||
n_embed: int = 2048
|
||||
depth: int = 2
|
||||
mlp_ratio: int = 1
|
||||
downsample_ratio: int = 2
|
||||
token_pooling: bool = False
|
||||
|
||||
def __init__(self,
|
||||
projector_type: str = "downsample_mlp_gelu",
|
||||
input_dim: int = 1152,
|
||||
n_embed: int = 2048,
|
||||
depth: int = 2,
|
||||
mlp_ratio: int = 1,
|
||||
downsample_ratio: int = 2,
|
||||
**kwargs):
|
||||
self.projector_type = projector_type
|
||||
self.input_dim = input_dim
|
||||
self.n_embed = n_embed
|
||||
self.depth = depth
|
||||
self.mlp_ratio = mlp_ratio
|
||||
self.downsample_ratio = downsample_ratio
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
class DeepseekV2Config(PretrainedConfig):
|
||||
|
||||
model_type = "deepseek_v2"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=102400,
|
||||
hidden_size=4096,
|
||||
intermediate_size=11008,
|
||||
moe_intermediate_size=1407,
|
||||
num_hidden_layers=30,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=32,
|
||||
n_shared_experts=None,
|
||||
n_routed_experts=None,
|
||||
ep_size=1,
|
||||
routed_scaling_factor=1.0,
|
||||
kv_lora_rank=512,
|
||||
q_lora_rank=1536,
|
||||
qk_rope_head_dim=64,
|
||||
v_head_dim=128,
|
||||
qk_nope_head_dim=128,
|
||||
topk_method='gready',
|
||||
n_group=None,
|
||||
topk_group=None,
|
||||
num_experts_per_tok=None,
|
||||
moe_layer_freq=1,
|
||||
first_k_dense_replace=0,
|
||||
norm_topk_prob=False,
|
||||
scoring_func='softmax',
|
||||
aux_loss_alpha=0.001,
|
||||
seq_aux=True,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=2048,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-6,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=100000,
|
||||
eos_token_id=100001,
|
||||
pretraining_tp=1,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
use_mla=True,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.moe_intermediate_size = moe_intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.n_shared_experts = n_shared_experts
|
||||
self.n_routed_experts = n_routed_experts
|
||||
self.ep_size = ep_size
|
||||
self.routed_scaling_factor = routed_scaling_factor
|
||||
self.kv_lora_rank = kv_lora_rank
|
||||
self.q_lora_rank = q_lora_rank
|
||||
self.qk_rope_head_dim = qk_rope_head_dim
|
||||
self.v_head_dim = v_head_dim
|
||||
self.qk_nope_head_dim = qk_nope_head_dim
|
||||
self.topk_method = topk_method
|
||||
self.n_group = n_group
|
||||
self.topk_group = topk_group
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.moe_layer_freq = moe_layer_freq
|
||||
self.first_k_dense_replace = first_k_dense_replace
|
||||
self.norm_topk_prob = norm_topk_prob
|
||||
self.scoring_func = scoring_func
|
||||
self.aux_loss_alpha = aux_loss_alpha
|
||||
self.seq_aux = seq_aux
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = float(rms_norm_eps)
|
||||
self.pretraining_tp = pretraining_tp
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
self.use_mla = use_mla
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class DeepseekVLV2Config(PretrainedConfig):
|
||||
model_type = "deepseek_vl_v2"
|
||||
vision_config: VisionEncoderConfig
|
||||
projector_config: MlpProjectorConfig
|
||||
|
||||
tile_tag: str = "2D"
|
||||
global_view_pos: str = "head"
|
||||
candidate_resolutions: tuple[tuple[int, int]] = ((384, 384), )
|
||||
|
||||
def __init__(self,
|
||||
tile_tag: str = "tile_tag",
|
||||
global_view_pos: str = "head",
|
||||
candidate_resolutions: tuple[tuple[int,
|
||||
int]] = ((384, 384), ),
|
||||
**kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
vision_config = kwargs.get("vision_config", {})
|
||||
self.vision_config = VisionEncoderConfig(**vision_config)
|
||||
|
||||
projector_config = kwargs.get("projector_config", {})
|
||||
self.projector_config = MlpProjectorConfig(**projector_config)
|
||||
|
||||
language_config = kwargs.get("language_config", {})
|
||||
self.text_config = DeepseekV2Config(**language_config)
|
||||
|
||||
self.tile_tag = tile_tag
|
||||
self.global_view_pos = global_view_pos
|
||||
self.candidate_resolutions = candidate_resolutions
|
||||
self.vocab_size = self.text_config.vocab_size
|
||||
69
vllm/transformers_utils/configs/dotsocr.py
Normal file
69
vllm/transformers_utils/configs/dotsocr.py
Normal file
@@ -0,0 +1,69 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any, Optional
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.models.qwen2 import Qwen2Config
|
||||
|
||||
|
||||
class DotsVisionConfig(PretrainedConfig):
|
||||
model_type: str = "dots_vit"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embed_dim: int = 1536, # vision encoder embed size
|
||||
hidden_size: int = 1536, # after merger hidden size
|
||||
intermediate_size: int = 4224,
|
||||
num_hidden_layers: int = 42,
|
||||
num_attention_heads: int = 12,
|
||||
num_channels: int = 3,
|
||||
patch_size: int = 14,
|
||||
spatial_merge_size: int = 2,
|
||||
temporal_patch_size: int = 1,
|
||||
rms_norm_eps: float = 1e-5,
|
||||
use_bias: bool = False,
|
||||
attn_implementation="flash_attention_2",
|
||||
initializer_range=0.02,
|
||||
init_merger_std=0.02,
|
||||
is_causal=False, # ve causal forward
|
||||
post_norm=True,
|
||||
gradient_checkpointing=False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.embed_dim = embed_dim
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_channels = num_channels
|
||||
self.patch_size = patch_size
|
||||
self.spatial_merge_size = spatial_merge_size
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_bias = use_bias
|
||||
self.attn_implementation = attn_implementation
|
||||
self.initializer_range = initializer_range
|
||||
self.init_merger_std = init_merger_std
|
||||
self.is_causal = is_causal
|
||||
self.post_norm = post_norm
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
|
||||
|
||||
class DotsOCRConfig(Qwen2Config):
|
||||
model_type = "dots_ocr"
|
||||
|
||||
def __init__(self,
|
||||
image_token_id=151665,
|
||||
video_token_id=151656,
|
||||
vision_config: Optional[dict] = None,
|
||||
*args,
|
||||
**kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.image_token_id = image_token_id
|
||||
self.video_token_id = video_token_id
|
||||
self.vision_config = DotsVisionConfig(**(vision_config or {}))
|
||||
|
||||
def save_pretrained(self, save_directory, **kwargs):
|
||||
self._auto_class = None
|
||||
super().save_pretrained(save_directory, **kwargs)
|
||||
84
vllm/transformers_utils/configs/eagle.py
Normal file
84
vllm/transformers_utils/configs/eagle.py
Normal file
@@ -0,0 +1,84 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers import AutoConfig, PretrainedConfig
|
||||
|
||||
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
|
||||
|
||||
|
||||
class EAGLEConfig(PretrainedConfig):
|
||||
model_type = "eagle"
|
||||
|
||||
def __init__(self,
|
||||
model: Union[PretrainedConfig, dict, None] = None,
|
||||
truncated_vocab_size: Optional[int] = None,
|
||||
method: Optional[str] = 'eagle',
|
||||
**kwargs):
|
||||
|
||||
model_config: Union[PretrainedConfig, DeepseekV2Config, None]
|
||||
if isinstance(model, dict):
|
||||
archs = model.get("architectures", [])
|
||||
target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]
|
||||
if any(target_arch in archs for target_arch in target_archs):
|
||||
# AutoConfig does not support DeepSeek MoE models yet
|
||||
model_config = DeepseekV2Config(**model)
|
||||
else:
|
||||
model_config = AutoConfig.for_model(**model)
|
||||
else:
|
||||
model_config = model
|
||||
|
||||
for k, v in kwargs.items():
|
||||
if k != "architectures" and k != "model_type" and hasattr(
|
||||
model_config, k):
|
||||
setattr(model_config, k, v)
|
||||
|
||||
self.model = model_config
|
||||
|
||||
if self.model is None:
|
||||
self.truncated_vocab_size = None
|
||||
else:
|
||||
self.truncated_vocab_size = self.model.vocab_size if \
|
||||
truncated_vocab_size is None else truncated_vocab_size
|
||||
|
||||
# Eagle model name should follow naming convention of
|
||||
# LlamaForCausalLM -> EagleLlamaForCausalLM
|
||||
# LlamaForCausalLM -> Eagle3LlamaForCausalLM
|
||||
# LlamaForCausalLMEagle3 -> LlamaForCausalLMEagle3
|
||||
if method == "eagle":
|
||||
assert self.model is not None, \
|
||||
"model should not be None when method is eagle"
|
||||
kwargs["architectures"] = [
|
||||
f"Eagle{arch}" if not arch.startswith("Eagle") \
|
||||
else arch for arch in self.model.architectures
|
||||
]
|
||||
|
||||
elif method == "eagle3":
|
||||
assert self.model is not None, \
|
||||
"model should not be None when method is eagle3"
|
||||
kwargs["architectures"] = [
|
||||
arch if arch.startswith("Eagle3") or arch.endswith("Eagle3")
|
||||
else f"Eagle3{arch}" for arch in self.model.architectures
|
||||
]
|
||||
else:
|
||||
raise ValueError(f"Invalid method {method}. "
|
||||
"Supported methods are eagle and eagle3.")
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if self.model is not None:
|
||||
for k, v in self.model.to_dict().items():
|
||||
if k not in kwargs:
|
||||
setattr(self, k, v)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
pretrained_model_name_or_path: Union[str, os.PathLike],
|
||||
**kwargs,
|
||||
) -> "EAGLEConfig":
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs)
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
90
vllm/transformers_utils/configs/falcon.py
Normal file
90
vllm/transformers_utils/configs/falcon.py
Normal file
@@ -0,0 +1,90 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Adapted from
|
||||
# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Falcon configuration"""
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class RWConfig(PretrainedConfig):
|
||||
model_type = "falcon"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_hidden_layers": "n_layer",
|
||||
"num_attention_heads": "n_head",
|
||||
"num_kv_heads": "n_head_kv",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=250880,
|
||||
hidden_size=64,
|
||||
n_layer=2,
|
||||
n_head=8,
|
||||
layer_norm_epsilon=1e-5,
|
||||
initializer_range=0.02,
|
||||
use_cache=True,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
hidden_dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
multi_query=True,
|
||||
n_head_kv=None,
|
||||
alibi=False,
|
||||
bias=False,
|
||||
parallel_attn=False,
|
||||
new_decoder_architecture=False,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.vocab_size = vocab_size
|
||||
# Backward compatibility with n_embed kwarg
|
||||
n_embed = kwargs.pop("n_embed", None)
|
||||
self.hidden_size = hidden_size if n_embed is None else n_embed
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
self.use_cache = use_cache
|
||||
self.hidden_dropout = hidden_dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
self.bos_token_id = bos_token_id
|
||||
self.eos_token_id = eos_token_id
|
||||
self.multi_query = multi_query
|
||||
self.n_head_kv = 1 if n_head_kv is None else n_head_kv
|
||||
self.alibi = alibi
|
||||
self.bias = bias
|
||||
self.parallel_attn = parallel_attn
|
||||
self.new_decoder_architecture = new_decoder_architecture
|
||||
|
||||
if self.hidden_size == 8192:
|
||||
# Hack for falcon-40b
|
||||
self.new_decoder_architecture = True
|
||||
|
||||
super().__init__(bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
**kwargs)
|
||||
|
||||
@property
|
||||
def head_dim(self):
|
||||
return self.hidden_size // self.n_head
|
||||
|
||||
@property
|
||||
def rotary(self):
|
||||
return not self.alibi
|
||||
237
vllm/transformers_utils/configs/jais.py
Normal file
237
vllm/transformers_utils/configs/jais.py
Normal file
@@ -0,0 +1,237 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright 2023 Cerebras Systems.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""JAIS configuration"""
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class JAISConfig(PretrainedConfig):
|
||||
"""
|
||||
This is the configuration class to store the configuration of a
|
||||
[`JAISModel`]. It is used to instantiate a JAIS model according to the
|
||||
specified arguments, defining the model architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used
|
||||
to control the model outputs. Read the documentation from
|
||||
[`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 50257):
|
||||
Vocabulary size of the JAIS model. Defines the number of different
|
||||
tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`JAISModel`].
|
||||
n_positions (`int`, *optional*, defaults to 1024):
|
||||
The maximum sequence length that this model might ever be used
|
||||
with. Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048).
|
||||
n_embd (`int`, *optional*, defaults to 768):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
n_layer (`int`, *optional*, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (`int`, *optional*, defaults to 12):
|
||||
Number of attention heads for each attention layer in the
|
||||
Transformer encoder.
|
||||
n_inner (`int`, *optional*, defaults to None):
|
||||
Dimensionality of the inner feed-forward layers. `None` will set
|
||||
it to 4 times n_embd
|
||||
activation_function (`str`, *optional*, defaults to `"gelu"`):
|
||||
Activation function, to be selected in the list
|
||||
`["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"]`.
|
||||
resid_pdrop (`float`, *optional*, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in
|
||||
the embeddings, encoder, and pooler.
|
||||
embd_pdrop (`float`, *optional*, defaults to 0.1):
|
||||
The dropout ratio for the embeddings.
|
||||
attn_pdrop (`float`, *optional*, defaults to 0.1):
|
||||
The dropout ratio for the attention.
|
||||
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon to use in the layer normalization layers.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
scale_attn_weights (`bool`, *optional*, defaults to `True`):
|
||||
Scale attention weights by dividing by sqrt(hidden_size)..
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values
|
||||
attentions (not used by all models).
|
||||
scale_attn_by_inverse_layer_idx (`bool`, *optional*, default `True`):
|
||||
Whether to additionally scale attention weights
|
||||
by `1 / layer_idx + 1`.
|
||||
reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
|
||||
Whether to scale keys (K) prior to computing attention
|
||||
(dot-product)
|
||||
and upcast attention dot-product/softmax to float() when training
|
||||
with mixed precision.
|
||||
position_embedding_type (`str`, *optional*, defaults to `"learned"`):
|
||||
Positional embedding can be either `"alibi"` or `"learned"`.
|
||||
mup_width_scale (`float`, *optional*, defaults to 1.0):
|
||||
muP parameter to scale learning rate and initializers. Calculated
|
||||
as (`d_model,0 / d_model`), where
|
||||
`d_model` is the model's width and `d_model,0` is the proxy
|
||||
model's width.
|
||||
mup_embeddings_scale (`float`, *optional*, defaults to 1.0):
|
||||
muP parameter to scale token and position embeddings.
|
||||
mup_output_alpha (`float`, *optional*, defaults to 1.0):
|
||||
muP parameter to scale output logits
|
||||
(`output_logits_scale = mup_output_alpha * mup_width_scale`).
|
||||
mup_scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
|
||||
Scale attention weights by dividing by hidden_size instead of
|
||||
sqrt(hidden_size). Need to set scale_attn_weights to `True` as
|
||||
well.
|
||||
alibi_scaling (`dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for ALiBi
|
||||
embeddings. Currently only supports linear
|
||||
scaling strategy. Can specify either the scaling `factor` (must be
|
||||
a float greater than 1) for fixed scaling
|
||||
or `train_seq_len` for dynamic scaling on input samples with
|
||||
sequence length > `train_seq_len`. The expected
|
||||
formats are `{"type": strategy name, "factor": scaling factor}` or
|
||||
`{"type": strategy name,
|
||||
"train_seq_len": training sequence length}`.
|
||||
architectures (`list`, *optional*, defaults to ['JAISLMHeadModel']):
|
||||
architecture names for Jais.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import JAISConfig, JAISModel
|
||||
|
||||
>>> # Initializing a JAIS configuration
|
||||
>>> configuration = JAISConfig()
|
||||
|
||||
>>> # Initializing a model (with random weights) from the configuration
|
||||
>>> model = JAISModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "jais"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"hidden_size": "n_embd",
|
||||
"max_position_embeddings": "n_positions",
|
||||
"num_attention_heads": "n_head",
|
||||
"num_hidden_layers": "n_layer",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=50257,
|
||||
n_positions=1024,
|
||||
n_embd=768,
|
||||
n_layer=12,
|
||||
n_head=12,
|
||||
n_inner=None,
|
||||
activation_function="gelu_new",
|
||||
resid_pdrop=0.1,
|
||||
embd_pdrop=0.1,
|
||||
attn_pdrop=0.1,
|
||||
layer_norm_epsilon=1e-5,
|
||||
initializer_range=0.02,
|
||||
scale_attn_weights=True,
|
||||
use_cache=True,
|
||||
bos_token_id=50256,
|
||||
eos_token_id=50256,
|
||||
scale_attn_by_inverse_layer_idx=False,
|
||||
reorder_and_upcast_attn=False,
|
||||
position_embedding_type="learned",
|
||||
mup_width_scale=1.0,
|
||||
mup_embeddings_scale=1.0,
|
||||
mup_output_alpha=1.0,
|
||||
mup_scale_qk_dot_by_d=False,
|
||||
alibi_scaling=None,
|
||||
architectures=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
self.n_inner = n_inner
|
||||
self.activation_function = activation_function
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.embd_pdrop = embd_pdrop
|
||||
self.attn_pdrop = attn_pdrop
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
self.scale_attn_weights = scale_attn_weights
|
||||
self.use_cache = use_cache
|
||||
self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
|
||||
self.reorder_and_upcast_attn = reorder_and_upcast_attn
|
||||
|
||||
self.bos_token_id = bos_token_id
|
||||
self.eos_token_id = eos_token_id
|
||||
|
||||
self.position_embedding_type = position_embedding_type
|
||||
self.mup_width_scale = mup_width_scale
|
||||
self.mup_embeddings_scale = mup_embeddings_scale
|
||||
self.mup_output_alpha = mup_output_alpha
|
||||
self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d
|
||||
|
||||
self.alibi_scaling = alibi_scaling
|
||||
self._alibi_scaling_validation()
|
||||
if architectures is None:
|
||||
architectures = ["JAISLMHeadModel"]
|
||||
|
||||
super().__init__(
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
architectures=architectures,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _alibi_scaling_validation(self):
|
||||
"""
|
||||
Validate the `alibi_scaling` configuration.
|
||||
"""
|
||||
if self.alibi_scaling is None:
|
||||
return
|
||||
|
||||
if (not isinstance(self.alibi_scaling, dict)
|
||||
or len(self.alibi_scaling) != 2):
|
||||
raise ValueError(
|
||||
"`alibi_scaling` must be a dictionary with two fields, "
|
||||
"`type` and `factor` or `type` and `train_seq_len`, "
|
||||
f"got {self.alibi_scaling}")
|
||||
alibi_scaling_type = self.alibi_scaling.get("type", None)
|
||||
alibi_scaling_factor = self.alibi_scaling.get("factor", None)
|
||||
alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
|
||||
if alibi_scaling_type is None or alibi_scaling_type != "linear":
|
||||
raise ValueError(f"`alibi_scaling`'s type field must be 'linear', "
|
||||
f"got {alibi_scaling_type}")
|
||||
if (alibi_scaling_factor is not None
|
||||
and not isinstance(alibi_scaling_factor, float)
|
||||
or (alibi_scaling_factor is not None
|
||||
and alibi_scaling_factor <= 1.0)):
|
||||
raise ValueError(
|
||||
f"`alibi_scaling`'s factor field must be a float > 1.0, "
|
||||
f"got {alibi_scaling_factor}")
|
||||
if (alibi_dynamic_scaling is not None
|
||||
and not isinstance(alibi_dynamic_scaling, int)
|
||||
or (alibi_dynamic_scaling is not None
|
||||
and alibi_dynamic_scaling <= 1)):
|
||||
raise ValueError(
|
||||
f"`alibi_scaling`'s `train_seq_len` field must be an "
|
||||
f"integer > 1, got {alibi_dynamic_scaling}")
|
||||
37
vllm/transformers_utils/configs/kimi_vl.py
Normal file
37
vllm/transformers_utils/configs/kimi_vl.py
Normal file
@@ -0,0 +1,37 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
|
||||
from vllm.transformers_utils.configs.moonvit import MoonViTConfig
|
||||
|
||||
|
||||
class KimiVLConfig(PretrainedConfig):
|
||||
model_type = "kimi_vl"
|
||||
|
||||
def __init__(self,
|
||||
vision_config: Optional[Union[dict, MoonViTConfig]] = None,
|
||||
text_config: Optional[Union[dict, DeepseekV2Config]] = None,
|
||||
ignore_index: int = -100,
|
||||
media_placeholder_token_id: int = 163605,
|
||||
pad_token_id: int = 0,
|
||||
**kwargs):
|
||||
if vision_config is None:
|
||||
vision_config = MoonViTConfig()
|
||||
elif isinstance(vision_config, dict):
|
||||
vision_config = MoonViTConfig(**vision_config)
|
||||
self.vision_config = vision_config
|
||||
|
||||
if text_config is None:
|
||||
text_config = DeepseekV2Config()
|
||||
elif isinstance(text_config, dict):
|
||||
text_config = DeepseekV2Config(**text_config)
|
||||
self.text_config = text_config
|
||||
|
||||
self.ignore_index = ignore_index
|
||||
self.media_placeholder_token_id = media_placeholder_token_id
|
||||
|
||||
super().__init__(pad_token_id=pad_token_id, **kwargs)
|
||||
63
vllm/transformers_utils/configs/medusa.py
Normal file
63
vllm/transformers_utils/configs/medusa.py
Normal file
@@ -0,0 +1,63 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class MedusaConfig(PretrainedConfig):
|
||||
model_type = "medusa"
|
||||
|
||||
def __init__(self,
|
||||
hidden_size: int = 4096,
|
||||
vocab_size: int = 32001,
|
||||
num_heads: int = 5,
|
||||
num_hidden_layers: int = 1,
|
||||
max_paths: int = 64,
|
||||
topk: int = 10,
|
||||
truncated_vocab_size: Optional[int] = None,
|
||||
**kwargs):
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.vocab_size = vocab_size
|
||||
self.num_heads = num_heads
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.max_paths = max_paths
|
||||
self.topk = topk
|
||||
self.max_seq_len = int(2**20)
|
||||
self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
|
||||
else truncated_vocab_size
|
||||
if "architectures" not in kwargs:
|
||||
kwargs["architectures"] = ["MedusaModel"]
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
pretrained_model_name_or_path: Union[str, os.PathLike],
|
||||
**kwargs,
|
||||
) -> "MedusaConfig":
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs)
|
||||
for k in list(config_dict.keys()):
|
||||
if 'num' in k:
|
||||
if 'heads' in k:
|
||||
config_dict["num_heads"] = config_dict.pop(k)
|
||||
elif 'layers' in k:
|
||||
config_dict["num_hidden_layers"] = config_dict.pop(k)
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
@property
|
||||
def num_attention_heads(self):
|
||||
return 0
|
||||
|
||||
@property
|
||||
def num_lookahead_tokens(self):
|
||||
return self.num_heads
|
||||
|
||||
@num_lookahead_tokens.setter
|
||||
def num_lookahead_tokens(self, num_lookahead_tokens: int):
|
||||
self.num_heads = num_lookahead_tokens
|
||||
101
vllm/transformers_utils/configs/midashenglm.py
Normal file
101
vllm/transformers_utils/configs/midashenglm.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Copyright 2025 Horizon team, Xiaomi MiLM Plus.
|
||||
# Copyright 2024 The Qwen team.
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
|
||||
Qwen2_5OmniTextConfig)
|
||||
|
||||
|
||||
class DashengConfig(PretrainedConfig):
|
||||
model_type = "midashenglm_dasheng_encoder"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embed_dim: int = 768,
|
||||
outputdim: int = 527,
|
||||
patch_size: Union[int, tuple[int, int]] = 16,
|
||||
patch_stride: Union[int, tuple[int, int]] = 16,
|
||||
input_channels: int = 1,
|
||||
target_length: int = 1012,
|
||||
depth: int = 12,
|
||||
num_heads: int = 12,
|
||||
mlp_ratio: float = 4.0,
|
||||
qkv_bias: bool = True,
|
||||
init_values: Optional[float] = None,
|
||||
drop_rate: float = 0.0,
|
||||
attn_drop_rate: float = 0.0,
|
||||
f_min: float = 0.0,
|
||||
f_max: float = 8000.0,
|
||||
center: bool = True,
|
||||
win_length: int = 512,
|
||||
hop_length: int = 160,
|
||||
sample_rate: int = 16000,
|
||||
n_fft: int = 512,
|
||||
n_mels: int = 64,
|
||||
**kwargs,
|
||||
):
|
||||
self.embed_dim = embed_dim
|
||||
self.outputdim = outputdim
|
||||
self.patch_size = patch_size
|
||||
self.patch_stride = patch_stride
|
||||
self.input_channels = input_channels
|
||||
self.target_length = target_length
|
||||
self.depth = depth
|
||||
self.num_heads = num_heads
|
||||
self.mlp_ratio = mlp_ratio
|
||||
self.qkv_bias = qkv_bias
|
||||
self.init_values = init_values
|
||||
self.drop_rate = drop_rate
|
||||
self.attn_drop_rate = attn_drop_rate
|
||||
self.f_min = f_min
|
||||
self.f_max = f_max
|
||||
self.center = center
|
||||
self.win_length = win_length
|
||||
self.hop_length = hop_length
|
||||
self.sample_rate = sample_rate
|
||||
self.n_fft = n_fft
|
||||
self.n_mels = n_mels
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
class MiDashengLMConfig(PretrainedConfig):
|
||||
model_type = "midashenglm"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
audio_encoder_config: Optional[dict] = None,
|
||||
subsample_factor: int = 5,
|
||||
text_config: Optional[dict] = None,
|
||||
audio_token_id: Optional[int] = None,
|
||||
**kwargs,
|
||||
):
|
||||
self.audio_encoder_config = DashengConfig(
|
||||
**(audio_encoder_config or {}))
|
||||
self.subsample_factor = subsample_factor
|
||||
self.text_config = (Qwen2_5OmniTextConfig(
|
||||
**text_config) if text_config else Qwen2_5OmniTextConfig())
|
||||
self.text_config.rope_scaling = None # uses_mrope is false
|
||||
self.audio_token_id = audio_token_id
|
||||
super().__init__(**kwargs)
|
||||
165
vllm/transformers_utils/configs/mistral.py
Normal file
165
vllm/transformers_utils/configs/mistral.py
Normal file
@@ -0,0 +1,165 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any
|
||||
|
||||
from transformers import PretrainedConfig, WhisperConfig
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def adapt_config_dict(config_dict: dict[str, Any],
|
||||
**kwargs) -> PretrainedConfig:
|
||||
config_dict.update(kwargs)
|
||||
config_dict = _remap_general_mistral_args(config_dict)
|
||||
|
||||
if bool(config_dict.get("quantization")):
|
||||
config_dict = _remap_mistral_quantization_args(config_dict)
|
||||
|
||||
if bool(config_dict.get("moe")):
|
||||
config_dict["architectures"] = ["MixtralForCausalLM"]
|
||||
else:
|
||||
config_dict["architectures"] = ["MistralForCausalLM"]
|
||||
|
||||
if bool(config_dict.get("yarn")):
|
||||
config_dict = _remap_mistral_yarn_args(config_dict)
|
||||
|
||||
is_vision = ((config_dict.get("multimodal")
|
||||
or {}).get("vision_encoder_args")
|
||||
or config_dict.get("vision_encoder"))
|
||||
is_audio = bool(
|
||||
((config_dict.get("multimodal") or {}).get("whisper_model_args")
|
||||
or {}).get("encoder_args"))
|
||||
|
||||
assert not (is_vision and is_audio), \
|
||||
"Vision and audio are mutually exclusive"
|
||||
|
||||
if is_vision:
|
||||
config_dict = _remap_mistral_vision_args(config_dict)
|
||||
if is_audio:
|
||||
config_dict = _remap_mistral_audio_args(config_dict)
|
||||
|
||||
config = PretrainedConfig.from_dict(config_dict)
|
||||
|
||||
logger.debug("Initialized config %s", config)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _remap_mistral_vision_args(config: dict) -> dict:
|
||||
if config.get("multimodal"):
|
||||
vision_config = config.pop("multimodal")
|
||||
else:
|
||||
vision_config = config.pop("vision_encoder")
|
||||
|
||||
quant_config = config.get("quantization_config")
|
||||
config = {
|
||||
"model_type": "pixtral",
|
||||
"architectures": ["PixtralForConditionalGeneration"],
|
||||
"text_config": PretrainedConfig.from_dict(config),
|
||||
"vision_config": PretrainedConfig.from_dict(vision_config),
|
||||
}
|
||||
if quant_config:
|
||||
config["quantization_config"] = quant_config
|
||||
return config
|
||||
|
||||
|
||||
def _remap_mistral_yarn_args(config: dict) -> dict:
|
||||
# Direct remaps: yarn.X -> rope_scaling.Y
|
||||
# Source keys are from mistral.model.args.YarnArgs
|
||||
_map = {
|
||||
"beta": "beta_fast",
|
||||
"alpha": "beta_slow",
|
||||
}
|
||||
yarn_config = config.get("yarn") or {}
|
||||
renamed_yarn_config = {_map.get(k, k): v for k, v in yarn_config.items()}
|
||||
config["rope_scaling"] = {
|
||||
"rope_type": "yarn",
|
||||
"mscale_all_dim": 1, # We hardcoded this to 1
|
||||
**renamed_yarn_config
|
||||
}
|
||||
return config
|
||||
|
||||
|
||||
def _remap_general_mistral_args(config: dict) -> dict:
|
||||
# Mistral key -> HF key
|
||||
config_mapping = {
|
||||
"dim": "hidden_size",
|
||||
"norm_eps": "rms_norm_eps",
|
||||
"n_kv_heads": "num_key_value_heads",
|
||||
"n_layers": "num_hidden_layers",
|
||||
"n_heads": "num_attention_heads",
|
||||
"hidden_dim": "intermediate_size",
|
||||
}
|
||||
# HF key -> (Mistral key, default value)
|
||||
top_level_mapping_with_default = {
|
||||
"model_type": ("model_type", "transformer"),
|
||||
"hidden_act": ("activation", "silu"),
|
||||
"tie_word_embeddings": ("tied_embeddings", False),
|
||||
"max_seq_len": ("max_seq_len", 128_000),
|
||||
"max_position_embeddings": ("max_position_embeddings", 128_000),
|
||||
}
|
||||
|
||||
for key, new_key in config_mapping.items():
|
||||
if key in config:
|
||||
config[new_key] = config.pop(key)
|
||||
|
||||
for new_key, (key,
|
||||
default_value) in top_level_mapping_with_default.items():
|
||||
config[new_key] = config.pop(key, default_value)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _remap_mistral_quantization_args(config: dict) -> dict:
|
||||
quantization = config.get("quantization", {})
|
||||
if quantization.get("qformat_weight") == "fp8_e4m3":
|
||||
# This maps to the FP8 static per-tensor quantization scheme
|
||||
quantization_config = {
|
||||
"quant_method": "fp8",
|
||||
"activation_scheme": "static"
|
||||
}
|
||||
elif quantization.get("quant_method") == "compressed-tensors":
|
||||
# Pass through the quantization config to compressed-tensors
|
||||
quantization_config = quantization
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Found unknown quantization='{quantization}' in config")
|
||||
|
||||
config["quantization_config"] = quantization_config
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _remap_mistral_audio_args(config: dict) -> dict:
|
||||
whisper_args = config["multimodal"].pop("whisper_model_args")
|
||||
encoder_args = whisper_args["encoder_args"]
|
||||
downsample_args = whisper_args["downsample_args"]
|
||||
|
||||
quant_config = config.get("quantization_config")
|
||||
config = {
|
||||
"model_type":
|
||||
"whixtral",
|
||||
"architectures": ["VoxtralForConditionalGeneration"],
|
||||
"text_config":
|
||||
PretrainedConfig.from_dict(config),
|
||||
"audio_config":
|
||||
WhisperConfig(
|
||||
num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"],
|
||||
window_size=encoder_args["audio_encoding_args"]["window_size"],
|
||||
sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"],
|
||||
hop_length=encoder_args["audio_encoding_args"]["hop_length"],
|
||||
downsample_factor=downsample_args["downsample_factor"],
|
||||
d_model=encoder_args["dim"],
|
||||
encoder_layers=encoder_args["n_layers"],
|
||||
encoder_ffn_dim=encoder_args["hidden_dim"],
|
||||
encoder_attention_heads=encoder_args["n_heads"],
|
||||
vocab_size=encoder_args["vocab_size"],
|
||||
max_source_positions=encoder_args["max_source_positions"],
|
||||
is_encoder_decoder=False, # Override WhisperConfig default
|
||||
)
|
||||
}
|
||||
if quant_config:
|
||||
config["quantization_config"] = quant_config
|
||||
return config
|
||||
68
vllm/transformers_utils/configs/mlp_speculator.py
Normal file
68
vllm/transformers_utils/configs/mlp_speculator.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class MLPSpeculatorConfig(PretrainedConfig):
|
||||
model_type = "mlp_speculator"
|
||||
|
||||
attribute_map = {
|
||||
"hidden_size": "emb_dim",
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
vocab_size: int = 32000,
|
||||
emb_dim: int = 4096,
|
||||
inner_dim: int = 0,
|
||||
n_predict: int = 3,
|
||||
top_k_tokens_per_head: Optional[list[int]] = None,
|
||||
n_candidates: int = 5,
|
||||
tie_weights: bool = False,
|
||||
scale_input: bool = False,
|
||||
**kwargs):
|
||||
"""
|
||||
Initialize an MLPSpeculatorConfig
|
||||
|
||||
Args:
|
||||
vocab_size: int
|
||||
the model vocab size
|
||||
emb_dim: int
|
||||
the model embedding dimension
|
||||
inner_dim: int
|
||||
the inner dimension of the model. If 0, will be the emb_dim.
|
||||
n_predict: int
|
||||
the number of lookaheads for the speculator
|
||||
top_k_tokens_per_head: list[int]
|
||||
Number of tokens to consider from each head when forming the
|
||||
candidate tree.
|
||||
For each candidate branch in the tree, head n produces topk[n]
|
||||
additional sub-branches.
|
||||
NOTE: This parameter is currently unused.
|
||||
n_candidates: int
|
||||
number of child candidates to create per sequence
|
||||
tie_weights: bool
|
||||
If true, use a single set of weights for every model
|
||||
head/stage after the first. The initial projection
|
||||
from the base model may have a different size, so that
|
||||
stays separate.
|
||||
scale_input: bool
|
||||
if True, will scale the initial hidden states from
|
||||
the base model.
|
||||
"""
|
||||
if top_k_tokens_per_head is None:
|
||||
top_k_tokens_per_head = [5, 4, 3]
|
||||
assert len(top_k_tokens_per_head) == n_predict
|
||||
self.vocab_size = vocab_size
|
||||
self.emb_dim = emb_dim
|
||||
self.inner_dim = inner_dim
|
||||
self.n_predict = n_predict
|
||||
self.top_k_tokens_per_head = top_k_tokens_per_head
|
||||
self.n_candidates = n_candidates
|
||||
self.num_lookahead_tokens = n_predict
|
||||
self.tie_weights = tie_weights
|
||||
self.scale_input = scale_input
|
||||
|
||||
super().__init__(**kwargs)
|
||||
33
vllm/transformers_utils/configs/moonvit.py
Normal file
33
vllm/transformers_utils/configs/moonvit.py
Normal file
@@ -0,0 +1,33 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class MoonViTConfig(PretrainedConfig):
|
||||
model_type = "moonvit"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
patch_size: int = 14,
|
||||
init_pos_emb_height: int = 64,
|
||||
init_pos_emb_width: int = 64,
|
||||
num_attention_heads: int = 16,
|
||||
num_hidden_layers: int = 27,
|
||||
hidden_size: int = 1152,
|
||||
intermediate_size: int = 4304,
|
||||
merge_kernel_size: tuple[int, int] = (2, 2),
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.patch_size = patch_size
|
||||
# Positional embedding config
|
||||
self.init_pos_emb_height = init_pos_emb_height
|
||||
self.init_pos_emb_width = init_pos_emb_width
|
||||
# Transformer config
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
# Patch merger config
|
||||
self.merge_kernel_size = merge_kernel_size
|
||||
205
vllm/transformers_utils/configs/nemotron.py
Normal file
205
vllm/transformers_utils/configs/nemotron.py
Normal file
@@ -0,0 +1,205 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Copyright 2024 HuggingFace Inc. team. All rights reserved.
|
||||
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Nemotron model configuration"""
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class NemotronConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a
|
||||
[`NemotronModel`]. It is used to instantiate a Nemotron model
|
||||
according to the specified arguments, defining the model architecture.
|
||||
Instantiating a configuration with the defaults will yield a similar
|
||||
configuration to that of the Nemotron-8B.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be
|
||||
used to control the model outputs. Read the documentation from
|
||||
[`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 256000):
|
||||
Vocabulary size of the Nemotron model. Defines the number of
|
||||
different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`NemotronModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 6144):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 24576):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 48):
|
||||
Number of attention heads for each attention layer in the
|
||||
Transformer decoder.
|
||||
head_dim (`int`, *optional*):
|
||||
Projection weights dimension in multi-head attention. Set to
|
||||
hidden_size // num_attention_heads if None
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
This is the number of key_value heads that should be used to
|
||||
implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use
|
||||
Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1 the model will use Multi Query Attention
|
||||
(MQA) otherwise GQA is used. When converting a multi-head
|
||||
checkpoint to a GQA checkpoint, each group key and value
|
||||
head should be constructed by meanpooling all the original
|
||||
heads within that group. For more details checkout
|
||||
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
|
||||
is not specified, will default to `num_attention_heads`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
|
||||
The non-linear activation function (function or string) in the
|
||||
decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 4096):
|
||||
The maximum sequence length that this model might ever be used
|
||||
with.
|
||||
initializer_range (`float`, *optional*, defaults to 0.0134):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values
|
||||
attentions (not used by all models). Only relevant if
|
||||
`config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*):
|
||||
Padding token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 2):
|
||||
Beginning of stream token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 3):
|
||||
End of stream token id.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether to tie weight embeddings
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
partial_rotary_factor (`float`, *optional*, defaults to 0.5):
|
||||
Percentage of the query and keys which will have rotary embedding.
|
||||
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value and output
|
||||
projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
mlp_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in up_proj and down_proj layers in the MLP
|
||||
layers.
|
||||
|
||||
```python
|
||||
>>> from transformers import NemotronModel, NemotronConfig
|
||||
>>> # Initializing a Nemotron nemotron-15b style configuration
|
||||
>>> configuration = NemotronConfig()
|
||||
>>> # Initializing a model from the nemotron-15b style configuration
|
||||
>>> model = NemotronModel(configuration)
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "nemotron"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=256000,
|
||||
hidden_size=6144,
|
||||
intermediate_size=24576,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=48,
|
||||
head_dim=None,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="relu2",
|
||||
max_position_embeddings=4096,
|
||||
initializer_range=0.0134,
|
||||
norm_eps=1e-5,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=2,
|
||||
eos_token_id=3,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
partial_rotary_factor=0.5,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
mlp_bias=False,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
head_dim = head_dim or kwargs.get("kv_channels")
|
||||
self.head_dim = head_dim if head_dim is not None else (
|
||||
hidden_size // num_attention_heads)
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.norm_eps = norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
# for backward compatibility
|
||||
partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
|
||||
"rope_percentage") or partial_rotary_factor
|
||||
self.partial_rotary_factor = partial_rotary_factor
|
||||
self._rope_scaling_validation()
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
self.mlp_bias = mlp_bias
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _rope_scaling_validation(self):
|
||||
"""
|
||||
Validate the `rope_scaling` configuration.
|
||||
"""
|
||||
if self.rope_scaling is None:
|
||||
return
|
||||
|
||||
if not isinstance(self.rope_scaling, dict) or len(
|
||||
self.rope_scaling) != 2:
|
||||
raise ValueError(
|
||||
"`rope_scaling` must be a dictionary with two fields, "
|
||||
f"`type` and `factor`, got {self.rope_scaling}")
|
||||
rope_scaling_type = self.rope_scaling.get("type", None)
|
||||
rope_scaling_factor = self.rope_scaling.get("factor", None)
|
||||
if rope_scaling_type is None or rope_scaling_type not in [
|
||||
"linear", "dynamic"
|
||||
]:
|
||||
raise ValueError(
|
||||
"`rope_scaling`'s type field must be one of ['linear', "
|
||||
f"'dynamic'], got {rope_scaling_type}")
|
||||
if rope_scaling_factor is None or not isinstance(
|
||||
rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
|
||||
raise ValueError(
|
||||
"`rope_scaling`'s factor field must be a float > 1, got "
|
||||
f"{rope_scaling_factor}")
|
||||
259
vllm/transformers_utils/configs/nemotron_h.py
Normal file
259
vllm/transformers_utils/configs/nemotron_h.py
Normal file
@@ -0,0 +1,259 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Copyright 2024 HuggingFace Inc. team. All rights reserved.
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""NemotronH model configuration"""
|
||||
|
||||
import regex as re
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class NemotronHConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a
|
||||
[`NemotronHModel`]. It is used to instantiate a NemotronH model according
|
||||
to the specified arguments, defining the model architecture. Instantiating
|
||||
a configuration with the defaults will yield a similar configuration to
|
||||
that of the NemotronH-v0.1 model.
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 131072):
|
||||
Vocabulary size of the NemotronH model. Defines the number of
|
||||
different tokens that can be represented by the `inputs_ids`
|
||||
passed when calling [`NemotronHModel`]
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model's input and output word embeddings should be
|
||||
tied. Note that this is only relevant if the model has an output
|
||||
word embedding layer.
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 21504):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 52):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
hybrid_override_pattern (`str`, *optional*, defaults to
|
||||
`"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
|
||||
The pattern of the hybrid model. The pattern is a string of
|
||||
characters where each character represents
|
||||
M: Mamba2, *: Attention, -: MLP
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the
|
||||
Transformer encoder.
|
||||
attention_head_dim (`int`, *optional*, defaults to 128):
|
||||
Dimension of each attention head.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 8):
|
||||
This is the number of key_value heads that should be used to
|
||||
implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use
|
||||
Multi Head Attention (MHA), if `num_key_value_heads=1` the model
|
||||
will use Multi Query Attention (MQA) otherwise GQA is used.
|
||||
mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
|
||||
The non-linear activation function in the MLP layers.
|
||||
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use bias in attention layers.
|
||||
mlp_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use bias in MLP layers.
|
||||
use_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use bias in the model.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
residual_in_fp32 (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not residuals should be in `float32`. If set to `False`
|
||||
residuals will keep the same `dtype` as the rest of the model.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values
|
||||
attentions (not used by all models). Only relevant if
|
||||
`config.is_decoder=True`.
|
||||
num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
|
||||
Number of prompt logits to calculate during generation. If `None`,
|
||||
all logits will be calculated. If an integer value, only last
|
||||
`num_logits_to_keep` logits will be calculated.
|
||||
pad_token_id (`int`, *optional*, defaults to 0):
|
||||
The id of the padding token.
|
||||
bos_token_id (`int`, *optional*, defaults to 1):
|
||||
The id of the "beginning-of-sequence" token.
|
||||
eos_token_id (`int`, *optional*, defaults to 2):
|
||||
The id of the "end-of-sequence" token.
|
||||
sliding_window (`int`, *optional*, defaults to None):
|
||||
Sliding window attention window size.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 4096):
|
||||
The maximum sequence length that this model might ever be used
|
||||
with.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
hidden_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the hidden states.
|
||||
use_mamba_kernels (`bool`, *optional*, defaults to `True`):
|
||||
Flag indicating whether or not to use the fast mamba kernels.
|
||||
These are available only if `mamba-ssm` and `causal-conv1d`
|
||||
are installed, and the mamba modules are running on a CUDA device.
|
||||
ssm_state_size (`int`, *optional*, defaults to 128):
|
||||
The dimension of the mamba state space latents.
|
||||
mamba_num_heads (`int`, *optional*, defaults to 128):
|
||||
Number of heads in Mamba layers.
|
||||
mamba_n_groups (`int`, *optional*, defaults to 8):
|
||||
Number of groups in Mamba layers.
|
||||
mamba_head_dim (`int`, *optional*, defaults to 64):
|
||||
Dimension of each Mamba head.
|
||||
mamba_d_conv (`int`, *optional*, defaults to 4):
|
||||
The size of the mamba convolution kernel.
|
||||
mamba_expand (`int`, *optional*, defaults to 2):
|
||||
Expanding factor used to determine the mamba intermediate size.
|
||||
mamba_hidden_act (`str`, *optional*, defaults to "silu"):
|
||||
The non-linear activation function in the Mamba layers.
|
||||
mamba_dt_min (`float`, *optional*, defaults to 0.001):
|
||||
Minimum value for the time step in Mamba.
|
||||
mamba_dt_max (`float`, *optional*, defaults to 0.1):
|
||||
Maximum value for the time step in Mamba.
|
||||
mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
|
||||
Limits for the time step in Mamba.
|
||||
mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
|
||||
Floor value for time step initialization in Mamba.
|
||||
mamba_conv_bias (`bool`, *optional*, defaults to `True`):
|
||||
Whether to use bias in the convolution layer of the mamba mixer
|
||||
block.
|
||||
mamba_proj_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use bias in the input and output projections of the
|
||||
mamba mixer block.
|
||||
mamba_chunk_size (`int`, *optional*, defaults to 256):
|
||||
Size of chunks for Mamba processing.
|
||||
rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the pre-normalization residual connections.
|
||||
"""
|
||||
|
||||
model_type = "nemotron_h"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=131072,
|
||||
tie_word_embeddings=False,
|
||||
hidden_size=4096,
|
||||
intermediate_size=21504,
|
||||
num_hidden_layers=52,
|
||||
hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
|
||||
num_attention_heads=32,
|
||||
head_dim=128,
|
||||
num_key_value_heads=8, # nemo: num_query_groups
|
||||
mlp_hidden_act="relu2",
|
||||
attention_bias=False,
|
||||
mlp_bias=False,
|
||||
use_bias=False,
|
||||
initializer_range=0.02, # nemo: init_method_std
|
||||
layer_norm_epsilon=1e-5, # nemo: layernorm_epsilon
|
||||
residual_in_fp32=False, # Megatron Core default value
|
||||
use_cache=True,
|
||||
num_logits_to_keep=1,
|
||||
pad_token_id=0,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
sliding_window=None,
|
||||
max_position_embeddings=4096,
|
||||
attention_dropout=0.0,
|
||||
hidden_dropout=0.0, # * ADDED
|
||||
use_mamba_kernels=True,
|
||||
ssm_state_size=128, # mamba_state_size
|
||||
mamba_num_heads=128,
|
||||
mamba_n_groups=8, # nemo: mamba_ssm_ngroups = num_heads
|
||||
mamba_head_dim=64,
|
||||
mamba_d_conv=4,
|
||||
mamba_expand=2,
|
||||
mamba_hidden_act="silu",
|
||||
mamba_dt_min=0.001,
|
||||
mamba_dt_max=0.1,
|
||||
mamba_dt_limit=(0.0, float("inf")),
|
||||
mamba_dt_init_floor=1e-4,
|
||||
mamba_conv_bias=True,
|
||||
mamba_proj_bias=False,
|
||||
mamba_chunk_size=256,
|
||||
rescale_prenorm_residual=True,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.tie_word_embeddings = tie_word_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.hybrid_override_pattern = hybrid_override_pattern
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.head_dim = head_dim
|
||||
self.sliding_window = sliding_window
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.attention_dropout = attention_dropout
|
||||
self.hidden_dropout = hidden_dropout
|
||||
|
||||
# Validate hybrid_override_pattern
|
||||
# M: Mamba2, *: Attention, -: MLP
|
||||
assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
|
||||
"hybrid_override_pattern must have same length as "
|
||||
"num_hidden_layers")
|
||||
assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
|
||||
"hybrid_override_pattern must only contain characters "
|
||||
"'M', '*', or '-'")
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.mlp_hidden_act = mlp_hidden_act
|
||||
self.attention_bias = attention_bias
|
||||
self.mlp_bias = mlp_bias
|
||||
self.use_bias = use_bias
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.residual_in_fp32 = residual_in_fp32
|
||||
|
||||
self.use_cache = use_cache
|
||||
self.num_logits_to_keep = num_logits_to_keep
|
||||
|
||||
self.use_mamba_kernels = use_mamba_kernels
|
||||
self.n_groups = mamba_n_groups
|
||||
self.mamba_head_dim = mamba_head_dim
|
||||
self.ssm_state_size = ssm_state_size
|
||||
self.mamba_num_heads = mamba_num_heads
|
||||
self.conv_kernel = mamba_d_conv
|
||||
self.expand = mamba_expand
|
||||
self.mamba_hidden_act = mamba_hidden_act
|
||||
self.time_step_min = mamba_dt_min
|
||||
self.time_step_max = mamba_dt_max
|
||||
self.time_step_limit = mamba_dt_limit
|
||||
self.time_step_floor = mamba_dt_init_floor
|
||||
self.use_conv_bias = mamba_conv_bias
|
||||
self.mamba_proj_bias = mamba_proj_bias
|
||||
self.chunk_size = mamba_chunk_size
|
||||
self.rescale_prenorm_residual = rescale_prenorm_residual
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def layers_block_type(self):
|
||||
return [
|
||||
"mamba" if self.hybrid_override_pattern[i] == "M" else
|
||||
"attention" if self.hybrid_override_pattern[i] == "*" else "mlp"
|
||||
for i in range(self.num_hidden_layers)
|
||||
]
|
||||
56
vllm/transformers_utils/configs/nemotron_vl.py
Normal file
56
vllm/transformers_utils/configs/nemotron_vl.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# Adapted from
|
||||
# https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py
|
||||
# --------------------------------------------------------
|
||||
# Adapted from https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B under MIT License
|
||||
# LICENSE is in incl_licenses directory.
|
||||
# --------------------------------------------------------
|
||||
|
||||
from transformers import LlamaConfig
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.dynamic_module_utils import get_class_from_dynamic_module
|
||||
|
||||
|
||||
class Nemotron_Nano_VL_Config(PretrainedConfig):
|
||||
model_type = 'Llama_Nemotron_Nano_VL'
|
||||
is_composition = True
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vision_config=None,
|
||||
llm_config=None,
|
||||
force_image_size=None,
|
||||
downsample_ratio=0.5,
|
||||
template=None,
|
||||
ps_version='v1',
|
||||
image_tag_type="internvl",
|
||||
projector_hidden_size=4096,
|
||||
vit_hidden_size=1280,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if vision_config is not None:
|
||||
assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"]
|
||||
vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1])
|
||||
self.vision_config = vision_auto_config(**vision_config)
|
||||
else:
|
||||
self.vision_config = PretrainedConfig()
|
||||
|
||||
if llm_config is None:
|
||||
self.text_config = LlamaConfig()
|
||||
else:
|
||||
self.text_config = LlamaConfig(**llm_config)
|
||||
|
||||
# Assign configuration values
|
||||
self.force_image_size = force_image_size
|
||||
self.downsample_ratio = downsample_ratio
|
||||
self.template = template # TODO move out of here and into the tokenizer
|
||||
self.ps_version = ps_version # Pixel shuffle version
|
||||
self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
|
||||
self.projector_hidden_size = projector_hidden_size
|
||||
self.vit_hidden_size = vit_hidden_size
|
||||
80
vllm/transformers_utils/configs/olmo3.py
Normal file
80
vllm/transformers_utils/configs/olmo3.py
Normal file
@@ -0,0 +1,80 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class Olmo3Config(PretrainedConfig):
|
||||
|
||||
model_type = "olmo3"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=50304,
|
||||
hidden_size=4096,
|
||||
intermediate_size=11008,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=2048,
|
||||
initializer_range=0.02,
|
||||
use_cache=True,
|
||||
pad_token_id=1,
|
||||
bos_token_id=None,
|
||||
eos_token_id=50279,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
rms_norm_eps=1e-5,
|
||||
sliding_window=4096,
|
||||
layer_types=None,
|
||||
**kwargs,
|
||||
):
|
||||
# This model uses Olmo3ForCausalLM in transformers but Olmo2ForCausalLM
|
||||
# in vLLM.
|
||||
if "architectures" not in kwargs:
|
||||
kwargs["architectures"] = ["Olmo2ForCausalLM"]
|
||||
elif "Olmo3ForCausalLM" in kwargs["architectures"]:
|
||||
kwargs["architectures"].remove("Olmo3ForCausalLM")
|
||||
kwargs["architectures"].append("Olmo2ForCausalLM")
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
|
||||
self.sliding_window = sliding_window
|
||||
self.layer_types = layer_types
|
||||
if self.layer_types is None:
|
||||
self.layer_types = [
|
||||
"sliding_attention" if (i + 1) % 4 != 0 else "full_attention"
|
||||
for i in range(self.num_hidden_layers)
|
||||
]
|
||||
176
vllm/transformers_utils/configs/ovis.py
Normal file
176
vllm/transformers_utils/configs/ovis.py
Normal file
@@ -0,0 +1,176 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
|
||||
# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
|
||||
# Ovis Config with AimV2 config registration removed for Transformers compatibility
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from transformers import AutoConfig, PretrainedConfig
|
||||
|
||||
|
||||
class AIMv2Config(PretrainedConfig):
|
||||
"""This is the configuration class to store the configuration of an [`AIMv2Model`].
|
||||
Instantiating a configuration with the defaults will yield a similar configuration
|
||||
to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224).
|
||||
Args:
|
||||
hidden_size: Dimension of the hidden representations.
|
||||
intermediate_size: Dimension of the SwiGLU representations.
|
||||
num_hidden_layers: Number of hidden layers in the Transformer.
|
||||
num_attention_heads: Number of attention heads for each attention layer
|
||||
in the Transformer.
|
||||
num_channels: Number of input channels.
|
||||
image_size: Image size.
|
||||
patch_size: Patch size.
|
||||
rms_norm_eps: Epsilon value used for the RMS normalization layer.
|
||||
attention_dropout: Dropout ratio for attention probabilities.
|
||||
projection_dropout: Dropout ratio for the projection layer after the attention.
|
||||
qkv_bias: Whether to add a bias to the queries, keys and values.
|
||||
use_bias: Whether to add a bias in the feed-forward and projection layers.
|
||||
kwargs: Keyword arguments for the [`PretrainedConfig`].
|
||||
"""
|
||||
|
||||
model_type: str = "aimv2"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size: int = 1024,
|
||||
intermediate_size: int = 2816,
|
||||
num_hidden_layers: int = 24,
|
||||
num_attention_heads: int = 8,
|
||||
num_channels: int = 3,
|
||||
image_size: int = 224,
|
||||
patch_size: int = 14,
|
||||
rms_norm_eps: float = 1e-5,
|
||||
attention_dropout: float = 0.0,
|
||||
projection_dropout: float = 0.0,
|
||||
qkv_bias: bool = False,
|
||||
use_bias: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_channels = num_channels
|
||||
self.patch_size = patch_size
|
||||
self.image_size = image_size
|
||||
self.attention_dropout = attention_dropout
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
|
||||
self.projection_dropout = projection_dropout
|
||||
self.qkv_bias = qkv_bias
|
||||
self.use_bias = use_bias
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Visual Tokenizer Configuration
|
||||
# ----------------------------------------------------------------------
|
||||
class BaseVisualTokenizerConfig(PretrainedConfig):
|
||||
|
||||
def __init__(self,
|
||||
vocab_size=16384,
|
||||
tokenize_function="softmax",
|
||||
tau=1.0,
|
||||
depths=None,
|
||||
drop_cls_token=False,
|
||||
backbone_config: Optional[Union[PretrainedConfig,
|
||||
dict]] = None,
|
||||
hidden_stride: int = 1,
|
||||
**kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.tokenize_function = tokenize_function
|
||||
self.tau = tau
|
||||
if isinstance(depths, str):
|
||||
depths = [int(x) for x in depths.split('|')]
|
||||
self.depths = depths
|
||||
self.backbone_kwargs = dict[str, Any]()
|
||||
self.drop_cls_token = drop_cls_token
|
||||
if backbone_config is not None:
|
||||
assert isinstance(backbone_config, (PretrainedConfig, dict)), \
|
||||
f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
|
||||
if not isinstance(backbone_config, PretrainedConfig):
|
||||
model_type = backbone_config['model_type']
|
||||
if model_type != "aimv2":
|
||||
backbone_config.pop('model_type')
|
||||
backbone_config = AutoConfig.for_model(model_type, **backbone_config)
|
||||
else:
|
||||
backbone_config = AIMv2Config(**backbone_config)
|
||||
self.backbone_config = backbone_config
|
||||
self.hidden_stride = hidden_stride
|
||||
|
||||
|
||||
class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
|
||||
model_type = "aimv2_visual_tokenizer"
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if self.drop_cls_token:
|
||||
self.drop_cls_token = False
|
||||
if self.depths:
|
||||
assert len(self.depths) == 1
|
||||
self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
|
||||
|
||||
|
||||
class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
|
||||
model_type = "siglip_visual_tokenizer"
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if self.drop_cls_token:
|
||||
self.drop_cls_token = False
|
||||
if self.depths:
|
||||
assert len(self.depths) == 1
|
||||
self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
|
||||
|
||||
|
||||
AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
|
||||
AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Ovis Configuration
|
||||
# ----------------------------------------------------------------------
|
||||
class OvisConfig(PretrainedConfig):
|
||||
model_type = "ovis"
|
||||
|
||||
def __init__(self,
|
||||
llm_config: Optional[Union[PretrainedConfig, dict]] = None,
|
||||
visual_tokenizer_config: Optional[Union[PretrainedConfig,
|
||||
dict]] = None,
|
||||
multimodal_max_length=8192,
|
||||
hidden_size=None,
|
||||
conversation_formatter_class=None,
|
||||
llm_attn_implementation=None,
|
||||
disable_tie_weight=False,
|
||||
**kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if llm_config is not None:
|
||||
assert isinstance(llm_config, (PretrainedConfig, dict)), \
|
||||
f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
|
||||
if not isinstance(llm_config, PretrainedConfig):
|
||||
model_type = llm_config['model_type']
|
||||
llm_config.pop('model_type')
|
||||
llm_config = AutoConfig.for_model(model_type, **llm_config)
|
||||
|
||||
# map llm_config to text_config
|
||||
self.text_config = llm_config
|
||||
if visual_tokenizer_config is not None:
|
||||
assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
|
||||
f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
|
||||
if not isinstance(visual_tokenizer_config, PretrainedConfig):
|
||||
model_type = visual_tokenizer_config['model_type']
|
||||
visual_tokenizer_config.pop('model_type')
|
||||
visual_tokenizer_config = AutoConfig.for_model(
|
||||
model_type, **visual_tokenizer_config)
|
||||
|
||||
self.visual_tokenizer_config = visual_tokenizer_config
|
||||
self.multimodal_max_length = multimodal_max_length
|
||||
self.hidden_size = hidden_size
|
||||
self.conversation_formatter_class = conversation_formatter_class
|
||||
self.llm_attn_implementation = llm_attn_implementation
|
||||
self.disable_tie_weight = disable_tie_weight
|
||||
275
vllm/transformers_utils/configs/qwen3_next.py
Normal file
275
vllm/transformers_utils/configs/qwen3_next.py
Normal file
@@ -0,0 +1,275 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Qwen3-Next model configuration"""
|
||||
|
||||
from transformers.configuration_utils import (PretrainedConfig,
|
||||
layer_type_validation)
|
||||
from transformers.modeling_rope_utils import rope_config_validation
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class Qwen3NextConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`Qwen3NextModel`]. It is used to instantiate a
|
||||
Qwen3-Next model according to the specified arguments, defining the model architecture.
|
||||
Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
Qwen3-Next-80B-A3B-Instruct [Qwen/Qwen3-Next-80B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct).
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 151936):
|
||||
Vocabulary size of the model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids`.
|
||||
hidden_size (`int`, *optional*, defaults to 2048):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 5632):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 48):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 2):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
||||
hidden_act (`str`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 32768):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model's input and output word embeddings should be tied.
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
rope_scaling (`Dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
|
||||
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
||||
accordingly.
|
||||
Expected contents:
|
||||
`rope_type` (`str`):
|
||||
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
|
||||
'llama3'], with 'default' being the original RoPE implementation.
|
||||
`factor` (`float`, *optional*):
|
||||
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
|
||||
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
|
||||
original maximum pre-trained length.
|
||||
`original_max_position_embeddings` (`int`, *optional*):
|
||||
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
|
||||
pretraining.
|
||||
`attention_factor` (`float`, *optional*):
|
||||
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
|
||||
computation. If unspecified, it defaults to value recommended by the implementation, using the
|
||||
`factor` field to infer the suggested value.
|
||||
`beta_fast` (`float`, *optional*):
|
||||
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
|
||||
ramp function. If unspecified, it defaults to 32.
|
||||
`beta_slow` (`float`, *optional*):
|
||||
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
|
||||
ramp function. If unspecified, it defaults to 1.
|
||||
`short_factor` (`List[float]`, *optional*):
|
||||
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
|
||||
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
||||
size divided by the number of attention heads divided by 2
|
||||
`long_factor` (`List[float]`, *optional*):
|
||||
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
|
||||
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
||||
size divided by the number of attention heads divided by 2
|
||||
`low_freq_factor` (`float`, *optional*):
|
||||
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
|
||||
`high_freq_factor` (`float`, *optional*):
|
||||
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
|
||||
partial_rotary_factor (`float`, *optional*, defaults to 0.25):
|
||||
Percentage of the query and keys which will have rotary embedding.
|
||||
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
head_dim (`int`, *optional*, defaults to 256):
|
||||
Projection weights dimension in multi-head attention.
|
||||
linear_conv_kernel_dim (`int`, *optional*, defaults to 4):
|
||||
Kernel size of the convolution used in linear attention layers.
|
||||
linear_key_head_dim (`int`, *optional*, defaults to 128):
|
||||
Dimension of each key head in linear attention.
|
||||
linear_value_head_dim (`int`, *optional*, defaults to 128):
|
||||
Dimension of each value head in linear attention.
|
||||
linear_num_key_heads (`int`, *optional*, defaults to 16):
|
||||
Number of key heads used in linear attention layers.
|
||||
linear_num_value_heads (`int`, *optional*, defaults to 32):
|
||||
Number of value heads used in linear attention layers.
|
||||
decoder_sparse_step (`int`, *optional*, defaults to 1):
|
||||
The frequency of the MoE layer.
|
||||
moe_intermediate_size (`int`, *optional*, defaults to 512):
|
||||
Intermediate size of the routed expert.
|
||||
shared_expert_intermediate_size (`int`, *optional*, defaults to 512):
|
||||
Intermediate size of the shared expert.
|
||||
num_experts_per_tok (`int`, *optional*, defaults to 10):
|
||||
Number of selected experts.
|
||||
num_experts (`int`, *optional*, defaults to 512):
|
||||
Number of routed experts.
|
||||
norm_topk_prob (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the topk probabilities.
|
||||
output_router_logits (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the router logits should be returned by the model. Enabling this will also
|
||||
allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
|
||||
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
|
||||
The aux loss factor for the total loss.
|
||||
mlp_only_layers (`list[int]`, *optional*, defaults to `[]`):
|
||||
Indicate which layers use Qwen3NextMLP rather than Qwen3NextSparseMoeBlock
|
||||
The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
|
||||
If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
|
||||
layer_types (`list[str]`, *optional*):
|
||||
Types of each layer (attention or linear).
|
||||
|
||||
```python
|
||||
>>> from transformers import Qwen3NextModel, Qwen3NextConfig
|
||||
|
||||
>>> # Initializing a Qwen3Next style configuration
|
||||
>>> configuration = Qwen3NextConfig()
|
||||
|
||||
>>> # Initializing a model from the Qwen3-Next-80B-A3B style configuration
|
||||
>>> model = Qwen3NextModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```
|
||||
""" # noqa: E501
|
||||
|
||||
model_type = "qwen3_next"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
base_model_tp_plan = {
|
||||
"layers.*.self_attn.q_proj": "colwise",
|
||||
"layers.*.self_attn.k_proj": "colwise",
|
||||
"layers.*.self_attn.v_proj": "colwise",
|
||||
"layers.*.self_attn.o_proj": "rowwise",
|
||||
"layers.*.mlp.experts.*.gate_proj": "colwise",
|
||||
"layers.*.mlp.experts.*.up_proj": "colwise",
|
||||
"layers.*.mlp.experts.*.down_proj": "rowwise",
|
||||
"layers.*.mlp.shared_experts.gate_proj": "colwise",
|
||||
"layers.*.mlp.shared_experts.up_proj": "colwise",
|
||||
"layers.*.mlp.shared_experts.down_proj": "rowwise",
|
||||
"layers.*.mlp.gate_proj": "colwise",
|
||||
"layers.*.mlp.up_proj": "colwise",
|
||||
"layers.*.mlp.down_proj": "rowwise",
|
||||
}
|
||||
base_model_pp_plan = {
|
||||
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
||||
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
||||
"norm": (["hidden_states"], ["hidden_states"]),
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=151936,
|
||||
hidden_size=2048,
|
||||
intermediate_size=5632,
|
||||
num_hidden_layers=48,
|
||||
num_attention_heads=16,
|
||||
num_key_value_heads=2,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=32768,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-6,
|
||||
use_cache=True,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
partial_rotary_factor=0.25,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
head_dim=256,
|
||||
linear_conv_kernel_dim=4,
|
||||
linear_key_head_dim=128,
|
||||
linear_value_head_dim=128,
|
||||
linear_num_key_heads=16,
|
||||
linear_num_value_heads=32,
|
||||
decoder_sparse_step=1,
|
||||
moe_intermediate_size=512,
|
||||
shared_expert_intermediate_size=512,
|
||||
num_experts_per_tok=10,
|
||||
num_experts=512,
|
||||
norm_topk_prob=True,
|
||||
output_router_logits=False,
|
||||
router_aux_loss_coef=0.001,
|
||||
mlp_only_layers=None,
|
||||
layer_types=None,
|
||||
**kwargs,
|
||||
):
|
||||
if mlp_only_layers is None:
|
||||
mlp_only_layers = []
|
||||
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self.partial_rotary_factor = partial_rotary_factor
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
self.head_dim = head_dim
|
||||
rope_config_validation(self)
|
||||
|
||||
self.layer_types = layer_types
|
||||
if self.layer_types is None:
|
||||
self.layer_types = [
|
||||
"linear_attention" if bool((i + 1) % 4) else "full_attention"
|
||||
for i in range(self.num_hidden_layers)
|
||||
]
|
||||
layer_type_validation(self.layer_types)
|
||||
|
||||
# linear attention part
|
||||
self.linear_conv_kernel_dim = linear_conv_kernel_dim
|
||||
self.linear_key_head_dim = linear_key_head_dim
|
||||
self.linear_value_head_dim = linear_value_head_dim
|
||||
self.linear_num_key_heads = linear_num_key_heads
|
||||
self.linear_num_value_heads = linear_num_value_heads
|
||||
|
||||
# MoE arguments
|
||||
self.decoder_sparse_step = decoder_sparse_step
|
||||
self.moe_intermediate_size = moe_intermediate_size
|
||||
self.shared_expert_intermediate_size = shared_expert_intermediate_size
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.num_experts = num_experts
|
||||
self.norm_topk_prob = norm_topk_prob
|
||||
self.output_router_logits = output_router_logits
|
||||
self.router_aux_loss_coef = router_aux_loss_coef
|
||||
self.mlp_only_layers = mlp_only_layers
|
||||
|
||||
|
||||
__all__ = ["Qwen3NextConfig"]
|
||||
91
vllm/transformers_utils/configs/radio.py
Normal file
91
vllm/transformers_utils/configs/radio.py
Normal file
@@ -0,0 +1,91 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Radio vision model configuration"""
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
VIT_TIMM_DIM_BY_NAME: dict[str, tuple[int, int, int, int]] = {
|
||||
"vit_small_patch16_224": (384, 12, 6, 1536),
|
||||
"vit_base_patch16_224": (768, 12, 12, 3072),
|
||||
"vit_large_patch16_224": (1024, 24, 16, 4096),
|
||||
"vit_huge_patch16_224": (1280, 32, 16, 5120),
|
||||
}
|
||||
|
||||
OPENAI_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
|
||||
OPENAI_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
|
||||
|
||||
|
||||
class RadioConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a Radio
|
||||
vision model. It is used to instantiate a Radio model according to the
|
||||
specified arguments, defining the model architecture.
|
||||
|
||||
Args:
|
||||
model_name: Name of the vision transformer model
|
||||
(e.g., "vit_base_patch16_224"). Used to determine architecture
|
||||
dimensions from `VIT_TIMM_DIM_BY_NAME`.
|
||||
image_size: The size (resolution) of each image.
|
||||
patch_size: The size (resolution) of each patch.
|
||||
qkv_bias: Whether to add a bias to the queries, keys and values.
|
||||
qk_normalization: Whether to apply normalization to queries and keys.
|
||||
norm_type: The normalization type to use.
|
||||
layer_norm_eps: The epsilon used by the layer normalization layers.
|
||||
initializer_factor: A factor for initializing all weight matrices.
|
||||
hidden_act: The non-linear activation function in the encoder.
|
||||
max_img_size: Maximum image size for position embeddings.
|
||||
norm_mean: Mean values for image normalization (RGB channels).
|
||||
Defaults to (0.48145466, 0.4578275, 0.40821073)).
|
||||
norm_std: Standard deviation values for image normalization
|
||||
(RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).
|
||||
reg_tokens: Number of register tokens to use.
|
||||
"""
|
||||
|
||||
model_type = "radio"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str,
|
||||
image_size: int = 224,
|
||||
patch_size: int = 16,
|
||||
qkv_bias: bool = True,
|
||||
qk_normalization: bool = False,
|
||||
norm_type: str = "layer_norm",
|
||||
layer_norm_eps: float = 1e-6,
|
||||
initializer_factor: float = 1.0,
|
||||
hidden_act: str = "gelu",
|
||||
max_img_size: int = 2048,
|
||||
norm_mean: Union[tuple[float, float, float], list] = OPENAI_CLIP_MEAN,
|
||||
norm_std: Union[tuple[float, float, float], list] = OPENAI_CLIP_STD,
|
||||
reg_tokens: Optional[int] = None,
|
||||
**kwargs,
|
||||
):
|
||||
self.model_name = model_name
|
||||
(
|
||||
self.hidden_size,
|
||||
self.num_hidden_layers,
|
||||
self.num_attention_heads,
|
||||
self.intermediate_size,
|
||||
) = VIT_TIMM_DIM_BY_NAME[model_name]
|
||||
self.image_size = image_size
|
||||
self.patch_size = patch_size
|
||||
self.qkv_bias = qkv_bias
|
||||
self.qk_normalization = qk_normalization
|
||||
self.norm_type = norm_type
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.initializer_factor = initializer_factor
|
||||
self.hidden_act = hidden_act
|
||||
self.max_img_size = max_img_size
|
||||
self.norm_mean = list(norm_mean) if isinstance(norm_mean,
|
||||
(tuple,
|
||||
list)) else norm_mean
|
||||
self.norm_std = list(norm_std) if isinstance(norm_std,
|
||||
(tuple,
|
||||
list)) else norm_std
|
||||
self.reg_tokens = reg_tokens
|
||||
super().__init__(**kwargs)
|
||||
2
vllm/transformers_utils/configs/speculators/__init__.py
Normal file
2
vllm/transformers_utils/configs/speculators/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
32
vllm/transformers_utils/configs/speculators/algos.py
Normal file
32
vllm/transformers_utils/configs/speculators/algos.py
Normal file
@@ -0,0 +1,32 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
SUPPORTED_SPECULATORS_TYPES = {}
|
||||
|
||||
|
||||
def register_speculator(name):
|
||||
|
||||
def decorator(fn):
|
||||
SUPPORTED_SPECULATORS_TYPES[name] = fn
|
||||
return fn
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
@register_speculator("eagle3")
|
||||
def update_eagle3(config_dict: dict, vllm_config: dict) -> None:
|
||||
"""
|
||||
Apply Eagle-3 specific configuration transformations.
|
||||
|
||||
Eagle-3 specific fields:
|
||||
- draft_vocab_size: Size of the draft model's vocabulary
|
||||
- target_hidden_size: Hidden size of the target model
|
||||
- norm_before_residual: Whether to apply norm before residual connection
|
||||
"""
|
||||
|
||||
vllm_config["draft_vocab_size"] = config_dict.get("draft_vocab_size")
|
||||
if config_dict.get("target_hidden_size") is not None:
|
||||
vllm_config["target_hidden_size"] = config_dict["target_hidden_size"]
|
||||
vllm_config["norm_before_residual"] = config_dict.get(
|
||||
"norm_before_residual", True)
|
||||
vllm_config["architectures"] = ["Eagle3LlamaForCausalLM"]
|
||||
111
vllm/transformers_utils/configs/speculators/base.py
Normal file
111
vllm/transformers_utils/configs/speculators/base.py
Normal file
@@ -0,0 +1,111 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import os
|
||||
from typing import Any, Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.transformers_utils.configs.speculators.algos import (
|
||||
SUPPORTED_SPECULATORS_TYPES)
|
||||
|
||||
__all__ = ["SpeculatorsConfig"]
|
||||
|
||||
|
||||
class SpeculatorsConfig(PretrainedConfig):
|
||||
model_type = "speculators"
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
pretrained_model_name_or_path: Union[str, os.PathLike],
|
||||
**kwargs,
|
||||
) -> "SpeculatorsConfig":
|
||||
"""Load speculators Eagle config and convert to vLLM format."""
|
||||
config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path,
|
||||
**kwargs)
|
||||
|
||||
vllm_config = cls.extract_vllm_speculative_config(config_dict)
|
||||
return cls(**vllm_config)
|
||||
|
||||
@classmethod
|
||||
def extract_vllm_speculative_config(
|
||||
cls, config_dict: dict[str, Any]) -> dict[str, Any]:
|
||||
speculators_model_type = config_dict.get("speculators_model_type")
|
||||
if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES:
|
||||
raise ValueError(
|
||||
f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. "
|
||||
"Please ensure you're loading a speculators-format model.")
|
||||
|
||||
# validate fields
|
||||
# TODO: @dsikka - use speculators pydantic model to validate
|
||||
cls.validate_speculators_config(config_dict=config_dict)
|
||||
# Convert from speculators config -> format that can be ingested by vLLM
|
||||
vllm_config = cls.build_vllm_speculative_config(
|
||||
config_dict=config_dict)
|
||||
# Apply anything specific to the supported algorithm
|
||||
algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type]
|
||||
algo_updater(config_dict=config_dict, vllm_config=vllm_config)
|
||||
return vllm_config
|
||||
|
||||
@classmethod
|
||||
def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None:
|
||||
try:
|
||||
spec_config = config_dict["speculators_config"]
|
||||
methods = spec_config["proposal_methods"]
|
||||
first_method = methods[0]
|
||||
_ = first_method["speculative_tokens"]
|
||||
_ = spec_config["verifier"]["name_or_path"]
|
||||
_ = config_dict["speculators_model_type"]
|
||||
except (KeyError, IndexError, TypeError) as e:
|
||||
raise ValueError("Invalid speculators config structure") from e
|
||||
|
||||
if "transformer_layer_config" not in config_dict:
|
||||
raise ValueError("Must provide transformer_layer_config")
|
||||
|
||||
if not isinstance(config_dict["transformer_layer_config"], dict):
|
||||
raise TypeError(
|
||||
"'transformer_layer_config' must be a dictionary if provided")
|
||||
|
||||
@classmethod
|
||||
def build_vllm_speculative_config(
|
||||
cls, config_dict: dict[str, Any]) -> dict[str, Any]:
|
||||
"""
|
||||
Build vLLM-compatible speculative configuration from speculators format.
|
||||
|
||||
This method extracts and transforms speculative configuration from the
|
||||
speculators format into the structure expected by vLLM.
|
||||
|
||||
Args:
|
||||
config_dict: Configuration dictionary in speculators format
|
||||
|
||||
Returns:
|
||||
Dictionary with vLLM-compatible speculative configuration
|
||||
"""
|
||||
# Extract speculators configuration
|
||||
spec_config = config_dict["speculators_config"]
|
||||
|
||||
# Currently we only support one proposal method
|
||||
proposal_methods = spec_config.get("proposal_methods")
|
||||
if not proposal_methods:
|
||||
raise ValueError("No proposal methods found in speculators config")
|
||||
|
||||
first_method = proposal_methods[0]
|
||||
num_speculative_tokens = first_method.get("speculative_tokens")
|
||||
|
||||
if num_speculative_tokens is None:
|
||||
raise ValueError(
|
||||
"Missing 'speculative_tokens' in proposal method. "
|
||||
f"Got: {first_method}")
|
||||
|
||||
# Build base vLLM speculative configuration
|
||||
vllm_config = {
|
||||
"method": config_dict.get("speculators_model_type"),
|
||||
"num_speculative_tokens": num_speculative_tokens,
|
||||
"target_model": spec_config.get("verifier")["name_or_path"]
|
||||
}
|
||||
|
||||
# Merge transformer layer configuration if present
|
||||
transformer_config = config_dict.get("transformer_layer_config", {})
|
||||
vllm_config.update(transformer_config)
|
||||
|
||||
return vllm_config
|
||||
123
vllm/transformers_utils/configs/step3_vl.py
Normal file
123
vllm/transformers_utils/configs/step3_vl.py
Normal file
@@ -0,0 +1,123 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class Step3VisionEncoderConfig(PretrainedConfig):
|
||||
model_type = "step3_vision_encoder"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size=1792,
|
||||
intermediate_size=3072,
|
||||
output_hidden_size=4096,
|
||||
num_hidden_layers=63,
|
||||
num_attention_heads=16,
|
||||
num_channels=3,
|
||||
image_size=728,
|
||||
patch_size=14,
|
||||
hidden_act="quick_gelu",
|
||||
layer_norm_eps=1e-5,
|
||||
**kwargs,
|
||||
):
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.output_hidden_size = output_hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_channels = num_channels
|
||||
self.patch_size = patch_size
|
||||
self.image_size = image_size
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.hidden_act = hidden_act
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
class Step3TextConfig(PretrainedConfig):
|
||||
model_type = "step3_text"
|
||||
architectures = ["Step3TextForCausalLM"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size: int = 7168,
|
||||
intermediate_size: int = 18432,
|
||||
num_attention_heads: int = 64,
|
||||
num_attention_groups: int = 1,
|
||||
num_hidden_layers: int = 61,
|
||||
max_seq_len: int = 65536,
|
||||
vocab_size: int = 128815,
|
||||
rms_norm_eps: float = 1e-5,
|
||||
moe_intermediate_size: int = 5120,
|
||||
moe_num_experts: int = 48,
|
||||
moe_top_k: int = 3,
|
||||
rope_theta: float = 500000,
|
||||
rope_scaling: Optional[dict[str, Any]] = None,
|
||||
max_position_embedding: int = 65536,
|
||||
share_expert_dim: int = 5120,
|
||||
share_q_dim: int = 2048,
|
||||
head_dim: int = 256,
|
||||
norm_expert_weight: bool = False,
|
||||
moe_layers_enum: tuple[int,
|
||||
...] = (4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
|
||||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
|
||||
35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
|
||||
45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
|
||||
55, 56, 57, 58, 59),
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_attention_groups = num_attention_groups
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.max_seq_len = max_seq_len
|
||||
self.vocab_size = vocab_size
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.moe_intermediate_size = moe_intermediate_size
|
||||
self.moe_num_experts = moe_num_experts
|
||||
self.moe_top_k = moe_top_k
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self.max_position_embedding = max_position_embedding
|
||||
self.share_expert_dim = share_expert_dim
|
||||
self.share_q_dim = share_q_dim
|
||||
self.head_dim = head_dim
|
||||
self.norm_expert_weight = norm_expert_weight
|
||||
self.moe_layers_enum = moe_layers_enum
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
class Step3VLConfig(PretrainedConfig):
|
||||
model_type = "step3_vl"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vision_config: Optional[Union[dict, Step3VisionEncoderConfig]] = None,
|
||||
text_config: Optional[Union[dict, Step3TextConfig]] = None,
|
||||
understand_projector_stride: int = 1,
|
||||
projector_bias: bool = True,
|
||||
image_token_id: int = 128001,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
if vision_config is None:
|
||||
vision_config = Step3VisionEncoderConfig()
|
||||
elif isinstance(vision_config, dict):
|
||||
vision_config = Step3VisionEncoderConfig(**vision_config)
|
||||
self.vision_config = vision_config
|
||||
|
||||
if text_config is None:
|
||||
text_config = Step3TextConfig()
|
||||
elif isinstance(text_config, dict):
|
||||
text_config = Step3TextConfig(**text_config)
|
||||
self.text_config = text_config
|
||||
|
||||
self.understand_projector_stride = understand_projector_stride
|
||||
self.projector_bias = projector_bias
|
||||
self.hidden_size = text_config.hidden_size
|
||||
self.image_token_id = image_token_id
|
||||
|
||||
super().__init__(**kwargs)
|
||||
116
vllm/transformers_utils/configs/ultravox.py
Normal file
116
vllm/transformers_utils/configs/ultravox.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
|
||||
from typing import Any, Optional
|
||||
|
||||
import transformers
|
||||
|
||||
|
||||
class UltravoxConfig(transformers.PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a
|
||||
[`UltravoxForConditionalGeneration`]. It is used to instantiate an
|
||||
Ultravox model according to the specified arguments, defining the model
|
||||
architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to
|
||||
control the model outputs. Read the documentation from [`PretrainedConfig`]
|
||||
for more information.
|
||||
|
||||
Args:
|
||||
audio_config (`Union[AutoConfig, dict]`, *optional*):
|
||||
Custom audio config or dict.
|
||||
text_config (`Union[AutoConfig, dict]`, *optional*):
|
||||
The config object of the text backbone.
|
||||
audio_model_id (`str`, *optional*):
|
||||
The model ID of the audio backbone.
|
||||
text_model_id (`str`, *optional*):
|
||||
The model ID of the text backbone.
|
||||
ignore_index (`int`, *optional*, defaults to -100):
|
||||
The ignore index for the loss function.
|
||||
audio_token_index (`int`, *optional*, defaults to 32000):
|
||||
The audio token index to encode the audio prompt.
|
||||
stack_factor (`int`, *optional*, defaults to 8):
|
||||
Audio downsampling factor for the multimodal projector.
|
||||
norm_init (`float`, *optional*, defaults to 0.4):
|
||||
The initialization value for the layer normalization.
|
||||
projector_act (`str`, *optional*, defaults to `"swiglu"`):
|
||||
The activation function used by the multimodal projector.
|
||||
projector_ln_mid (`bool`, *optional*, defaults to `False`):
|
||||
Whether to apply layer normalization at the middle of the
|
||||
projector or at the end. Versions v0.4.1 and below
|
||||
use `False`, but v0.5 and above use `True`.
|
||||
"""
|
||||
wrapped_model_config: transformers.PretrainedConfig
|
||||
model_type = "ultravox"
|
||||
audio_token = "<|audio|>"
|
||||
is_composition = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
audio_config: Optional[dict[str, Any]] = None,
|
||||
text_config: Optional[dict[str, Any]] = None,
|
||||
audio_model_id: Optional[str] = None,
|
||||
text_model_id: Optional[str] = None,
|
||||
ignore_index: int = -100,
|
||||
audio_token_index: int = 32000,
|
||||
hidden_size: int = 4096,
|
||||
stack_factor: int = 8,
|
||||
norm_init: float = 0.4,
|
||||
projector_act: str = "swiglu",
|
||||
projector_ln_mid: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
self.ignore_index = ignore_index
|
||||
self.audio_token_index = audio_token_index
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.stack_factor = stack_factor
|
||||
self.norm_init = norm_init
|
||||
self.projector_act = projector_act
|
||||
self.projector_ln_mid = projector_ln_mid
|
||||
|
||||
# N.B. May set the wrapped_model_config below.
|
||||
self.text_model_id = text_model_id
|
||||
if text_model_id is None:
|
||||
text_config = text_config or {}
|
||||
self.wrapped_model_config = transformers.CONFIG_MAPPING[
|
||||
text_config.get("model_type", "llama")](**text_config)
|
||||
|
||||
# N.B. May set the audio_config below.
|
||||
self.audio_model_id = audio_model_id
|
||||
if audio_model_id is None:
|
||||
self.audio_model_id = None
|
||||
audio_config = audio_config or {}
|
||||
self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
|
||||
"model_type", "whisper")](**audio_config)
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def __setattr__(self, key, value):
|
||||
# Since --hf-overrides are applied _after_ the UltravoxConfig is
|
||||
# instantiated, load the configs implicitly when assigning text_model_id
|
||||
# or audio_model_id. This allows:
|
||||
#
|
||||
# --hf-overrides.text_model_id=<quantized variant>
|
||||
#
|
||||
# to behave as intended.
|
||||
if key == "text_model_id" and value is not None:
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
self.wrapped_model_config = get_config(value,
|
||||
trust_remote_code=False)
|
||||
elif key == "audio_model_id" and value is not None:
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
self.audio_config = get_config(value, trust_remote_code=False)
|
||||
|
||||
return super().__setattr__(key, value)
|
||||
|
||||
@property
|
||||
def text_config(self) -> transformers.PretrainedConfig:
|
||||
# When Ultravox wraps a multi-modal model (e.g. Gemma), we instantiate
|
||||
# the full model, but the text config is the text config of the inner
|
||||
# model.
|
||||
return self.wrapped_model_config.get_text_config()
|
||||
199
vllm/transformers_utils/detokenizer_utils.py
Normal file
199
vllm/transformers_utils/detokenizer_utils.py
Normal file
@@ -0,0 +1,199 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from .tokenizer import AnyTokenizer
|
||||
|
||||
|
||||
def _replace_none_with_empty(tokens: list[Optional[str]]):
|
||||
for i, token in enumerate(tokens):
|
||||
if token is None:
|
||||
tokens[i] = ""
|
||||
|
||||
|
||||
def _convert_tokens_to_string_with_added_encoders(
|
||||
tokenizer: AnyTokenizer,
|
||||
output_tokens: list[str],
|
||||
skip_special_tokens: bool,
|
||||
spaces_between_special_tokens: bool,
|
||||
) -> str:
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
|
||||
# NOTE(woosuk): The following code is slow because it runs a for loop over
|
||||
# the output_tokens. In Python, running a for loop over a list can be slow
|
||||
# even when the loop body is very simple.
|
||||
# Performance improvements: avoid repeated attribute and function lookups;
|
||||
# localize frequently used objects;
|
||||
|
||||
sub_texts: list[str] = []
|
||||
current_sub_text: list[str] = []
|
||||
convert_tokens_to_string = tokenizer.convert_tokens_to_string
|
||||
added_vocab_set = set(tokenizer.get_added_vocab())
|
||||
all_special_tokens = set(
|
||||
tokenizer.all_special_tokens) if skip_special_tokens else ()
|
||||
|
||||
for token in output_tokens:
|
||||
# Use precomputed set for skip-special check
|
||||
if token in all_special_tokens:
|
||||
continue
|
||||
if token in added_vocab_set:
|
||||
if current_sub_text:
|
||||
sub_texts.append(convert_tokens_to_string(current_sub_text))
|
||||
current_sub_text.clear()
|
||||
sub_texts.append(token)
|
||||
else:
|
||||
current_sub_text.append(token)
|
||||
if current_sub_text:
|
||||
sub_texts.append(convert_tokens_to_string(current_sub_text))
|
||||
if spaces_between_special_tokens:
|
||||
return " ".join(sub_texts)
|
||||
return "".join(sub_texts)
|
||||
|
||||
|
||||
# 5 is an arbitrary value that should work for all
|
||||
# tokenizers (bigger = more conservative).
|
||||
INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5
|
||||
|
||||
|
||||
def convert_prompt_ids_to_tokens(
|
||||
tokenizer: AnyTokenizer,
|
||||
prompt_ids: list[int],
|
||||
skip_special_tokens: bool = False,
|
||||
) -> tuple[list[str], int, int]:
|
||||
"""Converts the prompt ids to tokens and returns the tokens and offsets
|
||||
for incremental detokenization.
|
||||
|
||||
Note that not all tokens are converted to strings. Only the tokens that
|
||||
are necessary for incremental detokenization are converted to strings.
|
||||
"""
|
||||
# We do not need to convert the whole prompt to tokens.
|
||||
# Offset a little more in case we have special tokens.
|
||||
new_tokens = tokenizer.convert_ids_to_tokens(
|
||||
prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2:],
|
||||
skip_special_tokens=skip_special_tokens)
|
||||
read_offset = len(new_tokens)
|
||||
prefix_offset = max(
|
||||
read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
|
||||
# This is required to guard against out-of-vocab prompt token ids
|
||||
_replace_none_with_empty(new_tokens) # type: ignore[arg-type]
|
||||
return new_tokens, prefix_offset, read_offset
|
||||
|
||||
|
||||
def convert_ids_list_to_tokens(
|
||||
tokenizer: AnyTokenizer,
|
||||
token_ids: list[int],
|
||||
) -> list[str]:
|
||||
"""Detokenize the input ids individually.
|
||||
|
||||
Args:
|
||||
tokenizer: tokenizer used by model under test
|
||||
token_ids: convert these tokens (Python list form)
|
||||
|
||||
Returns:
|
||||
Python list of token string representations
|
||||
|
||||
"""
|
||||
token_str_lst = []
|
||||
for token_id in token_ids:
|
||||
# use default skip_special_tokens.
|
||||
token_str = tokenizer.decode([token_id])
|
||||
if token_str is None:
|
||||
token_str = ""
|
||||
token_str_lst.append(token_str)
|
||||
return token_str_lst
|
||||
|
||||
|
||||
# Based on
|
||||
# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
|
||||
# under Apache 2.0 license
|
||||
def detokenize_incrementally(
|
||||
tokenizer: AnyTokenizer,
|
||||
all_input_ids: list[int],
|
||||
prev_tokens: Optional[list[str]],
|
||||
prefix_offset: int,
|
||||
read_offset: int,
|
||||
skip_special_tokens: bool = False,
|
||||
spaces_between_special_tokens: bool = True,
|
||||
) -> tuple[list[str], str, int, int]:
|
||||
"""Detokenizes the input ids incrementally and returns the new tokens
|
||||
and the new text.
|
||||
|
||||
If `prev_tokens` is None, this function will convert the input ids to
|
||||
tokens and return the tokens and the new text. Otherwise, it will return the
|
||||
new tokens and the new text.
|
||||
|
||||
This function will also return the new prefix offset and the new read
|
||||
offset to be used in the next iteration.
|
||||
|
||||
The offsets are necessary to defeat cleanup algorithms in the decode which
|
||||
decide to add a space or not depending on the surrounding ids.
|
||||
|
||||
Args:
|
||||
tokenizer: The tokenizer to use.
|
||||
all_input_ids: The input ids. The last id is the new token id.
|
||||
prev_tokens: The previous tokens. If None, this function will convert
|
||||
the input ids to tokens and return the tokens and the new text.
|
||||
prefix_offset: The prefix offset.
|
||||
read_offset: The read offset.
|
||||
skip_special_tokens: Whether to skip special tokens.
|
||||
spaces_between_special_tokens: Whether to add spaces between special
|
||||
tokens.
|
||||
"""
|
||||
new_token_id = all_input_ids[-1]
|
||||
# This is the first iteration for this sequence
|
||||
is_first_iter = prev_tokens is None
|
||||
if is_first_iter:
|
||||
(prev_tokens, prefix_offset,
|
||||
read_offset) = convert_prompt_ids_to_tokens(
|
||||
tokenizer,
|
||||
all_input_ids[:-1],
|
||||
skip_special_tokens=skip_special_tokens)
|
||||
assert prev_tokens is not None
|
||||
|
||||
# If the new token id is out of bounds, return an empty string.
|
||||
if 0 <= new_token_id < len(tokenizer):
|
||||
# Put new_token_id in a list so skip_special_tokens is respected
|
||||
new_tokens = tokenizer.convert_ids_to_tokens(
|
||||
[new_token_id], skip_special_tokens=skip_special_tokens)
|
||||
if isinstance(new_tokens, str):
|
||||
new_tokens = [new_tokens]
|
||||
else:
|
||||
new_tokens = [""]
|
||||
output_tokens = prev_tokens + new_tokens
|
||||
|
||||
# If this is the first iteration, return all tokens.
|
||||
if is_first_iter:
|
||||
new_tokens = output_tokens
|
||||
|
||||
# The prefix text is necessary only to defeat cleanup algorithms in
|
||||
# the decode which decide to add a space or not depending on the
|
||||
# surrounding ids.
|
||||
if tokenizer.is_fast or not tokenizer.get_added_vocab():
|
||||
prefix_text = tokenizer.convert_tokens_to_string(
|
||||
output_tokens[prefix_offset:read_offset])
|
||||
new_text = tokenizer.convert_tokens_to_string(
|
||||
output_tokens[prefix_offset:])
|
||||
else:
|
||||
prefix_text = _convert_tokens_to_string_with_added_encoders(
|
||||
tokenizer,
|
||||
output_tokens[prefix_offset:read_offset],
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
spaces_between_special_tokens=spaces_between_special_tokens,
|
||||
)
|
||||
new_text = _convert_tokens_to_string_with_added_encoders(
|
||||
tokenizer,
|
||||
output_tokens[prefix_offset:],
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
spaces_between_special_tokens=spaces_between_special_tokens,
|
||||
)
|
||||
|
||||
if len(new_text) <= len(prefix_text) or new_text.endswith("<EFBFBD>"):
|
||||
# utf-8 char at the end means it's a potential unfinished byte sequence
|
||||
# from byte fallback tokenization.
|
||||
# If it's in the middle, it's probably a real invalid id generated
|
||||
# by the model
|
||||
return new_tokens, "", prefix_offset, read_offset
|
||||
|
||||
new_text = new_text[len(prefix_text):]
|
||||
return new_tokens, new_text, read_offset, len(output_tokens)
|
||||
60
vllm/transformers_utils/dynamic_module.py
Normal file
60
vllm/transformers_utils/dynamic_module.py
Normal file
@@ -0,0 +1,60 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import os
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers.dynamic_module_utils import get_class_from_dynamic_module
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def try_get_class_from_dynamic_module(
|
||||
class_reference: str,
|
||||
pretrained_model_name_or_path: str,
|
||||
cache_dir: Optional[Union[str, os.PathLike]] = None,
|
||||
force_download: bool = False,
|
||||
resume_download: Optional[bool] = None,
|
||||
proxies: Optional[dict[str, str]] = None,
|
||||
token: Optional[Union[bool, str]] = None,
|
||||
revision: Optional[str] = None,
|
||||
local_files_only: bool = False,
|
||||
repo_type: Optional[str] = None,
|
||||
code_revision: Optional[str] = None,
|
||||
warn_on_fail: bool = True,
|
||||
**kwargs,
|
||||
) -> Optional[type]:
|
||||
"""
|
||||
As `transformers.dynamic_module_utils.get_class_from_dynamic_module`,
|
||||
but ignoring any errors.
|
||||
"""
|
||||
try:
|
||||
return get_class_from_dynamic_module(
|
||||
class_reference,
|
||||
pretrained_model_name_or_path,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
resume_download=resume_download,
|
||||
proxies=proxies,
|
||||
token=token,
|
||||
revision=revision,
|
||||
local_files_only=local_files_only,
|
||||
repo_type=repo_type,
|
||||
code_revision=code_revision,
|
||||
**kwargs,
|
||||
)
|
||||
except Exception:
|
||||
location = "ModelScope" if envs.VLLM_USE_MODELSCOPE else "HF Hub"
|
||||
|
||||
if warn_on_fail:
|
||||
logger.warning(
|
||||
"Unable to load %s from %s on %s.",
|
||||
class_reference,
|
||||
pretrained_model_name_or_path,
|
||||
location,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
return None
|
||||
299
vllm/transformers_utils/processor.py
Normal file
299
vllm/transformers_utils/processor.py
Normal file
@@ -0,0 +1,299 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from functools import lru_cache
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
||||
|
||||
from transformers import (AutoFeatureExtractor, AutoImageProcessor,
|
||||
AutoProcessor, AutoVideoProcessor)
|
||||
from transformers.feature_extraction_utils import FeatureExtractionMixin
|
||||
from transformers.image_processing_utils import BaseImageProcessor
|
||||
from transformers.processing_utils import ProcessorMixin
|
||||
from transformers.video_processing_utils import BaseVideoProcessor
|
||||
from typing_extensions import TypeVar
|
||||
|
||||
from vllm.utils import get_allowed_kwarg_only_overrides
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig
|
||||
|
||||
_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
|
||||
_V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor)
|
||||
|
||||
|
||||
class HashableDict(dict):
|
||||
"""
|
||||
A dictionary that can be hashed by lru_cache.
|
||||
"""
|
||||
|
||||
# NOTE: pythonic dict is not hashable,
|
||||
# we override on it directly for simplicity
|
||||
def __hash__(self) -> int: # type: ignore[override]
|
||||
return hash(frozenset(self.items()))
|
||||
|
||||
|
||||
class HashableList(list):
|
||||
"""
|
||||
A list that can be hashed by lru_cache.
|
||||
"""
|
||||
|
||||
def __hash__(self) -> int: # type: ignore[override]
|
||||
return hash(tuple(self))
|
||||
|
||||
|
||||
def _get_processor_factory_fn(processor_cls: Union[type, tuple[type, ...]]):
|
||||
if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin:
|
||||
return AutoProcessor.from_pretrained
|
||||
if hasattr(processor_cls, "from_pretrained"):
|
||||
return processor_cls.from_pretrained
|
||||
|
||||
return processor_cls
|
||||
|
||||
|
||||
def _merge_mm_kwargs(
|
||||
model_config: "ModelConfig",
|
||||
processor_cls: Union[type, tuple[type, ...]],
|
||||
/,
|
||||
**kwargs,
|
||||
):
|
||||
mm_config = model_config.get_multimodal_config()
|
||||
merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs)
|
||||
|
||||
factory = _get_processor_factory_fn(processor_cls)
|
||||
allowed_kwargs = get_allowed_kwarg_only_overrides(
|
||||
factory,
|
||||
merged_kwargs,
|
||||
requires_kw_only=False,
|
||||
allow_var_kwargs=True,
|
||||
)
|
||||
|
||||
# NOTE: Pythonic dict is not hashable and will raise unhashable type
|
||||
# error when calling `cached_get_processor`, therefore we need to
|
||||
# wrap it to a hashable dict.
|
||||
for key, value in allowed_kwargs.items():
|
||||
if isinstance(value, dict):
|
||||
allowed_kwargs[key] = HashableDict(value)
|
||||
if isinstance(value, list):
|
||||
allowed_kwargs[key] = HashableList(value)
|
||||
|
||||
return allowed_kwargs
|
||||
|
||||
|
||||
def get_processor(
|
||||
processor_name: str,
|
||||
*args: Any,
|
||||
revision: Optional[str] = None,
|
||||
trust_remote_code: bool = False,
|
||||
processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
|
||||
**kwargs: Any,
|
||||
) -> _P:
|
||||
"""Load a processor for the given model name via HuggingFace."""
|
||||
if revision is None:
|
||||
revision = "main"
|
||||
|
||||
try:
|
||||
if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin:
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
processor_name,
|
||||
*args,
|
||||
revision=revision,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**kwargs,
|
||||
)
|
||||
elif issubclass(processor_cls, ProcessorMixin):
|
||||
processor = processor_cls.from_pretrained(
|
||||
processor_name,
|
||||
*args,
|
||||
revision=revision,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
# Processors that are standalone classes unrelated to HF
|
||||
processor = processor_cls(*args, **kwargs)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the processor class not existing or not
|
||||
# currently being imported, suggest using the --trust-remote-code flag.
|
||||
# Unlike AutoTokenizer, AutoProcessor does not separate such errors
|
||||
if not trust_remote_code:
|
||||
err_msg = (
|
||||
"Failed to load the processor. If the processor is "
|
||||
"a custom processor not yet available in the HuggingFace "
|
||||
"transformers library, consider setting "
|
||||
"`trust_remote_code=True` in LLM or using the "
|
||||
"`--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
if not isinstance(processor, processor_cls):
|
||||
raise TypeError("Invalid type of HuggingFace processor. "
|
||||
f"Expected type: {processor_cls}, but "
|
||||
f"found type: {type(processor)}")
|
||||
|
||||
return processor
|
||||
|
||||
|
||||
cached_get_processor = lru_cache(get_processor)
|
||||
|
||||
|
||||
def cached_processor_from_config(
|
||||
model_config: "ModelConfig",
|
||||
processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
|
||||
**kwargs: Any,
|
||||
) -> _P:
|
||||
return cached_get_processor(
|
||||
model_config.model,
|
||||
revision=model_config.revision,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
processor_cls=processor_cls, # type: ignore[arg-type]
|
||||
**_merge_mm_kwargs(model_config, processor_cls, **kwargs),
|
||||
)
|
||||
|
||||
|
||||
def get_feature_extractor(
|
||||
processor_name: str,
|
||||
*args: Any,
|
||||
revision: Optional[str] = None,
|
||||
trust_remote_code: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Load an audio feature extractor for the given model name
|
||||
via HuggingFace."""
|
||||
try:
|
||||
feature_extractor = AutoFeatureExtractor.from_pretrained(
|
||||
processor_name,
|
||||
*args,
|
||||
revision=revision,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**kwargs)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the processor class not existing or not
|
||||
# currently being imported, suggest using the --trust-remote-code flag.
|
||||
# Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
|
||||
if not trust_remote_code:
|
||||
err_msg = (
|
||||
"Failed to load the feature extractor. If the feature "
|
||||
"extractor is a custom extractor not yet available in the "
|
||||
"HuggingFace transformers library, consider setting "
|
||||
"`trust_remote_code=True` in LLM or using the "
|
||||
"`--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
return cast(FeatureExtractionMixin, feature_extractor)
|
||||
|
||||
|
||||
cached_get_feature_extractor = lru_cache(get_feature_extractor)
|
||||
|
||||
|
||||
def cached_feature_extractor_from_config(
|
||||
model_config: "ModelConfig",
|
||||
**kwargs: Any,
|
||||
):
|
||||
return cached_get_feature_extractor(
|
||||
model_config.model,
|
||||
revision=model_config.revision,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
**_merge_mm_kwargs(model_config, AutoFeatureExtractor, **kwargs),
|
||||
)
|
||||
|
||||
|
||||
def get_image_processor(
|
||||
processor_name: str,
|
||||
*args: Any,
|
||||
revision: Optional[str] = None,
|
||||
trust_remote_code: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Load an image processor for the given model name via HuggingFace."""
|
||||
try:
|
||||
processor = AutoImageProcessor.from_pretrained(
|
||||
processor_name,
|
||||
*args,
|
||||
revision=revision,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**kwargs)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the processor class not existing or not
|
||||
# currently being imported, suggest using the --trust-remote-code flag.
|
||||
# Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
|
||||
if not trust_remote_code:
|
||||
err_msg = (
|
||||
"Failed to load the image processor. If the image processor is "
|
||||
"a custom processor not yet available in the HuggingFace "
|
||||
"transformers library, consider setting "
|
||||
"`trust_remote_code=True` in LLM or using the "
|
||||
"`--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
return cast(BaseImageProcessor, processor)
|
||||
|
||||
|
||||
cached_get_image_processor = lru_cache(get_image_processor)
|
||||
|
||||
|
||||
def cached_image_processor_from_config(
|
||||
model_config: "ModelConfig",
|
||||
**kwargs: Any,
|
||||
):
|
||||
return cached_get_image_processor(
|
||||
model_config.model,
|
||||
revision=model_config.revision,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
**_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs),
|
||||
)
|
||||
|
||||
|
||||
def get_video_processor(
|
||||
processor_name: str,
|
||||
*args: Any,
|
||||
revision: Optional[str] = None,
|
||||
trust_remote_code: bool = False,
|
||||
processor_cls_overrides: Optional[type[_V]] = None,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Load a video processor for the given model name via HuggingFace."""
|
||||
try:
|
||||
processor_cls = processor_cls_overrides or AutoVideoProcessor
|
||||
processor = processor_cls.from_pretrained(
|
||||
processor_name,
|
||||
*args,
|
||||
revision=revision,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**kwargs)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the processor class not existing or not
|
||||
# currently being imported, suggest using the --trust-remote-code flag.
|
||||
# Unlike AutoTokenizer, AutoVideoProcessor does not separate such errors
|
||||
if not trust_remote_code:
|
||||
err_msg = (
|
||||
"Failed to load the video processor. If the video processor is "
|
||||
"a custom processor not yet available in the HuggingFace "
|
||||
"transformers library, consider setting "
|
||||
"`trust_remote_code=True` in LLM or using the "
|
||||
"`--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
return cast(BaseVideoProcessor, processor)
|
||||
|
||||
|
||||
cached_get_video_processor = lru_cache(get_video_processor)
|
||||
|
||||
|
||||
def cached_video_processor_from_config(
|
||||
model_config: "ModelConfig",
|
||||
processor_cls: Optional[type[_V]] = None,
|
||||
**kwargs: Any,
|
||||
):
|
||||
return cached_get_video_processor(
|
||||
model_config.model,
|
||||
revision=model_config.revision,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
processor_cls_overrides=processor_cls, # type: ignore[arg-type]
|
||||
**_merge_mm_kwargs(model_config, AutoVideoProcessor, **kwargs),
|
||||
)
|
||||
16
vllm/transformers_utils/processors/__init__.py
Normal file
16
vllm/transformers_utils/processors/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Multi-modal processors may be defined in this directory for the following
|
||||
reasons:
|
||||
|
||||
- There is no processing file defined by HF Hub or Transformers library.
|
||||
- There is a need to override the existing processor to support vLLM.
|
||||
"""
|
||||
|
||||
from vllm.transformers_utils.processors.deepseek_vl2 import (
|
||||
DeepseekVLV2Processor)
|
||||
from vllm.transformers_utils.processors.ovis import OvisProcessor
|
||||
from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
|
||||
|
||||
__all__ = ["DeepseekVLV2Processor", "OvisProcessor", "Ovis2_5Processor"]
|
||||
362
vllm/transformers_utils/processors/deepseek_vl2.py
Normal file
362
vllm/transformers_utils/processors/deepseek_vl2.py
Normal file
@@ -0,0 +1,362 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
|
||||
# Copyright (c) 2023-2024 DeepSeek.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# this software and associated documentation files (the "Software"), to deal in
|
||||
# the Software without restriction, including without limitation the rights to
|
||||
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||
# the Software, and to permit persons to whom the Software is furnished to do so,
|
||||
# subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
import math
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import torchvision.transforms as T
|
||||
from PIL import Image, ImageOps
|
||||
from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast
|
||||
from transformers.processing_utils import ProcessorMixin
|
||||
|
||||
|
||||
class ImageTransform:
|
||||
|
||||
def __init__(self,
|
||||
mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
std: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
normalize: bool = True):
|
||||
self.mean = mean
|
||||
self.std = std
|
||||
self.normalize = normalize
|
||||
|
||||
transform_pipelines = [T.ToTensor()]
|
||||
|
||||
if normalize:
|
||||
transform_pipelines.append(T.Normalize(mean, std))
|
||||
|
||||
self.transform = T.Compose(transform_pipelines)
|
||||
|
||||
def __call__(self, pil_img: Image.Image):
|
||||
x = self.transform(pil_img)
|
||||
return x
|
||||
|
||||
|
||||
class DeepseekVLV2Processor(ProcessorMixin):
|
||||
tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
|
||||
attributes = ["tokenizer"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: LlamaTokenizerFast,
|
||||
candidate_resolutions: tuple[tuple[int, int]],
|
||||
patch_size: int,
|
||||
downsample_ratio: int,
|
||||
image_mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
image_std: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
normalize: bool = True,
|
||||
image_token: str = "<image>",
|
||||
pad_token: str = "<|▁pad▁|>",
|
||||
add_special_token: bool = False,
|
||||
sft_format: str = "deepseek",
|
||||
mask_prompt: bool = True,
|
||||
ignore_id: int = -100,
|
||||
**kwargs,
|
||||
):
|
||||
|
||||
self.candidate_resolutions = candidate_resolutions
|
||||
self.image_size = candidate_resolutions[0][0]
|
||||
self.patch_size = patch_size
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.normalize = normalize
|
||||
self.downsample_ratio = downsample_ratio
|
||||
|
||||
self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize)
|
||||
self.tokenizer = tokenizer
|
||||
self.tokenizer.padding_side = 'left' # must set this,padding side with make a difference in batch inference
|
||||
|
||||
# add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
|
||||
if tokenizer.pad_token is None:
|
||||
self.tokenizer.add_special_tokens({'pad_token': pad_token})
|
||||
|
||||
# add image token
|
||||
image_token_id = self.tokenizer.vocab.get(image_token)
|
||||
if image_token_id is None:
|
||||
special_tokens = [image_token]
|
||||
special_tokens_dict = {"additional_special_tokens": special_tokens}
|
||||
self.tokenizer.add_special_tokens(special_tokens_dict)
|
||||
self.image_token_id = self.tokenizer.vocab.get(image_token)
|
||||
|
||||
# add five special tokens for grounding-related tasks
|
||||
# <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
|
||||
special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>']
|
||||
special_tokens_dict = {"additional_special_tokens": special_tokens}
|
||||
self.tokenizer.add_special_tokens(special_tokens_dict)
|
||||
|
||||
# add special tokens for SFT data
|
||||
special_tokens = ["<|User|>", "<|Assistant|>"]
|
||||
special_tokens_dict = {"additional_special_tokens": special_tokens}
|
||||
self.tokenizer.add_special_tokens(special_tokens_dict)
|
||||
|
||||
self.image_token = image_token
|
||||
self.pad_token = pad_token
|
||||
self.add_special_token = add_special_token
|
||||
self.sft_format = sft_format
|
||||
self.mask_prompt = mask_prompt
|
||||
self.ignore_id = ignore_id
|
||||
|
||||
super().__init__(
|
||||
tokenizer,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def select_best_resolution(self, image_size):
|
||||
# used for cropping
|
||||
original_width, original_height = image_size
|
||||
best_fit = None
|
||||
max_effective_resolution = 0
|
||||
min_wasted_resolution = float("inf")
|
||||
|
||||
for width, height in self.candidate_resolutions:
|
||||
scale = min(width / original_width, height / original_height)
|
||||
downscaled_width, downscaled_height = int(
|
||||
original_width * scale), int(original_height * scale)
|
||||
effective_resolution = min(downscaled_width * downscaled_height,
|
||||
original_width * original_height)
|
||||
wasted_resolution = (width * height) - effective_resolution
|
||||
|
||||
if effective_resolution > max_effective_resolution or (
|
||||
effective_resolution == max_effective_resolution
|
||||
and wasted_resolution < min_wasted_resolution):
|
||||
max_effective_resolution = effective_resolution
|
||||
min_wasted_resolution = wasted_resolution
|
||||
best_fit = (width, height)
|
||||
|
||||
return best_fit
|
||||
|
||||
@property
|
||||
def bos_id(self):
|
||||
return self.tokenizer.bos_token_id
|
||||
|
||||
@property
|
||||
def eos_id(self):
|
||||
return self.tokenizer.eos_token_id
|
||||
|
||||
@property
|
||||
def pad_id(self):
|
||||
return self.tokenizer.pad_token_id
|
||||
|
||||
def encode(self, text: str, bos: bool = True, eos: bool = False):
|
||||
t = self.tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
if bos:
|
||||
t = [self.bos_id] + t
|
||||
if eos:
|
||||
t = t + [self.eos_id]
|
||||
|
||||
return t
|
||||
|
||||
def decode(self, t: list[int], **kwargs) -> str:
|
||||
return self.tokenizer.decode(t, **kwargs)
|
||||
|
||||
def process_one(
|
||||
self,
|
||||
prompt: str,
|
||||
images: list[Image.Image],
|
||||
inference_mode: bool = True,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
prompt (str): the formatted prompt;
|
||||
images (list[ImageType]): the list of images;
|
||||
inference_mode (bool): if True, then remove the last eos token;
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
outputs (BaseProcessorOutput): the output of the processor,
|
||||
- input_ids (torch.LongTensor): [N + image tokens]
|
||||
- target_ids (torch.LongTensor): [N + image tokens]
|
||||
- pixel_values (torch.FloatTensor): [n_patches, 3, H, W]
|
||||
- image_id (int): the id of the image token
|
||||
- num_image_tokens (list[int]): the number of image tokens
|
||||
"""
|
||||
|
||||
assert (prompt is not None and images is not None
|
||||
), "prompt and images must be used at the same time."
|
||||
|
||||
sft_format = prompt
|
||||
tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images(
|
||||
sft_format, images, bos=True, eos=True, cropping=len(images) <= 2)
|
||||
masked_tokenized_str = []
|
||||
for token_index in tokenized_str:
|
||||
if token_index != self.image_token_id:
|
||||
masked_tokenized_str.append(token_index)
|
||||
else:
|
||||
masked_tokenized_str.append(self.ignore_id)
|
||||
|
||||
assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \
|
||||
(f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
|
||||
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal")
|
||||
|
||||
input_ids = torch.LongTensor(tokenized_str)
|
||||
target_ids = torch.LongTensor(masked_tokenized_str)
|
||||
images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
|
||||
|
||||
# set input_ids < 0 | input_ids == self.image_token_id as ignore_id
|
||||
target_ids[(input_ids < 0) |
|
||||
(input_ids == self.image_token_id)] = self.ignore_id
|
||||
input_ids[input_ids < 0] = self.pad_id
|
||||
|
||||
if inference_mode:
|
||||
# Remove the ending eos token
|
||||
assert input_ids[-1] == self.eos_id
|
||||
input_ids = input_ids[:-1]
|
||||
target_ids = target_ids[:-1]
|
||||
images_seq_mask = images_seq_mask[:-1]
|
||||
|
||||
if len(images_list) == 0:
|
||||
pixel_values = torch.zeros((1, 3, self.image_size, self.image_size))
|
||||
images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
|
||||
else:
|
||||
pixel_values = torch.stack(images_list, dim=0)
|
||||
images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
|
||||
|
||||
input_ids = input_ids.unsqueeze(0)
|
||||
|
||||
prepare = BatchFeature(
|
||||
data=dict(
|
||||
input_ids=input_ids,
|
||||
pixel_values=pixel_values,
|
||||
images_seq_mask=images_seq_mask,
|
||||
images_spatial_crop=images_spatial_crop,
|
||||
num_image_tokens=num_image_tokens,
|
||||
),
|
||||
tensor_type="pt",
|
||||
)
|
||||
return prepare
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
*,
|
||||
text: str,
|
||||
images: list[Image.Image],
|
||||
inference_mode: bool = True,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
text (str): the formatted prompt;
|
||||
images (list[ImageType]): the list of images;
|
||||
inference_mode (bool): if True, then remove the last eos token;
|
||||
**kwargs:
|
||||
|
||||
Returns:
|
||||
outputs (BaseProcessorOutput): the output of the processor,
|
||||
- input_ids (torch.LongTensor): [N + image tokens]
|
||||
- images (torch.FloatTensor): [n_images, 3, H, W]
|
||||
- image_id (int): the id of the image token
|
||||
- num_image_tokens (list[int]): the number of image tokens
|
||||
"""
|
||||
|
||||
prepare = self.process_one(
|
||||
prompt=text,
|
||||
images=images,
|
||||
inference_mode=inference_mode,
|
||||
)
|
||||
|
||||
return prepare
|
||||
|
||||
def tokenize_with_images(
|
||||
self,
|
||||
conversation: str,
|
||||
images: list[Image.Image],
|
||||
bos: bool = True,
|
||||
eos: bool = True,
|
||||
cropping: bool = True,
|
||||
):
|
||||
"""Tokenize text with <image> tags."""
|
||||
assert conversation.count(self.image_token) == len(images)
|
||||
text_splits = conversation.split(self.image_token)
|
||||
images_list, images_seq_mask, images_spatial_crop = [], [], []
|
||||
num_image_tokens = []
|
||||
tokenized_str = []
|
||||
for text_sep, image in zip(text_splits, images):
|
||||
"""encode text_sep"""
|
||||
tokenized_sep = self.encode(text_sep, bos=False, eos=False)
|
||||
tokenized_str += tokenized_sep
|
||||
images_seq_mask += [False] * len(tokenized_sep)
|
||||
|
||||
"""select best resolution for anyres"""
|
||||
if cropping:
|
||||
best_width, best_height = self.select_best_resolution(image.size)
|
||||
else:
|
||||
best_width, best_height = self.image_size, self.image_size
|
||||
|
||||
"""process the global view"""
|
||||
global_view = ImageOps.pad(image, (self.image_size, self.image_size),
|
||||
color=tuple(int(x * 255) for x in self.image_transform.mean))
|
||||
images_list.append(self.image_transform(global_view))
|
||||
|
||||
"""process the local views"""
|
||||
local_view = ImageOps.pad(image, (best_width, best_height),
|
||||
color=tuple(int(x * 255) for x in self.image_transform.mean))
|
||||
for i in range(0, best_height, self.image_size):
|
||||
for j in range(0, best_width, self.image_size):
|
||||
images_list.append(
|
||||
self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
|
||||
|
||||
"""record height / width crop num"""
|
||||
num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size
|
||||
images_spatial_crop.append([num_width_tiles, num_height_tiles])
|
||||
|
||||
"""add image tokens"""
|
||||
h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
|
||||
# global views tokens h * (w + 1), 1 is for line separator
|
||||
tokenized_image = [self.image_token_id] * h * (w + 1)
|
||||
# add a separator between global and local views
|
||||
tokenized_image += [self.image_token_id]
|
||||
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
|
||||
tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1)
|
||||
|
||||
tokenized_str += tokenized_image
|
||||
images_seq_mask += [True] * len(tokenized_image)
|
||||
num_image_tokens.append(len(tokenized_image))
|
||||
|
||||
"""process the last text split"""
|
||||
tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
|
||||
tokenized_str += tokenized_sep
|
||||
images_seq_mask += [False] * len(tokenized_sep)
|
||||
|
||||
"""add the bos and eos tokens"""
|
||||
if bos:
|
||||
tokenized_str = [self.bos_id] + tokenized_str
|
||||
images_seq_mask = [False] + images_seq_mask
|
||||
if eos:
|
||||
tokenized_str = tokenized_str + [self.eos_id]
|
||||
images_seq_mask = images_seq_mask + [False]
|
||||
|
||||
assert len(tokenized_str) == len(
|
||||
images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
|
||||
|
||||
return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens
|
||||
|
||||
|
||||
AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)
|
||||
420
vllm/transformers_utils/processors/ovis.py
Normal file
420
vllm/transformers_utils/processors/ovis.py
Normal file
@@ -0,0 +1,420 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
|
||||
# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from functools import cached_property
|
||||
from typing import Union
|
||||
|
||||
import PIL
|
||||
import torch
|
||||
from transformers import AutoProcessor, BatchFeature
|
||||
from transformers.image_utils import ImageInput
|
||||
from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
|
||||
Unpack)
|
||||
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
|
||||
__all__ = ['OvisProcessor']
|
||||
IGNORE_ID = -100
|
||||
|
||||
class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding": False,
|
||||
},
|
||||
"images_kwargs": {
|
||||
'max_partition':9,
|
||||
'covering_threshold':0.9,
|
||||
'convert_to_rgb':True,
|
||||
'return_tensors':'pt'},
|
||||
}
|
||||
|
||||
|
||||
|
||||
class OvisProcessor(ProcessorMixin):
|
||||
r"""
|
||||
Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
|
||||
[`OvisProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
|
||||
[`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`] for more information.
|
||||
Args:
|
||||
image_processor ([`Qwen2VLImageProcessor`], *optional*):
|
||||
The image processor is a required input.
|
||||
tokenizer ([`Qwen2TokenizerFast`], *optional*):
|
||||
The tokenizer is a required input.
|
||||
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
|
||||
in a chat into a tokenizable string.
|
||||
"""
|
||||
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
valid_kwargs = ["chat_template", "image_pad_token", "image_segment_len"]
|
||||
|
||||
image_processor_class = "AutoImageProcessor"
|
||||
tokenizer_class = "AutoTokenizer"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image_processor=None,
|
||||
tokenizer=None,
|
||||
chat_template=None,
|
||||
image_pad_token=None,
|
||||
image_segment_len=255,
|
||||
**kwargs,
|
||||
):
|
||||
self.image_token = "<image>"
|
||||
self.image_pad_token = image_pad_token
|
||||
self.image_segment_len = image_segment_len
|
||||
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
||||
|
||||
@cached_property
|
||||
def extra_special_tokens(self):
|
||||
image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token]
|
||||
extra_special_tokens = {
|
||||
"image_token": -200,
|
||||
"image_atom": -300,
|
||||
"image_start": -301,
|
||||
"image_prefix": -302,
|
||||
"image_col_sep": -303,
|
||||
"image_row_sep": -304,
|
||||
"image_end": -305,
|
||||
'image_pad': image_pad_token_id,
|
||||
}
|
||||
return extra_special_tokens
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: ImageInput = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
**kwargs: Unpack[OvisProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
|
||||
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
|
||||
Args:
|
||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
||||
tensor. Both channels-first and channels-last formats are supported.
|
||||
text (`str`, `list[str]`, `list[list[str]]`):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
|
||||
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||
If set, will return tensors of a particular framework. Acceptable values are:
|
||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||
Returns:
|
||||
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
||||
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
||||
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
||||
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
||||
`None`).
|
||||
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
||||
- **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
|
||||
- **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
|
||||
- **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
|
||||
- **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
|
||||
"""
|
||||
output_kwargs = self._merge_kwargs(
|
||||
OvisProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Process all images first
|
||||
image_features = {}
|
||||
if images is not None:
|
||||
processed_images = []
|
||||
image_placeholders_list = []
|
||||
grids = []
|
||||
|
||||
# Process each image
|
||||
for image in images if isinstance(images, list) else [images]:
|
||||
pixel_values, image_placeholders, grid = self.preprocess_image(
|
||||
image=image, **output_kwargs["images_kwargs"]
|
||||
)
|
||||
processed_images.append(pixel_values)
|
||||
image_placeholders_list.append(image_placeholders)
|
||||
grids.append(grid)
|
||||
|
||||
# assign all processed images
|
||||
if processed_images:
|
||||
image_features["image_placeholders"] = image_placeholders_list
|
||||
|
||||
# Process text input
|
||||
if text is not None:
|
||||
|
||||
if not isinstance(text, list):
|
||||
text = [text]
|
||||
|
||||
tokenized_batched_text = self._tokenize_with_image_symbol(text)
|
||||
image_token_id = self.get_token_value("image_token")
|
||||
replaced_ids_list = []
|
||||
idx = 0
|
||||
for ids_tensor in tokenized_batched_text:
|
||||
if image_token_id in ids_tensor and "image_placeholders" in image_features:
|
||||
if idx < len(image_features["image_placeholders"]):
|
||||
# Converts in list for ease of use
|
||||
ids_list = ids_tensor.tolist()
|
||||
|
||||
new_ids = []
|
||||
|
||||
# replace placeholders
|
||||
for i, token_id in enumerate(ids_list):
|
||||
if token_id == image_token_id:
|
||||
placeholder_ids = image_features["image_placeholders"][idx]
|
||||
new_ids.extend(placeholder_ids)
|
||||
idx += 1
|
||||
else:
|
||||
new_ids.append(token_id)
|
||||
|
||||
# Converts back to tensors
|
||||
ids_tensor = torch.tensor(new_ids, dtype=torch.long)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
'Mismatch between the images you provided and the number of placeholder present in the text')
|
||||
|
||||
replaced_ids_list.append(ids_tensor)
|
||||
|
||||
if replaced_ids_list:
|
||||
replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
|
||||
else:
|
||||
replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)
|
||||
|
||||
# Create the output with text features
|
||||
output = BatchFeature(
|
||||
data={
|
||||
"input_ids": replaced_and_tokenized_ids,
|
||||
}
|
||||
)
|
||||
|
||||
# Add image features if present
|
||||
if image_features:
|
||||
output["pixel_values"] = processed_images
|
||||
output['grids'] = grids
|
||||
|
||||
return output
|
||||
|
||||
# If only images were provided
|
||||
return BatchFeature(data=image_features)
|
||||
|
||||
def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
|
||||
batch_token_ids = []
|
||||
for text in text_list:
|
||||
text_chunks = [self.tokenizer(chunk, add_special_tokens=False).input_ids for chunk in
|
||||
text.split(self.image_token)]
|
||||
token_ids = []
|
||||
num_chuck = len(text_chunks)
|
||||
for i, chunk in enumerate(text_chunks):
|
||||
token_ids.extend(chunk)
|
||||
if i < num_chuck - 1:
|
||||
token_ids.append(self.get_token_value("image_token"))
|
||||
batch_token_ids.append(token_ids)
|
||||
return torch.tensor(batch_token_ids, dtype=torch.long)
|
||||
|
||||
def get_image_size(self):
|
||||
size = self.image_processor.size
|
||||
if 'shortest_edge' in size:
|
||||
width = height = size['shortest_edge']
|
||||
elif "height" in size and "width" in size:
|
||||
width = size['width']
|
||||
height = size['height']
|
||||
else:
|
||||
raise ValueError( "Can't parse image size from image_processor config.")
|
||||
return height, width
|
||||
|
||||
def get_token_value(self, tok):
|
||||
return self.extra_special_tokens[tok]
|
||||
|
||||
def construct_image_indicators(self, grid):
|
||||
image_placeholders = [self.get_token_value('image_start'),
|
||||
self.get_token_value('image_atom'),
|
||||
self.get_token_value('image_prefix')]
|
||||
if grid[0] * grid[1] > 1:
|
||||
for r in range(grid[0]):
|
||||
for c in range(grid[1]):
|
||||
image_placeholders.append(self.get_token_value('image_atom') )
|
||||
if c < grid[1] - 1:
|
||||
image_placeholders.append(self.get_token_value('image_col_sep'))
|
||||
if r < grid[0] - 1:
|
||||
image_placeholders.append(self.get_token_value('image_row_sep'))
|
||||
image_placeholders.append(self.get_token_value('image_end'))
|
||||
return image_placeholders
|
||||
|
||||
def construct_image_placeholders(self, grid):
|
||||
|
||||
image_placeholders = self.construct_image_indicators(grid)
|
||||
|
||||
image_atom_token_id = self.get_token_value('image_atom')
|
||||
# Extract the padding token ID from tokenizer
|
||||
image_padding_token_id = self.get_token_value('image_pad')
|
||||
|
||||
# Create a new list with padding tokens inserted
|
||||
padded_placeholder_tokens = []
|
||||
for token in image_placeholders:
|
||||
padded_placeholder_tokens.append(image_padding_token_id)
|
||||
if token == image_atom_token_id:
|
||||
padded_placeholder_tokens.extend([image_padding_token_id] * self.image_segment_len)
|
||||
return padded_placeholder_tokens
|
||||
|
||||
def preprocess_image(self, image: PIL.Image.Image, max_partition, covering_threshold, convert_to_rgb, return_tensors):
|
||||
def _preprocess(img: PIL.Image.Image, side):
|
||||
# first resize and preprocess
|
||||
w, h = img.size
|
||||
if w == h:
|
||||
new_width = new_height = side
|
||||
elif w > h:
|
||||
new_width = side
|
||||
new_height = int(h / w * new_width)
|
||||
else:
|
||||
new_height = side
|
||||
new_width = int(w / h * new_height)
|
||||
new_size = dict(height=new_height, width=new_width)
|
||||
pixel_values = self.image_processor.preprocess(img, size=new_size, return_tensors=return_tensors)['pixel_values']
|
||||
|
||||
# then pad to square
|
||||
square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device)
|
||||
new_height, new_width = pixel_values.shape[2:]
|
||||
if new_height == new_width:
|
||||
square_values[:, :, :, :] = pixel_values
|
||||
elif new_height > new_width:
|
||||
from_index = (side - new_width) // 2
|
||||
square_values[:, :, :, from_index:from_index + new_width] = pixel_values
|
||||
else:
|
||||
from_index = (side - new_height) // 2
|
||||
square_values[:, :, from_index:from_index + new_height, :] = pixel_values
|
||||
|
||||
return square_values
|
||||
|
||||
def _partition(img, grid) -> list[tuple[int, int, int, int]]:
|
||||
w, h = img.size
|
||||
row_height = h // grid[0]
|
||||
col_width = w // grid[1]
|
||||
|
||||
partition = []
|
||||
for row in range(grid[0]):
|
||||
for col in range(grid[1]):
|
||||
left = col * col_width
|
||||
upper = row * row_height
|
||||
right = w if col == grid[1] - 1 else (col + 1) * col_width
|
||||
lower = h if row == grid[0] - 1 else (row + 1) * row_height
|
||||
partition.append((left, upper, right, lower))
|
||||
|
||||
return partition
|
||||
|
||||
def _covering_area(left, upper, right, lower, side):
|
||||
w = right - left
|
||||
h = lower - upper
|
||||
w, h = max(w, h), min(w, h)
|
||||
if w > side:
|
||||
h = h / w * side
|
||||
w = side
|
||||
return w * h
|
||||
|
||||
def _get_best_grid(img, side):
|
||||
img_area = img.size[0] * img.size[1]
|
||||
|
||||
candidate_grids = []
|
||||
for i in range(1, max_partition + 1):
|
||||
for j in range(1, max_partition + 1):
|
||||
if i * j <= max_partition:
|
||||
candidate_grids.append((i, j))
|
||||
|
||||
all_grids = []
|
||||
good_grids = []
|
||||
for grid in candidate_grids:
|
||||
partition = _partition(img, grid)
|
||||
covering_ratio = sum([_covering_area(*p, side) for p in partition]) / img_area
|
||||
assert covering_ratio <= 1.0
|
||||
all_grids.append((grid, covering_ratio))
|
||||
if covering_ratio > covering_threshold:
|
||||
good_grids.append((grid, covering_ratio))
|
||||
|
||||
if len(good_grids) > 0:
|
||||
# pick the good partition with minimum #sub_images and break the tie using covering_ratio
|
||||
return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][0]
|
||||
else:
|
||||
# pick the partition with maximum covering_ratio and break the tie using #sub_images
|
||||
return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
|
||||
|
||||
if convert_to_rgb:
|
||||
image = convert_image_mode(image, 'RGB')
|
||||
|
||||
|
||||
sides = self.get_image_size()
|
||||
if sides[0] != sides[1]:
|
||||
raise ValueError('get_image_size() returns non-square size')
|
||||
side = sides[0]
|
||||
grid = _get_best_grid(image, side)
|
||||
partition = _partition(image, grid)
|
||||
crops = [image.crop(p) for p in partition]
|
||||
if len(crops) > 1:
|
||||
crops.insert(0, image)
|
||||
pixel_values = torch.cat([_preprocess(crop, side) for crop in crops], dim=0)
|
||||
image_placeholders = self.construct_image_placeholders(grid)
|
||||
return pixel_values, image_placeholders, grid
|
||||
|
||||
def batch_decode(self, *args, **kwargs):
|
||||
"""
|
||||
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
|
||||
refer to the docstring of this method for more information.
|
||||
"""
|
||||
return self.tokenizer.batch_decode(*args, **kwargs)
|
||||
|
||||
def decode(self, *args, **kwargs):
|
||||
"""
|
||||
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
|
||||
the docstring of this method for more information.
|
||||
"""
|
||||
return self.tokenizer.decode(*args, **kwargs)
|
||||
|
||||
def post_process_image_text_to_text(self, generated_outputs):
|
||||
"""
|
||||
Post-process the output of the model to decode the text.
|
||||
Args:
|
||||
generated_outputs (`torch.Tensor` or `np.ndarray`):
|
||||
The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
|
||||
or `(sequence_length,)`.
|
||||
Returns:
|
||||
`list[str]`: The decoded text.
|
||||
"""
|
||||
return self.tokenizer.batch_decode(
|
||||
generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
||||
)
|
||||
|
||||
@property
|
||||
def model_input_names(self):
|
||||
tokenizer_input_names = self.tokenizer.model_input_names
|
||||
image_processor_input_names = self.image_processor.model_input_names
|
||||
names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
||||
return names_from_processor + ["second_per_grid_ts"]
|
||||
|
||||
|
||||
AutoProcessor.register("OvisProcessor", OvisProcessor)
|
||||
458
vllm/transformers_utils/processors/ovis2_5.py
Normal file
458
vllm/transformers_utils/processors/ovis2_5.py
Normal file
@@ -0,0 +1,458 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import math
|
||||
from functools import cached_property
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import PIL
|
||||
import torch
|
||||
from transformers import AutoProcessor, BatchFeature
|
||||
from transformers.image_utils import ImageInput
|
||||
from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
|
||||
Unpack)
|
||||
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
|
||||
__all__ = ['Ovis2_5Processor']
|
||||
IMAGE_TOKEN = "<image>"
|
||||
VIDEO_TOKEN = "<video>"
|
||||
MIN_PIXELS = 448 * 448
|
||||
MAX_PIXELS = 1792 * 1792
|
||||
|
||||
|
||||
class Ovis2_5ProcessorKwargs(ProcessingKwargs,
|
||||
total=False): # type: ignore[call-arg]
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding": False,
|
||||
},
|
||||
"images_kwargs": {
|
||||
'convert_to_rgb': True,
|
||||
'min_pixels': MIN_PIXELS,
|
||||
'max_pixels': MAX_PIXELS,
|
||||
},
|
||||
"videos_kwargs": {
|
||||
'convert_to_rgb': True,
|
||||
'min_pixels': MIN_PIXELS,
|
||||
'max_pixels': MAX_PIXELS,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class Ovis2_5Processor(ProcessorMixin):
|
||||
r"""
|
||||
Constructs an Ovis processor which wraps an Ovis image processor
|
||||
and a Qwen2 tokenizer into a single processor.
|
||||
[`OvisProcessor`] offers all the functionalities of
|
||||
[`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`].
|
||||
See the [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`]
|
||||
for more information.
|
||||
Args:
|
||||
image_processor ([`Qwen2VLImageProcessor`], *optional*):
|
||||
The image processor is a required input.
|
||||
tokenizer ([`Qwen2TokenizerFast`], *optional*):
|
||||
The tokenizer is a required input.
|
||||
chat_template (`str`, *optional*): A Jinja template which will
|
||||
be used to convert lists of messages in a chat into
|
||||
a tokenizable string.
|
||||
"""
|
||||
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
valid_kwargs = ["chat_template", "image_pad_token"]
|
||||
|
||||
image_processor_class = "AutoImageProcessor"
|
||||
tokenizer_class = "AutoTokenizer"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image_processor=None,
|
||||
tokenizer=None,
|
||||
chat_template=None,
|
||||
image_pad_token=None,
|
||||
patch_size=16,
|
||||
hidden_stride=2,
|
||||
temporal_patch_size=1,
|
||||
**kwargs,
|
||||
):
|
||||
self.image_token = IMAGE_TOKEN
|
||||
self.video_token = VIDEO_TOKEN
|
||||
self.image_pad_token = "<|image_pad|>"
|
||||
|
||||
self.patch_size = patch_size
|
||||
self.hidden_stride = hidden_stride
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
super().__init__(image_processor,
|
||||
tokenizer,
|
||||
chat_template=chat_template)
|
||||
|
||||
@cached_property
|
||||
def extra_special_tokens(self):
|
||||
image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token]
|
||||
extra_special_tokens = {
|
||||
"image_token": -200,
|
||||
"video_token": -201,
|
||||
"visual_atom": -300,
|
||||
"image_start": -301,
|
||||
"image_end": -302,
|
||||
"video_start": -303,
|
||||
"video_end": -304,
|
||||
'image_pad': image_pad_token_id,
|
||||
}
|
||||
return extra_special_tokens
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: ImageInput = None,
|
||||
videos: Union[np.ndarray, list[ImageInput]] = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput],
|
||||
list[PreTokenizedInput]] = None,
|
||||
**kwargs: Unpack[Ovis2_5ProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s)
|
||||
and image(s). This method forwards the `text`and `kwargs` arguments
|
||||
to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text`
|
||||
is not `None` to encode the text. To prepare the vision inputs,
|
||||
this method forwards the `vision_infos` and `kwrags` arguments to
|
||||
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`]
|
||||
if `vision_infos` is not `None`.
|
||||
Args:
|
||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`,
|
||||
`list[PIL.Image.Image]`, `list[np.ndarray]`,
|
||||
`list[torch.Tensor]`):
|
||||
The image or batch of images to be prepared.
|
||||
Each image can be a PIL image, NumPy array or PyTorch
|
||||
tensor. Both channels-first and channels-last formats
|
||||
are supported.
|
||||
text (`str`, `list[str]`, `list[list[str]]`):
|
||||
The sequence or batch of sequences to be encoded.
|
||||
Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as
|
||||
list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with
|
||||
a batch of sequences).
|
||||
videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`,
|
||||
`list[torch.Tensor]`):
|
||||
The image or batch of videos to be prepared. Each video
|
||||
can be a 4D NumPy array or PyTorch tensor, or a nested
|
||||
list of 3D frames. Both channels-first and channels-last
|
||||
formats are supported.
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||
If set, will return tensors of a particular framework.
|
||||
Acceptable values are:
|
||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||
Returns:
|
||||
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
||||
- **input_ids** -- list of token ids to be fed to a model.
|
||||
Returned when `text` is not `None`.
|
||||
- **attention_mask** -- list of indices specifying which tokens
|
||||
should be attended to by the model (when
|
||||
`return_attention_mask=True` or if *"attention_mask"*
|
||||
is in `self.model_input_names` and if `text` is not `None`).
|
||||
- **pixel_values** -- Pixel values to be fed to a model.
|
||||
Returned when `images` is not `None`.
|
||||
- **pixel_values_videos** -- Pixel values of videos to be fed to
|
||||
a model. Returned when `videos` is not `None`.
|
||||
- **image_grid_thw** -- list of image 3D grid in LLM. Returned
|
||||
when `images` is not `None`.
|
||||
- **video_grid_thw** -- list of video 3D grid in LLM. Returned
|
||||
when `videos` is not `None`.
|
||||
- **second_per_grid_ts** -- list of video seconds per time grid.
|
||||
Returned when `videos` is not `None`.
|
||||
"""
|
||||
output_kwargs = self._merge_kwargs(
|
||||
Ovis2_5ProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
# Process all images first
|
||||
visual_features = {}
|
||||
output = BatchFeature()
|
||||
if images is not None:
|
||||
processed_images = []
|
||||
image_placeholders_list = []
|
||||
grids = []
|
||||
# Process each image
|
||||
for image in images if isinstance(images, list) else [images]:
|
||||
pixel_values, image_placeholders, grid = (
|
||||
self.preprocess_multidata(
|
||||
images=image, **output_kwargs["images_kwargs"]))
|
||||
processed_images.append(pixel_values)
|
||||
image_placeholders_list.append(image_placeholders)
|
||||
grids.append(grid)
|
||||
|
||||
# assign all processed images
|
||||
if processed_images:
|
||||
visual_features["image_placeholders"] = image_placeholders_list
|
||||
output["pixel_values"] = processed_images
|
||||
output["grids"] = grids
|
||||
|
||||
if videos is not None:
|
||||
processed_videos = []
|
||||
videos_placeholders_list = []
|
||||
grids = []
|
||||
# Process each video
|
||||
for video in videos if isinstance(videos, list) else [videos]:
|
||||
pixel_values, video_placeholders, grid = (
|
||||
self.preprocess_multidata(
|
||||
video=video, **output_kwargs["videos_kwargs"]))
|
||||
processed_videos.append(pixel_values)
|
||||
videos_placeholders_list.append(video_placeholders)
|
||||
grids.append(grid)
|
||||
# assign all processed videos
|
||||
if processed_videos:
|
||||
visual_features[
|
||||
"video_placeholders"] = videos_placeholders_list
|
||||
output["video_pixel_values"] = processed_videos
|
||||
output["video_grids"] = grids
|
||||
|
||||
# Process text input
|
||||
if text is not None:
|
||||
if not isinstance(text, list):
|
||||
text = [text]
|
||||
tokenized_batched_text = self._tokenize_with_visual_symbol(text)
|
||||
image_token_id = self.get_token_value("image_token")
|
||||
video_token_id = self.get_token_value("video_token")
|
||||
replaced_ids_list = []
|
||||
image_idx = 0
|
||||
video_idx = 0
|
||||
for ids_tensor in tokenized_batched_text:
|
||||
has_image_tokens = (image_token_id in ids_tensor
|
||||
and "image_placeholders" in visual_features
|
||||
and image_idx < len(
|
||||
visual_features["image_placeholders"]))
|
||||
has_video_tokens = (video_token_id in ids_tensor
|
||||
and "video_placeholders" in visual_features
|
||||
and video_idx < len(
|
||||
visual_features["video_placeholders"]))
|
||||
if has_image_tokens or has_video_tokens:
|
||||
# Convert to list for easier manipulation
|
||||
ids_list = ids_tensor.tolist()
|
||||
new_ids = []
|
||||
|
||||
# Replace placeholders
|
||||
for token_id in ids_list:
|
||||
if token_id == image_token_id:
|
||||
new_ids.extend(
|
||||
visual_features["image_placeholders"]
|
||||
[image_idx])
|
||||
image_idx += 1
|
||||
elif token_id == video_token_id:
|
||||
new_ids.extend(
|
||||
visual_features["video_placeholders"]
|
||||
[video_idx])
|
||||
video_idx += 1
|
||||
else:
|
||||
new_ids.append(token_id)
|
||||
# Convert back to tensor
|
||||
ids_tensor = torch.tensor(new_ids, dtype=torch.long)
|
||||
replaced_ids_list.append(ids_tensor)
|
||||
if replaced_ids_list:
|
||||
replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
|
||||
else:
|
||||
replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)
|
||||
output["input_ids"] = replaced_and_tokenized_ids
|
||||
|
||||
return output
|
||||
# If only images were provided
|
||||
return BatchFeature(data=visual_features)
|
||||
|
||||
def _tokenize_with_visual_symbol(self,
|
||||
text_list: list[str]) -> torch.LongTensor:
|
||||
batch_token_ids = []
|
||||
for text in text_list:
|
||||
token_ids = []
|
||||
video_token_id = self.get_token_value("video_token")
|
||||
image_token_id = self.get_token_value("image_token")
|
||||
video_split_texts = text.split(self.video_token)
|
||||
|
||||
for j, video_segment in enumerate(video_split_texts):
|
||||
image_split_texts = video_segment.split(self.image_token)
|
||||
text_chunks = [
|
||||
self.tokenizer(chunk, add_special_tokens=False).input_ids
|
||||
for chunk in image_split_texts
|
||||
]
|
||||
segment_tokens = []
|
||||
for i, chunk in enumerate(text_chunks):
|
||||
segment_tokens.extend(chunk)
|
||||
if i < len(text_chunks) - 1:
|
||||
segment_tokens.append(image_token_id)
|
||||
token_ids.extend(segment_tokens)
|
||||
if j < len(video_split_texts) - 1:
|
||||
token_ids.append(video_token_id)
|
||||
|
||||
batch_token_ids.append(token_ids)
|
||||
return torch.tensor(batch_token_ids, dtype=torch.long)
|
||||
|
||||
# Copied from qwen2_vl
|
||||
def smart_resize(self,
|
||||
height: int,
|
||||
width: int,
|
||||
factor: int = 28,
|
||||
min_pixels: int = MIN_PIXELS,
|
||||
max_pixels: int = MAX_PIXELS):
|
||||
"""Rescales the image so that the following conditions are met:
|
||||
1. Both dimensions (height and width) are divisible by 'factor'.
|
||||
2. The total number of pixels is within the range
|
||||
['min_pixels', 'max_pixels'].
|
||||
3. The aspect ratio of the image is maintained as closely as possible.
|
||||
"""
|
||||
if height < factor or width < factor:
|
||||
print(f"height:{height} or width:{width} must be "
|
||||
f"larger than factor:{factor}")
|
||||
if height < width:
|
||||
width = round(factor / height * width)
|
||||
height = factor
|
||||
else:
|
||||
height = round(factor / width * height)
|
||||
width = factor
|
||||
|
||||
elif max(height, width) / min(height, width) > 200:
|
||||
print(f"absolute aspect ratio must be smaller than 200, "
|
||||
f"got {max(height, width) / min(height, width)}")
|
||||
if height > width:
|
||||
height = 200 * width
|
||||
else:
|
||||
width = 200 * height
|
||||
|
||||
h_bar = round(height / factor) * factor
|
||||
w_bar = round(width / factor) * factor
|
||||
if h_bar * w_bar > max_pixels:
|
||||
beta = math.sqrt((height * width) / max_pixels)
|
||||
h_bar = math.floor(height / beta / factor) * factor
|
||||
w_bar = math.floor(width / beta / factor) * factor
|
||||
elif h_bar * w_bar < min_pixels:
|
||||
beta = math.sqrt(min_pixels / (height * width))
|
||||
h_bar = math.ceil(height * beta / factor) * factor
|
||||
w_bar = math.ceil(width * beta / factor) * factor
|
||||
return h_bar, w_bar
|
||||
|
||||
def get_token_value(self, tok):
|
||||
return self.extra_special_tokens[tok]
|
||||
|
||||
def construct_visual_indicators(self, grid, is_video: bool = False):
|
||||
if is_video:
|
||||
start_token = self.get_token_value('video_start')
|
||||
end_token = self.get_token_value('video_end')
|
||||
else:
|
||||
start_token = self.get_token_value('image_start')
|
||||
end_token = self.get_token_value('image_end')
|
||||
|
||||
image_placeholders = [start_token, self.get_token_value('visual_atom')]
|
||||
if grid[0] * grid[1] > 1:
|
||||
for r in range(grid[0]):
|
||||
for c in range(grid[1]):
|
||||
image_placeholders.append(
|
||||
self.get_token_value('visual_atom'))
|
||||
|
||||
image_placeholders.append(end_token)
|
||||
return image_placeholders
|
||||
|
||||
def construct_visual_placeholders(self, grid, is_video: bool = False):
|
||||
visual_placeholders = self.construct_visual_indicators((1, 1),
|
||||
is_video)
|
||||
|
||||
image_atom_token_id = self.get_token_value('visual_atom')
|
||||
# Extract the padding token ID from tokenizer
|
||||
image_padding_token_id = self.get_token_value('image_pad')
|
||||
|
||||
num_image_atoms = grid[0] * grid[1] * grid[2]
|
||||
num_image_atoms //= self.hidden_stride**2
|
||||
num_image_atoms //= self.temporal_patch_size
|
||||
|
||||
# Create a new list with padding tokens inserted
|
||||
padded_placeholder_tokens = []
|
||||
for token in visual_placeholders:
|
||||
if token == image_atom_token_id:
|
||||
padded_placeholder_tokens.extend([image_padding_token_id] *
|
||||
num_image_atoms)
|
||||
else:
|
||||
padded_placeholder_tokens.append(image_padding_token_id)
|
||||
return padded_placeholder_tokens
|
||||
|
||||
def preprocess_multidata(
|
||||
self,
|
||||
images: Optional[Union[PIL.Image.Image, list[PIL.Image.Image]]] = None,
|
||||
video: Optional[Union[list[PIL.Image.Image], np.ndarray]] = None,
|
||||
convert_to_rgb: Optional[bool] = True,
|
||||
min_pixels: int = MIN_PIXELS,
|
||||
max_pixels: int = MAX_PIXELS,
|
||||
return_tensors: Optional[str] = 'pt',
|
||||
):
|
||||
is_video = False
|
||||
if images is not None:
|
||||
if not isinstance(images, list):
|
||||
images = [images]
|
||||
elif video is not None:
|
||||
is_video = True
|
||||
# type of vidoe in dummy_mm_data is np.ndarray
|
||||
if isinstance(video, np.ndarray):
|
||||
images = []
|
||||
for i in range(video.shape[0]):
|
||||
image = PIL.Image.fromarray(video[i].astype(np.uint8))
|
||||
images.append(image)
|
||||
elif isinstance(video, list):
|
||||
images = video
|
||||
min_pixels = min(max_pixels if max_pixels is not None else MAX_PIXELS,
|
||||
min_pixels if min_pixels is not None else MIN_PIXELS)
|
||||
images = [
|
||||
image.convert("RGB")
|
||||
if convert_to_rgb and image.mode != 'RGB' else image
|
||||
for image in images
|
||||
]
|
||||
|
||||
width, height = images[0].size
|
||||
resized_height, resized_width = height, width
|
||||
processed_images = []
|
||||
for image in images:
|
||||
resized_height, resized_width = self.smart_resize(
|
||||
height,
|
||||
width,
|
||||
factor=self.patch_size * self.hidden_stride,
|
||||
min_pixels=min_pixels,
|
||||
max_pixels=max_pixels,
|
||||
)
|
||||
new_size = dict(height=resized_height, width=resized_width)
|
||||
image_pt = self.image_processor.preprocess(
|
||||
image, size=new_size, return_tensors="np")['pixel_values'][0]
|
||||
|
||||
processed_images.append(image_pt)
|
||||
|
||||
patches = np.array(processed_images)
|
||||
if patches.shape[0] % self.temporal_patch_size != 0:
|
||||
num_to_pad = self.temporal_patch_size - (patches.shape[0] %
|
||||
self.temporal_patch_size)
|
||||
repeats = np.repeat(patches[-1][np.newaxis], num_to_pad, axis=0)
|
||||
patches = np.concatenate([patches, repeats], axis=0)
|
||||
channel = patches.shape[1]
|
||||
grid_t = patches.shape[0] // self.temporal_patch_size
|
||||
grid_h = resized_height // self.patch_size
|
||||
grid_w = resized_width // self.patch_size
|
||||
|
||||
patches = patches.reshape(
|
||||
grid_t,
|
||||
self.temporal_patch_size,
|
||||
channel,
|
||||
grid_h // self.hidden_stride,
|
||||
self.hidden_stride,
|
||||
self.patch_size,
|
||||
grid_w // self.hidden_stride,
|
||||
self.hidden_stride,
|
||||
self.patch_size,
|
||||
)
|
||||
patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
|
||||
flatten_patches = patches.reshape(
|
||||
grid_t * grid_h * grid_w, channel * self.temporal_patch_size *
|
||||
self.patch_size * self.patch_size)
|
||||
|
||||
visual_placeholders = self.construct_visual_placeholders(
|
||||
[grid_t, grid_h, grid_w], is_video)
|
||||
return torch.tensor(
|
||||
flatten_patches), visual_placeholders, torch.tensor(
|
||||
[[grid_t, grid_h, grid_w]])
|
||||
|
||||
|
||||
AutoProcessor.register("Ovis2_5Processor", Ovis2_5Processor)
|
||||
104
vllm/transformers_utils/runai_utils.py
Normal file
104
vllm/transformers_utils/runai_utils.py
Normal file
@@ -0,0 +1,104 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
import shutil
|
||||
import signal
|
||||
from typing import Optional
|
||||
|
||||
from vllm import envs
|
||||
from vllm.assets.base import get_cache_dir
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import PlaceholderModule
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
SUPPORTED_SCHEMES = ['s3://', 'gs://']
|
||||
|
||||
try:
|
||||
from runai_model_streamer import list_safetensors as runai_list_safetensors
|
||||
from runai_model_streamer import pull_files as runai_pull_files
|
||||
except (ImportError, OSError):
|
||||
# see https://github.com/run-ai/runai-model-streamer/issues/26
|
||||
# OSError will be raised on arm64 platform
|
||||
runai_model_streamer = PlaceholderModule(
|
||||
"runai_model_streamer") # type: ignore[assignment]
|
||||
runai_pull_files = runai_model_streamer.placeholder_attr("pull_files")
|
||||
runai_list_safetensors = runai_model_streamer.placeholder_attr(
|
||||
"list_safetensors")
|
||||
|
||||
|
||||
def list_safetensors(path: str = "") -> list[str]:
|
||||
"""
|
||||
List full file names from object path and filter by allow pattern.
|
||||
|
||||
Args:
|
||||
path: The object storage path to list from.
|
||||
|
||||
Returns:
|
||||
list[str]: List of full object storage paths allowed by the pattern
|
||||
"""
|
||||
return runai_list_safetensors(path)
|
||||
|
||||
|
||||
def is_runai_obj_uri(model_or_path: str) -> bool:
|
||||
return model_or_path.lower().startswith(tuple(SUPPORTED_SCHEMES))
|
||||
|
||||
|
||||
class ObjectStorageModel:
|
||||
"""
|
||||
A class representing an ObjectStorage model mirrored into a
|
||||
temporary directory.
|
||||
|
||||
Attributes:
|
||||
dir: The temporary created directory.
|
||||
|
||||
Methods:
|
||||
pull_files(): Pull model from object storage to the temporary directory.
|
||||
"""
|
||||
|
||||
def __init__(self, url: str) -> None:
|
||||
if envs.VLLM_ASSETS_CACHE_MODEL_CLEAN:
|
||||
for sig in (signal.SIGINT, signal.SIGTERM):
|
||||
existing_handler = signal.getsignal(sig)
|
||||
signal.signal(sig, self._close_by_signal(existing_handler))
|
||||
|
||||
dir_name = os.path.join(
|
||||
get_cache_dir(), "model_streamer",
|
||||
hashlib.sha256(str(url).encode()).hexdigest()[:8])
|
||||
if os.path.exists(dir_name):
|
||||
shutil.rmtree(dir_name)
|
||||
os.makedirs(dir_name)
|
||||
self.dir = dir_name
|
||||
logger.debug("Init object storage, model cache path is: %s", dir_name)
|
||||
|
||||
def _close(self) -> None:
|
||||
if os.path.exists(self.dir):
|
||||
shutil.rmtree(self.dir)
|
||||
|
||||
def _close_by_signal(self, existing_handler=None):
|
||||
|
||||
def new_handler(signum, frame):
|
||||
self._close()
|
||||
if existing_handler:
|
||||
existing_handler(signum, frame)
|
||||
|
||||
return new_handler
|
||||
|
||||
def pull_files(self,
|
||||
model_path: str = "",
|
||||
allow_pattern: Optional[list[str]] = None,
|
||||
ignore_pattern: Optional[list[str]] = None) -> None:
|
||||
"""
|
||||
Pull files from object storage into the temporary directory.
|
||||
|
||||
Args:
|
||||
model_path: The object storage path of the model.
|
||||
allow_pattern: A list of patterns of which files to pull.
|
||||
ignore_pattern: A list of patterns of which files not to pull.
|
||||
|
||||
"""
|
||||
if not model_path.endswith("/"):
|
||||
model_path = model_path + "/"
|
||||
runai_pull_files(model_path, self.dir, allow_pattern, ignore_pattern)
|
||||
93
vllm/transformers_utils/s3_utils.py
Normal file
93
vllm/transformers_utils/s3_utils.py
Normal file
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import fnmatch
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
from vllm.utils import PlaceholderModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from botocore.client import BaseClient
|
||||
|
||||
try:
|
||||
import boto3
|
||||
except ImportError:
|
||||
boto3 = PlaceholderModule("boto3") # type: ignore[assignment]
|
||||
|
||||
|
||||
def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
|
||||
return [
|
||||
path for path in paths if any(
|
||||
fnmatch.fnmatch(path, pattern) for pattern in patterns)
|
||||
]
|
||||
|
||||
|
||||
def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
|
||||
return [
|
||||
path for path in paths
|
||||
if not any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
|
||||
]
|
||||
|
||||
|
||||
def glob(s3: Optional["BaseClient"] = None,
|
||||
path: str = "",
|
||||
allow_pattern: Optional[list[str]] = None) -> list[str]:
|
||||
"""
|
||||
List full file names from S3 path and filter by allow pattern.
|
||||
|
||||
Args:
|
||||
s3: S3 client to use.
|
||||
path: The S3 path to list from.
|
||||
allow_pattern: A list of patterns of which files to pull.
|
||||
|
||||
Returns:
|
||||
list[str]: List of full S3 paths allowed by the pattern
|
||||
"""
|
||||
if s3 is None:
|
||||
s3 = boto3.client("s3")
|
||||
if not path.endswith("/"):
|
||||
path = path + "/"
|
||||
bucket_name, _, paths = list_files(s3,
|
||||
path=path,
|
||||
allow_pattern=allow_pattern)
|
||||
return [f"s3://{bucket_name}/{path}" for path in paths]
|
||||
|
||||
|
||||
def list_files(
|
||||
s3: "BaseClient",
|
||||
path: str,
|
||||
allow_pattern: Optional[list[str]] = None,
|
||||
ignore_pattern: Optional[list[str]] = None
|
||||
) -> tuple[str, str, list[str]]:
|
||||
"""
|
||||
List files from S3 path and filter by pattern.
|
||||
|
||||
Args:
|
||||
s3: S3 client to use.
|
||||
path: The S3 path to list from.
|
||||
allow_pattern: A list of patterns of which files to pull.
|
||||
ignore_pattern: A list of patterns of which files not to pull.
|
||||
|
||||
Returns:
|
||||
tuple[str, str, list[str]]: A tuple where:
|
||||
- The first element is the bucket name
|
||||
- The second element is string represent the bucket
|
||||
and the prefix as a dir like string
|
||||
- The third element is a list of files allowed or
|
||||
disallowed by pattern
|
||||
"""
|
||||
parts = path.removeprefix('s3://').split('/')
|
||||
prefix = '/'.join(parts[1:])
|
||||
bucket_name = parts[0]
|
||||
|
||||
objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
|
||||
paths = [obj['Key'] for obj in objects.get('Contents', [])]
|
||||
|
||||
paths = _filter_ignore(paths, ["*/"])
|
||||
if allow_pattern is not None:
|
||||
paths = _filter_allow(paths, allow_pattern)
|
||||
|
||||
if ignore_pattern is not None:
|
||||
paths = _filter_ignore(paths, ignore_pattern)
|
||||
|
||||
return bucket_name, prefix, paths
|
||||
292
vllm/transformers_utils/tokenizer.py
Normal file
292
vllm/transformers_utils/tokenizer.py
Normal file
@@ -0,0 +1,292 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import contextlib
|
||||
import copy
|
||||
import os
|
||||
import warnings
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
import huggingface_hub
|
||||
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
||||
PreTrainedTokenizerFast)
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm import envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.config import (
|
||||
get_sentence_transformer_tokenizer_config)
|
||||
from vllm.transformers_utils.tokenizers import MistralTokenizer
|
||||
from vllm.transformers_utils.utils import check_gguf_file
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.tokenizer_base import TokenizerBase
|
||||
else:
|
||||
ModelConfig = Any
|
||||
LoRARequest = Any
|
||||
TokenizerBase = Any
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
|
||||
TokenizerBase]
|
||||
|
||||
|
||||
def decode_tokens(
|
||||
tokenizer: AnyTokenizer,
|
||||
token_ids: list[int],
|
||||
*,
|
||||
skip_special_tokens: Optional[bool] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Backend-agnostic equivalent of HF's
|
||||
`tokenizer.decode(token_ids, ...)`.
|
||||
|
||||
`skip_special_tokens=None` means to use the backend's default
|
||||
settings.
|
||||
"""
|
||||
if skip_special_tokens is not None:
|
||||
return tokenizer.decode(token_ids,
|
||||
skip_special_tokens=skip_special_tokens)
|
||||
|
||||
return tokenizer.decode(token_ids)
|
||||
|
||||
|
||||
def encode_tokens(
|
||||
tokenizer: AnyTokenizer,
|
||||
text: str,
|
||||
*,
|
||||
truncation: Optional[bool] = None,
|
||||
max_length: Optional[int] = None,
|
||||
add_special_tokens: Optional[bool] = None,
|
||||
) -> list[int]:
|
||||
"""
|
||||
Backend-agnostic equivalent of HF's
|
||||
`tokenizer.encode(text, ...)`.
|
||||
|
||||
`add_special_tokens=None` means to use the backend's default
|
||||
settings.
|
||||
"""
|
||||
|
||||
kw_args: dict[str, Any] = {}
|
||||
if max_length is not None:
|
||||
kw_args["max_length"] = max_length
|
||||
|
||||
if truncation is not None:
|
||||
kw_args["truncation"] = truncation
|
||||
|
||||
if add_special_tokens is not None:
|
||||
kw_args["add_special_tokens"] = add_special_tokens
|
||||
|
||||
return tokenizer.encode(text, **kw_args)
|
||||
|
||||
|
||||
def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
|
||||
"""
|
||||
By default, transformers will recompute multiple tokenizer properties
|
||||
each time they are called, leading to a significant slowdown.
|
||||
This proxy caches these properties for faster access.
|
||||
"""
|
||||
cached_tokenizer = copy.copy(tokenizer)
|
||||
|
||||
tokenizer_all_special_ids = tokenizer.all_special_ids
|
||||
tokenizer_all_special_tokens = tokenizer.all_special_tokens
|
||||
tokenizer_all_special_tokens_extended = (
|
||||
tokenizer.all_special_tokens_extended)
|
||||
tokenizer_vocab = tokenizer.get_vocab()
|
||||
tokenizer_len = len(tokenizer)
|
||||
|
||||
max_token_id = max(tokenizer_vocab.values())
|
||||
# Some tokenizers (e.g., QwenTokenizer) have special tokens that
|
||||
# are added and included in the implementation of the vocab_size
|
||||
# property, but not in get_vocab(); if there is an implementation
|
||||
# of vocab size, we should take the greater value.
|
||||
if hasattr(tokenizer, "vocab_size"):
|
||||
with contextlib.suppress(NotImplementedError):
|
||||
max_token_id = max(max_token_id, tokenizer.vocab_size)
|
||||
|
||||
class CachedTokenizer(tokenizer.__class__): # type: ignore
|
||||
|
||||
@property
|
||||
def all_special_ids(self) -> list[int]:
|
||||
return tokenizer_all_special_ids
|
||||
|
||||
@property
|
||||
def all_special_tokens(self) -> list[str]:
|
||||
return tokenizer_all_special_tokens
|
||||
|
||||
@property
|
||||
def all_special_tokens_extended(self) -> list[str]:
|
||||
return tokenizer_all_special_tokens_extended
|
||||
|
||||
@property
|
||||
def max_token_id(self) -> int:
|
||||
return max_token_id
|
||||
|
||||
def get_vocab(self) -> dict[str, int]:
|
||||
return tokenizer_vocab
|
||||
|
||||
def __len__(self) -> int:
|
||||
return tokenizer_len
|
||||
|
||||
def __reduce__(self):
|
||||
return get_cached_tokenizer, (tokenizer, )
|
||||
|
||||
CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
|
||||
|
||||
cached_tokenizer.__class__ = CachedTokenizer
|
||||
return cached_tokenizer
|
||||
|
||||
|
||||
def get_tokenizer(
|
||||
tokenizer_name: Union[str, Path],
|
||||
*args,
|
||||
tokenizer_mode: str = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
revision: Optional[str] = None,
|
||||
download_dir: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> AnyTokenizer:
|
||||
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope.
|
||||
"""
|
||||
if envs.VLLM_USE_MODELSCOPE:
|
||||
# download model from ModelScope hub,
|
||||
# lazy import so that modelscope is not required for normal use.
|
||||
# pylint: disable=C.
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
|
||||
# avoid circuit import
|
||||
from vllm.model_executor.model_loader.weight_utils import get_lock
|
||||
|
||||
# Only set the tokenizer here, model will be downloaded on the workers.
|
||||
if not os.path.exists(tokenizer_name):
|
||||
# Use file lock to prevent multiple processes from
|
||||
# downloading the same file at the same time.
|
||||
with get_lock(tokenizer_name, download_dir):
|
||||
tokenizer_path = snapshot_download(
|
||||
model_id=tokenizer_name,
|
||||
cache_dir=download_dir,
|
||||
revision=revision,
|
||||
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||
# Ignore weights - we only need the tokenizer.
|
||||
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
||||
tokenizer_name = tokenizer_path
|
||||
|
||||
if tokenizer_mode == "slow":
|
||||
if kwargs.get("use_fast", False):
|
||||
raise ValueError(
|
||||
"Cannot use the fast tokenizer in slow tokenizer mode.")
|
||||
kwargs["use_fast"] = False
|
||||
|
||||
if "truncation_side" not in kwargs:
|
||||
kwargs["truncation_side"] = "left"
|
||||
|
||||
# Separate model folder from file path for GGUF models
|
||||
is_gguf = check_gguf_file(tokenizer_name)
|
||||
if is_gguf:
|
||||
kwargs["gguf_file"] = Path(tokenizer_name).name
|
||||
tokenizer_name = Path(tokenizer_name).parent
|
||||
|
||||
# if tokenizer is from official mistral org
|
||||
is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai"
|
||||
if is_from_mistral_org and tokenizer_mode != "mistral":
|
||||
warnings.warn(
|
||||
'It is strongly recommended to run mistral models with '
|
||||
'`--tokenizer-mode "mistral"` to ensure correct '
|
||||
'encoding and decoding.',
|
||||
FutureWarning,
|
||||
stacklevel=2)
|
||||
|
||||
tokenizer: AnyTokenizer
|
||||
if tokenizer_mode == "mistral":
|
||||
tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name),
|
||||
revision=revision)
|
||||
elif tokenizer_mode == "custom":
|
||||
from vllm.transformers_utils.tokenizer_base import TokenizerRegistry
|
||||
tokenizer = TokenizerRegistry.get_tokenizer(str(tokenizer_name),
|
||||
*args,
|
||||
revision=revision,
|
||||
download_dir=download_dir,
|
||||
**kwargs)
|
||||
else:
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
tokenizer_name,
|
||||
*args,
|
||||
trust_remote_code=trust_remote_code,
|
||||
revision=revision,
|
||||
**kwargs,
|
||||
)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the tokenizer class not existing or not
|
||||
# currently being imported,
|
||||
# suggest using the --trust-remote-code flag.
|
||||
if not trust_remote_code and (
|
||||
"does not exist or is not currently imported." in str(e)
|
||||
or "requires you to execute the tokenizer file" in str(e)):
|
||||
err_msg = ("Failed to load the tokenizer. If the tokenizer "
|
||||
"is a custom tokenizer not yet available in the "
|
||||
"HuggingFace transformers library, consider "
|
||||
"setting `trust_remote_code=True` in LLM or using "
|
||||
"the `--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
# The special_tokens in tokenizer should also be
|
||||
# controlled by do_lower_case in encoder_config
|
||||
encoder_config = get_sentence_transformer_tokenizer_config(
|
||||
tokenizer_name, revision)
|
||||
if isinstance(encoder_config, dict) and encoder_config.get(
|
||||
"do_lower_case", False):
|
||||
special_tokens_map = {
|
||||
k: v.lower()
|
||||
for k, v in tokenizer.special_tokens_map.items()
|
||||
}
|
||||
tokenizer.add_special_tokens(special_tokens_map)
|
||||
|
||||
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||
logger.warning(
|
||||
"Using a slow tokenizer. This might cause a significant "
|
||||
"slowdown. Consider using a fast tokenizer instead.")
|
||||
tokenizer = get_cached_tokenizer(tokenizer)
|
||||
|
||||
return tokenizer
|
||||
|
||||
|
||||
cached_get_tokenizer = lru_cache(get_tokenizer)
|
||||
|
||||
|
||||
def cached_tokenizer_from_config(
|
||||
model_config: ModelConfig,
|
||||
**kwargs: Any,
|
||||
):
|
||||
return cached_get_tokenizer(
|
||||
model_config.tokenizer,
|
||||
tokenizer_mode=model_config.tokenizer_mode,
|
||||
revision=model_config.tokenizer_revision,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
def init_tokenizer_from_configs(model_config: ModelConfig):
|
||||
runner_type = model_config.runner_type
|
||||
if runner_type == "generate" or runner_type == "draft":
|
||||
truncation_side = "left"
|
||||
elif runner_type == "pooling":
|
||||
truncation_side = "right"
|
||||
else:
|
||||
assert_never(runner_type)
|
||||
|
||||
return get_tokenizer(
|
||||
model_config.tokenizer,
|
||||
tokenizer_mode=model_config.tokenizer_mode,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
revision=model_config.tokenizer_revision,
|
||||
truncation_side=truncation_side,
|
||||
)
|
||||
154
vllm/transformers_utils/tokenizer_base.py
Normal file
154
vllm/transformers_utils/tokenizer_base.py
Normal file
@@ -0,0 +1,154 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import importlib
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
|
||||
|
||||
|
||||
class TokenizerBase(ABC):
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def all_special_tokens_extended(self) -> list[str]:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def all_special_tokens(self) -> list[str]:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def all_special_ids(self) -> list[int]:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def bos_token_id(self) -> int:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def eos_token_id(self) -> int:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def sep_token(self) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def pad_token(self) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def is_fast(self) -> bool:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def vocab_size(self) -> int:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def max_token_id(self) -> int:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def truncation_side(self) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
def __len__(self) -> int:
|
||||
return self.vocab_size
|
||||
|
||||
@abstractmethod
|
||||
def __call__(
|
||||
self,
|
||||
text: Union[str, list[str], list[int]],
|
||||
text_pair: Optional[str] = None,
|
||||
add_special_tokens: bool = False,
|
||||
truncation: bool = False,
|
||||
max_length: Optional[int] = None,
|
||||
):
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def get_vocab(self) -> dict[str, int]:
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def get_added_vocab(self) -> dict[str, int]:
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def encode_one(
|
||||
self,
|
||||
text: str,
|
||||
truncation: bool = False,
|
||||
max_length: Optional[int] = None,
|
||||
) -> list[int]:
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def encode(self,
|
||||
text: str,
|
||||
truncation: Optional[bool] = None,
|
||||
max_length: Optional[int] = None,
|
||||
add_special_tokens: Optional[bool] = None) -> list[int]:
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def apply_chat_template(self,
|
||||
messages: list["ChatCompletionMessageParam"],
|
||||
tools: Optional[list[dict[str, Any]]] = None,
|
||||
**kwargs) -> list[int]:
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def convert_tokens_to_string(self, tokens: list[str]) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def decode(self,
|
||||
ids: Union[list[int], int],
|
||||
skip_special_tokens: bool = True) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def convert_ids_to_tokens(
|
||||
self,
|
||||
ids: list[int],
|
||||
skip_special_tokens: bool = True,
|
||||
) -> list[str]:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class TokenizerRegistry:
|
||||
# Tokenizer name -> (tokenizer module, tokenizer class)
|
||||
REGISTRY: dict[str, tuple[str, str]] = {}
|
||||
|
||||
@staticmethod
|
||||
def register(name: str, module: str, class_name: str) -> None:
|
||||
TokenizerRegistry.REGISTRY[name] = (module, class_name)
|
||||
|
||||
@staticmethod
|
||||
def get_tokenizer(
|
||||
tokenizer_name: str,
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> TokenizerBase:
|
||||
tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
|
||||
if tokenizer_cls is None:
|
||||
raise ValueError(f"Tokenizer {tokenizer_name} not found.")
|
||||
|
||||
tokenizer_module = importlib.import_module(tokenizer_cls[0])
|
||||
class_ = getattr(tokenizer_module, tokenizer_cls[1])
|
||||
return class_.from_pretrained(*args, **kwargs)
|
||||
10
vllm/transformers_utils/tokenizers/__init__.py
Normal file
10
vllm/transformers_utils/tokenizers/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
|
||||
truncate_tool_call_ids, validate_request_params)
|
||||
|
||||
__all__ = [
|
||||
"MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids",
|
||||
"validate_request_params"
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
521
vllm/transformers_utils/tokenizers/mistral.py
Normal file
521
vllm/transformers_utils/tokenizers/mistral.py
Normal file
@@ -0,0 +1,521 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
||||
|
||||
import huggingface_hub
|
||||
import regex as re
|
||||
from huggingface_hub import HfApi, hf_hub_download
|
||||
from transformers.tokenization_utils_base import BatchEncoding
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.tokenizer_base import TokenizerBase
|
||||
from vllm.utils import is_list_of
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# make sure `mistral_common` is lazy imported,
|
||||
# so that users who only use non-mistral models
|
||||
# will not be bothered by the dependency.
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from mistral_common.tokens.tokenizers.mistral import (
|
||||
MistralTokenizer as PublicMistralTokenizer)
|
||||
|
||||
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def maybe_serialize_tool_calls(request: "ChatCompletionRequest"):
|
||||
# SEE: https://github.com/vllm-project/vllm/pull/9951
|
||||
# Credits go to: @gcalmettes
|
||||
# NOTE: There is currently a bug in pydantic where attributes
|
||||
# declared as iterables are replaced in in the instances by
|
||||
# pydantic-core ValidatorIterator instance. In particular, this
|
||||
# affects tool_calls defined in ChatCompletionAssistantMessageParam
|
||||
# model:
|
||||
# see:
|
||||
# - https://github.com/pydantic/pydantic/issues/9467
|
||||
# As a result, tool_calls from assistant messages are never
|
||||
# deserialized in the request object if the tool_calls iterator is
|
||||
# not consumed. This affect messages passed to the MistralTokenizer
|
||||
# since no chat template is applied and therefore the tools_calls
|
||||
# iterator is not directly consumed.
|
||||
# Issue is tracked on Pydantic side, with resolution planned for
|
||||
# v2.11 release. In the meantime, the official workaround is to
|
||||
# consume the iterator so the tool_calls are correctly deserialized
|
||||
# in the OpenAI ChatCompletionAssistantMessageParam object
|
||||
# https://github.com/pydantic/pydantic/issues/9467#issuecomment-2442097291 # noqa: E501
|
||||
# Official Pydantic Issues:
|
||||
# - https://github.com/pydantic/pydantic/issues/9541
|
||||
# TODO: remove when pydantic v2.11 is released
|
||||
for i, message in enumerate(request.messages):
|
||||
if message.get("role") == 'assistant':
|
||||
tool_calls_validator = message.get("tool_calls", ().__iter__())
|
||||
validated_tool_calls = []
|
||||
while True:
|
||||
try:
|
||||
tool_call = next(tool_calls_validator) # type: ignore
|
||||
validated_tool_calls.append(tool_call)
|
||||
except StopIteration:
|
||||
break
|
||||
|
||||
request.messages[i]["tool_calls"] = validated_tool_calls
|
||||
|
||||
|
||||
def truncate_tool_call_ids(request: "ChatCompletionRequest"):
|
||||
"""Truncates tool call IDs for Mistral's ID requirements."""
|
||||
for i, message in enumerate(request.messages):
|
||||
if message.get("role") == 'assistant':
|
||||
tool_calls = message.get("tool_calls", [])
|
||||
for tool_call in tool_calls:
|
||||
if len(tool_call["id"]) > 9:
|
||||
logger.warning(
|
||||
"Truncating tool call ID: %s to %s",
|
||||
tool_call["id"],
|
||||
tool_call["id"][-9:],
|
||||
)
|
||||
tool_call["id"] = tool_call["id"][-9:]
|
||||
|
||||
request.messages[i]["tool_calls"] = tool_calls
|
||||
|
||||
elif message.get("role") in {"tool_results", "tool"}:
|
||||
if "tool_call_id" in message:
|
||||
tool_call_id = message["tool_call_id"]
|
||||
|
||||
if len(tool_call_id) > 9:
|
||||
logger.warning(
|
||||
"Truncating tool_call_id: %s to %s",
|
||||
tool_call_id,
|
||||
tool_call_id[-9:],
|
||||
)
|
||||
tool_call_id = tool_call_id[-9:]
|
||||
request.messages[i]["tool_call_id"] = tool_call_id
|
||||
|
||||
|
||||
def validate_request_params(request: "ChatCompletionRequest"):
|
||||
if (request.skip_special_tokens is not None
|
||||
and not request.skip_special_tokens):
|
||||
raise ValueError("skip_special_tokens=False is not supported "
|
||||
"for Mistral tokenizers.")
|
||||
|
||||
|
||||
def list_local_repo_files(repo_id: str, revision: Optional[str]) -> list[str]:
|
||||
repo_cache = os.path.join(
|
||||
huggingface_hub.constants.HF_HUB_CACHE,
|
||||
huggingface_hub.constants.REPO_ID_SEPARATOR.join(
|
||||
["models", *repo_id.split("/")]))
|
||||
|
||||
if revision is None:
|
||||
revision_file = os.path.join(repo_cache, "refs", "main")
|
||||
if os.path.isfile(revision_file):
|
||||
with open(revision_file) as file:
|
||||
revision = file.read()
|
||||
|
||||
if revision:
|
||||
revision_dir = os.path.join(repo_cache, "snapshots", revision)
|
||||
if os.path.isdir(revision_dir):
|
||||
return os.listdir(revision_dir)
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def find_tokenizer_file(files: list[str]):
|
||||
file_pattern = re.compile(
|
||||
r"^tokenizer\.model\.v.*$|^tekken\.json$|^tokenizer\.mm\.model\.v.*$")
|
||||
|
||||
matched_files = [file for file in files if file_pattern.match(file)]
|
||||
if len(matched_files) > 1:
|
||||
raise OSError(
|
||||
f"Found {len(matched_files)} files matching the "
|
||||
f"pattern: `{file_pattern.pattern}`. Make sure only one Mistral "
|
||||
f"tokenizer is present in {files}.")
|
||||
elif len(matched_files) == 0:
|
||||
raise OSError(
|
||||
f"Found {len(matched_files)} files matching the "
|
||||
f"pattern: `{file_pattern.pattern}`. Make sure that a Mistral "
|
||||
f"tokenizer is present in {files}.")
|
||||
|
||||
return matched_files[0]
|
||||
|
||||
|
||||
def _aggregate_content(content: list) -> list[dict[str, Any]]:
|
||||
aggregated_content: list[dict[str, Any]] = []
|
||||
for chunk in content:
|
||||
if chunk.get("type"
|
||||
) == "text" and aggregated_content and aggregated_content[
|
||||
-1].get("type") == "text":
|
||||
aggregated_content[-1]["text"] += "\n\n" + chunk.get("text")
|
||||
else:
|
||||
aggregated_content.append(chunk)
|
||||
if len(aggregated_content) == 1 and aggregated_content[0].get(
|
||||
"type") == "text":
|
||||
content = aggregated_content[0]["text"]
|
||||
return content
|
||||
|
||||
|
||||
def make_mistral_chat_completion_request(
|
||||
messages: list["ChatCompletionMessageParam"],
|
||||
tools: Optional[list[dict[str,
|
||||
Any]]] = None) -> "ChatCompletionRequest":
|
||||
last_message = cast(dict[str, Any], messages[-1])
|
||||
if last_message["role"] == "assistant":
|
||||
last_message["prefix"] = True
|
||||
|
||||
# mistral-common requires AssistantMessage content to be string [1].
|
||||
#
|
||||
# [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80
|
||||
for message in messages:
|
||||
# Remove reasoning_content as unsupported by Mistral
|
||||
_ = message.pop("reasoning_content", None) # type: ignore
|
||||
|
||||
# Convert list text content to string
|
||||
if message.get("role") in ("assistant", "tool"):
|
||||
content: Any = message.get("content")
|
||||
if isinstance(content, list):
|
||||
content = _aggregate_content(content)
|
||||
message["content"] = content
|
||||
|
||||
# The Mistral client, in comparison to the OpenAI client, requires the
|
||||
# "parameters" dict and the "description" string to be present
|
||||
# even if they are empty.
|
||||
if tools:
|
||||
for function in [
|
||||
tool["function"] for tool in tools
|
||||
if tool["type"] == "function"
|
||||
]:
|
||||
if function.get("parameters") is None:
|
||||
function["parameters"] = {}
|
||||
if function.get("description") is None:
|
||||
function["description"] = ""
|
||||
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
return ChatCompletionRequest(messages=messages,
|
||||
tools=tools) # type: ignore[type-var]
|
||||
|
||||
|
||||
class MistralTokenizer(TokenizerBase):
|
||||
|
||||
def __init__(self, tokenizer: "PublicMistralTokenizer") -> None:
|
||||
self.mistral = tokenizer
|
||||
self.instruct = tokenizer.instruct_tokenizer
|
||||
_mistral_version_str = self.instruct.tokenizer.version.value
|
||||
self.version: int = int(_mistral_version_str.split("v")[-1])
|
||||
|
||||
tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
|
||||
from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
|
||||
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
|
||||
|
||||
self.is_tekken = isinstance(tokenizer_, Tekkenizer)
|
||||
from mistral_common.tokens.tokenizers.sentencepiece import (
|
||||
SentencePieceTokenizer)
|
||||
self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer)
|
||||
self._special_token_policy = (SpecialTokenPolicy.IGNORE
|
||||
if self.is_tekken else None)
|
||||
if not (self.is_tekken or self.is_spm):
|
||||
raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
|
||||
|
||||
self._vocab = tokenizer_.vocab()
|
||||
# Convert to a dict[str, int] to match protocol, but this is a lossy
|
||||
# conversion. There may be multiple token ids that decode to the same
|
||||
# string due to partial UTF-8 byte sequences being converted to <20>
|
||||
self._vocab_dict = {
|
||||
token: idx
|
||||
for idx, token in enumerate(self._vocab)
|
||||
}
|
||||
self.tokenizer = tokenizer_
|
||||
self._max_token_id = self.vocab_size - 1
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls,
|
||||
path_or_repo_id: str,
|
||||
*,
|
||||
revision: Optional[str] = None) -> "MistralTokenizer":
|
||||
if not Path(path_or_repo_id).exists():
|
||||
assert len(path_or_repo_id.split("/")) == 2, (
|
||||
"You have either provided a non-existent path: "
|
||||
"{path_or_repo_id} or an invalid HF Hub repo id.")
|
||||
tokenizer_file = cls._download_mistral_tokenizer_from_hf(
|
||||
path_or_repo_id, revision)
|
||||
elif Path(path_or_repo_id).is_dir():
|
||||
tokenizer_file_name = find_tokenizer_file(
|
||||
os.listdir(path_or_repo_id))
|
||||
tokenizer_file = str(Path(path_or_repo_id) / tokenizer_file_name)
|
||||
else:
|
||||
assert Path(
|
||||
path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
|
||||
tokenizer_file = str(Path(path_or_repo_id))
|
||||
|
||||
from mistral_common.tokens.tokenizers.mistral import (
|
||||
MistralTokenizer as PublicMistralTokenizer)
|
||||
mistral_tokenizer = PublicMistralTokenizer.from_file(tokenizer_file)
|
||||
return cls(mistral_tokenizer)
|
||||
|
||||
@staticmethod
|
||||
def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
|
||||
revision: Optional[str]) -> str:
|
||||
try:
|
||||
hf_api = HfApi()
|
||||
files = hf_api.list_repo_files(repo_id=tokenizer_name,
|
||||
revision=revision)
|
||||
except ConnectionError as exc:
|
||||
files = list_local_repo_files(repo_id=tokenizer_name,
|
||||
revision=revision)
|
||||
|
||||
if len(files) == 0:
|
||||
raise exc
|
||||
|
||||
filename = find_tokenizer_file(files)
|
||||
|
||||
tokenizer_file = hf_hub_download(tokenizer_name,
|
||||
filename=filename,
|
||||
revision=revision)
|
||||
return tokenizer_file
|
||||
|
||||
# the following attributes are set to fit vLLM's design and are used
|
||||
# by the structured output backends.
|
||||
@property
|
||||
def all_special_tokens_extended(self) -> list[str]:
|
||||
from mistral_common.tokens.tokenizers.base import SpecialTokens
|
||||
|
||||
# tekken defines its own extended special tokens list
|
||||
if hasattr(self.tokenizer, "SPECIAL_TOKENS"):
|
||||
special_tokens = self.tokenizer.SPECIAL_TOKENS
|
||||
else:
|
||||
special_tokens = list(SpecialTokens)
|
||||
return [
|
||||
s.value if isinstance(s, SpecialTokens) else s
|
||||
for s in special_tokens
|
||||
]
|
||||
|
||||
@property
|
||||
def all_special_tokens(self) -> list[str]:
|
||||
return self.all_special_tokens_extended
|
||||
|
||||
@property
|
||||
def all_special_ids(self) -> list[int]:
|
||||
return [
|
||||
self.all_special_tokens.index(t) for t in self.all_special_tokens
|
||||
]
|
||||
|
||||
@property
|
||||
def bos_token_id(self) -> int:
|
||||
return self.tokenizer.bos_id
|
||||
|
||||
@property
|
||||
def eos_token_id(self) -> int:
|
||||
return self.tokenizer.eos_id
|
||||
|
||||
@property
|
||||
def sep_token(self) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
def pad_token(self) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
def is_fast(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
return len(self._vocab)
|
||||
|
||||
@property
|
||||
def max_token_id(self) -> int:
|
||||
return self._max_token_id
|
||||
|
||||
@property
|
||||
def truncation_side(self) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
def __len__(self) -> int:
|
||||
return self.vocab_size
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: Union[str, list[str], list[int]],
|
||||
text_pair: Optional[str] = None,
|
||||
add_special_tokens: bool = False,
|
||||
truncation: bool = False,
|
||||
max_length: Optional[int] = None,
|
||||
):
|
||||
input_ids: Union[list[int], list[list[int]]]
|
||||
# For list[str], original prompt text
|
||||
if is_list_of(text, str):
|
||||
input_ids_: list[list[int]] = []
|
||||
for p in text:
|
||||
each_input_ids = self.encode_one(p, truncation, max_length)
|
||||
input_ids_.append(each_input_ids)
|
||||
input_ids = input_ids_
|
||||
# For list[int], apply chat template output, already tokens.
|
||||
elif is_list_of(text, int):
|
||||
input_ids = text
|
||||
# For str, single prompt text
|
||||
else:
|
||||
input_ids = self.encode_one(text, truncation, max_length)
|
||||
return BatchEncoding({"input_ids": input_ids})
|
||||
|
||||
def get_vocab(self) -> dict[str, int]:
|
||||
# NB: the dictionary form of the vocabulary collapses token ids that map
|
||||
# to the same string but have different bytes
|
||||
return self._vocab_dict
|
||||
|
||||
def get_added_vocab(self) -> dict[str, int]:
|
||||
# Mistral tokenizers have no added vocabulary
|
||||
return {}
|
||||
|
||||
def encode_one(
|
||||
self,
|
||||
text: str,
|
||||
truncation: bool = False,
|
||||
max_length: Optional[int] = None,
|
||||
) -> list[int]:
|
||||
# Mistral Tokenizers should not add special tokens
|
||||
input_ids = self.encode(text)
|
||||
|
||||
if truncation:
|
||||
input_ids = input_ids[:max_length]
|
||||
return input_ids
|
||||
|
||||
def encode(self,
|
||||
text: str,
|
||||
truncation: Optional[bool] = None,
|
||||
max_length: Optional[int] = None,
|
||||
add_special_tokens: Optional[bool] = None) -> list[int]:
|
||||
# `encode` should only be used for prompt completion
|
||||
# it should never be used for chat_completion.
|
||||
# For chat completion use `apply_chat_template`
|
||||
if add_special_tokens is not None:
|
||||
return self.tokenizer.encode(text,
|
||||
bos=add_special_tokens,
|
||||
eos=add_special_tokens)
|
||||
else:
|
||||
return self.tokenizer.encode(text, bos=True, eos=False)
|
||||
|
||||
def apply_chat_template(self,
|
||||
messages: list["ChatCompletionMessageParam"],
|
||||
tools: Optional[list[dict[str, Any]]] = None,
|
||||
**kwargs) -> list[int]:
|
||||
|
||||
request = make_mistral_chat_completion_request(messages, tools)
|
||||
encoded = self.mistral.encode_chat_completion(request)
|
||||
|
||||
# encode-decode to get clean prompt
|
||||
return encoded.tokens
|
||||
|
||||
def convert_tokens_to_string(self, tokens: list[str]) -> str:
|
||||
from mistral_common.tokens.tokenizers.base import SpecialTokens
|
||||
if self.is_tekken:
|
||||
tokens = [
|
||||
t for t in tokens
|
||||
if (t is SpecialTokens.tool_calls
|
||||
or t not in self.tokenizer._all_special_tokens)
|
||||
]
|
||||
|
||||
if any(isinstance(t, bytes) for t in tokens):
|
||||
# we need to encode and decode all tokens again
|
||||
shift = self.tokenizer.num_special_tokens
|
||||
|
||||
def _token_to_id(t: str):
|
||||
t_bytes = t.encode("utf-8") \
|
||||
if not isinstance(t, bytes) else t
|
||||
try:
|
||||
return shift + \
|
||||
self.tokenizer._tekken_token2id_nospecial[t_bytes]
|
||||
except KeyError:
|
||||
logger.warning(
|
||||
"Failed to convert token %s to id,"
|
||||
" replacing with <unk>", t_bytes)
|
||||
return self.tokenizer.unk_id
|
||||
|
||||
ids = [_token_to_id(t) for t in tokens]
|
||||
decoded = self.tokenizer.decode(ids,
|
||||
self._special_token_policy)
|
||||
else:
|
||||
decoded = "".join(tokens)
|
||||
else:
|
||||
# make sure certain special tokens like Tool calls are
|
||||
# not decoded
|
||||
special_tokens = {SpecialTokens.tool_calls}
|
||||
regular_tokens: list[str] = []
|
||||
decoded_list = []
|
||||
|
||||
for token in tokens:
|
||||
if token in special_tokens:
|
||||
if regular_tokens:
|
||||
decoded_list.append(
|
||||
self.tokenizer.decode(regular_tokens,
|
||||
self._special_token_policy))
|
||||
regular_tokens = []
|
||||
decoded_list.append(token)
|
||||
else:
|
||||
regular_tokens.append(token)
|
||||
|
||||
if regular_tokens:
|
||||
decoded_list.append(
|
||||
self.tokenizer.decode(regular_tokens,
|
||||
self._special_token_policy))
|
||||
|
||||
decoded = ''.join(decoded_list)
|
||||
|
||||
return decoded
|
||||
|
||||
def decode(self,
|
||||
ids: Union[list[int], int],
|
||||
skip_special_tokens: bool = True) -> str:
|
||||
assert (
|
||||
skip_special_tokens
|
||||
), "skip_special_tokens=False is not supported for Mistral tokenizers."
|
||||
|
||||
if isinstance(ids, int):
|
||||
ids = [ids]
|
||||
return self.tokenizer.decode(ids, self._special_token_policy)
|
||||
|
||||
def convert_ids_to_tokens(
|
||||
self,
|
||||
ids: list[int],
|
||||
skip_special_tokens: bool = True,
|
||||
) -> list[str]:
|
||||
from mistral_common.tokens.tokenizers.base import SpecialTokens
|
||||
from mistral_common.tokens.tokenizers.instruct import (
|
||||
InstructTokenizerV13)
|
||||
|
||||
# TODO(Patrick) - potentially allow special tokens to not be skipped
|
||||
assert (
|
||||
skip_special_tokens
|
||||
), "skip_special_tokens=False is not supported for Mistral tokenizers."
|
||||
|
||||
assert self.is_tekken or self.is_spm, type(self.tokenizer)
|
||||
|
||||
if self.is_tekken:
|
||||
# skip special tokens except tool call and think tokens
|
||||
non_skip_special_tokens = {
|
||||
self.tokenizer.get_control_token(SpecialTokens.tool_calls)
|
||||
}
|
||||
if isinstance(self.instruct, InstructTokenizerV13):
|
||||
if self.instruct.BEGIN_THINK:
|
||||
non_skip_special_tokens.add(self.instruct.BEGIN_THINK)
|
||||
if self.instruct.END_THINK:
|
||||
non_skip_special_tokens.add(self.instruct.END_THINK)
|
||||
ids = [
|
||||
i for i in ids if i > self.tokenizer.num_special_tokens
|
||||
or i in non_skip_special_tokens
|
||||
]
|
||||
|
||||
tokens = [self.tokenizer.id_to_piece(id) for id in ids]
|
||||
|
||||
if any("<EFBFBD>" in t for t in tokens) and self.is_tekken:
|
||||
# if a decoded token contains the replacement character, then the
|
||||
# token has an incomplete UTF-8 character so we must use bytes
|
||||
# See: https://github.com/vllm-project/vllm/pull/8640
|
||||
# https://github.com/vllm-project/vllm/pull/9625
|
||||
# if underlying tokenizeir is sentencepiece, we just add "<22>"
|
||||
tokens = [
|
||||
self.tokenizer.id_to_byte_piece(id, self._special_token_policy)
|
||||
for id in ids
|
||||
]
|
||||
|
||||
return tokens
|
||||
108
vllm/transformers_utils/utils.py
Normal file
108
vllm/transformers_utils/utils.py
Normal file
@@ -0,0 +1,108 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
import struct
|
||||
from functools import cache
|
||||
from os import PathLike
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from vllm.envs import VLLM_MODEL_REDIRECT_PATH
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def is_s3(model_or_path: str) -> bool:
|
||||
return model_or_path.lower().startswith('s3://')
|
||||
|
||||
|
||||
def check_gguf_file(model: Union[str, PathLike]) -> bool:
|
||||
"""Check if the file is a GGUF model."""
|
||||
model = Path(model)
|
||||
if not model.is_file():
|
||||
return False
|
||||
elif model.suffix == ".gguf":
|
||||
return True
|
||||
|
||||
try:
|
||||
with model.open("rb") as f:
|
||||
header = f.read(4)
|
||||
|
||||
return header == b"GGUF"
|
||||
except Exception as e:
|
||||
logger.debug("Error reading file %s: %s", model, e)
|
||||
return False
|
||||
|
||||
|
||||
def modelscope_list_repo_files(
|
||||
repo_id: str,
|
||||
revision: Optional[str] = None,
|
||||
token: Union[str, bool, None] = None,
|
||||
) -> list[str]:
|
||||
"""List files in a modelscope repo."""
|
||||
from modelscope.hub.api import HubApi
|
||||
api = HubApi()
|
||||
api.login(token)
|
||||
# same as huggingface_hub.list_repo_files
|
||||
files = [
|
||||
file['Path'] for file in api.get_model_files(
|
||||
model_id=repo_id, revision=revision, recursive=True)
|
||||
if file['Type'] == 'blob'
|
||||
]
|
||||
return files
|
||||
|
||||
|
||||
def _maybe_json_dict(path: Union[str, PathLike]) -> dict[str, str]:
|
||||
with open(path) as f:
|
||||
try:
|
||||
return json.loads(f.read())
|
||||
except Exception:
|
||||
return dict[str, str]()
|
||||
|
||||
|
||||
def _maybe_space_split_dict(path: Union[str, PathLike]) -> dict[str, str]:
|
||||
parsed_dict = dict[str, str]()
|
||||
with open(path) as f:
|
||||
for line in f.readlines():
|
||||
try:
|
||||
model_name, redirect_name = line.strip().split()
|
||||
parsed_dict[model_name] = redirect_name
|
||||
except Exception:
|
||||
pass
|
||||
return parsed_dict
|
||||
|
||||
|
||||
@cache
|
||||
def maybe_model_redirect(model: str) -> str:
|
||||
"""
|
||||
Use model_redirect to redirect the model name to a local folder.
|
||||
|
||||
:param model: hf model name
|
||||
:return: maybe redirect to a local folder
|
||||
"""
|
||||
|
||||
model_redirect_path = VLLM_MODEL_REDIRECT_PATH
|
||||
|
||||
if not model_redirect_path:
|
||||
return model
|
||||
|
||||
if not Path(model_redirect_path).exists():
|
||||
return model
|
||||
|
||||
redirect_dict = (_maybe_json_dict(model_redirect_path)
|
||||
or _maybe_space_split_dict(model_redirect_path))
|
||||
if (redirect_model := redirect_dict.get(model)):
|
||||
logger.info("model redirect: [ %s ] -> [ %s ]", model, redirect_model)
|
||||
return redirect_model
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def parse_safetensors_file_metadata(
|
||||
path: Union[str, PathLike]) -> dict[str, Any]:
|
||||
with open(path, "rb") as f:
|
||||
length_of_metadata = struct.unpack('<Q', f.read(8))[0]
|
||||
metadata = json.loads(f.read(length_of_metadata).decode('utf-8'))
|
||||
return metadata
|
||||
Reference in New Issue
Block a user