first commit
This commit is contained in:
24
vllm/transformers_utils/__init__.py
Normal file
24
vllm/transformers_utils/__init__.py
Normal file
@@ -0,0 +1,24 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm import envs
|
||||
|
||||
if envs.VLLM_USE_MODELSCOPE:
|
||||
try:
|
||||
# Patch here, before each import happens
|
||||
import modelscope
|
||||
from packaging import version
|
||||
|
||||
# patch_hub begins from modelscope>=1.18.1
|
||||
if version.parse(modelscope.__version__) <= version.parse('1.18.0'):
|
||||
raise ImportError(
|
||||
'Using vLLM with ModelScope needs modelscope>=1.18.1, please '
|
||||
'install by `pip install modelscope -U`')
|
||||
from modelscope.utils.hf_util import patch_hub
|
||||
|
||||
# Patch hub to download models from modelscope to speed up.
|
||||
patch_hub()
|
||||
except ImportError as err:
|
||||
raise ImportError(
|
||||
"Please install modelscope>=1.18.1 via "
|
||||
"`pip install modelscope>=1.18.1` to use ModelScope.") from err
|
||||
BIN
vllm/transformers_utils/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
vllm/transformers_utils/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
BIN
vllm/transformers_utils/__pycache__/config.cpython-310.pyc
Normal file
BIN
vllm/transformers_utils/__pycache__/config.cpython-310.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
vllm/transformers_utils/__pycache__/processor.cpython-310.pyc
Normal file
BIN
vllm/transformers_utils/__pycache__/processor.cpython-310.pyc
Normal file
Binary file not shown.
BIN
vllm/transformers_utils/__pycache__/runai_utils.cpython-310.pyc
Normal file
BIN
vllm/transformers_utils/__pycache__/runai_utils.cpython-310.pyc
Normal file
Binary file not shown.
BIN
vllm/transformers_utils/__pycache__/s3_utils.cpython-310.pyc
Normal file
BIN
vllm/transformers_utils/__pycache__/s3_utils.cpython-310.pyc
Normal file
Binary file not shown.
BIN
vllm/transformers_utils/__pycache__/tokenizer.cpython-310.pyc
Normal file
BIN
vllm/transformers_utils/__pycache__/tokenizer.cpython-310.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
vllm/transformers_utils/__pycache__/utils.cpython-310.pyc
Normal file
BIN
vllm/transformers_utils/__pycache__/utils.cpython-310.pyc
Normal file
Binary file not shown.
5
vllm/transformers_utils/chat_templates/__init__.py
Normal file
5
vllm/transformers_utils/chat_templates/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from .registry import get_chat_template_fallback_path
|
||||
|
||||
__all__ = ["get_chat_template_fallback_path"]
|
||||
Binary file not shown.
Binary file not shown.
70
vllm/transformers_utils/chat_templates/registry.py
Normal file
70
vllm/transformers_utils/chat_templates/registry.py
Normal file
@@ -0,0 +1,70 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from pathlib import Path
|
||||
from typing import Callable, Optional, Union
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__file__)
|
||||
|
||||
CHAT_TEMPLATES_DIR = Path(__file__).parent
|
||||
|
||||
ChatTemplatePath = Union[Path, Callable[[str], Optional[Path]]]
|
||||
|
||||
|
||||
def _get_qwen_chat_template_fallback(
|
||||
tokenizer_name_or_path: str) -> Optional[Path]:
|
||||
if tokenizer_name_or_path.endswith("-Chat"):
|
||||
return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
|
||||
|
||||
return CHAT_TEMPLATES_DIR / "template_basic.jinja"
|
||||
|
||||
|
||||
def _get_minicpmv_chat_template_fallback(
|
||||
tokenizer_name_or_path: str) -> Optional[Path]:
|
||||
# MiniCPM-V-4.5 version uses a dedicated template
|
||||
if "4.5" in tokenizer_name_or_path or "4_5" in tokenizer_name_or_path:
|
||||
return CHAT_TEMPLATES_DIR / "template_minicpmv45.jinja"
|
||||
|
||||
# Other versions use chatml template
|
||||
return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
|
||||
|
||||
|
||||
# yapf: disable
|
||||
_MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
|
||||
"blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
|
||||
"chameleon": CHAT_TEMPLATES_DIR / "template_basic.jinja",
|
||||
"deepseek_vl_v2": CHAT_TEMPLATES_DIR / "template_deepseek_vl2.jinja",
|
||||
"fuyu": CHAT_TEMPLATES_DIR / "template_fuyu.jinja",
|
||||
"minicpmv": _get_minicpmv_chat_template_fallback,
|
||||
"paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
|
||||
"qwen": _get_qwen_chat_template_fallback,
|
||||
}
|
||||
# yapf: enable
|
||||
|
||||
|
||||
def register_chat_template_fallback_path(
|
||||
model_type: str,
|
||||
chat_template: ChatTemplatePath,
|
||||
) -> None:
|
||||
if model_type in _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK:
|
||||
logger.warning(
|
||||
"Model type %s already has a chat template registered. "
|
||||
"It will be overwritten by the new chat template %s.", model_type,
|
||||
chat_template)
|
||||
|
||||
_MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK[model_type] = chat_template
|
||||
|
||||
|
||||
def get_chat_template_fallback_path(
|
||||
model_type: str,
|
||||
tokenizer_name_or_path: str,
|
||||
) -> Optional[Path]:
|
||||
chat_template = _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK.get(model_type)
|
||||
if callable(chat_template):
|
||||
chat_template = chat_template(tokenizer_name_or_path)
|
||||
|
||||
if chat_template is None:
|
||||
return None
|
||||
|
||||
return chat_template
|
||||
@@ -0,0 +1,3 @@
|
||||
{%- for message in messages -%}
|
||||
{{- message['content'] -}}
|
||||
{%- endfor -%}
|
||||
11
vllm/transformers_utils/chat_templates/template_blip2.jinja
Normal file
11
vllm/transformers_utils/chat_templates/template_blip2.jinja
Normal file
@@ -0,0 +1,11 @@
|
||||
{%- for message in messages -%}
|
||||
{%- if message['role'] == 'user' -%}
|
||||
{{- 'Question: ' + message['content'] + ' ' -}}
|
||||
{%- elif message['role'] == 'assistant' -%}
|
||||
{{- 'Answer: ' + message['content'] + ' ' -}}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
|
||||
{%- if add_generation_prompt -%}
|
||||
{{- 'Answer:' -}}
|
||||
{% endif %}
|
||||
10
vllm/transformers_utils/chat_templates/template_chatml.jinja
Normal file
10
vllm/transformers_utils/chat_templates/template_chatml.jinja
Normal file
@@ -0,0 +1,10 @@
|
||||
{%- for message in messages -%}
|
||||
{{- '<|im_start|>' + message['role'] + '\n' + message['content'] -}}
|
||||
{%- if (loop.last and add_generation_prompt) or not loop.last -%}
|
||||
{{- '<|im_end|>' + '\n' -}}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
|
||||
{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
|
||||
{{- '<|im_start|>assistant\n' -}}
|
||||
{%- endif -%}
|
||||
@@ -0,0 +1,23 @@
|
||||
{%- if messages[0]['role'] == 'system' -%}
|
||||
{%- set system_message = messages[0]['content'] -%}
|
||||
{%- set messages = messages[1:] -%}
|
||||
{%- else -%}
|
||||
{% set system_message = '' -%}
|
||||
{%- endif -%}
|
||||
|
||||
{{ bos_token + system_message }}
|
||||
{%- for message in messages -%}
|
||||
{%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
|
||||
{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
|
||||
{%- endif -%}
|
||||
|
||||
{%- if message['role'] == 'user' -%}
|
||||
{{ '<|User|>: ' + message['content'] + '\n\n' }}
|
||||
{%- elif message['role'] == 'assistant' -%}
|
||||
{{ '<|Assistant|>: ' + message['content'] + eos_token + '\n\n' }}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
|
||||
{%- if add_generation_prompt -%}
|
||||
{{ '<|Assistant|>: ' }}
|
||||
{%- endif -%}
|
||||
@@ -0,0 +1,3 @@
|
||||
{%- for message in messages -%}
|
||||
{{- message['content'] + '\n' -}}
|
||||
{%- endfor -%}
|
||||
@@ -0,0 +1,93 @@
|
||||
{%- set enable_thinking = enable_thinking | default(false) %}
|
||||
{%- if tools %}
|
||||
{{- '<|im_start|>system\n' }}
|
||||
{%- if messages[0].role == 'system' %}
|
||||
{{- messages[0].content + '\n\n' }}
|
||||
{%- endif %}
|
||||
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
||||
{%- for tool in tools %}
|
||||
{{- "\n" }}
|
||||
{{- tool | tojson }}
|
||||
{%- endfor %}
|
||||
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
||||
{%- else %}
|
||||
{%- if messages[0].role == 'system' %}
|
||||
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
|
||||
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
||||
{%- for message in messages[::-1] %}
|
||||
{%- set index = (messages|length - 1) - loop.index0 %}
|
||||
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
||||
{%- set ns.multi_step_tool = false %}
|
||||
{%- set ns.last_query_index = index %}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
|
||||
{%- for message in messages %}
|
||||
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
||||
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
||||
{%- elif message.role == "assistant" %}
|
||||
{%- set content = message.content %}
|
||||
{%- set reasoning_content = '' %}
|
||||
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
|
||||
{%- set reasoning_content = message.reasoning_content %}
|
||||
{%- else %}
|
||||
{%- if '</think>' in message.content %}
|
||||
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
||||
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- if loop.index0 > ns.last_query_index %}
|
||||
{%- if loop.last or (not loop.last and reasoning_content) %}
|
||||
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
||||
{%- else %}
|
||||
{{- '<|im_start|>' + message.role + '\n' + content }}
|
||||
{%- endif %}
|
||||
{%- else %}
|
||||
{{- '<|im_start|>' + message.role + '\n' + content }}
|
||||
{%- endif %}
|
||||
|
||||
{%- if message.tool_calls %}
|
||||
{%- for tool_call in message.tool_calls %}
|
||||
{%- if (loop.first and content) or (not loop.first) %}
|
||||
{{- '\n' }}
|
||||
{%- endif %}
|
||||
{%- if tool_call.function %}
|
||||
{%- set tool_call = tool_call.function %}
|
||||
{%- endif %}
|
||||
{{- '<tool_call>\n{"name": "' }}
|
||||
{{- tool_call.name }}
|
||||
{{- '", "arguments": ' }}
|
||||
{%- if tool_call.arguments is string %}
|
||||
{{- tool_call.arguments }}
|
||||
{%- else %}
|
||||
{{- tool_call.arguments | tojson }}
|
||||
{%- endif %}
|
||||
{{- '}\n</tool_call>' }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- elif message.role == "tool" %}
|
||||
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
||||
{{- '<|im_start|>user' }}
|
||||
{%- endif %}
|
||||
{{- '\n<tool_response>\n' }}
|
||||
{{- message.content }}
|
||||
{{- '\n</tool_response>' }}
|
||||
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
|
||||
{%- if add_generation_prompt %}
|
||||
{{- '<|im_start|>assistant\n' }}
|
||||
{%- if enable_thinking is defined and enable_thinking is false %}
|
||||
{{- '<think>\n\n</think>\n\n' }}
|
||||
{%- endif %}
|
||||
{%- if enable_thinking is defined and enable_thinking is true %}
|
||||
{{- '<think>\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
1102
vllm/transformers_utils/config.py
Normal file
1102
vllm/transformers_utils/config.py
Normal file
File diff suppressed because it is too large
Load Diff
20
vllm/transformers_utils/config_parser_base.py
Normal file
20
vllm/transformers_utils/config_parser_base.py
Normal file
@@ -0,0 +1,20 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class ConfigParserBase(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def parse(self,
|
||||
model: Union[str, Path],
|
||||
trust_remote_code: bool,
|
||||
revision: Optional[str] = None,
|
||||
code_revision: Optional[str] = None,
|
||||
**kwargs) -> tuple[dict, PretrainedConfig]:
|
||||
raise NotImplementedError
|
||||
63
vllm/transformers_utils/configs/__init__.py
Normal file
63
vllm/transformers_utils/configs/__init__.py
Normal file
@@ -0,0 +1,63 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Model configs may be defined in this directory for the following reasons:
|
||||
|
||||
- There is no configuration file defined by HF Hub or Transformers library.
|
||||
- There is a need to override the existing config to support vLLM.
|
||||
"""
|
||||
|
||||
from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
|
||||
from vllm.transformers_utils.configs.deepseek_v3 import DeepseekV3Config
|
||||
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
|
||||
from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig
|
||||
from vllm.transformers_utils.configs.eagle import EAGLEConfig
|
||||
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
|
||||
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
|
||||
# `FalconConfig` class from the official HuggingFace transformers library.
|
||||
from vllm.transformers_utils.configs.falcon import RWConfig
|
||||
from vllm.transformers_utils.configs.jais import JAISConfig
|
||||
from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
|
||||
from vllm.transformers_utils.configs.medusa import MedusaConfig
|
||||
from vllm.transformers_utils.configs.midashenglm import MiDashengLMConfig
|
||||
from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
|
||||
from vllm.transformers_utils.configs.moonvit import MoonViTConfig
|
||||
from vllm.transformers_utils.configs.nemotron import NemotronConfig
|
||||
from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
|
||||
from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config
|
||||
from vllm.transformers_utils.configs.olmo3 import Olmo3Config
|
||||
from vllm.transformers_utils.configs.ovis import OvisConfig
|
||||
from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig
|
||||
from vllm.transformers_utils.configs.radio import RadioConfig
|
||||
from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
|
||||
from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig,
|
||||
Step3VisionEncoderConfig,
|
||||
Step3VLConfig)
|
||||
from vllm.transformers_utils.configs.ultravox import UltravoxConfig
|
||||
|
||||
__all__ = [
|
||||
"ChatGLMConfig",
|
||||
"DeepseekVLV2Config",
|
||||
"DeepseekV3Config",
|
||||
"DotsOCRConfig",
|
||||
"EAGLEConfig",
|
||||
"RWConfig",
|
||||
"JAISConfig",
|
||||
"MedusaConfig",
|
||||
"MiDashengLMConfig",
|
||||
"MLPSpeculatorConfig",
|
||||
"MoonViTConfig",
|
||||
"KimiVLConfig",
|
||||
"NemotronConfig",
|
||||
"NemotronHConfig",
|
||||
"Nemotron_Nano_VL_Config",
|
||||
"Olmo3Config",
|
||||
"OvisConfig",
|
||||
"RadioConfig",
|
||||
"SpeculatorsConfig",
|
||||
"UltravoxConfig",
|
||||
"Step3VLConfig",
|
||||
"Step3VisionEncoderConfig",
|
||||
"Step3TextConfig",
|
||||
"Qwen3NextConfig",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
vllm/transformers_utils/configs/__pycache__/jais.cpython-310.pyc
Normal file
BIN
vllm/transformers_utils/configs/__pycache__/jais.cpython-310.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
vllm/transformers_utils/configs/__pycache__/ovis.cpython-310.pyc
Normal file
BIN
vllm/transformers_utils/configs/__pycache__/ovis.cpython-310.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
207
vllm/transformers_utils/configs/arctic.py
Normal file
207
vllm/transformers_utils/configs/arctic.py
Normal file
@@ -0,0 +1,207 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# Copied from
|
||||
# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
|
||||
""" Arctic model configuration"""
|
||||
|
||||
from dataclasses import asdict, dataclass
|
||||
from typing import Any
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
ARCTIC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"arctic": "https://huggingface.co/Snowflake/snowflake-arctic-instruct/tree/main/config.json",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ArcticLoRAConfig:
|
||||
lora_r: int = 64
|
||||
lora_alpha: float = 16
|
||||
shard_base_weights: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class ArcticQuantizationConfig:
|
||||
q_bits: int = 8
|
||||
rounding: str = "nearest"
|
||||
mantissa_bits: int = 3
|
||||
group_size: int = 128
|
||||
|
||||
|
||||
class ArcticConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`ArcticModel`]. It is used to instantiate an
|
||||
Arctic model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
||||
with the defaults will yield a similar configuration to that of the #TODO(rsamdani): add what model has the default config..
|
||||
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 32000):
|
||||
Vocabulary size of the Arctic model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`ArcticModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 14336):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 8):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function (function or string) in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
|
||||
The maximum sequence length that this model might ever be used with. Arctic's sliding window attention
|
||||
allows sequence of up to 4096*32 tokens.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*):
|
||||
The id of the padding token.
|
||||
bos_token_id (`int`, *optional*, defaults to 1):
|
||||
The id of the "beginning-of-sequence" token.
|
||||
eos_token_id (`int`, *optional*, defaults to 2):
|
||||
The id of the "end-of-sequence" token.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model's input and output word embeddings should be tied.
|
||||
rope_theta (`float`, *optional*, defaults to 1000000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
sliding_window (`int`, *optional*):
|
||||
Sliding window attention window size. If not specified, will default to `4096`.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
num_experts_per_tok (`int`, *optional*, defaults to 2):
|
||||
The number of experts to root per-token, can be also interpreted as the `top-p` routing
|
||||
parameter
|
||||
num_local_experts (`int`, *optional*, defaults to 8):
|
||||
Number of experts per Sparse MLP layer.
|
||||
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
|
||||
The aux loss factor for the total loss.
|
||||
|
||||
```python
|
||||
>>> from transformers import ArcticModel, ArcticConfig
|
||||
|
||||
>>> # Initializing a Arctic 7B style configuration TODO(rsamdani): verify which model does the default configuration correspond to.
|
||||
>>> configuration = ArcticConfig()
|
||||
|
||||
>>> # Initializing a model from the Arctic 7B style configuration
|
||||
>>> model = ArcticModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "arctic"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=32000,
|
||||
hidden_size=4096,
|
||||
intermediate_size=14336,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=4096,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-5,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=1e6,
|
||||
sliding_window=None,
|
||||
attention_dropout=0.0,
|
||||
num_experts_per_tok=1,
|
||||
num_local_experts=8,
|
||||
router_aux_loss_coef=0.001,
|
||||
moe_layer_frequency=2,
|
||||
parallel_attn_mlp_res=False,
|
||||
moe_train_capacity_factor=1,
|
||||
moe_eval_capacity_factor=1,
|
||||
enable_expert_tensor_parallelism=False,
|
||||
moe_min_capacity=0,
|
||||
moe_token_dropping=True,
|
||||
quantization=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.sliding_window = sliding_window
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.num_local_experts = num_local_experts
|
||||
self.router_aux_loss_coef = router_aux_loss_coef
|
||||
self.moe_layer_frequency = moe_layer_frequency
|
||||
self.moe_train_capacity_factor = moe_train_capacity_factor
|
||||
self.moe_eval_capacity_factor = moe_eval_capacity_factor
|
||||
self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
|
||||
self.moe_min_capacity = moe_min_capacity
|
||||
self.moe_token_dropping = moe_token_dropping
|
||||
self.parallel_attn_mlp_res = parallel_attn_mlp_res
|
||||
if isinstance(quantization, dict):
|
||||
self.quantization = ArcticQuantizationConfig(**quantization)
|
||||
else:
|
||||
self.quantization = quantization
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, config_dict: dict[str, Any], **kwargs) -> "ArcticConfig":
|
||||
result = super().from_dict(config_dict, **kwargs)
|
||||
config = result[0] if isinstance(result, tuple) else result
|
||||
if isinstance(config.quantization, dict):
|
||||
config.quantization = ArcticQuantizationConfig(**config.quantization)
|
||||
return result
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
ret = super().to_dict()
|
||||
if isinstance(ret["quantization"], ArcticQuantizationConfig):
|
||||
ret["quantization"] = asdict(ret["quantization"])
|
||||
return ret
|
||||
72
vllm/transformers_utils/configs/chatglm.py
Normal file
72
vllm/transformers_utils/configs/chatglm.py
Normal file
@@ -0,0 +1,72 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Adapted from
|
||||
# https://github.com/zai-org/ChatGLM2-6B
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class ChatGLMConfig(PretrainedConfig):
|
||||
model_type = "chatglm"
|
||||
attribute_map = {
|
||||
"num_hidden_layers": "num_layers",
|
||||
"n_head_kv": "multi_query_group_num",
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
num_layers=28,
|
||||
padded_vocab_size=65024,
|
||||
hidden_size=4096,
|
||||
ffn_hidden_size=13696,
|
||||
kv_channels=128,
|
||||
num_attention_heads=32,
|
||||
seq_length=2048,
|
||||
hidden_dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
layernorm_epsilon=1e-5,
|
||||
rmsnorm=True,
|
||||
apply_residual_connection_post_layernorm=False,
|
||||
post_layer_norm=True,
|
||||
add_bias_linear=False,
|
||||
add_qkv_bias=False,
|
||||
interleaved_qkv=False,
|
||||
bias_dropout_fusion=True,
|
||||
multi_query_attention=False,
|
||||
multi_query_group_num=1,
|
||||
apply_query_key_layer_scaling=True,
|
||||
attention_softmax_in_fp32=True,
|
||||
fp32_residual_connection=False,
|
||||
quantization_bit=0,
|
||||
pre_seq_len=None,
|
||||
prefix_projection=False,
|
||||
**kwargs):
|
||||
self.num_layers = num_layers
|
||||
self.vocab_size = padded_vocab_size
|
||||
self.padded_vocab_size = padded_vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.ffn_hidden_size = ffn_hidden_size
|
||||
self.kv_channels = kv_channels
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.seq_length = seq_length
|
||||
# It is to be compatible with long lora.
|
||||
self.max_position_embeddings = seq_length
|
||||
self.hidden_dropout = hidden_dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.layernorm_epsilon = layernorm_epsilon
|
||||
self.rmsnorm = rmsnorm
|
||||
self.apply_residual_connection_post_layernorm = (
|
||||
apply_residual_connection_post_layernorm)
|
||||
self.post_layer_norm = post_layer_norm
|
||||
self.add_bias_linear = add_bias_linear
|
||||
self.add_qkv_bias = add_qkv_bias
|
||||
self.bias_dropout_fusion = bias_dropout_fusion
|
||||
self.multi_query_attention = multi_query_attention
|
||||
self.multi_query_group_num = multi_query_group_num
|
||||
self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
|
||||
self.attention_softmax_in_fp32 = attention_softmax_in_fp32
|
||||
self.fp32_residual_connection = fp32_residual_connection
|
||||
self.quantization_bit = quantization_bit
|
||||
self.pre_seq_len = pre_seq_len
|
||||
self.prefix_projection = prefix_projection
|
||||
self.interleaved_qkv = interleaved_qkv
|
||||
super().__init__(**kwargs)
|
||||
101
vllm/transformers_utils/configs/deepseek_v3.py
Normal file
101
vllm/transformers_utils/configs/deepseek_v3.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class DeepseekV3Config(PretrainedConfig):
|
||||
|
||||
model_type = "deepseek_v3"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=129280,
|
||||
hidden_size=7168,
|
||||
intermediate_size=18432,
|
||||
moe_intermediate_size=2048,
|
||||
num_hidden_layers=61,
|
||||
num_nextn_predict_layers=1,
|
||||
num_attention_heads=128,
|
||||
num_key_value_heads=128,
|
||||
n_shared_experts=1,
|
||||
n_routed_experts=256,
|
||||
ep_size=1,
|
||||
routed_scaling_factor=2.5,
|
||||
kv_lora_rank=512,
|
||||
q_lora_rank=1536,
|
||||
qk_rope_head_dim=64,
|
||||
v_head_dim=128,
|
||||
qk_nope_head_dim=128,
|
||||
topk_method='noaux_tc',
|
||||
n_group=8,
|
||||
topk_group=4,
|
||||
num_experts_per_tok=8,
|
||||
moe_layer_freq=1,
|
||||
first_k_dense_replace=3,
|
||||
norm_topk_prob=True,
|
||||
scoring_func='sigmoid',
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=4096,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-6,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=0,
|
||||
eos_token_id=1,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.moe_intermediate_size = moe_intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_nextn_predict_layers = num_nextn_predict_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.n_shared_experts = n_shared_experts
|
||||
self.n_routed_experts = n_routed_experts
|
||||
self.ep_size = ep_size
|
||||
self.routed_scaling_factor = routed_scaling_factor
|
||||
self.kv_lora_rank = kv_lora_rank
|
||||
self.q_lora_rank = q_lora_rank
|
||||
self.qk_rope_head_dim = qk_rope_head_dim
|
||||
self.v_head_dim = v_head_dim
|
||||
self.qk_nope_head_dim = qk_nope_head_dim
|
||||
self.topk_method = topk_method
|
||||
self.n_group = n_group
|
||||
self.topk_group = topk_group
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.moe_layer_freq = moe_layer_freq
|
||||
self.first_k_dense_replace = first_k_dense_replace
|
||||
self.norm_topk_prob = norm_topk_prob
|
||||
self.scoring_func = scoring_func
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
216
vllm/transformers_utils/configs/deepseek_vl2.py
Normal file
216
vllm/transformers_utils/configs/deepseek_vl2.py
Normal file
@@ -0,0 +1,216 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class VisionEncoderConfig(PretrainedConfig):
|
||||
model_type: str = "vision"
|
||||
|
||||
model_name: str = "vit_so400m_patch14_siglip_384.webli"
|
||||
image_size: int = 384
|
||||
patch_size: int = 16
|
||||
width: int = 1024
|
||||
layers: int = 24
|
||||
heads: int = 16
|
||||
mlp_ratio: int = 4
|
||||
global_pool: str = "map"
|
||||
ignore_head: bool = True
|
||||
class_token: bool = False
|
||||
num_classes: int = 0
|
||||
use_checkpoint: bool = False
|
||||
weight_init: str = "skip"
|
||||
deterministic: bool = False
|
||||
num_recomputing_layers: int = 0
|
||||
|
||||
def __init__(self,
|
||||
model_name: str = "vit_so400m_patch14_siglip_384.webli",
|
||||
image_size: int = 384,
|
||||
patch_size: int = 16,
|
||||
width: int = 1024,
|
||||
layers: int = 24,
|
||||
heads: int = 16,
|
||||
mlp_ratio: int = 4,
|
||||
global_pool: str = "map",
|
||||
ignore_head: bool = True,
|
||||
class_token: bool = False,
|
||||
num_classes: int = 0,
|
||||
use_checkpoint: bool = False,
|
||||
**kwargs):
|
||||
self.model_name = model_name
|
||||
self.image_size = image_size
|
||||
self.patch_size = patch_size
|
||||
self.width = width
|
||||
self.layers = layers
|
||||
self.heads = heads
|
||||
self.mlp_ratio = mlp_ratio
|
||||
self.global_pool = global_pool
|
||||
self.ignore_head = ignore_head
|
||||
self.class_token = class_token
|
||||
self.num_classes = num_classes
|
||||
self.use_checkpoint = use_checkpoint
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
class MlpProjectorConfig(PretrainedConfig):
|
||||
model_type = "mlp_projector"
|
||||
projector_type: str = "downsample_mlp_gelu"
|
||||
input_dim: int = 1152
|
||||
n_embed: int = 2048
|
||||
depth: int = 2
|
||||
mlp_ratio: int = 1
|
||||
downsample_ratio: int = 2
|
||||
token_pooling: bool = False
|
||||
|
||||
def __init__(self,
|
||||
projector_type: str = "downsample_mlp_gelu",
|
||||
input_dim: int = 1152,
|
||||
n_embed: int = 2048,
|
||||
depth: int = 2,
|
||||
mlp_ratio: int = 1,
|
||||
downsample_ratio: int = 2,
|
||||
**kwargs):
|
||||
self.projector_type = projector_type
|
||||
self.input_dim = input_dim
|
||||
self.n_embed = n_embed
|
||||
self.depth = depth
|
||||
self.mlp_ratio = mlp_ratio
|
||||
self.downsample_ratio = downsample_ratio
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
class DeepseekV2Config(PretrainedConfig):
|
||||
|
||||
model_type = "deepseek_v2"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=102400,
|
||||
hidden_size=4096,
|
||||
intermediate_size=11008,
|
||||
moe_intermediate_size=1407,
|
||||
num_hidden_layers=30,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=32,
|
||||
n_shared_experts=None,
|
||||
n_routed_experts=None,
|
||||
ep_size=1,
|
||||
routed_scaling_factor=1.0,
|
||||
kv_lora_rank=512,
|
||||
q_lora_rank=1536,
|
||||
qk_rope_head_dim=64,
|
||||
v_head_dim=128,
|
||||
qk_nope_head_dim=128,
|
||||
topk_method='gready',
|
||||
n_group=None,
|
||||
topk_group=None,
|
||||
num_experts_per_tok=None,
|
||||
moe_layer_freq=1,
|
||||
first_k_dense_replace=0,
|
||||
norm_topk_prob=False,
|
||||
scoring_func='softmax',
|
||||
aux_loss_alpha=0.001,
|
||||
seq_aux=True,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=2048,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-6,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=100000,
|
||||
eos_token_id=100001,
|
||||
pretraining_tp=1,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
use_mla=True,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.moe_intermediate_size = moe_intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.n_shared_experts = n_shared_experts
|
||||
self.n_routed_experts = n_routed_experts
|
||||
self.ep_size = ep_size
|
||||
self.routed_scaling_factor = routed_scaling_factor
|
||||
self.kv_lora_rank = kv_lora_rank
|
||||
self.q_lora_rank = q_lora_rank
|
||||
self.qk_rope_head_dim = qk_rope_head_dim
|
||||
self.v_head_dim = v_head_dim
|
||||
self.qk_nope_head_dim = qk_nope_head_dim
|
||||
self.topk_method = topk_method
|
||||
self.n_group = n_group
|
||||
self.topk_group = topk_group
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.moe_layer_freq = moe_layer_freq
|
||||
self.first_k_dense_replace = first_k_dense_replace
|
||||
self.norm_topk_prob = norm_topk_prob
|
||||
self.scoring_func = scoring_func
|
||||
self.aux_loss_alpha = aux_loss_alpha
|
||||
self.seq_aux = seq_aux
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = float(rms_norm_eps)
|
||||
self.pretraining_tp = pretraining_tp
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
self.use_mla = use_mla
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class DeepseekVLV2Config(PretrainedConfig):
|
||||
model_type = "deepseek_vl_v2"
|
||||
vision_config: VisionEncoderConfig
|
||||
projector_config: MlpProjectorConfig
|
||||
|
||||
tile_tag: str = "2D"
|
||||
global_view_pos: str = "head"
|
||||
candidate_resolutions: tuple[tuple[int, int]] = ((384, 384), )
|
||||
|
||||
def __init__(self,
|
||||
tile_tag: str = "tile_tag",
|
||||
global_view_pos: str = "head",
|
||||
candidate_resolutions: tuple[tuple[int,
|
||||
int]] = ((384, 384), ),
|
||||
**kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
vision_config = kwargs.get("vision_config", {})
|
||||
self.vision_config = VisionEncoderConfig(**vision_config)
|
||||
|
||||
projector_config = kwargs.get("projector_config", {})
|
||||
self.projector_config = MlpProjectorConfig(**projector_config)
|
||||
|
||||
language_config = kwargs.get("language_config", {})
|
||||
self.text_config = DeepseekV2Config(**language_config)
|
||||
|
||||
self.tile_tag = tile_tag
|
||||
self.global_view_pos = global_view_pos
|
||||
self.candidate_resolutions = candidate_resolutions
|
||||
self.vocab_size = self.text_config.vocab_size
|
||||
69
vllm/transformers_utils/configs/dotsocr.py
Normal file
69
vllm/transformers_utils/configs/dotsocr.py
Normal file
@@ -0,0 +1,69 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any, Optional
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.models.qwen2 import Qwen2Config
|
||||
|
||||
|
||||
class DotsVisionConfig(PretrainedConfig):
|
||||
model_type: str = "dots_vit"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embed_dim: int = 1536, # vision encoder embed size
|
||||
hidden_size: int = 1536, # after merger hidden size
|
||||
intermediate_size: int = 4224,
|
||||
num_hidden_layers: int = 42,
|
||||
num_attention_heads: int = 12,
|
||||
num_channels: int = 3,
|
||||
patch_size: int = 14,
|
||||
spatial_merge_size: int = 2,
|
||||
temporal_patch_size: int = 1,
|
||||
rms_norm_eps: float = 1e-5,
|
||||
use_bias: bool = False,
|
||||
attn_implementation="flash_attention_2",
|
||||
initializer_range=0.02,
|
||||
init_merger_std=0.02,
|
||||
is_causal=False, # ve causal forward
|
||||
post_norm=True,
|
||||
gradient_checkpointing=False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.embed_dim = embed_dim
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_channels = num_channels
|
||||
self.patch_size = patch_size
|
||||
self.spatial_merge_size = spatial_merge_size
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_bias = use_bias
|
||||
self.attn_implementation = attn_implementation
|
||||
self.initializer_range = initializer_range
|
||||
self.init_merger_std = init_merger_std
|
||||
self.is_causal = is_causal
|
||||
self.post_norm = post_norm
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
|
||||
|
||||
class DotsOCRConfig(Qwen2Config):
|
||||
model_type = "dots_ocr"
|
||||
|
||||
def __init__(self,
|
||||
image_token_id=151665,
|
||||
video_token_id=151656,
|
||||
vision_config: Optional[dict] = None,
|
||||
*args,
|
||||
**kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.image_token_id = image_token_id
|
||||
self.video_token_id = video_token_id
|
||||
self.vision_config = DotsVisionConfig(**(vision_config or {}))
|
||||
|
||||
def save_pretrained(self, save_directory, **kwargs):
|
||||
self._auto_class = None
|
||||
super().save_pretrained(save_directory, **kwargs)
|
||||
84
vllm/transformers_utils/configs/eagle.py
Normal file
84
vllm/transformers_utils/configs/eagle.py
Normal file
@@ -0,0 +1,84 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers import AutoConfig, PretrainedConfig
|
||||
|
||||
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
|
||||
|
||||
|
||||
class EAGLEConfig(PretrainedConfig):
|
||||
model_type = "eagle"
|
||||
|
||||
def __init__(self,
|
||||
model: Union[PretrainedConfig, dict, None] = None,
|
||||
truncated_vocab_size: Optional[int] = None,
|
||||
method: Optional[str] = 'eagle',
|
||||
**kwargs):
|
||||
|
||||
model_config: Union[PretrainedConfig, DeepseekV2Config, None]
|
||||
if isinstance(model, dict):
|
||||
archs = model.get("architectures", [])
|
||||
target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]
|
||||
if any(target_arch in archs for target_arch in target_archs):
|
||||
# AutoConfig does not support DeepSeek MoE models yet
|
||||
model_config = DeepseekV2Config(**model)
|
||||
else:
|
||||
model_config = AutoConfig.for_model(**model)
|
||||
else:
|
||||
model_config = model
|
||||
|
||||
for k, v in kwargs.items():
|
||||
if k != "architectures" and k != "model_type" and hasattr(
|
||||
model_config, k):
|
||||
setattr(model_config, k, v)
|
||||
|
||||
self.model = model_config
|
||||
|
||||
if self.model is None:
|
||||
self.truncated_vocab_size = None
|
||||
else:
|
||||
self.truncated_vocab_size = self.model.vocab_size if \
|
||||
truncated_vocab_size is None else truncated_vocab_size
|
||||
|
||||
# Eagle model name should follow naming convention of
|
||||
# LlamaForCausalLM -> EagleLlamaForCausalLM
|
||||
# LlamaForCausalLM -> Eagle3LlamaForCausalLM
|
||||
# LlamaForCausalLMEagle3 -> LlamaForCausalLMEagle3
|
||||
if method == "eagle":
|
||||
assert self.model is not None, \
|
||||
"model should not be None when method is eagle"
|
||||
kwargs["architectures"] = [
|
||||
f"Eagle{arch}" if not arch.startswith("Eagle") \
|
||||
else arch for arch in self.model.architectures
|
||||
]
|
||||
|
||||
elif method == "eagle3":
|
||||
assert self.model is not None, \
|
||||
"model should not be None when method is eagle3"
|
||||
kwargs["architectures"] = [
|
||||
arch if arch.startswith("Eagle3") or arch.endswith("Eagle3")
|
||||
else f"Eagle3{arch}" for arch in self.model.architectures
|
||||
]
|
||||
else:
|
||||
raise ValueError(f"Invalid method {method}. "
|
||||
"Supported methods are eagle and eagle3.")
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if self.model is not None:
|
||||
for k, v in self.model.to_dict().items():
|
||||
if k not in kwargs:
|
||||
setattr(self, k, v)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
pretrained_model_name_or_path: Union[str, os.PathLike],
|
||||
**kwargs,
|
||||
) -> "EAGLEConfig":
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs)
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
90
vllm/transformers_utils/configs/falcon.py
Normal file
90
vllm/transformers_utils/configs/falcon.py
Normal file
@@ -0,0 +1,90 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Adapted from
|
||||
# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Falcon configuration"""
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class RWConfig(PretrainedConfig):
|
||||
model_type = "falcon"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_hidden_layers": "n_layer",
|
||||
"num_attention_heads": "n_head",
|
||||
"num_kv_heads": "n_head_kv",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=250880,
|
||||
hidden_size=64,
|
||||
n_layer=2,
|
||||
n_head=8,
|
||||
layer_norm_epsilon=1e-5,
|
||||
initializer_range=0.02,
|
||||
use_cache=True,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
hidden_dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
multi_query=True,
|
||||
n_head_kv=None,
|
||||
alibi=False,
|
||||
bias=False,
|
||||
parallel_attn=False,
|
||||
new_decoder_architecture=False,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.vocab_size = vocab_size
|
||||
# Backward compatibility with n_embed kwarg
|
||||
n_embed = kwargs.pop("n_embed", None)
|
||||
self.hidden_size = hidden_size if n_embed is None else n_embed
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
self.use_cache = use_cache
|
||||
self.hidden_dropout = hidden_dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
self.bos_token_id = bos_token_id
|
||||
self.eos_token_id = eos_token_id
|
||||
self.multi_query = multi_query
|
||||
self.n_head_kv = 1 if n_head_kv is None else n_head_kv
|
||||
self.alibi = alibi
|
||||
self.bias = bias
|
||||
self.parallel_attn = parallel_attn
|
||||
self.new_decoder_architecture = new_decoder_architecture
|
||||
|
||||
if self.hidden_size == 8192:
|
||||
# Hack for falcon-40b
|
||||
self.new_decoder_architecture = True
|
||||
|
||||
super().__init__(bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
**kwargs)
|
||||
|
||||
@property
|
||||
def head_dim(self):
|
||||
return self.hidden_size // self.n_head
|
||||
|
||||
@property
|
||||
def rotary(self):
|
||||
return not self.alibi
|
||||
237
vllm/transformers_utils/configs/jais.py
Normal file
237
vllm/transformers_utils/configs/jais.py
Normal file
@@ -0,0 +1,237 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright 2023 Cerebras Systems.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""JAIS configuration"""
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class JAISConfig(PretrainedConfig):
|
||||
"""
|
||||
This is the configuration class to store the configuration of a
|
||||
[`JAISModel`]. It is used to instantiate a JAIS model according to the
|
||||
specified arguments, defining the model architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used
|
||||
to control the model outputs. Read the documentation from
|
||||
[`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 50257):
|
||||
Vocabulary size of the JAIS model. Defines the number of different
|
||||
tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`JAISModel`].
|
||||
n_positions (`int`, *optional*, defaults to 1024):
|
||||
The maximum sequence length that this model might ever be used
|
||||
with. Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048).
|
||||
n_embd (`int`, *optional*, defaults to 768):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
n_layer (`int`, *optional*, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (`int`, *optional*, defaults to 12):
|
||||
Number of attention heads for each attention layer in the
|
||||
Transformer encoder.
|
||||
n_inner (`int`, *optional*, defaults to None):
|
||||
Dimensionality of the inner feed-forward layers. `None` will set
|
||||
it to 4 times n_embd
|
||||
activation_function (`str`, *optional*, defaults to `"gelu"`):
|
||||
Activation function, to be selected in the list
|
||||
`["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"]`.
|
||||
resid_pdrop (`float`, *optional*, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in
|
||||
the embeddings, encoder, and pooler.
|
||||
embd_pdrop (`float`, *optional*, defaults to 0.1):
|
||||
The dropout ratio for the embeddings.
|
||||
attn_pdrop (`float`, *optional*, defaults to 0.1):
|
||||
The dropout ratio for the attention.
|
||||
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon to use in the layer normalization layers.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
scale_attn_weights (`bool`, *optional*, defaults to `True`):
|
||||
Scale attention weights by dividing by sqrt(hidden_size)..
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values
|
||||
attentions (not used by all models).
|
||||
scale_attn_by_inverse_layer_idx (`bool`, *optional*, default `True`):
|
||||
Whether to additionally scale attention weights
|
||||
by `1 / layer_idx + 1`.
|
||||
reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
|
||||
Whether to scale keys (K) prior to computing attention
|
||||
(dot-product)
|
||||
and upcast attention dot-product/softmax to float() when training
|
||||
with mixed precision.
|
||||
position_embedding_type (`str`, *optional*, defaults to `"learned"`):
|
||||
Positional embedding can be either `"alibi"` or `"learned"`.
|
||||
mup_width_scale (`float`, *optional*, defaults to 1.0):
|
||||
muP parameter to scale learning rate and initializers. Calculated
|
||||
as (`d_model,0 / d_model`), where
|
||||
`d_model` is the model's width and `d_model,0` is the proxy
|
||||
model's width.
|
||||
mup_embeddings_scale (`float`, *optional*, defaults to 1.0):
|
||||
muP parameter to scale token and position embeddings.
|
||||
mup_output_alpha (`float`, *optional*, defaults to 1.0):
|
||||
muP parameter to scale output logits
|
||||
(`output_logits_scale = mup_output_alpha * mup_width_scale`).
|
||||
mup_scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
|
||||
Scale attention weights by dividing by hidden_size instead of
|
||||
sqrt(hidden_size). Need to set scale_attn_weights to `True` as
|
||||
well.
|
||||
alibi_scaling (`dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for ALiBi
|
||||
embeddings. Currently only supports linear
|
||||
scaling strategy. Can specify either the scaling `factor` (must be
|
||||
a float greater than 1) for fixed scaling
|
||||
or `train_seq_len` for dynamic scaling on input samples with
|
||||
sequence length > `train_seq_len`. The expected
|
||||
formats are `{"type": strategy name, "factor": scaling factor}` or
|
||||
`{"type": strategy name,
|
||||
"train_seq_len": training sequence length}`.
|
||||
architectures (`list`, *optional*, defaults to ['JAISLMHeadModel']):
|
||||
architecture names for Jais.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import JAISConfig, JAISModel
|
||||
|
||||
>>> # Initializing a JAIS configuration
|
||||
>>> configuration = JAISConfig()
|
||||
|
||||
>>> # Initializing a model (with random weights) from the configuration
|
||||
>>> model = JAISModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "jais"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"hidden_size": "n_embd",
|
||||
"max_position_embeddings": "n_positions",
|
||||
"num_attention_heads": "n_head",
|
||||
"num_hidden_layers": "n_layer",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=50257,
|
||||
n_positions=1024,
|
||||
n_embd=768,
|
||||
n_layer=12,
|
||||
n_head=12,
|
||||
n_inner=None,
|
||||
activation_function="gelu_new",
|
||||
resid_pdrop=0.1,
|
||||
embd_pdrop=0.1,
|
||||
attn_pdrop=0.1,
|
||||
layer_norm_epsilon=1e-5,
|
||||
initializer_range=0.02,
|
||||
scale_attn_weights=True,
|
||||
use_cache=True,
|
||||
bos_token_id=50256,
|
||||
eos_token_id=50256,
|
||||
scale_attn_by_inverse_layer_idx=False,
|
||||
reorder_and_upcast_attn=False,
|
||||
position_embedding_type="learned",
|
||||
mup_width_scale=1.0,
|
||||
mup_embeddings_scale=1.0,
|
||||
mup_output_alpha=1.0,
|
||||
mup_scale_qk_dot_by_d=False,
|
||||
alibi_scaling=None,
|
||||
architectures=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
self.n_inner = n_inner
|
||||
self.activation_function = activation_function
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.embd_pdrop = embd_pdrop
|
||||
self.attn_pdrop = attn_pdrop
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
self.scale_attn_weights = scale_attn_weights
|
||||
self.use_cache = use_cache
|
||||
self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
|
||||
self.reorder_and_upcast_attn = reorder_and_upcast_attn
|
||||
|
||||
self.bos_token_id = bos_token_id
|
||||
self.eos_token_id = eos_token_id
|
||||
|
||||
self.position_embedding_type = position_embedding_type
|
||||
self.mup_width_scale = mup_width_scale
|
||||
self.mup_embeddings_scale = mup_embeddings_scale
|
||||
self.mup_output_alpha = mup_output_alpha
|
||||
self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d
|
||||
|
||||
self.alibi_scaling = alibi_scaling
|
||||
self._alibi_scaling_validation()
|
||||
if architectures is None:
|
||||
architectures = ["JAISLMHeadModel"]
|
||||
|
||||
super().__init__(
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
architectures=architectures,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _alibi_scaling_validation(self):
|
||||
"""
|
||||
Validate the `alibi_scaling` configuration.
|
||||
"""
|
||||
if self.alibi_scaling is None:
|
||||
return
|
||||
|
||||
if (not isinstance(self.alibi_scaling, dict)
|
||||
or len(self.alibi_scaling) != 2):
|
||||
raise ValueError(
|
||||
"`alibi_scaling` must be a dictionary with two fields, "
|
||||
"`type` and `factor` or `type` and `train_seq_len`, "
|
||||
f"got {self.alibi_scaling}")
|
||||
alibi_scaling_type = self.alibi_scaling.get("type", None)
|
||||
alibi_scaling_factor = self.alibi_scaling.get("factor", None)
|
||||
alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
|
||||
if alibi_scaling_type is None or alibi_scaling_type != "linear":
|
||||
raise ValueError(f"`alibi_scaling`'s type field must be 'linear', "
|
||||
f"got {alibi_scaling_type}")
|
||||
if (alibi_scaling_factor is not None
|
||||
and not isinstance(alibi_scaling_factor, float)
|
||||
or (alibi_scaling_factor is not None
|
||||
and alibi_scaling_factor <= 1.0)):
|
||||
raise ValueError(
|
||||
f"`alibi_scaling`'s factor field must be a float > 1.0, "
|
||||
f"got {alibi_scaling_factor}")
|
||||
if (alibi_dynamic_scaling is not None
|
||||
and not isinstance(alibi_dynamic_scaling, int)
|
||||
or (alibi_dynamic_scaling is not None
|
||||
and alibi_dynamic_scaling <= 1)):
|
||||
raise ValueError(
|
||||
f"`alibi_scaling`'s `train_seq_len` field must be an "
|
||||
f"integer > 1, got {alibi_dynamic_scaling}")
|
||||
37
vllm/transformers_utils/configs/kimi_vl.py
Normal file
37
vllm/transformers_utils/configs/kimi_vl.py
Normal file
@@ -0,0 +1,37 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
|
||||
from vllm.transformers_utils.configs.moonvit import MoonViTConfig
|
||||
|
||||
|
||||
class KimiVLConfig(PretrainedConfig):
|
||||
model_type = "kimi_vl"
|
||||
|
||||
def __init__(self,
|
||||
vision_config: Optional[Union[dict, MoonViTConfig]] = None,
|
||||
text_config: Optional[Union[dict, DeepseekV2Config]] = None,
|
||||
ignore_index: int = -100,
|
||||
media_placeholder_token_id: int = 163605,
|
||||
pad_token_id: int = 0,
|
||||
**kwargs):
|
||||
if vision_config is None:
|
||||
vision_config = MoonViTConfig()
|
||||
elif isinstance(vision_config, dict):
|
||||
vision_config = MoonViTConfig(**vision_config)
|
||||
self.vision_config = vision_config
|
||||
|
||||
if text_config is None:
|
||||
text_config = DeepseekV2Config()
|
||||
elif isinstance(text_config, dict):
|
||||
text_config = DeepseekV2Config(**text_config)
|
||||
self.text_config = text_config
|
||||
|
||||
self.ignore_index = ignore_index
|
||||
self.media_placeholder_token_id = media_placeholder_token_id
|
||||
|
||||
super().__init__(pad_token_id=pad_token_id, **kwargs)
|
||||
63
vllm/transformers_utils/configs/medusa.py
Normal file
63
vllm/transformers_utils/configs/medusa.py
Normal file
@@ -0,0 +1,63 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class MedusaConfig(PretrainedConfig):
|
||||
model_type = "medusa"
|
||||
|
||||
def __init__(self,
|
||||
hidden_size: int = 4096,
|
||||
vocab_size: int = 32001,
|
||||
num_heads: int = 5,
|
||||
num_hidden_layers: int = 1,
|
||||
max_paths: int = 64,
|
||||
topk: int = 10,
|
||||
truncated_vocab_size: Optional[int] = None,
|
||||
**kwargs):
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.vocab_size = vocab_size
|
||||
self.num_heads = num_heads
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.max_paths = max_paths
|
||||
self.topk = topk
|
||||
self.max_seq_len = int(2**20)
|
||||
self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
|
||||
else truncated_vocab_size
|
||||
if "architectures" not in kwargs:
|
||||
kwargs["architectures"] = ["MedusaModel"]
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
pretrained_model_name_or_path: Union[str, os.PathLike],
|
||||
**kwargs,
|
||||
) -> "MedusaConfig":
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs)
|
||||
for k in list(config_dict.keys()):
|
||||
if 'num' in k:
|
||||
if 'heads' in k:
|
||||
config_dict["num_heads"] = config_dict.pop(k)
|
||||
elif 'layers' in k:
|
||||
config_dict["num_hidden_layers"] = config_dict.pop(k)
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
@property
|
||||
def num_attention_heads(self):
|
||||
return 0
|
||||
|
||||
@property
|
||||
def num_lookahead_tokens(self):
|
||||
return self.num_heads
|
||||
|
||||
@num_lookahead_tokens.setter
|
||||
def num_lookahead_tokens(self, num_lookahead_tokens: int):
|
||||
self.num_heads = num_lookahead_tokens
|
||||
101
vllm/transformers_utils/configs/midashenglm.py
Normal file
101
vllm/transformers_utils/configs/midashenglm.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Copyright 2025 Horizon team, Xiaomi MiLM Plus.
|
||||
# Copyright 2024 The Qwen team.
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
|
||||
Qwen2_5OmniTextConfig)
|
||||
|
||||
|
||||
class DashengConfig(PretrainedConfig):
|
||||
model_type = "midashenglm_dasheng_encoder"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embed_dim: int = 768,
|
||||
outputdim: int = 527,
|
||||
patch_size: Union[int, tuple[int, int]] = 16,
|
||||
patch_stride: Union[int, tuple[int, int]] = 16,
|
||||
input_channels: int = 1,
|
||||
target_length: int = 1012,
|
||||
depth: int = 12,
|
||||
num_heads: int = 12,
|
||||
mlp_ratio: float = 4.0,
|
||||
qkv_bias: bool = True,
|
||||
init_values: Optional[float] = None,
|
||||
drop_rate: float = 0.0,
|
||||
attn_drop_rate: float = 0.0,
|
||||
f_min: float = 0.0,
|
||||
f_max: float = 8000.0,
|
||||
center: bool = True,
|
||||
win_length: int = 512,
|
||||
hop_length: int = 160,
|
||||
sample_rate: int = 16000,
|
||||
n_fft: int = 512,
|
||||
n_mels: int = 64,
|
||||
**kwargs,
|
||||
):
|
||||
self.embed_dim = embed_dim
|
||||
self.outputdim = outputdim
|
||||
self.patch_size = patch_size
|
||||
self.patch_stride = patch_stride
|
||||
self.input_channels = input_channels
|
||||
self.target_length = target_length
|
||||
self.depth = depth
|
||||
self.num_heads = num_heads
|
||||
self.mlp_ratio = mlp_ratio
|
||||
self.qkv_bias = qkv_bias
|
||||
self.init_values = init_values
|
||||
self.drop_rate = drop_rate
|
||||
self.attn_drop_rate = attn_drop_rate
|
||||
self.f_min = f_min
|
||||
self.f_max = f_max
|
||||
self.center = center
|
||||
self.win_length = win_length
|
||||
self.hop_length = hop_length
|
||||
self.sample_rate = sample_rate
|
||||
self.n_fft = n_fft
|
||||
self.n_mels = n_mels
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
class MiDashengLMConfig(PretrainedConfig):
|
||||
model_type = "midashenglm"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
audio_encoder_config: Optional[dict] = None,
|
||||
subsample_factor: int = 5,
|
||||
text_config: Optional[dict] = None,
|
||||
audio_token_id: Optional[int] = None,
|
||||
**kwargs,
|
||||
):
|
||||
self.audio_encoder_config = DashengConfig(
|
||||
**(audio_encoder_config or {}))
|
||||
self.subsample_factor = subsample_factor
|
||||
self.text_config = (Qwen2_5OmniTextConfig(
|
||||
**text_config) if text_config else Qwen2_5OmniTextConfig())
|
||||
self.text_config.rope_scaling = None # uses_mrope is false
|
||||
self.audio_token_id = audio_token_id
|
||||
super().__init__(**kwargs)
|
||||
165
vllm/transformers_utils/configs/mistral.py
Normal file
165
vllm/transformers_utils/configs/mistral.py
Normal file
@@ -0,0 +1,165 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any
|
||||
|
||||
from transformers import PretrainedConfig, WhisperConfig
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def adapt_config_dict(config_dict: dict[str, Any],
|
||||
**kwargs) -> PretrainedConfig:
|
||||
config_dict.update(kwargs)
|
||||
config_dict = _remap_general_mistral_args(config_dict)
|
||||
|
||||
if bool(config_dict.get("quantization")):
|
||||
config_dict = _remap_mistral_quantization_args(config_dict)
|
||||
|
||||
if bool(config_dict.get("moe")):
|
||||
config_dict["architectures"] = ["MixtralForCausalLM"]
|
||||
else:
|
||||
config_dict["architectures"] = ["MistralForCausalLM"]
|
||||
|
||||
if bool(config_dict.get("yarn")):
|
||||
config_dict = _remap_mistral_yarn_args(config_dict)
|
||||
|
||||
is_vision = ((config_dict.get("multimodal")
|
||||
or {}).get("vision_encoder_args")
|
||||
or config_dict.get("vision_encoder"))
|
||||
is_audio = bool(
|
||||
((config_dict.get("multimodal") or {}).get("whisper_model_args")
|
||||
or {}).get("encoder_args"))
|
||||
|
||||
assert not (is_vision and is_audio), \
|
||||
"Vision and audio are mutually exclusive"
|
||||
|
||||
if is_vision:
|
||||
config_dict = _remap_mistral_vision_args(config_dict)
|
||||
if is_audio:
|
||||
config_dict = _remap_mistral_audio_args(config_dict)
|
||||
|
||||
config = PretrainedConfig.from_dict(config_dict)
|
||||
|
||||
logger.debug("Initialized config %s", config)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _remap_mistral_vision_args(config: dict) -> dict:
|
||||
if config.get("multimodal"):
|
||||
vision_config = config.pop("multimodal")
|
||||
else:
|
||||
vision_config = config.pop("vision_encoder")
|
||||
|
||||
quant_config = config.get("quantization_config")
|
||||
config = {
|
||||
"model_type": "pixtral",
|
||||
"architectures": ["PixtralForConditionalGeneration"],
|
||||
"text_config": PretrainedConfig.from_dict(config),
|
||||
"vision_config": PretrainedConfig.from_dict(vision_config),
|
||||
}
|
||||
if quant_config:
|
||||
config["quantization_config"] = quant_config
|
||||
return config
|
||||
|
||||
|
||||
def _remap_mistral_yarn_args(config: dict) -> dict:
|
||||
# Direct remaps: yarn.X -> rope_scaling.Y
|
||||
# Source keys are from mistral.model.args.YarnArgs
|
||||
_map = {
|
||||
"beta": "beta_fast",
|
||||
"alpha": "beta_slow",
|
||||
}
|
||||
yarn_config = config.get("yarn") or {}
|
||||
renamed_yarn_config = {_map.get(k, k): v for k, v in yarn_config.items()}
|
||||
config["rope_scaling"] = {
|
||||
"rope_type": "yarn",
|
||||
"mscale_all_dim": 1, # We hardcoded this to 1
|
||||
**renamed_yarn_config
|
||||
}
|
||||
return config
|
||||
|
||||
|
||||
def _remap_general_mistral_args(config: dict) -> dict:
|
||||
# Mistral key -> HF key
|
||||
config_mapping = {
|
||||
"dim": "hidden_size",
|
||||
"norm_eps": "rms_norm_eps",
|
||||
"n_kv_heads": "num_key_value_heads",
|
||||
"n_layers": "num_hidden_layers",
|
||||
"n_heads": "num_attention_heads",
|
||||
"hidden_dim": "intermediate_size",
|
||||
}
|
||||
# HF key -> (Mistral key, default value)
|
||||
top_level_mapping_with_default = {
|
||||
"model_type": ("model_type", "transformer"),
|
||||
"hidden_act": ("activation", "silu"),
|
||||
"tie_word_embeddings": ("tied_embeddings", False),
|
||||
"max_seq_len": ("max_seq_len", 128_000),
|
||||
"max_position_embeddings": ("max_position_embeddings", 128_000),
|
||||
}
|
||||
|
||||
for key, new_key in config_mapping.items():
|
||||
if key in config:
|
||||
config[new_key] = config.pop(key)
|
||||
|
||||
for new_key, (key,
|
||||
default_value) in top_level_mapping_with_default.items():
|
||||
config[new_key] = config.pop(key, default_value)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _remap_mistral_quantization_args(config: dict) -> dict:
|
||||
quantization = config.get("quantization", {})
|
||||
if quantization.get("qformat_weight") == "fp8_e4m3":
|
||||
# This maps to the FP8 static per-tensor quantization scheme
|
||||
quantization_config = {
|
||||
"quant_method": "fp8",
|
||||
"activation_scheme": "static"
|
||||
}
|
||||
elif quantization.get("quant_method") == "compressed-tensors":
|
||||
# Pass through the quantization config to compressed-tensors
|
||||
quantization_config = quantization
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Found unknown quantization='{quantization}' in config")
|
||||
|
||||
config["quantization_config"] = quantization_config
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _remap_mistral_audio_args(config: dict) -> dict:
|
||||
whisper_args = config["multimodal"].pop("whisper_model_args")
|
||||
encoder_args = whisper_args["encoder_args"]
|
||||
downsample_args = whisper_args["downsample_args"]
|
||||
|
||||
quant_config = config.get("quantization_config")
|
||||
config = {
|
||||
"model_type":
|
||||
"whixtral",
|
||||
"architectures": ["VoxtralForConditionalGeneration"],
|
||||
"text_config":
|
||||
PretrainedConfig.from_dict(config),
|
||||
"audio_config":
|
||||
WhisperConfig(
|
||||
num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"],
|
||||
window_size=encoder_args["audio_encoding_args"]["window_size"],
|
||||
sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"],
|
||||
hop_length=encoder_args["audio_encoding_args"]["hop_length"],
|
||||
downsample_factor=downsample_args["downsample_factor"],
|
||||
d_model=encoder_args["dim"],
|
||||
encoder_layers=encoder_args["n_layers"],
|
||||
encoder_ffn_dim=encoder_args["hidden_dim"],
|
||||
encoder_attention_heads=encoder_args["n_heads"],
|
||||
vocab_size=encoder_args["vocab_size"],
|
||||
max_source_positions=encoder_args["max_source_positions"],
|
||||
is_encoder_decoder=False, # Override WhisperConfig default
|
||||
)
|
||||
}
|
||||
if quant_config:
|
||||
config["quantization_config"] = quant_config
|
||||
return config
|
||||
68
vllm/transformers_utils/configs/mlp_speculator.py
Normal file
68
vllm/transformers_utils/configs/mlp_speculator.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class MLPSpeculatorConfig(PretrainedConfig):
|
||||
model_type = "mlp_speculator"
|
||||
|
||||
attribute_map = {
|
||||
"hidden_size": "emb_dim",
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
vocab_size: int = 32000,
|
||||
emb_dim: int = 4096,
|
||||
inner_dim: int = 0,
|
||||
n_predict: int = 3,
|
||||
top_k_tokens_per_head: Optional[list[int]] = None,
|
||||
n_candidates: int = 5,
|
||||
tie_weights: bool = False,
|
||||
scale_input: bool = False,
|
||||
**kwargs):
|
||||
"""
|
||||
Initialize an MLPSpeculatorConfig
|
||||
|
||||
Args:
|
||||
vocab_size: int
|
||||
the model vocab size
|
||||
emb_dim: int
|
||||
the model embedding dimension
|
||||
inner_dim: int
|
||||
the inner dimension of the model. If 0, will be the emb_dim.
|
||||
n_predict: int
|
||||
the number of lookaheads for the speculator
|
||||
top_k_tokens_per_head: list[int]
|
||||
Number of tokens to consider from each head when forming the
|
||||
candidate tree.
|
||||
For each candidate branch in the tree, head n produces topk[n]
|
||||
additional sub-branches.
|
||||
NOTE: This parameter is currently unused.
|
||||
n_candidates: int
|
||||
number of child candidates to create per sequence
|
||||
tie_weights: bool
|
||||
If true, use a single set of weights for every model
|
||||
head/stage after the first. The initial projection
|
||||
from the base model may have a different size, so that
|
||||
stays separate.
|
||||
scale_input: bool
|
||||
if True, will scale the initial hidden states from
|
||||
the base model.
|
||||
"""
|
||||
if top_k_tokens_per_head is None:
|
||||
top_k_tokens_per_head = [5, 4, 3]
|
||||
assert len(top_k_tokens_per_head) == n_predict
|
||||
self.vocab_size = vocab_size
|
||||
self.emb_dim = emb_dim
|
||||
self.inner_dim = inner_dim
|
||||
self.n_predict = n_predict
|
||||
self.top_k_tokens_per_head = top_k_tokens_per_head
|
||||
self.n_candidates = n_candidates
|
||||
self.num_lookahead_tokens = n_predict
|
||||
self.tie_weights = tie_weights
|
||||
self.scale_input = scale_input
|
||||
|
||||
super().__init__(**kwargs)
|
||||
33
vllm/transformers_utils/configs/moonvit.py
Normal file
33
vllm/transformers_utils/configs/moonvit.py
Normal file
@@ -0,0 +1,33 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class MoonViTConfig(PretrainedConfig):
|
||||
model_type = "moonvit"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
patch_size: int = 14,
|
||||
init_pos_emb_height: int = 64,
|
||||
init_pos_emb_width: int = 64,
|
||||
num_attention_heads: int = 16,
|
||||
num_hidden_layers: int = 27,
|
||||
hidden_size: int = 1152,
|
||||
intermediate_size: int = 4304,
|
||||
merge_kernel_size: tuple[int, int] = (2, 2),
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.patch_size = patch_size
|
||||
# Positional embedding config
|
||||
self.init_pos_emb_height = init_pos_emb_height
|
||||
self.init_pos_emb_width = init_pos_emb_width
|
||||
# Transformer config
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
# Patch merger config
|
||||
self.merge_kernel_size = merge_kernel_size
|
||||
205
vllm/transformers_utils/configs/nemotron.py
Normal file
205
vllm/transformers_utils/configs/nemotron.py
Normal file
@@ -0,0 +1,205 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Copyright 2024 HuggingFace Inc. team. All rights reserved.
|
||||
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Nemotron model configuration"""
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class NemotronConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a
|
||||
[`NemotronModel`]. It is used to instantiate a Nemotron model
|
||||
according to the specified arguments, defining the model architecture.
|
||||
Instantiating a configuration with the defaults will yield a similar
|
||||
configuration to that of the Nemotron-8B.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be
|
||||
used to control the model outputs. Read the documentation from
|
||||
[`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 256000):
|
||||
Vocabulary size of the Nemotron model. Defines the number of
|
||||
different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`NemotronModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 6144):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 24576):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 48):
|
||||
Number of attention heads for each attention layer in the
|
||||
Transformer decoder.
|
||||
head_dim (`int`, *optional*):
|
||||
Projection weights dimension in multi-head attention. Set to
|
||||
hidden_size // num_attention_heads if None
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
This is the number of key_value heads that should be used to
|
||||
implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use
|
||||
Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1 the model will use Multi Query Attention
|
||||
(MQA) otherwise GQA is used. When converting a multi-head
|
||||
checkpoint to a GQA checkpoint, each group key and value
|
||||
head should be constructed by meanpooling all the original
|
||||
heads within that group. For more details checkout
|
||||
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
|
||||
is not specified, will default to `num_attention_heads`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
|
||||
The non-linear activation function (function or string) in the
|
||||
decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 4096):
|
||||
The maximum sequence length that this model might ever be used
|
||||
with.
|
||||
initializer_range (`float`, *optional*, defaults to 0.0134):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values
|
||||
attentions (not used by all models). Only relevant if
|
||||
`config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*):
|
||||
Padding token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 2):
|
||||
Beginning of stream token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 3):
|
||||
End of stream token id.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether to tie weight embeddings
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
partial_rotary_factor (`float`, *optional*, defaults to 0.5):
|
||||
Percentage of the query and keys which will have rotary embedding.
|
||||
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value and output
|
||||
projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
mlp_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in up_proj and down_proj layers in the MLP
|
||||
layers.
|
||||
|
||||
```python
|
||||
>>> from transformers import NemotronModel, NemotronConfig
|
||||
>>> # Initializing a Nemotron nemotron-15b style configuration
|
||||
>>> configuration = NemotronConfig()
|
||||
>>> # Initializing a model from the nemotron-15b style configuration
|
||||
>>> model = NemotronModel(configuration)
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "nemotron"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=256000,
|
||||
hidden_size=6144,
|
||||
intermediate_size=24576,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=48,
|
||||
head_dim=None,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="relu2",
|
||||
max_position_embeddings=4096,
|
||||
initializer_range=0.0134,
|
||||
norm_eps=1e-5,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=2,
|
||||
eos_token_id=3,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
partial_rotary_factor=0.5,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
mlp_bias=False,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
head_dim = head_dim or kwargs.get("kv_channels")
|
||||
self.head_dim = head_dim if head_dim is not None else (
|
||||
hidden_size // num_attention_heads)
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.norm_eps = norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
# for backward compatibility
|
||||
partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
|
||||
"rope_percentage") or partial_rotary_factor
|
||||
self.partial_rotary_factor = partial_rotary_factor
|
||||
self._rope_scaling_validation()
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
self.mlp_bias = mlp_bias
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _rope_scaling_validation(self):
|
||||
"""
|
||||
Validate the `rope_scaling` configuration.
|
||||
"""
|
||||
if self.rope_scaling is None:
|
||||
return
|
||||
|
||||
if not isinstance(self.rope_scaling, dict) or len(
|
||||
self.rope_scaling) != 2:
|
||||
raise ValueError(
|
||||
"`rope_scaling` must be a dictionary with two fields, "
|
||||
f"`type` and `factor`, got {self.rope_scaling}")
|
||||
rope_scaling_type = self.rope_scaling.get("type", None)
|
||||
rope_scaling_factor = self.rope_scaling.get("factor", None)
|
||||
if rope_scaling_type is None or rope_scaling_type not in [
|
||||
"linear", "dynamic"
|
||||
]:
|
||||
raise ValueError(
|
||||
"`rope_scaling`'s type field must be one of ['linear', "
|
||||
f"'dynamic'], got {rope_scaling_type}")
|
||||
if rope_scaling_factor is None or not isinstance(
|
||||
rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
|
||||
raise ValueError(
|
||||
"`rope_scaling`'s factor field must be a float > 1, got "
|
||||
f"{rope_scaling_factor}")
|
||||
259
vllm/transformers_utils/configs/nemotron_h.py
Normal file
259
vllm/transformers_utils/configs/nemotron_h.py
Normal file
@@ -0,0 +1,259 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Copyright 2024 HuggingFace Inc. team. All rights reserved.
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""NemotronH model configuration"""
|
||||
|
||||
import regex as re
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class NemotronHConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a
|
||||
[`NemotronHModel`]. It is used to instantiate a NemotronH model according
|
||||
to the specified arguments, defining the model architecture. Instantiating
|
||||
a configuration with the defaults will yield a similar configuration to
|
||||
that of the NemotronH-v0.1 model.
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 131072):
|
||||
Vocabulary size of the NemotronH model. Defines the number of
|
||||
different tokens that can be represented by the `inputs_ids`
|
||||
passed when calling [`NemotronHModel`]
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model's input and output word embeddings should be
|
||||
tied. Note that this is only relevant if the model has an output
|
||||
word embedding layer.
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 21504):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 52):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
hybrid_override_pattern (`str`, *optional*, defaults to
|
||||
`"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
|
||||
The pattern of the hybrid model. The pattern is a string of
|
||||
characters where each character represents
|
||||
M: Mamba2, *: Attention, -: MLP
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the
|
||||
Transformer encoder.
|
||||
attention_head_dim (`int`, *optional*, defaults to 128):
|
||||
Dimension of each attention head.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 8):
|
||||
This is the number of key_value heads that should be used to
|
||||
implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use
|
||||
Multi Head Attention (MHA), if `num_key_value_heads=1` the model
|
||||
will use Multi Query Attention (MQA) otherwise GQA is used.
|
||||
mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
|
||||
The non-linear activation function in the MLP layers.
|
||||
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use bias in attention layers.
|
||||
mlp_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use bias in MLP layers.
|
||||
use_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use bias in the model.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
residual_in_fp32 (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not residuals should be in `float32`. If set to `False`
|
||||
residuals will keep the same `dtype` as the rest of the model.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values
|
||||
attentions (not used by all models). Only relevant if
|
||||
`config.is_decoder=True`.
|
||||
num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
|
||||
Number of prompt logits to calculate during generation. If `None`,
|
||||
all logits will be calculated. If an integer value, only last
|
||||
`num_logits_to_keep` logits will be calculated.
|
||||
pad_token_id (`int`, *optional*, defaults to 0):
|
||||
The id of the padding token.
|
||||
bos_token_id (`int`, *optional*, defaults to 1):
|
||||
The id of the "beginning-of-sequence" token.
|
||||
eos_token_id (`int`, *optional*, defaults to 2):
|
||||
The id of the "end-of-sequence" token.
|
||||
sliding_window (`int`, *optional*, defaults to None):
|
||||
Sliding window attention window size.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 4096):
|
||||
The maximum sequence length that this model might ever be used
|
||||
with.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
hidden_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the hidden states.
|
||||
use_mamba_kernels (`bool`, *optional*, defaults to `True`):
|
||||
Flag indicating whether or not to use the fast mamba kernels.
|
||||
These are available only if `mamba-ssm` and `causal-conv1d`
|
||||
are installed, and the mamba modules are running on a CUDA device.
|
||||
ssm_state_size (`int`, *optional*, defaults to 128):
|
||||
The dimension of the mamba state space latents.
|
||||
mamba_num_heads (`int`, *optional*, defaults to 128):
|
||||
Number of heads in Mamba layers.
|
||||
mamba_n_groups (`int`, *optional*, defaults to 8):
|
||||
Number of groups in Mamba layers.
|
||||
mamba_head_dim (`int`, *optional*, defaults to 64):
|
||||
Dimension of each Mamba head.
|
||||
mamba_d_conv (`int`, *optional*, defaults to 4):
|
||||
The size of the mamba convolution kernel.
|
||||
mamba_expand (`int`, *optional*, defaults to 2):
|
||||
Expanding factor used to determine the mamba intermediate size.
|
||||
mamba_hidden_act (`str`, *optional*, defaults to "silu"):
|
||||
The non-linear activation function in the Mamba layers.
|
||||
mamba_dt_min (`float`, *optional*, defaults to 0.001):
|
||||
Minimum value for the time step in Mamba.
|
||||
mamba_dt_max (`float`, *optional*, defaults to 0.1):
|
||||
Maximum value for the time step in Mamba.
|
||||
mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
|
||||
Limits for the time step in Mamba.
|
||||
mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
|
||||
Floor value for time step initialization in Mamba.
|
||||
mamba_conv_bias (`bool`, *optional*, defaults to `True`):
|
||||
Whether to use bias in the convolution layer of the mamba mixer
|
||||
block.
|
||||
mamba_proj_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use bias in the input and output projections of the
|
||||
mamba mixer block.
|
||||
mamba_chunk_size (`int`, *optional*, defaults to 256):
|
||||
Size of chunks for Mamba processing.
|
||||
rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the pre-normalization residual connections.
|
||||
"""
|
||||
|
||||
model_type = "nemotron_h"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=131072,
|
||||
tie_word_embeddings=False,
|
||||
hidden_size=4096,
|
||||
intermediate_size=21504,
|
||||
num_hidden_layers=52,
|
||||
hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
|
||||
num_attention_heads=32,
|
||||
head_dim=128,
|
||||
num_key_value_heads=8, # nemo: num_query_groups
|
||||
mlp_hidden_act="relu2",
|
||||
attention_bias=False,
|
||||
mlp_bias=False,
|
||||
use_bias=False,
|
||||
initializer_range=0.02, # nemo: init_method_std
|
||||
layer_norm_epsilon=1e-5, # nemo: layernorm_epsilon
|
||||
residual_in_fp32=False, # Megatron Core default value
|
||||
use_cache=True,
|
||||
num_logits_to_keep=1,
|
||||
pad_token_id=0,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
sliding_window=None,
|
||||
max_position_embeddings=4096,
|
||||
attention_dropout=0.0,
|
||||
hidden_dropout=0.0, # * ADDED
|
||||
use_mamba_kernels=True,
|
||||
ssm_state_size=128, # mamba_state_size
|
||||
mamba_num_heads=128,
|
||||
mamba_n_groups=8, # nemo: mamba_ssm_ngroups = num_heads
|
||||
mamba_head_dim=64,
|
||||
mamba_d_conv=4,
|
||||
mamba_expand=2,
|
||||
mamba_hidden_act="silu",
|
||||
mamba_dt_min=0.001,
|
||||
mamba_dt_max=0.1,
|
||||
mamba_dt_limit=(0.0, float("inf")),
|
||||
mamba_dt_init_floor=1e-4,
|
||||
mamba_conv_bias=True,
|
||||
mamba_proj_bias=False,
|
||||
mamba_chunk_size=256,
|
||||
rescale_prenorm_residual=True,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.tie_word_embeddings = tie_word_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.hybrid_override_pattern = hybrid_override_pattern
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.head_dim = head_dim
|
||||
self.sliding_window = sliding_window
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.attention_dropout = attention_dropout
|
||||
self.hidden_dropout = hidden_dropout
|
||||
|
||||
# Validate hybrid_override_pattern
|
||||
# M: Mamba2, *: Attention, -: MLP
|
||||
assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
|
||||
"hybrid_override_pattern must have same length as "
|
||||
"num_hidden_layers")
|
||||
assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
|
||||
"hybrid_override_pattern must only contain characters "
|
||||
"'M', '*', or '-'")
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.mlp_hidden_act = mlp_hidden_act
|
||||
self.attention_bias = attention_bias
|
||||
self.mlp_bias = mlp_bias
|
||||
self.use_bias = use_bias
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.residual_in_fp32 = residual_in_fp32
|
||||
|
||||
self.use_cache = use_cache
|
||||
self.num_logits_to_keep = num_logits_to_keep
|
||||
|
||||
self.use_mamba_kernels = use_mamba_kernels
|
||||
self.n_groups = mamba_n_groups
|
||||
self.mamba_head_dim = mamba_head_dim
|
||||
self.ssm_state_size = ssm_state_size
|
||||
self.mamba_num_heads = mamba_num_heads
|
||||
self.conv_kernel = mamba_d_conv
|
||||
self.expand = mamba_expand
|
||||
self.mamba_hidden_act = mamba_hidden_act
|
||||
self.time_step_min = mamba_dt_min
|
||||
self.time_step_max = mamba_dt_max
|
||||
self.time_step_limit = mamba_dt_limit
|
||||
self.time_step_floor = mamba_dt_init_floor
|
||||
self.use_conv_bias = mamba_conv_bias
|
||||
self.mamba_proj_bias = mamba_proj_bias
|
||||
self.chunk_size = mamba_chunk_size
|
||||
self.rescale_prenorm_residual = rescale_prenorm_residual
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def layers_block_type(self):
|
||||
return [
|
||||
"mamba" if self.hybrid_override_pattern[i] == "M" else
|
||||
"attention" if self.hybrid_override_pattern[i] == "*" else "mlp"
|
||||
for i in range(self.num_hidden_layers)
|
||||
]
|
||||
56
vllm/transformers_utils/configs/nemotron_vl.py
Normal file
56
vllm/transformers_utils/configs/nemotron_vl.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# Adapted from
|
||||
# https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py
|
||||
# --------------------------------------------------------
|
||||
# Adapted from https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B under MIT License
|
||||
# LICENSE is in incl_licenses directory.
|
||||
# --------------------------------------------------------
|
||||
|
||||
from transformers import LlamaConfig
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.dynamic_module_utils import get_class_from_dynamic_module
|
||||
|
||||
|
||||
class Nemotron_Nano_VL_Config(PretrainedConfig):
|
||||
model_type = 'Llama_Nemotron_Nano_VL'
|
||||
is_composition = True
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vision_config=None,
|
||||
llm_config=None,
|
||||
force_image_size=None,
|
||||
downsample_ratio=0.5,
|
||||
template=None,
|
||||
ps_version='v1',
|
||||
image_tag_type="internvl",
|
||||
projector_hidden_size=4096,
|
||||
vit_hidden_size=1280,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if vision_config is not None:
|
||||
assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"]
|
||||
vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1])
|
||||
self.vision_config = vision_auto_config(**vision_config)
|
||||
else:
|
||||
self.vision_config = PretrainedConfig()
|
||||
|
||||
if llm_config is None:
|
||||
self.text_config = LlamaConfig()
|
||||
else:
|
||||
self.text_config = LlamaConfig(**llm_config)
|
||||
|
||||
# Assign configuration values
|
||||
self.force_image_size = force_image_size
|
||||
self.downsample_ratio = downsample_ratio
|
||||
self.template = template # TODO move out of here and into the tokenizer
|
||||
self.ps_version = ps_version # Pixel shuffle version
|
||||
self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
|
||||
self.projector_hidden_size = projector_hidden_size
|
||||
self.vit_hidden_size = vit_hidden_size
|
||||
80
vllm/transformers_utils/configs/olmo3.py
Normal file
80
vllm/transformers_utils/configs/olmo3.py
Normal file
@@ -0,0 +1,80 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class Olmo3Config(PretrainedConfig):
|
||||
|
||||
model_type = "olmo3"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=50304,
|
||||
hidden_size=4096,
|
||||
intermediate_size=11008,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=2048,
|
||||
initializer_range=0.02,
|
||||
use_cache=True,
|
||||
pad_token_id=1,
|
||||
bos_token_id=None,
|
||||
eos_token_id=50279,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
rms_norm_eps=1e-5,
|
||||
sliding_window=4096,
|
||||
layer_types=None,
|
||||
**kwargs,
|
||||
):
|
||||
# This model uses Olmo3ForCausalLM in transformers but Olmo2ForCausalLM
|
||||
# in vLLM.
|
||||
if "architectures" not in kwargs:
|
||||
kwargs["architectures"] = ["Olmo2ForCausalLM"]
|
||||
elif "Olmo3ForCausalLM" in kwargs["architectures"]:
|
||||
kwargs["architectures"].remove("Olmo3ForCausalLM")
|
||||
kwargs["architectures"].append("Olmo2ForCausalLM")
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
|
||||
self.sliding_window = sliding_window
|
||||
self.layer_types = layer_types
|
||||
if self.layer_types is None:
|
||||
self.layer_types = [
|
||||
"sliding_attention" if (i + 1) % 4 != 0 else "full_attention"
|
||||
for i in range(self.num_hidden_layers)
|
||||
]
|
||||
176
vllm/transformers_utils/configs/ovis.py
Normal file
176
vllm/transformers_utils/configs/ovis.py
Normal file
@@ -0,0 +1,176 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
|
||||
# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
|
||||
# Ovis Config with AimV2 config registration removed for Transformers compatibility
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from transformers import AutoConfig, PretrainedConfig
|
||||
|
||||
|
||||
class AIMv2Config(PretrainedConfig):
|
||||
"""This is the configuration class to store the configuration of an [`AIMv2Model`].
|
||||
Instantiating a configuration with the defaults will yield a similar configuration
|
||||
to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224).
|
||||
Args:
|
||||
hidden_size: Dimension of the hidden representations.
|
||||
intermediate_size: Dimension of the SwiGLU representations.
|
||||
num_hidden_layers: Number of hidden layers in the Transformer.
|
||||
num_attention_heads: Number of attention heads for each attention layer
|
||||
in the Transformer.
|
||||
num_channels: Number of input channels.
|
||||
image_size: Image size.
|
||||
patch_size: Patch size.
|
||||
rms_norm_eps: Epsilon value used for the RMS normalization layer.
|
||||
attention_dropout: Dropout ratio for attention probabilities.
|
||||
projection_dropout: Dropout ratio for the projection layer after the attention.
|
||||
qkv_bias: Whether to add a bias to the queries, keys and values.
|
||||
use_bias: Whether to add a bias in the feed-forward and projection layers.
|
||||
kwargs: Keyword arguments for the [`PretrainedConfig`].
|
||||
"""
|
||||
|
||||
model_type: str = "aimv2"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size: int = 1024,
|
||||
intermediate_size: int = 2816,
|
||||
num_hidden_layers: int = 24,
|
||||
num_attention_heads: int = 8,
|
||||
num_channels: int = 3,
|
||||
image_size: int = 224,
|
||||
patch_size: int = 14,
|
||||
rms_norm_eps: float = 1e-5,
|
||||
attention_dropout: float = 0.0,
|
||||
projection_dropout: float = 0.0,
|
||||
qkv_bias: bool = False,
|
||||
use_bias: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_channels = num_channels
|
||||
self.patch_size = patch_size
|
||||
self.image_size = image_size
|
||||
self.attention_dropout = attention_dropout
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
|
||||
self.projection_dropout = projection_dropout
|
||||
self.qkv_bias = qkv_bias
|
||||
self.use_bias = use_bias
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Visual Tokenizer Configuration
|
||||
# ----------------------------------------------------------------------
|
||||
class BaseVisualTokenizerConfig(PretrainedConfig):
|
||||
|
||||
def __init__(self,
|
||||
vocab_size=16384,
|
||||
tokenize_function="softmax",
|
||||
tau=1.0,
|
||||
depths=None,
|
||||
drop_cls_token=False,
|
||||
backbone_config: Optional[Union[PretrainedConfig,
|
||||
dict]] = None,
|
||||
hidden_stride: int = 1,
|
||||
**kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.tokenize_function = tokenize_function
|
||||
self.tau = tau
|
||||
if isinstance(depths, str):
|
||||
depths = [int(x) for x in depths.split('|')]
|
||||
self.depths = depths
|
||||
self.backbone_kwargs = dict[str, Any]()
|
||||
self.drop_cls_token = drop_cls_token
|
||||
if backbone_config is not None:
|
||||
assert isinstance(backbone_config, (PretrainedConfig, dict)), \
|
||||
f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
|
||||
if not isinstance(backbone_config, PretrainedConfig):
|
||||
model_type = backbone_config['model_type']
|
||||
if model_type != "aimv2":
|
||||
backbone_config.pop('model_type')
|
||||
backbone_config = AutoConfig.for_model(model_type, **backbone_config)
|
||||
else:
|
||||
backbone_config = AIMv2Config(**backbone_config)
|
||||
self.backbone_config = backbone_config
|
||||
self.hidden_stride = hidden_stride
|
||||
|
||||
|
||||
class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
|
||||
model_type = "aimv2_visual_tokenizer"
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if self.drop_cls_token:
|
||||
self.drop_cls_token = False
|
||||
if self.depths:
|
||||
assert len(self.depths) == 1
|
||||
self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
|
||||
|
||||
|
||||
class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
|
||||
model_type = "siglip_visual_tokenizer"
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if self.drop_cls_token:
|
||||
self.drop_cls_token = False
|
||||
if self.depths:
|
||||
assert len(self.depths) == 1
|
||||
self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
|
||||
|
||||
|
||||
AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
|
||||
AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Ovis Configuration
|
||||
# ----------------------------------------------------------------------
|
||||
class OvisConfig(PretrainedConfig):
|
||||
model_type = "ovis"
|
||||
|
||||
def __init__(self,
|
||||
llm_config: Optional[Union[PretrainedConfig, dict]] = None,
|
||||
visual_tokenizer_config: Optional[Union[PretrainedConfig,
|
||||
dict]] = None,
|
||||
multimodal_max_length=8192,
|
||||
hidden_size=None,
|
||||
conversation_formatter_class=None,
|
||||
llm_attn_implementation=None,
|
||||
disable_tie_weight=False,
|
||||
**kwargs):
|
||||
super().__init__(**kwargs)
|
||||
if llm_config is not None:
|
||||
assert isinstance(llm_config, (PretrainedConfig, dict)), \
|
||||
f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
|
||||
if not isinstance(llm_config, PretrainedConfig):
|
||||
model_type = llm_config['model_type']
|
||||
llm_config.pop('model_type')
|
||||
llm_config = AutoConfig.for_model(model_type, **llm_config)
|
||||
|
||||
# map llm_config to text_config
|
||||
self.text_config = llm_config
|
||||
if visual_tokenizer_config is not None:
|
||||
assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
|
||||
f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
|
||||
if not isinstance(visual_tokenizer_config, PretrainedConfig):
|
||||
model_type = visual_tokenizer_config['model_type']
|
||||
visual_tokenizer_config.pop('model_type')
|
||||
visual_tokenizer_config = AutoConfig.for_model(
|
||||
model_type, **visual_tokenizer_config)
|
||||
|
||||
self.visual_tokenizer_config = visual_tokenizer_config
|
||||
self.multimodal_max_length = multimodal_max_length
|
||||
self.hidden_size = hidden_size
|
||||
self.conversation_formatter_class = conversation_formatter_class
|
||||
self.llm_attn_implementation = llm_attn_implementation
|
||||
self.disable_tie_weight = disable_tie_weight
|
||||
275
vllm/transformers_utils/configs/qwen3_next.py
Normal file
275
vllm/transformers_utils/configs/qwen3_next.py
Normal file
@@ -0,0 +1,275 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Qwen3-Next model configuration"""
|
||||
|
||||
from transformers.configuration_utils import (PretrainedConfig,
|
||||
layer_type_validation)
|
||||
from transformers.modeling_rope_utils import rope_config_validation
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class Qwen3NextConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`Qwen3NextModel`]. It is used to instantiate a
|
||||
Qwen3-Next model according to the specified arguments, defining the model architecture.
|
||||
Instantiating a configuration with the defaults will yield a similar configuration to that of
|
||||
Qwen3-Next-80B-A3B-Instruct [Qwen/Qwen3-Next-80B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct).
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 151936):
|
||||
Vocabulary size of the model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids`.
|
||||
hidden_size (`int`, *optional*, defaults to 2048):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 5632):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 48):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 2):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
||||
hidden_act (`str`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 32768):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model's input and output word embeddings should be tied.
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
rope_scaling (`Dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
|
||||
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
||||
accordingly.
|
||||
Expected contents:
|
||||
`rope_type` (`str`):
|
||||
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
|
||||
'llama3'], with 'default' being the original RoPE implementation.
|
||||
`factor` (`float`, *optional*):
|
||||
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
|
||||
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
|
||||
original maximum pre-trained length.
|
||||
`original_max_position_embeddings` (`int`, *optional*):
|
||||
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
|
||||
pretraining.
|
||||
`attention_factor` (`float`, *optional*):
|
||||
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
|
||||
computation. If unspecified, it defaults to value recommended by the implementation, using the
|
||||
`factor` field to infer the suggested value.
|
||||
`beta_fast` (`float`, *optional*):
|
||||
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
|
||||
ramp function. If unspecified, it defaults to 32.
|
||||
`beta_slow` (`float`, *optional*):
|
||||
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
|
||||
ramp function. If unspecified, it defaults to 1.
|
||||
`short_factor` (`List[float]`, *optional*):
|
||||
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
|
||||
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
||||
size divided by the number of attention heads divided by 2
|
||||
`long_factor` (`List[float]`, *optional*):
|
||||
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
|
||||
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
||||
size divided by the number of attention heads divided by 2
|
||||
`low_freq_factor` (`float`, *optional*):
|
||||
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
|
||||
`high_freq_factor` (`float`, *optional*):
|
||||
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
|
||||
partial_rotary_factor (`float`, *optional*, defaults to 0.25):
|
||||
Percentage of the query and keys which will have rotary embedding.
|
||||
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
head_dim (`int`, *optional*, defaults to 256):
|
||||
Projection weights dimension in multi-head attention.
|
||||
linear_conv_kernel_dim (`int`, *optional*, defaults to 4):
|
||||
Kernel size of the convolution used in linear attention layers.
|
||||
linear_key_head_dim (`int`, *optional*, defaults to 128):
|
||||
Dimension of each key head in linear attention.
|
||||
linear_value_head_dim (`int`, *optional*, defaults to 128):
|
||||
Dimension of each value head in linear attention.
|
||||
linear_num_key_heads (`int`, *optional*, defaults to 16):
|
||||
Number of key heads used in linear attention layers.
|
||||
linear_num_value_heads (`int`, *optional*, defaults to 32):
|
||||
Number of value heads used in linear attention layers.
|
||||
decoder_sparse_step (`int`, *optional*, defaults to 1):
|
||||
The frequency of the MoE layer.
|
||||
moe_intermediate_size (`int`, *optional*, defaults to 512):
|
||||
Intermediate size of the routed expert.
|
||||
shared_expert_intermediate_size (`int`, *optional*, defaults to 512):
|
||||
Intermediate size of the shared expert.
|
||||
num_experts_per_tok (`int`, *optional*, defaults to 10):
|
||||
Number of selected experts.
|
||||
num_experts (`int`, *optional*, defaults to 512):
|
||||
Number of routed experts.
|
||||
norm_topk_prob (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the topk probabilities.
|
||||
output_router_logits (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the router logits should be returned by the model. Enabling this will also
|
||||
allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
|
||||
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
|
||||
The aux loss factor for the total loss.
|
||||
mlp_only_layers (`list[int]`, *optional*, defaults to `[]`):
|
||||
Indicate which layers use Qwen3NextMLP rather than Qwen3NextSparseMoeBlock
|
||||
The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
|
||||
If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
|
||||
layer_types (`list[str]`, *optional*):
|
||||
Types of each layer (attention or linear).
|
||||
|
||||
```python
|
||||
>>> from transformers import Qwen3NextModel, Qwen3NextConfig
|
||||
|
||||
>>> # Initializing a Qwen3Next style configuration
|
||||
>>> configuration = Qwen3NextConfig()
|
||||
|
||||
>>> # Initializing a model from the Qwen3-Next-80B-A3B style configuration
|
||||
>>> model = Qwen3NextModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```
|
||||
""" # noqa: E501
|
||||
|
||||
model_type = "qwen3_next"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
base_model_tp_plan = {
|
||||
"layers.*.self_attn.q_proj": "colwise",
|
||||
"layers.*.self_attn.k_proj": "colwise",
|
||||
"layers.*.self_attn.v_proj": "colwise",
|
||||
"layers.*.self_attn.o_proj": "rowwise",
|
||||
"layers.*.mlp.experts.*.gate_proj": "colwise",
|
||||
"layers.*.mlp.experts.*.up_proj": "colwise",
|
||||
"layers.*.mlp.experts.*.down_proj": "rowwise",
|
||||
"layers.*.mlp.shared_experts.gate_proj": "colwise",
|
||||
"layers.*.mlp.shared_experts.up_proj": "colwise",
|
||||
"layers.*.mlp.shared_experts.down_proj": "rowwise",
|
||||
"layers.*.mlp.gate_proj": "colwise",
|
||||
"layers.*.mlp.up_proj": "colwise",
|
||||
"layers.*.mlp.down_proj": "rowwise",
|
||||
}
|
||||
base_model_pp_plan = {
|
||||
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
||||
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
||||
"norm": (["hidden_states"], ["hidden_states"]),
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=151936,
|
||||
hidden_size=2048,
|
||||
intermediate_size=5632,
|
||||
num_hidden_layers=48,
|
||||
num_attention_heads=16,
|
||||
num_key_value_heads=2,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=32768,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-6,
|
||||
use_cache=True,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
partial_rotary_factor=0.25,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
head_dim=256,
|
||||
linear_conv_kernel_dim=4,
|
||||
linear_key_head_dim=128,
|
||||
linear_value_head_dim=128,
|
||||
linear_num_key_heads=16,
|
||||
linear_num_value_heads=32,
|
||||
decoder_sparse_step=1,
|
||||
moe_intermediate_size=512,
|
||||
shared_expert_intermediate_size=512,
|
||||
num_experts_per_tok=10,
|
||||
num_experts=512,
|
||||
norm_topk_prob=True,
|
||||
output_router_logits=False,
|
||||
router_aux_loss_coef=0.001,
|
||||
mlp_only_layers=None,
|
||||
layer_types=None,
|
||||
**kwargs,
|
||||
):
|
||||
if mlp_only_layers is None:
|
||||
mlp_only_layers = []
|
||||
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self.partial_rotary_factor = partial_rotary_factor
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
self.head_dim = head_dim
|
||||
rope_config_validation(self)
|
||||
|
||||
self.layer_types = layer_types
|
||||
if self.layer_types is None:
|
||||
self.layer_types = [
|
||||
"linear_attention" if bool((i + 1) % 4) else "full_attention"
|
||||
for i in range(self.num_hidden_layers)
|
||||
]
|
||||
layer_type_validation(self.layer_types)
|
||||
|
||||
# linear attention part
|
||||
self.linear_conv_kernel_dim = linear_conv_kernel_dim
|
||||
self.linear_key_head_dim = linear_key_head_dim
|
||||
self.linear_value_head_dim = linear_value_head_dim
|
||||
self.linear_num_key_heads = linear_num_key_heads
|
||||
self.linear_num_value_heads = linear_num_value_heads
|
||||
|
||||
# MoE arguments
|
||||
self.decoder_sparse_step = decoder_sparse_step
|
||||
self.moe_intermediate_size = moe_intermediate_size
|
||||
self.shared_expert_intermediate_size = shared_expert_intermediate_size
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.num_experts = num_experts
|
||||
self.norm_topk_prob = norm_topk_prob
|
||||
self.output_router_logits = output_router_logits
|
||||
self.router_aux_loss_coef = router_aux_loss_coef
|
||||
self.mlp_only_layers = mlp_only_layers
|
||||
|
||||
|
||||
__all__ = ["Qwen3NextConfig"]
|
||||
91
vllm/transformers_utils/configs/radio.py
Normal file
91
vllm/transformers_utils/configs/radio.py
Normal file
@@ -0,0 +1,91 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Radio vision model configuration"""
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
VIT_TIMM_DIM_BY_NAME: dict[str, tuple[int, int, int, int]] = {
|
||||
"vit_small_patch16_224": (384, 12, 6, 1536),
|
||||
"vit_base_patch16_224": (768, 12, 12, 3072),
|
||||
"vit_large_patch16_224": (1024, 24, 16, 4096),
|
||||
"vit_huge_patch16_224": (1280, 32, 16, 5120),
|
||||
}
|
||||
|
||||
OPENAI_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
|
||||
OPENAI_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
|
||||
|
||||
|
||||
class RadioConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a Radio
|
||||
vision model. It is used to instantiate a Radio model according to the
|
||||
specified arguments, defining the model architecture.
|
||||
|
||||
Args:
|
||||
model_name: Name of the vision transformer model
|
||||
(e.g., "vit_base_patch16_224"). Used to determine architecture
|
||||
dimensions from `VIT_TIMM_DIM_BY_NAME`.
|
||||
image_size: The size (resolution) of each image.
|
||||
patch_size: The size (resolution) of each patch.
|
||||
qkv_bias: Whether to add a bias to the queries, keys and values.
|
||||
qk_normalization: Whether to apply normalization to queries and keys.
|
||||
norm_type: The normalization type to use.
|
||||
layer_norm_eps: The epsilon used by the layer normalization layers.
|
||||
initializer_factor: A factor for initializing all weight matrices.
|
||||
hidden_act: The non-linear activation function in the encoder.
|
||||
max_img_size: Maximum image size for position embeddings.
|
||||
norm_mean: Mean values for image normalization (RGB channels).
|
||||
Defaults to (0.48145466, 0.4578275, 0.40821073)).
|
||||
norm_std: Standard deviation values for image normalization
|
||||
(RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).
|
||||
reg_tokens: Number of register tokens to use.
|
||||
"""
|
||||
|
||||
model_type = "radio"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str,
|
||||
image_size: int = 224,
|
||||
patch_size: int = 16,
|
||||
qkv_bias: bool = True,
|
||||
qk_normalization: bool = False,
|
||||
norm_type: str = "layer_norm",
|
||||
layer_norm_eps: float = 1e-6,
|
||||
initializer_factor: float = 1.0,
|
||||
hidden_act: str = "gelu",
|
||||
max_img_size: int = 2048,
|
||||
norm_mean: Union[tuple[float, float, float], list] = OPENAI_CLIP_MEAN,
|
||||
norm_std: Union[tuple[float, float, float], list] = OPENAI_CLIP_STD,
|
||||
reg_tokens: Optional[int] = None,
|
||||
**kwargs,
|
||||
):
|
||||
self.model_name = model_name
|
||||
(
|
||||
self.hidden_size,
|
||||
self.num_hidden_layers,
|
||||
self.num_attention_heads,
|
||||
self.intermediate_size,
|
||||
) = VIT_TIMM_DIM_BY_NAME[model_name]
|
||||
self.image_size = image_size
|
||||
self.patch_size = patch_size
|
||||
self.qkv_bias = qkv_bias
|
||||
self.qk_normalization = qk_normalization
|
||||
self.norm_type = norm_type
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.initializer_factor = initializer_factor
|
||||
self.hidden_act = hidden_act
|
||||
self.max_img_size = max_img_size
|
||||
self.norm_mean = list(norm_mean) if isinstance(norm_mean,
|
||||
(tuple,
|
||||
list)) else norm_mean
|
||||
self.norm_std = list(norm_std) if isinstance(norm_std,
|
||||
(tuple,
|
||||
list)) else norm_std
|
||||
self.reg_tokens = reg_tokens
|
||||
super().__init__(**kwargs)
|
||||
2
vllm/transformers_utils/configs/speculators/__init__.py
Normal file
2
vllm/transformers_utils/configs/speculators/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
32
vllm/transformers_utils/configs/speculators/algos.py
Normal file
32
vllm/transformers_utils/configs/speculators/algos.py
Normal file
@@ -0,0 +1,32 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
SUPPORTED_SPECULATORS_TYPES = {}
|
||||
|
||||
|
||||
def register_speculator(name):
|
||||
|
||||
def decorator(fn):
|
||||
SUPPORTED_SPECULATORS_TYPES[name] = fn
|
||||
return fn
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
@register_speculator("eagle3")
|
||||
def update_eagle3(config_dict: dict, vllm_config: dict) -> None:
|
||||
"""
|
||||
Apply Eagle-3 specific configuration transformations.
|
||||
|
||||
Eagle-3 specific fields:
|
||||
- draft_vocab_size: Size of the draft model's vocabulary
|
||||
- target_hidden_size: Hidden size of the target model
|
||||
- norm_before_residual: Whether to apply norm before residual connection
|
||||
"""
|
||||
|
||||
vllm_config["draft_vocab_size"] = config_dict.get("draft_vocab_size")
|
||||
if config_dict.get("target_hidden_size") is not None:
|
||||
vllm_config["target_hidden_size"] = config_dict["target_hidden_size"]
|
||||
vllm_config["norm_before_residual"] = config_dict.get(
|
||||
"norm_before_residual", True)
|
||||
vllm_config["architectures"] = ["Eagle3LlamaForCausalLM"]
|
||||
111
vllm/transformers_utils/configs/speculators/base.py
Normal file
111
vllm/transformers_utils/configs/speculators/base.py
Normal file
@@ -0,0 +1,111 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import os
|
||||
from typing import Any, Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.transformers_utils.configs.speculators.algos import (
|
||||
SUPPORTED_SPECULATORS_TYPES)
|
||||
|
||||
__all__ = ["SpeculatorsConfig"]
|
||||
|
||||
|
||||
class SpeculatorsConfig(PretrainedConfig):
|
||||
model_type = "speculators"
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
pretrained_model_name_or_path: Union[str, os.PathLike],
|
||||
**kwargs,
|
||||
) -> "SpeculatorsConfig":
|
||||
"""Load speculators Eagle config and convert to vLLM format."""
|
||||
config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path,
|
||||
**kwargs)
|
||||
|
||||
vllm_config = cls.extract_vllm_speculative_config(config_dict)
|
||||
return cls(**vllm_config)
|
||||
|
||||
@classmethod
|
||||
def extract_vllm_speculative_config(
|
||||
cls, config_dict: dict[str, Any]) -> dict[str, Any]:
|
||||
speculators_model_type = config_dict.get("speculators_model_type")
|
||||
if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES:
|
||||
raise ValueError(
|
||||
f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. "
|
||||
"Please ensure you're loading a speculators-format model.")
|
||||
|
||||
# validate fields
|
||||
# TODO: @dsikka - use speculators pydantic model to validate
|
||||
cls.validate_speculators_config(config_dict=config_dict)
|
||||
# Convert from speculators config -> format that can be ingested by vLLM
|
||||
vllm_config = cls.build_vllm_speculative_config(
|
||||
config_dict=config_dict)
|
||||
# Apply anything specific to the supported algorithm
|
||||
algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type]
|
||||
algo_updater(config_dict=config_dict, vllm_config=vllm_config)
|
||||
return vllm_config
|
||||
|
||||
@classmethod
|
||||
def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None:
|
||||
try:
|
||||
spec_config = config_dict["speculators_config"]
|
||||
methods = spec_config["proposal_methods"]
|
||||
first_method = methods[0]
|
||||
_ = first_method["speculative_tokens"]
|
||||
_ = spec_config["verifier"]["name_or_path"]
|
||||
_ = config_dict["speculators_model_type"]
|
||||
except (KeyError, IndexError, TypeError) as e:
|
||||
raise ValueError("Invalid speculators config structure") from e
|
||||
|
||||
if "transformer_layer_config" not in config_dict:
|
||||
raise ValueError("Must provide transformer_layer_config")
|
||||
|
||||
if not isinstance(config_dict["transformer_layer_config"], dict):
|
||||
raise TypeError(
|
||||
"'transformer_layer_config' must be a dictionary if provided")
|
||||
|
||||
@classmethod
|
||||
def build_vllm_speculative_config(
|
||||
cls, config_dict: dict[str, Any]) -> dict[str, Any]:
|
||||
"""
|
||||
Build vLLM-compatible speculative configuration from speculators format.
|
||||
|
||||
This method extracts and transforms speculative configuration from the
|
||||
speculators format into the structure expected by vLLM.
|
||||
|
||||
Args:
|
||||
config_dict: Configuration dictionary in speculators format
|
||||
|
||||
Returns:
|
||||
Dictionary with vLLM-compatible speculative configuration
|
||||
"""
|
||||
# Extract speculators configuration
|
||||
spec_config = config_dict["speculators_config"]
|
||||
|
||||
# Currently we only support one proposal method
|
||||
proposal_methods = spec_config.get("proposal_methods")
|
||||
if not proposal_methods:
|
||||
raise ValueError("No proposal methods found in speculators config")
|
||||
|
||||
first_method = proposal_methods[0]
|
||||
num_speculative_tokens = first_method.get("speculative_tokens")
|
||||
|
||||
if num_speculative_tokens is None:
|
||||
raise ValueError(
|
||||
"Missing 'speculative_tokens' in proposal method. "
|
||||
f"Got: {first_method}")
|
||||
|
||||
# Build base vLLM speculative configuration
|
||||
vllm_config = {
|
||||
"method": config_dict.get("speculators_model_type"),
|
||||
"num_speculative_tokens": num_speculative_tokens,
|
||||
"target_model": spec_config.get("verifier")["name_or_path"]
|
||||
}
|
||||
|
||||
# Merge transformer layer configuration if present
|
||||
transformer_config = config_dict.get("transformer_layer_config", {})
|
||||
vllm_config.update(transformer_config)
|
||||
|
||||
return vllm_config
|
||||
123
vllm/transformers_utils/configs/step3_vl.py
Normal file
123
vllm/transformers_utils/configs/step3_vl.py
Normal file
@@ -0,0 +1,123 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class Step3VisionEncoderConfig(PretrainedConfig):
|
||||
model_type = "step3_vision_encoder"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size=1792,
|
||||
intermediate_size=3072,
|
||||
output_hidden_size=4096,
|
||||
num_hidden_layers=63,
|
||||
num_attention_heads=16,
|
||||
num_channels=3,
|
||||
image_size=728,
|
||||
patch_size=14,
|
||||
hidden_act="quick_gelu",
|
||||
layer_norm_eps=1e-5,
|
||||
**kwargs,
|
||||
):
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.output_hidden_size = output_hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_channels = num_channels
|
||||
self.patch_size = patch_size
|
||||
self.image_size = image_size
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.hidden_act = hidden_act
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
class Step3TextConfig(PretrainedConfig):
|
||||
model_type = "step3_text"
|
||||
architectures = ["Step3TextForCausalLM"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size: int = 7168,
|
||||
intermediate_size: int = 18432,
|
||||
num_attention_heads: int = 64,
|
||||
num_attention_groups: int = 1,
|
||||
num_hidden_layers: int = 61,
|
||||
max_seq_len: int = 65536,
|
||||
vocab_size: int = 128815,
|
||||
rms_norm_eps: float = 1e-5,
|
||||
moe_intermediate_size: int = 5120,
|
||||
moe_num_experts: int = 48,
|
||||
moe_top_k: int = 3,
|
||||
rope_theta: float = 500000,
|
||||
rope_scaling: Optional[dict[str, Any]] = None,
|
||||
max_position_embedding: int = 65536,
|
||||
share_expert_dim: int = 5120,
|
||||
share_q_dim: int = 2048,
|
||||
head_dim: int = 256,
|
||||
norm_expert_weight: bool = False,
|
||||
moe_layers_enum: tuple[int,
|
||||
...] = (4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
|
||||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
|
||||
35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
|
||||
45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
|
||||
55, 56, 57, 58, 59),
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_attention_groups = num_attention_groups
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.max_seq_len = max_seq_len
|
||||
self.vocab_size = vocab_size
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.moe_intermediate_size = moe_intermediate_size
|
||||
self.moe_num_experts = moe_num_experts
|
||||
self.moe_top_k = moe_top_k
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self.max_position_embedding = max_position_embedding
|
||||
self.share_expert_dim = share_expert_dim
|
||||
self.share_q_dim = share_q_dim
|
||||
self.head_dim = head_dim
|
||||
self.norm_expert_weight = norm_expert_weight
|
||||
self.moe_layers_enum = moe_layers_enum
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
class Step3VLConfig(PretrainedConfig):
|
||||
model_type = "step3_vl"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vision_config: Optional[Union[dict, Step3VisionEncoderConfig]] = None,
|
||||
text_config: Optional[Union[dict, Step3TextConfig]] = None,
|
||||
understand_projector_stride: int = 1,
|
||||
projector_bias: bool = True,
|
||||
image_token_id: int = 128001,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
if vision_config is None:
|
||||
vision_config = Step3VisionEncoderConfig()
|
||||
elif isinstance(vision_config, dict):
|
||||
vision_config = Step3VisionEncoderConfig(**vision_config)
|
||||
self.vision_config = vision_config
|
||||
|
||||
if text_config is None:
|
||||
text_config = Step3TextConfig()
|
||||
elif isinstance(text_config, dict):
|
||||
text_config = Step3TextConfig(**text_config)
|
||||
self.text_config = text_config
|
||||
|
||||
self.understand_projector_stride = understand_projector_stride
|
||||
self.projector_bias = projector_bias
|
||||
self.hidden_size = text_config.hidden_size
|
||||
self.image_token_id = image_token_id
|
||||
|
||||
super().__init__(**kwargs)
|
||||
116
vllm/transformers_utils/configs/ultravox.py
Normal file
116
vllm/transformers_utils/configs/ultravox.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
|
||||
from typing import Any, Optional
|
||||
|
||||
import transformers
|
||||
|
||||
|
||||
class UltravoxConfig(transformers.PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a
|
||||
[`UltravoxForConditionalGeneration`]. It is used to instantiate an
|
||||
Ultravox model according to the specified arguments, defining the model
|
||||
architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to
|
||||
control the model outputs. Read the documentation from [`PretrainedConfig`]
|
||||
for more information.
|
||||
|
||||
Args:
|
||||
audio_config (`Union[AutoConfig, dict]`, *optional*):
|
||||
Custom audio config or dict.
|
||||
text_config (`Union[AutoConfig, dict]`, *optional*):
|
||||
The config object of the text backbone.
|
||||
audio_model_id (`str`, *optional*):
|
||||
The model ID of the audio backbone.
|
||||
text_model_id (`str`, *optional*):
|
||||
The model ID of the text backbone.
|
||||
ignore_index (`int`, *optional*, defaults to -100):
|
||||
The ignore index for the loss function.
|
||||
audio_token_index (`int`, *optional*, defaults to 32000):
|
||||
The audio token index to encode the audio prompt.
|
||||
stack_factor (`int`, *optional*, defaults to 8):
|
||||
Audio downsampling factor for the multimodal projector.
|
||||
norm_init (`float`, *optional*, defaults to 0.4):
|
||||
The initialization value for the layer normalization.
|
||||
projector_act (`str`, *optional*, defaults to `"swiglu"`):
|
||||
The activation function used by the multimodal projector.
|
||||
projector_ln_mid (`bool`, *optional*, defaults to `False`):
|
||||
Whether to apply layer normalization at the middle of the
|
||||
projector or at the end. Versions v0.4.1 and below
|
||||
use `False`, but v0.5 and above use `True`.
|
||||
"""
|
||||
wrapped_model_config: transformers.PretrainedConfig
|
||||
model_type = "ultravox"
|
||||
audio_token = "<|audio|>"
|
||||
is_composition = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
audio_config: Optional[dict[str, Any]] = None,
|
||||
text_config: Optional[dict[str, Any]] = None,
|
||||
audio_model_id: Optional[str] = None,
|
||||
text_model_id: Optional[str] = None,
|
||||
ignore_index: int = -100,
|
||||
audio_token_index: int = 32000,
|
||||
hidden_size: int = 4096,
|
||||
stack_factor: int = 8,
|
||||
norm_init: float = 0.4,
|
||||
projector_act: str = "swiglu",
|
||||
projector_ln_mid: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
self.ignore_index = ignore_index
|
||||
self.audio_token_index = audio_token_index
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.stack_factor = stack_factor
|
||||
self.norm_init = norm_init
|
||||
self.projector_act = projector_act
|
||||
self.projector_ln_mid = projector_ln_mid
|
||||
|
||||
# N.B. May set the wrapped_model_config below.
|
||||
self.text_model_id = text_model_id
|
||||
if text_model_id is None:
|
||||
text_config = text_config or {}
|
||||
self.wrapped_model_config = transformers.CONFIG_MAPPING[
|
||||
text_config.get("model_type", "llama")](**text_config)
|
||||
|
||||
# N.B. May set the audio_config below.
|
||||
self.audio_model_id = audio_model_id
|
||||
if audio_model_id is None:
|
||||
self.audio_model_id = None
|
||||
audio_config = audio_config or {}
|
||||
self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
|
||||
"model_type", "whisper")](**audio_config)
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def __setattr__(self, key, value):
|
||||
# Since --hf-overrides are applied _after_ the UltravoxConfig is
|
||||
# instantiated, load the configs implicitly when assigning text_model_id
|
||||
# or audio_model_id. This allows:
|
||||
#
|
||||
# --hf-overrides.text_model_id=<quantized variant>
|
||||
#
|
||||
# to behave as intended.
|
||||
if key == "text_model_id" and value is not None:
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
self.wrapped_model_config = get_config(value,
|
||||
trust_remote_code=False)
|
||||
elif key == "audio_model_id" and value is not None:
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
self.audio_config = get_config(value, trust_remote_code=False)
|
||||
|
||||
return super().__setattr__(key, value)
|
||||
|
||||
@property
|
||||
def text_config(self) -> transformers.PretrainedConfig:
|
||||
# When Ultravox wraps a multi-modal model (e.g. Gemma), we instantiate
|
||||
# the full model, but the text config is the text config of the inner
|
||||
# model.
|
||||
return self.wrapped_model_config.get_text_config()
|
||||
199
vllm/transformers_utils/detokenizer_utils.py
Normal file
199
vllm/transformers_utils/detokenizer_utils.py
Normal file
@@ -0,0 +1,199 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from .tokenizer import AnyTokenizer
|
||||
|
||||
|
||||
def _replace_none_with_empty(tokens: list[Optional[str]]):
|
||||
for i, token in enumerate(tokens):
|
||||
if token is None:
|
||||
tokens[i] = ""
|
||||
|
||||
|
||||
def _convert_tokens_to_string_with_added_encoders(
|
||||
tokenizer: AnyTokenizer,
|
||||
output_tokens: list[str],
|
||||
skip_special_tokens: bool,
|
||||
spaces_between_special_tokens: bool,
|
||||
) -> str:
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
|
||||
# NOTE(woosuk): The following code is slow because it runs a for loop over
|
||||
# the output_tokens. In Python, running a for loop over a list can be slow
|
||||
# even when the loop body is very simple.
|
||||
# Performance improvements: avoid repeated attribute and function lookups;
|
||||
# localize frequently used objects;
|
||||
|
||||
sub_texts: list[str] = []
|
||||
current_sub_text: list[str] = []
|
||||
convert_tokens_to_string = tokenizer.convert_tokens_to_string
|
||||
added_vocab_set = set(tokenizer.get_added_vocab())
|
||||
all_special_tokens = set(
|
||||
tokenizer.all_special_tokens) if skip_special_tokens else ()
|
||||
|
||||
for token in output_tokens:
|
||||
# Use precomputed set for skip-special check
|
||||
if token in all_special_tokens:
|
||||
continue
|
||||
if token in added_vocab_set:
|
||||
if current_sub_text:
|
||||
sub_texts.append(convert_tokens_to_string(current_sub_text))
|
||||
current_sub_text.clear()
|
||||
sub_texts.append(token)
|
||||
else:
|
||||
current_sub_text.append(token)
|
||||
if current_sub_text:
|
||||
sub_texts.append(convert_tokens_to_string(current_sub_text))
|
||||
if spaces_between_special_tokens:
|
||||
return " ".join(sub_texts)
|
||||
return "".join(sub_texts)
|
||||
|
||||
|
||||
# 5 is an arbitrary value that should work for all
|
||||
# tokenizers (bigger = more conservative).
|
||||
INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5
|
||||
|
||||
|
||||
def convert_prompt_ids_to_tokens(
|
||||
tokenizer: AnyTokenizer,
|
||||
prompt_ids: list[int],
|
||||
skip_special_tokens: bool = False,
|
||||
) -> tuple[list[str], int, int]:
|
||||
"""Converts the prompt ids to tokens and returns the tokens and offsets
|
||||
for incremental detokenization.
|
||||
|
||||
Note that not all tokens are converted to strings. Only the tokens that
|
||||
are necessary for incremental detokenization are converted to strings.
|
||||
"""
|
||||
# We do not need to convert the whole prompt to tokens.
|
||||
# Offset a little more in case we have special tokens.
|
||||
new_tokens = tokenizer.convert_ids_to_tokens(
|
||||
prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2:],
|
||||
skip_special_tokens=skip_special_tokens)
|
||||
read_offset = len(new_tokens)
|
||||
prefix_offset = max(
|
||||
read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
|
||||
# This is required to guard against out-of-vocab prompt token ids
|
||||
_replace_none_with_empty(new_tokens) # type: ignore[arg-type]
|
||||
return new_tokens, prefix_offset, read_offset
|
||||
|
||||
|
||||
def convert_ids_list_to_tokens(
|
||||
tokenizer: AnyTokenizer,
|
||||
token_ids: list[int],
|
||||
) -> list[str]:
|
||||
"""Detokenize the input ids individually.
|
||||
|
||||
Args:
|
||||
tokenizer: tokenizer used by model under test
|
||||
token_ids: convert these tokens (Python list form)
|
||||
|
||||
Returns:
|
||||
Python list of token string representations
|
||||
|
||||
"""
|
||||
token_str_lst = []
|
||||
for token_id in token_ids:
|
||||
# use default skip_special_tokens.
|
||||
token_str = tokenizer.decode([token_id])
|
||||
if token_str is None:
|
||||
token_str = ""
|
||||
token_str_lst.append(token_str)
|
||||
return token_str_lst
|
||||
|
||||
|
||||
# Based on
|
||||
# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
|
||||
# under Apache 2.0 license
|
||||
def detokenize_incrementally(
|
||||
tokenizer: AnyTokenizer,
|
||||
all_input_ids: list[int],
|
||||
prev_tokens: Optional[list[str]],
|
||||
prefix_offset: int,
|
||||
read_offset: int,
|
||||
skip_special_tokens: bool = False,
|
||||
spaces_between_special_tokens: bool = True,
|
||||
) -> tuple[list[str], str, int, int]:
|
||||
"""Detokenizes the input ids incrementally and returns the new tokens
|
||||
and the new text.
|
||||
|
||||
If `prev_tokens` is None, this function will convert the input ids to
|
||||
tokens and return the tokens and the new text. Otherwise, it will return the
|
||||
new tokens and the new text.
|
||||
|
||||
This function will also return the new prefix offset and the new read
|
||||
offset to be used in the next iteration.
|
||||
|
||||
The offsets are necessary to defeat cleanup algorithms in the decode which
|
||||
decide to add a space or not depending on the surrounding ids.
|
||||
|
||||
Args:
|
||||
tokenizer: The tokenizer to use.
|
||||
all_input_ids: The input ids. The last id is the new token id.
|
||||
prev_tokens: The previous tokens. If None, this function will convert
|
||||
the input ids to tokens and return the tokens and the new text.
|
||||
prefix_offset: The prefix offset.
|
||||
read_offset: The read offset.
|
||||
skip_special_tokens: Whether to skip special tokens.
|
||||
spaces_between_special_tokens: Whether to add spaces between special
|
||||
tokens.
|
||||
"""
|
||||
new_token_id = all_input_ids[-1]
|
||||
# This is the first iteration for this sequence
|
||||
is_first_iter = prev_tokens is None
|
||||
if is_first_iter:
|
||||
(prev_tokens, prefix_offset,
|
||||
read_offset) = convert_prompt_ids_to_tokens(
|
||||
tokenizer,
|
||||
all_input_ids[:-1],
|
||||
skip_special_tokens=skip_special_tokens)
|
||||
assert prev_tokens is not None
|
||||
|
||||
# If the new token id is out of bounds, return an empty string.
|
||||
if 0 <= new_token_id < len(tokenizer):
|
||||
# Put new_token_id in a list so skip_special_tokens is respected
|
||||
new_tokens = tokenizer.convert_ids_to_tokens(
|
||||
[new_token_id], skip_special_tokens=skip_special_tokens)
|
||||
if isinstance(new_tokens, str):
|
||||
new_tokens = [new_tokens]
|
||||
else:
|
||||
new_tokens = [""]
|
||||
output_tokens = prev_tokens + new_tokens
|
||||
|
||||
# If this is the first iteration, return all tokens.
|
||||
if is_first_iter:
|
||||
new_tokens = output_tokens
|
||||
|
||||
# The prefix text is necessary only to defeat cleanup algorithms in
|
||||
# the decode which decide to add a space or not depending on the
|
||||
# surrounding ids.
|
||||
if tokenizer.is_fast or not tokenizer.get_added_vocab():
|
||||
prefix_text = tokenizer.convert_tokens_to_string(
|
||||
output_tokens[prefix_offset:read_offset])
|
||||
new_text = tokenizer.convert_tokens_to_string(
|
||||
output_tokens[prefix_offset:])
|
||||
else:
|
||||
prefix_text = _convert_tokens_to_string_with_added_encoders(
|
||||
tokenizer,
|
||||
output_tokens[prefix_offset:read_offset],
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
spaces_between_special_tokens=spaces_between_special_tokens,
|
||||
)
|
||||
new_text = _convert_tokens_to_string_with_added_encoders(
|
||||
tokenizer,
|
||||
output_tokens[prefix_offset:],
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
spaces_between_special_tokens=spaces_between_special_tokens,
|
||||
)
|
||||
|
||||
if len(new_text) <= len(prefix_text) or new_text.endswith("<EFBFBD>"):
|
||||
# utf-8 char at the end means it's a potential unfinished byte sequence
|
||||
# from byte fallback tokenization.
|
||||
# If it's in the middle, it's probably a real invalid id generated
|
||||
# by the model
|
||||
return new_tokens, "", prefix_offset, read_offset
|
||||
|
||||
new_text = new_text[len(prefix_text):]
|
||||
return new_tokens, new_text, read_offset, len(output_tokens)
|
||||
60
vllm/transformers_utils/dynamic_module.py
Normal file
60
vllm/transformers_utils/dynamic_module.py
Normal file
@@ -0,0 +1,60 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import os
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers.dynamic_module_utils import get_class_from_dynamic_module
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def try_get_class_from_dynamic_module(
|
||||
class_reference: str,
|
||||
pretrained_model_name_or_path: str,
|
||||
cache_dir: Optional[Union[str, os.PathLike]] = None,
|
||||
force_download: bool = False,
|
||||
resume_download: Optional[bool] = None,
|
||||
proxies: Optional[dict[str, str]] = None,
|
||||
token: Optional[Union[bool, str]] = None,
|
||||
revision: Optional[str] = None,
|
||||
local_files_only: bool = False,
|
||||
repo_type: Optional[str] = None,
|
||||
code_revision: Optional[str] = None,
|
||||
warn_on_fail: bool = True,
|
||||
**kwargs,
|
||||
) -> Optional[type]:
|
||||
"""
|
||||
As `transformers.dynamic_module_utils.get_class_from_dynamic_module`,
|
||||
but ignoring any errors.
|
||||
"""
|
||||
try:
|
||||
return get_class_from_dynamic_module(
|
||||
class_reference,
|
||||
pretrained_model_name_or_path,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
resume_download=resume_download,
|
||||
proxies=proxies,
|
||||
token=token,
|
||||
revision=revision,
|
||||
local_files_only=local_files_only,
|
||||
repo_type=repo_type,
|
||||
code_revision=code_revision,
|
||||
**kwargs,
|
||||
)
|
||||
except Exception:
|
||||
location = "ModelScope" if envs.VLLM_USE_MODELSCOPE else "HF Hub"
|
||||
|
||||
if warn_on_fail:
|
||||
logger.warning(
|
||||
"Unable to load %s from %s on %s.",
|
||||
class_reference,
|
||||
pretrained_model_name_or_path,
|
||||
location,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
return None
|
||||
299
vllm/transformers_utils/processor.py
Normal file
299
vllm/transformers_utils/processor.py
Normal file
@@ -0,0 +1,299 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from functools import lru_cache
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
||||
|
||||
from transformers import (AutoFeatureExtractor, AutoImageProcessor,
|
||||
AutoProcessor, AutoVideoProcessor)
|
||||
from transformers.feature_extraction_utils import FeatureExtractionMixin
|
||||
from transformers.image_processing_utils import BaseImageProcessor
|
||||
from transformers.processing_utils import ProcessorMixin
|
||||
from transformers.video_processing_utils import BaseVideoProcessor
|
||||
from typing_extensions import TypeVar
|
||||
|
||||
from vllm.utils import get_allowed_kwarg_only_overrides
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig
|
||||
|
||||
_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
|
||||
_V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor)
|
||||
|
||||
|
||||
class HashableDict(dict):
|
||||
"""
|
||||
A dictionary that can be hashed by lru_cache.
|
||||
"""
|
||||
|
||||
# NOTE: pythonic dict is not hashable,
|
||||
# we override on it directly for simplicity
|
||||
def __hash__(self) -> int: # type: ignore[override]
|
||||
return hash(frozenset(self.items()))
|
||||
|
||||
|
||||
class HashableList(list):
|
||||
"""
|
||||
A list that can be hashed by lru_cache.
|
||||
"""
|
||||
|
||||
def __hash__(self) -> int: # type: ignore[override]
|
||||
return hash(tuple(self))
|
||||
|
||||
|
||||
def _get_processor_factory_fn(processor_cls: Union[type, tuple[type, ...]]):
|
||||
if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin:
|
||||
return AutoProcessor.from_pretrained
|
||||
if hasattr(processor_cls, "from_pretrained"):
|
||||
return processor_cls.from_pretrained
|
||||
|
||||
return processor_cls
|
||||
|
||||
|
||||
def _merge_mm_kwargs(
|
||||
model_config: "ModelConfig",
|
||||
processor_cls: Union[type, tuple[type, ...]],
|
||||
/,
|
||||
**kwargs,
|
||||
):
|
||||
mm_config = model_config.get_multimodal_config()
|
||||
merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs)
|
||||
|
||||
factory = _get_processor_factory_fn(processor_cls)
|
||||
allowed_kwargs = get_allowed_kwarg_only_overrides(
|
||||
factory,
|
||||
merged_kwargs,
|
||||
requires_kw_only=False,
|
||||
allow_var_kwargs=True,
|
||||
)
|
||||
|
||||
# NOTE: Pythonic dict is not hashable and will raise unhashable type
|
||||
# error when calling `cached_get_processor`, therefore we need to
|
||||
# wrap it to a hashable dict.
|
||||
for key, value in allowed_kwargs.items():
|
||||
if isinstance(value, dict):
|
||||
allowed_kwargs[key] = HashableDict(value)
|
||||
if isinstance(value, list):
|
||||
allowed_kwargs[key] = HashableList(value)
|
||||
|
||||
return allowed_kwargs
|
||||
|
||||
|
||||
def get_processor(
|
||||
processor_name: str,
|
||||
*args: Any,
|
||||
revision: Optional[str] = None,
|
||||
trust_remote_code: bool = False,
|
||||
processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
|
||||
**kwargs: Any,
|
||||
) -> _P:
|
||||
"""Load a processor for the given model name via HuggingFace."""
|
||||
if revision is None:
|
||||
revision = "main"
|
||||
|
||||
try:
|
||||
if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin:
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
processor_name,
|
||||
*args,
|
||||
revision=revision,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**kwargs,
|
||||
)
|
||||
elif issubclass(processor_cls, ProcessorMixin):
|
||||
processor = processor_cls.from_pretrained(
|
||||
processor_name,
|
||||
*args,
|
||||
revision=revision,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
# Processors that are standalone classes unrelated to HF
|
||||
processor = processor_cls(*args, **kwargs)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the processor class not existing or not
|
||||
# currently being imported, suggest using the --trust-remote-code flag.
|
||||
# Unlike AutoTokenizer, AutoProcessor does not separate such errors
|
||||
if not trust_remote_code:
|
||||
err_msg = (
|
||||
"Failed to load the processor. If the processor is "
|
||||
"a custom processor not yet available in the HuggingFace "
|
||||
"transformers library, consider setting "
|
||||
"`trust_remote_code=True` in LLM or using the "
|
||||
"`--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
if not isinstance(processor, processor_cls):
|
||||
raise TypeError("Invalid type of HuggingFace processor. "
|
||||
f"Expected type: {processor_cls}, but "
|
||||
f"found type: {type(processor)}")
|
||||
|
||||
return processor
|
||||
|
||||
|
||||
cached_get_processor = lru_cache(get_processor)
|
||||
|
||||
|
||||
def cached_processor_from_config(
|
||||
model_config: "ModelConfig",
|
||||
processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
|
||||
**kwargs: Any,
|
||||
) -> _P:
|
||||
return cached_get_processor(
|
||||
model_config.model,
|
||||
revision=model_config.revision,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
processor_cls=processor_cls, # type: ignore[arg-type]
|
||||
**_merge_mm_kwargs(model_config, processor_cls, **kwargs),
|
||||
)
|
||||
|
||||
|
||||
def get_feature_extractor(
|
||||
processor_name: str,
|
||||
*args: Any,
|
||||
revision: Optional[str] = None,
|
||||
trust_remote_code: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Load an audio feature extractor for the given model name
|
||||
via HuggingFace."""
|
||||
try:
|
||||
feature_extractor = AutoFeatureExtractor.from_pretrained(
|
||||
processor_name,
|
||||
*args,
|
||||
revision=revision,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**kwargs)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the processor class not existing or not
|
||||
# currently being imported, suggest using the --trust-remote-code flag.
|
||||
# Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
|
||||
if not trust_remote_code:
|
||||
err_msg = (
|
||||
"Failed to load the feature extractor. If the feature "
|
||||
"extractor is a custom extractor not yet available in the "
|
||||
"HuggingFace transformers library, consider setting "
|
||||
"`trust_remote_code=True` in LLM or using the "
|
||||
"`--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
return cast(FeatureExtractionMixin, feature_extractor)
|
||||
|
||||
|
||||
cached_get_feature_extractor = lru_cache(get_feature_extractor)
|
||||
|
||||
|
||||
def cached_feature_extractor_from_config(
|
||||
model_config: "ModelConfig",
|
||||
**kwargs: Any,
|
||||
):
|
||||
return cached_get_feature_extractor(
|
||||
model_config.model,
|
||||
revision=model_config.revision,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
**_merge_mm_kwargs(model_config, AutoFeatureExtractor, **kwargs),
|
||||
)
|
||||
|
||||
|
||||
def get_image_processor(
|
||||
processor_name: str,
|
||||
*args: Any,
|
||||
revision: Optional[str] = None,
|
||||
trust_remote_code: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Load an image processor for the given model name via HuggingFace."""
|
||||
try:
|
||||
processor = AutoImageProcessor.from_pretrained(
|
||||
processor_name,
|
||||
*args,
|
||||
revision=revision,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**kwargs)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the processor class not existing or not
|
||||
# currently being imported, suggest using the --trust-remote-code flag.
|
||||
# Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
|
||||
if not trust_remote_code:
|
||||
err_msg = (
|
||||
"Failed to load the image processor. If the image processor is "
|
||||
"a custom processor not yet available in the HuggingFace "
|
||||
"transformers library, consider setting "
|
||||
"`trust_remote_code=True` in LLM or using the "
|
||||
"`--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
return cast(BaseImageProcessor, processor)
|
||||
|
||||
|
||||
cached_get_image_processor = lru_cache(get_image_processor)
|
||||
|
||||
|
||||
def cached_image_processor_from_config(
|
||||
model_config: "ModelConfig",
|
||||
**kwargs: Any,
|
||||
):
|
||||
return cached_get_image_processor(
|
||||
model_config.model,
|
||||
revision=model_config.revision,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
**_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs),
|
||||
)
|
||||
|
||||
|
||||
def get_video_processor(
|
||||
processor_name: str,
|
||||
*args: Any,
|
||||
revision: Optional[str] = None,
|
||||
trust_remote_code: bool = False,
|
||||
processor_cls_overrides: Optional[type[_V]] = None,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Load a video processor for the given model name via HuggingFace."""
|
||||
try:
|
||||
processor_cls = processor_cls_overrides or AutoVideoProcessor
|
||||
processor = processor_cls.from_pretrained(
|
||||
processor_name,
|
||||
*args,
|
||||
revision=revision,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**kwargs)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the processor class not existing or not
|
||||
# currently being imported, suggest using the --trust-remote-code flag.
|
||||
# Unlike AutoTokenizer, AutoVideoProcessor does not separate such errors
|
||||
if not trust_remote_code:
|
||||
err_msg = (
|
||||
"Failed to load the video processor. If the video processor is "
|
||||
"a custom processor not yet available in the HuggingFace "
|
||||
"transformers library, consider setting "
|
||||
"`trust_remote_code=True` in LLM or using the "
|
||||
"`--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
return cast(BaseVideoProcessor, processor)
|
||||
|
||||
|
||||
cached_get_video_processor = lru_cache(get_video_processor)
|
||||
|
||||
|
||||
def cached_video_processor_from_config(
|
||||
model_config: "ModelConfig",
|
||||
processor_cls: Optional[type[_V]] = None,
|
||||
**kwargs: Any,
|
||||
):
|
||||
return cached_get_video_processor(
|
||||
model_config.model,
|
||||
revision=model_config.revision,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
processor_cls_overrides=processor_cls, # type: ignore[arg-type]
|
||||
**_merge_mm_kwargs(model_config, AutoVideoProcessor, **kwargs),
|
||||
)
|
||||
16
vllm/transformers_utils/processors/__init__.py
Normal file
16
vllm/transformers_utils/processors/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Multi-modal processors may be defined in this directory for the following
|
||||
reasons:
|
||||
|
||||
- There is no processing file defined by HF Hub or Transformers library.
|
||||
- There is a need to override the existing processor to support vLLM.
|
||||
"""
|
||||
|
||||
from vllm.transformers_utils.processors.deepseek_vl2 import (
|
||||
DeepseekVLV2Processor)
|
||||
from vllm.transformers_utils.processors.ovis import OvisProcessor
|
||||
from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
|
||||
|
||||
__all__ = ["DeepseekVLV2Processor", "OvisProcessor", "Ovis2_5Processor"]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
362
vllm/transformers_utils/processors/deepseek_vl2.py
Normal file
362
vllm/transformers_utils/processors/deepseek_vl2.py
Normal file
@@ -0,0 +1,362 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
|
||||
# Copyright (c) 2023-2024 DeepSeek.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# this software and associated documentation files (the "Software"), to deal in
|
||||
# the Software without restriction, including without limitation the rights to
|
||||
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||
# the Software, and to permit persons to whom the Software is furnished to do so,
|
||||
# subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
import math
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import torchvision.transforms as T
|
||||
from PIL import Image, ImageOps
|
||||
from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast
|
||||
from transformers.processing_utils import ProcessorMixin
|
||||
|
||||
|
||||
class ImageTransform:
|
||||
|
||||
def __init__(self,
|
||||
mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
std: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
normalize: bool = True):
|
||||
self.mean = mean
|
||||
self.std = std
|
||||
self.normalize = normalize
|
||||
|
||||
transform_pipelines = [T.ToTensor()]
|
||||
|
||||
if normalize:
|
||||
transform_pipelines.append(T.Normalize(mean, std))
|
||||
|
||||
self.transform = T.Compose(transform_pipelines)
|
||||
|
||||
def __call__(self, pil_img: Image.Image):
|
||||
x = self.transform(pil_img)
|
||||
return x
|
||||
|
||||
|
||||
class DeepseekVLV2Processor(ProcessorMixin):
|
||||
tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
|
||||
attributes = ["tokenizer"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: LlamaTokenizerFast,
|
||||
candidate_resolutions: tuple[tuple[int, int]],
|
||||
patch_size: int,
|
||||
downsample_ratio: int,
|
||||
image_mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
image_std: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
normalize: bool = True,
|
||||
image_token: str = "<image>",
|
||||
pad_token: str = "<|▁pad▁|>",
|
||||
add_special_token: bool = False,
|
||||
sft_format: str = "deepseek",
|
||||
mask_prompt: bool = True,
|
||||
ignore_id: int = -100,
|
||||
**kwargs,
|
||||
):
|
||||
|
||||
self.candidate_resolutions = candidate_resolutions
|
||||
self.image_size = candidate_resolutions[0][0]
|
||||
self.patch_size = patch_size
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.normalize = normalize
|
||||
self.downsample_ratio = downsample_ratio
|
||||
|
||||
self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize)
|
||||
self.tokenizer = tokenizer
|
||||
self.tokenizer.padding_side = 'left' # must set this,padding side with make a difference in batch inference
|
||||
|
||||
# add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
|
||||
if tokenizer.pad_token is None:
|
||||
self.tokenizer.add_special_tokens({'pad_token': pad_token})
|
||||
|
||||
# add image token
|
||||
image_token_id = self.tokenizer.vocab.get(image_token)
|
||||
if image_token_id is None:
|
||||
special_tokens = [image_token]
|
||||
special_tokens_dict = {"additional_special_tokens": special_tokens}
|
||||
self.tokenizer.add_special_tokens(special_tokens_dict)
|
||||
self.image_token_id = self.tokenizer.vocab.get(image_token)
|
||||
|
||||
# add five special tokens for grounding-related tasks
|
||||
# <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
|
||||
special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>']
|
||||
special_tokens_dict = {"additional_special_tokens": special_tokens}
|
||||
self.tokenizer.add_special_tokens(special_tokens_dict)
|
||||
|
||||
# add special tokens for SFT data
|
||||
special_tokens = ["<|User|>", "<|Assistant|>"]
|
||||
special_tokens_dict = {"additional_special_tokens": special_tokens}
|
||||
self.tokenizer.add_special_tokens(special_tokens_dict)
|
||||
|
||||
self.image_token = image_token
|
||||
self.pad_token = pad_token
|
||||
self.add_special_token = add_special_token
|
||||
self.sft_format = sft_format
|
||||
self.mask_prompt = mask_prompt
|
||||
self.ignore_id = ignore_id
|
||||
|
||||
super().__init__(
|
||||
tokenizer,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def select_best_resolution(self, image_size):
|
||||
# used for cropping
|
||||
original_width, original_height = image_size
|
||||
best_fit = None
|
||||
max_effective_resolution = 0
|
||||
min_wasted_resolution = float("inf")
|
||||
|
||||
for width, height in self.candidate_resolutions:
|
||||
scale = min(width / original_width, height / original_height)
|
||||
downscaled_width, downscaled_height = int(
|
||||
original_width * scale), int(original_height * scale)
|
||||
effective_resolution = min(downscaled_width * downscaled_height,
|
||||
original_width * original_height)
|
||||
wasted_resolution = (width * height) - effective_resolution
|
||||
|
||||
if effective_resolution > max_effective_resolution or (
|
||||
effective_resolution == max_effective_resolution
|
||||
and wasted_resolution < min_wasted_resolution):
|
||||
max_effective_resolution = effective_resolution
|
||||
min_wasted_resolution = wasted_resolution
|
||||
best_fit = (width, height)
|
||||
|
||||
return best_fit
|
||||
|
||||
@property
|
||||
def bos_id(self):
|
||||
return self.tokenizer.bos_token_id
|
||||
|
||||
@property
|
||||
def eos_id(self):
|
||||
return self.tokenizer.eos_token_id
|
||||
|
||||
@property
|
||||
def pad_id(self):
|
||||
return self.tokenizer.pad_token_id
|
||||
|
||||
def encode(self, text: str, bos: bool = True, eos: bool = False):
|
||||
t = self.tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
if bos:
|
||||
t = [self.bos_id] + t
|
||||
if eos:
|
||||
t = t + [self.eos_id]
|
||||
|
||||
return t
|
||||
|
||||
def decode(self, t: list[int], **kwargs) -> str:
|
||||
return self.tokenizer.decode(t, **kwargs)
|
||||
|
||||
def process_one(
|
||||
self,
|
||||
prompt: str,
|
||||
images: list[Image.Image],
|
||||
inference_mode: bool = True,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
prompt (str): the formatted prompt;
|
||||
images (list[ImageType]): the list of images;
|
||||
inference_mode (bool): if True, then remove the last eos token;
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
outputs (BaseProcessorOutput): the output of the processor,
|
||||
- input_ids (torch.LongTensor): [N + image tokens]
|
||||
- target_ids (torch.LongTensor): [N + image tokens]
|
||||
- pixel_values (torch.FloatTensor): [n_patches, 3, H, W]
|
||||
- image_id (int): the id of the image token
|
||||
- num_image_tokens (list[int]): the number of image tokens
|
||||
"""
|
||||
|
||||
assert (prompt is not None and images is not None
|
||||
), "prompt and images must be used at the same time."
|
||||
|
||||
sft_format = prompt
|
||||
tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images(
|
||||
sft_format, images, bos=True, eos=True, cropping=len(images) <= 2)
|
||||
masked_tokenized_str = []
|
||||
for token_index in tokenized_str:
|
||||
if token_index != self.image_token_id:
|
||||
masked_tokenized_str.append(token_index)
|
||||
else:
|
||||
masked_tokenized_str.append(self.ignore_id)
|
||||
|
||||
assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \
|
||||
(f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
|
||||
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal")
|
||||
|
||||
input_ids = torch.LongTensor(tokenized_str)
|
||||
target_ids = torch.LongTensor(masked_tokenized_str)
|
||||
images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
|
||||
|
||||
# set input_ids < 0 | input_ids == self.image_token_id as ignore_id
|
||||
target_ids[(input_ids < 0) |
|
||||
(input_ids == self.image_token_id)] = self.ignore_id
|
||||
input_ids[input_ids < 0] = self.pad_id
|
||||
|
||||
if inference_mode:
|
||||
# Remove the ending eos token
|
||||
assert input_ids[-1] == self.eos_id
|
||||
input_ids = input_ids[:-1]
|
||||
target_ids = target_ids[:-1]
|
||||
images_seq_mask = images_seq_mask[:-1]
|
||||
|
||||
if len(images_list) == 0:
|
||||
pixel_values = torch.zeros((1, 3, self.image_size, self.image_size))
|
||||
images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
|
||||
else:
|
||||
pixel_values = torch.stack(images_list, dim=0)
|
||||
images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
|
||||
|
||||
input_ids = input_ids.unsqueeze(0)
|
||||
|
||||
prepare = BatchFeature(
|
||||
data=dict(
|
||||
input_ids=input_ids,
|
||||
pixel_values=pixel_values,
|
||||
images_seq_mask=images_seq_mask,
|
||||
images_spatial_crop=images_spatial_crop,
|
||||
num_image_tokens=num_image_tokens,
|
||||
),
|
||||
tensor_type="pt",
|
||||
)
|
||||
return prepare
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
*,
|
||||
text: str,
|
||||
images: list[Image.Image],
|
||||
inference_mode: bool = True,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
text (str): the formatted prompt;
|
||||
images (list[ImageType]): the list of images;
|
||||
inference_mode (bool): if True, then remove the last eos token;
|
||||
**kwargs:
|
||||
|
||||
Returns:
|
||||
outputs (BaseProcessorOutput): the output of the processor,
|
||||
- input_ids (torch.LongTensor): [N + image tokens]
|
||||
- images (torch.FloatTensor): [n_images, 3, H, W]
|
||||
- image_id (int): the id of the image token
|
||||
- num_image_tokens (list[int]): the number of image tokens
|
||||
"""
|
||||
|
||||
prepare = self.process_one(
|
||||
prompt=text,
|
||||
images=images,
|
||||
inference_mode=inference_mode,
|
||||
)
|
||||
|
||||
return prepare
|
||||
|
||||
def tokenize_with_images(
|
||||
self,
|
||||
conversation: str,
|
||||
images: list[Image.Image],
|
||||
bos: bool = True,
|
||||
eos: bool = True,
|
||||
cropping: bool = True,
|
||||
):
|
||||
"""Tokenize text with <image> tags."""
|
||||
assert conversation.count(self.image_token) == len(images)
|
||||
text_splits = conversation.split(self.image_token)
|
||||
images_list, images_seq_mask, images_spatial_crop = [], [], []
|
||||
num_image_tokens = []
|
||||
tokenized_str = []
|
||||
for text_sep, image in zip(text_splits, images):
|
||||
"""encode text_sep"""
|
||||
tokenized_sep = self.encode(text_sep, bos=False, eos=False)
|
||||
tokenized_str += tokenized_sep
|
||||
images_seq_mask += [False] * len(tokenized_sep)
|
||||
|
||||
"""select best resolution for anyres"""
|
||||
if cropping:
|
||||
best_width, best_height = self.select_best_resolution(image.size)
|
||||
else:
|
||||
best_width, best_height = self.image_size, self.image_size
|
||||
|
||||
"""process the global view"""
|
||||
global_view = ImageOps.pad(image, (self.image_size, self.image_size),
|
||||
color=tuple(int(x * 255) for x in self.image_transform.mean))
|
||||
images_list.append(self.image_transform(global_view))
|
||||
|
||||
"""process the local views"""
|
||||
local_view = ImageOps.pad(image, (best_width, best_height),
|
||||
color=tuple(int(x * 255) for x in self.image_transform.mean))
|
||||
for i in range(0, best_height, self.image_size):
|
||||
for j in range(0, best_width, self.image_size):
|
||||
images_list.append(
|
||||
self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
|
||||
|
||||
"""record height / width crop num"""
|
||||
num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size
|
||||
images_spatial_crop.append([num_width_tiles, num_height_tiles])
|
||||
|
||||
"""add image tokens"""
|
||||
h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
|
||||
# global views tokens h * (w + 1), 1 is for line separator
|
||||
tokenized_image = [self.image_token_id] * h * (w + 1)
|
||||
# add a separator between global and local views
|
||||
tokenized_image += [self.image_token_id]
|
||||
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
|
||||
tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1)
|
||||
|
||||
tokenized_str += tokenized_image
|
||||
images_seq_mask += [True] * len(tokenized_image)
|
||||
num_image_tokens.append(len(tokenized_image))
|
||||
|
||||
"""process the last text split"""
|
||||
tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
|
||||
tokenized_str += tokenized_sep
|
||||
images_seq_mask += [False] * len(tokenized_sep)
|
||||
|
||||
"""add the bos and eos tokens"""
|
||||
if bos:
|
||||
tokenized_str = [self.bos_id] + tokenized_str
|
||||
images_seq_mask = [False] + images_seq_mask
|
||||
if eos:
|
||||
tokenized_str = tokenized_str + [self.eos_id]
|
||||
images_seq_mask = images_seq_mask + [False]
|
||||
|
||||
assert len(tokenized_str) == len(
|
||||
images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
|
||||
|
||||
return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens
|
||||
|
||||
|
||||
AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)
|
||||
420
vllm/transformers_utils/processors/ovis.py
Normal file
420
vllm/transformers_utils/processors/ovis.py
Normal file
@@ -0,0 +1,420 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
|
||||
# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from functools import cached_property
|
||||
from typing import Union
|
||||
|
||||
import PIL
|
||||
import torch
|
||||
from transformers import AutoProcessor, BatchFeature
|
||||
from transformers.image_utils import ImageInput
|
||||
from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
|
||||
Unpack)
|
||||
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
|
||||
__all__ = ['OvisProcessor']
|
||||
IGNORE_ID = -100
|
||||
|
||||
class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding": False,
|
||||
},
|
||||
"images_kwargs": {
|
||||
'max_partition':9,
|
||||
'covering_threshold':0.9,
|
||||
'convert_to_rgb':True,
|
||||
'return_tensors':'pt'},
|
||||
}
|
||||
|
||||
|
||||
|
||||
class OvisProcessor(ProcessorMixin):
|
||||
r"""
|
||||
Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
|
||||
[`OvisProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
|
||||
[`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`] for more information.
|
||||
Args:
|
||||
image_processor ([`Qwen2VLImageProcessor`], *optional*):
|
||||
The image processor is a required input.
|
||||
tokenizer ([`Qwen2TokenizerFast`], *optional*):
|
||||
The tokenizer is a required input.
|
||||
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
|
||||
in a chat into a tokenizable string.
|
||||
"""
|
||||
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
valid_kwargs = ["chat_template", "image_pad_token", "image_segment_len"]
|
||||
|
||||
image_processor_class = "AutoImageProcessor"
|
||||
tokenizer_class = "AutoTokenizer"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image_processor=None,
|
||||
tokenizer=None,
|
||||
chat_template=None,
|
||||
image_pad_token=None,
|
||||
image_segment_len=255,
|
||||
**kwargs,
|
||||
):
|
||||
self.image_token = "<image>"
|
||||
self.image_pad_token = image_pad_token
|
||||
self.image_segment_len = image_segment_len
|
||||
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
||||
|
||||
@cached_property
|
||||
def extra_special_tokens(self):
|
||||
image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token]
|
||||
extra_special_tokens = {
|
||||
"image_token": -200,
|
||||
"image_atom": -300,
|
||||
"image_start": -301,
|
||||
"image_prefix": -302,
|
||||
"image_col_sep": -303,
|
||||
"image_row_sep": -304,
|
||||
"image_end": -305,
|
||||
'image_pad': image_pad_token_id,
|
||||
}
|
||||
return extra_special_tokens
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: ImageInput = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
**kwargs: Unpack[OvisProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
|
||||
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
|
||||
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
|
||||
Args:
|
||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
||||
tensor. Both channels-first and channels-last formats are supported.
|
||||
text (`str`, `list[str]`, `list[list[str]]`):
|
||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||
videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
|
||||
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||
If set, will return tensors of a particular framework. Acceptable values are:
|
||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||
Returns:
|
||||
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
||||
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
||||
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
||||
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
||||
`None`).
|
||||
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
||||
- **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
|
||||
- **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
|
||||
- **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
|
||||
- **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
|
||||
"""
|
||||
output_kwargs = self._merge_kwargs(
|
||||
OvisProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Process all images first
|
||||
image_features = {}
|
||||
if images is not None:
|
||||
processed_images = []
|
||||
image_placeholders_list = []
|
||||
grids = []
|
||||
|
||||
# Process each image
|
||||
for image in images if isinstance(images, list) else [images]:
|
||||
pixel_values, image_placeholders, grid = self.preprocess_image(
|
||||
image=image, **output_kwargs["images_kwargs"]
|
||||
)
|
||||
processed_images.append(pixel_values)
|
||||
image_placeholders_list.append(image_placeholders)
|
||||
grids.append(grid)
|
||||
|
||||
# assign all processed images
|
||||
if processed_images:
|
||||
image_features["image_placeholders"] = image_placeholders_list
|
||||
|
||||
# Process text input
|
||||
if text is not None:
|
||||
|
||||
if not isinstance(text, list):
|
||||
text = [text]
|
||||
|
||||
tokenized_batched_text = self._tokenize_with_image_symbol(text)
|
||||
image_token_id = self.get_token_value("image_token")
|
||||
replaced_ids_list = []
|
||||
idx = 0
|
||||
for ids_tensor in tokenized_batched_text:
|
||||
if image_token_id in ids_tensor and "image_placeholders" in image_features:
|
||||
if idx < len(image_features["image_placeholders"]):
|
||||
# Converts in list for ease of use
|
||||
ids_list = ids_tensor.tolist()
|
||||
|
||||
new_ids = []
|
||||
|
||||
# replace placeholders
|
||||
for i, token_id in enumerate(ids_list):
|
||||
if token_id == image_token_id:
|
||||
placeholder_ids = image_features["image_placeholders"][idx]
|
||||
new_ids.extend(placeholder_ids)
|
||||
idx += 1
|
||||
else:
|
||||
new_ids.append(token_id)
|
||||
|
||||
# Converts back to tensors
|
||||
ids_tensor = torch.tensor(new_ids, dtype=torch.long)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
'Mismatch between the images you provided and the number of placeholder present in the text')
|
||||
|
||||
replaced_ids_list.append(ids_tensor)
|
||||
|
||||
if replaced_ids_list:
|
||||
replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
|
||||
else:
|
||||
replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)
|
||||
|
||||
# Create the output with text features
|
||||
output = BatchFeature(
|
||||
data={
|
||||
"input_ids": replaced_and_tokenized_ids,
|
||||
}
|
||||
)
|
||||
|
||||
# Add image features if present
|
||||
if image_features:
|
||||
output["pixel_values"] = processed_images
|
||||
output['grids'] = grids
|
||||
|
||||
return output
|
||||
|
||||
# If only images were provided
|
||||
return BatchFeature(data=image_features)
|
||||
|
||||
def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
|
||||
batch_token_ids = []
|
||||
for text in text_list:
|
||||
text_chunks = [self.tokenizer(chunk, add_special_tokens=False).input_ids for chunk in
|
||||
text.split(self.image_token)]
|
||||
token_ids = []
|
||||
num_chuck = len(text_chunks)
|
||||
for i, chunk in enumerate(text_chunks):
|
||||
token_ids.extend(chunk)
|
||||
if i < num_chuck - 1:
|
||||
token_ids.append(self.get_token_value("image_token"))
|
||||
batch_token_ids.append(token_ids)
|
||||
return torch.tensor(batch_token_ids, dtype=torch.long)
|
||||
|
||||
def get_image_size(self):
|
||||
size = self.image_processor.size
|
||||
if 'shortest_edge' in size:
|
||||
width = height = size['shortest_edge']
|
||||
elif "height" in size and "width" in size:
|
||||
width = size['width']
|
||||
height = size['height']
|
||||
else:
|
||||
raise ValueError( "Can't parse image size from image_processor config.")
|
||||
return height, width
|
||||
|
||||
def get_token_value(self, tok):
|
||||
return self.extra_special_tokens[tok]
|
||||
|
||||
def construct_image_indicators(self, grid):
|
||||
image_placeholders = [self.get_token_value('image_start'),
|
||||
self.get_token_value('image_atom'),
|
||||
self.get_token_value('image_prefix')]
|
||||
if grid[0] * grid[1] > 1:
|
||||
for r in range(grid[0]):
|
||||
for c in range(grid[1]):
|
||||
image_placeholders.append(self.get_token_value('image_atom') )
|
||||
if c < grid[1] - 1:
|
||||
image_placeholders.append(self.get_token_value('image_col_sep'))
|
||||
if r < grid[0] - 1:
|
||||
image_placeholders.append(self.get_token_value('image_row_sep'))
|
||||
image_placeholders.append(self.get_token_value('image_end'))
|
||||
return image_placeholders
|
||||
|
||||
def construct_image_placeholders(self, grid):
|
||||
|
||||
image_placeholders = self.construct_image_indicators(grid)
|
||||
|
||||
image_atom_token_id = self.get_token_value('image_atom')
|
||||
# Extract the padding token ID from tokenizer
|
||||
image_padding_token_id = self.get_token_value('image_pad')
|
||||
|
||||
# Create a new list with padding tokens inserted
|
||||
padded_placeholder_tokens = []
|
||||
for token in image_placeholders:
|
||||
padded_placeholder_tokens.append(image_padding_token_id)
|
||||
if token == image_atom_token_id:
|
||||
padded_placeholder_tokens.extend([image_padding_token_id] * self.image_segment_len)
|
||||
return padded_placeholder_tokens
|
||||
|
||||
def preprocess_image(self, image: PIL.Image.Image, max_partition, covering_threshold, convert_to_rgb, return_tensors):
|
||||
def _preprocess(img: PIL.Image.Image, side):
|
||||
# first resize and preprocess
|
||||
w, h = img.size
|
||||
if w == h:
|
||||
new_width = new_height = side
|
||||
elif w > h:
|
||||
new_width = side
|
||||
new_height = int(h / w * new_width)
|
||||
else:
|
||||
new_height = side
|
||||
new_width = int(w / h * new_height)
|
||||
new_size = dict(height=new_height, width=new_width)
|
||||
pixel_values = self.image_processor.preprocess(img, size=new_size, return_tensors=return_tensors)['pixel_values']
|
||||
|
||||
# then pad to square
|
||||
square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device)
|
||||
new_height, new_width = pixel_values.shape[2:]
|
||||
if new_height == new_width:
|
||||
square_values[:, :, :, :] = pixel_values
|
||||
elif new_height > new_width:
|
||||
from_index = (side - new_width) // 2
|
||||
square_values[:, :, :, from_index:from_index + new_width] = pixel_values
|
||||
else:
|
||||
from_index = (side - new_height) // 2
|
||||
square_values[:, :, from_index:from_index + new_height, :] = pixel_values
|
||||
|
||||
return square_values
|
||||
|
||||
def _partition(img, grid) -> list[tuple[int, int, int, int]]:
|
||||
w, h = img.size
|
||||
row_height = h // grid[0]
|
||||
col_width = w // grid[1]
|
||||
|
||||
partition = []
|
||||
for row in range(grid[0]):
|
||||
for col in range(grid[1]):
|
||||
left = col * col_width
|
||||
upper = row * row_height
|
||||
right = w if col == grid[1] - 1 else (col + 1) * col_width
|
||||
lower = h if row == grid[0] - 1 else (row + 1) * row_height
|
||||
partition.append((left, upper, right, lower))
|
||||
|
||||
return partition
|
||||
|
||||
def _covering_area(left, upper, right, lower, side):
|
||||
w = right - left
|
||||
h = lower - upper
|
||||
w, h = max(w, h), min(w, h)
|
||||
if w > side:
|
||||
h = h / w * side
|
||||
w = side
|
||||
return w * h
|
||||
|
||||
def _get_best_grid(img, side):
|
||||
img_area = img.size[0] * img.size[1]
|
||||
|
||||
candidate_grids = []
|
||||
for i in range(1, max_partition + 1):
|
||||
for j in range(1, max_partition + 1):
|
||||
if i * j <= max_partition:
|
||||
candidate_grids.append((i, j))
|
||||
|
||||
all_grids = []
|
||||
good_grids = []
|
||||
for grid in candidate_grids:
|
||||
partition = _partition(img, grid)
|
||||
covering_ratio = sum([_covering_area(*p, side) for p in partition]) / img_area
|
||||
assert covering_ratio <= 1.0
|
||||
all_grids.append((grid, covering_ratio))
|
||||
if covering_ratio > covering_threshold:
|
||||
good_grids.append((grid, covering_ratio))
|
||||
|
||||
if len(good_grids) > 0:
|
||||
# pick the good partition with minimum #sub_images and break the tie using covering_ratio
|
||||
return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][0]
|
||||
else:
|
||||
# pick the partition with maximum covering_ratio and break the tie using #sub_images
|
||||
return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
|
||||
|
||||
if convert_to_rgb:
|
||||
image = convert_image_mode(image, 'RGB')
|
||||
|
||||
|
||||
sides = self.get_image_size()
|
||||
if sides[0] != sides[1]:
|
||||
raise ValueError('get_image_size() returns non-square size')
|
||||
side = sides[0]
|
||||
grid = _get_best_grid(image, side)
|
||||
partition = _partition(image, grid)
|
||||
crops = [image.crop(p) for p in partition]
|
||||
if len(crops) > 1:
|
||||
crops.insert(0, image)
|
||||
pixel_values = torch.cat([_preprocess(crop, side) for crop in crops], dim=0)
|
||||
image_placeholders = self.construct_image_placeholders(grid)
|
||||
return pixel_values, image_placeholders, grid
|
||||
|
||||
def batch_decode(self, *args, **kwargs):
|
||||
"""
|
||||
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
|
||||
refer to the docstring of this method for more information.
|
||||
"""
|
||||
return self.tokenizer.batch_decode(*args, **kwargs)
|
||||
|
||||
def decode(self, *args, **kwargs):
|
||||
"""
|
||||
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
|
||||
the docstring of this method for more information.
|
||||
"""
|
||||
return self.tokenizer.decode(*args, **kwargs)
|
||||
|
||||
def post_process_image_text_to_text(self, generated_outputs):
|
||||
"""
|
||||
Post-process the output of the model to decode the text.
|
||||
Args:
|
||||
generated_outputs (`torch.Tensor` or `np.ndarray`):
|
||||
The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
|
||||
or `(sequence_length,)`.
|
||||
Returns:
|
||||
`list[str]`: The decoded text.
|
||||
"""
|
||||
return self.tokenizer.batch_decode(
|
||||
generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
||||
)
|
||||
|
||||
@property
|
||||
def model_input_names(self):
|
||||
tokenizer_input_names = self.tokenizer.model_input_names
|
||||
image_processor_input_names = self.image_processor.model_input_names
|
||||
names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
||||
return names_from_processor + ["second_per_grid_ts"]
|
||||
|
||||
|
||||
AutoProcessor.register("OvisProcessor", OvisProcessor)
|
||||
458
vllm/transformers_utils/processors/ovis2_5.py
Normal file
458
vllm/transformers_utils/processors/ovis2_5.py
Normal file
@@ -0,0 +1,458 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import math
|
||||
from functools import cached_property
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import PIL
|
||||
import torch
|
||||
from transformers import AutoProcessor, BatchFeature
|
||||
from transformers.image_utils import ImageInput
|
||||
from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
|
||||
Unpack)
|
||||
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
|
||||
__all__ = ['Ovis2_5Processor']
|
||||
IMAGE_TOKEN = "<image>"
|
||||
VIDEO_TOKEN = "<video>"
|
||||
MIN_PIXELS = 448 * 448
|
||||
MAX_PIXELS = 1792 * 1792
|
||||
|
||||
|
||||
class Ovis2_5ProcessorKwargs(ProcessingKwargs,
|
||||
total=False): # type: ignore[call-arg]
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding": False,
|
||||
},
|
||||
"images_kwargs": {
|
||||
'convert_to_rgb': True,
|
||||
'min_pixels': MIN_PIXELS,
|
||||
'max_pixels': MAX_PIXELS,
|
||||
},
|
||||
"videos_kwargs": {
|
||||
'convert_to_rgb': True,
|
||||
'min_pixels': MIN_PIXELS,
|
||||
'max_pixels': MAX_PIXELS,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class Ovis2_5Processor(ProcessorMixin):
|
||||
r"""
|
||||
Constructs an Ovis processor which wraps an Ovis image processor
|
||||
and a Qwen2 tokenizer into a single processor.
|
||||
[`OvisProcessor`] offers all the functionalities of
|
||||
[`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`].
|
||||
See the [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`]
|
||||
for more information.
|
||||
Args:
|
||||
image_processor ([`Qwen2VLImageProcessor`], *optional*):
|
||||
The image processor is a required input.
|
||||
tokenizer ([`Qwen2TokenizerFast`], *optional*):
|
||||
The tokenizer is a required input.
|
||||
chat_template (`str`, *optional*): A Jinja template which will
|
||||
be used to convert lists of messages in a chat into
|
||||
a tokenizable string.
|
||||
"""
|
||||
|
||||
attributes = ["image_processor", "tokenizer"]
|
||||
valid_kwargs = ["chat_template", "image_pad_token"]
|
||||
|
||||
image_processor_class = "AutoImageProcessor"
|
||||
tokenizer_class = "AutoTokenizer"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image_processor=None,
|
||||
tokenizer=None,
|
||||
chat_template=None,
|
||||
image_pad_token=None,
|
||||
patch_size=16,
|
||||
hidden_stride=2,
|
||||
temporal_patch_size=1,
|
||||
**kwargs,
|
||||
):
|
||||
self.image_token = IMAGE_TOKEN
|
||||
self.video_token = VIDEO_TOKEN
|
||||
self.image_pad_token = "<|image_pad|>"
|
||||
|
||||
self.patch_size = patch_size
|
||||
self.hidden_stride = hidden_stride
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
super().__init__(image_processor,
|
||||
tokenizer,
|
||||
chat_template=chat_template)
|
||||
|
||||
@cached_property
|
||||
def extra_special_tokens(self):
|
||||
image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token]
|
||||
extra_special_tokens = {
|
||||
"image_token": -200,
|
||||
"video_token": -201,
|
||||
"visual_atom": -300,
|
||||
"image_start": -301,
|
||||
"image_end": -302,
|
||||
"video_start": -303,
|
||||
"video_end": -304,
|
||||
'image_pad': image_pad_token_id,
|
||||
}
|
||||
return extra_special_tokens
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: ImageInput = None,
|
||||
videos: Union[np.ndarray, list[ImageInput]] = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput],
|
||||
list[PreTokenizedInput]] = None,
|
||||
**kwargs: Unpack[Ovis2_5ProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s)
|
||||
and image(s). This method forwards the `text`and `kwargs` arguments
|
||||
to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text`
|
||||
is not `None` to encode the text. To prepare the vision inputs,
|
||||
this method forwards the `vision_infos` and `kwrags` arguments to
|
||||
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`]
|
||||
if `vision_infos` is not `None`.
|
||||
Args:
|
||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`,
|
||||
`list[PIL.Image.Image]`, `list[np.ndarray]`,
|
||||
`list[torch.Tensor]`):
|
||||
The image or batch of images to be prepared.
|
||||
Each image can be a PIL image, NumPy array or PyTorch
|
||||
tensor. Both channels-first and channels-last formats
|
||||
are supported.
|
||||
text (`str`, `list[str]`, `list[list[str]]`):
|
||||
The sequence or batch of sequences to be encoded.
|
||||
Each sequence can be a string or a list of strings
|
||||
(pretokenized string). If the sequences are provided as
|
||||
list of strings (pretokenized), you must set
|
||||
`is_split_into_words=True` (to lift the ambiguity with
|
||||
a batch of sequences).
|
||||
videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`,
|
||||
`list[torch.Tensor]`):
|
||||
The image or batch of videos to be prepared. Each video
|
||||
can be a 4D NumPy array or PyTorch tensor, or a nested
|
||||
list of 3D frames. Both channels-first and channels-last
|
||||
formats are supported.
|
||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||
If set, will return tensors of a particular framework.
|
||||
Acceptable values are:
|
||||
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
||||
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
||||
- `'np'`: Return NumPy `np.ndarray` objects.
|
||||
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
||||
Returns:
|
||||
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
||||
- **input_ids** -- list of token ids to be fed to a model.
|
||||
Returned when `text` is not `None`.
|
||||
- **attention_mask** -- list of indices specifying which tokens
|
||||
should be attended to by the model (when
|
||||
`return_attention_mask=True` or if *"attention_mask"*
|
||||
is in `self.model_input_names` and if `text` is not `None`).
|
||||
- **pixel_values** -- Pixel values to be fed to a model.
|
||||
Returned when `images` is not `None`.
|
||||
- **pixel_values_videos** -- Pixel values of videos to be fed to
|
||||
a model. Returned when `videos` is not `None`.
|
||||
- **image_grid_thw** -- list of image 3D grid in LLM. Returned
|
||||
when `images` is not `None`.
|
||||
- **video_grid_thw** -- list of video 3D grid in LLM. Returned
|
||||
when `videos` is not `None`.
|
||||
- **second_per_grid_ts** -- list of video seconds per time grid.
|
||||
Returned when `videos` is not `None`.
|
||||
"""
|
||||
output_kwargs = self._merge_kwargs(
|
||||
Ovis2_5ProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
# Process all images first
|
||||
visual_features = {}
|
||||
output = BatchFeature()
|
||||
if images is not None:
|
||||
processed_images = []
|
||||
image_placeholders_list = []
|
||||
grids = []
|
||||
# Process each image
|
||||
for image in images if isinstance(images, list) else [images]:
|
||||
pixel_values, image_placeholders, grid = (
|
||||
self.preprocess_multidata(
|
||||
images=image, **output_kwargs["images_kwargs"]))
|
||||
processed_images.append(pixel_values)
|
||||
image_placeholders_list.append(image_placeholders)
|
||||
grids.append(grid)
|
||||
|
||||
# assign all processed images
|
||||
if processed_images:
|
||||
visual_features["image_placeholders"] = image_placeholders_list
|
||||
output["pixel_values"] = processed_images
|
||||
output["grids"] = grids
|
||||
|
||||
if videos is not None:
|
||||
processed_videos = []
|
||||
videos_placeholders_list = []
|
||||
grids = []
|
||||
# Process each video
|
||||
for video in videos if isinstance(videos, list) else [videos]:
|
||||
pixel_values, video_placeholders, grid = (
|
||||
self.preprocess_multidata(
|
||||
video=video, **output_kwargs["videos_kwargs"]))
|
||||
processed_videos.append(pixel_values)
|
||||
videos_placeholders_list.append(video_placeholders)
|
||||
grids.append(grid)
|
||||
# assign all processed videos
|
||||
if processed_videos:
|
||||
visual_features[
|
||||
"video_placeholders"] = videos_placeholders_list
|
||||
output["video_pixel_values"] = processed_videos
|
||||
output["video_grids"] = grids
|
||||
|
||||
# Process text input
|
||||
if text is not None:
|
||||
if not isinstance(text, list):
|
||||
text = [text]
|
||||
tokenized_batched_text = self._tokenize_with_visual_symbol(text)
|
||||
image_token_id = self.get_token_value("image_token")
|
||||
video_token_id = self.get_token_value("video_token")
|
||||
replaced_ids_list = []
|
||||
image_idx = 0
|
||||
video_idx = 0
|
||||
for ids_tensor in tokenized_batched_text:
|
||||
has_image_tokens = (image_token_id in ids_tensor
|
||||
and "image_placeholders" in visual_features
|
||||
and image_idx < len(
|
||||
visual_features["image_placeholders"]))
|
||||
has_video_tokens = (video_token_id in ids_tensor
|
||||
and "video_placeholders" in visual_features
|
||||
and video_idx < len(
|
||||
visual_features["video_placeholders"]))
|
||||
if has_image_tokens or has_video_tokens:
|
||||
# Convert to list for easier manipulation
|
||||
ids_list = ids_tensor.tolist()
|
||||
new_ids = []
|
||||
|
||||
# Replace placeholders
|
||||
for token_id in ids_list:
|
||||
if token_id == image_token_id:
|
||||
new_ids.extend(
|
||||
visual_features["image_placeholders"]
|
||||
[image_idx])
|
||||
image_idx += 1
|
||||
elif token_id == video_token_id:
|
||||
new_ids.extend(
|
||||
visual_features["video_placeholders"]
|
||||
[video_idx])
|
||||
video_idx += 1
|
||||
else:
|
||||
new_ids.append(token_id)
|
||||
# Convert back to tensor
|
||||
ids_tensor = torch.tensor(new_ids, dtype=torch.long)
|
||||
replaced_ids_list.append(ids_tensor)
|
||||
if replaced_ids_list:
|
||||
replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
|
||||
else:
|
||||
replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)
|
||||
output["input_ids"] = replaced_and_tokenized_ids
|
||||
|
||||
return output
|
||||
# If only images were provided
|
||||
return BatchFeature(data=visual_features)
|
||||
|
||||
def _tokenize_with_visual_symbol(self,
|
||||
text_list: list[str]) -> torch.LongTensor:
|
||||
batch_token_ids = []
|
||||
for text in text_list:
|
||||
token_ids = []
|
||||
video_token_id = self.get_token_value("video_token")
|
||||
image_token_id = self.get_token_value("image_token")
|
||||
video_split_texts = text.split(self.video_token)
|
||||
|
||||
for j, video_segment in enumerate(video_split_texts):
|
||||
image_split_texts = video_segment.split(self.image_token)
|
||||
text_chunks = [
|
||||
self.tokenizer(chunk, add_special_tokens=False).input_ids
|
||||
for chunk in image_split_texts
|
||||
]
|
||||
segment_tokens = []
|
||||
for i, chunk in enumerate(text_chunks):
|
||||
segment_tokens.extend(chunk)
|
||||
if i < len(text_chunks) - 1:
|
||||
segment_tokens.append(image_token_id)
|
||||
token_ids.extend(segment_tokens)
|
||||
if j < len(video_split_texts) - 1:
|
||||
token_ids.append(video_token_id)
|
||||
|
||||
batch_token_ids.append(token_ids)
|
||||
return torch.tensor(batch_token_ids, dtype=torch.long)
|
||||
|
||||
# Copied from qwen2_vl
|
||||
def smart_resize(self,
|
||||
height: int,
|
||||
width: int,
|
||||
factor: int = 28,
|
||||
min_pixels: int = MIN_PIXELS,
|
||||
max_pixels: int = MAX_PIXELS):
|
||||
"""Rescales the image so that the following conditions are met:
|
||||
1. Both dimensions (height and width) are divisible by 'factor'.
|
||||
2. The total number of pixels is within the range
|
||||
['min_pixels', 'max_pixels'].
|
||||
3. The aspect ratio of the image is maintained as closely as possible.
|
||||
"""
|
||||
if height < factor or width < factor:
|
||||
print(f"height:{height} or width:{width} must be "
|
||||
f"larger than factor:{factor}")
|
||||
if height < width:
|
||||
width = round(factor / height * width)
|
||||
height = factor
|
||||
else:
|
||||
height = round(factor / width * height)
|
||||
width = factor
|
||||
|
||||
elif max(height, width) / min(height, width) > 200:
|
||||
print(f"absolute aspect ratio must be smaller than 200, "
|
||||
f"got {max(height, width) / min(height, width)}")
|
||||
if height > width:
|
||||
height = 200 * width
|
||||
else:
|
||||
width = 200 * height
|
||||
|
||||
h_bar = round(height / factor) * factor
|
||||
w_bar = round(width / factor) * factor
|
||||
if h_bar * w_bar > max_pixels:
|
||||
beta = math.sqrt((height * width) / max_pixels)
|
||||
h_bar = math.floor(height / beta / factor) * factor
|
||||
w_bar = math.floor(width / beta / factor) * factor
|
||||
elif h_bar * w_bar < min_pixels:
|
||||
beta = math.sqrt(min_pixels / (height * width))
|
||||
h_bar = math.ceil(height * beta / factor) * factor
|
||||
w_bar = math.ceil(width * beta / factor) * factor
|
||||
return h_bar, w_bar
|
||||
|
||||
def get_token_value(self, tok):
|
||||
return self.extra_special_tokens[tok]
|
||||
|
||||
def construct_visual_indicators(self, grid, is_video: bool = False):
|
||||
if is_video:
|
||||
start_token = self.get_token_value('video_start')
|
||||
end_token = self.get_token_value('video_end')
|
||||
else:
|
||||
start_token = self.get_token_value('image_start')
|
||||
end_token = self.get_token_value('image_end')
|
||||
|
||||
image_placeholders = [start_token, self.get_token_value('visual_atom')]
|
||||
if grid[0] * grid[1] > 1:
|
||||
for r in range(grid[0]):
|
||||
for c in range(grid[1]):
|
||||
image_placeholders.append(
|
||||
self.get_token_value('visual_atom'))
|
||||
|
||||
image_placeholders.append(end_token)
|
||||
return image_placeholders
|
||||
|
||||
def construct_visual_placeholders(self, grid, is_video: bool = False):
|
||||
visual_placeholders = self.construct_visual_indicators((1, 1),
|
||||
is_video)
|
||||
|
||||
image_atom_token_id = self.get_token_value('visual_atom')
|
||||
# Extract the padding token ID from tokenizer
|
||||
image_padding_token_id = self.get_token_value('image_pad')
|
||||
|
||||
num_image_atoms = grid[0] * grid[1] * grid[2]
|
||||
num_image_atoms //= self.hidden_stride**2
|
||||
num_image_atoms //= self.temporal_patch_size
|
||||
|
||||
# Create a new list with padding tokens inserted
|
||||
padded_placeholder_tokens = []
|
||||
for token in visual_placeholders:
|
||||
if token == image_atom_token_id:
|
||||
padded_placeholder_tokens.extend([image_padding_token_id] *
|
||||
num_image_atoms)
|
||||
else:
|
||||
padded_placeholder_tokens.append(image_padding_token_id)
|
||||
return padded_placeholder_tokens
|
||||
|
||||
def preprocess_multidata(
|
||||
self,
|
||||
images: Optional[Union[PIL.Image.Image, list[PIL.Image.Image]]] = None,
|
||||
video: Optional[Union[list[PIL.Image.Image], np.ndarray]] = None,
|
||||
convert_to_rgb: Optional[bool] = True,
|
||||
min_pixels: int = MIN_PIXELS,
|
||||
max_pixels: int = MAX_PIXELS,
|
||||
return_tensors: Optional[str] = 'pt',
|
||||
):
|
||||
is_video = False
|
||||
if images is not None:
|
||||
if not isinstance(images, list):
|
||||
images = [images]
|
||||
elif video is not None:
|
||||
is_video = True
|
||||
# type of vidoe in dummy_mm_data is np.ndarray
|
||||
if isinstance(video, np.ndarray):
|
||||
images = []
|
||||
for i in range(video.shape[0]):
|
||||
image = PIL.Image.fromarray(video[i].astype(np.uint8))
|
||||
images.append(image)
|
||||
elif isinstance(video, list):
|
||||
images = video
|
||||
min_pixels = min(max_pixels if max_pixels is not None else MAX_PIXELS,
|
||||
min_pixels if min_pixels is not None else MIN_PIXELS)
|
||||
images = [
|
||||
image.convert("RGB")
|
||||
if convert_to_rgb and image.mode != 'RGB' else image
|
||||
for image in images
|
||||
]
|
||||
|
||||
width, height = images[0].size
|
||||
resized_height, resized_width = height, width
|
||||
processed_images = []
|
||||
for image in images:
|
||||
resized_height, resized_width = self.smart_resize(
|
||||
height,
|
||||
width,
|
||||
factor=self.patch_size * self.hidden_stride,
|
||||
min_pixels=min_pixels,
|
||||
max_pixels=max_pixels,
|
||||
)
|
||||
new_size = dict(height=resized_height, width=resized_width)
|
||||
image_pt = self.image_processor.preprocess(
|
||||
image, size=new_size, return_tensors="np")['pixel_values'][0]
|
||||
|
||||
processed_images.append(image_pt)
|
||||
|
||||
patches = np.array(processed_images)
|
||||
if patches.shape[0] % self.temporal_patch_size != 0:
|
||||
num_to_pad = self.temporal_patch_size - (patches.shape[0] %
|
||||
self.temporal_patch_size)
|
||||
repeats = np.repeat(patches[-1][np.newaxis], num_to_pad, axis=0)
|
||||
patches = np.concatenate([patches, repeats], axis=0)
|
||||
channel = patches.shape[1]
|
||||
grid_t = patches.shape[0] // self.temporal_patch_size
|
||||
grid_h = resized_height // self.patch_size
|
||||
grid_w = resized_width // self.patch_size
|
||||
|
||||
patches = patches.reshape(
|
||||
grid_t,
|
||||
self.temporal_patch_size,
|
||||
channel,
|
||||
grid_h // self.hidden_stride,
|
||||
self.hidden_stride,
|
||||
self.patch_size,
|
||||
grid_w // self.hidden_stride,
|
||||
self.hidden_stride,
|
||||
self.patch_size,
|
||||
)
|
||||
patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
|
||||
flatten_patches = patches.reshape(
|
||||
grid_t * grid_h * grid_w, channel * self.temporal_patch_size *
|
||||
self.patch_size * self.patch_size)
|
||||
|
||||
visual_placeholders = self.construct_visual_placeholders(
|
||||
[grid_t, grid_h, grid_w], is_video)
|
||||
return torch.tensor(
|
||||
flatten_patches), visual_placeholders, torch.tensor(
|
||||
[[grid_t, grid_h, grid_w]])
|
||||
|
||||
|
||||
AutoProcessor.register("Ovis2_5Processor", Ovis2_5Processor)
|
||||
104
vllm/transformers_utils/runai_utils.py
Normal file
104
vllm/transformers_utils/runai_utils.py
Normal file
@@ -0,0 +1,104 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
import shutil
|
||||
import signal
|
||||
from typing import Optional
|
||||
|
||||
from vllm import envs
|
||||
from vllm.assets.base import get_cache_dir
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import PlaceholderModule
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
SUPPORTED_SCHEMES = ['s3://', 'gs://']
|
||||
|
||||
try:
|
||||
from runai_model_streamer import list_safetensors as runai_list_safetensors
|
||||
from runai_model_streamer import pull_files as runai_pull_files
|
||||
except (ImportError, OSError):
|
||||
# see https://github.com/run-ai/runai-model-streamer/issues/26
|
||||
# OSError will be raised on arm64 platform
|
||||
runai_model_streamer = PlaceholderModule(
|
||||
"runai_model_streamer") # type: ignore[assignment]
|
||||
runai_pull_files = runai_model_streamer.placeholder_attr("pull_files")
|
||||
runai_list_safetensors = runai_model_streamer.placeholder_attr(
|
||||
"list_safetensors")
|
||||
|
||||
|
||||
def list_safetensors(path: str = "") -> list[str]:
|
||||
"""
|
||||
List full file names from object path and filter by allow pattern.
|
||||
|
||||
Args:
|
||||
path: The object storage path to list from.
|
||||
|
||||
Returns:
|
||||
list[str]: List of full object storage paths allowed by the pattern
|
||||
"""
|
||||
return runai_list_safetensors(path)
|
||||
|
||||
|
||||
def is_runai_obj_uri(model_or_path: str) -> bool:
|
||||
return model_or_path.lower().startswith(tuple(SUPPORTED_SCHEMES))
|
||||
|
||||
|
||||
class ObjectStorageModel:
|
||||
"""
|
||||
A class representing an ObjectStorage model mirrored into a
|
||||
temporary directory.
|
||||
|
||||
Attributes:
|
||||
dir: The temporary created directory.
|
||||
|
||||
Methods:
|
||||
pull_files(): Pull model from object storage to the temporary directory.
|
||||
"""
|
||||
|
||||
def __init__(self, url: str) -> None:
|
||||
if envs.VLLM_ASSETS_CACHE_MODEL_CLEAN:
|
||||
for sig in (signal.SIGINT, signal.SIGTERM):
|
||||
existing_handler = signal.getsignal(sig)
|
||||
signal.signal(sig, self._close_by_signal(existing_handler))
|
||||
|
||||
dir_name = os.path.join(
|
||||
get_cache_dir(), "model_streamer",
|
||||
hashlib.sha256(str(url).encode()).hexdigest()[:8])
|
||||
if os.path.exists(dir_name):
|
||||
shutil.rmtree(dir_name)
|
||||
os.makedirs(dir_name)
|
||||
self.dir = dir_name
|
||||
logger.debug("Init object storage, model cache path is: %s", dir_name)
|
||||
|
||||
def _close(self) -> None:
|
||||
if os.path.exists(self.dir):
|
||||
shutil.rmtree(self.dir)
|
||||
|
||||
def _close_by_signal(self, existing_handler=None):
|
||||
|
||||
def new_handler(signum, frame):
|
||||
self._close()
|
||||
if existing_handler:
|
||||
existing_handler(signum, frame)
|
||||
|
||||
return new_handler
|
||||
|
||||
def pull_files(self,
|
||||
model_path: str = "",
|
||||
allow_pattern: Optional[list[str]] = None,
|
||||
ignore_pattern: Optional[list[str]] = None) -> None:
|
||||
"""
|
||||
Pull files from object storage into the temporary directory.
|
||||
|
||||
Args:
|
||||
model_path: The object storage path of the model.
|
||||
allow_pattern: A list of patterns of which files to pull.
|
||||
ignore_pattern: A list of patterns of which files not to pull.
|
||||
|
||||
"""
|
||||
if not model_path.endswith("/"):
|
||||
model_path = model_path + "/"
|
||||
runai_pull_files(model_path, self.dir, allow_pattern, ignore_pattern)
|
||||
93
vllm/transformers_utils/s3_utils.py
Normal file
93
vllm/transformers_utils/s3_utils.py
Normal file
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import fnmatch
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
from vllm.utils import PlaceholderModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from botocore.client import BaseClient
|
||||
|
||||
try:
|
||||
import boto3
|
||||
except ImportError:
|
||||
boto3 = PlaceholderModule("boto3") # type: ignore[assignment]
|
||||
|
||||
|
||||
def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
|
||||
return [
|
||||
path for path in paths if any(
|
||||
fnmatch.fnmatch(path, pattern) for pattern in patterns)
|
||||
]
|
||||
|
||||
|
||||
def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
|
||||
return [
|
||||
path for path in paths
|
||||
if not any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
|
||||
]
|
||||
|
||||
|
||||
def glob(s3: Optional["BaseClient"] = None,
|
||||
path: str = "",
|
||||
allow_pattern: Optional[list[str]] = None) -> list[str]:
|
||||
"""
|
||||
List full file names from S3 path and filter by allow pattern.
|
||||
|
||||
Args:
|
||||
s3: S3 client to use.
|
||||
path: The S3 path to list from.
|
||||
allow_pattern: A list of patterns of which files to pull.
|
||||
|
||||
Returns:
|
||||
list[str]: List of full S3 paths allowed by the pattern
|
||||
"""
|
||||
if s3 is None:
|
||||
s3 = boto3.client("s3")
|
||||
if not path.endswith("/"):
|
||||
path = path + "/"
|
||||
bucket_name, _, paths = list_files(s3,
|
||||
path=path,
|
||||
allow_pattern=allow_pattern)
|
||||
return [f"s3://{bucket_name}/{path}" for path in paths]
|
||||
|
||||
|
||||
def list_files(
|
||||
s3: "BaseClient",
|
||||
path: str,
|
||||
allow_pattern: Optional[list[str]] = None,
|
||||
ignore_pattern: Optional[list[str]] = None
|
||||
) -> tuple[str, str, list[str]]:
|
||||
"""
|
||||
List files from S3 path and filter by pattern.
|
||||
|
||||
Args:
|
||||
s3: S3 client to use.
|
||||
path: The S3 path to list from.
|
||||
allow_pattern: A list of patterns of which files to pull.
|
||||
ignore_pattern: A list of patterns of which files not to pull.
|
||||
|
||||
Returns:
|
||||
tuple[str, str, list[str]]: A tuple where:
|
||||
- The first element is the bucket name
|
||||
- The second element is string represent the bucket
|
||||
and the prefix as a dir like string
|
||||
- The third element is a list of files allowed or
|
||||
disallowed by pattern
|
||||
"""
|
||||
parts = path.removeprefix('s3://').split('/')
|
||||
prefix = '/'.join(parts[1:])
|
||||
bucket_name = parts[0]
|
||||
|
||||
objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
|
||||
paths = [obj['Key'] for obj in objects.get('Contents', [])]
|
||||
|
||||
paths = _filter_ignore(paths, ["*/"])
|
||||
if allow_pattern is not None:
|
||||
paths = _filter_allow(paths, allow_pattern)
|
||||
|
||||
if ignore_pattern is not None:
|
||||
paths = _filter_ignore(paths, ignore_pattern)
|
||||
|
||||
return bucket_name, prefix, paths
|
||||
292
vllm/transformers_utils/tokenizer.py
Normal file
292
vllm/transformers_utils/tokenizer.py
Normal file
@@ -0,0 +1,292 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import contextlib
|
||||
import copy
|
||||
import os
|
||||
import warnings
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
import huggingface_hub
|
||||
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
||||
PreTrainedTokenizerFast)
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm import envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.config import (
|
||||
get_sentence_transformer_tokenizer_config)
|
||||
from vllm.transformers_utils.tokenizers import MistralTokenizer
|
||||
from vllm.transformers_utils.utils import check_gguf_file
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.tokenizer_base import TokenizerBase
|
||||
else:
|
||||
ModelConfig = Any
|
||||
LoRARequest = Any
|
||||
TokenizerBase = Any
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
|
||||
TokenizerBase]
|
||||
|
||||
|
||||
def decode_tokens(
|
||||
tokenizer: AnyTokenizer,
|
||||
token_ids: list[int],
|
||||
*,
|
||||
skip_special_tokens: Optional[bool] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Backend-agnostic equivalent of HF's
|
||||
`tokenizer.decode(token_ids, ...)`.
|
||||
|
||||
`skip_special_tokens=None` means to use the backend's default
|
||||
settings.
|
||||
"""
|
||||
if skip_special_tokens is not None:
|
||||
return tokenizer.decode(token_ids,
|
||||
skip_special_tokens=skip_special_tokens)
|
||||
|
||||
return tokenizer.decode(token_ids)
|
||||
|
||||
|
||||
def encode_tokens(
|
||||
tokenizer: AnyTokenizer,
|
||||
text: str,
|
||||
*,
|
||||
truncation: Optional[bool] = None,
|
||||
max_length: Optional[int] = None,
|
||||
add_special_tokens: Optional[bool] = None,
|
||||
) -> list[int]:
|
||||
"""
|
||||
Backend-agnostic equivalent of HF's
|
||||
`tokenizer.encode(text, ...)`.
|
||||
|
||||
`add_special_tokens=None` means to use the backend's default
|
||||
settings.
|
||||
"""
|
||||
|
||||
kw_args: dict[str, Any] = {}
|
||||
if max_length is not None:
|
||||
kw_args["max_length"] = max_length
|
||||
|
||||
if truncation is not None:
|
||||
kw_args["truncation"] = truncation
|
||||
|
||||
if add_special_tokens is not None:
|
||||
kw_args["add_special_tokens"] = add_special_tokens
|
||||
|
||||
return tokenizer.encode(text, **kw_args)
|
||||
|
||||
|
||||
def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
|
||||
"""
|
||||
By default, transformers will recompute multiple tokenizer properties
|
||||
each time they are called, leading to a significant slowdown.
|
||||
This proxy caches these properties for faster access.
|
||||
"""
|
||||
cached_tokenizer = copy.copy(tokenizer)
|
||||
|
||||
tokenizer_all_special_ids = tokenizer.all_special_ids
|
||||
tokenizer_all_special_tokens = tokenizer.all_special_tokens
|
||||
tokenizer_all_special_tokens_extended = (
|
||||
tokenizer.all_special_tokens_extended)
|
||||
tokenizer_vocab = tokenizer.get_vocab()
|
||||
tokenizer_len = len(tokenizer)
|
||||
|
||||
max_token_id = max(tokenizer_vocab.values())
|
||||
# Some tokenizers (e.g., QwenTokenizer) have special tokens that
|
||||
# are added and included in the implementation of the vocab_size
|
||||
# property, but not in get_vocab(); if there is an implementation
|
||||
# of vocab size, we should take the greater value.
|
||||
if hasattr(tokenizer, "vocab_size"):
|
||||
with contextlib.suppress(NotImplementedError):
|
||||
max_token_id = max(max_token_id, tokenizer.vocab_size)
|
||||
|
||||
class CachedTokenizer(tokenizer.__class__): # type: ignore
|
||||
|
||||
@property
|
||||
def all_special_ids(self) -> list[int]:
|
||||
return tokenizer_all_special_ids
|
||||
|
||||
@property
|
||||
def all_special_tokens(self) -> list[str]:
|
||||
return tokenizer_all_special_tokens
|
||||
|
||||
@property
|
||||
def all_special_tokens_extended(self) -> list[str]:
|
||||
return tokenizer_all_special_tokens_extended
|
||||
|
||||
@property
|
||||
def max_token_id(self) -> int:
|
||||
return max_token_id
|
||||
|
||||
def get_vocab(self) -> dict[str, int]:
|
||||
return tokenizer_vocab
|
||||
|
||||
def __len__(self) -> int:
|
||||
return tokenizer_len
|
||||
|
||||
def __reduce__(self):
|
||||
return get_cached_tokenizer, (tokenizer, )
|
||||
|
||||
CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
|
||||
|
||||
cached_tokenizer.__class__ = CachedTokenizer
|
||||
return cached_tokenizer
|
||||
|
||||
|
||||
def get_tokenizer(
|
||||
tokenizer_name: Union[str, Path],
|
||||
*args,
|
||||
tokenizer_mode: str = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
revision: Optional[str] = None,
|
||||
download_dir: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> AnyTokenizer:
|
||||
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope.
|
||||
"""
|
||||
if envs.VLLM_USE_MODELSCOPE:
|
||||
# download model from ModelScope hub,
|
||||
# lazy import so that modelscope is not required for normal use.
|
||||
# pylint: disable=C.
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
|
||||
# avoid circuit import
|
||||
from vllm.model_executor.model_loader.weight_utils import get_lock
|
||||
|
||||
# Only set the tokenizer here, model will be downloaded on the workers.
|
||||
if not os.path.exists(tokenizer_name):
|
||||
# Use file lock to prevent multiple processes from
|
||||
# downloading the same file at the same time.
|
||||
with get_lock(tokenizer_name, download_dir):
|
||||
tokenizer_path = snapshot_download(
|
||||
model_id=tokenizer_name,
|
||||
cache_dir=download_dir,
|
||||
revision=revision,
|
||||
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||
# Ignore weights - we only need the tokenizer.
|
||||
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
||||
tokenizer_name = tokenizer_path
|
||||
|
||||
if tokenizer_mode == "slow":
|
||||
if kwargs.get("use_fast", False):
|
||||
raise ValueError(
|
||||
"Cannot use the fast tokenizer in slow tokenizer mode.")
|
||||
kwargs["use_fast"] = False
|
||||
|
||||
if "truncation_side" not in kwargs:
|
||||
kwargs["truncation_side"] = "left"
|
||||
|
||||
# Separate model folder from file path for GGUF models
|
||||
is_gguf = check_gguf_file(tokenizer_name)
|
||||
if is_gguf:
|
||||
kwargs["gguf_file"] = Path(tokenizer_name).name
|
||||
tokenizer_name = Path(tokenizer_name).parent
|
||||
|
||||
# if tokenizer is from official mistral org
|
||||
is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai"
|
||||
if is_from_mistral_org and tokenizer_mode != "mistral":
|
||||
warnings.warn(
|
||||
'It is strongly recommended to run mistral models with '
|
||||
'`--tokenizer-mode "mistral"` to ensure correct '
|
||||
'encoding and decoding.',
|
||||
FutureWarning,
|
||||
stacklevel=2)
|
||||
|
||||
tokenizer: AnyTokenizer
|
||||
if tokenizer_mode == "mistral":
|
||||
tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name),
|
||||
revision=revision)
|
||||
elif tokenizer_mode == "custom":
|
||||
from vllm.transformers_utils.tokenizer_base import TokenizerRegistry
|
||||
tokenizer = TokenizerRegistry.get_tokenizer(str(tokenizer_name),
|
||||
*args,
|
||||
revision=revision,
|
||||
download_dir=download_dir,
|
||||
**kwargs)
|
||||
else:
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
tokenizer_name,
|
||||
*args,
|
||||
trust_remote_code=trust_remote_code,
|
||||
revision=revision,
|
||||
**kwargs,
|
||||
)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the tokenizer class not existing or not
|
||||
# currently being imported,
|
||||
# suggest using the --trust-remote-code flag.
|
||||
if not trust_remote_code and (
|
||||
"does not exist or is not currently imported." in str(e)
|
||||
or "requires you to execute the tokenizer file" in str(e)):
|
||||
err_msg = ("Failed to load the tokenizer. If the tokenizer "
|
||||
"is a custom tokenizer not yet available in the "
|
||||
"HuggingFace transformers library, consider "
|
||||
"setting `trust_remote_code=True` in LLM or using "
|
||||
"the `--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
# The special_tokens in tokenizer should also be
|
||||
# controlled by do_lower_case in encoder_config
|
||||
encoder_config = get_sentence_transformer_tokenizer_config(
|
||||
tokenizer_name, revision)
|
||||
if isinstance(encoder_config, dict) and encoder_config.get(
|
||||
"do_lower_case", False):
|
||||
special_tokens_map = {
|
||||
k: v.lower()
|
||||
for k, v in tokenizer.special_tokens_map.items()
|
||||
}
|
||||
tokenizer.add_special_tokens(special_tokens_map)
|
||||
|
||||
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||
logger.warning(
|
||||
"Using a slow tokenizer. This might cause a significant "
|
||||
"slowdown. Consider using a fast tokenizer instead.")
|
||||
tokenizer = get_cached_tokenizer(tokenizer)
|
||||
|
||||
return tokenizer
|
||||
|
||||
|
||||
cached_get_tokenizer = lru_cache(get_tokenizer)
|
||||
|
||||
|
||||
def cached_tokenizer_from_config(
|
||||
model_config: ModelConfig,
|
||||
**kwargs: Any,
|
||||
):
|
||||
return cached_get_tokenizer(
|
||||
model_config.tokenizer,
|
||||
tokenizer_mode=model_config.tokenizer_mode,
|
||||
revision=model_config.tokenizer_revision,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
def init_tokenizer_from_configs(model_config: ModelConfig):
|
||||
runner_type = model_config.runner_type
|
||||
if runner_type == "generate" or runner_type == "draft":
|
||||
truncation_side = "left"
|
||||
elif runner_type == "pooling":
|
||||
truncation_side = "right"
|
||||
else:
|
||||
assert_never(runner_type)
|
||||
|
||||
return get_tokenizer(
|
||||
model_config.tokenizer,
|
||||
tokenizer_mode=model_config.tokenizer_mode,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
revision=model_config.tokenizer_revision,
|
||||
truncation_side=truncation_side,
|
||||
)
|
||||
154
vllm/transformers_utils/tokenizer_base.py
Normal file
154
vllm/transformers_utils/tokenizer_base.py
Normal file
@@ -0,0 +1,154 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import importlib
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
|
||||
|
||||
|
||||
class TokenizerBase(ABC):
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def all_special_tokens_extended(self) -> list[str]:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def all_special_tokens(self) -> list[str]:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def all_special_ids(self) -> list[int]:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def bos_token_id(self) -> int:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def eos_token_id(self) -> int:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def sep_token(self) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def pad_token(self) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def is_fast(self) -> bool:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def vocab_size(self) -> int:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def max_token_id(self) -> int:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def truncation_side(self) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
def __len__(self) -> int:
|
||||
return self.vocab_size
|
||||
|
||||
@abstractmethod
|
||||
def __call__(
|
||||
self,
|
||||
text: Union[str, list[str], list[int]],
|
||||
text_pair: Optional[str] = None,
|
||||
add_special_tokens: bool = False,
|
||||
truncation: bool = False,
|
||||
max_length: Optional[int] = None,
|
||||
):
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def get_vocab(self) -> dict[str, int]:
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def get_added_vocab(self) -> dict[str, int]:
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def encode_one(
|
||||
self,
|
||||
text: str,
|
||||
truncation: bool = False,
|
||||
max_length: Optional[int] = None,
|
||||
) -> list[int]:
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def encode(self,
|
||||
text: str,
|
||||
truncation: Optional[bool] = None,
|
||||
max_length: Optional[int] = None,
|
||||
add_special_tokens: Optional[bool] = None) -> list[int]:
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def apply_chat_template(self,
|
||||
messages: list["ChatCompletionMessageParam"],
|
||||
tools: Optional[list[dict[str, Any]]] = None,
|
||||
**kwargs) -> list[int]:
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def convert_tokens_to_string(self, tokens: list[str]) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def decode(self,
|
||||
ids: Union[list[int], int],
|
||||
skip_special_tokens: bool = True) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def convert_ids_to_tokens(
|
||||
self,
|
||||
ids: list[int],
|
||||
skip_special_tokens: bool = True,
|
||||
) -> list[str]:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class TokenizerRegistry:
|
||||
# Tokenizer name -> (tokenizer module, tokenizer class)
|
||||
REGISTRY: dict[str, tuple[str, str]] = {}
|
||||
|
||||
@staticmethod
|
||||
def register(name: str, module: str, class_name: str) -> None:
|
||||
TokenizerRegistry.REGISTRY[name] = (module, class_name)
|
||||
|
||||
@staticmethod
|
||||
def get_tokenizer(
|
||||
tokenizer_name: str,
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> TokenizerBase:
|
||||
tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
|
||||
if tokenizer_cls is None:
|
||||
raise ValueError(f"Tokenizer {tokenizer_name} not found.")
|
||||
|
||||
tokenizer_module = importlib.import_module(tokenizer_cls[0])
|
||||
class_ = getattr(tokenizer_module, tokenizer_cls[1])
|
||||
return class_.from_pretrained(*args, **kwargs)
|
||||
10
vllm/transformers_utils/tokenizers/__init__.py
Normal file
10
vllm/transformers_utils/tokenizers/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
|
||||
truncate_tool_call_ids, validate_request_params)
|
||||
|
||||
__all__ = [
|
||||
"MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids",
|
||||
"validate_request_params"
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
521
vllm/transformers_utils/tokenizers/mistral.py
Normal file
521
vllm/transformers_utils/tokenizers/mistral.py
Normal file
@@ -0,0 +1,521 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
||||
|
||||
import huggingface_hub
|
||||
import regex as re
|
||||
from huggingface_hub import HfApi, hf_hub_download
|
||||
from transformers.tokenization_utils_base import BatchEncoding
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.tokenizer_base import TokenizerBase
|
||||
from vllm.utils import is_list_of
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# make sure `mistral_common` is lazy imported,
|
||||
# so that users who only use non-mistral models
|
||||
# will not be bothered by the dependency.
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from mistral_common.tokens.tokenizers.mistral import (
|
||||
MistralTokenizer as PublicMistralTokenizer)
|
||||
|
||||
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def maybe_serialize_tool_calls(request: "ChatCompletionRequest"):
|
||||
# SEE: https://github.com/vllm-project/vllm/pull/9951
|
||||
# Credits go to: @gcalmettes
|
||||
# NOTE: There is currently a bug in pydantic where attributes
|
||||
# declared as iterables are replaced in in the instances by
|
||||
# pydantic-core ValidatorIterator instance. In particular, this
|
||||
# affects tool_calls defined in ChatCompletionAssistantMessageParam
|
||||
# model:
|
||||
# see:
|
||||
# - https://github.com/pydantic/pydantic/issues/9467
|
||||
# As a result, tool_calls from assistant messages are never
|
||||
# deserialized in the request object if the tool_calls iterator is
|
||||
# not consumed. This affect messages passed to the MistralTokenizer
|
||||
# since no chat template is applied and therefore the tools_calls
|
||||
# iterator is not directly consumed.
|
||||
# Issue is tracked on Pydantic side, with resolution planned for
|
||||
# v2.11 release. In the meantime, the official workaround is to
|
||||
# consume the iterator so the tool_calls are correctly deserialized
|
||||
# in the OpenAI ChatCompletionAssistantMessageParam object
|
||||
# https://github.com/pydantic/pydantic/issues/9467#issuecomment-2442097291 # noqa: E501
|
||||
# Official Pydantic Issues:
|
||||
# - https://github.com/pydantic/pydantic/issues/9541
|
||||
# TODO: remove when pydantic v2.11 is released
|
||||
for i, message in enumerate(request.messages):
|
||||
if message.get("role") == 'assistant':
|
||||
tool_calls_validator = message.get("tool_calls", ().__iter__())
|
||||
validated_tool_calls = []
|
||||
while True:
|
||||
try:
|
||||
tool_call = next(tool_calls_validator) # type: ignore
|
||||
validated_tool_calls.append(tool_call)
|
||||
except StopIteration:
|
||||
break
|
||||
|
||||
request.messages[i]["tool_calls"] = validated_tool_calls
|
||||
|
||||
|
||||
def truncate_tool_call_ids(request: "ChatCompletionRequest"):
|
||||
"""Truncates tool call IDs for Mistral's ID requirements."""
|
||||
for i, message in enumerate(request.messages):
|
||||
if message.get("role") == 'assistant':
|
||||
tool_calls = message.get("tool_calls", [])
|
||||
for tool_call in tool_calls:
|
||||
if len(tool_call["id"]) > 9:
|
||||
logger.warning(
|
||||
"Truncating tool call ID: %s to %s",
|
||||
tool_call["id"],
|
||||
tool_call["id"][-9:],
|
||||
)
|
||||
tool_call["id"] = tool_call["id"][-9:]
|
||||
|
||||
request.messages[i]["tool_calls"] = tool_calls
|
||||
|
||||
elif message.get("role") in {"tool_results", "tool"}:
|
||||
if "tool_call_id" in message:
|
||||
tool_call_id = message["tool_call_id"]
|
||||
|
||||
if len(tool_call_id) > 9:
|
||||
logger.warning(
|
||||
"Truncating tool_call_id: %s to %s",
|
||||
tool_call_id,
|
||||
tool_call_id[-9:],
|
||||
)
|
||||
tool_call_id = tool_call_id[-9:]
|
||||
request.messages[i]["tool_call_id"] = tool_call_id
|
||||
|
||||
|
||||
def validate_request_params(request: "ChatCompletionRequest"):
|
||||
if (request.skip_special_tokens is not None
|
||||
and not request.skip_special_tokens):
|
||||
raise ValueError("skip_special_tokens=False is not supported "
|
||||
"for Mistral tokenizers.")
|
||||
|
||||
|
||||
def list_local_repo_files(repo_id: str, revision: Optional[str]) -> list[str]:
|
||||
repo_cache = os.path.join(
|
||||
huggingface_hub.constants.HF_HUB_CACHE,
|
||||
huggingface_hub.constants.REPO_ID_SEPARATOR.join(
|
||||
["models", *repo_id.split("/")]))
|
||||
|
||||
if revision is None:
|
||||
revision_file = os.path.join(repo_cache, "refs", "main")
|
||||
if os.path.isfile(revision_file):
|
||||
with open(revision_file) as file:
|
||||
revision = file.read()
|
||||
|
||||
if revision:
|
||||
revision_dir = os.path.join(repo_cache, "snapshots", revision)
|
||||
if os.path.isdir(revision_dir):
|
||||
return os.listdir(revision_dir)
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def find_tokenizer_file(files: list[str]):
|
||||
file_pattern = re.compile(
|
||||
r"^tokenizer\.model\.v.*$|^tekken\.json$|^tokenizer\.mm\.model\.v.*$")
|
||||
|
||||
matched_files = [file for file in files if file_pattern.match(file)]
|
||||
if len(matched_files) > 1:
|
||||
raise OSError(
|
||||
f"Found {len(matched_files)} files matching the "
|
||||
f"pattern: `{file_pattern.pattern}`. Make sure only one Mistral "
|
||||
f"tokenizer is present in {files}.")
|
||||
elif len(matched_files) == 0:
|
||||
raise OSError(
|
||||
f"Found {len(matched_files)} files matching the "
|
||||
f"pattern: `{file_pattern.pattern}`. Make sure that a Mistral "
|
||||
f"tokenizer is present in {files}.")
|
||||
|
||||
return matched_files[0]
|
||||
|
||||
|
||||
def _aggregate_content(content: list) -> list[dict[str, Any]]:
|
||||
aggregated_content: list[dict[str, Any]] = []
|
||||
for chunk in content:
|
||||
if chunk.get("type"
|
||||
) == "text" and aggregated_content and aggregated_content[
|
||||
-1].get("type") == "text":
|
||||
aggregated_content[-1]["text"] += "\n\n" + chunk.get("text")
|
||||
else:
|
||||
aggregated_content.append(chunk)
|
||||
if len(aggregated_content) == 1 and aggregated_content[0].get(
|
||||
"type") == "text":
|
||||
content = aggregated_content[0]["text"]
|
||||
return content
|
||||
|
||||
|
||||
def make_mistral_chat_completion_request(
|
||||
messages: list["ChatCompletionMessageParam"],
|
||||
tools: Optional[list[dict[str,
|
||||
Any]]] = None) -> "ChatCompletionRequest":
|
||||
last_message = cast(dict[str, Any], messages[-1])
|
||||
if last_message["role"] == "assistant":
|
||||
last_message["prefix"] = True
|
||||
|
||||
# mistral-common requires AssistantMessage content to be string [1].
|
||||
#
|
||||
# [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80
|
||||
for message in messages:
|
||||
# Remove reasoning_content as unsupported by Mistral
|
||||
_ = message.pop("reasoning_content", None) # type: ignore
|
||||
|
||||
# Convert list text content to string
|
||||
if message.get("role") in ("assistant", "tool"):
|
||||
content: Any = message.get("content")
|
||||
if isinstance(content, list):
|
||||
content = _aggregate_content(content)
|
||||
message["content"] = content
|
||||
|
||||
# The Mistral client, in comparison to the OpenAI client, requires the
|
||||
# "parameters" dict and the "description" string to be present
|
||||
# even if they are empty.
|
||||
if tools:
|
||||
for function in [
|
||||
tool["function"] for tool in tools
|
||||
if tool["type"] == "function"
|
||||
]:
|
||||
if function.get("parameters") is None:
|
||||
function["parameters"] = {}
|
||||
if function.get("description") is None:
|
||||
function["description"] = ""
|
||||
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
return ChatCompletionRequest(messages=messages,
|
||||
tools=tools) # type: ignore[type-var]
|
||||
|
||||
|
||||
class MistralTokenizer(TokenizerBase):
|
||||
|
||||
def __init__(self, tokenizer: "PublicMistralTokenizer") -> None:
|
||||
self.mistral = tokenizer
|
||||
self.instruct = tokenizer.instruct_tokenizer
|
||||
_mistral_version_str = self.instruct.tokenizer.version.value
|
||||
self.version: int = int(_mistral_version_str.split("v")[-1])
|
||||
|
||||
tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
|
||||
from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
|
||||
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
|
||||
|
||||
self.is_tekken = isinstance(tokenizer_, Tekkenizer)
|
||||
from mistral_common.tokens.tokenizers.sentencepiece import (
|
||||
SentencePieceTokenizer)
|
||||
self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer)
|
||||
self._special_token_policy = (SpecialTokenPolicy.IGNORE
|
||||
if self.is_tekken else None)
|
||||
if not (self.is_tekken or self.is_spm):
|
||||
raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
|
||||
|
||||
self._vocab = tokenizer_.vocab()
|
||||
# Convert to a dict[str, int] to match protocol, but this is a lossy
|
||||
# conversion. There may be multiple token ids that decode to the same
|
||||
# string due to partial UTF-8 byte sequences being converted to <20>
|
||||
self._vocab_dict = {
|
||||
token: idx
|
||||
for idx, token in enumerate(self._vocab)
|
||||
}
|
||||
self.tokenizer = tokenizer_
|
||||
self._max_token_id = self.vocab_size - 1
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls,
|
||||
path_or_repo_id: str,
|
||||
*,
|
||||
revision: Optional[str] = None) -> "MistralTokenizer":
|
||||
if not Path(path_or_repo_id).exists():
|
||||
assert len(path_or_repo_id.split("/")) == 2, (
|
||||
"You have either provided a non-existent path: "
|
||||
"{path_or_repo_id} or an invalid HF Hub repo id.")
|
||||
tokenizer_file = cls._download_mistral_tokenizer_from_hf(
|
||||
path_or_repo_id, revision)
|
||||
elif Path(path_or_repo_id).is_dir():
|
||||
tokenizer_file_name = find_tokenizer_file(
|
||||
os.listdir(path_or_repo_id))
|
||||
tokenizer_file = str(Path(path_or_repo_id) / tokenizer_file_name)
|
||||
else:
|
||||
assert Path(
|
||||
path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
|
||||
tokenizer_file = str(Path(path_or_repo_id))
|
||||
|
||||
from mistral_common.tokens.tokenizers.mistral import (
|
||||
MistralTokenizer as PublicMistralTokenizer)
|
||||
mistral_tokenizer = PublicMistralTokenizer.from_file(tokenizer_file)
|
||||
return cls(mistral_tokenizer)
|
||||
|
||||
@staticmethod
|
||||
def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
|
||||
revision: Optional[str]) -> str:
|
||||
try:
|
||||
hf_api = HfApi()
|
||||
files = hf_api.list_repo_files(repo_id=tokenizer_name,
|
||||
revision=revision)
|
||||
except ConnectionError as exc:
|
||||
files = list_local_repo_files(repo_id=tokenizer_name,
|
||||
revision=revision)
|
||||
|
||||
if len(files) == 0:
|
||||
raise exc
|
||||
|
||||
filename = find_tokenizer_file(files)
|
||||
|
||||
tokenizer_file = hf_hub_download(tokenizer_name,
|
||||
filename=filename,
|
||||
revision=revision)
|
||||
return tokenizer_file
|
||||
|
||||
# the following attributes are set to fit vLLM's design and are used
|
||||
# by the structured output backends.
|
||||
@property
|
||||
def all_special_tokens_extended(self) -> list[str]:
|
||||
from mistral_common.tokens.tokenizers.base import SpecialTokens
|
||||
|
||||
# tekken defines its own extended special tokens list
|
||||
if hasattr(self.tokenizer, "SPECIAL_TOKENS"):
|
||||
special_tokens = self.tokenizer.SPECIAL_TOKENS
|
||||
else:
|
||||
special_tokens = list(SpecialTokens)
|
||||
return [
|
||||
s.value if isinstance(s, SpecialTokens) else s
|
||||
for s in special_tokens
|
||||
]
|
||||
|
||||
@property
|
||||
def all_special_tokens(self) -> list[str]:
|
||||
return self.all_special_tokens_extended
|
||||
|
||||
@property
|
||||
def all_special_ids(self) -> list[int]:
|
||||
return [
|
||||
self.all_special_tokens.index(t) for t in self.all_special_tokens
|
||||
]
|
||||
|
||||
@property
|
||||
def bos_token_id(self) -> int:
|
||||
return self.tokenizer.bos_id
|
||||
|
||||
@property
|
||||
def eos_token_id(self) -> int:
|
||||
return self.tokenizer.eos_id
|
||||
|
||||
@property
|
||||
def sep_token(self) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
def pad_token(self) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
def is_fast(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
return len(self._vocab)
|
||||
|
||||
@property
|
||||
def max_token_id(self) -> int:
|
||||
return self._max_token_id
|
||||
|
||||
@property
|
||||
def truncation_side(self) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
def __len__(self) -> int:
|
||||
return self.vocab_size
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: Union[str, list[str], list[int]],
|
||||
text_pair: Optional[str] = None,
|
||||
add_special_tokens: bool = False,
|
||||
truncation: bool = False,
|
||||
max_length: Optional[int] = None,
|
||||
):
|
||||
input_ids: Union[list[int], list[list[int]]]
|
||||
# For list[str], original prompt text
|
||||
if is_list_of(text, str):
|
||||
input_ids_: list[list[int]] = []
|
||||
for p in text:
|
||||
each_input_ids = self.encode_one(p, truncation, max_length)
|
||||
input_ids_.append(each_input_ids)
|
||||
input_ids = input_ids_
|
||||
# For list[int], apply chat template output, already tokens.
|
||||
elif is_list_of(text, int):
|
||||
input_ids = text
|
||||
# For str, single prompt text
|
||||
else:
|
||||
input_ids = self.encode_one(text, truncation, max_length)
|
||||
return BatchEncoding({"input_ids": input_ids})
|
||||
|
||||
def get_vocab(self) -> dict[str, int]:
|
||||
# NB: the dictionary form of the vocabulary collapses token ids that map
|
||||
# to the same string but have different bytes
|
||||
return self._vocab_dict
|
||||
|
||||
def get_added_vocab(self) -> dict[str, int]:
|
||||
# Mistral tokenizers have no added vocabulary
|
||||
return {}
|
||||
|
||||
def encode_one(
|
||||
self,
|
||||
text: str,
|
||||
truncation: bool = False,
|
||||
max_length: Optional[int] = None,
|
||||
) -> list[int]:
|
||||
# Mistral Tokenizers should not add special tokens
|
||||
input_ids = self.encode(text)
|
||||
|
||||
if truncation:
|
||||
input_ids = input_ids[:max_length]
|
||||
return input_ids
|
||||
|
||||
def encode(self,
|
||||
text: str,
|
||||
truncation: Optional[bool] = None,
|
||||
max_length: Optional[int] = None,
|
||||
add_special_tokens: Optional[bool] = None) -> list[int]:
|
||||
# `encode` should only be used for prompt completion
|
||||
# it should never be used for chat_completion.
|
||||
# For chat completion use `apply_chat_template`
|
||||
if add_special_tokens is not None:
|
||||
return self.tokenizer.encode(text,
|
||||
bos=add_special_tokens,
|
||||
eos=add_special_tokens)
|
||||
else:
|
||||
return self.tokenizer.encode(text, bos=True, eos=False)
|
||||
|
||||
def apply_chat_template(self,
|
||||
messages: list["ChatCompletionMessageParam"],
|
||||
tools: Optional[list[dict[str, Any]]] = None,
|
||||
**kwargs) -> list[int]:
|
||||
|
||||
request = make_mistral_chat_completion_request(messages, tools)
|
||||
encoded = self.mistral.encode_chat_completion(request)
|
||||
|
||||
# encode-decode to get clean prompt
|
||||
return encoded.tokens
|
||||
|
||||
def convert_tokens_to_string(self, tokens: list[str]) -> str:
|
||||
from mistral_common.tokens.tokenizers.base import SpecialTokens
|
||||
if self.is_tekken:
|
||||
tokens = [
|
||||
t for t in tokens
|
||||
if (t is SpecialTokens.tool_calls
|
||||
or t not in self.tokenizer._all_special_tokens)
|
||||
]
|
||||
|
||||
if any(isinstance(t, bytes) for t in tokens):
|
||||
# we need to encode and decode all tokens again
|
||||
shift = self.tokenizer.num_special_tokens
|
||||
|
||||
def _token_to_id(t: str):
|
||||
t_bytes = t.encode("utf-8") \
|
||||
if not isinstance(t, bytes) else t
|
||||
try:
|
||||
return shift + \
|
||||
self.tokenizer._tekken_token2id_nospecial[t_bytes]
|
||||
except KeyError:
|
||||
logger.warning(
|
||||
"Failed to convert token %s to id,"
|
||||
" replacing with <unk>", t_bytes)
|
||||
return self.tokenizer.unk_id
|
||||
|
||||
ids = [_token_to_id(t) for t in tokens]
|
||||
decoded = self.tokenizer.decode(ids,
|
||||
self._special_token_policy)
|
||||
else:
|
||||
decoded = "".join(tokens)
|
||||
else:
|
||||
# make sure certain special tokens like Tool calls are
|
||||
# not decoded
|
||||
special_tokens = {SpecialTokens.tool_calls}
|
||||
regular_tokens: list[str] = []
|
||||
decoded_list = []
|
||||
|
||||
for token in tokens:
|
||||
if token in special_tokens:
|
||||
if regular_tokens:
|
||||
decoded_list.append(
|
||||
self.tokenizer.decode(regular_tokens,
|
||||
self._special_token_policy))
|
||||
regular_tokens = []
|
||||
decoded_list.append(token)
|
||||
else:
|
||||
regular_tokens.append(token)
|
||||
|
||||
if regular_tokens:
|
||||
decoded_list.append(
|
||||
self.tokenizer.decode(regular_tokens,
|
||||
self._special_token_policy))
|
||||
|
||||
decoded = ''.join(decoded_list)
|
||||
|
||||
return decoded
|
||||
|
||||
def decode(self,
|
||||
ids: Union[list[int], int],
|
||||
skip_special_tokens: bool = True) -> str:
|
||||
assert (
|
||||
skip_special_tokens
|
||||
), "skip_special_tokens=False is not supported for Mistral tokenizers."
|
||||
|
||||
if isinstance(ids, int):
|
||||
ids = [ids]
|
||||
return self.tokenizer.decode(ids, self._special_token_policy)
|
||||
|
||||
def convert_ids_to_tokens(
|
||||
self,
|
||||
ids: list[int],
|
||||
skip_special_tokens: bool = True,
|
||||
) -> list[str]:
|
||||
from mistral_common.tokens.tokenizers.base import SpecialTokens
|
||||
from mistral_common.tokens.tokenizers.instruct import (
|
||||
InstructTokenizerV13)
|
||||
|
||||
# TODO(Patrick) - potentially allow special tokens to not be skipped
|
||||
assert (
|
||||
skip_special_tokens
|
||||
), "skip_special_tokens=False is not supported for Mistral tokenizers."
|
||||
|
||||
assert self.is_tekken or self.is_spm, type(self.tokenizer)
|
||||
|
||||
if self.is_tekken:
|
||||
# skip special tokens except tool call and think tokens
|
||||
non_skip_special_tokens = {
|
||||
self.tokenizer.get_control_token(SpecialTokens.tool_calls)
|
||||
}
|
||||
if isinstance(self.instruct, InstructTokenizerV13):
|
||||
if self.instruct.BEGIN_THINK:
|
||||
non_skip_special_tokens.add(self.instruct.BEGIN_THINK)
|
||||
if self.instruct.END_THINK:
|
||||
non_skip_special_tokens.add(self.instruct.END_THINK)
|
||||
ids = [
|
||||
i for i in ids if i > self.tokenizer.num_special_tokens
|
||||
or i in non_skip_special_tokens
|
||||
]
|
||||
|
||||
tokens = [self.tokenizer.id_to_piece(id) for id in ids]
|
||||
|
||||
if any("<EFBFBD>" in t for t in tokens) and self.is_tekken:
|
||||
# if a decoded token contains the replacement character, then the
|
||||
# token has an incomplete UTF-8 character so we must use bytes
|
||||
# See: https://github.com/vllm-project/vllm/pull/8640
|
||||
# https://github.com/vllm-project/vllm/pull/9625
|
||||
# if underlying tokenizeir is sentencepiece, we just add "<22>"
|
||||
tokens = [
|
||||
self.tokenizer.id_to_byte_piece(id, self._special_token_policy)
|
||||
for id in ids
|
||||
]
|
||||
|
||||
return tokens
|
||||
108
vllm/transformers_utils/utils.py
Normal file
108
vllm/transformers_utils/utils.py
Normal file
@@ -0,0 +1,108 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
import struct
|
||||
from functools import cache
|
||||
from os import PathLike
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from vllm.envs import VLLM_MODEL_REDIRECT_PATH
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def is_s3(model_or_path: str) -> bool:
|
||||
return model_or_path.lower().startswith('s3://')
|
||||
|
||||
|
||||
def check_gguf_file(model: Union[str, PathLike]) -> bool:
|
||||
"""Check if the file is a GGUF model."""
|
||||
model = Path(model)
|
||||
if not model.is_file():
|
||||
return False
|
||||
elif model.suffix == ".gguf":
|
||||
return True
|
||||
|
||||
try:
|
||||
with model.open("rb") as f:
|
||||
header = f.read(4)
|
||||
|
||||
return header == b"GGUF"
|
||||
except Exception as e:
|
||||
logger.debug("Error reading file %s: %s", model, e)
|
||||
return False
|
||||
|
||||
|
||||
def modelscope_list_repo_files(
|
||||
repo_id: str,
|
||||
revision: Optional[str] = None,
|
||||
token: Union[str, bool, None] = None,
|
||||
) -> list[str]:
|
||||
"""List files in a modelscope repo."""
|
||||
from modelscope.hub.api import HubApi
|
||||
api = HubApi()
|
||||
api.login(token)
|
||||
# same as huggingface_hub.list_repo_files
|
||||
files = [
|
||||
file['Path'] for file in api.get_model_files(
|
||||
model_id=repo_id, revision=revision, recursive=True)
|
||||
if file['Type'] == 'blob'
|
||||
]
|
||||
return files
|
||||
|
||||
|
||||
def _maybe_json_dict(path: Union[str, PathLike]) -> dict[str, str]:
|
||||
with open(path) as f:
|
||||
try:
|
||||
return json.loads(f.read())
|
||||
except Exception:
|
||||
return dict[str, str]()
|
||||
|
||||
|
||||
def _maybe_space_split_dict(path: Union[str, PathLike]) -> dict[str, str]:
|
||||
parsed_dict = dict[str, str]()
|
||||
with open(path) as f:
|
||||
for line in f.readlines():
|
||||
try:
|
||||
model_name, redirect_name = line.strip().split()
|
||||
parsed_dict[model_name] = redirect_name
|
||||
except Exception:
|
||||
pass
|
||||
return parsed_dict
|
||||
|
||||
|
||||
@cache
|
||||
def maybe_model_redirect(model: str) -> str:
|
||||
"""
|
||||
Use model_redirect to redirect the model name to a local folder.
|
||||
|
||||
:param model: hf model name
|
||||
:return: maybe redirect to a local folder
|
||||
"""
|
||||
|
||||
model_redirect_path = VLLM_MODEL_REDIRECT_PATH
|
||||
|
||||
if not model_redirect_path:
|
||||
return model
|
||||
|
||||
if not Path(model_redirect_path).exists():
|
||||
return model
|
||||
|
||||
redirect_dict = (_maybe_json_dict(model_redirect_path)
|
||||
or _maybe_space_split_dict(model_redirect_path))
|
||||
if (redirect_model := redirect_dict.get(model)):
|
||||
logger.info("model redirect: [ %s ] -> [ %s ]", model, redirect_model)
|
||||
return redirect_model
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def parse_safetensors_file_metadata(
|
||||
path: Union[str, PathLike]) -> dict[str, Any]:
|
||||
with open(path, "rb") as f:
|
||||
length_of_metadata = struct.unpack('<Q', f.read(8))[0]
|
||||
metadata = json.loads(f.read(length_of_metadata).decode('utf-8'))
|
||||
return metadata
|
||||
Reference in New Issue
Block a user