First commit
This commit is contained in:
17
vllm/transformers_utils/__init__.py
Normal file
17
vllm/transformers_utils/__init__.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from vllm.envs import VLLM_USE_MODELSCOPE
|
||||
|
||||
if VLLM_USE_MODELSCOPE:
|
||||
# Patch here, before each import happens
|
||||
import modelscope
|
||||
from packaging import version
|
||||
|
||||
# patch_hub begins from modelscope>=1.18.1
|
||||
if version.parse(modelscope.__version__) <= version.parse('1.18.0'):
|
||||
raise ImportError(
|
||||
'Using vLLM with ModelScope needs modelscope>=1.18.1, please '
|
||||
'install by `pip install modelscope>=1.18.1`')
|
||||
|
||||
from modelscope.utils.hf_util import patch_hub
|
||||
|
||||
# Patch hub to download models from modelscope to speed up.
|
||||
patch_hub()
|
||||
BIN
vllm/transformers_utils/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
vllm/transformers_utils/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
BIN
vllm/transformers_utils/__pycache__/config.cpython-310.pyc
Normal file
BIN
vllm/transformers_utils/__pycache__/config.cpython-310.pyc
Normal file
Binary file not shown.
BIN
vllm/transformers_utils/__pycache__/detokenizer.cpython-310.pyc
Normal file
BIN
vllm/transformers_utils/__pycache__/detokenizer.cpython-310.pyc
Normal file
Binary file not shown.
BIN
vllm/transformers_utils/__pycache__/processor.cpython-310.pyc
Normal file
BIN
vllm/transformers_utils/__pycache__/processor.cpython-310.pyc
Normal file
Binary file not shown.
BIN
vllm/transformers_utils/__pycache__/tokenizer.cpython-310.pyc
Normal file
BIN
vllm/transformers_utils/__pycache__/tokenizer.cpython-310.pyc
Normal file
Binary file not shown.
BIN
vllm/transformers_utils/__pycache__/utils.cpython-310.pyc
Normal file
BIN
vllm/transformers_utils/__pycache__/utils.cpython-310.pyc
Normal file
Binary file not shown.
360
vllm/transformers_utils/config.py
Normal file
360
vllm/transformers_utils/config.py
Normal file
@@ -0,0 +1,360 @@
|
||||
import enum
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional, Type, Union
|
||||
|
||||
import huggingface_hub
|
||||
from huggingface_hub import (file_exists, hf_hub_download,
|
||||
try_to_load_from_cache)
|
||||
from transformers import GenerationConfig, PretrainedConfig
|
||||
from transformers.models.auto.image_processing_auto import (
|
||||
get_image_processor_config)
|
||||
from transformers.models.auto.modeling_auto import (
|
||||
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
|
||||
from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
|
||||
|
||||
from vllm.envs import VLLM_USE_MODELSCOPE
|
||||
from vllm.logger import init_logger
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
|
||||
EAGLEConfig, ExaoneConfig,
|
||||
InternVLChatConfig, JAISConfig,
|
||||
MedusaConfig, MllamaConfig,
|
||||
MLPSpeculatorConfig, MPTConfig,
|
||||
NemotronConfig, NVLM_D_Config,
|
||||
Qwen2VLConfig, RWConfig,
|
||||
SolarConfig, UltravoxConfig)
|
||||
# yapf: enable
|
||||
from vllm.transformers_utils.utils import check_gguf_file
|
||||
|
||||
if VLLM_USE_MODELSCOPE:
|
||||
from modelscope import AutoConfig
|
||||
else:
|
||||
from transformers import AutoConfig
|
||||
|
||||
MISTRAL_CONFIG_NAME = "params.json"
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
_CONFIG_REGISTRY_OVERRIDE_HF: Dict[str, Type[PretrainedConfig]] = {
|
||||
"mllama": MllamaConfig
|
||||
}
|
||||
|
||||
_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
|
||||
"chatglm": ChatGLMConfig,
|
||||
"dbrx": DbrxConfig,
|
||||
"mpt": MPTConfig,
|
||||
"RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct)
|
||||
"RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct)
|
||||
"jais": JAISConfig,
|
||||
"mlp_speculator": MLPSpeculatorConfig,
|
||||
"medusa": MedusaConfig,
|
||||
"eagle": EAGLEConfig,
|
||||
"exaone": ExaoneConfig,
|
||||
"internvl_chat": InternVLChatConfig,
|
||||
"nemotron": NemotronConfig,
|
||||
"NVLM_D": NVLM_D_Config,
|
||||
"solar": SolarConfig,
|
||||
"ultravox": UltravoxConfig,
|
||||
"qwen2_vl": Qwen2VLConfig,
|
||||
**_CONFIG_REGISTRY_OVERRIDE_HF
|
||||
}
|
||||
|
||||
|
||||
class ConfigFormat(str, enum.Enum):
|
||||
AUTO = "auto"
|
||||
HF = "hf"
|
||||
MISTRAL = "mistral"
|
||||
|
||||
|
||||
def file_or_path_exists(model: Union[str, Path], config_name, revision,
|
||||
token) -> bool:
|
||||
if Path(model).exists():
|
||||
return (Path(model) / config_name).is_file()
|
||||
|
||||
# Offline mode support: Check if config file is cached already
|
||||
cached_filepath = try_to_load_from_cache(repo_id=model,
|
||||
filename=config_name,
|
||||
revision=revision)
|
||||
if isinstance(cached_filepath, str):
|
||||
# The config file exists in cache- we can continue trying to load
|
||||
return True
|
||||
|
||||
# NB: file_exists will only check for the existence of the config file on
|
||||
# hf_hub. This will fail in offline mode.
|
||||
try:
|
||||
return file_exists(model, config_name, revision=revision, token=token)
|
||||
except huggingface_hub.errors.OfflineModeIsEnabled:
|
||||
# Don't raise in offline mode, all we know is that we don't have this
|
||||
# file cached.
|
||||
return False
|
||||
|
||||
|
||||
def get_config(
|
||||
model: Union[str, Path],
|
||||
trust_remote_code: bool,
|
||||
revision: Optional[str] = None,
|
||||
code_revision: Optional[str] = None,
|
||||
rope_scaling: Optional[dict] = None,
|
||||
rope_theta: Optional[float] = None,
|
||||
config_format: ConfigFormat = ConfigFormat.AUTO,
|
||||
**kwargs,
|
||||
) -> PretrainedConfig:
|
||||
# Separate model folder from file path for GGUF models
|
||||
|
||||
is_gguf = check_gguf_file(model)
|
||||
if is_gguf:
|
||||
kwargs["gguf_file"] = Path(model).name
|
||||
model = Path(model).parent
|
||||
|
||||
if config_format == ConfigFormat.AUTO:
|
||||
if is_gguf or file_or_path_exists(model,
|
||||
HF_CONFIG_NAME,
|
||||
revision=revision,
|
||||
token=kwargs.get("token")):
|
||||
config_format = ConfigFormat.HF
|
||||
elif file_or_path_exists(model,
|
||||
MISTRAL_CONFIG_NAME,
|
||||
revision=revision,
|
||||
token=kwargs.get("token")):
|
||||
config_format = ConfigFormat.MISTRAL
|
||||
else:
|
||||
# If we're in offline mode and found no valid config format, then
|
||||
# raise an offline mode error to indicate to the user that they
|
||||
# don't have files cached and may need to go online.
|
||||
# This is conveniently triggered by calling file_exists().
|
||||
file_exists(model,
|
||||
HF_CONFIG_NAME,
|
||||
revision=revision,
|
||||
token=kwargs.get("token"))
|
||||
|
||||
raise ValueError(f"No supported config format found in {model}")
|
||||
|
||||
if config_format == ConfigFormat.HF:
|
||||
config_dict, _ = PretrainedConfig.get_config_dict(
|
||||
model, revision=revision, code_revision=code_revision, **kwargs)
|
||||
|
||||
# Use custom model class if it's in our registry
|
||||
model_type = config_dict.get("model_type")
|
||||
if model_type in _CONFIG_REGISTRY:
|
||||
config_class = _CONFIG_REGISTRY[model_type]
|
||||
config = config_class.from_pretrained(model,
|
||||
revision=revision,
|
||||
code_revision=code_revision)
|
||||
else:
|
||||
try:
|
||||
config = AutoConfig.from_pretrained(
|
||||
model,
|
||||
trust_remote_code=trust_remote_code,
|
||||
revision=revision,
|
||||
code_revision=code_revision,
|
||||
**kwargs,
|
||||
)
|
||||
except ValueError as e:
|
||||
if (not trust_remote_code
|
||||
and "requires you to execute the configuration file"
|
||||
in str(e)):
|
||||
err_msg = (
|
||||
"Failed to load the model config. If the model "
|
||||
"is a custom model not yet available in the "
|
||||
"HuggingFace transformers library, consider setting "
|
||||
"`trust_remote_code=True` in LLM or using the "
|
||||
"`--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
elif config_format == ConfigFormat.MISTRAL:
|
||||
config = load_params_config(model, revision)
|
||||
else:
|
||||
raise ValueError(f"Unsupported config format: {config_format}")
|
||||
|
||||
# Special architecture mapping check for GGUF models
|
||||
if is_gguf:
|
||||
if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
|
||||
raise RuntimeError(
|
||||
f"Can't get gguf config for {config.model_type}.")
|
||||
model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
|
||||
config.update({"architectures": [model_type]})
|
||||
|
||||
for key, value in [
|
||||
("rope_scaling", rope_scaling),
|
||||
("rope_theta", rope_theta),
|
||||
]:
|
||||
if value is not None:
|
||||
logger.info(
|
||||
"Updating %s from %r to %r",
|
||||
key,
|
||||
getattr(config, key, None),
|
||||
value,
|
||||
)
|
||||
config.update({key: value})
|
||||
|
||||
return config
|
||||
def maybe_register_config_serialize_by_value(trust_remote_code: bool) -> None:
|
||||
"""Try to register HF model configuration class to serialize by value
|
||||
With trust_remote_code, the config class is typically an instance of a
|
||||
custom class imported from the HF modules cache. The class will not be
|
||||
importable in spawned workers by default (and won't exist at all on
|
||||
other nodes), which breaks serialization of the config.
|
||||
In this function we tell the cloudpickle serialization library to pass
|
||||
instances of these generated classes by value instead of by reference,
|
||||
i.e. the class definition is serialized along with its data so that the
|
||||
class module does not need to be importable on the receiving end. This
|
||||
registration only works if the modules cache has already been
|
||||
initialized.
|
||||
See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs
|
||||
"""
|
||||
if not trust_remote_code:
|
||||
return
|
||||
|
||||
try:
|
||||
import transformers_modules
|
||||
except ImportError:
|
||||
logger.debug("Could not import transformers_modules used for remote"
|
||||
" code. If remote code is not needed remove"
|
||||
" `--trust-remote-code`.")
|
||||
return
|
||||
|
||||
try:
|
||||
import cloudpickle
|
||||
cloudpickle.register_pickle_by_value(transformers_modules)
|
||||
|
||||
# ray vendors its own version of cloudpickle
|
||||
from vllm.executor.ray_utils import ray
|
||||
if ray:
|
||||
ray.cloudpickle.register_pickle_by_value(transformers_modules)
|
||||
|
||||
# multiprocessing uses pickle to serialize arguments when using spawn
|
||||
# Here we get pickle to use cloudpickle to serialize ModelConfig objects
|
||||
# that contain instances of the custom config class to avoid
|
||||
# serialization problems if the generated module (and model) has a `.`
|
||||
# in its name
|
||||
import multiprocessing
|
||||
import pickle
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
|
||||
def _reduce_modelconfig(mc: ModelConfig):
|
||||
return (pickle.loads, (cloudpickle.dumps(mc), ))
|
||||
|
||||
multiprocessing.reducer.register(ModelConfig, _reduce_modelconfig)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Unable to register remote classes used by"
|
||||
" trust_remote_code with by-value serialization. This may"
|
||||
" lead to a later error. If remote code is not needed"
|
||||
" remove `--trust-remote-code`",
|
||||
exc_info=e)
|
||||
|
||||
def load_params_config(model, revision) -> PretrainedConfig:
|
||||
# This function loads a params.json config which
|
||||
# should be used when loading models in mistral format
|
||||
|
||||
config_file_name = "params.json"
|
||||
|
||||
config_path = Path(model) / config_file_name
|
||||
|
||||
if not config_path.is_file():
|
||||
config_path = Path(
|
||||
hf_hub_download(model, config_file_name, revision=revision))
|
||||
|
||||
with open(config_path, "r") as file:
|
||||
config_dict = json.load(file)
|
||||
|
||||
config_mapping = {
|
||||
"dim": "hidden_size",
|
||||
"norm_eps": "rms_norm_eps",
|
||||
"n_kv_heads": "num_key_value_heads",
|
||||
"n_layers": "num_hidden_layers",
|
||||
"n_heads": "num_attention_heads",
|
||||
"hidden_dim": "intermediate_size",
|
||||
}
|
||||
|
||||
def recurse_elems(elem: Any):
|
||||
if isinstance(elem, dict):
|
||||
config_dict = {}
|
||||
for key, value in elem.items():
|
||||
key = config_mapping.get(key, key)
|
||||
config_dict[key] = recurse_elems(value)
|
||||
return PretrainedConfig(**config_dict)
|
||||
else:
|
||||
return elem
|
||||
|
||||
config_dict["model_type"] = config_dict.get("model_type", "transformer")
|
||||
config_dict["hidden_act"] = config_dict.get("activation", "silu")
|
||||
config_dict["tie_word_embeddings"] = config_dict.get(
|
||||
"tie_embeddings", False)
|
||||
config_dict["max_seq_len"] = config_dict.get("max_seq_len", 128_000)
|
||||
config_dict["max_position_embeddings"] = config_dict.get(
|
||||
"max_position_embeddings", 128_000)
|
||||
|
||||
if config_dict.get("moe") is not None:
|
||||
config_dict["architectures"] = ["MixtralForCausalLM"]
|
||||
else:
|
||||
config_dict["architectures"] = ["MistralForCausalLM"]
|
||||
|
||||
if config_dict.get("vision_encoder") is not None:
|
||||
multimodal_config = config_dict.pop("vision_encoder")
|
||||
|
||||
config_dict = {
|
||||
"text_config": config_dict,
|
||||
"vision_config": multimodal_config
|
||||
}
|
||||
config_dict["architectures"] = ["PixtralForConditionalGeneration"]
|
||||
config_dict["model_type"] = "pixtral"
|
||||
|
||||
config = recurse_elems(config_dict)
|
||||
return config
|
||||
|
||||
|
||||
def get_hf_image_processor_config(
|
||||
model: Union[str, Path],
|
||||
revision: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> Dict[str, Any]:
|
||||
# ModelScope does not provide an interface for image_processor
|
||||
if VLLM_USE_MODELSCOPE:
|
||||
return dict()
|
||||
# Separate model folder from file path for GGUF models
|
||||
if check_gguf_file(model):
|
||||
model = Path(model).parent
|
||||
return get_image_processor_config(model, revision=revision, **kwargs)
|
||||
|
||||
|
||||
def get_hf_text_config(config: PretrainedConfig):
|
||||
"""Get the "sub" config relevant to llm for multi modal models.
|
||||
No op for pure text models.
|
||||
"""
|
||||
if hasattr(config, "text_config"):
|
||||
# The code operates under the assumption that text_config should have
|
||||
# `num_attention_heads` (among others). Assert here to fail early
|
||||
# if transformers config doesn't align with this assumption.
|
||||
assert hasattr(config.text_config, "num_attention_heads")
|
||||
return config.text_config
|
||||
else:
|
||||
return config
|
||||
|
||||
|
||||
def try_get_generation_config(
|
||||
model: str,
|
||||
trust_remote_code: bool,
|
||||
revision: Optional[str] = None,
|
||||
) -> Optional[GenerationConfig]:
|
||||
try:
|
||||
return GenerationConfig.from_pretrained(
|
||||
model,
|
||||
revision=revision,
|
||||
)
|
||||
except OSError: # Not found
|
||||
try:
|
||||
config = get_config(
|
||||
model,
|
||||
trust_remote_code=trust_remote_code,
|
||||
revision=revision,
|
||||
)
|
||||
return GenerationConfig.from_model_config(config)
|
||||
except OSError: # Not found
|
||||
return None
|
||||
40
vllm/transformers_utils/configs/__init__.py
Normal file
40
vllm/transformers_utils/configs/__init__.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
|
||||
from vllm.transformers_utils.configs.dbrx import DbrxConfig
|
||||
from vllm.transformers_utils.configs.eagle import EAGLEConfig
|
||||
from vllm.transformers_utils.configs.exaone import ExaoneConfig
|
||||
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
|
||||
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
|
||||
# `FalconConfig` class from the official HuggingFace transformers library.
|
||||
from vllm.transformers_utils.configs.falcon import RWConfig
|
||||
from vllm.transformers_utils.configs.internvl import InternVLChatConfig
|
||||
from vllm.transformers_utils.configs.jais import JAISConfig
|
||||
from vllm.transformers_utils.configs.medusa import MedusaConfig
|
||||
from vllm.transformers_utils.configs.mllama import MllamaConfig
|
||||
from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
|
||||
from vllm.transformers_utils.configs.mpt import MPTConfig
|
||||
from vllm.transformers_utils.configs.nemotron import NemotronConfig
|
||||
from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
|
||||
from vllm.transformers_utils.configs.qwen2vl import (Qwen2VLConfig,
|
||||
Qwen2VLVisionConfig)
|
||||
from vllm.transformers_utils.configs.solar import SolarConfig
|
||||
from vllm.transformers_utils.configs.ultravox import UltravoxConfig
|
||||
|
||||
__all__ = [
|
||||
"ChatGLMConfig",
|
||||
"DbrxConfig",
|
||||
"MPTConfig",
|
||||
"RWConfig",
|
||||
"InternVLChatConfig",
|
||||
"JAISConfig",
|
||||
"MedusaConfig",
|
||||
"EAGLEConfig",
|
||||
"ExaoneConfig",
|
||||
"MllamaConfig",
|
||||
"MLPSpeculatorConfig",
|
||||
"NemotronConfig",
|
||||
"NVLM_D_Config",
|
||||
"SolarConfig",
|
||||
"UltravoxConfig",
|
||||
"Qwen2VLConfig",
|
||||
"Qwen2VLVisionConfig",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
vllm/transformers_utils/configs/__pycache__/dbrx.cpython-310.pyc
Normal file
BIN
vllm/transformers_utils/configs/__pycache__/dbrx.cpython-310.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
vllm/transformers_utils/configs/__pycache__/jais.cpython-310.pyc
Normal file
BIN
vllm/transformers_utils/configs/__pycache__/jais.cpython-310.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
vllm/transformers_utils/configs/__pycache__/mpt.cpython-310.pyc
Normal file
BIN
vllm/transformers_utils/configs/__pycache__/mpt.cpython-310.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
204
vllm/transformers_utils/configs/arctic.py
Normal file
204
vllm/transformers_utils/configs/arctic.py
Normal file
@@ -0,0 +1,204 @@
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# Copied from
|
||||
# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
|
||||
""" Arctic model configuration"""
|
||||
|
||||
from dataclasses import asdict, dataclass
|
||||
from typing import Any, Dict
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
ARCTIC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"arctic": "https://huggingface.co/Snowflake/snowflake-arctic-instruct/tree/main/config.json",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ArcticLoraConfig:
|
||||
lora_r: int = 64
|
||||
lora_alpha: float = 16
|
||||
shard_base_weights: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class ArcticQuantizationConfig:
|
||||
q_bits: int = 8
|
||||
rounding: str = "nearest"
|
||||
mantissa_bits: int = 3
|
||||
group_size: int = 128
|
||||
|
||||
|
||||
class ArcticConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`ArcticModel`]. It is used to instantiate an
|
||||
Arctic model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
||||
with the defaults will yield a similar configuration to that of the #TODO(rsamdani): add what model has the default config..
|
||||
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 32000):
|
||||
Vocabulary size of the Arctic model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`ArcticModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 14336):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 8):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function (function or string) in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
|
||||
The maximum sequence length that this model might ever be used with. Arctic's sliding window attention
|
||||
allows sequence of up to 4096*32 tokens.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*):
|
||||
The id of the padding token.
|
||||
bos_token_id (`int`, *optional*, defaults to 1):
|
||||
The id of the "beginning-of-sequence" token.
|
||||
eos_token_id (`int`, *optional*, defaults to 2):
|
||||
The id of the "end-of-sequence" token.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model's input and output word embeddings should be tied.
|
||||
rope_theta (`float`, *optional*, defaults to 1000000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
sliding_window (`int`, *optional*):
|
||||
Sliding window attention window size. If not specified, will default to `4096`.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
num_experts_per_tok (`int`, *optional*, defaults to 2):
|
||||
The number of experts to root per-token, can be also interpreted as the `top-p` routing
|
||||
parameter
|
||||
num_local_experts (`int`, *optional*, defaults to 8):
|
||||
Number of experts per Sparse MLP layer.
|
||||
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
|
||||
The aux loss factor for the total loss.
|
||||
|
||||
```python
|
||||
>>> from transformers import ArcticModel, ArcticConfig
|
||||
|
||||
>>> # Initializing a Arctic 7B style configuration TODO(rsamdani): verify which model does the default configuration correspond to.
|
||||
>>> configuration = ArcticConfig()
|
||||
|
||||
>>> # Initializing a model from the Arctic 7B style configuration
|
||||
>>> model = ArcticModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "arctic"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=32000,
|
||||
hidden_size=4096,
|
||||
intermediate_size=14336,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=4096,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-5,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=1e6,
|
||||
sliding_window=None,
|
||||
attention_dropout=0.0,
|
||||
num_experts_per_tok=1,
|
||||
num_local_experts=8,
|
||||
router_aux_loss_coef=0.001,
|
||||
moe_layer_frequency=2,
|
||||
parallel_attn_mlp_res=False,
|
||||
moe_train_capacity_factor=1,
|
||||
moe_eval_capacity_factor=1,
|
||||
enable_expert_tensor_parallelism=False,
|
||||
moe_min_capacity=0,
|
||||
moe_token_dropping=True,
|
||||
quantization=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.sliding_window = sliding_window
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.num_local_experts = num_local_experts
|
||||
self.router_aux_loss_coef = router_aux_loss_coef
|
||||
self.moe_layer_frequency = moe_layer_frequency
|
||||
self.moe_train_capacity_factor = moe_train_capacity_factor
|
||||
self.moe_eval_capacity_factor = moe_eval_capacity_factor
|
||||
self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
|
||||
self.moe_min_capacity = moe_min_capacity
|
||||
self.moe_token_dropping = moe_token_dropping
|
||||
self.parallel_attn_mlp_res = parallel_attn_mlp_res
|
||||
if isinstance(quantization, dict):
|
||||
self.quantization = ArcticQuantizationConfig(**quantization)
|
||||
else:
|
||||
self.quantization = quantization
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "ArcticConfig":
|
||||
result = super().from_dict(config_dict, **kwargs)
|
||||
config = result[0] if isinstance(result, tuple) else result
|
||||
if isinstance(config.quantization, dict):
|
||||
config.quantization = ArcticQuantizationConfig(**config.quantization)
|
||||
return result
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
ret = super().to_dict()
|
||||
if isinstance(ret["quantization"], ArcticQuantizationConfig):
|
||||
ret["quantization"] = asdict(ret["quantization"])
|
||||
return ret
|
||||
70
vllm/transformers_utils/configs/chatglm.py
Normal file
70
vllm/transformers_utils/configs/chatglm.py
Normal file
@@ -0,0 +1,70 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/THUDM/ChatGLM2-6B
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class ChatGLMConfig(PretrainedConfig):
|
||||
model_type = "chatglm"
|
||||
attribute_map = {
|
||||
"num_hidden_layers": "num_layers",
|
||||
"n_head_kv": "multi_query_group_num",
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
num_layers=28,
|
||||
padded_vocab_size=65024,
|
||||
hidden_size=4096,
|
||||
ffn_hidden_size=13696,
|
||||
kv_channels=128,
|
||||
num_attention_heads=32,
|
||||
seq_length=2048,
|
||||
hidden_dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
layernorm_epsilon=1e-5,
|
||||
rmsnorm=True,
|
||||
apply_residual_connection_post_layernorm=False,
|
||||
post_layer_norm=True,
|
||||
add_bias_linear=False,
|
||||
add_qkv_bias=False,
|
||||
interleaved_qkv=False,
|
||||
bias_dropout_fusion=True,
|
||||
multi_query_attention=False,
|
||||
multi_query_group_num=1,
|
||||
apply_query_key_layer_scaling=True,
|
||||
attention_softmax_in_fp32=True,
|
||||
fp32_residual_connection=False,
|
||||
quantization_bit=0,
|
||||
pre_seq_len=None,
|
||||
prefix_projection=False,
|
||||
**kwargs):
|
||||
self.num_layers = num_layers
|
||||
self.vocab_size = padded_vocab_size
|
||||
self.padded_vocab_size = padded_vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.ffn_hidden_size = ffn_hidden_size
|
||||
self.kv_channels = kv_channels
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.seq_length = seq_length
|
||||
# It is to be compatible with long lora.
|
||||
self.max_position_embeddings = seq_length
|
||||
self.hidden_dropout = hidden_dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.layernorm_epsilon = layernorm_epsilon
|
||||
self.rmsnorm = rmsnorm
|
||||
self.apply_residual_connection_post_layernorm = (
|
||||
apply_residual_connection_post_layernorm)
|
||||
self.post_layer_norm = post_layer_norm
|
||||
self.add_bias_linear = add_bias_linear
|
||||
self.add_qkv_bias = add_qkv_bias
|
||||
self.bias_dropout_fusion = bias_dropout_fusion
|
||||
self.multi_query_attention = multi_query_attention
|
||||
self.multi_query_group_num = multi_query_group_num
|
||||
self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
|
||||
self.attention_softmax_in_fp32 = attention_softmax_in_fp32
|
||||
self.fp32_residual_connection = fp32_residual_connection
|
||||
self.quantization_bit = quantization_bit
|
||||
self.pre_seq_len = pre_seq_len
|
||||
self.prefix_projection = prefix_projection
|
||||
self.interleaved_qkv = interleaved_qkv
|
||||
super().__init__(**kwargs)
|
||||
278
vllm/transformers_utils/configs/dbrx.py
Normal file
278
vllm/transformers_utils/configs/dbrx.py
Normal file
@@ -0,0 +1,278 @@
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# Copied from
|
||||
# https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py
|
||||
"""Dbrx configuration."""
|
||||
|
||||
from typing import Any, Optional
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {} # type: ignore
|
||||
|
||||
|
||||
class DbrxAttentionConfig(PretrainedConfig):
|
||||
"""Configuration class for Dbrx Attention.
|
||||
|
||||
[`DbrxAttention`] class. It is used to instantiate attention layers
|
||||
according to the specified arguments, defining the layers architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
attn_pdrop (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probability for the attention layers.
|
||||
clip_qkv (`float`, *optional*, defaults to None):
|
||||
If not `None`, clip the queries, keys, and values in the attention layer to this value.
|
||||
kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
|
||||
rope_theta (float): The base frequency for rope.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
attn_pdrop: float = 0,
|
||||
clip_qkv: Optional[float] = None,
|
||||
kv_n_heads: int = 1,
|
||||
rope_theta: float = 10000.0,
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.attn_pdrop = attn_pdrop
|
||||
self.clip_qkv = clip_qkv
|
||||
self.kv_n_heads = kv_n_heads
|
||||
self.rope_theta = rope_theta
|
||||
|
||||
for k in ["model_type"]:
|
||||
if k in kwargs:
|
||||
kwargs.pop(k)
|
||||
if len(kwargs) != 0:
|
||||
raise ValueError(f"Found unknown {kwargs=}")
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls, pretrained_model_name_or_path: str, **kwargs: Any
|
||||
) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs
|
||||
)
|
||||
|
||||
if config_dict.get("model_type") == "dbrx":
|
||||
config_dict = config_dict["attn_config"]
|
||||
|
||||
if (
|
||||
"model_type" in config_dict
|
||||
and hasattr(cls, "model_type")
|
||||
and config_dict["model_type"] != cls.model_type
|
||||
):
|
||||
logger.warning(
|
||||
"You are using a model of type %s to instantiate a model of "
|
||||
"type %s. This is not supported for all configurations of "
|
||||
"models and can yield errors.",
|
||||
config_dict["model_type"], cls.model_type)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class DbrxFFNConfig(PretrainedConfig):
|
||||
"""Configuration class for Dbrx FFN.
|
||||
|
||||
[`DbrxFFN`] class. It is used to instantiate feedforward layers according to
|
||||
the specified arguments, defining the layers architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
|
||||
The dict should have a key 'name' with the value being the name of
|
||||
the activation function along with any additional keyword arguments.
|
||||
ffn_hidden_size (int, optional): The hidden size of the feedforward network.
|
||||
moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
|
||||
moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
|
||||
moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
|
||||
moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
|
||||
moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
|
||||
uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
|
||||
This should only be used for benchmarking purposes.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ffn_act_fn: Optional[dict] = None,
|
||||
ffn_hidden_size: int = 3584,
|
||||
moe_num_experts: int = 4,
|
||||
moe_top_k: int = 1,
|
||||
moe_jitter_eps: Optional[float] = None,
|
||||
moe_loss_weight: float = 0.01,
|
||||
moe_normalize_expert_weights: Optional[float] = 1,
|
||||
uniform_expert_assignment: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__()
|
||||
if ffn_act_fn is None:
|
||||
ffn_act_fn = {"name": "silu"}
|
||||
self.ffn_act_fn = ffn_act_fn
|
||||
self.ffn_hidden_size = ffn_hidden_size
|
||||
self.moe_num_experts = moe_num_experts
|
||||
self.moe_top_k = moe_top_k
|
||||
self.moe_jitter_eps = moe_jitter_eps
|
||||
self.moe_loss_weight = moe_loss_weight
|
||||
self.moe_normalize_expert_weights = moe_normalize_expert_weights
|
||||
self.uniform_expert_assignment = uniform_expert_assignment
|
||||
|
||||
for k in ["model_type"]:
|
||||
if k in kwargs:
|
||||
kwargs.pop(k)
|
||||
if len(kwargs) != 0:
|
||||
raise ValueError(f"Found unknown {kwargs=}")
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls, pretrained_model_name_or_path: str, **kwargs: Any
|
||||
) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs
|
||||
)
|
||||
|
||||
if config_dict.get("model_type") == "dbrx":
|
||||
config_dict = config_dict["ffn_config"]
|
||||
|
||||
if (
|
||||
"model_type" in config_dict
|
||||
and hasattr(cls, "model_type")
|
||||
and config_dict["model_type"] != cls.model_type
|
||||
):
|
||||
logger.warning(
|
||||
"You are using a model of type %s to instantiate a model of "
|
||||
"type %s. This is not supported for all "
|
||||
"configurations of models and can yield errors.", config_dict["model_type"], cls.model_type)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class DbrxConfig(PretrainedConfig):
|
||||
"""Configuration class for Dbrx.
|
||||
|
||||
[`DbrxModel`]. It is used to instantiate a Dbrx model according to the
|
||||
specified arguments, defining the model architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
d_model (`int`, *optional*, defaults to 6144):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
n_heads (`int`, *optional*, defaults to 48):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
n_layers (`int`, *optional*, defaults to 40):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
max_seq_len (`int`, *optional*, defaults to 32768):
|
||||
The maximum sequence length of the model.
|
||||
vocab_size (`int`, *optional*, defaults to 100352):
|
||||
Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
|
||||
the `inputs_ids` passed when calling [`DbrxModel`].
|
||||
resid_pdrop (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probability applied to the attention output before combining with residual.
|
||||
emb_pdrop (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probability for the embedding layer.
|
||||
attn_config (`dict`, *optional*):
|
||||
A dictionary used to configure the model's attention module.
|
||||
ffn_config (`dict`, *optional*):
|
||||
A dictionary used to configure the model's FFN module.
|
||||
use_cache (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
output_router_logits (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the router logits should be returned by the model. Enabling this will also
|
||||
allow the model to output the auxiliary loss. See [here]() for more details
|
||||
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
|
||||
The aux loss factor for the total loss.
|
||||
|
||||
|
||||
Example:
|
||||
```python
|
||||
>>> from transformers import DbrxConfig, DbrxModel
|
||||
|
||||
>>> # Initializing a Dbrx configuration
|
||||
>>> configuration = DbrxConfig()
|
||||
|
||||
>>> # Initializing a model (with random weights) from the configuration
|
||||
>>> model = DbrxModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```
|
||||
"""
|
||||
|
||||
model_type = "dbrx"
|
||||
attribute_map = {
|
||||
"num_attention_heads": "n_heads",
|
||||
"hidden_size": "d_model",
|
||||
"num_hidden_layers": "n_layers",
|
||||
"max_position_embeddings": "max_seq_len",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
d_model: int = 2048,
|
||||
n_heads: int = 16,
|
||||
n_layers: int = 24,
|
||||
max_seq_len: int = 2048,
|
||||
vocab_size: int = 32000,
|
||||
resid_pdrop: float = 0.0,
|
||||
emb_pdrop: float = 0.0,
|
||||
attn_config: Optional[DbrxAttentionConfig] = None,
|
||||
ffn_config: Optional[DbrxFFNConfig] = None,
|
||||
use_cache: bool = True,
|
||||
initializer_range: float = 0.02,
|
||||
output_router_logits: bool = False,
|
||||
router_aux_loss_coef: float = 0.05,
|
||||
**kwargs: Any,
|
||||
):
|
||||
if attn_config is None:
|
||||
self.attn_config = DbrxAttentionConfig()
|
||||
elif isinstance(attn_config, dict):
|
||||
self.attn_config = DbrxAttentionConfig(**attn_config)
|
||||
else:
|
||||
self.attn_config = attn_config
|
||||
|
||||
if ffn_config is None:
|
||||
self.ffn_config = DbrxFFNConfig()
|
||||
elif isinstance(ffn_config, dict):
|
||||
self.ffn_config = DbrxFFNConfig(**ffn_config)
|
||||
else:
|
||||
self.ffn_config = ffn_config
|
||||
|
||||
self.d_model = d_model
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.max_seq_len = max_seq_len
|
||||
self.vocab_size = vocab_size
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.emb_pdrop = emb_pdrop
|
||||
self.use_cache = use_cache
|
||||
self.initializer_range = initializer_range
|
||||
self.output_router_logits = output_router_logits
|
||||
self.router_aux_loss_coef = router_aux_loss_coef
|
||||
|
||||
tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
|
||||
if tie_word_embeddings:
|
||||
raise ValueError(
|
||||
"tie_word_embeddings is not supported for Dbrx models."
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
49
vllm/transformers_utils/configs/eagle.py
Normal file
49
vllm/transformers_utils/configs/eagle.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import os
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers import AutoConfig, PretrainedConfig
|
||||
|
||||
|
||||
class EAGLEConfig(PretrainedConfig):
|
||||
model_type = "eagle"
|
||||
|
||||
def __init__(self,
|
||||
model: Union[PretrainedConfig, dict, None] = None,
|
||||
truncated_vocab_size: Optional[int] = None,
|
||||
**kwargs):
|
||||
|
||||
model_config = None if model is None else (AutoConfig.for_model(
|
||||
**model) if isinstance(model, dict) else model)
|
||||
|
||||
for k, v in kwargs.items():
|
||||
if k != "architectures" and k != "model_type" and hasattr(
|
||||
model_config, k):
|
||||
setattr(model_config, k, v)
|
||||
|
||||
self.model = model_config
|
||||
|
||||
if self.model is None:
|
||||
self.truncated_vocab_size = None
|
||||
else:
|
||||
self.truncated_vocab_size = self.model.vocab_size if \
|
||||
truncated_vocab_size is None else truncated_vocab_size
|
||||
|
||||
if "architectures" not in kwargs:
|
||||
kwargs["architectures"] = ["EAGLEModel"]
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if self.model is not None:
|
||||
for k, v in self.model.to_dict().items():
|
||||
if not hasattr(self, k):
|
||||
setattr(self, k, v)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
pretrained_model_name_or_path: Union[str, os.PathLike],
|
||||
**kwargs,
|
||||
) -> "EAGLEConfig":
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs)
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
190
vllm/transformers_utils/configs/exaone.py
Normal file
190
vllm/transformers_utils/configs/exaone.py
Normal file
@@ -0,0 +1,190 @@
|
||||
# coding=utf-8
|
||||
# Copied from
|
||||
# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
|
||||
# Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Exaone model configuration"""
|
||||
|
||||
from typing import Dict
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, str] = {}
|
||||
|
||||
|
||||
class ExaoneConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:
|
||||
`~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model
|
||||
according to the specified arguments, defining the model architecture.
|
||||
Instantiating a configuration with the defaults will yield a similar
|
||||
configuration to that of the Exaone
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig`
|
||||
and can be used to control the model outputs. Read the documentation from :
|
||||
class:`~transformers.PretrainedConfig` for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 50257):
|
||||
Vocabulary size of the GPT Lingvo model. Defines the number of
|
||||
different tokens that can be represented by the :obj:`inputs_ids`
|
||||
passed when calling :class:`~transformers.ExaoneModel`. Vocabulary
|
||||
size of the model.
|
||||
Defines the different tokens that can be represented by the
|
||||
`inputs_ids` passed to the forward method of :class:
|
||||
`~transformers.EXAONEModel`.
|
||||
hidden_size (:obj:`int`, `optional`, defaults to 2048):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_layers (:obj:`int`, `optional`, defaults to 24):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the
|
||||
Transformer decoder.
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
This is the number of key_value heads that should be used to
|
||||
implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi
|
||||
Head Attention (MHA), if `num_key_value_heads=1 the model will use
|
||||
Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint,
|
||||
each group key and value head should be constructed by meanpooling
|
||||
all the original heads within that group. For more details checkout
|
||||
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
|
||||
specified, will default to `num_attention_heads`.
|
||||
rotary_pct (`float`, *optional*, defaults to 0.25):
|
||||
percentage of hidden dimensions to allocate to rotary embeddings
|
||||
intermediate_size (:obj:`int`, `optional`, defaults to 8192):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in
|
||||
the Transformer encoder.
|
||||
activation_function (:obj:`str` or :obj:`function`, `optional`,
|
||||
defaults to :obj:`"gelu_new"`):
|
||||
The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`,
|
||||
:obj:`"selu"` and :obj:`"gelu_new"` are supported.
|
||||
embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout probabilitiy for all fully connected layers in the
|
||||
embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
|
||||
The vocabulary size of the :obj:`token_type_ids` passed when calling
|
||||
:class:`~transformers.EXAONEModel`.
|
||||
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values
|
||||
attentions (not used by all models).
|
||||
Only relevant if ``config.is_decoder=True``.
|
||||
gradient_checkpointing (:obj:`bool`, `optional`,
|
||||
defaults to :obj:`False`):
|
||||
If True, use gradient checkpointing to save memory at the expense
|
||||
of slower backward pass.
|
||||
Example::
|
||||
|
||||
>>> from transformers import ExoneModel, ExaoneConfig
|
||||
|
||||
>>> # Initializing a EXAONE configuration
|
||||
>>> configuration = ExaoneConfig()
|
||||
|
||||
>>> # Initializing a model from configuration
|
||||
>>> model = ExoneModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
|
||||
model_type = "exaone"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {"num_hidden_layers": "num_layers"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=102400,
|
||||
max_position_embeddings=2048,
|
||||
hidden_size=2048,
|
||||
num_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
intermediate_size=None,
|
||||
activation_function="silu",
|
||||
rotary_pct=0.25,
|
||||
resid_dropout=0.0,
|
||||
embed_dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
layer_norm_epsilon=1e-6,
|
||||
initializer_range=0.02,
|
||||
use_cache=True,
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
tie_word_embeddings=True,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.num_layers = num_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_hidden_layers = num_layers
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
if intermediate_size:
|
||||
self.intermediate_size = intermediate_size
|
||||
else:
|
||||
self.intermediate_size = hidden_size * 4
|
||||
self.activation_function = activation_function
|
||||
self.resid_dropout = resid_dropout
|
||||
self.embed_dropout = embed_dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
self.use_cache = use_cache
|
||||
self.rotary_pct = rotary_pct
|
||||
|
||||
self.bos_token_id = bos_token_id
|
||||
self.eos_token_id = eos_token_id
|
||||
|
||||
self.use_logit_cap = kwargs.pop("use_logit_cap", False)
|
||||
self.ln_no_scale = kwargs.pop("ln_no_scale", False)
|
||||
self.use_gated = kwargs.pop("use_gated", False)
|
||||
self.use_emb_norm = kwargs.pop("use_emb_norm", False)
|
||||
self.use_rotary_pos = kwargs.pop("use_rotary_pos", False)
|
||||
self.rotary_type = kwargs.pop("rotary_type", None)
|
||||
self.scaling_factor = kwargs.pop("scaling_factor", 1)
|
||||
self.use_absolute_pos = kwargs.pop("use_absolute_pos", True)
|
||||
self.use_extra_logit = kwargs.pop("use_extra_logit", True)
|
||||
self.rotary_expand_length = kwargs.pop("rotary_expand_length", None)
|
||||
self.rotary_base = kwargs.pop("rotary_base", 10000.0)
|
||||
self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False)
|
||||
self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head",
|
||||
(rotary_pct == 0.25))
|
||||
if self.use_rotary_pos:
|
||||
self.use_absolute_pos = False
|
||||
89
vllm/transformers_utils/configs/falcon.py
Normal file
89
vllm/transformers_utils/configs/falcon.py
Normal file
@@ -0,0 +1,89 @@
|
||||
# Adapted from
|
||||
# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Falcon configuration"""
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class RWConfig(PretrainedConfig):
|
||||
model_type = "falcon"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_hidden_layers": "n_layer",
|
||||
"num_attention_heads": "n_head",
|
||||
"num_kv_heads": "n_head_kv",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=250880,
|
||||
hidden_size=64,
|
||||
n_layer=2,
|
||||
n_head=8,
|
||||
num_ln_in_parallel_attn=None,
|
||||
layer_norm_epsilon=1e-5,
|
||||
initializer_range=0.02,
|
||||
use_cache=True,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
hidden_dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
multi_query=True,
|
||||
n_head_kv=None,
|
||||
alibi=False,
|
||||
bias=False,
|
||||
parallel_attn=False,
|
||||
new_decoder_architecture=False,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.vocab_size = vocab_size
|
||||
# Backward compatibility with n_embed kwarg
|
||||
n_embed = kwargs.pop("n_embed", None)
|
||||
self.hidden_size = hidden_size if n_embed is None else n_embed
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
self.use_cache = use_cache
|
||||
self.hidden_dropout = hidden_dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
self.bos_token_id = bos_token_id
|
||||
self.eos_token_id = eos_token_id
|
||||
self.multi_query = multi_query
|
||||
self.n_head_kv = 1 if n_head_kv is None else n_head_kv
|
||||
self.alibi = alibi
|
||||
self.bias = bias
|
||||
self.parallel_attn = parallel_attn
|
||||
self.new_decoder_architecture = new_decoder_architecture
|
||||
|
||||
if self.hidden_size == 8192:
|
||||
# Hack for falcon-40b
|
||||
self.new_decoder_architecture = True
|
||||
self.num_ln_in_parallel_attn = num_ln_in_parallel_attn
|
||||
|
||||
super().__init__(bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
**kwargs)
|
||||
|
||||
@property
|
||||
def head_dim(self):
|
||||
return self.hidden_size // self.n_head
|
||||
|
||||
@property
|
||||
def rotary(self):
|
||||
return not self.alibi
|
||||
51
vllm/transformers_utils/configs/internvl.py
Normal file
51
vllm/transformers_utils/configs/internvl.py
Normal file
@@ -0,0 +1,51 @@
|
||||
# Adapted from
|
||||
# https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py
|
||||
# --------------------------------------------------------
|
||||
# InternVL
|
||||
# Copyright (c) 2024 OpenGVLab
|
||||
# Licensed under The MIT License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class InternVLChatConfig(PretrainedConfig):
|
||||
model_type = 'internvl_chat'
|
||||
is_composition = True
|
||||
|
||||
def __init__(self,
|
||||
vision_config=None,
|
||||
llm_config=None,
|
||||
use_backbone_lora=0,
|
||||
use_llm_lora=0,
|
||||
select_layer=-1,
|
||||
force_image_size=None,
|
||||
downsample_ratio=0.5,
|
||||
template=None,
|
||||
dynamic_image_size=False,
|
||||
use_thumbnail=False,
|
||||
ps_version='v1',
|
||||
min_dynamic_patch=1,
|
||||
max_dynamic_patch=6,
|
||||
**kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if vision_config is None:
|
||||
vision_config = {}
|
||||
|
||||
if llm_config is None:
|
||||
llm_config = {}
|
||||
|
||||
self.vision_config = PretrainedConfig(**vision_config)
|
||||
self.text_config = PretrainedConfig(**llm_config)
|
||||
|
||||
self.use_backbone_lora = use_backbone_lora
|
||||
self.use_llm_lora = use_llm_lora
|
||||
self.select_layer = select_layer
|
||||
self.force_image_size = force_image_size
|
||||
self.downsample_ratio = downsample_ratio
|
||||
self.template = template
|
||||
self.dynamic_image_size = dynamic_image_size
|
||||
self.use_thumbnail = use_thumbnail
|
||||
self.ps_version = ps_version # pixel shuffle version
|
||||
self.min_dynamic_patch = min_dynamic_patch
|
||||
self.max_dynamic_patch = max_dynamic_patch
|
||||
236
vllm/transformers_utils/configs/jais.py
Normal file
236
vllm/transformers_utils/configs/jais.py
Normal file
@@ -0,0 +1,236 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright 2023 Cerebras Systems.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""JAIS configuration"""
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class JAISConfig(PretrainedConfig):
|
||||
"""
|
||||
This is the configuration class to store the configuration of a
|
||||
[`JAISModel`]. It is used to instantiate a JAIS model according to the
|
||||
specified arguments, defining the model architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used
|
||||
to control the model outputs. Read the documentation from
|
||||
[`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 50257):
|
||||
Vocabulary size of the JAIS model. Defines the number of different
|
||||
tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`JAISModel`].
|
||||
n_positions (`int`, *optional*, defaults to 1024):
|
||||
The maximum sequence length that this model might ever be used
|
||||
with. Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048).
|
||||
n_embd (`int`, *optional*, defaults to 768):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
n_layer (`int`, *optional*, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (`int`, *optional*, defaults to 12):
|
||||
Number of attention heads for each attention layer in the
|
||||
Transformer encoder.
|
||||
n_inner (`int`, *optional*, defaults to None):
|
||||
Dimensionality of the inner feed-forward layers. `None` will set
|
||||
it to 4 times n_embd
|
||||
activation_function (`str`, *optional*, defaults to `"gelu"`):
|
||||
Activation function, to be selected in the list
|
||||
`["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"]`.
|
||||
resid_pdrop (`float`, *optional*, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in
|
||||
the embeddings, encoder, and pooler.
|
||||
embd_pdrop (`float`, *optional*, defaults to 0.1):
|
||||
The dropout ratio for the embeddings.
|
||||
attn_pdrop (`float`, *optional*, defaults to 0.1):
|
||||
The dropout ratio for the attention.
|
||||
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon to use in the layer normalization layers.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
scale_attn_weights (`bool`, *optional*, defaults to `True`):
|
||||
Scale attention weights by dividing by sqrt(hidden_size)..
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values
|
||||
attentions (not used by all models).
|
||||
scale_attn_by_inverse_layer_idx (`bool`, *optional*,
|
||||
defaults to `False`):
|
||||
Whether to additionally scale attention weights by
|
||||
`1 / layer_idx + 1`.
|
||||
reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
|
||||
Whether to scale keys (K) prior to computing attention
|
||||
(dot-product)
|
||||
and upcast attention dot-product/softmax to float() when training
|
||||
with mixed precision.
|
||||
position_embedding_type (`str`, *optional*, defaults to `"learned"`):
|
||||
Positional embedding can be either `"alibi"` or `"learned"`.
|
||||
mup_width_scale (`float`, *optional*, defaults to 1.0):
|
||||
muP parameter to scale learning rate and initializers. Calculated
|
||||
as (`d_model,0 / d_model`), where
|
||||
`d_model` is the model's width and `d_model,0` is the proxy
|
||||
model's width.
|
||||
mup_embeddings_scale (`float`, *optional*, defaults to 1.0):
|
||||
muP parameter to scale token and position embeddings.
|
||||
mup_output_alpha (`float`, *optional*, defaults to 1.0):
|
||||
muP parameter to scale output logits
|
||||
(`output_logits_scale = mup_output_alpha * mup_width_scale`).
|
||||
mup_scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
|
||||
Scale attention weights by dividing by hidden_size instead of
|
||||
sqrt(hidden_size). Need to set scale_attn_weights to `True` as
|
||||
well.
|
||||
alibi_scaling (`Dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for ALiBi
|
||||
embeddings. Currently only supports linear
|
||||
scaling strategy. Can specify either the scaling `factor` (must be
|
||||
a float greater than 1) for fixed scaling
|
||||
or `train_seq_len` for dynamic scaling on input samples with
|
||||
sequence length > `train_seq_len`. The expected
|
||||
formats are `{"type": strategy name, "factor": scaling factor}` or
|
||||
`{"type": strategy name,
|
||||
"train_seq_len": training sequence length}`.
|
||||
architectures (`List`, *optional*, defaults to ['JAISLMHeadModel']):
|
||||
architecture names for Jais.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import JAISConfig, JAISModel
|
||||
|
||||
>>> # Initializing a JAIS configuration
|
||||
>>> configuration = JAISConfig()
|
||||
|
||||
>>> # Initializing a model (with random weights) from the configuration
|
||||
>>> model = JAISModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "jais"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"hidden_size": "n_embd",
|
||||
"max_position_embeddings": "n_positions",
|
||||
"num_attention_heads": "n_head",
|
||||
"num_hidden_layers": "n_layer",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=50257,
|
||||
n_positions=1024,
|
||||
n_embd=768,
|
||||
n_layer=12,
|
||||
n_head=12,
|
||||
n_inner=None,
|
||||
activation_function="gelu_new",
|
||||
resid_pdrop=0.1,
|
||||
embd_pdrop=0.1,
|
||||
attn_pdrop=0.1,
|
||||
layer_norm_epsilon=1e-5,
|
||||
initializer_range=0.02,
|
||||
scale_attn_weights=True,
|
||||
use_cache=True,
|
||||
bos_token_id=50256,
|
||||
eos_token_id=50256,
|
||||
scale_attn_by_inverse_layer_idx=False,
|
||||
reorder_and_upcast_attn=False,
|
||||
position_embedding_type="learned",
|
||||
mup_width_scale=1.0,
|
||||
mup_embeddings_scale=1.0,
|
||||
mup_output_alpha=1.0,
|
||||
mup_scale_qk_dot_by_d=False,
|
||||
alibi_scaling=None,
|
||||
architectures=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
self.n_inner = n_inner
|
||||
self.activation_function = activation_function
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.embd_pdrop = embd_pdrop
|
||||
self.attn_pdrop = attn_pdrop
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
self.scale_attn_weights = scale_attn_weights
|
||||
self.use_cache = use_cache
|
||||
self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
|
||||
self.reorder_and_upcast_attn = reorder_and_upcast_attn
|
||||
|
||||
self.bos_token_id = bos_token_id
|
||||
self.eos_token_id = eos_token_id
|
||||
|
||||
self.position_embedding_type = position_embedding_type
|
||||
self.mup_width_scale = mup_width_scale
|
||||
self.mup_embeddings_scale = mup_embeddings_scale
|
||||
self.mup_output_alpha = mup_output_alpha
|
||||
self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d
|
||||
|
||||
self.alibi_scaling = alibi_scaling
|
||||
self._alibi_scaling_validation()
|
||||
if architectures is None:
|
||||
architectures = ["JAISLMHeadModel"]
|
||||
|
||||
super().__init__(
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
architectures=architectures,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _alibi_scaling_validation(self):
|
||||
"""
|
||||
Validate the `alibi_scaling` configuration.
|
||||
"""
|
||||
if self.alibi_scaling is None:
|
||||
return
|
||||
|
||||
if (not isinstance(self.alibi_scaling, dict)
|
||||
or len(self.alibi_scaling) != 2):
|
||||
raise ValueError(
|
||||
"`alibi_scaling` must be a dictionary with two fields,"
|
||||
"`type` and `factor` or `type` and `train_seq_len`, "
|
||||
f"got {self.alibi_scaling}")
|
||||
alibi_scaling_type = self.alibi_scaling.get("type", None)
|
||||
alibi_scaling_factor = self.alibi_scaling.get("factor", None)
|
||||
alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
|
||||
if alibi_scaling_type is None or alibi_scaling_type != "linear":
|
||||
raise ValueError(f"`alibi_scaling`'s type field must be 'linear',"
|
||||
f"got {alibi_scaling_type}")
|
||||
if (alibi_scaling_factor is not None
|
||||
and not isinstance(alibi_scaling_factor, float)
|
||||
or (alibi_scaling_factor is not None
|
||||
and alibi_scaling_factor <= 1.0)):
|
||||
raise ValueError(
|
||||
f"`alibi_scaling`'s factor field must be a float > 1.0,"
|
||||
f"got {alibi_scaling_factor}")
|
||||
if (alibi_dynamic_scaling is not None
|
||||
and not isinstance(alibi_dynamic_scaling, int)
|
||||
or (alibi_dynamic_scaling is not None
|
||||
and alibi_dynamic_scaling <= 1)):
|
||||
raise ValueError(
|
||||
f"`alibi_scaling`'s `train_seq_len` field must be an"
|
||||
f"integer > 1, got {alibi_dynamic_scaling}")
|
||||
60
vllm/transformers_utils/configs/medusa.py
Normal file
60
vllm/transformers_utils/configs/medusa.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import os
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class MedusaConfig(PretrainedConfig):
|
||||
model_type = "medusa"
|
||||
|
||||
def __init__(self,
|
||||
hidden_size: int = 4096,
|
||||
vocab_size: int = 32001,
|
||||
num_heads: int = 5,
|
||||
num_hidden_layers: int = 1,
|
||||
max_paths: int = 64,
|
||||
topk: int = 10,
|
||||
truncated_vocab_size: Optional[int] = None,
|
||||
**kwargs):
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.vocab_size = vocab_size
|
||||
self.num_heads = num_heads
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.max_paths = max_paths
|
||||
self.topk = topk
|
||||
self.max_seq_len = int(2**20)
|
||||
self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
|
||||
else truncated_vocab_size
|
||||
if "architectures" not in kwargs:
|
||||
kwargs["architectures"] = ["MedusaModel"]
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
pretrained_model_name_or_path: Union[str, os.PathLike],
|
||||
**kwargs,
|
||||
) -> "MedusaConfig":
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs)
|
||||
for k in list(config_dict.keys()):
|
||||
if 'num' in k:
|
||||
if 'heads' in k:
|
||||
config_dict["num_heads"] = config_dict.pop(k)
|
||||
elif 'layers' in k:
|
||||
config_dict["num_hidden_layers"] = config_dict.pop(k)
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
@property
|
||||
def num_attention_heads(self):
|
||||
return 0
|
||||
|
||||
@property
|
||||
def num_lookahead_tokens(self):
|
||||
return self.num_heads
|
||||
|
||||
@num_lookahead_tokens.setter
|
||||
def num_lookahead_tokens(self, num_lookahead_tokens: int):
|
||||
self.num_heads = num_lookahead_tokens
|
||||
28
vllm/transformers_utils/configs/mllama.py
Normal file
28
vllm/transformers_utils/configs/mllama.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from transformers.models.mllama import configuration_mllama as mllama_hf_config
|
||||
|
||||
|
||||
class MllamaTextConfig(mllama_hf_config.MllamaTextConfig):
|
||||
'''
|
||||
Use this class to override is_encoder_decoder:
|
||||
- transformers regards mllama as is_encoder_decoder=False
|
||||
- vllm needs is_encoder_decoder=True to enable cross-attention
|
||||
'''
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.is_encoder_decoder = True
|
||||
|
||||
|
||||
class MllamaConfig(mllama_hf_config.MllamaConfig):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text_config=None,
|
||||
**kwargs,
|
||||
):
|
||||
if isinstance(text_config, dict):
|
||||
text_config = MllamaTextConfig(**text_config)
|
||||
super().__init__(text_config=text_config, **kwargs)
|
||||
65
vllm/transformers_utils/configs/mlp_speculator.py
Normal file
65
vllm/transformers_utils/configs/mlp_speculator.py
Normal file
@@ -0,0 +1,65 @@
|
||||
from typing import List, Optional
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class MLPSpeculatorConfig(PretrainedConfig):
|
||||
model_type = "mlp_speculator"
|
||||
|
||||
attribute_map = {
|
||||
"hidden_size": "emb_dim",
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
vocab_size: int = 32000,
|
||||
emb_dim: int = 4096,
|
||||
inner_dim: int = 0,
|
||||
n_predict: int = 3,
|
||||
top_k_tokens_per_head: Optional[List[int]] = None,
|
||||
n_candidates: int = 5,
|
||||
tie_weights: bool = False,
|
||||
scale_input: bool = False,
|
||||
**kwargs):
|
||||
"""
|
||||
Initialize an MLPSpeculatorConfig
|
||||
|
||||
Args:
|
||||
vocab_size: int
|
||||
the model vocab size
|
||||
emb_dim: int
|
||||
the model embedding dimension
|
||||
inner_dim: int
|
||||
the inner dimension of the model. If 0, will be the emb_dim.
|
||||
n_predict: int
|
||||
the number of lookaheads for the speculator
|
||||
top_k_tokens_per_head: List[int]
|
||||
Number of tokens to consider from each head when forming the
|
||||
candidate tree.
|
||||
For each candidate branch in the tree, head n produces topk[n]
|
||||
additional sub-branches.
|
||||
NOTE: This parameter is currently unused.
|
||||
n_candidates: int
|
||||
number of child candidates to create per sequence
|
||||
tie_weights: bool
|
||||
If true, use a single set of weights for every model
|
||||
head/stage after the first. The initial projection
|
||||
from the base model may have a different size, so that
|
||||
stays separate.
|
||||
scale_input: bool
|
||||
if True, will scale the initial hidden states from
|
||||
the base model.
|
||||
"""
|
||||
if top_k_tokens_per_head is None:
|
||||
top_k_tokens_per_head = [5, 4, 3]
|
||||
assert len(top_k_tokens_per_head) == n_predict
|
||||
self.vocab_size = vocab_size
|
||||
self.emb_dim = emb_dim
|
||||
self.inner_dim = inner_dim
|
||||
self.n_predict = n_predict
|
||||
self.top_k_tokens_per_head = top_k_tokens_per_head
|
||||
self.n_candidates = n_candidates
|
||||
self.num_lookahead_tokens = n_predict
|
||||
self.tie_weights = tie_weights
|
||||
self.scale_input = scale_input
|
||||
|
||||
super().__init__(**kwargs)
|
||||
178
vllm/transformers_utils/configs/mpt.py
Normal file
178
vllm/transformers_utils/configs/mpt.py
Normal file
@@ -0,0 +1,178 @@
|
||||
# coding=utf-8
|
||||
# Copied from
|
||||
# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
|
||||
"""A HuggingFace-style model configuration."""
|
||||
import warnings
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
attn_config_defaults: Dict = {
|
||||
'attn_type': 'multihead_attention',
|
||||
'attn_pdrop': 0.0,
|
||||
'attn_impl': 'triton',
|
||||
'qk_ln': False,
|
||||
'clip_qkv': None,
|
||||
'softmax_scale': None,
|
||||
'prefix_lm': False,
|
||||
'attn_uses_sequence_id': False,
|
||||
'alibi': False,
|
||||
'alibi_bias_max': 8
|
||||
}
|
||||
ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'}
|
||||
init_config_defaults: Dict = {
|
||||
'name': 'kaiming_normal_',
|
||||
'fan_mode': 'fan_in',
|
||||
'init_nonlinearity': 'relu',
|
||||
'init_div_is_residual': True,
|
||||
'emb_init_std': None,
|
||||
'emb_init_uniform_lim': None,
|
||||
'init_std': None,
|
||||
'init_gain': 0.0
|
||||
}
|
||||
|
||||
|
||||
class MPTConfig(PretrainedConfig):
|
||||
model_type = 'mpt'
|
||||
attribute_map = {
|
||||
'num_attention_heads': 'n_heads',
|
||||
'hidden_size': 'd_model',
|
||||
'num_hidden_layers': 'n_layers',
|
||||
}
|
||||
|
||||
# pylint: disable=dangerous-default-value
|
||||
def __init__(self,
|
||||
d_model: int = 2048,
|
||||
n_heads: int = 16,
|
||||
n_layers: int = 24,
|
||||
expansion_ratio: int = 4,
|
||||
max_seq_len: int = 2048,
|
||||
vocab_size: int = 50368,
|
||||
resid_pdrop: float = 0.0,
|
||||
emb_pdrop: float = 0.0,
|
||||
learned_pos_emb: bool = True,
|
||||
attn_config: Dict = attn_config_defaults,
|
||||
ffn_config: Dict = ffn_config_defaults,
|
||||
init_device: str = 'cpu',
|
||||
logit_scale: Optional[Union[float, str]] = None,
|
||||
no_bias: bool = False,
|
||||
embedding_fraction: float = 1.0,
|
||||
norm_type: str = 'low_precision_layernorm',
|
||||
use_cache: bool = False,
|
||||
init_config: Dict = init_config_defaults,
|
||||
fc_type: str = 'torch',
|
||||
verbose: Optional[int] = None,
|
||||
**kwargs: Any):
|
||||
self.d_model = d_model
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.expansion_ratio = expansion_ratio
|
||||
self.max_seq_len = max_seq_len
|
||||
self.vocab_size = vocab_size
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.emb_pdrop = emb_pdrop
|
||||
self.learned_pos_emb = learned_pos_emb
|
||||
self.attn_config = attn_config
|
||||
self.ffn_config = ffn_config
|
||||
self.init_device = init_device
|
||||
self.logit_scale = logit_scale
|
||||
self.no_bias = no_bias
|
||||
self.embedding_fraction = embedding_fraction
|
||||
self.norm_type = norm_type
|
||||
self.use_cache = use_cache
|
||||
self.init_config = init_config
|
||||
self.fc_type = fc_type
|
||||
if verbose is not None:
|
||||
warnings.warn(DeprecationWarning(
|
||||
'verbose argument for MPTConfig is now ignored and '
|
||||
'will be removed. Use python_log_level instead.'),
|
||||
stacklevel=2)
|
||||
if 'name' in kwargs:
|
||||
del kwargs['name']
|
||||
if 'loss_fn' in kwargs:
|
||||
del kwargs['loss_fn']
|
||||
if self.attn_config.get('alibi', False):
|
||||
self.learned_pos_emb = False
|
||||
warnings.warn(
|
||||
f'alibi is turned on, setting `learned_pos_emb` '
|
||||
f'to {self.learned_pos_emb}`',
|
||||
stacklevel=2)
|
||||
super().__init__(**kwargs)
|
||||
self._validate_config()
|
||||
|
||||
def _set_config_defaults(
|
||||
self, config: Dict[str, Any],
|
||||
config_defaults: Dict[str, Any]) -> Dict[str, Any]:
|
||||
for (k, v) in config_defaults.items():
|
||||
if k not in config:
|
||||
config[k] = v
|
||||
return config
|
||||
|
||||
def _validate_config(self) -> None:
|
||||
self.attn_config = self._set_config_defaults(self.attn_config,
|
||||
attn_config_defaults)
|
||||
self.ffn_config = self._set_config_defaults(self.ffn_config,
|
||||
ffn_config_defaults)
|
||||
self.init_config = self._set_config_defaults(self.init_config,
|
||||
init_config_defaults)
|
||||
if self.d_model % self.n_heads != 0:
|
||||
raise ValueError('d_model must be divisible by n_heads')
|
||||
if any((
|
||||
prob < 0 or prob > 1 for prob in
|
||||
[self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop]
|
||||
)):
|
||||
raise ValueError(
|
||||
"self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
|
||||
"probabilities and must be between 0 and 1")
|
||||
if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
|
||||
raise ValueError(
|
||||
f"Unknown attn_impl={self.attn_config['attn_impl']}")
|
||||
if self.attn_config['prefix_lm'] and self.attn_config[
|
||||
'attn_impl'] not in ['torch', 'triton']:
|
||||
raise NotImplementedError(
|
||||
'prefix_lm only implemented with torch and triton attention.')
|
||||
if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [
|
||||
'torch', 'triton'
|
||||
]:
|
||||
raise NotImplementedError(
|
||||
'alibi only implemented with torch and triton attention.')
|
||||
if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
|
||||
'attn_impl'] not in ['torch', 'triton']:
|
||||
raise NotImplementedError(
|
||||
'attn_uses_sequence_id only implemented with torch '
|
||||
'and triton attention.')
|
||||
if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
|
||||
raise ValueError(
|
||||
'model.embedding_fraction must be between 0 (exclusive) '
|
||||
'and 1 (inclusive)!')
|
||||
if isinstance(self.logit_scale,
|
||||
str) and self.logit_scale != 'inv_sqrt_d_model':
|
||||
raise ValueError(
|
||||
f"self.logit_scale={self.logit_scale!r} is not recognized as "
|
||||
"an option; use numeric value or 'inv_sqrt_d_model'.")
|
||||
if self.init_config.get('name', None) is None:
|
||||
raise ValueError(
|
||||
f"self.init_config={self.init_config!r} 'name' needs to be set."
|
||||
)
|
||||
if not self.learned_pos_emb and (not self.attn_config['alibi']):
|
||||
warnings.warn(
|
||||
'Positional information not being provided to the model.',
|
||||
stacklevel=2)
|
||||
if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
|
||||
try:
|
||||
# pylint: disable=import-outside-toplevel
|
||||
import transformer_engine.pytorch as te
|
||||
del te
|
||||
except Exception as exc:
|
||||
raise ImportError(
|
||||
'TransformerEngine import fail. `fc_type: te` requires '
|
||||
'TransformerEngine be installed. '
|
||||
'The required version of transformer_engine also requires '
|
||||
'FlashAttention v1.0.6 is installed:\n'
|
||||
'pip install flash-attn==1.0.6 --no-build-isolation \n'
|
||||
'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
|
||||
) from exc
|
||||
if self.ffn_config['ffn_type'] == 'mptmlp':
|
||||
self.ffn_config['fc_type'] = self.fc_type
|
||||
elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
|
||||
self.ffn_config['bias'] = not self.no_bias
|
||||
203
vllm/transformers_utils/configs/nemotron.py
Normal file
203
vllm/transformers_utils/configs/nemotron.py
Normal file
@@ -0,0 +1,203 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 HuggingFace Inc. team. All rights reserved.
|
||||
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Nemotron model configuration"""
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class NemotronConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a
|
||||
[`NemotronModel`]. It is used to instantiate an Nemotron model
|
||||
according to the specified arguments, defining the model architecture.
|
||||
Instantiating a configuration with the defaults will yield a similar
|
||||
configuration to that of the Nemotron-8B.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be
|
||||
used to control the model outputs. Read the documentation from
|
||||
[`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 256000):
|
||||
Vocabulary size of the Nemotron model. Defines the number of
|
||||
different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`NemotronModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 6144):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 24576):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 48):
|
||||
Number of attention heads for each attention layer in the
|
||||
Transformer decoder.
|
||||
head_dim (`int`, *optional*):
|
||||
Projection weights dimension in multi-head attention. Set to
|
||||
hidden_size // num_attention_heads if None
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
This is the number of key_value heads that should be used to
|
||||
implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use
|
||||
Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1 the model will use Multi Query Attention
|
||||
(MQA) otherwise GQA is used. When converting a multi-head
|
||||
checkpoint to a GQA checkpoint, each group key and value
|
||||
head should be constructed by meanpooling all the original
|
||||
heads within that group. For more details checkout
|
||||
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
|
||||
is not specified, will default to `num_attention_heads`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
|
||||
The non-linear activation function (function or string) in the
|
||||
decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 4096):
|
||||
The maximum sequence length that this model might ever be used
|
||||
with.
|
||||
initializer_range (`float`, *optional*, defaults to 0.0134):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values
|
||||
attentions (not used by all models). Only relevant if
|
||||
`config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*):
|
||||
Padding token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 2):
|
||||
Beginning of stream token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 3):
|
||||
End of stream token id.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether to tie weight embeddings
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
partial_rotary_factor (`float`, *optional*, defaults to 0.5):
|
||||
Percentage of the query and keys which will have rotary embedding.
|
||||
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value and output
|
||||
projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
mlp_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in up_proj and down_proj layers in the MLP
|
||||
layers.
|
||||
|
||||
```python
|
||||
>>> from transformers import NemotronModel, NemotronConfig
|
||||
>>> # Initializing a Nemotron nemotron-15b style configuration
|
||||
>>> configuration = NemotronConfig()
|
||||
>>> # Initializing a model from the nemotron-15b style configuration
|
||||
>>> model = NemotronModel(configuration)
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "nemotron"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=256000,
|
||||
hidden_size=6144,
|
||||
intermediate_size=24576,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=48,
|
||||
head_dim=None,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="relu2",
|
||||
max_position_embeddings=4096,
|
||||
initializer_range=0.0134,
|
||||
norm_eps=1e-5,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=2,
|
||||
eos_token_id=3,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
partial_rotary_factor=0.5,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
mlp_bias=False,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
head_dim = head_dim or kwargs.get("kv_channels", None)
|
||||
self.head_dim = head_dim if head_dim is not None else (
|
||||
hidden_size // num_attention_heads)
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.norm_eps = norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
# for backward compatibility
|
||||
partial_rotary_factor = kwargs.get("rope_percent", None) or kwargs.get(
|
||||
"rope_percentage", None) or partial_rotary_factor
|
||||
self.partial_rotary_factor = partial_rotary_factor
|
||||
self._rope_scaling_validation()
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
self.mlp_bias = mlp_bias
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _rope_scaling_validation(self):
|
||||
"""
|
||||
Validate the `rope_scaling` configuration.
|
||||
"""
|
||||
if self.rope_scaling is None:
|
||||
return
|
||||
|
||||
if not isinstance(self.rope_scaling,
|
||||
dict) or len(self.rope_scaling) != 2:
|
||||
raise ValueError(
|
||||
"`rope_scaling` must be a dictionary with two fields, "
|
||||
f"`type` and `factor`, got {self.rope_scaling}")
|
||||
rope_scaling_type = self.rope_scaling.get("type", None)
|
||||
rope_scaling_factor = self.rope_scaling.get("factor", None)
|
||||
if rope_scaling_type is None or rope_scaling_type not in [
|
||||
"linear", "dynamic"
|
||||
]:
|
||||
raise ValueError(
|
||||
"`rope_scaling`'s type field must be one of ['linear', "
|
||||
f"'dynamic'], got {rope_scaling_type}")
|
||||
if rope_scaling_factor is None or not isinstance(
|
||||
rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
|
||||
raise ValueError(
|
||||
"`rope_scaling`'s factor field must be a float > 1, got "
|
||||
f"{rope_scaling_factor}")
|
||||
12
vllm/transformers_utils/configs/nvlm_d.py
Normal file
12
vllm/transformers_utils/configs/nvlm_d.py
Normal file
@@ -0,0 +1,12 @@
|
||||
# Adapted from
|
||||
# https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
|
||||
# --------------------------------------------------------
|
||||
# NVLM-D
|
||||
# Copyright (c) 2024 NVIDIA
|
||||
# Licensed under Apache 2.0 License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
from .internvl import InternVLChatConfig
|
||||
|
||||
|
||||
class NVLM_D_Config(InternVLChatConfig):
|
||||
model_type = 'NVLM_D'
|
||||
131
vllm/transformers_utils/configs/qwen2vl.py
Normal file
131
vllm/transformers_utils/configs/qwen2vl.py
Normal file
@@ -0,0 +1,131 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Qwen2VL model configuration"""
|
||||
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class Qwen2VLVisionConfig(PretrainedConfig):
|
||||
model_type = "qwen2_vl"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
depth=32,
|
||||
embed_dim=1280,
|
||||
hidden_size=3584,
|
||||
hidden_act="quick_gelu",
|
||||
mlp_ratio=4,
|
||||
num_heads=16,
|
||||
in_channels=3,
|
||||
patch_size=14,
|
||||
spatial_merge_size=2,
|
||||
temporal_patch_size=2,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.depth = depth
|
||||
self.embed_dim = embed_dim
|
||||
self.hidden_size = hidden_size
|
||||
self.hidden_act = hidden_act
|
||||
self.mlp_ratio = mlp_ratio
|
||||
self.num_heads = num_heads
|
||||
self.in_channels = in_channels
|
||||
self.patch_size = patch_size
|
||||
self.spatial_merge_size = spatial_merge_size
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
|
||||
os.PathLike],
|
||||
**kwargs) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
if config_dict.get("model_type") == "qwen2_vl":
|
||||
config_dict = config_dict["vision_config"]
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class Qwen2VLConfig(PretrainedConfig):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=152064,
|
||||
hidden_size=8192,
|
||||
intermediate_size=29568,
|
||||
num_hidden_layers=80,
|
||||
num_attention_heads=64,
|
||||
num_key_value_heads=8,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=32768,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-05,
|
||||
use_cache=True,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=1000000.0,
|
||||
use_sliding_window=False,
|
||||
sliding_window=4096,
|
||||
max_window_layers=80,
|
||||
attention_dropout=0.0,
|
||||
vision_config=None,
|
||||
rope_scaling=None,
|
||||
**kwargs,
|
||||
):
|
||||
if isinstance(vision_config, dict):
|
||||
self.vision_config = Qwen2VLVisionConfig(**vision_config)
|
||||
elif vision_config is None:
|
||||
self.vision_config = Qwen2VLVisionConfig()
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.use_sliding_window = use_sliding_window
|
||||
self.sliding_window = sliding_window
|
||||
self.max_window_layers = max_window_layers
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.attention_dropout = attention_dropout
|
||||
self.rope_scaling = rope_scaling
|
||||
|
||||
# NOTE: the following section from original transformers config
|
||||
# for Qwen2-VL is commented out to address rope config loading issue
|
||||
#
|
||||
# if self.rope_scaling is not None and "type" in self.rope_scaling:
|
||||
# if self.rope_scaling["type"] == "mrope":
|
||||
# self.rope_scaling["type"] = "default"
|
||||
# self.rope_scaling["rope_type"] = self.rope_scaling["type"]
|
||||
# rope_config_validation(self)
|
||||
|
||||
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
|
||||
245
vllm/transformers_utils/configs/solar.py
Normal file
245
vllm/transformers_utils/configs/solar.py
Normal file
@@ -0,0 +1,245 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Solar model configuration"""
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class SolarConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store
|
||||
the configuration of a [`SolarModel`].
|
||||
It is used to instantiate an LLaMA model
|
||||
according to the specified arguments,
|
||||
defining the model architecture.
|
||||
Instantiating a configuration with the
|
||||
defaults will yield a similar
|
||||
configuration to that of the LLaMA-7B.
|
||||
Configuration objects inherit from [`PretrainedConfig`]
|
||||
and can be used to control the model outputs.
|
||||
Read the documentation from [`PretrainedConfig`] for more information.
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 32000):
|
||||
Vocabulary size of the LLaMA model.
|
||||
Defines the number of different tokens
|
||||
that can be represented by the `inputs_ids`
|
||||
passed when calling [`SolarModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 11008):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer
|
||||
in the Transformer decoder.
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
This is the number of key_value heads that
|
||||
should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`,
|
||||
the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1` the model
|
||||
will use Multi Query Attention (MQA)
|
||||
otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint,
|
||||
each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group.
|
||||
For more details checkout [this paper]
|
||||
(https://arxiv.org/pdf/2305.13245.pdf).
|
||||
If it is not specified, will default to
|
||||
`num_attention_heads`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function (function or string)
|
||||
in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Solar 1 supports up to 2048 tokens,
|
||||
Solar 2 up to 4096, CodeSolar up to 16384.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of
|
||||
the truncated_normal_initializer for initializing
|
||||
all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return
|
||||
the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*):
|
||||
Padding token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 1):
|
||||
Beginning of stream token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 2):
|
||||
End of stream token id.
|
||||
pretraining_tp (`int`, *optional*, defaults to 1):
|
||||
Experimental feature. Tensor parallelism rank
|
||||
used during pretraining.
|
||||
Please refer to [this
|
||||
document](https://huggingface.co/docs/
|
||||
transformers/main/
|
||||
perf_train_gpu_many#tensor-parallelism)
|
||||
to understand more about it. This value is
|
||||
necessary to ensure exact reproducibility
|
||||
of the pretraining results.
|
||||
Please refer to [this
|
||||
issue](https://github.com/pytorch/pytorch/issues/76232).
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether to tie weight embeddings
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
rope_scaling (`Dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for
|
||||
the RoPE embeddings.
|
||||
Currently supports two scaling
|
||||
strategies: linear and dynamic.
|
||||
Their scaling factor must be a float greater than 1.
|
||||
The expected format is
|
||||
`{"type": strategy name, "factor": scaling factor}`.
|
||||
When using this flag, don't update
|
||||
`max_position_embeddings` to the expected new maximum.
|
||||
See the following thread for more information on how
|
||||
these scaling strategies behave:
|
||||
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/
|
||||
dynamically_scaled_rope_further_increases/. This is an
|
||||
experimental feature, subject to breaking
|
||||
API changes in future versions.
|
||||
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value
|
||||
and output projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
mlp_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in up_proj, down_proj and gate_proj
|
||||
layers in the MLP layers.
|
||||
sliding_window (`int`, *optional*, defaults to 2047):
|
||||
Sliding window attention window size. If not specified,
|
||||
will default to `2047`.
|
||||
```python
|
||||
>>> from transformers import SolarModel, SolarConfig
|
||||
>>> # Initializing a Solar-pro style configuration
|
||||
>>> configuration = SolarConfig()
|
||||
>>> # Initializing a model from the Solar-pro style configuration
|
||||
>>> model = SolarModel(configuration)
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "solar"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=32000,
|
||||
hidden_size=4096,
|
||||
intermediate_size=11008,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=2048,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-6,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
pretraining_tp=1,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
mlp_bias=False,
|
||||
sliding_window=2047,
|
||||
bskcn_1=None,
|
||||
bskcn_2=None,
|
||||
bskcn_3=None,
|
||||
bskcn_4=None,
|
||||
bskcn_tv=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.pretraining_tp = pretraining_tp
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self._rope_scaling_validation()
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
self.mlp_bias = mlp_bias
|
||||
self.sliding_window = sliding_window
|
||||
self.bskcn_1 = bskcn_1 if bskcn_1 is not None else [12, 20, 32, 44]
|
||||
self.bskcn_2 = bskcn_2 if bskcn_2 is not None else [20, 32]
|
||||
self.bskcn_3 = bskcn_3 if bskcn_3 is not None else [16, 24, 36, 48]
|
||||
self.bskcn_4 = bskcn_4 if bskcn_4 is not None else [28, 40]
|
||||
self.bskcn_tv = bskcn_tv if bskcn_tv is not None else [0.9, 0.8]
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _rope_scaling_validation(self):
|
||||
"""
|
||||
Validate the `rope_scaling` configuration.
|
||||
"""
|
||||
if self.rope_scaling is None:
|
||||
return
|
||||
|
||||
if (not isinstance(self.rope_scaling, dict)
|
||||
or len(self.rope_scaling) != 2):
|
||||
raise ValueError(
|
||||
"`rope_scaling` must be a dictionary with two fields,"
|
||||
" `type` and `factor`, "
|
||||
f"got {self.rope_scaling}")
|
||||
rope_scaling_type = self.rope_scaling.get("type", None)
|
||||
rope_scaling_factor = self.rope_scaling.get("factor", None)
|
||||
if rope_scaling_type is None or rope_scaling_type not in [
|
||||
"linear",
|
||||
"dynamic",
|
||||
]:
|
||||
raise ValueError(f"`rope_scaling`'s type field must be one of "
|
||||
f"['linear', 'dynamic'], got {rope_scaling_type}")
|
||||
if (rope_scaling_factor is None
|
||||
or not isinstance(rope_scaling_factor, float)
|
||||
or rope_scaling_factor <= 1.0):
|
||||
raise ValueError(
|
||||
f"`rope_scaling`'s factor field must be a float > 1,"
|
||||
f" got {rope_scaling_factor}")
|
||||
99
vllm/transformers_utils/configs/ultravox.py
Normal file
99
vllm/transformers_utils/configs/ultravox.py
Normal file
@@ -0,0 +1,99 @@
|
||||
# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import transformers
|
||||
|
||||
|
||||
class UltravoxConfig(transformers.PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a
|
||||
[`UltravoxForConditionalGeneration`]. It is used to instantiate an
|
||||
Ultravox model according to the specified arguments, defining the model
|
||||
architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to
|
||||
control the model outputs. Read the documentation from [`PretrainedConfig`]
|
||||
for more information.
|
||||
|
||||
Args:
|
||||
audio_config (`Union[AutoConfig, dict]`, *optional*):
|
||||
Custom audio config or dict
|
||||
text_config (`Union[AutoConfig, dict]`, *optional*):
|
||||
The config object of the text backbone. Can be any of `LlamaConfig`
|
||||
or `MistralConfig`.
|
||||
ignore_index (`int`, *optional*, defaults to -100):
|
||||
The ignore index for the loss function.
|
||||
audio_token_index (`int`, *optional*, defaults to 32000):
|
||||
The audio token index to encode the audio prompt.
|
||||
stack_factor (`int`, *optional*, defaults to 8):
|
||||
Audio downsampling factor for the multimodal projector.
|
||||
norm_init (`float`, *optional*, defaults to 0.4):
|
||||
The initialization value for the layer normalization.
|
||||
projector_act (`str`, *optional*, defaults to `"swiglu"`):
|
||||
The activation function used by the multimodal projector.
|
||||
text_model_lora_config (`LoraConfigSimplified`, *optional*):
|
||||
The LoRA configuration for finetuning the text model.
|
||||
audio_model_lora_config (`LoraConfigSimplified`, *optional*):
|
||||
The LoRA configuration for finetuning the audio model.
|
||||
"""
|
||||
|
||||
model_type = "ultravox"
|
||||
is_composition = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
audio_config: Optional[Dict[str, Any]] = None,
|
||||
text_config: Optional[Dict[str, Any]] = None,
|
||||
audio_model_id: Optional[str] = None,
|
||||
text_model_id: Optional[str] = None,
|
||||
ignore_index: int = -100,
|
||||
audio_token_index: int = 32000,
|
||||
hidden_size: int = 4096,
|
||||
stack_factor: int = 8,
|
||||
norm_init: float = 0.4,
|
||||
projector_act: str = "swiglu",
|
||||
text_model_lora_config: Optional[Dict[str, Any]] = None,
|
||||
audio_model_lora_config: Optional[Dict[str, Any]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
self.ignore_index = ignore_index
|
||||
|
||||
self.audio_model_id = audio_model_id
|
||||
self.text_model_id = text_model_id
|
||||
self.audio_token_index = audio_token_index
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.stack_factor = stack_factor
|
||||
self.norm_init = norm_init
|
||||
self.projector_act = projector_act
|
||||
|
||||
if text_model_id is not None:
|
||||
# Avoid circular import
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
self.text_config = get_config(text_model_id,
|
||||
trust_remote_code=False)
|
||||
else:
|
||||
text_config = text_config or {}
|
||||
self.text_config = transformers.CONFIG_MAPPING[text_config.get(
|
||||
"model_type", "llama")](**text_config)
|
||||
|
||||
if audio_model_id is not None:
|
||||
# Avoid circular import
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
self.audio_config = get_config(audio_model_id,
|
||||
trust_remote_code=False)
|
||||
else:
|
||||
audio_config = audio_config or {}
|
||||
self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
|
||||
"model_type", "whisper")](**audio_config)
|
||||
|
||||
self.text_model_lora_config = text_model_lora_config or {}
|
||||
self.audio_model_lora_config = audio_model_lora_config or {}
|
||||
|
||||
self.vocab_size = self.text_config.vocab_size
|
||||
|
||||
self.initializer_range = self.text_config.initializer_range
|
||||
|
||||
super().__init__(**kwargs)
|
||||
327
vllm/transformers_utils/detokenizer.py
Normal file
327
vllm/transformers_utils/detokenizer.py
Normal file
@@ -0,0 +1,327 @@
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams,
|
||||
Sequence, SequenceGroup)
|
||||
|
||||
from .tokenizer import AnyTokenizer
|
||||
from .tokenizer_group import BaseTokenizerGroup
|
||||
|
||||
|
||||
class Detokenizer:
|
||||
"""Provides methods to decode the output of a model into text."""
|
||||
|
||||
def __init__(self, tokenizer_group: BaseTokenizerGroup):
|
||||
self.tokenizer_group = tokenizer_group
|
||||
|
||||
def get_tokenizer_for_seq(self, sequence: Sequence) -> AnyTokenizer:
|
||||
"""Returns the HF tokenizer to use for a given sequence."""
|
||||
return self.tokenizer_group.get_lora_tokenizer(sequence.lora_request)
|
||||
|
||||
def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup,
|
||||
prompt_logprobs: List[Optional[Dict[
|
||||
int, Logprob]]],
|
||||
position_offset: int) -> None:
|
||||
"""Decodes the logprobs for the prompt of a sequence group.
|
||||
|
||||
Args:
|
||||
seq_group: The sequence group to decode.
|
||||
prompt_logprobs: The logprobs to decode.
|
||||
position_offset: Offset of the first index of the logprobs
|
||||
relative to the start of the sequence (for chunked prefill).
|
||||
|
||||
Returns:
|
||||
The prompt logprobs with the decoded tokens.
|
||||
"""
|
||||
prms = seq_group.sampling_params
|
||||
assert prms is not None
|
||||
|
||||
# We can pick any sequence for the prompt.
|
||||
seq = seq_group.get_seqs()[0]
|
||||
# Only prompt, without the generated token.
|
||||
all_token_ids = seq.get_token_ids()
|
||||
prompt_token_ids = all_token_ids[:-1]
|
||||
tokenizer = self.get_tokenizer_for_seq(seq)
|
||||
prefix_offset = 0
|
||||
read_offset = 0
|
||||
next_iter_prefix_offset = 0
|
||||
next_iter_read_offset = 0
|
||||
next_iter_tokens: List[str] = []
|
||||
prev_tokens = None
|
||||
|
||||
for token_position_in_logprob, prompt_logprobs_for_token in enumerate(
|
||||
prompt_logprobs):
|
||||
|
||||
# Absolute token position equals the index in the logprobs
|
||||
# list plus the offset of the entire logprobs list relative
|
||||
# to the start of the sequence.
|
||||
token_position = token_position_in_logprob + position_offset
|
||||
if not prompt_logprobs_for_token:
|
||||
continue
|
||||
for token_id, sample_logprob in prompt_logprobs_for_token.items():
|
||||
if (sample_logprob.decoded_token is None
|
||||
and token_id != VLLM_INVALID_TOKEN_ID):
|
||||
prompt_token_ids_with_token = (
|
||||
prompt_token_ids[:token_position] + [token_id])
|
||||
(new_tokens, new_text, new_prefix_offset,
|
||||
new_read_offset) = detokenize_incrementally(
|
||||
tokenizer=tokenizer,
|
||||
all_input_ids=prompt_token_ids_with_token,
|
||||
prev_tokens=prev_tokens,
|
||||
prefix_offset=prefix_offset,
|
||||
read_offset=read_offset,
|
||||
skip_special_tokens=prms.skip_special_tokens,
|
||||
spaces_between_special_tokens=prms.
|
||||
spaces_between_special_tokens,
|
||||
)
|
||||
|
||||
sample_logprob.decoded_token = new_text
|
||||
|
||||
# Use the offsets & prev tokens corresponding to
|
||||
# real tokens to ensure detokenization is consistent
|
||||
# actual with prompt.
|
||||
if token_id == all_token_ids[token_position]:
|
||||
next_iter_prefix_offset = new_prefix_offset
|
||||
next_iter_read_offset = new_read_offset
|
||||
next_iter_tokens = new_tokens
|
||||
|
||||
# Advance to the next token position.
|
||||
prefix_offset = next_iter_prefix_offset
|
||||
read_offset = next_iter_read_offset
|
||||
if prev_tokens is None:
|
||||
prev_tokens = next_iter_tokens
|
||||
else:
|
||||
prev_tokens.extend(next_iter_tokens)
|
||||
|
||||
def decode_sequence_inplace(self, seq: Sequence,
|
||||
prms: SamplingParams) -> int:
|
||||
"""Decodes the new token for a sequence. In-place operation.
|
||||
|
||||
Args:
|
||||
seq: The sequence to decode.
|
||||
prms: The sampling parameters used to generate the sequence.
|
||||
|
||||
Returns:
|
||||
The number of characters added to the output text.
|
||||
"""
|
||||
all_input_ids = seq.get_token_ids()
|
||||
token_id_generated_this_iteration = all_input_ids[-1]
|
||||
tokenizer = self.get_tokenizer_for_seq(seq)
|
||||
|
||||
# Convert prompt token IDs to tokens if necessary.
|
||||
# Do it here so that we don't have to repeat this
|
||||
# computation for each logprob.
|
||||
if seq.tokens is None:
|
||||
(seq.tokens, seq.prefix_offset,
|
||||
seq.read_offset) = convert_prompt_ids_to_tokens(
|
||||
tokenizer=tokenizer,
|
||||
prompt_ids=all_input_ids[:-1],
|
||||
skip_special_tokens=prms.skip_special_tokens,
|
||||
)
|
||||
|
||||
(new_tokens, new_decoded_token_text, prefix_offset,
|
||||
read_offset) = detokenize_incrementally(
|
||||
tokenizer=tokenizer,
|
||||
all_input_ids=all_input_ids,
|
||||
prev_tokens=seq.tokens,
|
||||
prefix_offset=seq.prefix_offset,
|
||||
read_offset=seq.read_offset,
|
||||
skip_special_tokens=prms.skip_special_tokens,
|
||||
spaces_between_special_tokens=prms.spaces_between_special_tokens,
|
||||
)
|
||||
|
||||
# Decode logprobs
|
||||
logprobs = seq.output_logprobs[-1]
|
||||
if logprobs:
|
||||
previous_tokens = all_input_ids[:-1]
|
||||
for token_id, sample_logprob in logprobs.items():
|
||||
# If the token was generated this iteration,
|
||||
# use the provided text.
|
||||
if token_id == token_id_generated_this_iteration:
|
||||
sample_logprob.decoded_token = new_decoded_token_text
|
||||
continue
|
||||
|
||||
if (sample_logprob.decoded_token is None
|
||||
and token_id != VLLM_INVALID_TOKEN_ID):
|
||||
all_input_ids_with_logprob = previous_tokens + [token_id]
|
||||
(_, new_text, _, _) = detokenize_incrementally(
|
||||
tokenizer=tokenizer,
|
||||
all_input_ids=all_input_ids_with_logprob,
|
||||
prev_tokens=seq.tokens,
|
||||
prefix_offset=seq.prefix_offset,
|
||||
read_offset=seq.read_offset,
|
||||
skip_special_tokens=prms.skip_special_tokens,
|
||||
spaces_between_special_tokens=prms.
|
||||
spaces_between_special_tokens,
|
||||
)
|
||||
sample_logprob.decoded_token = new_text
|
||||
|
||||
seq.tokens.extend(new_tokens)
|
||||
seq.prefix_offset = prefix_offset
|
||||
seq.read_offset = read_offset
|
||||
seq.output_text += new_decoded_token_text
|
||||
|
||||
return len(new_decoded_token_text)
|
||||
|
||||
|
||||
def _replace_none_with_empty(tokens: List[Optional[str]]):
|
||||
for i, token in enumerate(tokens):
|
||||
if token is None:
|
||||
tokens[i] = ""
|
||||
|
||||
|
||||
def _convert_tokens_to_string_with_added_encoders(
|
||||
tokenizer: AnyTokenizer,
|
||||
output_tokens: List[str],
|
||||
skip_special_tokens: bool,
|
||||
spaces_between_special_tokens: bool,
|
||||
) -> str:
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
|
||||
# NOTE(woosuk): The following code is slow because it runs a for loop over
|
||||
# the output_tokens. In Python, running a for loop over a list can be slow
|
||||
# even when the loop body is very simple.
|
||||
sub_texts: List[str] = []
|
||||
current_sub_text: List[str] = []
|
||||
all_special_tokens = set(tokenizer.all_special_tokens)
|
||||
for token in output_tokens:
|
||||
if skip_special_tokens and token in all_special_tokens:
|
||||
continue
|
||||
if token in tokenizer.get_added_vocab():
|
||||
if current_sub_text:
|
||||
sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
|
||||
sub_texts.append(sub_text)
|
||||
current_sub_text = []
|
||||
sub_texts.append(token)
|
||||
else:
|
||||
current_sub_text.append(token)
|
||||
if current_sub_text:
|
||||
sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
|
||||
sub_texts.append(sub_text)
|
||||
if spaces_between_special_tokens:
|
||||
return " ".join(sub_texts)
|
||||
else:
|
||||
return "".join(sub_texts)
|
||||
|
||||
|
||||
# 5 is an arbitrary value that should work for all
|
||||
# tokenizers (bigger = more conservative).
|
||||
INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5
|
||||
|
||||
|
||||
def convert_prompt_ids_to_tokens(
|
||||
tokenizer: AnyTokenizer,
|
||||
prompt_ids: List[int],
|
||||
skip_special_tokens: bool = False,
|
||||
) -> Tuple[List[str], int, int]:
|
||||
"""Converts the prompt ids to tokens and returns the tokens and offsets
|
||||
for incremental detokenization.
|
||||
|
||||
Note that not all tokens are converted to strings. Only the tokens that
|
||||
are necessary for incremental detokenization are converted to strings.
|
||||
"""
|
||||
# We do not need to convert the whole prompt to tokens.
|
||||
# Offset a little more in case we have special tokens.
|
||||
new_tokens = tokenizer.convert_ids_to_tokens(
|
||||
prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2:],
|
||||
skip_special_tokens=skip_special_tokens)
|
||||
read_offset = len(new_tokens)
|
||||
prefix_offset = max(
|
||||
read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
|
||||
# This is required to guard against out-of-vocab prompt token ids
|
||||
_replace_none_with_empty(new_tokens) # type: ignore[arg-type]
|
||||
return new_tokens, prefix_offset, read_offset
|
||||
|
||||
|
||||
# Based on
|
||||
# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
|
||||
# under Apache 2.0 license
|
||||
def detokenize_incrementally(
|
||||
tokenizer: AnyTokenizer,
|
||||
all_input_ids: List[int],
|
||||
prev_tokens: Optional[List[str]],
|
||||
prefix_offset: int,
|
||||
read_offset: int,
|
||||
skip_special_tokens: bool = False,
|
||||
spaces_between_special_tokens: bool = True,
|
||||
) -> Tuple[List[str], str, int, int]:
|
||||
"""Detokenizes the input ids incrementally and returns the new tokens
|
||||
and the new text.
|
||||
|
||||
If `prev_tokens` is None, this function will convert the input ids to
|
||||
tokens and return the tokens and the new text. Otherwise, it will return the
|
||||
new tokens and the new text.
|
||||
|
||||
This function will also return the new prefix offset and the new read
|
||||
offset to be used in the next iteration.
|
||||
|
||||
The offsets are necessary to defeat cleanup algorithms in the decode which
|
||||
decide to add a space or not depending on the surrounding ids.
|
||||
|
||||
Args:
|
||||
tokenizer: The tokenizer to use.
|
||||
all_input_ids: The input ids. The last id is the new token id.
|
||||
prev_tokens: The previous tokens. If None, this function will convert
|
||||
the input ids to tokens and return the tokens and the new text.
|
||||
prefix_offset: The prefix offset.
|
||||
read_offset: The read offset.
|
||||
skip_special_tokens: Whether to skip special tokens.
|
||||
spaces_between_special_tokens: Whether to add spaces between special
|
||||
tokens.
|
||||
"""
|
||||
new_token_id = all_input_ids[-1]
|
||||
# This is the first iteration for this sequence
|
||||
is_first_iter = prev_tokens is None
|
||||
if is_first_iter:
|
||||
(prev_tokens, prefix_offset,
|
||||
read_offset) = convert_prompt_ids_to_tokens(
|
||||
tokenizer,
|
||||
all_input_ids[:-1],
|
||||
skip_special_tokens=skip_special_tokens)
|
||||
assert prev_tokens is not None
|
||||
|
||||
# If the new token id is out of bounds, return an empty string.
|
||||
if 0 <= new_token_id < len(tokenizer):
|
||||
# Put new_token_id in a list so skip_special_tokens is respected
|
||||
new_tokens = tokenizer.convert_ids_to_tokens(
|
||||
[new_token_id], skip_special_tokens=skip_special_tokens)
|
||||
if isinstance(new_tokens, str):
|
||||
new_tokens = [new_tokens]
|
||||
else:
|
||||
new_tokens = [""]
|
||||
output_tokens = prev_tokens + new_tokens
|
||||
|
||||
# If this is the first iteration, return all tokens.
|
||||
if is_first_iter:
|
||||
new_tokens = output_tokens
|
||||
|
||||
# The prefix text is necessary only to defeat cleanup algorithms in
|
||||
# the decode which decide to add a space or not depending on the
|
||||
# surrounding ids.
|
||||
if tokenizer.is_fast or not tokenizer.get_added_vocab():
|
||||
prefix_text = tokenizer.convert_tokens_to_string(
|
||||
output_tokens[prefix_offset:read_offset])
|
||||
new_text = tokenizer.convert_tokens_to_string(
|
||||
output_tokens[prefix_offset:])
|
||||
else:
|
||||
prefix_text = _convert_tokens_to_string_with_added_encoders(
|
||||
tokenizer,
|
||||
output_tokens[prefix_offset:read_offset],
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
spaces_between_special_tokens=spaces_between_special_tokens,
|
||||
)
|
||||
new_text = _convert_tokens_to_string_with_added_encoders(
|
||||
tokenizer,
|
||||
output_tokens[prefix_offset:],
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
spaces_between_special_tokens=spaces_between_special_tokens,
|
||||
)
|
||||
|
||||
if len(new_text) <= len(prefix_text) or new_text.endswith("<EFBFBD>"):
|
||||
# utf-8 char at the end means it's a potential unfinished byte sequence
|
||||
# from byte fallback tokenization.
|
||||
# If it's in the middle, it's probably a real invalid id generated
|
||||
# by the model
|
||||
return new_tokens, "", prefix_offset, read_offset
|
||||
|
||||
new_text = new_text[len(prefix_text):]
|
||||
return new_tokens, new_text, read_offset, len(output_tokens)
|
||||
94
vllm/transformers_utils/processor.py
Normal file
94
vllm/transformers_utils/processor.py
Normal file
@@ -0,0 +1,94 @@
|
||||
from typing import Any, cast
|
||||
|
||||
|
||||
def get_processor(
|
||||
processor_name: str,
|
||||
*args: Any,
|
||||
trust_remote_code: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Load a processor for the given model name via HuggingFace."""
|
||||
# don't put this import at the top level
|
||||
# it will call torch.cuda.device_count()
|
||||
from transformers import AutoProcessor
|
||||
from transformers.processing_utils import ProcessorMixin
|
||||
|
||||
try:
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
processor_name,
|
||||
*args,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**kwargs)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the processor class not existing or not
|
||||
# currently being imported, suggest using the --trust-remote-code flag.
|
||||
# Unlike AutoTokenizer, AutoProcessor does not separate such errors
|
||||
if not trust_remote_code:
|
||||
err_msg = (
|
||||
"Failed to load the processor. If the processor is "
|
||||
"a custom processor not yet available in the HuggingFace "
|
||||
"transformers library, consider setting "
|
||||
"`trust_remote_code=True` in LLM or using the "
|
||||
"`--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
return cast(ProcessorMixin, processor)
|
||||
|
||||
|
||||
def get_image_processor(
|
||||
processor_name: str,
|
||||
*args: Any,
|
||||
trust_remote_code: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Load an image processor for the given model name via HuggingFace."""
|
||||
# don't put this import at the top level
|
||||
# it will call torch.cuda.device_count()
|
||||
from transformers import AutoImageProcessor
|
||||
from transformers.image_processing_utils import BaseImageProcessor
|
||||
|
||||
try:
|
||||
processor = AutoImageProcessor.from_pretrained(
|
||||
processor_name,
|
||||
*args,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**kwargs)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the processor class not existing or not
|
||||
# currently being imported, suggest using the --trust-remote-code flag.
|
||||
# Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
|
||||
if not trust_remote_code:
|
||||
err_msg = (
|
||||
"Failed to load the image processor. If the image processor is "
|
||||
"a custom processor not yet available in the HuggingFace "
|
||||
"transformers library, consider setting "
|
||||
"`trust_remote_code=True` in LLM or using the "
|
||||
"`--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
return cast(BaseImageProcessor, processor)
|
||||
|
||||
|
||||
def get_video_processor(
|
||||
processor_name: str,
|
||||
*args: Any,
|
||||
trust_remote_code: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Load a video processor for the given model name via HuggingFace."""
|
||||
# don't put this import at the top level
|
||||
# it will call torch.cuda.device_count()
|
||||
from transformers.image_processing_utils import BaseImageProcessor
|
||||
|
||||
processor = get_processor(
|
||||
processor_name,
|
||||
*args,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
return cast(BaseImageProcessor, processor.video_processor)
|
||||
193
vllm/transformers_utils/tokenizer.py
Normal file
193
vllm/transformers_utils/tokenizer.py
Normal file
@@ -0,0 +1,193 @@
|
||||
import os
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from types import MethodType
|
||||
from typing import Optional, Union
|
||||
|
||||
import huggingface_hub
|
||||
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
||||
PreTrainedTokenizerFast)
|
||||
|
||||
from vllm.envs import VLLM_USE_MODELSCOPE
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.tokenizers import MistralTokenizer
|
||||
from vllm.transformers_utils.utils import check_gguf_file
|
||||
from vllm.utils import make_async
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
|
||||
MistralTokenizer]
|
||||
|
||||
|
||||
def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
|
||||
"""Get tokenizer with cached properties.
|
||||
|
||||
This will patch the tokenizer object in place.
|
||||
|
||||
By default, transformers will recompute multiple tokenizer properties
|
||||
each time they are called, leading to a significant slowdown. This
|
||||
function caches these properties for faster access."""
|
||||
|
||||
tokenizer_all_special_ids = set(tokenizer.all_special_ids)
|
||||
tokenizer_all_special_tokens_extended = (
|
||||
tokenizer.all_special_tokens_extended)
|
||||
tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
|
||||
tokenizer_len = len(tokenizer)
|
||||
|
||||
class CachedTokenizer(tokenizer.__class__): # type: ignore
|
||||
|
||||
@property
|
||||
def all_special_ids(self):
|
||||
return tokenizer_all_special_ids
|
||||
|
||||
@property
|
||||
def all_special_tokens(self):
|
||||
return tokenizer_all_special_tokens
|
||||
|
||||
@property
|
||||
def all_special_tokens_extended(self):
|
||||
return tokenizer_all_special_tokens_extended
|
||||
|
||||
def __len__(self):
|
||||
return tokenizer_len
|
||||
|
||||
CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
|
||||
|
||||
tokenizer.__class__ = CachedTokenizer
|
||||
return tokenizer
|
||||
|
||||
|
||||
def patch_padding_side(tokenizer: PreTrainedTokenizer) -> None:
|
||||
"""Patch _pad method to accept `padding_side` for older tokenizers."""
|
||||
orig_pad = tokenizer._pad
|
||||
|
||||
def _pad(
|
||||
self: PreTrainedTokenizer,
|
||||
*args,
|
||||
padding_side: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
if padding_side is not None and padding_side != self.padding_side:
|
||||
msg = ("`padding_side` argument is not supported by "
|
||||
f"{type(tokenizer).__name__} and will be ignored.")
|
||||
warnings.warn(msg, stacklevel=2)
|
||||
|
||||
return orig_pad(*args, **kwargs)
|
||||
|
||||
tokenizer._pad = MethodType(_pad, tokenizer)
|
||||
|
||||
|
||||
def get_tokenizer(
|
||||
tokenizer_name: Union[str, Path],
|
||||
*args,
|
||||
tokenizer_mode: str = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
revision: Optional[str] = None,
|
||||
download_dir: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> AnyTokenizer:
|
||||
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope.
|
||||
"""
|
||||
if VLLM_USE_MODELSCOPE:
|
||||
# download model from ModelScope hub,
|
||||
# lazy import so that modelscope is not required for normal use.
|
||||
# pylint: disable=C.
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
|
||||
# Only set the tokenizer here, model will be downloaded on the workers.
|
||||
if not os.path.exists(tokenizer_name):
|
||||
tokenizer_path = snapshot_download(
|
||||
model_id=tokenizer_name,
|
||||
cache_dir=download_dir,
|
||||
revision=revision,
|
||||
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||
# Ignore weights - we only need the tokenizer.
|
||||
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
||||
tokenizer_name = tokenizer_path
|
||||
|
||||
if tokenizer_mode == "slow":
|
||||
if kwargs.get("use_fast", False):
|
||||
raise ValueError(
|
||||
"Cannot use the fast tokenizer in slow tokenizer mode.")
|
||||
kwargs["use_fast"] = False
|
||||
|
||||
if "truncation_side" not in kwargs:
|
||||
kwargs["truncation_side"] = "left"
|
||||
|
||||
# Separate model folder from file path for GGUF models
|
||||
is_gguf = check_gguf_file(tokenizer_name)
|
||||
if is_gguf:
|
||||
kwargs["gguf_file"] = Path(tokenizer_name).name
|
||||
tokenizer_name = Path(tokenizer_name).parent
|
||||
|
||||
# if tokenizer is from official mistral org
|
||||
is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai"
|
||||
if is_from_mistral_org and tokenizer_mode != "mistral":
|
||||
warnings.warn(
|
||||
'It is strongly recommended to run mistral models with '
|
||||
'`--tokenizer_mode "mistral"` to ensure correct '
|
||||
'encoding and decoding.',
|
||||
FutureWarning,
|
||||
stacklevel=2)
|
||||
if tokenizer_mode == "mistral":
|
||||
tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name),
|
||||
revision=revision)
|
||||
else:
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
tokenizer_name,
|
||||
*args,
|
||||
trust_remote_code=trust_remote_code,
|
||||
revision=revision,
|
||||
**kwargs,
|
||||
)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the tokenizer class not existing or not
|
||||
# currently being imported,
|
||||
# suggest using the --trust-remote-code flag.
|
||||
if not trust_remote_code and (
|
||||
"does not exist or is not currently imported." in str(e)
|
||||
or "requires you to execute the tokenizer file" in str(e)):
|
||||
err_msg = ("Failed to load the tokenizer. If the tokenizer "
|
||||
"is a custom tokenizer not yet available in the "
|
||||
"HuggingFace transformers library, consider "
|
||||
"setting `trust_remote_code=True` in LLM or using "
|
||||
"the `--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
# NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324
|
||||
if type(tokenizer).__name__ in ("ChatGLMTokenizer",
|
||||
"ChatGLM4Tokenizer"):
|
||||
assert isinstance(tokenizer, PreTrainedTokenizer)
|
||||
patch_padding_side(tokenizer)
|
||||
|
||||
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||
logger.warning(
|
||||
"Using a slow tokenizer. This might cause a significant "
|
||||
"slowdown. Consider using a fast tokenizer instead.")
|
||||
tokenizer = get_cached_tokenizer(tokenizer)
|
||||
|
||||
return tokenizer
|
||||
|
||||
|
||||
def get_lora_tokenizer(lora_request: LoRARequest, *args,
|
||||
**kwargs) -> Optional[AnyTokenizer]:
|
||||
if lora_request is None:
|
||||
return None
|
||||
try:
|
||||
tokenizer = get_tokenizer(lora_request.lora_path, *args, **kwargs)
|
||||
except Exception as e:
|
||||
# No tokenizer was found in the LoRA folder,
|
||||
# use base model tokenizer
|
||||
logger.warning(
|
||||
"No tokenizer found in %s, using base model tokenizer instead. "
|
||||
"(Exception: %s)", lora_request.lora_path, e)
|
||||
tokenizer = None
|
||||
return tokenizer
|
||||
|
||||
|
||||
get_lora_tokenizer_async = make_async(get_lora_tokenizer)
|
||||
52
vllm/transformers_utils/tokenizer_group/__init__.py
Normal file
52
vllm/transformers_utils/tokenizer_group/__init__.py
Normal file
@@ -0,0 +1,52 @@
|
||||
from typing import Optional, Type
|
||||
|
||||
from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
|
||||
TokenizerPoolConfig)
|
||||
from vllm.executor.ray_utils import ray
|
||||
|
||||
from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
|
||||
from .tokenizer_group import TokenizerGroup
|
||||
|
||||
if ray:
|
||||
from .ray_tokenizer_group import RayTokenizerGroupPool
|
||||
else:
|
||||
RayTokenizerGroupPool = None # type: ignore
|
||||
|
||||
|
||||
def init_tokenizer_from_configs(model_config: ModelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
parallel_config: ParallelConfig,
|
||||
enable_lora: bool):
|
||||
init_kwargs = dict(tokenizer_id=model_config.tokenizer,
|
||||
enable_lora=enable_lora,
|
||||
max_num_seqs=scheduler_config.max_num_seqs,
|
||||
max_input_length=None,
|
||||
tokenizer_mode=model_config.tokenizer_mode,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
revision=model_config.tokenizer_revision)
|
||||
|
||||
return get_tokenizer_group(parallel_config.tokenizer_pool_config,
|
||||
**init_kwargs)
|
||||
|
||||
|
||||
def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
|
||||
**init_kwargs) -> BaseTokenizerGroup:
|
||||
tokenizer_cls: Type[BaseTokenizerGroup]
|
||||
if tokenizer_pool_config is None:
|
||||
tokenizer_cls = TokenizerGroup
|
||||
elif isinstance(tokenizer_pool_config.pool_type, type) and issubclass(
|
||||
tokenizer_pool_config.pool_type, BaseTokenizerGroup):
|
||||
tokenizer_cls = tokenizer_pool_config.pool_type
|
||||
elif tokenizer_pool_config.pool_type == "ray":
|
||||
if RayTokenizerGroupPool is None:
|
||||
raise ImportError(
|
||||
"RayTokenizerGroupPool is not available. Please install "
|
||||
"the ray package to use the Ray tokenizer group pool.")
|
||||
tokenizer_cls = RayTokenizerGroupPool
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unknown pool type: {tokenizer_pool_config.pool_type}")
|
||||
return tokenizer_cls.from_config(tokenizer_pool_config, **init_kwargs)
|
||||
|
||||
|
||||
__all__ = ["AnyTokenizer", "get_tokenizer_group", "BaseTokenizerGroup"]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,66 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Optional
|
||||
|
||||
from vllm.config import TokenizerPoolConfig
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
|
||||
class BaseTokenizerGroup(ABC):
|
||||
"""A group of tokenizers that can be used for LoRA adapters."""
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
|
||||
**init_kwargs) -> "BaseTokenizerGroup":
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def ping(self) -> bool:
|
||||
"""Check if the tokenizer group is alive."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_max_input_len(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> Optional[int]:
|
||||
"""Get the maximum input length for the LoRA request."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def encode(self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
||||
"""Encode a prompt using the tokenizer group."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def encode_async(
|
||||
self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
||||
"""Encode a prompt using the tokenizer group."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_lora_tokenizer(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> AnyTokenizer:
|
||||
"""Get a tokenizer for a LoRA request."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def get_lora_tokenizer_async(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> AnyTokenizer:
|
||||
"""Get a tokenizer for a LoRA request."""
|
||||
pass
|
||||
|
||||
def check_health(self):
|
||||
"""Raise exception if the tokenizer group is unhealthy."""
|
||||
return
|
||||
240
vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
Normal file
240
vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
Normal file
@@ -0,0 +1,240 @@
|
||||
import asyncio
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
try:
|
||||
from ray.exceptions import ActorDiedError # type: ignore
|
||||
except ImportError:
|
||||
# For older versions of Ray
|
||||
from ray.exceptions import RayActorError as ActorDiedError # type: ignore
|
||||
from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
|
||||
|
||||
from vllm.config import TokenizerPoolConfig
|
||||
from vllm.executor.ray_utils import ray
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
from .base_tokenizer_group import BaseTokenizerGroup
|
||||
from .tokenizer_group import TokenizerGroup
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class RayTokenizerGroupPool(BaseTokenizerGroup):
|
||||
"""A Ray-based pool of TokenizerGroups for async tokenization."""
|
||||
|
||||
# Class to use for workers making up the pool.
|
||||
_worker_cls = TokenizerGroup
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
|
||||
**init_kwargs) -> "RayTokenizerGroupPool":
|
||||
if not tokenizer_pool_config:
|
||||
raise ValueError("tokenizer_pool_config must not be None.")
|
||||
ray_actor_options = (tokenizer_pool_config.extra_config or {
|
||||
"num_cpus": 0
|
||||
})
|
||||
ray_actor_options.setdefault(
|
||||
"scheduling_strategy",
|
||||
NodeAffinitySchedulingStrategy(
|
||||
node_id=ray.get_runtime_context().get_node_id(), soft=True))
|
||||
|
||||
# Carry over the env vars to the actors.
|
||||
# This is necessary for API keys and such.
|
||||
ray_actor_options.setdefault("runtime_env", {})
|
||||
_carry_over_env_vars_to_runtime_env(ray_actor_options["runtime_env"])
|
||||
|
||||
init_kwargs["num_actors"] = tokenizer_pool_config.pool_size
|
||||
init_kwargs["ray_actor_options"] = ray_actor_options
|
||||
|
||||
return cls(**init_kwargs)
|
||||
|
||||
def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
|
||||
max_input_length: Optional[int], num_actors: int,
|
||||
ray_actor_options: dict, **tokenizer_config):
|
||||
# Store a local copy of the TokenizerGroup for quick access
|
||||
# to underlying HF tokenizers.
|
||||
self._tokenizer_config = {
|
||||
"tokenizer_id": tokenizer_id,
|
||||
"enable_lora": enable_lora,
|
||||
"max_num_seqs": max_num_seqs,
|
||||
"max_input_length": max_input_length,
|
||||
**tokenizer_config
|
||||
}
|
||||
self._local_tokenizer_group = self._worker_cls(
|
||||
**self._tokenizer_config, )
|
||||
|
||||
self._ray_tokenizer_group_cls = ray.remote(
|
||||
self._worker_cls).options(**ray_actor_options) # type: ignore
|
||||
self.tokenizer_actors = [self._init_actor() for _ in range(num_actors)]
|
||||
self._idle_actors: Optional[asyncio.Queue] = None
|
||||
|
||||
# If set, actor is unhealthy. Will reraise on the next
|
||||
# check_health call.
|
||||
self._exception: Optional[ActorDiedError] = None
|
||||
|
||||
def _init_actor(self) -> ray.ObjectRef:
|
||||
return self._ray_tokenizer_group_cls.remote(**self._tokenizer_config)
|
||||
|
||||
@property
|
||||
def pool_size(self) -> int:
|
||||
return len(self.tokenizer_actors)
|
||||
|
||||
def ping(self):
|
||||
return ray.get([
|
||||
actor.ping.remote() # type: ignore
|
||||
for actor in self.tokenizer_actors
|
||||
])
|
||||
|
||||
def _ensure_queue_initialized(self):
|
||||
if self._idle_actors is None:
|
||||
self._idle_actors = asyncio.Queue()
|
||||
for actor in self.tokenizer_actors:
|
||||
self._idle_actors.put_nowait(actor)
|
||||
|
||||
def _finalize_encode(self, actor: ray.ObjectRef,
|
||||
original_actor: ray.ObjectRef, actor_is_alive: bool):
|
||||
assert self._idle_actors is not None
|
||||
# Cleanup the dead actor.
|
||||
if not actor_is_alive or original_actor is not actor:
|
||||
self.tokenizer_actors.remove(original_actor)
|
||||
if actor_is_alive:
|
||||
# Put the actor back in the queue.
|
||||
# This is done in a finally block to ensure that the actor is
|
||||
# always put back in the queue, even if an exception/cancellation
|
||||
# is raised.
|
||||
self._idle_actors.put_nowait(actor)
|
||||
# Add back the new actor.
|
||||
if original_actor is not actor:
|
||||
self.tokenizer_actors.append(actor)
|
||||
|
||||
def encode(self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
||||
"""Encode a prompt using the tokenizer group.
|
||||
|
||||
We pick an idle actor and use it to encode the prompt.
|
||||
The actor is then put back in the queue for future use.
|
||||
This is blocking.
|
||||
"""
|
||||
self.check_health()
|
||||
self._ensure_queue_initialized()
|
||||
assert self._idle_actors is not None
|
||||
|
||||
if self._idle_actors.empty():
|
||||
raise RuntimeError("No idle actors available.")
|
||||
actor = self._idle_actors.get_nowait()
|
||||
actor_is_alive = True
|
||||
original_actor = actor
|
||||
try:
|
||||
ret = ray.get(
|
||||
actor.encode.remote(request_id=request_id,
|
||||
prompt=prompt,
|
||||
lora_request=lora_request))
|
||||
except ActorDiedError as e:
|
||||
# If the actor is dead, we first try to reinitialize it.
|
||||
logger.warning("%s died with ActorDiedError, reinitializing.",
|
||||
actor,
|
||||
exc_info=e)
|
||||
actor = self._init_actor()
|
||||
try:
|
||||
ret = ray.get(
|
||||
actor.encode.remote(request_id=request_id,
|
||||
prompt=prompt,
|
||||
lora_request=lora_request))
|
||||
except ActorDiedError as e:
|
||||
logger.error(
|
||||
"%s died for second time in a row, marking "
|
||||
"RayTokenizerGroupPool as unhealthy.", actor)
|
||||
actor_is_alive = False
|
||||
if not self._exception:
|
||||
self._exception = e
|
||||
self.check_health()
|
||||
finally:
|
||||
self._finalize_encode(actor, original_actor, actor_is_alive)
|
||||
return ret
|
||||
|
||||
async def encode_async(
|
||||
self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
||||
"""Encode a prompt using the tokenizer group.
|
||||
|
||||
We pick an idle actor and use it to encode the prompt.
|
||||
If there are no idle actors, we wait until one becomes
|
||||
available.
|
||||
The actor is then put back in the queue for future use.
|
||||
This is non-blocking.
|
||||
"""
|
||||
self.check_health()
|
||||
self._ensure_queue_initialized()
|
||||
assert self._idle_actors is not None
|
||||
|
||||
actor = await self._idle_actors.get()
|
||||
actor_is_alive = True
|
||||
original_actor = actor
|
||||
try:
|
||||
ret = await actor.encode.remote(request_id=request_id,
|
||||
prompt=prompt,
|
||||
lora_request=lora_request)
|
||||
except ActorDiedError as e:
|
||||
# If the actor is dead, we first try to reinitialize it.
|
||||
logger.warning("%s died with ActorDiedError, reinitializing.",
|
||||
actor,
|
||||
exc_info=e)
|
||||
actor = self._init_actor()
|
||||
try:
|
||||
ret = await actor.encode.remote(request_id=request_id,
|
||||
prompt=prompt,
|
||||
lora_request=lora_request)
|
||||
except ActorDiedError as e:
|
||||
logger.error(
|
||||
"%s died for second time in a row, marking "
|
||||
"RayTokenizerGroupPool as unhealthy.", actor)
|
||||
actor_is_alive = False
|
||||
if not self._exception:
|
||||
self._exception = e
|
||||
self.check_health()
|
||||
finally:
|
||||
self._finalize_encode(actor, original_actor, actor_is_alive)
|
||||
return ret
|
||||
|
||||
def get_max_input_len(self,
|
||||
lora_request: Optional[LoRARequest] = None
|
||||
) -> Optional[int]:
|
||||
"""Get the maximum input length for the LoRA request."""
|
||||
return self._local_tokenizer_group.get_max_input_len(lora_request)
|
||||
|
||||
def get_lora_tokenizer(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> AnyTokenizer:
|
||||
return self._local_tokenizer_group.get_lora_tokenizer(lora_request)
|
||||
|
||||
async def get_lora_tokenizer_async(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> AnyTokenizer:
|
||||
return await self._local_tokenizer_group.get_lora_tokenizer_async(
|
||||
lora_request)
|
||||
|
||||
def check_health(self):
|
||||
if self._exception:
|
||||
raise RuntimeError(
|
||||
"TokenizerGroupPool is unhealthy.") from self._exception
|
||||
|
||||
|
||||
def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None:
|
||||
"""Copy over all current process environment variables to the runtime_env.
|
||||
|
||||
The variables in runtime_env will take precedence over the current process
|
||||
environment variables.
|
||||
|
||||
runtime_env will be modified in place."""
|
||||
env_vars = os.environ.copy()
|
||||
runtime_env.setdefault("env_vars", {})
|
||||
env_vars.update(runtime_env["env_vars"])
|
||||
runtime_env["env_vars"] = env_vars
|
||||
99
vllm/transformers_utils/tokenizer_group/tokenizer_group.py
Normal file
99
vllm/transformers_utils/tokenizer_group/tokenizer_group.py
Normal file
@@ -0,0 +1,99 @@
|
||||
from typing import List, Optional
|
||||
|
||||
from vllm.config import TokenizerPoolConfig
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
|
||||
get_lora_tokenizer,
|
||||
get_lora_tokenizer_async,
|
||||
get_tokenizer)
|
||||
from vllm.utils import LRUCache
|
||||
|
||||
from .base_tokenizer_group import BaseTokenizerGroup
|
||||
|
||||
|
||||
class TokenizerGroup(BaseTokenizerGroup):
|
||||
"""A group of tokenizers that can be used for LoRA adapters."""
|
||||
|
||||
def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
|
||||
max_input_length: Optional[int], **tokenizer_config):
|
||||
self.tokenizer_id = tokenizer_id
|
||||
self.tokenizer_config = tokenizer_config
|
||||
self.enable_lora = enable_lora
|
||||
self.max_input_length = max_input_length
|
||||
self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
|
||||
self.lora_tokenizers = LRUCache[AnyTokenizer](
|
||||
capacity=max_num_seqs if enable_lora else 0)
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
|
||||
**init_kwargs) -> "TokenizerGroup":
|
||||
return cls(**init_kwargs)
|
||||
|
||||
def ping(self) -> bool:
|
||||
"""Check if the tokenizer group is alive."""
|
||||
return True
|
||||
|
||||
def get_max_input_len(self,
|
||||
lora_request: Optional[LoRARequest] = None
|
||||
) -> Optional[int]:
|
||||
"""Get the maximum input length for the LoRA request."""
|
||||
return self.max_input_length
|
||||
|
||||
def _raise_if_input_too_long(self,
|
||||
encoded_tokens: List[int],
|
||||
lora_request: Optional[LoRARequest] = None):
|
||||
input_length = len(encoded_tokens)
|
||||
if lora_request:
|
||||
max_input_length = (lora_request.long_lora_max_len
|
||||
or self.max_input_length)
|
||||
else:
|
||||
max_input_length = self.max_input_length
|
||||
if max_input_length is not None and input_length > max_input_length:
|
||||
raise ValueError("Input too long.", input_length, max_input_length)
|
||||
|
||||
def encode(self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
||||
tokenizer = self.get_lora_tokenizer(lora_request)
|
||||
ret = tokenizer.encode(prompt)
|
||||
self._raise_if_input_too_long(ret, lora_request)
|
||||
return ret
|
||||
|
||||
async def encode_async(
|
||||
self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
||||
tokenizer = await self.get_lora_tokenizer_async(lora_request)
|
||||
ret = tokenizer.encode(prompt)
|
||||
self._raise_if_input_too_long(ret, lora_request)
|
||||
return ret
|
||||
|
||||
def get_lora_tokenizer(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> AnyTokenizer:
|
||||
if not lora_request or not self.enable_lora:
|
||||
return self.tokenizer
|
||||
if lora_request.lora_int_id not in self.lora_tokenizers:
|
||||
tokenizer = (get_lora_tokenizer(
|
||||
lora_request, **self.tokenizer_config) or self.tokenizer)
|
||||
self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
|
||||
return tokenizer
|
||||
else:
|
||||
return self.lora_tokenizers[lora_request.lora_int_id]
|
||||
|
||||
async def get_lora_tokenizer_async(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> AnyTokenizer:
|
||||
if not lora_request or not self.enable_lora:
|
||||
return self.tokenizer
|
||||
if lora_request.lora_int_id not in self.lora_tokenizers:
|
||||
tokenizer = (await get_lora_tokenizer_async(
|
||||
lora_request, **self.tokenizer_config) or self.tokenizer)
|
||||
self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
|
||||
return tokenizer
|
||||
else:
|
||||
return self.lora_tokenizers[lora_request.lora_int_id]
|
||||
3
vllm/transformers_utils/tokenizers/__init__.py
Normal file
3
vllm/transformers_utils/tokenizers/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .mistral import MistralTokenizer
|
||||
|
||||
__all__ = ["MistralTokenizer"]
|
||||
Binary file not shown.
Binary file not shown.
229
vllm/transformers_utils/tokenizers/mistral.py
Normal file
229
vllm/transformers_utils/tokenizers/mistral.py
Normal file
@@ -0,0 +1,229 @@
|
||||
import os
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
||||
|
||||
from huggingface_hub import HfApi, hf_hub_download
|
||||
# yapf: disable
|
||||
from mistral_common.tokens.tokenizers.mistral import ChatCompletionRequest
|
||||
from mistral_common.tokens.tokenizers.mistral import (
|
||||
MistralTokenizer as PublicMistralTokenizer)
|
||||
# yapf: enable
|
||||
from mistral_common.tokens.tokenizers.sentencepiece import (
|
||||
SentencePieceTokenizer)
|
||||
from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy,
|
||||
Tekkenizer)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
|
||||
|
||||
|
||||
@dataclass
|
||||
class Encoding:
|
||||
input_ids: List[int]
|
||||
|
||||
|
||||
def find_tokenizer_file(files: List[str]):
|
||||
file_pattern = re.compile(r"^tokenizer\.model\.v.*$|^tekken\.json$")
|
||||
|
||||
matched_files = [file for file in files if file_pattern.match(file)]
|
||||
if len(matched_files) > 1:
|
||||
raise OSError(f"Found {len(matched_files)} files matching the "
|
||||
f"pattern: {file_pattern}. Make sure only one Mistral "
|
||||
f"tokenizer is present in {files}.")
|
||||
elif len(matched_files) == 0:
|
||||
raise OSError(f"Found {len(matched_files)} files matching the "
|
||||
f"pattern: {file_pattern}. Make sure that a Mistral "
|
||||
f"tokenizer is present in {files}.")
|
||||
|
||||
return matched_files[0]
|
||||
|
||||
|
||||
class MistralTokenizer:
|
||||
|
||||
def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
|
||||
self.mistral = tokenizer
|
||||
self.instruct = tokenizer.instruct_tokenizer
|
||||
|
||||
tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
|
||||
if isinstance(tokenizer_, Tekkenizer):
|
||||
# Make sure special tokens will not raise
|
||||
tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE
|
||||
|
||||
self._vocab = {
|
||||
token: idx
|
||||
for idx, token in enumerate(tokenizer_.vocab())
|
||||
}
|
||||
elif isinstance(tokenizer_, SentencePieceTokenizer):
|
||||
self._vocab = {
|
||||
token: idx
|
||||
for idx, token in enumerate(tokenizer_.vocab())
|
||||
}
|
||||
else:
|
||||
raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
|
||||
|
||||
self.tokenizer = tokenizer_
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls,
|
||||
path_or_repo_id: str,
|
||||
*,
|
||||
revision: Optional[str] = None) -> "MistralTokenizer":
|
||||
if not Path(path_or_repo_id).exists():
|
||||
assert len(path_or_repo_id.split("/")) == 2, (
|
||||
"You have either provided a non-existent path: "
|
||||
"{path_or_repo_id} or an invalid HF Hub repo id.")
|
||||
tokenizer_file = cls._download_mistral_tokenizer_from_hf(
|
||||
path_or_repo_id, revision)
|
||||
elif Path(path_or_repo_id).is_dir():
|
||||
tokenizer_file_name = find_tokenizer_file(
|
||||
os.listdir(path_or_repo_id))
|
||||
tokenizer_file = str(Path(path_or_repo_id) / tokenizer_file_name)
|
||||
else:
|
||||
assert Path(
|
||||
path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
|
||||
|
||||
mistral_tokenizer = PublicMistralTokenizer.from_file(tokenizer_file)
|
||||
return cls(mistral_tokenizer)
|
||||
|
||||
@staticmethod
|
||||
def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
|
||||
revision: Optional[str]) -> str:
|
||||
api = HfApi()
|
||||
repo_info = api.model_info(tokenizer_name)
|
||||
files = [s.rfilename for s in repo_info.siblings]
|
||||
|
||||
filename = find_tokenizer_file(files)
|
||||
|
||||
tokenizer_file = hf_hub_download(tokenizer_name,
|
||||
filename=filename,
|
||||
revision=revision)
|
||||
return tokenizer_file
|
||||
|
||||
# the following attributes are set to fit VLLM's design
|
||||
@property
|
||||
def all_special_tokens_extended(self) -> List[str]:
|
||||
return []
|
||||
|
||||
@property
|
||||
def all_special_tokens(self) -> List[str]:
|
||||
return []
|
||||
|
||||
@property
|
||||
def all_special_ids(self) -> List[int]:
|
||||
return []
|
||||
|
||||
@property
|
||||
def bos_token_id(self) -> int:
|
||||
return self.tokenizer.bos_id
|
||||
|
||||
@property
|
||||
def eos_token_id(self) -> int:
|
||||
return self.tokenizer.eos_id
|
||||
|
||||
@property
|
||||
def is_fast(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
return len(self._vocab)
|
||||
|
||||
def __len__(self) -> int:
|
||||
return self.vocab_size
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
prompt: str,
|
||||
add_special_tokens: bool = False,
|
||||
truncation: bool = False,
|
||||
max_length: Optional[int] = None,
|
||||
):
|
||||
# Mistral Tokenizers should not add special tokens
|
||||
input_ids = self.encode(prompt)
|
||||
|
||||
if truncation:
|
||||
input_ids = input_ids[:max_length]
|
||||
|
||||
return Encoding(input_ids=input_ids)
|
||||
|
||||
def get_vocab(self) -> Dict[str, int]:
|
||||
return self._vocab
|
||||
|
||||
def get_added_vocab(self) -> Dict[str, int]:
|
||||
# Mistral tokenizers have no added vocabulary
|
||||
return {}
|
||||
|
||||
def encode(self, prompt: str) -> List[int]:
|
||||
# `encode` should only be used for prompt completion
|
||||
# it should never be used for chat_completion.
|
||||
# For chat completion use `apply_chat_template`
|
||||
return self.tokenizer.encode(prompt, bos=True, eos=False)
|
||||
|
||||
def apply_chat_template(self,
|
||||
messages: List["ChatCompletionMessageParam"],
|
||||
tools: Optional[Dict[str, Any]] = None,
|
||||
**kwargs) -> List[int]:
|
||||
|
||||
request = ChatCompletionRequest(messages=messages,
|
||||
tools=tools) # type: ignore[type-var]
|
||||
encoded = self.mistral.encode_chat_completion(request)
|
||||
|
||||
# encode-decode to get clean prompt
|
||||
return encoded.tokens
|
||||
|
||||
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
||||
if isinstance(self.tokenizer, Tekkenizer):
|
||||
tokens = [
|
||||
t for t in tokens
|
||||
if t not in self.tokenizer._all_special_tokens
|
||||
]
|
||||
|
||||
if any(isinstance(t, bytes) for t in tokens):
|
||||
# we need to encode and decode all tokens again
|
||||
shift = self.tokenizer.num_special_tokens
|
||||
byte_tokens = [
|
||||
t.encode("utf-8") if not isinstance(t, bytes) else t
|
||||
for t in tokens
|
||||
]
|
||||
ids = [
|
||||
self.tokenizer._tekken_token2id_nospecial[t] + shift
|
||||
for t in byte_tokens
|
||||
]
|
||||
decoded = self.tokenizer.decode(ids)
|
||||
else:
|
||||
decoded = "".join(tokens)
|
||||
else:
|
||||
decoded = self.tokenizer.decode(tokens) # type: ignore[arg-type]
|
||||
|
||||
return decoded
|
||||
|
||||
def decode(self, ids: Union[List[int], int]) -> str:
|
||||
if isinstance(ids, int):
|
||||
ids = [ids]
|
||||
return self.tokenizer.decode(ids)
|
||||
|
||||
def convert_ids_to_tokens(
|
||||
self,
|
||||
ids: List[int],
|
||||
skip_special_tokens: bool = True,
|
||||
) -> List[str]:
|
||||
# TODO(Patrick) - potentially allow special tokens to not be skipped
|
||||
assert (
|
||||
skip_special_tokens
|
||||
), "Skipping special tokens is not supported for Mistral tokenizers."
|
||||
|
||||
assert isinstance(self.tokenizer,
|
||||
(Tekkenizer, SentencePieceTokenizer)), type(
|
||||
self.tokenizer)
|
||||
|
||||
tokens = [self.tokenizer.id_to_piece(id) for id in ids]
|
||||
|
||||
if any(t.strip() == "<EFBFBD>" for t in tokens):
|
||||
# if any stripped decoded token is undefined
|
||||
# because it's invalid unicode then pass bytes
|
||||
# See: https://github.com/vllm-project/vllm/pull/8640
|
||||
tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]
|
||||
|
||||
return tokens
|
||||
16
vllm/transformers_utils/utils.py
Normal file
16
vllm/transformers_utils/utils.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from os import PathLike
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
|
||||
def check_gguf_file(model: Union[str, PathLike]) -> bool:
|
||||
"""Check if the file is a GGUF model."""
|
||||
model = Path(model)
|
||||
if not model.is_file():
|
||||
return False
|
||||
elif model.suffix == ".gguf":
|
||||
return True
|
||||
|
||||
with open(model, "rb") as f:
|
||||
header = f.read(4)
|
||||
return header == b"GGUF"
|
||||
Reference in New Issue
Block a user