init
This commit is contained in:
0
vllm/transformers_utils/__init__.py
Normal file
0
vllm/transformers_utils/__init__.py
Normal file
52
vllm/transformers_utils/config.py
Normal file
52
vllm/transformers_utils/config.py
Normal file
@@ -0,0 +1,52 @@
|
||||
from typing import Optional
|
||||
|
||||
from transformers import AutoConfig, PretrainedConfig
|
||||
|
||||
from vllm.transformers_utils.configs import *
|
||||
|
||||
_CONFIG_REGISTRY = {
|
||||
"chatglm": ChatGLMConfig,
|
||||
"mpt": MPTConfig,
|
||||
"RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct)
|
||||
"RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct)
|
||||
"starcoder2": Starcoder2Config,
|
||||
"minicpm": CPMDragonflyConfig,
|
||||
}
|
||||
|
||||
|
||||
def get_config(model: str,
|
||||
trust_remote_code: bool,
|
||||
revision: Optional[str] = None,
|
||||
code_revision: Optional[str] = None) -> PretrainedConfig:
|
||||
# FIXME(woosuk): This is a temporary fix for StarCoder2.
|
||||
# Remove this when the model is supported by HuggingFace transformers.
|
||||
if "bigcode" in model and "starcoder2" in model:
|
||||
config_class = _CONFIG_REGISTRY["starcoder2"]
|
||||
config = config_class.from_pretrained(model,
|
||||
revision=revision,
|
||||
code_revision=code_revision)
|
||||
return config
|
||||
|
||||
try:
|
||||
config = AutoConfig.from_pretrained(
|
||||
model,
|
||||
trust_remote_code=trust_remote_code,
|
||||
revision=revision,
|
||||
code_revision=code_revision)
|
||||
except ValueError as e:
|
||||
if (not trust_remote_code and
|
||||
"requires you to execute the configuration file" in str(e)):
|
||||
err_msg = (
|
||||
"Failed to load the model config. If the model is a custom "
|
||||
"model not yet available in the HuggingFace transformers "
|
||||
"library, consider setting `trust_remote_code=True` in LLM "
|
||||
"or using the `--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
if config.model_type in _CONFIG_REGISTRY:
|
||||
config_class = _CONFIG_REGISTRY[config.model_type]
|
||||
config = config_class.from_pretrained(model,
|
||||
revision=revision,
|
||||
code_revision=code_revision)
|
||||
return config
|
||||
16
vllm/transformers_utils/configs/__init__.py
Normal file
16
vllm/transformers_utils/configs/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
|
||||
from vllm.transformers_utils.configs.mpt import MPTConfig
|
||||
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
|
||||
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
|
||||
# `FalconConfig` class from the official HuggingFace transformers library.
|
||||
from vllm.transformers_utils.configs.falcon import RWConfig
|
||||
from vllm.transformers_utils.configs.starcoder2 import Starcoder2Config
|
||||
from vllm.transformers_utils.configs.cpm import CPMDragonflyConfig
|
||||
|
||||
__all__ = [
|
||||
"ChatGLMConfig",
|
||||
"MPTConfig",
|
||||
"RWConfig",
|
||||
"Starcoder2Config",
|
||||
"CPMDragonflyConfig",
|
||||
]
|
||||
68
vllm/transformers_utils/configs/chatglm.py
Normal file
68
vllm/transformers_utils/configs/chatglm.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/THUDM/ChatGLM2-6B
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class ChatGLMConfig(PretrainedConfig):
|
||||
model_type = "chatglm"
|
||||
attribute_map = {
|
||||
"num_hidden_layers": "num_layers",
|
||||
"n_head_kv": "multi_query_group_num",
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
num_layers=28,
|
||||
padded_vocab_size=65024,
|
||||
hidden_size=4096,
|
||||
ffn_hidden_size=13696,
|
||||
kv_channels=128,
|
||||
num_attention_heads=32,
|
||||
seq_length=2048,
|
||||
hidden_dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
layernorm_epsilon=1e-5,
|
||||
rmsnorm=True,
|
||||
apply_residual_connection_post_layernorm=False,
|
||||
post_layer_norm=True,
|
||||
add_bias_linear=False,
|
||||
add_qkv_bias=False,
|
||||
interleaved_qkv=False,
|
||||
bias_dropout_fusion=True,
|
||||
multi_query_attention=False,
|
||||
multi_query_group_num=1,
|
||||
apply_query_key_layer_scaling=True,
|
||||
attention_softmax_in_fp32=True,
|
||||
fp32_residual_connection=False,
|
||||
quantization_bit=0,
|
||||
pre_seq_len=None,
|
||||
prefix_projection=False,
|
||||
**kwargs):
|
||||
self.num_layers = num_layers
|
||||
self.vocab_size = padded_vocab_size
|
||||
self.padded_vocab_size = padded_vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.ffn_hidden_size = ffn_hidden_size
|
||||
self.kv_channels = kv_channels
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.seq_length = seq_length
|
||||
self.hidden_dropout = hidden_dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.layernorm_epsilon = layernorm_epsilon
|
||||
self.rmsnorm = rmsnorm
|
||||
self.apply_residual_connection_post_layernorm = (
|
||||
apply_residual_connection_post_layernorm)
|
||||
self.post_layer_norm = post_layer_norm
|
||||
self.add_bias_linear = add_bias_linear
|
||||
self.add_qkv_bias = add_qkv_bias
|
||||
self.bias_dropout_fusion = bias_dropout_fusion
|
||||
self.multi_query_attention = multi_query_attention
|
||||
self.multi_query_group_num = multi_query_group_num
|
||||
self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
|
||||
self.attention_softmax_in_fp32 = attention_softmax_in_fp32
|
||||
self.fp32_residual_connection = fp32_residual_connection
|
||||
self.quantization_bit = quantization_bit
|
||||
self.pre_seq_len = pre_seq_len
|
||||
self.prefix_projection = prefix_projection
|
||||
self.interleaved_qkv = interleaved_qkv
|
||||
super().__init__(**kwargs)
|
||||
113
vllm/transformers_utils/configs/cpm.py
Normal file
113
vllm/transformers_utils/configs/cpm.py
Normal file
@@ -0,0 +1,113 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 The OpenBMB team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
import math
|
||||
|
||||
|
||||
class CPMDragonflyConfig(PretrainedConfig):
|
||||
model_type = "cpm_dragonfly"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"scale_emb": "scale_emb",
|
||||
"scale_depth": "scale_depth",
|
||||
"scale": "scale",
|
||||
"attention_scale": "attention_scale",
|
||||
"qk_norm": "qk_norm",
|
||||
"ffn_gated": "ffn_gated",
|
||||
} # model specific to common
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=32000,
|
||||
hidden_size=4096,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=32,
|
||||
dim_head=128,
|
||||
intermediate_size=11008,
|
||||
num_hidden_layers=32,
|
||||
dropout_p=0.0,
|
||||
hidden_act="silu",
|
||||
scale=True,
|
||||
scale_emb: float=1.,
|
||||
scale_depth: float=-1,
|
||||
dim_model_base:int=None,
|
||||
rms_norm_eps=1e-5,
|
||||
init_std=0.02,
|
||||
half: bool = True,
|
||||
half_type = 'bf16',
|
||||
mask_modules: Optional[List[Tuple[bool, bool]]] = None,
|
||||
use_flash_attn: bool = True,
|
||||
flash_attn_mask_shape="1d",
|
||||
flash_impl="cuda",
|
||||
base=10000,
|
||||
non_checkpointing_layers_num:int = 0,
|
||||
attention_scale=1,
|
||||
qk_norm=False,
|
||||
ffn_gated=True,
|
||||
tie_lm_head=False,
|
||||
max_position_embeddings=2048,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.dim_head = dim_head
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.dropout_p = dropout_p
|
||||
self.hidden_act = hidden_act
|
||||
self.scale = scale
|
||||
self.scale_emb = scale_emb
|
||||
self.half = half
|
||||
self.half_type = half_type
|
||||
self.dim_model_base = dim_model_base
|
||||
self.scale_depth = scale_depth
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.init_std = init_std
|
||||
self.flash_impl = flash_impl
|
||||
self.mask_modules = mask_modules
|
||||
self.use_flash_attn = use_flash_attn
|
||||
self.flash_attn_mask_shape = flash_attn_mask_shape
|
||||
self.base = base
|
||||
self.attention_scale=attention_scale
|
||||
self.qk_norm = qk_norm
|
||||
self.ffn_gated = ffn_gated
|
||||
self.non_checkpointing_layers_num = non_checkpointing_layers_num
|
||||
self.tie_lm_head = tie_lm_head
|
||||
self.use_bfloat16 = True if self.half_type == 'bf16' else False
|
||||
super().__init__(architectures=["CPMDragonflyForCausalLM"])
|
||||
|
||||
@property
|
||||
def scale_width(self,):
|
||||
if self.scale:
|
||||
return self.hidden_size / self.dim_model_base
|
||||
else:
|
||||
return 1.
|
||||
|
||||
@property
|
||||
def scale_states(self,):
|
||||
if self.scale:
|
||||
return self.scale_depth / math.sqrt(self.num_hidden_layers)
|
||||
else:
|
||||
return 1.
|
||||
87
vllm/transformers_utils/configs/falcon.py
Normal file
87
vllm/transformers_utils/configs/falcon.py
Normal file
@@ -0,0 +1,87 @@
|
||||
# Adapted from
|
||||
# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Falcon configuration"""
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class RWConfig(PretrainedConfig):
|
||||
model_type = "falcon"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_hidden_layers": "n_layer",
|
||||
"num_attention_heads": "n_head",
|
||||
"num_kv_heads": "n_head_kv",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=250880,
|
||||
hidden_size=64,
|
||||
n_layer=2,
|
||||
n_head=8,
|
||||
layer_norm_epsilon=1e-5,
|
||||
initializer_range=0.02,
|
||||
use_cache=True,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
hidden_dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
multi_query=True,
|
||||
n_head_kv=None,
|
||||
alibi=False,
|
||||
bias=False,
|
||||
parallel_attn=False,
|
||||
new_decoder_architecture=False,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.vocab_size = vocab_size
|
||||
# Backward compatibility with n_embed kwarg
|
||||
n_embed = kwargs.pop("n_embed", None)
|
||||
self.hidden_size = hidden_size if n_embed is None else n_embed
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
self.use_cache = use_cache
|
||||
self.hidden_dropout = hidden_dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
self.bos_token_id = bos_token_id
|
||||
self.eos_token_id = eos_token_id
|
||||
self.multi_query = multi_query
|
||||
self.n_head_kv = 1 if n_head_kv is None else n_head_kv
|
||||
self.alibi = alibi
|
||||
self.bias = bias
|
||||
self.parallel_attn = parallel_attn
|
||||
self.new_decoder_architecture = new_decoder_architecture
|
||||
|
||||
if self.hidden_size == 8192:
|
||||
# Hack for falcon-40b
|
||||
self.new_decoder_architecture = True
|
||||
|
||||
super().__init__(bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
**kwargs)
|
||||
|
||||
@property
|
||||
def head_dim(self):
|
||||
return self.hidden_size // self.n_head
|
||||
|
||||
@property
|
||||
def rotary(self):
|
||||
return not self.alibi
|
||||
232
vllm/transformers_utils/configs/mpt.py
Normal file
232
vllm/transformers_utils/configs/mpt.py
Normal file
@@ -0,0 +1,232 @@
|
||||
# coding=utf-8
|
||||
# Copied from
|
||||
# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
|
||||
"""A HuggingFace-style model configuration."""
|
||||
import warnings
|
||||
from typing import Any, Dict, Optional, Union
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
attn_config_defaults: Dict = {
|
||||
'attn_type': 'multihead_attention',
|
||||
'attn_pdrop': 0.0,
|
||||
'attn_impl': 'triton',
|
||||
'qk_ln': False,
|
||||
'clip_qkv': None,
|
||||
'softmax_scale': None,
|
||||
'prefix_lm': False,
|
||||
'attn_uses_sequence_id': False,
|
||||
'alibi': False,
|
||||
'alibi_bias_max': 8
|
||||
}
|
||||
ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'}
|
||||
init_config_defaults: Dict = {
|
||||
'name': 'kaiming_normal_',
|
||||
'fan_mode': 'fan_in',
|
||||
'init_nonlinearity': 'relu',
|
||||
'init_div_is_residual': True,
|
||||
'emb_init_std': None,
|
||||
'emb_init_uniform_lim': None,
|
||||
'init_std': None,
|
||||
'init_gain': 0.0
|
||||
}
|
||||
|
||||
|
||||
class MPTConfig(PretrainedConfig):
|
||||
model_type = 'mpt'
|
||||
attribute_map = {
|
||||
'num_attention_heads': 'n_heads',
|
||||
'hidden_size': 'd_model',
|
||||
'num_hidden_layers': 'n_layers',
|
||||
}
|
||||
|
||||
# pylint: disable=dangerous-default-value
|
||||
def __init__(self,
|
||||
d_model: int = 2048,
|
||||
n_heads: int = 16,
|
||||
n_layers: int = 24,
|
||||
expansion_ratio: int = 4,
|
||||
max_seq_len: int = 2048,
|
||||
vocab_size: int = 50368,
|
||||
resid_pdrop: float = 0.0,
|
||||
emb_pdrop: float = 0.0,
|
||||
learned_pos_emb: bool = True,
|
||||
attn_config: Dict = attn_config_defaults,
|
||||
ffn_config: Dict = ffn_config_defaults,
|
||||
init_device: str = 'cpu',
|
||||
logit_scale: Optional[Union[float, str]] = None,
|
||||
no_bias: bool = False,
|
||||
embedding_fraction: float = 1.0,
|
||||
norm_type: str = 'low_precision_layernorm',
|
||||
use_cache: bool = False,
|
||||
init_config: Dict = init_config_defaults,
|
||||
fc_type: str = 'torch',
|
||||
verbose: Optional[int] = None,
|
||||
**kwargs: Any):
|
||||
"""The MPT configuration class.
|
||||
Args:
|
||||
d_model (int): The size of the embedding dimension of the model.
|
||||
n_heads (int): The number of attention heads.
|
||||
n_layers (int): The number of layers in the model.
|
||||
expansion_ratio (int): The ratio of the up/down scale in the ffn.
|
||||
max_seq_len (int): The maximum sequence length of the model.
|
||||
vocab_size (int): The size of the vocabulary.
|
||||
resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
|
||||
emb_pdrop (float): The dropout probability for the embedding layer.
|
||||
learned_pos_emb (bool): Whether to use learned positional embeddings
|
||||
attn_config (Dict): A dictionary used to configure the model's attention module:
|
||||
attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention
|
||||
attn_pdrop (float): The dropout probability for the attention layers.
|
||||
attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
|
||||
qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
|
||||
clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
|
||||
this value.
|
||||
softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
|
||||
use the default scale of ``1/sqrt(d_keys)``.
|
||||
prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
|
||||
extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
|
||||
can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
|
||||
attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
|
||||
When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
|
||||
which sub-sequence each token belongs to.
|
||||
Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
|
||||
alibi (bool): Whether to use the alibi bias instead of position embeddings.
|
||||
alibi_bias_max (int): The maximum value of the alibi bias.
|
||||
kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
|
||||
ffn_config (Dict): A dictionary used to configure the model's ffn module:
|
||||
ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp
|
||||
init_device (str): The device to use for parameter initialization.
|
||||
logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
|
||||
no_bias (bool): Whether to use bias in all layers.
|
||||
verbose (int): The verbosity level. 0 is silent.
|
||||
embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
|
||||
norm_type (str): choose type of norm to use
|
||||
use_cache (bool): Whether or not the model should return the last key/values attentions
|
||||
init_config (Dict): A dictionary used to configure the model initialization:
|
||||
init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
|
||||
'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
|
||||
'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
|
||||
init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
|
||||
emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
|
||||
emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
|
||||
used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
|
||||
init_std (float): The standard deviation of the normal distribution used to initialize the model,
|
||||
if using the baseline_ parameter initialization scheme.
|
||||
init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
|
||||
fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
|
||||
init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
|
||||
---
|
||||
See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
|
||||
fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs.
|
||||
"""
|
||||
self.d_model = d_model
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.expansion_ratio = expansion_ratio
|
||||
self.max_seq_len = max_seq_len
|
||||
self.vocab_size = vocab_size
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.emb_pdrop = emb_pdrop
|
||||
self.learned_pos_emb = learned_pos_emb
|
||||
self.attn_config = attn_config
|
||||
self.ffn_config = ffn_config
|
||||
self.init_device = init_device
|
||||
self.logit_scale = logit_scale
|
||||
self.no_bias = no_bias
|
||||
self.embedding_fraction = embedding_fraction
|
||||
self.norm_type = norm_type
|
||||
self.use_cache = use_cache
|
||||
self.init_config = init_config
|
||||
self.fc_type = fc_type
|
||||
if verbose is not None:
|
||||
warnings.warn(DeprecationWarning(
|
||||
'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.'
|
||||
),
|
||||
stacklevel=2)
|
||||
if 'name' in kwargs:
|
||||
del kwargs['name']
|
||||
if 'loss_fn' in kwargs:
|
||||
del kwargs['loss_fn']
|
||||
if self.attn_config.get('alibi', False):
|
||||
self.learned_pos_emb = False
|
||||
warnings.warn(
|
||||
f'alibi is turned on, setting `learned_pos_emb` to {self.learned_pos_emb}`',
|
||||
stacklevel=2)
|
||||
super().__init__(**kwargs)
|
||||
self._validate_config()
|
||||
|
||||
def _set_config_defaults(
|
||||
self, config: Dict[str, Any],
|
||||
config_defaults: Dict[str, Any]) -> Dict[str, Any]:
|
||||
for (k, v) in config_defaults.items():
|
||||
if k not in config:
|
||||
config[k] = v
|
||||
return config
|
||||
|
||||
def _validate_config(self) -> None:
|
||||
self.attn_config = self._set_config_defaults(self.attn_config,
|
||||
attn_config_defaults)
|
||||
self.ffn_config = self._set_config_defaults(self.ffn_config,
|
||||
ffn_config_defaults)
|
||||
self.init_config = self._set_config_defaults(self.init_config,
|
||||
init_config_defaults)
|
||||
if self.d_model % self.n_heads != 0:
|
||||
raise ValueError('d_model must be divisible by n_heads')
|
||||
if any((
|
||||
prob < 0 or prob > 1 for prob in
|
||||
[self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop]
|
||||
)):
|
||||
raise ValueError(
|
||||
"self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1" # pylint: disable=line-too-long
|
||||
)
|
||||
if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
|
||||
raise ValueError(
|
||||
f"Unknown attn_impl={self.attn_config['attn_impl']}")
|
||||
if self.attn_config['prefix_lm'] and self.attn_config[
|
||||
'attn_impl'] not in ['torch', 'triton']:
|
||||
raise NotImplementedError(
|
||||
'prefix_lm only implemented with torch and triton attention.')
|
||||
if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [
|
||||
'torch', 'triton'
|
||||
]:
|
||||
raise NotImplementedError(
|
||||
'alibi only implemented with torch and triton attention.')
|
||||
if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
|
||||
'attn_impl'] not in ['torch', 'triton']:
|
||||
raise NotImplementedError(
|
||||
'attn_uses_sequence_id only implemented with torch and triton attention.' # pylint: disable=line-too-long
|
||||
)
|
||||
if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
|
||||
raise ValueError(
|
||||
'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!' # pylint: disable=line-too-long
|
||||
)
|
||||
if isinstance(self.logit_scale,
|
||||
str) and self.logit_scale != 'inv_sqrt_d_model':
|
||||
raise ValueError(
|
||||
f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'." # pylint: disable=line-too-long
|
||||
)
|
||||
if self.init_config.get('name', None) is None:
|
||||
raise ValueError(
|
||||
f"self.init_config={self.init_config!r} 'name' needs to be set."
|
||||
)
|
||||
if not self.learned_pos_emb and (not self.attn_config['alibi']):
|
||||
warnings.warn(
|
||||
'Positional information not being provided to the model.',
|
||||
stacklevel=2)
|
||||
if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
|
||||
try:
|
||||
# pylint: disable=import-outside-toplevel
|
||||
import transformer_engine.pytorch as te
|
||||
del te
|
||||
except Exception as exc:
|
||||
raise ImportError(
|
||||
# pylint: disable=line-too-long
|
||||
'TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed. '
|
||||
+
|
||||
'The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:\n'
|
||||
+ 'pip install flash-attn==1.0.6 --no-build-isolation \n' +
|
||||
'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
|
||||
) from exc
|
||||
if self.ffn_config['ffn_type'] == 'mptmlp':
|
||||
self.ffn_config['fc_type'] = self.fc_type
|
||||
elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
|
||||
self.ffn_config['bias'] = not self.no_bias
|
||||
127
vllm/transformers_utils/configs/starcoder2.py
Normal file
127
vllm/transformers_utils/configs/starcoder2.py
Normal file
@@ -0,0 +1,127 @@
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class Starcoder2Config(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`Starcoder2Model`]. It is used to instantiate a
|
||||
Starcoder2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
||||
with the defaults will yield a similar configuration to that of the [bigcode/starcoder2-7b_16k](https://huggingface.co/bigcode/starcoder2-7b_16k) model.
|
||||
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 49152):
|
||||
Vocabulary size of the Starcoder2 model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`Starcoder2Model`]
|
||||
hidden_size (`int`, *optional*, defaults to 3072):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 12288):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 30):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 24):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 2):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
|
||||
The non-linear activation function (function or string) in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 4096):
|
||||
The maximum sequence length that this model might ever be used with. Starcoder2's sliding window attention
|
||||
allows sequence of up to 4096*32 tokens.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
norm_epsilon (`float`, *optional*, defaults to 1e-05):
|
||||
Epsilon value for the layer norm
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
bos_token_id (`int`, *optional*, defaults to 50256):
|
||||
The id of the "beginning-of-sequence" token.
|
||||
eos_token_id (`int`, *optional*, defaults to 50256):
|
||||
The id of the "end-of-sequence" token.
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
sliding_window (`int`, *optional*):
|
||||
Sliding window attention window size. If not specified, will default to `None` (no sliding window).
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
residual_dropout (`float`, *optional*, defaults to 0.0):
|
||||
Residual connection dropout value.
|
||||
embedding_dropout (`float`, *optional*, defaults to 0.0):
|
||||
Embedding dropout.
|
||||
use_bias (`bool`, *optional*, defaults to `True`):
|
||||
Whether to use bias term on linear layers of the model.
|
||||
|
||||
|
||||
```python
|
||||
>>> from transformers import Starcoder2Model, Starcoder2Config
|
||||
|
||||
>>> # Initializing a Starcoder2 7B style configuration
|
||||
>>> configuration = Starcoder2Config()
|
||||
|
||||
>>> # Initializing a model from the Starcoder2 7B style configuration
|
||||
>>> model = Starcoder2Model(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "starcoder2"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=49152,
|
||||
hidden_size=3072,
|
||||
intermediate_size=12288,
|
||||
num_hidden_layers=30,
|
||||
num_attention_heads=24,
|
||||
num_key_value_heads=2,
|
||||
hidden_act="gelu_pytorch_tanh",
|
||||
max_position_embeddings=4096,
|
||||
initializer_range=0.018042,
|
||||
norm_epsilon=1e-5,
|
||||
use_cache=True,
|
||||
bos_token_id=50256,
|
||||
eos_token_id=50256,
|
||||
rope_theta=10000.0,
|
||||
sliding_window=None,
|
||||
attention_dropout=0.0,
|
||||
residual_dropout=0.0,
|
||||
embedding_dropout=0.0,
|
||||
use_bias=True,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.sliding_window = sliding_window
|
||||
self.use_bias = use_bias
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.norm_epsilon = norm_epsilon
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.attention_dropout = attention_dropout
|
||||
self.residual_dropout = residual_dropout
|
||||
self.embedding_dropout = embedding_dropout
|
||||
|
||||
super().__init__(
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
if self.architectures is None:
|
||||
self.architectures = ['Starcoder2ForCausalLM']
|
||||
245
vllm/transformers_utils/tokenizer.py
Normal file
245
vllm/transformers_utils/tokenizer.py
Normal file
@@ -0,0 +1,245 @@
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
||||
PreTrainedTokenizerFast)
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.utils import make_async, LRUCache
|
||||
from vllm.transformers_utils.tokenizers import *
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def get_tokenizer(
|
||||
tokenizer_name: str,
|
||||
*args,
|
||||
tokenizer_mode: str = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
tokenizer_revision: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||
"""Gets a tokenizer for the given model name via Huggingface."""
|
||||
if tokenizer_mode == "slow":
|
||||
if kwargs.get("use_fast", False):
|
||||
raise ValueError(
|
||||
"Cannot use the fast tokenizer in slow tokenizer mode.")
|
||||
kwargs["use_fast"] = False
|
||||
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
tokenizer_name,
|
||||
*args,
|
||||
trust_remote_code=trust_remote_code,
|
||||
tokenizer_revision=tokenizer_revision,
|
||||
**kwargs)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the tokenizer class not existing or not
|
||||
# currently being imported, suggest using the --trust-remote-code flag.
|
||||
if (not trust_remote_code and
|
||||
("does not exist or is not currently imported." in str(e)
|
||||
or "requires you to execute the tokenizer file" in str(e))):
|
||||
err_msg = (
|
||||
"Failed to load the tokenizer. If the tokenizer is a custom "
|
||||
"tokenizer not yet available in the HuggingFace transformers "
|
||||
"library, consider setting `trust_remote_code=True` in LLM "
|
||||
"or using the `--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
except AttributeError as e:
|
||||
if "BaichuanTokenizer" in str(e):
|
||||
# This is for the error "'BaichuanTokenizer' object has no
|
||||
# attribute 'sp_model'".
|
||||
tokenizer = BaichuanTokenizer.from_pretrained(
|
||||
tokenizer_name,
|
||||
*args,
|
||||
trust_remote_code=trust_remote_code,
|
||||
tokenizer_revision=tokenizer_revision,
|
||||
**kwargs)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||
logger.warning(
|
||||
"Using a slow tokenizer. This might cause a significant "
|
||||
"slowdown. Consider using a fast tokenizer instead.")
|
||||
return tokenizer
|
||||
|
||||
|
||||
def get_lora_tokenizer(lora_request: LoRARequest, *args,
|
||||
**kwargs) -> Optional[PreTrainedTokenizer]:
|
||||
if lora_request is None:
|
||||
return None
|
||||
try:
|
||||
tokenizer = get_tokenizer(lora_request.lora_local_path, *args,
|
||||
**kwargs)
|
||||
except OSError as e:
|
||||
# No tokenizer was found in the LoRA folder,
|
||||
# use base model tokenizer
|
||||
logger.warning(
|
||||
f"No tokenizer found in {lora_request.lora_local_path}, "
|
||||
"using base model tokenizer instead. "
|
||||
f"(Exception: {str(e)})")
|
||||
tokenizer = None
|
||||
return tokenizer
|
||||
|
||||
|
||||
get_lora_tokenizer_async = make_async(get_lora_tokenizer)
|
||||
|
||||
|
||||
class TokenizerGroup:
|
||||
"""A group of tokenizers that can be used for LoRA adapters."""
|
||||
|
||||
def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
|
||||
max_input_length: Optional[int], **tokenizer_config):
|
||||
self.tokenizer_id = tokenizer_id
|
||||
self.tokenizer_config = tokenizer_config
|
||||
self.enable_lora = enable_lora
|
||||
self.max_input_length = max_input_length
|
||||
self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
|
||||
if enable_lora:
|
||||
self.lora_tokenizers = LRUCache(capacity=max_num_seqs)
|
||||
else:
|
||||
self.lora_tokenizers = None
|
||||
|
||||
def encode(self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
||||
tokenizer = self.get_lora_tokenizer(lora_request)
|
||||
return tokenizer.encode(prompt)
|
||||
|
||||
async def encode_async(
|
||||
self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
||||
tokenizer = await self.get_lora_tokenizer_async(lora_request)
|
||||
return tokenizer.encode(prompt)
|
||||
|
||||
def get_lora_tokenizer(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer":
|
||||
if not lora_request or not self.enable_lora:
|
||||
return self.tokenizer
|
||||
if lora_request.lora_int_id not in self.lora_tokenizers:
|
||||
tokenizer = (get_lora_tokenizer(
|
||||
lora_request, **self.tokenizer_config) or self.tokenizer)
|
||||
self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
|
||||
return tokenizer
|
||||
else:
|
||||
return self.lora_tokenizers.get(lora_request.lora_int_id)
|
||||
|
||||
async def get_lora_tokenizer_async(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer":
|
||||
if not lora_request or not self.enable_lora:
|
||||
return self.tokenizer
|
||||
if lora_request.lora_int_id not in self.lora_tokenizers:
|
||||
tokenizer = (await get_lora_tokenizer_async(
|
||||
lora_request, **self.tokenizer_config) or self.tokenizer)
|
||||
self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
|
||||
return tokenizer
|
||||
else:
|
||||
return self.lora_tokenizers.get(lora_request.lora_int_id)
|
||||
|
||||
|
||||
def _convert_tokens_to_string_with_added_encoders(
|
||||
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
|
||||
output_tokens: List[str],
|
||||
skip_special_tokens: bool,
|
||||
spaces_between_special_tokens: bool,
|
||||
) -> str:
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
|
||||
# NOTE(woosuk): The following code is slow because it runs a for loop over
|
||||
# the output_tokens. In Python, running a for loop over a list can be slow
|
||||
# even when the loop body is very simple.
|
||||
sub_texts = []
|
||||
current_sub_text = []
|
||||
all_special_tokens = set(tokenizer.all_special_tokens)
|
||||
for token in output_tokens:
|
||||
if skip_special_tokens and token in all_special_tokens:
|
||||
continue
|
||||
if token in tokenizer.get_added_vocab():
|
||||
if current_sub_text:
|
||||
sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
|
||||
sub_texts.append(sub_text)
|
||||
current_sub_text = []
|
||||
sub_texts.append(token)
|
||||
else:
|
||||
current_sub_text.append(token)
|
||||
if current_sub_text:
|
||||
sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
|
||||
sub_texts.append(sub_text)
|
||||
if spaces_between_special_tokens:
|
||||
return " ".join(sub_texts)
|
||||
else:
|
||||
return "".join(sub_texts)
|
||||
|
||||
|
||||
# Based on
|
||||
# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
|
||||
# under Apache 2.0 license
|
||||
def detokenize_incrementally(
|
||||
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
|
||||
all_input_ids: List[int],
|
||||
prev_tokens: Optional[List[str]],
|
||||
prefix_offset: int = 0,
|
||||
read_offset: int = 0,
|
||||
skip_special_tokens: bool = False,
|
||||
spaces_between_special_tokens: bool = True,
|
||||
) -> Tuple[List[str], str, int, int]:
|
||||
new_token_id = all_input_ids[-1]
|
||||
# This is the first iteration for this sequence
|
||||
if prev_tokens is None:
|
||||
new_tokens = tokenizer.convert_ids_to_tokens(
|
||||
all_input_ids, skip_special_tokens=skip_special_tokens)
|
||||
output_tokens = new_tokens
|
||||
# 5 is an arbitrary value that should work for all
|
||||
# tokenizers (bigger = more conservative).
|
||||
# Subtract 1 extra to account for the generated token.
|
||||
prefix_offset = max(len(output_tokens) - 6, 0)
|
||||
# If the first new token is a special token, we can't skip 1 extra token
|
||||
if skip_special_tokens and new_token_id in tokenizer.all_special_ids:
|
||||
read_offset = max(len(output_tokens), 0)
|
||||
else:
|
||||
read_offset = max(len(output_tokens) - 1, 0)
|
||||
else:
|
||||
# Put new_token_id in a list so skip_special_tokens is respected
|
||||
new_tokens = tokenizer.convert_ids_to_tokens(
|
||||
[new_token_id], skip_special_tokens=skip_special_tokens)
|
||||
output_tokens = prev_tokens + new_tokens
|
||||
|
||||
# The prefix text is necessary only to defeat cleanup algorithms in
|
||||
# the decode which decide to add a space or not depending on the
|
||||
# surrounding ids.
|
||||
if tokenizer.is_fast or not tokenizer.get_added_vocab():
|
||||
prefix_text = tokenizer.convert_tokens_to_string(
|
||||
output_tokens[prefix_offset:read_offset])
|
||||
new_text = tokenizer.convert_tokens_to_string(
|
||||
output_tokens[prefix_offset:])
|
||||
else:
|
||||
prefix_text = _convert_tokens_to_string_with_added_encoders(
|
||||
tokenizer,
|
||||
output_tokens[prefix_offset:read_offset],
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
spaces_between_special_tokens=spaces_between_special_tokens,
|
||||
)
|
||||
new_text = _convert_tokens_to_string_with_added_encoders(
|
||||
tokenizer,
|
||||
output_tokens[prefix_offset:],
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
spaces_between_special_tokens=spaces_between_special_tokens,
|
||||
)
|
||||
|
||||
if len(new_text) > len(prefix_text) and not new_text.endswith("<EFBFBD>"):
|
||||
# utf-8 char at the end means it's a potential unfinished byte sequence
|
||||
# from byte fallback tokenization.
|
||||
# If it's in the middle, it's probably a real invalid id generated
|
||||
# by the model
|
||||
new_text = new_text[len(prefix_text):]
|
||||
return new_tokens, new_text, read_offset, len(output_tokens)
|
||||
else:
|
||||
return new_tokens, "", prefix_offset, read_offset
|
||||
5
vllm/transformers_utils/tokenizers/__init__.py
Normal file
5
vllm/transformers_utils/tokenizers/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from vllm.transformers_utils.tokenizers.baichuan import BaichuanTokenizer
|
||||
|
||||
__all__ = [
|
||||
"BaichuanTokenizer",
|
||||
]
|
||||
263
vllm/transformers_utils/tokenizers/baichuan.py
Normal file
263
vllm/transformers_utils/tokenizers/baichuan.py
Normal file
@@ -0,0 +1,263 @@
|
||||
# yapf: disable
|
||||
# Adapted from
|
||||
# https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py
|
||||
# This includes a fix suggested in
|
||||
# https://github.com/vllm-project/vllm/issues/1403#issuecomment-1767503058
|
||||
# Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved.
|
||||
|
||||
import os
|
||||
from shutil import copyfile
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import sentencepiece as spm
|
||||
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
|
||||
from transformers.utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
|
||||
|
||||
PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {},
|
||||
"tokenizer_file": {},
|
||||
}
|
||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
|
||||
|
||||
|
||||
class BaichuanTokenizer(PreTrainedTokenizer):
|
||||
"""
|
||||
Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding.
|
||||
|
||||
Args:
|
||||
vocab_file (`str`):
|
||||
Path to the vocabulary file.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
model_input_names = ["input_ids", "attention_mask"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file,
|
||||
unk_token="<unk>",
|
||||
bos_token="<s>",
|
||||
eos_token="</s>",
|
||||
pad_token=None,
|
||||
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
||||
add_bos_token=True,
|
||||
add_eos_token=False,
|
||||
clean_up_tokenization_spaces=False,
|
||||
**kwargs,
|
||||
):
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
bos_token = (
|
||||
AddedToken(bos_token, lstrip=False, rstrip=False)
|
||||
if isinstance(bos_token, str)
|
||||
else bos_token
|
||||
)
|
||||
eos_token = (
|
||||
AddedToken(eos_token, lstrip=False, rstrip=False)
|
||||
if isinstance(eos_token, str)
|
||||
else eos_token
|
||||
)
|
||||
unk_token = (
|
||||
AddedToken(unk_token, lstrip=False, rstrip=False)
|
||||
if isinstance(unk_token, str)
|
||||
else unk_token
|
||||
)
|
||||
pad_token = (
|
||||
AddedToken(pad_token, lstrip=False, rstrip=False)
|
||||
if isinstance(pad_token, str)
|
||||
else pad_token
|
||||
)
|
||||
self.vocab_file = vocab_file
|
||||
self.add_bos_token = add_bos_token
|
||||
self.add_eos_token = add_eos_token
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(vocab_file)
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
add_bos_token=add_bos_token,
|
||||
add_eos_token=add_eos_token,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
state["sp_model"] = None
|
||||
return state
|
||||
|
||||
def __setstate__(self, d):
|
||||
self.__dict__ = d
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(self.vocab_file)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
"""Returns vocab size"""
|
||||
return self.sp_model.get_piece_size()
|
||||
|
||||
def get_vocab(self):
|
||||
"""Returns vocab as a dict"""
|
||||
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def _tokenize(self, text):
|
||||
"""Returns a tokenized string."""
|
||||
return self.sp_model.encode(text, out_type=str)
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
"""Converts a token (str) in an id using the vocab."""
|
||||
return self.sp_model.piece_to_id(token)
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
token = self.sp_model.IdToPiece(index)
|
||||
return token
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
"""Converts a sequence of tokens (string) in a single string."""
|
||||
current_sub_tokens = []
|
||||
out_string = ""
|
||||
prev_is_special = False
|
||||
for i, token in enumerate(tokens):
|
||||
# make sure that special tokens are not decoded using sentencepiece model
|
||||
if token in self.all_special_tokens:
|
||||
if not prev_is_special and i != 0:
|
||||
out_string += " "
|
||||
out_string += self.sp_model.decode(current_sub_tokens) + token
|
||||
prev_is_special = True
|
||||
current_sub_tokens = []
|
||||
else:
|
||||
current_sub_tokens.append(token)
|
||||
prev_is_special = False
|
||||
out_string += self.sp_model.decode(current_sub_tokens)
|
||||
return out_string
|
||||
|
||||
def save_vocabulary(
|
||||
self, save_directory, filename_prefix: Optional[str] = None
|
||||
) -> Tuple[str]:
|
||||
"""
|
||||
Save the vocabulary and special tokens file to a directory.
|
||||
|
||||
Args:
|
||||
save_directory (`str`):
|
||||
The directory in which to save the vocabulary.
|
||||
|
||||
Returns:
|
||||
`Tuple(str)`: Paths to the files saved.
|
||||
"""
|
||||
if not os.path.isdir(save_directory):
|
||||
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
||||
return
|
||||
out_vocab_file = os.path.join(
|
||||
save_directory,
|
||||
(filename_prefix + "-" if filename_prefix else "")
|
||||
+ VOCAB_FILES_NAMES["vocab_file"],
|
||||
)
|
||||
|
||||
if os.path.abspath(self.vocab_file) != os.path.abspath(
|
||||
out_vocab_file
|
||||
) and os.path.isfile(self.vocab_file):
|
||||
copyfile(self.vocab_file, out_vocab_file)
|
||||
elif not os.path.isfile(self.vocab_file):
|
||||
with open(out_vocab_file, "wb") as fi:
|
||||
content_spiece_model = self.sp_model.serialized_model_proto()
|
||||
fi.write(content_spiece_model)
|
||||
|
||||
return (out_vocab_file,)
|
||||
|
||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
||||
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
||||
|
||||
output = bos_token_id + token_ids_0 + eos_token_id
|
||||
|
||||
if token_ids_1 is not None:
|
||||
output = output + bos_token_id + token_ids_1 + eos_token_id
|
||||
|
||||
return output
|
||||
|
||||
def get_special_tokens_mask(
|
||||
self,
|
||||
token_ids_0: List[int],
|
||||
token_ids_1: Optional[List[int]] = None,
|
||||
already_has_special_tokens: bool = False,
|
||||
) -> List[int]:
|
||||
"""
|
||||
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||
special tokens using the tokenizer `prepare_for_model` method.
|
||||
|
||||
Args:
|
||||
token_ids_0 (`List[int]`):
|
||||
List of IDs.
|
||||
token_ids_1 (`List[int]`, *optional*):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the token list is already formatted with special tokens for the model.
|
||||
|
||||
Returns:
|
||||
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
||||
"""
|
||||
if already_has_special_tokens:
|
||||
return super().get_special_tokens_mask(
|
||||
token_ids_0=token_ids_0,
|
||||
token_ids_1=token_ids_1,
|
||||
already_has_special_tokens=True,
|
||||
)
|
||||
|
||||
bos_token_id = [1] if self.add_bos_token else []
|
||||
eos_token_id = [1] if self.add_eos_token else []
|
||||
|
||||
if token_ids_1 is None:
|
||||
return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
|
||||
return (
|
||||
bos_token_id
|
||||
+ ([0] * len(token_ids_0))
|
||||
+ eos_token_id
|
||||
+ bos_token_id
|
||||
+ ([0] * len(token_ids_1))
|
||||
+ eos_token_id
|
||||
)
|
||||
|
||||
def create_token_type_ids_from_sequences(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||
) -> List[int]:
|
||||
"""
|
||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
|
||||
sequence pair mask has the following format:
|
||||
|
||||
```
|
||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
||||
| first sequence | second sequence |
|
||||
```
|
||||
|
||||
if token_ids_1 is None, only returns the first portion of the mask (0s).
|
||||
|
||||
Args:
|
||||
token_ids_0 (`List[int]`):
|
||||
List of ids.
|
||||
token_ids_1 (`List[int]`, *optional*):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
|
||||
Returns:
|
||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
||||
"""
|
||||
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
||||
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
||||
|
||||
output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
|
||||
|
||||
if token_ids_1 is not None:
|
||||
output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
|
||||
|
||||
return output
|
||||
Reference in New Issue
Block a user