forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
17
vllm-v0.6.2/vllm/transformers_utils/__init__.py
Normal file
17
vllm-v0.6.2/vllm/transformers_utils/__init__.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from vllm.envs import VLLM_USE_MODELSCOPE
|
||||
|
||||
if VLLM_USE_MODELSCOPE:
|
||||
# Patch here, before each import happens
|
||||
import modelscope
|
||||
from packaging import version
|
||||
|
||||
# patch_hub begins from modelscope>=1.18.1
|
||||
if version.parse(modelscope.__version__) <= version.parse('1.18.0'):
|
||||
raise ImportError(
|
||||
'Using vLLM with ModelScope needs modelscope>=1.18.1, please '
|
||||
'install by `pip install modelscope -U`')
|
||||
|
||||
from modelscope.utils.hf_util import patch_hub
|
||||
|
||||
# Patch hub to download models from modelscope to speed up.
|
||||
patch_hub()
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
570
vllm-v0.6.2/vllm/transformers_utils/config.py
Normal file
570
vllm-v0.6.2/vllm/transformers_utils/config.py
Normal file
@@ -0,0 +1,570 @@
|
||||
import enum
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional, Type, Union
|
||||
|
||||
import huggingface_hub
|
||||
from huggingface_hub import (file_exists, hf_hub_download,
|
||||
try_to_load_from_cache)
|
||||
from huggingface_hub.utils import (EntryNotFoundError, LocalEntryNotFoundError,
|
||||
RepositoryNotFoundError,
|
||||
RevisionNotFoundError)
|
||||
from transformers import GenerationConfig, PretrainedConfig
|
||||
from transformers.models.auto.image_processing_auto import (
|
||||
get_image_processor_config)
|
||||
from transformers.models.auto.modeling_auto import (
|
||||
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
|
||||
from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
|
||||
|
||||
from vllm.envs import VLLM_USE_MODELSCOPE
|
||||
from vllm.logger import init_logger
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
|
||||
EAGLEConfig, ExaoneConfig,
|
||||
H2OVLChatConfig,
|
||||
InternVLChatConfig, JAISConfig,
|
||||
MedusaConfig, MllamaConfig,
|
||||
MLPSpeculatorConfig, MPTConfig,
|
||||
NemotronConfig, NVLM_D_Config,
|
||||
RWConfig, SolarConfig,
|
||||
UltravoxConfig)
|
||||
# yapf: enable
|
||||
from vllm.transformers_utils.utils import check_gguf_file
|
||||
|
||||
if VLLM_USE_MODELSCOPE:
|
||||
from modelscope import AutoConfig
|
||||
else:
|
||||
from transformers import AutoConfig
|
||||
|
||||
MISTRAL_CONFIG_NAME = "params.json"
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
_CONFIG_REGISTRY_OVERRIDE_HF: Dict[str, Type[PretrainedConfig]] = {
|
||||
"mllama": MllamaConfig
|
||||
}
|
||||
|
||||
_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
|
||||
"chatglm": ChatGLMConfig,
|
||||
"dbrx": DbrxConfig,
|
||||
"mpt": MPTConfig,
|
||||
"RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct)
|
||||
"RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct)
|
||||
"jais": JAISConfig,
|
||||
"mlp_speculator": MLPSpeculatorConfig,
|
||||
"medusa": MedusaConfig,
|
||||
"eagle": EAGLEConfig,
|
||||
"exaone": ExaoneConfig,
|
||||
"h2ovl_chat": H2OVLChatConfig,
|
||||
"internvl_chat": InternVLChatConfig,
|
||||
"nemotron": NemotronConfig,
|
||||
"NVLM_D": NVLM_D_Config,
|
||||
"solar": SolarConfig,
|
||||
"ultravox": UltravoxConfig,
|
||||
**_CONFIG_REGISTRY_OVERRIDE_HF
|
||||
}
|
||||
|
||||
|
||||
class ConfigFormat(str, enum.Enum):
|
||||
AUTO = "auto"
|
||||
HF = "hf"
|
||||
MISTRAL = "mistral"
|
||||
|
||||
|
||||
def file_or_path_exists(model: Union[str, Path], config_name, revision,
|
||||
token) -> bool:
|
||||
if Path(model).exists():
|
||||
return (Path(model) / config_name).is_file()
|
||||
|
||||
# Offline mode support: Check if config file is cached already
|
||||
cached_filepath = try_to_load_from_cache(repo_id=model,
|
||||
filename=config_name,
|
||||
revision=revision)
|
||||
if isinstance(cached_filepath, str):
|
||||
# The config file exists in cache- we can continue trying to load
|
||||
return True
|
||||
|
||||
# NB: file_exists will only check for the existence of the config file on
|
||||
# hf_hub. This will fail in offline mode.
|
||||
try:
|
||||
return file_exists(model, config_name, revision=revision, token=token)
|
||||
except huggingface_hub.errors.OfflineModeIsEnabled:
|
||||
# Don't raise in offline mode, all we know is that we don't have this
|
||||
# file cached.
|
||||
return False
|
||||
|
||||
|
||||
def patch_rope_scaling(config: PretrainedConfig) -> None:
|
||||
"""Provide backwards compatibility for RoPE."""
|
||||
text_config = getattr(config, "text_config", None)
|
||||
if text_config is not None:
|
||||
patch_rope_scaling(text_config)
|
||||
|
||||
rope_scaling = getattr(config, "rope_scaling", None)
|
||||
if rope_scaling is not None:
|
||||
patch_rope_scaling_dict(rope_scaling)
|
||||
|
||||
|
||||
def patch_rope_scaling_dict(rope_scaling: Dict[str, Any]) -> None:
|
||||
if "rope_type" not in rope_scaling and "type" in rope_scaling:
|
||||
rope_scaling["rope_type"] = rope_scaling["type"]
|
||||
logger.info("Replacing legacy 'type' key with 'rope_type'")
|
||||
|
||||
if "rope_type" not in rope_scaling:
|
||||
raise ValueError("rope_scaling should have a 'rope_type' key")
|
||||
|
||||
if rope_scaling["rope_type"] == "su":
|
||||
rope_scaling["rope_type"] = "longrope"
|
||||
logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
|
||||
elif rope_scaling["rope_type"] == "mrope":
|
||||
assert "mrope_section" in rope_scaling
|
||||
rope_scaling["rope_type"] = "default"
|
||||
logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
|
||||
|
||||
|
||||
def uses_mrope(config: PretrainedConfig) -> bool:
|
||||
"""Detect if the model with this config uses M-ROPE."""
|
||||
rope_scaling = getattr(config, "rope_scaling", None)
|
||||
if rope_scaling is None:
|
||||
return False
|
||||
|
||||
return "mrope_section" in rope_scaling
|
||||
|
||||
|
||||
def is_encoder_decoder(config: PretrainedConfig) -> bool:
|
||||
"""Detect if the model with this config is used as an encoder/decoder."""
|
||||
text_config = getattr(config, "text_config", None)
|
||||
if text_config is not None:
|
||||
return is_encoder_decoder(text_config)
|
||||
|
||||
return getattr(config, "is_encoder_decoder", False)
|
||||
|
||||
|
||||
def get_config(
|
||||
model: Union[str, Path],
|
||||
trust_remote_code: bool,
|
||||
revision: Optional[str] = None,
|
||||
code_revision: Optional[str] = None,
|
||||
config_format: ConfigFormat = ConfigFormat.AUTO,
|
||||
token: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> PretrainedConfig:
|
||||
# Separate model folder from file path for GGUF models
|
||||
|
||||
is_gguf = check_gguf_file(model)
|
||||
if is_gguf:
|
||||
kwargs["gguf_file"] = Path(model).name
|
||||
model = Path(model).parent
|
||||
|
||||
if config_format == ConfigFormat.AUTO:
|
||||
if is_gguf or file_or_path_exists(
|
||||
model, HF_CONFIG_NAME, revision=revision, token=token):
|
||||
config_format = ConfigFormat.HF
|
||||
elif file_or_path_exists(model,
|
||||
MISTRAL_CONFIG_NAME,
|
||||
revision=revision,
|
||||
token=token):
|
||||
config_format = ConfigFormat.MISTRAL
|
||||
else:
|
||||
# If we're in offline mode and found no valid config format, then
|
||||
# raise an offline mode error to indicate to the user that they
|
||||
# don't have files cached and may need to go online.
|
||||
# This is conveniently triggered by calling file_exists().
|
||||
file_exists(model, HF_CONFIG_NAME, revision=revision, token=token)
|
||||
|
||||
raise ValueError(f"No supported config format found in {model}")
|
||||
|
||||
if config_format == ConfigFormat.HF:
|
||||
config_dict, _ = PretrainedConfig.get_config_dict(
|
||||
model,
|
||||
revision=revision,
|
||||
code_revision=code_revision,
|
||||
token=token,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Use custom model class if it's in our registry
|
||||
model_type = config_dict.get("model_type")
|
||||
if model_type in _CONFIG_REGISTRY:
|
||||
config_class = _CONFIG_REGISTRY[model_type]
|
||||
config = config_class.from_pretrained(
|
||||
model,
|
||||
revision=revision,
|
||||
code_revision=code_revision,
|
||||
token=token,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
try:
|
||||
config = AutoConfig.from_pretrained(
|
||||
model,
|
||||
trust_remote_code=trust_remote_code,
|
||||
revision=revision,
|
||||
code_revision=code_revision,
|
||||
token=token,
|
||||
**kwargs,
|
||||
)
|
||||
except ValueError as e:
|
||||
if (not trust_remote_code
|
||||
and "requires you to execute the configuration file"
|
||||
in str(e)):
|
||||
err_msg = (
|
||||
"Failed to load the model config. If the model "
|
||||
"is a custom model not yet available in the "
|
||||
"HuggingFace transformers library, consider setting "
|
||||
"`trust_remote_code=True` in LLM or using the "
|
||||
"`--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
elif config_format == ConfigFormat.MISTRAL:
|
||||
config = load_params_config(model, revision, token=token, **kwargs)
|
||||
else:
|
||||
raise ValueError(f"Unsupported config format: {config_format}")
|
||||
|
||||
# Special architecture mapping check for GGUF models
|
||||
if is_gguf:
|
||||
if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
|
||||
raise RuntimeError(
|
||||
f"Can't get gguf config for {config.model_type}.")
|
||||
model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
|
||||
config.update({"architectures": [model_type]})
|
||||
|
||||
patch_rope_scaling(config)
|
||||
|
||||
if trust_remote_code:
|
||||
maybe_register_config_serialize_by_value()
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def get_hf_file_to_dict(file_name: str,
|
||||
model: Union[str, Path],
|
||||
revision: Optional[str] = 'main',
|
||||
token: Optional[str] = None):
|
||||
"""
|
||||
Downloads a file from the Hugging Face Hub and returns
|
||||
its contents as a dictionary.
|
||||
|
||||
Parameters:
|
||||
- file_name (str): The name of the file to download.
|
||||
- model (str): The name of the model on the Hugging Face Hub.
|
||||
- revision (str): The specific version of the model.
|
||||
- token (str): The Hugging Face authentication token.
|
||||
|
||||
Returns:
|
||||
- config_dict (dict): A dictionary containing
|
||||
the contents of the downloaded file.
|
||||
"""
|
||||
file_path = Path(model) / file_name
|
||||
|
||||
if file_or_path_exists(model=model,
|
||||
config_name=file_name,
|
||||
revision=revision,
|
||||
token=token):
|
||||
|
||||
if not file_path.is_file():
|
||||
try:
|
||||
hf_hub_file = hf_hub_download(model,
|
||||
file_name,
|
||||
revision=revision)
|
||||
except (RepositoryNotFoundError, RevisionNotFoundError,
|
||||
EntryNotFoundError, LocalEntryNotFoundError) as e:
|
||||
logger.debug("File or repository not found in hf_hub_download",
|
||||
e)
|
||||
return None
|
||||
file_path = Path(hf_hub_file)
|
||||
|
||||
with open(file_path) as file:
|
||||
return json.load(file)
|
||||
return None
|
||||
|
||||
|
||||
def get_pooling_config(model: str,
|
||||
revision: Optional[str] = 'main',
|
||||
token: Optional[str] = None):
|
||||
"""
|
||||
This function gets the pooling and normalize
|
||||
config from the model - only applies to
|
||||
sentence-transformers models.
|
||||
|
||||
Args:
|
||||
model (str): The name of the Hugging Face model.
|
||||
revision (str, optional): The specific version
|
||||
of the model to use. Defaults to 'main'.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the pooling
|
||||
type and whether normalization is used.
|
||||
"""
|
||||
|
||||
modules_file_name = "modules.json"
|
||||
modules_dict = get_hf_file_to_dict(modules_file_name, model, revision,
|
||||
token)
|
||||
|
||||
if modules_dict is None:
|
||||
return None
|
||||
|
||||
pooling = next((item for item in modules_dict
|
||||
if item["type"] == "sentence_transformers.models.Pooling"),
|
||||
None)
|
||||
normalize = bool(
|
||||
next((item for item in modules_dict
|
||||
if item["type"] == "sentence_transformers.models.Normalize"),
|
||||
False))
|
||||
|
||||
if pooling:
|
||||
|
||||
pooling_file_name = "{}/config.json".format(pooling["path"])
|
||||
pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision,
|
||||
token)
|
||||
pooling_type_name = next(
|
||||
(item for item, val in pooling_dict.items() if val is True), None)
|
||||
|
||||
if pooling_type_name is not None:
|
||||
pooling_type_name = get_pooling_config_name(pooling_type_name)
|
||||
|
||||
return {"pooling_type": pooling_type_name, "normalize": normalize}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
|
||||
if "pooling_mode_" in pooling_name:
|
||||
pooling_name = pooling_name.replace("pooling_mode_", "")
|
||||
|
||||
if "_" in pooling_name:
|
||||
pooling_name = pooling_name.split("_")[0]
|
||||
|
||||
if "lasttoken" in pooling_name:
|
||||
pooling_name = "last"
|
||||
|
||||
supported_pooling_types = ['LAST', 'ALL', 'CLS', 'STEP', 'MEAN']
|
||||
pooling_type_name = pooling_name.upper()
|
||||
|
||||
try:
|
||||
if pooling_type_name in supported_pooling_types:
|
||||
return pooling_type_name
|
||||
except NotImplementedError as e:
|
||||
logger.debug("Pooling type not supported", e)
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def get_sentence_transformer_tokenizer_config(model: str,
|
||||
revision: Optional[str] = 'main',
|
||||
token: Optional[str] = None):
|
||||
"""
|
||||
Returns the tokenization configuration dictionary for a
|
||||
given Sentence Transformer BERT model.
|
||||
|
||||
Parameters:
|
||||
- model (str): The name of the Sentence Transformer
|
||||
BERT model.
|
||||
- revision (str, optional): The revision of the m
|
||||
odel to use. Defaults to 'main'.
|
||||
- token (str): A Hugging Face access token.
|
||||
|
||||
Returns:
|
||||
- dict: A dictionary containing the configuration parameters
|
||||
for the Sentence Transformer BERT model.
|
||||
"""
|
||||
for config_name in [
|
||||
"sentence_bert_config.json",
|
||||
"sentence_roberta_config.json",
|
||||
"sentence_distilbert_config.json",
|
||||
"sentence_camembert_config.json",
|
||||
"sentence_albert_config.json",
|
||||
"sentence_xlm-roberta_config.json",
|
||||
"sentence_xlnet_config.json",
|
||||
]:
|
||||
encoder_dict = get_hf_file_to_dict(config_name, model, revision, token)
|
||||
if encoder_dict:
|
||||
break
|
||||
|
||||
if not encoder_dict:
|
||||
return None
|
||||
|
||||
if all(k in encoder_dict for k in ("max_seq_length", "do_lower_case")):
|
||||
return encoder_dict
|
||||
return None
|
||||
|
||||
|
||||
def maybe_register_config_serialize_by_value() -> None:
|
||||
"""Try to register HF model configuration class to serialize by value
|
||||
|
||||
If trust_remote_code is set, and the model's config file specifies an
|
||||
`AutoConfig` class, then the config class is typically an instance of
|
||||
a custom class imported from the HF modules cache.
|
||||
|
||||
Examples:
|
||||
|
||||
>>> from transformers import AutoConfig
|
||||
>>> klass = AutoConfig.from_pretrained('meta-llama/Meta-Llama-3-8B', trust_remote_code=True)
|
||||
>>> klass.__class__ # transformers.models.llama.configuration_llama.LlamaConfig
|
||||
>>> import transformers_modules # error, not initialized
|
||||
>>> klass = AutoConfig.from_pretrained('deepseek-ai/DeepSeek-V2.5', trust_remote_code=True)
|
||||
>>> import transformers_modules # success, initialized
|
||||
>>> klass.__class__ # transformers_modules.deepseek-ai.DeepSeek-V2.5.98b11844770b2c3ffc18b175c758a803640f4e77.configuration_deepseek.DeepseekV2Config
|
||||
|
||||
In the DeepSeek example, the config class is an instance of a custom
|
||||
class that is not serializable by default. This class will not be
|
||||
importable in spawned workers, and won't exist at all on
|
||||
other nodes, which breaks serialization of the config.
|
||||
|
||||
In this function we tell the cloudpickle serialization library to pass
|
||||
instances of these generated classes by value instead of by reference,
|
||||
i.e. the class definition is serialized along with its data so that the
|
||||
class module does not need to be importable on the receiving end.
|
||||
|
||||
See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs
|
||||
""" # noqa
|
||||
try:
|
||||
import transformers_modules
|
||||
except ImportError:
|
||||
# the config does not need trust_remote_code
|
||||
return
|
||||
|
||||
try:
|
||||
import cloudpickle
|
||||
cloudpickle.register_pickle_by_value(transformers_modules)
|
||||
|
||||
# ray vendors its own version of cloudpickle
|
||||
from vllm.executor.ray_utils import ray
|
||||
if ray:
|
||||
ray.cloudpickle.register_pickle_by_value(transformers_modules)
|
||||
|
||||
# multiprocessing uses pickle to serialize arguments when using spawn
|
||||
# Here we get pickle to use cloudpickle to serialize config objects
|
||||
# that contain instances of the custom config class to avoid
|
||||
# serialization problems if the generated module (and model) has a `.`
|
||||
# in its name
|
||||
import multiprocessing
|
||||
import pickle
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
|
||||
def _reduce_config(config: VllmConfig):
|
||||
return (pickle.loads, (cloudpickle.dumps(config), ))
|
||||
|
||||
multiprocessing.reducer.register(VllmConfig, _reduce_config)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Unable to register remote classes used by"
|
||||
" trust_remote_code with by-value serialization. This may"
|
||||
" lead to a later error. If remote code is not needed"
|
||||
" remove `--trust-remote-code`",
|
||||
exc_info=e)
|
||||
|
||||
|
||||
def load_params_config(model: Union[str, Path],
|
||||
revision: Optional[str],
|
||||
token: Optional[str] = None,
|
||||
**kwargs) -> PretrainedConfig:
|
||||
# This function loads a params.json config which
|
||||
# should be used when loading models in mistral format
|
||||
|
||||
config_file_name = "params.json"
|
||||
|
||||
config_dict = get_hf_file_to_dict(config_file_name, model, revision, token)
|
||||
assert isinstance(config_dict, dict)
|
||||
|
||||
config_mapping = {
|
||||
"dim": "hidden_size",
|
||||
"norm_eps": "rms_norm_eps",
|
||||
"n_kv_heads": "num_key_value_heads",
|
||||
"n_layers": "num_hidden_layers",
|
||||
"n_heads": "num_attention_heads",
|
||||
"hidden_dim": "intermediate_size",
|
||||
}
|
||||
|
||||
def recurse_elems(elem: Any):
|
||||
if isinstance(elem, dict):
|
||||
config_dict = {}
|
||||
for key, value in elem.items():
|
||||
key = config_mapping.get(key, key)
|
||||
config_dict[key] = recurse_elems(value)
|
||||
return PretrainedConfig(**config_dict)
|
||||
else:
|
||||
return elem
|
||||
|
||||
config_dict["model_type"] = config_dict.get("model_type", "transformer")
|
||||
config_dict["hidden_act"] = config_dict.get("activation", "silu")
|
||||
config_dict["tie_word_embeddings"] = config_dict.get(
|
||||
"tie_embeddings", False)
|
||||
config_dict["max_seq_len"] = config_dict.get("max_seq_len", 128_000)
|
||||
config_dict["max_position_embeddings"] = config_dict.get(
|
||||
"max_position_embeddings", 128_000)
|
||||
|
||||
if config_dict.get("moe") is not None:
|
||||
config_dict["architectures"] = ["MixtralForCausalLM"]
|
||||
else:
|
||||
config_dict["architectures"] = ["MistralForCausalLM"]
|
||||
|
||||
if config_dict.get("vision_encoder") is not None:
|
||||
multimodal_config = config_dict.pop("vision_encoder")
|
||||
|
||||
config_dict = {
|
||||
"text_config": config_dict,
|
||||
"vision_config": multimodal_config
|
||||
}
|
||||
config_dict["architectures"] = ["PixtralForConditionalGeneration"]
|
||||
config_dict["model_type"] = "pixtral"
|
||||
|
||||
config_dict.update(kwargs)
|
||||
|
||||
config = recurse_elems(config_dict)
|
||||
return config
|
||||
|
||||
|
||||
def get_hf_image_processor_config(
|
||||
model: Union[str, Path],
|
||||
revision: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> Dict[str, Any]:
|
||||
# ModelScope does not provide an interface for image_processor
|
||||
if VLLM_USE_MODELSCOPE:
|
||||
return dict()
|
||||
# Separate model folder from file path for GGUF models
|
||||
if check_gguf_file(model):
|
||||
model = Path(model).parent
|
||||
return get_image_processor_config(model, revision=revision, **kwargs)
|
||||
|
||||
|
||||
def get_hf_text_config(config: PretrainedConfig):
|
||||
"""Get the "sub" config relevant to llm for multi modal models.
|
||||
No op for pure text models.
|
||||
"""
|
||||
if hasattr(config, "text_config"):
|
||||
# The code operates under the assumption that text_config should have
|
||||
# `num_attention_heads` (among others). Assert here to fail early
|
||||
# if transformers config doesn't align with this assumption.
|
||||
assert hasattr(config.text_config, "num_attention_heads")
|
||||
return config.text_config
|
||||
else:
|
||||
return config
|
||||
|
||||
|
||||
def try_get_generation_config(
|
||||
model: str,
|
||||
trust_remote_code: bool,
|
||||
revision: Optional[str] = None,
|
||||
) -> Optional[GenerationConfig]:
|
||||
try:
|
||||
return GenerationConfig.from_pretrained(
|
||||
model,
|
||||
revision=revision,
|
||||
)
|
||||
except OSError: # Not found
|
||||
try:
|
||||
config = get_config(
|
||||
model,
|
||||
trust_remote_code=trust_remote_code,
|
||||
revision=revision,
|
||||
)
|
||||
return GenerationConfig.from_model_config(config)
|
||||
except OSError: # Not found
|
||||
return None
|
||||
38
vllm-v0.6.2/vllm/transformers_utils/configs/__init__.py
Normal file
38
vllm-v0.6.2/vllm/transformers_utils/configs/__init__.py
Normal file
@@ -0,0 +1,38 @@
|
||||
from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
|
||||
from vllm.transformers_utils.configs.dbrx import DbrxConfig
|
||||
from vllm.transformers_utils.configs.eagle import EAGLEConfig
|
||||
from vllm.transformers_utils.configs.exaone import ExaoneConfig
|
||||
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
|
||||
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
|
||||
# `FalconConfig` class from the official HuggingFace transformers library.
|
||||
from vllm.transformers_utils.configs.falcon import RWConfig
|
||||
from vllm.transformers_utils.configs.h2ovl import H2OVLChatConfig
|
||||
from vllm.transformers_utils.configs.internvl import InternVLChatConfig
|
||||
from vllm.transformers_utils.configs.jais import JAISConfig
|
||||
from vllm.transformers_utils.configs.medusa import MedusaConfig
|
||||
from vllm.transformers_utils.configs.mllama import MllamaConfig
|
||||
from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
|
||||
from vllm.transformers_utils.configs.mpt import MPTConfig
|
||||
from vllm.transformers_utils.configs.nemotron import NemotronConfig
|
||||
from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
|
||||
from vllm.transformers_utils.configs.solar import SolarConfig
|
||||
from vllm.transformers_utils.configs.ultravox import UltravoxConfig
|
||||
|
||||
__all__ = [
|
||||
"ChatGLMConfig",
|
||||
"DbrxConfig",
|
||||
"MPTConfig",
|
||||
"RWConfig",
|
||||
"H2OVLChatConfig",
|
||||
"InternVLChatConfig",
|
||||
"JAISConfig",
|
||||
"MedusaConfig",
|
||||
"EAGLEConfig",
|
||||
"ExaoneConfig",
|
||||
"MllamaConfig",
|
||||
"MLPSpeculatorConfig",
|
||||
"NemotronConfig",
|
||||
"NVLM_D_Config",
|
||||
"SolarConfig",
|
||||
"UltravoxConfig",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
204
vllm-v0.6.2/vllm/transformers_utils/configs/arctic.py
Normal file
204
vllm-v0.6.2/vllm/transformers_utils/configs/arctic.py
Normal file
@@ -0,0 +1,204 @@
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# Copied from
|
||||
# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
|
||||
""" Arctic model configuration"""
|
||||
|
||||
from dataclasses import asdict, dataclass
|
||||
from typing import Any, Dict
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
ARCTIC_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"arctic": "https://huggingface.co/Snowflake/snowflake-arctic-instruct/tree/main/config.json",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ArcticLoraConfig:
|
||||
lora_r: int = 64
|
||||
lora_alpha: float = 16
|
||||
shard_base_weights: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class ArcticQuantizationConfig:
|
||||
q_bits: int = 8
|
||||
rounding: str = "nearest"
|
||||
mantissa_bits: int = 3
|
||||
group_size: int = 128
|
||||
|
||||
|
||||
class ArcticConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`ArcticModel`]. It is used to instantiate an
|
||||
Arctic model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
||||
with the defaults will yield a similar configuration to that of the #TODO(rsamdani): add what model has the default config..
|
||||
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 32000):
|
||||
Vocabulary size of the Arctic model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`ArcticModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 14336):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
num_key_value_heads (`int`, *optional*, defaults to 8):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function (function or string) in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
|
||||
The maximum sequence length that this model might ever be used with. Arctic's sliding window attention
|
||||
allows sequence of up to 4096*32 tokens.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*):
|
||||
The id of the padding token.
|
||||
bos_token_id (`int`, *optional*, defaults to 1):
|
||||
The id of the "beginning-of-sequence" token.
|
||||
eos_token_id (`int`, *optional*, defaults to 2):
|
||||
The id of the "end-of-sequence" token.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether the model's input and output word embeddings should be tied.
|
||||
rope_theta (`float`, *optional*, defaults to 1000000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
sliding_window (`int`, *optional*):
|
||||
Sliding window attention window size. If not specified, will default to `4096`.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
num_experts_per_tok (`int`, *optional*, defaults to 2):
|
||||
The number of experts to root per-token, can be also interpreted as the `top-p` routing
|
||||
parameter
|
||||
num_local_experts (`int`, *optional*, defaults to 8):
|
||||
Number of experts per Sparse MLP layer.
|
||||
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
|
||||
The aux loss factor for the total loss.
|
||||
|
||||
```python
|
||||
>>> from transformers import ArcticModel, ArcticConfig
|
||||
|
||||
>>> # Initializing a Arctic 7B style configuration TODO(rsamdani): verify which model does the default configuration correspond to.
|
||||
>>> configuration = ArcticConfig()
|
||||
|
||||
>>> # Initializing a model from the Arctic 7B style configuration
|
||||
>>> model = ArcticModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "arctic"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=32000,
|
||||
hidden_size=4096,
|
||||
intermediate_size=14336,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=4096,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-5,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=1e6,
|
||||
sliding_window=None,
|
||||
attention_dropout=0.0,
|
||||
num_experts_per_tok=1,
|
||||
num_local_experts=8,
|
||||
router_aux_loss_coef=0.001,
|
||||
moe_layer_frequency=2,
|
||||
parallel_attn_mlp_res=False,
|
||||
moe_train_capacity_factor=1,
|
||||
moe_eval_capacity_factor=1,
|
||||
enable_expert_tensor_parallelism=False,
|
||||
moe_min_capacity=0,
|
||||
moe_token_dropping=True,
|
||||
quantization=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.sliding_window = sliding_window
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
self.num_experts_per_tok = num_experts_per_tok
|
||||
self.num_local_experts = num_local_experts
|
||||
self.router_aux_loss_coef = router_aux_loss_coef
|
||||
self.moe_layer_frequency = moe_layer_frequency
|
||||
self.moe_train_capacity_factor = moe_train_capacity_factor
|
||||
self.moe_eval_capacity_factor = moe_eval_capacity_factor
|
||||
self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
|
||||
self.moe_min_capacity = moe_min_capacity
|
||||
self.moe_token_dropping = moe_token_dropping
|
||||
self.parallel_attn_mlp_res = parallel_attn_mlp_res
|
||||
if isinstance(quantization, dict):
|
||||
self.quantization = ArcticQuantizationConfig(**quantization)
|
||||
else:
|
||||
self.quantization = quantization
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "ArcticConfig":
|
||||
result = super().from_dict(config_dict, **kwargs)
|
||||
config = result[0] if isinstance(result, tuple) else result
|
||||
if isinstance(config.quantization, dict):
|
||||
config.quantization = ArcticQuantizationConfig(**config.quantization)
|
||||
return result
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
ret = super().to_dict()
|
||||
if isinstance(ret["quantization"], ArcticQuantizationConfig):
|
||||
ret["quantization"] = asdict(ret["quantization"])
|
||||
return ret
|
||||
69
vllm-v0.6.2/vllm/transformers_utils/configs/chatglm.py
Normal file
69
vllm-v0.6.2/vllm/transformers_utils/configs/chatglm.py
Normal file
@@ -0,0 +1,69 @@
|
||||
# Adapted from
|
||||
# https://github.com/THUDM/ChatGLM2-6B
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class ChatGLMConfig(PretrainedConfig):
|
||||
model_type = "chatglm"
|
||||
attribute_map = {
|
||||
"num_hidden_layers": "num_layers",
|
||||
"n_head_kv": "multi_query_group_num",
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
num_layers=28,
|
||||
padded_vocab_size=65024,
|
||||
hidden_size=4096,
|
||||
ffn_hidden_size=13696,
|
||||
kv_channels=128,
|
||||
num_attention_heads=32,
|
||||
seq_length=2048,
|
||||
hidden_dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
layernorm_epsilon=1e-5,
|
||||
rmsnorm=True,
|
||||
apply_residual_connection_post_layernorm=False,
|
||||
post_layer_norm=True,
|
||||
add_bias_linear=False,
|
||||
add_qkv_bias=False,
|
||||
interleaved_qkv=False,
|
||||
bias_dropout_fusion=True,
|
||||
multi_query_attention=False,
|
||||
multi_query_group_num=1,
|
||||
apply_query_key_layer_scaling=True,
|
||||
attention_softmax_in_fp32=True,
|
||||
fp32_residual_connection=False,
|
||||
quantization_bit=0,
|
||||
pre_seq_len=None,
|
||||
prefix_projection=False,
|
||||
**kwargs):
|
||||
self.num_layers = num_layers
|
||||
self.vocab_size = padded_vocab_size
|
||||
self.padded_vocab_size = padded_vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.ffn_hidden_size = ffn_hidden_size
|
||||
self.kv_channels = kv_channels
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.seq_length = seq_length
|
||||
# It is to be compatible with long lora.
|
||||
self.max_position_embeddings = seq_length
|
||||
self.hidden_dropout = hidden_dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.layernorm_epsilon = layernorm_epsilon
|
||||
self.rmsnorm = rmsnorm
|
||||
self.apply_residual_connection_post_layernorm = (
|
||||
apply_residual_connection_post_layernorm)
|
||||
self.post_layer_norm = post_layer_norm
|
||||
self.add_bias_linear = add_bias_linear
|
||||
self.add_qkv_bias = add_qkv_bias
|
||||
self.bias_dropout_fusion = bias_dropout_fusion
|
||||
self.multi_query_attention = multi_query_attention
|
||||
self.multi_query_group_num = multi_query_group_num
|
||||
self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
|
||||
self.attention_softmax_in_fp32 = attention_softmax_in_fp32
|
||||
self.fp32_residual_connection = fp32_residual_connection
|
||||
self.quantization_bit = quantization_bit
|
||||
self.pre_seq_len = pre_seq_len
|
||||
self.prefix_projection = prefix_projection
|
||||
self.interleaved_qkv = interleaved_qkv
|
||||
super().__init__(**kwargs)
|
||||
278
vllm-v0.6.2/vllm/transformers_utils/configs/dbrx.py
Normal file
278
vllm-v0.6.2/vllm/transformers_utils/configs/dbrx.py
Normal file
@@ -0,0 +1,278 @@
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# Copied from
|
||||
# https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py
|
||||
"""Dbrx configuration."""
|
||||
|
||||
from typing import Any, Optional
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {} # type: ignore
|
||||
|
||||
|
||||
class DbrxAttentionConfig(PretrainedConfig):
|
||||
"""Configuration class for Dbrx Attention.
|
||||
|
||||
[`DbrxAttention`] class. It is used to instantiate attention layers
|
||||
according to the specified arguments, defining the layers architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
attn_pdrop (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probability for the attention layers.
|
||||
clip_qkv (`float`, *optional*, defaults to None):
|
||||
If not `None`, clip the queries, keys, and values in the attention layer to this value.
|
||||
kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
|
||||
rope_theta (float): The base frequency for rope.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
attn_pdrop: float = 0,
|
||||
clip_qkv: Optional[float] = None,
|
||||
kv_n_heads: int = 1,
|
||||
rope_theta: float = 10000.0,
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.attn_pdrop = attn_pdrop
|
||||
self.clip_qkv = clip_qkv
|
||||
self.kv_n_heads = kv_n_heads
|
||||
self.rope_theta = rope_theta
|
||||
|
||||
for k in ["model_type"]:
|
||||
if k in kwargs:
|
||||
kwargs.pop(k)
|
||||
if len(kwargs) != 0:
|
||||
raise ValueError(f"Found unknown {kwargs=}")
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls, pretrained_model_name_or_path: str, **kwargs: Any
|
||||
) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs
|
||||
)
|
||||
|
||||
if config_dict.get("model_type") == "dbrx":
|
||||
config_dict = config_dict["attn_config"]
|
||||
|
||||
if (
|
||||
"model_type" in config_dict
|
||||
and hasattr(cls, "model_type")
|
||||
and config_dict["model_type"] != cls.model_type
|
||||
):
|
||||
logger.warning(
|
||||
"You are using a model of type %s to instantiate a model of "
|
||||
"type %s. This is not supported for all configurations of "
|
||||
"models and can yield errors.",
|
||||
config_dict["model_type"], cls.model_type)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class DbrxFFNConfig(PretrainedConfig):
|
||||
"""Configuration class for Dbrx FFN.
|
||||
|
||||
[`DbrxFFN`] class. It is used to instantiate feedforward layers according to
|
||||
the specified arguments, defining the layers architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
|
||||
The dict should have a key 'name' with the value being the name of
|
||||
the activation function along with any additional keyword arguments.
|
||||
ffn_hidden_size (int, optional): The hidden size of the feedforward network.
|
||||
moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
|
||||
moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
|
||||
moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
|
||||
moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
|
||||
moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
|
||||
uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
|
||||
This should only be used for benchmarking purposes.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ffn_act_fn: Optional[dict] = None,
|
||||
ffn_hidden_size: int = 3584,
|
||||
moe_num_experts: int = 4,
|
||||
moe_top_k: int = 1,
|
||||
moe_jitter_eps: Optional[float] = None,
|
||||
moe_loss_weight: float = 0.01,
|
||||
moe_normalize_expert_weights: Optional[float] = 1,
|
||||
uniform_expert_assignment: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__()
|
||||
if ffn_act_fn is None:
|
||||
ffn_act_fn = {"name": "silu"}
|
||||
self.ffn_act_fn = ffn_act_fn
|
||||
self.ffn_hidden_size = ffn_hidden_size
|
||||
self.moe_num_experts = moe_num_experts
|
||||
self.moe_top_k = moe_top_k
|
||||
self.moe_jitter_eps = moe_jitter_eps
|
||||
self.moe_loss_weight = moe_loss_weight
|
||||
self.moe_normalize_expert_weights = moe_normalize_expert_weights
|
||||
self.uniform_expert_assignment = uniform_expert_assignment
|
||||
|
||||
for k in ["model_type"]:
|
||||
if k in kwargs:
|
||||
kwargs.pop(k)
|
||||
if len(kwargs) != 0:
|
||||
raise ValueError(f"Found unknown {kwargs=}")
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls, pretrained_model_name_or_path: str, **kwargs: Any
|
||||
) -> "PretrainedConfig":
|
||||
cls._set_token_in_kwargs(kwargs)
|
||||
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs
|
||||
)
|
||||
|
||||
if config_dict.get("model_type") == "dbrx":
|
||||
config_dict = config_dict["ffn_config"]
|
||||
|
||||
if (
|
||||
"model_type" in config_dict
|
||||
and hasattr(cls, "model_type")
|
||||
and config_dict["model_type"] != cls.model_type
|
||||
):
|
||||
logger.warning(
|
||||
"You are using a model of type %s to instantiate a model of "
|
||||
"type %s. This is not supported for all "
|
||||
"configurations of models and can yield errors.", config_dict["model_type"], cls.model_type)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class DbrxConfig(PretrainedConfig):
|
||||
"""Configuration class for Dbrx.
|
||||
|
||||
[`DbrxModel`]. It is used to instantiate a Dbrx model according to the
|
||||
specified arguments, defining the model architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
d_model (`int`, *optional*, defaults to 6144):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
n_heads (`int`, *optional*, defaults to 48):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
n_layers (`int`, *optional*, defaults to 40):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
max_seq_len (`int`, *optional*, defaults to 32768):
|
||||
The maximum sequence length of the model.
|
||||
vocab_size (`int`, *optional*, defaults to 100352):
|
||||
Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
|
||||
the `inputs_ids` passed when calling [`DbrxModel`].
|
||||
resid_pdrop (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probability applied to the attention output before combining with residual.
|
||||
emb_pdrop (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probability for the embedding layer.
|
||||
attn_config (`dict`, *optional*):
|
||||
A dictionary used to configure the model's attention module.
|
||||
ffn_config (`dict`, *optional*):
|
||||
A dictionary used to configure the model's FFN module.
|
||||
use_cache (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
output_router_logits (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the router logits should be returned by the model. Enabling this will also
|
||||
allow the model to output the auxiliary loss. See [here]() for more details
|
||||
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
|
||||
The aux loss factor for the total loss.
|
||||
|
||||
|
||||
Example:
|
||||
```python
|
||||
>>> from transformers import DbrxConfig, DbrxModel
|
||||
|
||||
>>> # Initializing a Dbrx configuration
|
||||
>>> configuration = DbrxConfig()
|
||||
|
||||
>>> # Initializing a model (with random weights) from the configuration
|
||||
>>> model = DbrxModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```
|
||||
"""
|
||||
|
||||
model_type = "dbrx"
|
||||
attribute_map = {
|
||||
"num_attention_heads": "n_heads",
|
||||
"hidden_size": "d_model",
|
||||
"num_hidden_layers": "n_layers",
|
||||
"max_position_embeddings": "max_seq_len",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
d_model: int = 2048,
|
||||
n_heads: int = 16,
|
||||
n_layers: int = 24,
|
||||
max_seq_len: int = 2048,
|
||||
vocab_size: int = 32000,
|
||||
resid_pdrop: float = 0.0,
|
||||
emb_pdrop: float = 0.0,
|
||||
attn_config: Optional[DbrxAttentionConfig] = None,
|
||||
ffn_config: Optional[DbrxFFNConfig] = None,
|
||||
use_cache: bool = True,
|
||||
initializer_range: float = 0.02,
|
||||
output_router_logits: bool = False,
|
||||
router_aux_loss_coef: float = 0.05,
|
||||
**kwargs: Any,
|
||||
):
|
||||
if attn_config is None:
|
||||
self.attn_config = DbrxAttentionConfig()
|
||||
elif isinstance(attn_config, dict):
|
||||
self.attn_config = DbrxAttentionConfig(**attn_config)
|
||||
else:
|
||||
self.attn_config = attn_config
|
||||
|
||||
if ffn_config is None:
|
||||
self.ffn_config = DbrxFFNConfig()
|
||||
elif isinstance(ffn_config, dict):
|
||||
self.ffn_config = DbrxFFNConfig(**ffn_config)
|
||||
else:
|
||||
self.ffn_config = ffn_config
|
||||
|
||||
self.d_model = d_model
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.max_seq_len = max_seq_len
|
||||
self.vocab_size = vocab_size
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.emb_pdrop = emb_pdrop
|
||||
self.use_cache = use_cache
|
||||
self.initializer_range = initializer_range
|
||||
self.output_router_logits = output_router_logits
|
||||
self.router_aux_loss_coef = router_aux_loss_coef
|
||||
|
||||
tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
|
||||
if tie_word_embeddings:
|
||||
raise ValueError(
|
||||
"tie_word_embeddings is not supported for Dbrx models."
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
49
vllm-v0.6.2/vllm/transformers_utils/configs/eagle.py
Normal file
49
vllm-v0.6.2/vllm/transformers_utils/configs/eagle.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import os
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers import AutoConfig, PretrainedConfig
|
||||
|
||||
|
||||
class EAGLEConfig(PretrainedConfig):
|
||||
model_type = "eagle"
|
||||
|
||||
def __init__(self,
|
||||
model: Union[PretrainedConfig, dict, None] = None,
|
||||
truncated_vocab_size: Optional[int] = None,
|
||||
**kwargs):
|
||||
|
||||
model_config = None if model is None else (AutoConfig.for_model(
|
||||
**model) if isinstance(model, dict) else model)
|
||||
|
||||
for k, v in kwargs.items():
|
||||
if k != "architectures" and k != "model_type" and hasattr(
|
||||
model_config, k):
|
||||
setattr(model_config, k, v)
|
||||
|
||||
self.model = model_config
|
||||
|
||||
if self.model is None:
|
||||
self.truncated_vocab_size = None
|
||||
else:
|
||||
self.truncated_vocab_size = self.model.vocab_size if \
|
||||
truncated_vocab_size is None else truncated_vocab_size
|
||||
|
||||
if "architectures" not in kwargs:
|
||||
kwargs["architectures"] = ["EAGLEModel"]
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if self.model is not None:
|
||||
for k, v in self.model.to_dict().items():
|
||||
if not hasattr(self, k):
|
||||
setattr(self, k, v)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
pretrained_model_name_or_path: Union[str, os.PathLike],
|
||||
**kwargs,
|
||||
) -> "EAGLEConfig":
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs)
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
189
vllm-v0.6.2/vllm/transformers_utils/configs/exaone.py
Normal file
189
vllm-v0.6.2/vllm/transformers_utils/configs/exaone.py
Normal file
@@ -0,0 +1,189 @@
|
||||
# Copied from
|
||||
# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
|
||||
# Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Exaone model configuration"""
|
||||
|
||||
from typing import Dict
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, str] = {}
|
||||
|
||||
|
||||
class ExaoneConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:
|
||||
`~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model
|
||||
according to the specified arguments, defining the model architecture.
|
||||
Instantiating a configuration with the defaults will yield a similar
|
||||
configuration to that of the Exaone
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig`
|
||||
and can be used to control the model outputs. Read the documentation from :
|
||||
class:`~transformers.PretrainedConfig` for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 50257):
|
||||
Vocabulary size of the GPT Lingvo model. Defines the number of
|
||||
different tokens that can be represented by the :obj:`inputs_ids`
|
||||
passed when calling :class:`~transformers.ExaoneModel`. Vocabulary
|
||||
size of the model.
|
||||
Defines the different tokens that can be represented by the
|
||||
`inputs_ids` passed to the forward method of :class:
|
||||
`~transformers.EXAONEModel`.
|
||||
hidden_size (:obj:`int`, `optional`, defaults to 2048):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_layers (:obj:`int`, `optional`, defaults to 24):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the
|
||||
Transformer decoder.
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
This is the number of key_value heads that should be used to
|
||||
implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi
|
||||
Head Attention (MHA), if `num_key_value_heads=1 the model will use
|
||||
Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint,
|
||||
each group key and value head should be constructed by meanpooling
|
||||
all the original heads within that group. For more details checkout
|
||||
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
|
||||
specified, will default to `num_attention_heads`.
|
||||
rotary_pct (`float`, *optional*, defaults to 0.25):
|
||||
percentage of hidden dimensions to allocate to rotary embeddings
|
||||
intermediate_size (:obj:`int`, `optional`, defaults to 8192):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in
|
||||
the Transformer encoder.
|
||||
activation_function (:obj:`str` or :obj:`function`, `optional`,
|
||||
defaults to :obj:`"gelu_new"`):
|
||||
The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`,
|
||||
:obj:`"selu"` and :obj:`"gelu_new"` are supported.
|
||||
embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout probabilitiy for all fully connected layers in the
|
||||
embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
|
||||
The vocabulary size of the :obj:`token_type_ids` passed when calling
|
||||
:class:`~transformers.EXAONEModel`.
|
||||
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values
|
||||
attentions (not used by all models).
|
||||
Only relevant if ``config.is_decoder=True``.
|
||||
gradient_checkpointing (:obj:`bool`, `optional`,
|
||||
defaults to :obj:`False`):
|
||||
If True, use gradient checkpointing to save memory at the expense
|
||||
of slower backward pass.
|
||||
Example::
|
||||
|
||||
>>> from transformers import ExoneModel, ExaoneConfig
|
||||
|
||||
>>> # Initializing a EXAONE configuration
|
||||
>>> configuration = ExaoneConfig()
|
||||
|
||||
>>> # Initializing a model from configuration
|
||||
>>> model = ExoneModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
|
||||
model_type = "exaone"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {"num_hidden_layers": "num_layers"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=102400,
|
||||
max_position_embeddings=2048,
|
||||
hidden_size=2048,
|
||||
num_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
intermediate_size=None,
|
||||
activation_function="silu",
|
||||
rotary_pct=0.25,
|
||||
resid_dropout=0.0,
|
||||
embed_dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
layer_norm_epsilon=1e-6,
|
||||
initializer_range=0.02,
|
||||
use_cache=True,
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
tie_word_embeddings=True,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.num_layers = num_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_hidden_layers = num_layers
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
if intermediate_size:
|
||||
self.intermediate_size = intermediate_size
|
||||
else:
|
||||
self.intermediate_size = hidden_size * 4
|
||||
self.activation_function = activation_function
|
||||
self.resid_dropout = resid_dropout
|
||||
self.embed_dropout = embed_dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
self.use_cache = use_cache
|
||||
self.rotary_pct = rotary_pct
|
||||
|
||||
self.bos_token_id = bos_token_id
|
||||
self.eos_token_id = eos_token_id
|
||||
|
||||
self.use_logit_cap = kwargs.pop("use_logit_cap", False)
|
||||
self.ln_no_scale = kwargs.pop("ln_no_scale", False)
|
||||
self.use_gated = kwargs.pop("use_gated", False)
|
||||
self.use_emb_norm = kwargs.pop("use_emb_norm", False)
|
||||
self.use_rotary_pos = kwargs.pop("use_rotary_pos", False)
|
||||
self.rotary_type = kwargs.pop("rotary_type", None)
|
||||
self.scaling_factor = kwargs.pop("scaling_factor", 1)
|
||||
self.use_absolute_pos = kwargs.pop("use_absolute_pos", True)
|
||||
self.use_extra_logit = kwargs.pop("use_extra_logit", True)
|
||||
self.rotary_expand_length = kwargs.pop("rotary_expand_length", None)
|
||||
self.rotary_base = kwargs.pop("rotary_base", 10000.0)
|
||||
self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False)
|
||||
self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head",
|
||||
(rotary_pct == 0.25))
|
||||
if self.use_rotary_pos:
|
||||
self.use_absolute_pos = False
|
||||
87
vllm-v0.6.2/vllm/transformers_utils/configs/falcon.py
Normal file
87
vllm-v0.6.2/vllm/transformers_utils/configs/falcon.py
Normal file
@@ -0,0 +1,87 @@
|
||||
# Adapted from
|
||||
# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Falcon configuration"""
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class RWConfig(PretrainedConfig):
|
||||
model_type = "falcon"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_hidden_layers": "n_layer",
|
||||
"num_attention_heads": "n_head",
|
||||
"num_kv_heads": "n_head_kv",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=250880,
|
||||
hidden_size=64,
|
||||
n_layer=2,
|
||||
n_head=8,
|
||||
layer_norm_epsilon=1e-5,
|
||||
initializer_range=0.02,
|
||||
use_cache=True,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
hidden_dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
multi_query=True,
|
||||
n_head_kv=None,
|
||||
alibi=False,
|
||||
bias=False,
|
||||
parallel_attn=False,
|
||||
new_decoder_architecture=False,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.vocab_size = vocab_size
|
||||
# Backward compatibility with n_embed kwarg
|
||||
n_embed = kwargs.pop("n_embed", None)
|
||||
self.hidden_size = hidden_size if n_embed is None else n_embed
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
self.use_cache = use_cache
|
||||
self.hidden_dropout = hidden_dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
self.bos_token_id = bos_token_id
|
||||
self.eos_token_id = eos_token_id
|
||||
self.multi_query = multi_query
|
||||
self.n_head_kv = 1 if n_head_kv is None else n_head_kv
|
||||
self.alibi = alibi
|
||||
self.bias = bias
|
||||
self.parallel_attn = parallel_attn
|
||||
self.new_decoder_architecture = new_decoder_architecture
|
||||
|
||||
if self.hidden_size == 8192:
|
||||
# Hack for falcon-40b
|
||||
self.new_decoder_architecture = True
|
||||
|
||||
super().__init__(bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
**kwargs)
|
||||
|
||||
@property
|
||||
def head_dim(self):
|
||||
return self.hidden_size // self.n_head
|
||||
|
||||
@property
|
||||
def rotary(self):
|
||||
return not self.alibi
|
||||
13
vllm-v0.6.2/vllm/transformers_utils/configs/h2ovl.py
Normal file
13
vllm-v0.6.2/vllm/transformers_utils/configs/h2ovl.py
Normal file
@@ -0,0 +1,13 @@
|
||||
# Adapted from
|
||||
# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py
|
||||
# --------------------------------------------------------
|
||||
# H2OVL-Mississippi
|
||||
# Copyright (c) 2024 H2O.AI
|
||||
# Licensed under Apache 2.0 License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
|
||||
from .internvl import InternVLChatConfig
|
||||
|
||||
|
||||
class H2OVLChatConfig(InternVLChatConfig):
|
||||
model_type = "h2ovl_chat"
|
||||
51
vllm-v0.6.2/vllm/transformers_utils/configs/internvl.py
Normal file
51
vllm-v0.6.2/vllm/transformers_utils/configs/internvl.py
Normal file
@@ -0,0 +1,51 @@
|
||||
# Adapted from
|
||||
# https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py
|
||||
# --------------------------------------------------------
|
||||
# InternVL
|
||||
# Copyright (c) 2024 OpenGVLab
|
||||
# Licensed under The MIT License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class InternVLChatConfig(PretrainedConfig):
|
||||
model_type = 'internvl_chat'
|
||||
is_composition = True
|
||||
|
||||
def __init__(self,
|
||||
vision_config=None,
|
||||
llm_config=None,
|
||||
use_backbone_lora=0,
|
||||
use_llm_lora=0,
|
||||
select_layer=-1,
|
||||
force_image_size=None,
|
||||
downsample_ratio=0.5,
|
||||
template=None,
|
||||
dynamic_image_size=False,
|
||||
use_thumbnail=False,
|
||||
ps_version='v1',
|
||||
min_dynamic_patch=1,
|
||||
max_dynamic_patch=6,
|
||||
**kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if vision_config is None:
|
||||
vision_config = {}
|
||||
|
||||
if llm_config is None:
|
||||
llm_config = {}
|
||||
|
||||
self.vision_config = PretrainedConfig(**vision_config)
|
||||
self.text_config = PretrainedConfig(**llm_config)
|
||||
|
||||
self.use_backbone_lora = use_backbone_lora
|
||||
self.use_llm_lora = use_llm_lora
|
||||
self.select_layer = select_layer
|
||||
self.force_image_size = force_image_size
|
||||
self.downsample_ratio = downsample_ratio
|
||||
self.template = template
|
||||
self.dynamic_image_size = dynamic_image_size
|
||||
self.use_thumbnail = use_thumbnail
|
||||
self.ps_version = ps_version # pixel shuffle version
|
||||
self.min_dynamic_patch = min_dynamic_patch
|
||||
self.max_dynamic_patch = max_dynamic_patch
|
||||
235
vllm-v0.6.2/vllm/transformers_utils/configs/jais.py
Normal file
235
vllm-v0.6.2/vllm/transformers_utils/configs/jais.py
Normal file
@@ -0,0 +1,235 @@
|
||||
# Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright 2023 Cerebras Systems.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""JAIS configuration"""
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class JAISConfig(PretrainedConfig):
|
||||
"""
|
||||
This is the configuration class to store the configuration of a
|
||||
[`JAISModel`]. It is used to instantiate a JAIS model according to the
|
||||
specified arguments, defining the model architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used
|
||||
to control the model outputs. Read the documentation from
|
||||
[`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 50257):
|
||||
Vocabulary size of the JAIS model. Defines the number of different
|
||||
tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`JAISModel`].
|
||||
n_positions (`int`, *optional*, defaults to 1024):
|
||||
The maximum sequence length that this model might ever be used
|
||||
with. Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048).
|
||||
n_embd (`int`, *optional*, defaults to 768):
|
||||
Dimensionality of the embeddings and hidden states.
|
||||
n_layer (`int`, *optional*, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (`int`, *optional*, defaults to 12):
|
||||
Number of attention heads for each attention layer in the
|
||||
Transformer encoder.
|
||||
n_inner (`int`, *optional*, defaults to None):
|
||||
Dimensionality of the inner feed-forward layers. `None` will set
|
||||
it to 4 times n_embd
|
||||
activation_function (`str`, *optional*, defaults to `"gelu"`):
|
||||
Activation function, to be selected in the list
|
||||
`["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"]`.
|
||||
resid_pdrop (`float`, *optional*, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in
|
||||
the embeddings, encoder, and pooler.
|
||||
embd_pdrop (`float`, *optional*, defaults to 0.1):
|
||||
The dropout ratio for the embeddings.
|
||||
attn_pdrop (`float`, *optional*, defaults to 0.1):
|
||||
The dropout ratio for the attention.
|
||||
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon to use in the layer normalization layers.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
scale_attn_weights (`bool`, *optional*, defaults to `True`):
|
||||
Scale attention weights by dividing by sqrt(hidden_size)..
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values
|
||||
attentions (not used by all models).
|
||||
scale_attn_by_inverse_layer_idx (`bool`, *optional*,
|
||||
defaults to `False`):
|
||||
Whether to additionally scale attention weights by
|
||||
`1 / layer_idx + 1`.
|
||||
reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
|
||||
Whether to scale keys (K) prior to computing attention
|
||||
(dot-product)
|
||||
and upcast attention dot-product/softmax to float() when training
|
||||
with mixed precision.
|
||||
position_embedding_type (`str`, *optional*, defaults to `"learned"`):
|
||||
Positional embedding can be either `"alibi"` or `"learned"`.
|
||||
mup_width_scale (`float`, *optional*, defaults to 1.0):
|
||||
muP parameter to scale learning rate and initializers. Calculated
|
||||
as (`d_model,0 / d_model`), where
|
||||
`d_model` is the model's width and `d_model,0` is the proxy
|
||||
model's width.
|
||||
mup_embeddings_scale (`float`, *optional*, defaults to 1.0):
|
||||
muP parameter to scale token and position embeddings.
|
||||
mup_output_alpha (`float`, *optional*, defaults to 1.0):
|
||||
muP parameter to scale output logits
|
||||
(`output_logits_scale = mup_output_alpha * mup_width_scale`).
|
||||
mup_scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
|
||||
Scale attention weights by dividing by hidden_size instead of
|
||||
sqrt(hidden_size). Need to set scale_attn_weights to `True` as
|
||||
well.
|
||||
alibi_scaling (`Dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for ALiBi
|
||||
embeddings. Currently only supports linear
|
||||
scaling strategy. Can specify either the scaling `factor` (must be
|
||||
a float greater than 1) for fixed scaling
|
||||
or `train_seq_len` for dynamic scaling on input samples with
|
||||
sequence length > `train_seq_len`. The expected
|
||||
formats are `{"type": strategy name, "factor": scaling factor}` or
|
||||
`{"type": strategy name,
|
||||
"train_seq_len": training sequence length}`.
|
||||
architectures (`List`, *optional*, defaults to ['JAISLMHeadModel']):
|
||||
architecture names for Jais.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import JAISConfig, JAISModel
|
||||
|
||||
>>> # Initializing a JAIS configuration
|
||||
>>> configuration = JAISConfig()
|
||||
|
||||
>>> # Initializing a model (with random weights) from the configuration
|
||||
>>> model = JAISModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "jais"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"hidden_size": "n_embd",
|
||||
"max_position_embeddings": "n_positions",
|
||||
"num_attention_heads": "n_head",
|
||||
"num_hidden_layers": "n_layer",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=50257,
|
||||
n_positions=1024,
|
||||
n_embd=768,
|
||||
n_layer=12,
|
||||
n_head=12,
|
||||
n_inner=None,
|
||||
activation_function="gelu_new",
|
||||
resid_pdrop=0.1,
|
||||
embd_pdrop=0.1,
|
||||
attn_pdrop=0.1,
|
||||
layer_norm_epsilon=1e-5,
|
||||
initializer_range=0.02,
|
||||
scale_attn_weights=True,
|
||||
use_cache=True,
|
||||
bos_token_id=50256,
|
||||
eos_token_id=50256,
|
||||
scale_attn_by_inverse_layer_idx=False,
|
||||
reorder_and_upcast_attn=False,
|
||||
position_embedding_type="learned",
|
||||
mup_width_scale=1.0,
|
||||
mup_embeddings_scale=1.0,
|
||||
mup_output_alpha=1.0,
|
||||
mup_scale_qk_dot_by_d=False,
|
||||
alibi_scaling=None,
|
||||
architectures=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
self.n_inner = n_inner
|
||||
self.activation_function = activation_function
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.embd_pdrop = embd_pdrop
|
||||
self.attn_pdrop = attn_pdrop
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
self.scale_attn_weights = scale_attn_weights
|
||||
self.use_cache = use_cache
|
||||
self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
|
||||
self.reorder_and_upcast_attn = reorder_and_upcast_attn
|
||||
|
||||
self.bos_token_id = bos_token_id
|
||||
self.eos_token_id = eos_token_id
|
||||
|
||||
self.position_embedding_type = position_embedding_type
|
||||
self.mup_width_scale = mup_width_scale
|
||||
self.mup_embeddings_scale = mup_embeddings_scale
|
||||
self.mup_output_alpha = mup_output_alpha
|
||||
self.mup_scale_qk_dot_by_d = mup_scale_qk_dot_by_d
|
||||
|
||||
self.alibi_scaling = alibi_scaling
|
||||
self._alibi_scaling_validation()
|
||||
if architectures is None:
|
||||
architectures = ["JAISLMHeadModel"]
|
||||
|
||||
super().__init__(
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
architectures=architectures,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _alibi_scaling_validation(self):
|
||||
"""
|
||||
Validate the `alibi_scaling` configuration.
|
||||
"""
|
||||
if self.alibi_scaling is None:
|
||||
return
|
||||
|
||||
if (not isinstance(self.alibi_scaling, dict)
|
||||
or len(self.alibi_scaling) != 2):
|
||||
raise ValueError(
|
||||
"`alibi_scaling` must be a dictionary with two fields,"
|
||||
"`type` and `factor` or `type` and `train_seq_len`, "
|
||||
f"got {self.alibi_scaling}")
|
||||
alibi_scaling_type = self.alibi_scaling.get("type", None)
|
||||
alibi_scaling_factor = self.alibi_scaling.get("factor", None)
|
||||
alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
|
||||
if alibi_scaling_type is None or alibi_scaling_type != "linear":
|
||||
raise ValueError(f"`alibi_scaling`'s type field must be 'linear',"
|
||||
f"got {alibi_scaling_type}")
|
||||
if (alibi_scaling_factor is not None
|
||||
and not isinstance(alibi_scaling_factor, float)
|
||||
or (alibi_scaling_factor is not None
|
||||
and alibi_scaling_factor <= 1.0)):
|
||||
raise ValueError(
|
||||
f"`alibi_scaling`'s factor field must be a float > 1.0,"
|
||||
f"got {alibi_scaling_factor}")
|
||||
if (alibi_dynamic_scaling is not None
|
||||
and not isinstance(alibi_dynamic_scaling, int)
|
||||
or (alibi_dynamic_scaling is not None
|
||||
and alibi_dynamic_scaling <= 1)):
|
||||
raise ValueError(
|
||||
f"`alibi_scaling`'s `train_seq_len` field must be an"
|
||||
f"integer > 1, got {alibi_dynamic_scaling}")
|
||||
60
vllm-v0.6.2/vllm/transformers_utils/configs/medusa.py
Normal file
60
vllm-v0.6.2/vllm/transformers_utils/configs/medusa.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import os
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class MedusaConfig(PretrainedConfig):
|
||||
model_type = "medusa"
|
||||
|
||||
def __init__(self,
|
||||
hidden_size: int = 4096,
|
||||
vocab_size: int = 32001,
|
||||
num_heads: int = 5,
|
||||
num_hidden_layers: int = 1,
|
||||
max_paths: int = 64,
|
||||
topk: int = 10,
|
||||
truncated_vocab_size: Optional[int] = None,
|
||||
**kwargs):
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.vocab_size = vocab_size
|
||||
self.num_heads = num_heads
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.max_paths = max_paths
|
||||
self.topk = topk
|
||||
self.max_seq_len = int(2**20)
|
||||
self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
|
||||
else truncated_vocab_size
|
||||
if "architectures" not in kwargs:
|
||||
kwargs["architectures"] = ["MedusaModel"]
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
pretrained_model_name_or_path: Union[str, os.PathLike],
|
||||
**kwargs,
|
||||
) -> "MedusaConfig":
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs)
|
||||
for k in list(config_dict.keys()):
|
||||
if 'num' in k:
|
||||
if 'heads' in k:
|
||||
config_dict["num_heads"] = config_dict.pop(k)
|
||||
elif 'layers' in k:
|
||||
config_dict["num_hidden_layers"] = config_dict.pop(k)
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
@property
|
||||
def num_attention_heads(self):
|
||||
return 0
|
||||
|
||||
@property
|
||||
def num_lookahead_tokens(self):
|
||||
return self.num_heads
|
||||
|
||||
@num_lookahead_tokens.setter
|
||||
def num_lookahead_tokens(self, num_lookahead_tokens: int):
|
||||
self.num_heads = num_lookahead_tokens
|
||||
28
vllm-v0.6.2/vllm/transformers_utils/configs/mllama.py
Normal file
28
vllm-v0.6.2/vllm/transformers_utils/configs/mllama.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from transformers.models.mllama import configuration_mllama as mllama_hf_config
|
||||
|
||||
|
||||
class MllamaTextConfig(mllama_hf_config.MllamaTextConfig):
|
||||
'''
|
||||
Use this class to override is_encoder_decoder:
|
||||
- transformers regards mllama as is_encoder_decoder=False
|
||||
- vllm needs is_encoder_decoder=True to enable cross-attention
|
||||
'''
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.is_encoder_decoder = True
|
||||
|
||||
|
||||
class MllamaConfig(mllama_hf_config.MllamaConfig):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text_config=None,
|
||||
**kwargs,
|
||||
):
|
||||
if isinstance(text_config, dict):
|
||||
text_config = MllamaTextConfig(**text_config)
|
||||
super().__init__(text_config=text_config, **kwargs)
|
||||
@@ -0,0 +1,65 @@
|
||||
from typing import List, Optional
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class MLPSpeculatorConfig(PretrainedConfig):
|
||||
model_type = "mlp_speculator"
|
||||
|
||||
attribute_map = {
|
||||
"hidden_size": "emb_dim",
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
vocab_size: int = 32000,
|
||||
emb_dim: int = 4096,
|
||||
inner_dim: int = 0,
|
||||
n_predict: int = 3,
|
||||
top_k_tokens_per_head: Optional[List[int]] = None,
|
||||
n_candidates: int = 5,
|
||||
tie_weights: bool = False,
|
||||
scale_input: bool = False,
|
||||
**kwargs):
|
||||
"""
|
||||
Initialize an MLPSpeculatorConfig
|
||||
|
||||
Args:
|
||||
vocab_size: int
|
||||
the model vocab size
|
||||
emb_dim: int
|
||||
the model embedding dimension
|
||||
inner_dim: int
|
||||
the inner dimension of the model. If 0, will be the emb_dim.
|
||||
n_predict: int
|
||||
the number of lookaheads for the speculator
|
||||
top_k_tokens_per_head: List[int]
|
||||
Number of tokens to consider from each head when forming the
|
||||
candidate tree.
|
||||
For each candidate branch in the tree, head n produces topk[n]
|
||||
additional sub-branches.
|
||||
NOTE: This parameter is currently unused.
|
||||
n_candidates: int
|
||||
number of child candidates to create per sequence
|
||||
tie_weights: bool
|
||||
If true, use a single set of weights for every model
|
||||
head/stage after the first. The initial projection
|
||||
from the base model may have a different size, so that
|
||||
stays separate.
|
||||
scale_input: bool
|
||||
if True, will scale the initial hidden states from
|
||||
the base model.
|
||||
"""
|
||||
if top_k_tokens_per_head is None:
|
||||
top_k_tokens_per_head = [5, 4, 3]
|
||||
assert len(top_k_tokens_per_head) == n_predict
|
||||
self.vocab_size = vocab_size
|
||||
self.emb_dim = emb_dim
|
||||
self.inner_dim = inner_dim
|
||||
self.n_predict = n_predict
|
||||
self.top_k_tokens_per_head = top_k_tokens_per_head
|
||||
self.n_candidates = n_candidates
|
||||
self.num_lookahead_tokens = n_predict
|
||||
self.tie_weights = tie_weights
|
||||
self.scale_input = scale_input
|
||||
|
||||
super().__init__(**kwargs)
|
||||
177
vllm-v0.6.2/vllm/transformers_utils/configs/mpt.py
Normal file
177
vllm-v0.6.2/vllm/transformers_utils/configs/mpt.py
Normal file
@@ -0,0 +1,177 @@
|
||||
# Copied from
|
||||
# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
|
||||
"""A HuggingFace-style model configuration."""
|
||||
import warnings
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
attn_config_defaults: Dict = {
|
||||
'attn_type': 'multihead_attention',
|
||||
'attn_pdrop': 0.0,
|
||||
'attn_impl': 'triton',
|
||||
'qk_ln': False,
|
||||
'clip_qkv': None,
|
||||
'softmax_scale': None,
|
||||
'prefix_lm': False,
|
||||
'attn_uses_sequence_id': False,
|
||||
'alibi': False,
|
||||
'alibi_bias_max': 8
|
||||
}
|
||||
ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'}
|
||||
init_config_defaults: Dict = {
|
||||
'name': 'kaiming_normal_',
|
||||
'fan_mode': 'fan_in',
|
||||
'init_nonlinearity': 'relu',
|
||||
'init_div_is_residual': True,
|
||||
'emb_init_std': None,
|
||||
'emb_init_uniform_lim': None,
|
||||
'init_std': None,
|
||||
'init_gain': 0.0
|
||||
}
|
||||
|
||||
|
||||
class MPTConfig(PretrainedConfig):
|
||||
model_type = 'mpt'
|
||||
attribute_map = {
|
||||
'num_attention_heads': 'n_heads',
|
||||
'hidden_size': 'd_model',
|
||||
'num_hidden_layers': 'n_layers',
|
||||
}
|
||||
|
||||
# pylint: disable=dangerous-default-value
|
||||
def __init__(self,
|
||||
d_model: int = 2048,
|
||||
n_heads: int = 16,
|
||||
n_layers: int = 24,
|
||||
expansion_ratio: int = 4,
|
||||
max_seq_len: int = 2048,
|
||||
vocab_size: int = 50368,
|
||||
resid_pdrop: float = 0.0,
|
||||
emb_pdrop: float = 0.0,
|
||||
learned_pos_emb: bool = True,
|
||||
attn_config: Dict = attn_config_defaults,
|
||||
ffn_config: Dict = ffn_config_defaults,
|
||||
init_device: str = 'cpu',
|
||||
logit_scale: Optional[Union[float, str]] = None,
|
||||
no_bias: bool = False,
|
||||
embedding_fraction: float = 1.0,
|
||||
norm_type: str = 'low_precision_layernorm',
|
||||
use_cache: bool = False,
|
||||
init_config: Dict = init_config_defaults,
|
||||
fc_type: str = 'torch',
|
||||
verbose: Optional[int] = None,
|
||||
**kwargs: Any):
|
||||
self.d_model = d_model
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.expansion_ratio = expansion_ratio
|
||||
self.max_seq_len = max_seq_len
|
||||
self.vocab_size = vocab_size
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.emb_pdrop = emb_pdrop
|
||||
self.learned_pos_emb = learned_pos_emb
|
||||
self.attn_config = attn_config
|
||||
self.ffn_config = ffn_config
|
||||
self.init_device = init_device
|
||||
self.logit_scale = logit_scale
|
||||
self.no_bias = no_bias
|
||||
self.embedding_fraction = embedding_fraction
|
||||
self.norm_type = norm_type
|
||||
self.use_cache = use_cache
|
||||
self.init_config = init_config
|
||||
self.fc_type = fc_type
|
||||
if verbose is not None:
|
||||
warnings.warn(DeprecationWarning(
|
||||
'verbose argument for MPTConfig is now ignored and '
|
||||
'will be removed. Use python_log_level instead.'),
|
||||
stacklevel=2)
|
||||
if 'name' in kwargs:
|
||||
del kwargs['name']
|
||||
if 'loss_fn' in kwargs:
|
||||
del kwargs['loss_fn']
|
||||
if self.attn_config.get('alibi', False):
|
||||
self.learned_pos_emb = False
|
||||
warnings.warn(
|
||||
f'alibi is turned on, setting `learned_pos_emb` '
|
||||
f'to {self.learned_pos_emb}`',
|
||||
stacklevel=2)
|
||||
super().__init__(**kwargs)
|
||||
self._validate_config()
|
||||
|
||||
def _set_config_defaults(
|
||||
self, config: Dict[str, Any],
|
||||
config_defaults: Dict[str, Any]) -> Dict[str, Any]:
|
||||
for (k, v) in config_defaults.items():
|
||||
if k not in config:
|
||||
config[k] = v
|
||||
return config
|
||||
|
||||
def _validate_config(self) -> None:
|
||||
self.attn_config = self._set_config_defaults(self.attn_config,
|
||||
attn_config_defaults)
|
||||
self.ffn_config = self._set_config_defaults(self.ffn_config,
|
||||
ffn_config_defaults)
|
||||
self.init_config = self._set_config_defaults(self.init_config,
|
||||
init_config_defaults)
|
||||
if self.d_model % self.n_heads != 0:
|
||||
raise ValueError('d_model must be divisible by n_heads')
|
||||
if any(
|
||||
prob < 0 or prob > 1 for prob in
|
||||
[self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop
|
||||
]):
|
||||
raise ValueError(
|
||||
"self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
|
||||
"probabilities and must be between 0 and 1")
|
||||
if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
|
||||
raise ValueError(
|
||||
f"Unknown attn_impl={self.attn_config['attn_impl']}")
|
||||
if self.attn_config['prefix_lm'] and self.attn_config[
|
||||
'attn_impl'] not in ['torch', 'triton']:
|
||||
raise NotImplementedError(
|
||||
'prefix_lm only implemented with torch and triton attention.')
|
||||
if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [
|
||||
'torch', 'triton'
|
||||
]:
|
||||
raise NotImplementedError(
|
||||
'alibi only implemented with torch and triton attention.')
|
||||
if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
|
||||
'attn_impl'] not in ['torch', 'triton']:
|
||||
raise NotImplementedError(
|
||||
'attn_uses_sequence_id only implemented with torch '
|
||||
'and triton attention.')
|
||||
if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
|
||||
raise ValueError(
|
||||
'model.embedding_fraction must be between 0 (exclusive) '
|
||||
'and 1 (inclusive)!')
|
||||
if isinstance(self.logit_scale,
|
||||
str) and self.logit_scale != 'inv_sqrt_d_model':
|
||||
raise ValueError(
|
||||
f"self.logit_scale={self.logit_scale!r} is not recognized as "
|
||||
"an option; use numeric value or 'inv_sqrt_d_model'.")
|
||||
if self.init_config.get('name', None) is None:
|
||||
raise ValueError(
|
||||
f"self.init_config={self.init_config!r} 'name' needs to be set."
|
||||
)
|
||||
if not self.learned_pos_emb and (not self.attn_config['alibi']):
|
||||
warnings.warn(
|
||||
'Positional information not being provided to the model.',
|
||||
stacklevel=2)
|
||||
if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
|
||||
try:
|
||||
# pylint: disable=import-outside-toplevel
|
||||
import transformer_engine.pytorch as te
|
||||
del te
|
||||
except Exception as exc:
|
||||
raise ImportError(
|
||||
'TransformerEngine import fail. `fc_type: te` requires '
|
||||
'TransformerEngine be installed. '
|
||||
'The required version of transformer_engine also requires '
|
||||
'FlashAttention v1.0.6 is installed:\n'
|
||||
'pip install flash-attn==1.0.6 --no-build-isolation \n'
|
||||
'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
|
||||
) from exc
|
||||
if self.ffn_config['ffn_type'] == 'mptmlp':
|
||||
self.ffn_config['fc_type'] = self.fc_type
|
||||
elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
|
||||
self.ffn_config['bias'] = not self.no_bias
|
||||
202
vllm-v0.6.2/vllm/transformers_utils/configs/nemotron.py
Normal file
202
vllm-v0.6.2/vllm/transformers_utils/configs/nemotron.py
Normal file
@@ -0,0 +1,202 @@
|
||||
# Copyright 2024 HuggingFace Inc. team. All rights reserved.
|
||||
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Nemotron model configuration"""
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class NemotronConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a
|
||||
[`NemotronModel`]. It is used to instantiate an Nemotron model
|
||||
according to the specified arguments, defining the model architecture.
|
||||
Instantiating a configuration with the defaults will yield a similar
|
||||
configuration to that of the Nemotron-8B.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be
|
||||
used to control the model outputs. Read the documentation from
|
||||
[`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 256000):
|
||||
Vocabulary size of the Nemotron model. Defines the number of
|
||||
different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`NemotronModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 6144):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 24576):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 48):
|
||||
Number of attention heads for each attention layer in the
|
||||
Transformer decoder.
|
||||
head_dim (`int`, *optional*):
|
||||
Projection weights dimension in multi-head attention. Set to
|
||||
hidden_size // num_attention_heads if None
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
This is the number of key_value heads that should be used to
|
||||
implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use
|
||||
Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1 the model will use Multi Query Attention
|
||||
(MQA) otherwise GQA is used. When converting a multi-head
|
||||
checkpoint to a GQA checkpoint, each group key and value
|
||||
head should be constructed by meanpooling all the original
|
||||
heads within that group. For more details checkout
|
||||
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
|
||||
is not specified, will default to `num_attention_heads`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
|
||||
The non-linear activation function (function or string) in the
|
||||
decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 4096):
|
||||
The maximum sequence length that this model might ever be used
|
||||
with.
|
||||
initializer_range (`float`, *optional*, defaults to 0.0134):
|
||||
The standard deviation of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values
|
||||
attentions (not used by all models). Only relevant if
|
||||
`config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*):
|
||||
Padding token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 2):
|
||||
Beginning of stream token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 3):
|
||||
End of stream token id.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether to tie weight embeddings
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
partial_rotary_factor (`float`, *optional*, defaults to 0.5):
|
||||
Percentage of the query and keys which will have rotary embedding.
|
||||
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value and output
|
||||
projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
mlp_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in up_proj and down_proj layers in the MLP
|
||||
layers.
|
||||
|
||||
```python
|
||||
>>> from transformers import NemotronModel, NemotronConfig
|
||||
>>> # Initializing a Nemotron nemotron-15b style configuration
|
||||
>>> configuration = NemotronConfig()
|
||||
>>> # Initializing a model from the nemotron-15b style configuration
|
||||
>>> model = NemotronModel(configuration)
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "nemotron"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=256000,
|
||||
hidden_size=6144,
|
||||
intermediate_size=24576,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=48,
|
||||
head_dim=None,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="relu2",
|
||||
max_position_embeddings=4096,
|
||||
initializer_range=0.0134,
|
||||
norm_eps=1e-5,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=2,
|
||||
eos_token_id=3,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
partial_rotary_factor=0.5,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
mlp_bias=False,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
head_dim = head_dim or kwargs.get("kv_channels")
|
||||
self.head_dim = head_dim if head_dim is not None else (
|
||||
hidden_size // num_attention_heads)
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.norm_eps = norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
# for backward compatibility
|
||||
partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
|
||||
"rope_percentage") or partial_rotary_factor
|
||||
self.partial_rotary_factor = partial_rotary_factor
|
||||
self._rope_scaling_validation()
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
self.mlp_bias = mlp_bias
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _rope_scaling_validation(self):
|
||||
"""
|
||||
Validate the `rope_scaling` configuration.
|
||||
"""
|
||||
if self.rope_scaling is None:
|
||||
return
|
||||
|
||||
if not isinstance(self.rope_scaling,
|
||||
dict) or len(self.rope_scaling) != 2:
|
||||
raise ValueError(
|
||||
"`rope_scaling` must be a dictionary with two fields, "
|
||||
f"`type` and `factor`, got {self.rope_scaling}")
|
||||
rope_scaling_type = self.rope_scaling.get("type", None)
|
||||
rope_scaling_factor = self.rope_scaling.get("factor", None)
|
||||
if rope_scaling_type is None or rope_scaling_type not in [
|
||||
"linear", "dynamic"
|
||||
]:
|
||||
raise ValueError(
|
||||
"`rope_scaling`'s type field must be one of ['linear', "
|
||||
f"'dynamic'], got {rope_scaling_type}")
|
||||
if rope_scaling_factor is None or not isinstance(
|
||||
rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
|
||||
raise ValueError(
|
||||
"`rope_scaling`'s factor field must be a float > 1, got "
|
||||
f"{rope_scaling_factor}")
|
||||
12
vllm-v0.6.2/vllm/transformers_utils/configs/nvlm_d.py
Normal file
12
vllm-v0.6.2/vllm/transformers_utils/configs/nvlm_d.py
Normal file
@@ -0,0 +1,12 @@
|
||||
# Adapted from
|
||||
# https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
|
||||
# --------------------------------------------------------
|
||||
# NVLM-D
|
||||
# Copyright (c) 2024 NVIDIA
|
||||
# Licensed under Apache 2.0 License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
from .internvl import InternVLChatConfig
|
||||
|
||||
|
||||
class NVLM_D_Config(InternVLChatConfig):
|
||||
model_type = 'NVLM_D'
|
||||
244
vllm-v0.6.2/vllm/transformers_utils/configs/solar.py
Normal file
244
vllm-v0.6.2/vllm/transformers_utils/configs/solar.py
Normal file
@@ -0,0 +1,244 @@
|
||||
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Solar model configuration"""
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
from transformers.utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class SolarConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store
|
||||
the configuration of a [`SolarModel`].
|
||||
It is used to instantiate an LLaMA model
|
||||
according to the specified arguments,
|
||||
defining the model architecture.
|
||||
Instantiating a configuration with the
|
||||
defaults will yield a similar
|
||||
configuration to that of the LLaMA-7B.
|
||||
Configuration objects inherit from [`PretrainedConfig`]
|
||||
and can be used to control the model outputs.
|
||||
Read the documentation from [`PretrainedConfig`] for more information.
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 32000):
|
||||
Vocabulary size of the LLaMA model.
|
||||
Defines the number of different tokens
|
||||
that can be represented by the `inputs_ids`
|
||||
passed when calling [`SolarModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 4096):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 11008):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer
|
||||
in the Transformer decoder.
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
This is the number of key_value heads that
|
||||
should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`,
|
||||
the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1` the model
|
||||
will use Multi Query Attention (MQA)
|
||||
otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint,
|
||||
each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group.
|
||||
For more details checkout [this paper]
|
||||
(https://arxiv.org/pdf/2305.13245.pdf).
|
||||
If it is not specified, will default to
|
||||
`num_attention_heads`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||
The non-linear activation function (function or string)
|
||||
in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Solar 1 supports up to 2048 tokens,
|
||||
Solar 2 up to 4096, CodeSolar up to 16384.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of
|
||||
the truncated_normal_initializer for initializing
|
||||
all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return
|
||||
the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*):
|
||||
Padding token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 1):
|
||||
Beginning of stream token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 2):
|
||||
End of stream token id.
|
||||
pretraining_tp (`int`, *optional*, defaults to 1):
|
||||
Experimental feature. Tensor parallelism rank
|
||||
used during pretraining.
|
||||
Please refer to [this
|
||||
document](https://huggingface.co/docs/
|
||||
transformers/main/
|
||||
perf_train_gpu_many#tensor-parallelism)
|
||||
to understand more about it. This value is
|
||||
necessary to ensure exact reproducibility
|
||||
of the pretraining results.
|
||||
Please refer to [this
|
||||
issue](https://github.com/pytorch/pytorch/issues/76232).
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether to tie weight embeddings
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
rope_scaling (`Dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for
|
||||
the RoPE embeddings.
|
||||
Currently supports two scaling
|
||||
strategies: linear and dynamic.
|
||||
Their scaling factor must be a float greater than 1.
|
||||
The expected format is
|
||||
`{"type": strategy name, "factor": scaling factor}`.
|
||||
When using this flag, don't update
|
||||
`max_position_embeddings` to the expected new maximum.
|
||||
See the following thread for more information on how
|
||||
these scaling strategies behave:
|
||||
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/
|
||||
dynamically_scaled_rope_further_increases/. This is an
|
||||
experimental feature, subject to breaking
|
||||
API changes in future versions.
|
||||
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value
|
||||
and output projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
mlp_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in up_proj, down_proj and gate_proj
|
||||
layers in the MLP layers.
|
||||
sliding_window (`int`, *optional*, defaults to 2047):
|
||||
Sliding window attention window size. If not specified,
|
||||
will default to `2047`.
|
||||
```python
|
||||
>>> from transformers import SolarModel, SolarConfig
|
||||
>>> # Initializing a Solar-pro style configuration
|
||||
>>> configuration = SolarConfig()
|
||||
>>> # Initializing a model from the Solar-pro style configuration
|
||||
>>> model = SolarModel(configuration)
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "solar"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=32000,
|
||||
hidden_size=4096,
|
||||
intermediate_size=11008,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=2048,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-6,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
pretraining_tp=1,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
mlp_bias=False,
|
||||
sliding_window=2047,
|
||||
bskcn_1=None,
|
||||
bskcn_2=None,
|
||||
bskcn_3=None,
|
||||
bskcn_4=None,
|
||||
bskcn_tv=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.pretraining_tp = pretraining_tp
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self._rope_scaling_validation()
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
self.mlp_bias = mlp_bias
|
||||
self.sliding_window = sliding_window
|
||||
self.bskcn_1 = bskcn_1 if bskcn_1 is not None else [12, 20, 32, 44]
|
||||
self.bskcn_2 = bskcn_2 if bskcn_2 is not None else [20, 32]
|
||||
self.bskcn_3 = bskcn_3 if bskcn_3 is not None else [16, 24, 36, 48]
|
||||
self.bskcn_4 = bskcn_4 if bskcn_4 is not None else [28, 40]
|
||||
self.bskcn_tv = bskcn_tv if bskcn_tv is not None else [0.9, 0.8]
|
||||
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _rope_scaling_validation(self):
|
||||
"""
|
||||
Validate the `rope_scaling` configuration.
|
||||
"""
|
||||
if self.rope_scaling is None:
|
||||
return
|
||||
|
||||
if (not isinstance(self.rope_scaling, dict)
|
||||
or len(self.rope_scaling) != 2):
|
||||
raise ValueError(
|
||||
"`rope_scaling` must be a dictionary with two fields,"
|
||||
" `type` and `factor`, "
|
||||
f"got {self.rope_scaling}")
|
||||
rope_scaling_type = self.rope_scaling.get("type", None)
|
||||
rope_scaling_factor = self.rope_scaling.get("factor", None)
|
||||
if rope_scaling_type is None or rope_scaling_type not in [
|
||||
"linear",
|
||||
"dynamic",
|
||||
]:
|
||||
raise ValueError(f"`rope_scaling`'s type field must be one of "
|
||||
f"['linear', 'dynamic'], got {rope_scaling_type}")
|
||||
if (rope_scaling_factor is None
|
||||
or not isinstance(rope_scaling_factor, float)
|
||||
or rope_scaling_factor <= 1.0):
|
||||
raise ValueError(
|
||||
f"`rope_scaling`'s factor field must be a float > 1,"
|
||||
f" got {rope_scaling_factor}")
|
||||
99
vllm-v0.6.2/vllm/transformers_utils/configs/ultravox.py
Normal file
99
vllm-v0.6.2/vllm/transformers_utils/configs/ultravox.py
Normal file
@@ -0,0 +1,99 @@
|
||||
# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import transformers
|
||||
|
||||
|
||||
class UltravoxConfig(transformers.PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a
|
||||
[`UltravoxForConditionalGeneration`]. It is used to instantiate an
|
||||
Ultravox model according to the specified arguments, defining the model
|
||||
architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to
|
||||
control the model outputs. Read the documentation from [`PretrainedConfig`]
|
||||
for more information.
|
||||
|
||||
Args:
|
||||
audio_config (`Union[AutoConfig, dict]`, *optional*):
|
||||
Custom audio config or dict
|
||||
text_config (`Union[AutoConfig, dict]`, *optional*):
|
||||
The config object of the text backbone. Can be any of `LlamaConfig`
|
||||
or `MistralConfig`.
|
||||
ignore_index (`int`, *optional*, defaults to -100):
|
||||
The ignore index for the loss function.
|
||||
audio_token_index (`int`, *optional*, defaults to 32000):
|
||||
The audio token index to encode the audio prompt.
|
||||
stack_factor (`int`, *optional*, defaults to 8):
|
||||
Audio downsampling factor for the multimodal projector.
|
||||
norm_init (`float`, *optional*, defaults to 0.4):
|
||||
The initialization value for the layer normalization.
|
||||
projector_act (`str`, *optional*, defaults to `"swiglu"`):
|
||||
The activation function used by the multimodal projector.
|
||||
text_model_lora_config (`LoraConfigSimplified`, *optional*):
|
||||
The LoRA configuration for finetuning the text model.
|
||||
audio_model_lora_config (`LoraConfigSimplified`, *optional*):
|
||||
The LoRA configuration for finetuning the audio model.
|
||||
"""
|
||||
|
||||
model_type = "ultravox"
|
||||
is_composition = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
audio_config: Optional[Dict[str, Any]] = None,
|
||||
text_config: Optional[Dict[str, Any]] = None,
|
||||
audio_model_id: Optional[str] = None,
|
||||
text_model_id: Optional[str] = None,
|
||||
ignore_index: int = -100,
|
||||
audio_token_index: int = 32000,
|
||||
hidden_size: int = 4096,
|
||||
stack_factor: int = 8,
|
||||
norm_init: float = 0.4,
|
||||
projector_act: str = "swiglu",
|
||||
text_model_lora_config: Optional[Dict[str, Any]] = None,
|
||||
audio_model_lora_config: Optional[Dict[str, Any]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
self.ignore_index = ignore_index
|
||||
|
||||
self.audio_model_id = audio_model_id
|
||||
self.text_model_id = text_model_id
|
||||
self.audio_token_index = audio_token_index
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.stack_factor = stack_factor
|
||||
self.norm_init = norm_init
|
||||
self.projector_act = projector_act
|
||||
|
||||
if text_model_id is not None:
|
||||
# Avoid circular import
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
self.text_config = get_config(text_model_id,
|
||||
trust_remote_code=False)
|
||||
else:
|
||||
text_config = text_config or {}
|
||||
self.text_config = transformers.CONFIG_MAPPING[text_config.get(
|
||||
"model_type", "llama")](**text_config)
|
||||
|
||||
if audio_model_id is not None:
|
||||
# Avoid circular import
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
self.audio_config = get_config(audio_model_id,
|
||||
trust_remote_code=False)
|
||||
else:
|
||||
audio_config = audio_config or {}
|
||||
self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
|
||||
"model_type", "whisper")](**audio_config)
|
||||
|
||||
self.text_model_lora_config = text_model_lora_config or {}
|
||||
self.audio_model_lora_config = audio_model_lora_config or {}
|
||||
|
||||
self.vocab_size = self.text_config.vocab_size
|
||||
|
||||
self.initializer_range = self.text_config.initializer_range
|
||||
|
||||
super().__init__(**kwargs)
|
||||
165
vllm-v0.6.2/vllm/transformers_utils/detokenizer.py
Normal file
165
vllm-v0.6.2/vllm/transformers_utils/detokenizer.py
Normal file
@@ -0,0 +1,165 @@
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams,
|
||||
Sequence, SequenceGroup)
|
||||
|
||||
from .detokenizer_utils import (convert_prompt_ids_to_tokens,
|
||||
detokenize_incrementally)
|
||||
from .tokenizer import AnyTokenizer
|
||||
from .tokenizer_group import BaseTokenizerGroup
|
||||
|
||||
|
||||
class Detokenizer:
|
||||
"""Provides methods to decode the output of a model into text."""
|
||||
|
||||
def __init__(self, tokenizer_group: BaseTokenizerGroup):
|
||||
self.tokenizer_group = tokenizer_group
|
||||
|
||||
def get_tokenizer_for_seq(self, sequence: Sequence) -> AnyTokenizer:
|
||||
"""Returns the HF tokenizer to use for a given sequence."""
|
||||
return self.tokenizer_group.get_lora_tokenizer(sequence.lora_request)
|
||||
|
||||
def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup,
|
||||
prompt_logprobs: List[Optional[Dict[
|
||||
int, Logprob]]],
|
||||
position_offset: int) -> None:
|
||||
"""Decodes the logprobs for the prompt of a sequence group.
|
||||
|
||||
Args:
|
||||
seq_group: The sequence group to decode.
|
||||
prompt_logprobs: The logprobs to decode.
|
||||
position_offset: Offset of the first index of the logprobs
|
||||
relative to the start of the sequence (for chunked prefill).
|
||||
|
||||
Returns:
|
||||
The prompt logprobs with the decoded tokens.
|
||||
"""
|
||||
prms = seq_group.sampling_params
|
||||
assert prms is not None
|
||||
|
||||
# We can pick any sequence for the prompt.
|
||||
seq = seq_group.get_seqs()[0]
|
||||
# Only prompt, without the generated token.
|
||||
all_token_ids = seq.get_token_ids()
|
||||
prompt_token_ids = all_token_ids[:-1]
|
||||
tokenizer = self.get_tokenizer_for_seq(seq)
|
||||
prefix_offset = 0
|
||||
read_offset = 0
|
||||
next_iter_prefix_offset = 0
|
||||
next_iter_read_offset = 0
|
||||
next_iter_tokens: List[str] = []
|
||||
prev_tokens = None
|
||||
|
||||
for token_position_in_logprob, prompt_logprobs_for_token in enumerate(
|
||||
prompt_logprobs):
|
||||
|
||||
# Absolute token position equals the index in the logprobs
|
||||
# list plus the offset of the entire logprobs list relative
|
||||
# to the start of the sequence.
|
||||
token_position = token_position_in_logprob + position_offset
|
||||
if not prompt_logprobs_for_token:
|
||||
continue
|
||||
for token_id, sample_logprob in prompt_logprobs_for_token.items():
|
||||
if (sample_logprob.decoded_token is None
|
||||
and token_id != VLLM_INVALID_TOKEN_ID):
|
||||
prompt_token_ids_with_token = (
|
||||
prompt_token_ids[:token_position] + [token_id])
|
||||
(new_tokens, new_text, new_prefix_offset,
|
||||
new_read_offset) = detokenize_incrementally(
|
||||
tokenizer=tokenizer,
|
||||
all_input_ids=prompt_token_ids_with_token,
|
||||
prev_tokens=prev_tokens,
|
||||
prefix_offset=prefix_offset,
|
||||
read_offset=read_offset,
|
||||
skip_special_tokens=prms.skip_special_tokens,
|
||||
spaces_between_special_tokens=prms.
|
||||
spaces_between_special_tokens,
|
||||
)
|
||||
|
||||
sample_logprob.decoded_token = new_text
|
||||
|
||||
# Use the offsets & prev tokens corresponding to
|
||||
# real tokens to ensure detokenization is consistent
|
||||
# actual with prompt.
|
||||
if token_id == all_token_ids[token_position]:
|
||||
next_iter_prefix_offset = new_prefix_offset
|
||||
next_iter_read_offset = new_read_offset
|
||||
next_iter_tokens = new_tokens
|
||||
|
||||
# Advance to the next token position.
|
||||
prefix_offset = next_iter_prefix_offset
|
||||
read_offset = next_iter_read_offset
|
||||
if prev_tokens is None:
|
||||
prev_tokens = next_iter_tokens.copy()
|
||||
else:
|
||||
prev_tokens.extend(next_iter_tokens)
|
||||
|
||||
def decode_sequence_inplace(self, seq: Sequence,
|
||||
prms: SamplingParams) -> int:
|
||||
"""Decodes the new token for a sequence. In-place operation.
|
||||
|
||||
Args:
|
||||
seq: The sequence to decode.
|
||||
prms: The sampling parameters used to generate the sequence.
|
||||
|
||||
Returns:
|
||||
The number of characters added to the output text.
|
||||
"""
|
||||
all_input_ids = seq.get_token_ids()
|
||||
token_id_generated_this_iteration = all_input_ids[-1]
|
||||
tokenizer = self.get_tokenizer_for_seq(seq)
|
||||
|
||||
# Convert prompt token IDs to tokens if necessary.
|
||||
# Do it here so that we don't have to repeat this
|
||||
# computation for each logprob.
|
||||
if seq.tokens is None:
|
||||
(seq.tokens, seq.prefix_offset,
|
||||
seq.read_offset) = convert_prompt_ids_to_tokens(
|
||||
tokenizer=tokenizer,
|
||||
prompt_ids=all_input_ids[:-1],
|
||||
skip_special_tokens=prms.skip_special_tokens,
|
||||
)
|
||||
|
||||
(new_tokens, new_decoded_token_text, prefix_offset,
|
||||
read_offset) = detokenize_incrementally(
|
||||
tokenizer=tokenizer,
|
||||
all_input_ids=all_input_ids,
|
||||
prev_tokens=seq.tokens,
|
||||
prefix_offset=seq.prefix_offset,
|
||||
read_offset=seq.read_offset,
|
||||
skip_special_tokens=prms.skip_special_tokens,
|
||||
spaces_between_special_tokens=prms.spaces_between_special_tokens,
|
||||
)
|
||||
|
||||
# Decode logprobs
|
||||
logprobs = seq.output_logprobs[-1]
|
||||
if logprobs:
|
||||
previous_tokens = all_input_ids[:-1]
|
||||
for token_id, sample_logprob in logprobs.items():
|
||||
# If the token was generated this iteration,
|
||||
# use the provided text.
|
||||
if token_id == token_id_generated_this_iteration:
|
||||
sample_logprob.decoded_token = new_decoded_token_text
|
||||
continue
|
||||
|
||||
if (sample_logprob.decoded_token is None
|
||||
and token_id != VLLM_INVALID_TOKEN_ID):
|
||||
all_input_ids_with_logprob = previous_tokens + [token_id]
|
||||
(_, new_text, _, _) = detokenize_incrementally(
|
||||
tokenizer=tokenizer,
|
||||
all_input_ids=all_input_ids_with_logprob,
|
||||
prev_tokens=seq.tokens,
|
||||
prefix_offset=seq.prefix_offset,
|
||||
read_offset=seq.read_offset,
|
||||
skip_special_tokens=prms.skip_special_tokens,
|
||||
spaces_between_special_tokens=prms.
|
||||
spaces_between_special_tokens,
|
||||
)
|
||||
sample_logprob.decoded_token = new_text
|
||||
|
||||
seq.tokens.extend(new_tokens)
|
||||
seq.prefix_offset = prefix_offset
|
||||
seq.read_offset = read_offset
|
||||
seq.output_text += new_decoded_token_text
|
||||
|
||||
return len(new_decoded_token_text)
|
||||
167
vllm-v0.6.2/vllm/transformers_utils/detokenizer_utils.py
Normal file
167
vllm-v0.6.2/vllm/transformers_utils/detokenizer_utils.py
Normal file
@@ -0,0 +1,167 @@
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from .tokenizer import AnyTokenizer
|
||||
|
||||
|
||||
def _replace_none_with_empty(tokens: List[Optional[str]]):
|
||||
for i, token in enumerate(tokens):
|
||||
if token is None:
|
||||
tokens[i] = ""
|
||||
|
||||
|
||||
def _convert_tokens_to_string_with_added_encoders(
|
||||
tokenizer: AnyTokenizer,
|
||||
output_tokens: List[str],
|
||||
skip_special_tokens: bool,
|
||||
spaces_between_special_tokens: bool,
|
||||
) -> str:
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
|
||||
# NOTE(woosuk): The following code is slow because it runs a for loop over
|
||||
# the output_tokens. In Python, running a for loop over a list can be slow
|
||||
# even when the loop body is very simple.
|
||||
sub_texts: List[str] = []
|
||||
current_sub_text: List[str] = []
|
||||
all_special_tokens = set(tokenizer.all_special_tokens)
|
||||
for token in output_tokens:
|
||||
if skip_special_tokens and token in all_special_tokens:
|
||||
continue
|
||||
if token in tokenizer.get_added_vocab():
|
||||
if current_sub_text:
|
||||
sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
|
||||
sub_texts.append(sub_text)
|
||||
current_sub_text = []
|
||||
sub_texts.append(token)
|
||||
else:
|
||||
current_sub_text.append(token)
|
||||
if current_sub_text:
|
||||
sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
|
||||
sub_texts.append(sub_text)
|
||||
if spaces_between_special_tokens:
|
||||
return " ".join(sub_texts)
|
||||
else:
|
||||
return "".join(sub_texts)
|
||||
|
||||
|
||||
# 5 is an arbitrary value that should work for all
|
||||
# tokenizers (bigger = more conservative).
|
||||
INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5
|
||||
|
||||
|
||||
def convert_prompt_ids_to_tokens(
|
||||
tokenizer: AnyTokenizer,
|
||||
prompt_ids: List[int],
|
||||
skip_special_tokens: bool = False,
|
||||
) -> Tuple[List[str], int, int]:
|
||||
"""Converts the prompt ids to tokens and returns the tokens and offsets
|
||||
for incremental detokenization.
|
||||
|
||||
Note that not all tokens are converted to strings. Only the tokens that
|
||||
are necessary for incremental detokenization are converted to strings.
|
||||
"""
|
||||
# We do not need to convert the whole prompt to tokens.
|
||||
# Offset a little more in case we have special tokens.
|
||||
new_tokens = tokenizer.convert_ids_to_tokens(
|
||||
prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2:],
|
||||
skip_special_tokens=skip_special_tokens)
|
||||
read_offset = len(new_tokens)
|
||||
prefix_offset = max(
|
||||
read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
|
||||
# This is required to guard against out-of-vocab prompt token ids
|
||||
_replace_none_with_empty(new_tokens) # type: ignore[arg-type]
|
||||
return new_tokens, prefix_offset, read_offset
|
||||
|
||||
|
||||
# Based on
|
||||
# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
|
||||
# under Apache 2.0 license
|
||||
def detokenize_incrementally(
|
||||
tokenizer: AnyTokenizer,
|
||||
all_input_ids: List[int],
|
||||
prev_tokens: Optional[List[str]],
|
||||
prefix_offset: int,
|
||||
read_offset: int,
|
||||
skip_special_tokens: bool = False,
|
||||
spaces_between_special_tokens: bool = True,
|
||||
) -> Tuple[List[str], str, int, int]:
|
||||
"""Detokenizes the input ids incrementally and returns the new tokens
|
||||
and the new text.
|
||||
|
||||
If `prev_tokens` is None, this function will convert the input ids to
|
||||
tokens and return the tokens and the new text. Otherwise, it will return the
|
||||
new tokens and the new text.
|
||||
|
||||
This function will also return the new prefix offset and the new read
|
||||
offset to be used in the next iteration.
|
||||
|
||||
The offsets are necessary to defeat cleanup algorithms in the decode which
|
||||
decide to add a space or not depending on the surrounding ids.
|
||||
|
||||
Args:
|
||||
tokenizer: The tokenizer to use.
|
||||
all_input_ids: The input ids. The last id is the new token id.
|
||||
prev_tokens: The previous tokens. If None, this function will convert
|
||||
the input ids to tokens and return the tokens and the new text.
|
||||
prefix_offset: The prefix offset.
|
||||
read_offset: The read offset.
|
||||
skip_special_tokens: Whether to skip special tokens.
|
||||
spaces_between_special_tokens: Whether to add spaces between special
|
||||
tokens.
|
||||
"""
|
||||
new_token_id = all_input_ids[-1]
|
||||
# This is the first iteration for this sequence
|
||||
is_first_iter = prev_tokens is None
|
||||
if is_first_iter:
|
||||
(prev_tokens, prefix_offset,
|
||||
read_offset) = convert_prompt_ids_to_tokens(
|
||||
tokenizer,
|
||||
all_input_ids[:-1],
|
||||
skip_special_tokens=skip_special_tokens)
|
||||
assert prev_tokens is not None
|
||||
|
||||
# If the new token id is out of bounds, return an empty string.
|
||||
if 0 <= new_token_id < len(tokenizer):
|
||||
# Put new_token_id in a list so skip_special_tokens is respected
|
||||
new_tokens = tokenizer.convert_ids_to_tokens(
|
||||
[new_token_id], skip_special_tokens=skip_special_tokens)
|
||||
if isinstance(new_tokens, str):
|
||||
new_tokens = [new_tokens]
|
||||
else:
|
||||
new_tokens = [""]
|
||||
output_tokens = prev_tokens + new_tokens
|
||||
|
||||
# If this is the first iteration, return all tokens.
|
||||
if is_first_iter:
|
||||
new_tokens = output_tokens
|
||||
|
||||
# The prefix text is necessary only to defeat cleanup algorithms in
|
||||
# the decode which decide to add a space or not depending on the
|
||||
# surrounding ids.
|
||||
if tokenizer.is_fast or not tokenizer.get_added_vocab():
|
||||
prefix_text = tokenizer.convert_tokens_to_string(
|
||||
output_tokens[prefix_offset:read_offset])
|
||||
new_text = tokenizer.convert_tokens_to_string(
|
||||
output_tokens[prefix_offset:])
|
||||
else:
|
||||
prefix_text = _convert_tokens_to_string_with_added_encoders(
|
||||
tokenizer,
|
||||
output_tokens[prefix_offset:read_offset],
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
spaces_between_special_tokens=spaces_between_special_tokens,
|
||||
)
|
||||
new_text = _convert_tokens_to_string_with_added_encoders(
|
||||
tokenizer,
|
||||
output_tokens[prefix_offset:],
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
spaces_between_special_tokens=spaces_between_special_tokens,
|
||||
)
|
||||
|
||||
if len(new_text) <= len(prefix_text) or new_text.endswith("<EFBFBD>"):
|
||||
# utf-8 char at the end means it's a potential unfinished byte sequence
|
||||
# from byte fallback tokenization.
|
||||
# If it's in the middle, it's probably a real invalid id generated
|
||||
# by the model
|
||||
return new_tokens, "", prefix_offset, read_offset
|
||||
|
||||
new_text = new_text[len(prefix_text):]
|
||||
return new_tokens, new_text, read_offset, len(output_tokens)
|
||||
98
vllm-v0.6.2/vllm/transformers_utils/processor.py
Normal file
98
vllm-v0.6.2/vllm/transformers_utils/processor.py
Normal file
@@ -0,0 +1,98 @@
|
||||
from functools import lru_cache
|
||||
from typing import Any, cast
|
||||
|
||||
|
||||
def get_processor(
|
||||
processor_name: str,
|
||||
*args: Any,
|
||||
trust_remote_code: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Load a processor for the given model name via HuggingFace."""
|
||||
# don't put this import at the top level
|
||||
# it will call torch.cuda.device_count()
|
||||
from transformers import AutoProcessor
|
||||
from transformers.processing_utils import ProcessorMixin
|
||||
|
||||
try:
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
processor_name,
|
||||
*args,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**kwargs)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the processor class not existing or not
|
||||
# currently being imported, suggest using the --trust-remote-code flag.
|
||||
# Unlike AutoTokenizer, AutoProcessor does not separate such errors
|
||||
if not trust_remote_code:
|
||||
err_msg = (
|
||||
"Failed to load the processor. If the processor is "
|
||||
"a custom processor not yet available in the HuggingFace "
|
||||
"transformers library, consider setting "
|
||||
"`trust_remote_code=True` in LLM or using the "
|
||||
"`--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
return cast(ProcessorMixin, processor)
|
||||
|
||||
|
||||
cached_get_processor = lru_cache(get_processor)
|
||||
|
||||
|
||||
def get_image_processor(
|
||||
processor_name: str,
|
||||
*args: Any,
|
||||
trust_remote_code: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Load an image processor for the given model name via HuggingFace."""
|
||||
# don't put this import at the top level
|
||||
# it will call torch.cuda.device_count()
|
||||
from transformers import AutoImageProcessor
|
||||
from transformers.image_processing_utils import BaseImageProcessor
|
||||
|
||||
try:
|
||||
processor = AutoImageProcessor.from_pretrained(
|
||||
processor_name,
|
||||
*args,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**kwargs)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the processor class not existing or not
|
||||
# currently being imported, suggest using the --trust-remote-code flag.
|
||||
# Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
|
||||
if not trust_remote_code:
|
||||
err_msg = (
|
||||
"Failed to load the image processor. If the image processor is "
|
||||
"a custom processor not yet available in the HuggingFace "
|
||||
"transformers library, consider setting "
|
||||
"`trust_remote_code=True` in LLM or using the "
|
||||
"`--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
return cast(BaseImageProcessor, processor)
|
||||
|
||||
|
||||
def get_video_processor(
|
||||
processor_name: str,
|
||||
*args: Any,
|
||||
trust_remote_code: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Load a video processor for the given model name via HuggingFace."""
|
||||
# don't put this import at the top level
|
||||
# it will call torch.cuda.device_count()
|
||||
from transformers.image_processing_utils import BaseImageProcessor
|
||||
|
||||
processor = get_processor(
|
||||
processor_name,
|
||||
*args,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
return cast(BaseImageProcessor, processor.video_processor)
|
||||
199
vllm-v0.6.2/vllm/transformers_utils/tokenizer.py
Normal file
199
vllm-v0.6.2/vllm/transformers_utils/tokenizer.py
Normal file
@@ -0,0 +1,199 @@
|
||||
import os
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from types import MethodType
|
||||
from typing import Optional, Union
|
||||
|
||||
import huggingface_hub
|
||||
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
||||
PreTrainedTokenizerFast)
|
||||
|
||||
from vllm.envs import VLLM_USE_MODELSCOPE
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.tokenizers import MistralTokenizer
|
||||
from vllm.transformers_utils.utils import check_gguf_file
|
||||
from vllm.utils import make_async
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
|
||||
MistralTokenizer]
|
||||
|
||||
|
||||
def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
|
||||
"""Get tokenizer with cached properties.
|
||||
|
||||
This will patch the tokenizer object in place.
|
||||
|
||||
By default, transformers will recompute multiple tokenizer properties
|
||||
each time they are called, leading to a significant slowdown. This
|
||||
function caches these properties for faster access."""
|
||||
|
||||
tokenizer_all_special_ids = set(tokenizer.all_special_ids)
|
||||
# Fallback for older transformers versions that don't have this attribute
|
||||
tokenizer_all_special_tokens_extended = getattr(
|
||||
tokenizer, 'all_special_tokens_extended', tokenizer.all_special_tokens)
|
||||
tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
|
||||
tokenizer_len = len(tokenizer)
|
||||
max_token_id = max(tokenizer.get_vocab().values())
|
||||
|
||||
class CachedTokenizer(tokenizer.__class__): # type: ignore
|
||||
|
||||
@property
|
||||
def all_special_ids(self):
|
||||
return tokenizer_all_special_ids
|
||||
|
||||
@property
|
||||
def all_special_tokens(self):
|
||||
return tokenizer_all_special_tokens
|
||||
|
||||
@property
|
||||
def all_special_tokens_extended(self):
|
||||
return tokenizer_all_special_tokens_extended
|
||||
|
||||
@property
|
||||
def max_token_id(self):
|
||||
return max_token_id
|
||||
|
||||
def __len__(self):
|
||||
return tokenizer_len
|
||||
|
||||
CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
|
||||
|
||||
tokenizer.__class__ = CachedTokenizer
|
||||
return tokenizer
|
||||
|
||||
|
||||
def patch_padding_side(tokenizer: PreTrainedTokenizer) -> None:
|
||||
"""Patch _pad method to accept `padding_side` for older tokenizers."""
|
||||
orig_pad = tokenizer._pad
|
||||
|
||||
def _pad(
|
||||
self: PreTrainedTokenizer,
|
||||
*args,
|
||||
padding_side: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
if padding_side is not None and padding_side != self.padding_side:
|
||||
msg = ("`padding_side` argument is not supported by "
|
||||
f"{type(tokenizer).__name__} and will be ignored.")
|
||||
warnings.warn(msg, stacklevel=2)
|
||||
|
||||
return orig_pad(*args, **kwargs)
|
||||
|
||||
tokenizer._pad = MethodType(_pad, tokenizer)
|
||||
|
||||
|
||||
def get_tokenizer(
|
||||
tokenizer_name: Union[str, Path],
|
||||
*args,
|
||||
tokenizer_mode: str = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
revision: Optional[str] = None,
|
||||
download_dir: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> AnyTokenizer:
|
||||
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope.
|
||||
"""
|
||||
if VLLM_USE_MODELSCOPE:
|
||||
# download model from ModelScope hub,
|
||||
# lazy import so that modelscope is not required for normal use.
|
||||
# pylint: disable=C.
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
|
||||
# Only set the tokenizer here, model will be downloaded on the workers.
|
||||
if not os.path.exists(tokenizer_name):
|
||||
tokenizer_path = snapshot_download(
|
||||
model_id=tokenizer_name,
|
||||
cache_dir=download_dir,
|
||||
revision=revision,
|
||||
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||
# Ignore weights - we only need the tokenizer.
|
||||
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
||||
tokenizer_name = tokenizer_path
|
||||
|
||||
if tokenizer_mode == "slow":
|
||||
if kwargs.get("use_fast", False):
|
||||
raise ValueError(
|
||||
"Cannot use the fast tokenizer in slow tokenizer mode.")
|
||||
kwargs["use_fast"] = False
|
||||
|
||||
if "truncation_side" not in kwargs:
|
||||
kwargs["truncation_side"] = "left"
|
||||
|
||||
# Separate model folder from file path for GGUF models
|
||||
is_gguf = check_gguf_file(tokenizer_name)
|
||||
if is_gguf:
|
||||
kwargs["gguf_file"] = Path(tokenizer_name).name
|
||||
tokenizer_name = Path(tokenizer_name).parent
|
||||
|
||||
# if tokenizer is from official mistral org
|
||||
is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai"
|
||||
if is_from_mistral_org and tokenizer_mode != "mistral":
|
||||
warnings.warn(
|
||||
'It is strongly recommended to run mistral models with '
|
||||
'`--tokenizer_mode "mistral"` to ensure correct '
|
||||
'encoding and decoding.',
|
||||
FutureWarning,
|
||||
stacklevel=2)
|
||||
if tokenizer_mode == "mistral":
|
||||
tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name),
|
||||
revision=revision)
|
||||
else:
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
tokenizer_name,
|
||||
*args,
|
||||
trust_remote_code=trust_remote_code,
|
||||
revision=revision,
|
||||
**kwargs,
|
||||
)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the tokenizer class not existing or not
|
||||
# currently being imported,
|
||||
# suggest using the --trust-remote-code flag.
|
||||
if not trust_remote_code and (
|
||||
"does not exist or is not currently imported." in str(e)
|
||||
or "requires you to execute the tokenizer file" in str(e)):
|
||||
err_msg = ("Failed to load the tokenizer. If the tokenizer "
|
||||
"is a custom tokenizer not yet available in the "
|
||||
"HuggingFace transformers library, consider "
|
||||
"setting `trust_remote_code=True` in LLM or using "
|
||||
"the `--trust-remote-code` flag in the CLI.")
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
# NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324
|
||||
if type(tokenizer).__name__ in ("ChatGLMTokenizer",
|
||||
"ChatGLM4Tokenizer"):
|
||||
assert isinstance(tokenizer, PreTrainedTokenizer)
|
||||
patch_padding_side(tokenizer)
|
||||
|
||||
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||
logger.warning(
|
||||
"Using a slow tokenizer. This might cause a significant "
|
||||
"slowdown. Consider using a fast tokenizer instead.")
|
||||
tokenizer = get_cached_tokenizer(tokenizer)
|
||||
|
||||
return tokenizer
|
||||
|
||||
|
||||
def get_lora_tokenizer(lora_request: LoRARequest, *args,
|
||||
**kwargs) -> Optional[AnyTokenizer]:
|
||||
if lora_request is None:
|
||||
return None
|
||||
try:
|
||||
tokenizer = get_tokenizer(lora_request.lora_path, *args, **kwargs)
|
||||
except Exception as e:
|
||||
# No tokenizer was found in the LoRA folder,
|
||||
# use base model tokenizer
|
||||
logger.warning(
|
||||
"No tokenizer found in %s, using base model tokenizer instead. "
|
||||
"(Exception: %s)", lora_request.lora_path, e)
|
||||
tokenizer = None
|
||||
return tokenizer
|
||||
|
||||
|
||||
get_lora_tokenizer_async = make_async(get_lora_tokenizer)
|
||||
@@ -0,0 +1,57 @@
|
||||
from typing import Optional, Type
|
||||
|
||||
from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
|
||||
TokenizerPoolConfig)
|
||||
from vllm.executor.ray_utils import ray
|
||||
|
||||
from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
|
||||
from .tokenizer_group import TokenizerGroup
|
||||
|
||||
if ray:
|
||||
from .ray_tokenizer_group import RayTokenizerGroupPool
|
||||
else:
|
||||
RayTokenizerGroupPool = None # type: ignore
|
||||
|
||||
|
||||
def init_tokenizer_from_configs(model_config: ModelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
parallel_config: ParallelConfig,
|
||||
enable_lora: bool):
|
||||
init_kwargs = dict(tokenizer_id=model_config.tokenizer,
|
||||
enable_lora=enable_lora,
|
||||
max_num_seqs=scheduler_config.max_num_seqs,
|
||||
max_input_length=None,
|
||||
tokenizer_mode=model_config.tokenizer_mode,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
revision=model_config.tokenizer_revision)
|
||||
|
||||
if (model_config.encoder_config is not None
|
||||
and "do_lower_case" in model_config.encoder_config):
|
||||
init_kwargs["do_lower_case"] = model_config.encoder_config[
|
||||
"do_lower_case"]
|
||||
|
||||
return get_tokenizer_group(parallel_config.tokenizer_pool_config,
|
||||
**init_kwargs)
|
||||
|
||||
|
||||
def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
|
||||
**init_kwargs) -> BaseTokenizerGroup:
|
||||
tokenizer_cls: Type[BaseTokenizerGroup]
|
||||
if tokenizer_pool_config is None:
|
||||
tokenizer_cls = TokenizerGroup
|
||||
elif isinstance(tokenizer_pool_config.pool_type, type) and issubclass(
|
||||
tokenizer_pool_config.pool_type, BaseTokenizerGroup):
|
||||
tokenizer_cls = tokenizer_pool_config.pool_type
|
||||
elif tokenizer_pool_config.pool_type == "ray":
|
||||
if RayTokenizerGroupPool is None:
|
||||
raise ImportError(
|
||||
"RayTokenizerGroupPool is not available. Please install "
|
||||
"the ray package to use the Ray tokenizer group pool.")
|
||||
tokenizer_cls = RayTokenizerGroupPool
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unknown pool type: {tokenizer_pool_config.pool_type}")
|
||||
return tokenizer_cls.from_config(tokenizer_pool_config, **init_kwargs)
|
||||
|
||||
|
||||
__all__ = ["AnyTokenizer", "get_tokenizer_group", "BaseTokenizerGroup"]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,66 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Optional
|
||||
|
||||
from vllm.config import TokenizerPoolConfig
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
|
||||
class BaseTokenizerGroup(ABC):
|
||||
"""A group of tokenizers that can be used for LoRA adapters."""
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
|
||||
**init_kwargs) -> "BaseTokenizerGroup":
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def ping(self) -> bool:
|
||||
"""Check if the tokenizer group is alive."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_max_input_len(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> Optional[int]:
|
||||
"""Get the maximum input length for the LoRA request."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def encode(self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
||||
"""Encode a prompt using the tokenizer group."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def encode_async(
|
||||
self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
||||
"""Encode a prompt using the tokenizer group."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_lora_tokenizer(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> AnyTokenizer:
|
||||
"""Get a tokenizer for a LoRA request."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def get_lora_tokenizer_async(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> AnyTokenizer:
|
||||
"""Get a tokenizer for a LoRA request."""
|
||||
pass
|
||||
|
||||
def check_health(self):
|
||||
"""Raise exception if the tokenizer group is unhealthy."""
|
||||
return
|
||||
@@ -0,0 +1,240 @@
|
||||
import asyncio
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
try:
|
||||
from ray.exceptions import ActorDiedError # type: ignore
|
||||
except ImportError:
|
||||
# For older versions of Ray
|
||||
from ray.exceptions import RayActorError as ActorDiedError # type: ignore
|
||||
from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
|
||||
|
||||
from vllm.config import TokenizerPoolConfig
|
||||
from vllm.executor.ray_utils import ray
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
from .base_tokenizer_group import BaseTokenizerGroup
|
||||
from .tokenizer_group import TokenizerGroup
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class RayTokenizerGroupPool(BaseTokenizerGroup):
|
||||
"""A Ray-based pool of TokenizerGroups for async tokenization."""
|
||||
|
||||
# Class to use for workers making up the pool.
|
||||
_worker_cls = TokenizerGroup
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
|
||||
**init_kwargs) -> "RayTokenizerGroupPool":
|
||||
if not tokenizer_pool_config:
|
||||
raise ValueError("tokenizer_pool_config must not be None.")
|
||||
ray_actor_options = (tokenizer_pool_config.extra_config or {
|
||||
"num_cpus": 0
|
||||
})
|
||||
ray_actor_options.setdefault(
|
||||
"scheduling_strategy",
|
||||
NodeAffinitySchedulingStrategy(
|
||||
node_id=ray.get_runtime_context().get_node_id(), soft=True))
|
||||
|
||||
# Carry over the env vars to the actors.
|
||||
# This is necessary for API keys and such.
|
||||
ray_actor_options.setdefault("runtime_env", {})
|
||||
_carry_over_env_vars_to_runtime_env(ray_actor_options["runtime_env"])
|
||||
|
||||
init_kwargs["num_actors"] = tokenizer_pool_config.pool_size
|
||||
init_kwargs["ray_actor_options"] = ray_actor_options
|
||||
|
||||
return cls(**init_kwargs)
|
||||
|
||||
def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
|
||||
max_input_length: Optional[int], num_actors: int,
|
||||
ray_actor_options: dict, **tokenizer_config):
|
||||
# Store a local copy of the TokenizerGroup for quick access
|
||||
# to underlying HF tokenizers.
|
||||
self._tokenizer_config = {
|
||||
"tokenizer_id": tokenizer_id,
|
||||
"enable_lora": enable_lora,
|
||||
"max_num_seqs": max_num_seqs,
|
||||
"max_input_length": max_input_length,
|
||||
**tokenizer_config
|
||||
}
|
||||
self._local_tokenizer_group = self._worker_cls(
|
||||
**self._tokenizer_config, )
|
||||
|
||||
self._ray_tokenizer_group_cls = ray.remote(
|
||||
self._worker_cls).options(**ray_actor_options) # type: ignore
|
||||
self.tokenizer_actors = [self._init_actor() for _ in range(num_actors)]
|
||||
self._idle_actors: Optional[asyncio.Queue] = None
|
||||
|
||||
# If set, actor is unhealthy. Will reraise on the next
|
||||
# check_health call.
|
||||
self._exception: Optional[ActorDiedError] = None
|
||||
|
||||
def _init_actor(self) -> ray.ObjectRef:
|
||||
return self._ray_tokenizer_group_cls.remote(**self._tokenizer_config)
|
||||
|
||||
@property
|
||||
def pool_size(self) -> int:
|
||||
return len(self.tokenizer_actors)
|
||||
|
||||
def ping(self):
|
||||
return ray.get([
|
||||
actor.ping.remote() # type: ignore
|
||||
for actor in self.tokenizer_actors
|
||||
])
|
||||
|
||||
def _ensure_queue_initialized(self):
|
||||
if self._idle_actors is None:
|
||||
self._idle_actors = asyncio.Queue()
|
||||
for actor in self.tokenizer_actors:
|
||||
self._idle_actors.put_nowait(actor)
|
||||
|
||||
def _finalize_encode(self, actor: ray.ObjectRef,
|
||||
original_actor: ray.ObjectRef, actor_is_alive: bool):
|
||||
assert self._idle_actors is not None
|
||||
# Cleanup the dead actor.
|
||||
if not actor_is_alive or original_actor is not actor:
|
||||
self.tokenizer_actors.remove(original_actor)
|
||||
if actor_is_alive:
|
||||
# Put the actor back in the queue.
|
||||
# This is done in a finally block to ensure that the actor is
|
||||
# always put back in the queue, even if an exception/cancellation
|
||||
# is raised.
|
||||
self._idle_actors.put_nowait(actor)
|
||||
# Add back the new actor.
|
||||
if original_actor is not actor:
|
||||
self.tokenizer_actors.append(actor)
|
||||
|
||||
def encode(self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
||||
"""Encode a prompt using the tokenizer group.
|
||||
|
||||
We pick an idle actor and use it to encode the prompt.
|
||||
The actor is then put back in the queue for future use.
|
||||
This is blocking.
|
||||
"""
|
||||
self.check_health()
|
||||
self._ensure_queue_initialized()
|
||||
assert self._idle_actors is not None
|
||||
|
||||
if self._idle_actors.empty():
|
||||
raise RuntimeError("No idle actors available.")
|
||||
actor = self._idle_actors.get_nowait()
|
||||
actor_is_alive = True
|
||||
original_actor = actor
|
||||
try:
|
||||
ret = ray.get(
|
||||
actor.encode.remote(request_id=request_id,
|
||||
prompt=prompt,
|
||||
lora_request=lora_request))
|
||||
except ActorDiedError as e:
|
||||
# If the actor is dead, we first try to reinitialize it.
|
||||
logger.warning("%s died with ActorDiedError, reinitializing.",
|
||||
actor,
|
||||
exc_info=e)
|
||||
actor = self._init_actor()
|
||||
try:
|
||||
ret = ray.get(
|
||||
actor.encode.remote(request_id=request_id,
|
||||
prompt=prompt,
|
||||
lora_request=lora_request))
|
||||
except ActorDiedError as e:
|
||||
logger.error(
|
||||
"%s died for second time in a row, marking "
|
||||
"RayTokenizerGroupPool as unhealthy.", actor)
|
||||
actor_is_alive = False
|
||||
if not self._exception:
|
||||
self._exception = e
|
||||
self.check_health()
|
||||
finally:
|
||||
self._finalize_encode(actor, original_actor, actor_is_alive)
|
||||
return ret
|
||||
|
||||
async def encode_async(
|
||||
self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
||||
"""Encode a prompt using the tokenizer group.
|
||||
|
||||
We pick an idle actor and use it to encode the prompt.
|
||||
If there are no idle actors, we wait until one becomes
|
||||
available.
|
||||
The actor is then put back in the queue for future use.
|
||||
This is non-blocking.
|
||||
"""
|
||||
self.check_health()
|
||||
self._ensure_queue_initialized()
|
||||
assert self._idle_actors is not None
|
||||
|
||||
actor = await self._idle_actors.get()
|
||||
actor_is_alive = True
|
||||
original_actor = actor
|
||||
try:
|
||||
ret = await actor.encode.remote(request_id=request_id,
|
||||
prompt=prompt,
|
||||
lora_request=lora_request)
|
||||
except ActorDiedError as e:
|
||||
# If the actor is dead, we first try to reinitialize it.
|
||||
logger.warning("%s died with ActorDiedError, reinitializing.",
|
||||
actor,
|
||||
exc_info=e)
|
||||
actor = self._init_actor()
|
||||
try:
|
||||
ret = await actor.encode.remote(request_id=request_id,
|
||||
prompt=prompt,
|
||||
lora_request=lora_request)
|
||||
except ActorDiedError as e:
|
||||
logger.error(
|
||||
"%s died for second time in a row, marking "
|
||||
"RayTokenizerGroupPool as unhealthy.", actor)
|
||||
actor_is_alive = False
|
||||
if not self._exception:
|
||||
self._exception = e
|
||||
self.check_health()
|
||||
finally:
|
||||
self._finalize_encode(actor, original_actor, actor_is_alive)
|
||||
return ret
|
||||
|
||||
def get_max_input_len(self,
|
||||
lora_request: Optional[LoRARequest] = None
|
||||
) -> Optional[int]:
|
||||
"""Get the maximum input length for the LoRA request."""
|
||||
return self._local_tokenizer_group.get_max_input_len(lora_request)
|
||||
|
||||
def get_lora_tokenizer(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> AnyTokenizer:
|
||||
return self._local_tokenizer_group.get_lora_tokenizer(lora_request)
|
||||
|
||||
async def get_lora_tokenizer_async(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> AnyTokenizer:
|
||||
return await self._local_tokenizer_group.get_lora_tokenizer_async(
|
||||
lora_request)
|
||||
|
||||
def check_health(self):
|
||||
if self._exception:
|
||||
raise RuntimeError(
|
||||
"TokenizerGroupPool is unhealthy.") from self._exception
|
||||
|
||||
|
||||
def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None:
|
||||
"""Copy over all current process environment variables to the runtime_env.
|
||||
|
||||
The variables in runtime_env will take precedence over the current process
|
||||
environment variables.
|
||||
|
||||
runtime_env will be modified in place."""
|
||||
env_vars = os.environ.copy()
|
||||
runtime_env.setdefault("env_vars", {})
|
||||
env_vars.update(runtime_env["env_vars"])
|
||||
runtime_env["env_vars"] = env_vars
|
||||
@@ -0,0 +1,99 @@
|
||||
from typing import List, Optional
|
||||
|
||||
from vllm.config import TokenizerPoolConfig
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
|
||||
get_lora_tokenizer,
|
||||
get_lora_tokenizer_async,
|
||||
get_tokenizer)
|
||||
from vllm.utils import LRUCache
|
||||
|
||||
from .base_tokenizer_group import BaseTokenizerGroup
|
||||
|
||||
|
||||
class TokenizerGroup(BaseTokenizerGroup):
|
||||
"""A group of tokenizers that can be used for LoRA adapters."""
|
||||
|
||||
def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
|
||||
max_input_length: Optional[int], **tokenizer_config):
|
||||
self.tokenizer_id = tokenizer_id
|
||||
self.tokenizer_config = tokenizer_config
|
||||
self.enable_lora = enable_lora
|
||||
self.max_input_length = max_input_length
|
||||
self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
|
||||
self.lora_tokenizers = LRUCache[AnyTokenizer](
|
||||
capacity=max_num_seqs if enable_lora else 0)
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
|
||||
**init_kwargs) -> "TokenizerGroup":
|
||||
return cls(**init_kwargs)
|
||||
|
||||
def ping(self) -> bool:
|
||||
"""Check if the tokenizer group is alive."""
|
||||
return True
|
||||
|
||||
def get_max_input_len(self,
|
||||
lora_request: Optional[LoRARequest] = None
|
||||
) -> Optional[int]:
|
||||
"""Get the maximum input length for the LoRA request."""
|
||||
return self.max_input_length
|
||||
|
||||
def _raise_if_input_too_long(self,
|
||||
encoded_tokens: List[int],
|
||||
lora_request: Optional[LoRARequest] = None):
|
||||
input_length = len(encoded_tokens)
|
||||
if lora_request:
|
||||
max_input_length = (lora_request.long_lora_max_len
|
||||
or self.max_input_length)
|
||||
else:
|
||||
max_input_length = self.max_input_length
|
||||
if max_input_length is not None and input_length > max_input_length:
|
||||
raise ValueError("Input too long.", input_length, max_input_length)
|
||||
|
||||
def encode(self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
||||
tokenizer = self.get_lora_tokenizer(lora_request)
|
||||
ret = tokenizer.encode(prompt)
|
||||
self._raise_if_input_too_long(ret, lora_request)
|
||||
return ret
|
||||
|
||||
async def encode_async(
|
||||
self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
||||
tokenizer = await self.get_lora_tokenizer_async(lora_request)
|
||||
ret = tokenizer.encode(prompt)
|
||||
self._raise_if_input_too_long(ret, lora_request)
|
||||
return ret
|
||||
|
||||
def get_lora_tokenizer(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> AnyTokenizer:
|
||||
if not lora_request or not self.enable_lora:
|
||||
return self.tokenizer
|
||||
if lora_request.lora_int_id not in self.lora_tokenizers:
|
||||
tokenizer = (get_lora_tokenizer(
|
||||
lora_request, **self.tokenizer_config) or self.tokenizer)
|
||||
self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
|
||||
return tokenizer
|
||||
else:
|
||||
return self.lora_tokenizers[lora_request.lora_int_id]
|
||||
|
||||
async def get_lora_tokenizer_async(
|
||||
self,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> AnyTokenizer:
|
||||
if not lora_request or not self.enable_lora:
|
||||
return self.tokenizer
|
||||
if lora_request.lora_int_id not in self.lora_tokenizers:
|
||||
tokenizer = (await get_lora_tokenizer_async(
|
||||
lora_request, **self.tokenizer_config) or self.tokenizer)
|
||||
self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
|
||||
return tokenizer
|
||||
else:
|
||||
return self.lora_tokenizers[lora_request.lora_int_id]
|
||||
@@ -0,0 +1,3 @@
|
||||
from .mistral import MistralTokenizer, maybe_serialize_tool_calls
|
||||
|
||||
__all__ = ["MistralTokenizer", "maybe_serialize_tool_calls"]
|
||||
Binary file not shown.
Binary file not shown.
363
vllm-v0.6.2/vllm/transformers_utils/tokenizers/mistral.py
Normal file
363
vllm-v0.6.2/vllm/transformers_utils/tokenizers/mistral.py
Normal file
@@ -0,0 +1,363 @@
|
||||
import os
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast
|
||||
|
||||
import huggingface_hub
|
||||
from huggingface_hub import HfApi, hf_hub_download
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from mistral_common.tokens.tokenizers.base import SpecialTokens
|
||||
# yapf: disable
|
||||
from mistral_common.tokens.tokenizers.mistral import (
|
||||
MistralTokenizer as PublicMistralTokenizer)
|
||||
# yapf: enable
|
||||
from mistral_common.tokens.tokenizers.sentencepiece import (
|
||||
SentencePieceTokenizer)
|
||||
from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy,
|
||||
Tekkenizer)
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Encoding:
|
||||
input_ids: List[int]
|
||||
|
||||
|
||||
def maybe_serialize_tool_calls(request: ChatCompletionRequest):
|
||||
# SEE: https://github.com/vllm-project/vllm/pull/9951
|
||||
# Credits go to: @gcalmettes
|
||||
# NOTE: There is currently a bug in pydantic where attributes
|
||||
# declared as iterables are replaced in in the instances by
|
||||
# pydantic-core ValidatorIterator instance. In particular, this
|
||||
# affects tool_calls defined in ChatCompletionAssistantMessageParam
|
||||
# model:
|
||||
# see:
|
||||
# - https://github.com/pydantic/pydantic/issues/9467
|
||||
# As a result, tool_calls from assistant messages are never
|
||||
# deserialized in the request object if the tool_calls iterator is
|
||||
# not consumed. This affect messages passed to the MistralTokenizer
|
||||
# since no chat template is applied and therefore the tools_calls
|
||||
# iterator is not directly consumed.
|
||||
# Issue is tracked on Pydantic side, with resolution planned for
|
||||
# v2.11 release. In the meantime, the official workaround is to
|
||||
# consume the iterator so the tool_calls are correctly deserialized
|
||||
# in the OpenAI ChatCompletionAssistantMessageParam object
|
||||
# https://github.com/pydantic/pydantic/issues/9467#issuecomment-2442097291 # noqa: E501
|
||||
# Official Pydantic Issues:
|
||||
# - https://github.com/pydantic/pydantic/issues/9541
|
||||
# TODO: remove when pydantic v2.11 is released
|
||||
for i, message in enumerate(request.messages):
|
||||
if message.get("role") == 'assistant':
|
||||
tool_calls_validator = message.get("tool_calls", ().__iter__())
|
||||
validated_tool_calls = []
|
||||
while True:
|
||||
try:
|
||||
tool_call = next(tool_calls_validator) # type: ignore
|
||||
validated_tool_calls.append(tool_call)
|
||||
except StopIteration:
|
||||
break
|
||||
|
||||
request.messages[i]["tool_calls"] = validated_tool_calls
|
||||
|
||||
|
||||
def list_local_repo_files(repo_id: str, revision: Optional[str]) -> List[str]:
|
||||
repo_cache = os.path.join(
|
||||
huggingface_hub.constants.HF_HUB_CACHE,
|
||||
huggingface_hub.constants.REPO_ID_SEPARATOR.join(
|
||||
["models", *repo_id.split("/")]))
|
||||
|
||||
if revision is None:
|
||||
revision_file = os.path.join(repo_cache, "refs", "main")
|
||||
if os.path.isfile(revision_file):
|
||||
with open(revision_file) as file:
|
||||
revision = file.read()
|
||||
|
||||
if revision:
|
||||
revision_dir = os.path.join(repo_cache, "snapshots", revision)
|
||||
if os.path.isdir(revision_dir):
|
||||
return os.listdir(revision_dir)
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def find_tokenizer_file(files: List[str]):
|
||||
file_pattern = re.compile(r"^tokenizer\.model\.v.*$|^tekken\.json$")
|
||||
|
||||
matched_files = [file for file in files if file_pattern.match(file)]
|
||||
if len(matched_files) > 1:
|
||||
raise OSError(f"Found {len(matched_files)} files matching the "
|
||||
f"pattern: {file_pattern}. Make sure only one Mistral "
|
||||
f"tokenizer is present in {files}.")
|
||||
elif len(matched_files) == 0:
|
||||
raise OSError(f"Found {len(matched_files)} files matching the "
|
||||
f"pattern: {file_pattern}. Make sure that a Mistral "
|
||||
f"tokenizer is present in {files}.")
|
||||
|
||||
return matched_files[0]
|
||||
|
||||
|
||||
class MistralTokenizer:
|
||||
|
||||
def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
|
||||
self.mistral = tokenizer
|
||||
self.instruct = tokenizer.instruct_tokenizer
|
||||
|
||||
tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
|
||||
self.is_tekken = isinstance(tokenizer_, Tekkenizer)
|
||||
self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer)
|
||||
if self.is_tekken:
|
||||
# Make sure special tokens will not raise
|
||||
tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE
|
||||
elif self.is_spm:
|
||||
pass
|
||||
else:
|
||||
raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
|
||||
|
||||
self._vocab = tokenizer_.vocab()
|
||||
# Convert to a Dict[str, int] to match protocol, but this is a lossy
|
||||
# conversion. There may be multiple token ids that decode to the same
|
||||
# string due to partial UTF-8 byte sequences being converted to <20>
|
||||
self._vocab_dict = {
|
||||
token: idx
|
||||
for idx, token in enumerate(self._vocab)
|
||||
}
|
||||
self.tokenizer = tokenizer_
|
||||
self._max_token_id = self.vocab_size - 1
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls,
|
||||
path_or_repo_id: str,
|
||||
*,
|
||||
revision: Optional[str] = None) -> "MistralTokenizer":
|
||||
if not Path(path_or_repo_id).exists():
|
||||
assert len(path_or_repo_id.split("/")) == 2, (
|
||||
"You have either provided a non-existent path: "
|
||||
"{path_or_repo_id} or an invalid HF Hub repo id.")
|
||||
tokenizer_file = cls._download_mistral_tokenizer_from_hf(
|
||||
path_or_repo_id, revision)
|
||||
elif Path(path_or_repo_id).is_dir():
|
||||
tokenizer_file_name = find_tokenizer_file(
|
||||
os.listdir(path_or_repo_id))
|
||||
tokenizer_file = str(Path(path_or_repo_id) / tokenizer_file_name)
|
||||
else:
|
||||
assert Path(
|
||||
path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
|
||||
|
||||
mistral_tokenizer = PublicMistralTokenizer.from_file(tokenizer_file)
|
||||
return cls(mistral_tokenizer)
|
||||
|
||||
@staticmethod
|
||||
def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
|
||||
revision: Optional[str]) -> str:
|
||||
try:
|
||||
hf_api = HfApi()
|
||||
files = hf_api.list_repo_files(repo_id=tokenizer_name,
|
||||
revision=revision)
|
||||
except ConnectionError as exc:
|
||||
files = list_local_repo_files(repo_id=tokenizer_name,
|
||||
revision=revision)
|
||||
|
||||
if len(files) == 0:
|
||||
raise exc
|
||||
|
||||
filename = find_tokenizer_file(files)
|
||||
|
||||
tokenizer_file = hf_hub_download(tokenizer_name,
|
||||
filename=filename,
|
||||
revision=revision)
|
||||
return tokenizer_file
|
||||
|
||||
# the following attributes are set to fit VLLM's design and are used
|
||||
# by the guided structured output backends.
|
||||
@property
|
||||
def all_special_tokens_extended(self) -> List[str]:
|
||||
# tekken defines its own extended special tokens list
|
||||
if hasattr(self.tokenizer, "SPECIAL_TOKENS"):
|
||||
special_tokens = self.tokenizer.SPECIAL_TOKENS
|
||||
else:
|
||||
special_tokens = list(SpecialTokens)
|
||||
return [
|
||||
s.value if isinstance(s, SpecialTokens) else s
|
||||
for s in special_tokens
|
||||
]
|
||||
|
||||
@property
|
||||
def all_special_tokens(self) -> List[str]:
|
||||
return self.all_special_tokens_extended
|
||||
|
||||
@property
|
||||
def all_special_ids(self) -> List[int]:
|
||||
return [
|
||||
self.all_special_tokens.index(t) for t in self.all_special_tokens
|
||||
]
|
||||
|
||||
@property
|
||||
def bos_token_id(self) -> int:
|
||||
return self.tokenizer.bos_id
|
||||
|
||||
@property
|
||||
def eos_token_id(self) -> int:
|
||||
return self.tokenizer.eos_id
|
||||
|
||||
@property
|
||||
def is_fast(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
return len(self._vocab)
|
||||
|
||||
@property
|
||||
def max_token_id(self) -> int:
|
||||
return self._max_token_id
|
||||
|
||||
def __len__(self) -> int:
|
||||
return self.vocab_size
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
prompt: str,
|
||||
add_special_tokens: bool = False,
|
||||
truncation: bool = False,
|
||||
max_length: Optional[int] = None,
|
||||
):
|
||||
# Mistral Tokenizers should not add special tokens
|
||||
input_ids = self.encode(prompt)
|
||||
|
||||
if truncation:
|
||||
input_ids = input_ids[:max_length]
|
||||
|
||||
return Encoding(input_ids=input_ids)
|
||||
|
||||
def get_vocab(self) -> Dict[str, int]:
|
||||
# NB: the dictionary form of the vocabulary collapses token ids that map
|
||||
# to the same string but have different bytes
|
||||
return self._vocab_dict
|
||||
|
||||
def get_added_vocab(self) -> Dict[str, int]:
|
||||
# Mistral tokenizers have no added vocabulary
|
||||
return {}
|
||||
|
||||
def encode(self, prompt: str) -> List[int]:
|
||||
# `encode` should only be used for prompt completion
|
||||
# it should never be used for chat_completion.
|
||||
# For chat completion use `apply_chat_template`
|
||||
return self.tokenizer.encode(prompt, bos=True, eos=False)
|
||||
|
||||
def apply_chat_template(self,
|
||||
messages: List["ChatCompletionMessageParam"],
|
||||
tools: Optional[Dict[str, Any]] = None,
|
||||
**kwargs) -> List[int]:
|
||||
|
||||
last_message = cast(Dict[str, Any], messages[-1])
|
||||
if last_message["role"] == "assistant":
|
||||
last_message["prefix"] = True
|
||||
|
||||
request = ChatCompletionRequest(messages=messages,
|
||||
tools=tools) # type: ignore[type-var]
|
||||
encoded = self.mistral.encode_chat_completion(request)
|
||||
|
||||
# encode-decode to get clean prompt
|
||||
return encoded.tokens
|
||||
|
||||
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
||||
if self.is_tekken:
|
||||
tokens = [
|
||||
t for t in tokens
|
||||
if (t is SpecialTokens.tool_calls
|
||||
or t not in self.tokenizer._all_special_tokens)
|
||||
]
|
||||
|
||||
if any(isinstance(t, bytes) for t in tokens):
|
||||
# we need to encode and decode all tokens again
|
||||
shift = self.tokenizer.num_special_tokens
|
||||
|
||||
def _token_to_id(t: str):
|
||||
t_bytes = t.encode("utf-8") \
|
||||
if not isinstance(t, bytes) else t
|
||||
try:
|
||||
return shift + \
|
||||
self.tokenizer._tekken_token2id_nospecial[t_bytes]
|
||||
except KeyError:
|
||||
logger.warning(
|
||||
"Failed to convert token %s to id,"
|
||||
" replacing with <unk>", t_bytes)
|
||||
return self.tokenizer.unk_id
|
||||
|
||||
ids = [_token_to_id(t) for t in tokens]
|
||||
decoded = self.tokenizer.decode(ids)
|
||||
else:
|
||||
decoded = "".join(tokens)
|
||||
else:
|
||||
# make sure certain special tokens like Tool calls are
|
||||
# not decoded
|
||||
special_tokens = {SpecialTokens.tool_calls}
|
||||
regular_tokens: List[str] = []
|
||||
decoded_list = []
|
||||
|
||||
for token in tokens:
|
||||
if token in special_tokens:
|
||||
if regular_tokens:
|
||||
decoded_list.append(
|
||||
self.tokenizer.decode(regular_tokens))
|
||||
regular_tokens = []
|
||||
decoded_list.append(token)
|
||||
else:
|
||||
regular_tokens.append(token)
|
||||
|
||||
if regular_tokens:
|
||||
decoded_list.append(
|
||||
self.decode(regular_tokens)) # type: ignore
|
||||
|
||||
decoded = ''.join(decoded_list)
|
||||
|
||||
return decoded
|
||||
|
||||
def decode(self,
|
||||
ids: Union[List[int], int],
|
||||
skip_special_tokens: bool = True) -> str:
|
||||
assert (
|
||||
skip_special_tokens
|
||||
), "skip_special_tokens=False is not supported for Mistral tokenizers."
|
||||
|
||||
if isinstance(ids, int):
|
||||
ids = [ids]
|
||||
return self.tokenizer.decode(ids)
|
||||
|
||||
def convert_ids_to_tokens(
|
||||
self,
|
||||
ids: List[int],
|
||||
skip_special_tokens: bool = True,
|
||||
) -> List[str]:
|
||||
# TODO(Patrick) - potentially allow special tokens to not be skipped
|
||||
assert (
|
||||
skip_special_tokens
|
||||
), "skip_special_tokens=False is not supported for Mistral tokenizers."
|
||||
|
||||
assert self.is_tekken or self.is_spm, type(self.tokenizer)
|
||||
|
||||
if self.is_tekken:
|
||||
# skip special tokens except tool call
|
||||
ids = [
|
||||
i for i in ids if i > self.tokenizer.num_special_tokens or i ==
|
||||
self.tokenizer.get_control_token(SpecialTokens.tool_calls)
|
||||
]
|
||||
|
||||
tokens = [self.tokenizer.id_to_piece(id) for id in ids]
|
||||
|
||||
if any("<EFBFBD>" in t for t in tokens) and self.is_tekken:
|
||||
# if a decoded token contains the replacement character, then the
|
||||
# token has an incomplete UTF-8 character so we must use bytes
|
||||
# See: https://github.com/vllm-project/vllm/pull/8640
|
||||
# https://github.com/vllm-project/vllm/pull/9625
|
||||
# if underlying tokenizeir is sentencepiece, we just add "<22>"
|
||||
tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]
|
||||
|
||||
return tokens
|
||||
16
vllm-v0.6.2/vllm/transformers_utils/utils.py
Normal file
16
vllm-v0.6.2/vllm/transformers_utils/utils.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from os import PathLike
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
|
||||
def check_gguf_file(model: Union[str, PathLike]) -> bool:
|
||||
"""Check if the file is a GGUF model."""
|
||||
model = Path(model)
|
||||
if not model.is_file():
|
||||
return False
|
||||
elif model.suffix == ".gguf":
|
||||
return True
|
||||
|
||||
with open(model, "rb") as f:
|
||||
header = f.read(4)
|
||||
return header == b"GGUF"
|
||||
Reference in New Issue
Block a user