Sync from v0.13
This commit is contained in:
108
vllm/config/__init__.py
Normal file
108
vllm/config/__init__.py
Normal file
@@ -0,0 +1,108 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm.config.attention import AttentionConfig
|
||||
from vllm.config.cache import CacheConfig
|
||||
from vllm.config.compilation import (
|
||||
CompilationConfig,
|
||||
CompilationMode,
|
||||
CUDAGraphMode,
|
||||
PassConfig,
|
||||
)
|
||||
from vllm.config.device import DeviceConfig
|
||||
from vllm.config.ec_transfer import ECTransferConfig
|
||||
from vllm.config.kv_events import KVEventsConfig
|
||||
from vllm.config.kv_transfer import KVTransferConfig
|
||||
from vllm.config.load import LoadConfig
|
||||
from vllm.config.lora import LoRAConfig
|
||||
from vllm.config.model import (
|
||||
ModelConfig,
|
||||
iter_architecture_defaults,
|
||||
try_match_architecture_defaults,
|
||||
)
|
||||
from vllm.config.multimodal import MultiModalConfig
|
||||
from vllm.config.observability import ObservabilityConfig
|
||||
from vllm.config.parallel import EPLBConfig, ParallelConfig
|
||||
from vllm.config.pooler import PoolerConfig
|
||||
from vllm.config.profiler import ProfilerConfig
|
||||
from vllm.config.scheduler import SchedulerConfig
|
||||
from vllm.config.speculative import SpeculativeConfig
|
||||
from vllm.config.speech_to_text import SpeechToTextConfig
|
||||
from vllm.config.structured_outputs import StructuredOutputsConfig
|
||||
from vllm.config.utils import (
|
||||
ConfigType,
|
||||
SupportsMetricsInfo,
|
||||
config,
|
||||
get_attr_docs,
|
||||
is_init_field,
|
||||
update_config,
|
||||
)
|
||||
from vllm.config.vllm import (
|
||||
VllmConfig,
|
||||
get_cached_compilation_config,
|
||||
get_current_vllm_config,
|
||||
get_layers_from_vllm_config,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
|
||||
# __all__ should only contain classes and functions.
|
||||
# Types and globals should be imported from their respective modules.
|
||||
__all__ = [
|
||||
# From vllm.config.attention
|
||||
"AttentionConfig",
|
||||
# From vllm.config.cache
|
||||
"CacheConfig",
|
||||
# From vllm.config.compilation
|
||||
"CompilationConfig",
|
||||
"CompilationMode",
|
||||
"CUDAGraphMode",
|
||||
"PassConfig",
|
||||
# From vllm.config.device
|
||||
"DeviceConfig",
|
||||
# From vllm.config.ec_transfer
|
||||
"ECTransferConfig",
|
||||
# From vllm.config.kv_events
|
||||
"KVEventsConfig",
|
||||
# From vllm.config.kv_transfer
|
||||
"KVTransferConfig",
|
||||
# From vllm.config.load
|
||||
"LoadConfig",
|
||||
# From vllm.config.lora
|
||||
"LoRAConfig",
|
||||
# From vllm.config.model
|
||||
"ModelConfig",
|
||||
"iter_architecture_defaults",
|
||||
"try_match_architecture_defaults",
|
||||
# From vllm.config.multimodal
|
||||
"MultiModalConfig",
|
||||
# From vllm.config.observability
|
||||
"ObservabilityConfig",
|
||||
# From vllm.config.parallel
|
||||
"EPLBConfig",
|
||||
"ParallelConfig",
|
||||
# From vllm.config.pooler
|
||||
"PoolerConfig",
|
||||
# From vllm.config.scheduler
|
||||
"SchedulerConfig",
|
||||
# From vllm.config.speculative
|
||||
"SpeculativeConfig",
|
||||
# From vllm.config.speech_to_text
|
||||
"SpeechToTextConfig",
|
||||
# From vllm.config.structured_outputs
|
||||
"StructuredOutputsConfig",
|
||||
# From vllm.config.profiler
|
||||
"ProfilerConfig",
|
||||
# From vllm.config.utils
|
||||
"ConfigType",
|
||||
"SupportsMetricsInfo",
|
||||
"config",
|
||||
"get_attr_docs",
|
||||
"is_init_field",
|
||||
"update_config",
|
||||
# From vllm.config.vllm
|
||||
"VllmConfig",
|
||||
"get_cached_compilation_config",
|
||||
"get_current_vllm_config",
|
||||
"set_current_vllm_config",
|
||||
"get_layers_from_vllm_config",
|
||||
]
|
||||
114
vllm/config/attention.py
Normal file
114
vllm/config/attention.py
Normal file
@@ -0,0 +1,114 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import field_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class AttentionConfig:
|
||||
"""Configuration for attention mechanisms in vLLM."""
|
||||
|
||||
backend: AttentionBackendEnum | None = None
|
||||
"""Attention backend to use. If None, will be selected automatically."""
|
||||
|
||||
flash_attn_version: Literal[2, 3] | None = None
|
||||
"""Force vllm to use a specific flash-attention version (2 or 3).
|
||||
Only valid when using the flash-attention backend."""
|
||||
|
||||
use_prefill_decode_attention: bool = False
|
||||
"""Use separate prefill and decode kernels for attention instead of
|
||||
the unified triton kernel."""
|
||||
|
||||
flash_attn_max_num_splits_for_cuda_graph: int = 32
|
||||
"""Flash Attention max number splits for cuda graph decode."""
|
||||
|
||||
use_cudnn_prefill: bool = False
|
||||
"""Whether to use cudnn prefill."""
|
||||
|
||||
use_trtllm_ragged_deepseek_prefill: bool = False
|
||||
"""Whether to use TRTLLM ragged deepseek prefill."""
|
||||
|
||||
use_trtllm_attention: bool | None = None
|
||||
"""If set to True/False, use or don't use the TRTLLM attention backend
|
||||
in flashinfer. If None, auto-detect the attention backend in flashinfer."""
|
||||
|
||||
disable_flashinfer_prefill: bool = False
|
||||
"""Whether to disable flashinfer prefill."""
|
||||
|
||||
disable_flashinfer_q_quantization: bool = False
|
||||
"""If set, when using fp8 kv, do not quantize Q to fp8."""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
Provide a hash that uniquely identifies all the configs
|
||||
that affect the structure of the computation
|
||||
graph from input ids/embeddings to the final hidden states,
|
||||
excluding anything before input ids/embeddings and after
|
||||
the final hidden states.
|
||||
"""
|
||||
from vllm.config.utils import get_hash_factors, hash_factors
|
||||
|
||||
ignored_factors: list[str] = []
|
||||
factors = get_hash_factors(self, ignored_factors)
|
||||
return hash_factors(factors)
|
||||
|
||||
@field_validator("backend", mode="before")
|
||||
@classmethod
|
||||
def validate_backend_before(cls, value: Any) -> Any:
|
||||
"""Enable parsing of the `backend` enum type from string."""
|
||||
if isinstance(value, str):
|
||||
return AttentionBackendEnum[value.upper()]
|
||||
return value
|
||||
|
||||
def _set_from_env_if_set(self, field_name: str, env_var_name: str) -> None:
|
||||
"""Set field from env var if set, with deprecation warning."""
|
||||
from vllm import envs
|
||||
|
||||
if envs.is_set(env_var_name):
|
||||
value = getattr(envs, env_var_name)
|
||||
if field_name == "backend":
|
||||
value = self.validate_backend_before(value)
|
||||
setattr(self, field_name, value)
|
||||
logger.warning_once(
|
||||
"Using %s environment variable is deprecated and will be removed in "
|
||||
"v0.14.0 or v1.0.0, whichever is soonest. Please use "
|
||||
"--attention-config.%s command line argument or "
|
||||
"AttentionConfig(%s=...) config field instead.",
|
||||
env_var_name,
|
||||
field_name,
|
||||
field_name,
|
||||
)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
self._set_from_env_if_set("backend", "VLLM_ATTENTION_BACKEND")
|
||||
self._set_from_env_if_set("flash_attn_version", "VLLM_FLASH_ATTN_VERSION")
|
||||
self._set_from_env_if_set(
|
||||
"use_prefill_decode_attention", "VLLM_V1_USE_PREFILL_DECODE_ATTENTION"
|
||||
)
|
||||
self._set_from_env_if_set(
|
||||
"flash_attn_max_num_splits_for_cuda_graph",
|
||||
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
|
||||
)
|
||||
self._set_from_env_if_set("use_cudnn_prefill", "VLLM_USE_CUDNN_PREFILL")
|
||||
self._set_from_env_if_set(
|
||||
"use_trtllm_ragged_deepseek_prefill",
|
||||
"VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL",
|
||||
)
|
||||
self._set_from_env_if_set("use_trtllm_attention", "VLLM_USE_TRTLLM_ATTENTION")
|
||||
self._set_from_env_if_set(
|
||||
"disable_flashinfer_prefill", "VLLM_DISABLE_FLASHINFER_PREFILL"
|
||||
)
|
||||
self._set_from_env_if_set(
|
||||
"disable_flashinfer_q_quantization",
|
||||
"VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION",
|
||||
)
|
||||
232
vllm/config/cache.py
Normal file
232
vllm/config/cache.py
Normal file
@@ -0,0 +1,232 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import field
|
||||
from typing import TYPE_CHECKING, Any, Literal
|
||||
|
||||
from pydantic import Field, SkipValidation, field_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.mem_utils import get_cpu_memory
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config.parallel import ParallelConfig
|
||||
else:
|
||||
ParallelConfig = Any
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
BlockSize = Literal[1, 8, 16, 32, 64, 128, 256]
|
||||
CacheDType = Literal[
|
||||
"auto",
|
||||
"bfloat16",
|
||||
"fp8",
|
||||
"fp8_e4m3",
|
||||
"fp8_e5m2",
|
||||
"fp8_inc",
|
||||
"fp8_ds_mla",
|
||||
]
|
||||
MambaDType = Literal["auto", "float32", "float16"]
|
||||
PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor", "xxhash", "xxhash_cbor"]
|
||||
KVOffloadingBackend = Literal["native", "lmcache"]
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class CacheConfig:
|
||||
"""Configuration for the KV cache."""
|
||||
|
||||
block_size: SkipValidation[BlockSize] = None # type: ignore
|
||||
"""Size of a contiguous cache block in number of tokens. On CUDA devices,
|
||||
only block sizes up to 32 are supported.
|
||||
|
||||
This config has no static default. If left unspecified by the user, it will
|
||||
be set in `Platform.check_and_update_config()` based on the current
|
||||
platform."""
|
||||
gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
|
||||
"""The fraction of GPU memory to be used for the model executor, which can
|
||||
range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
|
||||
utilization. If unspecified, will use the default value of 0.9. This is a
|
||||
per-instance limit, and only applies to the current vLLM instance. It does
|
||||
not matter if you have another vLLM instance running on the same GPU. For
|
||||
example, if you have two vLLM instances running on the same GPU, you can
|
||||
set the GPU memory utilization to 0.5 for each instance."""
|
||||
swap_space: float = Field(default=4, ge=0)
|
||||
"""Size of the CPU swap space per GPU (in GiB)."""
|
||||
cache_dtype: CacheDType = "auto"
|
||||
"""Data type for kv cache storage. If "auto", will use model data type.
|
||||
CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
|
||||
fp8 (=fp8_e4m3). Intel Gaudi (HPU) supports fp8 (using fp8_inc).
|
||||
Some models (namely DeepSeekV3.2) default to fp8, set to bfloat16 to use
|
||||
bfloat16 instead, this is an invalid option for models that do not default
|
||||
to fp8.
|
||||
"""
|
||||
is_attention_free: bool = False
|
||||
"""Whether the model is attention-free. This is primarily set in
|
||||
`ModelConfig` and that value should be manually duplicated here."""
|
||||
num_gpu_blocks_override: int | None = None
|
||||
"""Number of GPU blocks to use. This overrides the profiled `num_gpu_blocks`
|
||||
if specified. Does nothing if `None`. Used for testing preemption."""
|
||||
sliding_window: int | None = None
|
||||
"""Sliding window size for the KV cache. This is primarily set in
|
||||
`ModelConfig` and that value should be manually duplicated here."""
|
||||
enable_prefix_caching: bool = True
|
||||
"""Whether to enable prefix caching."""
|
||||
prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
|
||||
"""Set the hash algorithm for prefix caching:\n
|
||||
- "sha256" uses Pickle for object serialization before hashing. This is the
|
||||
current default, as SHA256 is the most secure choice to avoid potential
|
||||
hash collisions.\n
|
||||
- "sha256_cbor" provides a reproducible, cross-language compatible hash. It
|
||||
serializes objects using canonical CBOR and hashes them with SHA-256.\n
|
||||
- "xxhash" uses Pickle serialization with xxHash (128-bit) for faster,
|
||||
non-cryptographic hashing. Requires the optional ``xxhash`` package.
|
||||
IMPORTANT: Use of a hashing algorithm that is not considered
|
||||
cryptographically secure theoretically increases the risk of hash collisions,
|
||||
which can cause undefined behavior or even leak private information in
|
||||
multi-tenant environments. Even if collisions are still very unlikely, it is
|
||||
important to consider your security risk tolerance against the performance
|
||||
benefits before turning this on.\n
|
||||
- "xxhash_cbor" combines canonical CBOR serialization with xxHash for
|
||||
reproducible hashing. Requires the optional ``xxhash`` package."""
|
||||
cpu_offload_gb: float = Field(default=0, ge=0)
|
||||
"""The space in GiB to offload to CPU, per GPU. Default is 0, which means
|
||||
no offloading. Intuitively, this argument can be seen as a virtual way to
|
||||
increase the GPU memory size. For example, if you have one 24 GB GPU and
|
||||
set this to 10, virtually you can think of it as a 34 GB GPU. Then you can
|
||||
load a 13B model with BF16 weight, which requires at least 26GB GPU memory.
|
||||
Note that this requires fast CPU-GPU interconnect, as part of the model is
|
||||
loaded from CPU memory to GPU memory on the fly in each model forward pass.
|
||||
"""
|
||||
calculate_kv_scales: bool = False
|
||||
"""This enables dynamic calculation of `k_scale` and `v_scale` when
|
||||
kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
|
||||
checkpoint if available. Otherwise, the scales will default to 1.0."""
|
||||
cpu_kvcache_space_bytes: int | None = None
|
||||
"""(CPU backend only) CPU key-value cache space."""
|
||||
mamba_page_size_padded: int | None = None
|
||||
""" Optional override for mamba page size; used by hybrid mamba/attention
|
||||
models to ensure exact alignment with attention page size."""
|
||||
mamba_block_size: int | None = Field(default=None, gt=0)
|
||||
"""Size of a contiguous cache block in number of tokens for mamba cache.
|
||||
Can be set only when prefix caching is enabled.
|
||||
Value must be a multiple of 8 to align with causal_conv1d kernel."""
|
||||
mamba_cache_dtype: MambaDType = "auto"
|
||||
"""The data type to use for the Mamba cache (both the conv as well as the
|
||||
ssm state). If set to 'auto', the data type will be inferred from the model
|
||||
config."""
|
||||
mamba_ssm_cache_dtype: MambaDType = "auto"
|
||||
"""The data type to use for the Mamba cache (ssm state only, conv state will
|
||||
still be controlled by mamba_cache_dtype). If set to 'auto', the data type
|
||||
for the ssm state will be determined by mamba_cache_dtype."""
|
||||
|
||||
# Will be set after profiling.
|
||||
num_gpu_blocks: int | None = field(default=None, init=False)
|
||||
"""The number of blocks to allocate for GPU memory."""
|
||||
num_cpu_blocks: int | None = field(default=None, init=False)
|
||||
"""The number of blocks to allocate for CPU memory."""
|
||||
|
||||
kv_sharing_fast_prefill: bool = False
|
||||
"""This feature is work in progress and no prefill optimization takes place
|
||||
with this flag enabled currently.
|
||||
|
||||
In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254),
|
||||
some layers can skip tokens corresponding to prefill. This flag enables
|
||||
attention metadata for eligible layers to be overridden with metadata
|
||||
necessary for implementing this optimization in some models (e.g. Gemma3n)
|
||||
"""
|
||||
|
||||
kv_cache_memory_bytes: int | None = None
|
||||
"""Size of KV Cache per GPU in bytes. By default, this is set to None
|
||||
and vllm can automatically infer the kv cache size based on
|
||||
gpu_memory_utilization. However, users may want to manually specify
|
||||
the kv cache memory size. kv_cache_memory_bytes allows more fine-grain
|
||||
control of how much memory gets used when compared with using
|
||||
gpu_memory_utilization. Note that kv_cache_memory_bytes
|
||||
(when not-None) ignores gpu_memory_utilization"""
|
||||
|
||||
kv_offloading_size: float | None = None
|
||||
"""Size of the KV cache offloading buffer in GiB. When TP > 1, this is
|
||||
the total buffer size summed across all TP ranks. By default, this is set
|
||||
to None, which means no KV offloading is enabled. When set with
|
||||
kv_offloading_backend, vLLM will enable KV cache offloading to CPU"""
|
||||
|
||||
kv_offloading_backend: KVOffloadingBackend | None = None
|
||||
"""The backend to use for KV cache offloading. Supported backends include
|
||||
'native' (vLLM native CPU offloading), 'lmcache' This option must be used
|
||||
together with kv_offloading_size."""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
ensure that it is included in the factors list if
|
||||
it affects the computation graph.
|
||||
|
||||
Provide a hash that uniquely identifies all the configs
|
||||
that affect the structure of the computation
|
||||
graph from input ids/embeddings to the final hidden states,
|
||||
excluding anything before input ids/embeddings and after
|
||||
the final hidden states.
|
||||
"""
|
||||
ignored_factors = {
|
||||
# Runtime/derived knobs that don't affect compiled graph shape
|
||||
"gpu_memory_utilization",
|
||||
"swap_space",
|
||||
"is_attention_free",
|
||||
"num_gpu_blocks_override",
|
||||
"enable_prefix_caching",
|
||||
"prefix_caching_hash_algo",
|
||||
"cpu_kvcache_space_bytes",
|
||||
"mamba_page_size_padded",
|
||||
# Post-init/derived counters
|
||||
"num_gpu_blocks",
|
||||
"num_cpu_blocks",
|
||||
# WIP feature toggle not impacting compiled graph shape
|
||||
"kv_sharing_fast_prefill",
|
||||
}
|
||||
|
||||
from vllm.config.utils import get_hash_factors, hash_factors
|
||||
|
||||
factors = get_hash_factors(self, ignored_factors)
|
||||
return hash_factors(factors)
|
||||
|
||||
def metrics_info(self):
|
||||
# convert cache_config to dict(key: str, value: str) for prometheus
|
||||
# metrics info
|
||||
return {key: str(value) for key, value in self.__dict__.items()}
|
||||
|
||||
@field_validator("cache_dtype", mode="after")
|
||||
@classmethod
|
||||
def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType:
|
||||
if cache_dtype.startswith("fp8"):
|
||||
logger.info(
|
||||
"Using fp8 data type to store kv cache. It reduces the GPU "
|
||||
"memory footprint and boosts the performance. "
|
||||
"Meanwhile, it may cause accuracy drop without a proper "
|
||||
"scaling factor."
|
||||
)
|
||||
return cache_dtype
|
||||
|
||||
def verify_with_parallel_config(
|
||||
self,
|
||||
parallel_config: ParallelConfig,
|
||||
) -> None:
|
||||
swap_space_bytes = self.swap_space * GiB_bytes
|
||||
total_cpu_memory = get_cpu_memory()
|
||||
# FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
|
||||
# group are in the same node. However, the GPUs may span multiple nodes.
|
||||
num_gpus_per_node = parallel_config.tensor_parallel_size
|
||||
cpu_memory_usage = swap_space_bytes * num_gpus_per_node
|
||||
|
||||
msg = (
|
||||
f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
|
||||
f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory "
|
||||
"is allocated for the swap space."
|
||||
)
|
||||
if cpu_memory_usage > 0.7 * total_cpu_memory:
|
||||
raise ValueError("Too large swap space. " + msg)
|
||||
elif cpu_memory_usage > 0.4 * total_cpu_memory:
|
||||
logger.warning("Possibly too large swap space. %s", msg)
|
||||
1140
vllm/config/compilation.py
Normal file
1140
vllm/config/compilation.py
Normal file
File diff suppressed because it is too large
Load Diff
75
vllm/config/device.py
Normal file
75
vllm/config/device.py
Normal file
@@ -0,0 +1,75 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import field
|
||||
from typing import Any, Literal
|
||||
|
||||
import torch
|
||||
from pydantic import ConfigDict, SkipValidation
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
|
||||
|
||||
|
||||
@config
|
||||
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
||||
class DeviceConfig:
|
||||
"""Configuration for the device to use for vLLM execution."""
|
||||
|
||||
device: SkipValidation[Device | torch.device | None] = "auto"
|
||||
"""Device type for vLLM execution.
|
||||
This parameter is deprecated and will be
|
||||
removed in a future release.
|
||||
It will now be set automatically based
|
||||
on the current platform."""
|
||||
device_type: str = field(init=False)
|
||||
"""Device type from the current platform. This is set in
|
||||
`__post_init__`."""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
ensure that it is included in the factors list if
|
||||
it affects the computation graph.
|
||||
|
||||
Provide a hash that uniquely identifies all the configs
|
||||
that affect the structure of the computation
|
||||
graph from input ids/embeddings to the final hidden states,
|
||||
excluding anything before input ids/embeddings and after
|
||||
the final hidden states.
|
||||
"""
|
||||
# no factors to consider.
|
||||
# the device/platform information will be summarized
|
||||
# by torch/vllm automatically.
|
||||
factors: list[Any] = []
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
def __post_init__(self):
|
||||
if self.device == "auto":
|
||||
# Automated device type detection
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
self.device_type = current_platform.device_type
|
||||
if not self.device_type:
|
||||
raise RuntimeError(
|
||||
"Failed to infer device type, please set "
|
||||
"the environment variable `VLLM_LOGGING_LEVEL=DEBUG` "
|
||||
"to turn on verbose logging to help debug the issue."
|
||||
)
|
||||
else:
|
||||
# Device type is assigned explicitly
|
||||
if isinstance(self.device, str):
|
||||
self.device_type = self.device
|
||||
elif isinstance(self.device, torch.device):
|
||||
self.device_type = self.device.type
|
||||
|
||||
# Some device types require processing inputs on CPU
|
||||
if self.device_type in ["tpu"]:
|
||||
self.device = None
|
||||
else:
|
||||
# Set device with device type
|
||||
self.device = torch.device(self.device_type)
|
||||
110
vllm/config/ec_transfer.py
Normal file
110
vllm/config/ec_transfer.py
Normal file
@@ -0,0 +1,110 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import hashlib
|
||||
import uuid
|
||||
from dataclasses import field
|
||||
from typing import Any, Literal, get_args
|
||||
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm.config.utils import config
|
||||
|
||||
ECProducer = Literal["ec_producer"]
|
||||
ECConsumer = Literal["ec_consumer"]
|
||||
ECRole = Literal[ECProducer, ECConsumer]
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class ECTransferConfig:
|
||||
"""Configuration for distributed EC cache transfer."""
|
||||
|
||||
ec_connector: str | None = None
|
||||
"""The EC connector for vLLM to transmit EC caches between vLLM instances.
|
||||
"""
|
||||
|
||||
engine_id: str | None = None
|
||||
"""The engine id for EC transfers."""
|
||||
|
||||
ec_buffer_device: str | None = "cuda"
|
||||
"""The device used by ec connector to buffer the EC cache.
|
||||
Currently only support 'cuda'."""
|
||||
|
||||
ec_buffer_size: float = 1e9
|
||||
"""The buffer size for TorchDistributedConnector. Measured in number of
|
||||
bytes. Recommended value: 1e9 (about 1GB)."""
|
||||
|
||||
ec_role: ECRole | None = None
|
||||
"""Whether this vLLM instance produces, consumes EC cache, or both. Choices
|
||||
are 'ec_producer', 'ec_consumer'."""
|
||||
|
||||
ec_rank: int | None = None
|
||||
"""The rank of this vLLM instance in the EC cache transfer. Typical value:
|
||||
0 for encoder, 1 for pd instance.
|
||||
Currently only 1P1D is supported."""
|
||||
|
||||
ec_parallel_size: int = 1
|
||||
"""The number of parallel instances for EC cache transfer. For
|
||||
PyNcclConnector, this should be 2."""
|
||||
|
||||
ec_ip: str = "127.0.0.1"
|
||||
"""The EC connector ip, used to build distributed connection."""
|
||||
|
||||
ec_port: int = 14579
|
||||
"""The EC connector port, used to build distributed connection."""
|
||||
|
||||
ec_connector_extra_config: dict[str, Any] = field(default_factory=dict)
|
||||
"""any extra config that the connector may need."""
|
||||
|
||||
ec_connector_module_path: str | None = None
|
||||
"""The Python module path to dynamically load the EC connector from.
|
||||
Only supported in V1."""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
ensure that it is included in the factors list if
|
||||
it affects the computation graph.
|
||||
|
||||
Provide a hash that uniquely identifies all the configs
|
||||
that affect the structure of the computation
|
||||
graph from input ids/embeddings to the final hidden states,
|
||||
excluding anything before input ids/embeddings and after
|
||||
the final hidden states.
|
||||
"""
|
||||
# no factors to consider.
|
||||
# this config will not affect the computation graph.
|
||||
factors: list[Any] = []
|
||||
hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.engine_id is None:
|
||||
self.engine_id = str(uuid.uuid4())
|
||||
|
||||
if self.ec_role is not None and self.ec_role not in get_args(ECRole):
|
||||
raise ValueError(
|
||||
f"Unsupported ec_role: {self.ec_role}. "
|
||||
f"Supported roles are {get_args(ECRole)}"
|
||||
)
|
||||
|
||||
if self.ec_connector is not None and self.ec_role is None:
|
||||
raise ValueError(
|
||||
"Please specify ec_role when ec_connector "
|
||||
f"is set, supported roles are {get_args(ECRole)}"
|
||||
)
|
||||
|
||||
@property
|
||||
def is_ec_transfer_instance(self) -> bool:
|
||||
return self.ec_connector is not None and self.ec_role in get_args(ECRole)
|
||||
|
||||
@property
|
||||
def is_ec_producer(self) -> bool:
|
||||
return self.ec_connector is not None and self.ec_role in get_args(ECProducer)
|
||||
|
||||
@property
|
||||
def is_ec_consumer(self) -> bool:
|
||||
return self.ec_connector is not None and self.ec_role in get_args(ECConsumer)
|
||||
|
||||
def get_from_extra_config(self, key, default) -> Any:
|
||||
return self.ec_connector_extra_config.get(key, default)
|
||||
56
vllm/config/kv_events.py
Normal file
56
vllm/config/kv_events.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import Field
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm.config.utils import config
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class KVEventsConfig:
|
||||
"""Configuration for KV event publishing."""
|
||||
|
||||
enable_kv_cache_events: bool = False
|
||||
"""If True, enable KV cache events for tracking block storage and removal.
|
||||
Events can be published externally by zmq using the event publisher config.
|
||||
"""
|
||||
|
||||
publisher: Literal["null", "zmq"] = Field(default=None)
|
||||
"""The publisher to use for publishing kv events. Can be "null", "zmq".
|
||||
"""
|
||||
|
||||
endpoint: str = "tcp://*:5557"
|
||||
"""The zmq endpoint to use for publishing kv events.
|
||||
"""
|
||||
|
||||
replay_endpoint: str | None = None
|
||||
"""The zmq endpoint to use for replaying kv events.
|
||||
"""
|
||||
|
||||
buffer_steps: int = 10_000
|
||||
"""The number of steps to cache for replay endpoint. Will only save
|
||||
events from the last N steps for the replay endpoint.
|
||||
"""
|
||||
|
||||
hwm: int = 100_000
|
||||
"""The zmq high water mark for the event publisher. After queueing N events,
|
||||
events will start dropping if the consumer is not keeping up.
|
||||
"""
|
||||
|
||||
max_queue_size: int = 100_000
|
||||
"""The maximum number of events to queue while waiting for publishing.
|
||||
"""
|
||||
|
||||
topic: str = ""
|
||||
"""The topic to use for the event publisher. Consumers can subscribe to
|
||||
this topic to receive events.
|
||||
"""
|
||||
|
||||
def __post_init__(self):
|
||||
if self.publisher is None:
|
||||
self.publisher = "zmq" if self.enable_kv_cache_events else "null"
|
||||
119
vllm/config/kv_transfer.py
Normal file
119
vllm/config/kv_transfer.py
Normal file
@@ -0,0 +1,119 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import uuid
|
||||
from dataclasses import field
|
||||
from typing import Any, Literal, get_args
|
||||
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
KVProducer = Literal["kv_producer", "kv_both"]
|
||||
KVConsumer = Literal["kv_consumer", "kv_both"]
|
||||
KVRole = Literal[KVProducer, KVConsumer]
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class KVTransferConfig:
|
||||
"""Configuration for distributed KV cache transfer."""
|
||||
|
||||
kv_connector: str | None = None
|
||||
"""The KV connector for vLLM to transmit KV caches between vLLM instances.
|
||||
"""
|
||||
|
||||
engine_id: str | None = None
|
||||
"""The engine id for KV transfers."""
|
||||
|
||||
kv_buffer_device: str = "cuda"
|
||||
"""The device used by kv connector to buffer the KV cache. Choices are
|
||||
'cuda' and 'cpu'."""
|
||||
|
||||
kv_buffer_size: float = 1e9
|
||||
"""The buffer size for TorchDistributedConnector. Measured in number of
|
||||
bytes. Recommended value: 1e9 (about 1GB)."""
|
||||
|
||||
kv_role: KVRole | None = None
|
||||
"""Whether this vLLM instance produces, consumes KV cache, or both. Choices
|
||||
are 'kv_producer', 'kv_consumer', and 'kv_both'."""
|
||||
|
||||
kv_rank: int | None = None
|
||||
"""The rank of this vLLM instance in the KV cache transfer. Typical value:
|
||||
0 for prefill instance, 1 for decode instance.
|
||||
Currently only 1P1D is supported."""
|
||||
|
||||
kv_parallel_size: int = 1
|
||||
"""The number of parallel instances for KV cache transfer. For
|
||||
P2pNcclConnector, this should be 2."""
|
||||
|
||||
kv_ip: str = "127.0.0.1"
|
||||
"""The KV connector ip, used to build distributed connection."""
|
||||
|
||||
kv_port: int = 14579
|
||||
"""The KV connector port, used to build distributed connection."""
|
||||
|
||||
kv_connector_extra_config: dict[str, Any] = field(default_factory=dict)
|
||||
"""any extra config that the connector may need."""
|
||||
|
||||
kv_connector_module_path: str | None = None
|
||||
"""The Python module path to dynamically load the KV connector from.
|
||||
Only supported in V1."""
|
||||
|
||||
enable_permute_local_kv: bool = False
|
||||
"""Experiment feature flag to enable HND to NHD KV Transfer"""
|
||||
|
||||
kv_load_failure_policy: Literal["recompute", "fail"] = "recompute"
|
||||
"""Policy for handling KV cache load failures.
|
||||
'recompute': reschedule the request to recompute failed blocks (default)
|
||||
'fail': immediately fail the request with an error finish reason"""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
ensure that it is included in the factors list if
|
||||
it affects the computation graph.
|
||||
|
||||
Provide a hash that uniquely identifies all the configs
|
||||
that affect the structure of the computation
|
||||
graph from input ids/embeddings to the final hidden states,
|
||||
excluding anything before input ids/embeddings and after
|
||||
the final hidden states.
|
||||
"""
|
||||
# no factors to consider.
|
||||
# this config will not affect the computation graph.
|
||||
factors: list[Any] = []
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.engine_id is None:
|
||||
self.engine_id = str(uuid.uuid4())
|
||||
|
||||
if self.kv_role is not None and self.kv_role not in get_args(KVRole):
|
||||
raise ValueError(
|
||||
f"Unsupported kv_role: {self.kv_role}. "
|
||||
f"Supported roles are {get_args(KVRole)}"
|
||||
)
|
||||
|
||||
if self.kv_connector is not None and self.kv_role is None:
|
||||
raise ValueError(
|
||||
"Please specify kv_role when kv_connector "
|
||||
f"is set, supported roles are {get_args(KVRole)}"
|
||||
)
|
||||
|
||||
@property
|
||||
def is_kv_transfer_instance(self) -> bool:
|
||||
return self.kv_connector is not None and self.kv_role in get_args(KVRole)
|
||||
|
||||
@property
|
||||
def is_kv_producer(self) -> bool:
|
||||
return self.kv_connector is not None and self.kv_role in get_args(KVProducer)
|
||||
|
||||
@property
|
||||
def is_kv_consumer(self) -> bool:
|
||||
return self.kv_connector is not None and self.kv_role in get_args(KVConsumer)
|
||||
|
||||
def get_from_extra_config(self, key, default) -> Any:
|
||||
return self.kv_connector_extra_config.get(key, default)
|
||||
124
vllm/config/load.py
Normal file
124
vllm/config/load.py
Normal file
@@ -0,0 +1,124 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from pydantic import Field, field_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.model_executor.model_loader import LoadFormats
|
||||
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
||||
else:
|
||||
LoadFormats = Any
|
||||
TensorizerConfig = Any
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class LoadConfig:
|
||||
"""Configuration for loading the model weights."""
|
||||
|
||||
load_format: str | LoadFormats = "auto"
|
||||
"""The format of the model weights to load:\n
|
||||
- "auto" will try to load the weights in the safetensors format and fall
|
||||
back to the pytorch bin format if safetensors format is not available.\n
|
||||
- "pt" will load the weights in the pytorch bin format.\n
|
||||
- "safetensors" will load the weights in the safetensors format.\n
|
||||
- "npcache" will load the weights in pytorch format and store a numpy cache
|
||||
to speed up the loading.\n
|
||||
- "dummy" will initialize the weights with random values, which is mainly
|
||||
for profiling.\n
|
||||
- "tensorizer" will use CoreWeave's tensorizer library for fast weight
|
||||
loading. See the Tensorize vLLM Model script in the Examples section for
|
||||
more information.\n
|
||||
- "runai_streamer" will load the Safetensors weights using Run:ai Model
|
||||
Streamer.\n
|
||||
- "runai_streamer_sharded" will load weights from pre-sharded checkpoint
|
||||
files using Run:ai Model Streamer.\n
|
||||
- "bitsandbytes" will load the weights using bitsandbytes quantization.\n
|
||||
- "sharded_state" will load weights from pre-sharded checkpoint files,
|
||||
supporting efficient loading of tensor-parallel models.\n
|
||||
- "gguf" will load weights from GGUF format files (details specified in
|
||||
https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n
|
||||
- "mistral" will load weights from consolidated safetensors files used by
|
||||
Mistral models.
|
||||
- Other custom values can be supported via plugins."""
|
||||
download_dir: str | None = None
|
||||
"""Directory to download and load the weights, default to the default
|
||||
cache directory of Hugging Face."""
|
||||
safetensors_load_strategy: str = "lazy"
|
||||
"""Specifies the loading strategy for safetensors weights.
|
||||
- "lazy" (default): Weights are memory-mapped from the file. This enables
|
||||
on-demand loading and is highly efficient for models on local storage.
|
||||
- "eager": The entire file is read into CPU memory upfront before loading.
|
||||
This is recommended for models on network filesystems (e.g., Lustre, NFS)
|
||||
as it avoids inefficient random reads, significantly speeding up model
|
||||
initialization. However, it uses more CPU RAM.
|
||||
- "torchao": Weights are loaded in upfront and then reconstructed
|
||||
into torchao tensor subclasses. This is used when the checkpoint
|
||||
was quantized using torchao and saved using safetensors.
|
||||
Needs torchao >= 0.14.0
|
||||
"""
|
||||
model_loader_extra_config: dict | TensorizerConfig = Field(default_factory=dict)
|
||||
"""Extra config for model loader. This will be passed to the model loader
|
||||
corresponding to the chosen load_format."""
|
||||
device: str | None = None
|
||||
"""Device to which model weights will be loaded, default to
|
||||
device_config.device"""
|
||||
ignore_patterns: list[str] | str = Field(default_factory=lambda: ["original/**/*"])
|
||||
"""The list of patterns to ignore when loading the model. Default to
|
||||
"original/**/*" to avoid repeated loading of llama's checkpoints."""
|
||||
use_tqdm_on_load: bool = True
|
||||
"""Whether to enable tqdm for showing progress bar when loading model
|
||||
weights."""
|
||||
pt_load_map_location: str | dict[str, str] = "cpu"
|
||||
"""
|
||||
pt_load_map_location: the map location for loading pytorch checkpoint, to
|
||||
support loading checkpoints can only be loaded on certain devices like
|
||||
"cuda", this is equivalent to {"": "cuda"}. Another supported format is
|
||||
mapping from different devices like from GPU 1 to GPU 0:
|
||||
{"cuda:1": "cuda:0"}. Note that when passed from command line, the strings
|
||||
in dictionary needs to be double quoted for json parsing. For more details,
|
||||
see original doc for `map_location` in https://pytorch.org/docs/stable/generated/torch.load.html
|
||||
"""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
ensure that it is included in the factors list if
|
||||
it affects the computation graph.
|
||||
|
||||
Provide a hash that uniquely identifies all the configs
|
||||
that affect the structure of the computation
|
||||
graph from input ids/embeddings to the final hidden states,
|
||||
excluding anything before input ids/embeddings and after
|
||||
the final hidden states.
|
||||
"""
|
||||
# no factors to consider.
|
||||
# this config will not affect the computation graph.
|
||||
factors: list[Any] = []
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
@field_validator("load_format", mode="after")
|
||||
def _lowercase_load_format(cls, load_format: str) -> str:
|
||||
return load_format.lower()
|
||||
|
||||
@field_validator("ignore_patterns", mode="after")
|
||||
def _validate_ignore_patterns(
|
||||
cls, ignore_patterns: list[str] | str
|
||||
) -> list[str] | str:
|
||||
if ignore_patterns != ["original/**/*"] and len(ignore_patterns) > 0:
|
||||
logger.info(
|
||||
"Ignoring the following patterns when downloading weights: %s",
|
||||
ignore_patterns,
|
||||
)
|
||||
|
||||
return ignore_patterns
|
||||
96
vllm/config/lora.py
Normal file
96
vllm/config/lora.py
Normal file
@@ -0,0 +1,96 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Literal
|
||||
|
||||
import torch
|
||||
from pydantic import ConfigDict, Field, model_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
from typing_extensions import Self
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.config.cache import CacheConfig
|
||||
else:
|
||||
ModelConfig = Any
|
||||
CacheConfig = Any
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
LoRADType = Literal["auto", "float16", "bfloat16"]
|
||||
MaxLoRARanks = Literal[1, 8, 16, 32, 64, 128, 256, 320, 512]
|
||||
LoRAExtraVocabSize = Literal[256, 512]
|
||||
|
||||
|
||||
@config
|
||||
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
||||
class LoRAConfig:
|
||||
"""Configuration for LoRA."""
|
||||
|
||||
max_lora_rank: MaxLoRARanks = 16
|
||||
"""Max LoRA rank."""
|
||||
max_loras: int = Field(default=1, ge=1)
|
||||
"""Max number of LoRAs in a single batch."""
|
||||
fully_sharded_loras: bool = False
|
||||
"""By default, only half of the LoRA computation is sharded with tensor
|
||||
parallelism. Enabling this will use the fully sharded layers. At high
|
||||
sequence length, max rank or tensor parallel size, this is likely faster.
|
||||
"""
|
||||
max_cpu_loras: int | None = None
|
||||
"""Maximum number of LoRAs to store in CPU memory. Must be >= than
|
||||
`max_loras`."""
|
||||
lora_dtype: torch.dtype | LoRADType = "auto"
|
||||
"""Data type for LoRA. If auto, will default to base model dtype."""
|
||||
default_mm_loras: dict[str, str] | None = None
|
||||
"""Dictionary mapping specific modalities to LoRA model paths; this field
|
||||
is only applicable to multimodal models and should be leveraged when a
|
||||
model always expects a LoRA to be active when a given modality is present.
|
||||
Note that currently, if a request provides multiple additional
|
||||
modalities, each of which have their own LoRA, we do NOT apply
|
||||
default_mm_loras because we currently only support one lora adapter
|
||||
per prompt. When run in offline mode, the lora IDs for n modalities
|
||||
will be automatically assigned to 1-n with the names of the modalities
|
||||
in alphabetic order."""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
ensure that it is included in the factors list if
|
||||
it affects the computation graph.
|
||||
|
||||
Provide a hash that uniquely identifies all the configs
|
||||
that affect the structure of the computation
|
||||
graph from input ids/embeddings to the final hidden states,
|
||||
excluding anything before input ids/embeddings and after
|
||||
the final hidden states.
|
||||
"""
|
||||
factors: list[Any] = []
|
||||
factors.append(self.max_lora_rank)
|
||||
factors.append(self.max_loras)
|
||||
factors.append(self.fully_sharded_loras)
|
||||
factors.append(self.lora_dtype)
|
||||
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _validate_lora_config(self) -> Self:
|
||||
if self.max_cpu_loras is None:
|
||||
self.max_cpu_loras = self.max_loras
|
||||
elif self.max_cpu_loras < self.max_loras:
|
||||
raise ValueError(
|
||||
f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
|
||||
f"max_loras ({self.max_loras})"
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
def verify_with_model_config(self, model_config: ModelConfig):
|
||||
if self.lora_dtype in (None, "auto"):
|
||||
self.lora_dtype = model_config.dtype
|
||||
elif isinstance(self.lora_dtype, str):
|
||||
self.lora_dtype = getattr(torch, self.lora_dtype)
|
||||
2190
vllm/config/model.py
Normal file
2190
vllm/config/model.py
Normal file
File diff suppressed because it is too large
Load Diff
247
vllm/config/multimodal.py
Normal file
247
vllm/config/multimodal.py
Normal file
@@ -0,0 +1,247 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Mapping
|
||||
from typing import Any, Literal, TypeAlias
|
||||
|
||||
from pydantic import ConfigDict, Field, field_validator, model_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||
from vllm.config.utils import config
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
|
||||
@dataclass
|
||||
class BaseDummyOptions:
|
||||
"""Base options for generating dummy data during profiling."""
|
||||
|
||||
count: int = Field(999, ge=0)
|
||||
|
||||
|
||||
@dataclass(config=ConfigDict(extra="forbid"))
|
||||
class VideoDummyOptions(BaseDummyOptions):
|
||||
"""Options for generating dummy video data during profiling."""
|
||||
|
||||
num_frames: int | None = Field(None, gt=0)
|
||||
width: int | None = Field(None, gt=0)
|
||||
height: int | None = Field(None, gt=0)
|
||||
|
||||
|
||||
@dataclass(config=ConfigDict(extra="forbid"))
|
||||
class ImageDummyOptions(BaseDummyOptions):
|
||||
"""Options for generating dummy image data during profiling."""
|
||||
|
||||
width: int | None = Field(None, gt=0)
|
||||
height: int | None = Field(None, gt=0)
|
||||
|
||||
|
||||
@dataclass(config=ConfigDict(extra="forbid"))
|
||||
class AudioDummyOptions(BaseDummyOptions):
|
||||
"""Options for generating dummy audio data during profiling."""
|
||||
|
||||
length: int | None = Field(None, gt=0)
|
||||
|
||||
|
||||
MMEncoderTPMode = Literal["weights", "data"]
|
||||
MMCacheType = Literal["shm", "lru"]
|
||||
DummyOptions: TypeAlias = (
|
||||
BaseDummyOptions | VideoDummyOptions | ImageDummyOptions | AudioDummyOptions
|
||||
)
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class MultiModalConfig:
|
||||
"""Controls the behavior of multimodal models."""
|
||||
|
||||
limit_per_prompt: dict[str, DummyOptions] = Field(default_factory=dict)
|
||||
"""The maximum number of input items and options allowed per
|
||||
prompt for each modality.
|
||||
Defaults to 999 for each modality.
|
||||
|
||||
Legacy format (count only):
|
||||
{"image": 16, "video": 2}
|
||||
|
||||
Configurable format (with options):
|
||||
{"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512},
|
||||
"image": {"count": 5, "width": 512, "height": 512}}
|
||||
|
||||
Mixed format (combining both):
|
||||
{"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512,
|
||||
"height": 512}}
|
||||
"""
|
||||
enable_mm_embeds: bool = False
|
||||
"""If `True`, enables passing multimodal embeddings:
|
||||
for `LLM` class, this refers to tensor inputs under `multi_modal_data`;
|
||||
for the OpenAI-compatible server, this refers to chat messages with content
|
||||
`"type": "*_embeds"`.
|
||||
|
||||
WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
|
||||
Only enable this flag for trusted users!"""
|
||||
media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
|
||||
"""Additional args passed to process media inputs, keyed by modalities.
|
||||
For example, to set num_frames for video, set
|
||||
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
|
||||
mm_processor_kwargs: dict[str, object] | None = None
|
||||
"""Arguments to be forwarded to the model's processor for multi-modal data,
|
||||
e.g., image processor. Overrides for the multi-modal processor obtained
|
||||
from `transformers.AutoProcessor.from_pretrained`.
|
||||
|
||||
The available overrides depend on the model that is being run.
|
||||
|
||||
For example, for Phi-3-Vision:
|
||||
`{"num_crops": 4}`."""
|
||||
mm_processor_cache_gb: float = Field(default=4, ge=0)
|
||||
"""The size (in GiB) of the multi-modal processor cache, which is used to
|
||||
avoid re-processing past multi-modal inputs.
|
||||
|
||||
This cache is duplicated for each API process and engine core process,
|
||||
resulting in a total memory usage of
|
||||
`mm_processor_cache_gb * (api_server_count + data_parallel_size)`.
|
||||
|
||||
Set to `0` to disable this cache completely (not recommended)."""
|
||||
mm_processor_cache_type: MMCacheType = "lru"
|
||||
"""Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
|
||||
use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
|
||||
mm_shm_cache_max_object_size_mb: int = Field(default=128, ge=0)
|
||||
"""Size limit (in MiB) for each object stored in the multi-modal processor
|
||||
shared memory cache. Only effective when `mm_processor_cache_type` is
|
||||
`"shm"`."""
|
||||
mm_encoder_tp_mode: MMEncoderTPMode = "weights"
|
||||
"""Indicates how to optimize multi-modal encoder inference using tensor
|
||||
parallelism (TP).
|
||||
|
||||
- `"weights"`: Within the same vLLM engine, split the weights of
|
||||
each layer across TP ranks. (default TP behavior)\n
|
||||
- `"data"`: Within the same vLLM engine, split the batched input data
|
||||
across TP ranks to process the data in parallel, while hosting
|
||||
the full weights on each TP rank.
|
||||
This batch-level DP is not to be confused with API request-level
|
||||
DP (which is controlled by `--data-parallel-size`).
|
||||
This is only supported on a per-model basis and falls back to
|
||||
`"weights"` if the encoder does not support DP."""
|
||||
mm_encoder_attn_backend: AttentionBackendEnum | None = None
|
||||
"""Optional override for the multi-modal encoder attention backend when
|
||||
using vision transformers. Accepts any value from
|
||||
`vllm.attention.backends.registry.AttentionBackendEnum` (e.g. `FLASH_ATTN`)."""
|
||||
interleave_mm_strings: bool = False
|
||||
"""Enable fully interleaved support for multimodal prompts, while using
|
||||
--chat-template-content-format=string."""
|
||||
skip_mm_profiling: bool = False
|
||||
"""When enabled, skips multimodal memory profiling and only profiles with
|
||||
language backbone model during engine initialization.
|
||||
|
||||
This reduces engine startup time but shifts the responsibility to users for
|
||||
estimating the peak memory usage of the activation of multimodal encoder and
|
||||
embedding cache."""
|
||||
video_pruning_rate: float | None = Field(default=None, ge=0.0, lt=1.0)
|
||||
"""Sets pruning rate for video pruning via Efficient Video Sampling.
|
||||
Value sits in range [0;1) and determines fraction of media tokens
|
||||
from each video to be pruned.
|
||||
"""
|
||||
|
||||
@field_validator("limit_per_prompt", mode="before")
|
||||
@classmethod
|
||||
def _validate_limit_per_prompt(
|
||||
cls, value: dict[str, int | dict[str, int]]
|
||||
) -> dict[str, DummyOptions]:
|
||||
for k, v in value.items():
|
||||
# Handle legacy format where only count is specified
|
||||
if isinstance(v, int):
|
||||
v = {"count": v}
|
||||
# Convert to the appropriate DummyOptions subclass
|
||||
if k == "video":
|
||||
value[k] = VideoDummyOptions(**v)
|
||||
elif k == "image":
|
||||
value[k] = ImageDummyOptions(**v)
|
||||
elif k == "audio":
|
||||
value[k] = AudioDummyOptions(**v)
|
||||
else:
|
||||
value[k] = BaseDummyOptions(**v)
|
||||
return value
|
||||
|
||||
@field_validator("mm_encoder_attn_backend", mode="before")
|
||||
@classmethod
|
||||
def _validate_mm_encoder_attn_backend(
|
||||
cls, value: str | AttentionBackendEnum | None
|
||||
) -> AttentionBackendEnum | None:
|
||||
if isinstance(value, str) and value.upper() == "XFORMERS":
|
||||
raise ValueError(
|
||||
"Attention backend 'XFORMERS' has been removed (See PR #29262 for "
|
||||
"details). Please select a supported attention backend."
|
||||
)
|
||||
|
||||
if value is None or isinstance(value, AttentionBackendEnum):
|
||||
return value
|
||||
|
||||
assert isinstance(value, str), (
|
||||
"mm_encoder_attn_backend must be a string or an AttentionBackendEnum."
|
||||
)
|
||||
return AttentionBackendEnum[value.upper()]
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _validate_multimodal_config(self):
|
||||
if self.mm_processor_cache_type != "shm" and (
|
||||
self.mm_shm_cache_max_object_size_mb
|
||||
!= MultiModalConfig.mm_shm_cache_max_object_size_mb
|
||||
):
|
||||
raise ValueError(
|
||||
"'mm_shm_cache_max_object_size_mb' should only be set when "
|
||||
"'mm_processor_cache_type' is 'shm'."
|
||||
)
|
||||
return self
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
ensure that it is included in the factors list if
|
||||
it affects the computation graph.
|
||||
|
||||
Provide a hash that uniquely identifies all the configs
|
||||
that affect the structure of the computation
|
||||
graph from input ids/embeddings to the final hidden states,
|
||||
excluding anything before input ids/embeddings and after
|
||||
the final hidden states.
|
||||
"""
|
||||
factors: list[Any] = [
|
||||
self.mm_encoder_attn_backend.name
|
||||
if self.mm_encoder_attn_backend is not None
|
||||
else None
|
||||
]
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
def get_limit_per_prompt(self, modality: str) -> int:
|
||||
"""
|
||||
Get the maximum number of input items allowed per prompt
|
||||
for the given modality (backward compatible).
|
||||
"""
|
||||
limit_data = self.limit_per_prompt.get(modality)
|
||||
|
||||
if limit_data is None:
|
||||
# Unspecified modality is set to 999 by default
|
||||
return 999
|
||||
return limit_data.count
|
||||
|
||||
def get_dummy_options(self, modality: str) -> BaseDummyOptions | None:
|
||||
"""
|
||||
Get the configurable dummy data options for a modality.
|
||||
Returns None if no options are configured for this modality.
|
||||
"""
|
||||
# All values are now DummyOptions after normalization
|
||||
return self.limit_per_prompt.get(modality)
|
||||
|
||||
def merge_mm_processor_kwargs(
|
||||
self,
|
||||
inference_kwargs: Mapping[str, object],
|
||||
) -> dict[str, object]:
|
||||
"""
|
||||
Get the keyword arguments to pass to the multi-modal processor
|
||||
according to the extra arguments passed during inference.
|
||||
"""
|
||||
kwargs = self.mm_processor_kwargs or {}
|
||||
return kwargs | dict(inference_kwargs)
|
||||
|
||||
def is_multimodal_pruning_enabled(self):
|
||||
return self.video_pruning_rate is not None and self.video_pruning_rate > 0
|
||||
140
vllm/config/observability.py
Normal file
140
vllm/config/observability.py
Normal file
@@ -0,0 +1,140 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from functools import cached_property
|
||||
from typing import Any, Literal, cast
|
||||
|
||||
from packaging.version import parse
|
||||
from pydantic import Field, field_validator, model_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm import version
|
||||
from vllm.config.utils import config
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
DetailedTraceModules = Literal["model", "worker", "all"]
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class ObservabilityConfig:
|
||||
"""Configuration for observability - metrics and tracing."""
|
||||
|
||||
show_hidden_metrics_for_version: str | None = None
|
||||
"""Enable deprecated Prometheus metrics that have been hidden since the
|
||||
specified version. For example, if a previously deprecated metric has been
|
||||
hidden since the v0.7.0 release, you use
|
||||
`--show-hidden-metrics-for-version=0.7` as a temporary escape hatch while
|
||||
you migrate to new metrics. The metric is likely to be removed completely
|
||||
in an upcoming release."""
|
||||
|
||||
@cached_property
|
||||
def show_hidden_metrics(self) -> bool:
|
||||
"""Check if the hidden metrics should be shown."""
|
||||
if self.show_hidden_metrics_for_version is None:
|
||||
return False
|
||||
return version._prev_minor_version_was(self.show_hidden_metrics_for_version)
|
||||
|
||||
otlp_traces_endpoint: str | None = None
|
||||
"""Target URL to which OpenTelemetry traces will be sent."""
|
||||
|
||||
collect_detailed_traces: list[DetailedTraceModules] | None = None
|
||||
"""It makes sense to set this only if `--otlp-traces-endpoint` is set. If
|
||||
set, it will collect detailed traces for the specified modules. This
|
||||
involves use of possibly costly and or blocking operations and hence might
|
||||
have a performance impact.
|
||||
|
||||
Note that collecting detailed timing information for each request can be
|
||||
expensive."""
|
||||
|
||||
kv_cache_metrics: bool = False
|
||||
"""Enable KV cache residency metrics (lifetime, idle time, reuse gaps).
|
||||
Uses sampling to minimize overhead.
|
||||
Requires log stats to be enabled (i.e., --disable-log-stats not set)."""
|
||||
|
||||
kv_cache_metrics_sample: float = Field(default=0.01, gt=0, le=1)
|
||||
"""Sampling rate for KV cache metrics (0.0, 1.0]. Default 0.01 = 1% of blocks."""
|
||||
|
||||
cudagraph_metrics: bool = False
|
||||
"""Enable CUDA graph metrics (number of padded/unpadded tokens, runtime cudagraph
|
||||
dispatch modes, and their observed frequencies at every logging interval)."""
|
||||
|
||||
enable_layerwise_nvtx_tracing: bool = False
|
||||
"""Enable layerwise NVTX tracing. This traces the execution of each layer or
|
||||
module in the model and attach informations such as input/output shapes to
|
||||
nvtx range markers. Noted that this doesn't work with CUDA graphs enabled."""
|
||||
|
||||
@cached_property
|
||||
def collect_model_forward_time(self) -> bool:
|
||||
"""Whether to collect model forward time for the request."""
|
||||
return self.collect_detailed_traces is not None and (
|
||||
"model" in self.collect_detailed_traces
|
||||
or "all" in self.collect_detailed_traces
|
||||
)
|
||||
|
||||
@cached_property
|
||||
def collect_model_execute_time(self) -> bool:
|
||||
"""Whether to collect model execute time for the request."""
|
||||
return self.collect_detailed_traces is not None and (
|
||||
"worker" in self.collect_detailed_traces
|
||||
or "all" in self.collect_detailed_traces
|
||||
)
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
ensure that it is included in the factors list if
|
||||
it affects the computation graph.
|
||||
|
||||
Provide a hash that uniquely identifies all the configs
|
||||
that affect the structure of the computation
|
||||
graph from input ids/embeddings to the final hidden states,
|
||||
excluding anything before input ids/embeddings and after
|
||||
the final hidden states.
|
||||
"""
|
||||
# no factors to consider.
|
||||
# this config will not affect the computation graph.
|
||||
factors: list[Any] = []
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
@field_validator("show_hidden_metrics_for_version")
|
||||
@classmethod
|
||||
def _validate_show_hidden_metrics_for_version(cls, value: str | None) -> str | None:
|
||||
if value is not None:
|
||||
# Raises an exception if the string is not a valid version.
|
||||
parse(value)
|
||||
return value
|
||||
|
||||
@field_validator("otlp_traces_endpoint")
|
||||
@classmethod
|
||||
def _validate_otlp_traces_endpoint(cls, value: str | None) -> str | None:
|
||||
if value is not None:
|
||||
from vllm.tracing import is_otel_available, otel_import_error_traceback
|
||||
|
||||
if not is_otel_available():
|
||||
raise ValueError(
|
||||
"OpenTelemetry is not available. Unable to configure "
|
||||
"'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
|
||||
f"installed. Original error:\n{otel_import_error_traceback}"
|
||||
)
|
||||
return value
|
||||
|
||||
@field_validator("collect_detailed_traces")
|
||||
@classmethod
|
||||
def _validate_collect_detailed_traces(
|
||||
cls, value: list[DetailedTraceModules] | None
|
||||
) -> list[DetailedTraceModules] | None:
|
||||
"""Handle the legacy case where users might provide a comma-separated
|
||||
string instead of a list of strings."""
|
||||
if value is not None and len(value) == 1 and "," in value[0]:
|
||||
value = cast(list[DetailedTraceModules], value[0].split(","))
|
||||
return value
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _validate_tracing_config(self):
|
||||
if self.collect_detailed_traces and not self.otlp_traces_endpoint:
|
||||
raise ValueError(
|
||||
"collect_detailed_traces requires `--otlp-traces-endpoint` to be set."
|
||||
)
|
||||
return self
|
||||
660
vllm/config/parallel.py
Normal file
660
vllm/config/parallel.py
Normal file
@@ -0,0 +1,660 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
from typing import TYPE_CHECKING, Any, Literal
|
||||
|
||||
import torch
|
||||
from pydantic import Field, model_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
from torch.distributed import ProcessGroup, ReduceOp
|
||||
from typing_extensions import Self
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.batch_invariant import (
|
||||
vllm_is_batch_invariant,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.network_utils import get_open_ports_list
|
||||
from vllm.utils.torch_utils import cuda_device_count_stateless
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ray.runtime_env import RuntimeEnv
|
||||
from ray.util.placement_group import PlacementGroup
|
||||
|
||||
from vllm.v1.executor import Executor
|
||||
else:
|
||||
RuntimeEnv = Any
|
||||
PlacementGroup = Any
|
||||
Executor = Any
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
ExpertPlacementStrategy = Literal["linear", "round_robin"]
|
||||
DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
|
||||
DataParallelBackend = Literal["ray", "mp"]
|
||||
EPLBPolicyOption = Literal["default"]
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class EPLBConfig:
|
||||
"""Configuration for Expert Parallel Load Balancing (EP)."""
|
||||
|
||||
window_size: int = 1000
|
||||
"""Window size for expert load recording."""
|
||||
step_interval: int = 3000
|
||||
"""
|
||||
Interval for rearranging experts in expert parallelism.
|
||||
|
||||
Note that if this is greater than the EPLB window size, only the metrics
|
||||
of the last `lb_window_size` steps will be used for rearranging experts.
|
||||
"""
|
||||
|
||||
num_redundant_experts: int = Field(default=0, ge=0)
|
||||
"""Number of redundant experts to use for expert parallelism."""
|
||||
|
||||
log_balancedness: bool = False
|
||||
"""
|
||||
Log the balancedness each step of expert parallelism.
|
||||
This is turned off by default since it will cause communication overhead.
|
||||
"""
|
||||
use_async: bool = False
|
||||
"""
|
||||
Whether to use non-blocking EPLB.
|
||||
"""
|
||||
|
||||
policy: EPLBPolicyOption = "default"
|
||||
"""The policy type for expert parallel load balancing (EPLB)."""
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class ParallelConfig:
|
||||
"""Configuration for the distributed execution."""
|
||||
|
||||
pipeline_parallel_size: int = 1
|
||||
"""Number of pipeline parallel groups."""
|
||||
tensor_parallel_size: int = 1
|
||||
"""Number of tensor parallel groups."""
|
||||
prefill_context_parallel_size: int = 1
|
||||
"""Number of prefill context parallel groups."""
|
||||
data_parallel_size: int = 1
|
||||
"""Number of data parallel groups. MoE layers will be sharded according to
|
||||
the product of the tensor parallel size and data parallel size."""
|
||||
data_parallel_size_local: int = 1
|
||||
"""Number of local data parallel groups."""
|
||||
data_parallel_rank: int = 0
|
||||
"""Rank of the data parallel group."""
|
||||
data_parallel_rank_local: int | None = None
|
||||
"""Local rank of the data parallel group,
|
||||
set only in SPMD mode."""
|
||||
data_parallel_master_ip: str = "127.0.0.1"
|
||||
"""IP of the data parallel master."""
|
||||
data_parallel_rpc_port: int = 29550
|
||||
"""Port for data parallel messaging."""
|
||||
data_parallel_master_port: int = 29500
|
||||
"""Port of the data parallel master."""
|
||||
data_parallel_backend: DataParallelBackend = "mp"
|
||||
"""Backend to use for data parallel, either "mp" or "ray"."""
|
||||
data_parallel_external_lb: bool = False
|
||||
"""Whether to use "external" DP LB mode. Applies only to online serving
|
||||
and when data_parallel_size > 0. This is useful for a "one-pod-per-rank"
|
||||
wide-EP setup in Kubernetes. Set implicitly when --data-parallel-rank
|
||||
is provided explicitly to vllm serve."""
|
||||
data_parallel_hybrid_lb: bool = False
|
||||
"""Whether to use "hybrid" DP LB mode. Applies only to online serving
|
||||
and when data_parallel_size > 0. Enables running an AsyncLLM
|
||||
and API server on a "per-node" basis where vLLM load balances
|
||||
between local data parallel ranks, but an external LB balances
|
||||
between vLLM nodes/replicas. Set explicitly in conjunction with
|
||||
--data-parallel-start-rank."""
|
||||
enable_expert_parallel: bool = False
|
||||
"""Use expert parallelism instead of tensor parallelism for MoE layers."""
|
||||
enable_eplb: bool = False
|
||||
"""Enable expert parallelism load balancing for MoE layers."""
|
||||
eplb_config: EPLBConfig = Field(default_factory=EPLBConfig)
|
||||
"""Expert parallelism configuration."""
|
||||
expert_placement_strategy: ExpertPlacementStrategy = "linear"
|
||||
"""The expert placement strategy for MoE layers:\n
|
||||
- "linear": Experts are placed in a contiguous manner. For example, with 4
|
||||
experts and 2 ranks, rank 0 will have experts [0, 1] and rank 1 will have
|
||||
experts [2, 3].\n
|
||||
- "round_robin": Experts are placed in a round-robin manner. For example,
|
||||
with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1
|
||||
will have experts [1, 3]. This strategy can help improve load balancing
|
||||
for grouped expert models with no redundant experts."""
|
||||
all2all_backend: (
|
||||
Literal[
|
||||
"naive",
|
||||
"pplx",
|
||||
"deepep_high_throughput",
|
||||
"deepep_low_latency",
|
||||
"allgather_reducescatter",
|
||||
"flashinfer_all2allv",
|
||||
]
|
||||
| None
|
||||
) = None
|
||||
"""All2All backend for MoE expert parallel communication. If not set, uses
|
||||
the value from VLLM_ALL2ALL_BACKEND environment variable. Available options:
|
||||
- "naive": Naive all2all implementation using broadcasts
|
||||
- "allgather_reducescatter": All2all based on allgather and reducescatter
|
||||
- "pplx": Use pplx kernels
|
||||
- "deepep_high_throughput": Use deepep high-throughput kernels
|
||||
- "deepep_low_latency": Use deepep low-latency kernels
|
||||
- "flashinfer_all2allv": Use flashinfer alltoallv kernels for mnnvl"""
|
||||
|
||||
max_parallel_loading_workers: int | None = None
|
||||
"""Maximum number of parallel loading workers when loading model
|
||||
sequentially in multiple batches. To avoid RAM OOM when using tensor
|
||||
parallel and large models."""
|
||||
|
||||
disable_custom_all_reduce: bool = False
|
||||
"""Disable the custom all-reduce kernel and fall back to NCCL."""
|
||||
|
||||
enable_dbo: bool = False
|
||||
"""Enable dual batch overlap for the model executor."""
|
||||
|
||||
dbo_decode_token_threshold: int = 32
|
||||
"""The threshold for dual batch overlap for batches only containing decodes.
|
||||
If the number of tokens in the request is greater than this threshold,
|
||||
microbatching will be used. Otherwise, the request will be processed in a
|
||||
single batch."""
|
||||
dbo_prefill_token_threshold: int = 512 # TODO(lucas): tune
|
||||
"""The threshold for dual batch overlap for batches that contain one or more
|
||||
prefills. If the number of tokens in the request is greater than this
|
||||
threshold, microbatching will be used. Otherwise, the request will be
|
||||
processed in a single batch."""
|
||||
|
||||
disable_nccl_for_dp_synchronization: bool = False
|
||||
"""Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py
|
||||
to use Gloo instead of NCCL for its all reduce"""
|
||||
|
||||
ray_workers_use_nsight: bool = False
|
||||
"""Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
|
||||
|
||||
ray_runtime_env: RuntimeEnv | None = None
|
||||
"""Ray runtime environment to pass to distributed workers."""
|
||||
|
||||
placement_group: PlacementGroup | None = None
|
||||
"""ray distributed model workers placement group."""
|
||||
|
||||
distributed_executor_backend: (
|
||||
str | DistributedExecutorBackend | type[Executor] | None
|
||||
) = None
|
||||
"""Backend to use for distributed model workers, either "ray" or "mp"
|
||||
(multiprocessing). If the product of pipeline_parallel_size and tensor_parallel_size
|
||||
is less than or equal to the number of GPUs available, "mp" will be used to
|
||||
keep processing on a single host. Otherwise, an error will be raised. To use "mp"
|
||||
you must also set nnodes, and to use "ray" you must manually set
|
||||
distributed_executor_backend to "ray".
|
||||
|
||||
Note that tpu only support Ray for distributed inference."""
|
||||
|
||||
worker_cls: str = "auto"
|
||||
"""The full name of the worker class to use. If "auto", the worker class
|
||||
will be determined based on the platform."""
|
||||
sd_worker_cls: str = "auto"
|
||||
"""The full name of the worker class to use for speculative decoding.
|
||||
If "auto", the worker class will be determined based on the platform."""
|
||||
worker_extension_cls: str = ""
|
||||
"""The full name of the worker extension class to use. The worker extension
|
||||
class is dynamically inherited by the worker class. This is used to inject
|
||||
new attributes and methods to the worker class for use in collective_rpc
|
||||
calls."""
|
||||
master_addr: str = "127.0.0.1"
|
||||
"""distributed master address for multi-node distributed
|
||||
inference when distributed_executor_backend is mp."""
|
||||
master_port: int = 29501
|
||||
"""distributed master port for multi-node distributed
|
||||
inference when distributed_executor_backend is mp."""
|
||||
node_rank: int = 0
|
||||
"""distributed node rank for multi-node distributed
|
||||
inference when distributed_executor_backend is mp."""
|
||||
nnodes: int = 1
|
||||
"""num of nodes for multi-node distributed
|
||||
inference when distributed_executor_backend is mp."""
|
||||
|
||||
world_size: int = Field(init=False)
|
||||
"""world_size is TPxPP, it affects the number of workers we create."""
|
||||
|
||||
rank: int = 0
|
||||
"""Global rank in distributed setup."""
|
||||
|
||||
_data_parallel_master_port_list: list[int] = Field(default_factory=list)
|
||||
"""List of open port auto-queried for data parallel messaging.
|
||||
Set to be private as it's not intended to be configured by users.
|
||||
"""
|
||||
|
||||
decode_context_parallel_size: int = 1
|
||||
"""Number of decode context parallel groups, because the world size does
|
||||
not change by dcp, it simply reuse the GPUs of TP group, and tp_size
|
||||
needs to be divisible by dcp_size."""
|
||||
|
||||
dcp_kv_cache_interleave_size: int = 1
|
||||
"""
|
||||
Interleave size of kv_cache storage while using DCP.
|
||||
dcp_kv_cache_interleave_size has been replaced by cp_kv_cache_interleave_size,
|
||||
and will be deprecated when PCP is fully supported.
|
||||
|
||||
"""
|
||||
cp_kv_cache_interleave_size: int = 1
|
||||
"""Interleave size of kv_cache storage while using DCP or PCP.
|
||||
For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
|
||||
and `total_cp_world_size = pcp_world_size * dcp_world_size`.
|
||||
store interleave_size tokens on total_cp_rank i,
|
||||
then store next interleave_size tokens on total_cp_rank i+1.
|
||||
Interleave_size=1: token-level alignment, where token `i` is stored on
|
||||
total_cp_rank `i % total_cp_world_size`.
|
||||
Interleave_size=block_size: block-level alignment, where tokens are
|
||||
first populated to the preceding ranks. Tokens are then stored
|
||||
in (rank i+1, block j) only after (rank i, block j) is fully occupied.
|
||||
Block_size should be greater than or equal to cp_kv_cache_interleave_size.
|
||||
Block_size should be divisible by cp_kv_cache_interleave_size.
|
||||
"""
|
||||
|
||||
_api_process_count: int = Field(default=1, gt=0)
|
||||
"""
|
||||
The number of API processes initialized.
|
||||
|
||||
Note:
|
||||
This is an internal config that is only valid for and
|
||||
should only be set by API server scale-out.
|
||||
"""
|
||||
|
||||
_api_process_rank: int = Field(default=0, ge=-1)
|
||||
"""
|
||||
The rank of this API process, or `-1` for engine core processes
|
||||
under API server scale-out.
|
||||
|
||||
Note:
|
||||
This is an internal config that is only valid for and
|
||||
should only be set by API server scale-out.
|
||||
"""
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _validate_parallel_config(self) -> Self:
|
||||
if self._api_process_rank >= self._api_process_count:
|
||||
raise ValueError(
|
||||
"Invalid value of `_api_process_rank`. "
|
||||
f"Expected to be `-1` or `[0, {self._api_process_count})`, "
|
||||
f"but found: {self._api_process_rank}"
|
||||
)
|
||||
|
||||
if self.data_parallel_size_local > self.data_parallel_size:
|
||||
raise ValueError(
|
||||
f"data_parallel_size_local ({self.data_parallel_size_local}) "
|
||||
f"must be <= data_parallel_size ({self.data_parallel_size})"
|
||||
)
|
||||
|
||||
if self.data_parallel_size <= 1 and self.data_parallel_external_lb:
|
||||
raise ValueError(
|
||||
"data_parallel_external_lb can only be set when data_parallel_size > 1"
|
||||
)
|
||||
|
||||
if self.enable_eplb:
|
||||
if not current_platform.is_cuda_alike():
|
||||
raise ValueError(
|
||||
"Expert parallelism load balancing is only supported on "
|
||||
"CUDA devices or ROCm devices now."
|
||||
)
|
||||
if not self.enable_expert_parallel:
|
||||
raise ValueError("enable_expert_parallel must be True to use EPLB.")
|
||||
if self.tensor_parallel_size * self.data_parallel_size <= 1:
|
||||
raise ValueError(
|
||||
"EPLB requires tensor_parallel_size or data_parallel_size "
|
||||
f"to be greater than 1, but got "
|
||||
f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}."
|
||||
)
|
||||
else:
|
||||
if self.eplb_config.num_redundant_experts != 0:
|
||||
raise ValueError(
|
||||
"num_redundant_experts is set to "
|
||||
f"{self.eplb_config.num_redundant_experts} but EPLB is not "
|
||||
"enabled. Either enable EPLB or unset "
|
||||
"num_redundant_experts."
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
@property
|
||||
def world_size_across_dp(self) -> int:
|
||||
"""world_size_across_dp is TPxPPxDP, it is the size of the world
|
||||
including data parallelism."""
|
||||
return self.world_size * self.data_parallel_size
|
||||
|
||||
def get_next_dp_init_port(self) -> int:
|
||||
"""
|
||||
We might need to initialize process groups in multiple
|
||||
processes that is related to data parallelism,
|
||||
e.g. both in the worker and in the engine, which
|
||||
can live in different processes. To avoid port conflicts, we
|
||||
pop a new port from the prepared port list each time we need to
|
||||
initialize a new process group related to data parallelism.
|
||||
"""
|
||||
if self._data_parallel_master_port_list:
|
||||
answer = self._data_parallel_master_port_list.pop()
|
||||
else:
|
||||
answer = self.data_parallel_master_port
|
||||
self.data_parallel_master_port += 1
|
||||
|
||||
return answer
|
||||
|
||||
def stateless_init_dp_group(self) -> ProcessGroup:
|
||||
# NOTE: In high-concurrency scenarios multiple processes
|
||||
# can pick the same (currently free) port through a race
|
||||
# condition when calling `get_open_port()`. When the first
|
||||
# process binds the port the others will subsequently fail
|
||||
# with `torch.distributed.DistNetworkError: EADDRINUSE`.
|
||||
# To make the initialization more robust we retry a few times
|
||||
# with a fresh port whenever this specific error is observed.
|
||||
from torch.distributed import DistNetworkError
|
||||
|
||||
from vllm.distributed.utils import (
|
||||
stateless_init_torch_distributed_process_group,
|
||||
)
|
||||
|
||||
max_retries = 5
|
||||
last_exc: Exception | None = None
|
||||
for _ in range(max_retries):
|
||||
try:
|
||||
# use gloo since the engine process might not have cuda device
|
||||
return stateless_init_torch_distributed_process_group(
|
||||
self.data_parallel_master_ip,
|
||||
self.get_next_dp_init_port(),
|
||||
self.data_parallel_rank,
|
||||
self.data_parallel_size,
|
||||
backend=current_platform.dist_backend,
|
||||
)
|
||||
except DistNetworkError as e:
|
||||
# We only want to retry when the root cause is EADDRINUSE.
|
||||
if "EADDRINUSE" in str(e):
|
||||
logger.warning("Address already in use. Retrying with a new port.")
|
||||
last_exc = e
|
||||
continue # try again with a new port
|
||||
raise e
|
||||
|
||||
# If we get here all retries have failed.
|
||||
assert last_exc is not None
|
||||
raise last_exc
|
||||
|
||||
# The all_reduce at the end of attention (during o_proj) means that
|
||||
# inputs are replicated across each rank of the tensor parallel group.
|
||||
# If using expert-parallelism with DeepEP All2All ops, replicated
|
||||
# tokens results in useless duplicate computation and communication.
|
||||
#
|
||||
# In this case, ensure the input to the experts is sequence parallel
|
||||
# to avoid the excess work.
|
||||
#
|
||||
# Not needed for pplx-kernels as it can handle duplicate input tokens.
|
||||
@property
|
||||
def use_sequence_parallel_moe(self) -> bool:
|
||||
return (
|
||||
self.all2all_backend
|
||||
in (
|
||||
"allgather_reducescatter",
|
||||
"naive",
|
||||
"deepep_high_throughput",
|
||||
"deepep_low_latency",
|
||||
)
|
||||
and self.enable_expert_parallel
|
||||
and self.tensor_parallel_size > 1
|
||||
and self.data_parallel_size > 1
|
||||
)
|
||||
|
||||
@property
|
||||
def node_rank_within_dp(self) -> int:
|
||||
return self.node_rank % self.nnodes_within_dp
|
||||
|
||||
@property
|
||||
def nnodes_within_dp(self) -> int:
|
||||
if self.nnodes == 1:
|
||||
return 1
|
||||
data_parallel_node_size = (
|
||||
self.data_parallel_size // self.data_parallel_size_local
|
||||
)
|
||||
return self.nnodes // data_parallel_node_size
|
||||
|
||||
@property
|
||||
def local_world_size(self) -> int:
|
||||
return self.world_size // self.nnodes_within_dp
|
||||
|
||||
@staticmethod
|
||||
def has_unfinished_dp(dp_group: ProcessGroup, has_unfinished: bool) -> bool:
|
||||
tensor = torch.tensor([has_unfinished], dtype=torch.int32, device="cpu")
|
||||
# dp rank 0: has_unfinished_seqs=True
|
||||
# dp rank 1: has_unfinished_seqs=False
|
||||
# aggregated: has_unfinished_seqs=True
|
||||
# so this is an OR operation, i.e. MAX in integers
|
||||
torch.distributed.all_reduce(tensor, op=ReduceOp.MAX, group=dp_group)
|
||||
aggregated_has_unfinished = bool(tensor.item())
|
||||
return aggregated_has_unfinished
|
||||
|
||||
@staticmethod
|
||||
def sync_kv_cache_memory_size(dp_group: ProcessGroup, kv_cache_memory: int) -> int:
|
||||
if kv_cache_memory == -1:
|
||||
kv_cache_memory = torch.iinfo(torch.int64).max
|
||||
tensor = torch.tensor([kv_cache_memory], dtype=torch.int64, device="cpu")
|
||||
# we cannot use broadcast for stateless dp group since it depends
|
||||
# on global rank
|
||||
torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group)
|
||||
return tensor.item()
|
||||
|
||||
def compute_hash(self):
|
||||
"""
|
||||
Provide a hash that uniquely identifies all the configs
|
||||
that affect the structure of the computation
|
||||
graph from input ids/embeddings to the final hidden states,
|
||||
excluding anything before input ids/embeddings and after
|
||||
the final hidden states.
|
||||
|
||||
This hash is also used for DP worker configuration validation
|
||||
to prevent hangs from mismatched collective communication patterns.
|
||||
"""
|
||||
ignored_factors = {
|
||||
# Derived/runtime topology, networking, or launch details
|
||||
"data_parallel_rank",
|
||||
"data_parallel_rank_local",
|
||||
"data_parallel_backend",
|
||||
"data_parallel_external_lb",
|
||||
"data_parallel_hybrid_lb",
|
||||
"data_parallel_master_ip",
|
||||
"data_parallel_master_port",
|
||||
"_data_parallel_master_port_list",
|
||||
"data_parallel_rpc_port",
|
||||
"rank",
|
||||
"master_addr",
|
||||
"master_port",
|
||||
"node_rank",
|
||||
"nnodes",
|
||||
"max_parallel_loading_workers",
|
||||
"disable_custom_all_reduce",
|
||||
"ray_workers_use_nsight",
|
||||
"ray_runtime_env",
|
||||
"placement_group",
|
||||
"distributed_executor_backend",
|
||||
"worker_cls",
|
||||
"sd_worker_cls",
|
||||
"worker_extension_cls",
|
||||
"_api_process_count",
|
||||
"_api_process_rank",
|
||||
}
|
||||
|
||||
from vllm.config.utils import get_hash_factors, hash_factors
|
||||
|
||||
factors = get_hash_factors(self, ignored_factors)
|
||||
# Explicitly include backend affecting env factor as before
|
||||
factors["VLLM_ALL2ALL_BACKEND"] = str(envs.VLLM_ALL2ALL_BACKEND)
|
||||
return hash_factors(factors)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
# Set all2all_backend from env var if not specified, with deprecation warning
|
||||
if self.all2all_backend is None:
|
||||
self.all2all_backend = envs.VLLM_ALL2ALL_BACKEND
|
||||
if envs.is_set("VLLM_ALL2ALL_BACKEND"):
|
||||
logger.warning_once(
|
||||
"VLLM_ALL2ALL_BACKEND environment variable is deprecated and "
|
||||
"will be removed in a future release. Please use the "
|
||||
"--all2all-backend command-line argument instead."
|
||||
)
|
||||
|
||||
# Continue with the rest of the initialization
|
||||
self.world_size = (
|
||||
self.pipeline_parallel_size
|
||||
* self.tensor_parallel_size
|
||||
* self.prefill_context_parallel_size
|
||||
)
|
||||
|
||||
if self.distributed_executor_backend == "external_launcher":
|
||||
logger.info("Using external launcher for distributed inference.")
|
||||
self.world_size *= self.data_parallel_size
|
||||
|
||||
if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
|
||||
# Data parallel was specified in the engine args.
|
||||
if self.distributed_executor_backend == "external_launcher":
|
||||
# For external launcher,
|
||||
# we need to set the data parallel rank automatically
|
||||
self.data_parallel_rank = int(os.environ["RANK"]) // (
|
||||
self.world_size // self.data_parallel_size
|
||||
)
|
||||
logger.info(
|
||||
"Set data_parallel_rank to %d automatically.",
|
||||
self.data_parallel_rank,
|
||||
)
|
||||
if not self._data_parallel_master_port_list:
|
||||
self._data_parallel_master_port_list = get_open_ports_list(5)
|
||||
self.data_parallel_master_port = self._data_parallel_master_port_list.pop()
|
||||
|
||||
if not (0 <= self.data_parallel_rank < self.data_parallel_size):
|
||||
raise ValueError(
|
||||
f"data_parallel_rank ({self.data_parallel_rank})"
|
||||
f" must be in the range [0, {self.data_parallel_size})"
|
||||
)
|
||||
else:
|
||||
# Otherwise fall back to env vars (e.g. for offline SPMD case).
|
||||
self.data_parallel_size = envs.VLLM_DP_SIZE
|
||||
self.data_parallel_rank = envs.VLLM_DP_RANK
|
||||
self.data_parallel_rank_local = envs.VLLM_DP_RANK_LOCAL
|
||||
self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
|
||||
self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
|
||||
|
||||
if self.distributed_executor_backend == "external_launcher":
|
||||
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
|
||||
logger.info("Disabling V1 multiprocessing for external launcher.")
|
||||
|
||||
if self.distributed_executor_backend is None and self.world_size > 1:
|
||||
# We use multiprocessing by default if world_size fits on the
|
||||
# current node and we aren't in a ray placement group.
|
||||
|
||||
from vllm.v1.executor import ray_utils
|
||||
|
||||
backend: DistributedExecutorBackend = "mp"
|
||||
ray_found = ray_utils.ray_is_available()
|
||||
if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
|
||||
backend = "uni"
|
||||
elif current_platform.is_cuda() and self.nnodes > 1:
|
||||
backend = "mp"
|
||||
elif (
|
||||
current_platform.is_cuda()
|
||||
and cuda_device_count_stateless() < self.world_size
|
||||
):
|
||||
gpu_count = cuda_device_count_stateless()
|
||||
raise ValueError(
|
||||
f"World size ({self.world_size}) is larger than the number of "
|
||||
f"available GPUs ({gpu_count}) in this node. If this is "
|
||||
"intentional and you are using:\n"
|
||||
"- ray, set '--distributed-executor-backend ray'.\n"
|
||||
"- multiprocessing, set '--nnodes' appropriately."
|
||||
)
|
||||
elif self.data_parallel_backend == "ray":
|
||||
logger.info(
|
||||
"Using ray distributed inference because "
|
||||
"data_parallel_backend is ray"
|
||||
)
|
||||
backend = "ray"
|
||||
elif ray_found:
|
||||
if self.placement_group:
|
||||
backend = "ray"
|
||||
else:
|
||||
from ray import is_initialized as ray_is_initialized
|
||||
|
||||
if ray_is_initialized():
|
||||
from ray.util import get_current_placement_group
|
||||
|
||||
if get_current_placement_group():
|
||||
backend = "ray"
|
||||
self.distributed_executor_backend = backend
|
||||
logger.debug("Defaulting to use %s for distributed inference", backend)
|
||||
|
||||
if self.distributed_executor_backend is None and self.world_size == 1:
|
||||
self.distributed_executor_backend = "uni"
|
||||
|
||||
if self.max_parallel_loading_workers is not None:
|
||||
logger.warning(
|
||||
"max_parallel_loading_workers is currently "
|
||||
"not supported and will be ignored."
|
||||
)
|
||||
allowed_backends = ("mp", "uni", "external_launcher")
|
||||
if (
|
||||
self.distributed_executor_backend not in allowed_backends
|
||||
and self.nnodes > 1
|
||||
):
|
||||
raise ValueError(
|
||||
"nnodes > 1 can only be set when distributed executor "
|
||||
"backend is mp, uni or external_launcher."
|
||||
)
|
||||
|
||||
@property
|
||||
def use_ray(self) -> bool:
|
||||
return self.distributed_executor_backend == "ray" or (
|
||||
isinstance(self.distributed_executor_backend, type)
|
||||
and getattr(self.distributed_executor_backend, "uses_ray", False)
|
||||
)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _verify_args(self) -> Self:
|
||||
# Lazy import to avoid circular import
|
||||
from vllm.v1.executor import Executor
|
||||
|
||||
# Enable batch invariance settings if requested
|
||||
if vllm_is_batch_invariant():
|
||||
self.disable_custom_all_reduce = True
|
||||
|
||||
if (
|
||||
self.distributed_executor_backend is not None
|
||||
and not isinstance(self.distributed_executor_backend, str)
|
||||
and not (
|
||||
isinstance(self.distributed_executor_backend, type)
|
||||
and issubclass(self.distributed_executor_backend, Executor)
|
||||
)
|
||||
):
|
||||
raise ValueError(
|
||||
"Unrecognized distributed executor backend "
|
||||
f"{self.distributed_executor_backend}. Supported "
|
||||
"values are 'ray', 'mp' 'uni', 'external_launcher', "
|
||||
" custom Executor subclass or its import path."
|
||||
)
|
||||
if self.use_ray:
|
||||
from vllm.v1.executor import ray_utils
|
||||
|
||||
ray_utils.assert_ray_available()
|
||||
|
||||
if not current_platform.use_custom_allreduce():
|
||||
self.disable_custom_all_reduce = True
|
||||
logger.debug(
|
||||
"Disabled the custom all-reduce kernel because it is not "
|
||||
"supported on current platform."
|
||||
)
|
||||
if self.nnodes > 1:
|
||||
self.disable_custom_all_reduce = True
|
||||
logger.debug(
|
||||
"Disabled the custom all-reduce since we are running on multi-node."
|
||||
)
|
||||
if self.ray_workers_use_nsight and not self.use_ray:
|
||||
raise ValueError(
|
||||
"Unable to use nsight profiling unless workers run with Ray."
|
||||
)
|
||||
|
||||
return self
|
||||
126
vllm/config/pooler.py
Normal file
126
vllm/config/pooler.py
Normal file
@@ -0,0 +1,126 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
PoolingTypeStr = Literal["LAST", "ALL", "CLS", "STEP", "MEAN"]
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class PoolerConfig:
|
||||
"""Controls the behavior of output pooling in pooling models."""
|
||||
|
||||
pooling_type: PoolingTypeStr | None = None
|
||||
"""
|
||||
The pooling method of the pooling model. This should be a key in
|
||||
[`vllm.model_executor.layers.pooler.PoolingType`][].
|
||||
"""
|
||||
|
||||
## for embeddings models
|
||||
normalize: bool | None = None
|
||||
"""
|
||||
Whether to normalize the embeddings outputs. Defaults to True.
|
||||
"""
|
||||
dimensions: int | None = None
|
||||
"""
|
||||
Reduce the dimensions of embeddings if model
|
||||
support matryoshka representation. Defaults to None.
|
||||
"""
|
||||
enable_chunked_processing: bool | None = None
|
||||
"""
|
||||
Whether to enable chunked processing for long inputs that exceed the model's
|
||||
maximum position embeddings. When enabled, long inputs will be split into
|
||||
chunks, processed separately, and then aggregated using weighted averaging.
|
||||
This allows embedding models to handle arbitrarily long text without CUDA
|
||||
errors. Defaults to False.
|
||||
"""
|
||||
max_embed_len: int | None = None
|
||||
"""
|
||||
Maximum input length allowed for embedding generation. When set, allows
|
||||
inputs longer than max_embed_len to be accepted for embedding models.
|
||||
When an input exceeds max_embed_len, it will be handled according to
|
||||
the original max_model_len validation logic.
|
||||
Defaults to None (i.e. set to max_model_len).
|
||||
"""
|
||||
|
||||
## for classification models
|
||||
softmax: float | None = None
|
||||
"""
|
||||
softmax will be deprecated, please use use_activation instead.
|
||||
"""
|
||||
activation: float | None = None
|
||||
"""
|
||||
activation will be deprecated, please use use_activation instead.
|
||||
"""
|
||||
use_activation: bool | None = None
|
||||
"""
|
||||
Whether to apply activation function to the classification outputs.
|
||||
Defaults to True.
|
||||
"""
|
||||
logit_bias: float | None = None
|
||||
"""
|
||||
If provided, apply classification logit biases. Defaults to None.
|
||||
"""
|
||||
|
||||
## for reward models
|
||||
step_tag_id: int | None = None
|
||||
"""
|
||||
If set, only the score corresponding to the `step_tag_id` in the
|
||||
generated sentence should be returned. Otherwise, the scores for all tokens
|
||||
are returned.
|
||||
"""
|
||||
returned_token_ids: list[int] | None = None
|
||||
"""
|
||||
A list of indices for the vocabulary dimensions to be extracted,
|
||||
such as the token IDs of `good_token` and `bad_token` in the
|
||||
`math-shepherd-mistral-7b-prm` model.
|
||||
"""
|
||||
|
||||
def __post_init__(self):
|
||||
# raise deprecated warning for softmax and activation
|
||||
self.use_activation = get_use_activation(self)
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
ensure that it is included in the factors list if
|
||||
it affects the computation graph.
|
||||
|
||||
Provide a hash that uniquely identifies all the configs
|
||||
that affect the structure of the computation
|
||||
graph from input ids/embeddings to the final hidden states,
|
||||
excluding anything before input ids/embeddings and after
|
||||
the final hidden states.
|
||||
"""
|
||||
# no factors to consider.
|
||||
# this config will not affect the computation graph.
|
||||
factors: list[Any] = []
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
|
||||
def get_use_activation(o: object):
|
||||
if softmax := getattr(o, "softmax", None) is not None:
|
||||
logger.warning_once(
|
||||
"softmax will be deprecated and will be removed in v0.15. "
|
||||
"Please use use_activation instead."
|
||||
)
|
||||
return softmax
|
||||
|
||||
if activation := getattr(o, "activation", None) is not None:
|
||||
logger.warning_once(
|
||||
"activation will be deprecated and will be removed in v0.15. "
|
||||
"Please use use_activation instead."
|
||||
)
|
||||
return activation
|
||||
|
||||
return getattr(o, "use_activation", None)
|
||||
199
vllm/config/profiler.py
Normal file
199
vllm/config/profiler.py
Normal file
@@ -0,0 +1,199 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import Field, model_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
from typing_extensions import Self
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
ProfilerKind = Literal["torch", "cuda"]
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class ProfilerConfig:
|
||||
"""Dataclass which contains profiler config for the engine."""
|
||||
|
||||
profiler: ProfilerKind | None = None
|
||||
"""Which profiler to use. Defaults to None. Options are:
|
||||
|
||||
- 'torch': Use PyTorch profiler.\n
|
||||
- 'cuda': Use CUDA profiler."""
|
||||
|
||||
torch_profiler_dir: str = ""
|
||||
"""Directory to save torch profiler traces. Both AsyncLLM's CPU traces and
|
||||
worker's traces (CPU & GPU) will be saved under this directory. Note that
|
||||
it must be an absolute path."""
|
||||
|
||||
torch_profiler_with_stack: bool = True
|
||||
"""If `True`, enables stack tracing in the torch profiler. Enabled by default."""
|
||||
|
||||
torch_profiler_with_flops: bool = False
|
||||
"""If `True`, enables FLOPS counting in the torch profiler. Disabled by default."""
|
||||
|
||||
torch_profiler_use_gzip: bool = True
|
||||
"""If `True`, saves torch profiler traces in gzip format. Enabled by default"""
|
||||
|
||||
torch_profiler_dump_cuda_time_total: bool = True
|
||||
"""If `True`, dumps total CUDA time in torch profiler traces. Enabled by default."""
|
||||
|
||||
torch_profiler_record_shapes: bool = False
|
||||
"""If `True`, records tensor shapes in the torch profiler. Disabled by default."""
|
||||
|
||||
torch_profiler_with_memory: bool = False
|
||||
"""If `True`, enables memory profiling in the torch profiler.
|
||||
Disabled by default."""
|
||||
|
||||
ignore_frontend: bool = False
|
||||
"""If `True`, disables the front-end profiling of AsyncLLM when using the
|
||||
'torch' profiler. This is needed to reduce overhead when using delay/limit options,
|
||||
since the front-end profiling does not track iterations and will capture the
|
||||
entire range.
|
||||
"""
|
||||
|
||||
delay_iterations: int = Field(default=0, ge=0)
|
||||
"""Number of engine iterations to skip before starting profiling.
|
||||
Defaults to 0, meaning profiling starts immediately after receiving /start_profile.
|
||||
"""
|
||||
|
||||
max_iterations: int = Field(default=0, ge=0)
|
||||
"""Maximum number of engine iterations to profile after starting profiling.
|
||||
Defaults to 0, meaning no limit.
|
||||
"""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
ensure that it is included in the factors list if
|
||||
it affects the computation graph.
|
||||
|
||||
Provide a hash that uniquely identifies all the configs
|
||||
that affect the structure of the computation
|
||||
graph from input ids/embeddings to the final hidden states,
|
||||
excluding anything before input ids/embeddings and after
|
||||
the final hidden states.
|
||||
"""
|
||||
# no factors to consider.
|
||||
# this config will not affect the computation graph.
|
||||
factors: list[Any] = []
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
def _get_from_env_if_set(self, field_name: str, env_var_name: str) -> None:
|
||||
"""Get field from env var if set, with deprecation warning."""
|
||||
|
||||
if envs.is_set(env_var_name):
|
||||
value = getattr(envs, env_var_name)
|
||||
logger.warning_once(
|
||||
"Using %s environment variable is deprecated and will be removed in "
|
||||
"v0.14.0 or v1.0.0, whichever is soonest. Please use "
|
||||
"--profiler-config.%s command line argument or "
|
||||
"ProfilerConfig(%s=...) config field instead.",
|
||||
env_var_name,
|
||||
field_name,
|
||||
field_name,
|
||||
)
|
||||
return value
|
||||
return None
|
||||
|
||||
def _set_from_env_if_set(
|
||||
self,
|
||||
field_name: str,
|
||||
env_var_name: str,
|
||||
to_bool: bool = True,
|
||||
to_int: bool = False,
|
||||
) -> None:
|
||||
"""Set field from env var if set, with deprecation warning."""
|
||||
value = self._get_from_env_if_set(field_name, env_var_name)
|
||||
if value is not None:
|
||||
if to_bool:
|
||||
value = value == "1"
|
||||
if to_int:
|
||||
value = int(value)
|
||||
setattr(self, field_name, value)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _validate_profiler_config(self) -> Self:
|
||||
maybe_use_cuda_profiler = self._get_from_env_if_set(
|
||||
"profiler", "VLLM_TORCH_CUDA_PROFILE"
|
||||
)
|
||||
if maybe_use_cuda_profiler is not None:
|
||||
self.profiler = "cuda" if maybe_use_cuda_profiler == "1" else None
|
||||
else:
|
||||
self._set_from_env_if_set(
|
||||
"torch_profiler_dir", "VLLM_TORCH_PROFILER_DIR", to_bool=False
|
||||
)
|
||||
if self.torch_profiler_dir:
|
||||
self.profiler = "torch"
|
||||
self._set_from_env_if_set(
|
||||
"torch_profiler_record_shapes",
|
||||
"VLLM_TORCH_PROFILER_RECORD_SHAPES",
|
||||
)
|
||||
self._set_from_env_if_set(
|
||||
"torch_profiler_with_memory",
|
||||
"VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY",
|
||||
)
|
||||
self._set_from_env_if_set(
|
||||
"torch_profiler_with_stack",
|
||||
"VLLM_TORCH_PROFILER_WITH_STACK",
|
||||
)
|
||||
self._set_from_env_if_set(
|
||||
"torch_profiler_with_flops",
|
||||
"VLLM_TORCH_PROFILER_WITH_FLOPS",
|
||||
)
|
||||
self._set_from_env_if_set(
|
||||
"ignore_frontend",
|
||||
"VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM",
|
||||
)
|
||||
self._set_from_env_if_set(
|
||||
"torch_profiler_use_gzip",
|
||||
"VLLM_TORCH_PROFILER_USE_GZIP",
|
||||
)
|
||||
self._set_from_env_if_set(
|
||||
"torch_profiler_dump_cuda_time_total",
|
||||
"VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL",
|
||||
)
|
||||
|
||||
self._set_from_env_if_set(
|
||||
"delay_iterations", "VLLM_PROFILER_DELAY_ITERS", to_bool=False, to_int=True
|
||||
)
|
||||
self._set_from_env_if_set(
|
||||
"max_iterations", "VLLM_PROFILER_MAX_ITERS", to_bool=False, to_int=True
|
||||
)
|
||||
|
||||
has_delay_or_limit = self.delay_iterations > 0 or self.max_iterations > 0
|
||||
if self.profiler == "torch" and has_delay_or_limit and not self.ignore_frontend:
|
||||
logger.warning_once(
|
||||
"Using 'torch' profiler with delay_iterations or max_iterations "
|
||||
"while ignore_frontend is False may result in high overhead."
|
||||
)
|
||||
|
||||
profiler_dir = self.torch_profiler_dir
|
||||
if profiler_dir and self.profiler != "torch":
|
||||
raise ValueError(
|
||||
"torch_profiler_dir is only applicable when profiler is set to 'torch'"
|
||||
)
|
||||
if self.profiler == "torch" and not profiler_dir:
|
||||
raise ValueError("torch_profiler_dir must be set when profiler is 'torch'")
|
||||
|
||||
if profiler_dir:
|
||||
is_gs_path = (
|
||||
profiler_dir.startswith("gs://")
|
||||
and profiler_dir[5:]
|
||||
and profiler_dir[5] != "/"
|
||||
)
|
||||
if not is_gs_path:
|
||||
self.torch_profiler_dir = os.path.abspath(
|
||||
os.path.expanduser(profiler_dir)
|
||||
)
|
||||
|
||||
return self
|
||||
299
vllm/config/scheduler.py
Normal file
299
vllm/config/scheduler.py
Normal file
@@ -0,0 +1,299 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Callable
|
||||
from dataclasses import InitVar
|
||||
from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
|
||||
|
||||
from pydantic import Field, field_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
from typing_extensions import Self
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.hashing import safe_hash
|
||||
from vllm.utils.import_utils import resolve_obj_by_qualname
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.v1.core.sched.interface import SchedulerInterface
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
RunnerType = Literal["generate", "pooling", "draft"]
|
||||
SchedulerPolicy = Literal["fcfs", "priority"]
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class SchedulerConfig:
|
||||
"""Scheduler configuration."""
|
||||
|
||||
max_model_len: InitVar[int]
|
||||
"""Maximum length of a sequence (including prompt and generated text).
|
||||
|
||||
Note: This is stored in the ModelConfig, and is used only here to
|
||||
provide fallbacks and validate other attributes."""
|
||||
|
||||
is_encoder_decoder: InitVar[bool]
|
||||
"""True if the model is an encoder-decoder model.
|
||||
|
||||
Note: This is stored in the ModelConfig, and is used only here to
|
||||
disable chunked prefill and prefix caching for encoder-decoder models.
|
||||
"""
|
||||
|
||||
DEFAULT_MAX_NUM_BATCHED_TOKENS: ClassVar[int] = 2048
|
||||
DEFAULT_MAX_NUM_SEQS: ClassVar[int] = 128
|
||||
|
||||
runner_type: RunnerType = "generate"
|
||||
"""The runner type to launch for the model."""
|
||||
|
||||
max_num_batched_tokens: int = Field(default=DEFAULT_MAX_NUM_BATCHED_TOKENS, ge=1)
|
||||
"""Maximum number of tokens to be processed in a single iteration.
|
||||
|
||||
The default value here is mainly for convenience when testing.
|
||||
In real usage, this should be set in `EngineArgs.create_engine_config`.
|
||||
"""
|
||||
|
||||
max_num_seqs: int = Field(default=DEFAULT_MAX_NUM_SEQS, ge=1)
|
||||
"""Maximum number of sequences to be processed in a single iteration.
|
||||
|
||||
The default value here is mainly for convenience when testing.
|
||||
In real usage, this should be set in `EngineArgs.create_engine_config`.
|
||||
"""
|
||||
|
||||
max_num_partial_prefills: int = Field(default=1, ge=1)
|
||||
"""For chunked prefill, the maximum number of sequences that can be
|
||||
partially prefilled concurrently."""
|
||||
|
||||
max_long_partial_prefills: int = Field(default=1, ge=1)
|
||||
"""For chunked prefill, the maximum number of prompts longer than
|
||||
long_prefill_token_threshold that will be prefilled concurrently. Setting
|
||||
this less than max_num_partial_prefills will allow shorter prompts to jump
|
||||
the queue in front of longer prompts in some cases, improving latency."""
|
||||
|
||||
long_prefill_token_threshold: int = 0
|
||||
"""For chunked prefill, a request is considered long if the prompt is
|
||||
longer than this number of tokens."""
|
||||
|
||||
enable_chunked_prefill: bool = True
|
||||
"""If True, prefill requests can be chunked based
|
||||
on the remaining `max_num_batched_tokens`.
|
||||
|
||||
The default value here is mainly for convenience when testing.
|
||||
In real usage, this should be set in `EngineArgs.create_engine_config`.
|
||||
"""
|
||||
|
||||
is_multimodal_model: bool = False
|
||||
"""True if the model is multimodal."""
|
||||
|
||||
# TODO (ywang96): Make this configurable.
|
||||
max_num_encoder_input_tokens: int = Field(init=False)
|
||||
"""Multimodal encoder compute budget, only used in V1.
|
||||
|
||||
NOTE: This is not currently configurable. It will be overridden by
|
||||
max_num_batched_tokens in case max multimodal embedding size is larger."""
|
||||
|
||||
# TODO (ywang96): Make this configurable.
|
||||
encoder_cache_size: int = Field(init=False)
|
||||
"""Multimodal encoder cache size, only used in V1.
|
||||
|
||||
NOTE: This is not currently configurable. It will be overridden by
|
||||
max_num_batched_tokens in case max multimodal embedding size is larger."""
|
||||
|
||||
policy: SchedulerPolicy = "fcfs"
|
||||
"""The scheduling policy to use:\n
|
||||
- "fcfs" means first come first served, i.e. requests are handled in order
|
||||
of arrival.\n
|
||||
- "priority" means requests are handled based on given priority (lower
|
||||
value means earlier handling) and time of arrival deciding any ties)."""
|
||||
|
||||
disable_chunked_mm_input: bool = False
|
||||
"""If set to true and chunked prefill is enabled, we do not want to
|
||||
partially schedule a multimodal item. Only used in V1
|
||||
This ensures that if a request has a mixed prompt
|
||||
(like text tokens TTTT followed by image tokens IIIIIIIIII) where only
|
||||
some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
|
||||
it will be scheduled as TTTT in one step and IIIIIIIIII in the next."""
|
||||
|
||||
# scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler"
|
||||
# (default) or "mod.custom_class".
|
||||
scheduler_cls: str | type[object] = Field(default=None)
|
||||
"""The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is
|
||||
the default scheduler. Can be a class directly or the path to a class of
|
||||
form "mod.custom_class"."""
|
||||
|
||||
disable_hybrid_kv_cache_manager: bool | None = None
|
||||
"""If set to True, KV cache manager will allocate the same size of KV cache
|
||||
for all attention layers even if there are multiple type of attention layers
|
||||
like full attention and sliding window attention.
|
||||
If set to None, the default value will be determined based on the environment
|
||||
and starting configuration.
|
||||
"""
|
||||
|
||||
async_scheduling: bool = False
|
||||
"""If set to True, perform async scheduling. This helps to avoid gaps in
|
||||
GPU utilization, leading to better latency and throughput.
|
||||
Async scheduling is currently not supported with some features such as
|
||||
speculative decoding and pipeline parallelism.
|
||||
"""
|
||||
|
||||
stream_interval: int = Field(default=1, ge=1)
|
||||
"""The interval (or buffer size) for streaming in terms of token length.
|
||||
A smaller value (1) makes streaming smoother by sending each token immediately,
|
||||
while a larger value (e.g., 10) reduces host overhead and may increase throughput
|
||||
by batching multiple tokens before sending."""
|
||||
|
||||
@staticmethod
|
||||
def default_factory(**kwargs):
|
||||
"""
|
||||
Factory method to create `SchedulerConfig` with default values for `InitVar`s.
|
||||
"""
|
||||
if "max_model_len" not in kwargs:
|
||||
kwargs["max_model_len"] = 8192
|
||||
if "is_encoder_decoder" not in kwargs:
|
||||
kwargs["is_encoder_decoder"] = False
|
||||
return SchedulerConfig(**kwargs)
|
||||
|
||||
def get_scheduler_cls(self) -> type["SchedulerInterface"]:
|
||||
if self.scheduler_cls is None:
|
||||
if self.async_scheduling:
|
||||
from vllm.v1.core.sched.async_scheduler import AsyncScheduler
|
||||
|
||||
return AsyncScheduler
|
||||
from vllm.v1.core.sched.scheduler import Scheduler
|
||||
|
||||
return Scheduler
|
||||
|
||||
# This warning can be removed once the Scheduler interface is
|
||||
# finalized and we can maintain support for scheduler classes that
|
||||
# implement it
|
||||
logger.warning_once(
|
||||
"Using custom scheduler class %s. This scheduler interface is "
|
||||
"not public and compatibility may not be maintained.",
|
||||
self.scheduler_cls,
|
||||
)
|
||||
if not isinstance(self.scheduler_cls, str):
|
||||
return cast(type["SchedulerInterface"], self.scheduler_cls)
|
||||
return resolve_obj_by_qualname(self.scheduler_cls)
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
ensure that it is included in the factors list if
|
||||
it affects the computation graph.
|
||||
|
||||
Provide a hash that uniquely identifies all the configs
|
||||
that affect the structure of the computation
|
||||
graph from input ids/embeddings to the final hidden states,
|
||||
excluding anything before input ids/embeddings and after
|
||||
the final hidden states.
|
||||
"""
|
||||
factors: list[Any] = []
|
||||
|
||||
# max_num_batched_tokens need to be included in the hash due
|
||||
# to two reasons:
|
||||
# 1. LoRA creates static buffers based on max_num_batched_tokens.
|
||||
# The tensor sizes and strides get captured in the torch.compile
|
||||
# graph explicitly.
|
||||
# 2. Inductor decides whether using 32-bit or 64-bit indexing integer
|
||||
# based on the data sizes. `max_num_batched_tokens` has an
|
||||
# impact on that. For more details, please check
|
||||
# https://github.com/vllm-project/vllm/issues/29585
|
||||
factors.append(self.max_num_batched_tokens)
|
||||
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
@field_validator("scheduler_cls", "async_scheduling", mode="wrap")
|
||||
@classmethod
|
||||
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
|
||||
"""Skip validation if the value is `None` when initialisation is delayed."""
|
||||
if value is None:
|
||||
return value
|
||||
return handler(value)
|
||||
|
||||
def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None:
|
||||
if is_encoder_decoder:
|
||||
# Chunked prefill should be disabled for encoder-decoder models.
|
||||
self.disable_chunked_mm_input = True
|
||||
self.enable_chunked_prefill = False
|
||||
self.long_prefill_token_threshold = 0
|
||||
logger.info(
|
||||
"Encoder-decoder models do not support chunked prefill nor"
|
||||
" prefix caching; disabling both."
|
||||
)
|
||||
|
||||
self.max_num_encoder_input_tokens = self.max_num_batched_tokens
|
||||
self.encoder_cache_size = self.max_num_batched_tokens
|
||||
|
||||
if self.enable_chunked_prefill:
|
||||
logger.info(
|
||||
"Chunked prefill is enabled with max_num_batched_tokens=%d.",
|
||||
self.max_num_batched_tokens,
|
||||
)
|
||||
|
||||
if self.max_num_partial_prefills > 1:
|
||||
if self.long_prefill_token_threshold == 0:
|
||||
self.long_prefill_token_threshold = int(max_model_len * 0.04)
|
||||
|
||||
logger.info(
|
||||
"Concurrent partial prefills enabled with "
|
||||
"max_num_partial_prefills=%d, max_long_partial_prefills=%d, "
|
||||
"long_prefill_token_threshold=%d",
|
||||
self.max_num_partial_prefills,
|
||||
self.max_long_partial_prefills,
|
||||
self.long_prefill_token_threshold,
|
||||
)
|
||||
|
||||
self.verify_max_model_len(max_model_len)
|
||||
|
||||
def verify_max_model_len(self, max_model_len: int) -> Self:
|
||||
if (
|
||||
self.max_num_batched_tokens < max_model_len
|
||||
and not self.enable_chunked_prefill
|
||||
):
|
||||
raise ValueError(
|
||||
f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
|
||||
f"smaller than max_model_len ({max_model_len}). "
|
||||
"This effectively limits the maximum sequence length to "
|
||||
"max_num_batched_tokens and makes vLLM reject longer "
|
||||
"sequences. Please increase max_num_batched_tokens or "
|
||||
"decrease max_model_len."
|
||||
)
|
||||
|
||||
if self.max_num_batched_tokens < self.max_num_seqs:
|
||||
raise ValueError(
|
||||
f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
|
||||
"be greater than or equal to max_num_seqs "
|
||||
f"({self.max_num_seqs})."
|
||||
)
|
||||
|
||||
if self.max_num_batched_tokens > self.max_num_seqs * max_model_len:
|
||||
logger.warning(
|
||||
"max_num_batched_tokens (%d) exceeds max_num_seqs "
|
||||
"* max_model_len (%d). This may lead to unexpected behavior.",
|
||||
self.max_num_batched_tokens,
|
||||
self.max_num_seqs * max_model_len,
|
||||
)
|
||||
|
||||
if self.max_num_partial_prefills > 1:
|
||||
if not self.enable_chunked_prefill:
|
||||
raise ValueError(
|
||||
"Chunked prefill must be enabled to set "
|
||||
"max_num_partial_prefills > 1."
|
||||
)
|
||||
|
||||
if self.long_prefill_token_threshold > max_model_len:
|
||||
raise ValueError(
|
||||
"long_prefill_token_threshold "
|
||||
f"({self.long_prefill_token_threshold}) cannot be greater "
|
||||
f"than the max_model_len ({max_model_len})."
|
||||
)
|
||||
|
||||
if self.max_long_partial_prefills > self.max_num_partial_prefills:
|
||||
raise ValueError(
|
||||
f"{self.max_long_partial_prefills=} must be less than or equal to "
|
||||
f"{self.max_num_partial_prefills=}."
|
||||
)
|
||||
|
||||
return self
|
||||
644
vllm/config/speculative.py
Normal file
644
vllm/config/speculative.py
Normal file
@@ -0,0 +1,644 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import ast
|
||||
from typing import TYPE_CHECKING, Any, Literal, get_args
|
||||
|
||||
from pydantic import Field, SkipValidation, model_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
from typing_extensions import Self
|
||||
|
||||
from vllm.config.model import ModelConfig
|
||||
from vllm.config.parallel import ParallelConfig
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.hashing import safe_hash
|
||||
from vllm.utils.import_utils import LazyLoader, has_arctic_inference
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
import vllm.model_executor.layers.quantization as me_quant
|
||||
else:
|
||||
PretrainedConfig = Any
|
||||
|
||||
me_quant = LazyLoader(
|
||||
"model_executor", globals(), "vllm.model_executor.layers.quantization"
|
||||
)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
MTPModelTypes = Literal[
|
||||
"deepseek_mtp",
|
||||
"mimo_mtp",
|
||||
"glm4_moe_mtp",
|
||||
"ernie_mtp",
|
||||
"qwen3_next_mtp",
|
||||
"longcat_flash_mtp",
|
||||
"mtp",
|
||||
"pangu_ultra_moe_mtp",
|
||||
]
|
||||
EagleModelTypes = Literal["eagle", "eagle3", MTPModelTypes]
|
||||
SpeculativeMethod = Literal[
|
||||
"ngram",
|
||||
"medusa",
|
||||
"mlp_speculator",
|
||||
"draft_model",
|
||||
"suffix",
|
||||
EagleModelTypes,
|
||||
]
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class SpeculativeConfig:
|
||||
"""Configuration for speculative decoding."""
|
||||
|
||||
enforce_eager: bool | None = None
|
||||
"""Override the default enforce_eager from model_config"""
|
||||
# General speculative decoding control
|
||||
num_speculative_tokens: int = Field(default=None, gt=0)
|
||||
"""The number of speculative tokens, if provided. It will default to the
|
||||
number in the draft model config if present, otherwise, it is required."""
|
||||
model: str | None = None
|
||||
"""The name of the draft model, eagle head, or additional weights, if
|
||||
provided."""
|
||||
method: SpeculativeMethod | None = None
|
||||
"""The name of the speculative method to use. If users provide and set the
|
||||
`model` param, the speculative method type will be detected automatically
|
||||
if possible, if `model` param is not provided, the method name must be
|
||||
provided.
|
||||
|
||||
If using `ngram` method, the related configuration `prompt_lookup_max` and
|
||||
`prompt_lookup_min` should be considered."""
|
||||
draft_tensor_parallel_size: int | None = Field(default=None, ge=1)
|
||||
"""The degree of the tensor parallelism for the draft model. Can only be 1
|
||||
or the same as the target model's tensor parallel size."""
|
||||
|
||||
# Draft model configuration
|
||||
quantization: me_quant.QuantizationMethods | None = None
|
||||
"""Quantization method that was used to quantize the draft model weights.
|
||||
If `None`, we assume the model weights are not quantized. Note that it only
|
||||
takes effect when using the draft model-based speculative method."""
|
||||
max_model_len: int | None = Field(default=None, ge=1)
|
||||
"""The maximum model length of the draft model. Used when testing the
|
||||
ability to skip speculation for some sequences."""
|
||||
revision: str | None = None
|
||||
"""The specific model version to use for the draft model. It can be a
|
||||
branch name, a tag name, or a commit id. If unspecified, will use the
|
||||
default version."""
|
||||
code_revision: str | None = None
|
||||
"""The specific revision to use for the draft model code on Hugging Face
|
||||
Hub. It can be a branch name, a tag name, or a commit id. If unspecified,
|
||||
will use the default version."""
|
||||
|
||||
# Advanced control
|
||||
disable_by_batch_size: int | None = Field(default=None, ge=2)
|
||||
"""Disable speculative decoding for new incoming requests when the number
|
||||
of enqueued requests is larger than this value, if provided."""
|
||||
disable_padded_drafter_batch: bool = False
|
||||
"""Disable input padding for speculative decoding. If set to True,
|
||||
speculative input batches can contain sequences of different lengths,
|
||||
which may only be supported by certain attention backends. This currently
|
||||
only affects the EAGLE method of speculation."""
|
||||
|
||||
# Ngram proposer configuration
|
||||
prompt_lookup_max: int | None = Field(default=None, ge=1)
|
||||
"""Maximum size of ngram token window when using Ngram proposer, required
|
||||
when method is set to ngram."""
|
||||
prompt_lookup_min: int | None = Field(default=None, ge=1)
|
||||
"""Minimum size of ngram token window when using Ngram proposer, if
|
||||
provided. Defaults to 1."""
|
||||
|
||||
speculative_token_tree: str | None = None
|
||||
"""Specifies the tree structure for speculative token generation.
|
||||
"""
|
||||
# required configuration params passed from engine
|
||||
target_model_config: SkipValidation[ModelConfig] = None # type: ignore
|
||||
"""The configuration of the target model."""
|
||||
target_parallel_config: SkipValidation[ParallelConfig] = None # type: ignore
|
||||
"""The parallel configuration for the target model."""
|
||||
|
||||
# params generated in the post-init stage
|
||||
draft_model_config: SkipValidation[ModelConfig] = None # type: ignore
|
||||
"""The configuration of the draft model initialized internal."""
|
||||
draft_parallel_config: SkipValidation[ParallelConfig] = None # type: ignore
|
||||
"""The parallel configuration for the draft model initialized internal."""
|
||||
|
||||
# Suffix decoding configuration
|
||||
suffix_decoding_max_tree_depth: int = 24
|
||||
"""The maximum depth of the suffix decoding global and prompt trees. The
|
||||
tree depth limits the sum of the prefix match and speculation lengths."""
|
||||
|
||||
suffix_decoding_max_cached_requests: int = 10000
|
||||
"""The maximum number of requests to cache in the global suffix tree. If
|
||||
exceeded, will trigger eviction in FIFO order. If set to 0, the global
|
||||
suffix tree is disabled and past responses are not cached (prompt trees
|
||||
are still used)."""
|
||||
|
||||
suffix_decoding_max_spec_factor: float = 1.0
|
||||
"""The maximum spec factor for suffix decoding. The spec factor controls
|
||||
speculation lengths based on the prefix match length: max_spec_tokens =
|
||||
max_spec_factor * prefix_match_length."""
|
||||
|
||||
suffix_decoding_min_token_prob: float = 0.1
|
||||
"""The minimum token probability for suffix decoding. Will only speculate
|
||||
tokens with estimated probability (based on frequency counts) greater than
|
||||
or equal to this value."""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
ensure that it is included in the factors list if
|
||||
it affects the computation graph.
|
||||
|
||||
Provide a hash that uniquely identifies all the configs
|
||||
that affect the structure of the computation
|
||||
graph from input ids/embeddings to the final hidden states,
|
||||
excluding anything before input ids/embeddings and after
|
||||
the final hidden states.
|
||||
"""
|
||||
factors: list[Any] = []
|
||||
# Eagle3 affects the computation graph because it returns intermediate
|
||||
# hidden states in addition to the final hidden state.
|
||||
factors.append(self.method == "eagle3")
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
@staticmethod
|
||||
def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
|
||||
initial_architecture = hf_config.architectures[0]
|
||||
if hf_config.model_type in ("deepseek_v3", "deepseek_v32"):
|
||||
hf_config.model_type = "deepseek_mtp"
|
||||
if hf_config.model_type == "deepseek_mtp":
|
||||
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
|
||||
hf_config.update(
|
||||
{"n_predict": n_predict, "architectures": ["DeepSeekMTPModel"]}
|
||||
)
|
||||
if hf_config.model_type in ("pangu_ultra_moe"):
|
||||
hf_config.model_type = "pangu_ultra_moe_mtp"
|
||||
if hf_config.model_type == "pangu_ultra_moe_mtp":
|
||||
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
|
||||
hf_config.update(
|
||||
{"n_predict": n_predict, "architectures": ["OpenPanguMTPModel"]}
|
||||
)
|
||||
|
||||
if hf_config.architectures[0] == "MiMoForCausalLM":
|
||||
hf_config.model_type = "mimo_mtp"
|
||||
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
|
||||
hf_config.update(
|
||||
{
|
||||
"num_hidden_layers": 0,
|
||||
"n_predict": n_predict,
|
||||
"architectures": ["MiMoMTPModel"],
|
||||
}
|
||||
)
|
||||
|
||||
if hf_config.architectures[0] == "Glm4MoeForCausalLM":
|
||||
hf_config.model_type = "glm4_moe_mtp"
|
||||
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
|
||||
hf_config.update(
|
||||
{
|
||||
"num_hidden_layers": 0,
|
||||
"n_predict": n_predict,
|
||||
"architectures": ["Glm4MoeMTPModel"],
|
||||
}
|
||||
)
|
||||
|
||||
if hf_config.model_type == "ernie4_5_moe":
|
||||
hf_config.model_type = "ernie_mtp"
|
||||
if hf_config.model_type == "ernie_mtp":
|
||||
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
|
||||
hf_config.update(
|
||||
{"n_predict": n_predict, "architectures": ["ErnieMTPModel"]}
|
||||
)
|
||||
|
||||
if hf_config.model_type == "qwen3_next":
|
||||
hf_config.model_type = "qwen3_next_mtp"
|
||||
if hf_config.model_type == "qwen3_next_mtp":
|
||||
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
|
||||
hf_config.update(
|
||||
{"n_predict": n_predict, "architectures": ["Qwen3NextMTP"]}
|
||||
)
|
||||
if hf_config.model_type == "longcat_flash":
|
||||
hf_config.model_type = "longcat_flash_mtp"
|
||||
n_predict = getattr(hf_config, "num_nextn_predict_layers", 1)
|
||||
hf_config.update(
|
||||
{"n_predict": n_predict, "architectures": ["LongCatFlashMTPModel"]}
|
||||
)
|
||||
|
||||
if initial_architecture == "MistralLarge3ForCausalLM":
|
||||
hf_config.update({"architectures": ["EagleMistralLarge3ForCausalLM"]})
|
||||
|
||||
return hf_config
|
||||
|
||||
def __post_init__(self):
|
||||
# Note: "method" is a new parameter that helps to extend the
|
||||
# configuration of non-model-based proposers, and the "model" parameter
|
||||
# will be used to set the draft model, eagle head, or additional weight
|
||||
# when needed. If users do not specify "method", the speculative method
|
||||
# will be detected automatically if possible. If the speculative method
|
||||
# can not be detected, it will be considered as the "draft_model" by
|
||||
# default.
|
||||
|
||||
if self.method in get_args(MTPModelTypes) and self.method != "mtp":
|
||||
logger.warning(
|
||||
"method `%s` is deprecated and replaced with mtp.", self.method
|
||||
)
|
||||
self.method = "mtp"
|
||||
|
||||
if self.model is None and self.num_speculative_tokens is not None:
|
||||
if self.method == "mtp":
|
||||
if self.target_model_config is None:
|
||||
raise ValueError("target_model_config must be present for mtp")
|
||||
if self.target_model_config.hf_text_config.model_type == "deepseek_v32":
|
||||
# FIXME(luccafong): cudgraph with v32 MTP is not supported,
|
||||
# remove this when the issue is fixed.
|
||||
self.enforce_eager = True
|
||||
# use the draft model from the same model:
|
||||
self.model = self.target_model_config.model
|
||||
# Align the quantization of draft model for cases such as
|
||||
# --quantization fp8 with a bf16 checkpoint.
|
||||
if not self.quantization:
|
||||
self.quantization = self.target_model_config.quantization
|
||||
elif self.method in ("ngram", "[ngram]"):
|
||||
self.model = "ngram"
|
||||
elif self.method == "suffix":
|
||||
self.model = "suffix"
|
||||
else:
|
||||
raise ValueError(
|
||||
"num_speculative_tokens was provided but without speculative model."
|
||||
)
|
||||
|
||||
# Automatically configure the method for ngram when "model" is used
|
||||
# instead of "method"
|
||||
if self.method is None and (
|
||||
self.model is not None and self.model in ("ngram", "[ngram]")
|
||||
):
|
||||
self.method = "ngram"
|
||||
|
||||
if self.method in ("ngram", "[ngram]"):
|
||||
# Unified to "ngram" internally
|
||||
self.method = "ngram"
|
||||
# Set default values if not provided
|
||||
if self.prompt_lookup_min is None and self.prompt_lookup_max is None:
|
||||
# TODO(woosuk): Tune these values. They are arbitrarily chosen.
|
||||
self.prompt_lookup_min = 5
|
||||
self.prompt_lookup_max = 5
|
||||
elif self.prompt_lookup_min is None:
|
||||
if self.prompt_lookup_max is None:
|
||||
raise ValueError(
|
||||
"Either prompt_lookup_max or prompt_lookup_min must be "
|
||||
"provided when using the ngram method."
|
||||
)
|
||||
self.prompt_lookup_min = self.prompt_lookup_max
|
||||
elif self.prompt_lookup_max is None:
|
||||
if self.prompt_lookup_min is None:
|
||||
raise ValueError(
|
||||
"Either prompt_lookup_max or prompt_lookup_min must be "
|
||||
"provided when using the ngram method."
|
||||
)
|
||||
self.prompt_lookup_max = self.prompt_lookup_min
|
||||
|
||||
# Validate values
|
||||
if self.prompt_lookup_min > self.prompt_lookup_max:
|
||||
raise ValueError(
|
||||
f"prompt_lookup_min={self.prompt_lookup_min} must "
|
||||
f"be <= prompt_lookup_max={self.prompt_lookup_max}"
|
||||
)
|
||||
|
||||
# TODO: current we still need extract vocab_size from target model
|
||||
# config, in future, we may try refactor it out, and set
|
||||
# draft related config as None here.
|
||||
self.draft_model_config = self.target_model_config
|
||||
self.draft_parallel_config = self.target_parallel_config
|
||||
elif self.method == "suffix":
|
||||
self._validate_suffix_decoding()
|
||||
else:
|
||||
self.prompt_lookup_max = 0
|
||||
self.prompt_lookup_min = 0
|
||||
|
||||
if self.model is not None:
|
||||
self.draft_model_config = ModelConfig(
|
||||
model=self.model,
|
||||
runner="draft",
|
||||
tokenizer=self.target_model_config.tokenizer,
|
||||
tokenizer_mode=self.target_model_config.tokenizer_mode,
|
||||
trust_remote_code=self.target_model_config.trust_remote_code,
|
||||
allowed_local_media_path=self.target_model_config.allowed_local_media_path,
|
||||
allowed_media_domains=self.target_model_config.allowed_media_domains,
|
||||
dtype=self.target_model_config.dtype,
|
||||
seed=self.target_model_config.seed,
|
||||
revision=self.revision,
|
||||
code_revision=self.code_revision,
|
||||
tokenizer_revision=self.target_model_config.tokenizer_revision,
|
||||
spec_target_max_model_len=self.target_model_config.max_model_len,
|
||||
quantization=self.quantization,
|
||||
enforce_eager=self.target_model_config.enforce_eager,
|
||||
max_logprobs=self.target_model_config.max_logprobs,
|
||||
hf_overrides=SpeculativeConfig.hf_config_override,
|
||||
config_format=self.target_model_config.config_format,
|
||||
)
|
||||
|
||||
# Automatically detect the method
|
||||
if self.method in ("eagle", "eagle3"):
|
||||
pass
|
||||
# examples:
|
||||
# yuhuili/EAGLE-LLaMA3-Instruct-8B
|
||||
# yuhuili/EAGLE3-LLaMA3.1-Instruct-8B
|
||||
# AngelSlim/Qwen3-8B_eagle3
|
||||
elif "eagle-" in self.draft_model_config.model.lower():
|
||||
self.method = "eagle"
|
||||
elif "eagle3" in self.draft_model_config.model.lower():
|
||||
self.method = "eagle3"
|
||||
elif self.draft_model_config.hf_config.model_type == "medusa":
|
||||
self.method = "medusa"
|
||||
elif self.draft_model_config.hf_config.model_type == "mlp_speculator":
|
||||
self.method = "mlp_speculator"
|
||||
elif self.draft_model_config.hf_config.model_type in get_args(
|
||||
MTPModelTypes
|
||||
):
|
||||
self.method = "mtp"
|
||||
if self.num_speculative_tokens > 1:
|
||||
logger.warning(
|
||||
"Enabling num_speculative_tokens > 1 will run"
|
||||
"multiple times of forward on same MTP layer"
|
||||
",which may result in lower acceptance rate"
|
||||
)
|
||||
elif self.draft_model_config.hf_config.model_type in (
|
||||
"longcat_flash_mtp"
|
||||
):
|
||||
self.method = "longcat_flash_mtp"
|
||||
if self.num_speculative_tokens > 1:
|
||||
logger.warning(
|
||||
"LongCat MTP models only have "
|
||||
"one layer. Might need some code changes "
|
||||
"to support multiple layers."
|
||||
)
|
||||
else:
|
||||
self.method = "draft_model"
|
||||
raise NotImplementedError(
|
||||
"Speculative decoding with draft model is not "
|
||||
"supported yet. Please consider using other "
|
||||
"speculative decoding methods such as ngram, medusa, "
|
||||
"eagle, or mtp."
|
||||
)
|
||||
|
||||
# Replace hf_config for EAGLE draft_model
|
||||
if self.method in ("eagle", "eagle3"):
|
||||
from vllm.transformers_utils.configs import SpeculatorsConfig
|
||||
from vllm.transformers_utils.configs.eagle import EAGLEConfig
|
||||
|
||||
if isinstance(
|
||||
self.draft_model_config.hf_config,
|
||||
(EAGLEConfig, SpeculatorsConfig),
|
||||
):
|
||||
pass
|
||||
else:
|
||||
eagle_config = EAGLEConfig(
|
||||
self.draft_model_config.hf_config,
|
||||
method=self.method,
|
||||
model_type="eagle",
|
||||
)
|
||||
self.draft_model_config.hf_config = eagle_config
|
||||
|
||||
if self.num_speculative_tokens is not None and hasattr(
|
||||
self.draft_model_config.hf_config, "num_lookahead_tokens"
|
||||
):
|
||||
self.draft_model_config.hf_config.num_lookahead_tokens = (
|
||||
self.num_speculative_tokens
|
||||
)
|
||||
|
||||
n_predict = getattr(
|
||||
self.draft_model_config.hf_config, "n_predict", None
|
||||
)
|
||||
if n_predict is not None:
|
||||
if self.num_speculative_tokens is None:
|
||||
# Default to max value defined in draft model config.
|
||||
self.num_speculative_tokens = n_predict
|
||||
elif (
|
||||
self.num_speculative_tokens > n_predict
|
||||
and self.num_speculative_tokens % n_predict != 0
|
||||
):
|
||||
# Ensure divisibility for MTP module reuse.
|
||||
raise ValueError(
|
||||
f"num_speculative_tokens:{self.num_speculative_tokens}"
|
||||
f" must be divisible by {n_predict=}"
|
||||
)
|
||||
|
||||
if self.speculative_token_tree is None:
|
||||
# Generate chain of tokens.
|
||||
self.speculative_token_tree = str(
|
||||
[(i + 1) * (0,) for i in range(self.num_speculative_tokens)]
|
||||
)
|
||||
else:
|
||||
# Sort the token tree breadth-first.
|
||||
tree_choices = ast.literal_eval(self.speculative_token_tree)
|
||||
self.speculative_token_tree = str(
|
||||
sorted(tree_choices, key=lambda t: (len(t), t))
|
||||
)
|
||||
|
||||
self.draft_tensor_parallel_size = (
|
||||
SpeculativeConfig._verify_and_get_draft_tp(
|
||||
self.target_parallel_config,
|
||||
self.draft_tensor_parallel_size,
|
||||
self.draft_model_config.hf_config,
|
||||
)
|
||||
)
|
||||
|
||||
self.draft_model_config.max_model_len = (
|
||||
SpeculativeConfig._maybe_override_draft_max_model_len(
|
||||
self.max_model_len,
|
||||
self.draft_model_config.max_model_len,
|
||||
self.target_model_config.max_model_len,
|
||||
)
|
||||
)
|
||||
|
||||
self.draft_parallel_config = (
|
||||
SpeculativeConfig.create_draft_parallel_config(
|
||||
self.target_parallel_config, self.draft_tensor_parallel_size
|
||||
)
|
||||
)
|
||||
return self
|
||||
|
||||
def _validate_suffix_decoding(self):
|
||||
if not has_arctic_inference():
|
||||
raise ImportError(
|
||||
"Arctic Inference is required for suffix decoding. "
|
||||
"Install via `pip install arctic-inference==0.1.1`."
|
||||
)
|
||||
if self.num_speculative_tokens is None:
|
||||
# Suffix decoding decides the actual number of speculative tokens
|
||||
# dynamically and treats num_speculative_tokens as a maximum limit.
|
||||
self.num_speculative_tokens = self.suffix_decoding_max_tree_depth
|
||||
logger.warning(
|
||||
"Defaulted num_speculative_tokens to %s for suffix decoding.",
|
||||
self.num_speculative_tokens,
|
||||
)
|
||||
# Validate values
|
||||
if self.suffix_decoding_max_tree_depth < 1:
|
||||
raise ValueError(
|
||||
f"suffix_decoding_max_tree_depth="
|
||||
f"{self.suffix_decoding_max_tree_depth} must be >= 1"
|
||||
)
|
||||
if self.suffix_decoding_max_cached_requests < 0:
|
||||
raise ValueError(
|
||||
f"suffix_decoding_max_cached_requests="
|
||||
f"{self.suffix_decoding_max_cached_requests} must be >= 0"
|
||||
)
|
||||
if self.suffix_decoding_max_spec_factor < 0:
|
||||
raise ValueError(
|
||||
f"suffix_decoding_max_spec_factor="
|
||||
f"{self.suffix_decoding_max_spec_factor} must be >= 0"
|
||||
)
|
||||
if not 0 <= self.suffix_decoding_min_token_prob <= 1:
|
||||
raise ValueError(
|
||||
f"suffix_decoding_min_token_prob="
|
||||
f"{self.suffix_decoding_min_token_prob} must be in [0, 1]"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _maybe_override_draft_max_model_len(
|
||||
speculative_max_model_len: int | None,
|
||||
draft_max_model_len: int,
|
||||
target_max_model_len: int,
|
||||
) -> int:
|
||||
"""Determine the max sequence len for the draft model. This is usually
|
||||
the draft_max_model_len, but may be the target_max_model_len if it is
|
||||
less than the draft_max_model_len, or may be speculative_max_model_len
|
||||
if it is specified.
|
||||
|
||||
This is necessary so that sequences do not exceed the capacity of the
|
||||
draft model or the target model.
|
||||
|
||||
speculative_max_model_len is mainly used for testing that sequences can
|
||||
skip speculation.
|
||||
"""
|
||||
|
||||
if speculative_max_model_len is not None:
|
||||
if speculative_max_model_len > draft_max_model_len:
|
||||
raise ValueError(
|
||||
f"{speculative_max_model_len=} cannot be "
|
||||
f"larger than {draft_max_model_len=}"
|
||||
)
|
||||
|
||||
if speculative_max_model_len > target_max_model_len:
|
||||
raise ValueError(
|
||||
f"{speculative_max_model_len=} cannot be "
|
||||
f"larger than {target_max_model_len=}"
|
||||
)
|
||||
|
||||
return speculative_max_model_len
|
||||
|
||||
return min(
|
||||
draft_max_model_len,
|
||||
target_max_model_len,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _verify_and_get_draft_tp(
|
||||
target_parallel_config: ParallelConfig,
|
||||
speculative_draft_tensor_parallel_size: int | None,
|
||||
draft_hf_config: PretrainedConfig,
|
||||
) -> int:
|
||||
"""
|
||||
Verifies and adjusts the tensor parallel size for a draft model
|
||||
specified using speculative_draft_tensor_parallel_size.
|
||||
"""
|
||||
# If speculative_draft_tensor_parallel_size is unset then set it
|
||||
# appropriately else verify that it is set correctly.
|
||||
if speculative_draft_tensor_parallel_size is None:
|
||||
if draft_hf_config.model_type == "mlp_speculator":
|
||||
speculative_draft_tensor_parallel_size = 1
|
||||
if target_parallel_config.tensor_parallel_size > 1:
|
||||
logger.warning(
|
||||
"%s cannot currently be run with tp>1; "
|
||||
"setting speculative_draft_tensor_parallel_size=1",
|
||||
draft_hf_config.model_type,
|
||||
)
|
||||
else:
|
||||
speculative_draft_tensor_parallel_size = (
|
||||
target_parallel_config.tensor_parallel_size
|
||||
)
|
||||
elif speculative_draft_tensor_parallel_size not in (
|
||||
1,
|
||||
target_parallel_config.tensor_parallel_size,
|
||||
):
|
||||
raise ValueError(
|
||||
f"{speculative_draft_tensor_parallel_size=} cannot be "
|
||||
f"other value than 1 or target model tensor_parallel_size"
|
||||
)
|
||||
return speculative_draft_tensor_parallel_size
|
||||
|
||||
@staticmethod
|
||||
def create_draft_parallel_config(
|
||||
target_parallel_config: ParallelConfig,
|
||||
speculative_draft_tensor_parallel_size: int,
|
||||
) -> ParallelConfig:
|
||||
"""Create a parallel config for use by the draft worker.
|
||||
|
||||
This is mostly a copy of the target parallel config, except the tp_size.
|
||||
"""
|
||||
draft_parallel_config = ParallelConfig(
|
||||
pipeline_parallel_size=target_parallel_config.pipeline_parallel_size,
|
||||
tensor_parallel_size=speculative_draft_tensor_parallel_size,
|
||||
distributed_executor_backend=target_parallel_config.distributed_executor_backend,
|
||||
max_parallel_loading_workers=target_parallel_config.max_parallel_loading_workers,
|
||||
disable_custom_all_reduce=target_parallel_config.disable_custom_all_reduce,
|
||||
ray_workers_use_nsight=target_parallel_config.ray_workers_use_nsight,
|
||||
placement_group=target_parallel_config.placement_group,
|
||||
)
|
||||
|
||||
return draft_parallel_config
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _verify_args(self) -> Self:
|
||||
if self.num_speculative_tokens is None:
|
||||
raise ValueError(
|
||||
"num_speculative_tokens must be provided with "
|
||||
"speculative model unless the draft model config contains an "
|
||||
"n_predict parameter."
|
||||
)
|
||||
|
||||
if self.num_speculative_tokens <= 0:
|
||||
raise ValueError(
|
||||
"Expected num_speculative_tokens to be greater "
|
||||
f"than zero ({self.num_speculative_tokens})."
|
||||
)
|
||||
|
||||
if self.draft_model_config:
|
||||
self.draft_model_config.verify_with_parallel_config(
|
||||
self.draft_parallel_config
|
||||
)
|
||||
|
||||
if self.disable_by_batch_size is not None and self.disable_by_batch_size < 2:
|
||||
raise ValueError(
|
||||
"Expect the batch size threshold of disabling "
|
||||
"speculative decoding is > 1, but got "
|
||||
f"{self.disable_by_batch_size=}"
|
||||
)
|
||||
|
||||
eagle3_target_supported = ["llama", "qwen", "minicpm", "gpt_oss"]
|
||||
if (
|
||||
self.method == "eagle3"
|
||||
and self.target_model_config
|
||||
and not any(
|
||||
supported_model in self.target_model_config.hf_text_config.model_type
|
||||
for supported_model in eagle3_target_supported
|
||||
)
|
||||
):
|
||||
raise ValueError(
|
||||
f"Eagle3 is only supported for {eagle3_target_supported} models. " # noqa: E501
|
||||
f"Got {self.target_model_config.hf_text_config.model_type=}"
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
def use_eagle(self) -> bool:
|
||||
return self.method in ("eagle", "eagle3", "mtp")
|
||||
|
||||
def __repr__(self) -> str:
|
||||
method = self.method
|
||||
model = None if method in ("ngram", "suffix") else self.draft_model_config.model
|
||||
num_spec_tokens = self.num_speculative_tokens
|
||||
return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
|
||||
38
vllm/config/speech_to_text.py
Normal file
38
vllm/config/speech_to_text.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm.config.utils import config
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class SpeechToTextConfig:
|
||||
"""Configuration for speech-to-text models."""
|
||||
|
||||
sample_rate: float = 16_000
|
||||
"""Sample rate (Hz) to resample input audio to. Most speech models expect
|
||||
16kHz audio input. The input audio will be automatically resampled to this
|
||||
rate before processing."""
|
||||
|
||||
max_audio_clip_s: int = 30
|
||||
"""Maximum duration in seconds for a single audio clip without chunking.
|
||||
Audio longer than this will be split into smaller chunks if
|
||||
`allow_audio_chunking` evaluates to True, otherwise it will be rejected."""
|
||||
|
||||
overlap_chunk_second: int = 1
|
||||
"""Overlap duration in seconds between consecutive audio chunks when
|
||||
splitting long audio. This helps maintain context across chunk boundaries
|
||||
and improves transcription quality at split points."""
|
||||
|
||||
min_energy_split_window_size: int | None = 1600
|
||||
"""Window size in samples for finding low-energy (quiet) regions to split
|
||||
audio chunks. The algorithm looks for the quietest moment within this
|
||||
window to minimize cutting through speech. Default 1600 samples ≈ 100ms
|
||||
at 16kHz. If None, no chunking will be done."""
|
||||
|
||||
@property
|
||||
def allow_audio_chunking(self) -> bool:
|
||||
return self.min_energy_split_window_size is not None
|
||||
78
vllm/config/structured_outputs.py
Normal file
78
vllm/config/structured_outputs.py
Normal file
@@ -0,0 +1,78 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import model_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
from typing_extensions import Self
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
StructuredOutputsBackend = Literal[
|
||||
"auto", "xgrammar", "guidance", "outlines", "lm-format-enforcer"
|
||||
]
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class StructuredOutputsConfig:
|
||||
"""Dataclass which contains structured outputs config for the engine."""
|
||||
|
||||
backend: StructuredOutputsBackend = "auto"
|
||||
"""Which engine will be used for structured outputs (e.g. JSON schema,
|
||||
regex, etc) by default. With "auto", we will make opinionated choices
|
||||
based on request contents and what the backend libraries currently support,
|
||||
so the behavior is subject to change in each release."""
|
||||
disable_fallback: bool = False
|
||||
"""If `True`, vLLM will not fallback to a different backend on error."""
|
||||
disable_any_whitespace: bool = False
|
||||
"""If `True`, json output will always be compact without any whitespace.
|
||||
If `False`, the model may generate whitespace between JSON fields,
|
||||
which is still valid JSON. This is only supported for xgrammar
|
||||
and guidance backends."""
|
||||
disable_additional_properties: bool = False
|
||||
"""If `True`, the `guidance` backend will not use `additionalProperties`
|
||||
in the JSON schema. This is only supported for the `guidance` backend and
|
||||
is used to better align its behaviour with `outlines` and `xgrammar`."""
|
||||
reasoning_parser: str = ""
|
||||
"""Select the reasoning parser depending on the model that you're using.
|
||||
This is used to parse the reasoning content into OpenAI API format."""
|
||||
reasoning_parser_plugin: str = ""
|
||||
"""Path to a dynamically reasoning parser plugin that can be dynamically
|
||||
loaded and registered."""
|
||||
enable_in_reasoning: bool = False
|
||||
"""Whether to use structured input for reasoning."""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
ensure that it is included in the factors list if
|
||||
it affects the computation graph.
|
||||
|
||||
Provide a hash that uniquely identifies all the configs
|
||||
that affect the structure of the computation
|
||||
graph from input ids/embeddings to the final hidden states,
|
||||
excluding anything before input ids/embeddings and after
|
||||
the final hidden states.
|
||||
"""
|
||||
# no factors to consider.
|
||||
# this config will not affect the computation graph.
|
||||
factors: list[Any] = []
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _validate_structured_output_config(self) -> Self:
|
||||
if self.disable_any_whitespace and self.backend not in ("xgrammar", "guidance"):
|
||||
raise ValueError(
|
||||
"disable_any_whitespace is only supported for "
|
||||
"xgrammar and guidance backends."
|
||||
)
|
||||
if self.disable_additional_properties and self.backend != "guidance":
|
||||
raise ValueError(
|
||||
"disable_additional_properties is only supported "
|
||||
"for the guidance backend."
|
||||
)
|
||||
return self
|
||||
370
vllm/config/utils.py
Normal file
370
vllm/config/utils.py
Normal file
@@ -0,0 +1,370 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Utility functions for vLLM config dataclasses."""
|
||||
|
||||
import ast
|
||||
import enum
|
||||
import hashlib
|
||||
import inspect
|
||||
import json
|
||||
import pathlib
|
||||
import textwrap
|
||||
from collections.abc import Iterable, Mapping, Sequence, Set
|
||||
from dataclasses import MISSING, Field, dataclass, field, fields, is_dataclass, replace
|
||||
from itertools import pairwise
|
||||
from typing import TYPE_CHECKING, Any, Protocol, TypeVar
|
||||
|
||||
import regex as re
|
||||
import torch
|
||||
from pydantic.fields import FieldInfo
|
||||
from typing_extensions import runtime_checkable
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from _typeshed import DataclassInstance
|
||||
else:
|
||||
DataclassInstance = Any
|
||||
|
||||
ConfigType = type[DataclassInstance]
|
||||
ConfigT = TypeVar("ConfigT", bound=ConfigType)
|
||||
|
||||
|
||||
def config(cls: ConfigT) -> ConfigT:
|
||||
"""
|
||||
A decorator that ensures all fields in a dataclass have default values
|
||||
and that each field has a docstring.
|
||||
|
||||
If a `ConfigT` is used as a CLI argument itself, the `type` keyword argument
|
||||
provided by `get_kwargs` will be
|
||||
`pydantic.TypeAdapter(ConfigT).validate_json(cli_arg)` which treats the
|
||||
`cli_arg` as a JSON string which gets validated by `pydantic`.
|
||||
|
||||
Config validation is performed by the tools/pre_commit/validate_config.py
|
||||
script, which is invoked during the pre-commit checks.
|
||||
"""
|
||||
return cls
|
||||
|
||||
|
||||
def get_field(cls: ConfigType, name: str) -> Field:
|
||||
"""Get the default factory field of a dataclass by name. Used for getting
|
||||
default factory fields in `EngineArgs`."""
|
||||
if not is_dataclass(cls):
|
||||
raise TypeError("The given class is not a dataclass.")
|
||||
cls_fields = {f.name: f for f in fields(cls)}
|
||||
if name not in cls_fields:
|
||||
raise ValueError(f"Field '{name}' not found in {cls.__name__}.")
|
||||
named_field: Field = cls_fields[name]
|
||||
if (default_factory := named_field.default_factory) is not MISSING:
|
||||
return field(default_factory=default_factory)
|
||||
if (default := named_field.default) is not MISSING:
|
||||
if isinstance(default, FieldInfo):
|
||||
# Handle pydantic.Field defaults
|
||||
if default.default_factory is not None:
|
||||
return field(default_factory=default.default_factory)
|
||||
else:
|
||||
default = default.default
|
||||
return field(default=default)
|
||||
|
||||
raise ValueError(
|
||||
f"{cls.__name__}.{name} must have a default value or default factory."
|
||||
)
|
||||
|
||||
|
||||
def getattr_iter(
|
||||
object: object, names: Iterable[str], default: Any, warn: bool = False
|
||||
) -> Any:
|
||||
"""
|
||||
A helper function that retrieves an attribute from an object which may
|
||||
have multiple possible names. This is useful when fetching attributes from
|
||||
arbitrary `transformers.PretrainedConfig` instances.
|
||||
|
||||
In the case where the first name in `names` is the preferred name, and
|
||||
any other names are deprecated aliases, setting `warn=True` will log a
|
||||
warning when a deprecated name is used.
|
||||
"""
|
||||
for i, name in enumerate(names):
|
||||
if hasattr(object, name):
|
||||
if warn and i > 0:
|
||||
logger.warning_once(
|
||||
"%s contains a deprecated attribute name '%s'. "
|
||||
"Please use the preferred attribute name '%s' instead.",
|
||||
type(object).__name__,
|
||||
name,
|
||||
names[0],
|
||||
)
|
||||
return getattr(object, name)
|
||||
return default
|
||||
|
||||
|
||||
def contains_object_print(text: str) -> bool:
|
||||
"""
|
||||
Check if the text looks like a printed Python object, e.g.
|
||||
contains any substring matching the pattern: "at 0xFFFFFFF>"
|
||||
We match against 0x followed by 2-16 hex chars (there's
|
||||
a max of 16 on a 64-bit system).
|
||||
|
||||
Args:
|
||||
text (str): The text to check
|
||||
|
||||
Returns:
|
||||
result (bool): `True` if a match is found, `False` otherwise.
|
||||
"""
|
||||
pattern = r"at 0x[a-fA-F0-9]{2,16}>"
|
||||
match = re.search(pattern, text)
|
||||
return match is not None
|
||||
|
||||
|
||||
def assert_hashable(text: str) -> bool:
|
||||
if not contains_object_print(text):
|
||||
return True
|
||||
raise AssertionError(
|
||||
f"vLLM tried to hash some configs that may have Python objects ids "
|
||||
f"in them. This is a bug, please file an issue. "
|
||||
f"Text being hashed: {text}"
|
||||
)
|
||||
|
||||
|
||||
def get_attr_docs(cls: type[Any]) -> dict[str, str]:
|
||||
"""
|
||||
Get any docstrings placed after attribute assignments in a class body.
|
||||
|
||||
https://davidism.com/mit-license/
|
||||
"""
|
||||
|
||||
cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
|
||||
|
||||
if not isinstance(cls_node, ast.ClassDef):
|
||||
raise TypeError("Given object was not a class.")
|
||||
|
||||
out = {}
|
||||
|
||||
# Consider each pair of nodes.
|
||||
for a, b in pairwise(cls_node.body):
|
||||
# Must be an assignment then a constant string.
|
||||
if (
|
||||
not isinstance(a, (ast.Assign, ast.AnnAssign))
|
||||
or not isinstance(b, ast.Expr)
|
||||
or not isinstance(b.value, ast.Constant)
|
||||
or not isinstance(b.value.value, str)
|
||||
):
|
||||
continue
|
||||
|
||||
doc = inspect.cleandoc(b.value.value)
|
||||
|
||||
# An assignment can have multiple targets (a = b = v), but an
|
||||
# annotated assignment only has one target.
|
||||
targets = a.targets if isinstance(a, ast.Assign) else [a.target]
|
||||
|
||||
for target in targets:
|
||||
# Must be assigning to a plain name.
|
||||
if not isinstance(target, ast.Name):
|
||||
continue
|
||||
|
||||
out[target.id] = doc
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def is_init_field(cls: ConfigType, name: str) -> bool:
|
||||
return next(f for f in fields(cls) if f.name == name).init
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class SupportsHash(Protocol):
|
||||
def compute_hash(self) -> str: ...
|
||||
|
||||
|
||||
class SupportsMetricsInfo(Protocol):
|
||||
def metrics_info(self) -> dict[str, str]: ...
|
||||
|
||||
|
||||
def update_config(config: ConfigT, overrides: dict[str, Any]) -> ConfigT:
|
||||
processed_overrides = {}
|
||||
for field_name, value in overrides.items():
|
||||
assert hasattr(config, field_name), (
|
||||
f"{type(config)} has no field `{field_name}`"
|
||||
)
|
||||
current_value = getattr(config, field_name)
|
||||
if is_dataclass(current_value) and not is_dataclass(value):
|
||||
assert isinstance(value, dict), (
|
||||
f"Overrides to {type(config)}.{field_name} must be a dict"
|
||||
f" or {type(current_value)}, but got {type(value)}"
|
||||
)
|
||||
value = update_config(
|
||||
current_value, # type: ignore[type-var]
|
||||
value,
|
||||
)
|
||||
processed_overrides[field_name] = value
|
||||
return replace(config, **processed_overrides)
|
||||
|
||||
|
||||
def normalize_value(x):
|
||||
"""Return a stable, JSON-serializable canonical form for hashing.
|
||||
Order: primitives, special types (Enum, callable, torch.dtype, Path), then
|
||||
generic containers (Mapping/Set/Sequence) with recursion.
|
||||
"""
|
||||
# Fast path
|
||||
if x is None or isinstance(x, (bool, int, float, str)):
|
||||
return x
|
||||
|
||||
# Enums: tag with FQN to avoid primitive collisions.
|
||||
# Ex: Enum(1) vs int(1) -> ("module.QualName", value).
|
||||
if isinstance(x, enum.Enum):
|
||||
enum_type = f"{x.__class__.__module__}.{x.__class__.__qualname__}"
|
||||
return (enum_type, normalize_value(x.value))
|
||||
|
||||
# Classes (types) are accepted and canonicalized by their fully-qualified
|
||||
# name (module.qualname) for a stable identifier.
|
||||
# Instances are only accepted if they expose uuid(); otherwise they are
|
||||
# rejected to avoid under-hashing object state.
|
||||
|
||||
# Callables: accept classes only; reject funcs/lambdas/methods.
|
||||
# Used by LogitsProcessor types and ModelConfig.hf_overrides.
|
||||
if isinstance(x, type):
|
||||
module = getattr(x, "__module__", "")
|
||||
qual = getattr(x, "__qualname__", getattr(x, "__name__", ""))
|
||||
return ".".join([p for p in (module, qual) if p]) or repr(x)
|
||||
|
||||
# Prefer stable uuid identifiers for objects that provide them, even if
|
||||
# they are callable instances (e.g., InductorPass wrappers).
|
||||
if hasattr(x, "uuid") and callable(getattr(x, "uuid", None)):
|
||||
return x.uuid()
|
||||
|
||||
if callable(x):
|
||||
raise TypeError("normalize_value: function or callable instance unsupported")
|
||||
|
||||
# Torch dtype: stringify (torch.float64 -> "torch.float64").
|
||||
# We rely on the string form here; dtype-bearing fields that need additional
|
||||
# disambiguation should encode that at the config layer.
|
||||
if isinstance(x, torch.dtype):
|
||||
return str(x)
|
||||
|
||||
# Bytes
|
||||
if isinstance(x, (bytes, bytearray)):
|
||||
return x.hex()
|
||||
|
||||
# Paths (canonicalize)
|
||||
if isinstance(x, pathlib.Path):
|
||||
try:
|
||||
return str(x.expanduser().resolve())
|
||||
except Exception:
|
||||
return str(x)
|
||||
|
||||
# Dataclasses: represent as (FQN, sorted(field,value) tuple) for stability.
|
||||
if is_dataclass(x):
|
||||
type_fqn = f"{x.__class__.__module__}.{x.__class__.__qualname__}"
|
||||
items = tuple(
|
||||
(f.name, normalize_value(getattr(x, f.name)))
|
||||
for f in sorted(fields(x), key=lambda f: f.name)
|
||||
)
|
||||
return (type_fqn, items)
|
||||
|
||||
# Containers (generic)
|
||||
if isinstance(x, Mapping):
|
||||
return tuple(sorted((str(k), normalize_value(v)) for k, v in x.items()))
|
||||
if isinstance(x, Set):
|
||||
return tuple(sorted(repr(normalize_value(v)) for v in x))
|
||||
if isinstance(x, Sequence) and not isinstance(x, (str, bytes, bytearray)):
|
||||
return tuple(normalize_value(v) for v in x)
|
||||
|
||||
# PretrainedConfig
|
||||
if hasattr(x, "to_json_string") and callable(x.to_json_string):
|
||||
return x.to_json_string()
|
||||
|
||||
# Unsupported type: e.g., modules, generators, open files, or objects
|
||||
# without a stable JSON/UUID representation. Hard-error to avoid
|
||||
# under-hashing.
|
||||
# If you hit this, either reshape your config to use supported primitives
|
||||
# and containers, or extend normalize_value to provide a stable encoding
|
||||
# (e.g., via uuid() or to_json_string()) for this type.
|
||||
raise TypeError(
|
||||
f"normalize_value: unsupported type '{type(x).__name__}'. "
|
||||
"Ensure config values use supported primitives/containers or add a "
|
||||
"stable representation for this type."
|
||||
)
|
||||
|
||||
|
||||
def get_hash_factors(config: ConfigT, ignored_factors: set[str]) -> dict[str, object]:
|
||||
"""Gets the factors used for hashing a config class.
|
||||
- Includes all dataclass fields not in `ignored_factors`.
|
||||
- Errors on non-normalizable values.
|
||||
"""
|
||||
factors: dict[str, object] = {}
|
||||
for dc_field in fields(config):
|
||||
factor = dc_field.name
|
||||
if factor in ignored_factors:
|
||||
continue
|
||||
value = getattr(config, factor, None)
|
||||
try:
|
||||
factors[factor] = normalize_value(value)
|
||||
except TypeError as e:
|
||||
raise TypeError(
|
||||
f"get_hash_factors: unsupported type for key '{factor}' "
|
||||
f"({type(value).__name__})"
|
||||
) from e
|
||||
return factors
|
||||
|
||||
|
||||
def hash_factors(items: dict[str, object]) -> str:
|
||||
"""Return a SHA-256 hex digest of the canonical items structure."""
|
||||
return hashlib.sha256(json.dumps(items, sort_keys=True).encode()).hexdigest()
|
||||
|
||||
|
||||
def handle_deprecated(
|
||||
config: ConfigT,
|
||||
old_name: str,
|
||||
new_name_or_names: str | list[str],
|
||||
removal_version: str,
|
||||
) -> None:
|
||||
old_val = getattr(config, old_name)
|
||||
if old_val is None:
|
||||
return
|
||||
|
||||
if isinstance(new_name_or_names, str):
|
||||
new_names = [new_name_or_names]
|
||||
else:
|
||||
new_names = new_name_or_names
|
||||
|
||||
msg = (
|
||||
f"{old_name} is deprecated and will be removed in {removal_version}. "
|
||||
f"Use {', '.join(new_names)} instead."
|
||||
)
|
||||
logger.warning(msg)
|
||||
|
||||
for new_name in new_names:
|
||||
setattr(config, new_name, old_val)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Range:
|
||||
"""
|
||||
A range of numbers.
|
||||
Inclusive of start, inclusive of end.
|
||||
"""
|
||||
|
||||
start: int
|
||||
end: int
|
||||
|
||||
def is_single_size(self) -> bool:
|
||||
return self.start == self.end
|
||||
|
||||
def __contains__(self, size: int) -> bool:
|
||||
# Inclusive of start, inclusive of end
|
||||
return self.start <= size <= self.end
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
if not isinstance(other, Range):
|
||||
return False
|
||||
return self.start == other.start and self.end == other.end
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash((self.start, self.end))
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"({self.start}, {self.end})"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.__str__()
|
||||
1434
vllm/config/vllm.py
Normal file
1434
vllm/config/vllm.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user