Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

108
vllm/config/__init__.py Normal file
View File

@@ -0,0 +1,108 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.config.attention import AttentionConfig
from vllm.config.cache import CacheConfig
from vllm.config.compilation import (
CompilationConfig,
CompilationMode,
CUDAGraphMode,
PassConfig,
)
from vllm.config.device import DeviceConfig
from vllm.config.ec_transfer import ECTransferConfig
from vllm.config.kv_events import KVEventsConfig
from vllm.config.kv_transfer import KVTransferConfig
from vllm.config.load import LoadConfig
from vllm.config.lora import LoRAConfig
from vllm.config.model import (
ModelConfig,
iter_architecture_defaults,
try_match_architecture_defaults,
)
from vllm.config.multimodal import MultiModalConfig
from vllm.config.observability import ObservabilityConfig
from vllm.config.parallel import EPLBConfig, ParallelConfig
from vllm.config.pooler import PoolerConfig
from vllm.config.profiler import ProfilerConfig
from vllm.config.scheduler import SchedulerConfig
from vllm.config.speculative import SpeculativeConfig
from vllm.config.speech_to_text import SpeechToTextConfig
from vllm.config.structured_outputs import StructuredOutputsConfig
from vllm.config.utils import (
ConfigType,
SupportsMetricsInfo,
config,
get_attr_docs,
is_init_field,
update_config,
)
from vllm.config.vllm import (
VllmConfig,
get_cached_compilation_config,
get_current_vllm_config,
get_layers_from_vllm_config,
set_current_vllm_config,
)
# __all__ should only contain classes and functions.
# Types and globals should be imported from their respective modules.
__all__ = [
# From vllm.config.attention
"AttentionConfig",
# From vllm.config.cache
"CacheConfig",
# From vllm.config.compilation
"CompilationConfig",
"CompilationMode",
"CUDAGraphMode",
"PassConfig",
# From vllm.config.device
"DeviceConfig",
# From vllm.config.ec_transfer
"ECTransferConfig",
# From vllm.config.kv_events
"KVEventsConfig",
# From vllm.config.kv_transfer
"KVTransferConfig",
# From vllm.config.load
"LoadConfig",
# From vllm.config.lora
"LoRAConfig",
# From vllm.config.model
"ModelConfig",
"iter_architecture_defaults",
"try_match_architecture_defaults",
# From vllm.config.multimodal
"MultiModalConfig",
# From vllm.config.observability
"ObservabilityConfig",
# From vllm.config.parallel
"EPLBConfig",
"ParallelConfig",
# From vllm.config.pooler
"PoolerConfig",
# From vllm.config.scheduler
"SchedulerConfig",
# From vllm.config.speculative
"SpeculativeConfig",
# From vllm.config.speech_to_text
"SpeechToTextConfig",
# From vllm.config.structured_outputs
"StructuredOutputsConfig",
# From vllm.config.profiler
"ProfilerConfig",
# From vllm.config.utils
"ConfigType",
"SupportsMetricsInfo",
"config",
"get_attr_docs",
"is_init_field",
"update_config",
# From vllm.config.vllm
"VllmConfig",
"get_cached_compilation_config",
"get_current_vllm_config",
"set_current_vllm_config",
"get_layers_from_vllm_config",
]

114
vllm/config/attention.py Normal file
View File

@@ -0,0 +1,114 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, Literal
from pydantic import field_validator
from pydantic.dataclasses import dataclass
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.config.utils import config
from vllm.logger import init_logger
logger = init_logger(__name__)
@config
@dataclass
class AttentionConfig:
"""Configuration for attention mechanisms in vLLM."""
backend: AttentionBackendEnum | None = None
"""Attention backend to use. If None, will be selected automatically."""
flash_attn_version: Literal[2, 3] | None = None
"""Force vllm to use a specific flash-attention version (2 or 3).
Only valid when using the flash-attention backend."""
use_prefill_decode_attention: bool = False
"""Use separate prefill and decode kernels for attention instead of
the unified triton kernel."""
flash_attn_max_num_splits_for_cuda_graph: int = 32
"""Flash Attention max number splits for cuda graph decode."""
use_cudnn_prefill: bool = False
"""Whether to use cudnn prefill."""
use_trtllm_ragged_deepseek_prefill: bool = False
"""Whether to use TRTLLM ragged deepseek prefill."""
use_trtllm_attention: bool | None = None
"""If set to True/False, use or don't use the TRTLLM attention backend
in flashinfer. If None, auto-detect the attention backend in flashinfer."""
disable_flashinfer_prefill: bool = False
"""Whether to disable flashinfer prefill."""
disable_flashinfer_q_quantization: bool = False
"""If set, when using fp8 kv, do not quantize Q to fp8."""
def compute_hash(self) -> str:
"""
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
from vllm.config.utils import get_hash_factors, hash_factors
ignored_factors: list[str] = []
factors = get_hash_factors(self, ignored_factors)
return hash_factors(factors)
@field_validator("backend", mode="before")
@classmethod
def validate_backend_before(cls, value: Any) -> Any:
"""Enable parsing of the `backend` enum type from string."""
if isinstance(value, str):
return AttentionBackendEnum[value.upper()]
return value
def _set_from_env_if_set(self, field_name: str, env_var_name: str) -> None:
"""Set field from env var if set, with deprecation warning."""
from vllm import envs
if envs.is_set(env_var_name):
value = getattr(envs, env_var_name)
if field_name == "backend":
value = self.validate_backend_before(value)
setattr(self, field_name, value)
logger.warning_once(
"Using %s environment variable is deprecated and will be removed in "
"v0.14.0 or v1.0.0, whichever is soonest. Please use "
"--attention-config.%s command line argument or "
"AttentionConfig(%s=...) config field instead.",
env_var_name,
field_name,
field_name,
)
def __post_init__(self) -> None:
self._set_from_env_if_set("backend", "VLLM_ATTENTION_BACKEND")
self._set_from_env_if_set("flash_attn_version", "VLLM_FLASH_ATTN_VERSION")
self._set_from_env_if_set(
"use_prefill_decode_attention", "VLLM_V1_USE_PREFILL_DECODE_ATTENTION"
)
self._set_from_env_if_set(
"flash_attn_max_num_splits_for_cuda_graph",
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
)
self._set_from_env_if_set("use_cudnn_prefill", "VLLM_USE_CUDNN_PREFILL")
self._set_from_env_if_set(
"use_trtllm_ragged_deepseek_prefill",
"VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL",
)
self._set_from_env_if_set("use_trtllm_attention", "VLLM_USE_TRTLLM_ATTENTION")
self._set_from_env_if_set(
"disable_flashinfer_prefill", "VLLM_DISABLE_FLASHINFER_PREFILL"
)
self._set_from_env_if_set(
"disable_flashinfer_q_quantization",
"VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION",
)

232
vllm/config/cache.py Normal file
View File

@@ -0,0 +1,232 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import field
from typing import TYPE_CHECKING, Any, Literal
from pydantic import Field, SkipValidation, field_validator
from pydantic.dataclasses import dataclass
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.mem_utils import get_cpu_memory
if TYPE_CHECKING:
from vllm.config.parallel import ParallelConfig
else:
ParallelConfig = Any
logger = init_logger(__name__)
BlockSize = Literal[1, 8, 16, 32, 64, 128, 256]
CacheDType = Literal[
"auto",
"bfloat16",
"fp8",
"fp8_e4m3",
"fp8_e5m2",
"fp8_inc",
"fp8_ds_mla",
]
MambaDType = Literal["auto", "float32", "float16"]
PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor", "xxhash", "xxhash_cbor"]
KVOffloadingBackend = Literal["native", "lmcache"]
@config
@dataclass
class CacheConfig:
"""Configuration for the KV cache."""
block_size: SkipValidation[BlockSize] = None # type: ignore
"""Size of a contiguous cache block in number of tokens. On CUDA devices,
only block sizes up to 32 are supported.
This config has no static default. If left unspecified by the user, it will
be set in `Platform.check_and_update_config()` based on the current
platform."""
gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
"""The fraction of GPU memory to be used for the model executor, which can
range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
utilization. If unspecified, will use the default value of 0.9. This is a
per-instance limit, and only applies to the current vLLM instance. It does
not matter if you have another vLLM instance running on the same GPU. For
example, if you have two vLLM instances running on the same GPU, you can
set the GPU memory utilization to 0.5 for each instance."""
swap_space: float = Field(default=4, ge=0)
"""Size of the CPU swap space per GPU (in GiB)."""
cache_dtype: CacheDType = "auto"
"""Data type for kv cache storage. If "auto", will use model data type.
CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
fp8 (=fp8_e4m3). Intel Gaudi (HPU) supports fp8 (using fp8_inc).
Some models (namely DeepSeekV3.2) default to fp8, set to bfloat16 to use
bfloat16 instead, this is an invalid option for models that do not default
to fp8.
"""
is_attention_free: bool = False
"""Whether the model is attention-free. This is primarily set in
`ModelConfig` and that value should be manually duplicated here."""
num_gpu_blocks_override: int | None = None
"""Number of GPU blocks to use. This overrides the profiled `num_gpu_blocks`
if specified. Does nothing if `None`. Used for testing preemption."""
sliding_window: int | None = None
"""Sliding window size for the KV cache. This is primarily set in
`ModelConfig` and that value should be manually duplicated here."""
enable_prefix_caching: bool = True
"""Whether to enable prefix caching."""
prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
"""Set the hash algorithm for prefix caching:\n
- "sha256" uses Pickle for object serialization before hashing. This is the
current default, as SHA256 is the most secure choice to avoid potential
hash collisions.\n
- "sha256_cbor" provides a reproducible, cross-language compatible hash. It
serializes objects using canonical CBOR and hashes them with SHA-256.\n
- "xxhash" uses Pickle serialization with xxHash (128-bit) for faster,
non-cryptographic hashing. Requires the optional ``xxhash`` package.
IMPORTANT: Use of a hashing algorithm that is not considered
cryptographically secure theoretically increases the risk of hash collisions,
which can cause undefined behavior or even leak private information in
multi-tenant environments. Even if collisions are still very unlikely, it is
important to consider your security risk tolerance against the performance
benefits before turning this on.\n
- "xxhash_cbor" combines canonical CBOR serialization with xxHash for
reproducible hashing. Requires the optional ``xxhash`` package."""
cpu_offload_gb: float = Field(default=0, ge=0)
"""The space in GiB to offload to CPU, per GPU. Default is 0, which means
no offloading. Intuitively, this argument can be seen as a virtual way to
increase the GPU memory size. For example, if you have one 24 GB GPU and
set this to 10, virtually you can think of it as a 34 GB GPU. Then you can
load a 13B model with BF16 weight, which requires at least 26GB GPU memory.
Note that this requires fast CPU-GPU interconnect, as part of the model is
loaded from CPU memory to GPU memory on the fly in each model forward pass.
"""
calculate_kv_scales: bool = False
"""This enables dynamic calculation of `k_scale` and `v_scale` when
kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
checkpoint if available. Otherwise, the scales will default to 1.0."""
cpu_kvcache_space_bytes: int | None = None
"""(CPU backend only) CPU key-value cache space."""
mamba_page_size_padded: int | None = None
""" Optional override for mamba page size; used by hybrid mamba/attention
models to ensure exact alignment with attention page size."""
mamba_block_size: int | None = Field(default=None, gt=0)
"""Size of a contiguous cache block in number of tokens for mamba cache.
Can be set only when prefix caching is enabled.
Value must be a multiple of 8 to align with causal_conv1d kernel."""
mamba_cache_dtype: MambaDType = "auto"
"""The data type to use for the Mamba cache (both the conv as well as the
ssm state). If set to 'auto', the data type will be inferred from the model
config."""
mamba_ssm_cache_dtype: MambaDType = "auto"
"""The data type to use for the Mamba cache (ssm state only, conv state will
still be controlled by mamba_cache_dtype). If set to 'auto', the data type
for the ssm state will be determined by mamba_cache_dtype."""
# Will be set after profiling.
num_gpu_blocks: int | None = field(default=None, init=False)
"""The number of blocks to allocate for GPU memory."""
num_cpu_blocks: int | None = field(default=None, init=False)
"""The number of blocks to allocate for CPU memory."""
kv_sharing_fast_prefill: bool = False
"""This feature is work in progress and no prefill optimization takes place
with this flag enabled currently.
In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254),
some layers can skip tokens corresponding to prefill. This flag enables
attention metadata for eligible layers to be overridden with metadata
necessary for implementing this optimization in some models (e.g. Gemma3n)
"""
kv_cache_memory_bytes: int | None = None
"""Size of KV Cache per GPU in bytes. By default, this is set to None
and vllm can automatically infer the kv cache size based on
gpu_memory_utilization. However, users may want to manually specify
the kv cache memory size. kv_cache_memory_bytes allows more fine-grain
control of how much memory gets used when compared with using
gpu_memory_utilization. Note that kv_cache_memory_bytes
(when not-None) ignores gpu_memory_utilization"""
kv_offloading_size: float | None = None
"""Size of the KV cache offloading buffer in GiB. When TP > 1, this is
the total buffer size summed across all TP ranks. By default, this is set
to None, which means no KV offloading is enabled. When set with
kv_offloading_backend, vLLM will enable KV cache offloading to CPU"""
kv_offloading_backend: KVOffloadingBackend | None = None
"""The backend to use for KV cache offloading. Supported backends include
'native' (vLLM native CPU offloading), 'lmcache' This option must be used
together with kv_offloading_size."""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
ignored_factors = {
# Runtime/derived knobs that don't affect compiled graph shape
"gpu_memory_utilization",
"swap_space",
"is_attention_free",
"num_gpu_blocks_override",
"enable_prefix_caching",
"prefix_caching_hash_algo",
"cpu_kvcache_space_bytes",
"mamba_page_size_padded",
# Post-init/derived counters
"num_gpu_blocks",
"num_cpu_blocks",
# WIP feature toggle not impacting compiled graph shape
"kv_sharing_fast_prefill",
}
from vllm.config.utils import get_hash_factors, hash_factors
factors = get_hash_factors(self, ignored_factors)
return hash_factors(factors)
def metrics_info(self):
# convert cache_config to dict(key: str, value: str) for prometheus
# metrics info
return {key: str(value) for key, value in self.__dict__.items()}
@field_validator("cache_dtype", mode="after")
@classmethod
def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType:
if cache_dtype.startswith("fp8"):
logger.info(
"Using fp8 data type to store kv cache. It reduces the GPU "
"memory footprint and boosts the performance. "
"Meanwhile, it may cause accuracy drop without a proper "
"scaling factor."
)
return cache_dtype
def verify_with_parallel_config(
self,
parallel_config: ParallelConfig,
) -> None:
swap_space_bytes = self.swap_space * GiB_bytes
total_cpu_memory = get_cpu_memory()
# FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
# group are in the same node. However, the GPUs may span multiple nodes.
num_gpus_per_node = parallel_config.tensor_parallel_size
cpu_memory_usage = swap_space_bytes * num_gpus_per_node
msg = (
f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory "
"is allocated for the swap space."
)
if cpu_memory_usage > 0.7 * total_cpu_memory:
raise ValueError("Too large swap space. " + msg)
elif cpu_memory_usage > 0.4 * total_cpu_memory:
logger.warning("Possibly too large swap space. %s", msg)

1140
vllm/config/compilation.py Normal file

File diff suppressed because it is too large Load Diff

75
vllm/config/device.py Normal file
View File

@@ -0,0 +1,75 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import field
from typing import Any, Literal
import torch
from pydantic import ConfigDict, SkipValidation
from pydantic.dataclasses import dataclass
from vllm.config.utils import config
from vllm.utils.hashing import safe_hash
Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
@config
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class DeviceConfig:
"""Configuration for the device to use for vLLM execution."""
device: SkipValidation[Device | torch.device | None] = "auto"
"""Device type for vLLM execution.
This parameter is deprecated and will be
removed in a future release.
It will now be set automatically based
on the current platform."""
device_type: str = field(init=False)
"""Device type from the current platform. This is set in
`__post_init__`."""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# the device/platform information will be summarized
# by torch/vllm automatically.
factors: list[Any] = []
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
def __post_init__(self):
if self.device == "auto":
# Automated device type detection
from vllm.platforms import current_platform
self.device_type = current_platform.device_type
if not self.device_type:
raise RuntimeError(
"Failed to infer device type, please set "
"the environment variable `VLLM_LOGGING_LEVEL=DEBUG` "
"to turn on verbose logging to help debug the issue."
)
else:
# Device type is assigned explicitly
if isinstance(self.device, str):
self.device_type = self.device
elif isinstance(self.device, torch.device):
self.device_type = self.device.type
# Some device types require processing inputs on CPU
if self.device_type in ["tpu"]:
self.device = None
else:
# Set device with device type
self.device = torch.device(self.device_type)

110
vllm/config/ec_transfer.py Normal file
View File

@@ -0,0 +1,110 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import hashlib
import uuid
from dataclasses import field
from typing import Any, Literal, get_args
from pydantic.dataclasses import dataclass
from vllm.config.utils import config
ECProducer = Literal["ec_producer"]
ECConsumer = Literal["ec_consumer"]
ECRole = Literal[ECProducer, ECConsumer]
@config
@dataclass
class ECTransferConfig:
"""Configuration for distributed EC cache transfer."""
ec_connector: str | None = None
"""The EC connector for vLLM to transmit EC caches between vLLM instances.
"""
engine_id: str | None = None
"""The engine id for EC transfers."""
ec_buffer_device: str | None = "cuda"
"""The device used by ec connector to buffer the EC cache.
Currently only support 'cuda'."""
ec_buffer_size: float = 1e9
"""The buffer size for TorchDistributedConnector. Measured in number of
bytes. Recommended value: 1e9 (about 1GB)."""
ec_role: ECRole | None = None
"""Whether this vLLM instance produces, consumes EC cache, or both. Choices
are 'ec_producer', 'ec_consumer'."""
ec_rank: int | None = None
"""The rank of this vLLM instance in the EC cache transfer. Typical value:
0 for encoder, 1 for pd instance.
Currently only 1P1D is supported."""
ec_parallel_size: int = 1
"""The number of parallel instances for EC cache transfer. For
PyNcclConnector, this should be 2."""
ec_ip: str = "127.0.0.1"
"""The EC connector ip, used to build distributed connection."""
ec_port: int = 14579
"""The EC connector port, used to build distributed connection."""
ec_connector_extra_config: dict[str, Any] = field(default_factory=dict)
"""any extra config that the connector may need."""
ec_connector_module_path: str | None = None
"""The Python module path to dynamically load the EC connector from.
Only supported in V1."""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
def __post_init__(self) -> None:
if self.engine_id is None:
self.engine_id = str(uuid.uuid4())
if self.ec_role is not None and self.ec_role not in get_args(ECRole):
raise ValueError(
f"Unsupported ec_role: {self.ec_role}. "
f"Supported roles are {get_args(ECRole)}"
)
if self.ec_connector is not None and self.ec_role is None:
raise ValueError(
"Please specify ec_role when ec_connector "
f"is set, supported roles are {get_args(ECRole)}"
)
@property
def is_ec_transfer_instance(self) -> bool:
return self.ec_connector is not None and self.ec_role in get_args(ECRole)
@property
def is_ec_producer(self) -> bool:
return self.ec_connector is not None and self.ec_role in get_args(ECProducer)
@property
def is_ec_consumer(self) -> bool:
return self.ec_connector is not None and self.ec_role in get_args(ECConsumer)
def get_from_extra_config(self, key, default) -> Any:
return self.ec_connector_extra_config.get(key, default)

56
vllm/config/kv_events.py Normal file
View File

@@ -0,0 +1,56 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Literal
from pydantic import Field
from pydantic.dataclasses import dataclass
from vllm.config.utils import config
@config
@dataclass
class KVEventsConfig:
"""Configuration for KV event publishing."""
enable_kv_cache_events: bool = False
"""If True, enable KV cache events for tracking block storage and removal.
Events can be published externally by zmq using the event publisher config.
"""
publisher: Literal["null", "zmq"] = Field(default=None)
"""The publisher to use for publishing kv events. Can be "null", "zmq".
"""
endpoint: str = "tcp://*:5557"
"""The zmq endpoint to use for publishing kv events.
"""
replay_endpoint: str | None = None
"""The zmq endpoint to use for replaying kv events.
"""
buffer_steps: int = 10_000
"""The number of steps to cache for replay endpoint. Will only save
events from the last N steps for the replay endpoint.
"""
hwm: int = 100_000
"""The zmq high water mark for the event publisher. After queueing N events,
events will start dropping if the consumer is not keeping up.
"""
max_queue_size: int = 100_000
"""The maximum number of events to queue while waiting for publishing.
"""
topic: str = ""
"""The topic to use for the event publisher. Consumers can subscribe to
this topic to receive events.
"""
def __post_init__(self):
if self.publisher is None:
self.publisher = "zmq" if self.enable_kv_cache_events else "null"

119
vllm/config/kv_transfer.py Normal file
View File

@@ -0,0 +1,119 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import uuid
from dataclasses import field
from typing import Any, Literal, get_args
from pydantic.dataclasses import dataclass
from vllm.config.utils import config
from vllm.utils.hashing import safe_hash
KVProducer = Literal["kv_producer", "kv_both"]
KVConsumer = Literal["kv_consumer", "kv_both"]
KVRole = Literal[KVProducer, KVConsumer]
@config
@dataclass
class KVTransferConfig:
"""Configuration for distributed KV cache transfer."""
kv_connector: str | None = None
"""The KV connector for vLLM to transmit KV caches between vLLM instances.
"""
engine_id: str | None = None
"""The engine id for KV transfers."""
kv_buffer_device: str = "cuda"
"""The device used by kv connector to buffer the KV cache. Choices are
'cuda' and 'cpu'."""
kv_buffer_size: float = 1e9
"""The buffer size for TorchDistributedConnector. Measured in number of
bytes. Recommended value: 1e9 (about 1GB)."""
kv_role: KVRole | None = None
"""Whether this vLLM instance produces, consumes KV cache, or both. Choices
are 'kv_producer', 'kv_consumer', and 'kv_both'."""
kv_rank: int | None = None
"""The rank of this vLLM instance in the KV cache transfer. Typical value:
0 for prefill instance, 1 for decode instance.
Currently only 1P1D is supported."""
kv_parallel_size: int = 1
"""The number of parallel instances for KV cache transfer. For
P2pNcclConnector, this should be 2."""
kv_ip: str = "127.0.0.1"
"""The KV connector ip, used to build distributed connection."""
kv_port: int = 14579
"""The KV connector port, used to build distributed connection."""
kv_connector_extra_config: dict[str, Any] = field(default_factory=dict)
"""any extra config that the connector may need."""
kv_connector_module_path: str | None = None
"""The Python module path to dynamically load the KV connector from.
Only supported in V1."""
enable_permute_local_kv: bool = False
"""Experiment feature flag to enable HND to NHD KV Transfer"""
kv_load_failure_policy: Literal["recompute", "fail"] = "recompute"
"""Policy for handling KV cache load failures.
'recompute': reschedule the request to recompute failed blocks (default)
'fail': immediately fail the request with an error finish reason"""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
def __post_init__(self) -> None:
if self.engine_id is None:
self.engine_id = str(uuid.uuid4())
if self.kv_role is not None and self.kv_role not in get_args(KVRole):
raise ValueError(
f"Unsupported kv_role: {self.kv_role}. "
f"Supported roles are {get_args(KVRole)}"
)
if self.kv_connector is not None and self.kv_role is None:
raise ValueError(
"Please specify kv_role when kv_connector "
f"is set, supported roles are {get_args(KVRole)}"
)
@property
def is_kv_transfer_instance(self) -> bool:
return self.kv_connector is not None and self.kv_role in get_args(KVRole)
@property
def is_kv_producer(self) -> bool:
return self.kv_connector is not None and self.kv_role in get_args(KVProducer)
@property
def is_kv_consumer(self) -> bool:
return self.kv_connector is not None and self.kv_role in get_args(KVConsumer)
def get_from_extra_config(self, key, default) -> Any:
return self.kv_connector_extra_config.get(key, default)

124
vllm/config/load.py Normal file
View File

@@ -0,0 +1,124 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import TYPE_CHECKING, Any
from pydantic import Field, field_validator
from pydantic.dataclasses import dataclass
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.utils.hashing import safe_hash
if TYPE_CHECKING:
from vllm.model_executor.model_loader import LoadFormats
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
else:
LoadFormats = Any
TensorizerConfig = Any
logger = init_logger(__name__)
@config
@dataclass
class LoadConfig:
"""Configuration for loading the model weights."""
load_format: str | LoadFormats = "auto"
"""The format of the model weights to load:\n
- "auto" will try to load the weights in the safetensors format and fall
back to the pytorch bin format if safetensors format is not available.\n
- "pt" will load the weights in the pytorch bin format.\n
- "safetensors" will load the weights in the safetensors format.\n
- "npcache" will load the weights in pytorch format and store a numpy cache
to speed up the loading.\n
- "dummy" will initialize the weights with random values, which is mainly
for profiling.\n
- "tensorizer" will use CoreWeave's tensorizer library for fast weight
loading. See the Tensorize vLLM Model script in the Examples section for
more information.\n
- "runai_streamer" will load the Safetensors weights using Run:ai Model
Streamer.\n
- "runai_streamer_sharded" will load weights from pre-sharded checkpoint
files using Run:ai Model Streamer.\n
- "bitsandbytes" will load the weights using bitsandbytes quantization.\n
- "sharded_state" will load weights from pre-sharded checkpoint files,
supporting efficient loading of tensor-parallel models.\n
- "gguf" will load weights from GGUF format files (details specified in
https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n
- "mistral" will load weights from consolidated safetensors files used by
Mistral models.
- Other custom values can be supported via plugins."""
download_dir: str | None = None
"""Directory to download and load the weights, default to the default
cache directory of Hugging Face."""
safetensors_load_strategy: str = "lazy"
"""Specifies the loading strategy for safetensors weights.
- "lazy" (default): Weights are memory-mapped from the file. This enables
on-demand loading and is highly efficient for models on local storage.
- "eager": The entire file is read into CPU memory upfront before loading.
This is recommended for models on network filesystems (e.g., Lustre, NFS)
as it avoids inefficient random reads, significantly speeding up model
initialization. However, it uses more CPU RAM.
- "torchao": Weights are loaded in upfront and then reconstructed
into torchao tensor subclasses. This is used when the checkpoint
was quantized using torchao and saved using safetensors.
Needs torchao >= 0.14.0
"""
model_loader_extra_config: dict | TensorizerConfig = Field(default_factory=dict)
"""Extra config for model loader. This will be passed to the model loader
corresponding to the chosen load_format."""
device: str | None = None
"""Device to which model weights will be loaded, default to
device_config.device"""
ignore_patterns: list[str] | str = Field(default_factory=lambda: ["original/**/*"])
"""The list of patterns to ignore when loading the model. Default to
"original/**/*" to avoid repeated loading of llama's checkpoints."""
use_tqdm_on_load: bool = True
"""Whether to enable tqdm for showing progress bar when loading model
weights."""
pt_load_map_location: str | dict[str, str] = "cpu"
"""
pt_load_map_location: the map location for loading pytorch checkpoint, to
support loading checkpoints can only be loaded on certain devices like
"cuda", this is equivalent to {"": "cuda"}. Another supported format is
mapping from different devices like from GPU 1 to GPU 0:
{"cuda:1": "cuda:0"}. Note that when passed from command line, the strings
in dictionary needs to be double quoted for json parsing. For more details,
see original doc for `map_location` in https://pytorch.org/docs/stable/generated/torch.load.html
"""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
@field_validator("load_format", mode="after")
def _lowercase_load_format(cls, load_format: str) -> str:
return load_format.lower()
@field_validator("ignore_patterns", mode="after")
def _validate_ignore_patterns(
cls, ignore_patterns: list[str] | str
) -> list[str] | str:
if ignore_patterns != ["original/**/*"] and len(ignore_patterns) > 0:
logger.info(
"Ignoring the following patterns when downloading weights: %s",
ignore_patterns,
)
return ignore_patterns

96
vllm/config/lora.py Normal file
View File

@@ -0,0 +1,96 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import TYPE_CHECKING, Any, Literal
import torch
from pydantic import ConfigDict, Field, model_validator
from pydantic.dataclasses import dataclass
from typing_extensions import Self
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.utils.hashing import safe_hash
if TYPE_CHECKING:
from vllm.config import ModelConfig
from vllm.config.cache import CacheConfig
else:
ModelConfig = Any
CacheConfig = Any
logger = init_logger(__name__)
LoRADType = Literal["auto", "float16", "bfloat16"]
MaxLoRARanks = Literal[1, 8, 16, 32, 64, 128, 256, 320, 512]
LoRAExtraVocabSize = Literal[256, 512]
@config
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class LoRAConfig:
"""Configuration for LoRA."""
max_lora_rank: MaxLoRARanks = 16
"""Max LoRA rank."""
max_loras: int = Field(default=1, ge=1)
"""Max number of LoRAs in a single batch."""
fully_sharded_loras: bool = False
"""By default, only half of the LoRA computation is sharded with tensor
parallelism. Enabling this will use the fully sharded layers. At high
sequence length, max rank or tensor parallel size, this is likely faster.
"""
max_cpu_loras: int | None = None
"""Maximum number of LoRAs to store in CPU memory. Must be >= than
`max_loras`."""
lora_dtype: torch.dtype | LoRADType = "auto"
"""Data type for LoRA. If auto, will default to base model dtype."""
default_mm_loras: dict[str, str] | None = None
"""Dictionary mapping specific modalities to LoRA model paths; this field
is only applicable to multimodal models and should be leveraged when a
model always expects a LoRA to be active when a given modality is present.
Note that currently, if a request provides multiple additional
modalities, each of which have their own LoRA, we do NOT apply
default_mm_loras because we currently only support one lora adapter
per prompt. When run in offline mode, the lora IDs for n modalities
will be automatically assigned to 1-n with the names of the modalities
in alphabetic order."""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
factors: list[Any] = []
factors.append(self.max_lora_rank)
factors.append(self.max_loras)
factors.append(self.fully_sharded_loras)
factors.append(self.lora_dtype)
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
@model_validator(mode="after")
def _validate_lora_config(self) -> Self:
if self.max_cpu_loras is None:
self.max_cpu_loras = self.max_loras
elif self.max_cpu_loras < self.max_loras:
raise ValueError(
f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
f"max_loras ({self.max_loras})"
)
return self
def verify_with_model_config(self, model_config: ModelConfig):
if self.lora_dtype in (None, "auto"):
self.lora_dtype = model_config.dtype
elif isinstance(self.lora_dtype, str):
self.lora_dtype = getattr(torch, self.lora_dtype)

2190
vllm/config/model.py Normal file

File diff suppressed because it is too large Load Diff

247
vllm/config/multimodal.py Normal file
View File

@@ -0,0 +1,247 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Mapping
from typing import Any, Literal, TypeAlias
from pydantic import ConfigDict, Field, field_validator, model_validator
from pydantic.dataclasses import dataclass
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.config.utils import config
from vllm.utils.hashing import safe_hash
@dataclass
class BaseDummyOptions:
"""Base options for generating dummy data during profiling."""
count: int = Field(999, ge=0)
@dataclass(config=ConfigDict(extra="forbid"))
class VideoDummyOptions(BaseDummyOptions):
"""Options for generating dummy video data during profiling."""
num_frames: int | None = Field(None, gt=0)
width: int | None = Field(None, gt=0)
height: int | None = Field(None, gt=0)
@dataclass(config=ConfigDict(extra="forbid"))
class ImageDummyOptions(BaseDummyOptions):
"""Options for generating dummy image data during profiling."""
width: int | None = Field(None, gt=0)
height: int | None = Field(None, gt=0)
@dataclass(config=ConfigDict(extra="forbid"))
class AudioDummyOptions(BaseDummyOptions):
"""Options for generating dummy audio data during profiling."""
length: int | None = Field(None, gt=0)
MMEncoderTPMode = Literal["weights", "data"]
MMCacheType = Literal["shm", "lru"]
DummyOptions: TypeAlias = (
BaseDummyOptions | VideoDummyOptions | ImageDummyOptions | AudioDummyOptions
)
@config
@dataclass
class MultiModalConfig:
"""Controls the behavior of multimodal models."""
limit_per_prompt: dict[str, DummyOptions] = Field(default_factory=dict)
"""The maximum number of input items and options allowed per
prompt for each modality.
Defaults to 999 for each modality.
Legacy format (count only):
{"image": 16, "video": 2}
Configurable format (with options):
{"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512},
"image": {"count": 5, "width": 512, "height": 512}}
Mixed format (combining both):
{"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512,
"height": 512}}
"""
enable_mm_embeds: bool = False
"""If `True`, enables passing multimodal embeddings:
for `LLM` class, this refers to tensor inputs under `multi_modal_data`;
for the OpenAI-compatible server, this refers to chat messages with content
`"type": "*_embeds"`.
WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
Only enable this flag for trusted users!"""
media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
mm_processor_kwargs: dict[str, object] | None = None
"""Arguments to be forwarded to the model's processor for multi-modal data,
e.g., image processor. Overrides for the multi-modal processor obtained
from `transformers.AutoProcessor.from_pretrained`.
The available overrides depend on the model that is being run.
For example, for Phi-3-Vision:
`{"num_crops": 4}`."""
mm_processor_cache_gb: float = Field(default=4, ge=0)
"""The size (in GiB) of the multi-modal processor cache, which is used to
avoid re-processing past multi-modal inputs.
This cache is duplicated for each API process and engine core process,
resulting in a total memory usage of
`mm_processor_cache_gb * (api_server_count + data_parallel_size)`.
Set to `0` to disable this cache completely (not recommended)."""
mm_processor_cache_type: MMCacheType = "lru"
"""Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
mm_shm_cache_max_object_size_mb: int = Field(default=128, ge=0)
"""Size limit (in MiB) for each object stored in the multi-modal processor
shared memory cache. Only effective when `mm_processor_cache_type` is
`"shm"`."""
mm_encoder_tp_mode: MMEncoderTPMode = "weights"
"""Indicates how to optimize multi-modal encoder inference using tensor
parallelism (TP).
- `"weights"`: Within the same vLLM engine, split the weights of
each layer across TP ranks. (default TP behavior)\n
- `"data"`: Within the same vLLM engine, split the batched input data
across TP ranks to process the data in parallel, while hosting
the full weights on each TP rank.
This batch-level DP is not to be confused with API request-level
DP (which is controlled by `--data-parallel-size`).
This is only supported on a per-model basis and falls back to
`"weights"` if the encoder does not support DP."""
mm_encoder_attn_backend: AttentionBackendEnum | None = None
"""Optional override for the multi-modal encoder attention backend when
using vision transformers. Accepts any value from
`vllm.attention.backends.registry.AttentionBackendEnum` (e.g. `FLASH_ATTN`)."""
interleave_mm_strings: bool = False
"""Enable fully interleaved support for multimodal prompts, while using
--chat-template-content-format=string."""
skip_mm_profiling: bool = False
"""When enabled, skips multimodal memory profiling and only profiles with
language backbone model during engine initialization.
This reduces engine startup time but shifts the responsibility to users for
estimating the peak memory usage of the activation of multimodal encoder and
embedding cache."""
video_pruning_rate: float | None = Field(default=None, ge=0.0, lt=1.0)
"""Sets pruning rate for video pruning via Efficient Video Sampling.
Value sits in range [0;1) and determines fraction of media tokens
from each video to be pruned.
"""
@field_validator("limit_per_prompt", mode="before")
@classmethod
def _validate_limit_per_prompt(
cls, value: dict[str, int | dict[str, int]]
) -> dict[str, DummyOptions]:
for k, v in value.items():
# Handle legacy format where only count is specified
if isinstance(v, int):
v = {"count": v}
# Convert to the appropriate DummyOptions subclass
if k == "video":
value[k] = VideoDummyOptions(**v)
elif k == "image":
value[k] = ImageDummyOptions(**v)
elif k == "audio":
value[k] = AudioDummyOptions(**v)
else:
value[k] = BaseDummyOptions(**v)
return value
@field_validator("mm_encoder_attn_backend", mode="before")
@classmethod
def _validate_mm_encoder_attn_backend(
cls, value: str | AttentionBackendEnum | None
) -> AttentionBackendEnum | None:
if isinstance(value, str) and value.upper() == "XFORMERS":
raise ValueError(
"Attention backend 'XFORMERS' has been removed (See PR #29262 for "
"details). Please select a supported attention backend."
)
if value is None or isinstance(value, AttentionBackendEnum):
return value
assert isinstance(value, str), (
"mm_encoder_attn_backend must be a string or an AttentionBackendEnum."
)
return AttentionBackendEnum[value.upper()]
@model_validator(mode="after")
def _validate_multimodal_config(self):
if self.mm_processor_cache_type != "shm" and (
self.mm_shm_cache_max_object_size_mb
!= MultiModalConfig.mm_shm_cache_max_object_size_mb
):
raise ValueError(
"'mm_shm_cache_max_object_size_mb' should only be set when "
"'mm_processor_cache_type' is 'shm'."
)
return self
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
factors: list[Any] = [
self.mm_encoder_attn_backend.name
if self.mm_encoder_attn_backend is not None
else None
]
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
def get_limit_per_prompt(self, modality: str) -> int:
"""
Get the maximum number of input items allowed per prompt
for the given modality (backward compatible).
"""
limit_data = self.limit_per_prompt.get(modality)
if limit_data is None:
# Unspecified modality is set to 999 by default
return 999
return limit_data.count
def get_dummy_options(self, modality: str) -> BaseDummyOptions | None:
"""
Get the configurable dummy data options for a modality.
Returns None if no options are configured for this modality.
"""
# All values are now DummyOptions after normalization
return self.limit_per_prompt.get(modality)
def merge_mm_processor_kwargs(
self,
inference_kwargs: Mapping[str, object],
) -> dict[str, object]:
"""
Get the keyword arguments to pass to the multi-modal processor
according to the extra arguments passed during inference.
"""
kwargs = self.mm_processor_kwargs or {}
return kwargs | dict(inference_kwargs)
def is_multimodal_pruning_enabled(self):
return self.video_pruning_rate is not None and self.video_pruning_rate > 0

View File

@@ -0,0 +1,140 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from functools import cached_property
from typing import Any, Literal, cast
from packaging.version import parse
from pydantic import Field, field_validator, model_validator
from pydantic.dataclasses import dataclass
from vllm import version
from vllm.config.utils import config
from vllm.utils.hashing import safe_hash
DetailedTraceModules = Literal["model", "worker", "all"]
@config
@dataclass
class ObservabilityConfig:
"""Configuration for observability - metrics and tracing."""
show_hidden_metrics_for_version: str | None = None
"""Enable deprecated Prometheus metrics that have been hidden since the
specified version. For example, if a previously deprecated metric has been
hidden since the v0.7.0 release, you use
`--show-hidden-metrics-for-version=0.7` as a temporary escape hatch while
you migrate to new metrics. The metric is likely to be removed completely
in an upcoming release."""
@cached_property
def show_hidden_metrics(self) -> bool:
"""Check if the hidden metrics should be shown."""
if self.show_hidden_metrics_for_version is None:
return False
return version._prev_minor_version_was(self.show_hidden_metrics_for_version)
otlp_traces_endpoint: str | None = None
"""Target URL to which OpenTelemetry traces will be sent."""
collect_detailed_traces: list[DetailedTraceModules] | None = None
"""It makes sense to set this only if `--otlp-traces-endpoint` is set. If
set, it will collect detailed traces for the specified modules. This
involves use of possibly costly and or blocking operations and hence might
have a performance impact.
Note that collecting detailed timing information for each request can be
expensive."""
kv_cache_metrics: bool = False
"""Enable KV cache residency metrics (lifetime, idle time, reuse gaps).
Uses sampling to minimize overhead.
Requires log stats to be enabled (i.e., --disable-log-stats not set)."""
kv_cache_metrics_sample: float = Field(default=0.01, gt=0, le=1)
"""Sampling rate for KV cache metrics (0.0, 1.0]. Default 0.01 = 1% of blocks."""
cudagraph_metrics: bool = False
"""Enable CUDA graph metrics (number of padded/unpadded tokens, runtime cudagraph
dispatch modes, and their observed frequencies at every logging interval)."""
enable_layerwise_nvtx_tracing: bool = False
"""Enable layerwise NVTX tracing. This traces the execution of each layer or
module in the model and attach informations such as input/output shapes to
nvtx range markers. Noted that this doesn't work with CUDA graphs enabled."""
@cached_property
def collect_model_forward_time(self) -> bool:
"""Whether to collect model forward time for the request."""
return self.collect_detailed_traces is not None and (
"model" in self.collect_detailed_traces
or "all" in self.collect_detailed_traces
)
@cached_property
def collect_model_execute_time(self) -> bool:
"""Whether to collect model execute time for the request."""
return self.collect_detailed_traces is not None and (
"worker" in self.collect_detailed_traces
or "all" in self.collect_detailed_traces
)
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
@field_validator("show_hidden_metrics_for_version")
@classmethod
def _validate_show_hidden_metrics_for_version(cls, value: str | None) -> str | None:
if value is not None:
# Raises an exception if the string is not a valid version.
parse(value)
return value
@field_validator("otlp_traces_endpoint")
@classmethod
def _validate_otlp_traces_endpoint(cls, value: str | None) -> str | None:
if value is not None:
from vllm.tracing import is_otel_available, otel_import_error_traceback
if not is_otel_available():
raise ValueError(
"OpenTelemetry is not available. Unable to configure "
"'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
f"installed. Original error:\n{otel_import_error_traceback}"
)
return value
@field_validator("collect_detailed_traces")
@classmethod
def _validate_collect_detailed_traces(
cls, value: list[DetailedTraceModules] | None
) -> list[DetailedTraceModules] | None:
"""Handle the legacy case where users might provide a comma-separated
string instead of a list of strings."""
if value is not None and len(value) == 1 and "," in value[0]:
value = cast(list[DetailedTraceModules], value[0].split(","))
return value
@model_validator(mode="after")
def _validate_tracing_config(self):
if self.collect_detailed_traces and not self.otlp_traces_endpoint:
raise ValueError(
"collect_detailed_traces requires `--otlp-traces-endpoint` to be set."
)
return self

660
vllm/config/parallel.py Normal file
View File

@@ -0,0 +1,660 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from typing import TYPE_CHECKING, Any, Literal
import torch
from pydantic import Field, model_validator
from pydantic.dataclasses import dataclass
from torch.distributed import ProcessGroup, ReduceOp
from typing_extensions import Self
import vllm.envs as envs
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.model_executor.layers.batch_invariant import (
vllm_is_batch_invariant,
)
from vllm.platforms import current_platform
from vllm.utils.network_utils import get_open_ports_list
from vllm.utils.torch_utils import cuda_device_count_stateless
if TYPE_CHECKING:
from ray.runtime_env import RuntimeEnv
from ray.util.placement_group import PlacementGroup
from vllm.v1.executor import Executor
else:
RuntimeEnv = Any
PlacementGroup = Any
Executor = Any
logger = init_logger(__name__)
ExpertPlacementStrategy = Literal["linear", "round_robin"]
DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
DataParallelBackend = Literal["ray", "mp"]
EPLBPolicyOption = Literal["default"]
@config
@dataclass
class EPLBConfig:
"""Configuration for Expert Parallel Load Balancing (EP)."""
window_size: int = 1000
"""Window size for expert load recording."""
step_interval: int = 3000
"""
Interval for rearranging experts in expert parallelism.
Note that if this is greater than the EPLB window size, only the metrics
of the last `lb_window_size` steps will be used for rearranging experts.
"""
num_redundant_experts: int = Field(default=0, ge=0)
"""Number of redundant experts to use for expert parallelism."""
log_balancedness: bool = False
"""
Log the balancedness each step of expert parallelism.
This is turned off by default since it will cause communication overhead.
"""
use_async: bool = False
"""
Whether to use non-blocking EPLB.
"""
policy: EPLBPolicyOption = "default"
"""The policy type for expert parallel load balancing (EPLB)."""
@config
@dataclass
class ParallelConfig:
"""Configuration for the distributed execution."""
pipeline_parallel_size: int = 1
"""Number of pipeline parallel groups."""
tensor_parallel_size: int = 1
"""Number of tensor parallel groups."""
prefill_context_parallel_size: int = 1
"""Number of prefill context parallel groups."""
data_parallel_size: int = 1
"""Number of data parallel groups. MoE layers will be sharded according to
the product of the tensor parallel size and data parallel size."""
data_parallel_size_local: int = 1
"""Number of local data parallel groups."""
data_parallel_rank: int = 0
"""Rank of the data parallel group."""
data_parallel_rank_local: int | None = None
"""Local rank of the data parallel group,
set only in SPMD mode."""
data_parallel_master_ip: str = "127.0.0.1"
"""IP of the data parallel master."""
data_parallel_rpc_port: int = 29550
"""Port for data parallel messaging."""
data_parallel_master_port: int = 29500
"""Port of the data parallel master."""
data_parallel_backend: DataParallelBackend = "mp"
"""Backend to use for data parallel, either "mp" or "ray"."""
data_parallel_external_lb: bool = False
"""Whether to use "external" DP LB mode. Applies only to online serving
and when data_parallel_size > 0. This is useful for a "one-pod-per-rank"
wide-EP setup in Kubernetes. Set implicitly when --data-parallel-rank
is provided explicitly to vllm serve."""
data_parallel_hybrid_lb: bool = False
"""Whether to use "hybrid" DP LB mode. Applies only to online serving
and when data_parallel_size > 0. Enables running an AsyncLLM
and API server on a "per-node" basis where vLLM load balances
between local data parallel ranks, but an external LB balances
between vLLM nodes/replicas. Set explicitly in conjunction with
--data-parallel-start-rank."""
enable_expert_parallel: bool = False
"""Use expert parallelism instead of tensor parallelism for MoE layers."""
enable_eplb: bool = False
"""Enable expert parallelism load balancing for MoE layers."""
eplb_config: EPLBConfig = Field(default_factory=EPLBConfig)
"""Expert parallelism configuration."""
expert_placement_strategy: ExpertPlacementStrategy = "linear"
"""The expert placement strategy for MoE layers:\n
- "linear": Experts are placed in a contiguous manner. For example, with 4
experts and 2 ranks, rank 0 will have experts [0, 1] and rank 1 will have
experts [2, 3].\n
- "round_robin": Experts are placed in a round-robin manner. For example,
with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1
will have experts [1, 3]. This strategy can help improve load balancing
for grouped expert models with no redundant experts."""
all2all_backend: (
Literal[
"naive",
"pplx",
"deepep_high_throughput",
"deepep_low_latency",
"allgather_reducescatter",
"flashinfer_all2allv",
]
| None
) = None
"""All2All backend for MoE expert parallel communication. If not set, uses
the value from VLLM_ALL2ALL_BACKEND environment variable. Available options:
- "naive": Naive all2all implementation using broadcasts
- "allgather_reducescatter": All2all based on allgather and reducescatter
- "pplx": Use pplx kernels
- "deepep_high_throughput": Use deepep high-throughput kernels
- "deepep_low_latency": Use deepep low-latency kernels
- "flashinfer_all2allv": Use flashinfer alltoallv kernels for mnnvl"""
max_parallel_loading_workers: int | None = None
"""Maximum number of parallel loading workers when loading model
sequentially in multiple batches. To avoid RAM OOM when using tensor
parallel and large models."""
disable_custom_all_reduce: bool = False
"""Disable the custom all-reduce kernel and fall back to NCCL."""
enable_dbo: bool = False
"""Enable dual batch overlap for the model executor."""
dbo_decode_token_threshold: int = 32
"""The threshold for dual batch overlap for batches only containing decodes.
If the number of tokens in the request is greater than this threshold,
microbatching will be used. Otherwise, the request will be processed in a
single batch."""
dbo_prefill_token_threshold: int = 512 # TODO(lucas): tune
"""The threshold for dual batch overlap for batches that contain one or more
prefills. If the number of tokens in the request is greater than this
threshold, microbatching will be used. Otherwise, the request will be
processed in a single batch."""
disable_nccl_for_dp_synchronization: bool = False
"""Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py
to use Gloo instead of NCCL for its all reduce"""
ray_workers_use_nsight: bool = False
"""Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
ray_runtime_env: RuntimeEnv | None = None
"""Ray runtime environment to pass to distributed workers."""
placement_group: PlacementGroup | None = None
"""ray distributed model workers placement group."""
distributed_executor_backend: (
str | DistributedExecutorBackend | type[Executor] | None
) = None
"""Backend to use for distributed model workers, either "ray" or "mp"
(multiprocessing). If the product of pipeline_parallel_size and tensor_parallel_size
is less than or equal to the number of GPUs available, "mp" will be used to
keep processing on a single host. Otherwise, an error will be raised. To use "mp"
you must also set nnodes, and to use "ray" you must manually set
distributed_executor_backend to "ray".
Note that tpu only support Ray for distributed inference."""
worker_cls: str = "auto"
"""The full name of the worker class to use. If "auto", the worker class
will be determined based on the platform."""
sd_worker_cls: str = "auto"
"""The full name of the worker class to use for speculative decoding.
If "auto", the worker class will be determined based on the platform."""
worker_extension_cls: str = ""
"""The full name of the worker extension class to use. The worker extension
class is dynamically inherited by the worker class. This is used to inject
new attributes and methods to the worker class for use in collective_rpc
calls."""
master_addr: str = "127.0.0.1"
"""distributed master address for multi-node distributed
inference when distributed_executor_backend is mp."""
master_port: int = 29501
"""distributed master port for multi-node distributed
inference when distributed_executor_backend is mp."""
node_rank: int = 0
"""distributed node rank for multi-node distributed
inference when distributed_executor_backend is mp."""
nnodes: int = 1
"""num of nodes for multi-node distributed
inference when distributed_executor_backend is mp."""
world_size: int = Field(init=False)
"""world_size is TPxPP, it affects the number of workers we create."""
rank: int = 0
"""Global rank in distributed setup."""
_data_parallel_master_port_list: list[int] = Field(default_factory=list)
"""List of open port auto-queried for data parallel messaging.
Set to be private as it's not intended to be configured by users.
"""
decode_context_parallel_size: int = 1
"""Number of decode context parallel groups, because the world size does
not change by dcp, it simply reuse the GPUs of TP group, and tp_size
needs to be divisible by dcp_size."""
dcp_kv_cache_interleave_size: int = 1
"""
Interleave size of kv_cache storage while using DCP.
dcp_kv_cache_interleave_size has been replaced by cp_kv_cache_interleave_size,
and will be deprecated when PCP is fully supported.
"""
cp_kv_cache_interleave_size: int = 1
"""Interleave size of kv_cache storage while using DCP or PCP.
For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
and `total_cp_world_size = pcp_world_size * dcp_world_size`.
store interleave_size tokens on total_cp_rank i,
then store next interleave_size tokens on total_cp_rank i+1.
Interleave_size=1: token-level alignment, where token `i` is stored on
total_cp_rank `i % total_cp_world_size`.
Interleave_size=block_size: block-level alignment, where tokens are
first populated to the preceding ranks. Tokens are then stored
in (rank i+1, block j) only after (rank i, block j) is fully occupied.
Block_size should be greater than or equal to cp_kv_cache_interleave_size.
Block_size should be divisible by cp_kv_cache_interleave_size.
"""
_api_process_count: int = Field(default=1, gt=0)
"""
The number of API processes initialized.
Note:
This is an internal config that is only valid for and
should only be set by API server scale-out.
"""
_api_process_rank: int = Field(default=0, ge=-1)
"""
The rank of this API process, or `-1` for engine core processes
under API server scale-out.
Note:
This is an internal config that is only valid for and
should only be set by API server scale-out.
"""
@model_validator(mode="after")
def _validate_parallel_config(self) -> Self:
if self._api_process_rank >= self._api_process_count:
raise ValueError(
"Invalid value of `_api_process_rank`. "
f"Expected to be `-1` or `[0, {self._api_process_count})`, "
f"but found: {self._api_process_rank}"
)
if self.data_parallel_size_local > self.data_parallel_size:
raise ValueError(
f"data_parallel_size_local ({self.data_parallel_size_local}) "
f"must be <= data_parallel_size ({self.data_parallel_size})"
)
if self.data_parallel_size <= 1 and self.data_parallel_external_lb:
raise ValueError(
"data_parallel_external_lb can only be set when data_parallel_size > 1"
)
if self.enable_eplb:
if not current_platform.is_cuda_alike():
raise ValueError(
"Expert parallelism load balancing is only supported on "
"CUDA devices or ROCm devices now."
)
if not self.enable_expert_parallel:
raise ValueError("enable_expert_parallel must be True to use EPLB.")
if self.tensor_parallel_size * self.data_parallel_size <= 1:
raise ValueError(
"EPLB requires tensor_parallel_size or data_parallel_size "
f"to be greater than 1, but got "
f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}."
)
else:
if self.eplb_config.num_redundant_experts != 0:
raise ValueError(
"num_redundant_experts is set to "
f"{self.eplb_config.num_redundant_experts} but EPLB is not "
"enabled. Either enable EPLB or unset "
"num_redundant_experts."
)
return self
@property
def world_size_across_dp(self) -> int:
"""world_size_across_dp is TPxPPxDP, it is the size of the world
including data parallelism."""
return self.world_size * self.data_parallel_size
def get_next_dp_init_port(self) -> int:
"""
We might need to initialize process groups in multiple
processes that is related to data parallelism,
e.g. both in the worker and in the engine, which
can live in different processes. To avoid port conflicts, we
pop a new port from the prepared port list each time we need to
initialize a new process group related to data parallelism.
"""
if self._data_parallel_master_port_list:
answer = self._data_parallel_master_port_list.pop()
else:
answer = self.data_parallel_master_port
self.data_parallel_master_port += 1
return answer
def stateless_init_dp_group(self) -> ProcessGroup:
# NOTE: In high-concurrency scenarios multiple processes
# can pick the same (currently free) port through a race
# condition when calling `get_open_port()`. When the first
# process binds the port the others will subsequently fail
# with `torch.distributed.DistNetworkError: EADDRINUSE`.
# To make the initialization more robust we retry a few times
# with a fresh port whenever this specific error is observed.
from torch.distributed import DistNetworkError
from vllm.distributed.utils import (
stateless_init_torch_distributed_process_group,
)
max_retries = 5
last_exc: Exception | None = None
for _ in range(max_retries):
try:
# use gloo since the engine process might not have cuda device
return stateless_init_torch_distributed_process_group(
self.data_parallel_master_ip,
self.get_next_dp_init_port(),
self.data_parallel_rank,
self.data_parallel_size,
backend=current_platform.dist_backend,
)
except DistNetworkError as e:
# We only want to retry when the root cause is EADDRINUSE.
if "EADDRINUSE" in str(e):
logger.warning("Address already in use. Retrying with a new port.")
last_exc = e
continue # try again with a new port
raise e
# If we get here all retries have failed.
assert last_exc is not None
raise last_exc
# The all_reduce at the end of attention (during o_proj) means that
# inputs are replicated across each rank of the tensor parallel group.
# If using expert-parallelism with DeepEP All2All ops, replicated
# tokens results in useless duplicate computation and communication.
#
# In this case, ensure the input to the experts is sequence parallel
# to avoid the excess work.
#
# Not needed for pplx-kernels as it can handle duplicate input tokens.
@property
def use_sequence_parallel_moe(self) -> bool:
return (
self.all2all_backend
in (
"allgather_reducescatter",
"naive",
"deepep_high_throughput",
"deepep_low_latency",
)
and self.enable_expert_parallel
and self.tensor_parallel_size > 1
and self.data_parallel_size > 1
)
@property
def node_rank_within_dp(self) -> int:
return self.node_rank % self.nnodes_within_dp
@property
def nnodes_within_dp(self) -> int:
if self.nnodes == 1:
return 1
data_parallel_node_size = (
self.data_parallel_size // self.data_parallel_size_local
)
return self.nnodes // data_parallel_node_size
@property
def local_world_size(self) -> int:
return self.world_size // self.nnodes_within_dp
@staticmethod
def has_unfinished_dp(dp_group: ProcessGroup, has_unfinished: bool) -> bool:
tensor = torch.tensor([has_unfinished], dtype=torch.int32, device="cpu")
# dp rank 0: has_unfinished_seqs=True
# dp rank 1: has_unfinished_seqs=False
# aggregated: has_unfinished_seqs=True
# so this is an OR operation, i.e. MAX in integers
torch.distributed.all_reduce(tensor, op=ReduceOp.MAX, group=dp_group)
aggregated_has_unfinished = bool(tensor.item())
return aggregated_has_unfinished
@staticmethod
def sync_kv_cache_memory_size(dp_group: ProcessGroup, kv_cache_memory: int) -> int:
if kv_cache_memory == -1:
kv_cache_memory = torch.iinfo(torch.int64).max
tensor = torch.tensor([kv_cache_memory], dtype=torch.int64, device="cpu")
# we cannot use broadcast for stateless dp group since it depends
# on global rank
torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group)
return tensor.item()
def compute_hash(self):
"""
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
This hash is also used for DP worker configuration validation
to prevent hangs from mismatched collective communication patterns.
"""
ignored_factors = {
# Derived/runtime topology, networking, or launch details
"data_parallel_rank",
"data_parallel_rank_local",
"data_parallel_backend",
"data_parallel_external_lb",
"data_parallel_hybrid_lb",
"data_parallel_master_ip",
"data_parallel_master_port",
"_data_parallel_master_port_list",
"data_parallel_rpc_port",
"rank",
"master_addr",
"master_port",
"node_rank",
"nnodes",
"max_parallel_loading_workers",
"disable_custom_all_reduce",
"ray_workers_use_nsight",
"ray_runtime_env",
"placement_group",
"distributed_executor_backend",
"worker_cls",
"sd_worker_cls",
"worker_extension_cls",
"_api_process_count",
"_api_process_rank",
}
from vllm.config.utils import get_hash_factors, hash_factors
factors = get_hash_factors(self, ignored_factors)
# Explicitly include backend affecting env factor as before
factors["VLLM_ALL2ALL_BACKEND"] = str(envs.VLLM_ALL2ALL_BACKEND)
return hash_factors(factors)
def __post_init__(self) -> None:
# Set all2all_backend from env var if not specified, with deprecation warning
if self.all2all_backend is None:
self.all2all_backend = envs.VLLM_ALL2ALL_BACKEND
if envs.is_set("VLLM_ALL2ALL_BACKEND"):
logger.warning_once(
"VLLM_ALL2ALL_BACKEND environment variable is deprecated and "
"will be removed in a future release. Please use the "
"--all2all-backend command-line argument instead."
)
# Continue with the rest of the initialization
self.world_size = (
self.pipeline_parallel_size
* self.tensor_parallel_size
* self.prefill_context_parallel_size
)
if self.distributed_executor_backend == "external_launcher":
logger.info("Using external launcher for distributed inference.")
self.world_size *= self.data_parallel_size
if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
# Data parallel was specified in the engine args.
if self.distributed_executor_backend == "external_launcher":
# For external launcher,
# we need to set the data parallel rank automatically
self.data_parallel_rank = int(os.environ["RANK"]) // (
self.world_size // self.data_parallel_size
)
logger.info(
"Set data_parallel_rank to %d automatically.",
self.data_parallel_rank,
)
if not self._data_parallel_master_port_list:
self._data_parallel_master_port_list = get_open_ports_list(5)
self.data_parallel_master_port = self._data_parallel_master_port_list.pop()
if not (0 <= self.data_parallel_rank < self.data_parallel_size):
raise ValueError(
f"data_parallel_rank ({self.data_parallel_rank})"
f" must be in the range [0, {self.data_parallel_size})"
)
else:
# Otherwise fall back to env vars (e.g. for offline SPMD case).
self.data_parallel_size = envs.VLLM_DP_SIZE
self.data_parallel_rank = envs.VLLM_DP_RANK
self.data_parallel_rank_local = envs.VLLM_DP_RANK_LOCAL
self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
if self.distributed_executor_backend == "external_launcher":
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
logger.info("Disabling V1 multiprocessing for external launcher.")
if self.distributed_executor_backend is None and self.world_size > 1:
# We use multiprocessing by default if world_size fits on the
# current node and we aren't in a ray placement group.
from vllm.v1.executor import ray_utils
backend: DistributedExecutorBackend = "mp"
ray_found = ray_utils.ray_is_available()
if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
backend = "uni"
elif current_platform.is_cuda() and self.nnodes > 1:
backend = "mp"
elif (
current_platform.is_cuda()
and cuda_device_count_stateless() < self.world_size
):
gpu_count = cuda_device_count_stateless()
raise ValueError(
f"World size ({self.world_size}) is larger than the number of "
f"available GPUs ({gpu_count}) in this node. If this is "
"intentional and you are using:\n"
"- ray, set '--distributed-executor-backend ray'.\n"
"- multiprocessing, set '--nnodes' appropriately."
)
elif self.data_parallel_backend == "ray":
logger.info(
"Using ray distributed inference because "
"data_parallel_backend is ray"
)
backend = "ray"
elif ray_found:
if self.placement_group:
backend = "ray"
else:
from ray import is_initialized as ray_is_initialized
if ray_is_initialized():
from ray.util import get_current_placement_group
if get_current_placement_group():
backend = "ray"
self.distributed_executor_backend = backend
logger.debug("Defaulting to use %s for distributed inference", backend)
if self.distributed_executor_backend is None and self.world_size == 1:
self.distributed_executor_backend = "uni"
if self.max_parallel_loading_workers is not None:
logger.warning(
"max_parallel_loading_workers is currently "
"not supported and will be ignored."
)
allowed_backends = ("mp", "uni", "external_launcher")
if (
self.distributed_executor_backend not in allowed_backends
and self.nnodes > 1
):
raise ValueError(
"nnodes > 1 can only be set when distributed executor "
"backend is mp, uni or external_launcher."
)
@property
def use_ray(self) -> bool:
return self.distributed_executor_backend == "ray" or (
isinstance(self.distributed_executor_backend, type)
and getattr(self.distributed_executor_backend, "uses_ray", False)
)
@model_validator(mode="after")
def _verify_args(self) -> Self:
# Lazy import to avoid circular import
from vllm.v1.executor import Executor
# Enable batch invariance settings if requested
if vllm_is_batch_invariant():
self.disable_custom_all_reduce = True
if (
self.distributed_executor_backend is not None
and not isinstance(self.distributed_executor_backend, str)
and not (
isinstance(self.distributed_executor_backend, type)
and issubclass(self.distributed_executor_backend, Executor)
)
):
raise ValueError(
"Unrecognized distributed executor backend "
f"{self.distributed_executor_backend}. Supported "
"values are 'ray', 'mp' 'uni', 'external_launcher', "
" custom Executor subclass or its import path."
)
if self.use_ray:
from vllm.v1.executor import ray_utils
ray_utils.assert_ray_available()
if not current_platform.use_custom_allreduce():
self.disable_custom_all_reduce = True
logger.debug(
"Disabled the custom all-reduce kernel because it is not "
"supported on current platform."
)
if self.nnodes > 1:
self.disable_custom_all_reduce = True
logger.debug(
"Disabled the custom all-reduce since we are running on multi-node."
)
if self.ray_workers_use_nsight and not self.use_ray:
raise ValueError(
"Unable to use nsight profiling unless workers run with Ray."
)
return self

126
vllm/config/pooler.py Normal file
View File

@@ -0,0 +1,126 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, Literal
from pydantic.dataclasses import dataclass
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.utils.hashing import safe_hash
logger = init_logger(__name__)
PoolingTypeStr = Literal["LAST", "ALL", "CLS", "STEP", "MEAN"]
@config
@dataclass
class PoolerConfig:
"""Controls the behavior of output pooling in pooling models."""
pooling_type: PoolingTypeStr | None = None
"""
The pooling method of the pooling model. This should be a key in
[`vllm.model_executor.layers.pooler.PoolingType`][].
"""
## for embeddings models
normalize: bool | None = None
"""
Whether to normalize the embeddings outputs. Defaults to True.
"""
dimensions: int | None = None
"""
Reduce the dimensions of embeddings if model
support matryoshka representation. Defaults to None.
"""
enable_chunked_processing: bool | None = None
"""
Whether to enable chunked processing for long inputs that exceed the model's
maximum position embeddings. When enabled, long inputs will be split into
chunks, processed separately, and then aggregated using weighted averaging.
This allows embedding models to handle arbitrarily long text without CUDA
errors. Defaults to False.
"""
max_embed_len: int | None = None
"""
Maximum input length allowed for embedding generation. When set, allows
inputs longer than max_embed_len to be accepted for embedding models.
When an input exceeds max_embed_len, it will be handled according to
the original max_model_len validation logic.
Defaults to None (i.e. set to max_model_len).
"""
## for classification models
softmax: float | None = None
"""
softmax will be deprecated, please use use_activation instead.
"""
activation: float | None = None
"""
activation will be deprecated, please use use_activation instead.
"""
use_activation: bool | None = None
"""
Whether to apply activation function to the classification outputs.
Defaults to True.
"""
logit_bias: float | None = None
"""
If provided, apply classification logit biases. Defaults to None.
"""
## for reward models
step_tag_id: int | None = None
"""
If set, only the score corresponding to the `step_tag_id` in the
generated sentence should be returned. Otherwise, the scores for all tokens
are returned.
"""
returned_token_ids: list[int] | None = None
"""
A list of indices for the vocabulary dimensions to be extracted,
such as the token IDs of `good_token` and `bad_token` in the
`math-shepherd-mistral-7b-prm` model.
"""
def __post_init__(self):
# raise deprecated warning for softmax and activation
self.use_activation = get_use_activation(self)
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
def get_use_activation(o: object):
if softmax := getattr(o, "softmax", None) is not None:
logger.warning_once(
"softmax will be deprecated and will be removed in v0.15. "
"Please use use_activation instead."
)
return softmax
if activation := getattr(o, "activation", None) is not None:
logger.warning_once(
"activation will be deprecated and will be removed in v0.15. "
"Please use use_activation instead."
)
return activation
return getattr(o, "use_activation", None)

199
vllm/config/profiler.py Normal file
View File

@@ -0,0 +1,199 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from typing import Any, Literal
from pydantic import Field, model_validator
from pydantic.dataclasses import dataclass
from typing_extensions import Self
import vllm.envs as envs
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.utils.hashing import safe_hash
logger = init_logger(__name__)
ProfilerKind = Literal["torch", "cuda"]
@config
@dataclass
class ProfilerConfig:
"""Dataclass which contains profiler config for the engine."""
profiler: ProfilerKind | None = None
"""Which profiler to use. Defaults to None. Options are:
- 'torch': Use PyTorch profiler.\n
- 'cuda': Use CUDA profiler."""
torch_profiler_dir: str = ""
"""Directory to save torch profiler traces. Both AsyncLLM's CPU traces and
worker's traces (CPU & GPU) will be saved under this directory. Note that
it must be an absolute path."""
torch_profiler_with_stack: bool = True
"""If `True`, enables stack tracing in the torch profiler. Enabled by default."""
torch_profiler_with_flops: bool = False
"""If `True`, enables FLOPS counting in the torch profiler. Disabled by default."""
torch_profiler_use_gzip: bool = True
"""If `True`, saves torch profiler traces in gzip format. Enabled by default"""
torch_profiler_dump_cuda_time_total: bool = True
"""If `True`, dumps total CUDA time in torch profiler traces. Enabled by default."""
torch_profiler_record_shapes: bool = False
"""If `True`, records tensor shapes in the torch profiler. Disabled by default."""
torch_profiler_with_memory: bool = False
"""If `True`, enables memory profiling in the torch profiler.
Disabled by default."""
ignore_frontend: bool = False
"""If `True`, disables the front-end profiling of AsyncLLM when using the
'torch' profiler. This is needed to reduce overhead when using delay/limit options,
since the front-end profiling does not track iterations and will capture the
entire range.
"""
delay_iterations: int = Field(default=0, ge=0)
"""Number of engine iterations to skip before starting profiling.
Defaults to 0, meaning profiling starts immediately after receiving /start_profile.
"""
max_iterations: int = Field(default=0, ge=0)
"""Maximum number of engine iterations to profile after starting profiling.
Defaults to 0, meaning no limit.
"""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
def _get_from_env_if_set(self, field_name: str, env_var_name: str) -> None:
"""Get field from env var if set, with deprecation warning."""
if envs.is_set(env_var_name):
value = getattr(envs, env_var_name)
logger.warning_once(
"Using %s environment variable is deprecated and will be removed in "
"v0.14.0 or v1.0.0, whichever is soonest. Please use "
"--profiler-config.%s command line argument or "
"ProfilerConfig(%s=...) config field instead.",
env_var_name,
field_name,
field_name,
)
return value
return None
def _set_from_env_if_set(
self,
field_name: str,
env_var_name: str,
to_bool: bool = True,
to_int: bool = False,
) -> None:
"""Set field from env var if set, with deprecation warning."""
value = self._get_from_env_if_set(field_name, env_var_name)
if value is not None:
if to_bool:
value = value == "1"
if to_int:
value = int(value)
setattr(self, field_name, value)
@model_validator(mode="after")
def _validate_profiler_config(self) -> Self:
maybe_use_cuda_profiler = self._get_from_env_if_set(
"profiler", "VLLM_TORCH_CUDA_PROFILE"
)
if maybe_use_cuda_profiler is not None:
self.profiler = "cuda" if maybe_use_cuda_profiler == "1" else None
else:
self._set_from_env_if_set(
"torch_profiler_dir", "VLLM_TORCH_PROFILER_DIR", to_bool=False
)
if self.torch_profiler_dir:
self.profiler = "torch"
self._set_from_env_if_set(
"torch_profiler_record_shapes",
"VLLM_TORCH_PROFILER_RECORD_SHAPES",
)
self._set_from_env_if_set(
"torch_profiler_with_memory",
"VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY",
)
self._set_from_env_if_set(
"torch_profiler_with_stack",
"VLLM_TORCH_PROFILER_WITH_STACK",
)
self._set_from_env_if_set(
"torch_profiler_with_flops",
"VLLM_TORCH_PROFILER_WITH_FLOPS",
)
self._set_from_env_if_set(
"ignore_frontend",
"VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM",
)
self._set_from_env_if_set(
"torch_profiler_use_gzip",
"VLLM_TORCH_PROFILER_USE_GZIP",
)
self._set_from_env_if_set(
"torch_profiler_dump_cuda_time_total",
"VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL",
)
self._set_from_env_if_set(
"delay_iterations", "VLLM_PROFILER_DELAY_ITERS", to_bool=False, to_int=True
)
self._set_from_env_if_set(
"max_iterations", "VLLM_PROFILER_MAX_ITERS", to_bool=False, to_int=True
)
has_delay_or_limit = self.delay_iterations > 0 or self.max_iterations > 0
if self.profiler == "torch" and has_delay_or_limit and not self.ignore_frontend:
logger.warning_once(
"Using 'torch' profiler with delay_iterations or max_iterations "
"while ignore_frontend is False may result in high overhead."
)
profiler_dir = self.torch_profiler_dir
if profiler_dir and self.profiler != "torch":
raise ValueError(
"torch_profiler_dir is only applicable when profiler is set to 'torch'"
)
if self.profiler == "torch" and not profiler_dir:
raise ValueError("torch_profiler_dir must be set when profiler is 'torch'")
if profiler_dir:
is_gs_path = (
profiler_dir.startswith("gs://")
and profiler_dir[5:]
and profiler_dir[5] != "/"
)
if not is_gs_path:
self.torch_profiler_dir = os.path.abspath(
os.path.expanduser(profiler_dir)
)
return self

299
vllm/config/scheduler.py Normal file
View File

@@ -0,0 +1,299 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Callable
from dataclasses import InitVar
from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
from pydantic import Field, field_validator
from pydantic.dataclasses import dataclass
from typing_extensions import Self
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.utils.hashing import safe_hash
from vllm.utils.import_utils import resolve_obj_by_qualname
if TYPE_CHECKING:
from vllm.v1.core.sched.interface import SchedulerInterface
logger = init_logger(__name__)
RunnerType = Literal["generate", "pooling", "draft"]
SchedulerPolicy = Literal["fcfs", "priority"]
@config
@dataclass
class SchedulerConfig:
"""Scheduler configuration."""
max_model_len: InitVar[int]
"""Maximum length of a sequence (including prompt and generated text).
Note: This is stored in the ModelConfig, and is used only here to
provide fallbacks and validate other attributes."""
is_encoder_decoder: InitVar[bool]
"""True if the model is an encoder-decoder model.
Note: This is stored in the ModelConfig, and is used only here to
disable chunked prefill and prefix caching for encoder-decoder models.
"""
DEFAULT_MAX_NUM_BATCHED_TOKENS: ClassVar[int] = 2048
DEFAULT_MAX_NUM_SEQS: ClassVar[int] = 128
runner_type: RunnerType = "generate"
"""The runner type to launch for the model."""
max_num_batched_tokens: int = Field(default=DEFAULT_MAX_NUM_BATCHED_TOKENS, ge=1)
"""Maximum number of tokens to be processed in a single iteration.
The default value here is mainly for convenience when testing.
In real usage, this should be set in `EngineArgs.create_engine_config`.
"""
max_num_seqs: int = Field(default=DEFAULT_MAX_NUM_SEQS, ge=1)
"""Maximum number of sequences to be processed in a single iteration.
The default value here is mainly for convenience when testing.
In real usage, this should be set in `EngineArgs.create_engine_config`.
"""
max_num_partial_prefills: int = Field(default=1, ge=1)
"""For chunked prefill, the maximum number of sequences that can be
partially prefilled concurrently."""
max_long_partial_prefills: int = Field(default=1, ge=1)
"""For chunked prefill, the maximum number of prompts longer than
long_prefill_token_threshold that will be prefilled concurrently. Setting
this less than max_num_partial_prefills will allow shorter prompts to jump
the queue in front of longer prompts in some cases, improving latency."""
long_prefill_token_threshold: int = 0
"""For chunked prefill, a request is considered long if the prompt is
longer than this number of tokens."""
enable_chunked_prefill: bool = True
"""If True, prefill requests can be chunked based
on the remaining `max_num_batched_tokens`.
The default value here is mainly for convenience when testing.
In real usage, this should be set in `EngineArgs.create_engine_config`.
"""
is_multimodal_model: bool = False
"""True if the model is multimodal."""
# TODO (ywang96): Make this configurable.
max_num_encoder_input_tokens: int = Field(init=False)
"""Multimodal encoder compute budget, only used in V1.
NOTE: This is not currently configurable. It will be overridden by
max_num_batched_tokens in case max multimodal embedding size is larger."""
# TODO (ywang96): Make this configurable.
encoder_cache_size: int = Field(init=False)
"""Multimodal encoder cache size, only used in V1.
NOTE: This is not currently configurable. It will be overridden by
max_num_batched_tokens in case max multimodal embedding size is larger."""
policy: SchedulerPolicy = "fcfs"
"""The scheduling policy to use:\n
- "fcfs" means first come first served, i.e. requests are handled in order
of arrival.\n
- "priority" means requests are handled based on given priority (lower
value means earlier handling) and time of arrival deciding any ties)."""
disable_chunked_mm_input: bool = False
"""If set to true and chunked prefill is enabled, we do not want to
partially schedule a multimodal item. Only used in V1
This ensures that if a request has a mixed prompt
(like text tokens TTTT followed by image tokens IIIIIIIIII) where only
some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
it will be scheduled as TTTT in one step and IIIIIIIIII in the next."""
# scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler"
# (default) or "mod.custom_class".
scheduler_cls: str | type[object] = Field(default=None)
"""The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is
the default scheduler. Can be a class directly or the path to a class of
form "mod.custom_class"."""
disable_hybrid_kv_cache_manager: bool | None = None
"""If set to True, KV cache manager will allocate the same size of KV cache
for all attention layers even if there are multiple type of attention layers
like full attention and sliding window attention.
If set to None, the default value will be determined based on the environment
and starting configuration.
"""
async_scheduling: bool = False
"""If set to True, perform async scheduling. This helps to avoid gaps in
GPU utilization, leading to better latency and throughput.
Async scheduling is currently not supported with some features such as
speculative decoding and pipeline parallelism.
"""
stream_interval: int = Field(default=1, ge=1)
"""The interval (or buffer size) for streaming in terms of token length.
A smaller value (1) makes streaming smoother by sending each token immediately,
while a larger value (e.g., 10) reduces host overhead and may increase throughput
by batching multiple tokens before sending."""
@staticmethod
def default_factory(**kwargs):
"""
Factory method to create `SchedulerConfig` with default values for `InitVar`s.
"""
if "max_model_len" not in kwargs:
kwargs["max_model_len"] = 8192
if "is_encoder_decoder" not in kwargs:
kwargs["is_encoder_decoder"] = False
return SchedulerConfig(**kwargs)
def get_scheduler_cls(self) -> type["SchedulerInterface"]:
if self.scheduler_cls is None:
if self.async_scheduling:
from vllm.v1.core.sched.async_scheduler import AsyncScheduler
return AsyncScheduler
from vllm.v1.core.sched.scheduler import Scheduler
return Scheduler
# This warning can be removed once the Scheduler interface is
# finalized and we can maintain support for scheduler classes that
# implement it
logger.warning_once(
"Using custom scheduler class %s. This scheduler interface is "
"not public and compatibility may not be maintained.",
self.scheduler_cls,
)
if not isinstance(self.scheduler_cls, str):
return cast(type["SchedulerInterface"], self.scheduler_cls)
return resolve_obj_by_qualname(self.scheduler_cls)
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
factors: list[Any] = []
# max_num_batched_tokens need to be included in the hash due
# to two reasons:
# 1. LoRA creates static buffers based on max_num_batched_tokens.
# The tensor sizes and strides get captured in the torch.compile
# graph explicitly.
# 2. Inductor decides whether using 32-bit or 64-bit indexing integer
# based on the data sizes. `max_num_batched_tokens` has an
# impact on that. For more details, please check
# https://github.com/vllm-project/vllm/issues/29585
factors.append(self.max_num_batched_tokens)
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
@field_validator("scheduler_cls", "async_scheduling", mode="wrap")
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
"""Skip validation if the value is `None` when initialisation is delayed."""
if value is None:
return value
return handler(value)
def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None:
if is_encoder_decoder:
# Chunked prefill should be disabled for encoder-decoder models.
self.disable_chunked_mm_input = True
self.enable_chunked_prefill = False
self.long_prefill_token_threshold = 0
logger.info(
"Encoder-decoder models do not support chunked prefill nor"
" prefix caching; disabling both."
)
self.max_num_encoder_input_tokens = self.max_num_batched_tokens
self.encoder_cache_size = self.max_num_batched_tokens
if self.enable_chunked_prefill:
logger.info(
"Chunked prefill is enabled with max_num_batched_tokens=%d.",
self.max_num_batched_tokens,
)
if self.max_num_partial_prefills > 1:
if self.long_prefill_token_threshold == 0:
self.long_prefill_token_threshold = int(max_model_len * 0.04)
logger.info(
"Concurrent partial prefills enabled with "
"max_num_partial_prefills=%d, max_long_partial_prefills=%d, "
"long_prefill_token_threshold=%d",
self.max_num_partial_prefills,
self.max_long_partial_prefills,
self.long_prefill_token_threshold,
)
self.verify_max_model_len(max_model_len)
def verify_max_model_len(self, max_model_len: int) -> Self:
if (
self.max_num_batched_tokens < max_model_len
and not self.enable_chunked_prefill
):
raise ValueError(
f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
f"smaller than max_model_len ({max_model_len}). "
"This effectively limits the maximum sequence length to "
"max_num_batched_tokens and makes vLLM reject longer "
"sequences. Please increase max_num_batched_tokens or "
"decrease max_model_len."
)
if self.max_num_batched_tokens < self.max_num_seqs:
raise ValueError(
f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
"be greater than or equal to max_num_seqs "
f"({self.max_num_seqs})."
)
if self.max_num_batched_tokens > self.max_num_seqs * max_model_len:
logger.warning(
"max_num_batched_tokens (%d) exceeds max_num_seqs "
"* max_model_len (%d). This may lead to unexpected behavior.",
self.max_num_batched_tokens,
self.max_num_seqs * max_model_len,
)
if self.max_num_partial_prefills > 1:
if not self.enable_chunked_prefill:
raise ValueError(
"Chunked prefill must be enabled to set "
"max_num_partial_prefills > 1."
)
if self.long_prefill_token_threshold > max_model_len:
raise ValueError(
"long_prefill_token_threshold "
f"({self.long_prefill_token_threshold}) cannot be greater "
f"than the max_model_len ({max_model_len})."
)
if self.max_long_partial_prefills > self.max_num_partial_prefills:
raise ValueError(
f"{self.max_long_partial_prefills=} must be less than or equal to "
f"{self.max_num_partial_prefills=}."
)
return self

644
vllm/config/speculative.py Normal file
View File

@@ -0,0 +1,644 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import ast
from typing import TYPE_CHECKING, Any, Literal, get_args
from pydantic import Field, SkipValidation, model_validator
from pydantic.dataclasses import dataclass
from typing_extensions import Self
from vllm.config.model import ModelConfig
from vllm.config.parallel import ParallelConfig
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.utils.hashing import safe_hash
from vllm.utils.import_utils import LazyLoader, has_arctic_inference
if TYPE_CHECKING:
from transformers import PretrainedConfig
import vllm.model_executor.layers.quantization as me_quant
else:
PretrainedConfig = Any
me_quant = LazyLoader(
"model_executor", globals(), "vllm.model_executor.layers.quantization"
)
logger = init_logger(__name__)
MTPModelTypes = Literal[
"deepseek_mtp",
"mimo_mtp",
"glm4_moe_mtp",
"ernie_mtp",
"qwen3_next_mtp",
"longcat_flash_mtp",
"mtp",
"pangu_ultra_moe_mtp",
]
EagleModelTypes = Literal["eagle", "eagle3", MTPModelTypes]
SpeculativeMethod = Literal[
"ngram",
"medusa",
"mlp_speculator",
"draft_model",
"suffix",
EagleModelTypes,
]
@config
@dataclass
class SpeculativeConfig:
"""Configuration for speculative decoding."""
enforce_eager: bool | None = None
"""Override the default enforce_eager from model_config"""
# General speculative decoding control
num_speculative_tokens: int = Field(default=None, gt=0)
"""The number of speculative tokens, if provided. It will default to the
number in the draft model config if present, otherwise, it is required."""
model: str | None = None
"""The name of the draft model, eagle head, or additional weights, if
provided."""
method: SpeculativeMethod | None = None
"""The name of the speculative method to use. If users provide and set the
`model` param, the speculative method type will be detected automatically
if possible, if `model` param is not provided, the method name must be
provided.
If using `ngram` method, the related configuration `prompt_lookup_max` and
`prompt_lookup_min` should be considered."""
draft_tensor_parallel_size: int | None = Field(default=None, ge=1)
"""The degree of the tensor parallelism for the draft model. Can only be 1
or the same as the target model's tensor parallel size."""
# Draft model configuration
quantization: me_quant.QuantizationMethods | None = None
"""Quantization method that was used to quantize the draft model weights.
If `None`, we assume the model weights are not quantized. Note that it only
takes effect when using the draft model-based speculative method."""
max_model_len: int | None = Field(default=None, ge=1)
"""The maximum model length of the draft model. Used when testing the
ability to skip speculation for some sequences."""
revision: str | None = None
"""The specific model version to use for the draft model. It can be a
branch name, a tag name, or a commit id. If unspecified, will use the
default version."""
code_revision: str | None = None
"""The specific revision to use for the draft model code on Hugging Face
Hub. It can be a branch name, a tag name, or a commit id. If unspecified,
will use the default version."""
# Advanced control
disable_by_batch_size: int | None = Field(default=None, ge=2)
"""Disable speculative decoding for new incoming requests when the number
of enqueued requests is larger than this value, if provided."""
disable_padded_drafter_batch: bool = False
"""Disable input padding for speculative decoding. If set to True,
speculative input batches can contain sequences of different lengths,
which may only be supported by certain attention backends. This currently
only affects the EAGLE method of speculation."""
# Ngram proposer configuration
prompt_lookup_max: int | None = Field(default=None, ge=1)
"""Maximum size of ngram token window when using Ngram proposer, required
when method is set to ngram."""
prompt_lookup_min: int | None = Field(default=None, ge=1)
"""Minimum size of ngram token window when using Ngram proposer, if
provided. Defaults to 1."""
speculative_token_tree: str | None = None
"""Specifies the tree structure for speculative token generation.
"""
# required configuration params passed from engine
target_model_config: SkipValidation[ModelConfig] = None # type: ignore
"""The configuration of the target model."""
target_parallel_config: SkipValidation[ParallelConfig] = None # type: ignore
"""The parallel configuration for the target model."""
# params generated in the post-init stage
draft_model_config: SkipValidation[ModelConfig] = None # type: ignore
"""The configuration of the draft model initialized internal."""
draft_parallel_config: SkipValidation[ParallelConfig] = None # type: ignore
"""The parallel configuration for the draft model initialized internal."""
# Suffix decoding configuration
suffix_decoding_max_tree_depth: int = 24
"""The maximum depth of the suffix decoding global and prompt trees. The
tree depth limits the sum of the prefix match and speculation lengths."""
suffix_decoding_max_cached_requests: int = 10000
"""The maximum number of requests to cache in the global suffix tree. If
exceeded, will trigger eviction in FIFO order. If set to 0, the global
suffix tree is disabled and past responses are not cached (prompt trees
are still used)."""
suffix_decoding_max_spec_factor: float = 1.0
"""The maximum spec factor for suffix decoding. The spec factor controls
speculation lengths based on the prefix match length: max_spec_tokens =
max_spec_factor * prefix_match_length."""
suffix_decoding_min_token_prob: float = 0.1
"""The minimum token probability for suffix decoding. Will only speculate
tokens with estimated probability (based on frequency counts) greater than
or equal to this value."""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
factors: list[Any] = []
# Eagle3 affects the computation graph because it returns intermediate
# hidden states in addition to the final hidden state.
factors.append(self.method == "eagle3")
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
@staticmethod
def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
initial_architecture = hf_config.architectures[0]
if hf_config.model_type in ("deepseek_v3", "deepseek_v32"):
hf_config.model_type = "deepseek_mtp"
if hf_config.model_type == "deepseek_mtp":
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
hf_config.update(
{"n_predict": n_predict, "architectures": ["DeepSeekMTPModel"]}
)
if hf_config.model_type in ("pangu_ultra_moe"):
hf_config.model_type = "pangu_ultra_moe_mtp"
if hf_config.model_type == "pangu_ultra_moe_mtp":
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
hf_config.update(
{"n_predict": n_predict, "architectures": ["OpenPanguMTPModel"]}
)
if hf_config.architectures[0] == "MiMoForCausalLM":
hf_config.model_type = "mimo_mtp"
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
hf_config.update(
{
"num_hidden_layers": 0,
"n_predict": n_predict,
"architectures": ["MiMoMTPModel"],
}
)
if hf_config.architectures[0] == "Glm4MoeForCausalLM":
hf_config.model_type = "glm4_moe_mtp"
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
hf_config.update(
{
"num_hidden_layers": 0,
"n_predict": n_predict,
"architectures": ["Glm4MoeMTPModel"],
}
)
if hf_config.model_type == "ernie4_5_moe":
hf_config.model_type = "ernie_mtp"
if hf_config.model_type == "ernie_mtp":
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
hf_config.update(
{"n_predict": n_predict, "architectures": ["ErnieMTPModel"]}
)
if hf_config.model_type == "qwen3_next":
hf_config.model_type = "qwen3_next_mtp"
if hf_config.model_type == "qwen3_next_mtp":
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
hf_config.update(
{"n_predict": n_predict, "architectures": ["Qwen3NextMTP"]}
)
if hf_config.model_type == "longcat_flash":
hf_config.model_type = "longcat_flash_mtp"
n_predict = getattr(hf_config, "num_nextn_predict_layers", 1)
hf_config.update(
{"n_predict": n_predict, "architectures": ["LongCatFlashMTPModel"]}
)
if initial_architecture == "MistralLarge3ForCausalLM":
hf_config.update({"architectures": ["EagleMistralLarge3ForCausalLM"]})
return hf_config
def __post_init__(self):
# Note: "method" is a new parameter that helps to extend the
# configuration of non-model-based proposers, and the "model" parameter
# will be used to set the draft model, eagle head, or additional weight
# when needed. If users do not specify "method", the speculative method
# will be detected automatically if possible. If the speculative method
# can not be detected, it will be considered as the "draft_model" by
# default.
if self.method in get_args(MTPModelTypes) and self.method != "mtp":
logger.warning(
"method `%s` is deprecated and replaced with mtp.", self.method
)
self.method = "mtp"
if self.model is None and self.num_speculative_tokens is not None:
if self.method == "mtp":
if self.target_model_config is None:
raise ValueError("target_model_config must be present for mtp")
if self.target_model_config.hf_text_config.model_type == "deepseek_v32":
# FIXME(luccafong): cudgraph with v32 MTP is not supported,
# remove this when the issue is fixed.
self.enforce_eager = True
# use the draft model from the same model:
self.model = self.target_model_config.model
# Align the quantization of draft model for cases such as
# --quantization fp8 with a bf16 checkpoint.
if not self.quantization:
self.quantization = self.target_model_config.quantization
elif self.method in ("ngram", "[ngram]"):
self.model = "ngram"
elif self.method == "suffix":
self.model = "suffix"
else:
raise ValueError(
"num_speculative_tokens was provided but without speculative model."
)
# Automatically configure the method for ngram when "model" is used
# instead of "method"
if self.method is None and (
self.model is not None and self.model in ("ngram", "[ngram]")
):
self.method = "ngram"
if self.method in ("ngram", "[ngram]"):
# Unified to "ngram" internally
self.method = "ngram"
# Set default values if not provided
if self.prompt_lookup_min is None and self.prompt_lookup_max is None:
# TODO(woosuk): Tune these values. They are arbitrarily chosen.
self.prompt_lookup_min = 5
self.prompt_lookup_max = 5
elif self.prompt_lookup_min is None:
if self.prompt_lookup_max is None:
raise ValueError(
"Either prompt_lookup_max or prompt_lookup_min must be "
"provided when using the ngram method."
)
self.prompt_lookup_min = self.prompt_lookup_max
elif self.prompt_lookup_max is None:
if self.prompt_lookup_min is None:
raise ValueError(
"Either prompt_lookup_max or prompt_lookup_min must be "
"provided when using the ngram method."
)
self.prompt_lookup_max = self.prompt_lookup_min
# Validate values
if self.prompt_lookup_min > self.prompt_lookup_max:
raise ValueError(
f"prompt_lookup_min={self.prompt_lookup_min} must "
f"be <= prompt_lookup_max={self.prompt_lookup_max}"
)
# TODO: current we still need extract vocab_size from target model
# config, in future, we may try refactor it out, and set
# draft related config as None here.
self.draft_model_config = self.target_model_config
self.draft_parallel_config = self.target_parallel_config
elif self.method == "suffix":
self._validate_suffix_decoding()
else:
self.prompt_lookup_max = 0
self.prompt_lookup_min = 0
if self.model is not None:
self.draft_model_config = ModelConfig(
model=self.model,
runner="draft",
tokenizer=self.target_model_config.tokenizer,
tokenizer_mode=self.target_model_config.tokenizer_mode,
trust_remote_code=self.target_model_config.trust_remote_code,
allowed_local_media_path=self.target_model_config.allowed_local_media_path,
allowed_media_domains=self.target_model_config.allowed_media_domains,
dtype=self.target_model_config.dtype,
seed=self.target_model_config.seed,
revision=self.revision,
code_revision=self.code_revision,
tokenizer_revision=self.target_model_config.tokenizer_revision,
spec_target_max_model_len=self.target_model_config.max_model_len,
quantization=self.quantization,
enforce_eager=self.target_model_config.enforce_eager,
max_logprobs=self.target_model_config.max_logprobs,
hf_overrides=SpeculativeConfig.hf_config_override,
config_format=self.target_model_config.config_format,
)
# Automatically detect the method
if self.method in ("eagle", "eagle3"):
pass
# examples:
# yuhuili/EAGLE-LLaMA3-Instruct-8B
# yuhuili/EAGLE3-LLaMA3.1-Instruct-8B
# AngelSlim/Qwen3-8B_eagle3
elif "eagle-" in self.draft_model_config.model.lower():
self.method = "eagle"
elif "eagle3" in self.draft_model_config.model.lower():
self.method = "eagle3"
elif self.draft_model_config.hf_config.model_type == "medusa":
self.method = "medusa"
elif self.draft_model_config.hf_config.model_type == "mlp_speculator":
self.method = "mlp_speculator"
elif self.draft_model_config.hf_config.model_type in get_args(
MTPModelTypes
):
self.method = "mtp"
if self.num_speculative_tokens > 1:
logger.warning(
"Enabling num_speculative_tokens > 1 will run"
"multiple times of forward on same MTP layer"
",which may result in lower acceptance rate"
)
elif self.draft_model_config.hf_config.model_type in (
"longcat_flash_mtp"
):
self.method = "longcat_flash_mtp"
if self.num_speculative_tokens > 1:
logger.warning(
"LongCat MTP models only have "
"one layer. Might need some code changes "
"to support multiple layers."
)
else:
self.method = "draft_model"
raise NotImplementedError(
"Speculative decoding with draft model is not "
"supported yet. Please consider using other "
"speculative decoding methods such as ngram, medusa, "
"eagle, or mtp."
)
# Replace hf_config for EAGLE draft_model
if self.method in ("eagle", "eagle3"):
from vllm.transformers_utils.configs import SpeculatorsConfig
from vllm.transformers_utils.configs.eagle import EAGLEConfig
if isinstance(
self.draft_model_config.hf_config,
(EAGLEConfig, SpeculatorsConfig),
):
pass
else:
eagle_config = EAGLEConfig(
self.draft_model_config.hf_config,
method=self.method,
model_type="eagle",
)
self.draft_model_config.hf_config = eagle_config
if self.num_speculative_tokens is not None and hasattr(
self.draft_model_config.hf_config, "num_lookahead_tokens"
):
self.draft_model_config.hf_config.num_lookahead_tokens = (
self.num_speculative_tokens
)
n_predict = getattr(
self.draft_model_config.hf_config, "n_predict", None
)
if n_predict is not None:
if self.num_speculative_tokens is None:
# Default to max value defined in draft model config.
self.num_speculative_tokens = n_predict
elif (
self.num_speculative_tokens > n_predict
and self.num_speculative_tokens % n_predict != 0
):
# Ensure divisibility for MTP module reuse.
raise ValueError(
f"num_speculative_tokens:{self.num_speculative_tokens}"
f" must be divisible by {n_predict=}"
)
if self.speculative_token_tree is None:
# Generate chain of tokens.
self.speculative_token_tree = str(
[(i + 1) * (0,) for i in range(self.num_speculative_tokens)]
)
else:
# Sort the token tree breadth-first.
tree_choices = ast.literal_eval(self.speculative_token_tree)
self.speculative_token_tree = str(
sorted(tree_choices, key=lambda t: (len(t), t))
)
self.draft_tensor_parallel_size = (
SpeculativeConfig._verify_and_get_draft_tp(
self.target_parallel_config,
self.draft_tensor_parallel_size,
self.draft_model_config.hf_config,
)
)
self.draft_model_config.max_model_len = (
SpeculativeConfig._maybe_override_draft_max_model_len(
self.max_model_len,
self.draft_model_config.max_model_len,
self.target_model_config.max_model_len,
)
)
self.draft_parallel_config = (
SpeculativeConfig.create_draft_parallel_config(
self.target_parallel_config, self.draft_tensor_parallel_size
)
)
return self
def _validate_suffix_decoding(self):
if not has_arctic_inference():
raise ImportError(
"Arctic Inference is required for suffix decoding. "
"Install via `pip install arctic-inference==0.1.1`."
)
if self.num_speculative_tokens is None:
# Suffix decoding decides the actual number of speculative tokens
# dynamically and treats num_speculative_tokens as a maximum limit.
self.num_speculative_tokens = self.suffix_decoding_max_tree_depth
logger.warning(
"Defaulted num_speculative_tokens to %s for suffix decoding.",
self.num_speculative_tokens,
)
# Validate values
if self.suffix_decoding_max_tree_depth < 1:
raise ValueError(
f"suffix_decoding_max_tree_depth="
f"{self.suffix_decoding_max_tree_depth} must be >= 1"
)
if self.suffix_decoding_max_cached_requests < 0:
raise ValueError(
f"suffix_decoding_max_cached_requests="
f"{self.suffix_decoding_max_cached_requests} must be >= 0"
)
if self.suffix_decoding_max_spec_factor < 0:
raise ValueError(
f"suffix_decoding_max_spec_factor="
f"{self.suffix_decoding_max_spec_factor} must be >= 0"
)
if not 0 <= self.suffix_decoding_min_token_prob <= 1:
raise ValueError(
f"suffix_decoding_min_token_prob="
f"{self.suffix_decoding_min_token_prob} must be in [0, 1]"
)
@staticmethod
def _maybe_override_draft_max_model_len(
speculative_max_model_len: int | None,
draft_max_model_len: int,
target_max_model_len: int,
) -> int:
"""Determine the max sequence len for the draft model. This is usually
the draft_max_model_len, but may be the target_max_model_len if it is
less than the draft_max_model_len, or may be speculative_max_model_len
if it is specified.
This is necessary so that sequences do not exceed the capacity of the
draft model or the target model.
speculative_max_model_len is mainly used for testing that sequences can
skip speculation.
"""
if speculative_max_model_len is not None:
if speculative_max_model_len > draft_max_model_len:
raise ValueError(
f"{speculative_max_model_len=} cannot be "
f"larger than {draft_max_model_len=}"
)
if speculative_max_model_len > target_max_model_len:
raise ValueError(
f"{speculative_max_model_len=} cannot be "
f"larger than {target_max_model_len=}"
)
return speculative_max_model_len
return min(
draft_max_model_len,
target_max_model_len,
)
@staticmethod
def _verify_and_get_draft_tp(
target_parallel_config: ParallelConfig,
speculative_draft_tensor_parallel_size: int | None,
draft_hf_config: PretrainedConfig,
) -> int:
"""
Verifies and adjusts the tensor parallel size for a draft model
specified using speculative_draft_tensor_parallel_size.
"""
# If speculative_draft_tensor_parallel_size is unset then set it
# appropriately else verify that it is set correctly.
if speculative_draft_tensor_parallel_size is None:
if draft_hf_config.model_type == "mlp_speculator":
speculative_draft_tensor_parallel_size = 1
if target_parallel_config.tensor_parallel_size > 1:
logger.warning(
"%s cannot currently be run with tp>1; "
"setting speculative_draft_tensor_parallel_size=1",
draft_hf_config.model_type,
)
else:
speculative_draft_tensor_parallel_size = (
target_parallel_config.tensor_parallel_size
)
elif speculative_draft_tensor_parallel_size not in (
1,
target_parallel_config.tensor_parallel_size,
):
raise ValueError(
f"{speculative_draft_tensor_parallel_size=} cannot be "
f"other value than 1 or target model tensor_parallel_size"
)
return speculative_draft_tensor_parallel_size
@staticmethod
def create_draft_parallel_config(
target_parallel_config: ParallelConfig,
speculative_draft_tensor_parallel_size: int,
) -> ParallelConfig:
"""Create a parallel config for use by the draft worker.
This is mostly a copy of the target parallel config, except the tp_size.
"""
draft_parallel_config = ParallelConfig(
pipeline_parallel_size=target_parallel_config.pipeline_parallel_size,
tensor_parallel_size=speculative_draft_tensor_parallel_size,
distributed_executor_backend=target_parallel_config.distributed_executor_backend,
max_parallel_loading_workers=target_parallel_config.max_parallel_loading_workers,
disable_custom_all_reduce=target_parallel_config.disable_custom_all_reduce,
ray_workers_use_nsight=target_parallel_config.ray_workers_use_nsight,
placement_group=target_parallel_config.placement_group,
)
return draft_parallel_config
@model_validator(mode="after")
def _verify_args(self) -> Self:
if self.num_speculative_tokens is None:
raise ValueError(
"num_speculative_tokens must be provided with "
"speculative model unless the draft model config contains an "
"n_predict parameter."
)
if self.num_speculative_tokens <= 0:
raise ValueError(
"Expected num_speculative_tokens to be greater "
f"than zero ({self.num_speculative_tokens})."
)
if self.draft_model_config:
self.draft_model_config.verify_with_parallel_config(
self.draft_parallel_config
)
if self.disable_by_batch_size is not None and self.disable_by_batch_size < 2:
raise ValueError(
"Expect the batch size threshold of disabling "
"speculative decoding is > 1, but got "
f"{self.disable_by_batch_size=}"
)
eagle3_target_supported = ["llama", "qwen", "minicpm", "gpt_oss"]
if (
self.method == "eagle3"
and self.target_model_config
and not any(
supported_model in self.target_model_config.hf_text_config.model_type
for supported_model in eagle3_target_supported
)
):
raise ValueError(
f"Eagle3 is only supported for {eagle3_target_supported} models. " # noqa: E501
f"Got {self.target_model_config.hf_text_config.model_type=}"
)
return self
def use_eagle(self) -> bool:
return self.method in ("eagle", "eagle3", "mtp")
def __repr__(self) -> str:
method = self.method
model = None if method in ("ngram", "suffix") else self.draft_model_config.model
num_spec_tokens = self.num_speculative_tokens
return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"

View File

@@ -0,0 +1,38 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pydantic.dataclasses import dataclass
from vllm.config.utils import config
@config
@dataclass
class SpeechToTextConfig:
"""Configuration for speech-to-text models."""
sample_rate: float = 16_000
"""Sample rate (Hz) to resample input audio to. Most speech models expect
16kHz audio input. The input audio will be automatically resampled to this
rate before processing."""
max_audio_clip_s: int = 30
"""Maximum duration in seconds for a single audio clip without chunking.
Audio longer than this will be split into smaller chunks if
`allow_audio_chunking` evaluates to True, otherwise it will be rejected."""
overlap_chunk_second: int = 1
"""Overlap duration in seconds between consecutive audio chunks when
splitting long audio. This helps maintain context across chunk boundaries
and improves transcription quality at split points."""
min_energy_split_window_size: int | None = 1600
"""Window size in samples for finding low-energy (quiet) regions to split
audio chunks. The algorithm looks for the quietest moment within this
window to minimize cutting through speech. Default 1600 samples ≈ 100ms
at 16kHz. If None, no chunking will be done."""
@property
def allow_audio_chunking(self) -> bool:
return self.min_energy_split_window_size is not None

View File

@@ -0,0 +1,78 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, Literal
from pydantic import model_validator
from pydantic.dataclasses import dataclass
from typing_extensions import Self
from vllm.config.utils import config
from vllm.utils.hashing import safe_hash
StructuredOutputsBackend = Literal[
"auto", "xgrammar", "guidance", "outlines", "lm-format-enforcer"
]
@config
@dataclass
class StructuredOutputsConfig:
"""Dataclass which contains structured outputs config for the engine."""
backend: StructuredOutputsBackend = "auto"
"""Which engine will be used for structured outputs (e.g. JSON schema,
regex, etc) by default. With "auto", we will make opinionated choices
based on request contents and what the backend libraries currently support,
so the behavior is subject to change in each release."""
disable_fallback: bool = False
"""If `True`, vLLM will not fallback to a different backend on error."""
disable_any_whitespace: bool = False
"""If `True`, json output will always be compact without any whitespace.
If `False`, the model may generate whitespace between JSON fields,
which is still valid JSON. This is only supported for xgrammar
and guidance backends."""
disable_additional_properties: bool = False
"""If `True`, the `guidance` backend will not use `additionalProperties`
in the JSON schema. This is only supported for the `guidance` backend and
is used to better align its behaviour with `outlines` and `xgrammar`."""
reasoning_parser: str = ""
"""Select the reasoning parser depending on the model that you're using.
This is used to parse the reasoning content into OpenAI API format."""
reasoning_parser_plugin: str = ""
"""Path to a dynamically reasoning parser plugin that can be dynamically
loaded and registered."""
enable_in_reasoning: bool = False
"""Whether to use structured input for reasoning."""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
@model_validator(mode="after")
def _validate_structured_output_config(self) -> Self:
if self.disable_any_whitespace and self.backend not in ("xgrammar", "guidance"):
raise ValueError(
"disable_any_whitespace is only supported for "
"xgrammar and guidance backends."
)
if self.disable_additional_properties and self.backend != "guidance":
raise ValueError(
"disable_additional_properties is only supported "
"for the guidance backend."
)
return self

370
vllm/config/utils.py Normal file
View File

@@ -0,0 +1,370 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Utility functions for vLLM config dataclasses."""
import ast
import enum
import hashlib
import inspect
import json
import pathlib
import textwrap
from collections.abc import Iterable, Mapping, Sequence, Set
from dataclasses import MISSING, Field, dataclass, field, fields, is_dataclass, replace
from itertools import pairwise
from typing import TYPE_CHECKING, Any, Protocol, TypeVar
import regex as re
import torch
from pydantic.fields import FieldInfo
from typing_extensions import runtime_checkable
from vllm.logger import init_logger
logger = init_logger(__name__)
if TYPE_CHECKING:
from _typeshed import DataclassInstance
else:
DataclassInstance = Any
ConfigType = type[DataclassInstance]
ConfigT = TypeVar("ConfigT", bound=ConfigType)
def config(cls: ConfigT) -> ConfigT:
"""
A decorator that ensures all fields in a dataclass have default values
and that each field has a docstring.
If a `ConfigT` is used as a CLI argument itself, the `type` keyword argument
provided by `get_kwargs` will be
`pydantic.TypeAdapter(ConfigT).validate_json(cli_arg)` which treats the
`cli_arg` as a JSON string which gets validated by `pydantic`.
Config validation is performed by the tools/pre_commit/validate_config.py
script, which is invoked during the pre-commit checks.
"""
return cls
def get_field(cls: ConfigType, name: str) -> Field:
"""Get the default factory field of a dataclass by name. Used for getting
default factory fields in `EngineArgs`."""
if not is_dataclass(cls):
raise TypeError("The given class is not a dataclass.")
cls_fields = {f.name: f for f in fields(cls)}
if name not in cls_fields:
raise ValueError(f"Field '{name}' not found in {cls.__name__}.")
named_field: Field = cls_fields[name]
if (default_factory := named_field.default_factory) is not MISSING:
return field(default_factory=default_factory)
if (default := named_field.default) is not MISSING:
if isinstance(default, FieldInfo):
# Handle pydantic.Field defaults
if default.default_factory is not None:
return field(default_factory=default.default_factory)
else:
default = default.default
return field(default=default)
raise ValueError(
f"{cls.__name__}.{name} must have a default value or default factory."
)
def getattr_iter(
object: object, names: Iterable[str], default: Any, warn: bool = False
) -> Any:
"""
A helper function that retrieves an attribute from an object which may
have multiple possible names. This is useful when fetching attributes from
arbitrary `transformers.PretrainedConfig` instances.
In the case where the first name in `names` is the preferred name, and
any other names are deprecated aliases, setting `warn=True` will log a
warning when a deprecated name is used.
"""
for i, name in enumerate(names):
if hasattr(object, name):
if warn and i > 0:
logger.warning_once(
"%s contains a deprecated attribute name '%s'. "
"Please use the preferred attribute name '%s' instead.",
type(object).__name__,
name,
names[0],
)
return getattr(object, name)
return default
def contains_object_print(text: str) -> bool:
"""
Check if the text looks like a printed Python object, e.g.
contains any substring matching the pattern: "at 0xFFFFFFF>"
We match against 0x followed by 2-16 hex chars (there's
a max of 16 on a 64-bit system).
Args:
text (str): The text to check
Returns:
result (bool): `True` if a match is found, `False` otherwise.
"""
pattern = r"at 0x[a-fA-F0-9]{2,16}>"
match = re.search(pattern, text)
return match is not None
def assert_hashable(text: str) -> bool:
if not contains_object_print(text):
return True
raise AssertionError(
f"vLLM tried to hash some configs that may have Python objects ids "
f"in them. This is a bug, please file an issue. "
f"Text being hashed: {text}"
)
def get_attr_docs(cls: type[Any]) -> dict[str, str]:
"""
Get any docstrings placed after attribute assignments in a class body.
https://davidism.com/mit-license/
"""
cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
if not isinstance(cls_node, ast.ClassDef):
raise TypeError("Given object was not a class.")
out = {}
# Consider each pair of nodes.
for a, b in pairwise(cls_node.body):
# Must be an assignment then a constant string.
if (
not isinstance(a, (ast.Assign, ast.AnnAssign))
or not isinstance(b, ast.Expr)
or not isinstance(b.value, ast.Constant)
or not isinstance(b.value.value, str)
):
continue
doc = inspect.cleandoc(b.value.value)
# An assignment can have multiple targets (a = b = v), but an
# annotated assignment only has one target.
targets = a.targets if isinstance(a, ast.Assign) else [a.target]
for target in targets:
# Must be assigning to a plain name.
if not isinstance(target, ast.Name):
continue
out[target.id] = doc
return out
def is_init_field(cls: ConfigType, name: str) -> bool:
return next(f for f in fields(cls) if f.name == name).init
@runtime_checkable
class SupportsHash(Protocol):
def compute_hash(self) -> str: ...
class SupportsMetricsInfo(Protocol):
def metrics_info(self) -> dict[str, str]: ...
def update_config(config: ConfigT, overrides: dict[str, Any]) -> ConfigT:
processed_overrides = {}
for field_name, value in overrides.items():
assert hasattr(config, field_name), (
f"{type(config)} has no field `{field_name}`"
)
current_value = getattr(config, field_name)
if is_dataclass(current_value) and not is_dataclass(value):
assert isinstance(value, dict), (
f"Overrides to {type(config)}.{field_name} must be a dict"
f" or {type(current_value)}, but got {type(value)}"
)
value = update_config(
current_value, # type: ignore[type-var]
value,
)
processed_overrides[field_name] = value
return replace(config, **processed_overrides)
def normalize_value(x):
"""Return a stable, JSON-serializable canonical form for hashing.
Order: primitives, special types (Enum, callable, torch.dtype, Path), then
generic containers (Mapping/Set/Sequence) with recursion.
"""
# Fast path
if x is None or isinstance(x, (bool, int, float, str)):
return x
# Enums: tag with FQN to avoid primitive collisions.
# Ex: Enum(1) vs int(1) -> ("module.QualName", value).
if isinstance(x, enum.Enum):
enum_type = f"{x.__class__.__module__}.{x.__class__.__qualname__}"
return (enum_type, normalize_value(x.value))
# Classes (types) are accepted and canonicalized by their fully-qualified
# name (module.qualname) for a stable identifier.
# Instances are only accepted if they expose uuid(); otherwise they are
# rejected to avoid under-hashing object state.
# Callables: accept classes only; reject funcs/lambdas/methods.
# Used by LogitsProcessor types and ModelConfig.hf_overrides.
if isinstance(x, type):
module = getattr(x, "__module__", "")
qual = getattr(x, "__qualname__", getattr(x, "__name__", ""))
return ".".join([p for p in (module, qual) if p]) or repr(x)
# Prefer stable uuid identifiers for objects that provide them, even if
# they are callable instances (e.g., InductorPass wrappers).
if hasattr(x, "uuid") and callable(getattr(x, "uuid", None)):
return x.uuid()
if callable(x):
raise TypeError("normalize_value: function or callable instance unsupported")
# Torch dtype: stringify (torch.float64 -> "torch.float64").
# We rely on the string form here; dtype-bearing fields that need additional
# disambiguation should encode that at the config layer.
if isinstance(x, torch.dtype):
return str(x)
# Bytes
if isinstance(x, (bytes, bytearray)):
return x.hex()
# Paths (canonicalize)
if isinstance(x, pathlib.Path):
try:
return str(x.expanduser().resolve())
except Exception:
return str(x)
# Dataclasses: represent as (FQN, sorted(field,value) tuple) for stability.
if is_dataclass(x):
type_fqn = f"{x.__class__.__module__}.{x.__class__.__qualname__}"
items = tuple(
(f.name, normalize_value(getattr(x, f.name)))
for f in sorted(fields(x), key=lambda f: f.name)
)
return (type_fqn, items)
# Containers (generic)
if isinstance(x, Mapping):
return tuple(sorted((str(k), normalize_value(v)) for k, v in x.items()))
if isinstance(x, Set):
return tuple(sorted(repr(normalize_value(v)) for v in x))
if isinstance(x, Sequence) and not isinstance(x, (str, bytes, bytearray)):
return tuple(normalize_value(v) for v in x)
# PretrainedConfig
if hasattr(x, "to_json_string") and callable(x.to_json_string):
return x.to_json_string()
# Unsupported type: e.g., modules, generators, open files, or objects
# without a stable JSON/UUID representation. Hard-error to avoid
# under-hashing.
# If you hit this, either reshape your config to use supported primitives
# and containers, or extend normalize_value to provide a stable encoding
# (e.g., via uuid() or to_json_string()) for this type.
raise TypeError(
f"normalize_value: unsupported type '{type(x).__name__}'. "
"Ensure config values use supported primitives/containers or add a "
"stable representation for this type."
)
def get_hash_factors(config: ConfigT, ignored_factors: set[str]) -> dict[str, object]:
"""Gets the factors used for hashing a config class.
- Includes all dataclass fields not in `ignored_factors`.
- Errors on non-normalizable values.
"""
factors: dict[str, object] = {}
for dc_field in fields(config):
factor = dc_field.name
if factor in ignored_factors:
continue
value = getattr(config, factor, None)
try:
factors[factor] = normalize_value(value)
except TypeError as e:
raise TypeError(
f"get_hash_factors: unsupported type for key '{factor}' "
f"({type(value).__name__})"
) from e
return factors
def hash_factors(items: dict[str, object]) -> str:
"""Return a SHA-256 hex digest of the canonical items structure."""
return hashlib.sha256(json.dumps(items, sort_keys=True).encode()).hexdigest()
def handle_deprecated(
config: ConfigT,
old_name: str,
new_name_or_names: str | list[str],
removal_version: str,
) -> None:
old_val = getattr(config, old_name)
if old_val is None:
return
if isinstance(new_name_or_names, str):
new_names = [new_name_or_names]
else:
new_names = new_name_or_names
msg = (
f"{old_name} is deprecated and will be removed in {removal_version}. "
f"Use {', '.join(new_names)} instead."
)
logger.warning(msg)
for new_name in new_names:
setattr(config, new_name, old_val)
@dataclass
class Range:
"""
A range of numbers.
Inclusive of start, inclusive of end.
"""
start: int
end: int
def is_single_size(self) -> bool:
return self.start == self.end
def __contains__(self, size: int) -> bool:
# Inclusive of start, inclusive of end
return self.start <= size <= self.end
def __eq__(self, other: object) -> bool:
if not isinstance(other, Range):
return False
return self.start == other.start and self.end == other.end
def __hash__(self) -> int:
return hash((self.start, self.end))
def __str__(self) -> str:
return f"({self.start}, {self.end})"
def __repr__(self) -> str:
return self.__str__()

1434
vllm/config/vllm.py Normal file

File diff suppressed because it is too large Load Diff