Add minimal vLLM 0.16.1 build repo for BI-V150

This commit is contained in:
2026-04-18 10:56:22 +08:00
commit d69657327e
1895 changed files with 615301 additions and 0 deletions

130
vllm/config/__init__.py Normal file
View File

@@ -0,0 +1,130 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.config.attention import AttentionConfig
from vllm.config.cache import CacheConfig
from vllm.config.compilation import (
CompilationConfig,
CompilationMode,
CUDAGraphMode,
PassConfig,
)
from vllm.config.device import DeviceConfig
from vllm.config.ec_transfer import ECTransferConfig
from vllm.config.kernel import KernelConfig
from vllm.config.kv_events import KVEventsConfig
from vllm.config.kv_transfer import KVTransferConfig
from vllm.config.load import LoadConfig
from vllm.config.lora import LoRAConfig
from vllm.config.model import (
ModelConfig,
iter_architecture_defaults,
str_dtype_to_torch_dtype,
try_match_architecture_defaults,
)
from vllm.config.multimodal import MultiModalConfig
from vllm.config.observability import ObservabilityConfig
from vllm.config.offload import (
OffloadBackend,
OffloadConfig,
PrefetchOffloadConfig,
UVAOffloadConfig,
)
from vllm.config.parallel import EPLBConfig, ParallelConfig
from vllm.config.pooler import PoolerConfig
from vllm.config.profiler import ProfilerConfig
from vllm.config.scheduler import SchedulerConfig
from vllm.config.speculative import SpeculativeConfig
from vllm.config.speech_to_text import SpeechToTextConfig
from vllm.config.structured_outputs import StructuredOutputsConfig
from vllm.config.utils import (
ConfigType,
SupportsMetricsInfo,
config,
get_attr_docs,
is_init_field,
replace,
update_config,
)
from vllm.config.vllm import (
VllmConfig,
get_cached_compilation_config,
get_current_vllm_config,
get_current_vllm_config_or_none,
get_layers_from_vllm_config,
set_current_vllm_config,
)
from vllm.config.weight_transfer import WeightTransferConfig
# __all__ should only contain classes and functions.
# Types and globals should be imported from their respective modules.
__all__ = [
# From vllm.config.attention
"AttentionConfig",
# From vllm.config.cache
"CacheConfig",
# From vllm.config.compilation
"CompilationConfig",
"CompilationMode",
"CUDAGraphMode",
"PassConfig",
# From vllm.config.device
"DeviceConfig",
# From vllm.config.ec_transfer
"ECTransferConfig",
# From vllm.config.kernel
"KernelConfig",
# From vllm.config.kv_events
"KVEventsConfig",
# From vllm.config.kv_transfer
"KVTransferConfig",
# From vllm.config.load
"LoadConfig",
# From vllm.config.lora
"LoRAConfig",
# From vllm.config.model
"ModelConfig",
"iter_architecture_defaults",
"str_dtype_to_torch_dtype",
"try_match_architecture_defaults",
# From vllm.config.multimodal
"MultiModalConfig",
# From vllm.config.observability
"ObservabilityConfig",
# From vllm.config.offload
"OffloadBackend",
"OffloadConfig",
"PrefetchOffloadConfig",
"UVAOffloadConfig",
# From vllm.config.parallel
"EPLBConfig",
"ParallelConfig",
# From vllm.config.pooler
"PoolerConfig",
# From vllm.config.scheduler
"SchedulerConfig",
# From vllm.config.speculative
"SpeculativeConfig",
# From vllm.config.speech_to_text
"SpeechToTextConfig",
# From vllm.config.structured_outputs
"StructuredOutputsConfig",
# From vllm.config.profiler
"ProfilerConfig",
# From vllm.config.utils
"ConfigType",
"SupportsMetricsInfo",
"config",
"get_attr_docs",
"is_init_field",
"replace",
"update_config",
# From vllm.config.vllm
"VllmConfig",
"get_cached_compilation_config",
"get_current_vllm_config",
"get_current_vllm_config_or_none",
"set_current_vllm_config",
"get_layers_from_vllm_config",
"WeightTransferConfig",
]

69
vllm/config/attention.py Normal file
View File

@@ -0,0 +1,69 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, Literal
from pydantic import field_validator
from vllm.config.utils import config
from vllm.v1.attention.backends.registry import AttentionBackendEnum
@config
class AttentionConfig:
"""Configuration for attention mechanisms in vLLM."""
backend: AttentionBackendEnum | None = None
"""Attention backend to use. If None, will be selected automatically."""
flash_attn_version: Literal[2, 3] | None = None
"""Force vllm to use a specific flash-attention version (2 or 3).
Only valid when using the flash-attention backend."""
use_prefill_decode_attention: bool = False
"""Use separate prefill and decode kernels for attention instead of
the unified triton kernel."""
flash_attn_max_num_splits_for_cuda_graph: int = 32
"""Flash Attention max number splits for cuda graph decode."""
use_cudnn_prefill: bool = False
"""Whether to use cudnn prefill."""
use_trtllm_ragged_deepseek_prefill: bool = True
"""Whether to use TRTLLM ragged deepseek prefill."""
use_trtllm_attention: bool | None = None
"""If set to True/False, use or don't use the TRTLLM attention backend
in flashinfer. If None, auto-detect the attention backend in flashinfer."""
disable_flashinfer_prefill: bool = False
"""Whether to disable flashinfer prefill."""
disable_flashinfer_q_quantization: bool = False
"""If set, when using fp8 kv, do not quantize Q to fp8."""
use_prefill_query_quantization: bool = False
"""If set, quantize query for attention in prefill."""
def compute_hash(self) -> str:
"""
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
from vllm.config.utils import get_hash_factors, hash_factors
ignored_factors: list[str] = []
factors = get_hash_factors(self, ignored_factors)
return hash_factors(factors)
@field_validator("backend", mode="before")
@classmethod
def validate_backend_before(cls, value: Any) -> Any:
"""Enable parsing of the `backend` enum type from string."""
if isinstance(value, str):
return AttentionBackendEnum[value.upper()]
return value

250
vllm/config/cache.py Normal file
View File

@@ -0,0 +1,250 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
from dataclasses import field
from typing import TYPE_CHECKING, Any, Literal
from pydantic import Field, SkipValidation, field_validator
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.mem_utils import format_gib, get_cpu_memory
if TYPE_CHECKING:
from vllm.config.parallel import ParallelConfig
else:
ParallelConfig = Any
logger = init_logger(__name__)
BlockSize = Literal[1, 8, 16, 32, 64, 128, 256]
CacheDType = Literal[
"auto",
"bfloat16",
"fp8",
"fp8_e4m3",
"fp8_e5m2",
"fp8_inc",
"fp8_ds_mla",
]
MambaDType = Literal["auto", "float32", "float16"]
MambaCacheMode = Literal["all", "align", "none"]
PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor", "xxhash", "xxhash_cbor"]
KVOffloadingBackend = Literal["native", "lmcache"]
@config
class CacheConfig:
"""Configuration for the KV cache."""
block_size: SkipValidation[BlockSize] = None # type: ignore[assignment]
"""Size of a contiguous cache block in number of tokens. On CUDA devices,
only block sizes up to 32 are supported.
This config has no static default. If left unspecified by the user, it will
be set in `Platform.check_and_update_config()` based on the current
platform."""
gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
"""The fraction of GPU memory to be used for the model executor, which can
range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
utilization. If unspecified, will use the default value of 0.9. This is a
per-instance limit, and only applies to the current vLLM instance. It does
not matter if you have another vLLM instance running on the same GPU. For
example, if you have two vLLM instances running on the same GPU, you can
set the GPU memory utilization to 0.5 for each instance."""
swap_space: float = Field(default=4, ge=0)
"""Size of the CPU swap space per GPU (in GiB)."""
cache_dtype: CacheDType = "auto"
"""Data type for kv cache storage. If "auto", will use model data type.
CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
fp8 (=fp8_e4m3). Intel Gaudi (HPU) supports fp8 (using fp8_inc).
Some models (namely DeepSeekV3.2) default to fp8, set to bfloat16 to use
bfloat16 instead, this is an invalid option for models that do not default
to fp8.
"""
is_attention_free: bool = False
"""Whether the model is attention-free. This is primarily set in
`ModelConfig` and that value should be manually duplicated here."""
num_gpu_blocks_override: int | None = None
"""Number of GPU blocks to use. This overrides the profiled `num_gpu_blocks`
if specified. Does nothing if `None`. Used for testing preemption."""
sliding_window: int | None = None
"""Sliding window size for the KV cache. This is primarily set in
`ModelConfig` and that value should be manually duplicated here."""
enable_prefix_caching: bool = True
"""Whether to enable prefix caching."""
prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
"""Set the hash algorithm for prefix caching:\n
- "sha256" uses Pickle for object serialization before hashing. This is the
current default, as SHA256 is the most secure choice to avoid potential
hash collisions.\n
- "sha256_cbor" provides a reproducible, cross-language compatible hash. It
serializes objects using canonical CBOR and hashes them with SHA-256.\n
- "xxhash" uses Pickle serialization with xxHash (128-bit) for faster,
non-cryptographic hashing. Requires the optional ``xxhash`` package.
IMPORTANT: Use of a hashing algorithm that is not considered
cryptographically secure theoretically increases the risk of hash collisions,
which can cause undefined behavior or even leak private information in
multi-tenant environments. Even if collisions are still very unlikely, it is
important to consider your security risk tolerance against the performance
benefits before turning this on.\n
- "xxhash_cbor" combines canonical CBOR serialization with xxHash for
reproducible hashing. Requires the optional ``xxhash`` package."""
cpu_offload_gb: float = Field(default=0, ge=0)
"""The space in GiB to offload to CPU, per GPU. Default is 0, which means
no offloading. Intuitively, this argument can be seen as a virtual way to
increase the GPU memory size. For example, if you have one 24 GB GPU and
set this to 10, virtually you can think of it as a 34 GB GPU. Then you can
load a 13B model with BF16 weight, which requires at least 26GB GPU memory.
Note that this requires fast CPU-GPU interconnect, as part of the model is
loaded from CPU memory to GPU memory on the fly in each model forward pass.
DEPRECATED: This field is deprecated and will be removed in v0.16.
Please use OffloadConfig.uva.cpu_offload_gb instead.
"""
cpu_offload_params: set[str] = Field(default_factory=set)
"""The set of parameter name segments to target for CPU offloading.
DEPRECATED: This field is deprecated and will be removed in v0.16.
Please use OffloadConfig.uva.cpu_offload_params instead.
"""
calculate_kv_scales: bool = False
"""This enables dynamic calculation of `k_scale` and `v_scale` when
kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
checkpoint if available. Otherwise, the scales will default to 1.0."""
cpu_kvcache_space_bytes: int | None = None
"""(CPU backend only) CPU key-value cache space."""
mamba_page_size_padded: int | None = None
""" Optional override for mamba page size; used by hybrid mamba/attention
models to ensure exact alignment with attention page size."""
mamba_block_size: int | None = Field(default=None, gt=0)
"""Size of a contiguous cache block in number of tokens for mamba cache.
Can be set only when prefix caching is enabled.
Value must be a multiple of 8 to align with causal_conv1d kernel."""
mamba_cache_dtype: MambaDType = "auto"
"""The data type to use for the Mamba cache (both the conv as well as the
ssm state). If set to 'auto', the data type will be inferred from the model
config."""
mamba_ssm_cache_dtype: MambaDType = "auto"
"""The data type to use for the Mamba cache (ssm state only, conv state will
still be controlled by mamba_cache_dtype). If set to 'auto', the data type
for the ssm state will be determined by mamba_cache_dtype."""
mamba_cache_mode: MambaCacheMode = "none"
"""The cache strategy for Mamba layers.
- "none": set when prefix caching is disabled.
- "all": cache the mamba state of all tokens at position i * block_size. This is
the default behavior (for models that support it) when prefix caching is
enabled.
- "align": only cache the mamba state of the last token of each scheduler step and
when the token is at position i * block_size.
"""
# Will be set after profiling.
num_gpu_blocks: int | None = field(default=None, init=False)
"""The number of blocks to allocate for GPU memory."""
num_cpu_blocks: int | None = field(default=None, init=False)
"""The number of blocks to allocate for CPU memory."""
kv_sharing_fast_prefill: bool = False
"""This feature is work in progress and no prefill optimization takes place
with this flag enabled currently.
In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254),
some layers can skip tokens corresponding to prefill. This flag enables
attention metadata for eligible layers to be overridden with metadata
necessary for implementing this optimization in some models (e.g. Gemma3n)
"""
kv_cache_memory_bytes: int | None = None
"""Size of KV Cache per GPU in bytes. By default, this is set to None
and vllm can automatically infer the kv cache size based on
gpu_memory_utilization. However, users may want to manually specify
the kv cache memory size. kv_cache_memory_bytes allows more fine-grain
control of how much memory gets used when compared with using
gpu_memory_utilization. Note that kv_cache_memory_bytes
(when not-None) ignores gpu_memory_utilization"""
kv_offloading_size: float | None = None
"""Size of the KV cache offloading buffer in GiB. When TP > 1, this is
the total buffer size summed across all TP ranks. By default, this is set
to None, which means no KV offloading is enabled. When set, vLLM will
enable KV cache offloading to CPU using the kv_offloading_backend."""
kv_offloading_backend: KVOffloadingBackend = "native"
"""The backend to use for KV cache offloading. Supported backends include
'native' (vLLM native CPU offloading), 'lmcache'.
KV offloading is only activated when kv_offloading_size is set."""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
ignored_factors = {
# Runtime/derived knobs that don't affect compiled graph shape
"gpu_memory_utilization",
"swap_space",
"is_attention_free",
"num_gpu_blocks_override",
"enable_prefix_caching",
"prefix_caching_hash_algo",
"cpu_kvcache_space_bytes",
"mamba_page_size_padded",
# Post-init/derived counters
"num_gpu_blocks",
"num_cpu_blocks",
# WIP feature toggle not impacting compiled graph shape
"kv_sharing_fast_prefill",
}
from vllm.config.utils import get_hash_factors, hash_factors
factors = get_hash_factors(self, ignored_factors)
return hash_factors(factors)
def metrics_info(self):
# convert cache_config to dict(key: str, value: str) for prometheus
# metrics info
return {key: str(value) for key, value in self.__dict__.items()}
@field_validator("cache_dtype", mode="after")
@classmethod
def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType:
if cache_dtype.startswith("fp8"):
logger.info(
"Using fp8 data type to store kv cache. It reduces the GPU "
"memory footprint and boosts the performance. "
"Meanwhile, it may cause accuracy drop without a proper "
"scaling factor."
)
return cache_dtype
def verify_with_parallel_config(
self,
parallel_config: ParallelConfig,
) -> None:
swap_space_bytes = math.ceil(self.swap_space * GiB_bytes)
total_cpu_memory = get_cpu_memory()
# FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
# group are in the same node. However, the GPUs may span multiple nodes.
num_gpus_per_node = parallel_config.tensor_parallel_size
cpu_memory_usage = swap_space_bytes * num_gpus_per_node
msg = (
f"{format_gib(cpu_memory_usage)} GiB out of the "
f"{format_gib(total_cpu_memory)} GiB total CPU memory "
"is allocated for the swap space."
)
if cpu_memory_usage > 0.7 * total_cpu_memory:
raise ValueError("Too large swap space. " + msg)
elif cpu_memory_usage > 0.4 * total_cpu_memory:
logger.warning("Possibly too large swap space. %s", msg)

1196
vllm/config/compilation.py Normal file

File diff suppressed because it is too large Load Diff

73
vllm/config/device.py Normal file
View File

@@ -0,0 +1,73 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import field
from typing import Any, Literal
import torch
from pydantic import ConfigDict, SkipValidation
from vllm.config.utils import config
from vllm.utils.hashing import safe_hash
Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
@config(config=ConfigDict(arbitrary_types_allowed=True))
class DeviceConfig:
"""Configuration for the device to use for vLLM execution."""
device: SkipValidation[Device | torch.device | None] = "auto"
"""Device type for vLLM execution.
This parameter is deprecated and will be
removed in a future release.
It will now be set automatically based
on the current platform."""
device_type: str = field(init=False)
"""Device type from the current platform. This is set in
`__post_init__`."""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# the device/platform information will be summarized
# by torch/vllm automatically.
factors: list[Any] = []
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
def __post_init__(self):
if self.device == "auto":
# Automated device type detection
from vllm.platforms import current_platform
self.device_type = current_platform.device_type
if not self.device_type:
raise RuntimeError(
"Failed to infer device type, please set "
"the environment variable `VLLM_LOGGING_LEVEL=DEBUG` "
"to turn on verbose logging to help debug the issue."
)
else:
# Device type is assigned explicitly
if isinstance(self.device, str):
self.device_type = self.device
elif isinstance(self.device, torch.device):
self.device_type = self.device.type
# Some device types require processing inputs on CPU
if self.device_type in ["tpu"]:
self.device = None
else:
# Set device with device type
self.device = torch.device(self.device_type)

107
vllm/config/ec_transfer.py Normal file
View File

@@ -0,0 +1,107 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import hashlib
import uuid
from dataclasses import field
from typing import Any, Literal, get_args
from vllm.config.utils import config
ECProducer = Literal["ec_producer", "ec_both"]
ECConsumer = Literal["ec_consumer", "ec_both"]
ECRole = Literal[ECProducer, ECConsumer]
@config
class ECTransferConfig:
"""Configuration for distributed EC cache transfer."""
ec_connector: str | None = None
"""The EC connector for vLLM to transmit EC caches between vLLM instances.
"""
engine_id: str | None = None
"""The engine id for EC transfers."""
ec_buffer_device: str | None = "cuda"
"""The device used by ec connector to buffer the EC cache.
Currently only support 'cuda'."""
ec_buffer_size: float = 1e9
"""The buffer size for TorchDistributedConnector. Measured in number of
bytes. Recommended value: 1e9 (about 1GB)."""
ec_role: ECRole | None = None
"""Whether this vLLM instance produces, consumes EC cache, or both. Choices
are 'ec_producer', 'ec_consumer', 'ec_both'."""
ec_rank: int | None = None
"""The rank of this vLLM instance in the EC cache transfer. Typical value:
0 for encoder, 1 for pd instance.
Currently only 1P1D is supported."""
ec_parallel_size: int = 1
"""The number of parallel instances for EC cache transfer. For
PyNcclConnector, this should be 2."""
ec_ip: str = "127.0.0.1"
"""The EC connector ip, used to build distributed connection."""
ec_port: int = 14579
"""The EC connector port, used to build distributed connection."""
ec_connector_extra_config: dict[str, Any] = field(default_factory=dict)
"""any extra config that the connector may need."""
ec_connector_module_path: str | None = None
"""The Python module path to dynamically load the EC connector from.
Only supported in V1."""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
def __post_init__(self) -> None:
if self.engine_id is None:
self.engine_id = str(uuid.uuid4())
if self.ec_role is not None and self.ec_role not in get_args(ECRole):
raise ValueError(
f"Unsupported ec_role: {self.ec_role}. "
f"Supported roles are {get_args(ECRole)}"
)
if self.ec_connector is not None and self.ec_role is None:
raise ValueError(
"Please specify ec_role when ec_connector "
f"is set, supported roles are {get_args(ECRole)}"
)
@property
def is_ec_transfer_instance(self) -> bool:
return self.ec_connector is not None and self.ec_role in get_args(ECRole)
@property
def is_ec_producer(self) -> bool:
return self.ec_connector is not None and self.ec_role in get_args(ECProducer)
@property
def is_ec_consumer(self) -> bool:
return self.ec_connector is not None and self.ec_role in get_args(ECConsumer)
def get_from_extra_config(self, key, default) -> Any:
return self.ec_connector_extra_config.get(key, default)

76
vllm/config/kernel.py Normal file
View File

@@ -0,0 +1,76 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Callable
from typing import Any, Literal
from pydantic import Field, field_validator
from vllm.config.utils import config
from vllm.utils.hashing import safe_hash
MoEBackend = Literal[
"auto",
"triton",
"deep_gemm",
"cutlass",
"flashinfer_trtllm",
"flashinfer_cutlass",
"flashinfer_cutedsl",
"marlin",
"aiter",
]
@config
class KernelConfig:
"""Configuration for kernel selection and warmup behavior."""
enable_flashinfer_autotune: bool = Field(default=None)
"""If True, run FlashInfer autotuning during kernel warmup."""
moe_backend: MoEBackend = "auto"
"""Backend for MoE expert computation kernels. Available options:
- "auto": Automatically select the best backend based on model and hardware\n
- "triton": Use Triton-based fused MoE kernels\n
- "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)\n
- "cutlass": Use vLLM CUTLASS kernels\n
- "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels\n
- "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels\n
- "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)\n
- "marlin": Use Marlin kernels (weight-only quantization)\n
- "aiter": Use AMD AITer kernels (ROCm only)"""
@field_validator("moe_backend", mode="before")
@classmethod
def _normalize_moe_backend(cls, value: Any) -> Any:
if isinstance(value, str):
return value.lower().replace("-", "_")
return value
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
@field_validator("enable_flashinfer_autotune", mode="wrap")
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
"""Skip validation if the value is `None` when initialization is delayed."""
if value is None:
return value
return handler(value)

54
vllm/config/kv_events.py Normal file
View File

@@ -0,0 +1,54 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Literal
from pydantic import Field
from vllm.config.utils import config
@config
class KVEventsConfig:
"""Configuration for KV event publishing."""
enable_kv_cache_events: bool = False
"""If True, enable KV cache events for tracking block storage and removal.
Events can be published externally by zmq using the event publisher config.
"""
publisher: Literal["null", "zmq"] = Field(default=None)
"""The publisher to use for publishing kv events. Can be "null", "zmq".
"""
endpoint: str = "tcp://*:5557"
"""The zmq endpoint to use for publishing kv events.
"""
replay_endpoint: str | None = None
"""The zmq endpoint to use for replaying kv events.
"""
buffer_steps: int = 10_000
"""The number of steps to cache for replay endpoint. Will only save
events from the last N steps for the replay endpoint.
"""
hwm: int = 100_000
"""The zmq high water mark for the event publisher. After queueing N events,
events will start dropping if the consumer is not keeping up.
"""
max_queue_size: int = 100_000
"""The maximum number of events to queue while waiting for publishing.
"""
topic: str = ""
"""The topic to use for the event publisher. Consumers can subscribe to
this topic to receive events.
"""
def __post_init__(self):
if self.publisher is None:
self.publisher = "zmq" if self.enable_kv_cache_events else "null"

116
vllm/config/kv_transfer.py Normal file
View File

@@ -0,0 +1,116 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import uuid
from dataclasses import field
from typing import Any, Literal, get_args
from vllm.config.utils import config
from vllm.utils.hashing import safe_hash
KVProducer = Literal["kv_producer", "kv_both"]
KVConsumer = Literal["kv_consumer", "kv_both"]
KVRole = Literal[KVProducer, KVConsumer]
@config
class KVTransferConfig:
"""Configuration for distributed KV cache transfer."""
kv_connector: str | None = None
"""The KV connector for vLLM to transmit KV caches between vLLM instances.
"""
engine_id: str | None = None
"""The engine id for KV transfers."""
kv_buffer_device: str = "cuda"
"""The device used by kv connector to buffer the KV cache. Choices are
'cuda' and 'cpu'."""
kv_buffer_size: float = 1e9
"""The buffer size for TorchDistributedConnector. Measured in number of
bytes. Recommended value: 1e9 (about 1GB)."""
kv_role: KVRole | None = None
"""Whether this vLLM instance produces, consumes KV cache, or both. Choices
are 'kv_producer', 'kv_consumer', and 'kv_both'."""
kv_rank: int | None = None
"""The rank of this vLLM instance in the KV cache transfer. Typical value:
0 for prefill instance, 1 for decode instance.
Currently only 1P1D is supported."""
kv_parallel_size: int = 1
"""The number of parallel instances for KV cache transfer. For
P2pNcclConnector, this should be 2."""
kv_ip: str = "127.0.0.1"
"""The KV connector ip, used to build distributed connection."""
kv_port: int = 14579
"""The KV connector port, used to build distributed connection."""
kv_connector_extra_config: dict[str, Any] = field(default_factory=dict)
"""any extra config that the connector may need."""
kv_connector_module_path: str | None = None
"""The Python module path to dynamically load the KV connector from.
Only supported in V1."""
enable_permute_local_kv: bool = False
"""Experiment feature flag to enable HND to NHD KV Transfer"""
kv_load_failure_policy: Literal["recompute", "fail"] = "fail"
"""Policy for handling KV cache load failures.
'recompute': reschedule the request to recompute failed blocks
'fail': immediately fail the request with an error finish reason (default)"""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
def __post_init__(self) -> None:
if self.engine_id is None:
self.engine_id = str(uuid.uuid4())
if self.kv_role is not None and self.kv_role not in get_args(KVRole):
raise ValueError(
f"Unsupported kv_role: {self.kv_role}. "
f"Supported roles are {get_args(KVRole)}"
)
if self.kv_connector is not None and self.kv_role is None:
raise ValueError(
"Please specify kv_role when kv_connector "
f"is set, supported roles are {get_args(KVRole)}"
)
@property
def is_kv_transfer_instance(self) -> bool:
return self.kv_connector is not None and self.kv_role in get_args(KVRole)
@property
def is_kv_producer(self) -> bool:
return self.kv_connector is not None and self.kv_role in get_args(KVProducer)
@property
def is_kv_consumer(self) -> bool:
return self.kv_connector is not None and self.kv_role in get_args(KVConsumer)
def get_from_extra_config(self, key, default) -> Any:
return self.kv_connector_extra_config.get(key, default)

122
vllm/config/load.py Normal file
View File

@@ -0,0 +1,122 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import TYPE_CHECKING, Any
from pydantic import Field, field_validator
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.utils.hashing import safe_hash
if TYPE_CHECKING:
from vllm.model_executor.model_loader import LoadFormats
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
else:
LoadFormats = Any
TensorizerConfig = Any
logger = init_logger(__name__)
@config
class LoadConfig:
"""Configuration for loading the model weights."""
load_format: str | LoadFormats = "auto"
"""The format of the model weights to load:\n
- "auto" will try to load the weights in the safetensors format and fall
back to the pytorch bin format if safetensors format is not available.\n
- "pt" will load the weights in the pytorch bin format.\n
- "safetensors" will load the weights in the safetensors format.\n
- "npcache" will load the weights in pytorch format and store a numpy cache
to speed up the loading.\n
- "dummy" will initialize the weights with random values, which is mainly
for profiling.\n
- "tensorizer" will use CoreWeave's tensorizer library for fast weight
loading. See the Tensorize vLLM Model script in the Examples section for
more information.\n
- "runai_streamer" will load the Safetensors weights using Run:ai Model
Streamer.\n
- "runai_streamer_sharded" will load weights from pre-sharded checkpoint
files using Run:ai Model Streamer.\n
- "bitsandbytes" will load the weights using bitsandbytes quantization.\n
- "sharded_state" will load weights from pre-sharded checkpoint files,
supporting efficient loading of tensor-parallel models.\n
- "gguf" will load weights from GGUF format files (details specified in
https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n
- "mistral" will load weights from consolidated safetensors files used by
Mistral models.
- Other custom values can be supported via plugins."""
download_dir: str | None = None
"""Directory to download and load the weights, default to the default
cache directory of Hugging Face."""
safetensors_load_strategy: str = "lazy"
"""Specifies the loading strategy for safetensors weights.
- "lazy" (default): Weights are memory-mapped from the file. This enables
on-demand loading and is highly efficient for models on local storage.
- "eager": The entire file is read into CPU memory upfront before loading.
This is recommended for models on network filesystems (e.g., Lustre, NFS)
as it avoids inefficient random reads, significantly speeding up model
initialization. However, it uses more CPU RAM.
- "torchao": Weights are loaded in upfront and then reconstructed
into torchao tensor subclasses. This is used when the checkpoint
was quantized using torchao and saved using safetensors.
Needs torchao >= 0.14.0
"""
model_loader_extra_config: dict | TensorizerConfig = Field(default_factory=dict)
"""Extra config for model loader. This will be passed to the model loader
corresponding to the chosen load_format."""
device: str | None = None
"""Device to which model weights will be loaded, default to
device_config.device"""
ignore_patterns: list[str] | str = Field(default_factory=lambda: ["original/**/*"])
"""The list of patterns to ignore when loading the model. Default to
"original/**/*" to avoid repeated loading of llama's checkpoints."""
use_tqdm_on_load: bool = True
"""Whether to enable tqdm for showing progress bar when loading model
weights."""
pt_load_map_location: str | dict[str, str] = "cpu"
"""
pt_load_map_location: the map location for loading pytorch checkpoint, to
support loading checkpoints can only be loaded on certain devices like
"cuda", this is equivalent to {"": "cuda"}. Another supported format is
mapping from different devices like from GPU 1 to GPU 0:
{"cuda:1": "cuda:0"}. Note that when passed from command line, the strings
in dictionary needs to be double quoted for json parsing. For more details,
see original doc for `map_location` in https://pytorch.org/docs/stable/generated/torch.load.html
"""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
@field_validator("load_format", mode="after")
def _lowercase_load_format(cls, load_format: str) -> str:
return load_format.lower()
@field_validator("ignore_patterns", mode="after")
def _validate_ignore_patterns(
cls, ignore_patterns: list[str] | str
) -> list[str] | str:
if ignore_patterns != ["original/**/*"] and len(ignore_patterns) > 0:
logger.info(
"Ignoring the following patterns when downloading weights: %s",
ignore_patterns,
)
return ignore_patterns

107
vllm/config/lora.py Normal file
View File

@@ -0,0 +1,107 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import TYPE_CHECKING, Any, Literal
import torch
from pydantic import ConfigDict, Field, model_validator
from typing_extensions import Self
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.utils.hashing import safe_hash
if TYPE_CHECKING:
from vllm.config import ModelConfig
from vllm.config.cache import CacheConfig
else:
ModelConfig = Any
CacheConfig = Any
logger = init_logger(__name__)
LoRADType = Literal["auto", "float16", "bfloat16"]
MaxLoRARanks = Literal[1, 8, 16, 32, 64, 128, 256, 320, 512]
LoRAExtraVocabSize = Literal[256, 512]
@config(config=ConfigDict(arbitrary_types_allowed=True))
class LoRAConfig:
"""Configuration for LoRA."""
max_lora_rank: MaxLoRARanks = 16
"""Max LoRA rank."""
max_loras: int = Field(default=1, ge=1)
"""Max number of LoRAs in a single batch."""
fully_sharded_loras: bool = False
"""By default, only half of the LoRA computation is sharded with tensor
parallelism. Enabling this will use the fully sharded layers. At high
sequence length, max rank or tensor parallel size, this is likely faster.
"""
max_cpu_loras: int | None = None
"""Maximum number of LoRAs to store in CPU memory. Must be >= than
`max_loras`."""
lora_dtype: torch.dtype | LoRADType = "auto"
"""Data type for LoRA. If auto, will default to base model dtype."""
default_mm_loras: dict[str, str] | None = None
"""Dictionary mapping specific modalities to LoRA model paths; this field
is only applicable to multimodal models and should be leveraged when a
model always expects a LoRA to be active when a given modality is present.
Note that currently, if a request provides multiple additional
modalities, each of which have their own LoRA, we do NOT apply
default_mm_loras because we currently only support one lora adapter
per prompt. When run in offline mode, the lora IDs for n modalities
will be automatically assigned to 1-n with the names of the modalities
in alphabetic order."""
enable_tower_connector_lora: bool = False
"""If `True`, LoRA support for the tower (vision encoder) and connector
of multimodal models will be enabled. This is an experimental feature and
currently only supports some MM models such as the Qwen VL series. The default
is False."""
specialize_active_lora: bool = False
"""Whether to construct lora kernel grid by the number of active LoRA adapters.
When set to True, separate cuda graphs will be captured for different counts
of active LoRAs (powers of 2 up to max_loras), which can improve performance
for variable LoRA usage patterns at the cost of increased startup time and
memory usage. Only takes effect when cudagraph_specialize_lora is True.
"""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
factors: list[Any] = []
factors.append(self.max_lora_rank)
factors.append(self.max_loras)
factors.append(self.fully_sharded_loras)
factors.append(self.lora_dtype)
factors.append(self.enable_tower_connector_lora)
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
@model_validator(mode="after")
def _validate_lora_config(self) -> Self:
if self.max_cpu_loras is None:
self.max_cpu_loras = self.max_loras
elif self.max_cpu_loras < self.max_loras:
raise ValueError(
f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
f"max_loras ({self.max_loras})."
)
return self
def verify_with_model_config(self, model_config: ModelConfig):
if self.lora_dtype in (None, "auto"):
self.lora_dtype = model_config.dtype
elif isinstance(self.lora_dtype, str):
self.lora_dtype = getattr(torch, self.lora_dtype)

2056
vllm/config/model.py Normal file

File diff suppressed because it is too large Load Diff

57
vllm/config/model_arch.py Normal file
View File

@@ -0,0 +1,57 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
from pydantic import ConfigDict
from pydantic.dataclasses import dataclass
from vllm.logger import init_logger
logger = init_logger(__name__)
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class ModelArchitectureConfig:
"""
Configuration for model architecture that required by vLLM runtime
"""
architectures: list[str] | None
"""List of model architecture class names (e.g., ['LlamaForCausalLM']).
It can be None upon calling `vllm_config.with_hf_config(config.text_config)`"""
model_type: str
"""Model type identifier (e.g., 'llama', 'gpt_oss')."""
text_model_type: str | None
"""Text model type identifier (e.g., 'llama4_text')."""
hidden_size: int
"""Hidden size of the model."""
total_num_hidden_layers: int
"""Number of hidden layers in the model."""
total_num_attention_heads: int
"""Number of attention heads in the model."""
head_size: int
"""Head dimension of the model."""
vocab_size: int
"""Vocabulary size of the model."""
total_num_kv_heads: int
"""Number of key value heads in the model."""
num_experts: int
"""Number of experts in the model."""
quantization_config: dict[str, Any] | None
"""Quantization configuration dictionary containing quantization parameters."""
is_deepseek_mla: bool
"""Whether the model is a DeepSeek MLA model."""
derived_max_model_len_and_key: tuple[float, str | None]
"""Derived maximum model length and key from the hf config."""

281
vllm/config/multimodal.py Normal file
View File

@@ -0,0 +1,281 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Mapping
from typing import Any, Literal, TypeAlias, TypedDict, final
from pydantic import ConfigDict, Field, field_validator, model_validator
from pydantic.dataclasses import dataclass
from vllm.config.utils import config
from vllm.utils.hashing import safe_hash
from vllm.v1.attention.backends.registry import AttentionBackendEnum
@dataclass
class BaseDummyOptions:
"""Base options for generating dummy data during profiling."""
count: int = Field(999, ge=0)
@dataclass(config=ConfigDict(extra="forbid"))
class VideoDummyOptions(BaseDummyOptions):
"""Options for generating dummy video data during profiling."""
num_frames: int | None = Field(None, gt=0)
width: int | None = Field(None, gt=0)
height: int | None = Field(None, gt=0)
@dataclass(config=ConfigDict(extra="forbid"))
class ImageDummyOptions(BaseDummyOptions):
"""Options for generating dummy image data during profiling."""
width: int | None = Field(None, gt=0)
height: int | None = Field(None, gt=0)
@dataclass(config=ConfigDict(extra="forbid"))
class AudioDummyOptions(BaseDummyOptions):
"""Options for generating dummy audio data during profiling."""
length: int | None = Field(None, gt=0)
@final
class MultiModalDummyOptionsBuiltins(TypedDict, total=False):
"""Type annotations for modality types predefined by vLLM."""
image: ImageDummyOptions
"""Options for dummy images."""
video: VideoDummyOptions
"""Options for dummy videos."""
audio: AudioDummyOptions
"""Options for dummy audios."""
MMEncoderTPMode = Literal["weights", "data"]
MMCacheType = Literal["shm", "lru"]
MMDummyOptions: TypeAlias = dict[str, BaseDummyOptions]
"""
A dictionary containing an entry for each modality type of dummy data.
The built-in modalities are defined by
[`MultiModalDummyOptionsBuiltins`][vllm.config.multimodal.MultiModalDummyOptionsBuiltins].
"""
@config
class MultiModalConfig:
"""Controls the behavior of multimodal models."""
language_model_only: bool = False
"""If True, disables all multimodal inputs by setting all modality limits to 0.
Equivalent to setting `--limit-mm-per-prompt` to 0 for every modality."""
limit_per_prompt: MMDummyOptions = Field(default_factory=dict)
"""The maximum number of input items and options allowed per
prompt for each modality.
Defaults to 999 for each modality.
Legacy format (count only):
{"image": 16, "video": 2}
Configurable format (with options):
{"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512},
"image": {"count": 5, "width": 512, "height": 512}}
Mixed format (combining both):
{"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512,
"height": 512}}
"""
enable_mm_embeds: bool = False
"""If `True`, enables passing multimodal embeddings:
for `LLM` class, this refers to tensor inputs under `multi_modal_data`;
for the OpenAI-compatible server, this refers to chat messages with content
`"type": "*_embeds"`.
When enabled with `--limit-mm-per-prompt` set to 0 for a modality,
precomputed embeddings skip count validation for that modality,
saving memory by not loading encoder modules while still enabling
embeddings as an input. Limits greater than 0 still apply to embeddings.
WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
Only enable this flag for trusted users!"""
media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
mm_processor_kwargs: dict[str, object] | None = None
"""Arguments to be forwarded to the model's processor for multi-modal data,
e.g., image processor. Overrides for the multi-modal processor obtained
from `transformers.AutoProcessor.from_pretrained`.
The available overrides depend on the model that is being run.
For example, for Phi-3-Vision:
`{"num_crops": 4}`."""
mm_processor_cache_gb: float = Field(default=4, ge=0)
"""The size (in GiB) of the multi-modal processor cache, which is used to
avoid re-processing past multi-modal inputs.
This cache is duplicated for each API process and engine core process,
resulting in a total memory usage of
`mm_processor_cache_gb * (api_server_count + data_parallel_size)`.
Set to `0` to disable this cache completely (not recommended)."""
mm_processor_cache_type: MMCacheType = "lru"
"""Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
mm_shm_cache_max_object_size_mb: int = Field(default=128, ge=0)
"""Size limit (in MiB) for each object stored in the multi-modal processor
shared memory cache. Only effective when `mm_processor_cache_type` is
`"shm"`."""
mm_encoder_only: bool = False
"""
When enabled, skips the language component of the model.
This is usually only valid in disaggregated Encoder process.
"""
mm_encoder_tp_mode: MMEncoderTPMode = "weights"
"""Indicates how to optimize multi-modal encoder inference using tensor
parallelism (TP).
- `"weights"`: Within the same vLLM engine, split the weights of
each layer across TP ranks. (default TP behavior)\n
- `"data"`: Within the same vLLM engine, split the batched input data
across TP ranks to process the data in parallel, while hosting
the full weights on each TP rank.
This batch-level DP is not to be confused with API request-level
DP (which is controlled by `--data-parallel-size`).
This is only supported on a per-model basis and falls back to
`"weights"` if the encoder does not support DP."""
mm_encoder_attn_backend: AttentionBackendEnum | None = None
"""Optional override for the multi-modal encoder attention backend when
using vision transformers. Accepts any value from
`vllm.v1.attention.backends.registry.AttentionBackendEnum` (e.g. `FLASH_ATTN`)."""
interleave_mm_strings: bool = False
"""Enable fully interleaved support for multimodal prompts, while using
--chat-template-content-format=string."""
skip_mm_profiling: bool = False
"""When enabled, skips multimodal memory profiling and only profiles with
language backbone model during engine initialization.
This reduces engine startup time but shifts the responsibility to users for
estimating the peak memory usage of the activation of multimodal encoder and
embedding cache."""
video_pruning_rate: float | None = Field(default=None, ge=0.0, lt=1.0)
"""Sets pruning rate for video pruning via Efficient Video Sampling.
Value sits in range [0;1) and determines fraction of media tokens
from each video to be pruned.
"""
@field_validator("limit_per_prompt", mode="before")
@classmethod
def _validate_limit_per_prompt(
cls,
value: dict[str, int | dict[str, int]],
) -> MMDummyOptions:
out: MMDummyOptions = {}
for k, v in value.items():
# Handle legacy format where only count is specified
if isinstance(v, int):
v = {"count": v}
# Convert to the appropriate DummyOptions subclass
if k == "video":
out[k] = VideoDummyOptions(**v)
elif k == "image":
out[k] = ImageDummyOptions(**v)
elif k == "audio":
out[k] = AudioDummyOptions(**v)
else:
out[k] = BaseDummyOptions(**v)
return out
@field_validator("mm_encoder_attn_backend", mode="before")
@classmethod
def _validate_mm_encoder_attn_backend(
cls, value: str | AttentionBackendEnum | None
) -> AttentionBackendEnum | None:
if isinstance(value, str) and value.upper() == "XFORMERS":
raise ValueError(
"Attention backend 'XFORMERS' has been removed (See PR #29262 for "
"details). Please select a supported attention backend."
)
if value is None or isinstance(value, AttentionBackendEnum):
return value
assert isinstance(value, str), (
"mm_encoder_attn_backend must be a string or an AttentionBackendEnum."
)
return AttentionBackendEnum[value.upper()]
@model_validator(mode="after")
def _validate_multimodal_config(self):
if self.mm_processor_cache_type != "shm" and (
self.mm_shm_cache_max_object_size_mb
!= MultiModalConfig.mm_shm_cache_max_object_size_mb
):
raise ValueError(
"'mm_shm_cache_max_object_size_mb' should only be set when "
"'mm_processor_cache_type' is 'shm'."
)
return self
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
factors: list[Any] = [
self.mm_encoder_attn_backend.name
if self.mm_encoder_attn_backend is not None
else None,
self.mm_encoder_tp_mode,
]
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
def get_limit_per_prompt(self, modality: str) -> int:
"""
Get the maximum number of input items allowed per prompt
for the given modality (backward compatible).
"""
if self.language_model_only:
return 0
limit_data = self.limit_per_prompt.get(modality)
if limit_data is None:
# Unspecified modality is set to 999 by default
return 999
return limit_data.count
def merge_mm_processor_kwargs(
self,
inference_kwargs: Mapping[str, object],
) -> dict[str, object]:
"""
Get the keyword arguments to pass to the multi-modal processor
according to the extra arguments passed during inference.
"""
kwargs = self.mm_processor_kwargs or {}
return kwargs | dict(inference_kwargs)
def is_multimodal_pruning_enabled(self):
return self.video_pruning_rate is not None and self.video_pruning_rate > 0

View File

@@ -0,0 +1,152 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from functools import cached_property
from typing import Any, Literal, cast
from packaging.version import parse
from pydantic import Field, field_validator, model_validator
from vllm import version
from vllm.config.utils import config
from vllm.utils.hashing import safe_hash
DetailedTraceModules = Literal["model", "worker", "all"]
@config
class ObservabilityConfig:
"""Configuration for observability - metrics and tracing."""
show_hidden_metrics_for_version: str | None = None
"""Enable deprecated Prometheus metrics that have been hidden since the
specified version. For example, if a previously deprecated metric has been
hidden since the v0.7.0 release, you use
`--show-hidden-metrics-for-version=0.7` as a temporary escape hatch while
you migrate to new metrics. The metric is likely to be removed completely
in an upcoming release."""
@cached_property
def show_hidden_metrics(self) -> bool:
"""Check if the hidden metrics should be shown."""
if self.show_hidden_metrics_for_version is None:
return False
return version._prev_minor_version_was(self.show_hidden_metrics_for_version)
otlp_traces_endpoint: str | None = None
"""Target URL to which OpenTelemetry traces will be sent."""
collect_detailed_traces: list[DetailedTraceModules] | None = None
"""It makes sense to set this only if `--otlp-traces-endpoint` is set. If
set, it will collect detailed traces for the specified modules. This
involves use of possibly costly and or blocking operations and hence might
have a performance impact.
Note that collecting detailed timing information for each request can be
expensive."""
kv_cache_metrics: bool = False
"""Enable KV cache residency metrics (lifetime, idle time, reuse gaps).
Uses sampling to minimize overhead.
Requires log stats to be enabled (i.e., --disable-log-stats not set)."""
kv_cache_metrics_sample: float = Field(default=0.01, gt=0, le=1)
"""Sampling rate for KV cache metrics (0.0, 1.0]. Default 0.01 = 1% of blocks."""
cudagraph_metrics: bool = False
"""Enable CUDA graph metrics (number of padded/unpadded tokens, runtime cudagraph
dispatch modes, and their observed frequencies at every logging interval)."""
enable_layerwise_nvtx_tracing: bool = False
"""Enable layerwise NVTX tracing. This traces the execution of each layer or
module in the model and attach informations such as input/output shapes to
nvtx range markers. Noted that this doesn't work with CUDA graphs enabled."""
enable_mfu_metrics: bool = False
"""Enable Model FLOPs Utilization (MFU) metrics."""
enable_mm_processor_stats: bool = False
"""Enable collection of timing statistics for multimodal processor operations.
This is for internal use only (e.g., benchmarks) and is not exposed as a CLI
argument."""
enable_logging_iteration_details: bool = False
"""Enable detailed logging of iteration details.
If set, vllm EngineCore will log iteration details
This includes number of context/generation requests and tokens
and the elapsed cpu time for the iteration."""
@cached_property
def collect_model_forward_time(self) -> bool:
"""Whether to collect model forward time for the request."""
return self.collect_detailed_traces is not None and (
"model" in self.collect_detailed_traces
or "all" in self.collect_detailed_traces
)
@cached_property
def collect_model_execute_time(self) -> bool:
"""Whether to collect model execute time for the request."""
return self.collect_detailed_traces is not None and (
"worker" in self.collect_detailed_traces
or "all" in self.collect_detailed_traces
)
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
@field_validator("show_hidden_metrics_for_version")
@classmethod
def _validate_show_hidden_metrics_for_version(cls, value: str | None) -> str | None:
if value is not None:
# Raises an exception if the string is not a valid version.
parse(value)
return value
@field_validator("otlp_traces_endpoint")
@classmethod
def _validate_otlp_traces_endpoint(cls, value: str | None) -> str | None:
if value is not None:
from vllm.tracing import is_tracing_available, otel_import_error_traceback
if not is_tracing_available():
raise ValueError(
"OpenTelemetry is not available. Unable to configure "
"'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
f"installed. Original error:\n{otel_import_error_traceback}"
)
return value
@field_validator("collect_detailed_traces")
@classmethod
def _validate_collect_detailed_traces(
cls, value: list[DetailedTraceModules] | None
) -> list[DetailedTraceModules] | None:
"""Handle the legacy case where users might provide a comma-separated
string instead of a list of strings."""
if value is not None and len(value) == 1 and "," in value[0]:
value = cast(list[DetailedTraceModules], value[0].split(","))
return value
@model_validator(mode="after")
def _validate_tracing_config(self):
if self.collect_detailed_traces and not self.otlp_traces_endpoint:
raise ValueError(
"collect_detailed_traces requires `--otlp-traces-endpoint` to be set."
)
return self

153
vllm/config/offload.py Normal file
View File

@@ -0,0 +1,153 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Configuration for model weight offloading."""
import warnings
from typing import Literal
from pydantic import Field, model_validator
from vllm.config.utils import config
OffloadBackend = Literal["auto", "uva", "prefetch"]
@config
class UVAOffloadConfig:
"""Configuration for UVA (Unified Virtual Addressing) CPU offloading.
Uses zero-copy access from CPU-pinned memory. Simple but requires
fast CPU-GPU interconnect.
"""
cpu_offload_gb: float = Field(default=0, ge=0)
"""The space in GiB to offload to CPU, per GPU. Default is 0, which means
no offloading. Intuitively, this argument can be seen as a virtual way to
increase the GPU memory size. For example, if you have one 24 GB GPU and
set this to 10, virtually you can think of it as a 34 GB GPU. Then you can
load a 13B model with BF16 weight, which requires at least 26GB GPU memory.
Note that this requires fast CPU-GPU interconnect, as part of the model is
loaded from CPU memory to GPU memory on the fly in each model forward pass.
This uses UVA (Unified Virtual Addressing) for zero-copy access.
"""
cpu_offload_params: set[str] = Field(default_factory=set)
"""The set of parameter name segments to target for CPU offloading.
Unmatched parameters are not offloaded. If this set is empty, parameters
are offloaded non-selectively until the memory limit defined by
`cpu_offload_gb` is reached.
Examples:
- For parameter name "mlp.experts.w2_weight":
- "experts" or "experts.w2_weight" will match.
- "expert" or "w2" will NOT match (must be exact segments).
This allows distinguishing parameters like "w2_weight" and "w2_weight_scale".
"""
@config
class PrefetchOffloadConfig:
"""Configuration for prefetch-based CPU offloading.
Groups layers and uses async H2D prefetch to hide transfer latency.
"""
offload_group_size: int = Field(default=0, ge=0)
"""Group every N layers together. Offload last `offload_num_in_group`
layers of each group. Default is 0 (disabled).
Example: group_size=8, num_in_group=2 offloads layers 6,7,14,15,22,23,...
Unlike cpu_offload_gb, this uses explicit async prefetching to hide transfer
latency.
"""
offload_num_in_group: int = Field(default=1, ge=1)
"""Number of layers to offload per group.
Must be <= offload_group_size. Default is 1."""
offload_prefetch_step: int = Field(default=1, ge=0)
"""Number of layers to prefetch ahead.
Higher values hide more latency but use more GPU memory. Default is 1."""
offload_params: set[str] = Field(default_factory=set)
"""The set of parameter name segments to target for prefetch offloading.
Unmatched parameters are not offloaded. If this set is empty, ALL
parameters of each offloaded layer are offloaded.
Uses segment matching: "w13_weight" matches "mlp.experts.w13_weight"
but not "mlp.experts.w13_weight_scale".
"""
@config
class OffloadConfig:
"""Configuration for model weight offloading to reduce GPU memory usage."""
offload_backend: OffloadBackend = "auto"
"""The backend for weight offloading. Options:
- "auto": Selects based on which sub-config has non-default values
(prefetch if offload_group_size > 0, uva if cpu_offload_gb > 0).
- "uva": UVA (Unified Virtual Addressing) zero-copy offloading.
- "prefetch": Async prefetch with group-based layer offloading.
"""
uva: UVAOffloadConfig = Field(default_factory=UVAOffloadConfig)
"""Parameters for UVA offloading backend."""
prefetch: PrefetchOffloadConfig = Field(default_factory=PrefetchOffloadConfig)
"""Parameters for prefetch offloading backend."""
@model_validator(mode="after")
def validate_offload_config(self) -> "OffloadConfig":
"""Validate offload configuration constraints."""
if self.offload_backend == "prefetch" or self.prefetch.offload_group_size > 0:
if self.prefetch.offload_num_in_group > self.prefetch.offload_group_size:
raise ValueError(
f"offload_num_in_group ({self.prefetch.offload_num_in_group})"
f" must be <= offload_group_size"
f" ({self.prefetch.offload_group_size})"
)
if self.prefetch.offload_prefetch_step < 1:
raise ValueError(
f"offload_prefetch_step"
f" ({self.prefetch.offload_prefetch_step})"
f" must be >= 1 when prefetch offloading is enabled"
f" (offload_group_size > 0)"
)
# Warn if both backends have non-default values
uva_active = self.uva.cpu_offload_gb > 0
prefetch_active = self.prefetch.offload_group_size > 0
if self.offload_backend == "uva" and prefetch_active:
warnings.warn(
"Prefetch offload fields are set but offload_backend='uva'. "
"Prefetch settings will be ignored.",
stacklevel=2,
)
elif self.offload_backend == "prefetch" and uva_active:
warnings.warn(
"UVA offload fields are set but offload_backend='prefetch'. "
"UVA settings will be ignored.",
stacklevel=2,
)
elif self.offload_backend == "auto" and uva_active and prefetch_active:
warnings.warn(
"Both UVA and prefetch offload fields are set with "
"offload_backend='auto'. Prefetch backend will be selected. "
"Set offload_backend explicitly to suppress this warning.",
stacklevel=2,
)
return self
def compute_hash(self) -> str:
"""
Provide a hash that uniquely identifies all the offload configs.
All fields are included because PrefetchOffloader patches module
forwards and inserts custom ops (wait_prefetch, start_prefetch)
into the computation graph. Changing any offload setting can
alter which layers are hooked and how prefetch indices are
computed, so the compilation cache must distinguish them.
"""
from vllm.config.utils import get_hash_factors, hash_factors
factors = get_hash_factors(self, ignored_factors=set())
hash_str = hash_factors(factors)
return hash_str

713
vllm/config/parallel.py Normal file
View File

@@ -0,0 +1,713 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from collections.abc import Callable
from typing import TYPE_CHECKING, Any, Literal
import torch
from pydantic import Field, field_validator, model_validator
from torch.distributed import ProcessGroup, ReduceOp
from typing_extensions import Self
import vllm.envs as envs
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.model_executor.layers.batch_invariant import (
vllm_is_batch_invariant,
)
from vllm.platforms import current_platform
from vllm.utils.network_utils import get_open_ports_list
from vllm.utils.torch_utils import cuda_device_count_stateless
if TYPE_CHECKING:
from ray.runtime_env import RuntimeEnv
from ray.util.placement_group import PlacementGroup
from vllm.v1.executor import Executor
else:
RuntimeEnv = Any
PlacementGroup = Any
Executor = Any
logger = init_logger(__name__)
ExpertPlacementStrategy = Literal["linear", "round_robin"]
DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
DataParallelBackend = Literal["ray", "mp"]
EPLBPolicyOption = Literal["default"]
All2AllBackend = Literal[
"naive",
"pplx",
"deepep_high_throughput",
"deepep_low_latency",
"mori",
"allgather_reducescatter",
"flashinfer_all2allv",
]
@config
class EPLBConfig:
"""Configuration for Expert Parallel Load Balancing (EP)."""
window_size: int = 1000
"""Window size for expert load recording."""
step_interval: int = 3000
"""
Interval for rearranging experts in expert parallelism.
Note that if this is greater than the EPLB window size, only the metrics
of the last `lb_window_size` steps will be used for rearranging experts.
"""
num_redundant_experts: int = Field(default=0, ge=0)
"""Number of redundant experts to use for expert parallelism."""
log_balancedness: bool = False
"""
Log the balancedness each step of expert parallelism.
This is turned off by default since it will cause communication overhead.
"""
log_balancedness_interval: int = 1
"""
Interval for logging the balancedness.
"""
use_async: bool = False
"""
Whether to use non-blocking EPLB.
"""
policy: EPLBPolicyOption = "default"
"""The policy type for expert parallel load balancing (EPLB)."""
@model_validator(mode="after")
def _validate_eplb_config(self) -> Self:
if self.use_async and self.policy != "default":
raise ValueError("Async EPLB is only supported with the default policy.")
if self.log_balancedness and self.log_balancedness_interval <= 0:
raise ValueError("log_balancedness_interval must be greater than 0.")
return self
@config
class ParallelConfig:
"""Configuration for the distributed execution."""
pipeline_parallel_size: int = 1
"""Number of pipeline parallel groups."""
tensor_parallel_size: int = 1
"""Number of tensor parallel groups."""
prefill_context_parallel_size: int = 1
"""Number of prefill context parallel groups."""
data_parallel_size: int = 1
"""Number of data parallel groups. MoE layers will be sharded according to
the product of the tensor parallel size and data parallel size."""
data_parallel_size_local: int = 1
"""Number of local data parallel groups."""
data_parallel_rank: int = 0
"""Rank of the data parallel group."""
data_parallel_rank_local: int | None = None
"""Local rank of the data parallel group,
set only in SPMD mode."""
data_parallel_master_ip: str = "127.0.0.1"
"""IP of the data parallel master."""
data_parallel_rpc_port: int = 29550
"""Port for data parallel messaging."""
data_parallel_master_port: int = 29500
"""Port of the data parallel master."""
data_parallel_backend: DataParallelBackend = "mp"
"""Backend to use for data parallel, either "mp" or "ray"."""
data_parallel_external_lb: bool = False
"""Whether to use "external" DP LB mode. Applies only to online serving
and when data_parallel_size > 0. This is useful for a "one-pod-per-rank"
wide-EP setup in Kubernetes. Set implicitly when --data-parallel-rank
is provided explicitly to vllm serve."""
data_parallel_hybrid_lb: bool = False
"""Whether to use "hybrid" DP LB mode. Applies only to online serving
and when data_parallel_size > 0. Enables running an AsyncLLM
and API server on a "per-node" basis where vLLM load balances
between local data parallel ranks, but an external LB balances
between vLLM nodes/replicas. Set explicitly in conjunction with
--data-parallel-start-rank."""
is_moe_model: bool | None = None
"""Whether the deployed model is MoE (if known)."""
enable_expert_parallel: bool = False
"""Use expert parallelism instead of tensor parallelism for MoE layers."""
enable_eplb: bool = False
"""Enable expert parallelism load balancing for MoE layers."""
eplb_config: EPLBConfig = Field(default_factory=EPLBConfig)
"""Expert parallelism configuration."""
expert_placement_strategy: ExpertPlacementStrategy = "linear"
"""The expert placement strategy for MoE layers:\n
- "linear": Experts are placed in a contiguous manner. For example, with 4
experts and 2 ranks, rank 0 will have experts [0, 1] and rank 1 will have
experts [2, 3].\n
- "round_robin": Experts are placed in a round-robin manner. For example,
with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1
will have experts [1, 3]. This strategy can help improve load balancing
for grouped expert models with no redundant experts."""
all2all_backend: All2AllBackend = "allgather_reducescatter"
"""All2All backend for MoE expert parallel communication. Available options:
- "naive": Naive all2all implementation using broadcasts\n
- "allgather_reducescatter": All2all based on allgather and reducescatter\n
- "pplx": Use pplx kernels\n
- "deepep_high_throughput": Use deepep high-throughput kernels\n
- "deepep_low_latency": Use deepep low-latency kernels\n
- "mori": Use mori kernels\n
- "flashinfer_all2allv": Use flashinfer alltoallv kernels for mnnvl"""
max_parallel_loading_workers: int | None = None
"""Maximum number of parallel loading workers when loading model
sequentially in multiple batches. To avoid RAM OOM when using tensor
parallel and large models."""
disable_custom_all_reduce: bool = False
"""Disable the custom all-reduce kernel and fall back to NCCL."""
enable_dbo: bool = False
"""Enable dual batch overlap for the model executor."""
ubatch_size: int = 0
"""Number of ubatch size."""
dbo_decode_token_threshold: int = 32
"""The threshold for dual batch overlap for batches only containing decodes.
If the number of tokens in the request is greater than this threshold,
microbatching will be used. Otherwise, the request will be processed in a
single batch."""
dbo_prefill_token_threshold: int = 512 # TODO(lucas): tune
"""The threshold for dual batch overlap for batches that contain one or more
prefills. If the number of tokens in the request is greater than this
threshold, microbatching will be used. Otherwise, the request will be
processed in a single batch."""
disable_nccl_for_dp_synchronization: bool | None = Field(default=None)
"""Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py
to use Gloo instead of NCCL for its all reduce.
Defaults to True when async scheduling is enabled, False otherwise.
"""
ray_workers_use_nsight: bool = False
"""Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
ray_runtime_env: RuntimeEnv | None = None
"""Ray runtime environment to pass to distributed workers."""
placement_group: PlacementGroup | None = None
"""ray distributed model workers placement group."""
distributed_executor_backend: (
str | DistributedExecutorBackend | type[Executor] | None
) = None
"""Backend to use for distributed model workers, either "ray" or "mp"
(multiprocessing). If the product of pipeline_parallel_size and tensor_parallel_size
is less than or equal to the number of GPUs available, "mp" will be used to
keep processing on a single host. Otherwise, an error will be raised. To use "mp"
you must also set nnodes, and to use "ray" you must manually set
distributed_executor_backend to "ray".
Note that tpu only support Ray for distributed inference."""
worker_cls: str = "auto"
"""The full name of the worker class to use. If "auto", the worker class
will be determined based on the platform."""
sd_worker_cls: str = "auto"
"""The full name of the worker class to use for speculative decoding.
If "auto", the worker class will be determined based on the platform."""
worker_extension_cls: str = ""
"""The full name of the worker extension class to use. The worker extension
class is dynamically inherited by the worker class. This is used to inject
new attributes and methods to the worker class for use in collective_rpc
calls."""
master_addr: str = "127.0.0.1"
"""distributed master address for multi-node distributed
inference when distributed_executor_backend is mp."""
master_port: int = 29501
"""distributed master port for multi-node distributed
inference when distributed_executor_backend is mp."""
node_rank: int = 0
"""distributed node rank for multi-node distributed
inference when distributed_executor_backend is mp."""
nnodes: int = 1
"""num of nodes for multi-node distributed
inference when distributed_executor_backend is mp."""
world_size: int = Field(init=False)
"""world_size is TPxPP, it affects the number of workers we create."""
rank: int = 0
"""Global rank in distributed setup."""
_data_parallel_master_port_list: list[int] = Field(default_factory=list)
"""List of open port auto-queried for data parallel messaging.
Set to be private as it's not intended to be configured by users.
"""
decode_context_parallel_size: int = 1
"""Number of decode context parallel groups, because the world size does
not change by dcp, it simply reuse the GPUs of TP group, and tp_size
needs to be divisible by dcp_size."""
dcp_kv_cache_interleave_size: int = 1
"""
Interleave size of kv_cache storage while using DCP.
dcp_kv_cache_interleave_size has been replaced by cp_kv_cache_interleave_size,
and will be deprecated when PCP is fully supported.
"""
cp_kv_cache_interleave_size: int = 1
"""Interleave size of kv_cache storage while using DCP or PCP.
For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
and `total_cp_world_size = pcp_world_size * dcp_world_size`.
store interleave_size tokens on total_cp_rank i,
then store next interleave_size tokens on total_cp_rank i+1.
Interleave_size=1: token-level alignment, where token `i` is stored on
total_cp_rank `i % total_cp_world_size`.
Interleave_size=block_size: block-level alignment, where tokens are
first populated to the preceding ranks. Tokens are then stored
in (rank i+1, block j) only after (rank i, block j) is fully occupied.
Block_size should be greater than or equal to cp_kv_cache_interleave_size.
Block_size should be divisible by cp_kv_cache_interleave_size.
"""
data_parallel_index: int = Field(init=False)
"""Equal to the data parallel rank but not used for torch process groups
and not overridden for dense models."""
_api_process_count: int = Field(default=1, gt=0)
"""
The number of API processes initialized.
Note:
This is an internal config that is only valid for and
should only be set by API server scale-out.
"""
_api_process_rank: int = Field(default=0, ge=-1)
"""
The rank of this API process, or `-1` for engine core processes
under API server scale-out.
Note:
This is an internal config that is only valid for and
should only be set by API server scale-out.
"""
@field_validator("disable_nccl_for_dp_synchronization", mode="wrap")
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
"""Skip validation if the value is `None` when initialisation is delayed."""
return None if value is None else handler(value)
@model_validator(mode="after")
def _validate_parallel_config(self) -> Self:
if self._api_process_rank >= self._api_process_count:
raise ValueError(
"Invalid value of `_api_process_rank`. "
f"Expected to be `-1` or `[0, {self._api_process_count})`, "
f"but found: {self._api_process_rank}"
)
if self.data_parallel_size_local > self.data_parallel_size:
raise ValueError(
f"data_parallel_size_local ({self.data_parallel_size_local}) "
f"must be <= data_parallel_size ({self.data_parallel_size})"
)
if self.data_parallel_size <= 1 and self.data_parallel_external_lb:
raise ValueError(
"data_parallel_external_lb can only be set when data_parallel_size > 1"
)
if self.enable_eplb:
if not current_platform.is_cuda_alike():
raise ValueError(
"Expert parallelism load balancing is only supported on "
"CUDA devices or ROCm devices now."
)
if not self.enable_expert_parallel:
raise ValueError("enable_expert_parallel must be True to use EPLB.")
if self.tensor_parallel_size * self.data_parallel_size <= 1:
raise ValueError(
"EPLB requires tensor_parallel_size or data_parallel_size "
f"to be greater than 1, but got "
f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}."
)
else:
if self.eplb_config.num_redundant_experts != 0:
raise ValueError(
"num_redundant_experts is set to "
f"{self.eplb_config.num_redundant_experts} but EPLB is not "
"enabled. Either enable EPLB or unset "
"num_redundant_experts."
)
# Note(hc): In the current implementation of decode context
# parallel(DCP), tp_size needs to be divisible by dcp_size,
# because the world size does not change by dcp, it simply
# reuses the GPUs of TP group, and split one TP group into
# tp_size//dcp_size DCP groups.
if self.tensor_parallel_size % self.decode_context_parallel_size != 0:
raise ValueError(
f"tp_size={self.tensor_parallel_size} must be divisible by"
f"dcp_size={self.decode_context_parallel_size}."
)
return self
@property
def world_size_across_dp(self) -> int:
"""world_size_across_dp is TPxPPxDP, it is the size of the world
including data parallelism."""
return self.world_size * self.data_parallel_size
@property
def use_ubatching(self) -> bool:
return self.enable_dbo or self.ubatch_size > 1
@property
def num_ubatches(self) -> int:
return 2 if self.enable_dbo else self.ubatch_size
@property
def local_engines_only(self) -> bool:
"""
Client manages local+remote EngineCores in pure internal LB case.
Client manages local EngineCores in hybrid and external LB case.
"""
return self.data_parallel_external_lb or self.data_parallel_hybrid_lb
def get_next_dp_init_port(self) -> int:
"""
We might need to initialize process groups in multiple
processes that is related to data parallelism,
e.g. both in the worker and in the engine, which
can live in different processes. To avoid port conflicts, we
pop a new port from the prepared port list each time we need to
initialize a new process group related to data parallelism.
"""
if self._data_parallel_master_port_list:
answer = self._data_parallel_master_port_list.pop()
else:
answer = self.data_parallel_master_port
self.data_parallel_master_port += 1
return answer
def stateless_init_dp_group(self) -> ProcessGroup:
# NOTE: In high-concurrency scenarios multiple processes
# can pick the same (currently free) port through a race
# condition when calling `get_open_port()`. When the first
# process binds the port the others will subsequently fail
# with `torch.distributed.DistNetworkError: EADDRINUSE`.
# To make the initialization more robust we retry a few times
# with a fresh port whenever this specific error is observed.
from torch.distributed import DistNetworkError
from vllm.distributed.utils import (
stateless_init_torch_distributed_process_group,
)
max_retries = 5
last_exc: Exception | None = None
for _ in range(max_retries):
try:
# use gloo since the engine process might not have cuda device
return stateless_init_torch_distributed_process_group(
self.data_parallel_master_ip,
self.get_next_dp_init_port(),
self.data_parallel_rank,
self.data_parallel_size,
backend=current_platform.dist_backend,
)
except DistNetworkError as e:
# We only want to retry when the root cause is EADDRINUSE.
if "EADDRINUSE" in str(e):
logger.warning("Address already in use. Retrying with a new port.")
last_exc = e
continue # try again with a new port
raise e
# If we get here all retries have failed.
assert last_exc is not None
raise last_exc
# The all_reduce at the end of attention (during o_proj) means that
# inputs are replicated across each rank of the tensor parallel group.
# If using expert-parallelism with DeepEP All2All ops, replicated
# tokens results in useless duplicate computation and communication.
#
# In this case, ensure the input to the experts is sequence parallel
# to avoid the excess work.
#
# Not needed for pplx-kernels as it can handle duplicate input tokens.
@property
def use_sequence_parallel_moe(self) -> bool:
return (
self.all2all_backend
in (
"allgather_reducescatter",
"naive",
"deepep_high_throughput",
"deepep_low_latency",
"mori",
)
and self.enable_expert_parallel
and self.tensor_parallel_size > 1
and self.data_parallel_size > 1
)
@property
def node_rank_within_dp(self) -> int:
return self.node_rank % self.nnodes_within_dp
@property
def nnodes_within_dp(self) -> int:
if self.nnodes == 1:
return 1
data_parallel_node_size = (
self.data_parallel_size // self.data_parallel_size_local
)
return self.nnodes // data_parallel_node_size
@property
def local_world_size(self) -> int:
return self.world_size // self.nnodes_within_dp
@staticmethod
def has_unfinished_dp(dp_group: ProcessGroup, has_unfinished: bool) -> bool:
tensor = torch.tensor([has_unfinished], dtype=torch.int32, device="cpu")
# dp rank 0: has_unfinished_seqs=True
# dp rank 1: has_unfinished_seqs=False
# aggregated: has_unfinished_seqs=True
# so this is an OR operation, i.e. MAX in integers
torch.distributed.all_reduce(tensor, op=ReduceOp.MAX, group=dp_group)
aggregated_has_unfinished = bool(tensor.item())
return aggregated_has_unfinished
@staticmethod
def sync_kv_cache_memory_size(dp_group: ProcessGroup, kv_cache_memory: int) -> int:
if kv_cache_memory == -1:
kv_cache_memory = torch.iinfo(torch.int64).max
tensor = torch.tensor([kv_cache_memory], dtype=torch.int64, device="cpu")
# we cannot use broadcast for stateless dp group since it depends
# on global rank
torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group)
return tensor.item()
def compute_hash(self):
"""
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
This hash is also used for DP worker configuration validation
to prevent hangs from mismatched collective communication patterns.
"""
ignored_factors = {
# Derived/runtime topology, networking, or launch details
"data_parallel_rank",
"data_parallel_rank_local",
"data_parallel_size_local",
"data_parallel_index",
"data_parallel_backend",
"data_parallel_external_lb",
"data_parallel_hybrid_lb",
"data_parallel_master_ip",
"data_parallel_master_port",
"_data_parallel_master_port_list",
"data_parallel_rpc_port",
"rank",
"master_addr",
"master_port",
"node_rank",
"nnodes",
"max_parallel_loading_workers",
"disable_custom_all_reduce",
"ray_workers_use_nsight",
"ray_runtime_env",
"placement_group",
"distributed_executor_backend",
"worker_cls",
"sd_worker_cls",
"worker_extension_cls",
"_api_process_count",
"_api_process_rank",
}
from vllm.config.utils import get_hash_factors, hash_factors
factors = get_hash_factors(self, ignored_factors)
return hash_factors(factors)
def __post_init__(self) -> None:
# Continue with the rest of the initialization
self.world_size = (
self.pipeline_parallel_size
* self.tensor_parallel_size
* self.prefill_context_parallel_size
)
if self.distributed_executor_backend == "external_launcher":
logger.info("Using external launcher for distributed inference.")
self.world_size *= self.data_parallel_size
if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
# Data parallel was specified in the engine args.
if self.distributed_executor_backend == "external_launcher":
# For external launcher,
# we need to set the data parallel rank automatically
self.data_parallel_rank = int(os.environ["RANK"]) // (
self.world_size // self.data_parallel_size
)
logger.info(
"Set data_parallel_rank to %d automatically.",
self.data_parallel_rank,
)
if not self._data_parallel_master_port_list:
self._data_parallel_master_port_list = get_open_ports_list(5)
self.data_parallel_master_port = self._data_parallel_master_port_list.pop()
if not (0 <= self.data_parallel_rank < self.data_parallel_size):
raise ValueError(
f"data_parallel_rank ({self.data_parallel_rank})"
f" must be in the range [0, {self.data_parallel_size})"
)
else:
# Otherwise fall back to env vars (e.g. for offline SPMD case).
self.data_parallel_size = envs.VLLM_DP_SIZE
self.data_parallel_rank = envs.VLLM_DP_RANK
self.data_parallel_rank_local = envs.VLLM_DP_RANK_LOCAL
self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
if self.data_parallel_size > 1 and self.is_moe_model is False:
raise ValueError(
"Offline data parallel mode is not supported/useful"
" for dense models."
)
self.data_parallel_index = self.data_parallel_rank
if self.distributed_executor_backend == "external_launcher":
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
logger.info("Disabling V1 multiprocessing for external launcher.")
if self.distributed_executor_backend is None and self.world_size > 1:
# We use multiprocessing by default if world_size fits on the
# current node and we aren't in a ray placement group.
from vllm.v1.executor import ray_utils
backend: DistributedExecutorBackend = "mp"
ray_found = ray_utils.ray_is_available()
if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
backend = "uni"
elif current_platform.is_cuda() and self.nnodes > 1:
backend = "mp"
elif (
current_platform.is_cuda()
and cuda_device_count_stateless() < self.world_size
):
gpu_count = cuda_device_count_stateless()
raise ValueError(
f"World size ({self.world_size}) is larger than the number of "
f"available GPUs ({gpu_count}) in this node. If this is "
"intentional and you are using:\n"
"- ray, set '--distributed-executor-backend ray'.\n"
"- multiprocessing, set '--nnodes' appropriately."
)
elif self.data_parallel_backend == "ray":
logger.info(
"Using ray distributed inference because "
"data_parallel_backend is ray"
)
backend = "ray"
elif ray_found:
if self.placement_group:
backend = "ray"
else:
from ray import is_initialized as ray_is_initialized
if ray_is_initialized():
from ray.util import get_current_placement_group
if get_current_placement_group():
backend = "ray"
self.distributed_executor_backend = backend
logger.debug("Defaulting to use %s for distributed inference", backend)
if self.distributed_executor_backend is None and self.world_size == 1:
self.distributed_executor_backend = "uni"
if self.max_parallel_loading_workers is not None:
logger.warning(
"max_parallel_loading_workers is currently "
"not supported and will be ignored."
)
allowed_backends = ("mp", "uni", "external_launcher")
if (
self.distributed_executor_backend not in allowed_backends
and self.nnodes > 1
):
raise ValueError(
"nnodes > 1 can only be set when distributed executor "
"backend is mp, uni or external_launcher."
)
@property
def use_ray(self) -> bool:
return self.distributed_executor_backend == "ray" or (
isinstance(self.distributed_executor_backend, type)
and getattr(self.distributed_executor_backend, "uses_ray", False)
)
@model_validator(mode="after")
def _verify_args(self) -> Self:
# Lazy import to avoid circular import
from vllm.v1.executor import Executor
# Enable batch invariance settings if requested
if vllm_is_batch_invariant():
self.disable_custom_all_reduce = True
if (
self.distributed_executor_backend is not None
and not isinstance(self.distributed_executor_backend, str)
and not (
isinstance(self.distributed_executor_backend, type)
and issubclass(self.distributed_executor_backend, Executor)
)
):
raise ValueError(
"Unrecognized distributed executor backend "
f"{self.distributed_executor_backend}. Supported "
"values are 'ray', 'mp' 'uni', 'external_launcher', "
" custom Executor subclass or its import path."
)
if self.use_ray:
from vllm.v1.executor import ray_utils
ray_utils.assert_ray_available()
if not current_platform.use_custom_allreduce():
self.disable_custom_all_reduce = True
logger.debug(
"Disabled the custom all-reduce kernel because it is not "
"supported on current platform."
)
if self.nnodes > 1:
self.disable_custom_all_reduce = True
logger.debug(
"Disabled the custom all-reduce since we are running on multi-node."
)
if self.ray_workers_use_nsight and not self.use_ray:
raise ValueError(
"Unable to use nsight profiling unless workers run with Ray."
)
return self

146
vllm/config/pooler.py Normal file
View File

@@ -0,0 +1,146 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, Literal, get_args
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.utils.hashing import safe_hash
logger = init_logger(__name__)
SequencePoolingType = Literal["CLS", "LAST", "MEAN"]
SEQ_POOLING_TYPES: tuple[SequencePoolingType, ...] = get_args(SequencePoolingType)
TokenPoolingType = Literal["ALL", "STEP"]
TOK_POOLING_TYPES: tuple[TokenPoolingType, ...] = get_args(TokenPoolingType)
@config
class PoolerConfig:
"""Controls the behavior of output pooling in pooling models."""
pooling_type: SequencePoolingType | TokenPoolingType | None = None
"""
The pooling method used for pooling.
If set, `seq_pooling_type` or `tok_pooling_type` are automatically populated
with this field. Alternatively, users can set `seq_pooling_type` and
`tok_pooling_type` explicitly.
This field is mainly for user convenience. Internal code should always use
`seq_pooling_type` or `tok_pooling_type` instead of `pooling_type`.
"""
seq_pooling_type: SequencePoolingType | None = None
"""
The pooling method used for sequence pooling.
"""
tok_pooling_type: TokenPoolingType | None = None
"""
The pooling method used for tokenwise pooling.
"""
use_activation: bool | None = None
"""
Whether to apply activation function to the pooler outputs.
`None` uses the pooler's default, which is `True` in most cases.
"""
## for embedding models
dimensions: int | None = None
"""
Reduce the dimensions of embeddings if model
support matryoshka representation. Defaults to None.
"""
enable_chunked_processing: bool = False
"""
Whether to enable chunked processing for long inputs that exceed the model's
maximum position embeddings. When enabled, long inputs will be split into
chunks, processed separately, and then aggregated using weighted averaging.
This allows embedding models to handle arbitrarily long text without CUDA
errors. Defaults to False.
"""
max_embed_len: int | None = None
"""
Maximum input length allowed for embedding generation. When set, allows
inputs longer than max_embed_len to be accepted for embedding models.
When an input exceeds max_embed_len, it will be handled according to
the original max_model_len validation logic.
Defaults to None (i.e. set to max_model_len).
"""
## for classification models
logit_bias: float | None = None
"""
If provided, apply classification logit biases. Defaults to None.
"""
## for reward models
step_tag_id: int | None = None
"""
If set, only the score corresponding to the `step_tag_id` in the
generated sentence should be returned. Otherwise, the scores for all tokens
are returned.
"""
returned_token_ids: list[int] | None = None
"""
A list of indices for the vocabulary dimensions to be extracted,
such as the token IDs of `good_token` and `bad_token` in the
`math-shepherd-mistral-7b-prm` model.
"""
def __post_init__(self) -> None:
if pooling_type := self.pooling_type:
if self.seq_pooling_type is not None:
raise ValueError(
"Cannot set both `pooling_type` and `seq_pooling_type`"
)
if self.tok_pooling_type is not None:
raise ValueError(
"Cannot set both `pooling_type` and `tok_pooling_type`"
)
if pooling_type in SEQ_POOLING_TYPES:
logger.debug(
"Resolved `pooling_type=%r` to `seq_pooling_type=%r`.",
pooling_type,
pooling_type,
)
self.seq_pooling_type = pooling_type
elif pooling_type in TOK_POOLING_TYPES:
logger.debug(
"Resolved `pooling_type=%r` to `tok_pooling_type=%r`.",
pooling_type,
pooling_type,
)
self.tok_pooling_type = pooling_type
else:
raise NotImplementedError(pooling_type)
def get_seq_pooling_type(self) -> SequencePoolingType:
assert self.seq_pooling_type is not None, "Should be resolved by ModelConfig"
return self.seq_pooling_type
def get_tok_pooling_type(self) -> TokenPoolingType:
assert self.tok_pooling_type is not None, "Should be resolved by ModelConfig"
return self.tok_pooling_type
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str

124
vllm/config/profiler.py Normal file
View File

@@ -0,0 +1,124 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from typing import Any, Literal
from pydantic import Field, model_validator
from typing_extensions import Self
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.utils.hashing import safe_hash
logger = init_logger(__name__)
ProfilerKind = Literal["torch", "cuda"]
def _is_uri_path(path: str) -> bool:
"""Check if path is a URI (scheme://...), excluding Windows drive letters.
Supports custom URI schemes like gs://, s3://, hdfs://, etc.
These paths should not be converted to absolute paths.
"""
if "://" in path:
scheme = path.split("://")[0]
# Windows drive letters are single characters (e.g., C://)
# Valid URI schemes have more than one character
return len(scheme) > 1
return False
@config
class ProfilerConfig:
"""Dataclass which contains profiler config for the engine."""
profiler: ProfilerKind | None = None
"""Which profiler to use. Defaults to None. Options are:
- 'torch': Use PyTorch profiler.\n
- 'cuda': Use CUDA profiler."""
torch_profiler_dir: str = ""
"""Directory to save torch profiler traces. Both AsyncLLM's CPU traces and
worker's traces (CPU & GPU) will be saved under this directory. Note that
it must be an absolute path."""
torch_profiler_with_stack: bool = True
"""If `True`, enables stack tracing in the torch profiler. Enabled by default."""
torch_profiler_with_flops: bool = False
"""If `True`, enables FLOPS counting in the torch profiler. Disabled by default."""
torch_profiler_use_gzip: bool = True
"""If `True`, saves torch profiler traces in gzip format. Enabled by default"""
torch_profiler_dump_cuda_time_total: bool = True
"""If `True`, dumps total CUDA time in torch profiler traces. Enabled by default."""
torch_profiler_record_shapes: bool = False
"""If `True`, records tensor shapes in the torch profiler. Disabled by default."""
torch_profiler_with_memory: bool = False
"""If `True`, enables memory profiling in the torch profiler.
Disabled by default."""
ignore_frontend: bool = False
"""If `True`, disables the front-end profiling of AsyncLLM when using the
'torch' profiler. This is needed to reduce overhead when using delay/limit options,
since the front-end profiling does not track iterations and will capture the
entire range.
"""
delay_iterations: int = Field(default=0, ge=0)
"""Number of engine iterations to skip before starting profiling.
Defaults to 0, meaning profiling starts immediately after receiving /start_profile.
"""
max_iterations: int = Field(default=0, ge=0)
"""Maximum number of engine iterations to profile after starting profiling.
Defaults to 0, meaning no limit.
"""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
@model_validator(mode="after")
def _validate_profiler_config(self) -> Self:
has_delay_or_limit = self.delay_iterations > 0 or self.max_iterations > 0
if self.profiler == "torch" and has_delay_or_limit and not self.ignore_frontend:
logger.warning_once(
"Using 'torch' profiler with delay_iterations or max_iterations "
"while ignore_frontend is False may result in high overhead."
)
profiler_dir = self.torch_profiler_dir
if profiler_dir and self.profiler != "torch":
raise ValueError(
"torch_profiler_dir is only applicable when profiler is set to 'torch'"
)
if self.profiler == "torch" and not profiler_dir:
raise ValueError("torch_profiler_dir must be set when profiler is 'torch'")
# Support any URI scheme (gs://, s3://, hdfs://, etc.)
# These paths should not be converted to absolute paths
if profiler_dir and not _is_uri_path(profiler_dir):
self.torch_profiler_dir = os.path.abspath(os.path.expanduser(profiler_dir))
return self

300
vllm/config/scheduler.py Normal file
View File

@@ -0,0 +1,300 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Callable
from dataclasses import InitVar
from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
from pydantic import Field, field_validator
from typing_extensions import Self
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.utils.hashing import safe_hash
from vllm.utils.import_utils import resolve_obj_by_qualname
if TYPE_CHECKING:
from vllm.v1.core.sched.interface import SchedulerInterface
logger = init_logger(__name__)
RunnerType = Literal["generate", "pooling", "draft"]
SchedulerPolicy = Literal["fcfs", "priority"]
@config
class SchedulerConfig:
"""Scheduler configuration."""
max_model_len: InitVar[int]
"""Maximum length of a sequence (including prompt and generated text).
Note: This is stored in the ModelConfig, and is used only here to
provide fallbacks and validate other attributes."""
is_encoder_decoder: InitVar[bool]
"""True if the model is an encoder-decoder model.
Note: This is stored in the ModelConfig, and is used only here to
disable chunked prefill and prefix caching for encoder-decoder models.
"""
DEFAULT_MAX_NUM_BATCHED_TOKENS: ClassVar[int] = 2048
DEFAULT_MAX_NUM_SEQS: ClassVar[int] = 128
runner_type: RunnerType = "generate"
"""The runner type to launch for the model."""
max_num_batched_tokens: int = Field(default=DEFAULT_MAX_NUM_BATCHED_TOKENS, ge=1)
"""Maximum number of tokens that can be processed in a single iteration.
The default value here is mainly for convenience when testing.
In real usage, this should be set in `EngineArgs.create_engine_config`.
"""
max_num_scheduled_tokens: int | None = Field(default=None)
"""Maximum number of tokens that the scheduler may issue in a single iteration.
This is usually equal to max_num_batched_tokens, but can be smaller in cases
when the model might append tokens into the batch (such as speculative decoding).
Defaults to max_num_batched_tokens."""
max_num_seqs: int = Field(default=DEFAULT_MAX_NUM_SEQS, ge=1)
"""Maximum number of sequences to be processed in a single iteration.
The default value here is mainly for convenience when testing.
In real usage, this should be set in `EngineArgs.create_engine_config`.
"""
max_num_partial_prefills: int = Field(default=1, ge=1)
"""For chunked prefill, the maximum number of sequences that can be
partially prefilled concurrently."""
max_long_partial_prefills: int = Field(default=1, ge=1)
"""For chunked prefill, the maximum number of prompts longer than
long_prefill_token_threshold that will be prefilled concurrently. Setting
this less than max_num_partial_prefills will allow shorter prompts to jump
the queue in front of longer prompts in some cases, improving latency."""
long_prefill_token_threshold: int = 0
"""For chunked prefill, a request is considered long if the prompt is
longer than this number of tokens."""
enable_chunked_prefill: bool = True
"""If True, prefill requests can be chunked based
on the remaining `max_num_batched_tokens`.
The default value here is mainly for convenience when testing.
In real usage, this should be set in `EngineArgs.create_engine_config`.
"""
is_multimodal_model: bool = False
"""True if the model is multimodal."""
# TODO (ywang96): Make this configurable.
max_num_encoder_input_tokens: int = Field(init=False)
"""Multimodal encoder compute budget, only used in V1.
NOTE: This is not currently configurable. It will be overridden by
max_num_batched_tokens in case max multimodal embedding size is larger."""
# TODO (ywang96): Make this configurable.
encoder_cache_size: int = Field(init=False)
"""Multimodal encoder cache size, only used in V1.
NOTE: This is not currently configurable. It will be overridden by
max_num_batched_tokens in case max multimodal embedding size is larger."""
policy: SchedulerPolicy = "fcfs"
"""The scheduling policy to use:\n
- "fcfs" means first come first served, i.e. requests are handled in order
of arrival.\n
- "priority" means requests are handled based on given priority (lower
value means earlier handling) and time of arrival deciding any ties)."""
disable_chunked_mm_input: bool = False
"""If set to true and chunked prefill is enabled, we do not want to
partially schedule a multimodal item. Only used in V1
This ensures that if a request has a mixed prompt
(like text tokens TTTT followed by image tokens IIIIIIIIII) where only
some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
it will be scheduled as TTTT in one step and IIIIIIIIII in the next."""
# scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler"
# (default) or "mod.custom_class".
scheduler_cls: str | type[object] | None = Field(default=None)
"""The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is
the default scheduler. Can be a class directly or the path to a class of
form "mod.custom_class"."""
disable_hybrid_kv_cache_manager: bool | None = None
"""If set to True, KV cache manager will allocate the same size of KV cache
for all attention layers even if there are multiple type of attention layers
like full attention and sliding window attention.
If set to None, the default value will be determined based on the environment
and starting configuration.
"""
async_scheduling: bool | None = Field(default=None)
"""If set to False, disable async scheduling. Async scheduling helps to
avoid gaps in GPU utilization, leading to better latency and throughput.
"""
stream_interval: int = Field(default=1, ge=1)
"""The interval (or buffer size) for streaming in terms of token length.
A smaller value (1) makes streaming smoother by sending each token immediately,
while a larger value (e.g., 10) reduces host overhead and may increase throughput
by batching multiple tokens before sending."""
@staticmethod
def default_factory(**kwargs):
"""
Factory method to create `SchedulerConfig` with default values for `InitVar`s.
"""
if "max_model_len" not in kwargs:
kwargs["max_model_len"] = 8192
if "is_encoder_decoder" not in kwargs:
kwargs["is_encoder_decoder"] = False
return SchedulerConfig(**kwargs)
def get_scheduler_cls(self) -> type["SchedulerInterface"]:
if self.scheduler_cls is None:
if self.async_scheduling:
from vllm.v1.core.sched.async_scheduler import AsyncScheduler
return AsyncScheduler
from vllm.v1.core.sched.scheduler import Scheduler
return Scheduler
# This warning can be removed once the Scheduler interface is
# finalized and we can maintain support for scheduler classes that
# implement it
logger.warning_once(
"Using custom scheduler class %s. This scheduler interface is "
"not public and compatibility may not be maintained.",
self.scheduler_cls,
)
if not isinstance(self.scheduler_cls, str):
return cast(type["SchedulerInterface"], self.scheduler_cls)
return resolve_obj_by_qualname(self.scheduler_cls)
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
factors: list[Any] = []
# max_num_batched_tokens need to be included in the hash due
# to two reasons:
# 1. LoRA creates static buffers based on max_num_batched_tokens.
# The tensor sizes and strides get captured in the torch.compile
# graph explicitly.
# 2. Inductor decides whether using 32-bit or 64-bit indexing integer
# based on the data sizes. `max_num_batched_tokens` has an
# impact on that. For more details, please check
# https://github.com/vllm-project/vllm/issues/29585
factors.append(self.max_num_batched_tokens)
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
@field_validator("scheduler_cls", "async_scheduling", mode="wrap")
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
"""Skip validation if the value is `None` when initialisation is delayed."""
return None if value is None else handler(value)
def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None:
if is_encoder_decoder:
# Chunked prefill should be disabled for encoder-decoder models.
self.disable_chunked_mm_input = True
self.enable_chunked_prefill = False
self.long_prefill_token_threshold = 0
logger.info(
"Encoder-decoder models do not support chunked prefill nor"
" prefix caching; disabling both."
)
self.max_num_encoder_input_tokens = self.max_num_batched_tokens
self.encoder_cache_size = self.max_num_batched_tokens
if self.enable_chunked_prefill:
logger.info(
"Chunked prefill is enabled with max_num_batched_tokens=%d.",
self.max_num_batched_tokens,
)
if self.max_num_partial_prefills > 1:
if self.long_prefill_token_threshold == 0:
self.long_prefill_token_threshold = int(max_model_len * 0.04)
logger.info(
"Concurrent partial prefills enabled with "
"max_num_partial_prefills=%d, max_long_partial_prefills=%d, "
"long_prefill_token_threshold=%d",
self.max_num_partial_prefills,
self.max_long_partial_prefills,
self.long_prefill_token_threshold,
)
self.verify_max_model_len(max_model_len)
def verify_max_model_len(self, max_model_len: int) -> Self:
if (
self.max_num_batched_tokens < max_model_len
and not self.enable_chunked_prefill
):
raise ValueError(
f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
f"smaller than max_model_len ({max_model_len}). "
"This effectively limits the maximum sequence length to "
"max_num_batched_tokens and makes vLLM reject longer "
"sequences. Please increase max_num_batched_tokens or "
"decrease max_model_len."
)
if self.max_num_batched_tokens < self.max_num_seqs:
raise ValueError(
f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
"be greater than or equal to max_num_seqs "
f"({self.max_num_seqs})."
)
if self.max_num_batched_tokens > self.max_num_seqs * max_model_len:
logger.warning(
"max_num_batched_tokens (%d) exceeds max_num_seqs "
"* max_model_len (%d). This may lead to unexpected behavior.",
self.max_num_batched_tokens,
self.max_num_seqs * max_model_len,
)
if self.max_num_partial_prefills > 1:
if not self.enable_chunked_prefill:
raise ValueError(
"Chunked prefill must be enabled to set "
"max_num_partial_prefills > 1."
)
if self.long_prefill_token_threshold > max_model_len:
raise ValueError(
"long_prefill_token_threshold "
f"({self.long_prefill_token_threshold}) cannot be greater "
f"than the max_model_len ({max_model_len})."
)
if self.max_long_partial_prefills > self.max_num_partial_prefills:
raise ValueError(
f"{self.max_long_partial_prefills=} must be less than or equal to "
f"{self.max_num_partial_prefills=}."
)
return self

789
vllm/config/speculative.py Normal file
View File

@@ -0,0 +1,789 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import ast
from typing import TYPE_CHECKING, Any, Literal, get_args
from pydantic import Field, SkipValidation, model_validator
from typing_extensions import Self
from vllm.config import LoadConfig
from vllm.config.model import ModelConfig
from vllm.config.parallel import ParallelConfig
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.transformers_utils.config import get_hf_text_config
from vllm.utils.hashing import safe_hash
from vllm.utils.import_utils import LazyLoader, has_arctic_inference
if TYPE_CHECKING:
from transformers import PretrainedConfig
import vllm.model_executor.layers.quantization as me_quant
else:
PretrainedConfig = Any
me_quant = LazyLoader(
"model_executor", globals(), "vllm.model_executor.layers.quantization"
)
logger = init_logger(__name__)
MTPModelTypes = Literal[
"deepseek_mtp",
"mimo_mtp",
"glm4_moe_mtp",
"glm4_moe_lite_mtp",
"glm_ocr_mtp",
"ernie_mtp",
"nemotron_h_mtp",
"exaone_moe_mtp",
"qwen3_next_mtp",
"qwen3_5_mtp",
"longcat_flash_mtp",
"mtp",
"pangu_ultra_moe_mtp",
"step3p5_mtp",
]
EagleModelTypes = Literal["eagle", "eagle3", MTPModelTypes]
SpeculativeMethod = Literal[
"ngram",
"medusa",
"mlp_speculator",
"draft_model",
"suffix",
EagleModelTypes,
]
@config
class SpeculativeConfig:
"""Configuration for speculative decoding."""
enforce_eager: bool | None = None
"""Override the default enforce_eager from model_config"""
# General speculative decoding control
num_speculative_tokens: int = Field(default=None, gt=0)
"""The number of speculative tokens, if provided. It will default to the
number in the draft model config if present, otherwise, it is required."""
model: str | None = None
"""The name of the draft model, eagle head, or additional weights, if
provided."""
method: SpeculativeMethod | None = None
"""The name of the speculative method to use. If users provide and set the
`model` param, the speculative method type will be detected automatically
if possible, if `model` param is not provided, the method name must be
provided.
If using `ngram` method, the related configuration `prompt_lookup_max` and
`prompt_lookup_min` should be considered."""
draft_tensor_parallel_size: int | None = Field(default=None, ge=1)
"""The degree of the tensor parallelism for the draft model. Can only be 1
or the same as the target model's tensor parallel size."""
tensor_parallel_size: int | None = None
"""Users should pass "draft_tensor_parallel_size". This parameter's purpose is to
warn users when they mistakenly provide the wrong argument."""
# Draft model configuration
quantization: me_quant.QuantizationMethods | None = None
"""Quantization method that was used to quantize the draft model weights.
If `None`, we assume the model weights are not quantized. Note that it only
takes effect when using the draft model-based speculative method."""
max_model_len: int | None = Field(default=None, ge=1)
"""The maximum model length of the draft model. Used when testing the
ability to skip speculation for some sequences."""
revision: str | None = None
"""The specific model version to use for the draft model. It can be a
branch name, a tag name, or a commit id. If unspecified, will use the
default version."""
code_revision: str | None = None
"""The specific revision to use for the draft model code on Hugging Face
Hub. It can be a branch name, a tag name, or a commit id. If unspecified,
will use the default version."""
# Advanced control
disable_padded_drafter_batch: bool = False
"""Disable input padding for speculative decoding. If set to True,
speculative input batches can contain sequences of different lengths,
which may only be supported by certain attention backends. This currently
only affects the EAGLE method of speculation."""
use_local_argmax_reduction: bool = False
"""Use vocab-parallel local argmax instead of all-gathering full logits
for draft token generation. Reduces communication from O(vocab_size) to
O(2 * tp_size) per token. Only applies to greedy draft selection in
non-tree speculation."""
# Ngram proposer configuration
prompt_lookup_max: int | None = Field(default=None, ge=1)
"""Maximum size of ngram token window when using Ngram proposer, required
when method is set to ngram."""
prompt_lookup_min: int | None = Field(default=None, ge=1)
"""Minimum size of ngram token window when using Ngram proposer, if
provided. Defaults to 1."""
# Alternative drafting strategies
speculative_token_tree: str | None = None
"""Specifies the tree structure for speculative token generation.
"""
parallel_drafting: bool = False
"""Enable parallel drafting, where all speculative tokens are generated
in parallel rather than sequentially. This can improve performance but
requires the speculative model be trained to support parallel drafting.
Only compatible with EAGLE and draft model methods."""
# required configuration params passed from engine
target_model_config: SkipValidation[ModelConfig] = None # type: ignore
"""The configuration of the target model."""
target_parallel_config: SkipValidation[ParallelConfig] = None # type: ignore
"""The parallel configuration for the target model."""
# params generated in the post-init stage
draft_model_config: SkipValidation[ModelConfig] = None # type: ignore
"""The configuration of the draft model initialized internal."""
draft_parallel_config: SkipValidation[ParallelConfig] = None # type: ignore
"""The parallel configuration for the draft model initialized internal."""
# Suffix decoding configuration
suffix_decoding_max_tree_depth: int = 24
"""The maximum depth of the suffix decoding global and prompt trees. The
tree depth limits the sum of the prefix match and speculation lengths."""
suffix_decoding_max_cached_requests: int = 10000
"""The maximum number of requests to cache in the global suffix tree. If
exceeded, will trigger eviction in FIFO order. If set to 0, the global
suffix tree is disabled and past responses are not cached (prompt trees
are still used)."""
suffix_decoding_max_spec_factor: float = 1.0
"""The maximum spec factor for suffix decoding. The spec factor controls
speculation lengths based on the prefix match length: max_spec_tokens =
max_spec_factor * prefix_match_length."""
suffix_decoding_min_token_prob: float = 0.1
"""The minimum token probability for suffix decoding. Will only speculate
tokens with estimated probability (based on frequency counts) greater than
or equal to this value."""
draft_load_config: LoadConfig | None = None
"""Load config for the draft model. If not specified, will use the load
config from the target model."""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
factors: list[Any] = []
# Eagle3 affects the computation graph because it returns intermediate
# hidden states in addition to the final hidden state.
factors.append(self.method == "eagle3")
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
@staticmethod
def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
initial_architecture = hf_config.architectures[0]
if hf_config.model_type in ("deepseek_v3", "deepseek_v32", "glm_moe_dsa"):
hf_config.model_type = "deepseek_mtp"
if hf_config.model_type == "deepseek_mtp":
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
hf_config.update(
{"n_predict": n_predict, "architectures": ["DeepSeekMTPModel"]}
)
if hf_config.model_type in ("pangu_ultra_moe"):
hf_config.model_type = "pangu_ultra_moe_mtp"
if hf_config.model_type == "pangu_ultra_moe_mtp":
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
hf_config.update(
{"n_predict": n_predict, "architectures": ["OpenPanguMTPModel"]}
)
if hf_config.architectures[0] == "MiMoForCausalLM":
hf_config.model_type = "mimo_mtp"
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
hf_config.update(
{
"num_hidden_layers": 0,
"n_predict": n_predict,
"architectures": ["MiMoMTPModel"],
}
)
if hf_config.architectures[0] == "Glm4MoeForCausalLM":
hf_config.model_type = "glm4_moe_mtp"
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
hf_config.update(
{
"n_predict": n_predict,
"architectures": ["Glm4MoeMTPModel"],
}
)
if hf_config.architectures[0] == "Glm4MoeLiteForCausalLM":
hf_config.model_type = "glm4_moe_lite_mtp"
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
hf_config.update(
{
"num_hidden_layers": 0,
"n_predict": n_predict,
"architectures": ["Glm4MoeLiteMTPModel"],
}
)
if hf_config.architectures[0] == "GlmOcrForConditionalGeneration":
hf_config.model_type = "glm_ocr_mtp"
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
hf_config.update(
{
"num_hidden_layers": 0,
"n_predict": n_predict,
"architectures": ["GlmOcrMTPModel"],
}
)
if hf_config.model_type == "ernie4_5_moe":
hf_config.model_type = "ernie_mtp"
if hf_config.model_type == "ernie_mtp":
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
hf_config.update(
{"n_predict": n_predict, "architectures": ["ErnieMTPModel"]}
)
if (
hf_config.model_type == "nemotron_h"
and hasattr(hf_config, "num_nextn_predict_layers")
and hf_config.num_nextn_predict_layers > 0
):
# Check if this is an MTP variant
hf_config.model_type = "nemotron_h_mtp"
if hf_config.model_type == "nemotron_h_mtp":
n_predict = getattr(hf_config, "num_nextn_predict_layers", 1)
hf_config.update(
{"n_predict": n_predict, "architectures": ["NemotronHMTPModel"]}
)
if hf_config.model_type == "qwen3_next":
hf_config.model_type = "qwen3_next_mtp"
if hf_config.model_type == "qwen3_next_mtp":
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
hf_config.update(
{"n_predict": n_predict, "architectures": ["Qwen3NextMTP"]}
)
if hf_config.model_type == "exaone_moe":
hf_config.model_type = "exaone_moe_mtp"
if hf_config.model_type == "exaone_moe_mtp":
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
hf_config.update(
{"n_predict": n_predict, "architectures": ["ExaoneMoeMTP"]}
)
if hf_config.model_type in ("qwen3_5", "qwen3_5_moe"):
is_moe = hf_config.model_type == "qwen3_5_moe"
hf_config.model_type = "qwen3_5_mtp"
n_predict = getattr(hf_config, "mtp_num_hidden_layers", None)
hf_config.update(
{
"n_predict": n_predict,
"architectures": ["Qwen3_5MoeMTP" if is_moe else "Qwen3_5MTP"],
}
)
if hf_config.model_type == "longcat_flash":
hf_config.model_type = "longcat_flash_mtp"
n_predict = getattr(hf_config, "num_nextn_predict_layers", 1)
hf_config.update(
{"n_predict": n_predict, "architectures": ["LongCatFlashMTPModel"]}
)
if hf_config.model_type == "step3p5":
hf_config.model_type = "step3p5_mtp"
n_predict = getattr(hf_config, "num_nextn_predict_layers", 1)
hf_config.update({"n_predict": n_predict, "architectures": ["Step3p5MTP"]})
if initial_architecture == "MistralLarge3ForCausalLM":
hf_config.update({"architectures": ["EagleMistralLarge3ForCausalLM"]})
return hf_config
def __post_init__(self):
# Note: "method" is a new parameter that helps to extend the
# configuration of non-model-based proposers, and the "model" parameter
# will be used to set the draft model, eagle head, or additional weight
# when needed. If users do not specify "method", the speculative method
# will be detected automatically if possible. If the speculative method
# can not be detected, it will be considered as the "draft_model" by
# default.
# infer method from user args
if self.method is None:
if self.model in ("ngram", "[ngram]"):
self.method = "ngram"
else:
self.method = "draft_model"
if self.method in get_args(MTPModelTypes) and self.method != "mtp":
logger.warning(
"method `%s` is deprecated and replaced with mtp.", self.method
)
self.method = "mtp"
if self.model is None and self.num_speculative_tokens is not None:
if self.method == "mtp":
if self.target_model_config is None:
raise ValueError("target_model_config must be present for mtp")
if self.target_model_config.hf_text_config.model_type == "deepseek_v32":
# FIXME(luccafong): cudagraph with v32 MTP is not supported,
# remove this when the issue is fixed.
self.enforce_eager = True
# use the draft model from the same model:
self.model = self.target_model_config.model
# Align the quantization of draft model for cases such as
# --quantization fp8 with a bf16 checkpoint.
if not self.quantization:
self.quantization = self.target_model_config.quantization
elif self.method in ("ngram", "[ngram]"):
self.model = "ngram"
elif self.method == "suffix":
self.model = "suffix"
else:
raise ValueError(
"num_speculative_tokens was provided but without speculative model."
)
if self.method in ("ngram", "[ngram]"):
# Unified to "ngram" internally
self.method = "ngram"
# Set default values if not provided
if self.prompt_lookup_min is None and self.prompt_lookup_max is None:
# TODO(woosuk): Tune these values. They are arbitrarily chosen.
self.prompt_lookup_min = 5
self.prompt_lookup_max = 5
elif self.prompt_lookup_min is None:
if self.prompt_lookup_max is None:
raise ValueError(
"Either prompt_lookup_max or prompt_lookup_min must be "
"provided when using the ngram method."
)
self.prompt_lookup_min = self.prompt_lookup_max
elif self.prompt_lookup_max is None:
if self.prompt_lookup_min is None:
raise ValueError(
"Either prompt_lookup_max or prompt_lookup_min must be "
"provided when using the ngram method."
)
self.prompt_lookup_max = self.prompt_lookup_min
# Validate values
if self.prompt_lookup_min > self.prompt_lookup_max:
raise ValueError(
f"prompt_lookup_min={self.prompt_lookup_min} must "
f"be <= prompt_lookup_max={self.prompt_lookup_max}"
)
# TODO: current we still need extract vocab_size from target model
# config, in future, we may try refactor it out, and set
# draft related config as None here.
self.draft_model_config = self.target_model_config
self.draft_parallel_config = self.target_parallel_config
elif self.method == "suffix":
self._validate_suffix_decoding()
else:
self.prompt_lookup_max = 0
self.prompt_lookup_min = 0
if self.model is not None:
self.draft_model_config = ModelConfig(
model=self.model,
runner="draft",
tokenizer=self.target_model_config.tokenizer,
tokenizer_mode=self.target_model_config.tokenizer_mode,
trust_remote_code=self.target_model_config.trust_remote_code,
allowed_local_media_path=self.target_model_config.allowed_local_media_path,
allowed_media_domains=self.target_model_config.allowed_media_domains,
dtype=self.target_model_config.dtype,
seed=self.target_model_config.seed,
revision=self.revision,
code_revision=self.code_revision,
tokenizer_revision=self.target_model_config.tokenizer_revision,
spec_target_max_model_len=self.target_model_config.max_model_len,
quantization=self.quantization,
enforce_eager=self.target_model_config.enforce_eager,
max_logprobs=self.target_model_config.max_logprobs,
hf_overrides=SpeculativeConfig.hf_config_override,
config_format=self.target_model_config.config_format,
)
# Automatically detect the method
if self.method in ("eagle", "eagle3"):
pass
# examples:
# yuhuili/EAGLE-LLaMA3-Instruct-8B
# yuhuili/EAGLE3-LLaMA3.1-Instruct-8B
# AngelSlim/Qwen3-8B_eagle3
elif "eagle-" in self.draft_model_config.model.lower():
self.method = "eagle"
elif "eagle3" in self.draft_model_config.model.lower():
self.method = "eagle3"
elif self.draft_model_config.hf_config.model_type == "medusa":
self.method = "medusa"
elif self.draft_model_config.hf_config.model_type == "mlp_speculator":
self.method = "mlp_speculator"
elif self.draft_model_config.hf_config.model_type in get_args(
MTPModelTypes
):
self.method = "mtp"
if self.num_speculative_tokens > 1:
logger.warning(
"Enabling num_speculative_tokens > 1 will run "
"multiple times of forward on same MTP layer"
",which may result in lower acceptance rate"
)
elif self.draft_model_config.hf_config.model_type in (
"longcat_flash_mtp"
):
self.method = "longcat_flash_mtp"
if self.num_speculative_tokens > 1:
logger.warning(
"LongCat MTP models only have "
"one layer. Might need some code changes "
"to support multiple layers."
)
elif self.method == "draft_model":
pass
else:
raise NotImplementedError(
f"Unsupported speculative method: '{self.method}'"
)
# Replace hf_config for EAGLE draft_model
if self.method in ("eagle", "eagle3"):
from vllm.transformers_utils.configs import SpeculatorsConfig
from vllm.transformers_utils.configs.eagle import EAGLEConfig
if isinstance(
self.draft_model_config.hf_config,
(EAGLEConfig, SpeculatorsConfig),
):
pass
else:
eagle_config = EAGLEConfig(
self.draft_model_config.hf_config,
method=self.method,
model_type="eagle",
)
# EAGLEConfig primarily updates architectures, so update
# all architectures-related fields in draft_model_config
self.draft_model_config.hf_config = eagle_config
self.draft_model_config.hf_text_config = get_hf_text_config(
self.draft_model_config.hf_config
)
self.draft_model_config.model_arch_config = (
self.draft_model_config.get_model_arch_config()
)
model_info, arch = (
self.draft_model_config.registry.inspect_model_cls(
self.draft_model_config.architectures,
self.draft_model_config,
)
)
self.draft_model_config._model_info = model_info
self.draft_model_config._architecture = arch
if self.num_speculative_tokens is not None and hasattr(
self.draft_model_config.hf_config, "num_lookahead_tokens"
):
self.draft_model_config.hf_config.num_lookahead_tokens = (
self.num_speculative_tokens
)
n_predict = getattr(
self.draft_model_config.hf_config, "n_predict", None
)
if n_predict is not None:
if self.num_speculative_tokens is None:
# Default to max value defined in draft model config.
self.num_speculative_tokens = n_predict
elif (
self.num_speculative_tokens > n_predict
and self.num_speculative_tokens % n_predict != 0
):
# Ensure divisibility for MTP module reuse.
raise ValueError(
f"num_speculative_tokens:{self.num_speculative_tokens}"
f" must be divisible by {n_predict=}"
)
if self.speculative_token_tree is None:
if self.num_speculative_tokens is None:
raise ValueError(
"A speculative model was provided, but neither "
"`speculative_token_tree` nor `num_speculative_tokens` "
"was provided"
)
# Generate chain of tokens.
self.speculative_token_tree = str(
[(i + 1) * (0,) for i in range(self.num_speculative_tokens)]
)
else:
# Sort the token tree breadth-first.
tree_choices = ast.literal_eval(self.speculative_token_tree)
self.speculative_token_tree = str(
sorted(tree_choices, key=lambda t: (len(t), t))
)
self.draft_tensor_parallel_size = (
SpeculativeConfig._verify_and_get_draft_tp(
self.target_parallel_config,
self.draft_tensor_parallel_size,
self.draft_model_config.hf_config,
)
)
self.draft_model_config.max_model_len = (
SpeculativeConfig._maybe_override_draft_max_model_len(
self.max_model_len,
self.draft_model_config.max_model_len,
self.target_model_config.max_model_len,
)
)
self.draft_parallel_config = (
SpeculativeConfig.create_draft_parallel_config(
self.target_parallel_config, self.draft_tensor_parallel_size
)
)
return self
def _validate_suffix_decoding(self):
if not has_arctic_inference():
raise ImportError(
"Arctic Inference is required for suffix decoding. "
"Install via `pip install arctic-inference==0.1.1`."
)
if self.num_speculative_tokens is None:
# Suffix decoding decides the actual number of speculative tokens
# dynamically and treats num_speculative_tokens as a maximum limit.
self.num_speculative_tokens = self.suffix_decoding_max_tree_depth
logger.warning(
"Defaulted num_speculative_tokens to %s for suffix decoding.",
self.num_speculative_tokens,
)
# Validate values
if self.suffix_decoding_max_tree_depth < 1:
raise ValueError(
f"suffix_decoding_max_tree_depth="
f"{self.suffix_decoding_max_tree_depth} must be >= 1"
)
if self.suffix_decoding_max_cached_requests < 0:
raise ValueError(
f"suffix_decoding_max_cached_requests="
f"{self.suffix_decoding_max_cached_requests} must be >= 0"
)
if self.suffix_decoding_max_spec_factor < 0:
raise ValueError(
f"suffix_decoding_max_spec_factor="
f"{self.suffix_decoding_max_spec_factor} must be >= 0"
)
if not 0 <= self.suffix_decoding_min_token_prob <= 1:
raise ValueError(
f"suffix_decoding_min_token_prob="
f"{self.suffix_decoding_min_token_prob} must be in [0, 1]"
)
@staticmethod
def _maybe_override_draft_max_model_len(
speculative_max_model_len: int | None,
draft_max_model_len: int,
target_max_model_len: int,
) -> int:
"""Determine the max sequence len for the draft model. This is usually
the draft_max_model_len, but may be the target_max_model_len if it is
less than the draft_max_model_len, or may be speculative_max_model_len
if it is specified.
This is necessary so that sequences do not exceed the capacity of the
draft model or the target model.
speculative_max_model_len is mainly used for testing that sequences can
skip speculation.
"""
if speculative_max_model_len is not None:
if speculative_max_model_len > draft_max_model_len:
raise ValueError(
f"{speculative_max_model_len=} cannot be "
f"larger than {draft_max_model_len=}"
)
if speculative_max_model_len > target_max_model_len:
raise ValueError(
f"{speculative_max_model_len=} cannot be "
f"larger than {target_max_model_len=}"
)
return speculative_max_model_len
return min(
draft_max_model_len,
target_max_model_len,
)
@staticmethod
def _verify_and_get_draft_tp(
target_parallel_config: ParallelConfig,
speculative_draft_tensor_parallel_size: int | None,
draft_hf_config: PretrainedConfig,
) -> int:
"""
Verifies and adjusts the tensor parallel size for a draft model
specified using speculative_draft_tensor_parallel_size.
"""
# If speculative_draft_tensor_parallel_size is unset then set it
# appropriately else verify that it is set correctly.
if speculative_draft_tensor_parallel_size is None:
if draft_hf_config.model_type == "mlp_speculator":
speculative_draft_tensor_parallel_size = 1
if target_parallel_config.tensor_parallel_size > 1:
logger.warning(
"%s cannot currently be run with tp>1; "
"setting speculative_draft_tensor_parallel_size=1",
draft_hf_config.model_type,
)
else:
speculative_draft_tensor_parallel_size = (
target_parallel_config.tensor_parallel_size
)
elif speculative_draft_tensor_parallel_size not in (
1,
target_parallel_config.tensor_parallel_size,
):
raise ValueError(
f"{speculative_draft_tensor_parallel_size=} cannot be "
f"other value than 1 or target model tensor_parallel_size"
)
return speculative_draft_tensor_parallel_size
@staticmethod
def create_draft_parallel_config(
target_parallel_config: ParallelConfig,
speculative_draft_tensor_parallel_size: int,
) -> ParallelConfig:
"""Create a parallel config for use by the draft worker.
This is mostly a copy of the target parallel config, except the tp_size.
"""
draft_parallel_config = ParallelConfig(
pipeline_parallel_size=target_parallel_config.pipeline_parallel_size,
tensor_parallel_size=speculative_draft_tensor_parallel_size,
distributed_executor_backend=target_parallel_config.distributed_executor_backend,
max_parallel_loading_workers=target_parallel_config.max_parallel_loading_workers,
disable_custom_all_reduce=target_parallel_config.disable_custom_all_reduce,
ray_workers_use_nsight=target_parallel_config.ray_workers_use_nsight,
placement_group=target_parallel_config.placement_group,
)
return draft_parallel_config
@model_validator(mode="after")
def _verify_args(self) -> Self:
if self.tensor_parallel_size is not None:
raise ValueError(
"'tensor_parallel_size' is not a valid argument in the "
"speculative_config. Please pass 'draft_tensor_parallel_size' instead."
)
if self.num_speculative_tokens is None:
raise ValueError(
"num_speculative_tokens must be provided with "
"speculative model unless the draft model config contains an "
"n_predict parameter."
)
if self.num_speculative_tokens <= 0:
raise ValueError(
"Expected num_speculative_tokens to be greater "
f"than zero ({self.num_speculative_tokens})."
)
if self.draft_model_config:
self.draft_model_config.verify_with_parallel_config(
self.draft_parallel_config
)
eagle3_target_supported = [
"llama",
"qwen",
"minicpm",
"gpt_oss",
"hunyuan_vl",
"hunyuan_v1_dense",
"afmoe",
"nemotron_h",
]
if (
self.method == "eagle3"
and self.target_model_config
and not any(
supported_model in self.target_model_config.hf_text_config.model_type
for supported_model in eagle3_target_supported
)
):
raise ValueError(
f"Eagle3 is only supported for {eagle3_target_supported} models. " # noqa: E501
f"Got {self.target_model_config.hf_text_config.model_type=}"
)
self.verify_equal_vocab_size_if_draft_model()
return self
def verify_equal_vocab_size_if_draft_model(self):
if (
self.method == "draft_model"
and self.target_model_config is not None
and self.draft_model_config is not None
):
target_vocab_size = self.target_model_config.get_vocab_size()
draft_vocab_size = self.draft_model_config.get_vocab_size()
if target_vocab_size != draft_vocab_size:
raise ValueError(
f"Target and draft model should have the same vocabulary size. "
f"Target model vocab_size={target_vocab_size}. "
f"Draft model vocab_size={draft_vocab_size}. "
f"Using models with different tokenizers can cause out-of-bounds "
f"errors during speculative decoding."
)
@property
def max_num_new_slots_for_drafting(self) -> int:
"""
Calculate the maximum number of new slots that might be added to the batch
when drafting.
"""
slots_per_req = 0 # for serial non-draft-model methods, no change needed
if self.parallel_drafting:
# For parallel drafting, we need one new slot per 'masked' token
slots_per_req = self.num_speculative_tokens - 1
if self.uses_draft_model():
# For draft model-based speculation, we need one new slot per request
# Since we do not slice the draft tokens
slots_per_req += 1
return slots_per_req
def use_eagle(self) -> bool:
return self.method in ("eagle", "eagle3", "mtp")
def uses_draft_model(self) -> bool:
return self.method == "draft_model"
def __repr__(self) -> str:
method = self.method
model = None if method in ("ngram", "suffix") else self.draft_model_config.model
num_spec_tokens = self.num_speculative_tokens
return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"

View File

@@ -0,0 +1,39 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.config.utils import config
@config
class SpeechToTextConfig:
"""Configuration for speech-to-text models."""
sample_rate: float = 16_000
"""Sample rate (Hz) to resample input audio to. Most speech models expect
16kHz audio input. The input audio will be automatically resampled to this
rate before processing."""
max_audio_clip_s: int | None = 30
"""Maximum duration in seconds for a single audio clip without chunking.
Audio longer than this will be split into smaller chunks if
`allow_audio_chunking` evaluates to True, otherwise it will be rejected.
`None` means audio duration can be unlimited and won't be chunked."""
overlap_chunk_second: int = 1
"""Overlap duration in seconds between consecutive audio chunks when
splitting long audio. This helps maintain context across chunk boundaries
and improves transcription quality at split points."""
min_energy_split_window_size: int | None = 1600
"""Window size in samples for finding low-energy (quiet) regions to split
audio chunks. The algorithm looks for the quietest moment within this
window to minimize cutting through speech. Default 1600 samples ≈ 100ms
at 16kHz. If None, no chunking will be done."""
@property
def allow_audio_chunking(self) -> bool:
return (
self.min_energy_split_window_size is not None
and self.max_audio_clip_s is not None
)

View File

@@ -0,0 +1,76 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, Literal
from pydantic import model_validator
from typing_extensions import Self
from vllm.config.utils import config
from vllm.utils.hashing import safe_hash
StructuredOutputsBackend = Literal[
"auto", "xgrammar", "guidance", "outlines", "lm-format-enforcer"
]
@config
class StructuredOutputsConfig:
"""Dataclass which contains structured outputs config for the engine."""
backend: StructuredOutputsBackend = "auto"
"""Which engine will be used for structured outputs (e.g. JSON schema,
regex, etc) by default. With "auto", we will make opinionated choices
based on request contents and what the backend libraries currently support,
so the behavior is subject to change in each release."""
disable_fallback: bool = False
"""If `True`, vLLM will not fallback to a different backend on error."""
disable_any_whitespace: bool = False
"""If `True`, json output will always be compact without any whitespace.
If `False`, the model may generate whitespace between JSON fields,
which is still valid JSON. This is only supported for xgrammar
and guidance backends."""
disable_additional_properties: bool = False
"""If `True`, the `guidance` backend will not use `additionalProperties`
in the JSON schema. This is only supported for the `guidance` backend and
is used to better align its behaviour with `outlines` and `xgrammar`."""
reasoning_parser: str = ""
"""Select the reasoning parser depending on the model that you're using.
This is used to parse the reasoning content into OpenAI API format."""
reasoning_parser_plugin: str = ""
"""Path to a dynamically reasoning parser plugin that can be dynamically
loaded and registered."""
enable_in_reasoning: bool = False
"""Whether to use structured input for reasoning."""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
@model_validator(mode="after")
def _validate_structured_output_config(self) -> Self:
if self.disable_any_whitespace and self.backend not in ("xgrammar", "guidance"):
raise ValueError(
"disable_any_whitespace is only supported for "
"xgrammar and guidance backends."
)
if self.disable_additional_properties and self.backend != "guidance":
raise ValueError(
"disable_additional_properties is only supported "
"for the guidance backend."
)
return self

447
vllm/config/utils.py Normal file
View File

@@ -0,0 +1,447 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Utility functions for vLLM config dataclasses."""
import ast
import enum
import hashlib
import inspect
import json
import os
import pathlib
import textwrap
from collections.abc import Callable, Mapping, Sequence, Set
from dataclasses import MISSING, field, fields, is_dataclass
from itertools import pairwise
from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast
import torch
from pydantic import ConfigDict
from pydantic.dataclasses import dataclass
from pydantic.fields import Field as PydanticField
from pydantic.fields import FieldInfo
from typing_extensions import dataclass_transform, runtime_checkable
import vllm.envs as envs
from vllm.logger import init_logger
logger = init_logger(__name__)
if TYPE_CHECKING:
from _typeshed import DataclassInstance
else:
DataclassInstance = Any
ConfigType = type[DataclassInstance]
ConfigT = TypeVar("ConfigT", bound=DataclassInstance)
@dataclass_transform(field_specifiers=(PydanticField,))
def config(
cls: type[ConfigT] | None = None,
*,
config: ConfigDict | None = None,
**kwargs: Any,
) -> type[ConfigT] | Callable[[type[ConfigT]], type[ConfigT]]:
"""Decorator to create a pydantic dataclass with default config. The default config
for the dataclass forbids extra fields.
All config classes in vLLM should use this decorator.
Args:
cls: The class to decorate
config: The pydantic ConfigDict to use. If provided, it will be merged with
the default config.
**kwargs: Additional arguments to pass to pydantic.dataclass."""
# Extra fields are forbidden by default
merged_config = ConfigDict(extra="forbid")
if config is not None:
merged_config.update(config)
def decorator(cls):
return dataclass(cls, config=merged_config, **kwargs)
# Called with arguments: @config(config=...)
if cls is None:
return decorator
# Called without arguments: @config
return decorator(cls)
def get_field(cls: ConfigType, name: str) -> Any:
"""Get the default factory field of a dataclass by name. Used for getting
default factory fields in `EngineArgs`."""
if not is_dataclass(cls):
raise TypeError("The given class is not a dataclass.")
try:
named_field = next(f for f in fields(cls) if f.name == name)
except StopIteration as e:
raise ValueError(f"Field '{name}' not found in {cls.__name__}.") from e
# The arguments to copy to the new field
default = named_field.default
default_factory = named_field.default_factory
init = named_field.init
# Handle pydantic.Field
if isinstance(default, FieldInfo):
if default.init is not None:
init = default.init
if default.default_factory is not None:
default_factory = cast(Callable[[], Any], default.default_factory)
default = MISSING
else:
default = default.default
if default is MISSING and default_factory is MISSING:
logger.warning_once(
"%s.%s has no default or default factory.", cls.__name__, name
)
return field(default=default, default_factory=default_factory, init=init)
def is_init_field(cls: ConfigType, name: str) -> bool:
return get_field(cls, name).init
def replace(dataclass_instance: ConfigT, /, **kwargs) -> ConfigT:
"""Like [`dataclasses.replace`](https://docs.python.org/3/library/dataclasses.html#dataclasses.replace),
but compatible with Pydantic dataclasses which use `pydantic.fields.Field` instead
of `dataclasses.field`"""
cls = type(dataclass_instance)
dataclass_dict = dataclass_instance.__dict__
dataclass_dict = {k: v for k, v in dataclass_dict.items() if is_init_field(cls, k)}
dataclass_dict.update(kwargs)
return cls(**dataclass_dict)
def getattr_iter(
object: object,
names: Sequence[str],
default: Any | None = None,
default_factory: Callable[[], Any] | None = None,
warn: bool = False,
) -> Any:
"""
A helper function that retrieves an attribute from an object which may
have multiple possible names. This is useful when fetching attributes from
arbitrary `transformers.PretrainedConfig` instances.
In the case where the first name in `names` is the preferred name, and
any other names are deprecated aliases, setting `warn=True` will log a
warning when a deprecated name is used.
"""
for i, name in enumerate(names):
if hasattr(object, name):
if warn and i > 0:
logger.warning_once(
"%s contains a deprecated attribute name '%s'. "
"Please use the preferred attribute name '%s' instead.",
type(object).__name__,
name,
names[0],
)
return getattr(object, name)
return default_factory() if default_factory is not None else default
def get_attr_docs(cls: type[Any]) -> dict[str, str]:
"""
Get any docstrings placed after attribute assignments in a class body.
https://davidism.com/mit-license/
"""
cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
if not isinstance(cls_node, ast.ClassDef):
raise TypeError("Given object was not a class.")
out = {}
# Consider each pair of nodes.
for a, b in pairwise(cls_node.body):
# Must be an assignment then a constant string.
if (
not isinstance(a, (ast.Assign, ast.AnnAssign))
or not isinstance(b, ast.Expr)
or not isinstance(b.value, ast.Constant)
or not isinstance(b.value.value, str)
):
continue
doc = inspect.cleandoc(b.value.value)
# An assignment can have multiple targets (a = b = v), but an
# annotated assignment only has one target.
targets = a.targets if isinstance(a, ast.Assign) else [a.target]
for target in targets:
# Must be assigning to a plain name.
if not isinstance(target, ast.Name):
continue
out[target.id] = doc
return out
@runtime_checkable
class SupportsHash(Protocol):
def compute_hash(self) -> str: ...
class SupportsMetricsInfo(Protocol):
def metrics_info(self) -> dict[str, str]: ...
def update_config(config: ConfigT, overrides: dict[str, Any]) -> ConfigT:
processed_overrides = {}
for field_name, value in overrides.items():
assert hasattr(config, field_name), (
f"{type(config)} has no field `{field_name}`"
)
current_value = getattr(config, field_name)
if is_dataclass(current_value) and not is_dataclass(value):
assert isinstance(value, dict), (
f"Overrides to {type(config)}.{field_name} must be a dict"
f" or {type(current_value)}, but got {type(value)}"
)
value = update_config(
current_value, # type: ignore[type-var]
value,
)
processed_overrides[field_name] = value
return replace(config, **processed_overrides)
def normalize_value(x):
"""Return a stable, JSON-serializable canonical form for hashing.
Order: primitives, special types (Enum, callable, torch.dtype, Path), then
generic containers (Mapping/Set/Sequence) with recursion.
"""
# Fast path
if x is None or isinstance(x, (bool, int, float, str)):
return x
# Enums: tag with FQN to avoid primitive collisions.
# Ex: Enum(1) vs int(1) -> ("module.QualName", value).
if isinstance(x, enum.Enum):
enum_type = f"{x.__class__.__module__}.{x.__class__.__qualname__}"
return (enum_type, normalize_value(x.value))
# Classes (types) are accepted and canonicalized by their fully-qualified
# name (module.qualname) for a stable identifier.
# Instances are only accepted if they expose uuid(); otherwise they are
# rejected to avoid under-hashing object state.
# Callables: accept classes only; reject funcs/lambdas/methods.
# Used by LogitsProcessor types and ModelConfig.hf_overrides.
if isinstance(x, type):
module = getattr(x, "__module__", "")
qual = getattr(x, "__qualname__", getattr(x, "__name__", ""))
return ".".join([p for p in (module, qual) if p]) or repr(x)
# Prefer stable uuid identifiers for objects that provide them, even if
# they are callable instances (e.g., InductorPass wrappers).
if hasattr(x, "uuid") and callable(getattr(x, "uuid", None)):
return x.uuid()
if callable(x):
raise TypeError("normalize_value: function or callable instance unsupported")
# Torch dtype: stringify (torch.float64 -> "torch.float64").
# We rely on the string form here; dtype-bearing fields that need additional
# disambiguation should encode that at the config layer.
if isinstance(x, torch.dtype):
return str(x)
# Bytes
if isinstance(x, (bytes, bytearray)):
return x.hex()
# Paths (canonicalize)
if isinstance(x, pathlib.Path):
try:
return str(x.expanduser().resolve())
except Exception:
return str(x)
# Dataclasses: represent as (FQN, sorted(field,value) tuple) for stability.
if is_dataclass(x):
type_fqn = f"{x.__class__.__module__}.{x.__class__.__qualname__}"
items = tuple(
(f.name, normalize_value(getattr(x, f.name)))
for f in sorted(fields(x), key=lambda f: f.name)
)
return (type_fqn, items)
# Containers (generic)
if isinstance(x, Mapping):
return tuple(sorted((str(k), normalize_value(v)) for k, v in x.items()))
if isinstance(x, Set):
return tuple(sorted(repr(normalize_value(v)) for v in x))
if isinstance(x, Sequence) and not isinstance(x, (str, bytes, bytearray)):
return tuple(normalize_value(v) for v in x)
# PretrainedConfig
if hasattr(x, "to_json_string") and callable(x.to_json_string):
return x.to_json_string()
# Unsupported type: e.g., modules, generators, open files, or objects
# without a stable JSON/UUID representation. Hard-error to avoid
# under-hashing.
# If you hit this, either reshape your config to use supported primitives
# and containers, or extend normalize_value to provide a stable encoding
# (e.g., via uuid() or to_json_string()) for this type.
raise TypeError(
f"normalize_value: unsupported type '{type(x).__name__}'. "
"Ensure config values use supported primitives/containers or add a "
"stable representation for this type."
)
def get_hash_factors(config: ConfigT, ignored_factors: set[str]) -> dict[str, object]:
"""Gets the factors used for hashing a config class.
- Includes all dataclass fields not in `ignored_factors`.
- Errors on non-normalizable values.
"""
factors: dict[str, object] = {}
for dc_field in fields(config):
factor = dc_field.name
if factor in ignored_factors:
continue
value = getattr(config, factor, None)
try:
factors[factor] = normalize_value(value)
except TypeError as e:
raise TypeError(
f"get_hash_factors: unsupported type for key '{factor}' "
f"({type(value).__name__})"
) from e
return factors
def hash_factors(items: dict[str, object]) -> str:
"""Return a SHA-256 hex digest of the canonical items structure."""
return hashlib.sha256(json.dumps(items, sort_keys=True).encode()).hexdigest()
@dataclass
class Range:
"""
A range of numbers.
Inclusive of start, inclusive of end.
"""
start: int
end: int
def is_single_size(self) -> bool:
return self.start == self.end
def __contains__(self, size: int) -> bool:
# Inclusive of start, inclusive of end
return self.start <= size <= self.end
def __eq__(self, other: object) -> bool:
if not isinstance(other, Range):
return False
return self.start == other.start and self.end == other.end
def __hash__(self) -> int:
return hash((self.start, self.end))
def __str__(self) -> str:
return f"({self.start}, {self.end})"
def __repr__(self) -> str:
return self.__str__()
def handle_deprecated(
config: ConfigT,
old_name: str,
new_name_or_names: str | list[str],
removal_version: str,
) -> None:
old_val = getattr(config, old_name)
if old_val is None:
return
if isinstance(new_name_or_names, str):
new_names = [new_name_or_names]
else:
new_names = new_name_or_names
msg = (
f"{old_name} is deprecated and will be removed in {removal_version}. "
f"Use {', '.join(new_names)} instead."
)
logger.warning(msg)
for new_name in new_names:
setattr(config, new_name, old_val)
def get_from_deprecated_env_if_set(
env_name: str,
removal_version: str,
field_name: str | None = None,
) -> str | None:
"""
Get value from deprecated environment variable with warning.
Args:
env_name: Name of the deprecated environment variable
removal_version: Version when it will be removed
field_name: Name of the field to suggest as alternative
Returns:
The environment variable value if set, None otherwise
"""
if envs.is_set(env_name):
value = os.environ.get(env_name)
alt_msg = f" Please use {field_name} instead." if field_name else ""
logger.warning_once(
"Using %s environment variable is deprecated and will be removed in %s.%s",
env_name,
removal_version,
alt_msg,
)
return value
return None
def set_from_deprecated_env_if_set(
config: ConfigT,
env_name: str,
removal_version: str,
field_name: str,
to_bool: bool = False,
to_int: bool = False,
) -> None:
"""
Set object field from deprecated environment variable with warning.
Args:
config: Config object to set the field on
env_name: Name of the deprecated environment variable
removal_version: Version when the env var will be removed
field_name: Name of the field to set
to_bool: Whether to convert the environment variable value to boolean
to_int: Whether to convert the environment variable value to integer
Returns:
None
"""
if to_bool and to_int:
raise ValueError("Cannot convert to both boolean and integer.")
env_value = get_from_deprecated_env_if_set(env_name, removal_version, field_name)
if env_value is not None:
field_value: str | bool | int = env_value
if to_bool:
field_value = env_value.lower() in ("1", "true")
elif to_int:
field_value = int(env_value)
setattr(config, field_name, field_value)

1758
vllm/config/vllm.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,13 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Literal
from vllm.config.utils import config
@config
class WeightTransferConfig:
"""Configuration for weight transfer during RL training."""
backend: Literal["nccl"] = "nccl"
"""The backend to use for weight transfer."""