Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -16,8 +16,8 @@ class AttentionConfig:
backend: AttentionBackendEnum | None = None
"""Attention backend to use. If None, will be selected automatically."""
flash_attn_version: Literal[2, 3] | None = None
"""Force vllm to use a specific flash-attention version (2 or 3).
flash_attn_version: Literal[2, 3, 4] | None = None
"""Force vllm to use a specific flash-attention version (2, 3, or 4).
Only valid when using the flash-attention backend."""
use_prefill_decode_attention: bool = False

View File

@@ -86,9 +86,16 @@ class CUDAGraphMode(enum.Enum):
def separate_routine(self) -> bool:
return isinstance(self.value, tuple)
def decode_use_graph(self) -> bool:
return self.decode_mode() == CUDAGraphMode.FULL
def valid_runtime_modes(self) -> bool:
return self in [CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL]
@classmethod
def valid_runtime_modes(cls) -> frozenset["CUDAGraphMode"]:
return frozenset({cls.NONE, cls.PIECEWISE, cls.FULL})
def is_valid_runtime_mode(self) -> bool:
return self in CUDAGraphMode.valid_runtime_modes()
def __str__(self) -> str:
return self.name
@@ -385,7 +392,7 @@ class CompilationConfig:
Please use mode. Currently all levels are mapped to mode.
"""
# Top-level Compilation control
mode: CompilationMode = Field(default=None)
mode: CompilationMode = Field(default=CompilationMode.NONE)
"""The compilation approach used for torch.compile-based compilation of the
model.
@@ -503,7 +510,7 @@ class CompilationConfig:
constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
# CudaGraph compilation
cudagraph_mode: CUDAGraphMode = Field(default=None)
cudagraph_mode: CUDAGraphMode = Field(default=CUDAGraphMode.FULL_DECODE_ONLY)
"""
The mode of the cudagraph:
@@ -1003,6 +1010,7 @@ class CompilationConfig:
# https://github.com/vllm-project/vllm/issues/33267
if not self.use_inductor_graph_partition:
self.splitting_ops.append("vllm::unified_kv_cache_update")
self.splitting_ops.append("vllm::unified_mla_kv_cache_update")
elif len(self.splitting_ops) == 0:
if (
@@ -1045,7 +1053,7 @@ class CompilationConfig:
"are optimized for prefill and are incompatible with CUDA Graphs. "
"In order to use CUDA Graphs for decode-optimized workloads, "
"use --all2all-backend with another option, such as "
"deepep_low_latency, pplx, or allgather_reducescatter."
"deepep_low_latency or allgather_reducescatter."
)
self.cudagraph_mode = CUDAGraphMode.NONE

View File

@@ -50,8 +50,6 @@ from vllm.transformers_utils.utils import maybe_model_redirect
from vllm.utils.import_utils import LazyLoader
from vllm.v1.attention.backends.registry import AttentionBackendEnum
import os
if TYPE_CHECKING:
from transformers import PretrainedConfig
@@ -128,6 +126,7 @@ class ModelConfig:
- "slow" will always use the slow tokenizer.\n
- "mistral" will always use the tokenizer from `mistral_common`.\n
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
- "qwen_vl" will always use the tokenizer from `qwen_vl`.\n
- Other custom values can be supported via plugins."""
trust_remote_code: bool = False
"""Trust remote code (e.g., from HuggingFace) when downloading the model
@@ -463,8 +462,6 @@ class ModelConfig:
self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
from vllm.platforms import current_platform
if self.override_attention_dtype is not None and not current_platform.is_rocm():
warnings.warn(
"override-attention-dtype is set but not using ROCm platform",
@@ -473,10 +470,9 @@ class ModelConfig:
if self.enable_sleep_mode and not current_platform.is_sleep_mode_available():
raise ValueError("Sleep mode is not supported on current platform.")
temp_hf_config_path = os.environ.get("CUSTOM_QUANT_CONFIG", None)
hf_config = get_config(
temp_hf_config_path or self.hf_config_path or self.model,
self.hf_config_path or self.model,
self.trust_remote_code,
self.revision,
self.code_revision,
@@ -622,6 +618,16 @@ class ModelConfig:
self._try_verify_and_update_model_config()
self._verify_quantization()
self._verify_cuda_graph()
import os
enforce_cuda_graph = os.environ.get("VLLM_ENFORCE_CUDA_GRAPH",None)
if enforce_cuda_graph is not None and enforce_cuda_graph in ["1", "y", "Y"]:
self.enforce_eager = False
else:
self.enforce_eager = True
logger.warning_once(
"Please export VLLM_ENFORCE_CUDA_GRAPH=1 to enable cuda graph. "
"For now, cuda graph is not used and --enforce-eager is disabled ,"
"we are trying to use cuda graph as the default mode")
self._verify_bnb_config()
def get_model_arch_config(
@@ -886,6 +892,7 @@ class ModelConfig:
"modelopt",
"modelopt_fp4",
"modelopt_mxfp8",
"modelopt_mixed",
"petit_nvfp4",
# Ensure heavy backends are probed last to avoid unnecessary
# imports during override detection (e.g., MXFP4 imports Triton)
@@ -942,8 +949,6 @@ class ModelConfig:
f"Unknown quantization method: {self.quantization}. Must "
f"be one of {supported_quantization}."
)
from vllm.platforms import current_platform
current_platform.verify_quantization(self.quantization)
if self.quantization in me_quant.DEPRECATED_QUANTIZATION_METHODS:
@@ -1813,8 +1818,6 @@ def _resolve_auto_dtype(
*,
is_pooling_model: bool,
):
from vllm.platforms import current_platform
supported_dtypes = [
dtype
for dtype in current_platform.supported_dtypes

View File

@@ -152,7 +152,6 @@ class ParallelConfig:
- "naive": Naive all2all implementation using broadcasts\n
- "allgather_reducescatter": All2all based on allgather and reducescatter\n
- "pplx": Use pplx kernels\n
- "deepep_high_throughput": Use deepep high-throughput kernels\n
- "deepep_low_latency": Use deepep low-latency kernels\n
- "mori": Use mori kernels\n
@@ -166,6 +165,9 @@ class ParallelConfig:
disable_custom_all_reduce: bool = False
"""Disable the custom all-reduce kernel and fall back to NCCL."""
enable_elastic_ep: bool = False
"""Enable elastic expert parallelism with stateless NCCL groups for DP/EP."""
enable_dbo: bool = False
"""Enable dual batch overlap for the model executor."""
ubatch_size: int = 0
@@ -245,6 +247,34 @@ class ParallelConfig:
Set to be private as it's not intended to be configured by users.
"""
_stateless_dp_group_port_list: list[list[int]] = Field(default_factory=list)
"""List of open ports for stateless DP groups when enable_elastic_ep is True.
Set to be private as it's not intended to be configured by users.
It is a list of list[int], with each inner list contains a set of 3 ports
to be used for setting up the stateless CPU/device/TCPStore groups
in StatelessGroupCoordinator. The number of inner lists is equal to
the number of DP groups,
i.e., len(self._stateless_dp_group_port_list) == world_size_across_dp // dp_size,
and len(self._stateless_dp_group_port_list[i]) == 3 for all i.
"""
_stateless_ep_group_port_list: list[list[int]] = Field(default_factory=list)
"""List of open ports for stateless EP groups when enable_elastic_ep is True.
Set to be private as it's not intended to be configured by users.
len(self._stateless_ep_group_port_list) == world_size_across_dp // ep_size,
"""
_stateless_eplb_group_port_list: list[list[int]] = Field(default_factory=list)
"""List of open ports for stateless EPLB groups when enable_elastic_ep is True.
Same topology as EP but separate NCCL communicator to avoid deadlocks.
"""
_stateless_world_group_port_list: list[list[int]] = Field(default_factory=list)
"""List of open ports for stateless world group when enable_elastic_ep is True.
Set to be private as it's not intended to be configured by users.
len(self._stateless_world_group_port_list) == 1,
"""
decode_context_parallel_size: int = 1
"""Number of decode context parallel groups, because the world size does
not change by dcp, it simply reuse the GPUs of TP group, and tp_size
@@ -310,6 +340,13 @@ class ParallelConfig:
f"but found: {self._api_process_rank}"
)
if self.all2all_backend == "pplx":
logger.warning(
"The 'pplx' all2all backend has been removed. "
"Falling back to 'allgather_reducescatter'."
)
self.all2all_backend = "allgather_reducescatter"
if self.data_parallel_size_local > self.data_parallel_size:
raise ValueError(
f"data_parallel_size_local ({self.data_parallel_size_local}) "
@@ -396,7 +433,67 @@ class ParallelConfig:
return answer
def stateless_init_dp_group(self) -> ProcessGroup:
def allocate_elastic_ep_ports(self) -> None:
"""Allocate all ports for elastic EP (stateless groups + DP master).
Must be called AFTER ray.init() so that ports claimed by Ray's
idle worker pool are already in use and won't be returned by
get_open_ports_list().
"""
if not self.enable_elastic_ep:
return
if self._stateless_world_group_port_list:
return
num_world_groups = 1
dp_size = self.data_parallel_size
ep_size = self.data_parallel_size * self.world_size_across_dp
num_dp_groups = max(1, self.world_size_across_dp // dp_size)
num_ep_groups = max(1, self.world_size_across_dp // ep_size)
num_eplb_groups = num_ep_groups
total_stateless_ports = (
num_world_groups + num_dp_groups + num_ep_groups + num_eplb_groups
) * 3
num_dp_master_ports = 5
all_ports = get_open_ports_list(total_stateless_ports + num_dp_master_ports)
self._data_parallel_master_port_list = all_ports[-num_dp_master_ports:]
self.data_parallel_master_port = self._data_parallel_master_port_list.pop()
all_ports = all_ports[:-num_dp_master_ports]
self._stateless_world_group_port_list = [
all_ports[i : i + 3] for i in range(0, num_world_groups * 3, 3)
]
start_idx = num_world_groups * 3
self._stateless_dp_group_port_list = [
all_ports[i : i + 3]
for i in range(start_idx, start_idx + num_dp_groups * 3, 3)
]
start_idx += num_dp_groups * 3
self._stateless_ep_group_port_list = [
all_ports[i : i + 3]
for i in range(start_idx, start_idx + num_ep_groups * 3, 3)
]
start_idx += num_ep_groups * 3
self._stateless_eplb_group_port_list = [
all_ports[i : i + 3]
for i in range(start_idx, start_idx + num_eplb_groups * 3, 3)
]
def get_next_stateless_world_group_port(self) -> list[int]:
return self._stateless_world_group_port_list.pop()
def get_next_stateless_dp_group_port(self) -> list[int]:
return self._stateless_dp_group_port_list.pop()
def get_next_stateless_ep_group_port(self) -> list[int]:
return self._stateless_ep_group_port_list.pop()
def get_next_stateless_eplb_group_port(self) -> list[int]:
return self._stateless_eplb_group_port_list.pop()
def stateless_init_dp_group(self, return_store: bool = False) -> ProcessGroup:
# NOTE: In high-concurrency scenarios multiple processes
# can pick the same (currently free) port through a race
# condition when calling `get_open_port()`. When the first
@@ -420,7 +517,8 @@ class ParallelConfig:
self.get_next_dp_init_port(),
self.data_parallel_rank,
self.data_parallel_size,
backend=current_platform.dist_backend,
backend="gloo",
return_store=return_store,
)
except DistNetworkError as e:
# We only want to retry when the root cause is EADDRINUSE.
@@ -442,7 +540,6 @@ class ParallelConfig:
# In this case, ensure the input to the experts is sequence parallel
# to avoid the excess work.
#
# Not needed for pplx-kernels as it can handle duplicate input tokens.
@property
def use_sequence_parallel_moe(self) -> bool:
return (
@@ -556,6 +653,21 @@ class ParallelConfig:
logger.info("Using external launcher for distributed inference.")
self.world_size *= self.data_parallel_size
if self.enable_elastic_ep:
if not self.enable_eplb:
raise ValueError("Elastic EP is only supported with enable_eplb=True.")
if self.pipeline_parallel_size > 1:
raise ValueError(
"Elastic EP is not supported with pipeline parallelism "
f"(pipeline_parallel_size={self.pipeline_parallel_size})."
)
if self.data_parallel_external_lb or self.data_parallel_hybrid_lb:
raise NotImplementedError(
"Elastic EP is not compatible with data_parallel_external_lb "
"or data_parallel_hybrid_lb. Elastic EP relies on a single API "
"server and core client to coordinate scale up/down."
)
if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
# Data parallel was specified in the engine args.
if self.distributed_executor_backend == "external_launcher":
@@ -568,9 +680,12 @@ class ParallelConfig:
"Set data_parallel_rank to %d automatically.",
self.data_parallel_rank,
)
if not self._data_parallel_master_port_list:
self._data_parallel_master_port_list = get_open_ports_list(5)
self.data_parallel_master_port = self._data_parallel_master_port_list.pop()
if not self.enable_elastic_ep:
if not self._data_parallel_master_port_list:
self._data_parallel_master_port_list = get_open_ports_list(5)
self.data_parallel_master_port = (
self._data_parallel_master_port_list.pop()
)
if not (0 <= self.data_parallel_rank < self.data_parallel_size):
raise ValueError(
@@ -597,7 +712,7 @@ class ParallelConfig:
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
logger.info("Disabling V1 multiprocessing for external launcher.")
if self.distributed_executor_backend is None and self.world_size > 1:
if self.distributed_executor_backend is None and self.world_size_across_dp > 1:
# We use multiprocessing by default if world_size fits on the
# current node and we aren't in a ray placement group.
@@ -659,6 +774,17 @@ class ParallelConfig:
"backend is mp, uni or external_launcher."
)
if (
self.all2all_backend in ("allgather_reducescatter", "naive")
and self.eplb_config.use_async
):
logger.warning(
"Async EPLB causes hangs with the '%s' all2all backend. "
"Forcing synchronous EPLB.",
self.all2all_backend,
)
self.eplb_config.use_async = False
@property
def use_ray(self) -> bool:
return self.distributed_executor_backend == "ray" or (

View File

@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import ast
import copy
from typing import TYPE_CHECKING, Any, Literal, get_args
from pydantic import Field, SkipValidation, model_validator
@@ -45,7 +46,7 @@ MTPModelTypes = Literal[
"pangu_ultra_moe_mtp",
"step3p5_mtp",
]
EagleModelTypes = Literal["eagle", "eagle3", MTPModelTypes]
EagleModelTypes = Literal["eagle", "eagle3", "extract_hidden_states", MTPModelTypes]
SpeculativeMethod = Literal[
"ngram",
"medusa",
@@ -77,12 +78,24 @@ class SpeculativeConfig:
If using `ngram` method, the related configuration `prompt_lookup_max` and
`prompt_lookup_min` should be considered."""
enable_multi_layers_mtp: bool = False
"""If set to True, the MTP method will run multiple layers of MTP
speculator. If set to False, it will run only one layer of MTP speculator.
This is only effective when the method is set to `mtp`."""
draft_tensor_parallel_size: int | None = Field(default=None, ge=1)
"""The degree of the tensor parallelism for the draft model. Can only be 1
or the same as the target model's tensor parallel size."""
draft_pipeline_parallel_size: int | None = Field(default=None, ge=1)
"""The degree of pipeline parallelism for the draft model.
Defaults to the target model's pipeline parallel size. Set this to 1 to
run the drafter locally on the last target PP stage."""
tensor_parallel_size: int | None = None
"""Users should pass "draft_tensor_parallel_size". This parameter's purpose is to
warn users when they mistakenly provide the wrong argument."""
pipeline_parallel_size: int | None = None
"""Users should pass "draft_pipeline_parallel_size". This parameter's
purpose is to warn users when they mistakenly provide the wrong argument."""
# Draft model configuration
quantization: me_quant.QuantizationMethods | None = None
@@ -181,9 +194,22 @@ class SpeculativeConfig:
the final hidden states.
"""
factors: list[Any] = []
# Eagle3 affects the computation graph because it returns intermediate
# hidden states in addition to the final hidden state.
factors.append(self.method == "eagle3")
# Eagle3 and extract_hidden_states affect the computation graph because
# they return intermediate hidden states in addition to the final hidden state.
uses_aux_hidden_states = self.method in ("eagle3", "extract_hidden_states")
factors.append(uses_aux_hidden_states)
# The specific layers used also affect the computation graph
if uses_aux_hidden_states and self.draft_model_config is not None:
layer_ids = getattr(
self.draft_model_config.hf_config,
"eagle_aux_hidden_state_layer_ids",
None,
)
if layer_ids is not None:
# Convert to tuple to make it hashable
factors.append(tuple(layer_ids))
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
@@ -352,6 +378,8 @@ class SpeculativeConfig:
self.model = "ngram"
elif self.method == "suffix":
self.model = "suffix"
elif self.method == "extract_hidden_states":
self.model = "extract_hidden_states"
else:
raise ValueError(
"num_speculative_tokens was provided but without speculative model."
@@ -394,6 +422,34 @@ class SpeculativeConfig:
self.draft_parallel_config = self.target_parallel_config
elif self.method == "suffix":
self._validate_suffix_decoding()
elif self.method == "extract_hidden_states":
from vllm.transformers_utils.configs.extract_hidden_states import (
ExtractHiddenStatesConfig,
)
# ExtractHiddenStatesModel is instantiated manually in load_model()
# We just need to store the target model config for KV cache shape info
self.model = "extract_hidden_states"
self.prompt_lookup_max = 0
self.prompt_lookup_min = 0
if hasattr(self.draft_model_config, "hf_config"):
hf_config = self.draft_model_config.hf_config.to_dict()
elif (
isinstance(self.draft_model_config, dict)
and "hf_config" in self.draft_model_config
):
hf_config = self.draft_model_config["hf_config"]
else:
hf_config = {}
self.draft_model_config = copy.copy(self.target_model_config)
self.draft_model_config.hf_config = ExtractHiddenStatesConfig(
self.draft_model_config.hf_config, **hf_config
)
self.update_arch_()
self.draft_parallel_config = self.target_parallel_config
else:
self.prompt_lookup_max = 0
self.prompt_lookup_min = 0
@@ -439,7 +495,10 @@ class SpeculativeConfig:
MTPModelTypes
):
self.method = "mtp"
if self.num_speculative_tokens > 1:
if (
self.enable_multi_layers_mtp is False
and self.num_speculative_tokens > 1
):
logger.warning(
"Enabling num_speculative_tokens > 1 will run "
"multiple times of forward on same MTP layer"
@@ -478,23 +537,8 @@ class SpeculativeConfig:
method=self.method,
model_type="eagle",
)
# EAGLEConfig primarily updates architectures, so update
# all architectures-related fields in draft_model_config
self.draft_model_config.hf_config = eagle_config
self.draft_model_config.hf_text_config = get_hf_text_config(
self.draft_model_config.hf_config
)
self.draft_model_config.model_arch_config = (
self.draft_model_config.get_model_arch_config()
)
model_info, arch = (
self.draft_model_config.registry.inspect_model_cls(
self.draft_model_config.architectures,
self.draft_model_config,
)
)
self.draft_model_config._model_info = model_info
self.draft_model_config._architecture = arch
self.update_arch_()
if self.num_speculative_tokens is not None and hasattr(
self.draft_model_config.hf_config, "num_lookahead_tokens"
@@ -510,6 +554,17 @@ class SpeculativeConfig:
if self.num_speculative_tokens is None:
# Default to max value defined in draft model config.
self.num_speculative_tokens = n_predict
elif (
self.method == "mtp"
and self.enable_multi_layers_mtp
and self.num_speculative_tokens > n_predict
):
logger.warning_once(
"For multi_layer_eagle, num_speculative_tokens "
"is greater than the layer_num, adjusting to "
"layer_num"
)
self.num_speculative_tokens = n_predict
elif (
self.num_speculative_tokens > n_predict
and self.num_speculative_tokens % n_predict != 0
@@ -555,9 +610,17 @@ class SpeculativeConfig:
)
)
self.draft_pipeline_parallel_size = (
SpeculativeConfig._verify_and_get_draft_pp(
self.target_parallel_config,
self.draft_pipeline_parallel_size,
)
)
self.draft_parallel_config = (
SpeculativeConfig.create_draft_parallel_config(
self.target_parallel_config, self.draft_tensor_parallel_size
self.target_parallel_config,
self.draft_tensor_parallel_size,
self.draft_pipeline_parallel_size,
)
)
return self
@@ -671,17 +734,61 @@ class SpeculativeConfig:
)
return speculative_draft_tensor_parallel_size
@staticmethod
def _verify_and_get_draft_pp(
target_parallel_config: ParallelConfig,
speculative_draft_pipeline_parallel_size: int | None,
) -> int:
"""
Verifies and adjusts the pipeline parallel size for a draft model
specified using speculative_draft_pipeline_parallel_size.
"""
if speculative_draft_pipeline_parallel_size is None:
return target_parallel_config.pipeline_parallel_size
if speculative_draft_pipeline_parallel_size not in (
1,
target_parallel_config.pipeline_parallel_size,
):
raise ValueError(
f"{speculative_draft_pipeline_parallel_size=} cannot be "
"other value than 1 or target model "
f"pipeline_parallel_size="
f"{target_parallel_config.pipeline_parallel_size}"
)
return speculative_draft_pipeline_parallel_size
def update_arch_(self):
"""
EagleConfig and ExtractHiddenStatesConfig update architectures, so update all
architectures-related fields in self.draft_model_config
"""
self.draft_model_config.hf_text_config = get_hf_text_config(
self.draft_model_config.hf_config
)
self.draft_model_config.model_arch_config = (
self.draft_model_config.get_model_arch_config()
)
model_info, arch = self.draft_model_config.registry.inspect_model_cls(
self.draft_model_config.architectures,
self.draft_model_config,
)
self.draft_model_config._model_info = model_info
self.draft_model_config._architecture = arch
@staticmethod
def create_draft_parallel_config(
target_parallel_config: ParallelConfig,
speculative_draft_tensor_parallel_size: int,
speculative_draft_pipeline_parallel_size: int,
) -> ParallelConfig:
"""Create a parallel config for use by the draft worker.
This is mostly a copy of the target parallel config, except the tp_size.
This is mostly a copy of the target parallel config, except the tp/pp
sizes used by the draft model.
"""
draft_parallel_config = ParallelConfig(
pipeline_parallel_size=target_parallel_config.pipeline_parallel_size,
pipeline_parallel_size=speculative_draft_pipeline_parallel_size,
tensor_parallel_size=speculative_draft_tensor_parallel_size,
distributed_executor_backend=target_parallel_config.distributed_executor_backend,
max_parallel_loading_workers=target_parallel_config.max_parallel_loading_workers,
@@ -699,6 +806,12 @@ class SpeculativeConfig:
"'tensor_parallel_size' is not a valid argument in the "
"speculative_config. Please pass 'draft_tensor_parallel_size' instead."
)
if self.pipeline_parallel_size is not None:
raise ValueError(
"'pipeline_parallel_size' is not a valid argument in the "
"speculative_config. Please pass "
"'draft_pipeline_parallel_size' instead."
)
if self.num_speculative_tokens is None:
raise ValueError(
@@ -718,7 +831,7 @@ class SpeculativeConfig:
self.draft_parallel_config
)
eagle3_target_supported = [
aux_hidden_states_supported = [
"llama",
"qwen",
"minicpm",
@@ -729,16 +842,16 @@ class SpeculativeConfig:
"nemotron_h",
]
if (
self.method == "eagle3"
self.method in ("eagle3", "extract_hidden_states")
and self.target_model_config
and not any(
supported_model in self.target_model_config.hf_text_config.model_type
for supported_model in eagle3_target_supported
for supported_model in aux_hidden_states_supported
)
):
raise ValueError(
f"Eagle3 is only supported for {eagle3_target_supported} models. " # noqa: E501
f"Got {self.target_model_config.hf_text_config.model_type=}"
f"{self.method} is only supported for {aux_hidden_states_supported}"
f" models. Got {self.target_model_config.hf_text_config.model_type=}"
)
self.verify_equal_vocab_size_if_draft_model()
return self
@@ -782,8 +895,65 @@ class SpeculativeConfig:
def uses_draft_model(self) -> bool:
return self.method == "draft_model"
def uses_extract_hidden_states(self) -> bool:
return self.method == "extract_hidden_states"
def needs_partial_pp_draft_remap(
self, target_parallel_config: ParallelConfig
) -> bool:
"""Whether draft PP is smaller than target PP and needs rank remap."""
if self.draft_parallel_config is None:
return False
return (
target_parallel_config.pipeline_parallel_size
> self.draft_parallel_config.pipeline_parallel_size
)
def resolve_partial_pp_draft_rank(
self, target_parallel_config: ParallelConfig
) -> int:
"""Map a target rank to the local draft rank for partial-PP drafting.
Currently this only supports running the draft model with `draft_pp=1`
on the last target PP stage.
"""
if not self.needs_partial_pp_draft_remap(target_parallel_config):
return target_parallel_config.rank
assert self.draft_parallel_config is not None
draft_pp = self.draft_parallel_config.pipeline_parallel_size
if draft_pp != 1:
raise ValueError(
"Partial pp drafter rank remapping only supports "
"draft_pipeline_parallel_size=1 when target PP is larger."
)
target_tp = target_parallel_config.tensor_parallel_size
draft_tp = self.draft_parallel_config.tensor_parallel_size
if draft_tp != target_tp:
raise ValueError(
"Partial pp drafter rank remapping requires "
"draft_tensor_parallel_size to equal target tensor_parallel_size. "
f"Got draft_tp={draft_tp}, target_tp={target_tp}."
)
target_pp = target_parallel_config.pipeline_parallel_size
target_rank = target_parallel_config.rank
target_pp_rank = target_rank // target_tp
target_tp_rank = target_rank % target_tp
if target_pp_rank != target_pp - 1:
raise ValueError(
"Partial pp drafter should only run on the last "
f"pipeline stage, but got pp rank {target_pp_rank} / {target_pp}"
)
return target_tp_rank
def __repr__(self) -> str:
method = self.method
model = None if method in ("ngram", "suffix") else self.draft_model_config.model
model = (
None
if method in ("ngram", "suffix", "extract_hidden_states")
else self.draft_model_config.model
)
num_spec_tokens = self.num_speculative_tokens
return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"

View File

@@ -126,6 +126,9 @@ def enable_allreduce_rms_fusion(cfg: "VllmConfig") -> bool:
# tp-dp combination broken:
# https://github.com/vllm-project/vllm/issues/34458
and cfg.parallel_config.data_parallel_size == 1
# tp-pp combination broken:
# https://github.com/vllm-project/vllm/issues/35426
and cfg.parallel_config.pipeline_parallel_size == 1
)
@@ -857,7 +860,7 @@ class VllmConfig:
self.compilation_config.pass_config.fuse_gemm_comms = False
else:
# Compute SP threshold early; disable if None (model too
# small) before +rms_norm gets forced into custom_ops.
# small for SP to be beneficial).
pass_config = self.compilation_config.pass_config
if pass_config.sp_min_token_num is None:
from vllm.compilation.passes.fusion.sequence_parallelism import (
@@ -880,15 +883,13 @@ class VllmConfig:
self.compilation_config.pass_config.enable_sp = False
self.compilation_config.pass_config.fuse_gemm_comms = False
if self.compilation_config.pass_config.enable_sp:
if "-rms_norm" in self.compilation_config.custom_ops:
logger.warning(
"RMS norm force disabled, sequence parallelism might break"
)
else:
self.compilation_config.custom_ops.append("+rms_norm")
from vllm.utils.torch_utils import HAS_OPAQUE_TYPE
if self.compilation_config.fast_moe_cold_start is None:
if HAS_OPAQUE_TYPE:
# On torch >= 2.11 the hoisted OpaqueObject approach supersedes
# fast_moe_cold_start, so force it off.
self.compilation_config.fast_moe_cold_start = False
elif self.compilation_config.fast_moe_cold_start is None:
# resolve default behavior: try to be as safe as possible
# this config is unsafe if any spec decoding draft model has a MOE.
# We'll conservatively turn it off if we see spec decoding.
@@ -907,9 +908,9 @@ class VllmConfig:
):
logger.warning_once(
"Pooling models do not support full cudagraphs. "
"Overriding cudagraph_mode to PIECEWISE."
"Overriding cudagraph_mode to NONE."
)
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
elif (
model_config.is_encoder_decoder
and self.compilation_config.cudagraph_mode
@@ -924,6 +925,33 @@ class VllmConfig:
CUDAGraphMode.FULL_DECODE_ONLY
)
# Check if KV connector requires PIECEWISE mode for CUDA graphs
if (
self.kv_transfer_config is not None
and self.kv_transfer_config.is_kv_transfer_instance
and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
):
# Lazy import to avoid circular dependencies
from vllm.distributed.kv_transfer.kv_connector.factory import (
KVConnectorFactory,
)
connector_cls = KVConnectorFactory.get_connector_class(
self.kv_transfer_config
)
if connector_cls.requires_piecewise_for_cudagraph(
self.kv_transfer_config.kv_connector_extra_config
):
logger.warning_once(
"KV connector %s requires PIECEWISE CUDA graph mode "
"due to layerwise async operations that cannot be "
"captured in CUDA graphs. "
"Overriding cudagraph_mode from %s to PIECEWISE.",
connector_cls.__name__,
self.compilation_config.cudagraph_mode.name,
)
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
# disable cudagraph when enforce eager execution
if self.model_config is not None and self.model_config.enforce_eager:
logger.info("Cudagraph is disabled under eager mode")
@@ -1113,6 +1141,20 @@ class VllmConfig:
if not self.instance_id:
self.instance_id = random_uuid()[:5]
def is_ixserver_connector(kv_transfer_config) -> bool:
if kv_transfer_config is not None and hasattr(
kv_transfer_config, "kv_connector"
):
connector = kv_transfer_config.kv_connector
if isinstance(connector, str):
connector_name = connector
else:
connector_name = getattr(
type(connector), "__name__", str(connector)
)
return "IxServer" in connector_name
return False
# Hybrid KV cache manager (HMA) runtime rules:
# - Explicit enable (--no-disable-kv-cache-manager): error if runtime
@@ -1154,21 +1196,29 @@ class VllmConfig:
if self.scheduler_config.disable_hybrid_kv_cache_manager is None:
# Default to disable HMA, but only if the user didn't express a preference.
if self.kv_transfer_config is not None:
if is_ixserver_connector(self.kv_transfer_config):
pass
# NOTE(Kuntai): turn HMA off for connector unless specifically enabled.
need_disable_hybrid_kv_cache_manager = True
logger.warning(
"Turning off hybrid kv cache manager because "
"`--kv-transfer-config` is set. This will reduce the "
"performance of vLLM on LLMs with sliding window attention "
"or Mamba attention. If you are a developer of kv connector"
", please consider supporting hybrid kv cache manager for "
"your connector by making sure your connector is a subclass"
" of `SupportsHMA` defined in kv_connector/v1/base.py and"
" use --no-disable-hybrid-kv-cache-manager to start vLLM."
else:
need_disable_hybrid_kv_cache_manager = True
logger.warning(
"Turning off hybrid kv cache manager because "
"`--kv-transfer-config` is set. This will reduce the "
"performance of vLLM on LLMs with sliding window attention "
"or Mamba attention. If you are a developer of kv connector"
", please consider supporting hybrid kv cache manager for "
"your connector by making sure your connector is a subclass"
" of `SupportsHMA` defined in kv_connector/v1/base.py and"
" use --no-disable-hybrid-kv-cache-manager to start vLLM."
)
self.scheduler_config.disable_hybrid_kv_cache_manager = (
need_disable_hybrid_kv_cache_manager
)
else:
self.scheduler_config.disable_hybrid_kv_cache_manager = (
need_disable_hybrid_kv_cache_manager
)
self.scheduler_config.disable_hybrid_kv_cache_manager = (
need_disable_hybrid_kv_cache_manager
)
elif (
self.scheduler_config.disable_hybrid_kv_cache_manager is False
and need_disable_hybrid_kv_cache_manager
@@ -1466,22 +1516,22 @@ class VllmConfig:
if compile_range_end is not None:
computed_compile_ranges_split_points.append(compile_range_end)
# # Add the compile ranges for flashinfer
# if compilation_config.pass_config.fuse_allreduce_rms:
# tp_size = self.parallel_config.tensor_parallel_size
# max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
# if max_size is not None:
# max_token_num = max_size // (
# self.model_config.get_hidden_size()
# * self.model_config.dtype.itemsize
# )
# if compile_range_end is not None and max_token_num < compile_range_end:
# computed_compile_ranges_split_points.append(max_token_num)
# else:
# logger.debug(
# "Max num batched tokens below allreduce-rms fusion threshold, "
# "allreduce-rms fusion will be enabled for all num_tokens."
# )
# Add the compile ranges for flashinfer
if compilation_config.pass_config.fuse_allreduce_rms:
tp_size = self.parallel_config.tensor_parallel_size
max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
if max_size is not None:
max_token_num = max_size // (
self.model_config.get_hidden_size()
* self.model_config.dtype.itemsize
)
if compile_range_end is not None and max_token_num < compile_range_end:
computed_compile_ranges_split_points.append(max_token_num)
else:
logger.debug(
"Max num batched tokens below allreduce-rms fusion threshold, "
"allreduce-rms fusion will be enabled for all num_tokens."
)
# Add the compile ranges for sequence parallelism
if compilation_config.pass_config.enable_sp:
@@ -1618,6 +1668,7 @@ class VllmConfig:
f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa
f"data_parallel_size={self.parallel_config.data_parallel_size}, " # noqa
f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa
f"quantization={self.model_config.quantization}, "
f"enforce_eager={self.model_config.enforce_eager}, "
f"enable_return_routed_experts={self.model_config.enable_return_routed_experts}, " # noqa
f"kv_cache_dtype={self.cache_config.cache_dtype}, "

View File

@@ -9,5 +9,5 @@ from vllm.config.utils import config
class WeightTransferConfig:
"""Configuration for weight transfer during RL training."""
backend: Literal["nccl"] = "nccl"
backend: Literal["nccl", "ipc"] = "nccl"
"""The backend to use for weight transfer."""