Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -16,8 +16,8 @@ class AttentionConfig:
|
||||
backend: AttentionBackendEnum | None = None
|
||||
"""Attention backend to use. If None, will be selected automatically."""
|
||||
|
||||
flash_attn_version: Literal[2, 3] | None = None
|
||||
"""Force vllm to use a specific flash-attention version (2 or 3).
|
||||
flash_attn_version: Literal[2, 3, 4] | None = None
|
||||
"""Force vllm to use a specific flash-attention version (2, 3, or 4).
|
||||
Only valid when using the flash-attention backend."""
|
||||
|
||||
use_prefill_decode_attention: bool = False
|
||||
|
||||
@@ -86,9 +86,16 @@ class CUDAGraphMode(enum.Enum):
|
||||
|
||||
def separate_routine(self) -> bool:
|
||||
return isinstance(self.value, tuple)
|
||||
|
||||
def decode_use_graph(self) -> bool:
|
||||
return self.decode_mode() == CUDAGraphMode.FULL
|
||||
|
||||
def valid_runtime_modes(self) -> bool:
|
||||
return self in [CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL]
|
||||
@classmethod
|
||||
def valid_runtime_modes(cls) -> frozenset["CUDAGraphMode"]:
|
||||
return frozenset({cls.NONE, cls.PIECEWISE, cls.FULL})
|
||||
|
||||
def is_valid_runtime_mode(self) -> bool:
|
||||
return self in CUDAGraphMode.valid_runtime_modes()
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.name
|
||||
@@ -385,7 +392,7 @@ class CompilationConfig:
|
||||
Please use mode. Currently all levels are mapped to mode.
|
||||
"""
|
||||
# Top-level Compilation control
|
||||
mode: CompilationMode = Field(default=None)
|
||||
mode: CompilationMode = Field(default=CompilationMode.NONE)
|
||||
"""The compilation approach used for torch.compile-based compilation of the
|
||||
model.
|
||||
|
||||
@@ -503,7 +510,7 @@ class CompilationConfig:
|
||||
constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
|
||||
|
||||
# CudaGraph compilation
|
||||
cudagraph_mode: CUDAGraphMode = Field(default=None)
|
||||
cudagraph_mode: CUDAGraphMode = Field(default=CUDAGraphMode.FULL_DECODE_ONLY)
|
||||
"""
|
||||
The mode of the cudagraph:
|
||||
|
||||
@@ -1003,6 +1010,7 @@ class CompilationConfig:
|
||||
# https://github.com/vllm-project/vllm/issues/33267
|
||||
if not self.use_inductor_graph_partition:
|
||||
self.splitting_ops.append("vllm::unified_kv_cache_update")
|
||||
self.splitting_ops.append("vllm::unified_mla_kv_cache_update")
|
||||
|
||||
elif len(self.splitting_ops) == 0:
|
||||
if (
|
||||
@@ -1045,7 +1053,7 @@ class CompilationConfig:
|
||||
"are optimized for prefill and are incompatible with CUDA Graphs. "
|
||||
"In order to use CUDA Graphs for decode-optimized workloads, "
|
||||
"use --all2all-backend with another option, such as "
|
||||
"deepep_low_latency, pplx, or allgather_reducescatter."
|
||||
"deepep_low_latency or allgather_reducescatter."
|
||||
)
|
||||
self.cudagraph_mode = CUDAGraphMode.NONE
|
||||
|
||||
|
||||
@@ -50,8 +50,6 @@ from vllm.transformers_utils.utils import maybe_model_redirect
|
||||
from vllm.utils.import_utils import LazyLoader
|
||||
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||
|
||||
import os
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
@@ -128,6 +126,7 @@ class ModelConfig:
|
||||
- "slow" will always use the slow tokenizer.\n
|
||||
- "mistral" will always use the tokenizer from `mistral_common`.\n
|
||||
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
|
||||
- "qwen_vl" will always use the tokenizer from `qwen_vl`.\n
|
||||
- Other custom values can be supported via plugins."""
|
||||
trust_remote_code: bool = False
|
||||
"""Trust remote code (e.g., from HuggingFace) when downloading the model
|
||||
@@ -463,8 +462,6 @@ class ModelConfig:
|
||||
|
||||
self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if self.override_attention_dtype is not None and not current_platform.is_rocm():
|
||||
warnings.warn(
|
||||
"override-attention-dtype is set but not using ROCm platform",
|
||||
@@ -473,10 +470,9 @@ class ModelConfig:
|
||||
|
||||
if self.enable_sleep_mode and not current_platform.is_sleep_mode_available():
|
||||
raise ValueError("Sleep mode is not supported on current platform.")
|
||||
|
||||
temp_hf_config_path = os.environ.get("CUSTOM_QUANT_CONFIG", None)
|
||||
|
||||
hf_config = get_config(
|
||||
temp_hf_config_path or self.hf_config_path or self.model,
|
||||
self.hf_config_path or self.model,
|
||||
self.trust_remote_code,
|
||||
self.revision,
|
||||
self.code_revision,
|
||||
@@ -622,6 +618,16 @@ class ModelConfig:
|
||||
self._try_verify_and_update_model_config()
|
||||
self._verify_quantization()
|
||||
self._verify_cuda_graph()
|
||||
import os
|
||||
enforce_cuda_graph = os.environ.get("VLLM_ENFORCE_CUDA_GRAPH",None)
|
||||
if enforce_cuda_graph is not None and enforce_cuda_graph in ["1", "y", "Y"]:
|
||||
self.enforce_eager = False
|
||||
else:
|
||||
self.enforce_eager = True
|
||||
logger.warning_once(
|
||||
"Please export VLLM_ENFORCE_CUDA_GRAPH=1 to enable cuda graph. "
|
||||
"For now, cuda graph is not used and --enforce-eager is disabled ,"
|
||||
"we are trying to use cuda graph as the default mode")
|
||||
self._verify_bnb_config()
|
||||
|
||||
def get_model_arch_config(
|
||||
@@ -886,6 +892,7 @@ class ModelConfig:
|
||||
"modelopt",
|
||||
"modelopt_fp4",
|
||||
"modelopt_mxfp8",
|
||||
"modelopt_mixed",
|
||||
"petit_nvfp4",
|
||||
# Ensure heavy backends are probed last to avoid unnecessary
|
||||
# imports during override detection (e.g., MXFP4 imports Triton)
|
||||
@@ -942,8 +949,6 @@ class ModelConfig:
|
||||
f"Unknown quantization method: {self.quantization}. Must "
|
||||
f"be one of {supported_quantization}."
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
current_platform.verify_quantization(self.quantization)
|
||||
|
||||
if self.quantization in me_quant.DEPRECATED_QUANTIZATION_METHODS:
|
||||
@@ -1813,8 +1818,6 @@ def _resolve_auto_dtype(
|
||||
*,
|
||||
is_pooling_model: bool,
|
||||
):
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
supported_dtypes = [
|
||||
dtype
|
||||
for dtype in current_platform.supported_dtypes
|
||||
|
||||
@@ -152,7 +152,6 @@ class ParallelConfig:
|
||||
|
||||
- "naive": Naive all2all implementation using broadcasts\n
|
||||
- "allgather_reducescatter": All2all based on allgather and reducescatter\n
|
||||
- "pplx": Use pplx kernels\n
|
||||
- "deepep_high_throughput": Use deepep high-throughput kernels\n
|
||||
- "deepep_low_latency": Use deepep low-latency kernels\n
|
||||
- "mori": Use mori kernels\n
|
||||
@@ -166,6 +165,9 @@ class ParallelConfig:
|
||||
disable_custom_all_reduce: bool = False
|
||||
"""Disable the custom all-reduce kernel and fall back to NCCL."""
|
||||
|
||||
enable_elastic_ep: bool = False
|
||||
"""Enable elastic expert parallelism with stateless NCCL groups for DP/EP."""
|
||||
|
||||
enable_dbo: bool = False
|
||||
"""Enable dual batch overlap for the model executor."""
|
||||
ubatch_size: int = 0
|
||||
@@ -245,6 +247,34 @@ class ParallelConfig:
|
||||
Set to be private as it's not intended to be configured by users.
|
||||
"""
|
||||
|
||||
_stateless_dp_group_port_list: list[list[int]] = Field(default_factory=list)
|
||||
"""List of open ports for stateless DP groups when enable_elastic_ep is True.
|
||||
Set to be private as it's not intended to be configured by users.
|
||||
It is a list of list[int], with each inner list contains a set of 3 ports
|
||||
to be used for setting up the stateless CPU/device/TCPStore groups
|
||||
in StatelessGroupCoordinator. The number of inner lists is equal to
|
||||
the number of DP groups,
|
||||
i.e., len(self._stateless_dp_group_port_list) == world_size_across_dp // dp_size,
|
||||
and len(self._stateless_dp_group_port_list[i]) == 3 for all i.
|
||||
"""
|
||||
|
||||
_stateless_ep_group_port_list: list[list[int]] = Field(default_factory=list)
|
||||
"""List of open ports for stateless EP groups when enable_elastic_ep is True.
|
||||
Set to be private as it's not intended to be configured by users.
|
||||
len(self._stateless_ep_group_port_list) == world_size_across_dp // ep_size,
|
||||
"""
|
||||
|
||||
_stateless_eplb_group_port_list: list[list[int]] = Field(default_factory=list)
|
||||
"""List of open ports for stateless EPLB groups when enable_elastic_ep is True.
|
||||
Same topology as EP but separate NCCL communicator to avoid deadlocks.
|
||||
"""
|
||||
|
||||
_stateless_world_group_port_list: list[list[int]] = Field(default_factory=list)
|
||||
"""List of open ports for stateless world group when enable_elastic_ep is True.
|
||||
Set to be private as it's not intended to be configured by users.
|
||||
len(self._stateless_world_group_port_list) == 1,
|
||||
"""
|
||||
|
||||
decode_context_parallel_size: int = 1
|
||||
"""Number of decode context parallel groups, because the world size does
|
||||
not change by dcp, it simply reuse the GPUs of TP group, and tp_size
|
||||
@@ -310,6 +340,13 @@ class ParallelConfig:
|
||||
f"but found: {self._api_process_rank}"
|
||||
)
|
||||
|
||||
if self.all2all_backend == "pplx":
|
||||
logger.warning(
|
||||
"The 'pplx' all2all backend has been removed. "
|
||||
"Falling back to 'allgather_reducescatter'."
|
||||
)
|
||||
self.all2all_backend = "allgather_reducescatter"
|
||||
|
||||
if self.data_parallel_size_local > self.data_parallel_size:
|
||||
raise ValueError(
|
||||
f"data_parallel_size_local ({self.data_parallel_size_local}) "
|
||||
@@ -396,7 +433,67 @@ class ParallelConfig:
|
||||
|
||||
return answer
|
||||
|
||||
def stateless_init_dp_group(self) -> ProcessGroup:
|
||||
def allocate_elastic_ep_ports(self) -> None:
|
||||
"""Allocate all ports for elastic EP (stateless groups + DP master).
|
||||
|
||||
Must be called AFTER ray.init() so that ports claimed by Ray's
|
||||
idle worker pool are already in use and won't be returned by
|
||||
get_open_ports_list().
|
||||
"""
|
||||
if not self.enable_elastic_ep:
|
||||
return
|
||||
if self._stateless_world_group_port_list:
|
||||
return
|
||||
|
||||
num_world_groups = 1
|
||||
dp_size = self.data_parallel_size
|
||||
ep_size = self.data_parallel_size * self.world_size_across_dp
|
||||
num_dp_groups = max(1, self.world_size_across_dp // dp_size)
|
||||
num_ep_groups = max(1, self.world_size_across_dp // ep_size)
|
||||
num_eplb_groups = num_ep_groups
|
||||
total_stateless_ports = (
|
||||
num_world_groups + num_dp_groups + num_ep_groups + num_eplb_groups
|
||||
) * 3
|
||||
num_dp_master_ports = 5
|
||||
|
||||
all_ports = get_open_ports_list(total_stateless_ports + num_dp_master_ports)
|
||||
|
||||
self._data_parallel_master_port_list = all_ports[-num_dp_master_ports:]
|
||||
self.data_parallel_master_port = self._data_parallel_master_port_list.pop()
|
||||
all_ports = all_ports[:-num_dp_master_ports]
|
||||
|
||||
self._stateless_world_group_port_list = [
|
||||
all_ports[i : i + 3] for i in range(0, num_world_groups * 3, 3)
|
||||
]
|
||||
start_idx = num_world_groups * 3
|
||||
self._stateless_dp_group_port_list = [
|
||||
all_ports[i : i + 3]
|
||||
for i in range(start_idx, start_idx + num_dp_groups * 3, 3)
|
||||
]
|
||||
start_idx += num_dp_groups * 3
|
||||
self._stateless_ep_group_port_list = [
|
||||
all_ports[i : i + 3]
|
||||
for i in range(start_idx, start_idx + num_ep_groups * 3, 3)
|
||||
]
|
||||
start_idx += num_ep_groups * 3
|
||||
self._stateless_eplb_group_port_list = [
|
||||
all_ports[i : i + 3]
|
||||
for i in range(start_idx, start_idx + num_eplb_groups * 3, 3)
|
||||
]
|
||||
|
||||
def get_next_stateless_world_group_port(self) -> list[int]:
|
||||
return self._stateless_world_group_port_list.pop()
|
||||
|
||||
def get_next_stateless_dp_group_port(self) -> list[int]:
|
||||
return self._stateless_dp_group_port_list.pop()
|
||||
|
||||
def get_next_stateless_ep_group_port(self) -> list[int]:
|
||||
return self._stateless_ep_group_port_list.pop()
|
||||
|
||||
def get_next_stateless_eplb_group_port(self) -> list[int]:
|
||||
return self._stateless_eplb_group_port_list.pop()
|
||||
|
||||
def stateless_init_dp_group(self, return_store: bool = False) -> ProcessGroup:
|
||||
# NOTE: In high-concurrency scenarios multiple processes
|
||||
# can pick the same (currently free) port through a race
|
||||
# condition when calling `get_open_port()`. When the first
|
||||
@@ -420,7 +517,8 @@ class ParallelConfig:
|
||||
self.get_next_dp_init_port(),
|
||||
self.data_parallel_rank,
|
||||
self.data_parallel_size,
|
||||
backend=current_platform.dist_backend,
|
||||
backend="gloo",
|
||||
return_store=return_store,
|
||||
)
|
||||
except DistNetworkError as e:
|
||||
# We only want to retry when the root cause is EADDRINUSE.
|
||||
@@ -442,7 +540,6 @@ class ParallelConfig:
|
||||
# In this case, ensure the input to the experts is sequence parallel
|
||||
# to avoid the excess work.
|
||||
#
|
||||
# Not needed for pplx-kernels as it can handle duplicate input tokens.
|
||||
@property
|
||||
def use_sequence_parallel_moe(self) -> bool:
|
||||
return (
|
||||
@@ -556,6 +653,21 @@ class ParallelConfig:
|
||||
logger.info("Using external launcher for distributed inference.")
|
||||
self.world_size *= self.data_parallel_size
|
||||
|
||||
if self.enable_elastic_ep:
|
||||
if not self.enable_eplb:
|
||||
raise ValueError("Elastic EP is only supported with enable_eplb=True.")
|
||||
if self.pipeline_parallel_size > 1:
|
||||
raise ValueError(
|
||||
"Elastic EP is not supported with pipeline parallelism "
|
||||
f"(pipeline_parallel_size={self.pipeline_parallel_size})."
|
||||
)
|
||||
if self.data_parallel_external_lb or self.data_parallel_hybrid_lb:
|
||||
raise NotImplementedError(
|
||||
"Elastic EP is not compatible with data_parallel_external_lb "
|
||||
"or data_parallel_hybrid_lb. Elastic EP relies on a single API "
|
||||
"server and core client to coordinate scale up/down."
|
||||
)
|
||||
|
||||
if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
|
||||
# Data parallel was specified in the engine args.
|
||||
if self.distributed_executor_backend == "external_launcher":
|
||||
@@ -568,9 +680,12 @@ class ParallelConfig:
|
||||
"Set data_parallel_rank to %d automatically.",
|
||||
self.data_parallel_rank,
|
||||
)
|
||||
if not self._data_parallel_master_port_list:
|
||||
self._data_parallel_master_port_list = get_open_ports_list(5)
|
||||
self.data_parallel_master_port = self._data_parallel_master_port_list.pop()
|
||||
if not self.enable_elastic_ep:
|
||||
if not self._data_parallel_master_port_list:
|
||||
self._data_parallel_master_port_list = get_open_ports_list(5)
|
||||
self.data_parallel_master_port = (
|
||||
self._data_parallel_master_port_list.pop()
|
||||
)
|
||||
|
||||
if not (0 <= self.data_parallel_rank < self.data_parallel_size):
|
||||
raise ValueError(
|
||||
@@ -597,7 +712,7 @@ class ParallelConfig:
|
||||
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
|
||||
logger.info("Disabling V1 multiprocessing for external launcher.")
|
||||
|
||||
if self.distributed_executor_backend is None and self.world_size > 1:
|
||||
if self.distributed_executor_backend is None and self.world_size_across_dp > 1:
|
||||
# We use multiprocessing by default if world_size fits on the
|
||||
# current node and we aren't in a ray placement group.
|
||||
|
||||
@@ -659,6 +774,17 @@ class ParallelConfig:
|
||||
"backend is mp, uni or external_launcher."
|
||||
)
|
||||
|
||||
if (
|
||||
self.all2all_backend in ("allgather_reducescatter", "naive")
|
||||
and self.eplb_config.use_async
|
||||
):
|
||||
logger.warning(
|
||||
"Async EPLB causes hangs with the '%s' all2all backend. "
|
||||
"Forcing synchronous EPLB.",
|
||||
self.all2all_backend,
|
||||
)
|
||||
self.eplb_config.use_async = False
|
||||
|
||||
@property
|
||||
def use_ray(self) -> bool:
|
||||
return self.distributed_executor_backend == "ray" or (
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import ast
|
||||
import copy
|
||||
from typing import TYPE_CHECKING, Any, Literal, get_args
|
||||
|
||||
from pydantic import Field, SkipValidation, model_validator
|
||||
@@ -45,7 +46,7 @@ MTPModelTypes = Literal[
|
||||
"pangu_ultra_moe_mtp",
|
||||
"step3p5_mtp",
|
||||
]
|
||||
EagleModelTypes = Literal["eagle", "eagle3", MTPModelTypes]
|
||||
EagleModelTypes = Literal["eagle", "eagle3", "extract_hidden_states", MTPModelTypes]
|
||||
SpeculativeMethod = Literal[
|
||||
"ngram",
|
||||
"medusa",
|
||||
@@ -77,12 +78,24 @@ class SpeculativeConfig:
|
||||
|
||||
If using `ngram` method, the related configuration `prompt_lookup_max` and
|
||||
`prompt_lookup_min` should be considered."""
|
||||
enable_multi_layers_mtp: bool = False
|
||||
"""If set to True, the MTP method will run multiple layers of MTP
|
||||
speculator. If set to False, it will run only one layer of MTP speculator.
|
||||
This is only effective when the method is set to `mtp`."""
|
||||
draft_tensor_parallel_size: int | None = Field(default=None, ge=1)
|
||||
"""The degree of the tensor parallelism for the draft model. Can only be 1
|
||||
or the same as the target model's tensor parallel size."""
|
||||
draft_pipeline_parallel_size: int | None = Field(default=None, ge=1)
|
||||
"""The degree of pipeline parallelism for the draft model.
|
||||
|
||||
Defaults to the target model's pipeline parallel size. Set this to 1 to
|
||||
run the drafter locally on the last target PP stage."""
|
||||
tensor_parallel_size: int | None = None
|
||||
"""Users should pass "draft_tensor_parallel_size". This parameter's purpose is to
|
||||
warn users when they mistakenly provide the wrong argument."""
|
||||
pipeline_parallel_size: int | None = None
|
||||
"""Users should pass "draft_pipeline_parallel_size". This parameter's
|
||||
purpose is to warn users when they mistakenly provide the wrong argument."""
|
||||
|
||||
# Draft model configuration
|
||||
quantization: me_quant.QuantizationMethods | None = None
|
||||
@@ -181,9 +194,22 @@ class SpeculativeConfig:
|
||||
the final hidden states.
|
||||
"""
|
||||
factors: list[Any] = []
|
||||
# Eagle3 affects the computation graph because it returns intermediate
|
||||
# hidden states in addition to the final hidden state.
|
||||
factors.append(self.method == "eagle3")
|
||||
# Eagle3 and extract_hidden_states affect the computation graph because
|
||||
# they return intermediate hidden states in addition to the final hidden state.
|
||||
uses_aux_hidden_states = self.method in ("eagle3", "extract_hidden_states")
|
||||
factors.append(uses_aux_hidden_states)
|
||||
|
||||
# The specific layers used also affect the computation graph
|
||||
if uses_aux_hidden_states and self.draft_model_config is not None:
|
||||
layer_ids = getattr(
|
||||
self.draft_model_config.hf_config,
|
||||
"eagle_aux_hidden_state_layer_ids",
|
||||
None,
|
||||
)
|
||||
if layer_ids is not None:
|
||||
# Convert to tuple to make it hashable
|
||||
factors.append(tuple(layer_ids))
|
||||
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
@@ -352,6 +378,8 @@ class SpeculativeConfig:
|
||||
self.model = "ngram"
|
||||
elif self.method == "suffix":
|
||||
self.model = "suffix"
|
||||
elif self.method == "extract_hidden_states":
|
||||
self.model = "extract_hidden_states"
|
||||
else:
|
||||
raise ValueError(
|
||||
"num_speculative_tokens was provided but without speculative model."
|
||||
@@ -394,6 +422,34 @@ class SpeculativeConfig:
|
||||
self.draft_parallel_config = self.target_parallel_config
|
||||
elif self.method == "suffix":
|
||||
self._validate_suffix_decoding()
|
||||
elif self.method == "extract_hidden_states":
|
||||
from vllm.transformers_utils.configs.extract_hidden_states import (
|
||||
ExtractHiddenStatesConfig,
|
||||
)
|
||||
|
||||
# ExtractHiddenStatesModel is instantiated manually in load_model()
|
||||
# We just need to store the target model config for KV cache shape info
|
||||
self.model = "extract_hidden_states"
|
||||
self.prompt_lookup_max = 0
|
||||
self.prompt_lookup_min = 0
|
||||
|
||||
if hasattr(self.draft_model_config, "hf_config"):
|
||||
hf_config = self.draft_model_config.hf_config.to_dict()
|
||||
elif (
|
||||
isinstance(self.draft_model_config, dict)
|
||||
and "hf_config" in self.draft_model_config
|
||||
):
|
||||
hf_config = self.draft_model_config["hf_config"]
|
||||
else:
|
||||
hf_config = {}
|
||||
|
||||
self.draft_model_config = copy.copy(self.target_model_config)
|
||||
self.draft_model_config.hf_config = ExtractHiddenStatesConfig(
|
||||
self.draft_model_config.hf_config, **hf_config
|
||||
)
|
||||
self.update_arch_()
|
||||
self.draft_parallel_config = self.target_parallel_config
|
||||
|
||||
else:
|
||||
self.prompt_lookup_max = 0
|
||||
self.prompt_lookup_min = 0
|
||||
@@ -439,7 +495,10 @@ class SpeculativeConfig:
|
||||
MTPModelTypes
|
||||
):
|
||||
self.method = "mtp"
|
||||
if self.num_speculative_tokens > 1:
|
||||
if (
|
||||
self.enable_multi_layers_mtp is False
|
||||
and self.num_speculative_tokens > 1
|
||||
):
|
||||
logger.warning(
|
||||
"Enabling num_speculative_tokens > 1 will run "
|
||||
"multiple times of forward on same MTP layer"
|
||||
@@ -478,23 +537,8 @@ class SpeculativeConfig:
|
||||
method=self.method,
|
||||
model_type="eagle",
|
||||
)
|
||||
# EAGLEConfig primarily updates architectures, so update
|
||||
# all architectures-related fields in draft_model_config
|
||||
self.draft_model_config.hf_config = eagle_config
|
||||
self.draft_model_config.hf_text_config = get_hf_text_config(
|
||||
self.draft_model_config.hf_config
|
||||
)
|
||||
self.draft_model_config.model_arch_config = (
|
||||
self.draft_model_config.get_model_arch_config()
|
||||
)
|
||||
model_info, arch = (
|
||||
self.draft_model_config.registry.inspect_model_cls(
|
||||
self.draft_model_config.architectures,
|
||||
self.draft_model_config,
|
||||
)
|
||||
)
|
||||
self.draft_model_config._model_info = model_info
|
||||
self.draft_model_config._architecture = arch
|
||||
self.update_arch_()
|
||||
|
||||
if self.num_speculative_tokens is not None and hasattr(
|
||||
self.draft_model_config.hf_config, "num_lookahead_tokens"
|
||||
@@ -510,6 +554,17 @@ class SpeculativeConfig:
|
||||
if self.num_speculative_tokens is None:
|
||||
# Default to max value defined in draft model config.
|
||||
self.num_speculative_tokens = n_predict
|
||||
elif (
|
||||
self.method == "mtp"
|
||||
and self.enable_multi_layers_mtp
|
||||
and self.num_speculative_tokens > n_predict
|
||||
):
|
||||
logger.warning_once(
|
||||
"For multi_layer_eagle, num_speculative_tokens "
|
||||
"is greater than the layer_num, adjusting to "
|
||||
"layer_num"
|
||||
)
|
||||
self.num_speculative_tokens = n_predict
|
||||
elif (
|
||||
self.num_speculative_tokens > n_predict
|
||||
and self.num_speculative_tokens % n_predict != 0
|
||||
@@ -555,9 +610,17 @@ class SpeculativeConfig:
|
||||
)
|
||||
)
|
||||
|
||||
self.draft_pipeline_parallel_size = (
|
||||
SpeculativeConfig._verify_and_get_draft_pp(
|
||||
self.target_parallel_config,
|
||||
self.draft_pipeline_parallel_size,
|
||||
)
|
||||
)
|
||||
self.draft_parallel_config = (
|
||||
SpeculativeConfig.create_draft_parallel_config(
|
||||
self.target_parallel_config, self.draft_tensor_parallel_size
|
||||
self.target_parallel_config,
|
||||
self.draft_tensor_parallel_size,
|
||||
self.draft_pipeline_parallel_size,
|
||||
)
|
||||
)
|
||||
return self
|
||||
@@ -671,17 +734,61 @@ class SpeculativeConfig:
|
||||
)
|
||||
return speculative_draft_tensor_parallel_size
|
||||
|
||||
@staticmethod
|
||||
def _verify_and_get_draft_pp(
|
||||
target_parallel_config: ParallelConfig,
|
||||
speculative_draft_pipeline_parallel_size: int | None,
|
||||
) -> int:
|
||||
"""
|
||||
Verifies and adjusts the pipeline parallel size for a draft model
|
||||
specified using speculative_draft_pipeline_parallel_size.
|
||||
"""
|
||||
if speculative_draft_pipeline_parallel_size is None:
|
||||
return target_parallel_config.pipeline_parallel_size
|
||||
|
||||
if speculative_draft_pipeline_parallel_size not in (
|
||||
1,
|
||||
target_parallel_config.pipeline_parallel_size,
|
||||
):
|
||||
raise ValueError(
|
||||
f"{speculative_draft_pipeline_parallel_size=} cannot be "
|
||||
"other value than 1 or target model "
|
||||
f"pipeline_parallel_size="
|
||||
f"{target_parallel_config.pipeline_parallel_size}"
|
||||
)
|
||||
return speculative_draft_pipeline_parallel_size
|
||||
|
||||
def update_arch_(self):
|
||||
"""
|
||||
EagleConfig and ExtractHiddenStatesConfig update architectures, so update all
|
||||
architectures-related fields in self.draft_model_config
|
||||
"""
|
||||
self.draft_model_config.hf_text_config = get_hf_text_config(
|
||||
self.draft_model_config.hf_config
|
||||
)
|
||||
self.draft_model_config.model_arch_config = (
|
||||
self.draft_model_config.get_model_arch_config()
|
||||
)
|
||||
model_info, arch = self.draft_model_config.registry.inspect_model_cls(
|
||||
self.draft_model_config.architectures,
|
||||
self.draft_model_config,
|
||||
)
|
||||
self.draft_model_config._model_info = model_info
|
||||
self.draft_model_config._architecture = arch
|
||||
|
||||
@staticmethod
|
||||
def create_draft_parallel_config(
|
||||
target_parallel_config: ParallelConfig,
|
||||
speculative_draft_tensor_parallel_size: int,
|
||||
speculative_draft_pipeline_parallel_size: int,
|
||||
) -> ParallelConfig:
|
||||
"""Create a parallel config for use by the draft worker.
|
||||
|
||||
This is mostly a copy of the target parallel config, except the tp_size.
|
||||
This is mostly a copy of the target parallel config, except the tp/pp
|
||||
sizes used by the draft model.
|
||||
"""
|
||||
draft_parallel_config = ParallelConfig(
|
||||
pipeline_parallel_size=target_parallel_config.pipeline_parallel_size,
|
||||
pipeline_parallel_size=speculative_draft_pipeline_parallel_size,
|
||||
tensor_parallel_size=speculative_draft_tensor_parallel_size,
|
||||
distributed_executor_backend=target_parallel_config.distributed_executor_backend,
|
||||
max_parallel_loading_workers=target_parallel_config.max_parallel_loading_workers,
|
||||
@@ -699,6 +806,12 @@ class SpeculativeConfig:
|
||||
"'tensor_parallel_size' is not a valid argument in the "
|
||||
"speculative_config. Please pass 'draft_tensor_parallel_size' instead."
|
||||
)
|
||||
if self.pipeline_parallel_size is not None:
|
||||
raise ValueError(
|
||||
"'pipeline_parallel_size' is not a valid argument in the "
|
||||
"speculative_config. Please pass "
|
||||
"'draft_pipeline_parallel_size' instead."
|
||||
)
|
||||
|
||||
if self.num_speculative_tokens is None:
|
||||
raise ValueError(
|
||||
@@ -718,7 +831,7 @@ class SpeculativeConfig:
|
||||
self.draft_parallel_config
|
||||
)
|
||||
|
||||
eagle3_target_supported = [
|
||||
aux_hidden_states_supported = [
|
||||
"llama",
|
||||
"qwen",
|
||||
"minicpm",
|
||||
@@ -729,16 +842,16 @@ class SpeculativeConfig:
|
||||
"nemotron_h",
|
||||
]
|
||||
if (
|
||||
self.method == "eagle3"
|
||||
self.method in ("eagle3", "extract_hidden_states")
|
||||
and self.target_model_config
|
||||
and not any(
|
||||
supported_model in self.target_model_config.hf_text_config.model_type
|
||||
for supported_model in eagle3_target_supported
|
||||
for supported_model in aux_hidden_states_supported
|
||||
)
|
||||
):
|
||||
raise ValueError(
|
||||
f"Eagle3 is only supported for {eagle3_target_supported} models. " # noqa: E501
|
||||
f"Got {self.target_model_config.hf_text_config.model_type=}"
|
||||
f"{self.method} is only supported for {aux_hidden_states_supported}"
|
||||
f" models. Got {self.target_model_config.hf_text_config.model_type=}"
|
||||
)
|
||||
self.verify_equal_vocab_size_if_draft_model()
|
||||
return self
|
||||
@@ -782,8 +895,65 @@ class SpeculativeConfig:
|
||||
def uses_draft_model(self) -> bool:
|
||||
return self.method == "draft_model"
|
||||
|
||||
def uses_extract_hidden_states(self) -> bool:
|
||||
return self.method == "extract_hidden_states"
|
||||
|
||||
def needs_partial_pp_draft_remap(
|
||||
self, target_parallel_config: ParallelConfig
|
||||
) -> bool:
|
||||
"""Whether draft PP is smaller than target PP and needs rank remap."""
|
||||
if self.draft_parallel_config is None:
|
||||
return False
|
||||
return (
|
||||
target_parallel_config.pipeline_parallel_size
|
||||
> self.draft_parallel_config.pipeline_parallel_size
|
||||
)
|
||||
|
||||
def resolve_partial_pp_draft_rank(
|
||||
self, target_parallel_config: ParallelConfig
|
||||
) -> int:
|
||||
"""Map a target rank to the local draft rank for partial-PP drafting.
|
||||
|
||||
Currently this only supports running the draft model with `draft_pp=1`
|
||||
on the last target PP stage.
|
||||
"""
|
||||
if not self.needs_partial_pp_draft_remap(target_parallel_config):
|
||||
return target_parallel_config.rank
|
||||
|
||||
assert self.draft_parallel_config is not None
|
||||
draft_pp = self.draft_parallel_config.pipeline_parallel_size
|
||||
if draft_pp != 1:
|
||||
raise ValueError(
|
||||
"Partial pp drafter rank remapping only supports "
|
||||
"draft_pipeline_parallel_size=1 when target PP is larger."
|
||||
)
|
||||
|
||||
target_tp = target_parallel_config.tensor_parallel_size
|
||||
draft_tp = self.draft_parallel_config.tensor_parallel_size
|
||||
if draft_tp != target_tp:
|
||||
raise ValueError(
|
||||
"Partial pp drafter rank remapping requires "
|
||||
"draft_tensor_parallel_size to equal target tensor_parallel_size. "
|
||||
f"Got draft_tp={draft_tp}, target_tp={target_tp}."
|
||||
)
|
||||
|
||||
target_pp = target_parallel_config.pipeline_parallel_size
|
||||
target_rank = target_parallel_config.rank
|
||||
target_pp_rank = target_rank // target_tp
|
||||
target_tp_rank = target_rank % target_tp
|
||||
if target_pp_rank != target_pp - 1:
|
||||
raise ValueError(
|
||||
"Partial pp drafter should only run on the last "
|
||||
f"pipeline stage, but got pp rank {target_pp_rank} / {target_pp}"
|
||||
)
|
||||
return target_tp_rank
|
||||
|
||||
def __repr__(self) -> str:
|
||||
method = self.method
|
||||
model = None if method in ("ngram", "suffix") else self.draft_model_config.model
|
||||
model = (
|
||||
None
|
||||
if method in ("ngram", "suffix", "extract_hidden_states")
|
||||
else self.draft_model_config.model
|
||||
)
|
||||
num_spec_tokens = self.num_speculative_tokens
|
||||
return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
|
||||
|
||||
@@ -126,6 +126,9 @@ def enable_allreduce_rms_fusion(cfg: "VllmConfig") -> bool:
|
||||
# tp-dp combination broken:
|
||||
# https://github.com/vllm-project/vllm/issues/34458
|
||||
and cfg.parallel_config.data_parallel_size == 1
|
||||
# tp-pp combination broken:
|
||||
# https://github.com/vllm-project/vllm/issues/35426
|
||||
and cfg.parallel_config.pipeline_parallel_size == 1
|
||||
)
|
||||
|
||||
|
||||
@@ -857,7 +860,7 @@ class VllmConfig:
|
||||
self.compilation_config.pass_config.fuse_gemm_comms = False
|
||||
else:
|
||||
# Compute SP threshold early; disable if None (model too
|
||||
# small) before +rms_norm gets forced into custom_ops.
|
||||
# small for SP to be beneficial).
|
||||
pass_config = self.compilation_config.pass_config
|
||||
if pass_config.sp_min_token_num is None:
|
||||
from vllm.compilation.passes.fusion.sequence_parallelism import (
|
||||
@@ -880,15 +883,13 @@ class VllmConfig:
|
||||
self.compilation_config.pass_config.enable_sp = False
|
||||
self.compilation_config.pass_config.fuse_gemm_comms = False
|
||||
|
||||
if self.compilation_config.pass_config.enable_sp:
|
||||
if "-rms_norm" in self.compilation_config.custom_ops:
|
||||
logger.warning(
|
||||
"RMS norm force disabled, sequence parallelism might break"
|
||||
)
|
||||
else:
|
||||
self.compilation_config.custom_ops.append("+rms_norm")
|
||||
from vllm.utils.torch_utils import HAS_OPAQUE_TYPE
|
||||
|
||||
if self.compilation_config.fast_moe_cold_start is None:
|
||||
if HAS_OPAQUE_TYPE:
|
||||
# On torch >= 2.11 the hoisted OpaqueObject approach supersedes
|
||||
# fast_moe_cold_start, so force it off.
|
||||
self.compilation_config.fast_moe_cold_start = False
|
||||
elif self.compilation_config.fast_moe_cold_start is None:
|
||||
# resolve default behavior: try to be as safe as possible
|
||||
# this config is unsafe if any spec decoding draft model has a MOE.
|
||||
# We'll conservatively turn it off if we see spec decoding.
|
||||
@@ -907,9 +908,9 @@ class VllmConfig:
|
||||
):
|
||||
logger.warning_once(
|
||||
"Pooling models do not support full cudagraphs. "
|
||||
"Overriding cudagraph_mode to PIECEWISE."
|
||||
"Overriding cudagraph_mode to NONE."
|
||||
)
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
elif (
|
||||
model_config.is_encoder_decoder
|
||||
and self.compilation_config.cudagraph_mode
|
||||
@@ -924,6 +925,33 @@ class VllmConfig:
|
||||
CUDAGraphMode.FULL_DECODE_ONLY
|
||||
)
|
||||
|
||||
# Check if KV connector requires PIECEWISE mode for CUDA graphs
|
||||
if (
|
||||
self.kv_transfer_config is not None
|
||||
and self.kv_transfer_config.is_kv_transfer_instance
|
||||
and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
||||
):
|
||||
# Lazy import to avoid circular dependencies
|
||||
from vllm.distributed.kv_transfer.kv_connector.factory import (
|
||||
KVConnectorFactory,
|
||||
)
|
||||
|
||||
connector_cls = KVConnectorFactory.get_connector_class(
|
||||
self.kv_transfer_config
|
||||
)
|
||||
if connector_cls.requires_piecewise_for_cudagraph(
|
||||
self.kv_transfer_config.kv_connector_extra_config
|
||||
):
|
||||
logger.warning_once(
|
||||
"KV connector %s requires PIECEWISE CUDA graph mode "
|
||||
"due to layerwise async operations that cannot be "
|
||||
"captured in CUDA graphs. "
|
||||
"Overriding cudagraph_mode from %s to PIECEWISE.",
|
||||
connector_cls.__name__,
|
||||
self.compilation_config.cudagraph_mode.name,
|
||||
)
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||
|
||||
# disable cudagraph when enforce eager execution
|
||||
if self.model_config is not None and self.model_config.enforce_eager:
|
||||
logger.info("Cudagraph is disabled under eager mode")
|
||||
@@ -1113,6 +1141,20 @@ class VllmConfig:
|
||||
|
||||
if not self.instance_id:
|
||||
self.instance_id = random_uuid()[:5]
|
||||
|
||||
def is_ixserver_connector(kv_transfer_config) -> bool:
|
||||
if kv_transfer_config is not None and hasattr(
|
||||
kv_transfer_config, "kv_connector"
|
||||
):
|
||||
connector = kv_transfer_config.kv_connector
|
||||
if isinstance(connector, str):
|
||||
connector_name = connector
|
||||
else:
|
||||
connector_name = getattr(
|
||||
type(connector), "__name__", str(connector)
|
||||
)
|
||||
return "IxServer" in connector_name
|
||||
return False
|
||||
|
||||
# Hybrid KV cache manager (HMA) runtime rules:
|
||||
# - Explicit enable (--no-disable-kv-cache-manager): error if runtime
|
||||
@@ -1154,21 +1196,29 @@ class VllmConfig:
|
||||
if self.scheduler_config.disable_hybrid_kv_cache_manager is None:
|
||||
# Default to disable HMA, but only if the user didn't express a preference.
|
||||
if self.kv_transfer_config is not None:
|
||||
if is_ixserver_connector(self.kv_transfer_config):
|
||||
pass
|
||||
# NOTE(Kuntai): turn HMA off for connector unless specifically enabled.
|
||||
need_disable_hybrid_kv_cache_manager = True
|
||||
logger.warning(
|
||||
"Turning off hybrid kv cache manager because "
|
||||
"`--kv-transfer-config` is set. This will reduce the "
|
||||
"performance of vLLM on LLMs with sliding window attention "
|
||||
"or Mamba attention. If you are a developer of kv connector"
|
||||
", please consider supporting hybrid kv cache manager for "
|
||||
"your connector by making sure your connector is a subclass"
|
||||
" of `SupportsHMA` defined in kv_connector/v1/base.py and"
|
||||
" use --no-disable-hybrid-kv-cache-manager to start vLLM."
|
||||
else:
|
||||
need_disable_hybrid_kv_cache_manager = True
|
||||
logger.warning(
|
||||
"Turning off hybrid kv cache manager because "
|
||||
"`--kv-transfer-config` is set. This will reduce the "
|
||||
"performance of vLLM on LLMs with sliding window attention "
|
||||
"or Mamba attention. If you are a developer of kv connector"
|
||||
", please consider supporting hybrid kv cache manager for "
|
||||
"your connector by making sure your connector is a subclass"
|
||||
" of `SupportsHMA` defined in kv_connector/v1/base.py and"
|
||||
" use --no-disable-hybrid-kv-cache-manager to start vLLM."
|
||||
)
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = (
|
||||
need_disable_hybrid_kv_cache_manager
|
||||
)
|
||||
|
||||
else:
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = (
|
||||
need_disable_hybrid_kv_cache_manager
|
||||
)
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = (
|
||||
need_disable_hybrid_kv_cache_manager
|
||||
)
|
||||
elif (
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager is False
|
||||
and need_disable_hybrid_kv_cache_manager
|
||||
@@ -1466,22 +1516,22 @@ class VllmConfig:
|
||||
if compile_range_end is not None:
|
||||
computed_compile_ranges_split_points.append(compile_range_end)
|
||||
|
||||
# # Add the compile ranges for flashinfer
|
||||
# if compilation_config.pass_config.fuse_allreduce_rms:
|
||||
# tp_size = self.parallel_config.tensor_parallel_size
|
||||
# max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
|
||||
# if max_size is not None:
|
||||
# max_token_num = max_size // (
|
||||
# self.model_config.get_hidden_size()
|
||||
# * self.model_config.dtype.itemsize
|
||||
# )
|
||||
# if compile_range_end is not None and max_token_num < compile_range_end:
|
||||
# computed_compile_ranges_split_points.append(max_token_num)
|
||||
# else:
|
||||
# logger.debug(
|
||||
# "Max num batched tokens below allreduce-rms fusion threshold, "
|
||||
# "allreduce-rms fusion will be enabled for all num_tokens."
|
||||
# )
|
||||
# Add the compile ranges for flashinfer
|
||||
if compilation_config.pass_config.fuse_allreduce_rms:
|
||||
tp_size = self.parallel_config.tensor_parallel_size
|
||||
max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
|
||||
if max_size is not None:
|
||||
max_token_num = max_size // (
|
||||
self.model_config.get_hidden_size()
|
||||
* self.model_config.dtype.itemsize
|
||||
)
|
||||
if compile_range_end is not None and max_token_num < compile_range_end:
|
||||
computed_compile_ranges_split_points.append(max_token_num)
|
||||
else:
|
||||
logger.debug(
|
||||
"Max num batched tokens below allreduce-rms fusion threshold, "
|
||||
"allreduce-rms fusion will be enabled for all num_tokens."
|
||||
)
|
||||
|
||||
# Add the compile ranges for sequence parallelism
|
||||
if compilation_config.pass_config.enable_sp:
|
||||
@@ -1618,6 +1668,7 @@ class VllmConfig:
|
||||
f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa
|
||||
f"data_parallel_size={self.parallel_config.data_parallel_size}, " # noqa
|
||||
f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa
|
||||
f"quantization={self.model_config.quantization}, "
|
||||
f"enforce_eager={self.model_config.enforce_eager}, "
|
||||
f"enable_return_routed_experts={self.model_config.enable_return_routed_experts}, " # noqa
|
||||
f"kv_cache_dtype={self.cache_config.cache_dtype}, "
|
||||
|
||||
@@ -9,5 +9,5 @@ from vllm.config.utils import config
|
||||
class WeightTransferConfig:
|
||||
"""Configuration for weight transfer during RL training."""
|
||||
|
||||
backend: Literal["nccl"] = "nccl"
|
||||
backend: Literal["nccl", "ipc"] = "nccl"
|
||||
"""The backend to use for weight transfer."""
|
||||
|
||||
Reference in New Issue
Block a user