Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import ast
import copy
from typing import TYPE_CHECKING, Any, Literal, get_args
from pydantic import Field, SkipValidation, model_validator
@@ -45,7 +46,7 @@ MTPModelTypes = Literal[
"pangu_ultra_moe_mtp",
"step3p5_mtp",
]
EagleModelTypes = Literal["eagle", "eagle3", MTPModelTypes]
EagleModelTypes = Literal["eagle", "eagle3", "extract_hidden_states", MTPModelTypes]
SpeculativeMethod = Literal[
"ngram",
"medusa",
@@ -77,12 +78,24 @@ class SpeculativeConfig:
If using `ngram` method, the related configuration `prompt_lookup_max` and
`prompt_lookup_min` should be considered."""
enable_multi_layers_mtp: bool = False
"""If set to True, the MTP method will run multiple layers of MTP
speculator. If set to False, it will run only one layer of MTP speculator.
This is only effective when the method is set to `mtp`."""
draft_tensor_parallel_size: int | None = Field(default=None, ge=1)
"""The degree of the tensor parallelism for the draft model. Can only be 1
or the same as the target model's tensor parallel size."""
draft_pipeline_parallel_size: int | None = Field(default=None, ge=1)
"""The degree of pipeline parallelism for the draft model.
Defaults to the target model's pipeline parallel size. Set this to 1 to
run the drafter locally on the last target PP stage."""
tensor_parallel_size: int | None = None
"""Users should pass "draft_tensor_parallel_size". This parameter's purpose is to
warn users when they mistakenly provide the wrong argument."""
pipeline_parallel_size: int | None = None
"""Users should pass "draft_pipeline_parallel_size". This parameter's
purpose is to warn users when they mistakenly provide the wrong argument."""
# Draft model configuration
quantization: me_quant.QuantizationMethods | None = None
@@ -181,9 +194,22 @@ class SpeculativeConfig:
the final hidden states.
"""
factors: list[Any] = []
# Eagle3 affects the computation graph because it returns intermediate
# hidden states in addition to the final hidden state.
factors.append(self.method == "eagle3")
# Eagle3 and extract_hidden_states affect the computation graph because
# they return intermediate hidden states in addition to the final hidden state.
uses_aux_hidden_states = self.method in ("eagle3", "extract_hidden_states")
factors.append(uses_aux_hidden_states)
# The specific layers used also affect the computation graph
if uses_aux_hidden_states and self.draft_model_config is not None:
layer_ids = getattr(
self.draft_model_config.hf_config,
"eagle_aux_hidden_state_layer_ids",
None,
)
if layer_ids is not None:
# Convert to tuple to make it hashable
factors.append(tuple(layer_ids))
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
@@ -352,6 +378,8 @@ class SpeculativeConfig:
self.model = "ngram"
elif self.method == "suffix":
self.model = "suffix"
elif self.method == "extract_hidden_states":
self.model = "extract_hidden_states"
else:
raise ValueError(
"num_speculative_tokens was provided but without speculative model."
@@ -394,6 +422,34 @@ class SpeculativeConfig:
self.draft_parallel_config = self.target_parallel_config
elif self.method == "suffix":
self._validate_suffix_decoding()
elif self.method == "extract_hidden_states":
from vllm.transformers_utils.configs.extract_hidden_states import (
ExtractHiddenStatesConfig,
)
# ExtractHiddenStatesModel is instantiated manually in load_model()
# We just need to store the target model config for KV cache shape info
self.model = "extract_hidden_states"
self.prompt_lookup_max = 0
self.prompt_lookup_min = 0
if hasattr(self.draft_model_config, "hf_config"):
hf_config = self.draft_model_config.hf_config.to_dict()
elif (
isinstance(self.draft_model_config, dict)
and "hf_config" in self.draft_model_config
):
hf_config = self.draft_model_config["hf_config"]
else:
hf_config = {}
self.draft_model_config = copy.copy(self.target_model_config)
self.draft_model_config.hf_config = ExtractHiddenStatesConfig(
self.draft_model_config.hf_config, **hf_config
)
self.update_arch_()
self.draft_parallel_config = self.target_parallel_config
else:
self.prompt_lookup_max = 0
self.prompt_lookup_min = 0
@@ -439,7 +495,10 @@ class SpeculativeConfig:
MTPModelTypes
):
self.method = "mtp"
if self.num_speculative_tokens > 1:
if (
self.enable_multi_layers_mtp is False
and self.num_speculative_tokens > 1
):
logger.warning(
"Enabling num_speculative_tokens > 1 will run "
"multiple times of forward on same MTP layer"
@@ -478,23 +537,8 @@ class SpeculativeConfig:
method=self.method,
model_type="eagle",
)
# EAGLEConfig primarily updates architectures, so update
# all architectures-related fields in draft_model_config
self.draft_model_config.hf_config = eagle_config
self.draft_model_config.hf_text_config = get_hf_text_config(
self.draft_model_config.hf_config
)
self.draft_model_config.model_arch_config = (
self.draft_model_config.get_model_arch_config()
)
model_info, arch = (
self.draft_model_config.registry.inspect_model_cls(
self.draft_model_config.architectures,
self.draft_model_config,
)
)
self.draft_model_config._model_info = model_info
self.draft_model_config._architecture = arch
self.update_arch_()
if self.num_speculative_tokens is not None and hasattr(
self.draft_model_config.hf_config, "num_lookahead_tokens"
@@ -510,6 +554,17 @@ class SpeculativeConfig:
if self.num_speculative_tokens is None:
# Default to max value defined in draft model config.
self.num_speculative_tokens = n_predict
elif (
self.method == "mtp"
and self.enable_multi_layers_mtp
and self.num_speculative_tokens > n_predict
):
logger.warning_once(
"For multi_layer_eagle, num_speculative_tokens "
"is greater than the layer_num, adjusting to "
"layer_num"
)
self.num_speculative_tokens = n_predict
elif (
self.num_speculative_tokens > n_predict
and self.num_speculative_tokens % n_predict != 0
@@ -555,9 +610,17 @@ class SpeculativeConfig:
)
)
self.draft_pipeline_parallel_size = (
SpeculativeConfig._verify_and_get_draft_pp(
self.target_parallel_config,
self.draft_pipeline_parallel_size,
)
)
self.draft_parallel_config = (
SpeculativeConfig.create_draft_parallel_config(
self.target_parallel_config, self.draft_tensor_parallel_size
self.target_parallel_config,
self.draft_tensor_parallel_size,
self.draft_pipeline_parallel_size,
)
)
return self
@@ -671,17 +734,61 @@ class SpeculativeConfig:
)
return speculative_draft_tensor_parallel_size
@staticmethod
def _verify_and_get_draft_pp(
target_parallel_config: ParallelConfig,
speculative_draft_pipeline_parallel_size: int | None,
) -> int:
"""
Verifies and adjusts the pipeline parallel size for a draft model
specified using speculative_draft_pipeline_parallel_size.
"""
if speculative_draft_pipeline_parallel_size is None:
return target_parallel_config.pipeline_parallel_size
if speculative_draft_pipeline_parallel_size not in (
1,
target_parallel_config.pipeline_parallel_size,
):
raise ValueError(
f"{speculative_draft_pipeline_parallel_size=} cannot be "
"other value than 1 or target model "
f"pipeline_parallel_size="
f"{target_parallel_config.pipeline_parallel_size}"
)
return speculative_draft_pipeline_parallel_size
def update_arch_(self):
"""
EagleConfig and ExtractHiddenStatesConfig update architectures, so update all
architectures-related fields in self.draft_model_config
"""
self.draft_model_config.hf_text_config = get_hf_text_config(
self.draft_model_config.hf_config
)
self.draft_model_config.model_arch_config = (
self.draft_model_config.get_model_arch_config()
)
model_info, arch = self.draft_model_config.registry.inspect_model_cls(
self.draft_model_config.architectures,
self.draft_model_config,
)
self.draft_model_config._model_info = model_info
self.draft_model_config._architecture = arch
@staticmethod
def create_draft_parallel_config(
target_parallel_config: ParallelConfig,
speculative_draft_tensor_parallel_size: int,
speculative_draft_pipeline_parallel_size: int,
) -> ParallelConfig:
"""Create a parallel config for use by the draft worker.
This is mostly a copy of the target parallel config, except the tp_size.
This is mostly a copy of the target parallel config, except the tp/pp
sizes used by the draft model.
"""
draft_parallel_config = ParallelConfig(
pipeline_parallel_size=target_parallel_config.pipeline_parallel_size,
pipeline_parallel_size=speculative_draft_pipeline_parallel_size,
tensor_parallel_size=speculative_draft_tensor_parallel_size,
distributed_executor_backend=target_parallel_config.distributed_executor_backend,
max_parallel_loading_workers=target_parallel_config.max_parallel_loading_workers,
@@ -699,6 +806,12 @@ class SpeculativeConfig:
"'tensor_parallel_size' is not a valid argument in the "
"speculative_config. Please pass 'draft_tensor_parallel_size' instead."
)
if self.pipeline_parallel_size is not None:
raise ValueError(
"'pipeline_parallel_size' is not a valid argument in the "
"speculative_config. Please pass "
"'draft_pipeline_parallel_size' instead."
)
if self.num_speculative_tokens is None:
raise ValueError(
@@ -718,7 +831,7 @@ class SpeculativeConfig:
self.draft_parallel_config
)
eagle3_target_supported = [
aux_hidden_states_supported = [
"llama",
"qwen",
"minicpm",
@@ -729,16 +842,16 @@ class SpeculativeConfig:
"nemotron_h",
]
if (
self.method == "eagle3"
self.method in ("eagle3", "extract_hidden_states")
and self.target_model_config
and not any(
supported_model in self.target_model_config.hf_text_config.model_type
for supported_model in eagle3_target_supported
for supported_model in aux_hidden_states_supported
)
):
raise ValueError(
f"Eagle3 is only supported for {eagle3_target_supported} models. " # noqa: E501
f"Got {self.target_model_config.hf_text_config.model_type=}"
f"{self.method} is only supported for {aux_hidden_states_supported}"
f" models. Got {self.target_model_config.hf_text_config.model_type=}"
)
self.verify_equal_vocab_size_if_draft_model()
return self
@@ -782,8 +895,65 @@ class SpeculativeConfig:
def uses_draft_model(self) -> bool:
return self.method == "draft_model"
def uses_extract_hidden_states(self) -> bool:
return self.method == "extract_hidden_states"
def needs_partial_pp_draft_remap(
self, target_parallel_config: ParallelConfig
) -> bool:
"""Whether draft PP is smaller than target PP and needs rank remap."""
if self.draft_parallel_config is None:
return False
return (
target_parallel_config.pipeline_parallel_size
> self.draft_parallel_config.pipeline_parallel_size
)
def resolve_partial_pp_draft_rank(
self, target_parallel_config: ParallelConfig
) -> int:
"""Map a target rank to the local draft rank for partial-PP drafting.
Currently this only supports running the draft model with `draft_pp=1`
on the last target PP stage.
"""
if not self.needs_partial_pp_draft_remap(target_parallel_config):
return target_parallel_config.rank
assert self.draft_parallel_config is not None
draft_pp = self.draft_parallel_config.pipeline_parallel_size
if draft_pp != 1:
raise ValueError(
"Partial pp drafter rank remapping only supports "
"draft_pipeline_parallel_size=1 when target PP is larger."
)
target_tp = target_parallel_config.tensor_parallel_size
draft_tp = self.draft_parallel_config.tensor_parallel_size
if draft_tp != target_tp:
raise ValueError(
"Partial pp drafter rank remapping requires "
"draft_tensor_parallel_size to equal target tensor_parallel_size. "
f"Got draft_tp={draft_tp}, target_tp={target_tp}."
)
target_pp = target_parallel_config.pipeline_parallel_size
target_rank = target_parallel_config.rank
target_pp_rank = target_rank // target_tp
target_tp_rank = target_rank % target_tp
if target_pp_rank != target_pp - 1:
raise ValueError(
"Partial pp drafter should only run on the last "
f"pipeline stage, but got pp rank {target_pp_rank} / {target_pp}"
)
return target_tp_rank
def __repr__(self) -> str:
method = self.method
model = None if method in ("ngram", "suffix") else self.draft_model_config.model
model = (
None
if method in ("ngram", "suffix", "extract_hidden_states")
else self.draft_model_config.model
)
num_spec_tokens = self.num_speculative_tokens
return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"