Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import ast
|
||||
import copy
|
||||
from typing import TYPE_CHECKING, Any, Literal, get_args
|
||||
|
||||
from pydantic import Field, SkipValidation, model_validator
|
||||
@@ -45,7 +46,7 @@ MTPModelTypes = Literal[
|
||||
"pangu_ultra_moe_mtp",
|
||||
"step3p5_mtp",
|
||||
]
|
||||
EagleModelTypes = Literal["eagle", "eagle3", MTPModelTypes]
|
||||
EagleModelTypes = Literal["eagle", "eagle3", "extract_hidden_states", MTPModelTypes]
|
||||
SpeculativeMethod = Literal[
|
||||
"ngram",
|
||||
"medusa",
|
||||
@@ -77,12 +78,24 @@ class SpeculativeConfig:
|
||||
|
||||
If using `ngram` method, the related configuration `prompt_lookup_max` and
|
||||
`prompt_lookup_min` should be considered."""
|
||||
enable_multi_layers_mtp: bool = False
|
||||
"""If set to True, the MTP method will run multiple layers of MTP
|
||||
speculator. If set to False, it will run only one layer of MTP speculator.
|
||||
This is only effective when the method is set to `mtp`."""
|
||||
draft_tensor_parallel_size: int | None = Field(default=None, ge=1)
|
||||
"""The degree of the tensor parallelism for the draft model. Can only be 1
|
||||
or the same as the target model's tensor parallel size."""
|
||||
draft_pipeline_parallel_size: int | None = Field(default=None, ge=1)
|
||||
"""The degree of pipeline parallelism for the draft model.
|
||||
|
||||
Defaults to the target model's pipeline parallel size. Set this to 1 to
|
||||
run the drafter locally on the last target PP stage."""
|
||||
tensor_parallel_size: int | None = None
|
||||
"""Users should pass "draft_tensor_parallel_size". This parameter's purpose is to
|
||||
warn users when they mistakenly provide the wrong argument."""
|
||||
pipeline_parallel_size: int | None = None
|
||||
"""Users should pass "draft_pipeline_parallel_size". This parameter's
|
||||
purpose is to warn users when they mistakenly provide the wrong argument."""
|
||||
|
||||
# Draft model configuration
|
||||
quantization: me_quant.QuantizationMethods | None = None
|
||||
@@ -181,9 +194,22 @@ class SpeculativeConfig:
|
||||
the final hidden states.
|
||||
"""
|
||||
factors: list[Any] = []
|
||||
# Eagle3 affects the computation graph because it returns intermediate
|
||||
# hidden states in addition to the final hidden state.
|
||||
factors.append(self.method == "eagle3")
|
||||
# Eagle3 and extract_hidden_states affect the computation graph because
|
||||
# they return intermediate hidden states in addition to the final hidden state.
|
||||
uses_aux_hidden_states = self.method in ("eagle3", "extract_hidden_states")
|
||||
factors.append(uses_aux_hidden_states)
|
||||
|
||||
# The specific layers used also affect the computation graph
|
||||
if uses_aux_hidden_states and self.draft_model_config is not None:
|
||||
layer_ids = getattr(
|
||||
self.draft_model_config.hf_config,
|
||||
"eagle_aux_hidden_state_layer_ids",
|
||||
None,
|
||||
)
|
||||
if layer_ids is not None:
|
||||
# Convert to tuple to make it hashable
|
||||
factors.append(tuple(layer_ids))
|
||||
|
||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
@@ -352,6 +378,8 @@ class SpeculativeConfig:
|
||||
self.model = "ngram"
|
||||
elif self.method == "suffix":
|
||||
self.model = "suffix"
|
||||
elif self.method == "extract_hidden_states":
|
||||
self.model = "extract_hidden_states"
|
||||
else:
|
||||
raise ValueError(
|
||||
"num_speculative_tokens was provided but without speculative model."
|
||||
@@ -394,6 +422,34 @@ class SpeculativeConfig:
|
||||
self.draft_parallel_config = self.target_parallel_config
|
||||
elif self.method == "suffix":
|
||||
self._validate_suffix_decoding()
|
||||
elif self.method == "extract_hidden_states":
|
||||
from vllm.transformers_utils.configs.extract_hidden_states import (
|
||||
ExtractHiddenStatesConfig,
|
||||
)
|
||||
|
||||
# ExtractHiddenStatesModel is instantiated manually in load_model()
|
||||
# We just need to store the target model config for KV cache shape info
|
||||
self.model = "extract_hidden_states"
|
||||
self.prompt_lookup_max = 0
|
||||
self.prompt_lookup_min = 0
|
||||
|
||||
if hasattr(self.draft_model_config, "hf_config"):
|
||||
hf_config = self.draft_model_config.hf_config.to_dict()
|
||||
elif (
|
||||
isinstance(self.draft_model_config, dict)
|
||||
and "hf_config" in self.draft_model_config
|
||||
):
|
||||
hf_config = self.draft_model_config["hf_config"]
|
||||
else:
|
||||
hf_config = {}
|
||||
|
||||
self.draft_model_config = copy.copy(self.target_model_config)
|
||||
self.draft_model_config.hf_config = ExtractHiddenStatesConfig(
|
||||
self.draft_model_config.hf_config, **hf_config
|
||||
)
|
||||
self.update_arch_()
|
||||
self.draft_parallel_config = self.target_parallel_config
|
||||
|
||||
else:
|
||||
self.prompt_lookup_max = 0
|
||||
self.prompt_lookup_min = 0
|
||||
@@ -439,7 +495,10 @@ class SpeculativeConfig:
|
||||
MTPModelTypes
|
||||
):
|
||||
self.method = "mtp"
|
||||
if self.num_speculative_tokens > 1:
|
||||
if (
|
||||
self.enable_multi_layers_mtp is False
|
||||
and self.num_speculative_tokens > 1
|
||||
):
|
||||
logger.warning(
|
||||
"Enabling num_speculative_tokens > 1 will run "
|
||||
"multiple times of forward on same MTP layer"
|
||||
@@ -478,23 +537,8 @@ class SpeculativeConfig:
|
||||
method=self.method,
|
||||
model_type="eagle",
|
||||
)
|
||||
# EAGLEConfig primarily updates architectures, so update
|
||||
# all architectures-related fields in draft_model_config
|
||||
self.draft_model_config.hf_config = eagle_config
|
||||
self.draft_model_config.hf_text_config = get_hf_text_config(
|
||||
self.draft_model_config.hf_config
|
||||
)
|
||||
self.draft_model_config.model_arch_config = (
|
||||
self.draft_model_config.get_model_arch_config()
|
||||
)
|
||||
model_info, arch = (
|
||||
self.draft_model_config.registry.inspect_model_cls(
|
||||
self.draft_model_config.architectures,
|
||||
self.draft_model_config,
|
||||
)
|
||||
)
|
||||
self.draft_model_config._model_info = model_info
|
||||
self.draft_model_config._architecture = arch
|
||||
self.update_arch_()
|
||||
|
||||
if self.num_speculative_tokens is not None and hasattr(
|
||||
self.draft_model_config.hf_config, "num_lookahead_tokens"
|
||||
@@ -510,6 +554,17 @@ class SpeculativeConfig:
|
||||
if self.num_speculative_tokens is None:
|
||||
# Default to max value defined in draft model config.
|
||||
self.num_speculative_tokens = n_predict
|
||||
elif (
|
||||
self.method == "mtp"
|
||||
and self.enable_multi_layers_mtp
|
||||
and self.num_speculative_tokens > n_predict
|
||||
):
|
||||
logger.warning_once(
|
||||
"For multi_layer_eagle, num_speculative_tokens "
|
||||
"is greater than the layer_num, adjusting to "
|
||||
"layer_num"
|
||||
)
|
||||
self.num_speculative_tokens = n_predict
|
||||
elif (
|
||||
self.num_speculative_tokens > n_predict
|
||||
and self.num_speculative_tokens % n_predict != 0
|
||||
@@ -555,9 +610,17 @@ class SpeculativeConfig:
|
||||
)
|
||||
)
|
||||
|
||||
self.draft_pipeline_parallel_size = (
|
||||
SpeculativeConfig._verify_and_get_draft_pp(
|
||||
self.target_parallel_config,
|
||||
self.draft_pipeline_parallel_size,
|
||||
)
|
||||
)
|
||||
self.draft_parallel_config = (
|
||||
SpeculativeConfig.create_draft_parallel_config(
|
||||
self.target_parallel_config, self.draft_tensor_parallel_size
|
||||
self.target_parallel_config,
|
||||
self.draft_tensor_parallel_size,
|
||||
self.draft_pipeline_parallel_size,
|
||||
)
|
||||
)
|
||||
return self
|
||||
@@ -671,17 +734,61 @@ class SpeculativeConfig:
|
||||
)
|
||||
return speculative_draft_tensor_parallel_size
|
||||
|
||||
@staticmethod
|
||||
def _verify_and_get_draft_pp(
|
||||
target_parallel_config: ParallelConfig,
|
||||
speculative_draft_pipeline_parallel_size: int | None,
|
||||
) -> int:
|
||||
"""
|
||||
Verifies and adjusts the pipeline parallel size for a draft model
|
||||
specified using speculative_draft_pipeline_parallel_size.
|
||||
"""
|
||||
if speculative_draft_pipeline_parallel_size is None:
|
||||
return target_parallel_config.pipeline_parallel_size
|
||||
|
||||
if speculative_draft_pipeline_parallel_size not in (
|
||||
1,
|
||||
target_parallel_config.pipeline_parallel_size,
|
||||
):
|
||||
raise ValueError(
|
||||
f"{speculative_draft_pipeline_parallel_size=} cannot be "
|
||||
"other value than 1 or target model "
|
||||
f"pipeline_parallel_size="
|
||||
f"{target_parallel_config.pipeline_parallel_size}"
|
||||
)
|
||||
return speculative_draft_pipeline_parallel_size
|
||||
|
||||
def update_arch_(self):
|
||||
"""
|
||||
EagleConfig and ExtractHiddenStatesConfig update architectures, so update all
|
||||
architectures-related fields in self.draft_model_config
|
||||
"""
|
||||
self.draft_model_config.hf_text_config = get_hf_text_config(
|
||||
self.draft_model_config.hf_config
|
||||
)
|
||||
self.draft_model_config.model_arch_config = (
|
||||
self.draft_model_config.get_model_arch_config()
|
||||
)
|
||||
model_info, arch = self.draft_model_config.registry.inspect_model_cls(
|
||||
self.draft_model_config.architectures,
|
||||
self.draft_model_config,
|
||||
)
|
||||
self.draft_model_config._model_info = model_info
|
||||
self.draft_model_config._architecture = arch
|
||||
|
||||
@staticmethod
|
||||
def create_draft_parallel_config(
|
||||
target_parallel_config: ParallelConfig,
|
||||
speculative_draft_tensor_parallel_size: int,
|
||||
speculative_draft_pipeline_parallel_size: int,
|
||||
) -> ParallelConfig:
|
||||
"""Create a parallel config for use by the draft worker.
|
||||
|
||||
This is mostly a copy of the target parallel config, except the tp_size.
|
||||
This is mostly a copy of the target parallel config, except the tp/pp
|
||||
sizes used by the draft model.
|
||||
"""
|
||||
draft_parallel_config = ParallelConfig(
|
||||
pipeline_parallel_size=target_parallel_config.pipeline_parallel_size,
|
||||
pipeline_parallel_size=speculative_draft_pipeline_parallel_size,
|
||||
tensor_parallel_size=speculative_draft_tensor_parallel_size,
|
||||
distributed_executor_backend=target_parallel_config.distributed_executor_backend,
|
||||
max_parallel_loading_workers=target_parallel_config.max_parallel_loading_workers,
|
||||
@@ -699,6 +806,12 @@ class SpeculativeConfig:
|
||||
"'tensor_parallel_size' is not a valid argument in the "
|
||||
"speculative_config. Please pass 'draft_tensor_parallel_size' instead."
|
||||
)
|
||||
if self.pipeline_parallel_size is not None:
|
||||
raise ValueError(
|
||||
"'pipeline_parallel_size' is not a valid argument in the "
|
||||
"speculative_config. Please pass "
|
||||
"'draft_pipeline_parallel_size' instead."
|
||||
)
|
||||
|
||||
if self.num_speculative_tokens is None:
|
||||
raise ValueError(
|
||||
@@ -718,7 +831,7 @@ class SpeculativeConfig:
|
||||
self.draft_parallel_config
|
||||
)
|
||||
|
||||
eagle3_target_supported = [
|
||||
aux_hidden_states_supported = [
|
||||
"llama",
|
||||
"qwen",
|
||||
"minicpm",
|
||||
@@ -729,16 +842,16 @@ class SpeculativeConfig:
|
||||
"nemotron_h",
|
||||
]
|
||||
if (
|
||||
self.method == "eagle3"
|
||||
self.method in ("eagle3", "extract_hidden_states")
|
||||
and self.target_model_config
|
||||
and not any(
|
||||
supported_model in self.target_model_config.hf_text_config.model_type
|
||||
for supported_model in eagle3_target_supported
|
||||
for supported_model in aux_hidden_states_supported
|
||||
)
|
||||
):
|
||||
raise ValueError(
|
||||
f"Eagle3 is only supported for {eagle3_target_supported} models. " # noqa: E501
|
||||
f"Got {self.target_model_config.hf_text_config.model_type=}"
|
||||
f"{self.method} is only supported for {aux_hidden_states_supported}"
|
||||
f" models. Got {self.target_model_config.hf_text_config.model_type=}"
|
||||
)
|
||||
self.verify_equal_vocab_size_if_draft_model()
|
||||
return self
|
||||
@@ -782,8 +895,65 @@ class SpeculativeConfig:
|
||||
def uses_draft_model(self) -> bool:
|
||||
return self.method == "draft_model"
|
||||
|
||||
def uses_extract_hidden_states(self) -> bool:
|
||||
return self.method == "extract_hidden_states"
|
||||
|
||||
def needs_partial_pp_draft_remap(
|
||||
self, target_parallel_config: ParallelConfig
|
||||
) -> bool:
|
||||
"""Whether draft PP is smaller than target PP and needs rank remap."""
|
||||
if self.draft_parallel_config is None:
|
||||
return False
|
||||
return (
|
||||
target_parallel_config.pipeline_parallel_size
|
||||
> self.draft_parallel_config.pipeline_parallel_size
|
||||
)
|
||||
|
||||
def resolve_partial_pp_draft_rank(
|
||||
self, target_parallel_config: ParallelConfig
|
||||
) -> int:
|
||||
"""Map a target rank to the local draft rank for partial-PP drafting.
|
||||
|
||||
Currently this only supports running the draft model with `draft_pp=1`
|
||||
on the last target PP stage.
|
||||
"""
|
||||
if not self.needs_partial_pp_draft_remap(target_parallel_config):
|
||||
return target_parallel_config.rank
|
||||
|
||||
assert self.draft_parallel_config is not None
|
||||
draft_pp = self.draft_parallel_config.pipeline_parallel_size
|
||||
if draft_pp != 1:
|
||||
raise ValueError(
|
||||
"Partial pp drafter rank remapping only supports "
|
||||
"draft_pipeline_parallel_size=1 when target PP is larger."
|
||||
)
|
||||
|
||||
target_tp = target_parallel_config.tensor_parallel_size
|
||||
draft_tp = self.draft_parallel_config.tensor_parallel_size
|
||||
if draft_tp != target_tp:
|
||||
raise ValueError(
|
||||
"Partial pp drafter rank remapping requires "
|
||||
"draft_tensor_parallel_size to equal target tensor_parallel_size. "
|
||||
f"Got draft_tp={draft_tp}, target_tp={target_tp}."
|
||||
)
|
||||
|
||||
target_pp = target_parallel_config.pipeline_parallel_size
|
||||
target_rank = target_parallel_config.rank
|
||||
target_pp_rank = target_rank // target_tp
|
||||
target_tp_rank = target_rank % target_tp
|
||||
if target_pp_rank != target_pp - 1:
|
||||
raise ValueError(
|
||||
"Partial pp drafter should only run on the last "
|
||||
f"pipeline stage, but got pp rank {target_pp_rank} / {target_pp}"
|
||||
)
|
||||
return target_tp_rank
|
||||
|
||||
def __repr__(self) -> str:
|
||||
method = self.method
|
||||
model = None if method in ("ngram", "suffix") else self.draft_model_config.model
|
||||
model = (
|
||||
None
|
||||
if method in ("ngram", "suffix", "extract_hidden_states")
|
||||
else self.draft_model_config.model
|
||||
)
|
||||
num_spec_tokens = self.num_speculative_tokens
|
||||
return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
|
||||
|
||||
Reference in New Issue
Block a user