### What this PR does / why we need it? 1. clean up v0.10.2 support in ut and e2e test 2. remove v0.11.0 period job, we're at v0.11.0 now. 3. remove uesless patch for deepseek v3.2. They have been done in vLLM already. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
235 lines
11 KiB
Python
235 lines
11 KiB
Python
import ast
|
|
|
|
import vllm.envs as envs
|
|
from vllm.config.speculative import SpeculativeConfig
|
|
from vllm.logger import logger
|
|
|
|
|
|
def __post_init__(self):
|
|
|
|
# Note: "method" is a new parameter that helps to extend the
|
|
# configuration of non-model-based proposers, and the "model" parameter
|
|
# will be used to set the draft model, eagle head, or additional weight
|
|
# when needed. If users do not specify "method", the speculative method
|
|
# will be detected automatically if possible. If the speculative method
|
|
# can not be detected, it will be considered as the "draft_model" by
|
|
# default.
|
|
|
|
if self.model is None and self.num_speculative_tokens is not None:
|
|
# TODO(Shangming): Refactor mtp configuration logic when supporting
|
|
if (self.target_model_config
|
|
and self.target_model_config.hf_text_config.model_type
|
|
in ("deepseek_v3", "deepseek_v32", "mimo", "ernie4_5_moe",
|
|
"qwen3_next")):
|
|
# use the draft model from the same model:
|
|
self.model = self.target_model_config.model
|
|
# Align the quantization of draft model for cases such as
|
|
# --quantization fp8 with a bf16 checkpoint.
|
|
if not self.quantization:
|
|
self.quantization = self.target_model_config.quantization
|
|
elif self.method in ("ngram", "[ngram]"):
|
|
self.model = "ngram"
|
|
else:
|
|
raise ValueError("num_speculative_tokens was provided but without "
|
|
"speculative model.")
|
|
|
|
# Automatically configure the method for ngram when "model" is used
|
|
# instead of "method"
|
|
if self.method is None and (self.model is not None
|
|
and self.model in ("ngram", "[ngram]")):
|
|
self.method = "ngram"
|
|
|
|
if self.method in ("ngram", "[ngram]"):
|
|
# Unified to "ngram" internally
|
|
self.method = "ngram"
|
|
# Set default values if not provided
|
|
if (self.prompt_lookup_min is None and self.prompt_lookup_max is None):
|
|
# TODO(woosuk): Tune these values. They are arbitrarily chosen.
|
|
self.prompt_lookup_min = 5
|
|
self.prompt_lookup_max = 5
|
|
elif self.prompt_lookup_min is None:
|
|
assert self.prompt_lookup_max is not None
|
|
self.prompt_lookup_min = self.prompt_lookup_max
|
|
elif self.prompt_lookup_max is None:
|
|
assert self.prompt_lookup_min is not None
|
|
self.prompt_lookup_max = self.prompt_lookup_min
|
|
|
|
# Validate values
|
|
if self.prompt_lookup_min < 1:
|
|
raise ValueError(
|
|
f"prompt_lookup_min={self.prompt_lookup_min} must be > 0")
|
|
if self.prompt_lookup_max < 1:
|
|
raise ValueError(
|
|
f"prompt_lookup_max={self.prompt_lookup_max} must be > 0")
|
|
if self.prompt_lookup_min > self.prompt_lookup_max:
|
|
raise ValueError(
|
|
f"prompt_lookup_min={self.prompt_lookup_min} must "
|
|
f"be <= prompt_lookup_max={self.prompt_lookup_max}")
|
|
|
|
# TODO: current we still need extract vocab_size from target model
|
|
# config, in future, we may try refactor it out, and set
|
|
# draft related config as None here.
|
|
self.draft_model_config = self.target_model_config
|
|
self.draft_parallel_config = self.target_parallel_config
|
|
else:
|
|
self.prompt_lookup_max = 0
|
|
self.prompt_lookup_min = 0
|
|
|
|
if self.model is not None:
|
|
# TODO: Move this import to the top once `ModelConfig`
|
|
# lives in `vllm.config.model`.
|
|
from vllm.config import ModelConfig
|
|
self.draft_model_config = ModelConfig(
|
|
model=self.model,
|
|
runner="draft",
|
|
tokenizer=self.target_model_config.tokenizer,
|
|
tokenizer_mode=self.target_model_config.tokenizer_mode,
|
|
trust_remote_code=self.target_model_config.trust_remote_code,
|
|
allowed_local_media_path=self.target_model_config.
|
|
allowed_local_media_path,
|
|
allowed_media_domains=self.target_model_config.
|
|
allowed_media_domains,
|
|
dtype=self.target_model_config.dtype,
|
|
seed=self.target_model_config.seed,
|
|
revision=self.revision,
|
|
code_revision=self.code_revision,
|
|
tokenizer_revision=self.target_model_config.tokenizer_revision,
|
|
spec_target_max_model_len=self.target_model_config.
|
|
max_model_len,
|
|
quantization=self.quantization,
|
|
enforce_eager=self.target_model_config.enforce_eager,
|
|
max_logprobs=self.target_model_config.max_logprobs,
|
|
hf_overrides=SpeculativeConfig.hf_config_override,
|
|
)
|
|
|
|
# Automatically detect the method
|
|
if self.method in ('eagle', 'eagle3'):
|
|
pass
|
|
# examples:
|
|
# yuhuili/EAGLE-LLaMA3-Instruct-8B
|
|
# yuhuili/EAGLE3-LLaMA3.1-Instruct-8B
|
|
# AngelSlim/Qwen3-8B_eagle3
|
|
elif "eagle-" in self.draft_model_config.model.lower():
|
|
self.method = "eagle"
|
|
elif "eagle3" in self.draft_model_config.model.lower():
|
|
self.method = "eagle3"
|
|
elif self.draft_model_config.hf_config.model_type == "medusa":
|
|
self.method = "medusa"
|
|
elif (self.draft_model_config.hf_config.model_type ==
|
|
"mlp_speculator"):
|
|
self.method = "mlp_speculator"
|
|
elif (self.draft_model_config.hf_config.model_type
|
|
in ("deepseek_mtp", "mimo_mtp", "glm4_moe_mtp")):
|
|
self.method = "deepseek_mtp"
|
|
if self.num_speculative_tokens > 1:
|
|
logger.warning(
|
|
"All Deepseek MTP models only have " \
|
|
"one layer. Might need some code changes " \
|
|
"to support multiple layers."
|
|
)
|
|
elif (self.draft_model_config.hf_config.model_type == "ernie_mtp"):
|
|
self.method = "ernie_mtp"
|
|
if self.num_speculative_tokens > 1:
|
|
logger.warning(
|
|
"All Ernie MTP models only have " \
|
|
"one layer. Might need some code changes " \
|
|
"to support multiple layers."
|
|
)
|
|
elif (self.draft_model_config.hf_config.model_type ==
|
|
"qwen3_next_mtp"):
|
|
self.method = "qwen3_next_mtp"
|
|
if self.num_speculative_tokens > 1:
|
|
logger.warning(
|
|
"All Qwen3Next MTP models only have " \
|
|
"one layer. Might need some code changes " \
|
|
"to support multiple layers."
|
|
)
|
|
elif (self.draft_model_config.hf_config.model_type
|
|
in ("longcat_flash_mtp")):
|
|
self.method = "longcat_flash_mtp"
|
|
if self.num_speculative_tokens > 1:
|
|
logger.warning(
|
|
"LongCat MTP models only have " \
|
|
"one layer. Might need some code changes " \
|
|
"to support multiple layers."
|
|
)
|
|
else:
|
|
self.method = "draft_model"
|
|
raise NotImplementedError(
|
|
"Speculative decoding with draft model is not "
|
|
"supported yet. Please consider using other "
|
|
"speculative decoding methods such as ngram, medusa, "
|
|
"eagle, or deepseek_mtp.")
|
|
|
|
# Replace hf_config for EAGLE draft_model
|
|
if self.method in ("eagle", "eagle3"):
|
|
if self.enable_chunked_prefill and not envs.VLLM_USE_V1:
|
|
raise ValueError(
|
|
"Chunked prefill and EAGLE are not compatible "
|
|
"when using V0.")
|
|
|
|
from vllm.transformers_utils.configs import SpeculatorsConfig
|
|
from vllm.transformers_utils.configs.eagle import EAGLEConfig
|
|
|
|
if isinstance(self.draft_model_config.hf_config,
|
|
(EAGLEConfig, SpeculatorsConfig)):
|
|
pass
|
|
else:
|
|
eagle_config = EAGLEConfig(
|
|
self.draft_model_config.hf_config,
|
|
method=self.method,
|
|
model_type="eagle")
|
|
self.draft_model_config.hf_config = eagle_config
|
|
|
|
if (self.num_speculative_tokens is not None
|
|
and hasattr(self.draft_model_config.hf_config,
|
|
"num_lookahead_tokens")):
|
|
self.draft_model_config.hf_config.num_lookahead_tokens = \
|
|
self.num_speculative_tokens
|
|
|
|
n_predict = getattr(self.draft_model_config.hf_config, "n_predict",
|
|
None)
|
|
if n_predict is not None:
|
|
if self.num_speculative_tokens is None:
|
|
# Default to max value defined in draft model config.
|
|
self.num_speculative_tokens = n_predict
|
|
elif self.num_speculative_tokens > n_predict and \
|
|
self.num_speculative_tokens % n_predict != 0:
|
|
# Ensure divisibility for MTP module reuse.
|
|
raise ValueError(
|
|
f"num_speculative_tokens:{self.num_speculative_tokens}"
|
|
f" must be divisible by {n_predict=}")
|
|
|
|
if self.speculative_token_tree is None:
|
|
# Generate chain of tokens.
|
|
self.speculative_token_tree = str([
|
|
(i + 1) * (0, ) for i in range(self.num_speculative_tokens)
|
|
])
|
|
else:
|
|
# Sort the token tree breadth-first.
|
|
tree_choices = ast.literal_eval(self.speculative_token_tree)
|
|
self.speculative_token_tree = str(
|
|
sorted(tree_choices, key=lambda t: (len(t), t)))
|
|
|
|
self.draft_tensor_parallel_size = \
|
|
SpeculativeConfig._verify_and_get_draft_tp(
|
|
self.target_parallel_config,
|
|
self.draft_tensor_parallel_size,
|
|
self.draft_model_config.hf_config
|
|
)
|
|
|
|
self.draft_model_config.max_model_len = (
|
|
SpeculativeConfig._maybe_override_draft_max_model_len(
|
|
self.max_model_len,
|
|
self.draft_model_config.max_model_len,
|
|
self.target_model_config.max_model_len,
|
|
))
|
|
|
|
self.draft_parallel_config = (
|
|
SpeculativeConfig.create_draft_parallel_config(
|
|
self.target_parallel_config,
|
|
self.draft_tensor_parallel_size))
|
|
|
|
|
|
SpeculativeConfig.__post_init__ = __post_init__
|