Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import ast
+import copy
 from typing import TYPE_CHECKING, Any, Literal, get_args

 from pydantic import Field, SkipValidation, model_validator
@@ -45,7 +46,7 @@ MTPModelTypes = Literal[
    "pangu_ultra_moe_mtp",
    "step3p5_mtp",
 ]
-EagleModelTypes = Literal["eagle", "eagle3", MTPModelTypes]
+EagleModelTypes = Literal["eagle", "eagle3", "extract_hidden_states", MTPModelTypes]
 SpeculativeMethod = Literal[
    "ngram",
    "medusa",
@@ -77,12 +78,24 @@ class SpeculativeConfig:

    If using `ngram` method, the related configuration `prompt_lookup_max` and
    `prompt_lookup_min` should be considered."""
+    enable_multi_layers_mtp: bool = False
+    """If set to True, the MTP method will run multiple layers of MTP
+    speculator. If set to False, it will run only one layer of MTP speculator.
+    This is only effective when the method is set to `mtp`."""
    draft_tensor_parallel_size: int | None = Field(default=None, ge=1)
    """The degree of the tensor parallelism for the draft model. Can only be 1
    or the same as the target model's tensor parallel size."""
+    draft_pipeline_parallel_size: int | None = Field(default=None, ge=1)
+    """The degree of pipeline parallelism for the draft model.
+
+    Defaults to the target model's pipeline parallel size. Set this to 1 to
+    run the drafter locally on the last target PP stage."""
    tensor_parallel_size: int | None = None
    """Users should pass "draft_tensor_parallel_size". This parameter's purpose is to
    warn users when they mistakenly provide the wrong argument."""
+    pipeline_parallel_size: int | None = None
+    """Users should pass "draft_pipeline_parallel_size". This parameter's
+    purpose is to warn users when they mistakenly provide the wrong argument."""

    # Draft model configuration
    quantization: me_quant.QuantizationMethods | None = None
@@ -181,9 +194,22 @@ class SpeculativeConfig:
        the final hidden states.
        """
        factors: list[Any] = []
-        # Eagle3 affects the computation graph because it returns intermediate
-        # hidden states in addition to the final hidden state.
-        factors.append(self.method == "eagle3")
+        # Eagle3 and extract_hidden_states affect the computation graph because
+        # they return intermediate hidden states in addition to the final hidden state.
+        uses_aux_hidden_states = self.method in ("eagle3", "extract_hidden_states")
+        factors.append(uses_aux_hidden_states)
+
+        # The specific layers used also affect the computation graph
+        if uses_aux_hidden_states and self.draft_model_config is not None:
+            layer_ids = getattr(
+                self.draft_model_config.hf_config,
+                "eagle_aux_hidden_state_layer_ids",
+                None,
+            )
+            if layer_ids is not None:
+                # Convert to tuple to make it hashable
+                factors.append(tuple(layer_ids))
+
        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

@@ -352,6 +378,8 @@ class SpeculativeConfig:
                self.model = "ngram"
            elif self.method == "suffix":
                self.model = "suffix"
+            elif self.method == "extract_hidden_states":
+                self.model = "extract_hidden_states"
            else:
                raise ValueError(
                    "num_speculative_tokens was provided but without speculative model."
@@ -394,6 +422,34 @@ class SpeculativeConfig:
            self.draft_parallel_config = self.target_parallel_config
        elif self.method == "suffix":
            self._validate_suffix_decoding()
+        elif self.method == "extract_hidden_states":
+            from vllm.transformers_utils.configs.extract_hidden_states import (
+                ExtractHiddenStatesConfig,
+            )
+
+            # ExtractHiddenStatesModel is instantiated manually in load_model()
+            # We just need to store the target model config for KV cache shape info
+            self.model = "extract_hidden_states"
+            self.prompt_lookup_max = 0
+            self.prompt_lookup_min = 0
+
+            if hasattr(self.draft_model_config, "hf_config"):
+                hf_config = self.draft_model_config.hf_config.to_dict()
+            elif (
+                isinstance(self.draft_model_config, dict)
+                and "hf_config" in self.draft_model_config
+            ):
+                hf_config = self.draft_model_config["hf_config"]
+            else:
+                hf_config = {}
+
+            self.draft_model_config = copy.copy(self.target_model_config)
+            self.draft_model_config.hf_config = ExtractHiddenStatesConfig(
+                self.draft_model_config.hf_config, **hf_config
+            )
+            self.update_arch_()
+            self.draft_parallel_config = self.target_parallel_config
+
        else:
            self.prompt_lookup_max = 0
            self.prompt_lookup_min = 0
@@ -439,7 +495,10 @@ class SpeculativeConfig:
                    MTPModelTypes
                ):
                    self.method = "mtp"
-                    if self.num_speculative_tokens > 1:
+                    if (
+                        self.enable_multi_layers_mtp is False
+                        and self.num_speculative_tokens > 1
+                    ):
                        logger.warning(
                            "Enabling num_speculative_tokens > 1 will run "
                            "multiple times of forward on same MTP layer"
@@ -478,23 +537,8 @@ class SpeculativeConfig:
                            method=self.method,
                            model_type="eagle",
                        )
-                        # EAGLEConfig primarily updates architectures, so update
-                        # all architectures-related fields in draft_model_config
                        self.draft_model_config.hf_config = eagle_config
-                        self.draft_model_config.hf_text_config = get_hf_text_config(
-                            self.draft_model_config.hf_config
-                        )
-                        self.draft_model_config.model_arch_config = (
-                            self.draft_model_config.get_model_arch_config()
-                        )
-                        model_info, arch = (
-                            self.draft_model_config.registry.inspect_model_cls(
-                                self.draft_model_config.architectures,
-                                self.draft_model_config,
-                            )
-                        )
-                        self.draft_model_config._model_info = model_info
-                        self.draft_model_config._architecture = arch
+                        self.update_arch_()

                if self.num_speculative_tokens is not None and hasattr(
                    self.draft_model_config.hf_config, "num_lookahead_tokens"
@@ -510,6 +554,17 @@ class SpeculativeConfig:
                    if self.num_speculative_tokens is None:
                        # Default to max value defined in draft model config.
                        self.num_speculative_tokens = n_predict
+                    elif (
+                        self.method == "mtp"
+                        and self.enable_multi_layers_mtp
+                        and self.num_speculative_tokens > n_predict
+                    ):
+                        logger.warning_once(
+                            "For multi_layer_eagle, num_speculative_tokens "
+                            "is greater than the layer_num, adjusting to "
+                            "layer_num"
+                        )
+                        self.num_speculative_tokens = n_predict
                    elif (
                        self.num_speculative_tokens > n_predict
                        and self.num_speculative_tokens % n_predict != 0
@@ -555,9 +610,17 @@ class SpeculativeConfig:
                    )
                )

+                self.draft_pipeline_parallel_size = (
+                    SpeculativeConfig._verify_and_get_draft_pp(
+                        self.target_parallel_config,
+                        self.draft_pipeline_parallel_size,
+                    )
+                )
                self.draft_parallel_config = (
                    SpeculativeConfig.create_draft_parallel_config(
-                        self.target_parallel_config, self.draft_tensor_parallel_size
+                        self.target_parallel_config,
+                        self.draft_tensor_parallel_size,
+                        self.draft_pipeline_parallel_size,
                    )
                )
        return self
@@ -671,17 +734,61 @@ class SpeculativeConfig:
            )
        return speculative_draft_tensor_parallel_size

+    @staticmethod
+    def _verify_and_get_draft_pp(
+        target_parallel_config: ParallelConfig,
+        speculative_draft_pipeline_parallel_size: int | None,
+    ) -> int:
+        """
+        Verifies and adjusts the pipeline parallel size for a draft model
+        specified using speculative_draft_pipeline_parallel_size.
+        """
+        if speculative_draft_pipeline_parallel_size is None:
+            return target_parallel_config.pipeline_parallel_size
+
+        if speculative_draft_pipeline_parallel_size not in (
+            1,
+            target_parallel_config.pipeline_parallel_size,
+        ):
+            raise ValueError(
+                f"{speculative_draft_pipeline_parallel_size=} cannot be "
+                "other value than 1 or target model "
+                f"pipeline_parallel_size="
+                f"{target_parallel_config.pipeline_parallel_size}"
+            )
+        return speculative_draft_pipeline_parallel_size
+
+    def update_arch_(self):
+        """
+        EagleConfig and ExtractHiddenStatesConfig update architectures, so update all
+        architectures-related fields in self.draft_model_config
+        """
+        self.draft_model_config.hf_text_config = get_hf_text_config(
+            self.draft_model_config.hf_config
+        )
+        self.draft_model_config.model_arch_config = (
+            self.draft_model_config.get_model_arch_config()
+        )
+        model_info, arch = self.draft_model_config.registry.inspect_model_cls(
+            self.draft_model_config.architectures,
+            self.draft_model_config,
+        )
+        self.draft_model_config._model_info = model_info
+        self.draft_model_config._architecture = arch
+
    @staticmethod
    def create_draft_parallel_config(
        target_parallel_config: ParallelConfig,
        speculative_draft_tensor_parallel_size: int,
+        speculative_draft_pipeline_parallel_size: int,
    ) -> ParallelConfig:
        """Create a parallel config for use by the draft worker.

-        This is mostly a copy of the target parallel config, except the tp_size.
+        This is mostly a copy of the target parallel config, except the tp/pp
+        sizes used by the draft model.
        """
        draft_parallel_config = ParallelConfig(
-            pipeline_parallel_size=target_parallel_config.pipeline_parallel_size,
+            pipeline_parallel_size=speculative_draft_pipeline_parallel_size,
            tensor_parallel_size=speculative_draft_tensor_parallel_size,
            distributed_executor_backend=target_parallel_config.distributed_executor_backend,
            max_parallel_loading_workers=target_parallel_config.max_parallel_loading_workers,
@@ -699,6 +806,12 @@ class SpeculativeConfig:
                "'tensor_parallel_size' is not a valid argument in the "
                "speculative_config. Please pass 'draft_tensor_parallel_size' instead."
            )
+        if self.pipeline_parallel_size is not None:
+            raise ValueError(
+                "'pipeline_parallel_size' is not a valid argument in the "
+                "speculative_config. Please pass "
+                "'draft_pipeline_parallel_size' instead."
+            )

        if self.num_speculative_tokens is None:
            raise ValueError(
@@ -718,7 +831,7 @@ class SpeculativeConfig:
                self.draft_parallel_config
            )

-        eagle3_target_supported = [
+        aux_hidden_states_supported = [
            "llama",
            "qwen",
            "minicpm",
@@ -729,16 +842,16 @@ class SpeculativeConfig:
            "nemotron_h",
        ]
        if (
-            self.method == "eagle3"
+            self.method in ("eagle3", "extract_hidden_states")
            and self.target_model_config
            and not any(
                supported_model in self.target_model_config.hf_text_config.model_type
-                for supported_model in eagle3_target_supported
+                for supported_model in aux_hidden_states_supported
            )
        ):
            raise ValueError(
-                f"Eagle3 is only supported for {eagle3_target_supported} models. "  # noqa: E501
-                f"Got {self.target_model_config.hf_text_config.model_type=}"
+                f"{self.method} is only supported for {aux_hidden_states_supported}"
+                f" models. Got {self.target_model_config.hf_text_config.model_type=}"
            )
        self.verify_equal_vocab_size_if_draft_model()
        return self
@@ -782,8 +895,65 @@ class SpeculativeConfig:
    def uses_draft_model(self) -> bool:
        return self.method == "draft_model"

+    def uses_extract_hidden_states(self) -> bool:
+        return self.method == "extract_hidden_states"
+
+    def needs_partial_pp_draft_remap(
+        self, target_parallel_config: ParallelConfig
+    ) -> bool:
+        """Whether draft PP is smaller than target PP and needs rank remap."""
+        if self.draft_parallel_config is None:
+            return False
+        return (
+            target_parallel_config.pipeline_parallel_size
+            > self.draft_parallel_config.pipeline_parallel_size
+        )
+
+    def resolve_partial_pp_draft_rank(
+        self, target_parallel_config: ParallelConfig
+    ) -> int:
+        """Map a target rank to the local draft rank for partial-PP drafting.
+
+        Currently this only supports running the draft model with `draft_pp=1`
+        on the last target PP stage.
+        """
+        if not self.needs_partial_pp_draft_remap(target_parallel_config):
+            return target_parallel_config.rank
+
+        assert self.draft_parallel_config is not None
+        draft_pp = self.draft_parallel_config.pipeline_parallel_size
+        if draft_pp != 1:
+            raise ValueError(
+                "Partial pp drafter rank remapping only supports "
+                "draft_pipeline_parallel_size=1 when target PP is larger."
+            )
+
+        target_tp = target_parallel_config.tensor_parallel_size
+        draft_tp = self.draft_parallel_config.tensor_parallel_size
+        if draft_tp != target_tp:
+            raise ValueError(
+                "Partial pp drafter rank remapping requires "
+                "draft_tensor_parallel_size to equal target tensor_parallel_size. "
+                f"Got draft_tp={draft_tp}, target_tp={target_tp}."
+            )
+
+        target_pp = target_parallel_config.pipeline_parallel_size
+        target_rank = target_parallel_config.rank
+        target_pp_rank = target_rank // target_tp
+        target_tp_rank = target_rank % target_tp
+        if target_pp_rank != target_pp - 1:
+            raise ValueError(
+                "Partial pp drafter should only run on the last "
+                f"pipeline stage, but got pp rank {target_pp_rank} / {target_pp}"
+            )
+        return target_tp_rank
+
    def __repr__(self) -> str:
        method = self.method
-        model = None if method in ("ngram", "suffix") else self.draft_model_config.model
+        model = (
+            None
+            if method in ("ngram", "suffix", "extract_hidden_states")
+            else self.draft_model_config.model
+        )
        num_spec_tokens = self.num_speculative_tokens
        return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"