Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/config/attention.py
+++ b/vllm/config/attention.py
@@ -16,8 +16,8 @@ class AttentionConfig:
    backend: AttentionBackendEnum | None = None
    """Attention backend to use. If None, will be selected automatically."""

-    flash_attn_version: Literal[2, 3] | None = None
-    """Force vllm to use a specific flash-attention version (2 or 3).
+    flash_attn_version: Literal[2, 3, 4] | None = None
+    """Force vllm to use a specific flash-attention version (2, 3, or 4).
    Only valid when using the flash-attention backend."""

    use_prefill_decode_attention: bool = False
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -86,9 +86,16 @@ class CUDAGraphMode(enum.Enum):

    def separate_routine(self) -> bool:
        return isinstance(self.value, tuple)
+    
+    def decode_use_graph(self) -> bool:
+        return self.decode_mode() == CUDAGraphMode.FULL

-    def valid_runtime_modes(self) -> bool:
-        return self in [CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL]
+    @classmethod
+    def valid_runtime_modes(cls) -> frozenset["CUDAGraphMode"]:
+        return frozenset({cls.NONE, cls.PIECEWISE, cls.FULL})
+
+    def is_valid_runtime_mode(self) -> bool:
+        return self in CUDAGraphMode.valid_runtime_modes()

    def __str__(self) -> str:
        return self.name
@@ -385,7 +392,7 @@ class CompilationConfig:
    Please use mode. Currently all levels are mapped to mode.
    """
    # Top-level Compilation control
-    mode: CompilationMode = Field(default=None)
+    mode: CompilationMode = Field(default=CompilationMode.NONE)
    """The compilation approach used for torch.compile-based compilation of the
    model.

@@ -503,7 +510,7 @@ class CompilationConfig:
    constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""

    # CudaGraph compilation
-    cudagraph_mode: CUDAGraphMode = Field(default=None)
+    cudagraph_mode: CUDAGraphMode = Field(default=CUDAGraphMode.FULL_DECODE_ONLY)
    """
    The mode of the cudagraph:

@@ -1003,6 +1010,7 @@ class CompilationConfig:
                # https://github.com/vllm-project/vllm/issues/33267
                if not self.use_inductor_graph_partition:
                    self.splitting_ops.append("vllm::unified_kv_cache_update")
+                    self.splitting_ops.append("vllm::unified_mla_kv_cache_update")

            elif len(self.splitting_ops) == 0:
                if (
@@ -1045,7 +1053,7 @@ class CompilationConfig:
                "are optimized for prefill and are incompatible with CUDA Graphs. "
                "In order to use CUDA Graphs for decode-optimized workloads, "
                "use --all2all-backend with another option, such as "
-                "deepep_low_latency, pplx, or allgather_reducescatter."
+                "deepep_low_latency or allgather_reducescatter."
            )
            self.cudagraph_mode = CUDAGraphMode.NONE

--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -50,8 +50,6 @@ from vllm.transformers_utils.utils import maybe_model_redirect
 from vllm.utils.import_utils import LazyLoader
 from vllm.v1.attention.backends.registry import AttentionBackendEnum

-import os
-
 if TYPE_CHECKING:
    from transformers import PretrainedConfig

@@ -128,6 +126,7 @@ class ModelConfig:
    - "slow" will always use the slow tokenizer.\n
    - "mistral" will always use the tokenizer from `mistral_common`.\n
    - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
+    - "qwen_vl" will always use the tokenizer from `qwen_vl`.\n
    - Other custom values can be supported via plugins."""
    trust_remote_code: bool = False
    """Trust remote code (e.g., from HuggingFace) when downloading the model
@@ -463,8 +462,6 @@ class ModelConfig:

        self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)

-        from vllm.platforms import current_platform
-
        if self.override_attention_dtype is not None and not current_platform.is_rocm():
            warnings.warn(
                "override-attention-dtype is set but not using ROCm platform",
@@ -473,10 +470,9 @@ class ModelConfig:

        if self.enable_sleep_mode and not current_platform.is_sleep_mode_available():
            raise ValueError("Sleep mode is not supported on current platform.")
-        
-        temp_hf_config_path = os.environ.get("CUSTOM_QUANT_CONFIG", None)
+
        hf_config = get_config(
-            temp_hf_config_path or self.hf_config_path or self.model,
+            self.hf_config_path or self.model,
            self.trust_remote_code,
            self.revision,
            self.code_revision,
@@ -622,6 +618,16 @@ class ModelConfig:
        self._try_verify_and_update_model_config()
        self._verify_quantization()
        self._verify_cuda_graph()
+        import os
+        enforce_cuda_graph = os.environ.get("VLLM_ENFORCE_CUDA_GRAPH",None)
+        if enforce_cuda_graph is not None and enforce_cuda_graph in ["1", "y", "Y"]:
+            self.enforce_eager = False
+        else:
+            self.enforce_eager = True
+            logger.warning_once(
+                "Please export VLLM_ENFORCE_CUDA_GRAPH=1 to enable cuda graph. "
+                "For now, cuda graph is not used and --enforce-eager is disabled ,"
+                "we are trying to use cuda graph as the default mode")
        self._verify_bnb_config()

    def get_model_arch_config(
@@ -886,6 +892,7 @@ class ModelConfig:
                "modelopt",
                "modelopt_fp4",
                "modelopt_mxfp8",
+                "modelopt_mixed",
                "petit_nvfp4",
                # Ensure heavy backends are probed last to avoid unnecessary
                # imports during override detection (e.g., MXFP4 imports Triton)
@@ -942,8 +949,6 @@ class ModelConfig:
                    f"Unknown quantization method: {self.quantization}. Must "
                    f"be one of {supported_quantization}."
                )
-            from vllm.platforms import current_platform
-
            current_platform.verify_quantization(self.quantization)

        if self.quantization in me_quant.DEPRECATED_QUANTIZATION_METHODS:
@@ -1813,8 +1818,6 @@ def _resolve_auto_dtype(
    *,
    is_pooling_model: bool,
 ):
-    from vllm.platforms import current_platform
-
    supported_dtypes = [
        dtype
        for dtype in current_platform.supported_dtypes
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -152,7 +152,6 @@ class ParallelConfig:

    - "naive": Naive all2all implementation using broadcasts\n
    - "allgather_reducescatter": All2all based on allgather and reducescatter\n
-    - "pplx": Use pplx kernels\n
    - "deepep_high_throughput": Use deepep high-throughput kernels\n
    - "deepep_low_latency": Use deepep low-latency kernels\n
    - "mori": Use mori kernels\n
@@ -166,6 +165,9 @@ class ParallelConfig:
    disable_custom_all_reduce: bool = False
    """Disable the custom all-reduce kernel and fall back to NCCL."""

+    enable_elastic_ep: bool = False
+    """Enable elastic expert parallelism with stateless NCCL groups for DP/EP."""
+
    enable_dbo: bool = False
    """Enable dual batch overlap for the model executor."""
    ubatch_size: int = 0
@@ -245,6 +247,34 @@ class ParallelConfig:
    Set to be private as it's not intended to be configured by users.
    """

+    _stateless_dp_group_port_list: list[list[int]] = Field(default_factory=list)
+    """List of open ports for stateless DP groups when enable_elastic_ep is True.
+    Set to be private as it's not intended to be configured by users.
+    It is a list of list[int], with each inner list contains a set of 3 ports
+    to be used for setting up the stateless CPU/device/TCPStore groups
+    in StatelessGroupCoordinator. The number of inner lists is equal to
+    the number of DP groups, 
+    i.e., len(self._stateless_dp_group_port_list) == world_size_across_dp // dp_size,
+    and len(self._stateless_dp_group_port_list[i]) == 3 for all i.
+    """
+
+    _stateless_ep_group_port_list: list[list[int]] = Field(default_factory=list)
+    """List of open ports for stateless EP groups when enable_elastic_ep is True.
+    Set to be private as it's not intended to be configured by users.
+    len(self._stateless_ep_group_port_list) == world_size_across_dp // ep_size,
+    """
+
+    _stateless_eplb_group_port_list: list[list[int]] = Field(default_factory=list)
+    """List of open ports for stateless EPLB groups when enable_elastic_ep is True.
+    Same topology as EP but separate NCCL communicator to avoid deadlocks.
+    """
+
+    _stateless_world_group_port_list: list[list[int]] = Field(default_factory=list)
+    """List of open ports for stateless world group when enable_elastic_ep is True.
+    Set to be private as it's not intended to be configured by users.
+    len(self._stateless_world_group_port_list) == 1,
+    """
+
    decode_context_parallel_size: int = 1
    """Number of decode context parallel groups, because the world size does
    not change by dcp, it simply reuse the GPUs of TP group, and tp_size
@@ -310,6 +340,13 @@ class ParallelConfig:
                f"but found: {self._api_process_rank}"
            )

+        if self.all2all_backend == "pplx":
+            logger.warning(
+                "The 'pplx' all2all backend has been removed. "
+                "Falling back to 'allgather_reducescatter'."
+            )
+            self.all2all_backend = "allgather_reducescatter"
+
        if self.data_parallel_size_local > self.data_parallel_size:
            raise ValueError(
                f"data_parallel_size_local ({self.data_parallel_size_local}) "
@@ -396,7 +433,67 @@ class ParallelConfig:

        return answer

-    def stateless_init_dp_group(self) -> ProcessGroup:
+    def allocate_elastic_ep_ports(self) -> None:
+        """Allocate all ports for elastic EP (stateless groups + DP master).
+
+        Must be called AFTER ray.init() so that ports claimed by Ray's
+        idle worker pool are already in use and won't be returned by
+        get_open_ports_list().
+        """
+        if not self.enable_elastic_ep:
+            return
+        if self._stateless_world_group_port_list:
+            return
+
+        num_world_groups = 1
+        dp_size = self.data_parallel_size
+        ep_size = self.data_parallel_size * self.world_size_across_dp
+        num_dp_groups = max(1, self.world_size_across_dp // dp_size)
+        num_ep_groups = max(1, self.world_size_across_dp // ep_size)
+        num_eplb_groups = num_ep_groups
+        total_stateless_ports = (
+            num_world_groups + num_dp_groups + num_ep_groups + num_eplb_groups
+        ) * 3
+        num_dp_master_ports = 5
+
+        all_ports = get_open_ports_list(total_stateless_ports + num_dp_master_ports)
+
+        self._data_parallel_master_port_list = all_ports[-num_dp_master_ports:]
+        self.data_parallel_master_port = self._data_parallel_master_port_list.pop()
+        all_ports = all_ports[:-num_dp_master_ports]
+
+        self._stateless_world_group_port_list = [
+            all_ports[i : i + 3] for i in range(0, num_world_groups * 3, 3)
+        ]
+        start_idx = num_world_groups * 3
+        self._stateless_dp_group_port_list = [
+            all_ports[i : i + 3]
+            for i in range(start_idx, start_idx + num_dp_groups * 3, 3)
+        ]
+        start_idx += num_dp_groups * 3
+        self._stateless_ep_group_port_list = [
+            all_ports[i : i + 3]
+            for i in range(start_idx, start_idx + num_ep_groups * 3, 3)
+        ]
+        start_idx += num_ep_groups * 3
+        self._stateless_eplb_group_port_list = [
+            all_ports[i : i + 3]
+            for i in range(start_idx, start_idx + num_eplb_groups * 3, 3)
+        ]
+
+    def get_next_stateless_world_group_port(self) -> list[int]:
+        return self._stateless_world_group_port_list.pop()
+
+    def get_next_stateless_dp_group_port(self) -> list[int]:
+        return self._stateless_dp_group_port_list.pop()
+
+    def get_next_stateless_ep_group_port(self) -> list[int]:
+        return self._stateless_ep_group_port_list.pop()
+
+    def get_next_stateless_eplb_group_port(self) -> list[int]:
+        return self._stateless_eplb_group_port_list.pop()
+
+    def stateless_init_dp_group(self, return_store: bool = False) -> ProcessGroup:
        # NOTE: In high-concurrency scenarios multiple processes
        # can pick the same (currently free) port through a race
        # condition when calling `get_open_port()`. When the first
@@ -420,7 +517,8 @@ class ParallelConfig:
                    self.get_next_dp_init_port(),
                    self.data_parallel_rank,
                    self.data_parallel_size,
-                    backend=current_platform.dist_backend,
+                    backend="gloo",
+                    return_store=return_store,
                )
            except DistNetworkError as e:
                # We only want to retry when the root cause is EADDRINUSE.
@@ -442,7 +540,6 @@ class ParallelConfig:
    # In this case, ensure the input to the experts is sequence parallel
    # to avoid the excess work.
    #
-    # Not needed for pplx-kernels as it can handle duplicate input tokens.
    @property
    def use_sequence_parallel_moe(self) -> bool:
        return (
@@ -556,6 +653,21 @@ class ParallelConfig:
            logger.info("Using external launcher for distributed inference.")
            self.world_size *= self.data_parallel_size

+        if self.enable_elastic_ep:
+            if not self.enable_eplb:
+                raise ValueError("Elastic EP is only supported with enable_eplb=True.")
+            if self.pipeline_parallel_size > 1:
+                raise ValueError(
+                    "Elastic EP is not supported with pipeline parallelism "
+                    f"(pipeline_parallel_size={self.pipeline_parallel_size})."
+                )
+            if self.data_parallel_external_lb or self.data_parallel_hybrid_lb:
+                raise NotImplementedError(
+                    "Elastic EP is not compatible with data_parallel_external_lb "
+                    "or data_parallel_hybrid_lb. Elastic EP relies on a single API "
+                    "server and core client to coordinate scale up/down."
+                )
+
        if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
            # Data parallel was specified in the engine args.
            if self.distributed_executor_backend == "external_launcher":
@@ -568,9 +680,12 @@ class ParallelConfig:
                    "Set data_parallel_rank to %d automatically.",
                    self.data_parallel_rank,
                )
-            if not self._data_parallel_master_port_list:
-                self._data_parallel_master_port_list = get_open_ports_list(5)
-            self.data_parallel_master_port = self._data_parallel_master_port_list.pop()
+            if not self.enable_elastic_ep:
+                if not self._data_parallel_master_port_list:
+                    self._data_parallel_master_port_list = get_open_ports_list(5)
+                self.data_parallel_master_port = (
+                    self._data_parallel_master_port_list.pop()
+                )

            if not (0 <= self.data_parallel_rank < self.data_parallel_size):
                raise ValueError(
@@ -597,7 +712,7 @@ class ParallelConfig:
            os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
            logger.info("Disabling V1 multiprocessing for external launcher.")

-        if self.distributed_executor_backend is None and self.world_size > 1:
+        if self.distributed_executor_backend is None and self.world_size_across_dp > 1:
            # We use multiprocessing by default if world_size fits on the
            # current node and we aren't in a ray placement group.

@@ -659,6 +774,17 @@ class ParallelConfig:
                "backend is mp, uni or external_launcher."
            )

+        if (
+            self.all2all_backend in ("allgather_reducescatter", "naive")
+            and self.eplb_config.use_async
+        ):
+            logger.warning(
+                "Async EPLB causes hangs with the '%s' all2all backend. "
+                "Forcing synchronous EPLB.",
+                self.all2all_backend,
+            )
+            self.eplb_config.use_async = False
+
    @property
    def use_ray(self) -> bool:
        return self.distributed_executor_backend == "ray" or (
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import ast
+import copy
 from typing import TYPE_CHECKING, Any, Literal, get_args

 from pydantic import Field, SkipValidation, model_validator
@@ -45,7 +46,7 @@ MTPModelTypes = Literal[
    "pangu_ultra_moe_mtp",
    "step3p5_mtp",
 ]
-EagleModelTypes = Literal["eagle", "eagle3", MTPModelTypes]
+EagleModelTypes = Literal["eagle", "eagle3", "extract_hidden_states", MTPModelTypes]
 SpeculativeMethod = Literal[
    "ngram",
    "medusa",
@@ -77,12 +78,24 @@ class SpeculativeConfig:

    If using `ngram` method, the related configuration `prompt_lookup_max` and
    `prompt_lookup_min` should be considered."""
+    enable_multi_layers_mtp: bool = False
+    """If set to True, the MTP method will run multiple layers of MTP
+    speculator. If set to False, it will run only one layer of MTP speculator.
+    This is only effective when the method is set to `mtp`."""
    draft_tensor_parallel_size: int | None = Field(default=None, ge=1)
    """The degree of the tensor parallelism for the draft model. Can only be 1
    or the same as the target model's tensor parallel size."""
+    draft_pipeline_parallel_size: int | None = Field(default=None, ge=1)
+    """The degree of pipeline parallelism for the draft model.
+
+    Defaults to the target model's pipeline parallel size. Set this to 1 to
+    run the drafter locally on the last target PP stage."""
    tensor_parallel_size: int | None = None
    """Users should pass "draft_tensor_parallel_size". This parameter's purpose is to
    warn users when they mistakenly provide the wrong argument."""
+    pipeline_parallel_size: int | None = None
+    """Users should pass "draft_pipeline_parallel_size". This parameter's
+    purpose is to warn users when they mistakenly provide the wrong argument."""

    # Draft model configuration
    quantization: me_quant.QuantizationMethods | None = None
@@ -181,9 +194,22 @@ class SpeculativeConfig:
        the final hidden states.
        """
        factors: list[Any] = []
-        # Eagle3 affects the computation graph because it returns intermediate
-        # hidden states in addition to the final hidden state.
-        factors.append(self.method == "eagle3")
+        # Eagle3 and extract_hidden_states affect the computation graph because
+        # they return intermediate hidden states in addition to the final hidden state.
+        uses_aux_hidden_states = self.method in ("eagle3", "extract_hidden_states")
+        factors.append(uses_aux_hidden_states)
+
+        # The specific layers used also affect the computation graph
+        if uses_aux_hidden_states and self.draft_model_config is not None:
+            layer_ids = getattr(
+                self.draft_model_config.hf_config,
+                "eagle_aux_hidden_state_layer_ids",
+                None,
+            )
+            if layer_ids is not None:
+                # Convert to tuple to make it hashable
+                factors.append(tuple(layer_ids))
+
        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

@@ -352,6 +378,8 @@ class SpeculativeConfig:
                self.model = "ngram"
            elif self.method == "suffix":
                self.model = "suffix"
+            elif self.method == "extract_hidden_states":
+                self.model = "extract_hidden_states"
            else:
                raise ValueError(
                    "num_speculative_tokens was provided but without speculative model."
@@ -394,6 +422,34 @@ class SpeculativeConfig:
            self.draft_parallel_config = self.target_parallel_config
        elif self.method == "suffix":
            self._validate_suffix_decoding()
+        elif self.method == "extract_hidden_states":
+            from vllm.transformers_utils.configs.extract_hidden_states import (
+                ExtractHiddenStatesConfig,
+            )
+
+            # ExtractHiddenStatesModel is instantiated manually in load_model()
+            # We just need to store the target model config for KV cache shape info
+            self.model = "extract_hidden_states"
+            self.prompt_lookup_max = 0
+            self.prompt_lookup_min = 0
+
+            if hasattr(self.draft_model_config, "hf_config"):
+                hf_config = self.draft_model_config.hf_config.to_dict()
+            elif (
+                isinstance(self.draft_model_config, dict)
+                and "hf_config" in self.draft_model_config
+            ):
+                hf_config = self.draft_model_config["hf_config"]
+            else:
+                hf_config = {}
+
+            self.draft_model_config = copy.copy(self.target_model_config)
+            self.draft_model_config.hf_config = ExtractHiddenStatesConfig(
+                self.draft_model_config.hf_config, **hf_config
+            )
+            self.update_arch_()
+            self.draft_parallel_config = self.target_parallel_config
+
        else:
            self.prompt_lookup_max = 0
            self.prompt_lookup_min = 0
@@ -439,7 +495,10 @@ class SpeculativeConfig:
                    MTPModelTypes
                ):
                    self.method = "mtp"
-                    if self.num_speculative_tokens > 1:
+                    if (
+                        self.enable_multi_layers_mtp is False
+                        and self.num_speculative_tokens > 1
+                    ):
                        logger.warning(
                            "Enabling num_speculative_tokens > 1 will run "
                            "multiple times of forward on same MTP layer"
@@ -478,23 +537,8 @@ class SpeculativeConfig:
                            method=self.method,
                            model_type="eagle",
                        )
-                        # EAGLEConfig primarily updates architectures, so update
-                        # all architectures-related fields in draft_model_config
                        self.draft_model_config.hf_config = eagle_config
-                        self.draft_model_config.hf_text_config = get_hf_text_config(
-                            self.draft_model_config.hf_config
-                        )
-                        self.draft_model_config.model_arch_config = (
-                            self.draft_model_config.get_model_arch_config()
-                        )
-                        model_info, arch = (
-                            self.draft_model_config.registry.inspect_model_cls(
-                                self.draft_model_config.architectures,
-                                self.draft_model_config,
-                            )
-                        )
-                        self.draft_model_config._model_info = model_info
-                        self.draft_model_config._architecture = arch
+                        self.update_arch_()

                if self.num_speculative_tokens is not None and hasattr(
                    self.draft_model_config.hf_config, "num_lookahead_tokens"
@@ -510,6 +554,17 @@ class SpeculativeConfig:
                    if self.num_speculative_tokens is None:
                        # Default to max value defined in draft model config.
                        self.num_speculative_tokens = n_predict
+                    elif (
+                        self.method == "mtp"
+                        and self.enable_multi_layers_mtp
+                        and self.num_speculative_tokens > n_predict
+                    ):
+                        logger.warning_once(
+                            "For multi_layer_eagle, num_speculative_tokens "
+                            "is greater than the layer_num, adjusting to "
+                            "layer_num"
+                        )
+                        self.num_speculative_tokens = n_predict
                    elif (
                        self.num_speculative_tokens > n_predict
                        and self.num_speculative_tokens % n_predict != 0
@@ -555,9 +610,17 @@ class SpeculativeConfig:
                    )
                )

+                self.draft_pipeline_parallel_size = (
+                    SpeculativeConfig._verify_and_get_draft_pp(
+                        self.target_parallel_config,
+                        self.draft_pipeline_parallel_size,
+                    )
+                )
                self.draft_parallel_config = (
                    SpeculativeConfig.create_draft_parallel_config(
-                        self.target_parallel_config, self.draft_tensor_parallel_size
+                        self.target_parallel_config,
+                        self.draft_tensor_parallel_size,
+                        self.draft_pipeline_parallel_size,
                    )
                )
        return self
@@ -671,17 +734,61 @@ class SpeculativeConfig:
            )
        return speculative_draft_tensor_parallel_size

+    @staticmethod
+    def _verify_and_get_draft_pp(
+        target_parallel_config: ParallelConfig,
+        speculative_draft_pipeline_parallel_size: int | None,
+    ) -> int:
+        """
+        Verifies and adjusts the pipeline parallel size for a draft model
+        specified using speculative_draft_pipeline_parallel_size.
+        """
+        if speculative_draft_pipeline_parallel_size is None:
+            return target_parallel_config.pipeline_parallel_size
+
+        if speculative_draft_pipeline_parallel_size not in (
+            1,
+            target_parallel_config.pipeline_parallel_size,
+        ):
+            raise ValueError(
+                f"{speculative_draft_pipeline_parallel_size=} cannot be "
+                "other value than 1 or target model "
+                f"pipeline_parallel_size="
+                f"{target_parallel_config.pipeline_parallel_size}"
+            )
+        return speculative_draft_pipeline_parallel_size
+
+    def update_arch_(self):
+        """
+        EagleConfig and ExtractHiddenStatesConfig update architectures, so update all
+        architectures-related fields in self.draft_model_config
+        """
+        self.draft_model_config.hf_text_config = get_hf_text_config(
+            self.draft_model_config.hf_config
+        )
+        self.draft_model_config.model_arch_config = (
+            self.draft_model_config.get_model_arch_config()
+        )
+        model_info, arch = self.draft_model_config.registry.inspect_model_cls(
+            self.draft_model_config.architectures,
+            self.draft_model_config,
+        )
+        self.draft_model_config._model_info = model_info
+        self.draft_model_config._architecture = arch
+
    @staticmethod
    def create_draft_parallel_config(
        target_parallel_config: ParallelConfig,
        speculative_draft_tensor_parallel_size: int,
+        speculative_draft_pipeline_parallel_size: int,
    ) -> ParallelConfig:
        """Create a parallel config for use by the draft worker.

-        This is mostly a copy of the target parallel config, except the tp_size.
+        This is mostly a copy of the target parallel config, except the tp/pp
+        sizes used by the draft model.
        """
        draft_parallel_config = ParallelConfig(
-            pipeline_parallel_size=target_parallel_config.pipeline_parallel_size,
+            pipeline_parallel_size=speculative_draft_pipeline_parallel_size,
            tensor_parallel_size=speculative_draft_tensor_parallel_size,
            distributed_executor_backend=target_parallel_config.distributed_executor_backend,
            max_parallel_loading_workers=target_parallel_config.max_parallel_loading_workers,
@@ -699,6 +806,12 @@ class SpeculativeConfig:
                "'tensor_parallel_size' is not a valid argument in the "
                "speculative_config. Please pass 'draft_tensor_parallel_size' instead."
            )
+        if self.pipeline_parallel_size is not None:
+            raise ValueError(
+                "'pipeline_parallel_size' is not a valid argument in the "
+                "speculative_config. Please pass "
+                "'draft_pipeline_parallel_size' instead."
+            )

        if self.num_speculative_tokens is None:
            raise ValueError(
@@ -718,7 +831,7 @@ class SpeculativeConfig:
                self.draft_parallel_config
            )

-        eagle3_target_supported = [
+        aux_hidden_states_supported = [
            "llama",
            "qwen",
            "minicpm",
@@ -729,16 +842,16 @@ class SpeculativeConfig:
            "nemotron_h",
        ]
        if (
-            self.method == "eagle3"
+            self.method in ("eagle3", "extract_hidden_states")
            and self.target_model_config
            and not any(
                supported_model in self.target_model_config.hf_text_config.model_type
-                for supported_model in eagle3_target_supported
+                for supported_model in aux_hidden_states_supported
            )
        ):
            raise ValueError(
-                f"Eagle3 is only supported for {eagle3_target_supported} models. "  # noqa: E501
-                f"Got {self.target_model_config.hf_text_config.model_type=}"
+                f"{self.method} is only supported for {aux_hidden_states_supported}"
+                f" models. Got {self.target_model_config.hf_text_config.model_type=}"
            )
        self.verify_equal_vocab_size_if_draft_model()
        return self
@@ -782,8 +895,65 @@ class SpeculativeConfig:
    def uses_draft_model(self) -> bool:
        return self.method == "draft_model"

+    def uses_extract_hidden_states(self) -> bool:
+        return self.method == "extract_hidden_states"
+
+    def needs_partial_pp_draft_remap(
+        self, target_parallel_config: ParallelConfig
+    ) -> bool:
+        """Whether draft PP is smaller than target PP and needs rank remap."""
+        if self.draft_parallel_config is None:
+            return False
+        return (
+            target_parallel_config.pipeline_parallel_size
+            > self.draft_parallel_config.pipeline_parallel_size
+        )
+
+    def resolve_partial_pp_draft_rank(
+        self, target_parallel_config: ParallelConfig
+    ) -> int:
+        """Map a target rank to the local draft rank for partial-PP drafting.
+
+        Currently this only supports running the draft model with `draft_pp=1`
+        on the last target PP stage.
+        """
+        if not self.needs_partial_pp_draft_remap(target_parallel_config):
+            return target_parallel_config.rank
+
+        assert self.draft_parallel_config is not None
+        draft_pp = self.draft_parallel_config.pipeline_parallel_size
+        if draft_pp != 1:
+            raise ValueError(
+                "Partial pp drafter rank remapping only supports "
+                "draft_pipeline_parallel_size=1 when target PP is larger."
+            )
+
+        target_tp = target_parallel_config.tensor_parallel_size
+        draft_tp = self.draft_parallel_config.tensor_parallel_size
+        if draft_tp != target_tp:
+            raise ValueError(
+                "Partial pp drafter rank remapping requires "
+                "draft_tensor_parallel_size to equal target tensor_parallel_size. "
+                f"Got draft_tp={draft_tp}, target_tp={target_tp}."
+            )
+
+        target_pp = target_parallel_config.pipeline_parallel_size
+        target_rank = target_parallel_config.rank
+        target_pp_rank = target_rank // target_tp
+        target_tp_rank = target_rank % target_tp
+        if target_pp_rank != target_pp - 1:
+            raise ValueError(
+                "Partial pp drafter should only run on the last "
+                f"pipeline stage, but got pp rank {target_pp_rank} / {target_pp}"
+            )
+        return target_tp_rank
+
    def __repr__(self) -> str:
        method = self.method
-        model = None if method in ("ngram", "suffix") else self.draft_model_config.model
+        model = (
+            None
+            if method in ("ngram", "suffix", "extract_hidden_states")
+            else self.draft_model_config.model
+        )
        num_spec_tokens = self.num_speculative_tokens
        return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -126,6 +126,9 @@ def enable_allreduce_rms_fusion(cfg: "VllmConfig") -> bool:
        # tp-dp combination broken:
        # https://github.com/vllm-project/vllm/issues/34458
        and cfg.parallel_config.data_parallel_size == 1
+        # tp-pp combination broken:
+        # https://github.com/vllm-project/vllm/issues/35426
+        and cfg.parallel_config.pipeline_parallel_size == 1
    )


@@ -857,7 +860,7 @@ class VllmConfig:
                self.compilation_config.pass_config.fuse_gemm_comms = False
            else:
                # Compute SP threshold early; disable if None (model too
-                # small) before +rms_norm gets forced into custom_ops.
+                # small for SP to be beneficial).
                pass_config = self.compilation_config.pass_config
                if pass_config.sp_min_token_num is None:
                    from vllm.compilation.passes.fusion.sequence_parallelism import (
@@ -880,15 +883,13 @@ class VllmConfig:
                    self.compilation_config.pass_config.enable_sp = False
                    self.compilation_config.pass_config.fuse_gemm_comms = False

-        if self.compilation_config.pass_config.enable_sp:
-            if "-rms_norm" in self.compilation_config.custom_ops:
-                logger.warning(
-                    "RMS norm force disabled, sequence parallelism might break"
-                )
-            else:
-                self.compilation_config.custom_ops.append("+rms_norm")
+        from vllm.utils.torch_utils import HAS_OPAQUE_TYPE

-        if self.compilation_config.fast_moe_cold_start is None:
+        if HAS_OPAQUE_TYPE:
+            # On torch >= 2.11 the hoisted OpaqueObject approach supersedes
+            # fast_moe_cold_start, so force it off.
+            self.compilation_config.fast_moe_cold_start = False
+        elif self.compilation_config.fast_moe_cold_start is None:
            # resolve default behavior: try to be as safe as possible
            # this config is unsafe if any spec decoding draft model has a MOE.
            # We'll conservatively turn it off if we see spec decoding.
@@ -907,9 +908,9 @@ class VllmConfig:
                ):
                    logger.warning_once(
                        "Pooling models do not support full cudagraphs. "
-                        "Overriding cudagraph_mode to PIECEWISE."
+                        "Overriding cudagraph_mode to NONE."
                    )
-                    self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+                    self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
                elif (
                    model_config.is_encoder_decoder
                    and self.compilation_config.cudagraph_mode
@@ -924,6 +925,33 @@ class VllmConfig:
                        CUDAGraphMode.FULL_DECODE_ONLY
                    )

+            # Check if KV connector requires PIECEWISE mode for CUDA graphs
+            if (
+                self.kv_transfer_config is not None
+                and self.kv_transfer_config.is_kv_transfer_instance
+                and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+            ):
+                # Lazy import to avoid circular dependencies
+                from vllm.distributed.kv_transfer.kv_connector.factory import (
+                    KVConnectorFactory,
+                )
+
+                connector_cls = KVConnectorFactory.get_connector_class(
+                    self.kv_transfer_config
+                )
+                if connector_cls.requires_piecewise_for_cudagraph(
+                    self.kv_transfer_config.kv_connector_extra_config
+                ):
+                    logger.warning_once(
+                        "KV connector %s requires PIECEWISE CUDA graph mode "
+                        "due to layerwise async operations that cannot be "
+                        "captured in CUDA graphs. "
+                        "Overriding cudagraph_mode from %s to PIECEWISE.",
+                        connector_cls.__name__,
+                        self.compilation_config.cudagraph_mode.name,
+                    )
+                    self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+
            # disable cudagraph when enforce eager execution
            if self.model_config is not None and self.model_config.enforce_eager:
                logger.info("Cudagraph is disabled under eager mode")
@@ -1113,6 +1141,20 @@ class VllmConfig:

        if not self.instance_id:
            self.instance_id = random_uuid()[:5]
+            
+        def is_ixserver_connector(kv_transfer_config) -> bool:
+            if kv_transfer_config is not None and hasattr(
+                kv_transfer_config, "kv_connector"
+            ):
+                connector = kv_transfer_config.kv_connector
+                if isinstance(connector, str):
+                    connector_name = connector
+                else:
+                    connector_name = getattr(
+                        type(connector), "__name__", str(connector)
+                    )
+                return "IxServer" in connector_name
+            return False

        # Hybrid KV cache manager (HMA) runtime rules:
        # - Explicit enable (--no-disable-kv-cache-manager): error if runtime
@@ -1154,21 +1196,29 @@ class VllmConfig:
        if self.scheduler_config.disable_hybrid_kv_cache_manager is None:
            # Default to disable HMA, but only if the user didn't express a preference.
            if self.kv_transfer_config is not None:
+                if is_ixserver_connector(self.kv_transfer_config):
+                    pass
                # NOTE(Kuntai): turn HMA off for connector unless specifically enabled.
-                need_disable_hybrid_kv_cache_manager = True
-                logger.warning(
-                    "Turning off hybrid kv cache manager because "
-                    "`--kv-transfer-config` is set. This will reduce the "
-                    "performance of vLLM on LLMs with sliding window attention "
-                    "or Mamba attention. If you are a developer of kv connector"
-                    ", please consider supporting hybrid kv cache manager for "
-                    "your connector by making sure your connector is a subclass"
-                    " of `SupportsHMA` defined in kv_connector/v1/base.py and"
-                    " use --no-disable-hybrid-kv-cache-manager to start vLLM."
+                else:
+                    need_disable_hybrid_kv_cache_manager = True
+                    logger.warning(
+                        "Turning off hybrid kv cache manager because "
+                        "`--kv-transfer-config` is set. This will reduce the "
+                        "performance of vLLM on LLMs with sliding window attention "
+                        "or Mamba attention. If you are a developer of kv connector"
+                        ", please consider supporting hybrid kv cache manager for "
+                        "your connector by making sure your connector is a subclass"
+                        " of `SupportsHMA` defined in kv_connector/v1/base.py and"
+                        " use --no-disable-hybrid-kv-cache-manager to start vLLM."
+                    )
+                    self.scheduler_config.disable_hybrid_kv_cache_manager = (
+                    need_disable_hybrid_kv_cache_manager
+                )
+                    
+            else:
+                self.scheduler_config.disable_hybrid_kv_cache_manager = (
+                    need_disable_hybrid_kv_cache_manager
                )
-            self.scheduler_config.disable_hybrid_kv_cache_manager = (
-                need_disable_hybrid_kv_cache_manager
-            )
        elif (
            self.scheduler_config.disable_hybrid_kv_cache_manager is False
            and need_disable_hybrid_kv_cache_manager
@@ -1466,22 +1516,22 @@ class VllmConfig:
        if compile_range_end is not None:
            computed_compile_ranges_split_points.append(compile_range_end)

-        # # Add the compile ranges for flashinfer
-        # if compilation_config.pass_config.fuse_allreduce_rms:
-        #     tp_size = self.parallel_config.tensor_parallel_size
-        #     max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
-        #     if max_size is not None:
-        #         max_token_num = max_size // (
-        #             self.model_config.get_hidden_size()
-        #             * self.model_config.dtype.itemsize
-        #         )
-        #         if compile_range_end is not None and max_token_num < compile_range_end:
-        #             computed_compile_ranges_split_points.append(max_token_num)
-        #         else:
-        #             logger.debug(
-        #                 "Max num batched tokens below allreduce-rms fusion threshold, "
-        #                 "allreduce-rms fusion will be enabled for all num_tokens."
-        #             )
+        # Add the compile ranges for flashinfer
+        if compilation_config.pass_config.fuse_allreduce_rms:
+            tp_size = self.parallel_config.tensor_parallel_size
+            max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
+            if max_size is not None:
+                max_token_num = max_size // (
+                    self.model_config.get_hidden_size()
+                    * self.model_config.dtype.itemsize
+                )
+                if compile_range_end is not None and max_token_num < compile_range_end:
+                    computed_compile_ranges_split_points.append(max_token_num)
+                else:
+                    logger.debug(
+                        "Max num batched tokens below allreduce-rms fusion threshold, "
+                        "allreduce-rms fusion will be enabled for all num_tokens."
+                    )

        # Add the compile ranges for sequence parallelism
        if compilation_config.pass_config.enable_sp:
@@ -1618,6 +1668,7 @@ class VllmConfig:
            f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, "  # noqa
            f"data_parallel_size={self.parallel_config.data_parallel_size}, "  # noqa
            f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, "  # noqa
+            f"quantization={self.model_config.quantization}, "
            f"enforce_eager={self.model_config.enforce_eager}, "
            f"enable_return_routed_experts={self.model_config.enable_return_routed_experts}, "  # noqa
            f"kv_cache_dtype={self.cache_config.cache_dtype}, "
--- a/vllm/config/weight_transfer.py
+++ b/vllm/config/weight_transfer.py
@@ -9,5 +9,5 @@ from vllm.config.utils import config
 class WeightTransferConfig:
    """Configuration for weight transfer during RL training."""

-    backend: Literal["nccl"] = "nccl"
+    backend: Literal["nccl", "ipc"] = "nccl"
    """The backend to use for weight transfer."""