From 60d9398f6d6dfd0720018b3b291c46c755996946 Mon Sep 17 00:00:00 2001
From: Yizhou <136800916+yiz-liu@users.noreply.github.com>
Date: Mon, 22 Dec 2025 15:24:54 +0800
Subject: [PATCH] [1/N][Eagle3] Aligns auxiliary hidden state usage for eagle3
 models (#5162)

### What this PR does / why we need it?
This is to prepare for the migration to vLLM's `EagleProposer`, it does
not have `name` attribution. Also it's a breakdown of #5100 .

Introduces logic to determine whether eagle3 heads require auxiliary
hidden states based on configuration, ensuring consistent handling
across related components. Prevents incorrect assumptions for eagle3
variants that do not use auxiliary outputs, improving compatibility and
correctness.

### Does this PR introduce _any_ user-facing change?
None.

### How was this patch tested?
None.
- vLLM version: v0.12.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9

---------

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
---
 vllm_ascend/spec_decode/eagle_proposer.py | 26 +++++++++++++++++++++++
 vllm_ascend/worker/model_runner_v1.py     | 11 ++++++----
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py
index 356efad6..7c4f2e3b 100644
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -45,6 +45,9 @@ class EagleProposer(Proposer):
         self.vllm_config = vllm_config
         self.device = device
         self.runner = runner
+        self.speculative_config = vllm_config.speculative_config
+        self.draft_model_config = self.speculative_config.draft_model_config
+        self.method = self.speculative_config.method
 
         self.block_size = vllm_config.cache_config.block_size
         # We need to get the hidden size from the draft model config because
@@ -99,6 +102,29 @@ class EagleProposer(Proposer):
                                        device="cpu",
                                        dtype=torch.int32)
         self.attn_mask_builder = AttentionMaskBuilder(self.device)
+        self.eagle3_use_aux_hidden_state: bool = (
+            self._get_eagle3_use_aux_hidden_state_from_config())
+
+    def _get_eagle3_use_aux_hidden_state_from_config(self) -> bool:
+        """
+        NOTE(2025-12-18): This is an explicit copy from vLLM EagleProposer, only added
+        to align with its logics.
+
+        Some eagle3 heads (e.g., nvidia/gpt-oss-120b-Eagle3-v2) do not use auxiliary
+        hidden states and directly uses the last layer output just like eagle1.
+        They might indicate this by setting "use_aux_hidden_state" to False
+        inside the "eagle_config" dict of their hf_config.
+        """
+        if self.method != "eagle3":
+            return False
+        # Assume that eagle3 heads use aux hidden states by default
+        use_aux_hidden_state = True
+        eagle_config = getattr(self.draft_model_config.hf_config,
+                               "eagle_config", None)
+        if eagle_config is not None:
+            use_aux_hidden_state = eagle_config.get("use_aux_hidden_state",
+                                                    True)
+        return use_aux_hidden_state
 
     def load_model(self, model: nn.Module) -> None:
         target_attn_layer_names = set(
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 86036a17..5e1cc0fc 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -112,7 +112,6 @@ from vllm_ascend.sample.logits_processor import build_logitsprocs
 from vllm_ascend.sample.sampler import AscendSampler
 from vllm_ascend.spec_decode import get_spec_decode_method
 from vllm_ascend.spec_decode.eagle_proposer import EagleProposer
-from vllm_ascend.spec_decode.interface import SpecDcodeType
 from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
 from vllm_ascend.utils import (AscendDeviceType, ProfileExecuteDuration,
                                enable_sp, get_ascend_device_type, is_moe_model,
@@ -385,6 +384,10 @@ class NPUModelRunner(GPUModelRunner):
             )
             if get_pp_group().is_last_rank:
                 self.drafter = self._get_drafter()
+                if self.speculative_config.method == "eagle3":
+                    assert isinstance(self.drafter, EagleProposer)
+                    self.use_aux_hidden_state_outputs = (
+                        self.drafter.eagle3_use_aux_hidden_state)
                 self.rejection_sampler = RejectionSampler(self.sampler)
             self.actual_seq_lengths_q = list(
                 range(self.decode_token_per_req, self.max_num_tokens + 1,
@@ -1417,7 +1420,7 @@ class NPUModelRunner(GPUModelRunner):
                 scheduler_output)
 
             aux_hidden_states = None
-            if self.drafter and self.drafter.name == SpecDcodeType.EAGLE3:
+            if self.use_aux_hidden_state_outputs:
                 hidden_states, aux_hidden_states = hidden_states
 
         kv_connector_output = KVConnectorOutput(
@@ -1929,7 +1932,7 @@ class NPUModelRunner(GPUModelRunner):
                     update_attn_params(self.update_stream, forward_context,
                                        num_tokens, self.vllm_config)
 
-        if self.drafter and self.drafter.name == SpecDcodeType.EAGLE3:
+        if self.use_aux_hidden_state_outputs:
             hidden_states, _ = hidden_states
         else:
             hidden_states = hidden_states
@@ -2196,7 +2199,7 @@ class NPUModelRunner(GPUModelRunner):
             if self.drafter:
                 logger.info("Loading drafter model...")
                 self.drafter.load_model(self.model)
-                if self.drafter.name == SpecDcodeType.EAGLE3:
+                if self.use_aux_hidden_state_outputs:
                     self.model.set_aux_hidden_state_layers(
                         self.model.get_eagle3_aux_hidden_state_layers())