diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 356efad6..7c4f2e3b 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -45,6 +45,9 @@ class EagleProposer(Proposer): self.vllm_config = vllm_config self.device = device self.runner = runner + self.speculative_config = vllm_config.speculative_config + self.draft_model_config = self.speculative_config.draft_model_config + self.method = self.speculative_config.method self.block_size = vllm_config.cache_config.block_size # We need to get the hidden size from the draft model config because @@ -99,6 +102,29 @@ class EagleProposer(Proposer): device="cpu", dtype=torch.int32) self.attn_mask_builder = AttentionMaskBuilder(self.device) + self.eagle3_use_aux_hidden_state: bool = ( + self._get_eagle3_use_aux_hidden_state_from_config()) + + def _get_eagle3_use_aux_hidden_state_from_config(self) -> bool: + """ + NOTE(2025-12-18): This is an explicit copy from vLLM EagleProposer, only added + to align with its logics. + + Some eagle3 heads (e.g., nvidia/gpt-oss-120b-Eagle3-v2) do not use auxiliary + hidden states and directly uses the last layer output just like eagle1. + They might indicate this by setting "use_aux_hidden_state" to False + inside the "eagle_config" dict of their hf_config. + """ + if self.method != "eagle3": + return False + # Assume that eagle3 heads use aux hidden states by default + use_aux_hidden_state = True + eagle_config = getattr(self.draft_model_config.hf_config, + "eagle_config", None) + if eagle_config is not None: + use_aux_hidden_state = eagle_config.get("use_aux_hidden_state", + True) + return use_aux_hidden_state def load_model(self, model: nn.Module) -> None: target_attn_layer_names = set( diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 86036a17..5e1cc0fc 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -112,7 +112,6 @@ from vllm_ascend.sample.logits_processor import build_logitsprocs from vllm_ascend.sample.sampler import AscendSampler from vllm_ascend.spec_decode import get_spec_decode_method from vllm_ascend.spec_decode.eagle_proposer import EagleProposer -from vllm_ascend.spec_decode.interface import SpecDcodeType from vllm_ascend.spec_decode.mtp_proposer import MtpProposer from vllm_ascend.utils import (AscendDeviceType, ProfileExecuteDuration, enable_sp, get_ascend_device_type, is_moe_model, @@ -385,6 +384,10 @@ class NPUModelRunner(GPUModelRunner): ) if get_pp_group().is_last_rank: self.drafter = self._get_drafter() + if self.speculative_config.method == "eagle3": + assert isinstance(self.drafter, EagleProposer) + self.use_aux_hidden_state_outputs = ( + self.drafter.eagle3_use_aux_hidden_state) self.rejection_sampler = RejectionSampler(self.sampler) self.actual_seq_lengths_q = list( range(self.decode_token_per_req, self.max_num_tokens + 1, @@ -1417,7 +1420,7 @@ class NPUModelRunner(GPUModelRunner): scheduler_output) aux_hidden_states = None - if self.drafter and self.drafter.name == SpecDcodeType.EAGLE3: + if self.use_aux_hidden_state_outputs: hidden_states, aux_hidden_states = hidden_states kv_connector_output = KVConnectorOutput( @@ -1929,7 +1932,7 @@ class NPUModelRunner(GPUModelRunner): update_attn_params(self.update_stream, forward_context, num_tokens, self.vllm_config) - if self.drafter and self.drafter.name == SpecDcodeType.EAGLE3: + if self.use_aux_hidden_state_outputs: hidden_states, _ = hidden_states else: hidden_states = hidden_states @@ -2196,7 +2199,7 @@ class NPUModelRunner(GPUModelRunner): if self.drafter: logger.info("Loading drafter model...") self.drafter.load_model(self.model) - if self.drafter.name == SpecDcodeType.EAGLE3: + if self.use_aux_hidden_state_outputs: self.model.set_aux_hidden_state_layers( self.model.get_eagle3_aux_hidden_state_layers())