From 4fa7cf6f506d798c06fc17e1817f31f9a841d09d Mon Sep 17 00:00:00 2001 From: Zetong Li <48438720+slippersss@users.noreply.github.com> Date: Sat, 7 Feb 2026 09:30:10 +0800 Subject: [PATCH] [Bugfix] Fix problematic dummy_run & improper input_batch_size in eagle (#6517) ### What this PR does / why we need it? This PR aims to fix problematic dummy_run that will cause excessive npu memory and to fix improper input_batch_size that will degrade running performance. ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? by ci - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0 --------- Signed-off-by: Zetong Li Signed-off-by: lilinsiman Co-authored-by: lilinsiman --- vllm_ascend/spec_decode/eagle_proposer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index e784bbb6..17abdc42 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -347,7 +347,7 @@ class EagleProposer(VllmEagleProposer): model_positions = self._get_positions(num_tokens) - batch_size = num_tokens // (self.num_speculative_tokens + 1) + batch_size = num_tokens // (self.num_speculative_tokens + 1) if not is_profile else self.runner.max_num_reqs with set_ascend_forward_context( multi_steps_attn_metadata[0] if multi_steps_attn_metadata else None, @@ -613,7 +613,7 @@ class EagleProposer(VllmEagleProposer): hidden_states = hidden_states[last_token_indices] last_token_indices = self.arange[:batch_size] - input_batch_size = num_input_tokens + input_batch_size = num_input_tokens if (self.method == "mtp" or self.use_cuda_graph) else batch_size forward_context = get_forward_context() forward_context.num_tokens = input_batch_size