From 4fa7cf6f506d798c06fc17e1817f31f9a841d09d Mon Sep 17 00:00:00 2001
From: Zetong Li <48438720+slippersss@users.noreply.github.com>
Date: Sat, 7 Feb 2026 09:30:10 +0800
Subject: [PATCH] [Bugfix] Fix problematic dummy_run & improper
 input_batch_size in eagle (#6517)

### What this PR does / why we need it?
This PR aims to fix problematic dummy_run that will cause excessive npu
memory and to fix improper input_batch_size that will degrade running
performance.

### Does this PR introduce _any_ user-facing change?
N/A

### How was this patch tested?
by ci

- vLLM version: v0.15.0
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0

---------

Signed-off-by: Zetong Li <slippersss@126.com>
Signed-off-by: lilinsiman <lilinsiman@gmail.com>
Co-authored-by: lilinsiman <lilinsiman@gmail.com>
---
 vllm_ascend/spec_decode/eagle_proposer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py
index e784bbb6..17abdc42 100644
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -347,7 +347,7 @@ class EagleProposer(VllmEagleProposer):
 
         model_positions = self._get_positions(num_tokens)
 
-        batch_size = num_tokens // (self.num_speculative_tokens + 1)
+        batch_size = num_tokens // (self.num_speculative_tokens + 1) if not is_profile else self.runner.max_num_reqs
 
         with set_ascend_forward_context(
             multi_steps_attn_metadata[0] if multi_steps_attn_metadata else None,
@@ -613,7 +613,7 @@ class EagleProposer(VllmEagleProposer):
         hidden_states = hidden_states[last_token_indices]
         last_token_indices = self.arange[:batch_size]
 
-        input_batch_size = num_input_tokens
+        input_batch_size = num_input_tokens if (self.method == "mtp" or self.use_cuda_graph) else batch_size
 
         forward_context = get_forward_context()
         forward_context.num_tokens = input_batch_size