[Bugfix] Fix problematic dummy_run & improper input_batch_size in eagle (#6517)

### What this PR does / why we need it? This PR aims to fix problematic dummy_run that will cause excessive npu memory and to fix improper input_batch_size that will degrade running performance. ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? by ci - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0 --------- Signed-off-by: Zetong Li <slippersss@126.com> Signed-off-by: lilinsiman <lilinsiman@gmail.com> Co-authored-by: lilinsiman <lilinsiman@gmail.com>
2026-02-07 09:30:10 +08:00
parent 1cc225711d
commit 4fa7cf6f50
1 changed files with 2 additions and 2 deletions
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -347,7 +347,7 @@ class EagleProposer(VllmEagleProposer):

        model_positions = self._get_positions(num_tokens)

-        batch_size = num_tokens // (self.num_speculative_tokens + 1)
+        batch_size = num_tokens // (self.num_speculative_tokens + 1) if not is_profile else self.runner.max_num_reqs

        with set_ascend_forward_context(
            multi_steps_attn_metadata[0] if multi_steps_attn_metadata else None,
@@ -613,7 +613,7 @@ class EagleProposer(VllmEagleProposer):
        hidden_states = hidden_states[last_token_indices]
        last_token_indices = self.arange[:batch_size]

-        input_batch_size = num_input_tokens
+        input_batch_size = num_input_tokens if (self.method == "mtp" or self.use_cuda_graph) else batch_size

        forward_context = get_forward_context()
        forward_context.num_tokens = input_batch_size