From 9cc41c94573af69580029c835b6d6ddf5ba6207a Mon Sep 17 00:00:00 2001 From: wangbj127 <256472688+wangbj127@users.noreply.github.com> Date: Sun, 29 Mar 2026 12:23:44 +0800 Subject: [PATCH] [v0.18.0][Bugfix][EAGLE] Fix FIA pad bug under max concurrency (#7754) cherry picked from https://github.com/vllm-project/vllm-ascend/pull/7740 Fixes padding problems of FIA op under max concurrency. - vLLM version: v0.18.0 - vLLM main: https://github.com/vllm-project/vllm/commit/35141a7eeda941a60ad5a4956670c60fd5a77029 Signed-off-by: Wangbingjie --- .../spec_decode/test_v1_spec_decode.py | 29 +++++++++++++++++++ vllm_ascend/spec_decode/eagle_proposer.py | 2 ++ 2 files changed, 31 insertions(+) diff --git a/tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py b/tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py index a24808cb..a011516d 100644 --- a/tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py +++ b/tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py @@ -534,3 +534,32 @@ def test_parallel_drafting_acceptance( print(f"golden: {golden}") assert match + + +@pytest.mark.parametrize("method", MODELS.keys()) +@pytest.mark.parametrize("num_speculative_tokens", [3]) +def test_eagle3_fia_pad_under_max_concurrency( + method: str, + num_speculative_tokens: int, +): + main_model_name = MODELS[method]["main"] + spec_model_name = MODELS[method]["spec"] + prompts = [ + "Hello, I am", + ] + speculative_config = { + "method": method, + "num_speculative_tokens": num_speculative_tokens, + "model": spec_model_name, + } + max_num_tokens = 1 + num_speculative_tokens + compilation_config = CompilationConfig(cudagraph_mode="FULL_DECODE_ONLY",cudagraph_capture_sizes=[max_num_tokens]) + with VllmRunner( + main_model_name, + max_model_len=2048, + tensor_parallel_size=1, + speculative_config=speculative_config, + max_num_batched_tokens=max_num_tokens, + compilation_config=compilation_config, + ) as llm: + _ = llm.generate_greedy(prompts, max_tokens=10) diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 20a82ebc..e47f4590 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -168,6 +168,8 @@ class SpecDecodeBaseProposer(EagleProposer): # RoPE need (max_num_tokens,) self.positions = torch.zeros(self.max_num_tokens, dtype=torch.int32, device=device) + self.token_arange_np = np.arange(self.max_num_tokens + 1) + def _get_model(self) -> nn.Module: """ Default method to call get_model(). Can be overridden by subclasses which