From 169e434f78f03b963efb2779d8c64675313f9481 Mon Sep 17 00:00:00 2001 From: Dijurido <256472688+wangbj127@users.noreply.github.com> Date: Thu, 26 Feb 2026 10:26:01 +0800 Subject: [PATCH] [CI] Fix EAGLE CI problems (#6702) ### What this PR does / why we need it? New FIA operator requires queryT equal to the last element of actualSequenceLengthQ. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Passed existing test (test_mtp_eagle_correctness.py). - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/9562912cead1f11e8540fb91306c5cbda66f0007 --------- Signed-off-by: Wangbingjie Signed-off-by: Wangbingjie Co-authored-by: Wangbingjie --- .../spec_decode/test_mtp_eagle_correctness.py | 2 -- vllm_ascend/spec_decode/eagle_proposer.py | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/e2e/singlecard/spec_decode/test_mtp_eagle_correctness.py b/tests/e2e/singlecard/spec_decode/test_mtp_eagle_correctness.py index 21d09512..77447e8f 100644 --- a/tests/e2e/singlecard/spec_decode/test_mtp_eagle_correctness.py +++ b/tests/e2e/singlecard/spec_decode/test_mtp_eagle_correctness.py @@ -120,7 +120,6 @@ def test_deepseek_mtp_correctness(model_name: str, num_speculative_tokens: int, del spec_llm -@pytest.mark.skip(reason="Failed with CANN8.5, fix me") @pytest.mark.parametrize("model_name", MODELS_EAGLE) @pytest.mark.parametrize("model_name_main", MODELS_MAIN) @pytest.mark.parametrize("num_speculative_tokens", [1, 2]) @@ -169,7 +168,6 @@ def test_llama_qwen3_eagle_correctness( "draft_tensor_parallel_size": draft_tensor_parallel_size, "max_model_len": 128, - "draft_vocab_size": 128256, }, compilation_config=CompilationConfig( cudagraph_mode="FULL_DECODE_ONLY", diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 73a080b4..7609e681 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -417,6 +417,21 @@ class EagleProposer(VllmEagleProposer): self.input_ids[last_token_indices] = next_token_ids if self.use_cuda_graph and num_tokens <= self.runner.cudagraph_batch_sizes[-1]: num_input_tokens = self.runner.cudagraph_dispatcher._bs_to_padded_graph_size[num_tokens] + if not ( + self.speculative_config.disable_padded_drafter_batch + and self.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE + ): + # TODO: Due to the inconsistency between the proposer `dispatcher` and model runner, this padding + # should have been done in model runner but not. For example, at prefill stage, target model + # is run in eager mode currently, which means `_pad_query_start_loc_for_fia` is not called, + # while draft model is run in graph model, which means we should pad the `query_start_loc`. + # Need to be fixed in the future. + num_reqs_padded = self.runner._pad_query_start_loc_for_fia( + num_input_tokens, common_attn_metadata.num_reqs, common_attn_metadata.num_reqs + ) + common_attn_metadata.num_reqs = num_reqs_padded + common_attn_metadata.query_start_loc = self.runner.query_start_loc.gpu[: num_reqs_padded + 1] + common_attn_metadata.query_start_loc_cpu = self.runner.query_start_loc.cpu[: num_reqs_padded + 1] else: num_input_tokens = num_tokens