diff --git a/tests/e2e/singlecard/spec_decode/test_mtp_eagle_correctness.py b/tests/e2e/singlecard/spec_decode/test_mtp_eagle_correctness.py
index 21d09512..77447e8f 100644
--- a/tests/e2e/singlecard/spec_decode/test_mtp_eagle_correctness.py
+++ b/tests/e2e/singlecard/spec_decode/test_mtp_eagle_correctness.py
@@ -120,7 +120,6 @@ def test_deepseek_mtp_correctness(model_name: str, num_speculative_tokens: int,
     del spec_llm
 
 
-@pytest.mark.skip(reason="Failed with CANN8.5, fix me")
 @pytest.mark.parametrize("model_name", MODELS_EAGLE)
 @pytest.mark.parametrize("model_name_main", MODELS_MAIN)
 @pytest.mark.parametrize("num_speculative_tokens", [1, 2])
@@ -169,7 +168,6 @@ def test_llama_qwen3_eagle_correctness(
                         "draft_tensor_parallel_size":
                         draft_tensor_parallel_size,
                         "max_model_len": 128,
-                        "draft_vocab_size": 128256,
                     },
                     compilation_config=CompilationConfig(
                         cudagraph_mode="FULL_DECODE_ONLY",
diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py
index 73a080b4..7609e681 100644
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -417,6 +417,21 @@ class EagleProposer(VllmEagleProposer):
         self.input_ids[last_token_indices] = next_token_ids
         if self.use_cuda_graph and num_tokens <= self.runner.cudagraph_batch_sizes[-1]:
             num_input_tokens = self.runner.cudagraph_dispatcher._bs_to_padded_graph_size[num_tokens]
+            if not (
+                self.speculative_config.disable_padded_drafter_batch
+                and self.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
+            ):
+                # TODO: Due to the inconsistency between the proposer `dispatcher` and model runner, this padding
+                # should have been done in model runner but not. For example, at prefill stage, target model
+                # is run in eager mode currently, which means `_pad_query_start_loc_for_fia` is not called,
+                # while draft model is run in graph model, which means we should pad the `query_start_loc`.
+                # Need to be fixed in the future.
+                num_reqs_padded = self.runner._pad_query_start_loc_for_fia(
+                    num_input_tokens, common_attn_metadata.num_reqs, common_attn_metadata.num_reqs
+                )
+                common_attn_metadata.num_reqs = num_reqs_padded
+                common_attn_metadata.query_start_loc = self.runner.query_start_loc.gpu[: num_reqs_padded + 1]
+                common_attn_metadata.query_start_loc_cpu = self.runner.query_start_loc.cpu[: num_reqs_padded + 1]
         else:
             num_input_tokens = num_tokens