From 1140789e83f76ae96bf7b64f1047ea01924fe06c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E8=84=B8=E7=94=B7?= <244036962@qq.com> Date: Wed, 7 Jan 2026 15:57:16 +0800 Subject: [PATCH] [Bugfix] Fix the graph capture failure issue in the eagle3+full scenario. (#5553) ### What this PR does / why we need it? When launching the service in the scenario where the cudagraph_mode is set to FULL and Eagle3 acceleration is enabled for inference, an error in fia will cause graph capture to fail. This PR fixes the issue. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/7157596103666ee7ccb7008acee8bff8a8ff1731 Signed-off-by: WithHades <244036962@qq.com> --- vllm_ascend/spec_decode/eagle_proposer.py | 13 +++++++++---- vllm_ascend/worker/model_runner_v1.py | 9 +-------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index b88a5ba9..7fede206 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -73,7 +73,8 @@ class EagleProposer(VllmEagleProposer): self.pcp_size = self.runner.pcp_size self.decode_threshold = 1 + self.num_speculative_tokens - + self.query_start_loc = self.runner._make_buffer( + self.runner.max_num_reqs + 1, dtype=torch.int32) self.arange_cpu = torch.arange(self.arange.shape[0], device="cpu", dtype=torch.int32) @@ -200,10 +201,14 @@ class EagleProposer(VllmEagleProposer): num_computed_tokens_cpu = ( self.runner.input_batch. num_computed_tokens_cpu_tensor[:num_reqs]) + self.query_start_loc.cpu[:num_reqs + 1] = torch.tensor( + [0] + self.runner.actual_seq_lengths_q[:num_reqs], + device="cpu", + dtype=torch.int32) + self.query_start_loc.copy_to_gpu() common_attn_metadata = AscendCommonAttentionMetadata( - query_start_loc=self.runner.query_start_loc.gpu[:num_reqs + 1], - query_start_loc_cpu=self.runner.query_start_loc.cpu[:num_reqs + - 1], + query_start_loc=self.query_start_loc.gpu[:num_reqs + 1], + query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs + 1], seq_lens_cpu=self.runner.seq_lens.cpu, seq_lens=self.runner.seq_lens.gpu[:num_reqs], num_reqs=num_reqs, diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index d4d901b2..70f60012 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1939,14 +1939,7 @@ class NPUModelRunner(GPUModelRunner): [0] * dcp_world_size for _ in range(pcp_world_size) ] for _ in range(num_tokens)] long_seq_metadata.num_computed_tokens_of_pcp_dcp = num_computed_tokens_of_pcp_dcp - # QUESTION: Why do we separately set query_start_loc for spec in the first place? - # While in _prepare_inputs we don't? - if self.speculative_config: - self.query_start_loc.cpu[:num_reqs + 1] = torch.tensor( - [0] + self.actual_seq_lengths_q[:num_reqs], - device="cpu", - dtype=torch.int32) - self.query_start_loc.copy_to_gpu() + common_attn_metadata = AscendCommonAttentionMetadata( query_start_loc=self.query_start_loc.gpu[:num_reqs + 1], query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs +