From a6f6e919e6928e8a23137970cf570eba27524fb1 Mon Sep 17 00:00:00 2001 From: drslark <96540755+drslark@users.noreply.github.com> Date: Mon, 16 Mar 2026 20:41:36 +0800 Subject: [PATCH] [main][bugfix] Fixed the problem that eagle3 will crash in FULL_DECODE_ONLY (#7290) ### What this PR does / why we need it? Two problems have been solved in this pr. These problems occur in the `FULL_DECODE_ONLY` mode that `num_tokens` should be padded to some value in `cudagraph_capture_sizes`. 1. We found the length of `seq_lens_list` in drafter's `attn_metadata` is 1 shorter than expected. It will raise a kernel exception to make vllm crash. e.g., `num_reqs` = 3, `cudagraph_capture_sizes` = [20], `actual_seq_lengths_q` is padded well to [4, 8, 12, 20]. But `seq_lens_list` = [5742, 4700, 7996], it is not padded. 3. Though the length of `seq_lens_list` in target's `attn_metadata` is the same as expected in `FULL_DECODE_ONLY`, some data are corrupted at the end of the list. e.g., `num_reqs` = 3, `cudagraph_capture_sizes` = [20], `actual_seq_lengths_q` is padded well to [4, 8, 12, 20]. But `seq_lens_list` = [5742, 4700, 7996, 5738], it has corrupted at the end of the list. - vLLM version: v0.17.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4034c3d32e30d01639459edd3ab486f56993876d Signed-off-by: drslark --- vllm_ascend/spec_decode/eagle_proposer.py | 2 ++ vllm_ascend/worker/model_runner_v1.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index aa86823f..00c26e30 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -559,6 +559,8 @@ class SpecDecodeBaseProposer(EagleProposer): common_attn_metadata.num_reqs = num_reqs_padded common_attn_metadata.query_start_loc = self.runner.query_start_loc.gpu[: num_reqs_padded + 1] common_attn_metadata.query_start_loc_cpu = self.runner.query_start_loc.cpu[: num_reqs_padded + 1] + common_attn_metadata.seq_lens = self.runner.seq_lens.gpu[:num_reqs_padded] + common_attn_metadata.seq_lens_cpu = self.runner.seq_lens.cpu[:num_reqs_padded] else: num_input_tokens = num_tokens diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 859cef73..fc02fc0a 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -758,11 +758,11 @@ class NPUModelRunner(GPUModelRunner): self.gdn_query_start_loc.copy_to_gpu() self.seq_lens.np[:num_reqs] = self.input_batch.num_computed_tokens_cpu[:num_reqs] + num_scheduled_tokens + self.seq_lens.cpu[num_reqs:].fill_(0) self.seq_lens.copy_to_gpu() # Fill unused with -1. Needed for reshape_and_cache in attention_cp self.query_start_loc.gpu[num_reqs + 1 :].fill_(-1) - self.seq_lens.gpu[num_reqs:].fill_(0) # Copy the tensors to the NPU. self._prepare_input_ids(scheduler_output, total_num_scheduled_tokens, cu_num_tokens)