diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index aa86823f..00c26e30 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -559,6 +559,8 @@ class SpecDecodeBaseProposer(EagleProposer): common_attn_metadata.num_reqs = num_reqs_padded common_attn_metadata.query_start_loc = self.runner.query_start_loc.gpu[: num_reqs_padded + 1] common_attn_metadata.query_start_loc_cpu = self.runner.query_start_loc.cpu[: num_reqs_padded + 1] + common_attn_metadata.seq_lens = self.runner.seq_lens.gpu[:num_reqs_padded] + common_attn_metadata.seq_lens_cpu = self.runner.seq_lens.cpu[:num_reqs_padded] else: num_input_tokens = num_tokens diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 859cef73..fc02fc0a 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -758,11 +758,11 @@ class NPUModelRunner(GPUModelRunner): self.gdn_query_start_loc.copy_to_gpu() self.seq_lens.np[:num_reqs] = self.input_batch.num_computed_tokens_cpu[:num_reqs] + num_scheduled_tokens + self.seq_lens.cpu[num_reqs:].fill_(0) self.seq_lens.copy_to_gpu() # Fill unused with -1. Needed for reshape_and_cache in attention_cp self.query_start_loc.gpu[num_reqs + 1 :].fill_(-1) - self.seq_lens.gpu[num_reqs:].fill_(0) # Copy the tensors to the NPU. self._prepare_input_ids(scheduler_output, total_num_scheduled_tokens, cu_num_tokens)