diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index c56a4562..83e10521 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -594,6 +594,8 @@ class NPUModelRunner(GPUModelRunner): req_indices, positions_np) self.input_batch.block_table.commit_slot_mapping( total_num_scheduled_tokens) + + total_num_pcp_pads = 0 if self.pcp_size > 1: if not self.vllm_config.model_config.use_mla: self.generate_kv_idx(scheduler_output) @@ -601,18 +603,21 @@ class NPUModelRunner(GPUModelRunner): tokens) num_scheduled_tokens = np.array(tokens, dtype=np.int32) total_num_scheduled_tokens = sum(num_scheduled_tokens[:num_reqs]) + total_num_pcp_pads = torch.sum(self.num_pcp_pads).item() else: position_pcp, pcp_unpad_mask = None, None self.num_pcp_pads = self.num_pcp_pads[:num_reqs] - total_num_pcp_pads = sum(self.num_pcp_pads) max_num_scheduled_tokens = max(tokens) - num_valid_tokens = np.array([ - num_tokens - - len(scheduler_output.scheduled_spec_decode_tokens.get(i, [])) - for num_tokens, i in zip(tokens, req_ids) - ], - dtype=np.int32) + if not scheduler_output.scheduled_spec_decode_tokens: + num_valid_tokens = np.array(tokens, dtype=np.int32) + else: + num_valid_tokens = np.array([ + num_tokens - + len(scheduler_output.scheduled_spec_decode_tokens.get(i, [])) + for num_tokens, i in zip(tokens, req_ids) + ], + dtype=np.int32) if (self.use_aclgraph and total_num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):