From 3b7eb5179f2b5d52e9bd693095d51de604ba5ece Mon Sep 17 00:00:00 2001 From: wangx700 Date: Mon, 15 Dec 2025 19:22:40 +0800 Subject: [PATCH] [Bugfix] fix the incorrect use of python's sum on tensors. (#4655) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? Fix the incorrect use of python's sum function on PyTorch tensors. 1. Using Python's sum() function on a tensor self.num_pcp_pads resulted in 6ms execution time Optimization: replacing with PyTorch's torch.sum() reduced execution time to 474µs 2. scheduler_output.scheduled_spec_decode_tokens undergoes repeated loop processing even when speculative decoding is not used Optimization: added conditional logic to skip processing loops when speculative decoding is disabled, eliminating unnecessary computational overhead. - vLLM version: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 - vLLM main: https://github.com/vllm-project/vllm/commit/86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 Signed-off-by: wangx700 Co-authored-by: weijinqian0 <1184188277@qq.com> --- vllm_ascend/worker/model_runner_v1.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index c56a4562..83e10521 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -594,6 +594,8 @@ class NPUModelRunner(GPUModelRunner): req_indices, positions_np) self.input_batch.block_table.commit_slot_mapping( total_num_scheduled_tokens) + + total_num_pcp_pads = 0 if self.pcp_size > 1: if not self.vllm_config.model_config.use_mla: self.generate_kv_idx(scheduler_output) @@ -601,18 +603,21 @@ class NPUModelRunner(GPUModelRunner): tokens) num_scheduled_tokens = np.array(tokens, dtype=np.int32) total_num_scheduled_tokens = sum(num_scheduled_tokens[:num_reqs]) + total_num_pcp_pads = torch.sum(self.num_pcp_pads).item() else: position_pcp, pcp_unpad_mask = None, None self.num_pcp_pads = self.num_pcp_pads[:num_reqs] - total_num_pcp_pads = sum(self.num_pcp_pads) max_num_scheduled_tokens = max(tokens) - num_valid_tokens = np.array([ - num_tokens - - len(scheduler_output.scheduled_spec_decode_tokens.get(i, [])) - for num_tokens, i in zip(tokens, req_ids) - ], - dtype=np.int32) + if not scheduler_output.scheduled_spec_decode_tokens: + num_valid_tokens = np.array(tokens, dtype=np.int32) + else: + num_valid_tokens = np.array([ + num_tokens - + len(scheduler_output.scheduled_spec_decode_tokens.get(i, [])) + for num_tokens, i in zip(tokens, req_ids) + ], + dtype=np.int32) if (self.use_aclgraph and total_num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):