[Bugfix] fix the incorrect use of python's sum on tensors. (#4655)

### What this PR does / why we need it?
Fix the incorrect use of python's sum function on PyTorch tensors.
1. Using Python's sum() function on a tensor self.num_pcp_pads resulted
in 6ms execution time
Optimization: replacing with PyTorch's torch.sum() reduced execution
time to 474µs
2. scheduler_output.scheduled_spec_decode_tokens undergoes repeated loop
processing even when speculative decoding is not used

Optimization: added conditional logic to skip processing loops when
speculative decoding is disabled, eliminating unnecessary computational
overhead.


- vLLM version: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
- vLLM main:
86e178f7c4

Signed-off-by: wangx700 <wangxin700@huawei.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
This commit is contained in:
wangx700
2025-12-15 19:22:40 +08:00
committed by GitHub
parent 6029bea480
commit 3b7eb5179f

View File

@@ -594,6 +594,8 @@ class NPUModelRunner(GPUModelRunner):
req_indices, positions_np) req_indices, positions_np)
self.input_batch.block_table.commit_slot_mapping( self.input_batch.block_table.commit_slot_mapping(
total_num_scheduled_tokens) total_num_scheduled_tokens)
total_num_pcp_pads = 0
if self.pcp_size > 1: if self.pcp_size > 1:
if not self.vllm_config.model_config.use_mla: if not self.vllm_config.model_config.use_mla:
self.generate_kv_idx(scheduler_output) self.generate_kv_idx(scheduler_output)
@@ -601,12 +603,15 @@ class NPUModelRunner(GPUModelRunner):
tokens) tokens)
num_scheduled_tokens = np.array(tokens, dtype=np.int32) num_scheduled_tokens = np.array(tokens, dtype=np.int32)
total_num_scheduled_tokens = sum(num_scheduled_tokens[:num_reqs]) total_num_scheduled_tokens = sum(num_scheduled_tokens[:num_reqs])
total_num_pcp_pads = torch.sum(self.num_pcp_pads).item()
else: else:
position_pcp, pcp_unpad_mask = None, None position_pcp, pcp_unpad_mask = None, None
self.num_pcp_pads = self.num_pcp_pads[:num_reqs] self.num_pcp_pads = self.num_pcp_pads[:num_reqs]
total_num_pcp_pads = sum(self.num_pcp_pads)
max_num_scheduled_tokens = max(tokens) max_num_scheduled_tokens = max(tokens)
if not scheduler_output.scheduled_spec_decode_tokens:
num_valid_tokens = np.array(tokens, dtype=np.int32)
else:
num_valid_tokens = np.array([ num_valid_tokens = np.array([
num_tokens - num_tokens -
len(scheduler_output.scheduled_spec_decode_tokens.get(i, [])) len(scheduler_output.scheduled_spec_decode_tokens.get(i, []))