[Bugfix] fix the incorrect use of python's sum on tensors. (#4655)
### What this PR does / why we need it?
Fix the incorrect use of python's sum function on PyTorch tensors.
1. Using Python's sum() function on a tensor self.num_pcp_pads resulted
in 6ms execution time
Optimization: replacing with PyTorch's torch.sum() reduced execution
time to 474µs
2. scheduler_output.scheduled_spec_decode_tokens undergoes repeated loop
processing even when speculative decoding is not used
Optimization: added conditional logic to skip processing loops when
speculative decoding is disabled, eliminating unnecessary computational
overhead.
- vLLM version: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
- vLLM main:
86e178f7c4
Signed-off-by: wangx700 <wangxin700@huawei.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
This commit is contained in:
@@ -594,6 +594,8 @@ class NPUModelRunner(GPUModelRunner):
|
||||
req_indices, positions_np)
|
||||
self.input_batch.block_table.commit_slot_mapping(
|
||||
total_num_scheduled_tokens)
|
||||
|
||||
total_num_pcp_pads = 0
|
||||
if self.pcp_size > 1:
|
||||
if not self.vllm_config.model_config.use_mla:
|
||||
self.generate_kv_idx(scheduler_output)
|
||||
@@ -601,12 +603,15 @@ class NPUModelRunner(GPUModelRunner):
|
||||
tokens)
|
||||
num_scheduled_tokens = np.array(tokens, dtype=np.int32)
|
||||
total_num_scheduled_tokens = sum(num_scheduled_tokens[:num_reqs])
|
||||
total_num_pcp_pads = torch.sum(self.num_pcp_pads).item()
|
||||
else:
|
||||
position_pcp, pcp_unpad_mask = None, None
|
||||
self.num_pcp_pads = self.num_pcp_pads[:num_reqs]
|
||||
|
||||
total_num_pcp_pads = sum(self.num_pcp_pads)
|
||||
max_num_scheduled_tokens = max(tokens)
|
||||
if not scheduler_output.scheduled_spec_decode_tokens:
|
||||
num_valid_tokens = np.array(tokens, dtype=np.int32)
|
||||
else:
|
||||
num_valid_tokens = np.array([
|
||||
num_tokens -
|
||||
len(scheduler_output.scheduled_spec_decode_tokens.get(i, []))
|
||||
|
||||
Reference in New Issue
Block a user