[Bugfix] fix the incorrect use of python's sum on tensors. (#4655)
### What this PR does / why we need it?
Fix the incorrect use of python's sum function on PyTorch tensors.
1. Using Python's sum() function on a tensor self.num_pcp_pads resulted
in 6ms execution time
Optimization: replacing with PyTorch's torch.sum() reduced execution
time to 474µs
2. scheduler_output.scheduled_spec_decode_tokens undergoes repeated loop
processing even when speculative decoding is not used
Optimization: added conditional logic to skip processing loops when
speculative decoding is disabled, eliminating unnecessary computational
overhead.
- vLLM version: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
- vLLM main:
86e178f7c4
Signed-off-by: wangx700 <wangxin700@huawei.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
This commit is contained in:
@@ -594,6 +594,8 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
req_indices, positions_np)
|
req_indices, positions_np)
|
||||||
self.input_batch.block_table.commit_slot_mapping(
|
self.input_batch.block_table.commit_slot_mapping(
|
||||||
total_num_scheduled_tokens)
|
total_num_scheduled_tokens)
|
||||||
|
|
||||||
|
total_num_pcp_pads = 0
|
||||||
if self.pcp_size > 1:
|
if self.pcp_size > 1:
|
||||||
if not self.vllm_config.model_config.use_mla:
|
if not self.vllm_config.model_config.use_mla:
|
||||||
self.generate_kv_idx(scheduler_output)
|
self.generate_kv_idx(scheduler_output)
|
||||||
@@ -601,12 +603,15 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
tokens)
|
tokens)
|
||||||
num_scheduled_tokens = np.array(tokens, dtype=np.int32)
|
num_scheduled_tokens = np.array(tokens, dtype=np.int32)
|
||||||
total_num_scheduled_tokens = sum(num_scheduled_tokens[:num_reqs])
|
total_num_scheduled_tokens = sum(num_scheduled_tokens[:num_reqs])
|
||||||
|
total_num_pcp_pads = torch.sum(self.num_pcp_pads).item()
|
||||||
else:
|
else:
|
||||||
position_pcp, pcp_unpad_mask = None, None
|
position_pcp, pcp_unpad_mask = None, None
|
||||||
self.num_pcp_pads = self.num_pcp_pads[:num_reqs]
|
self.num_pcp_pads = self.num_pcp_pads[:num_reqs]
|
||||||
|
|
||||||
total_num_pcp_pads = sum(self.num_pcp_pads)
|
|
||||||
max_num_scheduled_tokens = max(tokens)
|
max_num_scheduled_tokens = max(tokens)
|
||||||
|
if not scheduler_output.scheduled_spec_decode_tokens:
|
||||||
|
num_valid_tokens = np.array(tokens, dtype=np.int32)
|
||||||
|
else:
|
||||||
num_valid_tokens = np.array([
|
num_valid_tokens = np.array([
|
||||||
num_tokens -
|
num_tokens -
|
||||||
len(scheduler_output.scheduled_spec_decode_tokens.get(i, []))
|
len(scheduler_output.scheduled_spec_decode_tokens.get(i, []))
|
||||||
|
|||||||
Reference in New Issue
Block a user