[Perf] fix async copy for async scheduling (#4113)
### What this PR does / why we need it?
Only CPU tensors with `pin_memory=True` can be asynchronously copied to
the device. Currently, there are two instances where non-pinned CPU
tensors are being copied to the device, which will trigger synchronous
operations, reducing the expected benefits of asynchronous scheduling.
- vLLM version: v0.11.0
- vLLM main:
83f478bb19
Signed-off-by: realliujiaxu <realliujiaxu@163.com>
This commit is contained in:
@@ -100,7 +100,7 @@ class BlockTable:
|
||||
self.slot_mapping_cpu = torch.zeros(
|
||||
self.max_num_batched_tokens +
|
||||
2 * self.pcp_world_size * self.max_num_reqs,
|
||||
dtype=torch.int64,
|
||||
dtype=torch.int32,
|
||||
device="cpu",
|
||||
pin_memory=self.pin_memory)
|
||||
self.slot_mapping_np = self.slot_mapping_cpu.numpy()
|
||||
|
||||
@@ -1866,11 +1866,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
logits_indices = torch.from_numpy(
|
||||
cu_num_tokens
|
||||
) * self.pcp_size - self.num_pcp_pads[:num_reqs] - 1
|
||||
logits_indices = logits_indices.to(self.device,
|
||||
non_blocking=True)
|
||||
else:
|
||||
logits_indices = torch.from_numpy(cu_num_tokens - 1).to(
|
||||
logits_indices = logits_indices.pin_memory().to(
|
||||
self.device, non_blocking=True)
|
||||
else:
|
||||
logits_indices = self.query_start_loc[1:num_reqs + 1] - 1
|
||||
else:
|
||||
# Get the number of draft tokens for each request.
|
||||
# Iterate over the dictionary rather than all requests since not all
|
||||
|
||||
Reference in New Issue
Block a user