From 6bc770cd78eefd9cb2a9ac10fe4cae9ce4a6e700 Mon Sep 17 00:00:00 2001 From: realliujiaxu Date: Thu, 13 Nov 2025 09:11:26 +0800 Subject: [PATCH] [Perf] fix async copy for async scheduling (#4113) ### What this PR does / why we need it? Only CPU tensors with `pin_memory=True` can be asynchronously copied to the device. Currently, there are two instances where non-pinned CPU tensors are being copied to the device, which will trigger synchronous operations, reducing the expected benefits of asynchronous scheduling. - vLLM version: v0.11.0 - vLLM main: https://github.com/vllm-project/vllm/commit/83f478bb19489b41e9d208b47b4bb5a95ac171ac Signed-off-by: realliujiaxu --- vllm_ascend/worker/block_table.py | 2 +- vllm_ascend/worker/model_runner_v1.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm_ascend/worker/block_table.py b/vllm_ascend/worker/block_table.py index c33a8afa..da0cb543 100644 --- a/vllm_ascend/worker/block_table.py +++ b/vllm_ascend/worker/block_table.py @@ -100,7 +100,7 @@ class BlockTable: self.slot_mapping_cpu = torch.zeros( self.max_num_batched_tokens + 2 * self.pcp_world_size * self.max_num_reqs, - dtype=torch.int64, + dtype=torch.int32, device="cpu", pin_memory=self.pin_memory) self.slot_mapping_np = self.slot_mapping_cpu.numpy() diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index bf013c28..a332ac79 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1866,11 +1866,10 @@ class NPUModelRunner(LoRAModelRunnerMixin): logits_indices = torch.from_numpy( cu_num_tokens ) * self.pcp_size - self.num_pcp_pads[:num_reqs] - 1 - logits_indices = logits_indices.to(self.device, - non_blocking=True) - else: - logits_indices = torch.from_numpy(cu_num_tokens - 1).to( + logits_indices = logits_indices.pin_memory().to( self.device, non_blocking=True) + else: + logits_indices = self.query_start_loc[1:num_reqs + 1] - 1 else: # Get the number of draft tokens for each request. # Iterate over the dictionary rather than all requests since not all