From aef9d4249de8dc1ebf7d48dded296d7183086a27 Mon Sep 17 00:00:00 2001 From: Cao Yi Date: Mon, 9 Mar 2026 14:46:37 +0800 Subject: [PATCH] [Perf] Avoid CPU sync in mrope_positions copy by using full tensor copy (#7014) ### What this PR does / why we need it? The index-select operation `mrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(...)` triggers a CPU-NPU synchronization, which blocks subsequent operator dispatch and causes bubbles visible in Profiling. This PR changes to full tensor copy (`mrope_positions.gpu.copy_(mrope_positions.cpu)`) to eliminate the sync point. The trade-off is a negligible increase in memory usage since `mrope_positions.cpu` is a small tensor. **Result:** ~2-3% TPOT improvement with the profiling bubbles eliminated. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Verified via Profiling that the CPU sync bubble is eliminated and TPOT is reduced by 2-3%. - vLLM version: v0.16.0 - vLLM main: https://github.com/vllm-project/vllm/commit/15d76f74e2fdb12a95ea00f0ca283acf6219a2b7 Signed-off-by: SlightwindSec Co-authored-by: wanghuanjun2113 --- vllm_ascend/worker/model_runner_v1.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 51b345a7..47c7c5a6 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -723,8 +723,8 @@ class NPUModelRunner(GPUModelRunner): if self.uses_mrope: # Only relevant for models using M-RoPE (e.g, Qwen2-VL) self._calc_mrope_positions(scheduler_output) - self.mrope_positions.gpu[:, :total_num_scheduled_tokens].copy_( - self.mrope_positions.cpu[:, :total_num_scheduled_tokens], + self.mrope_positions.gpu.copy_( + self.mrope_positions.cpu, non_blocking=True, ) elif self.uses_xdrope_dim > 0: