From e7a13beedb74b39dffd9f9dd1dd47bed32360206 Mon Sep 17 00:00:00 2001 From: IWantFight <43377438+IWantFight@users.noreply.github.com> Date: Wed, 4 Feb 2026 10:59:45 +0800 Subject: [PATCH] [Bugfix] Synchronize only the current stream to avoid device sync (#6432) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? Following [PR #4233](https://github.com/vllm-project/vllm-ascend/pull/4233), a synchronization mechanism was introduced between steps in asynchronous scheduling with ACL Graph to address a hanging issue. However, full device-level synchronization is unnecessary—only the operations on the current stream need to be synchronized. Otherwise, if other background operations (such as send and recv) are running concurrently, they may negatively impact inference performance for the instance. hang problem ![c4bbfac9a9088acec0ad335b4c2af437](https://github.com/user-attachments/assets/b7c8c612-4d45-48ec-9465-954869f9643d) Synchronizing only the current stream can also resolve the hang issue. ### Does this PR introduce any user-facing change? No ### How was this patch tested? - vLLM version: v0.14.1 - vLLM main: https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd Signed-off-by: For_YL Co-authored-by: For_YL --- vllm_ascend/compilation/acl_graph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/compilation/acl_graph.py b/vllm_ascend/compilation/acl_graph.py index ed8673d2..93c32ce7 100644 --- a/vllm_ascend/compilation/acl_graph.py +++ b/vllm_ascend/compilation/acl_graph.py @@ -196,7 +196,7 @@ class ACLGraphWrapper: else False ) if self.runtime_mode != CUDAGraphMode.FULL or not forward_context.is_draft_model or not use_eagle: - torch.npu.synchronize() + torch.npu.current_stream().synchronize() entry.aclgraph.replay() return entry.output