diff --git a/vllm_ascend/compilation/acl_graph.py b/vllm_ascend/compilation/acl_graph.py index 29ec5793..97bfb03d 100644 --- a/vllm_ascend/compilation/acl_graph.py +++ b/vllm_ascend/compilation/acl_graph.py @@ -192,12 +192,13 @@ class ACLGraphWrapper: f"got {new_input_addresses}") logger.info_once("Replaying aclgraph") - # In async scheduling or multi-threaded (MT) scenarios, it is possible that + # In async scheduling or multi-threaded (MT) scenarios when graph mode is FULL, it is possible that # the CPU's record event (from update_attn_params) for the iteration i completes # before the grph replay of iteration i-1. # To ensure proper ordering, we must call synchronize here before replaying, # so that update_attn_params only executes after the previous graph replay has fully completed. - torch.npu.synchronize() + if self.runtime_mode == CUDAGraphMode.FULL: + torch.npu.synchronize() entry.aclgraph.replay() return entry.output