diff --git a/tests/e2e/singlecard/test_ascend_scheduler.py b/tests/e2e/singlecard/test_ascend_scheduler.py
index 39bba024..e9173588 100644
--- a/tests/e2e/singlecard/test_ascend_scheduler.py
+++ b/tests/e2e/singlecard/test_ascend_scheduler.py
@@ -128,7 +128,7 @@ def test_chunked_prefill_with_scheduler_dynamic_batch(
     )
 
 
-def test_async_scheduling() -> None:
+def test_async_scheduling_eager() -> None:
     prompts = [
         "Hello, my name is",
         "The president of the United States is",
@@ -148,3 +148,25 @@ def test_async_scheduling() -> None:
             async_scheduling=True,
     ) as vllm_model:
         vllm_model.generate(prompts, sampling_params=sampling_params)
+
+
+def test_async_scheduling_with_full_graph() -> None:
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ] * 10
+    sampling_params = SamplingParams(temperature=0.2,
+                                     max_tokens=10,
+                                     stop_token_ids=None)
+
+    with VllmRunner("Qwen/Qwen3-8B",
+                    max_model_len=4096,
+                    max_num_seqs=50,
+                    dtype="bfloat16",
+                    gpu_memory_utilization=0.9,
+                    async_scheduling=True,
+                    compilation_config={"cudagraph_mode":
+                                        "FULL"}) as vllm_model:
+        vllm_model.generate(prompts, sampling_params=sampling_params)
diff --git a/vllm_ascend/compilation/acl_graph.py b/vllm_ascend/compilation/acl_graph.py
index 3cb0613f..6aaccc63 100644
--- a/vllm_ascend/compilation/acl_graph.py
+++ b/vllm_ascend/compilation/acl_graph.py
@@ -186,6 +186,12 @@ class ACLGraphWrapper:
                 f"got {new_input_addresses}")
 
         logger.info_once("Replaying aclgraph")
+        # In async scheduling or multi-threaded (MT) scenarios, it is possible that
+        # the CPU's record event (from update_attn_params) for the iteration i completes
+        # before the grph replay of iteration i-1.
+        # To ensure proper ordering, we must call synchronize here before replaying,
+        # so that update_attn_params only executes after the previous graph replay has fully completed.
+        torch.npu.synchronize()
         entry.aclgraph.replay()
         return entry.output