From 1cdf9ffa73c9ebde8e576363c9db694272b81762 Mon Sep 17 00:00:00 2001 From: realliujiaxu Date: Wed, 19 Nov 2025 14:47:19 +0800 Subject: [PATCH] [Bugfix] fix hang in async scheduling (#4233) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? After https://github.com/vllm-project/vllm-ascend/pull/4113, there is no synchronization between steps. However, in async scheduling with aclgraph, it is possible that the CPU's record event for the current iteration completes before the previous iteration's graph execution has finished. If cpu is fast enough, device will hang on event_wait in interation i+1 (assume that event_record is executed immediately on update stream of device): image after add synchonization, record is launched after graph replay: image bubble time caused by synchronization is about 85 us on G8600: image ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - vLLM version: v0.11.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379 --------- Signed-off-by: realliujiaxu Co-authored-by: hwhaokun --- tests/e2e/singlecard/test_ascend_scheduler.py | 24 ++++++++++++++++++- vllm_ascend/compilation/acl_graph.py | 6 +++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/tests/e2e/singlecard/test_ascend_scheduler.py b/tests/e2e/singlecard/test_ascend_scheduler.py index 39bba024..e9173588 100644 --- a/tests/e2e/singlecard/test_ascend_scheduler.py +++ b/tests/e2e/singlecard/test_ascend_scheduler.py @@ -128,7 +128,7 @@ def test_chunked_prefill_with_scheduler_dynamic_batch( ) -def test_async_scheduling() -> None: +def test_async_scheduling_eager() -> None: prompts = [ "Hello, my name is", "The president of the United States is", @@ -148,3 +148,25 @@ def test_async_scheduling() -> None: async_scheduling=True, ) as vllm_model: vllm_model.generate(prompts, sampling_params=sampling_params) + + +def test_async_scheduling_with_full_graph() -> None: + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] * 10 + sampling_params = SamplingParams(temperature=0.2, + max_tokens=10, + stop_token_ids=None) + + with VllmRunner("Qwen/Qwen3-8B", + max_model_len=4096, + max_num_seqs=50, + dtype="bfloat16", + gpu_memory_utilization=0.9, + async_scheduling=True, + compilation_config={"cudagraph_mode": + "FULL"}) as vllm_model: + vllm_model.generate(prompts, sampling_params=sampling_params) diff --git a/vllm_ascend/compilation/acl_graph.py b/vllm_ascend/compilation/acl_graph.py index 3cb0613f..6aaccc63 100644 --- a/vllm_ascend/compilation/acl_graph.py +++ b/vllm_ascend/compilation/acl_graph.py @@ -186,6 +186,12 @@ class ACLGraphWrapper: f"got {new_input_addresses}") logger.info_once("Replaying aclgraph") + # In async scheduling or multi-threaded (MT) scenarios, it is possible that + # the CPU's record event (from update_attn_params) for the iteration i completes + # before the grph replay of iteration i-1. + # To ensure proper ordering, we must call synchronize here before replaying, + # so that update_attn_params only executes after the previous graph replay has fully completed. + torch.npu.synchronize() entry.aclgraph.replay() return entry.output