diff --git a/tests/e2e/singlecard/test_ascend_scheduler.py b/tests/e2e/singlecard/test_ascend_scheduler.py index 39bba024..e9173588 100644 --- a/tests/e2e/singlecard/test_ascend_scheduler.py +++ b/tests/e2e/singlecard/test_ascend_scheduler.py @@ -128,7 +128,7 @@ def test_chunked_prefill_with_scheduler_dynamic_batch( ) -def test_async_scheduling() -> None: +def test_async_scheduling_eager() -> None: prompts = [ "Hello, my name is", "The president of the United States is", @@ -148,3 +148,25 @@ def test_async_scheduling() -> None: async_scheduling=True, ) as vllm_model: vllm_model.generate(prompts, sampling_params=sampling_params) + + +def test_async_scheduling_with_full_graph() -> None: + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] * 10 + sampling_params = SamplingParams(temperature=0.2, + max_tokens=10, + stop_token_ids=None) + + with VllmRunner("Qwen/Qwen3-8B", + max_model_len=4096, + max_num_seqs=50, + dtype="bfloat16", + gpu_memory_utilization=0.9, + async_scheduling=True, + compilation_config={"cudagraph_mode": + "FULL"}) as vllm_model: + vllm_model.generate(prompts, sampling_params=sampling_params) diff --git a/vllm_ascend/compilation/acl_graph.py b/vllm_ascend/compilation/acl_graph.py index 3cb0613f..6aaccc63 100644 --- a/vllm_ascend/compilation/acl_graph.py +++ b/vllm_ascend/compilation/acl_graph.py @@ -186,6 +186,12 @@ class ACLGraphWrapper: f"got {new_input_addresses}") logger.info_once("Replaying aclgraph") + # In async scheduling or multi-threaded (MT) scenarios, it is possible that + # the CPU's record event (from update_attn_params) for the iteration i completes + # before the grph replay of iteration i-1. + # To ensure proper ordering, we must call synchronize here before replaying, + # so that update_attn_params only executes after the previous graph replay has fully completed. + torch.npu.synchronize() entry.aclgraph.replay() return entry.output