[BugFix] fix hang in async scheduling while open ENPU (#8354)

### What this PR does / why we need it?
1. there is no synchronization between steps. However, in async
scheduling with aclgraph, it is possible that the CPU's record event for
the current iteration completes before the previous iteration's graph
execution has finished. If cpu is fast enough, device will hang on
event_wait in interation i+1 (assume that event_record is executed
immediately on update stream of device).
2. Under ENPU, eagle proposers also need to follow event.record first,
and then event.Wait.

### Does this PR introduce _any_ user-facing change?
N/A

### How was this patch tested?

---------

Signed-off-by: 1zzk <785396250@qq.com>
This commit is contained in:
1kzk
2026-04-18 00:07:15 +08:00
committed by GitHub
parent f81f9a3c89
commit c995a959e6
3 changed files with 72 additions and 28 deletions

View File

@@ -20,8 +20,7 @@ from vllm.logger import logger
from vllm.platforms import current_platform
from vllm_ascend.ascend_forward_context import _EXTRA_CTX
from ..utils import weak_ref_tensors
from vllm_ascend.utils import weak_ref_tensors
@dataclasses.dataclass
@@ -66,6 +65,9 @@ class ACLGraphWrapper:
vllm_config: VllmConfig,
runtime_mode: CUDAGraphMode,
cudagraph_options: CUDAGraphOptions | None = None,
*,
use_eagle: bool = False,
enable_enpu: bool = False,
):
self.runnable = runnable
self.vllm_config = vllm_config
@@ -87,6 +89,8 @@ class ACLGraphWrapper:
# the entries for different batch descriptors that we need to capture
# aclgraphs for.
self.concrete_aclgraph_entries: dict[BatchDescriptor, ACLGraphEntry] = {}
self.enable_enpu = enable_enpu
self.use_eagle = use_eagle
def __getattr__(self, key: str):
# allow accessing the attributes of the runnable.
@@ -197,12 +201,11 @@ class ACLGraphWrapper:
# so that update_attn_params only executes after the previous graph replay has fully completed.
# If we do not in main model and in full-graph mode when using merge-eagle-graph,
# we do not need to synchronize.
use_eagle = (
self.vllm_config.speculative_config.method in ("eagle", "eagle3")
if self.vllm_config.speculative_config
else False
)
if self.runtime_mode != CUDAGraphMode.FULL or not _EXTRA_CTX.is_draft_model or not use_eagle:
# When enable_enpu is on, model_runner orders update vs replay; skip here.
# When FULL + EAGLE draft (merge path), replay does not need this barrier.
is_draft_eagle = _EXTRA_CTX.is_draft_model and self.use_eagle
need_sync = self.runtime_mode == CUDAGraphMode.FULL and not is_draft_eagle
if not self.enable_enpu and need_sync:
torch.npu.current_stream().synchronize()
entry.aclgraph.replay()
return entry.output