[BugFix] fix hang in async scheduling while open ENPU (#8354)
### What this PR does / why we need it? 1. there is no synchronization between steps. However, in async scheduling with aclgraph, it is possible that the CPU's record event for the current iteration completes before the previous iteration's graph execution has finished. If cpu is fast enough, device will hang on event_wait in interation i+1 (assume that event_record is executed immediately on update stream of device). 2. Under ENPU, eagle proposers also need to follow event.record first, and then event.Wait. ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? --------- Signed-off-by: 1zzk <785396250@qq.com>
This commit is contained in:
@@ -336,6 +336,15 @@ class NPUModelRunner(GPUModelRunner):
|
||||
self.input_ids = self._make_buffer(max_buffer_num_tokens, dtype=torch.int32)
|
||||
self.positions = self._make_buffer(max_buffer_num_tokens, dtype=torch.int64)
|
||||
|
||||
self.use_eagle = (
|
||||
vllm_config.speculative_config.method in ("eagle", "eagle3", "mtp")
|
||||
if vllm_config.speculative_config
|
||||
else False
|
||||
)
|
||||
# When True, run update_full_graph_params before self.model (ENPU / graph capture order).
|
||||
# Internal / non-public toggle: read C getenv ``ENPU_ENABLE`` from enpu code (not in envs.py).
|
||||
_enpu = get_c_env("ENPU_ENABLE")
|
||||
self.enable_enpu = _enpu is not None and _enpu.lower() == "true"
|
||||
self._set_up_drafter()
|
||||
|
||||
# kv role
|
||||
@@ -424,9 +433,6 @@ class NPUModelRunner(GPUModelRunner):
|
||||
self.cudagraph_batch_sizes = []
|
||||
self.mamba_state_idx: dict[str, int] = {}
|
||||
self._mamba_copy_bufs: mamba_utils.MambaCopyBuffers | None = None
|
||||
env_enpu_enable = get_c_env("ENPU_ENABLE")
|
||||
# When True, run update_full_graph_params before self.model (ENPU / graph capture order).
|
||||
self.enable_enpu = env_enpu_enable is not None and env_enpu_enable.lower() == "true"
|
||||
|
||||
@property
|
||||
def use_cp(self) -> bool:
|
||||
@@ -1795,6 +1801,9 @@ class NPUModelRunner(GPUModelRunner):
|
||||
and not forward_context.capturing
|
||||
and not self.use_sparse
|
||||
):
|
||||
if self.enable_enpu:
|
||||
torch.npu.current_stream().synchronize()
|
||||
|
||||
assert positions is not None
|
||||
update_full_graph_params(
|
||||
self.attn_backend,
|
||||
@@ -2592,7 +2601,13 @@ class NPUModelRunner(GPUModelRunner):
|
||||
# wrap the model with full graph wrapper if needed.
|
||||
if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
|
||||
self.update_stream: torch.npu.Stream = torch.npu.Stream()
|
||||
self.model = ACLGraphWrapper(self.model, self.vllm_config, runtime_mode=CUDAGraphMode.FULL)
|
||||
self.model = ACLGraphWrapper(
|
||||
self.model,
|
||||
self.vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
use_eagle=self.use_eagle,
|
||||
enable_enpu=self.enable_enpu,
|
||||
)
|
||||
|
||||
def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user