[Feature] support aclgraph for model runner v2 (#7110)

### What this PR does / why we need it? This PR aims to support aclgraph for model runner v2, please see RFC #5208. The PR contains these modifications: - adapt to newest commit of vllm main branch. - supply a unified interface of extra forward context for both model runner v1 and model runner v2. - implement graph mode for main model. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? - vLLM version: v0.16.0 - vLLM main: 4034c3d32e --------- Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
2026-03-13 09:11:46 +08:00
parent 1f71da80eb
commit c980e68d40
52 changed files with 840 additions and 309 deletions
--- a/vllm_ascend/worker/v2/utils.py
+++ b/vllm_ascend/worker/v2/utils.py
@@ -1,6 +1,11 @@
 from contextlib import contextmanager

 import torch
+import vllm
+from vllm.logger import logger
+
+from vllm_ascend.worker.v2.block_table import AscendBlockTables
+from vllm_ascend.worker.v2.model_states import init_asecnd_model_state


@contextmanager
@@ -15,6 +20,34 @@ def torch_cuda_wrapper():
        torch.cuda.CUDAGraph = torch.npu.NPUGraph
        torch.cuda.graph = torch.npu.graph
        torch.cuda.synchronize = torch.npu.synchronize
+        torch.cuda.set_stream = torch.npu.set_stream
+        torch.cuda.current_device = torch.npu.current_device
+        torch.cuda.mem_get_info = torch.npu.mem_get_info
+        logger.info_once("Wrapping torch.cuda with torch.npu.")
+        yield
+    finally:
+        pass
+
+
+@contextmanager
+def block_table_wrapper():
+    try:
+        # vllm-ascend need to initialize slot mapping as torch.int32 dtype,
+        # but vllm default is torch.int64 dtype.
+        vllm.v1.worker.gpu.model_runner.BlockTables = AscendBlockTables
+        logger.info_once("Wrapping BlockTables with AscendBlockTables.")
+        yield
+    finally:
+        pass
+
+
+@contextmanager
+def model_states_wrapper():
+    try:
+        # prepare_attn in AscendModelState is different from vllm,
+        # we need to override init_model_state.
+        vllm.v1.worker.gpu.model_runner.init_model_state = init_asecnd_model_state
+        logger.info_once("Wrapping init_model_state with init_asecnd_model_state.")
        yield
    finally:
        pass