implement model runner v2 basic framework (#5051)

### What this PR does / why we need it? This PR aim to implement model runner v2 basic framework in vllm-ascend, the e2e function is not guaranteed by this pr. ### Does this PR introduce _any_ user-facing change? use envs.VLLM_USE_V2_MODEL_RUNNER to decide if choose model_runenr_v2. ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
2025-12-18 15:51:54 +08:00
parent 1c8c23de58
commit b69b04d3a9
16 changed files with 843 additions and 98 deletions
--- a/vllm_ascend/worker/v2/aclgraph_utils.py
+++ b/vllm_ascend/worker/v2/aclgraph_utils.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from contextlib import contextmanager
+from typing import Any
+
+import torch
+import torch.nn as nn
+from vllm.config import VllmConfig
+from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.worker.gpu.block_table import BlockTables
+from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager
+from vllm.v1.worker.gpu.cudagraph_utils import \
+    prepare_inputs_to_capture as prepare_inputs_to_capture_gpu
+from vllm.v1.worker.gpu.input_batch import InputBuffers
+
+from vllm_ascend.worker.v2.utils import torch_cuda_wrapper
+
+
+class AclGraphManager(CudaGraphManager):
+    """ACL Graph Manager for Ascend NPUs."""
+
+    def __init__(self, vllm_config: VllmConfig, device: torch.device):
+        with torch_cuda_wrapper():
+            super().__init__(vllm_config, device)
+
+    def capture_graph(
+        self,
+        num_tokens: int,
+        model: nn.Module,
+        input_buffers: InputBuffers,
+        block_tables: BlockTables,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        kv_cache_config: KVCacheConfig,
+    ) -> None:
+        with (torch_cuda_wrapper(), prepare_capture_inputs_wrapper()):
+            super().capture_graph(
+                num_tokens,
+                model,
+                input_buffers,
+                block_tables,
+                attn_metadata_builders,
+                kv_cache_config,
+            )
+
+
+@contextmanager
+def prepare_capture_inputs_wrapper():
+    """Context manager to override input preparation for NPU graph capture."""
+    # TODO(Ronald1995): make prepare_inputs_to_capture as static method
+    # in CudaGraphManager.
+    global prepare_inputs_to_capture_gpu
+    try:
+        ori_func = prepare_inputs_to_capture_gpu
+        prepare_inputs_to_capture_gpu = prepare_inputs_to_capture
+        yield
+    finally:
+        prepare_inputs_to_capture_gpu = ori_func
+
+
+def prepare_inputs_to_capture(
+    num_reqs: int,
+    num_tokens: int,
+    input_buffers: InputBuffers,
+    block_tables: BlockTables,
+    attn_metadata_builders: list[AttentionMetadataBuilder],
+    max_model_len: int,
+    kv_cache_config: KVCacheConfig,
+) -> dict[str, Any]:
+    # TODO(Ronald1995): Implement NPU specific input preparation.
+    return {}