implement model runner v2 basic framework (#5051)

### What this PR does / why we need it? This PR aim to implement model runner v2 basic framework in vllm-ascend, the e2e function is not guaranteed by this pr. ### Does this PR introduce _any_ user-facing change? use envs.VLLM_USE_V2_MODEL_RUNNER to decide if choose model_runenr_v2. ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
2025-12-18 15:51:54 +08:00
parent 1c8c23de58
commit b69b04d3a9
16 changed files with 843 additions and 98 deletions
--- a/vllm_ascend/worker/v2/input_batch.py
+++ b/vllm_ascend/worker/v2/input_batch.py
@@ -0,0 +1,37 @@
+import numpy as np
+import torch
+from vllm.v1.worker.gpu.input_batch import InputBuffers
+
+
+class AscendInputBuffers(InputBuffers):
+    """Input buffers for Ascend NPUs."""
+
+    def __init__(
+        self,
+        max_num_reqs: int,
+        max_num_tokens: int,
+        inputs_embeds_size: int,
+        vocab_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        super().__init__(
+            max_num_reqs,
+            max_num_tokens,
+            inputs_embeds_size,
+            vocab_size,
+            dtype,
+            device,
+            pin_memory,
+        )
+        # Create seq_lens_cpu and seq_lens_np.
+        # npu's attention backend still needs seq_lens on CPU side.
+        self.seq_lens_cpu: torch.Tensor = torch.zeros(
+            max_num_reqs,
+            dtype=torch.int32,
+            device="cpu",
+        )
+        # seq_len_np and seq_lens_cpu share the same memory.
+        # define seq_lens_np for easier calculation with numpy.
+        self.seq_lens_np: np.ndarray = self.seq_lens_cpu.numpy()