implement model runner v2 basic framework (#5051)

### What this PR does / why we need it? This PR aim to implement model runner v2 basic framework in vllm-ascend, the e2e function is not guaranteed by this pr. ### Does this PR introduce _any_ user-facing change? use envs.VLLM_USE_V2_MODEL_RUNNER to decide if choose model_runenr_v2. ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
2025-12-18 15:51:54 +08:00
parent 1c8c23de58
commit b69b04d3a9
16 changed files with 843 additions and 98 deletions
--- a/vllm_ascend/worker/v2/states.py
+++ b/vllm_ascend/worker/v2/states.py
@@ -0,0 +1,88 @@
+from contextlib import contextmanager
+
+import torch
+from vllm.v1.utils import CpuGpuBuffer
+from vllm.v1.worker.gpu.states import RequestState, UvaBuffer
+
+
+class AscendRequestState(RequestState):
+    """Request state for Ascend NPUs."""
+
+    def __init__(
+        self,
+        max_num_reqs: int,
+        max_model_len: int,
+        max_num_batched_tokens: int,
+        num_speculative_steps: int,
+        vocab_size: int,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        with uva_wrapper():
+            super().__init__(
+                max_num_reqs,
+                max_model_len,
+                max_num_batched_tokens,
+                num_speculative_steps,
+                vocab_size,
+                device,
+                pin_memory,
+            )
+        # because we will override these attribute, delete these attribute to
+        # make sure it's collected by python gc immediately.
+        del self.prefill_token_ids
+        # vllm gpu_model_runner_v2 deprecate the seqs_lens_cpu attribute,
+        # because they think most attention backends do not need it.
+        # However, Ascend attention backend muse uses seqs_lens_cpu,
+        # so we keep num_computed_tokens_cpu here, seq_lens_cpu need to be
+        # calculated by num_computed_tokens_cpu + decode_token_per_req outside.
+        self.num_computed_tokens_cpu: torch.Tensor = torch.zeros(
+            self.max_num_reqs,
+            dtype=torch.int32,
+            device="cpu",
+        )
+        # NOTE(Ronald1995): Ascend NPUs do not support UVA yet,
+        # so we use CpuGpuBuffer to allocate prefill_token_ids buffer.
+        self.prefill_token_ids: CpuGpuBuffer = self._make_buffer(  # type: ignore
+            (self.max_num_reqs, self.max_model_len),
+            dtype=torch.int32)
+
+    def add_request(
+        self,
+        req_id,
+        prompt_len,
+        prefill_token_ids,
+        num_computed_tokens,
+        sampling_params,
+        lora_request,
+    ):
+
+        super().add_request(
+            req_id,
+            prompt_len,
+            prefill_token_ids,
+            num_computed_tokens,
+            sampling_params,
+            lora_request,
+        )
+        req_idx = self.req_id_to_index[req_id]
+        self.num_computed_tokens_cpu[req_idx] = num_computed_tokens
+
+
+@contextmanager
+def uva_wrapper():
+    """Context manager to disable UVA for Ascend NPUs."""
+
+    class UvaBufferWrapper:
+
+        def __init__(self, *args, **kwargs):
+            pass
+
+    # TODO(Ronald1995): rectify this when NPU support uva.
+    global UvaBuffer
+    ori_class = UvaBuffer
+    try:
+        UvaBuffer = UvaBufferWrapper
+        yield
+    finally:
+        UvaBuffer = ori_class