[Feature] adapt to uva buffer and main2main (#6657)

### What this PR does / why we need it? vllm model runner v2 use uva buffer to prepare input data, but npu doesn't support uva yet, this pr implement a uvawrapper class to mimic gpu's uva backend. what's more, this pr make some modifications to adapt to the newer main branch. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? - vLLM main: 13397841ab --------- Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
2026-02-12 10:36:31 +08:00
parent 56269eae0e
commit f1ffb5fb19
14 changed files with 407 additions and 179 deletions
--- a/vllm_ascend/worker/v2/input_batch.py
+++ b/vllm_ascend/worker/v2/input_batch.py
@@ -16,10 +16,11 @@
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
+from dataclasses import asdict, dataclass

 import numpy as np
 import torch
-from vllm.v1.worker.gpu.input_batch import InputBuffers
+from vllm.v1.worker.gpu.input_batch import InputBatch, InputBuffers


 class AscendInputBuffers(InputBuffers):
@@ -29,20 +30,12 @@ class AscendInputBuffers(InputBuffers):
        self,
        max_num_reqs: int,
        max_num_tokens: int,
-        inputs_embeds_size: int,
-        vocab_size: int,
-        dtype: torch.dtype,
        device: torch.device,
-        pin_memory: bool,
    ):
        super().__init__(
            max_num_reqs,
            max_num_tokens,
-            inputs_embeds_size,
-            vocab_size,
-            dtype,
            device,
-            pin_memory,
        )
        # Create seq_lens_cpu and seq_lens_np.
        # npu's attention backend still needs seq_lens on CPU side.
@@ -54,3 +47,36 @@ class AscendInputBuffers(InputBuffers):
        # seq_len_np and seq_lens_cpu share the same memory.
        # define seq_lens_np for easier calculation with numpy.
        self.seq_lens_np: np.ndarray = self.seq_lens_cpu.numpy()
+
+
+@dataclass
+class AscendInputBatch(InputBatch):
+    """Input batch for Ascend NPUs."""
+
+    # Create seq_lens_np.
+    # npu's attention backend still needs seq_lens on CPU side.
+    seq_lens_np: np.ndarray
+
+    @classmethod
+    def make_dummy(
+        cls,
+        num_reqs: int,
+        num_tokens: int,
+        input_buffers: AscendInputBuffers,
+        device: torch.device,
+    ) -> "AscendInputBatch":
+        """Override the make_dummy method to calculate seq_lens_np."""
+        input_batch = InputBatch.make_dummy(
+            num_reqs,
+            num_tokens,
+            input_buffers,
+            device,
+        )
+        # seq_len equals to query_len
+        input_buffers.seq_lens_np[:num_reqs] = num_tokens // num_reqs
+        input_buffers.seq_lens_np[num_reqs - 1] += num_tokens % num_reqs
+        # Pad for full CUDA graph mode.
+        input_buffers.seq_lens_np[num_reqs:] = 0
+        seq_lens_np = input_buffers.seq_lens_np[:num_reqs]
+        input_batch.seq_lens_np = seq_lens_np
+        return cls(**asdict(input_batch), seq_lens_np=seq_lens_np)