[Feature] adapt to uva buffer and main2main (#6657)

### What this PR does / why we need it?
vllm model runner v2 use uva buffer to prepare input data, but npu
doesn't support uva yet, this pr implement a uvawrapper class to mimic
gpu's uva backend. what's more, this pr make some modifications to adapt
to the newer main branch.

### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
- vLLM main:
13397841ab

---------

Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
This commit is contained in:
Ronald
2026-02-12 10:36:31 +08:00
committed by GitHub
parent 56269eae0e
commit f1ffb5fb19
14 changed files with 407 additions and 179 deletions

View File

@@ -16,10 +16,11 @@
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from dataclasses import asdict, dataclass
import numpy as np
import torch
from vllm.v1.worker.gpu.input_batch import InputBuffers
from vllm.v1.worker.gpu.input_batch import InputBatch, InputBuffers
class AscendInputBuffers(InputBuffers):
@@ -29,20 +30,12 @@ class AscendInputBuffers(InputBuffers):
self,
max_num_reqs: int,
max_num_tokens: int,
inputs_embeds_size: int,
vocab_size: int,
dtype: torch.dtype,
device: torch.device,
pin_memory: bool,
):
super().__init__(
max_num_reqs,
max_num_tokens,
inputs_embeds_size,
vocab_size,
dtype,
device,
pin_memory,
)
# Create seq_lens_cpu and seq_lens_np.
# npu's attention backend still needs seq_lens on CPU side.
@@ -54,3 +47,36 @@ class AscendInputBuffers(InputBuffers):
# seq_len_np and seq_lens_cpu share the same memory.
# define seq_lens_np for easier calculation with numpy.
self.seq_lens_np: np.ndarray = self.seq_lens_cpu.numpy()
@dataclass
class AscendInputBatch(InputBatch):
"""Input batch for Ascend NPUs."""
# Create seq_lens_np.
# npu's attention backend still needs seq_lens on CPU side.
seq_lens_np: np.ndarray
@classmethod
def make_dummy(
cls,
num_reqs: int,
num_tokens: int,
input_buffers: AscendInputBuffers,
device: torch.device,
) -> "AscendInputBatch":
"""Override the make_dummy method to calculate seq_lens_np."""
input_batch = InputBatch.make_dummy(
num_reqs,
num_tokens,
input_buffers,
device,
)
# seq_len equals to query_len
input_buffers.seq_lens_np[:num_reqs] = num_tokens // num_reqs
input_buffers.seq_lens_np[num_reqs - 1] += num_tokens % num_reqs
# Pad for full CUDA graph mode.
input_buffers.seq_lens_np[num_reqs:] = 0
seq_lens_np = input_buffers.seq_lens_np[:num_reqs]
input_batch.seq_lens_np = seq_lens_np
return cls(**asdict(input_batch), seq_lens_np=seq_lens_np)