xc-llm-ascend/vllm_ascend/worker/v2/input_batch.py

import numpy as np
import torch
from vllm.v1.worker.gpu.input_batch import InputBuffers


class AscendInputBuffers(InputBuffers):
    """Input buffers for Ascend NPUs."""

    def __init__(
        self,
        max_num_reqs: int,
        max_num_tokens: int,
        inputs_embeds_size: int,
        vocab_size: int,
        dtype: torch.dtype,
        device: torch.device,
        pin_memory: bool,
    ):
        super().__init__(
            max_num_reqs,
            max_num_tokens,
            inputs_embeds_size,
            vocab_size,
            dtype,
            device,
            pin_memory,
        )
        # Create seq_lens_cpu and seq_lens_np.
        # npu's attention backend still needs seq_lens on CPU side.
        self.seq_lens_cpu: torch.Tensor = torch.zeros(
            max_num_reqs,
            dtype=torch.int32,
            device="cpu",
        )
        # seq_len_np and seq_lens_cpu share the same memory.
        # define seq_lens_np for easier calculation with numpy.
        self.seq_lens_np: np.ndarray = self.seq_lens_cpu.numpy()
implement model runner v2 basic framework (#5051) ### What this PR does / why we need it? This PR aim to implement model runner v2 basic framework in vllm-ascend, the e2e function is not guaranteed by this pr. ### Does this PR introduce _any_ user-facing change? use envs.VLLM_USE_V2_MODEL_RUNNER to decide if choose model_runenr_v2. ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: Ronald1995 <ronaldautomobile@163.com> 2025-12-18 15:51:54 +08:00			`import numpy as np`
			`import torch`
			`from vllm.v1.worker.gpu.input_batch import InputBuffers`


			`class AscendInputBuffers(InputBuffers):`
			`"""Input buffers for Ascend NPUs."""`

			`def __init__(`
			`self,`
			`max_num_reqs: int,`
			`max_num_tokens: int,`
			`inputs_embeds_size: int,`
			`vocab_size: int,`
			`dtype: torch.dtype,`
			`device: torch.device,`
			`pin_memory: bool,`
			`):`
			`super().__init__(`
			`max_num_reqs,`
			`max_num_tokens,`
			`inputs_embeds_size,`
			`vocab_size,`
			`dtype,`
			`device,`
			`pin_memory,`
			`)`
			`# Create seq_lens_cpu and seq_lens_np.`
			`# npu's attention backend still needs seq_lens on CPU side.`
			`self.seq_lens_cpu: torch.Tensor = torch.zeros(`
			`max_num_reqs,`
			`dtype=torch.int32,`
			`device="cpu",`
			`)`
			`# seq_len_np and seq_lens_cpu share the same memory.`
			`# define seq_lens_np for easier calculation with numpy.`
			`self.seq_lens_np: np.ndarray = self.seq_lens_cpu.numpy()`