[Feature] model_runner refactor (#4764)

### What this PR does / why we need it? refactor npu_modelrunner， we should be close to gpu_modelrunner ### Does this PR introduce _any_ user-facing change? NO - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com> Signed-off-by: zhenwenqi2024 <155598497+zhenwenqi2024@users.noreply.github.com>
2025-12-12 17:27:09 +08:00
parent 5b12c068f9
commit f708d919f8
10 changed files with 676 additions and 1815 deletions
--- a/vllm_ascend/worker/block_table.py
+++ b/vllm_ascend/worker/block_table.py
@@ -4,6 +4,7 @@ import numpy as np
 import torch
 from vllm.distributed import get_dcp_group, get_pcp_group
 from vllm.utils.math_utils import cdiv
+from vllm.v1.utils import CpuGpuBuffer


 class BlockTable:
@@ -76,32 +77,14 @@ class BlockTable:
        duplicate_size = 1
        if self.pcp_world_size > 1:
            duplicate_size += num_speculative_tokens
-        self.block_table = torch.zeros(
-            (max_num_reqs * duplicate_size, logical_table_size),
-            device=self.device,
-            dtype=torch.int32,
-        )
-        self.block_table_cpu = torch.zeros(
-            (max_num_reqs * duplicate_size, logical_table_size),
-            device="cpu",
-            dtype=torch.int32,
-            pin_memory=pin_memory,
-        )
-        self.block_table_np = self.block_table_cpu.numpy()
+        self.block_table = self._make_buffer(max_num_reqs * duplicate_size,
+                                             logical_table_size,
+                                             dtype=torch.int32)
        self.num_blocks_per_row = np.zeros(max_num_reqs, dtype=np.int32)
-
-        self.slot_mapping_cpu = torch.zeros(
+        self.slot_mapping = self._make_buffer(
            self.max_num_batched_tokens +
            2 * self.pcp_world_size * self.max_num_reqs,
-            dtype=torch.int32,
-            device="cpu",
-            pin_memory=self.pin_memory)
-        self.slot_mapping_np = self.slot_mapping_cpu.numpy()
-        self.slot_mapping = torch.zeros(
-            self.max_num_batched_tokens +
-            2 * self.pcp_world_size * self.max_num_reqs,
-            dtype=torch.int32,
-            device=self.device)
+            dtype=torch.int32)

        self.kernel_sizes = kernel_sizes
        self.cp_kv_cache_interleave_size = cp_kv_cache_interleave_size
@@ -120,7 +103,7 @@ class BlockTable:
        num_blocks = len(block_ids)
        start = self.num_blocks_per_row[row_idx]

-        self.block_table_np[row_idx, start:start + num_blocks] = block_ids
+        self.block_table.np[row_idx, start:start + num_blocks] = block_ids
        self.num_blocks_per_row[row_idx] += num_blocks

    def add_row(self, block_ids: list[int], row_idx: int) -> None:
@@ -129,7 +112,7 @@ class BlockTable:

    def move_row(self, src: int, tgt: int) -> None:
        num_blocks = self.num_blocks_per_row[src]
-        self.block_table_np[tgt, :num_blocks] = self.block_table_np[
+        self.block_table.np[tgt, :num_blocks] = self.block_table.np[
            src, :num_blocks]
        self.num_blocks_per_row[tgt] = num_blocks

@@ -139,7 +122,7 @@ class BlockTable:
        self.num_blocks_per_row[src] = num_blocks_tgt
        self.num_blocks_per_row[tgt] = num_blocks_src

-        self.block_table_np[[src, tgt]] = self.block_table_np[[tgt, src]]
+        self.block_table.np[[src, tgt]] = self.block_table.np[[tgt, src]]

    def compute_slot_mapping(self, req_indices: np.ndarray,
                             positions: np.ndarray) -> None:
@@ -171,7 +154,7 @@ class BlockTable:
                                   self.blocks_per_phys_block +
                                   logical_block_idx)

-            block_numbers = self.block_table_np.ravel()[block_table_indices]
+            block_numbers = self.block_table.np.ravel()[block_table_indices]
            # Use virtual_block_size for mask calculation, which marks local
            # tokens.
            virtual_block_offsets = positions % virtual_block_size
@@ -186,7 +169,7 @@ class BlockTable:
            # Calculate slot_mapping
            slot_mapping = block_numbers * self.block_size + block_offsets
            # Write final slots, use -1 for not-local
-            self.slot_mapping_np[:req_indices.shape[0]] = np.where(
+            self.slot_mapping.np[:req_indices.shape[0]] = np.where(
                mask, slot_mapping, -1)
        else:
            assert self.kernel_sizes is not None
@@ -203,24 +186,22 @@ class BlockTable:
                    req_indices * self.max_num_blocks_per_req *
                    self.blocks_per_phys_block + logical_block_idx)

-                block_numbers = self.block_table_np.ravel(
+                block_numbers = self.block_table.np.ravel(
                )[block_table_indices]
                block_offsets = positions % self.block_size
                np.add(block_numbers * self.block_size,
                       block_offsets,
-                       out=self.slot_mapping_np[:req_indices.shape[0]])
+                       out=self.slot_mapping.np[:req_indices.shape[0]])

    def commit_block_table(self, num_reqs: int) -> None:
-        self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs],
-                                          non_blocking=True)
+        self.block_table.copy_to_gpu(num_reqs)

    def commit_slot_mapping(self, num_tokens: int) -> None:
-        self.slot_mapping[:num_tokens].copy_(
-            self.slot_mapping_cpu[:num_tokens], non_blocking=True)
+        self.slot_mapping.copy_to_gpu(num_tokens)

    def clear(self) -> None:
        self.block_table.fill_(0)
-        self.block_table_cpu.fill_(0)
+        self.block_table.cpu.fill_(0)

    def _convert_physical_to_logical_blocks(
            self, physical_blocks: np.ndarray) -> np.ndarray:
@@ -243,15 +224,22 @@ class BlockTable:

    def get_device_tensor(self) -> torch.Tensor:
        """Returns the device tensor of the block table."""
-        return self.block_table
+        return self.block_table.gpu

    def get_cpu_tensor(self) -> torch.Tensor:
        """Returns the CPU tensor of the block table."""
-        return self.block_table_cpu
+        return self.block_table.cpu

    def get_numpy_array(self) -> np.ndarray:
        """Returns the numpy array of the block table."""
-        return self.block_table_np
+        return self.block_table.np
+
+    def _make_buffer(self, *size: int | torch.SymInt,
+                     dtype: torch.dtype) -> CpuGpuBuffer:
+        return CpuGpuBuffer(*size,
+                            dtype=dtype,
+                            device=self.device,
+                            pin_memory=self.pin_memory)


 class MultiGroupBlockTable: