[Feature] model_runner refactor (#4764)
### What this PR does / why we need it?
refactor npu_modelrunner, we should be close to gpu_modelrunner
### Does this PR introduce _any_ user-facing change?
NO
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com>
Signed-off-by: zhenwenqi2024 <155598497+zhenwenqi2024@users.noreply.github.com>
This commit is contained in:
@@ -4,6 +4,7 @@ import numpy as np
|
||||
import torch
|
||||
from vllm.distributed import get_dcp_group, get_pcp_group
|
||||
from vllm.utils.math_utils import cdiv
|
||||
from vllm.v1.utils import CpuGpuBuffer
|
||||
|
||||
|
||||
class BlockTable:
|
||||
@@ -76,32 +77,14 @@ class BlockTable:
|
||||
duplicate_size = 1
|
||||
if self.pcp_world_size > 1:
|
||||
duplicate_size += num_speculative_tokens
|
||||
self.block_table = torch.zeros(
|
||||
(max_num_reqs * duplicate_size, logical_table_size),
|
||||
device=self.device,
|
||||
dtype=torch.int32,
|
||||
)
|
||||
self.block_table_cpu = torch.zeros(
|
||||
(max_num_reqs * duplicate_size, logical_table_size),
|
||||
device="cpu",
|
||||
dtype=torch.int32,
|
||||
pin_memory=pin_memory,
|
||||
)
|
||||
self.block_table_np = self.block_table_cpu.numpy()
|
||||
self.block_table = self._make_buffer(max_num_reqs * duplicate_size,
|
||||
logical_table_size,
|
||||
dtype=torch.int32)
|
||||
self.num_blocks_per_row = np.zeros(max_num_reqs, dtype=np.int32)
|
||||
|
||||
self.slot_mapping_cpu = torch.zeros(
|
||||
self.slot_mapping = self._make_buffer(
|
||||
self.max_num_batched_tokens +
|
||||
2 * self.pcp_world_size * self.max_num_reqs,
|
||||
dtype=torch.int32,
|
||||
device="cpu",
|
||||
pin_memory=self.pin_memory)
|
||||
self.slot_mapping_np = self.slot_mapping_cpu.numpy()
|
||||
self.slot_mapping = torch.zeros(
|
||||
self.max_num_batched_tokens +
|
||||
2 * self.pcp_world_size * self.max_num_reqs,
|
||||
dtype=torch.int32,
|
||||
device=self.device)
|
||||
dtype=torch.int32)
|
||||
|
||||
self.kernel_sizes = kernel_sizes
|
||||
self.cp_kv_cache_interleave_size = cp_kv_cache_interleave_size
|
||||
@@ -120,7 +103,7 @@ class BlockTable:
|
||||
num_blocks = len(block_ids)
|
||||
start = self.num_blocks_per_row[row_idx]
|
||||
|
||||
self.block_table_np[row_idx, start:start + num_blocks] = block_ids
|
||||
self.block_table.np[row_idx, start:start + num_blocks] = block_ids
|
||||
self.num_blocks_per_row[row_idx] += num_blocks
|
||||
|
||||
def add_row(self, block_ids: list[int], row_idx: int) -> None:
|
||||
@@ -129,7 +112,7 @@ class BlockTable:
|
||||
|
||||
def move_row(self, src: int, tgt: int) -> None:
|
||||
num_blocks = self.num_blocks_per_row[src]
|
||||
self.block_table_np[tgt, :num_blocks] = self.block_table_np[
|
||||
self.block_table.np[tgt, :num_blocks] = self.block_table.np[
|
||||
src, :num_blocks]
|
||||
self.num_blocks_per_row[tgt] = num_blocks
|
||||
|
||||
@@ -139,7 +122,7 @@ class BlockTable:
|
||||
self.num_blocks_per_row[src] = num_blocks_tgt
|
||||
self.num_blocks_per_row[tgt] = num_blocks_src
|
||||
|
||||
self.block_table_np[[src, tgt]] = self.block_table_np[[tgt, src]]
|
||||
self.block_table.np[[src, tgt]] = self.block_table.np[[tgt, src]]
|
||||
|
||||
def compute_slot_mapping(self, req_indices: np.ndarray,
|
||||
positions: np.ndarray) -> None:
|
||||
@@ -171,7 +154,7 @@ class BlockTable:
|
||||
self.blocks_per_phys_block +
|
||||
logical_block_idx)
|
||||
|
||||
block_numbers = self.block_table_np.ravel()[block_table_indices]
|
||||
block_numbers = self.block_table.np.ravel()[block_table_indices]
|
||||
# Use virtual_block_size for mask calculation, which marks local
|
||||
# tokens.
|
||||
virtual_block_offsets = positions % virtual_block_size
|
||||
@@ -186,7 +169,7 @@ class BlockTable:
|
||||
# Calculate slot_mapping
|
||||
slot_mapping = block_numbers * self.block_size + block_offsets
|
||||
# Write final slots, use -1 for not-local
|
||||
self.slot_mapping_np[:req_indices.shape[0]] = np.where(
|
||||
self.slot_mapping.np[:req_indices.shape[0]] = np.where(
|
||||
mask, slot_mapping, -1)
|
||||
else:
|
||||
assert self.kernel_sizes is not None
|
||||
@@ -203,24 +186,22 @@ class BlockTable:
|
||||
req_indices * self.max_num_blocks_per_req *
|
||||
self.blocks_per_phys_block + logical_block_idx)
|
||||
|
||||
block_numbers = self.block_table_np.ravel(
|
||||
block_numbers = self.block_table.np.ravel(
|
||||
)[block_table_indices]
|
||||
block_offsets = positions % self.block_size
|
||||
np.add(block_numbers * self.block_size,
|
||||
block_offsets,
|
||||
out=self.slot_mapping_np[:req_indices.shape[0]])
|
||||
out=self.slot_mapping.np[:req_indices.shape[0]])
|
||||
|
||||
def commit_block_table(self, num_reqs: int) -> None:
|
||||
self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs],
|
||||
non_blocking=True)
|
||||
self.block_table.copy_to_gpu(num_reqs)
|
||||
|
||||
def commit_slot_mapping(self, num_tokens: int) -> None:
|
||||
self.slot_mapping[:num_tokens].copy_(
|
||||
self.slot_mapping_cpu[:num_tokens], non_blocking=True)
|
||||
self.slot_mapping.copy_to_gpu(num_tokens)
|
||||
|
||||
def clear(self) -> None:
|
||||
self.block_table.fill_(0)
|
||||
self.block_table_cpu.fill_(0)
|
||||
self.block_table.cpu.fill_(0)
|
||||
|
||||
def _convert_physical_to_logical_blocks(
|
||||
self, physical_blocks: np.ndarray) -> np.ndarray:
|
||||
@@ -243,15 +224,22 @@ class BlockTable:
|
||||
|
||||
def get_device_tensor(self) -> torch.Tensor:
|
||||
"""Returns the device tensor of the block table."""
|
||||
return self.block_table
|
||||
return self.block_table.gpu
|
||||
|
||||
def get_cpu_tensor(self) -> torch.Tensor:
|
||||
"""Returns the CPU tensor of the block table."""
|
||||
return self.block_table_cpu
|
||||
return self.block_table.cpu
|
||||
|
||||
def get_numpy_array(self) -> np.ndarray:
|
||||
"""Returns the numpy array of the block table."""
|
||||
return self.block_table_np
|
||||
return self.block_table.np
|
||||
|
||||
def _make_buffer(self, *size: int | torch.SymInt,
|
||||
dtype: torch.dtype) -> CpuGpuBuffer:
|
||||
return CpuGpuBuffer(*size,
|
||||
dtype=dtype,
|
||||
device=self.device,
|
||||
pin_memory=self.pin_memory)
|
||||
|
||||
|
||||
class MultiGroupBlockTable:
|
||||
|
||||
Reference in New Issue
Block a user