first commit
This commit is contained in:
0
vllm_br/v0/__init__.py
Normal file
0
vllm_br/v0/__init__.py
Normal file
BIN
vllm_br/v0/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
vllm_br/v0/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
15
vllm_br/v0/attention/__init__.py
Normal file
15
vllm_br/v0/attention/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
################################################################################
|
||||
# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
################################################################################
|
||||
BIN
vllm_br/v0/attention/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
vllm_br/v0/attention/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
15
vllm_br/v0/attention/backends/__init__.py
Normal file
15
vllm_br/v0/attention/backends/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
################################################################################
|
||||
# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
################################################################################
|
||||
Binary file not shown.
Binary file not shown.
570
vllm_br/v0/attention/backends/attention_v0.py
Normal file
570
vllm_br/v0/attention/backends/attention_v0.py
Normal file
@@ -0,0 +1,570 @@
|
||||
################################################################################
|
||||
# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
################################################################################
|
||||
"""Attention layer with FlashAttention."""
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
|
||||
|
||||
import torch
|
||||
import torch_br
|
||||
|
||||
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
||||
AttentionType,
|
||||
is_quantized_kv_cache)
|
||||
from vllm.attention.backends.utils import CommonAttentionState
|
||||
from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8,
|
||||
get_flash_attn_version)
|
||||
from vllm.logger import logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.worker.model_runner import (ModelInputForGPUBuilder)
|
||||
|
||||
from collections import defaultdict
|
||||
from itertools import accumulate
|
||||
|
||||
from vllm.attention.backends.utils import (PAD_SLOT_ID, CommonAttentionState,
|
||||
compute_slot_mapping,
|
||||
compute_slot_mapping_start_idx,
|
||||
is_block_tables_empty)
|
||||
from vllm.multimodal import MultiModalPlaceholderMap
|
||||
from vllm.utils import async_tensor_h2d, make_tensor_with_pad
|
||||
|
||||
|
||||
class SUPAFlashAttentionBackend(AttentionBackend):
|
||||
|
||||
# NOTE: When piecewise cudagraph is enabled, this
|
||||
# makes sure the output tensor is allocated inside the cudagraph.
|
||||
# NOTE: currently, we do not support accept_output_buffer=True
|
||||
accept_output_buffer: bool = False
|
||||
|
||||
@staticmethod
|
||||
def get_supported_head_sizes() -> list[int]:
|
||||
return [32, 64, 96, 128, 160, 192, 224, 256]
|
||||
|
||||
@staticmethod
|
||||
def get_name() -> str:
|
||||
return "SUPAFLASH_ATTN_VLLM_V0"
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> type["SUPAFlashAttentionImpl"]:
|
||||
return SUPAFlashAttentionImpl
|
||||
|
||||
@staticmethod
|
||||
def get_metadata_cls() -> type["SUPAFlashAttentionMetadata"]:
|
||||
return SUPAFlashAttentionMetadata
|
||||
|
||||
@staticmethod
|
||||
def get_builder_cls() -> type["SUPAFlashAttentionMetadataBuilder"]:
|
||||
return SUPAFlashAttentionMetadataBuilder
|
||||
|
||||
@staticmethod
|
||||
def get_state_cls() -> Type["CommonAttentionState"]:
|
||||
return CommonAttentionState
|
||||
|
||||
@staticmethod
|
||||
def get_kv_cache_shape(
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
) -> tuple[int, ...]:
|
||||
if block_size % 16 != 0:
|
||||
raise ValueError("Block size must be a multiple of 16.")
|
||||
return (2, num_blocks, block_size, num_kv_heads, head_size)
|
||||
|
||||
@staticmethod
|
||||
def get_kv_cache_usharp_shape(
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
) -> Tuple[int, ...]:
|
||||
th_gran = SUPAFlashAttentionBackend.get_kv_cache_usharp_alignment(
|
||||
block_size)
|
||||
n_block = max(1, (num_blocks + th_gran - 1) // th_gran)
|
||||
logger.debug(
|
||||
f'Origin kv cache shape is [2, {num_blocks}, {block_size}, {num_kv_heads}, {head_size}, For SUPA Speed up, use [2, {n_block}, {th_gran * block_size}, {num_kv_heads * head_size}]' # noqa: G004
|
||||
)
|
||||
return (2, n_block, th_gran * block_size, num_kv_heads * head_size)
|
||||
|
||||
@staticmethod
|
||||
def get_kv_cache_usharp_alignment(block_size: int) -> int:
|
||||
max_h_limit = 2048
|
||||
return max_h_limit // block_size
|
||||
|
||||
|
||||
@dataclass
|
||||
class SUPAFlashAttentionMetadata:
|
||||
# NOTE(sang): Definition of context_len, query_len, and seq_len.
|
||||
# |---------- N-1 iteration --------|
|
||||
# |---------------- N iteration ---------------------|
|
||||
# |- tokenA -|......................|-- newTokens ---|
|
||||
# |---------- context_len ----------|
|
||||
# |-------------------- seq_len ---------------------|
|
||||
# |-- query_len ---|
|
||||
|
||||
num_actual_tokens: int # Number of tokens excluding padding.
|
||||
max_query_len: int
|
||||
query_start_loc: torch.Tensor
|
||||
max_seq_len: int
|
||||
seq_lens: torch.Tensor
|
||||
seq_lens_tensor: torch.Tensor
|
||||
block_table: torch.Tensor
|
||||
slot_mapping: torch.Tensor
|
||||
|
||||
# BIREN Attention Params
|
||||
seq_start_loc: torch.Tensor
|
||||
context_lens: torch.Tensor
|
||||
max_decode_seq_len: int
|
||||
num_prefills: int
|
||||
num_decodes: int
|
||||
num_prefills_tokens: int
|
||||
do_cache: bool # when use attentionsplit, do cache = False
|
||||
|
||||
# For cascade attention.
|
||||
use_cascade: bool
|
||||
common_prefix_len: int
|
||||
cu_prefix_query_lens: Optional[torch.Tensor]
|
||||
prefix_kv_lens: Optional[torch.Tensor]
|
||||
suffix_kv_lens: Optional[torch.Tensor]
|
||||
|
||||
# Optional aot scheduling
|
||||
scheduler_metadata: Optional[torch.Tensor] = None
|
||||
prefix_scheduler_metadata: Optional[torch.Tensor] = None
|
||||
_cached_prefill_metadata: Optional["SUPAFlashAttentionMetadata"] = None
|
||||
_cached_decode_metadata: Optional["SUPAFlashAttentionMetadata"] = None
|
||||
|
||||
# for local attention
|
||||
@dataclass
|
||||
class LocalAttentionMetadata:
|
||||
local_query_start_loc: torch.Tensor
|
||||
local_seqused_k: torch.Tensor
|
||||
local_block_table: torch.Tensor
|
||||
local_max_query_len: int
|
||||
local_max_seq_len: int
|
||||
local_scheduler_metadata: Optional[torch.Tensor]
|
||||
|
||||
local_attn_metadata: Optional[LocalAttentionMetadata] = None
|
||||
|
||||
@property
|
||||
def do_prefill(self) -> bool:
|
||||
return self.num_prefills > 0
|
||||
|
||||
@property
|
||||
def do_decode(self) -> bool:
|
||||
return self.num_decodes > 0
|
||||
|
||||
@property
|
||||
def prefill_metadata(self) -> Optional["SUPAFlashAttentionMetadata"]:
|
||||
if self.num_prefills == 0:
|
||||
return None
|
||||
|
||||
if self._cached_prefill_metadata is not None:
|
||||
return self._cached_prefill_metadata
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
class SUPAFlashAttentionMetadataBuilder:
|
||||
|
||||
def __init__(self, input_builder: "ModelInputForGPUBuilder"):
|
||||
self.input_builder = input_builder
|
||||
self.runner = input_builder.runner
|
||||
self.sliding_window = input_builder.sliding_window
|
||||
self.block_size = input_builder.block_size
|
||||
|
||||
def prepare(self):
|
||||
self.slot_mapping: List[int] = []
|
||||
self.prefill_seq_lens: List[int] = []
|
||||
self.context_lens: List[int] = []
|
||||
self.block_tables: List[List[int]] = []
|
||||
self.curr_seq_lens: List[int] = []
|
||||
self.multimodal_placeholder_maps: Dict[
|
||||
str,
|
||||
MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
|
||||
self.num_prefills = 0
|
||||
self.num_prefill_tokens = 0
|
||||
self.num_decode_tokens = 0
|
||||
self.has_prefix_cache_hit = False
|
||||
|
||||
def _add_seq_group(
|
||||
self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
|
||||
chunked_prefill_enabled: bool, prefix_cache_hit: bool):
|
||||
"""Add a sequence group to the metadata. Specifically update/append
|
||||
1. context length.
|
||||
2. block table.
|
||||
3. slot mapping.
|
||||
"""
|
||||
is_prompt = inter_data.is_prompt
|
||||
block_tables = inter_data.block_tables
|
||||
|
||||
for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
|
||||
curr_sliding_window_block) in zip(
|
||||
inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
|
||||
inter_data.orig_seq_lens,
|
||||
inter_data.seq_lens,
|
||||
inter_data.query_lens,
|
||||
inter_data.context_lens,
|
||||
inter_data.curr_sliding_window_blocks,
|
||||
strict=False):
|
||||
self.context_lens.append(context_len)
|
||||
|
||||
if is_prompt:
|
||||
mm_maps = inter_data.multi_modal_placeholder_maps
|
||||
if mm_maps:
|
||||
for modality, placeholders in mm_maps.items():
|
||||
self.multimodal_placeholder_maps[modality].extend(
|
||||
placeholders)
|
||||
|
||||
self.num_prefills += 1
|
||||
self.num_prefill_tokens += token_len
|
||||
self.prefill_seq_lens.append(seq_len)
|
||||
else:
|
||||
self.num_decode_tokens += query_len
|
||||
self.curr_seq_lens.append(curr_seq_len)
|
||||
|
||||
# Compute block table.
|
||||
# TODO(sang): Combine chunked prefill and prefix caching by
|
||||
# only allowing multiple of block_size chunk size.
|
||||
# NOTE: This only works for oooooooxxx style attention.
|
||||
block_table = []
|
||||
if prefix_cache_hit:
|
||||
# NOTE(woosuk): For flash-attn, the block table should
|
||||
# include the entries for the incoming prefill tokens.
|
||||
block_table = block_tables[seq_id]
|
||||
elif ((chunked_prefill_enabled or not is_prompt)
|
||||
and block_tables is not None):
|
||||
if curr_sliding_window_block == 0:
|
||||
block_table = block_tables[seq_id]
|
||||
else:
|
||||
block_table = block_tables[seq_id][
|
||||
-curr_sliding_window_block:]
|
||||
self.block_tables.append(block_table)
|
||||
|
||||
# Compute slot mapping.
|
||||
is_profile_run = is_block_tables_empty(block_tables)
|
||||
start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
|
||||
context_len,
|
||||
self.sliding_window)
|
||||
compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
|
||||
seq_len, context_len, start_idx,
|
||||
self.block_size, inter_data.block_tables)
|
||||
|
||||
def _get_graph_runner_block_tables(
|
||||
self, num_seqs: int,
|
||||
block_tables: List[List[int]]) -> torch.Tensor:
|
||||
# The shape of graph_block_tables is
|
||||
# [max batch size, max context len // block size].
|
||||
max_batch_size, max_blocks = self.runner.graph_block_tables.shape
|
||||
assert max_batch_size >= num_seqs
|
||||
|
||||
graph_block_tables = self.runner.graph_block_tables[:num_seqs]
|
||||
for i, block_table in enumerate(block_tables):
|
||||
if block_table:
|
||||
num_blocks = len(block_table)
|
||||
if num_blocks <= max_blocks:
|
||||
graph_block_tables[i, :num_blocks] = block_table
|
||||
else:
|
||||
# It may be possible to have more blocks allocated due
|
||||
# to lookahead slots of multi-step, however, they are
|
||||
# not used anyway, so can be safely ignored.
|
||||
graph_block_tables[
|
||||
i, :max_blocks] = block_table[:max_blocks]
|
||||
|
||||
return torch.from_numpy(graph_block_tables).to(
|
||||
device=self.runner.device, non_blocking=True)
|
||||
|
||||
def build(self, seq_lens: List[int], query_lens: List[int],
|
||||
cuda_graph_pad_size: int, batch_size: int):
|
||||
"""Build attention metadata with on-device tensors.
|
||||
|
||||
Args:
|
||||
seq_lens: The maybe padded sequence lengths of the input sequences.
|
||||
query_lens: The query lengths of the input sequences.
|
||||
cuda_graph_pad_size: The padding size for cuda graph.
|
||||
-1 if cuda graph is not used.
|
||||
batch_size: The maybe padded batch size.
|
||||
"""
|
||||
prefix_cache_hit = any([
|
||||
inter_data.prefix_cache_hit
|
||||
for inter_data in self.input_builder.inter_data_list
|
||||
])
|
||||
for inter_data in self.input_builder.inter_data_list:
|
||||
self._add_seq_group(inter_data,
|
||||
self.input_builder.chunked_prefill_enabled,
|
||||
prefix_cache_hit)
|
||||
|
||||
device = self.runner.device
|
||||
use_captured_graph = cuda_graph_pad_size != -1
|
||||
|
||||
max_query_len = max(query_lens)
|
||||
# decode_query_lens = query_lens[self.num_prefills:]
|
||||
# if len(decode_query_lens) > 0:
|
||||
# max_decode_query_len = max(decode_query_lens)
|
||||
# else:
|
||||
# max_decode_query_len = 1
|
||||
max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
|
||||
max_decode_seq_len = max(self.curr_seq_lens, default=0)
|
||||
num_decode_tokens = self.num_decode_tokens
|
||||
query_start_loc = list(accumulate(query_lens, initial=0))
|
||||
seq_start_loc = list(accumulate(seq_lens, initial=0))
|
||||
|
||||
num_seqs = len(seq_lens)
|
||||
if use_captured_graph:
|
||||
self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
|
||||
self.block_tables.extend([] * cuda_graph_pad_size)
|
||||
num_decode_tokens = batch_size - self.num_prefill_tokens
|
||||
block_tables = self._get_graph_runner_block_tables(
|
||||
num_seqs, self.block_tables)
|
||||
else:
|
||||
block_tables = make_tensor_with_pad(
|
||||
self.block_tables,
|
||||
pad=0,
|
||||
dtype=torch.int,
|
||||
device=device,
|
||||
)
|
||||
assert max_query_len > 0, ("query_lens: {}".format(query_lens))
|
||||
|
||||
assert device is not None
|
||||
context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
|
||||
device, self.runner.pin_memory)
|
||||
seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
|
||||
self.runner.pin_memory)
|
||||
slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
|
||||
device, self.runner.pin_memory)
|
||||
query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
|
||||
device,
|
||||
self.runner.pin_memory)
|
||||
return SUPAFlashAttentionMetadata(
|
||||
num_actual_tokens=batch_size,
|
||||
max_query_len=max_query_len,
|
||||
query_start_loc=query_start_loc_tensor,
|
||||
max_seq_len=max_prefill_seq_len,
|
||||
seq_lens=seq_lens,
|
||||
seq_lens_tensor=seq_lens_tensor,
|
||||
block_table=block_tables,
|
||||
slot_mapping=slot_mapping_tensor,
|
||||
use_cascade=False,
|
||||
common_prefix_len=0,
|
||||
scheduler_metadata=0,
|
||||
cu_prefix_query_lens=None,
|
||||
prefix_kv_lens=None,
|
||||
suffix_kv_lens=None,
|
||||
local_attn_metadata=None,
|
||||
prefix_scheduler_metadata=None,
|
||||
# Biren Attention Params
|
||||
seq_start_loc=seq_start_loc,
|
||||
context_lens=context_lens_tensor,
|
||||
max_decode_seq_len=max_decode_seq_len,
|
||||
num_prefills=self.num_prefills,
|
||||
num_decodes=num_decode_tokens,
|
||||
num_prefills_tokens=self.num_prefill_tokens,
|
||||
do_cache=False)
|
||||
|
||||
|
||||
class SUPAFlashAttentionImpl(AttentionImpl):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int,
|
||||
alibi_slopes: Optional[list[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[dict[str, Any]] = None,
|
||||
logits_soft_cap: Optional[float] = None,
|
||||
attn_type: AttentionType = AttentionType.DECODER,
|
||||
use_irope: bool = False,
|
||||
) -> None:
|
||||
if blocksparse_params is not None:
|
||||
raise ValueError(
|
||||
"FlashAttention does not support block-sparse attention.")
|
||||
self.num_heads = num_heads
|
||||
self.head_size = head_size
|
||||
self.scale = float(scale)
|
||||
self.num_kv_heads = num_kv_heads
|
||||
self.attn_type = attn_type
|
||||
if alibi_slopes is not None:
|
||||
alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
|
||||
self.alibi_slopes = alibi_slopes
|
||||
if sliding_window is None:
|
||||
self.sliding_window = (-1, -1)
|
||||
else:
|
||||
self.sliding_window = (sliding_window - 1, 0)
|
||||
self.kv_cache_dtype = kv_cache_dtype
|
||||
if logits_soft_cap is None:
|
||||
# In flash-attn, setting logits_soft_cap as 0 means no soft cap.
|
||||
logits_soft_cap = 0
|
||||
self.logits_soft_cap = logits_soft_cap
|
||||
|
||||
assert self.num_heads % self.num_kv_heads == 0
|
||||
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
||||
|
||||
support_head_sizes = SUPAFlashAttentionBackend.get_supported_head_sizes(
|
||||
)
|
||||
if head_size not in support_head_sizes:
|
||||
raise ValueError(
|
||||
f"Head size {head_size} is not supported by FlashAttention. "
|
||||
f"Supported head sizes are: {support_head_sizes}. "
|
||||
"Set VLLM_USE_V1=1 to use another attention backend.")
|
||||
|
||||
self.use_irope = use_irope
|
||||
self.vllm_flash_attn_version = get_flash_attn_version()
|
||||
if is_quantized_kv_cache(self.kv_cache_dtype) \
|
||||
and not flash_attn_supports_fp8():
|
||||
raise NotImplementedError(
|
||||
"FlashAttention does not support fp8 kv-cache on this device.")
|
||||
|
||||
def forward(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
kv_cache: torch.Tensor,
|
||||
attn_metadata: SUPAFlashAttentionMetadata,
|
||||
output: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
"""Forward pass with FlashAttention.
|
||||
|
||||
Args:
|
||||
query: shape = [num_tokens, num_heads, head_size]
|
||||
key: shape = [num_tokens, num_kv_heads, head_size]
|
||||
value: shape = [num_tokens, num_kv_heads, head_size]
|
||||
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
|
||||
attn_metadata: Metadata for attention.
|
||||
Returns:
|
||||
shape = [num_tokens, num_heads * head_size]
|
||||
NOTE: FP8 quantization, flash-attn expect the size of
|
||||
{q,k,v}_descale to be (num_sequences, num_kv_heads).
|
||||
We use torch's .expand() to avoid duplicating values
|
||||
"""
|
||||
assert output is None, "Output tensor should not provided."
|
||||
if attn_metadata is None:
|
||||
# FIXME: this may lead to wrong block estimatation
|
||||
# Profiling run.
|
||||
return query
|
||||
|
||||
# NOTE: supa attn use [batch_size, num_tokens, num_heads * head_size] as shape
|
||||
if kv_cache is not None and attn_metadata.do_cache:
|
||||
torch_br.supa_kvcache_store_infer_v2(
|
||||
kv_cache,
|
||||
key,
|
||||
value, # type: ignore
|
||||
attn_metadata.slot_mapping,
|
||||
self.head_size)
|
||||
|
||||
output_prefill = output_decode = None
|
||||
output = torch.empty_like(query)
|
||||
|
||||
if attn_metadata.do_prefill and attn_metadata.do_decode:
|
||||
# chunked
|
||||
decode_query = query[:, attn_metadata.num_prefills_tokens:]
|
||||
query = query[:, :attn_metadata.num_prefills_tokens]
|
||||
|
||||
key = key[:, :attn_metadata.num_prefills_tokens]
|
||||
value = value[:, :attn_metadata.num_prefills_tokens]
|
||||
elif attn_metadata.do_decode:
|
||||
decode_query = query
|
||||
|
||||
if attn_metadata.do_prefill:
|
||||
if (kv_cache is None or attn_metadata.block_table.numel() == 0):
|
||||
# has do_decode should go into prefix-enabled branch
|
||||
assert not attn_metadata.do_decode
|
||||
|
||||
# in this branch, query_start_loc = seq_start_loc
|
||||
if os.getenv('USE_BR_SUEAGER_SDPA',
|
||||
'False').lower() not in {'false', '0', ''}:
|
||||
output_prefill, inter_mediate = torch_br.sueager_scaled_dot_product_attention_fwd(
|
||||
query=query,
|
||||
key=key,
|
||||
value=value,
|
||||
mask=None,
|
||||
dropout_prob=0.0,
|
||||
is_causal=_get_causal_option(self.attn_type),
|
||||
scale=self.scale,
|
||||
algorithm="FMHA",
|
||||
)
|
||||
output_prefill = torch_br.supa_shape_transform_qkv(
|
||||
output_prefill, 1, query.shape[1], self.num_kv_heads,
|
||||
self.head_size)
|
||||
else:
|
||||
output_prefill = torch_br.supa_flash_attention_infer( # type: ignore
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
attn_metadata.query_start_loc,
|
||||
self.head_size,
|
||||
len(attn_metadata.query_start_loc), # type: ignore
|
||||
self.alibi_slopes,
|
||||
softmax_scale=self.scale,
|
||||
is_causal=_get_causal_option(self.attn_type))
|
||||
else:
|
||||
# prefix-enabled attention
|
||||
output_prefill = torch_br.supa_flash_attn_cache_infer( # type: ignore
|
||||
query,
|
||||
kv_cache,
|
||||
attn_metadata.query_start_loc,
|
||||
attn_metadata.seq_start_loc,
|
||||
attn_metadata.block_table,
|
||||
attn_metadata.context_lens,
|
||||
attn_metadata.slot_mapping,
|
||||
attn_metadata.max_seq_len,
|
||||
self.head_size,
|
||||
self.alibi_slopes,
|
||||
softmax_scale=self.scale)
|
||||
|
||||
if attn_metadata.do_decode:
|
||||
output_decode = torch_br.supa_attention_decoder_infer_v2( # type: ignore
|
||||
decode_query, # type: ignore
|
||||
kv_cache,
|
||||
attn_metadata.block_table,
|
||||
attn_metadata.seq_lens,
|
||||
attn_metadata.max_decode_seq_len,
|
||||
self.head_size,
|
||||
attn_metadata.num_prefills,
|
||||
self.alibi_slopes,
|
||||
softmax_scale=self.scale)
|
||||
|
||||
if attn_metadata.do_prefill and attn_metadata.do_decode:
|
||||
output[:, :attn_metadata.num_prefills_tokens] = output_prefill
|
||||
output[:, attn_metadata.num_prefills_tokens:] = output_decode
|
||||
elif attn_metadata.do_prefill:
|
||||
output = output_prefill
|
||||
else:
|
||||
output = output_decode
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def _get_causal_option(attn_type: str) -> bool:
|
||||
"""
|
||||
Determine whether the given attention type is suitable for causal
|
||||
attention mechanisms.
|
||||
|
||||
Args:
|
||||
attn_type (AttentionType): The type of attention being evaluated
|
||||
|
||||
Returns:
|
||||
bool: Returns `True` if the attention type is suitable for causal
|
||||
attention (i.e., not encoder, encoder-only, or encoder-decoder),
|
||||
otherwise returns `False`.
|
||||
"""
|
||||
return not (attn_type == AttentionType.ENCODER
|
||||
or attn_type == AttentionType.ENCODER_ONLY
|
||||
or attn_type == AttentionType.ENCODER_DECODER)
|
||||
15
vllm_br/v0/worker/__init__.py
Normal file
15
vllm_br/v0/worker/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
################################################################################
|
||||
# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
################################################################################
|
||||
BIN
vllm_br/v0/worker/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
vllm_br/v0/worker/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
vllm_br/v0/worker/__pycache__/worker.cpython-310.pyc
Normal file
BIN
vllm_br/v0/worker/__pycache__/worker.cpython-310.pyc
Normal file
Binary file not shown.
255
vllm_br/v0/worker/pooling_model_runner.py
Normal file
255
vllm_br/v0/worker/pooling_model_runner.py
Normal file
@@ -0,0 +1,255 @@
|
||||
################################################################################
|
||||
# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
################################################################################
|
||||
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import dataclasses
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed import get_pp_group
|
||||
from vllm.forward_context import set_forward_context
|
||||
from vllm.model_executor.pooling_metadata import PoolingMetadata
|
||||
from vllm.multimodal import MultiModalKwargs
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
|
||||
SequenceGroupMetadata)
|
||||
from vllm.worker.model_runner import (GPUModelRunnerBase, ModelInputForGPU,
|
||||
ModelInputForGPUBuilder)
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class ModelInputForGPUWithPoolingMetadata(ModelInputForGPU):
|
||||
"""
|
||||
Used by the PoolingModelRunner.
|
||||
"""
|
||||
pooling_metadata: Optional["PoolingMetadata"] = None
|
||||
|
||||
|
||||
class PoolingModelRunner(
|
||||
GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]):
|
||||
_model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = (
|
||||
ModelInputForGPUWithPoolingMetadata)
|
||||
_builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vllm_config: VllmConfig,
|
||||
kv_cache_dtype: Optional[str] = "auto",
|
||||
is_driver_worker: bool = False,
|
||||
):
|
||||
super().__init__(vllm_config=vllm_config,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
is_driver_worker=is_driver_worker)
|
||||
|
||||
@torch.inference_mode()
|
||||
def execute_model(
|
||||
self,
|
||||
model_input: ModelInputForGPUWithPoolingMetadata,
|
||||
kv_caches: List[torch.Tensor],
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
num_steps: int = 1,
|
||||
) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]:
|
||||
if num_steps > 1:
|
||||
raise ValueError(
|
||||
"PoolingModelRunner does not support multi-step execution.")
|
||||
|
||||
if self.lora_config:
|
||||
assert model_input.lora_requests is not None
|
||||
assert model_input.lora_mapping is not None
|
||||
self.set_active_loras(model_input.lora_requests,
|
||||
model_input.lora_mapping)
|
||||
|
||||
if self.prompt_adapter_config:
|
||||
assert model_input.prompt_adapter_requests is not None
|
||||
assert model_input.prompt_adapter_mapping is not None
|
||||
self.set_active_prompt_adapters(
|
||||
model_input.prompt_adapter_requests,
|
||||
model_input.prompt_adapter_mapping)
|
||||
|
||||
# Currently cuda graph is only supported by the decode phase.
|
||||
assert model_input.attn_metadata is not None
|
||||
prefill_meta = model_input.attn_metadata.prefill_metadata if hasattr(
|
||||
model_input.attn_metadata, 'prefill_metadata') else None
|
||||
decode_meta = model_input.attn_metadata.decode_metadata if hasattr(
|
||||
model_input.attn_metadata, 'decode_metadata') else None
|
||||
virtual_engine = model_input.virtual_engine
|
||||
# Pooling models are (ab-)used also to integrate non text models that
|
||||
# are not autoregressive (PrithviGeosaptialMAE).
|
||||
# These model might not use attention and do not really have a prefill
|
||||
# and decode phase. The model input is processed in one shot and both
|
||||
# decode_metadata and prefill_metadata would be None for such models.
|
||||
# See the PlaceholderAttentionMetadata class.
|
||||
# TODO: Figure out if cuda_graph is of any use for these models and
|
||||
# explore how to leverage it.
|
||||
if (prefill_meta is None and decode_meta is not None
|
||||
and decode_meta.use_cuda_graph):
|
||||
if model_input.inputs_embeds is None:
|
||||
assert model_input.input_tokens is not None
|
||||
graph_batch_size = model_input.input_tokens.shape[0]
|
||||
model_executable = (
|
||||
self.graph_runners[model_input.virtual_engine][(
|
||||
graph_batch_size, False)])
|
||||
else:
|
||||
graph_batch_size = model_input.inputs_embeds.shape[0]
|
||||
model_executable = (
|
||||
self.graph_runners[model_input.virtual_engine][(
|
||||
graph_batch_size, True)])
|
||||
else:
|
||||
model_executable = self.model
|
||||
|
||||
multi_modal_kwargs = model_input.multi_modal_kwargs or {}
|
||||
seqlen_agnostic_kwargs = {
|
||||
"finished_requests_ids": model_input.finished_requests_ids,
|
||||
"request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
|
||||
} if self.has_inner_state else {}
|
||||
if (self.observability_config is not None
|
||||
and self.observability_config.collect_model_forward_time):
|
||||
model_forward_start = torch.cuda.Event(enable_timing=True)
|
||||
model_forward_end = torch.cuda.Event(enable_timing=True)
|
||||
model_forward_start.record()
|
||||
|
||||
cross_enc_kwargs = {}
|
||||
if model_input.token_types is not None:
|
||||
cross_enc_kwargs["token_type_ids"] = model_input.token_types
|
||||
|
||||
import os
|
||||
use_graph = bool(
|
||||
os.getenv('ENABLE_VLLM_BR_GRAPH_MODE',
|
||||
'False').lower() not in {'false', '0', ''}
|
||||
and model_input.input_tokens.shape[0] % 256 == 0)
|
||||
if use_graph:
|
||||
batch_size = int(model_input.input_tokens.shape[0] / 256)
|
||||
self.model_input_in = self.graph_inputs.get(batch_size)
|
||||
graph = self.graphs.get(batch_size)
|
||||
if graph is None or self.model_input_in is None:
|
||||
use_graph = False
|
||||
# logger.info(f"!!! No graph captured for batch_size={batch_size}, fallback to normal execution")
|
||||
if use_graph:
|
||||
# logger.info(f"use graph captured for batch_size={batch_size}")
|
||||
# Copy the input tensors to the input buffers.
|
||||
self.model_input_in.input_tokens.copy_(model_input.input_tokens,
|
||||
non_blocking=True)
|
||||
self.model_input_in.input_positions.copy_(
|
||||
model_input.input_positions, non_blocking=True)
|
||||
# self.intermediate_tensors.copy_(intermediate_tensors) if intermediate_tensors is not None else None
|
||||
self.default_stream.record_event(self.copy_done_event)
|
||||
|
||||
with torch.supa.stream(self.graph_stream):
|
||||
self.graph_stream.wait_event(self.copy_done_event)
|
||||
graph.replay()
|
||||
self.graph_stream.record_event(self.graph_done_event)
|
||||
self.default_stream.wait_event(self.graph_done_event)
|
||||
hidden_or_intermediate_states = self.graph_outputs.get(batch_size)
|
||||
else:
|
||||
with set_forward_context(model_input.attn_metadata,
|
||||
self.vllm_config, virtual_engine):
|
||||
hidden_or_intermediate_states = model_executable(
|
||||
input_ids=model_input.input_tokens,
|
||||
positions=model_input.input_positions,
|
||||
intermediate_tensors=intermediate_tensors,
|
||||
**MultiModalKwargs.as_kwargs(
|
||||
multi_modal_kwargs,
|
||||
dtype=self.model_config.dtype,
|
||||
device=self.device,
|
||||
),
|
||||
**cross_enc_kwargs,
|
||||
**seqlen_agnostic_kwargs,
|
||||
)
|
||||
|
||||
if (self.observability_config is not None
|
||||
and self.observability_config.collect_model_forward_time):
|
||||
model_forward_end.record()
|
||||
|
||||
# Only perform pooling in the last pipeline stage.
|
||||
if not get_pp_group().is_last_rank:
|
||||
if (self.is_driver_worker
|
||||
and hidden_or_intermediate_states is not None
|
||||
and isinstance(hidden_or_intermediate_states,
|
||||
IntermediateTensors)
|
||||
and self.observability_config is not None
|
||||
and self.observability_config.collect_model_forward_time):
|
||||
model_forward_end.synchronize()
|
||||
model_forward_time = model_forward_start.elapsed_time(
|
||||
model_forward_end)
|
||||
orig_model_forward_time = 0.0
|
||||
if intermediate_tensors is not None:
|
||||
orig_model_forward_time = intermediate_tensors.tensors.get(
|
||||
"model_forward_time", torch.tensor(0.0)).item()
|
||||
hidden_or_intermediate_states.tensors["model_forward_time"] = (
|
||||
torch.tensor(model_forward_time + orig_model_forward_time))
|
||||
return hidden_or_intermediate_states
|
||||
|
||||
# Only perform pooling in the driver worker.
|
||||
if not self.is_driver_worker:
|
||||
return []
|
||||
|
||||
return [
|
||||
self.model.pooler(hidden_states=hidden_or_intermediate_states,
|
||||
pooling_metadata=model_input.pooling_metadata)
|
||||
]
|
||||
|
||||
def make_model_input_from_broadcasted_tensor_dict(
|
||||
self,
|
||||
tensor_dict: Dict[str,
|
||||
Any]) -> ModelInputForGPUWithPoolingMetadata:
|
||||
return ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
|
||||
tensor_dict,
|
||||
attn_backend=self.attn_backend,
|
||||
)
|
||||
|
||||
def prepare_model_input(
|
||||
self,
|
||||
seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
|
||||
virtual_engine: int = 0,
|
||||
finished_requests_ids: Optional[List[str]] = None
|
||||
) -> ModelInputForGPUWithPoolingMetadata:
|
||||
assert seq_group_metadata_list is not None
|
||||
model_input = self._prepare_model_input_tensors(
|
||||
seq_group_metadata_list, finished_requests_ids)
|
||||
# Prepare PoolingMetadata.
|
||||
assert model_input.seq_lens is not None
|
||||
pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
|
||||
model_input.seq_lens)
|
||||
|
||||
return dataclasses.replace(model_input,
|
||||
pooling_metadata=pooling_metadata)
|
||||
|
||||
def _prepare_pooling(
|
||||
self,
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||
prompt_lens: List[int],
|
||||
) -> PoolingMetadata:
|
||||
"""Prepare PoolingMetadata for the sequence group metadata list."""
|
||||
seq_groups: List[Tuple[List[int], PoolingParams]] = []
|
||||
for i, seq_group_metadata in enumerate(seq_group_metadata_list):
|
||||
seq_ids = list(seq_group_metadata.seq_data.keys())
|
||||
pooling_params = seq_group_metadata.pooling_params
|
||||
seq_groups.append((seq_ids, pooling_params))
|
||||
|
||||
seq_data: Dict[int, SequenceData] = {}
|
||||
for seq_group_metadata in seq_group_metadata_list:
|
||||
seq_data.update(seq_group_metadata.seq_data)
|
||||
|
||||
pooling_metadata = PoolingMetadata(
|
||||
seq_groups=seq_groups,
|
||||
seq_data=seq_data,
|
||||
prompt_lens=prompt_lens,
|
||||
)
|
||||
|
||||
return pooling_metadata
|
||||
720
vllm_br/v0/worker/worker.py
Normal file
720
vllm_br/v0/worker/worker.py
Normal file
@@ -0,0 +1,720 @@
|
||||
################################################################################
|
||||
# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
################################################################################
|
||||
"""A GPU worker class."""
|
||||
import gc
|
||||
import os
|
||||
from typing import Optional # SPDX-License-Identifier: Apache-2.0
|
||||
from typing import Dict, List, Set, Tuple, Type, Union
|
||||
|
||||
import torch
|
||||
import torch_br
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed import (ensure_model_parallel_initialized,
|
||||
init_distributed_environment,
|
||||
set_custom_all_reduce)
|
||||
from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
|
||||
from vllm.distributed.parallel_state import get_world_group
|
||||
from vllm.forward_context import set_forward_context
|
||||
from vllm.logger import logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor import set_random_seed
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
||||
from vllm.multimodal import MultiModalKwargs
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
|
||||
SequenceGroupMetadata, SequenceGroupMetadataDelta)
|
||||
from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache,
|
||||
memory_profiling)
|
||||
from vllm.worker.cache_engine import CacheEngine
|
||||
from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
|
||||
from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
|
||||
from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
|
||||
WorkerInput)
|
||||
from vllm_br.platform import SUPAPlatform
|
||||
from vllm_br.v0.attention.backends.attention_v0 import (
|
||||
SUPAFlashAttentionMetadata)
|
||||
from vllm_br.v0.worker.pooling_model_runner import (
|
||||
ModelInputForGPUWithPoolingMetadata, PoolingModelRunner)
|
||||
|
||||
_NUM_WARMUP_ITERS = 2
|
||||
|
||||
|
||||
def build_batch_input(batch_size, seq_len=256, device="supa"):
|
||||
input_tokens = torch.cat([
|
||||
torch.randint(0, 200, (seq_len, ), device=device)
|
||||
for _ in range(batch_size)
|
||||
])
|
||||
input_positions = torch.arange(seq_len, device=device).repeat(batch_size)
|
||||
seq_lens = [seq_len] * batch_size
|
||||
query_lens = [seq_len] * batch_size
|
||||
query_start_loc = torch.tensor(
|
||||
[i * seq_len for i in range(batch_size + 1)],
|
||||
dtype=torch.int32,
|
||||
device=device)
|
||||
seq_start_loc = [i * seq_len for i in range(batch_size + 1)]
|
||||
context_lens = torch.zeros(batch_size, dtype=torch.int32, device=device)
|
||||
slot_mapping = torch.full((batch_size * seq_len, ),
|
||||
-1,
|
||||
dtype=torch.int32,
|
||||
device=device)
|
||||
|
||||
attn_metadata = SUPAFlashAttentionMetadata(
|
||||
num_actual_tokens=batch_size * seq_len,
|
||||
max_query_len=seq_len,
|
||||
query_start_loc=query_start_loc,
|
||||
max_seq_len=seq_len,
|
||||
seq_lens=seq_lens,
|
||||
seq_lens_tensor=torch.tensor(seq_lens,
|
||||
dtype=torch.int32,
|
||||
device=device),
|
||||
block_table=torch.empty((batch_size, 0), dtype=torch.int32),
|
||||
slot_mapping=slot_mapping,
|
||||
seq_start_loc=seq_start_loc,
|
||||
context_lens=context_lens,
|
||||
max_decode_seq_len=0,
|
||||
num_prefills=batch_size,
|
||||
num_decodes=0,
|
||||
num_prefills_tokens=batch_size * seq_len,
|
||||
do_cache=False,
|
||||
use_cascade=False,
|
||||
common_prefix_len=0,
|
||||
cu_prefix_query_lens=None,
|
||||
prefix_kv_lens=None,
|
||||
suffix_kv_lens=None,
|
||||
scheduler_metadata=0,
|
||||
prefix_scheduler_metadata=None,
|
||||
_cached_prefill_metadata=None,
|
||||
_cached_decode_metadata=None,
|
||||
local_attn_metadata=None)
|
||||
|
||||
# build ModelInputForGPUWithPoolingMetadata
|
||||
model_input = ModelInputForGPUWithPoolingMetadata(
|
||||
input_tokens=input_tokens,
|
||||
inputs_embeds=None,
|
||||
input_positions=input_positions,
|
||||
token_types=None,
|
||||
seq_lens=seq_lens,
|
||||
query_lens=query_lens,
|
||||
lora_mapping=None,
|
||||
lora_requests=set(),
|
||||
attn_metadata=attn_metadata,
|
||||
prompt_adapter_mapping=None,
|
||||
prompt_adapter_requests=set(),
|
||||
multi_modal_kwargs={},
|
||||
request_ids_to_seq_ids={f'embd-{i}': [i]
|
||||
for i in range(batch_size)},
|
||||
finished_requests_ids=[],
|
||||
virtual_engine=0,
|
||||
async_callback=None,
|
||||
scheduler_outputs=None,
|
||||
previous_hidden_states=None,
|
||||
pooling_metadata=None)
|
||||
return model_input
|
||||
|
||||
|
||||
class SUPAWorker(LocalOrDistributedWorkerBase):
|
||||
"""A worker class that executes (a partition of) the model on a GPU.
|
||||
|
||||
Each worker is associated with a single GPU. The worker is responsible for
|
||||
maintaining the KV cache and executing the model on the GPU. In case of
|
||||
distributed inference, each worker is assigned a partition of the model.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vllm_config: VllmConfig,
|
||||
local_rank: int,
|
||||
rank: int,
|
||||
distributed_init_method: str,
|
||||
is_driver_worker: bool = False,
|
||||
model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None,
|
||||
) -> None:
|
||||
WorkerBase.__init__(self, vllm_config)
|
||||
self.parallel_config.rank = rank
|
||||
self.local_rank = local_rank
|
||||
self.rank = rank
|
||||
self.distributed_init_method = distributed_init_method
|
||||
self.is_driver_worker = is_driver_worker
|
||||
if self.model_config.trust_remote_code:
|
||||
# note: lazy import to avoid importing torch before initializing
|
||||
from vllm.utils import init_cached_hf_modules
|
||||
init_cached_hf_modules()
|
||||
|
||||
# Return hidden states from target model if the draft model is an
|
||||
# mlp_speculator
|
||||
speculative_config = self.speculative_config
|
||||
model_config = self.model_config
|
||||
speculative_args = {} if speculative_config is None \
|
||||
or (speculative_config.draft_model_config.hf_config.model_type ==
|
||||
model_config.hf_config.model_type) \
|
||||
or (speculative_config.draft_model_config.hf_config.model_type
|
||||
not in ("medusa",
|
||||
"mlp_speculator",
|
||||
"eagle",
|
||||
"deepseek_mtp",
|
||||
"mimo_mtp")) \
|
||||
else {"return_hidden_states": True}
|
||||
|
||||
ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
|
||||
if model_config.runner_type == "pooling":
|
||||
ModelRunnerClass = PoolingModelRunner
|
||||
elif self.model_config.is_encoder_decoder:
|
||||
ModelRunnerClass = EncoderDecoderModelRunner
|
||||
self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
|
||||
vllm_config=self.vllm_config,
|
||||
kv_cache_dtype=self.cache_config.cache_dtype,
|
||||
is_driver_worker=is_driver_worker,
|
||||
**speculative_args,
|
||||
)
|
||||
if model_runner_cls is not None:
|
||||
self.model_runner = model_runner_cls(self.model_runner)
|
||||
|
||||
# Uninitialized cache engine. Will be initialized by
|
||||
# initialize_cache.
|
||||
self.cache_engine: List[CacheEngine]
|
||||
# Initialize gpu_cache as pooling models don't initialize kv_caches
|
||||
self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
|
||||
self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
|
||||
|
||||
# Buffers saved before sleep
|
||||
self._sleep_saved_buffers: Dict[str, torch.Tensor] = {}
|
||||
|
||||
# Torch profiler. Enabled and configured through env vars:
|
||||
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
|
||||
if envs.VLLM_TORCH_PROFILER_DIR:
|
||||
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
|
||||
logger.info(
|
||||
"Profiling enabled. Traces will be saved to: %s",
|
||||
torch_profiler_trace_dir,
|
||||
)
|
||||
self.profiler = torch.profiler.profile(
|
||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||
torch_profiler_trace_dir, use_gzip=True),
|
||||
activities=[
|
||||
torch.profiler.ProfilerActivity.CPU,
|
||||
torch.profiler.ProfilerActivity.SUPA, # type: ignore
|
||||
],
|
||||
schedule=torch.profiler.schedule(wait=0,
|
||||
warmup=0,
|
||||
active=1,
|
||||
repeat=1),
|
||||
profile_memory=False,
|
||||
record_shapes=True,
|
||||
with_stack=False,
|
||||
use_supa_simple=True, # type: ignore
|
||||
)
|
||||
else:
|
||||
self.profiler = None
|
||||
|
||||
def start_profile(self):
|
||||
if self.profiler is None:
|
||||
raise RuntimeError("Profiler is not enabled.")
|
||||
self.profiler.start()
|
||||
|
||||
def stop_profile(self):
|
||||
if self.profiler is None:
|
||||
raise RuntimeError("Profiler is not enabled.")
|
||||
self.profiler.stop()
|
||||
|
||||
def sleep(self, level: int = 1) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
def wake_up(self, tags: Optional[list[str]] = None) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
def init_device(self):
|
||||
if self.device_config.device.type == "supa":
|
||||
self.device = torch.device(f"supa:{self.local_rank}")
|
||||
SUPAPlatform.set_device(self.device)
|
||||
|
||||
_check_if_gpu_supports_dtype(self.model_config.dtype)
|
||||
gc.collect()
|
||||
SUPAPlatform.empty_cache()
|
||||
self.init_gpu_memory = SUPAPlatform.mem_get_info()[0]
|
||||
self.baseline_snapshot = MemorySnapshot()
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Not support device type: {self.device_config.device}")
|
||||
# Initialize the distributed environment.
|
||||
init_worker_distributed_environment(self.vllm_config, self.rank,
|
||||
self.distributed_init_method,
|
||||
self.local_rank)
|
||||
# Set random seed.
|
||||
set_random_seed(self.model_config.seed)
|
||||
|
||||
def load_model(self):
|
||||
if self.vllm_config.model_config.enable_sleep_mode:
|
||||
raise NotImplementedError('SUPA do not support sleep mode')
|
||||
else:
|
||||
from contextlib import nullcontext
|
||||
context = nullcontext()
|
||||
with context:
|
||||
self.model_runner.load_model()
|
||||
|
||||
### capture graphs ###
|
||||
if os.getenv('ENABLE_VLLM_BR_GRAPH_MODE',
|
||||
'False').lower() not in {'false', '0', ''}:
|
||||
logger.info("Start capturing graphs...")
|
||||
if not hasattr(self.model_runner, "graph_captured"):
|
||||
self.model_runner.graph_captured = False
|
||||
if not self.model_runner.graph_captured:
|
||||
# support capturing graphs under multiple batch sizes."
|
||||
batch_sizes = [1, 2, 3, 4, 5, 6, 7, 8]
|
||||
self.model_runner.graphs = {}
|
||||
self.model_runner.graph_inputs = {}
|
||||
self.model_runner.graph_outputs = {}
|
||||
for bs in batch_sizes:
|
||||
if self.model_runner.parallel_config.world_size != 1:
|
||||
# prevent SCCL capturing by using the same stream with SCCL
|
||||
self.model_runner.graph_stream = torch.distributed.get_group_stream(
|
||||
get_world_group().device_group)
|
||||
else:
|
||||
self.model_runner.graph_stream = torch_br.supa.Stream()
|
||||
|
||||
self.model_runner.default_stream = torch.supa.default_stream(
|
||||
)
|
||||
self.model_runner.copy_done_event = torch_br.supa.Event()
|
||||
self.model_runner.graph_done_event = torch_br.supa.Event()
|
||||
graph = torch.supa.SUPAGraph()
|
||||
self.model_runner.model_input_in = build_batch_input(
|
||||
bs, seq_len=256, device=self.device)
|
||||
self.model_runner.intermediate_tensors = None
|
||||
|
||||
model_executable = self.model_runner.model
|
||||
multi_modal_kwargs = self.model_runner.model_input_in.multi_modal_kwargs or {}
|
||||
seqlen_agnostic_kwargs = {
|
||||
"finished_requests_ids":
|
||||
self.model_runner.model_input_in.finished_requests_ids,
|
||||
"request_ids_to_seq_ids":
|
||||
self.model_runner.model_input_in.
|
||||
request_ids_to_seq_ids,
|
||||
} if self.model_runner.has_inner_state else {}
|
||||
|
||||
cross_enc_kwargs = {}
|
||||
if self.model_runner.model_input_in.token_types is not None:
|
||||
cross_enc_kwargs[
|
||||
"token_type_ids"] = self.model_runner.model_input_in.token_types
|
||||
|
||||
# Run the model a few times without capturing the graph.
|
||||
# This is to make sure that the captured graph does not include the
|
||||
# kernel launches for initial benchmarking (e.g., Triton autotune).
|
||||
# Note one iteration is not enough for torch.compile
|
||||
for _ in range(_NUM_WARMUP_ITERS):
|
||||
with set_forward_context(
|
||||
self.model_runner.model_input_in.attn_metadata,
|
||||
self.model_runner.vllm_config, self.
|
||||
model_runner.model_input_in.virtual_engine):
|
||||
model_executable(
|
||||
input_ids=self.model_runner.model_input_in.
|
||||
input_tokens,
|
||||
positions=self.model_runner.model_input_in.
|
||||
input_positions,
|
||||
intermediate_tensors=None,
|
||||
**MultiModalKwargs.as_kwargs(
|
||||
multi_modal_kwargs,
|
||||
dtype=self.model_runner.model_config.dtype,
|
||||
device=self.model_runner.device,
|
||||
),
|
||||
**cross_enc_kwargs,
|
||||
**seqlen_agnostic_kwargs,
|
||||
)
|
||||
# Wait for the warm up operations to finish before proceeding with
|
||||
# Graph Capture.
|
||||
torch.supa.synchronize()
|
||||
|
||||
with torch.supa.graph(
|
||||
graph, stream=self.model_runner.graph_stream), \
|
||||
set_forward_context(
|
||||
self.model_runner.model_input_in.attn_metadata,
|
||||
self.model_runner.vllm_config, self.
|
||||
model_runner.model_input_in.virtual_engine):
|
||||
hidden_or_intermediate_states = model_executable(
|
||||
input_ids=self.model_runner.model_input_in.
|
||||
input_tokens,
|
||||
positions=self.model_runner.model_input_in.
|
||||
input_positions,
|
||||
intermediate_tensors=self.model_runner.
|
||||
intermediate_tensors,
|
||||
**MultiModalKwargs.as_kwargs(
|
||||
multi_modal_kwargs,
|
||||
dtype=self.model_runner.model_config.dtype,
|
||||
device=self.model_runner.device,
|
||||
),
|
||||
**cross_enc_kwargs,
|
||||
**seqlen_agnostic_kwargs,
|
||||
)
|
||||
torch.supa.synchronize()
|
||||
self.model_runner.graphs[bs] = graph
|
||||
self.model_runner.graph_inputs[
|
||||
bs] = self.model_runner.model_input_in
|
||||
self.model_runner.graph_outputs[
|
||||
bs] = hidden_or_intermediate_states
|
||||
self.model_runner.graph_captured = True
|
||||
logger.info("capturing graphs Done.")
|
||||
|
||||
def save_sharded_state(
|
||||
self,
|
||||
path: str,
|
||||
pattern: Optional[str] = None,
|
||||
max_size: Optional[int] = None,
|
||||
) -> None:
|
||||
self.model_runner.save_sharded_state(
|
||||
path,
|
||||
pattern=pattern,
|
||||
max_size=max_size,
|
||||
)
|
||||
|
||||
def save_tensorized_model(
|
||||
self,
|
||||
tensorizer_config: TensorizerConfig,
|
||||
) -> None:
|
||||
self.model_runner.save_tensorized_model(
|
||||
tensorizer_config=tensorizer_config, )
|
||||
|
||||
@torch.inference_mode()
|
||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||
"""Profiles the peak memory usage of the model to determine how many
|
||||
KV blocks may be allocated without OOMs.
|
||||
|
||||
The engine will first conduct a profiling of the existing memory usage.
|
||||
Then, it calculate the maximum possible number of GPU and CPU blocks
|
||||
that can be allocated with the remaining free memory.
|
||||
|
||||
Tip:
|
||||
You may limit the usage of GPU memory
|
||||
by adjusting the `gpu_memory_utilization` parameter.
|
||||
"""
|
||||
# Profile the memory usage of the model and get the maximum number of
|
||||
# cache blocks that can be allocated with the remaining free memory.
|
||||
SUPAPlatform.empty_cache()
|
||||
|
||||
_, total_gpu_memory = SUPAPlatform.mem_get_info()
|
||||
|
||||
# Execute a forward pass with dummy inputs to profile the memory usage
|
||||
# of the model.
|
||||
with memory_profiling(
|
||||
self.baseline_snapshot,
|
||||
weights_memory=self.model_runner.model_memory_usage) as result:
|
||||
self.model_runner.profile_run()
|
||||
|
||||
self._assert_memory_footprint_increased_during_profiling()
|
||||
|
||||
memory_for_current_instance = total_gpu_memory * \
|
||||
self.cache_config.gpu_memory_utilization
|
||||
available_kv_cache_memory = (memory_for_current_instance -
|
||||
result.non_kv_cache_memory)
|
||||
|
||||
# Calculate the number of blocks that can be allocated with the
|
||||
# profiled peak memory.
|
||||
cache_block_size = self.get_cache_block_size_bytes()
|
||||
if cache_block_size == 0:
|
||||
num_gpu_blocks = 0
|
||||
num_cpu_blocks = 0
|
||||
else:
|
||||
num_gpu_blocks = int(available_kv_cache_memory // cache_block_size)
|
||||
num_cpu_blocks = int(self.cache_config.swap_space_bytes //
|
||||
cache_block_size)
|
||||
num_gpu_blocks = max(num_gpu_blocks, 0)
|
||||
num_cpu_blocks = max(num_cpu_blocks, 0)
|
||||
|
||||
msg = (f"Memory profiling takes {result.profile_time:.2f} seconds\n"
|
||||
"the current vLLM instance can use "
|
||||
"total_gpu_memory "
|
||||
f"({(total_gpu_memory / GiB_bytes):.2f}GiB)"
|
||||
" x gpu_memory_utilization "
|
||||
f"({self.cache_config.gpu_memory_utilization:.2f})"
|
||||
f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n"
|
||||
"model weights take "
|
||||
f"{(result.weights_memory / GiB_bytes):.2f}GiB;"
|
||||
" non_torch_memory takes "
|
||||
f"{(result.non_torch_increase / GiB_bytes):.2f}GiB;"
|
||||
" PyTorch activation peak memory takes "
|
||||
f"{(result.torch_peak_increase / GiB_bytes):.2f}GiB;"
|
||||
" the rest of the memory reserved for KV Cache is "
|
||||
f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.")
|
||||
|
||||
logger.info(msg)
|
||||
# Final cleanup
|
||||
gc.collect()
|
||||
|
||||
return num_gpu_blocks, num_cpu_blocks
|
||||
|
||||
def _assert_memory_footprint_increased_during_profiling(self):
|
||||
# NOTE(woosuk): Here we assume that the other processes using the same
|
||||
# GPU did not change their memory usage during the profiling.
|
||||
free_gpu_memory, total = SUPAPlatform.mem_get_info()
|
||||
supa_memory = total - free_gpu_memory
|
||||
assert self.baseline_snapshot.supa_memory < supa_memory, (
|
||||
"Error in memory profiling. "
|
||||
f"Initial used memory {self.baseline_snapshot.supa_memory}, "
|
||||
f"currently used memory {supa_memory}. "
|
||||
f"This happens when the GPU memory was "
|
||||
"not properly cleaned up before initializing the vLLM instance.")
|
||||
|
||||
def initialize_cache(self, num_gpu_blocks: int,
|
||||
num_cpu_blocks: int) -> None:
|
||||
"""Allocate GPU and CPU KV cache with the specified number of blocks.
|
||||
|
||||
This also warms up the model, which may record CUDA graphs.
|
||||
"""
|
||||
raise_if_cache_size_invalid(
|
||||
num_gpu_blocks, self.cache_config.block_size,
|
||||
self.cache_config.is_attention_free,
|
||||
self.model_config.max_model_len,
|
||||
self.parallel_config.pipeline_parallel_size)
|
||||
|
||||
self.cache_config.num_gpu_blocks = num_gpu_blocks
|
||||
self.cache_config.num_cpu_blocks = num_cpu_blocks
|
||||
|
||||
if self.vllm_config.model_config.enable_sleep_mode:
|
||||
raise NotImplementedError('SUPA do not support sleep mode')
|
||||
else:
|
||||
from contextlib import nullcontext
|
||||
context = nullcontext()
|
||||
with context:
|
||||
self._init_cache_engine()
|
||||
self._warm_up_model()
|
||||
|
||||
def _init_cache_engine(self):
|
||||
assert self.cache_config.num_gpu_blocks is not None
|
||||
self.cache_engine = [
|
||||
CacheEngine(self.cache_config, self.model_config,
|
||||
self.parallel_config, self.device_config)
|
||||
for _ in range(self.parallel_config.pipeline_parallel_size)
|
||||
]
|
||||
self.gpu_cache = [
|
||||
self.cache_engine[ve].gpu_cache
|
||||
for ve in range(self.parallel_config.pipeline_parallel_size)
|
||||
]
|
||||
bind_kv_cache(self.compilation_config.static_forward_context,
|
||||
self.gpu_cache)
|
||||
|
||||
def _warm_up_model(self) -> None:
|
||||
# warm up sizes that are not in cudagraph capture sizes,
|
||||
# but users still want to compile for better performance,
|
||||
# e.g. for the max-num-batched token size in chunked prefill.
|
||||
warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy()
|
||||
if not self.model_config.enforce_eager:
|
||||
warmup_sizes = [
|
||||
x for x in warmup_sizes
|
||||
if x not in self.vllm_config.cuda_graph_sizes
|
||||
]
|
||||
for size in sorted(warmup_sizes, reverse=True):
|
||||
logger.info("Compile and warming up model for size %d", size)
|
||||
self.model_runner._dummy_run(size)
|
||||
if not self.model_config.enforce_eager:
|
||||
self.model_runner.capture_model(self.gpu_cache)
|
||||
# Reset the seed to ensure that the random state is not affected by
|
||||
# the model initialization and profiling.
|
||||
set_random_seed(self.model_config.seed)
|
||||
|
||||
@property
|
||||
def do_metadata_broadcast(self) -> bool:
|
||||
return self.parallel_config.tensor_parallel_size > 1
|
||||
|
||||
@property
|
||||
def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
|
||||
return self.gpu_cache
|
||||
|
||||
@torch.inference_mode()
|
||||
def prepare_worker_input(
|
||||
self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
|
||||
virtual_engine = execute_model_req.virtual_engine
|
||||
num_steps = execute_model_req.num_steps
|
||||
num_seq_groups = len(execute_model_req.seq_group_metadata_list)
|
||||
# `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors.
|
||||
# they contain parameters to launch cudamemcpyasync.
|
||||
blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in,
|
||||
device="cpu",
|
||||
dtype=torch.int64).view(-1, 2)
|
||||
blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out,
|
||||
device="cpu",
|
||||
dtype=torch.int64).view(-1, 2)
|
||||
# `blocks_to_copy` is a gpu tensor. The src and tgt of
|
||||
# blocks to copy are in the same device, and `blocks_to_copy`
|
||||
# can be used directly within cuda kernels.
|
||||
blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
|
||||
device=self.device,
|
||||
dtype=torch.int64).view(-1, 2)
|
||||
|
||||
return WorkerInput(
|
||||
num_seq_groups=num_seq_groups,
|
||||
blocks_to_swap_in=blocks_to_swap_in,
|
||||
blocks_to_swap_out=blocks_to_swap_out,
|
||||
blocks_to_copy=blocks_to_copy,
|
||||
virtual_engine=virtual_engine,
|
||||
num_steps=num_steps,
|
||||
)
|
||||
|
||||
@torch.inference_mode()
|
||||
def execute_worker(self, worker_input: WorkerInput) -> None:
|
||||
virtual_engine = worker_input.virtual_engine
|
||||
# Issue cache operations.
|
||||
if (worker_input.blocks_to_swap_in is not None
|
||||
and worker_input.blocks_to_swap_in.numel() > 0):
|
||||
self.cache_engine[virtual_engine].swap_in(
|
||||
worker_input.blocks_to_swap_in)
|
||||
if (worker_input.blocks_to_swap_out is not None
|
||||
and worker_input.blocks_to_swap_out.numel() > 0):
|
||||
self.cache_engine[virtual_engine].swap_out(
|
||||
worker_input.blocks_to_swap_out)
|
||||
if (worker_input.blocks_to_copy is not None
|
||||
and worker_input.blocks_to_copy.numel() > 0):
|
||||
self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy)
|
||||
|
||||
def _get_cached_seq_group_metadata(
|
||||
self,
|
||||
seq_group_metadata_list: List[Union[SequenceGroupMetadata,
|
||||
SequenceGroupMetadataDelta]],
|
||||
finished_request_ids: List[str]) -> List[SequenceGroupMetadata]:
|
||||
"""Return a list of cached Sequence Group Metadata after updating its
|
||||
state.
|
||||
|
||||
It is used because scheduler only sends delta to workers to reduce
|
||||
the data payload size. The function also cleans up cache based on
|
||||
a given `finished_request_ids`.
|
||||
"""
|
||||
new_seq_group_metadata_list = []
|
||||
for metadata_or_delta in seq_group_metadata_list:
|
||||
request_id = metadata_or_delta.request_id
|
||||
if request_id not in self._seq_group_metadata_cache:
|
||||
# The first prefill.
|
||||
assert isinstance(metadata_or_delta, SequenceGroupMetadata)
|
||||
self._seq_group_metadata_cache[request_id] = metadata_or_delta
|
||||
else:
|
||||
# The first prefill is already cached.
|
||||
if isinstance(metadata_or_delta, SequenceGroupMetadataDelta):
|
||||
self._seq_group_metadata_cache[request_id].apply_delta(
|
||||
metadata_or_delta)
|
||||
else:
|
||||
# If metadata snapshot is sent again, it is
|
||||
# preempted. Reset the cache because we need to start
|
||||
# from scratch.
|
||||
assert isinstance(metadata_or_delta, SequenceGroupMetadata)
|
||||
self._seq_group_metadata_cache[
|
||||
request_id] = metadata_or_delta
|
||||
|
||||
new_seq_group_metadata_list.append(
|
||||
self._seq_group_metadata_cache[request_id])
|
||||
|
||||
# Clean up finished ids
|
||||
for finished_id in finished_request_ids:
|
||||
del self._seq_group_metadata_cache[finished_id]
|
||||
|
||||
return new_seq_group_metadata_list
|
||||
|
||||
def _execute_model_spmd(
|
||||
self,
|
||||
execute_model_req: ExecuteModelRequest,
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
) -> Optional[List[SamplerOutput]]:
|
||||
if execute_model_req is not None:
|
||||
new_seq_group_metadata_list = self._get_cached_seq_group_metadata(
|
||||
execute_model_req.seq_group_metadata_list,
|
||||
execute_model_req.finished_requests_ids)
|
||||
|
||||
execute_model_req.seq_group_metadata_list = (
|
||||
new_seq_group_metadata_list)
|
||||
output = super()._execute_model_spmd(execute_model_req,
|
||||
intermediate_tensors)
|
||||
return output
|
||||
|
||||
def add_lora(self, lora_request: LoRARequest) -> bool:
|
||||
return self.model_runner.add_lora(lora_request)
|
||||
|
||||
def remove_lora(self, lora_id: int) -> bool:
|
||||
return self.model_runner.remove_lora(lora_id)
|
||||
|
||||
def pin_lora(self, lora_id: int) -> bool:
|
||||
return self.model_runner.pin_lora(lora_id)
|
||||
|
||||
def list_loras(self) -> Set[int]:
|
||||
return self.model_runner.list_loras()
|
||||
|
||||
def add_prompt_adapter(
|
||||
self, prompt_adapter_request: PromptAdapterRequest) -> bool:
|
||||
return self.model_runner.add_prompt_adapter(prompt_adapter_request)
|
||||
|
||||
def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
|
||||
return self.model_runner.remove_lora(prompt_adapter_id)
|
||||
|
||||
def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
|
||||
return self.model_runner.pin_prompt_adapter(prompt_adapter_id)
|
||||
|
||||
def list_prompt_adapters(self) -> Set[int]:
|
||||
return self.model_runner.list_prompt_adapters()
|
||||
|
||||
@property
|
||||
def max_model_len(self) -> int:
|
||||
return self.model_config.max_model_len
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
return self.model_runner.vocab_size
|
||||
|
||||
def get_cache_block_size_bytes(self) -> int:
|
||||
"""Get the size of the KV cache block size in bytes.
|
||||
"""
|
||||
return CacheEngine.get_cache_block_size(self.cache_config,
|
||||
self.model_config,
|
||||
self.parallel_config)
|
||||
|
||||
|
||||
def init_worker_distributed_environment(
|
||||
vllm_config: VllmConfig,
|
||||
rank: int,
|
||||
distributed_init_method: Optional[str] = None,
|
||||
local_rank: int = -1,
|
||||
) -> None:
|
||||
"""Initialize the distributed environment."""
|
||||
parallel_config = vllm_config.parallel_config
|
||||
set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
|
||||
|
||||
init_distributed_environment(parallel_config.world_size, rank,
|
||||
distributed_init_method, local_rank, "sccl")
|
||||
ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
|
||||
parallel_config.pipeline_parallel_size)
|
||||
|
||||
ensure_kv_transfer_initialized(vllm_config)
|
||||
|
||||
|
||||
def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
|
||||
# Check if the GPU supports the dtype.
|
||||
# TODO: add checkers
|
||||
return
|
||||
|
||||
|
||||
def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free,
|
||||
max_model_len, pipeline_parallel_size) -> None:
|
||||
if is_attention_free and num_gpu_blocks != 0:
|
||||
raise ValueError("No memory should be allocated for the cache blocks "
|
||||
f"for an attention-free model, but {num_gpu_blocks} "
|
||||
"blocks are allocated.")
|
||||
if not is_attention_free and num_gpu_blocks <= 0:
|
||||
raise ValueError("No available memory for the cache blocks. "
|
||||
"Try increasing `gpu_memory_utilization` when "
|
||||
"initializing the engine.")
|
||||
max_seq_len = block_size * (num_gpu_blocks // pipeline_parallel_size)
|
||||
if not is_attention_free and max_model_len > max_seq_len:
|
||||
raise ValueError(
|
||||
f"The model's max seq len ({max_model_len}) "
|
||||
"is larger than the maximum number of tokens that can be "
|
||||
f"stored in KV cache ({max_seq_len}). Try increasing "
|
||||
"`gpu_memory_utilization` or decreasing `max_model_len` when "
|
||||
"initializing the engine.")
|
||||
Reference in New Issue
Block a user